x86_64-mont5.pl revision 295009
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20# August 2013.
21#
22# Add MULX/AD*X code paths and additional interfaces to optimize for
23# branch prediction unit. For input lengths that are multiples of 8
24# the np argument is not just modulus value, but one interleaved
25# with 0. This is to optimize post-condition...
26
27$flavour = shift;
28$output  = shift;
29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
30
31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
36die "can't locate x86_64-xlate.pl";
37
38open OUT,"| \"$^X\" $xlate $flavour $output";
39*STDOUT=*OUT;
40
41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43	$addx = ($1>=2.23);
44}
45
46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48	$addx = ($1>=2.10);
49}
50
51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53	$addx = ($1>=12);
54}
55
56if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
57	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
58	$addx = ($ver>=3.03);
59}
60
61# int bn_mul_mont_gather5(
62$rp="%rdi";	# BN_ULONG *rp,
63$ap="%rsi";	# const BN_ULONG *ap,
64$bp="%rdx";	# const BN_ULONG *bp,
65$np="%rcx";	# const BN_ULONG *np,
66$n0="%r8";	# const BN_ULONG *n0,
67$num="%r9";	# int num,
68		# int idx);	# 0 to 2^5-1, "index" in $bp holding
69				# pre-computed powers of a', interlaced
70				# in such manner that b[0] is $bp[idx],
71				# b[1] is [2^5+idx], etc.
72$lo0="%r10";
73$hi0="%r11";
74$hi1="%r13";
75$i="%r14";
76$j="%r15";
77$m0="%rbx";
78$m1="%rbp";
79
80$code=<<___;
81.text
82
83.extern	OPENSSL_ia32cap_P
84
85.globl	bn_mul_mont_gather5
86.type	bn_mul_mont_gather5,\@function,6
87.align	64
88bn_mul_mont_gather5:
89	test	\$7,${num}d
90	jnz	.Lmul_enter
91___
92$code.=<<___ if ($addx);
93	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
94___
95$code.=<<___;
96	jmp	.Lmul4x_enter
97
98.align	16
99.Lmul_enter:
100	mov	${num}d,${num}d
101	mov	%rsp,%rax
102	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
103	push	%rbx
104	push	%rbp
105	push	%r12
106	push	%r13
107	push	%r14
108	push	%r15
109___
110$code.=<<___ if ($win64);
111	lea	-0x28(%rsp),%rsp
112	movaps	%xmm6,(%rsp)
113	movaps	%xmm7,0x10(%rsp)
114___
115$code.=<<___;
116	lea	2($num),%r11
117	neg	%r11
118	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
119	and	\$-1024,%rsp		# minimize TLB usage
120
121	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
122.Lmul_body:
123	mov	$bp,%r12		# reassign $bp
124___
125		$bp="%r12";
126		$STRIDE=2**5*8;		# 5 is "window size"
127		$N=$STRIDE/4;		# should match cache line size
128$code.=<<___;
129	mov	%r10,%r11
130	shr	\$`log($N/8)/log(2)`,%r10
131	and	\$`$N/8-1`,%r11
132	not	%r10
133	lea	.Lmagic_masks(%rip),%rax
134	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
135	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
136	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
137	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
138	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
139	movq	24(%rax,%r10,8),%xmm7
140
141	movq	`0*$STRIDE/4-96`($bp),%xmm0
142	movq	`1*$STRIDE/4-96`($bp),%xmm1
143	pand	%xmm4,%xmm0
144	movq	`2*$STRIDE/4-96`($bp),%xmm2
145	pand	%xmm5,%xmm1
146	movq	`3*$STRIDE/4-96`($bp),%xmm3
147	pand	%xmm6,%xmm2
148	por	%xmm1,%xmm0
149	pand	%xmm7,%xmm3
150	por	%xmm2,%xmm0
151	lea	$STRIDE($bp),$bp
152	por	%xmm3,%xmm0
153
154	movq	%xmm0,$m0		# m0=bp[0]
155
156	mov	($n0),$n0		# pull n0[0] value
157	mov	($ap),%rax
158
159	xor	$i,$i			# i=0
160	xor	$j,$j			# j=0
161
162	movq	`0*$STRIDE/4-96`($bp),%xmm0
163	movq	`1*$STRIDE/4-96`($bp),%xmm1
164	pand	%xmm4,%xmm0
165	movq	`2*$STRIDE/4-96`($bp),%xmm2
166	pand	%xmm5,%xmm1
167
168	mov	$n0,$m1
169	mulq	$m0			# ap[0]*bp[0]
170	mov	%rax,$lo0
171	mov	($np),%rax
172
173	movq	`3*$STRIDE/4-96`($bp),%xmm3
174	pand	%xmm6,%xmm2
175	por	%xmm1,%xmm0
176	pand	%xmm7,%xmm3
177
178	imulq	$lo0,$m1		# "tp[0]"*n0
179	mov	%rdx,$hi0
180
181	por	%xmm2,%xmm0
182	lea	$STRIDE($bp),$bp
183	por	%xmm3,%xmm0
184
185	mulq	$m1			# np[0]*m1
186	add	%rax,$lo0		# discarded
187	mov	8($ap),%rax
188	adc	\$0,%rdx
189	mov	%rdx,$hi1
190
191	lea	1($j),$j		# j++
192	jmp	.L1st_enter
193
194.align	16
195.L1st:
196	add	%rax,$hi1
197	mov	($ap,$j,8),%rax
198	adc	\$0,%rdx
199	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
200	mov	$lo0,$hi0
201	adc	\$0,%rdx
202	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
203	mov	%rdx,$hi1
204
205.L1st_enter:
206	mulq	$m0			# ap[j]*bp[0]
207	add	%rax,$hi0
208	mov	($np,$j,8),%rax
209	adc	\$0,%rdx
210	lea	1($j),$j		# j++
211	mov	%rdx,$lo0
212
213	mulq	$m1			# np[j]*m1
214	cmp	$num,$j
215	jne	.L1st
216
217	movq	%xmm0,$m0		# bp[1]
218
219	add	%rax,$hi1
220	mov	($ap),%rax		# ap[0]
221	adc	\$0,%rdx
222	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
223	adc	\$0,%rdx
224	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
225	mov	%rdx,$hi1
226	mov	$lo0,$hi0
227
228	xor	%rdx,%rdx
229	add	$hi0,$hi1
230	adc	\$0,%rdx
231	mov	$hi1,-8(%rsp,$num,8)
232	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
233
234	lea	1($i),$i		# i++
235	jmp	.Louter
236.align	16
237.Louter:
238	xor	$j,$j			# j=0
239	mov	$n0,$m1
240	mov	(%rsp),$lo0
241
242	movq	`0*$STRIDE/4-96`($bp),%xmm0
243	movq	`1*$STRIDE/4-96`($bp),%xmm1
244	pand	%xmm4,%xmm0
245	movq	`2*$STRIDE/4-96`($bp),%xmm2
246	pand	%xmm5,%xmm1
247
248	mulq	$m0			# ap[0]*bp[i]
249	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
250	mov	($np),%rax
251	adc	\$0,%rdx
252
253	movq	`3*$STRIDE/4-96`($bp),%xmm3
254	pand	%xmm6,%xmm2
255	por	%xmm1,%xmm0
256	pand	%xmm7,%xmm3
257
258	imulq	$lo0,$m1		# tp[0]*n0
259	mov	%rdx,$hi0
260
261	por	%xmm2,%xmm0
262	lea	$STRIDE($bp),$bp
263	por	%xmm3,%xmm0
264
265	mulq	$m1			# np[0]*m1
266	add	%rax,$lo0		# discarded
267	mov	8($ap),%rax
268	adc	\$0,%rdx
269	mov	8(%rsp),$lo0		# tp[1]
270	mov	%rdx,$hi1
271
272	lea	1($j),$j		# j++
273	jmp	.Linner_enter
274
275.align	16
276.Linner:
277	add	%rax,$hi1
278	mov	($ap,$j,8),%rax
279	adc	\$0,%rdx
280	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
281	mov	(%rsp,$j,8),$lo0
282	adc	\$0,%rdx
283	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
284	mov	%rdx,$hi1
285
286.Linner_enter:
287	mulq	$m0			# ap[j]*bp[i]
288	add	%rax,$hi0
289	mov	($np,$j,8),%rax
290	adc	\$0,%rdx
291	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
292	mov	%rdx,$hi0
293	adc	\$0,$hi0
294	lea	1($j),$j		# j++
295
296	mulq	$m1			# np[j]*m1
297	cmp	$num,$j
298	jne	.Linner
299
300	movq	%xmm0,$m0		# bp[i+1]
301
302	add	%rax,$hi1
303	mov	($ap),%rax		# ap[0]
304	adc	\$0,%rdx
305	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
306	mov	(%rsp,$j,8),$lo0
307	adc	\$0,%rdx
308	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
309	mov	%rdx,$hi1
310
311	xor	%rdx,%rdx
312	add	$hi0,$hi1
313	adc	\$0,%rdx
314	add	$lo0,$hi1		# pull upmost overflow bit
315	adc	\$0,%rdx
316	mov	$hi1,-8(%rsp,$num,8)
317	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
318
319	lea	1($i),$i		# i++
320	cmp	$num,$i
321	jb	.Louter
322
323	xor	$i,$i			# i=0 and clear CF!
324	mov	(%rsp),%rax		# tp[0]
325	lea	(%rsp),$ap		# borrow ap for tp
326	mov	$num,$j			# j=num
327	jmp	.Lsub
328.align	16
329.Lsub:	sbb	($np,$i,8),%rax
330	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
331	mov	8($ap,$i,8),%rax	# tp[i+1]
332	lea	1($i),$i		# i++
333	dec	$j			# doesnn't affect CF!
334	jnz	.Lsub
335
336	sbb	\$0,%rax		# handle upmost overflow bit
337	xor	$i,$i
338	and	%rax,$ap
339	not	%rax
340	mov	$rp,$np
341	and	%rax,$np
342	mov	$num,$j			# j=num
343	or	$np,$ap			# ap=borrow?tp:rp
344.align	16
345.Lcopy:					# copy or in-place refresh
346	mov	($ap,$i,8),%rax
347	mov	$i,(%rsp,$i,8)		# zap temporary vector
348	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
349	lea	1($i),$i
350	sub	\$1,$j
351	jnz	.Lcopy
352
353	mov	8(%rsp,$num,8),%rsi	# restore %rsp
354	mov	\$1,%rax
355___
356$code.=<<___ if ($win64);
357	movaps	-88(%rsi),%xmm6
358	movaps	-72(%rsi),%xmm7
359___
360$code.=<<___;
361	mov	-48(%rsi),%r15
362	mov	-40(%rsi),%r14
363	mov	-32(%rsi),%r13
364	mov	-24(%rsi),%r12
365	mov	-16(%rsi),%rbp
366	mov	-8(%rsi),%rbx
367	lea	(%rsi),%rsp
368.Lmul_epilogue:
369	ret
370.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
371___
372{{{
373my @A=("%r10","%r11");
374my @N=("%r13","%rdi");
375$code.=<<___;
376.type	bn_mul4x_mont_gather5,\@function,6
377.align	32
378bn_mul4x_mont_gather5:
379.Lmul4x_enter:
380___
381$code.=<<___ if ($addx);
382	and	\$0x80100,%r11d
383	cmp	\$0x80100,%r11d
384	je	.Lmulx4x_enter
385___
386$code.=<<___;
387	.byte	0x67
388	mov	%rsp,%rax
389	push	%rbx
390	push	%rbp
391	push	%r12
392	push	%r13
393	push	%r14
394	push	%r15
395___
396$code.=<<___ if ($win64);
397	lea	-0x28(%rsp),%rsp
398	movaps	%xmm6,(%rsp)
399	movaps	%xmm7,0x10(%rsp)
400___
401$code.=<<___;
402	.byte	0x67
403	mov	${num}d,%r10d
404	shl	\$3,${num}d
405	shl	\$3+2,%r10d		# 4*$num
406	neg	$num			# -$num
407
408	##############################################################
409	# ensure that stack frame doesn't alias with $aptr+4*$num
410	# modulo 4096, which covers ret[num], am[num] and n[2*num]
411	# (see bn_exp.c). this is done to allow memory disambiguation
412	# logic do its magic. [excessive frame is allocated in order
413	# to allow bn_from_mont8x to clear it.]
414	#
415	lea	-64(%rsp,$num,2),%r11
416	sub	$ap,%r11
417	and	\$4095,%r11
418	cmp	%r11,%r10
419	jb	.Lmul4xsp_alt
420	sub	%r11,%rsp		# align with $ap
421	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
422	jmp	.Lmul4xsp_done
423
424.align	32
425.Lmul4xsp_alt:
426	lea	4096-64(,$num,2),%r10
427	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
428	sub	%r10,%r11
429	mov	\$0,%r10
430	cmovc	%r10,%r11
431	sub	%r11,%rsp
432.Lmul4xsp_done:
433	and	\$-64,%rsp
434	neg	$num
435
436	mov	%rax,40(%rsp)
437.Lmul4x_body:
438
439	call	mul4x_internal
440
441	mov	40(%rsp),%rsi		# restore %rsp
442	mov	\$1,%rax
443___
444$code.=<<___ if ($win64);
445	movaps	-88(%rsi),%xmm6
446	movaps	-72(%rsi),%xmm7
447___
448$code.=<<___;
449	mov	-48(%rsi),%r15
450	mov	-40(%rsi),%r14
451	mov	-32(%rsi),%r13
452	mov	-24(%rsi),%r12
453	mov	-16(%rsi),%rbp
454	mov	-8(%rsi),%rbx
455	lea	(%rsi),%rsp
456.Lmul4x_epilogue:
457	ret
458.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
459
460.type	mul4x_internal,\@abi-omnipotent
461.align	32
462mul4x_internal:
463	shl	\$5,$num
464	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
465	lea	256(%rdx,$num),%r13
466	shr	\$5,$num		# restore $num
467___
468		$bp="%r12";
469		$STRIDE=2**5*8;		# 5 is "window size"
470		$N=$STRIDE/4;		# should match cache line size
471		$tp=$i;
472$code.=<<___;
473	mov	%r10,%r11
474	shr	\$`log($N/8)/log(2)`,%r10
475	and	\$`$N/8-1`,%r11
476	not	%r10
477	lea	.Lmagic_masks(%rip),%rax
478	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
479	lea	96(%rdx,%r11,8),$bp	# pointer within 1st cache line
480	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
481	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
482	add	\$7,%r11
483	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
484	movq	24(%rax,%r10,8),%xmm7
485	and	\$7,%r11
486
487	movq	`0*$STRIDE/4-96`($bp),%xmm0
488	lea	$STRIDE($bp),$tp	# borrow $tp
489	movq	`1*$STRIDE/4-96`($bp),%xmm1
490	pand	%xmm4,%xmm0
491	movq	`2*$STRIDE/4-96`($bp),%xmm2
492	pand	%xmm5,%xmm1
493	movq	`3*$STRIDE/4-96`($bp),%xmm3
494	pand	%xmm6,%xmm2
495	.byte	0x67
496	por	%xmm1,%xmm0
497	movq	`0*$STRIDE/4-96`($tp),%xmm1
498	.byte	0x67
499	pand	%xmm7,%xmm3
500	.byte	0x67
501	por	%xmm2,%xmm0
502	movq	`1*$STRIDE/4-96`($tp),%xmm2
503	.byte	0x67
504	pand	%xmm4,%xmm1
505	.byte	0x67
506	por	%xmm3,%xmm0
507	movq	`2*$STRIDE/4-96`($tp),%xmm3
508
509	movq	%xmm0,$m0		# m0=bp[0]
510	movq	`3*$STRIDE/4-96`($tp),%xmm0
511	mov	%r13,16+8(%rsp)		# save end of b[num]
512	mov	$rp, 56+8(%rsp)		# save $rp
513
514	mov	($n0),$n0		# pull n0[0] value
515	mov	($ap),%rax
516	lea	($ap,$num),$ap		# end of a[num]
517	neg	$num
518
519	mov	$n0,$m1
520	mulq	$m0			# ap[0]*bp[0]
521	mov	%rax,$A[0]
522	mov	($np),%rax
523
524	pand	%xmm5,%xmm2
525	pand	%xmm6,%xmm3
526	por	%xmm2,%xmm1
527
528	imulq	$A[0],$m1		# "tp[0]"*n0
529	##############################################################
530	# $tp is chosen so that writing to top-most element of the
531	# vector occurs just "above" references to powers table,
532	# "above" modulo cache-line size, which effectively precludes
533	# possibility of memory disambiguation logic failure when
534	# accessing the table.
535	#
536	lea	64+8(%rsp,%r11,8),$tp
537	mov	%rdx,$A[1]
538
539	pand	%xmm7,%xmm0
540	por	%xmm3,%xmm1
541	lea	2*$STRIDE($bp),$bp
542	por	%xmm1,%xmm0
543
544	mulq	$m1			# np[0]*m1
545	add	%rax,$A[0]		# discarded
546	mov	8($ap,$num),%rax
547	adc	\$0,%rdx
548	mov	%rdx,$N[1]
549
550	mulq	$m0
551	add	%rax,$A[1]
552	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
553	adc	\$0,%rdx
554	mov	%rdx,$A[0]
555
556	mulq	$m1
557	add	%rax,$N[1]
558	mov	16($ap,$num),%rax
559	adc	\$0,%rdx
560	add	$A[1],$N[1]
561	lea	4*8($num),$j		# j=4
562	lea	16*4($np),$np
563	adc	\$0,%rdx
564	mov	$N[1],($tp)
565	mov	%rdx,$N[0]
566	jmp	.L1st4x
567
568.align	32
569.L1st4x:
570	mulq	$m0			# ap[j]*bp[0]
571	add	%rax,$A[0]
572	mov	-16*2($np),%rax
573	lea	32($tp),$tp
574	adc	\$0,%rdx
575	mov	%rdx,$A[1]
576
577	mulq	$m1			# np[j]*m1
578	add	%rax,$N[0]
579	mov	-8($ap,$j),%rax
580	adc	\$0,%rdx
581	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
582	adc	\$0,%rdx
583	mov	$N[0],-24($tp)		# tp[j-1]
584	mov	%rdx,$N[1]
585
586	mulq	$m0			# ap[j]*bp[0]
587	add	%rax,$A[1]
588	mov	-16*1($np),%rax
589	adc	\$0,%rdx
590	mov	%rdx,$A[0]
591
592	mulq	$m1			# np[j]*m1
593	add	%rax,$N[1]
594	mov	($ap,$j),%rax
595	adc	\$0,%rdx
596	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
597	adc	\$0,%rdx
598	mov	$N[1],-16($tp)		# tp[j-1]
599	mov	%rdx,$N[0]
600
601	mulq	$m0			# ap[j]*bp[0]
602	add	%rax,$A[0]
603	mov	16*0($np),%rax
604	adc	\$0,%rdx
605	mov	%rdx,$A[1]
606
607	mulq	$m1			# np[j]*m1
608	add	%rax,$N[0]
609	mov	8($ap,$j),%rax
610	adc	\$0,%rdx
611	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
612	adc	\$0,%rdx
613	mov	$N[0],-8($tp)		# tp[j-1]
614	mov	%rdx,$N[1]
615
616	mulq	$m0			# ap[j]*bp[0]
617	add	%rax,$A[1]
618	mov	16*1($np),%rax
619	adc	\$0,%rdx
620	mov	%rdx,$A[0]
621
622	mulq	$m1			# np[j]*m1
623	add	%rax,$N[1]
624	mov	16($ap,$j),%rax
625	adc	\$0,%rdx
626	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
627	lea	16*4($np),$np
628	adc	\$0,%rdx
629	mov	$N[1],($tp)		# tp[j-1]
630	mov	%rdx,$N[0]
631
632	add	\$32,$j			# j+=4
633	jnz	.L1st4x
634
635	mulq	$m0			# ap[j]*bp[0]
636	add	%rax,$A[0]
637	mov	-16*2($np),%rax
638	lea	32($tp),$tp
639	adc	\$0,%rdx
640	mov	%rdx,$A[1]
641
642	mulq	$m1			# np[j]*m1
643	add	%rax,$N[0]
644	mov	-8($ap),%rax
645	adc	\$0,%rdx
646	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
647	adc	\$0,%rdx
648	mov	$N[0],-24($tp)		# tp[j-1]
649	mov	%rdx,$N[1]
650
651	mulq	$m0			# ap[j]*bp[0]
652	add	%rax,$A[1]
653	mov	-16*1($np),%rax
654	adc	\$0,%rdx
655	mov	%rdx,$A[0]
656
657	mulq	$m1			# np[j]*m1
658	add	%rax,$N[1]
659	mov	($ap,$num),%rax		# ap[0]
660	adc	\$0,%rdx
661	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
662	adc	\$0,%rdx
663	mov	$N[1],-16($tp)		# tp[j-1]
664	mov	%rdx,$N[0]
665
666	movq	%xmm0,$m0		# bp[1]
667	lea	($np,$num,2),$np	# rewind $np
668
669	xor	$N[1],$N[1]
670	add	$A[0],$N[0]
671	adc	\$0,$N[1]
672	mov	$N[0],-8($tp)
673
674	jmp	.Louter4x
675
676.align	32
677.Louter4x:
678	mov	($tp,$num),$A[0]
679	mov	$n0,$m1
680	mulq	$m0			# ap[0]*bp[i]
681	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
682	mov	($np),%rax
683	adc	\$0,%rdx
684
685	movq	`0*$STRIDE/4-96`($bp),%xmm0
686	movq	`1*$STRIDE/4-96`($bp),%xmm1
687	pand	%xmm4,%xmm0
688	movq	`2*$STRIDE/4-96`($bp),%xmm2
689	pand	%xmm5,%xmm1
690	movq	`3*$STRIDE/4-96`($bp),%xmm3
691
692	imulq	$A[0],$m1		# tp[0]*n0
693	.byte	0x67
694	mov	%rdx,$A[1]
695	mov	$N[1],($tp)		# store upmost overflow bit
696
697	pand	%xmm6,%xmm2
698	por	%xmm1,%xmm0
699	pand	%xmm7,%xmm3
700	por	%xmm2,%xmm0
701	lea	($tp,$num),$tp		# rewind $tp
702	lea	$STRIDE($bp),$bp
703	por	%xmm3,%xmm0
704
705	mulq	$m1			# np[0]*m1
706	add	%rax,$A[0]		# "$N[0]", discarded
707	mov	8($ap,$num),%rax
708	adc	\$0,%rdx
709	mov	%rdx,$N[1]
710
711	mulq	$m0			# ap[j]*bp[i]
712	add	%rax,$A[1]
713	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
714	adc	\$0,%rdx
715	add	8($tp),$A[1]		# +tp[1]
716	adc	\$0,%rdx
717	mov	%rdx,$A[0]
718
719	mulq	$m1			# np[j]*m1
720	add	%rax,$N[1]
721	mov	16($ap,$num),%rax
722	adc	\$0,%rdx
723	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
724	lea	4*8($num),$j		# j=4
725	lea	16*4($np),$np
726	adc	\$0,%rdx
727	mov	%rdx,$N[0]
728	jmp	.Linner4x
729
730.align	32
731.Linner4x:
732	mulq	$m0			# ap[j]*bp[i]
733	add	%rax,$A[0]
734	mov	-16*2($np),%rax
735	adc	\$0,%rdx
736	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
737	lea	32($tp),$tp
738	adc	\$0,%rdx
739	mov	%rdx,$A[1]
740
741	mulq	$m1			# np[j]*m1
742	add	%rax,$N[0]
743	mov	-8($ap,$j),%rax
744	adc	\$0,%rdx
745	add	$A[0],$N[0]
746	adc	\$0,%rdx
747	mov	$N[1],-32($tp)		# tp[j-1]
748	mov	%rdx,$N[1]
749
750	mulq	$m0			# ap[j]*bp[i]
751	add	%rax,$A[1]
752	mov	-16*1($np),%rax
753	adc	\$0,%rdx
754	add	-8($tp),$A[1]
755	adc	\$0,%rdx
756	mov	%rdx,$A[0]
757
758	mulq	$m1			# np[j]*m1
759	add	%rax,$N[1]
760	mov	($ap,$j),%rax
761	adc	\$0,%rdx
762	add	$A[1],$N[1]
763	adc	\$0,%rdx
764	mov	$N[0],-24($tp)		# tp[j-1]
765	mov	%rdx,$N[0]
766
767	mulq	$m0			# ap[j]*bp[i]
768	add	%rax,$A[0]
769	mov	16*0($np),%rax
770	adc	\$0,%rdx
771	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
772	adc	\$0,%rdx
773	mov	%rdx,$A[1]
774
775	mulq	$m1			# np[j]*m1
776	add	%rax,$N[0]
777	mov	8($ap,$j),%rax
778	adc	\$0,%rdx
779	add	$A[0],$N[0]
780	adc	\$0,%rdx
781	mov	$N[1],-16($tp)		# tp[j-1]
782	mov	%rdx,$N[1]
783
784	mulq	$m0			# ap[j]*bp[i]
785	add	%rax,$A[1]
786	mov	16*1($np),%rax
787	adc	\$0,%rdx
788	add	8($tp),$A[1]
789	adc	\$0,%rdx
790	mov	%rdx,$A[0]
791
792	mulq	$m1			# np[j]*m1
793	add	%rax,$N[1]
794	mov	16($ap,$j),%rax
795	adc	\$0,%rdx
796	add	$A[1],$N[1]
797	lea	16*4($np),$np
798	adc	\$0,%rdx
799	mov	$N[0],-8($tp)		# tp[j-1]
800	mov	%rdx,$N[0]
801
802	add	\$32,$j			# j+=4
803	jnz	.Linner4x
804
805	mulq	$m0			# ap[j]*bp[i]
806	add	%rax,$A[0]
807	mov	-16*2($np),%rax
808	adc	\$0,%rdx
809	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
810	lea	32($tp),$tp
811	adc	\$0,%rdx
812	mov	%rdx,$A[1]
813
814	mulq	$m1			# np[j]*m1
815	add	%rax,$N[0]
816	mov	-8($ap),%rax
817	adc	\$0,%rdx
818	add	$A[0],$N[0]
819	adc	\$0,%rdx
820	mov	$N[1],-32($tp)		# tp[j-1]
821	mov	%rdx,$N[1]
822
823	mulq	$m0			# ap[j]*bp[i]
824	add	%rax,$A[1]
825	mov	$m1,%rax
826	mov	-16*1($np),$m1
827	adc	\$0,%rdx
828	add	-8($tp),$A[1]
829	adc	\$0,%rdx
830	mov	%rdx,$A[0]
831
832	mulq	$m1			# np[j]*m1
833	add	%rax,$N[1]
834	mov	($ap,$num),%rax		# ap[0]
835	adc	\$0,%rdx
836	add	$A[1],$N[1]
837	adc	\$0,%rdx
838	mov	$N[0],-24($tp)		# tp[j-1]
839	mov	%rdx,$N[0]
840
841	movq	%xmm0,$m0		# bp[i+1]
842	mov	$N[1],-16($tp)		# tp[j-1]
843	lea	($np,$num,2),$np	# rewind $np
844
845	xor	$N[1],$N[1]
846	add	$A[0],$N[0]
847	adc	\$0,$N[1]
848	add	($tp),$N[0]		# pull upmost overflow bit
849	adc	\$0,$N[1]		# upmost overflow bit
850	mov	$N[0],-8($tp)
851
852	cmp	16+8(%rsp),$bp
853	jb	.Louter4x
854___
855if (1) {
856$code.=<<___;
857	sub	$N[0],$m1		# compare top-most words
858	adc	$j,$j			# $j is zero
859	or	$j,$N[1]
860	xor	\$1,$N[1]
861	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
862	lea	($np,$N[1],8),%rbp	# nptr in .sqr4x_sub
863	mov	%r9,%rcx
864	sar	\$3+2,%rcx		# cf=0
865	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
866	jmp	.Lsqr4x_sub
867___
868} else {
869my @ri=("%rax",$bp,$m0,$m1);
870my $rp="%rdx";
871$code.=<<___
872	xor	\$1,$N[1]
873	lea	($tp,$num),$tp		# rewind $tp
874	sar	\$5,$num		# cf=0
875	lea	($np,$N[1],8),$np
876	mov	56+8(%rsp),$rp		# restore $rp
877	jmp	.Lsub4x
878
879.align	32
880.Lsub4x:
881	.byte	0x66
882	mov	8*0($tp),@ri[0]
883	mov	8*1($tp),@ri[1]
884	.byte	0x66
885	sbb	16*0($np),@ri[0]
886	mov	8*2($tp),@ri[2]
887	sbb	16*1($np),@ri[1]
888	mov	3*8($tp),@ri[3]
889	lea	4*8($tp),$tp
890	sbb	16*2($np),@ri[2]
891	mov	@ri[0],8*0($rp)
892	sbb	16*3($np),@ri[3]
893	lea	16*4($np),$np
894	mov	@ri[1],8*1($rp)
895	mov	@ri[2],8*2($rp)
896	mov	@ri[3],8*3($rp)
897	lea	8*4($rp),$rp
898
899	inc	$num
900	jnz	.Lsub4x
901
902	ret
903___
904}
905$code.=<<___;
906.size	mul4x_internal,.-mul4x_internal
907___
908}}}
909{{{
910######################################################################
911# void bn_power5(
912my $rptr="%rdi";	# BN_ULONG *rptr,
913my $aptr="%rsi";	# const BN_ULONG *aptr,
914my $bptr="%rdx";	# const void *table,
915my $nptr="%rcx";	# const BN_ULONG *nptr,
916my $n0  ="%r8";		# const BN_ULONG *n0);
917my $num ="%r9";		# int num, has to be divisible by 8
918			# int pwr
919
920my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
921my @A0=("%r10","%r11");
922my @A1=("%r12","%r13");
923my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
924
925$code.=<<___;
926.globl	bn_power5
927.type	bn_power5,\@function,6
928.align	32
929bn_power5:
930___
931$code.=<<___ if ($addx);
932	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
933	and	\$0x80100,%r11d
934	cmp	\$0x80100,%r11d
935	je	.Lpowerx5_enter
936___
937$code.=<<___;
938	mov	%rsp,%rax
939	push	%rbx
940	push	%rbp
941	push	%r12
942	push	%r13
943	push	%r14
944	push	%r15
945___
946$code.=<<___ if ($win64);
947	lea	-0x28(%rsp),%rsp
948	movaps	%xmm6,(%rsp)
949	movaps	%xmm7,0x10(%rsp)
950___
951$code.=<<___;
952	mov	${num}d,%r10d
953	shl	\$3,${num}d		# convert $num to bytes
954	shl	\$3+2,%r10d		# 4*$num
955	neg	$num
956	mov	($n0),$n0		# *n0
957
958	##############################################################
959	# ensure that stack frame doesn't alias with $aptr+4*$num
960	# modulo 4096, which covers ret[num], am[num] and n[2*num]
961	# (see bn_exp.c). this is done to allow memory disambiguation
962	# logic do its magic.
963	#
964	lea	-64(%rsp,$num,2),%r11
965	sub	$aptr,%r11
966	and	\$4095,%r11
967	cmp	%r11,%r10
968	jb	.Lpwr_sp_alt
969	sub	%r11,%rsp		# align with $aptr
970	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
971	jmp	.Lpwr_sp_done
972
973.align	32
974.Lpwr_sp_alt:
975	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
976	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
977	sub	%r10,%r11
978	mov	\$0,%r10
979	cmovc	%r10,%r11
980	sub	%r11,%rsp
981.Lpwr_sp_done:
982	and	\$-64,%rsp
983	mov	$num,%r10
984	neg	$num
985
986	##############################################################
987	# Stack layout
988	#
989	# +0	saved $num, used in reduction section
990	# +8	&t[2*$num], used in reduction section
991	# +32	saved *n0
992	# +40	saved %rsp
993	# +48	t[2*$num]
994	#
995	mov	$n0,  32(%rsp)
996	mov	%rax, 40(%rsp)		# save original %rsp
997.Lpower5_body:
998	movq	$rptr,%xmm1		# save $rptr
999	movq	$nptr,%xmm2		# save $nptr
1000	movq	%r10, %xmm3		# -$num
1001	movq	$bptr,%xmm4
1002
1003	call	__bn_sqr8x_internal
1004	call	__bn_sqr8x_internal
1005	call	__bn_sqr8x_internal
1006	call	__bn_sqr8x_internal
1007	call	__bn_sqr8x_internal
1008
1009	movq	%xmm2,$nptr
1010	movq	%xmm4,$bptr
1011	mov	$aptr,$rptr
1012	mov	40(%rsp),%rax
1013	lea	32(%rsp),$n0
1014
1015	call	mul4x_internal
1016
1017	mov	40(%rsp),%rsi		# restore %rsp
1018	mov	\$1,%rax
1019	mov	-48(%rsi),%r15
1020	mov	-40(%rsi),%r14
1021	mov	-32(%rsi),%r13
1022	mov	-24(%rsi),%r12
1023	mov	-16(%rsi),%rbp
1024	mov	-8(%rsi),%rbx
1025	lea	(%rsi),%rsp
1026.Lpower5_epilogue:
1027	ret
1028.size	bn_power5,.-bn_power5
1029
1030.globl	bn_sqr8x_internal
1031.hidden	bn_sqr8x_internal
1032.type	bn_sqr8x_internal,\@abi-omnipotent
1033.align	32
1034bn_sqr8x_internal:
1035__bn_sqr8x_internal:
1036	##############################################################
1037	# Squaring part:
1038	#
1039	# a) multiply-n-add everything but a[i]*a[i];
1040	# b) shift result of a) by 1 to the left and accumulate
1041	#    a[i]*a[i] products;
1042	#
1043	##############################################################
1044	#                                                     a[1]a[0]
1045	#                                                 a[2]a[0]
1046	#                                             a[3]a[0]
1047	#                                             a[2]a[1]
1048	#                                         a[4]a[0]
1049	#                                         a[3]a[1]
1050	#                                     a[5]a[0]
1051	#                                     a[4]a[1]
1052	#                                     a[3]a[2]
1053	#                                 a[6]a[0]
1054	#                                 a[5]a[1]
1055	#                                 a[4]a[2]
1056	#                             a[7]a[0]
1057	#                             a[6]a[1]
1058	#                             a[5]a[2]
1059	#                             a[4]a[3]
1060	#                         a[7]a[1]
1061	#                         a[6]a[2]
1062	#                         a[5]a[3]
1063	#                     a[7]a[2]
1064	#                     a[6]a[3]
1065	#                     a[5]a[4]
1066	#                 a[7]a[3]
1067	#                 a[6]a[4]
1068	#             a[7]a[4]
1069	#             a[6]a[5]
1070	#         a[7]a[5]
1071	#     a[7]a[6]
1072	#                                                     a[1]a[0]
1073	#                                                 a[2]a[0]
1074	#                                             a[3]a[0]
1075	#                                         a[4]a[0]
1076	#                                     a[5]a[0]
1077	#                                 a[6]a[0]
1078	#                             a[7]a[0]
1079	#                                             a[2]a[1]
1080	#                                         a[3]a[1]
1081	#                                     a[4]a[1]
1082	#                                 a[5]a[1]
1083	#                             a[6]a[1]
1084	#                         a[7]a[1]
1085	#                                     a[3]a[2]
1086	#                                 a[4]a[2]
1087	#                             a[5]a[2]
1088	#                         a[6]a[2]
1089	#                     a[7]a[2]
1090	#                             a[4]a[3]
1091	#                         a[5]a[3]
1092	#                     a[6]a[3]
1093	#                 a[7]a[3]
1094	#                     a[5]a[4]
1095	#                 a[6]a[4]
1096	#             a[7]a[4]
1097	#             a[6]a[5]
1098	#         a[7]a[5]
1099	#     a[7]a[6]
1100	#                                                         a[0]a[0]
1101	#                                                 a[1]a[1]
1102	#                                         a[2]a[2]
1103	#                                 a[3]a[3]
1104	#                         a[4]a[4]
1105	#                 a[5]a[5]
1106	#         a[6]a[6]
1107	# a[7]a[7]
1108
1109	lea	32(%r10),$i		# $i=-($num-32)
1110	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
1111
1112	mov	$num,$j			# $j=$num
1113
1114					# comments apply to $num==8 case
1115	mov	-32($aptr,$i),$a0	# a[0]
1116	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1117	mov	-24($aptr,$i),%rax	# a[1]
1118	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1119	mov	-16($aptr,$i),$ai	# a[2]
1120	mov	%rax,$a1
1121
1122	mul	$a0			# a[1]*a[0]
1123	mov	%rax,$A0[0]		# a[1]*a[0]
1124	 mov	$ai,%rax		# a[2]
1125	mov	%rdx,$A0[1]
1126	mov	$A0[0],-24($tptr,$i)	# t[1]
1127
1128	mul	$a0			# a[2]*a[0]
1129	add	%rax,$A0[1]
1130	 mov	$ai,%rax
1131	adc	\$0,%rdx
1132	mov	$A0[1],-16($tptr,$i)	# t[2]
1133	mov	%rdx,$A0[0]
1134
1135
1136	 mov	-8($aptr,$i),$ai	# a[3]
1137	mul	$a1			# a[2]*a[1]
1138	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
1139	 mov	$ai,%rax
1140	mov	%rdx,$A1[1]
1141
1142	 lea	($i),$j
1143	mul	$a0			# a[3]*a[0]
1144	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1145	 mov	$ai,%rax
1146	mov	%rdx,$A0[1]
1147	adc	\$0,$A0[1]
1148	add	$A1[0],$A0[0]
1149	adc	\$0,$A0[1]
1150	mov	$A0[0],-8($tptr,$j)	# t[3]
1151	jmp	.Lsqr4x_1st
1152
1153.align	32
1154.Lsqr4x_1st:
1155	 mov	($aptr,$j),$ai		# a[4]
1156	mul	$a1			# a[3]*a[1]
1157	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1158	 mov	$ai,%rax
1159	mov	%rdx,$A1[0]
1160	adc	\$0,$A1[0]
1161
1162	mul	$a0			# a[4]*a[0]
1163	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1164	 mov	$ai,%rax		# a[3]
1165	 mov	8($aptr,$j),$ai		# a[5]
1166	mov	%rdx,$A0[0]
1167	adc	\$0,$A0[0]
1168	add	$A1[1],$A0[1]
1169	adc	\$0,$A0[0]
1170
1171
1172	mul	$a1			# a[4]*a[3]
1173	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1174	 mov	$ai,%rax
1175	 mov	$A0[1],($tptr,$j)	# t[4]
1176	mov	%rdx,$A1[1]
1177	adc	\$0,$A1[1]
1178
1179	mul	$a0			# a[5]*a[2]
1180	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1181	 mov	$ai,%rax
1182	 mov	16($aptr,$j),$ai	# a[6]
1183	mov	%rdx,$A0[1]
1184	adc	\$0,$A0[1]
1185	add	$A1[0],$A0[0]
1186	adc	\$0,$A0[1]
1187
1188	mul	$a1			# a[5]*a[3]
1189	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
1190	 mov	$ai,%rax
1191	 mov	$A0[0],8($tptr,$j)	# t[5]
1192	mov	%rdx,$A1[0]
1193	adc	\$0,$A1[0]
1194
1195	mul	$a0			# a[6]*a[2]
1196	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
1197	 mov	$ai,%rax		# a[3]
1198	 mov	24($aptr,$j),$ai	# a[7]
1199	mov	%rdx,$A0[0]
1200	adc	\$0,$A0[0]
1201	add	$A1[1],$A0[1]
1202	adc	\$0,$A0[0]
1203
1204
1205	mul	$a1			# a[6]*a[5]
1206	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
1207	 mov	$ai,%rax
1208	 mov	$A0[1],16($tptr,$j)	# t[6]
1209	mov	%rdx,$A1[1]
1210	adc	\$0,$A1[1]
1211	 lea	32($j),$j
1212
1213	mul	$a0			# a[7]*a[4]
1214	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
1215	 mov	$ai,%rax
1216	mov	%rdx,$A0[1]
1217	adc	\$0,$A0[1]
1218	add	$A1[0],$A0[0]
1219	adc	\$0,$A0[1]
1220	mov	$A0[0],-8($tptr,$j)	# t[7]
1221
1222	cmp	\$0,$j
1223	jne	.Lsqr4x_1st
1224
1225	mul	$a1			# a[7]*a[5]
1226	add	%rax,$A1[1]
1227	lea	16($i),$i
1228	adc	\$0,%rdx
1229	add	$A0[1],$A1[1]
1230	adc	\$0,%rdx
1231
1232	mov	$A1[1],($tptr)		# t[8]
1233	mov	%rdx,$A1[0]
1234	mov	%rdx,8($tptr)		# t[9]
1235	jmp	.Lsqr4x_outer
1236
1237.align	32
1238.Lsqr4x_outer:				# comments apply to $num==6 case
1239	mov	-32($aptr,$i),$a0	# a[0]
1240	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1241	mov	-24($aptr,$i),%rax	# a[1]
1242	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1243	mov	-16($aptr,$i),$ai	# a[2]
1244	mov	%rax,$a1
1245
1246	mul	$a0			# a[1]*a[0]
1247	mov	-24($tptr,$i),$A0[0]	# t[1]
1248	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
1249	 mov	$ai,%rax		# a[2]
1250	adc	\$0,%rdx
1251	mov	$A0[0],-24($tptr,$i)	# t[1]
1252	mov	%rdx,$A0[1]
1253
1254	mul	$a0			# a[2]*a[0]
1255	add	%rax,$A0[1]
1256	 mov	$ai,%rax
1257	adc	\$0,%rdx
1258	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
1259	mov	%rdx,$A0[0]
1260	adc	\$0,$A0[0]
1261	mov	$A0[1],-16($tptr,$i)	# t[2]
1262
1263	xor	$A1[0],$A1[0]
1264
1265	 mov	-8($aptr,$i),$ai	# a[3]
1266	mul	$a1			# a[2]*a[1]
1267	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
1268	 mov	$ai,%rax
1269	adc	\$0,%rdx
1270	add	-8($tptr,$i),$A1[0]
1271	mov	%rdx,$A1[1]
1272	adc	\$0,$A1[1]
1273
1274	mul	$a0			# a[3]*a[0]
1275	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1276	 mov	$ai,%rax
1277	adc	\$0,%rdx
1278	add	$A1[0],$A0[0]
1279	mov	%rdx,$A0[1]
1280	adc	\$0,$A0[1]
1281	mov	$A0[0],-8($tptr,$i)	# t[3]
1282
1283	lea	($i),$j
1284	jmp	.Lsqr4x_inner
1285
1286.align	32
1287.Lsqr4x_inner:
1288	 mov	($aptr,$j),$ai		# a[4]
1289	mul	$a1			# a[3]*a[1]
1290	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1291	 mov	$ai,%rax
1292	mov	%rdx,$A1[0]
1293	adc	\$0,$A1[0]
1294	add	($tptr,$j),$A1[1]
1295	adc	\$0,$A1[0]
1296
1297	.byte	0x67
1298	mul	$a0			# a[4]*a[0]
1299	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1300	 mov	$ai,%rax		# a[3]
1301	 mov	8($aptr,$j),$ai		# a[5]
1302	mov	%rdx,$A0[0]
1303	adc	\$0,$A0[0]
1304	add	$A1[1],$A0[1]
1305	adc	\$0,$A0[0]
1306
1307	mul	$a1			# a[4]*a[3]
1308	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1309	mov	$A0[1],($tptr,$j)	# t[4]
1310	 mov	$ai,%rax
1311	mov	%rdx,$A1[1]
1312	adc	\$0,$A1[1]
1313	add	8($tptr,$j),$A1[0]
1314	lea	16($j),$j		# j++
1315	adc	\$0,$A1[1]
1316
1317	mul	$a0			# a[5]*a[2]
1318	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1319	 mov	$ai,%rax
1320	adc	\$0,%rdx
1321	add	$A1[0],$A0[0]
1322	mov	%rdx,$A0[1]
1323	adc	\$0,$A0[1]
1324	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
1325
1326	cmp	\$0,$j
1327	jne	.Lsqr4x_inner
1328
1329	.byte	0x67
1330	mul	$a1			# a[5]*a[3]
1331	add	%rax,$A1[1]
1332	adc	\$0,%rdx
1333	add	$A0[1],$A1[1]
1334	adc	\$0,%rdx
1335
1336	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
1337	mov	%rdx,$A1[0]
1338	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
1339
1340	add	\$16,$i
1341	jnz	.Lsqr4x_outer
1342
1343					# comments apply to $num==4 case
1344	mov	-32($aptr),$a0		# a[0]
1345	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1346	mov	-24($aptr),%rax		# a[1]
1347	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1348	mov	-16($aptr),$ai		# a[2]
1349	mov	%rax,$a1
1350
1351	mul	$a0			# a[1]*a[0]
1352	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
1353	 mov	$ai,%rax		# a[2]
1354	mov	%rdx,$A0[1]
1355	adc	\$0,$A0[1]
1356
1357	mul	$a0			# a[2]*a[0]
1358	add	%rax,$A0[1]
1359	 mov	$ai,%rax
1360	 mov	$A0[0],-24($tptr)	# t[1]
1361	mov	%rdx,$A0[0]
1362	adc	\$0,$A0[0]
1363	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1364	 mov	-8($aptr),$ai		# a[3]
1365	adc	\$0,$A0[0]
1366
1367	mul	$a1			# a[2]*a[1]
1368	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1369	 mov	$ai,%rax
1370	 mov	$A0[1],-16($tptr)	# t[2]
1371	mov	%rdx,$A1[1]
1372	adc	\$0,$A1[1]
1373
1374	mul	$a0			# a[3]*a[0]
1375	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1376	 mov	$ai,%rax
1377	mov	%rdx,$A0[1]
1378	adc	\$0,$A0[1]
1379	add	$A1[0],$A0[0]
1380	adc	\$0,$A0[1]
1381	mov	$A0[0],-8($tptr)	# t[3]
1382
1383	mul	$a1			# a[3]*a[1]
1384	add	%rax,$A1[1]
1385	 mov	-16($aptr),%rax		# a[2]
1386	adc	\$0,%rdx
1387	add	$A0[1],$A1[1]
1388	adc	\$0,%rdx
1389
1390	mov	$A1[1],($tptr)		# t[4]
1391	mov	%rdx,$A1[0]
1392	mov	%rdx,8($tptr)		# t[5]
1393
1394	mul	$ai			# a[2]*a[3]
1395___
1396{
1397my ($shift,$carry)=($a0,$a1);
1398my @S=(@A1,$ai,$n0);
1399$code.=<<___;
1400	 add	\$16,$i
1401	 xor	$shift,$shift
1402	 sub	$num,$i			# $i=16-$num
1403	 xor	$carry,$carry
1404
1405	add	$A1[0],%rax		# t[5]
1406	adc	\$0,%rdx
1407	mov	%rax,8($tptr)		# t[5]
1408	mov	%rdx,16($tptr)		# t[6]
1409	mov	$carry,24($tptr)	# t[7]
1410
1411	 mov	-16($aptr,$i),%rax	# a[0]
1412	lea	48+8(%rsp),$tptr
1413	 xor	$A0[0],$A0[0]		# t[0]
1414	 mov	8($tptr),$A0[1]		# t[1]
1415
1416	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1417	shr	\$63,$A0[0]
1418	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1419	shr	\$63,$A0[1]
1420	or	$A0[0],$S[1]		# | t[2*i]>>63
1421	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1422	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1423	mul	%rax			# a[i]*a[i]
1424	neg	$carry			# mov $carry,cf
1425	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1426	adc	%rax,$S[0]
1427	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1428	mov	$S[0],($tptr)
1429	adc	%rdx,$S[1]
1430
1431	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1432	 mov	$S[1],8($tptr)
1433	 sbb	$carry,$carry		# mov cf,$carry
1434	shr	\$63,$A0[0]
1435	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1436	shr	\$63,$A0[1]
1437	or	$A0[0],$S[3]		# | t[2*i]>>63
1438	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1439	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1440	mul	%rax			# a[i]*a[i]
1441	neg	$carry			# mov $carry,cf
1442	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1443	adc	%rax,$S[2]
1444	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1445	mov	$S[2],16($tptr)
1446	adc	%rdx,$S[3]
1447	lea	16($i),$i
1448	mov	$S[3],24($tptr)
1449	sbb	$carry,$carry		# mov cf,$carry
1450	lea	64($tptr),$tptr
1451	jmp	.Lsqr4x_shift_n_add
1452
1453.align	32
1454.Lsqr4x_shift_n_add:
1455	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1456	shr	\$63,$A0[0]
1457	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1458	shr	\$63,$A0[1]
1459	or	$A0[0],$S[1]		# | t[2*i]>>63
1460	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1461	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1462	mul	%rax			# a[i]*a[i]
1463	neg	$carry			# mov $carry,cf
1464	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1465	adc	%rax,$S[0]
1466	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1467	mov	$S[0],-32($tptr)
1468	adc	%rdx,$S[1]
1469
1470	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1471	 mov	$S[1],-24($tptr)
1472	 sbb	$carry,$carry		# mov cf,$carry
1473	shr	\$63,$A0[0]
1474	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1475	shr	\$63,$A0[1]
1476	or	$A0[0],$S[3]		# | t[2*i]>>63
1477	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
1478	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1479	mul	%rax			# a[i]*a[i]
1480	neg	$carry			# mov $carry,cf
1481	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
1482	adc	%rax,$S[2]
1483	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1484	mov	$S[2],-16($tptr)
1485	adc	%rdx,$S[3]
1486
1487	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1488	 mov	$S[3],-8($tptr)
1489	 sbb	$carry,$carry		# mov cf,$carry
1490	shr	\$63,$A0[0]
1491	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1492	shr	\$63,$A0[1]
1493	or	$A0[0],$S[1]		# | t[2*i]>>63
1494	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1495	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1496	mul	%rax			# a[i]*a[i]
1497	neg	$carry			# mov $carry,cf
1498	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1499	adc	%rax,$S[0]
1500	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1501	mov	$S[0],0($tptr)
1502	adc	%rdx,$S[1]
1503
1504	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1505	 mov	$S[1],8($tptr)
1506	 sbb	$carry,$carry		# mov cf,$carry
1507	shr	\$63,$A0[0]
1508	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1509	shr	\$63,$A0[1]
1510	or	$A0[0],$S[3]		# | t[2*i]>>63
1511	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1512	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1513	mul	%rax			# a[i]*a[i]
1514	neg	$carry			# mov $carry,cf
1515	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1516	adc	%rax,$S[2]
1517	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1518	mov	$S[2],16($tptr)
1519	adc	%rdx,$S[3]
1520	mov	$S[3],24($tptr)
1521	sbb	$carry,$carry		# mov cf,$carry
1522	lea	64($tptr),$tptr
1523	add	\$32,$i
1524	jnz	.Lsqr4x_shift_n_add
1525
1526	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1527	.byte	0x67
1528	shr	\$63,$A0[0]
1529	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1530	shr	\$63,$A0[1]
1531	or	$A0[0],$S[1]		# | t[2*i]>>63
1532	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1533	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1534	mul	%rax			# a[i]*a[i]
1535	neg	$carry			# mov $carry,cf
1536	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1537	adc	%rax,$S[0]
1538	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1539	mov	$S[0],-32($tptr)
1540	adc	%rdx,$S[1]
1541
1542	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1543	 mov	$S[1],-24($tptr)
1544	 sbb	$carry,$carry		# mov cf,$carry
1545	shr	\$63,$A0[0]
1546	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1547	shr	\$63,$A0[1]
1548	or	$A0[0],$S[3]		# | t[2*i]>>63
1549	mul	%rax			# a[i]*a[i]
1550	neg	$carry			# mov $carry,cf
1551	adc	%rax,$S[2]
1552	adc	%rdx,$S[3]
1553	mov	$S[2],-16($tptr)
1554	mov	$S[3],-8($tptr)
1555___
1556}
1557######################################################################
1558# Montgomery reduction part, "word-by-word" algorithm.
1559#
1560# This new path is inspired by multiple submissions from Intel, by
1561# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1562# Vinodh Gopal...
1563{
1564my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1565
1566$code.=<<___;
1567	movq	%xmm2,$nptr
1568sqr8x_reduction:
1569	xor	%rax,%rax
1570	lea	($nptr,$num,2),%rcx	# end of n[]
1571	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
1572	mov	%rcx,0+8(%rsp)
1573	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
1574	mov	%rdx,8+8(%rsp)
1575	neg	$num
1576	jmp	.L8x_reduction_loop
1577
1578.align	32
1579.L8x_reduction_loop:
1580	lea	($tptr,$num),$tptr	# start of current t[] window
1581	.byte	0x66
1582	mov	8*0($tptr),$m0
1583	mov	8*1($tptr),%r9
1584	mov	8*2($tptr),%r10
1585	mov	8*3($tptr),%r11
1586	mov	8*4($tptr),%r12
1587	mov	8*5($tptr),%r13
1588	mov	8*6($tptr),%r14
1589	mov	8*7($tptr),%r15
1590	mov	%rax,(%rdx)		# store top-most carry bit
1591	lea	8*8($tptr),$tptr
1592
1593	.byte	0x67
1594	mov	$m0,%r8
1595	imulq	32+8(%rsp),$m0		# n0*a[0]
1596	mov	16*0($nptr),%rax	# n[0]
1597	mov	\$8,%ecx
1598	jmp	.L8x_reduce
1599
1600.align	32
1601.L8x_reduce:
1602	mulq	$m0
1603	 mov	16*1($nptr),%rax	# n[1]
1604	neg	%r8
1605	mov	%rdx,%r8
1606	adc	\$0,%r8
1607
1608	mulq	$m0
1609	add	%rax,%r9
1610	 mov	16*2($nptr),%rax
1611	adc	\$0,%rdx
1612	add	%r9,%r8
1613	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
1614	mov	%rdx,%r9
1615	adc	\$0,%r9
1616
1617	mulq	$m0
1618	add	%rax,%r10
1619	 mov	16*3($nptr),%rax
1620	adc	\$0,%rdx
1621	add	%r10,%r9
1622	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
1623	mov	%rdx,%r10
1624	adc	\$0,%r10
1625
1626	mulq	$m0
1627	add	%rax,%r11
1628	 mov	16*4($nptr),%rax
1629	adc	\$0,%rdx
1630	 imulq	%r8,$carry		# modulo-scheduled
1631	add	%r11,%r10
1632	mov	%rdx,%r11
1633	adc	\$0,%r11
1634
1635	mulq	$m0
1636	add	%rax,%r12
1637	 mov	16*5($nptr),%rax
1638	adc	\$0,%rdx
1639	add	%r12,%r11
1640	mov	%rdx,%r12
1641	adc	\$0,%r12
1642
1643	mulq	$m0
1644	add	%rax,%r13
1645	 mov	16*6($nptr),%rax
1646	adc	\$0,%rdx
1647	add	%r13,%r12
1648	mov	%rdx,%r13
1649	adc	\$0,%r13
1650
1651	mulq	$m0
1652	add	%rax,%r14
1653	 mov	16*7($nptr),%rax
1654	adc	\$0,%rdx
1655	add	%r14,%r13
1656	mov	%rdx,%r14
1657	adc	\$0,%r14
1658
1659	mulq	$m0
1660	 mov	$carry,$m0		# n0*a[i]
1661	add	%rax,%r15
1662	 mov	16*0($nptr),%rax	# n[0]
1663	adc	\$0,%rdx
1664	add	%r15,%r14
1665	mov	%rdx,%r15
1666	adc	\$0,%r15
1667
1668	dec	%ecx
1669	jnz	.L8x_reduce
1670
1671	lea	16*8($nptr),$nptr
1672	xor	%rax,%rax
1673	mov	8+8(%rsp),%rdx		# pull end of t[]
1674	cmp	0+8(%rsp),$nptr		# end of n[]?
1675	jae	.L8x_no_tail
1676
1677	.byte	0x66
1678	add	8*0($tptr),%r8
1679	adc	8*1($tptr),%r9
1680	adc	8*2($tptr),%r10
1681	adc	8*3($tptr),%r11
1682	adc	8*4($tptr),%r12
1683	adc	8*5($tptr),%r13
1684	adc	8*6($tptr),%r14
1685	adc	8*7($tptr),%r15
1686	sbb	$carry,$carry		# top carry
1687
1688	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1689	mov	\$8,%ecx
1690	mov	16*0($nptr),%rax
1691	jmp	.L8x_tail
1692
1693.align	32
1694.L8x_tail:
1695	mulq	$m0
1696	add	%rax,%r8
1697	 mov	16*1($nptr),%rax
1698	 mov	%r8,($tptr)		# save result
1699	mov	%rdx,%r8
1700	adc	\$0,%r8
1701
1702	mulq	$m0
1703	add	%rax,%r9
1704	 mov	16*2($nptr),%rax
1705	adc	\$0,%rdx
1706	add	%r9,%r8
1707	 lea	8($tptr),$tptr		# $tptr++
1708	mov	%rdx,%r9
1709	adc	\$0,%r9
1710
1711	mulq	$m0
1712	add	%rax,%r10
1713	 mov	16*3($nptr),%rax
1714	adc	\$0,%rdx
1715	add	%r10,%r9
1716	mov	%rdx,%r10
1717	adc	\$0,%r10
1718
1719	mulq	$m0
1720	add	%rax,%r11
1721	 mov	16*4($nptr),%rax
1722	adc	\$0,%rdx
1723	add	%r11,%r10
1724	mov	%rdx,%r11
1725	adc	\$0,%r11
1726
1727	mulq	$m0
1728	add	%rax,%r12
1729	 mov	16*5($nptr),%rax
1730	adc	\$0,%rdx
1731	add	%r12,%r11
1732	mov	%rdx,%r12
1733	adc	\$0,%r12
1734
1735	mulq	$m0
1736	add	%rax,%r13
1737	 mov	16*6($nptr),%rax
1738	adc	\$0,%rdx
1739	add	%r13,%r12
1740	mov	%rdx,%r13
1741	adc	\$0,%r13
1742
1743	mulq	$m0
1744	add	%rax,%r14
1745	 mov	16*7($nptr),%rax
1746	adc	\$0,%rdx
1747	add	%r14,%r13
1748	mov	%rdx,%r14
1749	adc	\$0,%r14
1750
1751	mulq	$m0
1752	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1753	add	%rax,%r15
1754	adc	\$0,%rdx
1755	add	%r15,%r14
1756	 mov	16*0($nptr),%rax	# pull n[0]
1757	mov	%rdx,%r15
1758	adc	\$0,%r15
1759
1760	dec	%ecx
1761	jnz	.L8x_tail
1762
1763	lea	16*8($nptr),$nptr
1764	mov	8+8(%rsp),%rdx		# pull end of t[]
1765	cmp	0+8(%rsp),$nptr		# end of n[]?
1766	jae	.L8x_tail_done		# break out of loop
1767
1768	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1769	neg	$carry
1770	 mov	8*0($nptr),%rax		# pull n[0]
1771	adc	8*0($tptr),%r8
1772	adc	8*1($tptr),%r9
1773	adc	8*2($tptr),%r10
1774	adc	8*3($tptr),%r11
1775	adc	8*4($tptr),%r12
1776	adc	8*5($tptr),%r13
1777	adc	8*6($tptr),%r14
1778	adc	8*7($tptr),%r15
1779	sbb	$carry,$carry		# top carry
1780
1781	mov	\$8,%ecx
1782	jmp	.L8x_tail
1783
1784.align	32
1785.L8x_tail_done:
1786	add	(%rdx),%r8		# can this overflow?
1787	adc	\$0,%r9
1788	adc	\$0,%r10
1789	adc	\$0,%r11
1790	adc	\$0,%r12
1791	adc	\$0,%r13
1792	adc	\$0,%r14
1793	adc	\$0,%r15		# can't overflow, because we
1794					# started with "overhung" part
1795					# of multiplication
1796	xor	%rax,%rax
1797
1798	neg	$carry
1799.L8x_no_tail:
1800	adc	8*0($tptr),%r8
1801	adc	8*1($tptr),%r9
1802	adc	8*2($tptr),%r10
1803	adc	8*3($tptr),%r11
1804	adc	8*4($tptr),%r12
1805	adc	8*5($tptr),%r13
1806	adc	8*6($tptr),%r14
1807	adc	8*7($tptr),%r15
1808	adc	\$0,%rax		# top-most carry
1809	 mov	-16($nptr),%rcx		# np[num-1]
1810	 xor	$carry,$carry
1811
1812	movq	%xmm2,$nptr		# restore $nptr
1813
1814	mov	%r8,8*0($tptr)		# store top 512 bits
1815	mov	%r9,8*1($tptr)
1816	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
1817	mov	%r10,8*2($tptr)
1818	mov	%r11,8*3($tptr)
1819	mov	%r12,8*4($tptr)
1820	mov	%r13,8*5($tptr)
1821	mov	%r14,8*6($tptr)
1822	mov	%r15,8*7($tptr)
1823	lea	8*8($tptr),$tptr
1824
1825	cmp	%rdx,$tptr		# end of t[]?
1826	jb	.L8x_reduction_loop
1827___
1828}
1829##############################################################
1830# Post-condition, 4x unrolled
1831#
1832{
1833my ($tptr,$nptr)=("%rbx","%rbp");
1834$code.=<<___;
1835	#xor	%rsi,%rsi		# %rsi was $carry above
1836	sub	%r15,%rcx		# compare top-most words
1837	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
1838	adc	%rsi,%rsi
1839	mov	$num,%rcx
1840	or	%rsi,%rax
1841	movq	%xmm1,$rptr		# restore $rptr
1842	xor	\$1,%rax
1843	movq	%xmm1,$aptr		# prepare for back-to-back call
1844	lea	($nptr,%rax,8),$nptr
1845	sar	\$3+2,%rcx		# cf=0
1846	jmp	.Lsqr4x_sub
1847
1848.align	32
1849.Lsqr4x_sub:
1850	.byte	0x66
1851	mov	8*0($tptr),%r12
1852	mov	8*1($tptr),%r13
1853	sbb	16*0($nptr),%r12
1854	mov	8*2($tptr),%r14
1855	sbb	16*1($nptr),%r13
1856	mov	8*3($tptr),%r15
1857	lea	8*4($tptr),$tptr
1858	sbb	16*2($nptr),%r14
1859	mov	%r12,8*0($rptr)
1860	sbb	16*3($nptr),%r15
1861	lea	16*4($nptr),$nptr
1862	mov	%r13,8*1($rptr)
1863	mov	%r14,8*2($rptr)
1864	mov	%r15,8*3($rptr)
1865	lea	8*4($rptr),$rptr
1866
1867	inc	%rcx			# pass %cf
1868	jnz	.Lsqr4x_sub
1869___
1870}
1871$code.=<<___;
1872	mov	$num,%r10		# prepare for back-to-back call
1873	neg	$num			# restore $num
1874	ret
1875.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1876___
1877{
1878$code.=<<___;
1879.globl	bn_from_montgomery
1880.type	bn_from_montgomery,\@abi-omnipotent
1881.align	32
1882bn_from_montgomery:
1883	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
1884	jz	bn_from_mont8x
1885	xor	%eax,%eax
1886	ret
1887.size	bn_from_montgomery,.-bn_from_montgomery
1888
1889.type	bn_from_mont8x,\@function,6
1890.align	32
1891bn_from_mont8x:
1892	.byte	0x67
1893	mov	%rsp,%rax
1894	push	%rbx
1895	push	%rbp
1896	push	%r12
1897	push	%r13
1898	push	%r14
1899	push	%r15
1900___
1901$code.=<<___ if ($win64);
1902	lea	-0x28(%rsp),%rsp
1903	movaps	%xmm6,(%rsp)
1904	movaps	%xmm7,0x10(%rsp)
1905___
1906$code.=<<___;
1907	.byte	0x67
1908	mov	${num}d,%r10d
1909	shl	\$3,${num}d		# convert $num to bytes
1910	shl	\$3+2,%r10d		# 4*$num
1911	neg	$num
1912	mov	($n0),$n0		# *n0
1913
1914	##############################################################
1915	# ensure that stack frame doesn't alias with $aptr+4*$num
1916	# modulo 4096, which covers ret[num], am[num] and n[2*num]
1917	# (see bn_exp.c). this is done to allow memory disambiguation
1918	# logic do its magic.
1919	#
1920	lea	-64(%rsp,$num,2),%r11
1921	sub	$aptr,%r11
1922	and	\$4095,%r11
1923	cmp	%r11,%r10
1924	jb	.Lfrom_sp_alt
1925	sub	%r11,%rsp		# align with $aptr
1926	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
1927	jmp	.Lfrom_sp_done
1928
1929.align	32
1930.Lfrom_sp_alt:
1931	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
1932	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
1933	sub	%r10,%r11
1934	mov	\$0,%r10
1935	cmovc	%r10,%r11
1936	sub	%r11,%rsp
1937.Lfrom_sp_done:
1938	and	\$-64,%rsp
1939	mov	$num,%r10
1940	neg	$num
1941
1942	##############################################################
1943	# Stack layout
1944	#
1945	# +0	saved $num, used in reduction section
1946	# +8	&t[2*$num], used in reduction section
1947	# +32	saved *n0
1948	# +40	saved %rsp
1949	# +48	t[2*$num]
1950	#
1951	mov	$n0,  32(%rsp)
1952	mov	%rax, 40(%rsp)		# save original %rsp
1953.Lfrom_body:
1954	mov	$num,%r11
1955	lea	48(%rsp),%rax
1956	pxor	%xmm0,%xmm0
1957	jmp	.Lmul_by_1
1958
1959.align	32
1960.Lmul_by_1:
1961	movdqu	($aptr),%xmm1
1962	movdqu	16($aptr),%xmm2
1963	movdqu	32($aptr),%xmm3
1964	movdqa	%xmm0,(%rax,$num)
1965	movdqu	48($aptr),%xmm4
1966	movdqa	%xmm0,16(%rax,$num)
1967	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
1968	movdqa	%xmm1,(%rax)
1969	movdqa	%xmm0,32(%rax,$num)
1970	movdqa	%xmm2,16(%rax)
1971	movdqa	%xmm0,48(%rax,$num)
1972	movdqa	%xmm3,32(%rax)
1973	movdqa	%xmm4,48(%rax)
1974	lea	64(%rax),%rax
1975	sub	\$64,%r11
1976	jnz	.Lmul_by_1
1977
1978	movq	$rptr,%xmm1
1979	movq	$nptr,%xmm2
1980	.byte	0x67
1981	mov	$nptr,%rbp
1982	movq	%r10, %xmm3		# -num
1983___
1984$code.=<<___ if ($addx);
1985	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
1986	and	\$0x80100,%r11d
1987	cmp	\$0x80100,%r11d
1988	jne	.Lfrom_mont_nox
1989
1990	lea	(%rax,$num),$rptr
1991	call	sqrx8x_reduction
1992
1993	pxor	%xmm0,%xmm0
1994	lea	48(%rsp),%rax
1995	mov	40(%rsp),%rsi		# restore %rsp
1996	jmp	.Lfrom_mont_zero
1997
1998.align	32
1999.Lfrom_mont_nox:
2000___
2001$code.=<<___;
2002	call	sqr8x_reduction
2003
2004	pxor	%xmm0,%xmm0
2005	lea	48(%rsp),%rax
2006	mov	40(%rsp),%rsi		# restore %rsp
2007	jmp	.Lfrom_mont_zero
2008
2009.align	32
2010.Lfrom_mont_zero:
2011	movdqa	%xmm0,16*0(%rax)
2012	movdqa	%xmm0,16*1(%rax)
2013	movdqa	%xmm0,16*2(%rax)
2014	movdqa	%xmm0,16*3(%rax)
2015	lea	16*4(%rax),%rax
2016	sub	\$32,$num
2017	jnz	.Lfrom_mont_zero
2018
2019	mov	\$1,%rax
2020	mov	-48(%rsi),%r15
2021	mov	-40(%rsi),%r14
2022	mov	-32(%rsi),%r13
2023	mov	-24(%rsi),%r12
2024	mov	-16(%rsi),%rbp
2025	mov	-8(%rsi),%rbx
2026	lea	(%rsi),%rsp
2027.Lfrom_epilogue:
2028	ret
2029.size	bn_from_mont8x,.-bn_from_mont8x
2030___
2031}
2032}}}
2033
2034if ($addx) {{{
2035my $bp="%rdx";	# restore original value
2036
2037$code.=<<___;
2038.type	bn_mulx4x_mont_gather5,\@function,6
2039.align	32
2040bn_mulx4x_mont_gather5:
2041.Lmulx4x_enter:
2042	.byte	0x67
2043	mov	%rsp,%rax
2044	push	%rbx
2045	push	%rbp
2046	push	%r12
2047	push	%r13
2048	push	%r14
2049	push	%r15
2050___
2051$code.=<<___ if ($win64);
2052	lea	-0x28(%rsp),%rsp
2053	movaps	%xmm6,(%rsp)
2054	movaps	%xmm7,0x10(%rsp)
2055___
2056$code.=<<___;
2057	.byte	0x67
2058	mov	${num}d,%r10d
2059	shl	\$3,${num}d		# convert $num to bytes
2060	shl	\$3+2,%r10d		# 4*$num
2061	neg	$num			# -$num
2062	mov	($n0),$n0		# *n0
2063
2064	##############################################################
2065	# ensure that stack frame doesn't alias with $aptr+4*$num
2066	# modulo 4096, which covers a[num], ret[num] and n[2*num]
2067	# (see bn_exp.c). this is done to allow memory disambiguation
2068	# logic do its magic. [excessive frame is allocated in order
2069	# to allow bn_from_mont8x to clear it.]
2070	#
2071	lea	-64(%rsp,$num,2),%r11
2072	sub	$ap,%r11
2073	and	\$4095,%r11
2074	cmp	%r11,%r10
2075	jb	.Lmulx4xsp_alt
2076	sub	%r11,%rsp		# align with $aptr
2077	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
2078	jmp	.Lmulx4xsp_done
2079
2080.align	32
2081.Lmulx4xsp_alt:
2082	lea	4096-64(,$num,2),%r10	# 4096-frame-$num
2083	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
2084	sub	%r10,%r11
2085	mov	\$0,%r10
2086	cmovc	%r10,%r11
2087	sub	%r11,%rsp
2088.Lmulx4xsp_done:
2089	and	\$-64,%rsp		# ensure alignment
2090	##############################################################
2091	# Stack layout
2092	# +0	-num
2093	# +8	off-loaded &b[i]
2094	# +16	end of b[num]
2095	# +24	inner counter
2096	# +32	saved n0
2097	# +40	saved %rsp
2098	# +48
2099	# +56	saved rp
2100	# +64	tmp[num+1]
2101	#
2102	mov	$n0, 32(%rsp)		# save *n0
2103	mov	%rax,40(%rsp)		# save original %rsp
2104.Lmulx4x_body:
2105	call	mulx4x_internal
2106
2107	mov	40(%rsp),%rsi		# restore %rsp
2108	mov	\$1,%rax
2109___
2110$code.=<<___ if ($win64);
2111	movaps	-88(%rsi),%xmm6
2112	movaps	-72(%rsi),%xmm7
2113___
2114$code.=<<___;
2115	mov	-48(%rsi),%r15
2116	mov	-40(%rsi),%r14
2117	mov	-32(%rsi),%r13
2118	mov	-24(%rsi),%r12
2119	mov	-16(%rsi),%rbp
2120	mov	-8(%rsi),%rbx
2121	lea	(%rsi),%rsp
2122.Lmulx4x_epilogue:
2123	ret
2124.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2125
2126.type	mulx4x_internal,\@abi-omnipotent
2127.align	32
2128mulx4x_internal:
2129	.byte	0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00	# mov	$num,8(%rsp)		# save -$num
2130	.byte	0x67
2131	neg	$num			# restore $num
2132	shl	\$5,$num
2133	lea	256($bp,$num),%r13
2134	shr	\$5+5,$num
2135	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
2136	sub	\$1,$num
2137	mov	%r13,16+8(%rsp)		# end of b[num]
2138	mov	$num,24+8(%rsp)		# inner counter
2139	mov	$rp, 56+8(%rsp)		# save $rp
2140___
2141my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
2142   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2143my $rptr=$bptr;
2144my $STRIDE=2**5*8;		# 5 is "window size"
2145my $N=$STRIDE/4;		# should match cache line size
2146$code.=<<___;
2147	mov	%r10,%r11
2148	shr	\$`log($N/8)/log(2)`,%r10
2149	and	\$`$N/8-1`,%r11
2150	not	%r10
2151	lea	.Lmagic_masks(%rip),%rax
2152	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
2153	lea	96($bp,%r11,8),$bptr	# pointer within 1st cache line
2154	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
2155	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
2156	add	\$7,%r11
2157	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
2158	movq	24(%rax,%r10,8),%xmm7
2159	and	\$7,%r11
2160
2161	movq	`0*$STRIDE/4-96`($bptr),%xmm0
2162	lea	$STRIDE($bptr),$tptr	# borrow $tptr
2163	movq	`1*$STRIDE/4-96`($bptr),%xmm1
2164	pand	%xmm4,%xmm0
2165	movq	`2*$STRIDE/4-96`($bptr),%xmm2
2166	pand	%xmm5,%xmm1
2167	movq	`3*$STRIDE/4-96`($bptr),%xmm3
2168	pand	%xmm6,%xmm2
2169	por	%xmm1,%xmm0
2170	movq	`0*$STRIDE/4-96`($tptr),%xmm1
2171	pand	%xmm7,%xmm3
2172	por	%xmm2,%xmm0
2173	movq	`1*$STRIDE/4-96`($tptr),%xmm2
2174	por	%xmm3,%xmm0
2175	.byte	0x67,0x67
2176	pand	%xmm4,%xmm1
2177	movq	`2*$STRIDE/4-96`($tptr),%xmm3
2178
2179	movq	%xmm0,%rdx		# bp[0]
2180	movq	`3*$STRIDE/4-96`($tptr),%xmm0
2181	lea	2*$STRIDE($bptr),$bptr	# next &b[i]
2182	pand	%xmm5,%xmm2
2183	.byte	0x67,0x67
2184	pand	%xmm6,%xmm3
2185	##############################################################
2186	# $tptr is chosen so that writing to top-most element of the
2187	# vector occurs just "above" references to powers table,
2188	# "above" modulo cache-line size, which effectively precludes
2189	# possibility of memory disambiguation logic failure when
2190	# accessing the table.
2191	#
2192	lea	64+8*4+8(%rsp,%r11,8),$tptr
2193
2194	mov	%rdx,$bi
2195	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
2196	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
2197	add	%rax,%r11
2198	mulx	2*8($aptr),%rax,%r13	# ...
2199	adc	%rax,%r12
2200	adc	\$0,%r13
2201	mulx	3*8($aptr),%rax,%r14
2202
2203	mov	$mi,%r15
2204	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2205	xor	$zero,$zero		# cf=0, of=0
2206	mov	$mi,%rdx
2207
2208	por	%xmm2,%xmm1
2209	pand	%xmm7,%xmm0
2210	por	%xmm3,%xmm1
2211	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2212	por	%xmm1,%xmm0
2213
2214	.byte	0x48,0x8d,0xb6,0x20,0x00,0x00,0x00	# lea	4*8($aptr),$aptr
2215	adcx	%rax,%r13
2216	adcx	$zero,%r14		# cf=0
2217
2218	mulx	0*16($nptr),%rax,%r10
2219	adcx	%rax,%r15		# discarded
2220	adox	%r11,%r10
2221	mulx	1*16($nptr),%rax,%r11
2222	adcx	%rax,%r10
2223	adox	%r12,%r11
2224	mulx	2*16($nptr),%rax,%r12
2225	mov	24+8(%rsp),$bptr	# counter value
2226	.byte	0x66
2227	mov	%r10,-8*4($tptr)
2228	adcx	%rax,%r11
2229	adox	%r13,%r12
2230	mulx	3*16($nptr),%rax,%r15
2231	 .byte	0x67,0x67
2232	 mov	$bi,%rdx
2233	mov	%r11,-8*3($tptr)
2234	adcx	%rax,%r12
2235	adox	$zero,%r15		# of=0
2236	.byte	0x48,0x8d,0x89,0x40,0x00,0x00,0x00	# lea	4*16($nptr),$nptr
2237	mov	%r12,-8*2($tptr)
2238	#jmp	.Lmulx4x_1st
2239
2240.align	32
2241.Lmulx4x_1st:
2242	adcx	$zero,%r15		# cf=0, modulo-scheduled
2243	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
2244	adcx	%r14,%r10
2245	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
2246	adcx	%rax,%r11
2247	mulx	2*8($aptr),%r12,%rax	# ...
2248	adcx	%r14,%r12
2249	mulx	3*8($aptr),%r13,%r14
2250	 .byte	0x67,0x67
2251	 mov	$mi,%rdx
2252	adcx	%rax,%r13
2253	adcx	$zero,%r14		# cf=0
2254	lea	4*8($aptr),$aptr
2255	lea	4*8($tptr),$tptr
2256
2257	adox	%r15,%r10
2258	mulx	0*16($nptr),%rax,%r15
2259	adcx	%rax,%r10
2260	adox	%r15,%r11
2261	mulx	1*16($nptr),%rax,%r15
2262	adcx	%rax,%r11
2263	adox	%r15,%r12
2264	mulx	2*16($nptr),%rax,%r15
2265	mov	%r10,-5*8($tptr)
2266	adcx	%rax,%r12
2267	mov	%r11,-4*8($tptr)
2268	adox	%r15,%r13
2269	mulx	3*16($nptr),%rax,%r15
2270	 mov	$bi,%rdx
2271	mov	%r12,-3*8($tptr)
2272	adcx	%rax,%r13
2273	adox	$zero,%r15
2274	lea	4*16($nptr),$nptr
2275	mov	%r13,-2*8($tptr)
2276
2277	dec	$bptr			# of=0, pass cf
2278	jnz	.Lmulx4x_1st
2279
2280	mov	8(%rsp),$num		# load -num
2281	movq	%xmm0,%rdx		# bp[1]
2282	adc	$zero,%r15		# modulo-scheduled
2283	lea	($aptr,$num),$aptr	# rewind $aptr
2284	add	%r15,%r14
2285	mov	8+8(%rsp),$bptr		# re-load &b[i]
2286	adc	$zero,$zero		# top-most carry
2287	mov	%r14,-1*8($tptr)
2288	jmp	.Lmulx4x_outer
2289
2290.align	32
2291.Lmulx4x_outer:
2292	mov	$zero,($tptr)		# save top-most carry
2293	lea	4*8($tptr,$num),$tptr	# rewind $tptr
2294	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
2295	xor	$zero,$zero		# cf=0, of=0
2296	mov	%rdx,$bi
2297	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
2298	adox	-4*8($tptr),$mi		# +t[0]
2299	adcx	%r14,%r11
2300	mulx	2*8($aptr),%r15,%r13	# ...
2301	adox	-3*8($tptr),%r11
2302	adcx	%r15,%r12
2303	mulx	3*8($aptr),%rdx,%r14
2304	adox	-2*8($tptr),%r12
2305	adcx	%rdx,%r13
2306	lea	($nptr,$num,2),$nptr	# rewind $nptr
2307	lea	4*8($aptr),$aptr
2308	adox	-1*8($tptr),%r13
2309	adcx	$zero,%r14
2310	adox	$zero,%r14
2311
2312	.byte	0x67
2313	mov	$mi,%r15
2314	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2315
2316	movq	`0*$STRIDE/4-96`($bptr),%xmm0
2317	.byte	0x67,0x67
2318	mov	$mi,%rdx
2319	movq	`1*$STRIDE/4-96`($bptr),%xmm1
2320	.byte	0x67
2321	pand	%xmm4,%xmm0
2322	movq	`2*$STRIDE/4-96`($bptr),%xmm2
2323	.byte	0x67
2324	pand	%xmm5,%xmm1
2325	movq	`3*$STRIDE/4-96`($bptr),%xmm3
2326	add	\$$STRIDE,$bptr		# next &b[i]
2327	.byte	0x67
2328	pand	%xmm6,%xmm2
2329	por	%xmm1,%xmm0
2330	pand	%xmm7,%xmm3
2331	xor	$zero,$zero		# cf=0, of=0
2332	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2333
2334	mulx	0*16($nptr),%rax,%r10
2335	adcx	%rax,%r15		# discarded
2336	adox	%r11,%r10
2337	mulx	1*16($nptr),%rax,%r11
2338	adcx	%rax,%r10
2339	adox	%r12,%r11
2340	mulx	2*16($nptr),%rax,%r12
2341	adcx	%rax,%r11
2342	adox	%r13,%r12
2343	mulx	3*16($nptr),%rax,%r15
2344	 mov	$bi,%rdx
2345	 por	%xmm2,%xmm0
2346	mov	24+8(%rsp),$bptr	# counter value
2347	mov	%r10,-8*4($tptr)
2348	 por	%xmm3,%xmm0
2349	adcx	%rax,%r12
2350	mov	%r11,-8*3($tptr)
2351	adox	$zero,%r15		# of=0
2352	mov	%r12,-8*2($tptr)
2353	lea	4*16($nptr),$nptr
2354	jmp	.Lmulx4x_inner
2355
2356.align	32
2357.Lmulx4x_inner:
2358	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
2359	adcx	$zero,%r15		# cf=0, modulo-scheduled
2360	adox	%r14,%r10
2361	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
2362	adcx	0*8($tptr),%r10
2363	adox	%rax,%r11
2364	mulx	2*8($aptr),%r12,%rax	# ...
2365	adcx	1*8($tptr),%r11
2366	adox	%r14,%r12
2367	mulx	3*8($aptr),%r13,%r14
2368	 mov	$mi,%rdx
2369	adcx	2*8($tptr),%r12
2370	adox	%rax,%r13
2371	adcx	3*8($tptr),%r13
2372	adox	$zero,%r14		# of=0
2373	lea	4*8($aptr),$aptr
2374	lea	4*8($tptr),$tptr
2375	adcx	$zero,%r14		# cf=0
2376
2377	adox	%r15,%r10
2378	mulx	0*16($nptr),%rax,%r15
2379	adcx	%rax,%r10
2380	adox	%r15,%r11
2381	mulx	1*16($nptr),%rax,%r15
2382	adcx	%rax,%r11
2383	adox	%r15,%r12
2384	mulx	2*16($nptr),%rax,%r15
2385	mov	%r10,-5*8($tptr)
2386	adcx	%rax,%r12
2387	adox	%r15,%r13
2388	mov	%r11,-4*8($tptr)
2389	mulx	3*16($nptr),%rax,%r15
2390	 mov	$bi,%rdx
2391	lea	4*16($nptr),$nptr
2392	mov	%r12,-3*8($tptr)
2393	adcx	%rax,%r13
2394	adox	$zero,%r15
2395	mov	%r13,-2*8($tptr)
2396
2397	dec	$bptr			# of=0, pass cf
2398	jnz	.Lmulx4x_inner
2399
2400	mov	0+8(%rsp),$num		# load -num
2401	movq	%xmm0,%rdx		# bp[i+1]
2402	adc	$zero,%r15		# modulo-scheduled
2403	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
2404	mov	8+8(%rsp),$bptr		# re-load &b[i]
2405	mov	16+8(%rsp),%r10
2406	adc	%r15,%r14
2407	lea	($aptr,$num),$aptr	# rewind $aptr
2408	adc	$zero,$zero		# top-most carry
2409	mov	%r14,-1*8($tptr)
2410
2411	cmp	%r10,$bptr
2412	jb	.Lmulx4x_outer
2413
2414	mov	-16($nptr),%r10
2415	xor	%r15,%r15
2416	sub	%r14,%r10		# compare top-most words
2417	adc	%r15,%r15
2418	or	%r15,$zero
2419	xor	\$1,$zero
2420	lea	($tptr,$num),%rdi	# rewind $tptr
2421	lea	($nptr,$num,2),$nptr	# rewind $nptr
2422	.byte	0x67,0x67
2423	sar	\$3+2,$num		# cf=0
2424	lea	($nptr,$zero,8),%rbp
2425	mov	56+8(%rsp),%rdx		# restore rp
2426	mov	$num,%rcx
2427	jmp	.Lsqrx4x_sub		# common post-condition
2428.size	mulx4x_internal,.-mulx4x_internal
2429___
2430}{
2431######################################################################
2432# void bn_power5(
2433my $rptr="%rdi";	# BN_ULONG *rptr,
2434my $aptr="%rsi";	# const BN_ULONG *aptr,
2435my $bptr="%rdx";	# const void *table,
2436my $nptr="%rcx";	# const BN_ULONG *nptr,
2437my $n0  ="%r8";		# const BN_ULONG *n0);
2438my $num ="%r9";		# int num, has to be divisible by 8
2439			# int pwr);
2440
2441my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2442my @A0=("%r10","%r11");
2443my @A1=("%r12","%r13");
2444my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2445
2446$code.=<<___;
2447.type	bn_powerx5,\@function,6
2448.align	32
2449bn_powerx5:
2450.Lpowerx5_enter:
2451	.byte	0x67
2452	mov	%rsp,%rax
2453	push	%rbx
2454	push	%rbp
2455	push	%r12
2456	push	%r13
2457	push	%r14
2458	push	%r15
2459___
2460$code.=<<___ if ($win64);
2461	lea	-0x28(%rsp),%rsp
2462	movaps	%xmm6,(%rsp)
2463	movaps	%xmm7,0x10(%rsp)
2464___
2465$code.=<<___;
2466	.byte	0x67
2467	mov	${num}d,%r10d
2468	shl	\$3,${num}d		# convert $num to bytes
2469	shl	\$3+2,%r10d		# 4*$num
2470	neg	$num
2471	mov	($n0),$n0		# *n0
2472
2473	##############################################################
2474	# ensure that stack frame doesn't alias with $aptr+4*$num
2475	# modulo 4096, which covers ret[num], am[num] and n[2*num]
2476	# (see bn_exp.c). this is done to allow memory disambiguation
2477	# logic do its magic.
2478	#
2479	lea	-64(%rsp,$num,2),%r11
2480	sub	$aptr,%r11
2481	and	\$4095,%r11
2482	cmp	%r11,%r10
2483	jb	.Lpwrx_sp_alt
2484	sub	%r11,%rsp		# align with $aptr
2485	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
2486	jmp	.Lpwrx_sp_done
2487
2488.align	32
2489.Lpwrx_sp_alt:
2490	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
2491	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
2492	sub	%r10,%r11
2493	mov	\$0,%r10
2494	cmovc	%r10,%r11
2495	sub	%r11,%rsp
2496.Lpwrx_sp_done:
2497	and	\$-64,%rsp
2498	mov	$num,%r10
2499	neg	$num
2500
2501	##############################################################
2502	# Stack layout
2503	#
2504	# +0	saved $num, used in reduction section
2505	# +8	&t[2*$num], used in reduction section
2506	# +16	intermediate carry bit
2507	# +24	top-most carry bit, used in reduction section
2508	# +32	saved *n0
2509	# +40	saved %rsp
2510	# +48	t[2*$num]
2511	#
2512	pxor	%xmm0,%xmm0
2513	movq	$rptr,%xmm1		# save $rptr
2514	movq	$nptr,%xmm2		# save $nptr
2515	movq	%r10, %xmm3		# -$num
2516	movq	$bptr,%xmm4
2517	mov	$n0,  32(%rsp)
2518	mov	%rax, 40(%rsp)		# save original %rsp
2519.Lpowerx5_body:
2520
2521	call	__bn_sqrx8x_internal
2522	call	__bn_sqrx8x_internal
2523	call	__bn_sqrx8x_internal
2524	call	__bn_sqrx8x_internal
2525	call	__bn_sqrx8x_internal
2526
2527	mov	%r10,$num		# -num
2528	mov	$aptr,$rptr
2529	movq	%xmm2,$nptr
2530	movq	%xmm4,$bptr
2531	mov	40(%rsp),%rax
2532
2533	call	mulx4x_internal
2534
2535	mov	40(%rsp),%rsi		# restore %rsp
2536	mov	\$1,%rax
2537___
2538$code.=<<___ if ($win64);
2539	movaps	-88(%rsi),%xmm6
2540	movaps	-72(%rsi),%xmm7
2541___
2542$code.=<<___;
2543	mov	-48(%rsi),%r15
2544	mov	-40(%rsi),%r14
2545	mov	-32(%rsi),%r13
2546	mov	-24(%rsi),%r12
2547	mov	-16(%rsi),%rbp
2548	mov	-8(%rsi),%rbx
2549	lea	(%rsi),%rsp
2550.Lpowerx5_epilogue:
2551	ret
2552.size	bn_powerx5,.-bn_powerx5
2553
2554.globl	bn_sqrx8x_internal
2555.hidden	bn_sqrx8x_internal
2556.type	bn_sqrx8x_internal,\@abi-omnipotent
2557.align	32
2558bn_sqrx8x_internal:
2559__bn_sqrx8x_internal:
2560	##################################################################
2561	# Squaring part:
2562	#
2563	# a) multiply-n-add everything but a[i]*a[i];
2564	# b) shift result of a) by 1 to the left and accumulate
2565	#    a[i]*a[i] products;
2566	#
2567	##################################################################
2568	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2569	#                                                     a[1]a[0]
2570	#                                                 a[2]a[0]
2571	#                                             a[3]a[0]
2572	#                                             a[2]a[1]
2573	#                                         a[3]a[1]
2574	#                                     a[3]a[2]
2575	#
2576	#                                         a[4]a[0]
2577	#                                     a[5]a[0]
2578	#                                 a[6]a[0]
2579	#                             a[7]a[0]
2580	#                                     a[4]a[1]
2581	#                                 a[5]a[1]
2582	#                             a[6]a[1]
2583	#                         a[7]a[1]
2584	#                                 a[4]a[2]
2585	#                             a[5]a[2]
2586	#                         a[6]a[2]
2587	#                     a[7]a[2]
2588	#                             a[4]a[3]
2589	#                         a[5]a[3]
2590	#                     a[6]a[3]
2591	#                 a[7]a[3]
2592	#
2593	#                     a[5]a[4]
2594	#                 a[6]a[4]
2595	#             a[7]a[4]
2596	#             a[6]a[5]
2597	#         a[7]a[5]
2598	#     a[7]a[6]
2599	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2600___
2601{
2602my ($zero,$carry)=("%rbp","%rcx");
2603my $aaptr=$zero;
2604$code.=<<___;
2605	lea	48+8(%rsp),$tptr
2606	lea	($aptr,$num),$aaptr
2607	mov	$num,0+8(%rsp)			# save $num
2608	mov	$aaptr,8+8(%rsp)		# save end of $aptr
2609	jmp	.Lsqr8x_zero_start
2610
2611.align	32
2612.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2613.Lsqrx8x_zero:
2614	.byte	0x3e
2615	movdqa	%xmm0,0*8($tptr)
2616	movdqa	%xmm0,2*8($tptr)
2617	movdqa	%xmm0,4*8($tptr)
2618	movdqa	%xmm0,6*8($tptr)
2619.Lsqr8x_zero_start:			# aligned at 32
2620	movdqa	%xmm0,8*8($tptr)
2621	movdqa	%xmm0,10*8($tptr)
2622	movdqa	%xmm0,12*8($tptr)
2623	movdqa	%xmm0,14*8($tptr)
2624	lea	16*8($tptr),$tptr
2625	sub	\$64,$num
2626	jnz	.Lsqrx8x_zero
2627
2628	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
2629	#xor	%r9,%r9			# t[1], ex-$num, zero already
2630	xor	%r10,%r10
2631	xor	%r11,%r11
2632	xor	%r12,%r12
2633	xor	%r13,%r13
2634	xor	%r14,%r14
2635	xor	%r15,%r15
2636	lea	48+8(%rsp),$tptr
2637	xor	$zero,$zero		# cf=0, cf=0
2638	jmp	.Lsqrx8x_outer_loop
2639
2640.align	32
2641.Lsqrx8x_outer_loop:
2642	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
2643	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
2644	adox	%rax,%r10
2645	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
2646	adcx	%r10,%r9
2647	adox	%rax,%r11
2648	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
2649	adcx	%r11,%r10
2650	adox	%rax,%r12
2651	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
2652	adcx	%r12,%r11
2653	adox	%rax,%r13
2654	mulx	5*8($aptr),%r12,%rax
2655	adcx	%r13,%r12
2656	adox	%rax,%r14
2657	mulx	6*8($aptr),%r13,%rax
2658	adcx	%r14,%r13
2659	adox	%r15,%rax
2660	mulx	7*8($aptr),%r14,%r15
2661	 mov	1*8($aptr),%rdx		# a[1]
2662	adcx	%rax,%r14
2663	adox	$zero,%r15
2664	adc	8*8($tptr),%r15
2665	mov	%r8,1*8($tptr)		# t[1]
2666	mov	%r9,2*8($tptr)		# t[2]
2667	sbb	$carry,$carry		# mov %cf,$carry
2668	xor	$zero,$zero		# cf=0, of=0
2669
2670
2671	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
2672	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
2673	adcx	%r10,%r8
2674	adox	%rbx,%r9
2675	mulx	4*8($aptr),%r10,%rbx	# ...
2676	adcx	%r11,%r9
2677	adox	%rax,%r10
2678	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
2679	adcx	%r12,%r10
2680	adox	%rbx,%r11
2681	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
2682	adcx	%r13,%r11
2683	adox	%r14,%r12
2684	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
2685	 mov	2*8($aptr),%rdx		# a[2]
2686	adcx	%rax,%r12
2687	adox	%rbx,%r13
2688	adcx	%r15,%r13
2689	adox	$zero,%r14		# of=0
2690	adcx	$zero,%r14		# cf=0
2691
2692	mov	%r8,3*8($tptr)		# t[3]
2693	mov	%r9,4*8($tptr)		# t[4]
2694
2695	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
2696	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
2697	adcx	%r10,%r8
2698	adox	%rbx,%r9
2699	mulx	5*8($aptr),%r10,%rbx	# ...
2700	adcx	%r11,%r9
2701	adox	%rax,%r10
2702	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
2703	adcx	%r12,%r10
2704	adox	%r13,%r11
2705	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
2706	.byte	0x3e
2707	 mov	3*8($aptr),%rdx		# a[3]
2708	adcx	%rbx,%r11
2709	adox	%rax,%r12
2710	adcx	%r14,%r12
2711	mov	%r8,5*8($tptr)		# t[5]
2712	mov	%r9,6*8($tptr)		# t[6]
2713	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
2714	adox	$zero,%r13		# of=0
2715	adcx	$zero,%r13		# cf=0
2716
2717	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
2718	adcx	%r10,%r8
2719	adox	%rax,%r9
2720	mulx	6*8($aptr),%r10,%rax	# ...
2721	adcx	%r11,%r9
2722	adox	%r12,%r10
2723	mulx	7*8($aptr),%r11,%r12
2724	 mov	4*8($aptr),%rdx		# a[4]
2725	 mov	5*8($aptr),%r14		# a[5]
2726	adcx	%rbx,%r10
2727	adox	%rax,%r11
2728	 mov	6*8($aptr),%r15		# a[6]
2729	adcx	%r13,%r11
2730	adox	$zero,%r12		# of=0
2731	adcx	$zero,%r12		# cf=0
2732
2733	mov	%r8,7*8($tptr)		# t[7]
2734	mov	%r9,8*8($tptr)		# t[8]
2735
2736	mulx	%r14,%r9,%rax		# a[5]*a[4]
2737	 mov	7*8($aptr),%r8		# a[7]
2738	adcx	%r10,%r9
2739	mulx	%r15,%r10,%rbx		# a[6]*a[4]
2740	adox	%rax,%r10
2741	adcx	%r11,%r10
2742	mulx	%r8,%r11,%rax		# a[7]*a[4]
2743	 mov	%r14,%rdx		# a[5]
2744	adox	%rbx,%r11
2745	adcx	%r12,%r11
2746	#adox	$zero,%rax		# of=0
2747	adcx	$zero,%rax		# cf=0
2748
2749	mulx	%r15,%r14,%rbx		# a[6]*a[5]
2750	mulx	%r8,%r12,%r13		# a[7]*a[5]
2751	 mov	%r15,%rdx		# a[6]
2752	 lea	8*8($aptr),$aptr
2753	adcx	%r14,%r11
2754	adox	%rbx,%r12
2755	adcx	%rax,%r12
2756	adox	$zero,%r13
2757
2758	.byte	0x67,0x67
2759	mulx	%r8,%r8,%r14		# a[7]*a[6]
2760	adcx	%r8,%r13
2761	adcx	$zero,%r14
2762
2763	cmp	8+8(%rsp),$aptr
2764	je	.Lsqrx8x_outer_break
2765
2766	neg	$carry			# mov $carry,%cf
2767	mov	\$-8,%rcx
2768	mov	$zero,%r15
2769	mov	8*8($tptr),%r8
2770	adcx	9*8($tptr),%r9		# +=t[9]
2771	adcx	10*8($tptr),%r10	# ...
2772	adcx	11*8($tptr),%r11
2773	adc	12*8($tptr),%r12
2774	adc	13*8($tptr),%r13
2775	adc	14*8($tptr),%r14
2776	adc	15*8($tptr),%r15
2777	lea	($aptr),$aaptr
2778	lea	2*64($tptr),$tptr
2779	sbb	%rax,%rax		# mov %cf,$carry
2780
2781	mov	-64($aptr),%rdx		# a[0]
2782	mov	%rax,16+8(%rsp)		# offload $carry
2783	mov	$tptr,24+8(%rsp)
2784
2785	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
2786	xor	%eax,%eax		# cf=0, of=0
2787	jmp	.Lsqrx8x_loop
2788
2789.align	32
2790.Lsqrx8x_loop:
2791	mov	%r8,%rbx
2792	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
2793	adcx	%rax,%rbx		# +=t[8]
2794	adox	%r9,%r8
2795
2796	mulx	1*8($aaptr),%rax,%r9	# ...
2797	adcx	%rax,%r8
2798	adox	%r10,%r9
2799
2800	mulx	2*8($aaptr),%rax,%r10
2801	adcx	%rax,%r9
2802	adox	%r11,%r10
2803
2804	mulx	3*8($aaptr),%rax,%r11
2805	adcx	%rax,%r10
2806	adox	%r12,%r11
2807
2808	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
2809	adcx	%rax,%r11
2810	adox	%r13,%r12
2811
2812	mulx	5*8($aaptr),%rax,%r13
2813	adcx	%rax,%r12
2814	adox	%r14,%r13
2815
2816	mulx	6*8($aaptr),%rax,%r14
2817	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
2818	 mov	\$0,%ebx
2819	adcx	%rax,%r13
2820	adox	%r15,%r14
2821
2822	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
2823	 mov	8($aptr,%rcx,8),%rdx	# a[i]
2824	adcx	%rax,%r14
2825	adox	%rbx,%r15		# %rbx is 0, of=0
2826	adcx	%rbx,%r15		# cf=0
2827
2828	.byte	0x67
2829	inc	%rcx			# of=0
2830	jnz	.Lsqrx8x_loop
2831
2832	lea	8*8($aaptr),$aaptr
2833	mov	\$-8,%rcx
2834	cmp	8+8(%rsp),$aaptr	# done?
2835	je	.Lsqrx8x_break
2836
2837	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
2838	.byte	0x66
2839	mov	-64($aptr),%rdx
2840	adcx	0*8($tptr),%r8
2841	adcx	1*8($tptr),%r9
2842	adc	2*8($tptr),%r10
2843	adc	3*8($tptr),%r11
2844	adc	4*8($tptr),%r12
2845	adc	5*8($tptr),%r13
2846	adc	6*8($tptr),%r14
2847	adc	7*8($tptr),%r15
2848	lea	8*8($tptr),$tptr
2849	.byte	0x67
2850	sbb	%rax,%rax		# mov %cf,%rax
2851	xor	%ebx,%ebx		# cf=0, of=0
2852	mov	%rax,16+8(%rsp)		# offload carry
2853	jmp	.Lsqrx8x_loop
2854
2855.align	32
2856.Lsqrx8x_break:
2857	sub	16+8(%rsp),%r8		# consume last carry
2858	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
2859	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
2860	xor	%ebp,%ebp		# xor	$zero,$zero
2861	mov	%r8,0*8($tptr)
2862	cmp	$carry,$tptr		# cf=0, of=0
2863	je	.Lsqrx8x_outer_loop
2864
2865	mov	%r9,1*8($tptr)
2866	 mov	1*8($carry),%r9
2867	mov	%r10,2*8($tptr)
2868	 mov	2*8($carry),%r10
2869	mov	%r11,3*8($tptr)
2870	 mov	3*8($carry),%r11
2871	mov	%r12,4*8($tptr)
2872	 mov	4*8($carry),%r12
2873	mov	%r13,5*8($tptr)
2874	 mov	5*8($carry),%r13
2875	mov	%r14,6*8($tptr)
2876	 mov	6*8($carry),%r14
2877	mov	%r15,7*8($tptr)
2878	 mov	7*8($carry),%r15
2879	mov	$carry,$tptr
2880	jmp	.Lsqrx8x_outer_loop
2881
2882.align	32
2883.Lsqrx8x_outer_break:
2884	mov	%r9,9*8($tptr)		# t[9]
2885	 movq	%xmm3,%rcx		# -$num
2886	mov	%r10,10*8($tptr)	# ...
2887	mov	%r11,11*8($tptr)
2888	mov	%r12,12*8($tptr)
2889	mov	%r13,13*8($tptr)
2890	mov	%r14,14*8($tptr)
2891___
2892}{
2893my $i="%rcx";
2894$code.=<<___;
2895	lea	48+8(%rsp),$tptr
2896	mov	($aptr,$i),%rdx		# a[0]
2897
2898	mov	8($tptr),$A0[1]		# t[1]
2899	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
2900	mov	0+8(%rsp),$num		# restore $num
2901	adox	$A0[1],$A0[1]
2902	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
2903	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
2904	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
2905
2906.align	32
2907.Lsqrx4x_shift_n_add:
2908	mulx	%rdx,%rax,%rbx
2909	 adox	$A1[0],$A1[0]
2910	adcx	$A0[0],%rax
2911	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
2912	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
2913	 adox	$A1[1],$A1[1]
2914	adcx	$A0[1],%rbx
2915	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
2916	mov	%rax,0($tptr)
2917	mov	%rbx,8($tptr)
2918
2919	mulx	%rdx,%rax,%rbx
2920	 adox	$A0[0],$A0[0]
2921	adcx	$A1[0],%rax
2922	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
2923	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
2924	 adox	$A0[1],$A0[1]
2925	adcx	$A1[1],%rbx
2926	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
2927	mov	%rax,16($tptr)
2928	mov	%rbx,24($tptr)
2929
2930	mulx	%rdx,%rax,%rbx
2931	 adox	$A1[0],$A1[0]
2932	adcx	$A0[0],%rax
2933	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
2934	 lea	32($i),$i
2935	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
2936	 adox	$A1[1],$A1[1]
2937	adcx	$A0[1],%rbx
2938	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
2939	mov	%rax,32($tptr)
2940	mov	%rbx,40($tptr)
2941
2942	mulx	%rdx,%rax,%rbx
2943	 adox	$A0[0],$A0[0]
2944	adcx	$A1[0],%rax
2945	jrcxz	.Lsqrx4x_shift_n_add_break
2946	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
2947	 adox	$A0[1],$A0[1]
2948	adcx	$A1[1],%rbx
2949	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
2950	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
2951	mov	%rax,48($tptr)
2952	mov	%rbx,56($tptr)
2953	lea	64($tptr),$tptr
2954	nop
2955	jmp	.Lsqrx4x_shift_n_add
2956
2957.align	32
2958.Lsqrx4x_shift_n_add_break:
2959	adcx	$A1[1],%rbx
2960	mov	%rax,48($tptr)
2961	mov	%rbx,56($tptr)
2962	lea	64($tptr),$tptr		# end of t[] buffer
2963___
2964}
2965######################################################################
2966# Montgomery reduction part, "word-by-word" algorithm.
2967#
2968# This new path is inspired by multiple submissions from Intel, by
2969# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
2970# Vinodh Gopal...
2971{
2972my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
2973
2974$code.=<<___;
2975	movq	%xmm2,$nptr
2976sqrx8x_reduction:
2977	xor	%eax,%eax		# initial top-most carry bit
2978	mov	32+8(%rsp),%rbx		# n0
2979	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
2980	lea	-128($nptr,$num,2),%rcx	# end of n[]
2981	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
2982	mov	%rcx, 0+8(%rsp)		# save end of n[]
2983	mov	$tptr,8+8(%rsp)		# save end of t[]
2984
2985	lea	48+8(%rsp),$tptr		# initial t[] window
2986	jmp	.Lsqrx8x_reduction_loop
2987
2988.align	32
2989.Lsqrx8x_reduction_loop:
2990	mov	8*1($tptr),%r9
2991	mov	8*2($tptr),%r10
2992	mov	8*3($tptr),%r11
2993	mov	8*4($tptr),%r12
2994	mov	%rdx,%r8
2995	imulq	%rbx,%rdx		# n0*a[i]
2996	mov	8*5($tptr),%r13
2997	mov	8*6($tptr),%r14
2998	mov	8*7($tptr),%r15
2999	mov	%rax,24+8(%rsp)		# store top-most carry bit
3000
3001	lea	8*8($tptr),$tptr
3002	xor	$carry,$carry		# cf=0,of=0
3003	mov	\$-8,%rcx
3004	jmp	.Lsqrx8x_reduce
3005
3006.align	32
3007.Lsqrx8x_reduce:
3008	mov	%r8, %rbx
3009	mulx	16*0($nptr),%rax,%r8	# n[0]
3010	adcx	%rbx,%rax		# discarded
3011	adox	%r9,%r8
3012
3013	mulx	16*1($nptr),%rbx,%r9	# n[1]
3014	adcx	%rbx,%r8
3015	adox	%r10,%r9
3016
3017	mulx	16*2($nptr),%rbx,%r10
3018	adcx	%rbx,%r9
3019	adox	%r11,%r10
3020
3021	mulx	16*3($nptr),%rbx,%r11
3022	adcx	%rbx,%r10
3023	adox	%r12,%r11
3024
3025	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rbx,%r12
3026	 mov	%rdx,%rax
3027	 mov	%r8,%rdx
3028	adcx	%rbx,%r11
3029	adox	%r13,%r12
3030
3031	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
3032	 mov	%rax,%rdx
3033	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
3034
3035	mulx	16*5($nptr),%rax,%r13
3036	adcx	%rax,%r12
3037	adox	%r14,%r13
3038
3039	mulx	16*6($nptr),%rax,%r14
3040	adcx	%rax,%r13
3041	adox	%r15,%r14
3042
3043	mulx	16*7($nptr),%rax,%r15
3044	 mov	%rbx,%rdx
3045	adcx	%rax,%r14
3046	adox	$carry,%r15		# $carry is 0
3047	adcx	$carry,%r15		# cf=0
3048
3049	.byte	0x67,0x67,0x67
3050	inc	%rcx			# of=0
3051	jnz	.Lsqrx8x_reduce
3052
3053	mov	$carry,%rax		# xor	%rax,%rax
3054	cmp	0+8(%rsp),$nptr		# end of n[]?
3055	jae	.Lsqrx8x_no_tail
3056
3057	mov	48+8(%rsp),%rdx		# pull n0*a[0]
3058	add	8*0($tptr),%r8
3059	lea	16*8($nptr),$nptr
3060	mov	\$-8,%rcx
3061	adcx	8*1($tptr),%r9
3062	adcx	8*2($tptr),%r10
3063	adc	8*3($tptr),%r11
3064	adc	8*4($tptr),%r12
3065	adc	8*5($tptr),%r13
3066	adc	8*6($tptr),%r14
3067	adc	8*7($tptr),%r15
3068	lea	8*8($tptr),$tptr
3069	sbb	%rax,%rax		# top carry
3070
3071	xor	$carry,$carry		# of=0, cf=0
3072	mov	%rax,16+8(%rsp)
3073	jmp	.Lsqrx8x_tail
3074
3075.align	32
3076.Lsqrx8x_tail:
3077	mov	%r8,%rbx
3078	mulx	16*0($nptr),%rax,%r8
3079	adcx	%rax,%rbx
3080	adox	%r9,%r8
3081
3082	mulx	16*1($nptr),%rax,%r9
3083	adcx	%rax,%r8
3084	adox	%r10,%r9
3085
3086	mulx	16*2($nptr),%rax,%r10
3087	adcx	%rax,%r9
3088	adox	%r11,%r10
3089
3090	mulx	16*3($nptr),%rax,%r11
3091	adcx	%rax,%r10
3092	adox	%r12,%r11
3093
3094	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rax,%r12
3095	adcx	%rax,%r11
3096	adox	%r13,%r12
3097
3098	mulx	16*5($nptr),%rax,%r13
3099	adcx	%rax,%r12
3100	adox	%r14,%r13
3101
3102	mulx	16*6($nptr),%rax,%r14
3103	adcx	%rax,%r13
3104	adox	%r15,%r14
3105
3106	mulx	16*7($nptr),%rax,%r15
3107	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
3108	adcx	%rax,%r14
3109	adox	$carry,%r15
3110	 mov	%rbx,($tptr,%rcx,8)	# save result
3111	 mov	%r8,%rbx
3112	adcx	$carry,%r15		# cf=0
3113
3114	inc	%rcx			# of=0
3115	jnz	.Lsqrx8x_tail
3116
3117	cmp	0+8(%rsp),$nptr		# end of n[]?
3118	jae	.Lsqrx8x_tail_done	# break out of loop
3119
3120	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3121	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
3122	 lea	16*8($nptr),$nptr
3123	adc	8*0($tptr),%r8
3124	adc	8*1($tptr),%r9
3125	adc	8*2($tptr),%r10
3126	adc	8*3($tptr),%r11
3127	adc	8*4($tptr),%r12
3128	adc	8*5($tptr),%r13
3129	adc	8*6($tptr),%r14
3130	adc	8*7($tptr),%r15
3131	lea	8*8($tptr),$tptr
3132	sbb	%rax,%rax
3133	sub	\$8,%rcx		# mov	\$-8,%rcx
3134
3135	xor	$carry,$carry		# of=0, cf=0
3136	mov	%rax,16+8(%rsp)
3137	jmp	.Lsqrx8x_tail
3138
3139.align	32
3140.Lsqrx8x_tail_done:
3141	add	24+8(%rsp),%r8		# can this overflow?
3142	adc	\$0,%r9
3143	adc	\$0,%r10
3144	adc	\$0,%r11
3145	adc	\$0,%r12
3146	adc	\$0,%r13
3147	adc	\$0,%r14
3148	adc	\$0,%r15		# can't overflow, because we
3149					# started with "overhung" part
3150					# of multiplication
3151	mov	$carry,%rax		# xor	%rax,%rax
3152
3153	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3154.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
3155	adc	8*0($tptr),%r8
3156	 movq	%xmm3,%rcx
3157	adc	8*1($tptr),%r9
3158	 mov	16*7($nptr),$carry
3159	 movq	%xmm2,$nptr		# restore $nptr
3160	adc	8*2($tptr),%r10
3161	adc	8*3($tptr),%r11
3162	adc	8*4($tptr),%r12
3163	adc	8*5($tptr),%r13
3164	adc	8*6($tptr),%r14
3165	adc	8*7($tptr),%r15
3166	adc	%rax,%rax		# top-most carry
3167
3168	mov	32+8(%rsp),%rbx		# n0
3169	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
3170
3171	mov	%r8,8*0($tptr)		# store top 512 bits
3172	 lea	8*8($tptr),%r8		# borrow %r8
3173	mov	%r9,8*1($tptr)
3174	mov	%r10,8*2($tptr)
3175	mov	%r11,8*3($tptr)
3176	mov	%r12,8*4($tptr)
3177	mov	%r13,8*5($tptr)
3178	mov	%r14,8*6($tptr)
3179	mov	%r15,8*7($tptr)
3180
3181	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
3182	cmp	8+8(%rsp),%r8		# end of t[]?
3183	jb	.Lsqrx8x_reduction_loop
3184___
3185}
3186##############################################################
3187# Post-condition, 4x unrolled
3188#
3189{
3190my ($rptr,$nptr)=("%rdx","%rbp");
3191my @ri=map("%r$_",(10..13));
3192my @ni=map("%r$_",(14..15));
3193$code.=<<___;
3194	xor	%ebx,%ebx
3195	sub	%r15,%rsi		# compare top-most words
3196	adc	%rbx,%rbx
3197	mov	%rcx,%r10		# -$num
3198	or	%rbx,%rax
3199	mov	%rcx,%r9		# -$num
3200	xor	\$1,%rax
3201	sar	\$3+2,%rcx		# cf=0
3202	#lea	48+8(%rsp,%r9),$tptr
3203	lea	($nptr,%rax,8),$nptr
3204	movq	%xmm1,$rptr		# restore $rptr
3205	movq	%xmm1,$aptr		# prepare for back-to-back call
3206	jmp	.Lsqrx4x_sub
3207
3208.align	32
3209.Lsqrx4x_sub:
3210	.byte	0x66
3211	mov	8*0($tptr),%r12
3212	mov	8*1($tptr),%r13
3213	sbb	16*0($nptr),%r12
3214	mov	8*2($tptr),%r14
3215	sbb	16*1($nptr),%r13
3216	mov	8*3($tptr),%r15
3217	lea	8*4($tptr),$tptr
3218	sbb	16*2($nptr),%r14
3219	mov	%r12,8*0($rptr)
3220	sbb	16*3($nptr),%r15
3221	lea	16*4($nptr),$nptr
3222	mov	%r13,8*1($rptr)
3223	mov	%r14,8*2($rptr)
3224	mov	%r15,8*3($rptr)
3225	lea	8*4($rptr),$rptr
3226
3227	inc	%rcx
3228	jnz	.Lsqrx4x_sub
3229___
3230}
3231$code.=<<___;
3232	neg	%r9			# restore $num
3233
3234	ret
3235.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3236___
3237}}}
3238{
3239my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3240				("%rdi","%esi","%rdx","%ecx");  # Unix order
3241my $out=$inp;
3242my $STRIDE=2**5*8;
3243my $N=$STRIDE/4;
3244
3245$code.=<<___;
3246.globl	bn_get_bits5
3247.type	bn_get_bits5,\@abi-omnipotent
3248.align	16
3249bn_get_bits5:
3250	lea	0($inp),%r10
3251	lea	1($inp),%r11
3252	mov	$num,%ecx
3253	shr	\$4,$num
3254	and	\$15,%ecx
3255	lea	-8(%ecx),%eax
3256	cmp	\$11,%ecx
3257	cmova	%r11,%r10
3258	cmova	%eax,%ecx
3259	movzw	(%r10,$num,2),%eax
3260	shrl	%cl,%eax
3261	and	\$31,%eax
3262	ret
3263.size	bn_get_bits5,.-bn_get_bits5
3264
3265.globl	bn_scatter5
3266.type	bn_scatter5,\@abi-omnipotent
3267.align	16
3268bn_scatter5:
3269	cmp	\$0, $num
3270	jz	.Lscatter_epilogue
3271	lea	($tbl,$idx,8),$tbl
3272.Lscatter:
3273	mov	($inp),%rax
3274	lea	8($inp),$inp
3275	mov	%rax,($tbl)
3276	lea	32*8($tbl),$tbl
3277	sub	\$1,$num
3278	jnz	.Lscatter
3279.Lscatter_epilogue:
3280	ret
3281.size	bn_scatter5,.-bn_scatter5
3282
3283.globl	bn_gather5
3284.type	bn_gather5,\@abi-omnipotent
3285.align	16
3286bn_gather5:
3287___
3288$code.=<<___ if ($win64);
3289.LSEH_begin_bn_gather5:
3290	# I can't trust assembler to use specific encoding:-(
3291	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
3292	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
3293	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
3294___
3295$code.=<<___;
3296	mov	$idx,%r11d
3297	shr	\$`log($N/8)/log(2)`,$idx
3298	and	\$`$N/8-1`,%r11
3299	not	$idx
3300	lea	.Lmagic_masks(%rip),%rax
3301	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
3302	lea	128($tbl,%r11,8),$tbl	# pointer within 1st cache line
3303	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
3304	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
3305	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
3306	movq	24(%rax,$idx,8),%xmm7
3307	jmp	.Lgather
3308.align	16
3309.Lgather:
3310	movq	`0*$STRIDE/4-128`($tbl),%xmm0
3311	movq	`1*$STRIDE/4-128`($tbl),%xmm1
3312	pand	%xmm4,%xmm0
3313	movq	`2*$STRIDE/4-128`($tbl),%xmm2
3314	pand	%xmm5,%xmm1
3315	movq	`3*$STRIDE/4-128`($tbl),%xmm3
3316	pand	%xmm6,%xmm2
3317	por	%xmm1,%xmm0
3318	pand	%xmm7,%xmm3
3319	.byte	0x67,0x67
3320	por	%xmm2,%xmm0
3321	lea	$STRIDE($tbl),$tbl
3322	por	%xmm3,%xmm0
3323
3324	movq	%xmm0,($out)		# m0=bp[0]
3325	lea	8($out),$out
3326	sub	\$1,$num
3327	jnz	.Lgather
3328___
3329$code.=<<___ if ($win64);
3330	movaps	(%rsp),%xmm6
3331	movaps	0x10(%rsp),%xmm7
3332	lea	0x28(%rsp),%rsp
3333___
3334$code.=<<___;
3335	ret
3336.LSEH_end_bn_gather5:
3337.size	bn_gather5,.-bn_gather5
3338___
3339}
3340$code.=<<___;
3341.align	64
3342.Lmagic_masks:
3343	.long	0,0, 0,0, 0,0, -1,-1
3344	.long	0,0, 0,0, 0,0,  0,0
3345.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3346___
3347
3348# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3349#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3350if ($win64) {
3351$rec="%rcx";
3352$frame="%rdx";
3353$context="%r8";
3354$disp="%r9";
3355
3356$code.=<<___;
3357.extern	__imp_RtlVirtualUnwind
3358.type	mul_handler,\@abi-omnipotent
3359.align	16
3360mul_handler:
3361	push	%rsi
3362	push	%rdi
3363	push	%rbx
3364	push	%rbp
3365	push	%r12
3366	push	%r13
3367	push	%r14
3368	push	%r15
3369	pushfq
3370	sub	\$64,%rsp
3371
3372	mov	120($context),%rax	# pull context->Rax
3373	mov	248($context),%rbx	# pull context->Rip
3374
3375	mov	8($disp),%rsi		# disp->ImageBase
3376	mov	56($disp),%r11		# disp->HandlerData
3377
3378	mov	0(%r11),%r10d		# HandlerData[0]
3379	lea	(%rsi,%r10),%r10	# end of prologue label
3380	cmp	%r10,%rbx		# context->Rip<end of prologue label
3381	jb	.Lcommon_seh_tail
3382
3383	mov	152($context),%rax	# pull context->Rsp
3384
3385	mov	4(%r11),%r10d		# HandlerData[1]
3386	lea	(%rsi,%r10),%r10	# epilogue label
3387	cmp	%r10,%rbx		# context->Rip>=epilogue label
3388	jae	.Lcommon_seh_tail
3389
3390	lea	.Lmul_epilogue(%rip),%r10
3391	cmp	%r10,%rbx
3392	jb	.Lbody_40
3393
3394	mov	192($context),%r10	# pull $num
3395	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
3396	jmp	.Lbody_proceed
3397
3398.Lbody_40:
3399	mov	40(%rax),%rax		# pull saved stack pointer
3400.Lbody_proceed:
3401
3402	movaps	-88(%rax),%xmm0
3403	movaps	-72(%rax),%xmm1
3404
3405	mov	-8(%rax),%rbx
3406	mov	-16(%rax),%rbp
3407	mov	-24(%rax),%r12
3408	mov	-32(%rax),%r13
3409	mov	-40(%rax),%r14
3410	mov	-48(%rax),%r15
3411	mov	%rbx,144($context)	# restore context->Rbx
3412	mov	%rbp,160($context)	# restore context->Rbp
3413	mov	%r12,216($context)	# restore context->R12
3414	mov	%r13,224($context)	# restore context->R13
3415	mov	%r14,232($context)	# restore context->R14
3416	mov	%r15,240($context)	# restore context->R15
3417	movups	%xmm0,512($context)	# restore context->Xmm6
3418	movups	%xmm1,528($context)	# restore context->Xmm7
3419
3420.Lcommon_seh_tail:
3421	mov	8(%rax),%rdi
3422	mov	16(%rax),%rsi
3423	mov	%rax,152($context)	# restore context->Rsp
3424	mov	%rsi,168($context)	# restore context->Rsi
3425	mov	%rdi,176($context)	# restore context->Rdi
3426
3427	mov	40($disp),%rdi		# disp->ContextRecord
3428	mov	$context,%rsi		# context
3429	mov	\$154,%ecx		# sizeof(CONTEXT)
3430	.long	0xa548f3fc		# cld; rep movsq
3431
3432	mov	$disp,%rsi
3433	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3434	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3435	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3436	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3437	mov	40(%rsi),%r10		# disp->ContextRecord
3438	lea	56(%rsi),%r11		# &disp->HandlerData
3439	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3440	mov	%r10,32(%rsp)		# arg5
3441	mov	%r11,40(%rsp)		# arg6
3442	mov	%r12,48(%rsp)		# arg7
3443	mov	%rcx,56(%rsp)		# arg8, (NULL)
3444	call	*__imp_RtlVirtualUnwind(%rip)
3445
3446	mov	\$1,%eax		# ExceptionContinueSearch
3447	add	\$64,%rsp
3448	popfq
3449	pop	%r15
3450	pop	%r14
3451	pop	%r13
3452	pop	%r12
3453	pop	%rbp
3454	pop	%rbx
3455	pop	%rdi
3456	pop	%rsi
3457	ret
3458.size	mul_handler,.-mul_handler
3459
3460.section	.pdata
3461.align	4
3462	.rva	.LSEH_begin_bn_mul_mont_gather5
3463	.rva	.LSEH_end_bn_mul_mont_gather5
3464	.rva	.LSEH_info_bn_mul_mont_gather5
3465
3466	.rva	.LSEH_begin_bn_mul4x_mont_gather5
3467	.rva	.LSEH_end_bn_mul4x_mont_gather5
3468	.rva	.LSEH_info_bn_mul4x_mont_gather5
3469
3470	.rva	.LSEH_begin_bn_power5
3471	.rva	.LSEH_end_bn_power5
3472	.rva	.LSEH_info_bn_power5
3473
3474	.rva	.LSEH_begin_bn_from_mont8x
3475	.rva	.LSEH_end_bn_from_mont8x
3476	.rva	.LSEH_info_bn_from_mont8x
3477___
3478$code.=<<___ if ($addx);
3479	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
3480	.rva	.LSEH_end_bn_mulx4x_mont_gather5
3481	.rva	.LSEH_info_bn_mulx4x_mont_gather5
3482
3483	.rva	.LSEH_begin_bn_powerx5
3484	.rva	.LSEH_end_bn_powerx5
3485	.rva	.LSEH_info_bn_powerx5
3486___
3487$code.=<<___;
3488	.rva	.LSEH_begin_bn_gather5
3489	.rva	.LSEH_end_bn_gather5
3490	.rva	.LSEH_info_bn_gather5
3491
3492.section	.xdata
3493.align	8
3494.LSEH_info_bn_mul_mont_gather5:
3495	.byte	9,0,0,0
3496	.rva	mul_handler
3497	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
3498.align	8
3499.LSEH_info_bn_mul4x_mont_gather5:
3500	.byte	9,0,0,0
3501	.rva	mul_handler
3502	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
3503.align	8
3504.LSEH_info_bn_power5:
3505	.byte	9,0,0,0
3506	.rva	mul_handler
3507	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
3508.align	8
3509.LSEH_info_bn_from_mont8x:
3510	.byte	9,0,0,0
3511	.rva	mul_handler
3512	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
3513___
3514$code.=<<___ if ($addx);
3515.align	8
3516.LSEH_info_bn_mulx4x_mont_gather5:
3517	.byte	9,0,0,0
3518	.rva	mul_handler
3519	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
3520.align	8
3521.LSEH_info_bn_powerx5:
3522	.byte	9,0,0,0
3523	.rva	mul_handler
3524	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
3525___
3526$code.=<<___;
3527.align	8
3528.LSEH_info_bn_gather5:
3529        .byte   0x01,0x0d,0x05,0x00
3530        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
3531        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
3532        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
3533.align	8
3534___
3535}
3536
3537$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3538
3539print $code;
3540close STDOUT;
3541