1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32$flavour = shift;
33$output  = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39die "can't locate x86_64-xlate.pl";
40
41open OUT,"| \"$^X\" $xlate $flavour $output";
42*STDOUT=*OUT;
43
44# int bn_mul_mont(
45$rp="%rdi";	# BN_ULONG *rp,
46$ap="%rsi";	# const BN_ULONG *ap,
47$bp="%rdx";	# const BN_ULONG *bp,
48$np="%rcx";	# const BN_ULONG *np,
49$n0="%r8";	# const BN_ULONG *n0,
50$num="%r9";	# int num);
51$lo0="%r10";
52$hi0="%r11";
53$hi1="%r13";
54$i="%r14";
55$j="%r15";
56$m0="%rbx";
57$m1="%rbp";
58
59$code=<<___;
60.text
61
62.globl	bn_mul_mont
63.type	bn_mul_mont,\@function,6
64.align	16
65bn_mul_mont:
66	_CET_ENDBR
67	test	\$3,${num}d
68	jnz	.Lmul_enter
69	cmp	\$8,${num}d
70	jb	.Lmul_enter
71	cmp	$ap,$bp
72	jne	.Lmul4x_enter
73	jmp	.Lsqr4x_enter
74
75.align	16
76.Lmul_enter:
77	push	%rbx
78	push	%rbp
79	push	%r12
80	push	%r13
81	push	%r14
82	push	%r15
83
84	mov	${num}d,${num}d
85	lea	2($num),%r10
86	mov	%rsp,%r11
87	neg	%r10
88	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
89	and	\$-1024,%rsp		# minimize TLB usage
90
91	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
92.Lmul_body:
93	mov	$bp,%r12		# reassign $bp
94___
95		$bp="%r12";
96$code.=<<___;
97	mov	($n0),$n0		# pull n0[0] value
98	mov	($bp),$m0		# m0=bp[0]
99	mov	($ap),%rax
100
101	xor	$i,$i			# i=0
102	xor	$j,$j			# j=0
103
104	mov	$n0,$m1
105	mulq	$m0			# ap[0]*bp[0]
106	mov	%rax,$lo0
107	mov	($np),%rax
108
109	imulq	$lo0,$m1		# "tp[0]"*n0
110	mov	%rdx,$hi0
111
112	mulq	$m1			# np[0]*m1
113	add	%rax,$lo0		# discarded
114	mov	8($ap),%rax
115	adc	\$0,%rdx
116	mov	%rdx,$hi1
117
118	lea	1($j),$j		# j++
119	jmp	.L1st_enter
120
121.align	16
122.L1st:
123	add	%rax,$hi1
124	mov	($ap,$j,8),%rax
125	adc	\$0,%rdx
126	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
127	mov	$lo0,$hi0
128	adc	\$0,%rdx
129	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
130	mov	%rdx,$hi1
131
132.L1st_enter:
133	mulq	$m0			# ap[j]*bp[0]
134	add	%rax,$hi0
135	mov	($np,$j,8),%rax
136	adc	\$0,%rdx
137	lea	1($j),$j		# j++
138	mov	%rdx,$lo0
139
140	mulq	$m1			# np[j]*m1
141	cmp	$num,$j
142	jl	.L1st
143
144	add	%rax,$hi1
145	mov	($ap),%rax		# ap[0]
146	adc	\$0,%rdx
147	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
148	adc	\$0,%rdx
149	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
150	mov	%rdx,$hi1
151	mov	$lo0,$hi0
152
153	xor	%rdx,%rdx
154	add	$hi0,$hi1
155	adc	\$0,%rdx
156	mov	$hi1,-8(%rsp,$num,8)
157	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
158
159	lea	1($i),$i		# i++
160	jmp	.Louter
161.align	16
162.Louter:
163	mov	($bp,$i,8),$m0		# m0=bp[i]
164	xor	$j,$j			# j=0
165	mov	$n0,$m1
166	mov	(%rsp),$lo0
167	mulq	$m0			# ap[0]*bp[i]
168	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
169	mov	($np),%rax
170	adc	\$0,%rdx
171
172	imulq	$lo0,$m1		# tp[0]*n0
173	mov	%rdx,$hi0
174
175	mulq	$m1			# np[0]*m1
176	add	%rax,$lo0		# discarded
177	mov	8($ap),%rax
178	adc	\$0,%rdx
179	mov	8(%rsp),$lo0		# tp[1]
180	mov	%rdx,$hi1
181
182	lea	1($j),$j		# j++
183	jmp	.Linner_enter
184
185.align	16
186.Linner:
187	add	%rax,$hi1
188	mov	($ap,$j,8),%rax
189	adc	\$0,%rdx
190	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
191	mov	(%rsp,$j,8),$lo0
192	adc	\$0,%rdx
193	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
194	mov	%rdx,$hi1
195
196.Linner_enter:
197	mulq	$m0			# ap[j]*bp[i]
198	add	%rax,$hi0
199	mov	($np,$j,8),%rax
200	adc	\$0,%rdx
201	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
202	mov	%rdx,$hi0
203	adc	\$0,$hi0
204	lea	1($j),$j		# j++
205
206	mulq	$m1			# np[j]*m1
207	cmp	$num,$j
208	jl	.Linner
209
210	add	%rax,$hi1
211	mov	($ap),%rax		# ap[0]
212	adc	\$0,%rdx
213	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
214	mov	(%rsp,$j,8),$lo0
215	adc	\$0,%rdx
216	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
217	mov	%rdx,$hi1
218
219	xor	%rdx,%rdx
220	add	$hi0,$hi1
221	adc	\$0,%rdx
222	add	$lo0,$hi1		# pull upmost overflow bit
223	adc	\$0,%rdx
224	mov	$hi1,-8(%rsp,$num,8)
225	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
226
227	lea	1($i),$i		# i++
228	cmp	$num,$i
229	jl	.Louter
230
231	xor	$i,$i			# i=0 and clear CF!
232	mov	(%rsp),%rax		# tp[0]
233	lea	(%rsp),$ap		# borrow ap for tp
234	mov	$num,$j			# j=num
235	jmp	.Lsub
236.align	16
237.Lsub:	sbb	($np,$i,8),%rax
238	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
239	mov	8($ap,$i,8),%rax	# tp[i+1]
240	lea	1($i),$i		# i++
241	dec	$j			# doesnn't affect CF!
242	jnz	.Lsub
243
244	sbb	\$0,%rax		# handle upmost overflow bit
245	xor	$i,$i
246	and	%rax,$ap
247	not	%rax
248	mov	$rp,$np
249	and	%rax,$np
250	mov	$num,$j			# j=num
251	or	$np,$ap			# ap=borrow?tp:rp
252.align	16
253.Lcopy:					# copy or in-place refresh
254	mov	($ap,$i,8),%rax
255	mov	$i,(%rsp,$i,8)		# zap temporary vector
256	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
257	lea	1($i),$i
258	sub	\$1,$j
259	jnz	.Lcopy
260
261	mov	8(%rsp,$num,8),%rsi	# restore %rsp
262	mov	\$1,%rax
263	mov	(%rsi),%r15
264	mov	8(%rsi),%r14
265	mov	16(%rsi),%r13
266	mov	24(%rsi),%r12
267	mov	32(%rsi),%rbp
268	mov	40(%rsi),%rbx
269	lea	48(%rsi),%rsp
270.Lmul_epilogue:
271	ret
272.size	bn_mul_mont,.-bn_mul_mont
273___
274{{{
275my @A=("%r10","%r11");
276my @N=("%r13","%rdi");
277$code.=<<___;
278.type	bn_mul4x_mont,\@function,6
279.align	16
280bn_mul4x_mont:
281.Lmul4x_enter:
282	_CET_ENDBR
283	push	%rbx
284	push	%rbp
285	push	%r12
286	push	%r13
287	push	%r14
288	push	%r15
289
290	mov	${num}d,${num}d
291	lea	4($num),%r10
292	mov	%rsp,%r11
293	neg	%r10
294	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
295	and	\$-1024,%rsp		# minimize TLB usage
296
297	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
298.Lmul4x_body:
299	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
300	mov	%rdx,%r12		# reassign $bp
301___
302		$bp="%r12";
303$code.=<<___;
304	mov	($n0),$n0		# pull n0[0] value
305	mov	($bp),$m0		# m0=bp[0]
306	mov	($ap),%rax
307
308	xor	$i,$i			# i=0
309	xor	$j,$j			# j=0
310
311	mov	$n0,$m1
312	mulq	$m0			# ap[0]*bp[0]
313	mov	%rax,$A[0]
314	mov	($np),%rax
315
316	imulq	$A[0],$m1		# "tp[0]"*n0
317	mov	%rdx,$A[1]
318
319	mulq	$m1			# np[0]*m1
320	add	%rax,$A[0]		# discarded
321	mov	8($ap),%rax
322	adc	\$0,%rdx
323	mov	%rdx,$N[1]
324
325	mulq	$m0
326	add	%rax,$A[1]
327	mov	8($np),%rax
328	adc	\$0,%rdx
329	mov	%rdx,$A[0]
330
331	mulq	$m1
332	add	%rax,$N[1]
333	mov	16($ap),%rax
334	adc	\$0,%rdx
335	add	$A[1],$N[1]
336	lea	4($j),$j		# j++
337	adc	\$0,%rdx
338	mov	$N[1],(%rsp)
339	mov	%rdx,$N[0]
340	jmp	.L1st4x
341.align	16
342.L1st4x:
343	mulq	$m0			# ap[j]*bp[0]
344	add	%rax,$A[0]
345	mov	-16($np,$j,8),%rax
346	adc	\$0,%rdx
347	mov	%rdx,$A[1]
348
349	mulq	$m1			# np[j]*m1
350	add	%rax,$N[0]
351	mov	-8($ap,$j,8),%rax
352	adc	\$0,%rdx
353	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
354	adc	\$0,%rdx
355	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
356	mov	%rdx,$N[1]
357
358	mulq	$m0			# ap[j]*bp[0]
359	add	%rax,$A[1]
360	mov	-8($np,$j,8),%rax
361	adc	\$0,%rdx
362	mov	%rdx,$A[0]
363
364	mulq	$m1			# np[j]*m1
365	add	%rax,$N[1]
366	mov	($ap,$j,8),%rax
367	adc	\$0,%rdx
368	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
369	adc	\$0,%rdx
370	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
371	mov	%rdx,$N[0]
372
373	mulq	$m0			# ap[j]*bp[0]
374	add	%rax,$A[0]
375	mov	($np,$j,8),%rax
376	adc	\$0,%rdx
377	mov	%rdx,$A[1]
378
379	mulq	$m1			# np[j]*m1
380	add	%rax,$N[0]
381	mov	8($ap,$j,8),%rax
382	adc	\$0,%rdx
383	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
384	adc	\$0,%rdx
385	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
386	mov	%rdx,$N[1]
387
388	mulq	$m0			# ap[j]*bp[0]
389	add	%rax,$A[1]
390	mov	8($np,$j,8),%rax
391	adc	\$0,%rdx
392	lea	4($j),$j		# j++
393	mov	%rdx,$A[0]
394
395	mulq	$m1			# np[j]*m1
396	add	%rax,$N[1]
397	mov	-16($ap,$j,8),%rax
398	adc	\$0,%rdx
399	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
400	adc	\$0,%rdx
401	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
402	mov	%rdx,$N[0]
403	cmp	$num,$j
404	jl	.L1st4x
405
406	mulq	$m0			# ap[j]*bp[0]
407	add	%rax,$A[0]
408	mov	-16($np,$j,8),%rax
409	adc	\$0,%rdx
410	mov	%rdx,$A[1]
411
412	mulq	$m1			# np[j]*m1
413	add	%rax,$N[0]
414	mov	-8($ap,$j,8),%rax
415	adc	\$0,%rdx
416	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
417	adc	\$0,%rdx
418	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
419	mov	%rdx,$N[1]
420
421	mulq	$m0			# ap[j]*bp[0]
422	add	%rax,$A[1]
423	mov	-8($np,$j,8),%rax
424	adc	\$0,%rdx
425	mov	%rdx,$A[0]
426
427	mulq	$m1			# np[j]*m1
428	add	%rax,$N[1]
429	mov	($ap),%rax		# ap[0]
430	adc	\$0,%rdx
431	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
432	adc	\$0,%rdx
433	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
434	mov	%rdx,$N[0]
435
436	xor	$N[1],$N[1]
437	add	$A[0],$N[0]
438	adc	\$0,$N[1]
439	mov	$N[0],-8(%rsp,$j,8)
440	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
441
442	lea	1($i),$i		# i++
443.align	4
444.Louter4x:
445	mov	($bp,$i,8),$m0		# m0=bp[i]
446	xor	$j,$j			# j=0
447	mov	(%rsp),$A[0]
448	mov	$n0,$m1
449	mulq	$m0			# ap[0]*bp[i]
450	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
451	mov	($np),%rax
452	adc	\$0,%rdx
453
454	imulq	$A[0],$m1		# tp[0]*n0
455	mov	%rdx,$A[1]
456
457	mulq	$m1			# np[0]*m1
458	add	%rax,$A[0]		# "$N[0]", discarded
459	mov	8($ap),%rax
460	adc	\$0,%rdx
461	mov	%rdx,$N[1]
462
463	mulq	$m0			# ap[j]*bp[i]
464	add	%rax,$A[1]
465	mov	8($np),%rax
466	adc	\$0,%rdx
467	add	8(%rsp),$A[1]		# +tp[1]
468	adc	\$0,%rdx
469	mov	%rdx,$A[0]
470
471	mulq	$m1			# np[j]*m1
472	add	%rax,$N[1]
473	mov	16($ap),%rax
474	adc	\$0,%rdx
475	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
476	lea	4($j),$j		# j+=2
477	adc	\$0,%rdx
478	mov	$N[1],(%rsp)		# tp[j-1]
479	mov	%rdx,$N[0]
480	jmp	.Linner4x
481.align	16
482.Linner4x:
483	mulq	$m0			# ap[j]*bp[i]
484	add	%rax,$A[0]
485	mov	-16($np,$j,8),%rax
486	adc	\$0,%rdx
487	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
488	adc	\$0,%rdx
489	mov	%rdx,$A[1]
490
491	mulq	$m1			# np[j]*m1
492	add	%rax,$N[0]
493	mov	-8($ap,$j,8),%rax
494	adc	\$0,%rdx
495	add	$A[0],$N[0]
496	adc	\$0,%rdx
497	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
498	mov	%rdx,$N[1]
499
500	mulq	$m0			# ap[j]*bp[i]
501	add	%rax,$A[1]
502	mov	-8($np,$j,8),%rax
503	adc	\$0,%rdx
504	add	-8(%rsp,$j,8),$A[1]
505	adc	\$0,%rdx
506	mov	%rdx,$A[0]
507
508	mulq	$m1			# np[j]*m1
509	add	%rax,$N[1]
510	mov	($ap,$j,8),%rax
511	adc	\$0,%rdx
512	add	$A[1],$N[1]
513	adc	\$0,%rdx
514	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
515	mov	%rdx,$N[0]
516
517	mulq	$m0			# ap[j]*bp[i]
518	add	%rax,$A[0]
519	mov	($np,$j,8),%rax
520	adc	\$0,%rdx
521	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
522	adc	\$0,%rdx
523	mov	%rdx,$A[1]
524
525	mulq	$m1			# np[j]*m1
526	add	%rax,$N[0]
527	mov	8($ap,$j,8),%rax
528	adc	\$0,%rdx
529	add	$A[0],$N[0]
530	adc	\$0,%rdx
531	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
532	mov	%rdx,$N[1]
533
534	mulq	$m0			# ap[j]*bp[i]
535	add	%rax,$A[1]
536	mov	8($np,$j,8),%rax
537	adc	\$0,%rdx
538	add	8(%rsp,$j,8),$A[1]
539	adc	\$0,%rdx
540	lea	4($j),$j		# j++
541	mov	%rdx,$A[0]
542
543	mulq	$m1			# np[j]*m1
544	add	%rax,$N[1]
545	mov	-16($ap,$j,8),%rax
546	adc	\$0,%rdx
547	add	$A[1],$N[1]
548	adc	\$0,%rdx
549	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
550	mov	%rdx,$N[0]
551	cmp	$num,$j
552	jl	.Linner4x
553
554	mulq	$m0			# ap[j]*bp[i]
555	add	%rax,$A[0]
556	mov	-16($np,$j,8),%rax
557	adc	\$0,%rdx
558	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
559	adc	\$0,%rdx
560	mov	%rdx,$A[1]
561
562	mulq	$m1			# np[j]*m1
563	add	%rax,$N[0]
564	mov	-8($ap,$j,8),%rax
565	adc	\$0,%rdx
566	add	$A[0],$N[0]
567	adc	\$0,%rdx
568	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
569	mov	%rdx,$N[1]
570
571	mulq	$m0			# ap[j]*bp[i]
572	add	%rax,$A[1]
573	mov	-8($np,$j,8),%rax
574	adc	\$0,%rdx
575	add	-8(%rsp,$j,8),$A[1]
576	adc	\$0,%rdx
577	lea	1($i),$i		# i++
578	mov	%rdx,$A[0]
579
580	mulq	$m1			# np[j]*m1
581	add	%rax,$N[1]
582	mov	($ap),%rax		# ap[0]
583	adc	\$0,%rdx
584	add	$A[1],$N[1]
585	adc	\$0,%rdx
586	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
587	mov	%rdx,$N[0]
588
589	xor	$N[1],$N[1]
590	add	$A[0],$N[0]
591	adc	\$0,$N[1]
592	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
593	adc	\$0,$N[1]
594	mov	$N[0],-8(%rsp,$j,8)
595	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
596
597	cmp	$num,$i
598	jl	.Louter4x
599___
600{
601my @ri=("%rax","%rdx",$m0,$m1);
602$code.=<<___;
603	mov	16(%rsp,$num,8),$rp	# restore $rp
604	mov	0(%rsp),@ri[0]		# tp[0]
605	pxor	%xmm0,%xmm0
606	mov	8(%rsp),@ri[1]		# tp[1]
607	shr	\$2,$num		# num/=4
608	lea	(%rsp),$ap		# borrow ap for tp
609	xor	$i,$i			# i=0 and clear CF!
610
611	sub	0($np),@ri[0]
612	mov	16($ap),@ri[2]		# tp[2]
613	mov	24($ap),@ri[3]		# tp[3]
614	sbb	8($np),@ri[1]
615	lea	-1($num),$j		# j=num/4-1
616	jmp	.Lsub4x
617.align	16
618.Lsub4x:
619	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
620	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
621	sbb	16($np,$i,8),@ri[2]
622	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
623	mov	40($ap,$i,8),@ri[1]
624	sbb	24($np,$i,8),@ri[3]
625	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
626	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
627	sbb	32($np,$i,8),@ri[0]
628	mov	48($ap,$i,8),@ri[2]
629	mov	56($ap,$i,8),@ri[3]
630	sbb	40($np,$i,8),@ri[1]
631	lea	4($i),$i		# i++
632	dec	$j			# doesnn't affect CF!
633	jnz	.Lsub4x
634
635	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
636	mov	32($ap,$i,8),@ri[0]	# load overflow bit
637	sbb	16($np,$i,8),@ri[2]
638	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
639	sbb	24($np,$i,8),@ri[3]
640	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
641
642	sbb	\$0,@ri[0]		# handle upmost overflow bit
643	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
644	xor	$i,$i			# i=0
645	and	@ri[0],$ap
646	not	@ri[0]
647	mov	$rp,$np
648	and	@ri[0],$np
649	lea	-1($num),$j
650	or	$np,$ap			# ap=borrow?tp:rp
651
652	movdqu	($ap),%xmm1
653	movdqa	%xmm0,(%rsp)
654	movdqu	%xmm1,($rp)
655	jmp	.Lcopy4x
656.align	16
657.Lcopy4x:					# copy or in-place refresh
658	movdqu	16($ap,$i),%xmm2
659	movdqu	32($ap,$i),%xmm1
660	movdqa	%xmm0,16(%rsp,$i)
661	movdqu	%xmm2,16($rp,$i)
662	movdqa	%xmm0,32(%rsp,$i)
663	movdqu	%xmm1,32($rp,$i)
664	lea	32($i),$i
665	dec	$j
666	jnz	.Lcopy4x
667
668	shl	\$2,$num
669	movdqu	16($ap,$i),%xmm2
670	movdqa	%xmm0,16(%rsp,$i)
671	movdqu	%xmm2,16($rp,$i)
672___
673}
674$code.=<<___;
675	mov	8(%rsp,$num,8),%rsi	# restore %rsp
676	mov	\$1,%rax
677	mov	(%rsi),%r15
678	mov	8(%rsi),%r14
679	mov	16(%rsi),%r13
680	mov	24(%rsi),%r12
681	mov	32(%rsi),%rbp
682	mov	40(%rsi),%rbx
683	lea	48(%rsi),%rsp
684.Lmul4x_epilogue:
685	ret
686.size	bn_mul4x_mont,.-bn_mul4x_mont
687___
688}}}
689{{{
690######################################################################
691# void bn_sqr4x_mont(
692my $rptr="%rdi";	# const BN_ULONG *rptr,
693my $aptr="%rsi";	# const BN_ULONG *aptr,
694my $bptr="%rdx";	# not used
695my $nptr="%rcx";	# const BN_ULONG *nptr,
696my $n0  ="%r8";		# const BN_ULONG *n0);
697my $num ="%r9";		# int num, has to be divisible by 4 and
698			# not less than 8
699
700my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
701my @A0=("%r10","%r11");
702my @A1=("%r12","%r13");
703my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
704
705$code.=<<___;
706.type	bn_sqr4x_mont,\@function,6
707.align	16
708bn_sqr4x_mont:
709.Lsqr4x_enter:
710	_CET_ENDBR
711	push	%rbx
712	push	%rbp
713	push	%r12
714	push	%r13
715	push	%r14
716	push	%r15
717
718	shl	\$3,${num}d		# convert $num to bytes
719	xor	%r10,%r10
720	mov	%rsp,%r11		# put aside %rsp
721	sub	$num,%r10		# -$num
722	mov	($n0),$n0		# *n0
723	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
724	and	\$-1024,%rsp		# minimize TLB usage
725	##############################################################
726	# Stack layout
727	#
728	# +0	saved $num, used in reduction section
729	# +8	&t[2*$num], used in reduction section
730	# +32	saved $rptr
731	# +40	saved $nptr
732	# +48	saved *n0
733	# +56	saved %rsp
734	# +64	t[2*$num]
735	#
736	mov	$rptr,32(%rsp)		# save $rptr
737	mov	$nptr,40(%rsp)
738	mov	$n0,  48(%rsp)
739	mov	%r11, 56(%rsp)		# save original %rsp
740.Lsqr4x_body:
741	##############################################################
742	# Squaring part:
743	#
744	# a) multiply-n-add everything but a[i]*a[i];
745	# b) shift result of a) by 1 to the left and accumulate
746	#    a[i]*a[i] products;
747	#
748	lea	32(%r10),$i		# $i=-($num-32)
749	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
750
751	mov	$num,$j			# $j=$num
752
753					# comments apply to $num==8 case
754	mov	-32($aptr,$i),$a0	# a[0]
755	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
756	mov	-24($aptr,$i),%rax	# a[1]
757	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
758	mov	-16($aptr,$i),$ai	# a[2]
759	mov	%rax,$a1
760
761	mul	$a0			# a[1]*a[0]
762	mov	%rax,$A0[0]		# a[1]*a[0]
763	 mov	$ai,%rax		# a[2]
764	mov	%rdx,$A0[1]
765	mov	$A0[0],-24($tptr,$i)	# t[1]
766
767	xor	$A0[0],$A0[0]
768	mul	$a0			# a[2]*a[0]
769	add	%rax,$A0[1]
770	 mov	$ai,%rax
771	adc	%rdx,$A0[0]
772	mov	$A0[1],-16($tptr,$i)	# t[2]
773
774	lea	-16($i),$j		# j=-16
775
776
777	 mov	8($aptr,$j),$ai		# a[3]
778	mul	$a1			# a[2]*a[1]
779	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
780	 mov	$ai,%rax
781	mov	%rdx,$A1[1]
782
783	xor	$A0[1],$A0[1]
784	add	$A1[0],$A0[0]
785	 lea	16($j),$j
786	adc	\$0,$A0[1]
787	mul	$a0			# a[3]*a[0]
788	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
789	 mov	$ai,%rax
790	adc	%rdx,$A0[1]
791	mov	$A0[0],-8($tptr,$j)	# t[3]
792	jmp	.Lsqr4x_1st
793
794.align	16
795.Lsqr4x_1st:
796	 mov	($aptr,$j),$ai		# a[4]
797	xor	$A1[0],$A1[0]
798	mul	$a1			# a[3]*a[1]
799	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
800	 mov	$ai,%rax
801	adc	%rdx,$A1[0]
802
803	xor	$A0[0],$A0[0]
804	add	$A1[1],$A0[1]
805	adc	\$0,$A0[0]
806	mul	$a0			# a[4]*a[0]
807	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
808	 mov	$ai,%rax		# a[3]
809	adc	%rdx,$A0[0]
810	mov	$A0[1],($tptr,$j)	# t[4]
811
812
813	 mov	8($aptr,$j),$ai		# a[5]
814	xor	$A1[1],$A1[1]
815	mul	$a1			# a[4]*a[3]
816	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
817	 mov	$ai,%rax
818	adc	%rdx,$A1[1]
819
820	xor	$A0[1],$A0[1]
821	add	$A1[0],$A0[0]
822	adc	\$0,$A0[1]
823	mul	$a0			# a[5]*a[2]
824	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
825	 mov	$ai,%rax
826	adc	%rdx,$A0[1]
827	mov	$A0[0],8($tptr,$j)	# t[5]
828
829	 mov	16($aptr,$j),$ai	# a[6]
830	xor	$A1[0],$A1[0]
831	mul	$a1			# a[5]*a[3]
832	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
833	 mov	$ai,%rax
834	adc	%rdx,$A1[0]
835
836	xor	$A0[0],$A0[0]
837	add	$A1[1],$A0[1]
838	adc	\$0,$A0[0]
839	mul	$a0			# a[6]*a[2]
840	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
841	 mov	$ai,%rax		# a[3]
842	adc	%rdx,$A0[0]
843	mov	$A0[1],16($tptr,$j)	# t[6]
844
845
846	 mov	24($aptr,$j),$ai	# a[7]
847	xor	$A1[1],$A1[1]
848	mul	$a1			# a[6]*a[5]
849	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
850	 mov	$ai,%rax
851	adc	%rdx,$A1[1]
852
853	xor	$A0[1],$A0[1]
854	add	$A1[0],$A0[0]
855	 lea	32($j),$j
856	adc	\$0,$A0[1]
857	mul	$a0			# a[7]*a[4]
858	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
859	 mov	$ai,%rax
860	adc	%rdx,$A0[1]
861	mov	$A0[0],-8($tptr,$j)	# t[7]
862
863	cmp	\$0,$j
864	jne	.Lsqr4x_1st
865
866	xor	$A1[0],$A1[0]
867	add	$A0[1],$A1[1]
868	adc	\$0,$A1[0]
869	mul	$a1			# a[7]*a[5]
870	add	%rax,$A1[1]
871	adc	%rdx,$A1[0]
872
873	mov	$A1[1],($tptr)		# t[8]
874	lea	16($i),$i
875	mov	$A1[0],8($tptr)		# t[9]
876	jmp	.Lsqr4x_outer
877
878.align	16
879.Lsqr4x_outer:				# comments apply to $num==6 case
880	mov	-32($aptr,$i),$a0	# a[0]
881	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
882	mov	-24($aptr,$i),%rax	# a[1]
883	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
884	mov	-16($aptr,$i),$ai	# a[2]
885	mov	%rax,$a1
886
887	mov	-24($tptr,$i),$A0[0]	# t[1]
888	xor	$A0[1],$A0[1]
889	mul	$a0			# a[1]*a[0]
890	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
891	 mov	$ai,%rax		# a[2]
892	adc	%rdx,$A0[1]
893	mov	$A0[0],-24($tptr,$i)	# t[1]
894
895	xor	$A0[0],$A0[0]
896	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
897	adc	\$0,$A0[0]
898	mul	$a0			# a[2]*a[0]
899	add	%rax,$A0[1]
900	 mov	$ai,%rax
901	adc	%rdx,$A0[0]
902	mov	$A0[1],-16($tptr,$i)	# t[2]
903
904	lea	-16($i),$j		# j=-16
905	xor	$A1[0],$A1[0]
906
907
908	 mov	8($aptr,$j),$ai		# a[3]
909	xor	$A1[1],$A1[1]
910	add	8($tptr,$j),$A1[0]
911	adc	\$0,$A1[1]
912	mul	$a1			# a[2]*a[1]
913	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
914	 mov	$ai,%rax
915	adc	%rdx,$A1[1]
916
917	xor	$A0[1],$A0[1]
918	add	$A1[0],$A0[0]
919	adc	\$0,$A0[1]
920	mul	$a0			# a[3]*a[0]
921	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
922	 mov	$ai,%rax
923	adc	%rdx,$A0[1]
924	mov	$A0[0],8($tptr,$j)	# t[3]
925
926	lea	16($j),$j
927	jmp	.Lsqr4x_inner
928
929.align	16
930.Lsqr4x_inner:
931	 mov	($aptr,$j),$ai		# a[4]
932	xor	$A1[0],$A1[0]
933	add	($tptr,$j),$A1[1]
934	adc	\$0,$A1[0]
935	mul	$a1			# a[3]*a[1]
936	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
937	 mov	$ai,%rax
938	adc	%rdx,$A1[0]
939
940	xor	$A0[0],$A0[0]
941	add	$A1[1],$A0[1]
942	adc	\$0,$A0[0]
943	mul	$a0			# a[4]*a[0]
944	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
945	 mov	$ai,%rax		# a[3]
946	adc	%rdx,$A0[0]
947	mov	$A0[1],($tptr,$j)	# t[4]
948
949	 mov	8($aptr,$j),$ai		# a[5]
950	xor	$A1[1],$A1[1]
951	add	8($tptr,$j),$A1[0]
952	adc	\$0,$A1[1]
953	mul	$a1			# a[4]*a[3]
954	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
955	 mov	$ai,%rax
956	adc	%rdx,$A1[1]
957
958	xor	$A0[1],$A0[1]
959	add	$A1[0],$A0[0]
960	lea	16($j),$j		# j++
961	adc	\$0,$A0[1]
962	mul	$a0			# a[5]*a[2]
963	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
964	 mov	$ai,%rax
965	adc	%rdx,$A0[1]
966	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
967
968	cmp	\$0,$j
969	jne	.Lsqr4x_inner
970
971	xor	$A1[0],$A1[0]
972	add	$A0[1],$A1[1]
973	adc	\$0,$A1[0]
974	mul	$a1			# a[5]*a[3]
975	add	%rax,$A1[1]
976	adc	%rdx,$A1[0]
977
978	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
979	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
980
981	add	\$16,$i
982	jnz	.Lsqr4x_outer
983
984					# comments apply to $num==4 case
985	mov	-32($aptr),$a0		# a[0]
986	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
987	mov	-24($aptr),%rax		# a[1]
988	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
989	mov	-16($aptr),$ai		# a[2]
990	mov	%rax,$a1
991
992	xor	$A0[1],$A0[1]
993	mul	$a0			# a[1]*a[0]
994	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
995	 mov	$ai,%rax		# a[2]
996	adc	%rdx,$A0[1]
997	mov	$A0[0],-24($tptr)	# t[1]
998
999	xor	$A0[0],$A0[0]
1000	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1001	adc	\$0,$A0[0]
1002	mul	$a0			# a[2]*a[0]
1003	add	%rax,$A0[1]
1004	 mov	$ai,%rax
1005	adc	%rdx,$A0[0]
1006	mov	$A0[1],-16($tptr)	# t[2]
1007
1008	 mov	-8($aptr),$ai		# a[3]
1009	mul	$a1			# a[2]*a[1]
1010	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1011	 mov	$ai,%rax
1012	adc	\$0,%rdx
1013
1014	xor	$A0[1],$A0[1]
1015	add	$A1[0],$A0[0]
1016	 mov	%rdx,$A1[1]
1017	adc	\$0,$A0[1]
1018	mul	$a0			# a[3]*a[0]
1019	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1020	 mov	$ai,%rax
1021	adc	%rdx,$A0[1]
1022	mov	$A0[0],-8($tptr)	# t[3]
1023
1024	xor	$A1[0],$A1[0]
1025	add	$A0[1],$A1[1]
1026	adc	\$0,$A1[0]
1027	mul	$a1			# a[3]*a[1]
1028	add	%rax,$A1[1]
1029	 mov	-16($aptr),%rax		# a[2]
1030	adc	%rdx,$A1[0]
1031
1032	mov	$A1[1],($tptr)		# t[4]
1033	mov	$A1[0],8($tptr)		# t[5]
1034
1035	mul	$ai			# a[2]*a[3]
1036___
1037{
1038my ($shift,$carry)=($a0,$a1);
1039my @S=(@A1,$ai,$n0);
1040$code.=<<___;
1041	 add	\$16,$i
1042	 xor	$shift,$shift
1043	 sub	$num,$i			# $i=16-$num
1044	 xor	$carry,$carry
1045
1046	add	$A1[0],%rax		# t[5]
1047	adc	\$0,%rdx
1048	mov	%rax,8($tptr)		# t[5]
1049	mov	%rdx,16($tptr)		# t[6]
1050	mov	$carry,24($tptr)	# t[7]
1051
1052	 mov	-16($aptr,$i),%rax	# a[0]
1053	lea	64(%rsp,$num,2),$tptr
1054	 xor	$A0[0],$A0[0]		# t[0]
1055	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
1056
1057	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1058	shr	\$63,$A0[0]
1059	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1060	shr	\$63,$A0[1]
1061	or	$A0[0],$S[1]		# | t[2*i]>>63
1062	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1063	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1064	mul	%rax			# a[i]*a[i]
1065	neg	$carry			# mov $carry,cf
1066	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1067	adc	%rax,$S[0]
1068	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1069	mov	$S[0],-32($tptr,$i,2)
1070	adc	%rdx,$S[1]
1071
1072	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1073	 mov	$S[1],-24($tptr,$i,2)
1074	 sbb	$carry,$carry		# mov cf,$carry
1075	shr	\$63,$A0[0]
1076	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1077	shr	\$63,$A0[1]
1078	or	$A0[0],$S[3]		# | t[2*i]>>63
1079	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1080	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1081	mul	%rax			# a[i]*a[i]
1082	neg	$carry			# mov $carry,cf
1083	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1084	adc	%rax,$S[2]
1085	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1086	mov	$S[2],-16($tptr,$i,2)
1087	adc	%rdx,$S[3]
1088	lea	16($i),$i
1089	mov	$S[3],-40($tptr,$i,2)
1090	sbb	$carry,$carry		# mov cf,$carry
1091	jmp	.Lsqr4x_shift_n_add
1092
1093.align	16
1094.Lsqr4x_shift_n_add:
1095	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1096	shr	\$63,$A0[0]
1097	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1098	shr	\$63,$A0[1]
1099	or	$A0[0],$S[1]		# | t[2*i]>>63
1100	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1101	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1102	mul	%rax			# a[i]*a[i]
1103	neg	$carry			# mov $carry,cf
1104	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1105	adc	%rax,$S[0]
1106	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1107	mov	$S[0],-32($tptr,$i,2)
1108	adc	%rdx,$S[1]
1109
1110	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1111	 mov	$S[1],-24($tptr,$i,2)
1112	 sbb	$carry,$carry		# mov cf,$carry
1113	shr	\$63,$A0[0]
1114	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1115	shr	\$63,$A0[1]
1116	or	$A0[0],$S[3]		# | t[2*i]>>63
1117	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1118	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1119	mul	%rax			# a[i]*a[i]
1120	neg	$carry			# mov $carry,cf
1121	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1122	adc	%rax,$S[2]
1123	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1124	mov	$S[2],-16($tptr,$i,2)
1125	adc	%rdx,$S[3]
1126
1127	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1128	 mov	$S[3],-8($tptr,$i,2)
1129	 sbb	$carry,$carry		# mov cf,$carry
1130	shr	\$63,$A0[0]
1131	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1132	shr	\$63,$A0[1]
1133	or	$A0[0],$S[1]		# | t[2*i]>>63
1134	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1135	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1136	mul	%rax			# a[i]*a[i]
1137	neg	$carry			# mov $carry,cf
1138	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1139	adc	%rax,$S[0]
1140	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1141	mov	$S[0],0($tptr,$i,2)
1142	adc	%rdx,$S[1]
1143
1144	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1145	 mov	$S[1],8($tptr,$i,2)
1146	 sbb	$carry,$carry		# mov cf,$carry
1147	shr	\$63,$A0[0]
1148	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1149	shr	\$63,$A0[1]
1150	or	$A0[0],$S[3]		# | t[2*i]>>63
1151	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1152	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1153	mul	%rax			# a[i]*a[i]
1154	neg	$carry			# mov $carry,cf
1155	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1156	adc	%rax,$S[2]
1157	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1158	mov	$S[2],16($tptr,$i,2)
1159	adc	%rdx,$S[3]
1160	mov	$S[3],24($tptr,$i,2)
1161	sbb	$carry,$carry		# mov cf,$carry
1162	add	\$32,$i
1163	jnz	.Lsqr4x_shift_n_add
1164
1165	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1166	shr	\$63,$A0[0]
1167	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1168	shr	\$63,$A0[1]
1169	or	$A0[0],$S[1]		# | t[2*i]>>63
1170	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1171	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1172	mul	%rax			# a[i]*a[i]
1173	neg	$carry			# mov $carry,cf
1174	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1175	adc	%rax,$S[0]
1176	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1177	mov	$S[0],-32($tptr)
1178	adc	%rdx,$S[1]
1179
1180	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1181	 mov	$S[1],-24($tptr)
1182	 sbb	$carry,$carry		# mov cf,$carry
1183	shr	\$63,$A0[0]
1184	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1185	shr	\$63,$A0[1]
1186	or	$A0[0],$S[3]		# | t[2*i]>>63
1187	mul	%rax			# a[i]*a[i]
1188	neg	$carry			# mov $carry,cf
1189	adc	%rax,$S[2]
1190	adc	%rdx,$S[3]
1191	mov	$S[2],-16($tptr)
1192	mov	$S[3],-8($tptr)
1193___
1194}
1195##############################################################
1196# Montgomery reduction part, "word-by-word" algorithm.
1197#
1198{
1199my ($topbit,$nptr)=("%rbp",$aptr);
1200my ($m0,$m1)=($a0,$a1);
1201my @Ni=("%rbx","%r9");
1202$code.=<<___;
1203	mov	40(%rsp),$nptr		# restore $nptr
1204	mov	48(%rsp),$n0		# restore *n0
1205	xor	$j,$j
1206	mov	$num,0(%rsp)		# save $num
1207	sub	$num,$j			# $j=-$num
1208	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
1209	 mov	$n0,$m0			#		# modsched #
1210	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
1211	lea	64(%rsp,$num),$tptr	# end of t[] window
1212	mov	%rax,8(%rsp)		# save end of t[] buffer
1213	lea	($nptr,$num),$nptr	# end of n[] buffer
1214	xor	$topbit,$topbit		# $topbit=0
1215
1216	mov	0($nptr,$j),%rax	# n[0]		# modsched #
1217	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1218	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
1219	 mov	%rax,$Ni[0]		#		# modsched #
1220	jmp	.Lsqr4x_mont_outer
1221
1222.align	16
1223.Lsqr4x_mont_outer:
1224	xor	$A0[1],$A0[1]
1225	mul	$m0			# n[0]*m0
1226	add	%rax,$A0[0]		# n[0]*m0+t[0]
1227	 mov	$Ni[1],%rax
1228	adc	%rdx,$A0[1]
1229	mov	$n0,$m1
1230
1231	xor	$A0[0],$A0[0]
1232	add	8($tptr,$j),$A0[1]
1233	adc	\$0,$A0[0]
1234	mul	$m0			# n[1]*m0
1235	add	%rax,$A0[1]		# n[1]*m0+t[1]
1236	 mov	$Ni[0],%rax
1237	adc	%rdx,$A0[0]
1238
1239	imulq	$A0[1],$m1
1240
1241	mov	16($nptr,$j),$Ni[0]	# n[2]
1242	xor	$A1[1],$A1[1]
1243	add	$A0[1],$A1[0]
1244	adc	\$0,$A1[1]
1245	mul	$m1			# n[0]*m1
1246	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
1247	 mov	$Ni[0],%rax
1248	adc	%rdx,$A1[1]
1249	mov	$A1[0],8($tptr,$j)	# "t[1]"
1250
1251	xor	$A0[1],$A0[1]
1252	add	16($tptr,$j),$A0[0]
1253	adc	\$0,$A0[1]
1254	mul	$m0			# n[2]*m0
1255	add	%rax,$A0[0]		# n[2]*m0+t[2]
1256	 mov	$Ni[1],%rax
1257	adc	%rdx,$A0[1]
1258
1259	mov	24($nptr,$j),$Ni[1]	# n[3]
1260	xor	$A1[0],$A1[0]
1261	add	$A0[0],$A1[1]
1262	adc	\$0,$A1[0]
1263	mul	$m1			# n[1]*m1
1264	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
1265	 mov	$Ni[1],%rax
1266	adc	%rdx,$A1[0]
1267	mov	$A1[1],16($tptr,$j)	# "t[2]"
1268
1269	xor	$A0[0],$A0[0]
1270	add	24($tptr,$j),$A0[1]
1271	lea	32($j),$j
1272	adc	\$0,$A0[0]
1273	mul	$m0			# n[3]*m0
1274	add	%rax,$A0[1]		# n[3]*m0+t[3]
1275	 mov	$Ni[0],%rax
1276	adc	%rdx,$A0[0]
1277	jmp	.Lsqr4x_mont_inner
1278
1279.align	16
1280.Lsqr4x_mont_inner:
1281	mov	($nptr,$j),$Ni[0]	# n[4]
1282	xor	$A1[1],$A1[1]
1283	add	$A0[1],$A1[0]
1284	adc	\$0,$A1[1]
1285	mul	$m1			# n[2]*m1
1286	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
1287	 mov	$Ni[0],%rax
1288	adc	%rdx,$A1[1]
1289	mov	$A1[0],-8($tptr,$j)	# "t[3]"
1290
1291	xor	$A0[1],$A0[1]
1292	add	($tptr,$j),$A0[0]
1293	adc	\$0,$A0[1]
1294	mul	$m0			# n[4]*m0
1295	add	%rax,$A0[0]		# n[4]*m0+t[4]
1296	 mov	$Ni[1],%rax
1297	adc	%rdx,$A0[1]
1298
1299	mov	8($nptr,$j),$Ni[1]	# n[5]
1300	xor	$A1[0],$A1[0]
1301	add	$A0[0],$A1[1]
1302	adc	\$0,$A1[0]
1303	mul	$m1			# n[3]*m1
1304	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
1305	 mov	$Ni[1],%rax
1306	adc	%rdx,$A1[0]
1307	mov	$A1[1],($tptr,$j)	# "t[4]"
1308
1309	xor	$A0[0],$A0[0]
1310	add	8($tptr,$j),$A0[1]
1311	adc	\$0,$A0[0]
1312	mul	$m0			# n[5]*m0
1313	add	%rax,$A0[1]		# n[5]*m0+t[5]
1314	 mov	$Ni[0],%rax
1315	adc	%rdx,$A0[0]
1316
1317
1318	mov	16($nptr,$j),$Ni[0]	# n[6]
1319	xor	$A1[1],$A1[1]
1320	add	$A0[1],$A1[0]
1321	adc	\$0,$A1[1]
1322	mul	$m1			# n[4]*m1
1323	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
1324	 mov	$Ni[0],%rax
1325	adc	%rdx,$A1[1]
1326	mov	$A1[0],8($tptr,$j)	# "t[5]"
1327
1328	xor	$A0[1],$A0[1]
1329	add	16($tptr,$j),$A0[0]
1330	adc	\$0,$A0[1]
1331	mul	$m0			# n[6]*m0
1332	add	%rax,$A0[0]		# n[6]*m0+t[6]
1333	 mov	$Ni[1],%rax
1334	adc	%rdx,$A0[1]
1335
1336	mov	24($nptr,$j),$Ni[1]	# n[7]
1337	xor	$A1[0],$A1[0]
1338	add	$A0[0],$A1[1]
1339	adc	\$0,$A1[0]
1340	mul	$m1			# n[5]*m1
1341	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
1342	 mov	$Ni[1],%rax
1343	adc	%rdx,$A1[0]
1344	mov	$A1[1],16($tptr,$j)	# "t[6]"
1345
1346	xor	$A0[0],$A0[0]
1347	add	24($tptr,$j),$A0[1]
1348	lea	32($j),$j
1349	adc	\$0,$A0[0]
1350	mul	$m0			# n[7]*m0
1351	add	%rax,$A0[1]		# n[7]*m0+t[7]
1352	 mov	$Ni[0],%rax
1353	adc	%rdx,$A0[0]
1354	cmp	\$0,$j
1355	jne	.Lsqr4x_mont_inner
1356
1357	 sub	0(%rsp),$j		# $j=-$num	# modsched #
1358	 mov	$n0,$m0			#		# modsched #
1359
1360	xor	$A1[1],$A1[1]
1361	add	$A0[1],$A1[0]
1362	adc	\$0,$A1[1]
1363	mul	$m1			# n[6]*m1
1364	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
1365	mov	$Ni[1],%rax
1366	adc	%rdx,$A1[1]
1367	mov	$A1[0],-8($tptr)	# "t[7]"
1368
1369	xor	$A0[1],$A0[1]
1370	add	($tptr),$A0[0]		# +t[8]
1371	adc	\$0,$A0[1]
1372	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
1373	add	$topbit,$A0[0]
1374	adc	\$0,$A0[1]
1375
1376	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
1377	xor	$A1[0],$A1[0]
1378	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1379	add	$A0[0],$A1[1]
1380	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
1381	adc	\$0,$A1[0]
1382	mul	$m1			# n[7]*m1
1383	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
1384	 mov	$Ni[0],%rax		#		# modsched #
1385	adc	%rdx,$A1[0]
1386	mov	$A1[1],($tptr)		# "t[8]"
1387
1388	xor	$topbit,$topbit
1389	add	8($tptr),$A1[0]		# +t[9]
1390	adc	$topbit,$topbit
1391	add	$A0[1],$A1[0]
1392	lea	16($tptr),$tptr		# "t[$num]>>128"
1393	adc	\$0,$topbit
1394	mov	$A1[0],-8($tptr)	# "t[9]"
1395	cmp	8(%rsp),$tptr		# are we done?
1396	jb	.Lsqr4x_mont_outer
1397
1398	mov	0(%rsp),$num		# restore $num
1399	mov	$topbit,($tptr)		# save $topbit
1400___
1401}
1402##############################################################
1403# Post-condition, 4x unrolled copy from bn_mul_mont
1404#
1405{
1406my ($tptr,$nptr)=("%rbx",$aptr);
1407my @ri=("%rax","%rdx","%r10","%r11");
1408$code.=<<___;
1409	mov	64(%rsp,$num),@ri[0]	# tp[0]
1410	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
1411	mov	40(%rsp),$nptr		# restore $nptr
1412	shr	\$5,$num		# num/4
1413	mov	8($tptr),@ri[1]		# t[1]
1414	xor	$i,$i			# i=0 and clear CF!
1415
1416	mov	32(%rsp),$rptr		# restore $rptr
1417	sub	0($nptr),@ri[0]
1418	mov	16($tptr),@ri[2]	# t[2]
1419	mov	24($tptr),@ri[3]	# t[3]
1420	sbb	8($nptr),@ri[1]
1421	lea	-1($num),$j		# j=num/4-1
1422	jmp	.Lsqr4x_sub
1423.align	16
1424.Lsqr4x_sub:
1425	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1426	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1427	sbb	16($nptr,$i,8),@ri[2]
1428	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
1429	mov	40($tptr,$i,8),@ri[1]
1430	sbb	24($nptr,$i,8),@ri[3]
1431	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1432	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1433	sbb	32($nptr,$i,8),@ri[0]
1434	mov	48($tptr,$i,8),@ri[2]
1435	mov	56($tptr,$i,8),@ri[3]
1436	sbb	40($nptr,$i,8),@ri[1]
1437	lea	4($i),$i		# i++
1438	dec	$j			# doesn't affect CF!
1439	jnz	.Lsqr4x_sub
1440
1441	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1442	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
1443	sbb	16($nptr,$i,8),@ri[2]
1444	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1445	sbb	24($nptr,$i,8),@ri[3]
1446	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1447
1448	sbb	\$0,@ri[0]		# handle upmost overflow bit
1449	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1450	xor	$i,$i			# i=0
1451	and	@ri[0],$tptr
1452	not	@ri[0]
1453	mov	$rptr,$nptr
1454	and	@ri[0],$nptr
1455	lea	-1($num),$j
1456	or	$nptr,$tptr		# tp=borrow?tp:rp
1457
1458	pxor	%xmm0,%xmm0
1459	lea	64(%rsp,$num,8),$nptr
1460	movdqu	($tptr),%xmm1
1461	lea	($nptr,$num,8),$nptr
1462	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
1463	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
1464	movdqu	%xmm1,($rptr)
1465	jmp	.Lsqr4x_copy
1466.align	16
1467.Lsqr4x_copy:				# copy or in-place refresh
1468	movdqu	16($tptr,$i),%xmm2
1469	movdqu	32($tptr,$i),%xmm1
1470	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1471	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
1472	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1473	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
1474	movdqu	%xmm2,16($rptr,$i)
1475	movdqu	%xmm1,32($rptr,$i)
1476	lea	32($i),$i
1477	dec	$j
1478	jnz	.Lsqr4x_copy
1479
1480	movdqu	16($tptr,$i),%xmm2
1481	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1482	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1483	movdqu	%xmm2,16($rptr,$i)
1484___
1485}
1486$code.=<<___;
1487	mov	56(%rsp),%rsi		# restore %rsp
1488	mov	\$1,%rax
1489	mov	0(%rsi),%r15
1490	mov	8(%rsi),%r14
1491	mov	16(%rsi),%r13
1492	mov	24(%rsi),%r12
1493	mov	32(%rsi),%rbp
1494	mov	40(%rsi),%rbx
1495	lea	48(%rsi),%rsp
1496.Lsqr4x_epilogue:
1497	ret
1498.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1499___
1500}}}
1501
1502print $code;
1503close STDOUT;
1504