x86_64-mont5.pl revision 296317
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20$flavour = shift;
21$output  = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33
34# int bn_mul_mont_gather5(
35$rp="%rdi";	# BN_ULONG *rp,
36$ap="%rsi";	# const BN_ULONG *ap,
37$bp="%rdx";	# const BN_ULONG *bp,
38$np="%rcx";	# const BN_ULONG *np,
39$n0="%r8";	# const BN_ULONG *n0,
40$num="%r9";	# int num,
41		# int idx);	# 0 to 2^5-1, "index" in $bp holding
42				# pre-computed powers of a', interlaced
43				# in such manner that b[0] is $bp[idx],
44				# b[1] is [2^5+idx], etc.
45$lo0="%r10";
46$hi0="%r11";
47$hi1="%r13";
48$i="%r14";
49$j="%r15";
50$m0="%rbx";
51$m1="%rbp";
52
53$code=<<___;
54.text
55
56.globl	bn_mul_mont_gather5
57.type	bn_mul_mont_gather5,\@function,6
58.align	64
59bn_mul_mont_gather5:
60	test	\$3,${num}d
61	jnz	.Lmul_enter
62	cmp	\$8,${num}d
63	jb	.Lmul_enter
64	jmp	.Lmul4x_enter
65
66.align	16
67.Lmul_enter:
68	mov	${num}d,${num}d
69	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
70	lea	.Linc(%rip),%r10
71	push	%rbx
72	push	%rbp
73	push	%r12
74	push	%r13
75	push	%r14
76	push	%r15
77
78.Lmul_alloca:
79	mov	%rsp,%rax
80	lea	2($num),%r11
81	neg	%r11
82	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
83	and	\$-1024,%rsp		# minimize TLB usage
84
85	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
86.Lmul_body:
87	lea	128($bp),%r12		# reassign $bp (+size optimization)
88___
89		$bp="%r12";
90		$STRIDE=2**5*8;		# 5 is "window size"
91		$N=$STRIDE/4;		# should match cache line size
92$code.=<<___;
93	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
94	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
95	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
96	and	\$-16,%r10
97
98	pshufd	\$0,%xmm5,%xmm5		# broadcast index
99	movdqa	%xmm1,%xmm4
100	movdqa	%xmm1,%xmm2
101___
102########################################################################
103# calculate mask by comparing 0..31 to index and save result to stack
104#
105$code.=<<___;
106	paddd	%xmm0,%xmm1
107	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
108	.byte	0x67
109	movdqa	%xmm4,%xmm3
110___
111for($k=0;$k<$STRIDE/16-4;$k+=4) {
112$code.=<<___;
113	paddd	%xmm1,%xmm2
114	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
115	movdqa	%xmm0,`16*($k+0)+112`(%r10)
116	movdqa	%xmm4,%xmm0
117
118	paddd	%xmm2,%xmm3
119	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
120	movdqa	%xmm1,`16*($k+1)+112`(%r10)
121	movdqa	%xmm4,%xmm1
122
123	paddd	%xmm3,%xmm0
124	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
125	movdqa	%xmm2,`16*($k+2)+112`(%r10)
126	movdqa	%xmm4,%xmm2
127
128	paddd	%xmm0,%xmm1
129	pcmpeqd	%xmm5,%xmm0
130	movdqa	%xmm3,`16*($k+3)+112`(%r10)
131	movdqa	%xmm4,%xmm3
132___
133}
134$code.=<<___;				# last iteration can be optimized
135	paddd	%xmm1,%xmm2
136	pcmpeqd	%xmm5,%xmm1
137	movdqa	%xmm0,`16*($k+0)+112`(%r10)
138
139	paddd	%xmm2,%xmm3
140	.byte	0x67
141	pcmpeqd	%xmm5,%xmm2
142	movdqa	%xmm1,`16*($k+1)+112`(%r10)
143
144	pcmpeqd	%xmm5,%xmm3
145	movdqa	%xmm2,`16*($k+2)+112`(%r10)
146	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
147
148	pand	`16*($k+1)-128`($bp),%xmm1
149	pand	`16*($k+2)-128`($bp),%xmm2
150	movdqa	%xmm3,`16*($k+3)+112`(%r10)
151	pand	`16*($k+3)-128`($bp),%xmm3
152	por	%xmm2,%xmm0
153	por	%xmm3,%xmm1
154___
155for($k=0;$k<$STRIDE/16-4;$k+=4) {
156$code.=<<___;
157	movdqa	`16*($k+0)-128`($bp),%xmm4
158	movdqa	`16*($k+1)-128`($bp),%xmm5
159	movdqa	`16*($k+2)-128`($bp),%xmm2
160	pand	`16*($k+0)+112`(%r10),%xmm4
161	movdqa	`16*($k+3)-128`($bp),%xmm3
162	pand	`16*($k+1)+112`(%r10),%xmm5
163	por	%xmm4,%xmm0
164	pand	`16*($k+2)+112`(%r10),%xmm2
165	por	%xmm5,%xmm1
166	pand	`16*($k+3)+112`(%r10),%xmm3
167	por	%xmm2,%xmm0
168	por	%xmm3,%xmm1
169___
170}
171$code.=<<___;
172	por	%xmm1,%xmm0
173	pshufd	\$0x4e,%xmm0,%xmm1
174	por	%xmm1,%xmm0
175	lea	$STRIDE($bp),$bp
176	movq	%xmm0,$m0		# m0=bp[0]
177
178	mov	($n0),$n0		# pull n0[0] value
179	mov	($ap),%rax
180
181	xor	$i,$i			# i=0
182	xor	$j,$j			# j=0
183
184	mov	$n0,$m1
185	mulq	$m0			# ap[0]*bp[0]
186	mov	%rax,$lo0
187	mov	($np),%rax
188
189	imulq	$lo0,$m1		# "tp[0]"*n0
190	mov	%rdx,$hi0
191
192	mulq	$m1			# np[0]*m1
193	add	%rax,$lo0		# discarded
194	mov	8($ap),%rax
195	adc	\$0,%rdx
196	mov	%rdx,$hi1
197
198	lea	1($j),$j		# j++
199	jmp	.L1st_enter
200
201.align	16
202.L1st:
203	add	%rax,$hi1
204	mov	($ap,$j,8),%rax
205	adc	\$0,%rdx
206	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
207	mov	$lo0,$hi0
208	adc	\$0,%rdx
209	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
210	mov	%rdx,$hi1
211
212.L1st_enter:
213	mulq	$m0			# ap[j]*bp[0]
214	add	%rax,$hi0
215	mov	($np,$j,8),%rax
216	adc	\$0,%rdx
217	lea	1($j),$j		# j++
218	mov	%rdx,$lo0
219
220	mulq	$m1			# np[j]*m1
221	cmp	$num,$j
222	jne	.L1st
223
224	add	%rax,$hi1
225	mov	($ap),%rax		# ap[0]
226	adc	\$0,%rdx
227	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
228	adc	\$0,%rdx
229	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
230	mov	%rdx,$hi1
231	mov	$lo0,$hi0
232
233	xor	%rdx,%rdx
234	add	$hi0,$hi1
235	adc	\$0,%rdx
236	mov	$hi1,-8(%rsp,$num,8)
237	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
238
239	lea	1($i),$i		# i++
240	jmp	.Louter
241.align	16
242.Louter:
243	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
244	and	\$-16,%rdx
245	pxor	%xmm4,%xmm4
246	pxor	%xmm5,%xmm5
247___
248for($k=0;$k<$STRIDE/16;$k+=4) {
249$code.=<<___;
250	movdqa	`16*($k+0)-128`($bp),%xmm0
251	movdqa	`16*($k+1)-128`($bp),%xmm1
252	movdqa	`16*($k+2)-128`($bp),%xmm2
253	movdqa	`16*($k+3)-128`($bp),%xmm3
254	pand	`16*($k+0)-128`(%rdx),%xmm0
255	pand	`16*($k+1)-128`(%rdx),%xmm1
256	por	%xmm0,%xmm4
257	pand	`16*($k+2)-128`(%rdx),%xmm2
258	por	%xmm1,%xmm5
259	pand	`16*($k+3)-128`(%rdx),%xmm3
260	por	%xmm2,%xmm4
261	por	%xmm3,%xmm5
262___
263}
264$code.=<<___;
265	por	%xmm5,%xmm4
266	pshufd	\$0x4e,%xmm4,%xmm0
267	por	%xmm4,%xmm0
268	lea	$STRIDE($bp),$bp
269	movq	%xmm0,$m0		# m0=bp[i]
270
271	xor	$j,$j			# j=0
272	mov	$n0,$m1
273	mov	(%rsp),$lo0
274
275	mulq	$m0			# ap[0]*bp[i]
276	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
277	mov	($np),%rax
278	adc	\$0,%rdx
279
280	imulq	$lo0,$m1		# tp[0]*n0
281	mov	%rdx,$hi0
282
283	mulq	$m1			# np[0]*m1
284	add	%rax,$lo0		# discarded
285	mov	8($ap),%rax
286	adc	\$0,%rdx
287	mov	8(%rsp),$lo0		# tp[1]
288	mov	%rdx,$hi1
289
290	lea	1($j),$j		# j++
291	jmp	.Linner_enter
292
293.align	16
294.Linner:
295	add	%rax,$hi1
296	mov	($ap,$j,8),%rax
297	adc	\$0,%rdx
298	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
299	mov	(%rsp,$j,8),$lo0
300	adc	\$0,%rdx
301	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
302	mov	%rdx,$hi1
303
304.Linner_enter:
305	mulq	$m0			# ap[j]*bp[i]
306	add	%rax,$hi0
307	mov	($np,$j,8),%rax
308	adc	\$0,%rdx
309	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
310	mov	%rdx,$hi0
311	adc	\$0,$hi0
312	lea	1($j),$j		# j++
313
314	mulq	$m1			# np[j]*m1
315	cmp	$num,$j
316	jne	.Linner
317
318	add	%rax,$hi1
319	mov	($ap),%rax		# ap[0]
320	adc	\$0,%rdx
321	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
322	mov	(%rsp,$j,8),$lo0
323	adc	\$0,%rdx
324	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
325	mov	%rdx,$hi1
326
327	xor	%rdx,%rdx
328	add	$hi0,$hi1
329	adc	\$0,%rdx
330	add	$lo0,$hi1		# pull upmost overflow bit
331	adc	\$0,%rdx
332	mov	$hi1,-8(%rsp,$num,8)
333	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
334
335	lea	1($i),$i		# i++
336	cmp	$num,$i
337	jl	.Louter
338
339	xor	$i,$i			# i=0 and clear CF!
340	mov	(%rsp),%rax		# tp[0]
341	lea	(%rsp),$ap		# borrow ap for tp
342	mov	$num,$j			# j=num
343	jmp	.Lsub
344.align	16
345.Lsub:	sbb	($np,$i,8),%rax
346	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
347	mov	8($ap,$i,8),%rax	# tp[i+1]
348	lea	1($i),$i		# i++
349	dec	$j			# doesnn't affect CF!
350	jnz	.Lsub
351
352	sbb	\$0,%rax		# handle upmost overflow bit
353	xor	$i,$i
354	and	%rax,$ap
355	not	%rax
356	mov	$rp,$np
357	and	%rax,$np
358	mov	$num,$j			# j=num
359	or	$np,$ap			# ap=borrow?tp:rp
360.align	16
361.Lcopy:					# copy or in-place refresh
362	mov	($ap,$i,8),%rax
363	mov	$i,(%rsp,$i,8)		# zap temporary vector
364	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
365	lea	1($i),$i
366	sub	\$1,$j
367	jnz	.Lcopy
368
369	mov	8(%rsp,$num,8),%rsi	# restore %rsp
370	mov	\$1,%rax
371
372	mov	(%rsi),%r15
373	mov	8(%rsi),%r14
374	mov	16(%rsi),%r13
375	mov	24(%rsi),%r12
376	mov	32(%rsi),%rbp
377	mov	40(%rsi),%rbx
378	lea	48(%rsi),%rsp
379.Lmul_epilogue:
380	ret
381.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
382___
383{{{
384my @A=("%r10","%r11");
385my @N=("%r13","%rdi");
386$code.=<<___;
387.type	bn_mul4x_mont_gather5,\@function,6
388.align	16
389bn_mul4x_mont_gather5:
390.Lmul4x_enter:
391	mov	${num}d,${num}d
392	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
393	lea	.Linc(%rip),%r10
394	push	%rbx
395	push	%rbp
396	push	%r12
397	push	%r13
398	push	%r14
399	push	%r15
400
401.Lmul4x_alloca:
402	mov	%rsp,%rax
403	lea	4($num),%r11
404	neg	%r11
405	lea	-256(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4)+256)
406	and	\$-1024,%rsp		# minimize TLB usage
407
408	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
409.Lmul4x_body:
410	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
411	lea	128(%rdx),%r12		# reassign $bp (+size optimization)
412___
413		$bp="%r12";
414		$STRIDE=2**5*8;		# 5 is "window size"
415		$N=$STRIDE/4;		# should match cache line size
416$code.=<<___;
417	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
418	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
419	lea	32-112(%rsp,$num,8),%r10# place the mask after tp[num+4] (+ICache optimization)
420
421	pshufd	\$0,%xmm5,%xmm5		# broadcast index
422	movdqa	%xmm1,%xmm4
423	.byte	0x67,0x67
424	movdqa	%xmm1,%xmm2
425___
426########################################################################
427# calculate mask by comparing 0..31 to index and save result to stack
428#
429$code.=<<___;
430	paddd	%xmm0,%xmm1
431	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
432	.byte	0x67
433	movdqa	%xmm4,%xmm3
434___
435for($k=0;$k<$STRIDE/16-4;$k+=4) {
436$code.=<<___;
437	paddd	%xmm1,%xmm2
438	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
439	movdqa	%xmm0,`16*($k+0)+112`(%r10)
440	movdqa	%xmm4,%xmm0
441
442	paddd	%xmm2,%xmm3
443	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
444	movdqa	%xmm1,`16*($k+1)+112`(%r10)
445	movdqa	%xmm4,%xmm1
446
447	paddd	%xmm3,%xmm0
448	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
449	movdqa	%xmm2,`16*($k+2)+112`(%r10)
450	movdqa	%xmm4,%xmm2
451
452	paddd	%xmm0,%xmm1
453	pcmpeqd	%xmm5,%xmm0
454	movdqa	%xmm3,`16*($k+3)+112`(%r10)
455	movdqa	%xmm4,%xmm3
456___
457}
458$code.=<<___;				# last iteration can be optimized
459	paddd	%xmm1,%xmm2
460	pcmpeqd	%xmm5,%xmm1
461	movdqa	%xmm0,`16*($k+0)+112`(%r10)
462
463	paddd	%xmm2,%xmm3
464	.byte	0x67
465	pcmpeqd	%xmm5,%xmm2
466	movdqa	%xmm1,`16*($k+1)+112`(%r10)
467
468	pcmpeqd	%xmm5,%xmm3
469	movdqa	%xmm2,`16*($k+2)+112`(%r10)
470	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
471
472	pand	`16*($k+1)-128`($bp),%xmm1
473	pand	`16*($k+2)-128`($bp),%xmm2
474	movdqa	%xmm3,`16*($k+3)+112`(%r10)
475	pand	`16*($k+3)-128`($bp),%xmm3
476	por	%xmm2,%xmm0
477	por	%xmm3,%xmm1
478___
479for($k=0;$k<$STRIDE/16-4;$k+=4) {
480$code.=<<___;
481	movdqa	`16*($k+0)-128`($bp),%xmm4
482	movdqa	`16*($k+1)-128`($bp),%xmm5
483	movdqa	`16*($k+2)-128`($bp),%xmm2
484	pand	`16*($k+0)+112`(%r10),%xmm4
485	movdqa	`16*($k+3)-128`($bp),%xmm3
486	pand	`16*($k+1)+112`(%r10),%xmm5
487	por	%xmm4,%xmm0
488	pand	`16*($k+2)+112`(%r10),%xmm2
489	por	%xmm5,%xmm1
490	pand	`16*($k+3)+112`(%r10),%xmm3
491	por	%xmm2,%xmm0
492	por	%xmm3,%xmm1
493___
494}
495$code.=<<___;
496	por	%xmm1,%xmm0
497	pshufd	\$0x4e,%xmm0,%xmm1
498	por	%xmm1,%xmm0
499	lea	$STRIDE($bp),$bp
500	movq	%xmm0,$m0		# m0=bp[0]
501
502	mov	($n0),$n0		# pull n0[0] value
503	mov	($ap),%rax
504
505	xor	$i,$i			# i=0
506	xor	$j,$j			# j=0
507
508	mov	$n0,$m1
509	mulq	$m0			# ap[0]*bp[0]
510	mov	%rax,$A[0]
511	mov	($np),%rax
512
513	imulq	$A[0],$m1		# "tp[0]"*n0
514	mov	%rdx,$A[1]
515
516	mulq	$m1			# np[0]*m1
517	add	%rax,$A[0]		# discarded
518	mov	8($ap),%rax
519	adc	\$0,%rdx
520	mov	%rdx,$N[1]
521
522	mulq	$m0
523	add	%rax,$A[1]
524	mov	8($np),%rax
525	adc	\$0,%rdx
526	mov	%rdx,$A[0]
527
528	mulq	$m1
529	add	%rax,$N[1]
530	mov	16($ap),%rax
531	adc	\$0,%rdx
532	add	$A[1],$N[1]
533	lea	4($j),$j		# j++
534	adc	\$0,%rdx
535	mov	$N[1],(%rsp)
536	mov	%rdx,$N[0]
537	jmp	.L1st4x
538.align	16
539.L1st4x:
540	mulq	$m0			# ap[j]*bp[0]
541	add	%rax,$A[0]
542	mov	-16($np,$j,8),%rax
543	adc	\$0,%rdx
544	mov	%rdx,$A[1]
545
546	mulq	$m1			# np[j]*m1
547	add	%rax,$N[0]
548	mov	-8($ap,$j,8),%rax
549	adc	\$0,%rdx
550	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
551	adc	\$0,%rdx
552	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
553	mov	%rdx,$N[1]
554
555	mulq	$m0			# ap[j]*bp[0]
556	add	%rax,$A[1]
557	mov	-8($np,$j,8),%rax
558	adc	\$0,%rdx
559	mov	%rdx,$A[0]
560
561	mulq	$m1			# np[j]*m1
562	add	%rax,$N[1]
563	mov	($ap,$j,8),%rax
564	adc	\$0,%rdx
565	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
566	adc	\$0,%rdx
567	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
568	mov	%rdx,$N[0]
569
570	mulq	$m0			# ap[j]*bp[0]
571	add	%rax,$A[0]
572	mov	($np,$j,8),%rax
573	adc	\$0,%rdx
574	mov	%rdx,$A[1]
575
576	mulq	$m1			# np[j]*m1
577	add	%rax,$N[0]
578	mov	8($ap,$j,8),%rax
579	adc	\$0,%rdx
580	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
581	adc	\$0,%rdx
582	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
583	mov	%rdx,$N[1]
584
585	mulq	$m0			# ap[j]*bp[0]
586	add	%rax,$A[1]
587	mov	8($np,$j,8),%rax
588	adc	\$0,%rdx
589	lea	4($j),$j		# j++
590	mov	%rdx,$A[0]
591
592	mulq	$m1			# np[j]*m1
593	add	%rax,$N[1]
594	mov	-16($ap,$j,8),%rax
595	adc	\$0,%rdx
596	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
597	adc	\$0,%rdx
598	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
599	mov	%rdx,$N[0]
600	cmp	$num,$j
601	jl	.L1st4x
602
603	mulq	$m0			# ap[j]*bp[0]
604	add	%rax,$A[0]
605	mov	-16($np,$j,8),%rax
606	adc	\$0,%rdx
607	mov	%rdx,$A[1]
608
609	mulq	$m1			# np[j]*m1
610	add	%rax,$N[0]
611	mov	-8($ap,$j,8),%rax
612	adc	\$0,%rdx
613	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
614	adc	\$0,%rdx
615	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
616	mov	%rdx,$N[1]
617
618	mulq	$m0			# ap[j]*bp[0]
619	add	%rax,$A[1]
620	mov	-8($np,$j,8),%rax
621	adc	\$0,%rdx
622	mov	%rdx,$A[0]
623
624	mulq	$m1			# np[j]*m1
625	add	%rax,$N[1]
626	mov	($ap),%rax		# ap[0]
627	adc	\$0,%rdx
628	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
629	adc	\$0,%rdx
630	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
631	mov	%rdx,$N[0]
632
633	xor	$N[1],$N[1]
634	add	$A[0],$N[0]
635	adc	\$0,$N[1]
636	mov	$N[0],-8(%rsp,$j,8)
637	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
638
639	lea	1($i),$i		# i++
640.align	4
641.Louter4x:
642	lea	32+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
643	pxor	%xmm4,%xmm4
644	pxor	%xmm5,%xmm5
645___
646for($k=0;$k<$STRIDE/16;$k+=4) {
647$code.=<<___;
648	movdqa	`16*($k+0)-128`($bp),%xmm0
649	movdqa	`16*($k+1)-128`($bp),%xmm1
650	movdqa	`16*($k+2)-128`($bp),%xmm2
651	movdqa	`16*($k+3)-128`($bp),%xmm3
652	pand	`16*($k+0)-128`(%rdx),%xmm0
653	pand	`16*($k+1)-128`(%rdx),%xmm1
654	por	%xmm0,%xmm4
655	pand	`16*($k+2)-128`(%rdx),%xmm2
656	por	%xmm1,%xmm5
657	pand	`16*($k+3)-128`(%rdx),%xmm3
658	por	%xmm2,%xmm4
659	por	%xmm3,%xmm5
660___
661}
662$code.=<<___;
663	por	%xmm5,%xmm4
664	pshufd	\$0x4e,%xmm4,%xmm0
665	por	%xmm4,%xmm0
666	lea	$STRIDE($bp),$bp
667	movq	%xmm0,$m0		# m0=bp[i]
668
669	xor	$j,$j			# j=0
670
671	mov	(%rsp),$A[0]
672	mov	$n0,$m1
673	mulq	$m0			# ap[0]*bp[i]
674	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
675	mov	($np),%rax
676	adc	\$0,%rdx
677
678	imulq	$A[0],$m1		# tp[0]*n0
679	mov	%rdx,$A[1]
680
681	mulq	$m1			# np[0]*m1
682	add	%rax,$A[0]		# "$N[0]", discarded
683	mov	8($ap),%rax
684	adc	\$0,%rdx
685	mov	%rdx,$N[1]
686
687	mulq	$m0			# ap[j]*bp[i]
688	add	%rax,$A[1]
689	mov	8($np),%rax
690	adc	\$0,%rdx
691	add	8(%rsp),$A[1]		# +tp[1]
692	adc	\$0,%rdx
693	mov	%rdx,$A[0]
694
695	mulq	$m1			# np[j]*m1
696	add	%rax,$N[1]
697	mov	16($ap),%rax
698	adc	\$0,%rdx
699	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
700	lea	4($j),$j		# j+=2
701	adc	\$0,%rdx
702	mov	%rdx,$N[0]
703	jmp	.Linner4x
704.align	16
705.Linner4x:
706	mulq	$m0			# ap[j]*bp[i]
707	add	%rax,$A[0]
708	mov	-16($np,$j,8),%rax
709	adc	\$0,%rdx
710	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
711	adc	\$0,%rdx
712	mov	%rdx,$A[1]
713
714	mulq	$m1			# np[j]*m1
715	add	%rax,$N[0]
716	mov	-8($ap,$j,8),%rax
717	adc	\$0,%rdx
718	add	$A[0],$N[0]
719	adc	\$0,%rdx
720	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
721	mov	%rdx,$N[1]
722
723	mulq	$m0			# ap[j]*bp[i]
724	add	%rax,$A[1]
725	mov	-8($np,$j,8),%rax
726	adc	\$0,%rdx
727	add	-8(%rsp,$j,8),$A[1]
728	adc	\$0,%rdx
729	mov	%rdx,$A[0]
730
731	mulq	$m1			# np[j]*m1
732	add	%rax,$N[1]
733	mov	($ap,$j,8),%rax
734	adc	\$0,%rdx
735	add	$A[1],$N[1]
736	adc	\$0,%rdx
737	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
738	mov	%rdx,$N[0]
739
740	mulq	$m0			# ap[j]*bp[i]
741	add	%rax,$A[0]
742	mov	($np,$j,8),%rax
743	adc	\$0,%rdx
744	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
745	adc	\$0,%rdx
746	mov	%rdx,$A[1]
747
748	mulq	$m1			# np[j]*m1
749	add	%rax,$N[0]
750	mov	8($ap,$j,8),%rax
751	adc	\$0,%rdx
752	add	$A[0],$N[0]
753	adc	\$0,%rdx
754	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
755	mov	%rdx,$N[1]
756
757	mulq	$m0			# ap[j]*bp[i]
758	add	%rax,$A[1]
759	mov	8($np,$j,8),%rax
760	adc	\$0,%rdx
761	add	8(%rsp,$j,8),$A[1]
762	adc	\$0,%rdx
763	lea	4($j),$j		# j++
764	mov	%rdx,$A[0]
765
766	mulq	$m1			# np[j]*m1
767	add	%rax,$N[1]
768	mov	-16($ap,$j,8),%rax
769	adc	\$0,%rdx
770	add	$A[1],$N[1]
771	adc	\$0,%rdx
772	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
773	mov	%rdx,$N[0]
774	cmp	$num,$j
775	jl	.Linner4x
776
777	mulq	$m0			# ap[j]*bp[i]
778	add	%rax,$A[0]
779	mov	-16($np,$j,8),%rax
780	adc	\$0,%rdx
781	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
782	adc	\$0,%rdx
783	mov	%rdx,$A[1]
784
785	mulq	$m1			# np[j]*m1
786	add	%rax,$N[0]
787	mov	-8($ap,$j,8),%rax
788	adc	\$0,%rdx
789	add	$A[0],$N[0]
790	adc	\$0,%rdx
791	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
792	mov	%rdx,$N[1]
793
794	mulq	$m0			# ap[j]*bp[i]
795	add	%rax,$A[1]
796	mov	-8($np,$j,8),%rax
797	adc	\$0,%rdx
798	add	-8(%rsp,$j,8),$A[1]
799	adc	\$0,%rdx
800	lea	1($i),$i		# i++
801	mov	%rdx,$A[0]
802
803	mulq	$m1			# np[j]*m1
804	add	%rax,$N[1]
805	mov	($ap),%rax		# ap[0]
806	adc	\$0,%rdx
807	add	$A[1],$N[1]
808	adc	\$0,%rdx
809	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
810	mov	%rdx,$N[0]
811
812	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
813
814	xor	$N[1],$N[1]
815	add	$A[0],$N[0]
816	adc	\$0,$N[1]
817	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
818	adc	\$0,$N[1]
819	mov	$N[0],-8(%rsp,$j,8)
820	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
821
822	cmp	$num,$i
823	jl	.Louter4x
824___
825{
826my @ri=("%rax","%rdx",$m0,$m1);
827$code.=<<___;
828	mov	16(%rsp,$num,8),$rp	# restore $rp
829	mov	0(%rsp),@ri[0]		# tp[0]
830	pxor	%xmm0,%xmm0
831	mov	8(%rsp),@ri[1]		# tp[1]
832	shr	\$2,$num		# num/=4
833	lea	(%rsp),$ap		# borrow ap for tp
834	xor	$i,$i			# i=0 and clear CF!
835
836	sub	0($np),@ri[0]
837	mov	16($ap),@ri[2]		# tp[2]
838	mov	24($ap),@ri[3]		# tp[3]
839	sbb	8($np),@ri[1]
840	lea	-1($num),$j		# j=num/4-1
841	jmp	.Lsub4x
842.align	16
843.Lsub4x:
844	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
845	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
846	sbb	16($np,$i,8),@ri[2]
847	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
848	mov	40($ap,$i,8),@ri[1]
849	sbb	24($np,$i,8),@ri[3]
850	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
851	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
852	sbb	32($np,$i,8),@ri[0]
853	mov	48($ap,$i,8),@ri[2]
854	mov	56($ap,$i,8),@ri[3]
855	sbb	40($np,$i,8),@ri[1]
856	lea	4($i),$i		# i++
857	dec	$j			# doesnn't affect CF!
858	jnz	.Lsub4x
859
860	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
861	mov	32($ap,$i,8),@ri[0]	# load overflow bit
862	sbb	16($np,$i,8),@ri[2]
863	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
864	sbb	24($np,$i,8),@ri[3]
865	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
866
867	sbb	\$0,@ri[0]		# handle upmost overflow bit
868	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
869	xor	$i,$i			# i=0
870	and	@ri[0],$ap
871	not	@ri[0]
872	mov	$rp,$np
873	and	@ri[0],$np
874	lea	-1($num),$j
875	or	$np,$ap			# ap=borrow?tp:rp
876
877	movdqu	($ap),%xmm1
878	movdqa	%xmm0,(%rsp)
879	movdqu	%xmm1,($rp)
880	jmp	.Lcopy4x
881.align	16
882.Lcopy4x:					# copy or in-place refresh
883	movdqu	16($ap,$i),%xmm2
884	movdqu	32($ap,$i),%xmm1
885	movdqa	%xmm0,16(%rsp,$i)
886	movdqu	%xmm2,16($rp,$i)
887	movdqa	%xmm0,32(%rsp,$i)
888	movdqu	%xmm1,32($rp,$i)
889	lea	32($i),$i
890	dec	$j
891	jnz	.Lcopy4x
892
893	shl	\$2,$num
894	movdqu	16($ap,$i),%xmm2
895	movdqa	%xmm0,16(%rsp,$i)
896	movdqu	%xmm2,16($rp,$i)
897___
898}
899$code.=<<___;
900	mov	8(%rsp,$num,8),%rsi	# restore %rsp
901	mov	\$1,%rax
902
903	mov	(%rsi),%r15
904	mov	8(%rsi),%r14
905	mov	16(%rsi),%r13
906	mov	24(%rsi),%r12
907	mov	32(%rsi),%rbp
908	mov	40(%rsi),%rbx
909	lea	48(%rsi),%rsp
910.Lmul4x_epilogue:
911	ret
912.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
913___
914}}}
915
916{
917my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9d") : # Win64 order
918				("%rdi","%rsi","%rdx","%ecx"); # Unix order
919my $out=$inp;
920my $STRIDE=2**5*8;
921my $N=$STRIDE/4;
922
923$code.=<<___;
924.globl	bn_scatter5
925.type	bn_scatter5,\@abi-omnipotent
926.align	16
927bn_scatter5:
928	cmp	\$0, $num
929	jz	.Lscatter_epilogue
930	lea	($tbl,$idx,8),$tbl
931.Lscatter:
932	mov	($inp),%rax
933	lea	8($inp),$inp
934	mov	%rax,($tbl)
935	lea	32*8($tbl),$tbl
936	sub	\$1,$num
937	jnz	.Lscatter
938.Lscatter_epilogue:
939	ret
940.size	bn_scatter5,.-bn_scatter5
941
942.globl	bn_gather5
943.type	bn_gather5,\@abi-omnipotent
944.align	16
945bn_gather5:
946.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
947	# I can't trust assembler to use specific encoding:-(
948	.byte	0x4c,0x8d,0x14,0x24			# lea    (%rsp),%r10
949	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	# sub	$0x108,%rsp
950	lea	.Linc(%rip),%rax
951	and	\$-16,%rsp		# shouldn't be formally required
952
953	movd	$idx,%xmm5
954	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
955	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
956	lea	128($tbl),%r11		# size optimization
957	lea	128(%rsp),%rax		# size optimization
958
959	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
960	movdqa	%xmm1,%xmm4
961	movdqa	%xmm1,%xmm2
962___
963########################################################################
964# calculate mask by comparing 0..31 to $idx and save result to stack
965#
966for($i=0;$i<$STRIDE/16;$i+=4) {
967$code.=<<___;
968	paddd	%xmm0,%xmm1
969	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
970___
971$code.=<<___	if ($i);
972	movdqa	%xmm3,`16*($i-1)-128`(%rax)
973___
974$code.=<<___;
975	movdqa	%xmm4,%xmm3
976
977	paddd	%xmm1,%xmm2
978	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
979	movdqa	%xmm0,`16*($i+0)-128`(%rax)
980	movdqa	%xmm4,%xmm0
981
982	paddd	%xmm2,%xmm3
983	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
984	movdqa	%xmm1,`16*($i+1)-128`(%rax)
985	movdqa	%xmm4,%xmm1
986
987	paddd	%xmm3,%xmm0
988	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
989	movdqa	%xmm2,`16*($i+2)-128`(%rax)
990	movdqa	%xmm4,%xmm2
991___
992}
993$code.=<<___;
994	movdqa	%xmm3,`16*($i-1)-128`(%rax)
995	jmp	.Lgather
996
997.align	32
998.Lgather:
999	pxor	%xmm4,%xmm4
1000	pxor	%xmm5,%xmm5
1001___
1002for($i=0;$i<$STRIDE/16;$i+=4) {
1003$code.=<<___;
1004	movdqa	`16*($i+0)-128`(%r11),%xmm0
1005	movdqa	`16*($i+1)-128`(%r11),%xmm1
1006	movdqa	`16*($i+2)-128`(%r11),%xmm2
1007	pand	`16*($i+0)-128`(%rax),%xmm0
1008	movdqa	`16*($i+3)-128`(%r11),%xmm3
1009	pand	`16*($i+1)-128`(%rax),%xmm1
1010	por	%xmm0,%xmm4
1011	pand	`16*($i+2)-128`(%rax),%xmm2
1012	por	%xmm1,%xmm5
1013	pand	`16*($i+3)-128`(%rax),%xmm3
1014	por	%xmm2,%xmm4
1015	por	%xmm3,%xmm5
1016___
1017}
1018$code.=<<___;
1019	por	%xmm5,%xmm4
1020	lea	$STRIDE(%r11),%r11
1021	pshufd	\$0x4e,%xmm4,%xmm0
1022	por	%xmm4,%xmm0
1023	movq	%xmm0,($out)		# m0=bp[0]
1024	lea	8($out),$out
1025	sub	\$1,$num
1026	jnz	.Lgather
1027
1028	lea	(%r10),%rsp
1029	ret
1030.LSEH_end_bn_gather5:
1031.size	bn_gather5,.-bn_gather5
1032___
1033}
1034$code.=<<___;
1035.align	64
1036.Linc:
1037	.long	0,0, 1,1
1038	.long	2,2, 2,2
1039.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1040___
1041
1042# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1043#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1044if ($win64) {
1045$rec="%rcx";
1046$frame="%rdx";
1047$context="%r8";
1048$disp="%r9";
1049
1050$code.=<<___;
1051.extern	__imp_RtlVirtualUnwind
1052.type	mul_handler,\@abi-omnipotent
1053.align	16
1054mul_handler:
1055	push	%rsi
1056	push	%rdi
1057	push	%rbx
1058	push	%rbp
1059	push	%r12
1060	push	%r13
1061	push	%r14
1062	push	%r15
1063	pushfq
1064	sub	\$64,%rsp
1065
1066	mov	120($context),%rax	# pull context->Rax
1067	mov	248($context),%rbx	# pull context->Rip
1068
1069	mov	8($disp),%rsi		# disp->ImageBase
1070	mov	56($disp),%r11		# disp->HandlerData
1071
1072	mov	0(%r11),%r10d		# HandlerData[0]
1073	lea	(%rsi,%r10),%r10	# end of prologue label
1074	cmp	%r10,%rbx		# context->Rip<end of prologue label
1075	jb	.Lcommon_seh_tail
1076
1077	lea	48(%rax),%rax
1078
1079	mov	4(%r11),%r10d		# HandlerData[1]
1080	lea	(%rsi,%r10),%r10	# end of alloca label
1081	cmp	%r10,%rbx		# context->Rip<end of alloca label
1082	jb	.Lcommon_seh_tail
1083
1084	mov	152($context),%rax	# pull context->Rsp
1085
1086	mov	8(%r11),%r10d		# HandlerData[2]
1087	lea	(%rsi,%r10),%r10	# epilogue label
1088	cmp	%r10,%rbx		# context->Rip>=epilogue label
1089	jae	.Lcommon_seh_tail
1090
1091	mov	192($context),%r10	# pull $num
1092	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1093
1094	lea	48(%rax),%rax
1095
1096	mov	-8(%rax),%rbx
1097	mov	-16(%rax),%rbp
1098	mov	-24(%rax),%r12
1099	mov	-32(%rax),%r13
1100	mov	-40(%rax),%r14
1101	mov	-48(%rax),%r15
1102	mov	%rbx,144($context)	# restore context->Rbx
1103	mov	%rbp,160($context)	# restore context->Rbp
1104	mov	%r12,216($context)	# restore context->R12
1105	mov	%r13,224($context)	# restore context->R13
1106	mov	%r14,232($context)	# restore context->R14
1107	mov	%r15,240($context)	# restore context->R15
1108
1109.Lcommon_seh_tail:
1110	mov	8(%rax),%rdi
1111	mov	16(%rax),%rsi
1112	mov	%rax,152($context)	# restore context->Rsp
1113	mov	%rsi,168($context)	# restore context->Rsi
1114	mov	%rdi,176($context)	# restore context->Rdi
1115
1116	mov	40($disp),%rdi		# disp->ContextRecord
1117	mov	$context,%rsi		# context
1118	mov	\$154,%ecx		# sizeof(CONTEXT)
1119	.long	0xa548f3fc		# cld; rep movsq
1120
1121	mov	$disp,%rsi
1122	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1123	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1124	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1125	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1126	mov	40(%rsi),%r10		# disp->ContextRecord
1127	lea	56(%rsi),%r11		# &disp->HandlerData
1128	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1129	mov	%r10,32(%rsp)		# arg5
1130	mov	%r11,40(%rsp)		# arg6
1131	mov	%r12,48(%rsp)		# arg7
1132	mov	%rcx,56(%rsp)		# arg8, (NULL)
1133	call	*__imp_RtlVirtualUnwind(%rip)
1134
1135	mov	\$1,%eax		# ExceptionContinueSearch
1136	add	\$64,%rsp
1137	popfq
1138	pop	%r15
1139	pop	%r14
1140	pop	%r13
1141	pop	%r12
1142	pop	%rbp
1143	pop	%rbx
1144	pop	%rdi
1145	pop	%rsi
1146	ret
1147.size	mul_handler,.-mul_handler
1148
1149.section	.pdata
1150.align	4
1151	.rva	.LSEH_begin_bn_mul_mont_gather5
1152	.rva	.LSEH_end_bn_mul_mont_gather5
1153	.rva	.LSEH_info_bn_mul_mont_gather5
1154
1155	.rva	.LSEH_begin_bn_mul4x_mont_gather5
1156	.rva	.LSEH_end_bn_mul4x_mont_gather5
1157	.rva	.LSEH_info_bn_mul4x_mont_gather5
1158
1159	.rva	.LSEH_begin_bn_gather5
1160	.rva	.LSEH_end_bn_gather5
1161	.rva	.LSEH_info_bn_gather5
1162
1163.section	.xdata
1164.align	8
1165.LSEH_info_bn_mul_mont_gather5:
1166	.byte	9,0,0,0
1167	.rva	mul_handler
1168	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
1169.align	8
1170.LSEH_info_bn_mul4x_mont_gather5:
1171	.byte	9,0,0,0
1172	.rva	mul_handler
1173	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1174.align	8
1175.LSEH_info_bn_gather5:
1176	.byte	0x01,0x0b,0x03,0x0a
1177	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
1178	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp), set_frame r10
1179.align	8
1180___
1181}
1182
1183$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1184
1185print $code;
1186close STDOUT;
1187