s390x-mont.pl revision 337982
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2007.
11#
12# Performance improvement over vanilla C code varies from 85% to 45%
13# depending on key length and benchmark. Unfortunately in this context
14# these are not very impressive results [for code that utilizes "wide"
15# 64x64=128-bit multiplication, which is not commonly available to C
16# programmers], at least hand-coded bn_asm.c replacement is known to
17# provide 30-40% better results for longest keys. Well, on a second
18# thought it's not very surprising, because z-CPUs are single-issue
19# and _strictly_ in-order execution, while bn_mul_mont is more or less
20# dependent on CPU ability to pipe-line instructions and have several
21# of them "in-flight" at the same time. I mean while other methods,
22# for example Karatsuba, aim to minimize amount of multiplications at
23# the cost of other operations increase, bn_mul_mont aim to neatly
24# "overlap" multiplications and the other operations [and on most
25# platforms even minimize the amount of the other operations, in
26# particular references to memory]. But it's possible to improve this
27# module performance by implementing dedicated squaring code-path and
28# possibly by unrolling loops...
29
30# January 2009.
31#
32# Reschedule to minimize/avoid Address Generation Interlock hazard,
33# make inner loops counter-based.
34
35# November 2010.
36#
37# Adapt for -m31 build. If kernel supports what's called "highgprs"
38# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
39# instructions and achieve "64-bit" performance even in 31-bit legacy
40# application context. The feature is not specific to any particular
41# processor, as long as it's "z-CPU". Latter implies that the code
42# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
43# is achieved by swapping words after 64-bit loads, follow _dswap-s.
44# On z990 it was measured to perform 2.6-2.2 times better than
45# compiler-generated code, less for longer keys...
46
47$flavour = shift;
48
49if ($flavour =~ /3[12]/) {
50	$SIZE_T=4;
51	$g="";
52} else {
53	$SIZE_T=8;
54	$g="g";
55}
56
57while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
58open STDOUT,">$output";
59
60$stdframe=16*$SIZE_T+4*8;
61
62$mn0="%r0";
63$num="%r1";
64
65# int bn_mul_mont(
66$rp="%r2";		# BN_ULONG *rp,
67$ap="%r3";		# const BN_ULONG *ap,
68$bp="%r4";		# const BN_ULONG *bp,
69$np="%r5";		# const BN_ULONG *np,
70$n0="%r6";		# const BN_ULONG *n0,
71#$num="160(%r15)"	# int num);
72
73$bi="%r2";	# zaps rp
74$j="%r7";
75
76$ahi="%r8";
77$alo="%r9";
78$nhi="%r10";
79$nlo="%r11";
80$AHI="%r12";
81$NHI="%r13";
82$count="%r14";
83$sp="%r15";
84
85$code.=<<___;
86.text
87.globl	bn_mul_mont
88.type	bn_mul_mont,\@function
89bn_mul_mont:
90	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
91	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
92	la	$bp,0($num,$bp)
93
94	st${g}	%r2,2*$SIZE_T($sp)
95
96	cghi	$num,16		#
97	lghi	%r2,0		#
98	blr	%r14		# if($num<16) return 0;
99___
100$code.=<<___ if ($flavour =~ /3[12]/);
101	tmll	$num,4
102	bnzr	%r14		# if ($num&1) return 0;
103___
104$code.=<<___ if ($flavour !~ /3[12]/);
105	cghi	$num,96		#
106	bhr	%r14		# if($num>96) return 0;
107___
108$code.=<<___;
109	stm${g}	%r3,%r15,3*$SIZE_T($sp)
110
111	lghi	$rp,-$stdframe-8	# leave room for carry bit
112	lcgr	$j,$num		# -$num
113	lgr	%r0,$sp
114	la	$rp,0($rp,$sp)
115	la	$sp,0($j,$rp)	# alloca
116	st${g}	%r0,0($sp)	# back chain
117
118	sra	$num,3		# restore $num
119	la	$bp,0($j,$bp)	# restore $bp
120	ahi	$num,-1		# adjust $num for inner loop
121	lg	$n0,0($n0)	# pull n0
122	_dswap	$n0
123
124	lg	$bi,0($bp)
125	_dswap	$bi
126	lg	$alo,0($ap)
127	_dswap	$alo
128	mlgr	$ahi,$bi	# ap[0]*bp[0]
129	lgr	$AHI,$ahi
130
131	lgr	$mn0,$alo	# "tp[0]"*n0
132	msgr	$mn0,$n0
133
134	lg	$nlo,0($np)	#
135	_dswap	$nlo
136	mlgr	$nhi,$mn0	# np[0]*m1
137	algr	$nlo,$alo	# +="tp[0]"
138	lghi	$NHI,0
139	alcgr	$NHI,$nhi
140
141	la	$j,8(%r0)	# j=1
142	lr	$count,$num
143
144.align	16
145.L1st:
146	lg	$alo,0($j,$ap)
147	_dswap	$alo
148	mlgr	$ahi,$bi	# ap[j]*bp[0]
149	algr	$alo,$AHI
150	lghi	$AHI,0
151	alcgr	$AHI,$ahi
152
153	lg	$nlo,0($j,$np)
154	_dswap	$nlo
155	mlgr	$nhi,$mn0	# np[j]*m1
156	algr	$nlo,$NHI
157	lghi	$NHI,0
158	alcgr	$nhi,$NHI	# +="tp[j]"
159	algr	$nlo,$alo
160	alcgr	$NHI,$nhi
161
162	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
163	la	$j,8($j)	# j++
164	brct	$count,.L1st
165
166	algr	$NHI,$AHI
167	lghi	$AHI,0
168	alcgr	$AHI,$AHI	# upmost overflow bit
169	stg	$NHI,$stdframe-8($j,$sp)
170	stg	$AHI,$stdframe($j,$sp)
171	la	$bp,8($bp)	# bp++
172
173.Louter:
174	lg	$bi,0($bp)	# bp[i]
175	_dswap	$bi
176	lg	$alo,0($ap)
177	_dswap	$alo
178	mlgr	$ahi,$bi	# ap[0]*bp[i]
179	alg	$alo,$stdframe($sp)	# +=tp[0]
180	lghi	$AHI,0
181	alcgr	$AHI,$ahi
182
183	lgr	$mn0,$alo
184	msgr	$mn0,$n0	# tp[0]*n0
185
186	lg	$nlo,0($np)	# np[0]
187	_dswap	$nlo
188	mlgr	$nhi,$mn0	# np[0]*m1
189	algr	$nlo,$alo	# +="tp[0]"
190	lghi	$NHI,0
191	alcgr	$NHI,$nhi
192
193	la	$j,8(%r0)	# j=1
194	lr	$count,$num
195
196.align	16
197.Linner:
198	lg	$alo,0($j,$ap)
199	_dswap	$alo
200	mlgr	$ahi,$bi	# ap[j]*bp[i]
201	algr	$alo,$AHI
202	lghi	$AHI,0
203	alcgr	$ahi,$AHI
204	alg	$alo,$stdframe($j,$sp)# +=tp[j]
205	alcgr	$AHI,$ahi
206
207	lg	$nlo,0($j,$np)
208	_dswap	$nlo
209	mlgr	$nhi,$mn0	# np[j]*m1
210	algr	$nlo,$NHI
211	lghi	$NHI,0
212	alcgr	$nhi,$NHI
213	algr	$nlo,$alo	# +="tp[j]"
214	alcgr	$NHI,$nhi
215
216	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
217	la	$j,8($j)	# j++
218	brct	$count,.Linner
219
220	algr	$NHI,$AHI
221	lghi	$AHI,0
222	alcgr	$AHI,$AHI
223	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
224	lghi	$ahi,0
225	alcgr	$AHI,$ahi	# new upmost overflow bit
226	stg	$NHI,$stdframe-8($j,$sp)
227	stg	$AHI,$stdframe($j,$sp)
228
229	la	$bp,8($bp)	# bp++
230	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
231	jne	.Louter
232
233	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
234	la	$ap,$stdframe($sp)
235	ahi	$num,1		# restore $num, incidentally clears "borrow"
236
237	la	$j,0(%r0)
238	lr	$count,$num
239.Lsub:	lg	$alo,0($j,$ap)
240	lg	$nlo,0($j,$np)
241	_dswap	$nlo
242	slbgr	$alo,$nlo
243	stg	$alo,0($j,$rp)
244	la	$j,8($j)
245	brct	$count,.Lsub
246	lghi	$ahi,0
247	slbgr	$AHI,$ahi	# handle upmost carry
248	lghi	$NHI,-1
249	xgr	$NHI,$AHI
250
251	la	$j,0(%r0)
252	lgr	$count,$num
253.Lcopy:	lg	$ahi,$stdframe($j,$sp)	# conditional copy
254	lg	$alo,0($j,$rp)
255	ngr	$ahi,$AHI
256	ngr	$alo,$NHI
257	ogr	$alo,$ahi
258	_dswap	$alo
259	stg	$j,$stdframe($j,$sp)	# zap tp
260	stg	$alo,0($j,$rp)
261	la	$j,8($j)
262	brct	$count,.Lcopy
263
264	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
265	lm${g}	%r6,%r15,0(%r1)
266	lghi	%r2,1		# signal "processed"
267	br	%r14
268.size	bn_mul_mont,.-bn_mul_mont
269.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
270___
271
272foreach (split("\n",$code)) {
273	s/\`([^\`]*)\`/eval $1/ge;
274	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
275	print $_,"\n";
276}
277close STDOUT;
278