mips-mont.pl revision 337982
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# This module doesn't present direct interest for OpenSSL, because it
11# doesn't provide better performance for longer keys, at least not on
12# in-order-execution cores. While 512-bit RSA sign operations can be
13# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
14# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
15# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
16# verify:-( All comparisons are against bn_mul_mont-free assembler.
17# The module might be of interest to embedded system developers, as
18# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
19# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
20# code.
21
22######################################################################
23# There is a number of MIPS ABI in use, O32 and N32/64 are most
24# widely used. Then there is a new contender: NUBI. It appears that if
25# one picks the latter, it's possible to arrange code in ABI neutral
26# manner. Therefore let's stick to NUBI register layout:
27#
28($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
29($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
30($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
31($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
32#
33# The return value is placed in $a0. Following coding rules facilitate
34# interoperability:
35#
36# - never ever touch $tp, "thread pointer", former $gp;
37# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
38#   old code];
39# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
40#
41# For reference here is register layout for N32/64 MIPS ABIs:
42#
43# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
44# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
45# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
46# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
47# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
48#
49$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
50
51if ($flavour =~ /64|n32/i) {
52	$PTR_ADD="dadd";	# incidentally works even on n32
53	$PTR_SUB="dsub";	# incidentally works even on n32
54	$REG_S="sd";
55	$REG_L="ld";
56	$SZREG=8;
57} else {
58	$PTR_ADD="add";
59	$PTR_SUB="sub";
60	$REG_S="sw";
61	$REG_L="lw";
62	$SZREG=4;
63}
64$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
65#
66# <appro@openssl.org>
67#
68######################################################################
69
70while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
71open STDOUT,">$output";
72
73if ($flavour =~ /64|n32/i) {
74	$LD="ld";
75	$ST="sd";
76	$MULTU="dmultu";
77	$ADDU="daddu";
78	$SUBU="dsubu";
79	$BNSZ=8;
80} else {
81	$LD="lw";
82	$ST="sw";
83	$MULTU="multu";
84	$ADDU="addu";
85	$SUBU="subu";
86	$BNSZ=4;
87}
88
89# int bn_mul_mont(
90$rp=$a0;	# BN_ULONG *rp,
91$ap=$a1;	# const BN_ULONG *ap,
92$bp=$a2;	# const BN_ULONG *bp,
93$np=$a3;	# const BN_ULONG *np,
94$n0=$a4;	# const BN_ULONG *n0,
95$num=$a5;	# int num);
96
97$lo0=$a6;
98$hi0=$a7;
99$lo1=$t1;
100$hi1=$t2;
101$aj=$s0;
102$bi=$s1;
103$nj=$s2;
104$tp=$s3;
105$alo=$s4;
106$ahi=$s5;
107$nlo=$s6;
108$nhi=$s7;
109$tj=$s8;
110$i=$s9;
111$j=$s10;
112$m1=$s11;
113
114$FRAMESIZE=14;
115
116$code=<<___;
117.text
118
119.set	noat
120.set	noreorder
121
122.align	5
123.globl	bn_mul_mont
124.ent	bn_mul_mont
125bn_mul_mont:
126___
127$code.=<<___ if ($flavour =~ /o32/i);
128	lw	$n0,16($sp)
129	lw	$num,20($sp)
130___
131$code.=<<___;
132	slt	$at,$num,4
133	bnez	$at,1f
134	li	$t0,0
135	slt	$at,$num,17	# on in-order CPU
136	bnez	$at,bn_mul_mont_internal
137	nop
1381:	jr	$ra
139	li	$a0,0
140.end	bn_mul_mont
141
142.align	5
143.ent	bn_mul_mont_internal
144bn_mul_mont_internal:
145	.frame	$fp,$FRAMESIZE*$SZREG,$ra
146	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
147	$PTR_SUB $sp,$FRAMESIZE*$SZREG
148	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
149	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
150	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
151	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
152	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
153	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
154	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
155	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
156	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
157___
158$code.=<<___ if ($flavour =~ /nubi/i);
159	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
160	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
161	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
162	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
163___
164$code.=<<___;
165	move	$fp,$sp
166
167	.set	reorder
168	$LD	$n0,0($n0)
169	$LD	$bi,0($bp)	# bp[0]
170	$LD	$aj,0($ap)	# ap[0]
171	$LD	$nj,0($np)	# np[0]
172
173	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
174	sll	$num,`log($BNSZ)/log(2)`
175	li	$at,-4096
176	$PTR_SUB $sp,$num
177	and	$sp,$at
178
179	$MULTU	$aj,$bi
180	$LD	$alo,$BNSZ($ap)
181	$LD	$nlo,$BNSZ($np)
182	mflo	$lo0
183	mfhi	$hi0
184	$MULTU	$lo0,$n0
185	mflo	$m1
186
187	$MULTU	$alo,$bi
188	mflo	$alo
189	mfhi	$ahi
190
191	$MULTU	$nj,$m1
192	mflo	$lo1
193	mfhi	$hi1
194	$MULTU	$nlo,$m1
195	$ADDU	$lo1,$lo0
196	sltu	$at,$lo1,$lo0
197	$ADDU	$hi1,$at
198	mflo	$nlo
199	mfhi	$nhi
200
201	move	$tp,$sp
202	li	$j,2*$BNSZ
203.align	4
204.L1st:
205	.set	noreorder
206	$PTR_ADD $aj,$ap,$j
207	$PTR_ADD $nj,$np,$j
208	$LD	$aj,($aj)
209	$LD	$nj,($nj)
210
211	$MULTU	$aj,$bi
212	$ADDU	$lo0,$alo,$hi0
213	$ADDU	$lo1,$nlo,$hi1
214	sltu	$at,$lo0,$hi0
215	sltu	$t0,$lo1,$hi1
216	$ADDU	$hi0,$ahi,$at
217	$ADDU	$hi1,$nhi,$t0
218	mflo	$alo
219	mfhi	$ahi
220
221	$ADDU	$lo1,$lo0
222	sltu	$at,$lo1,$lo0
223	$MULTU	$nj,$m1
224	$ADDU	$hi1,$at
225	addu	$j,$BNSZ
226	$ST	$lo1,($tp)
227	sltu	$t0,$j,$num
228	mflo	$nlo
229	mfhi	$nhi
230
231	bnez	$t0,.L1st
232	$PTR_ADD $tp,$BNSZ
233	.set	reorder
234
235	$ADDU	$lo0,$alo,$hi0
236	sltu	$at,$lo0,$hi0
237	$ADDU	$hi0,$ahi,$at
238
239	$ADDU	$lo1,$nlo,$hi1
240	sltu	$t0,$lo1,$hi1
241	$ADDU	$hi1,$nhi,$t0
242	$ADDU	$lo1,$lo0
243	sltu	$at,$lo1,$lo0
244	$ADDU	$hi1,$at
245
246	$ST	$lo1,($tp)
247
248	$ADDU	$hi1,$hi0
249	sltu	$at,$hi1,$hi0
250	$ST	$hi1,$BNSZ($tp)
251	$ST	$at,2*$BNSZ($tp)
252
253	li	$i,$BNSZ
254.align	4
255.Louter:
256	$PTR_ADD $bi,$bp,$i
257	$LD	$bi,($bi)
258	$LD	$aj,($ap)
259	$LD	$alo,$BNSZ($ap)
260	$LD	$tj,($sp)
261
262	$MULTU	$aj,$bi
263	$LD	$nj,($np)
264	$LD	$nlo,$BNSZ($np)
265	mflo	$lo0
266	mfhi	$hi0
267	$ADDU	$lo0,$tj
268	$MULTU	$lo0,$n0
269	sltu	$at,$lo0,$tj
270	$ADDU	$hi0,$at
271	mflo	$m1
272
273	$MULTU	$alo,$bi
274	mflo	$alo
275	mfhi	$ahi
276
277	$MULTU	$nj,$m1
278	mflo	$lo1
279	mfhi	$hi1
280
281	$MULTU	$nlo,$m1
282	$ADDU	$lo1,$lo0
283	sltu	$at,$lo1,$lo0
284	$ADDU	$hi1,$at
285	mflo	$nlo
286	mfhi	$nhi
287
288	move	$tp,$sp
289	li	$j,2*$BNSZ
290	$LD	$tj,$BNSZ($tp)
291.align	4
292.Linner:
293	.set	noreorder
294	$PTR_ADD $aj,$ap,$j
295	$PTR_ADD $nj,$np,$j
296	$LD	$aj,($aj)
297	$LD	$nj,($nj)
298
299	$MULTU	$aj,$bi
300	$ADDU	$lo0,$alo,$hi0
301	$ADDU	$lo1,$nlo,$hi1
302	sltu	$at,$lo0,$hi0
303	sltu	$t0,$lo1,$hi1
304	$ADDU	$hi0,$ahi,$at
305	$ADDU	$hi1,$nhi,$t0
306	mflo	$alo
307	mfhi	$ahi
308
309	$ADDU	$lo0,$tj
310	addu	$j,$BNSZ
311	$MULTU	$nj,$m1
312	sltu	$at,$lo0,$tj
313	$ADDU	$lo1,$lo0
314	$ADDU	$hi0,$at
315	sltu	$t0,$lo1,$lo0
316	$LD	$tj,2*$BNSZ($tp)
317	$ADDU	$hi1,$t0
318	sltu	$at,$j,$num
319	mflo	$nlo
320	mfhi	$nhi
321	$ST	$lo1,($tp)
322	bnez	$at,.Linner
323	$PTR_ADD $tp,$BNSZ
324	.set	reorder
325
326	$ADDU	$lo0,$alo,$hi0
327	sltu	$at,$lo0,$hi0
328	$ADDU	$hi0,$ahi,$at
329	$ADDU	$lo0,$tj
330	sltu	$t0,$lo0,$tj
331	$ADDU	$hi0,$t0
332
333	$LD	$tj,2*$BNSZ($tp)
334	$ADDU	$lo1,$nlo,$hi1
335	sltu	$at,$lo1,$hi1
336	$ADDU	$hi1,$nhi,$at
337	$ADDU	$lo1,$lo0
338	sltu	$t0,$lo1,$lo0
339	$ADDU	$hi1,$t0
340	$ST	$lo1,($tp)
341
342	$ADDU	$lo1,$hi1,$hi0
343	sltu	$hi1,$lo1,$hi0
344	$ADDU	$lo1,$tj
345	sltu	$at,$lo1,$tj
346	$ADDU	$hi1,$at
347	$ST	$lo1,$BNSZ($tp)
348	$ST	$hi1,2*$BNSZ($tp)
349
350	addu	$i,$BNSZ
351	sltu	$t0,$i,$num
352	bnez	$t0,.Louter
353
354	.set	noreorder
355	$PTR_ADD $tj,$sp,$num	# &tp[num]
356	move	$tp,$sp
357	move	$ap,$sp
358	li	$hi0,0		# clear borrow bit
359
360.align	4
361.Lsub:	$LD	$lo0,($tp)
362	$LD	$lo1,($np)
363	$PTR_ADD $tp,$BNSZ
364	$PTR_ADD $np,$BNSZ
365	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
366	sgtu	$at,$lo1,$lo0
367	$SUBU	$lo0,$lo1,$hi0
368	sgtu	$hi0,$lo0,$lo1
369	$ST	$lo0,($rp)
370	or	$hi0,$at
371	sltu	$at,$tp,$tj
372	bnez	$at,.Lsub
373	$PTR_ADD $rp,$BNSZ
374
375	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
376	move	$tp,$sp
377	$PTR_SUB $rp,$num	# restore rp
378	not	$hi1,$hi0
379
380.Lcopy:	$LD	$nj,($tp)	# conditional move
381	$LD	$aj,($rp)
382	$ST	$zero,($tp)
383	$PTR_ADD $tp,$BNSZ
384	and	$nj,$hi0
385	and	$aj,$hi1
386	or	$aj,$nj
387	sltu	$at,$tp,$tj
388	$ST	$aj,($rp)
389	bnez	$at,.Lcopy
390	$PTR_ADD $rp,$BNSZ
391
392	li	$a0,1
393	li	$t0,1
394
395	.set	noreorder
396	move	$sp,$fp
397	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
398	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
399	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
400	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
401	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
402	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
403	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
404	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
405	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
406___
407$code.=<<___ if ($flavour =~ /nubi/i);
408	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
409	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
410	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
411	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
412___
413$code.=<<___;
414	jr	$ra
415	$PTR_ADD $sp,$FRAMESIZE*$SZREG
416.end	bn_mul_mont_internal
417.rdata
418.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
419___
420
421$code =~ s/\`([^\`]*)\`/eval $1/gem;
422
423print $code;
424close STDOUT;
425