1238384Sjkim#!/usr/bin/env perl
2238384Sjkim#
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# This module doesn't present direct interest for OpenSSL, because it
11238384Sjkim# doesn't provide better performance for longer keys. While 512-bit
12238384Sjkim# RSA private key operations are 40% faster, 1024-bit ones are hardly
13238384Sjkim# faster at all, while longer key operations are slower by up to 20%.
14238384Sjkim# It might be of interest to embedded system developers though, as
15238384Sjkim# it's smaller than 1KB, yet offers ~3x improvement over compiler
16238384Sjkim# generated code.
17238384Sjkim#
18238384Sjkim# The module targets N32 and N64 MIPS ABIs and currently is a bit
19238384Sjkim# IRIX-centric, i.e. is likely to require adaptation for other OSes.
20238384Sjkim
21238384Sjkim# int bn_mul_mont(
22238384Sjkim$rp="a0";	# BN_ULONG *rp,
23238384Sjkim$ap="a1";	# const BN_ULONG *ap,
24238384Sjkim$bp="a2";	# const BN_ULONG *bp,
25238384Sjkim$np="a3";	# const BN_ULONG *np,
26238384Sjkim$n0="a4";	# const BN_ULONG *n0,
27238384Sjkim$num="a5";	# int num);
28238384Sjkim
29238384Sjkim$lo0="a6";
30238384Sjkim$hi0="a7";
31238384Sjkim$lo1="v0";
32238384Sjkim$hi1="v1";
33238384Sjkim$aj="t0";
34238384Sjkim$bi="t1";
35238384Sjkim$nj="t2";
36238384Sjkim$tp="t3";
37238384Sjkim$alo="s0";
38238384Sjkim$ahi="s1";
39238384Sjkim$nlo="s2";
40238384Sjkim$nhi="s3";
41238384Sjkim$tj="s4";
42238384Sjkim$i="s5";
43238384Sjkim$j="s6";
44238384Sjkim$fp="t8";
45238384Sjkim$m1="t9";
46238384Sjkim
47238384Sjkim$FRAME=8*(2+8);
48238384Sjkim
49238384Sjkim$code=<<___;
50238384Sjkim#include <asm.h>
51238384Sjkim#include <regdef.h>
52238384Sjkim
53238384Sjkim.text
54238384Sjkim
55238384Sjkim.set	noat
56238384Sjkim.set	reorder
57238384Sjkim
58238384Sjkim.align	5
59238384Sjkim.globl	bn_mul_mont
60238384Sjkim.ent	bn_mul_mont
61238384Sjkimbn_mul_mont:
62238384Sjkim	.set	noreorder
63238384Sjkim	PTR_SUB	sp,64
64238384Sjkim	move	$fp,sp
65238384Sjkim	.frame	$fp,64,ra
66238384Sjkim	slt	AT,$num,4
67238384Sjkim	li	v0,0
68238384Sjkim	beqzl	AT,.Lproceed
69238384Sjkim	nop
70238384Sjkim	jr	ra
71238384Sjkim	PTR_ADD	sp,$fp,64
72238384Sjkim	.set	reorder
73238384Sjkim.align	5
74238384Sjkim.Lproceed:
75238384Sjkim	ld	$n0,0($n0)
76238384Sjkim	ld	$bi,0($bp)	# bp[0]
77238384Sjkim	ld	$aj,0($ap)	# ap[0]
78238384Sjkim	ld	$nj,0($np)	# np[0]
79238384Sjkim	PTR_SUB	sp,16		# place for two extra words
80238384Sjkim	sll	$num,3
81238384Sjkim	li	AT,-4096
82238384Sjkim	PTR_SUB	sp,$num
83238384Sjkim	and	sp,AT
84238384Sjkim
85238384Sjkim	sd	s0,0($fp)
86238384Sjkim	sd	s1,8($fp)
87238384Sjkim	sd	s2,16($fp)
88238384Sjkim	sd	s3,24($fp)
89238384Sjkim	sd	s4,32($fp)
90238384Sjkim	sd	s5,40($fp)
91238384Sjkim	sd	s6,48($fp)
92238384Sjkim	sd	s7,56($fp)
93238384Sjkim
94238384Sjkim	dmultu	$aj,$bi
95238384Sjkim	ld	$alo,8($ap)
96238384Sjkim	ld	$nlo,8($np)
97238384Sjkim	mflo	$lo0
98238384Sjkim	mfhi	$hi0
99238384Sjkim	dmultu	$lo0,$n0
100238384Sjkim	mflo	$m1
101238384Sjkim
102238384Sjkim	dmultu	$alo,$bi
103238384Sjkim	mflo	$alo
104238384Sjkim	mfhi	$ahi
105238384Sjkim
106238384Sjkim	dmultu	$nj,$m1
107238384Sjkim	mflo	$lo1
108238384Sjkim	mfhi	$hi1
109238384Sjkim	dmultu	$nlo,$m1
110238384Sjkim	daddu	$lo1,$lo0
111238384Sjkim	sltu	AT,$lo1,$lo0
112238384Sjkim	daddu	$hi1,AT
113238384Sjkim	mflo	$nlo
114238384Sjkim	mfhi	$nhi
115238384Sjkim
116238384Sjkim	move	$tp,sp
117238384Sjkim	li	$j,16
118238384Sjkim.align	4
119238384Sjkim.L1st:
120238384Sjkim	.set	noreorder
121238384Sjkim	PTR_ADD	$aj,$ap,$j
122238384Sjkim	ld	$aj,($aj)
123238384Sjkim	PTR_ADD	$nj,$np,$j
124238384Sjkim	ld	$nj,($nj)
125238384Sjkim
126238384Sjkim	dmultu	$aj,$bi
127238384Sjkim	daddu	$lo0,$alo,$hi0
128238384Sjkim	daddu	$lo1,$nlo,$hi1
129238384Sjkim	sltu	AT,$lo0,$hi0
130238384Sjkim	sltu	s7,$lo1,$hi1
131238384Sjkim	daddu	$hi0,$ahi,AT
132238384Sjkim	daddu	$hi1,$nhi,s7
133238384Sjkim	mflo	$alo
134238384Sjkim	mfhi	$ahi
135238384Sjkim
136238384Sjkim	daddu	$lo1,$lo0
137238384Sjkim	sltu	AT,$lo1,$lo0
138238384Sjkim	dmultu	$nj,$m1
139238384Sjkim	daddu	$hi1,AT
140238384Sjkim	addu	$j,8
141238384Sjkim	sd	$lo1,($tp)
142238384Sjkim	sltu	s7,$j,$num
143238384Sjkim	mflo	$nlo
144238384Sjkim	mfhi	$nhi
145238384Sjkim
146238384Sjkim	bnez	s7,.L1st
147238384Sjkim	PTR_ADD	$tp,8
148238384Sjkim	.set	reorder
149238384Sjkim
150238384Sjkim	daddu	$lo0,$alo,$hi0
151238384Sjkim	sltu	AT,$lo0,$hi0
152238384Sjkim	daddu	$hi0,$ahi,AT
153238384Sjkim
154238384Sjkim	daddu	$lo1,$nlo,$hi1
155238384Sjkim	sltu	s7,$lo1,$hi1
156238384Sjkim	daddu	$hi1,$nhi,s7
157238384Sjkim	daddu	$lo1,$lo0
158238384Sjkim	sltu	AT,$lo1,$lo0
159238384Sjkim	daddu	$hi1,AT
160238384Sjkim
161238384Sjkim	sd	$lo1,($tp)
162238384Sjkim
163238384Sjkim	daddu	$hi1,$hi0
164238384Sjkim	sltu	AT,$hi1,$hi0
165238384Sjkim	sd	$hi1,8($tp)
166238384Sjkim	sd	AT,16($tp)
167238384Sjkim
168238384Sjkim	li	$i,8
169238384Sjkim.align	4
170238384Sjkim.Louter:
171238384Sjkim	PTR_ADD	$bi,$bp,$i
172238384Sjkim	ld	$bi,($bi)
173238384Sjkim	ld	$aj,($ap)
174238384Sjkim	ld	$alo,8($ap)
175238384Sjkim	ld	$tj,(sp)
176238384Sjkim
177238384Sjkim	dmultu	$aj,$bi
178238384Sjkim	ld	$nj,($np)
179238384Sjkim	ld	$nlo,8($np)
180238384Sjkim	mflo	$lo0
181238384Sjkim	mfhi	$hi0
182238384Sjkim	daddu	$lo0,$tj
183238384Sjkim	dmultu	$lo0,$n0
184238384Sjkim	sltu	AT,$lo0,$tj
185238384Sjkim	daddu	$hi0,AT
186238384Sjkim	mflo	$m1
187238384Sjkim
188238384Sjkim	dmultu	$alo,$bi
189238384Sjkim	mflo	$alo
190238384Sjkim	mfhi	$ahi
191238384Sjkim
192238384Sjkim	dmultu	$nj,$m1
193238384Sjkim	mflo	$lo1
194238384Sjkim	mfhi	$hi1
195238384Sjkim
196238384Sjkim	dmultu	$nlo,$m1
197238384Sjkim	daddu	$lo1,$lo0
198238384Sjkim	sltu	AT,$lo1,$lo0
199238384Sjkim	daddu	$hi1,AT
200238384Sjkim	mflo	$nlo
201238384Sjkim	mfhi	$nhi
202238384Sjkim
203238384Sjkim	move	$tp,sp
204238384Sjkim	li	$j,16
205238384Sjkim	ld	$tj,8($tp)
206238384Sjkim.align	4
207238384Sjkim.Linner:
208238384Sjkim	.set	noreorder
209238384Sjkim	PTR_ADD	$aj,$ap,$j
210238384Sjkim	ld	$aj,($aj)
211238384Sjkim	PTR_ADD	$nj,$np,$j
212238384Sjkim	ld	$nj,($nj)
213238384Sjkim
214238384Sjkim	dmultu	$aj,$bi
215238384Sjkim	daddu	$lo0,$alo,$hi0
216238384Sjkim	daddu	$lo1,$nlo,$hi1
217238384Sjkim	sltu	AT,$lo0,$hi0
218238384Sjkim	sltu	s7,$lo1,$hi1
219238384Sjkim	daddu	$hi0,$ahi,AT
220238384Sjkim	daddu	$hi1,$nhi,s7
221238384Sjkim	mflo	$alo
222238384Sjkim	mfhi	$ahi
223238384Sjkim
224238384Sjkim	daddu	$lo0,$tj
225238384Sjkim	addu	$j,8
226238384Sjkim	dmultu	$nj,$m1
227238384Sjkim	sltu	AT,$lo0,$tj
228238384Sjkim	daddu	$lo1,$lo0
229238384Sjkim	daddu	$hi0,AT
230238384Sjkim	sltu	s7,$lo1,$lo0
231238384Sjkim	ld	$tj,16($tp)
232238384Sjkim	daddu	$hi1,s7
233238384Sjkim	sltu	AT,$j,$num
234238384Sjkim	mflo	$nlo
235238384Sjkim	mfhi	$nhi
236238384Sjkim	sd	$lo1,($tp)
237238384Sjkim	bnez	AT,.Linner
238238384Sjkim	PTR_ADD	$tp,8
239238384Sjkim	.set	reorder
240238384Sjkim
241238384Sjkim	daddu	$lo0,$alo,$hi0
242238384Sjkim	sltu	AT,$lo0,$hi0
243238384Sjkim	daddu	$hi0,$ahi,AT
244238384Sjkim	daddu	$lo0,$tj
245238384Sjkim	sltu	s7,$lo0,$tj
246238384Sjkim	daddu	$hi0,s7
247238384Sjkim
248238384Sjkim	ld	$tj,16($tp)
249238384Sjkim	daddu	$lo1,$nlo,$hi1
250238384Sjkim	sltu	AT,$lo1,$hi1
251238384Sjkim	daddu	$hi1,$nhi,AT
252238384Sjkim	daddu	$lo1,$lo0
253238384Sjkim	sltu	s7,$lo1,$lo0
254238384Sjkim	daddu	$hi1,s7
255238384Sjkim	sd	$lo1,($tp)
256238384Sjkim
257238384Sjkim	daddu	$lo1,$hi1,$hi0
258238384Sjkim	sltu	$hi1,$lo1,$hi0
259238384Sjkim	daddu	$lo1,$tj
260238384Sjkim	sltu	AT,$lo1,$tj
261238384Sjkim	daddu	$hi1,AT
262238384Sjkim	sd	$lo1,8($tp)
263238384Sjkim	sd	$hi1,16($tp)
264238384Sjkim
265238384Sjkim	addu	$i,8
266238384Sjkim	sltu	s7,$i,$num
267238384Sjkim	bnez	s7,.Louter
268238384Sjkim
269238384Sjkim	.set	noreorder
270238384Sjkim	PTR_ADD	$tj,sp,$num	# &tp[num]
271238384Sjkim	move	$tp,sp
272238384Sjkim	move	$ap,sp
273238384Sjkim	li	$hi0,0		# clear borrow bit
274238384Sjkim
275238384Sjkim.align	4
276238384Sjkim.Lsub:	ld	$lo0,($tp)
277238384Sjkim	ld	$lo1,($np)
278238384Sjkim	PTR_ADD	$tp,8
279238384Sjkim	PTR_ADD	$np,8
280238384Sjkim	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
281238384Sjkim	sgtu	AT,$lo1,$lo0
282238384Sjkim	dsubu	$lo0,$lo1,$hi0
283238384Sjkim	sgtu	$hi0,$lo0,$lo1
284238384Sjkim	sd	$lo0,($rp)
285238384Sjkim	or	$hi0,AT
286238384Sjkim	sltu	AT,$tp,$tj
287238384Sjkim	bnez	AT,.Lsub
288238384Sjkim	PTR_ADD	$rp,8
289238384Sjkim
290238384Sjkim	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
291238384Sjkim	move	$tp,sp
292238384Sjkim	PTR_SUB	$rp,$num	# restore rp
293238384Sjkim	not	$hi1,$hi0
294238384Sjkim
295238384Sjkim	and	$ap,$hi0,sp
296238384Sjkim	and	$bp,$hi1,$rp
297238384Sjkim	or	$ap,$ap,$bp	# ap=borrow?tp:rp
298238384Sjkim
299238384Sjkim.align	4
300238384Sjkim.Lcopy:	ld	$aj,($ap)
301238384Sjkim	PTR_ADD	$ap,8
302238384Sjkim	PTR_ADD	$tp,8
303238384Sjkim	sd	zero,-8($tp)
304238384Sjkim	sltu	AT,$tp,$tj
305238384Sjkim	sd	$aj,($rp)
306238384Sjkim	bnez	AT,.Lcopy
307238384Sjkim	PTR_ADD	$rp,8
308238384Sjkim
309238384Sjkim	ld	s0,0($fp)
310238384Sjkim	ld	s1,8($fp)
311238384Sjkim	ld	s2,16($fp)
312238384Sjkim	ld	s3,24($fp)
313238384Sjkim	ld	s4,32($fp)
314238384Sjkim	ld	s5,40($fp)
315238384Sjkim	ld	s6,48($fp)
316238384Sjkim	ld	s7,56($fp)
317238384Sjkim	li	v0,1
318238384Sjkim	jr	ra
319238384Sjkim	PTR_ADD	sp,$fp,64
320238384Sjkim	.set	reorder
321238384SjkimEND(bn_mul_mont)
322238384Sjkim.rdata
323238384Sjkim.asciiz	"Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
324238384Sjkim___
325238384Sjkim
326238384Sjkimprint $code;
327238384Sjkimclose STDOUT;
328