1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2007.
11
12# Montgomery multiplication for ARMv4.
13#
14# Performance improvement naturally varies among CPU implementations
15# and compilers. The code was observed to provide +65-35% improvement
16# [depending on key length, less for longer keys] on ARM920T, and
17# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18# base and compiler generated code with in-lined umull and even umlal
19# instructions. The latter means that this code didn't really have an
20# "advantage" of utilizing some "secret" instruction.
21#
22# The code is interoperable with Thumb ISA and is rather compact, less
23# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24# about decorations, ABI and instruction syntax are identical.
25
26while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29$num="r0";	# starts as num argument, but holds &tp[num-1]
30$ap="r1";
31$bp="r2"; $bi="r2"; $rp="r2";
32$np="r3";
33$tp="r4";
34$aj="r5";
35$nj="r6";
36$tj="r7";
37$n0="r8";
38###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
39$alo="r10";	# sl, gcc uses it to keep @GOT
40$ahi="r11";	# fp
41$nlo="r12";	# ip
42###########	# r13 is stack pointer
43$nhi="r14";	# lr
44###########	# r15 is program counter
45
46#### argument block layout relative to &tp[num-1], a.k.a. $num
47$_rp="$num,#12*4";
48# ap permanently resides in r1
49$_bp="$num,#13*4";
50# np permanently resides in r3
51$_n0="$num,#14*4";
52$_num="$num,#15*4";	$_bpend=$_num;
53
54$code=<<___;
55.text
56
57.global	bn_mul_mont
58.type	bn_mul_mont,%function
59
60.align	2
61bn_mul_mont:
62	stmdb	sp!,{r0,r2}		@ sp points at argument block
63	ldr	$num,[sp,#3*4]		@ load num
64	cmp	$num,#2
65	movlt	r0,#0
66	addlt	sp,sp,#2*4
67	blt	.Labrt
68
69	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
70
71	mov	$num,$num,lsl#2		@ rescale $num for byte count
72	sub	sp,sp,$num		@ alloca(4*num)
73	sub	sp,sp,#4		@ +extra dword
74	sub	$num,$num,#4		@ "num=num-1"
75	add	$tp,$bp,$num		@ &bp[num-1]
76
77	add	$num,sp,$num		@ $num to point at &tp[num-1]
78	ldr	$n0,[$_n0]		@ &n0
79	ldr	$bi,[$bp]		@ bp[0]
80	ldr	$aj,[$ap],#4		@ ap[0],ap++
81	ldr	$nj,[$np],#4		@ np[0],np++
82	ldr	$n0,[$n0]		@ *n0
83	str	$tp,[$_bpend]		@ save &bp[num]
84
85	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
86	str	$n0,[$_n0]		@ save n0 value
87	mul	$n0,$alo,$n0		@ "tp[0]"*n0
88	mov	$nlo,#0
89	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
90	mov	$tp,sp
91
92.L1st:
93	ldr	$aj,[$ap],#4		@ ap[j],ap++
94	mov	$alo,$ahi
95	ldr	$nj,[$np],#4		@ np[j],np++
96	mov	$ahi,#0
97	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
98	mov	$nhi,#0
99	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
100	adds	$nlo,$nlo,$alo
101	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
102	adc	$nlo,$nhi,#0
103	cmp	$tp,$num
104	bne	.L1st
105
106	adds	$nlo,$nlo,$ahi
107	ldr	$tp,[$_bp]		@ restore bp
108	mov	$nhi,#0
109	ldr	$n0,[$_n0]		@ restore n0
110	adc	$nhi,$nhi,#0
111	str	$nlo,[$num]		@ tp[num-1]=
112	str	$nhi,[$num,#4]		@ tp[num]=
113
114.Louter:
115	sub	$tj,$num,sp		@ "original" $num-1 value
116	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
117	ldr	$bi,[$tp,#4]!		@ *(++bp)
118	sub	$np,$np,$tj		@ "rewind" np to &np[1]
119	ldr	$aj,[$ap,#-4]		@ ap[0]
120	ldr	$alo,[sp]		@ tp[0]
121	ldr	$nj,[$np,#-4]		@ np[0]
122	ldr	$tj,[sp,#4]		@ tp[1]
123
124	mov	$ahi,#0
125	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
126	str	$tp,[$_bp]		@ save bp
127	mul	$n0,$alo,$n0
128	mov	$nlo,#0
129	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
130	mov	$tp,sp
131
132.Linner:
133	ldr	$aj,[$ap],#4		@ ap[j],ap++
134	adds	$alo,$ahi,$tj		@ +=tp[j]
135	ldr	$nj,[$np],#4		@ np[j],np++
136	mov	$ahi,#0
137	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
138	mov	$nhi,#0
139	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
140	adc	$ahi,$ahi,#0
141	ldr	$tj,[$tp,#8]		@ tp[j+1]
142	adds	$nlo,$nlo,$alo
143	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
144	adc	$nlo,$nhi,#0
145	cmp	$tp,$num
146	bne	.Linner
147
148	adds	$nlo,$nlo,$ahi
149	mov	$nhi,#0
150	ldr	$tp,[$_bp]		@ restore bp
151	adc	$nhi,$nhi,#0
152	ldr	$n0,[$_n0]		@ restore n0
153	adds	$nlo,$nlo,$tj
154	ldr	$tj,[$_bpend]		@ restore &bp[num]
155	adc	$nhi,$nhi,#0
156	str	$nlo,[$num]		@ tp[num-1]=
157	str	$nhi,[$num,#4]		@ tp[num]=
158
159	cmp	$tp,$tj
160	bne	.Louter
161
162	ldr	$rp,[$_rp]		@ pull rp
163	add	$num,$num,#4		@ $num to point at &tp[num]
164	sub	$aj,$num,sp		@ "original" num value
165	mov	$tp,sp			@ "rewind" $tp
166	mov	$ap,$tp			@ "borrow" $ap
167	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
168
169	subs	$tj,$tj,$tj		@ "clear" carry flag
170.Lsub:	ldr	$tj,[$tp],#4
171	ldr	$nj,[$np],#4
172	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
173	str	$tj,[$rp],#4		@ rp[j]=
174	teq	$tp,$num		@ preserve carry
175	bne	.Lsub
176	sbcs	$nhi,$nhi,#0		@ upmost carry
177	mov	$tp,sp			@ "rewind" $tp
178	sub	$rp,$rp,$aj		@ "rewind" $rp
179
180	and	$ap,$tp,$nhi
181	bic	$np,$rp,$nhi
182	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
183
184.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
185	str	sp,[$tp],#4		@ zap tp
186	str	$tj,[$rp],#4
187	cmp	$tp,$num
188	bne	.Lcopy
189
190	add	sp,$num,#4		@ skip over tp[num+1]
191	ldmia	sp!,{r4-r12,lr}		@ restore registers
192	add	sp,sp,#2*4		@ skip over {r0,r2}
193	mov	r0,#1
194.Labrt:	tst	lr,#1
195	moveq	pc,lr			@ be binary compatible with V4, yet
196	bx	lr			@ interoperable with Thumb ISA:-)
197.size	bn_mul_mont,.-bn_mul_mont
198.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
199.align	2
200___
201
202$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
203print $code;
204close STDOUT;
205