1289848Sjkim#!/usr/bin/env perl
2289848Sjkim
3289848Sjkim# ====================================================================
4289848Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5289848Sjkim# project. The module is, however, dual licensed under OpenSSL and
6289848Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7289848Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8289848Sjkim# ====================================================================
9289848Sjkim
10289848Sjkim# October 2012.
11289848Sjkim#
12289848Sjkim# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
13289848Sjkim# onward. There are three new instructions used here: umulxhi,
14289848Sjkim# addxc[cc] and initializing store. On T3 RSA private key operations
15289848Sjkim# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
16289848Sjkim# lengths. This is without dedicated squaring procedure. On T4
17289848Sjkim# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
18289848Sjkim# for reference purposes, because T4 has dedicated Montgomery
19289848Sjkim# multiplication and squaring *instructions* that deliver even more.
20289848Sjkim
21289848Sjkim$bits=32;
22289848Sjkimfor (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
23289848Sjkimif ($bits==64)  { $bias=2047; $frame=192; }
24289848Sjkimelse            { $bias=0;    $frame=112; }
25289848Sjkim
26289848Sjkim$code.=<<___ if ($bits==64);
27289848Sjkim.register	%g2,#scratch
28289848Sjkim.register	%g3,#scratch
29289848Sjkim___
30289848Sjkim$code.=<<___;
31289848Sjkim.section	".text",#alloc,#execinstr
32289848Sjkim___
33289848Sjkim
34289848Sjkim($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
35289848Sjkim	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
36289848Sjkim
37289848Sjkim# int bn_mul_mont(
38289848Sjkim$rp="%o0";	# BN_ULONG *rp,
39289848Sjkim$ap="%o1";	# const BN_ULONG *ap,
40289848Sjkim$bp="%o2";	# const BN_ULONG *bp,
41289848Sjkim$np="%o3";	# const BN_ULONG *np,
42289848Sjkim$n0p="%o4";	# const BN_ULONG *n0,
43289848Sjkim$num="%o5";	# int num);	# caller ensures that num is even
44289848Sjkim				# and >=6
45289848Sjkim$code.=<<___;
46289848Sjkim.globl	bn_mul_mont_vis3
47289848Sjkim.align	32
48289848Sjkimbn_mul_mont_vis3:
49289848Sjkim	add	%sp,	$bias,	%g4	! real top of stack
50289848Sjkim	sll	$num,	2,	$num	! size in bytes
51289848Sjkim	add	$num,	63,	%g5
52289848Sjkim	andn	%g5,	63,	%g5	! buffer size rounded up to 64 bytes
53289848Sjkim	add	%g5,	%g5,	%g1
54289848Sjkim	add	%g5,	%g1,	%g1	! 3*buffer size
55289848Sjkim	sub	%g4,	%g1,	%g1
56289848Sjkim	andn	%g1,	63,	%g1	! align at 64 byte
57289848Sjkim	sub	%g1,	$frame,	%g1	! new top of stack
58289848Sjkim	sub	%g1,	%g4,	%g1
59289848Sjkim
60289848Sjkim	save	%sp,	%g1,	%sp
61289848Sjkim___
62289848Sjkim
63289848Sjkim#	+-------------------------------+<-----	%sp
64289848Sjkim#	.				.
65289848Sjkim#	+-------------------------------+<-----	aligned at 64 bytes
66289848Sjkim#	| __int64 tmp[0]		|
67289848Sjkim#	+-------------------------------+
68289848Sjkim#	.				.
69289848Sjkim#	.				.
70289848Sjkim#	+-------------------------------+<----- aligned at 64 bytes
71289848Sjkim#	| __int64 ap[1..0]		|	converted ap[]
72289848Sjkim#	+-------------------------------+
73289848Sjkim#	| __int64 np[1..0]		|	converted np[]
74289848Sjkim#	+-------------------------------+
75289848Sjkim#	| __int64 ap[3..2]		|
76289848Sjkim#	.				.
77289848Sjkim#	.				.
78289848Sjkim#	+-------------------------------+
79289848Sjkim($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
80289848Sjkim($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
81289848Sjkim($ovf,$i)=($t0,$t1);
82289848Sjkim$code.=<<___;
83289848Sjkim	ld	[$n0p+0],	$t0	! pull n0[0..1] value
84289848Sjkim	add	%sp, $bias+$frame, $tp
85289848Sjkim	ld	[$n0p+4],	$t1
86289848Sjkim	add	$tp,	%g5,	$anp
87289848Sjkim	ld	[$bp+0],	$t2	! m0=bp[0]
88289848Sjkim	sllx	$t1,	32,	$n0
89289848Sjkim	ld	[$bp+4],	$t3
90289848Sjkim	or	$t0,	$n0,	$n0
91289848Sjkim	add	$bp,	8,	$bp
92289848Sjkim
93289848Sjkim	ld	[$ap+0],	$t0	! ap[0]
94289848Sjkim	sllx	$t3,	32,	$m0
95289848Sjkim	ld	[$ap+4],	$t1
96289848Sjkim	or	$t2,	$m0,	$m0
97289848Sjkim
98289848Sjkim	ld	[$ap+8],	$t2	! ap[1]
99289848Sjkim	sllx	$t1,	32,	$aj
100289848Sjkim	ld	[$ap+12],	$t3
101289848Sjkim	or	$t0,	$aj,	$aj
102289848Sjkim	add	$ap,	16,	$ap
103289848Sjkim	stx	$aj,	[$anp]		! converted ap[0]
104289848Sjkim
105289848Sjkim	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
106289848Sjkim	umulxhi	$aj,	$m0,	$hi0
107289848Sjkim
108289848Sjkim	ld	[$np+0],	$t0	! np[0]
109289848Sjkim	sllx	$t3,	32,	$aj
110289848Sjkim	ld	[$np+4],	$t1
111289848Sjkim	or	$t2,	$aj,	$aj
112289848Sjkim
113289848Sjkim	ld	[$np+8],	$t2	! np[1]
114289848Sjkim	sllx	$t1,	32,	$nj
115289848Sjkim	ld	[$np+12],	$t3
116289848Sjkim	or	$t0, $nj,	$nj
117289848Sjkim	add	$np,	16,	$np
118289848Sjkim	stx	$nj,	[$anp+8]	! converted np[0]
119289848Sjkim
120289848Sjkim	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
121289848Sjkim	stx	$aj,	[$anp+16]	! converted ap[1]
122289848Sjkim
123289848Sjkim	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
124289848Sjkim	umulxhi	$aj,	$m0,	$aj	! ahi=aj
125289848Sjkim
126289848Sjkim	mulx	$nj,	$m1,	$lo1	! np[0]*m1
127289848Sjkim	umulxhi	$nj,	$m1,	$hi1
128289848Sjkim
129289848Sjkim	sllx	$t3,	32,	$nj
130289848Sjkim	or	$t2,	$nj,	$nj
131289848Sjkim	stx	$nj,	[$anp+24]	! converted np[1]
132289848Sjkim	add	$anp,	32,	$anp
133289848Sjkim
134289848Sjkim	addcc	$lo0,	$lo1,	$lo1
135289848Sjkim	addxc	%g0,	$hi1,	$hi1
136289848Sjkim
137289848Sjkim	mulx	$nj,	$m1,	$nlo	! np[1]*m1
138289848Sjkim	umulxhi	$nj,	$m1,	$nj	! nhi=nj
139289848Sjkim
140289848Sjkim	ba	.L1st
141289848Sjkim	sub	$num,	24,	$cnt	! cnt=num-3
142289848Sjkim
143289848Sjkim.align	16
144289848Sjkim.L1st:
145289848Sjkim	ld	[$ap+0],	$t0	! ap[j]
146289848Sjkim	addcc	$alo,	$hi0,	$lo0
147289848Sjkim	ld	[$ap+4],	$t1
148289848Sjkim	addxc	$aj,	%g0,	$hi0
149289848Sjkim
150289848Sjkim	sllx	$t1,	32,	$aj
151289848Sjkim	add	$ap,	8,	$ap
152289848Sjkim	or	$t0,	$aj,	$aj
153289848Sjkim	stx	$aj,	[$anp]		! converted ap[j]
154289848Sjkim
155289848Sjkim	ld	[$np+0],	$t2	! np[j]
156289848Sjkim	addcc	$nlo,	$hi1,	$lo1
157289848Sjkim	ld	[$np+4],	$t3
158289848Sjkim	addxc	$nj,	%g0,	$hi1	! nhi=nj
159289848Sjkim
160289848Sjkim	sllx	$t3,	32,	$nj
161289848Sjkim	add	$np,	8,	$np
162289848Sjkim	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
163289848Sjkim	or	$t2,	$nj,	$nj
164289848Sjkim	umulxhi	$aj,	$m0,	$aj	! ahi=aj
165289848Sjkim	stx	$nj,	[$anp+8]	! converted np[j]
166289848Sjkim	add	$anp,	16,	$anp	! anp++
167289848Sjkim
168289848Sjkim	mulx	$nj,	$m1,	$nlo	! np[j]*m1
169289848Sjkim	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
170289848Sjkim	umulxhi	$nj,	$m1,	$nj	! nhi=nj
171289848Sjkim	addxc	%g0,	$hi1,	$hi1
172289848Sjkim	stx	$lo1,	[$tp]		! tp[j-1]
173289848Sjkim	add	$tp,	8,	$tp	! tp++
174289848Sjkim
175289848Sjkim	brnz,pt	$cnt,	.L1st
176289848Sjkim	sub	$cnt,	8,	$cnt	! j--
177289848Sjkim!.L1st
178289848Sjkim	addcc	$alo,	$hi0,	$lo0
179289848Sjkim	addxc	$aj,	%g0,	$hi0	! ahi=aj
180289848Sjkim
181289848Sjkim	addcc	$nlo,	$hi1,	$lo1
182289848Sjkim	addxc	$nj,	%g0,	$hi1
183289848Sjkim	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
184289848Sjkim	addxc	%g0,	$hi1,	$hi1
185289848Sjkim	stx	$lo1,	[$tp]		! tp[j-1]
186289848Sjkim	add	$tp,	8,	$tp
187289848Sjkim
188289848Sjkim	addcc	$hi0,	$hi1,	$hi1
189289848Sjkim	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
190289848Sjkim	stx	$hi1,	[$tp]
191289848Sjkim	add	$tp,	8,	$tp
192289848Sjkim
193289848Sjkim	ba	.Louter
194289848Sjkim	sub	$num,	16,	$i	! i=num-2
195289848Sjkim
196289848Sjkim.align	16
197289848Sjkim.Louter:
198289848Sjkim	ld	[$bp+0],	$t2	! m0=bp[i]
199289848Sjkim	ld	[$bp+4],	$t3
200289848Sjkim
201289848Sjkim	sub	$anp,	$num,	$anp	! rewind
202289848Sjkim	sub	$tp,	$num,	$tp
203289848Sjkim	sub	$anp,	$num,	$anp
204289848Sjkim
205289848Sjkim	add	$bp,	8,	$bp
206289848Sjkim	sllx	$t3,	32,	$m0
207289848Sjkim	ldx	[$anp+0],	$aj	! ap[0]
208289848Sjkim	or	$t2,	$m0,	$m0
209289848Sjkim	ldx	[$anp+8],	$nj	! np[0]
210289848Sjkim
211289848Sjkim	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
212289848Sjkim	ldx	[$tp],		$tj	! tp[0]
213289848Sjkim	umulxhi	$aj,	$m0,	$hi0
214289848Sjkim	ldx	[$anp+16],	$aj	! ap[1]
215289848Sjkim	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
216289848Sjkim	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
217289848Sjkim	addxc	%g0,	$hi0,	$hi0
218289848Sjkim	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
219289848Sjkim	umulxhi	$aj,	$m0,	$aj	! ahi=aj
220289848Sjkim	mulx	$nj,	$m1,	$lo1	! np[0]*m1
221289848Sjkim	umulxhi	$nj,	$m1,	$hi1
222289848Sjkim	ldx	[$anp+24],	$nj	! np[1]
223289848Sjkim	add	$anp,	32,	$anp
224289848Sjkim	addcc	$lo1,	$lo0,	$lo1
225289848Sjkim	mulx	$nj,	$m1,	$nlo	! np[1]*m1
226289848Sjkim	addxc	%g0,	$hi1,	$hi1
227289848Sjkim	umulxhi	$nj,	$m1,	$nj	! nhi=nj
228289848Sjkim
229289848Sjkim	ba	.Linner
230289848Sjkim	sub	$num,	24,	$cnt	! cnt=num-3
231289848Sjkim.align	16
232289848Sjkim.Linner:
233289848Sjkim	addcc	$alo,	$hi0,	$lo0
234289848Sjkim	ldx	[$tp+8],	$tj	! tp[j]
235289848Sjkim	addxc	$aj,	%g0,	$hi0	! ahi=aj
236289848Sjkim	ldx	[$anp+0],	$aj	! ap[j]
237289848Sjkim	addcc	$nlo,	$hi1,	$lo1
238289848Sjkim	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
239289848Sjkim	addxc	$nj,	%g0,	$hi1	! nhi=nj
240289848Sjkim	ldx	[$anp+8],	$nj	! np[j]
241289848Sjkim	add	$anp,	16,	$anp
242289848Sjkim	umulxhi	$aj,	$m0,	$aj	! ahi=aj
243289848Sjkim	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
244289848Sjkim	mulx	$nj,	$m1,	$nlo	! np[j]*m1
245289848Sjkim	addxc	%g0,	$hi0,	$hi0
246289848Sjkim	umulxhi	$nj,	$m1,	$nj	! nhi=nj
247289848Sjkim	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
248289848Sjkim	addxc	%g0,	$hi1,	$hi1
249289848Sjkim	stx	$lo1,	[$tp]		! tp[j-1]
250289848Sjkim	add	$tp,	8,	$tp
251289848Sjkim	brnz,pt	$cnt,	.Linner
252289848Sjkim	sub	$cnt,	8,	$cnt
253289848Sjkim!.Linner
254289848Sjkim	ldx	[$tp+8],	$tj	! tp[j]
255289848Sjkim	addcc	$alo,	$hi0,	$lo0
256289848Sjkim	addxc	$aj,	%g0,	$hi0	! ahi=aj
257289848Sjkim	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
258289848Sjkim	addxc	%g0,	$hi0,	$hi0
259289848Sjkim
260289848Sjkim	addcc	$nlo,	$hi1,	$lo1
261289848Sjkim	addxc	$nj,	%g0,	$hi1	! nhi=nj
262289848Sjkim	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
263289848Sjkim	addxc	%g0,	$hi1,	$hi1
264289848Sjkim	stx	$lo1,	[$tp]		! tp[j-1]
265289848Sjkim
266289848Sjkim	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
267289848Sjkim	addxccc	$hi1,	$hi0,	$hi1
268289848Sjkim	addxc	%g0,	%g0,	$ovf
269289848Sjkim	stx	$hi1,	[$tp+8]
270289848Sjkim	add	$tp,	16,	$tp
271289848Sjkim
272289848Sjkim	brnz,pt	$i,	.Louter
273289848Sjkim	sub	$i,	8,	$i
274289848Sjkim
275289848Sjkim	sub	$anp,	$num,	$anp	! rewind
276289848Sjkim	sub	$tp,	$num,	$tp
277289848Sjkim	sub	$anp,	$num,	$anp
278289848Sjkim	ba	.Lsub
279289848Sjkim	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
280289848Sjkim
281289848Sjkim.align	16
282289848Sjkim.Lsub:
283289848Sjkim	ldx	[$tp],		$tj
284289848Sjkim	add	$tp,	8,	$tp
285289848Sjkim	ldx	[$anp+8],	$nj
286289848Sjkim	add	$anp,	16,	$anp
287289848Sjkim	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
288289848Sjkim	srlx	$tj,	32,	$tj
289289848Sjkim	srlx	$nj,	32,	$nj
290289848Sjkim	subccc	$tj,	$nj,	$t3
291289848Sjkim	add	$rp,	8,	$rp
292289848Sjkim	st	$t2,	[$rp-4]		! reverse order
293289848Sjkim	st	$t3,	[$rp-8]
294289848Sjkim	brnz,pt	$cnt,	.Lsub
295289848Sjkim	sub	$cnt,	8,	$cnt
296289848Sjkim
297289848Sjkim	sub	$anp,	$num,	$anp	! rewind
298289848Sjkim	sub	$tp,	$num,	$tp
299289848Sjkim	sub	$anp,	$num,	$anp
300289848Sjkim	sub	$rp,	$num,	$rp
301289848Sjkim
302337982Sjkim	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
303289848Sjkim	ba	.Lcopy
304289848Sjkim	sub	$num,	8,	$cnt
305289848Sjkim
306289848Sjkim.align	16
307337982Sjkim.Lcopy:					! conditional copy
308337982Sjkim	ld	[$tp+0],	$t0
309337982Sjkim	ld	[$tp+4],	$t1
310337982Sjkim	ld	[$rp+0],	$t2
311337982Sjkim	ld	[$rp+4],	$t3
312289848Sjkim	stx	%g0,	[$tp]		! zap
313289848Sjkim	add	$tp,	8,	$tp
314289848Sjkim	stx	%g0,	[$anp]		! zap
315289848Sjkim	stx	%g0,	[$anp+8]
316289848Sjkim	add	$anp,	16,	$anp
317337982Sjkim	movcs	%icc,	$t0,	$t2
318337982Sjkim	movcs	%icc,	$t1,	$t3
319289848Sjkim	st	$t3,	[$rp+0]		! flip order
320289848Sjkim	st	$t2,	[$rp+4]
321289848Sjkim	add	$rp,	8,	$rp
322289848Sjkim	brnz	$cnt,	.Lcopy
323289848Sjkim	sub	$cnt,	8,	$cnt
324289848Sjkim
325289848Sjkim	mov	1,	%o0
326289848Sjkim	ret
327289848Sjkim	restore
328289848Sjkim.type	bn_mul_mont_vis3, #function
329289848Sjkim.size	bn_mul_mont_vis3, .-bn_mul_mont_vis3
330289848Sjkim.asciz  "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
331289848Sjkim.align	4
332289848Sjkim___
333289848Sjkim
334289848Sjkim# Purpose of these subroutines is to explicitly encode VIS instructions,
335289848Sjkim# so that one can compile the module without having to specify VIS
336289848Sjkim# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
337289848Sjkim# Idea is to reserve for option to produce "universal" binary and let
338289848Sjkim# programmer detect if current CPU is VIS capable at run-time.
339289848Sjkimsub unvis3 {
340289848Sjkimmy ($mnemonic,$rs1,$rs2,$rd)=@_;
341289848Sjkimmy %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
342289848Sjkimmy ($ref,$opf);
343289848Sjkimmy %visopf = (	"addxc"		=> 0x011,
344289848Sjkim		"addxccc"	=> 0x013,
345289848Sjkim		"umulxhi"	=> 0x016	);
346289848Sjkim
347289848Sjkim    $ref = "$mnemonic\t$rs1,$rs2,$rd";
348289848Sjkim
349289848Sjkim    if ($opf=$visopf{$mnemonic}) {
350289848Sjkim	foreach ($rs1,$rs2,$rd) {
351289848Sjkim	    return $ref if (!/%([goli])([0-9])/);
352289848Sjkim	    $_=$bias{$1}+$2;
353289848Sjkim	}
354289848Sjkim
355289848Sjkim	return	sprintf ".word\t0x%08x !%s",
356289848Sjkim			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
357289848Sjkim			$ref;
358289848Sjkim    } else {
359289848Sjkim	return $ref;
360289848Sjkim    }
361289848Sjkim}
362289848Sjkim
363289848Sjkimforeach (split("\n",$code)) {
364289848Sjkim	s/\`([^\`]*)\`/eval $1/ge;
365289848Sjkim
366289848Sjkim	s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
367289848Sjkim		&unvis3($1,$2,$3,$4)
368289848Sjkim	 /ge;
369289848Sjkim
370289848Sjkim	print $_,"\n";
371289848Sjkim}
372289848Sjkim
373289848Sjkimclose STDOUT;
374