sparcv9-mont.pl revision 337982
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2005
11#
12# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13# for undertaken effort are multiple. First of all, UltraSPARC is not
14# the whole SPARCv9 universe and other VIS-free implementations deserve
15# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18# several integrated RSA/DSA accelerator circuits accessible through
19# kernel driver [only(*)], but having decent user-land software
20# implementation is important too. Finally, reasons like desire to
21# experiment with dedicated squaring procedure. Yes, this module
22# implements one, because it was easiest to draft it in SPARCv9
23# instructions...
24
25# (*)	Engine accessing the driver in question is on my TODO list.
26#	For reference, acceleator is estimated to give 6 to 10 times
27#	improvement on single-threaded RSA sign. It should be noted
28#	that 6-10x improvement coefficient does not actually mean
29#	something extraordinary in terms of absolute [single-threaded]
30#	performance, as SPARCv9 instruction set is by all means least
31#	suitable for high performance crypto among other 64 bit
32#	platforms. 6-10x factor simply places T1 in same performance
33#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
34#	appear impressive at all, but it's the sign operation which is
35#	far more critical/interesting.
36
37# You might notice that inner loops are modulo-scheduled:-) This has
38# essentially negligible impact on UltraSPARC performance, it's
39# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40# the advantage... Currently this module surpasses sparcv9a-mont.pl
41# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42# module still have hidden potential [see TODO list there], which is
43# estimated to be larger than 20%...
44
45# int bn_mul_mont(
46$rp="%i0";	# BN_ULONG *rp,
47$ap="%i1";	# const BN_ULONG *ap,
48$bp="%i2";	# const BN_ULONG *bp,
49$np="%i3";	# const BN_ULONG *np,
50$n0="%i4";	# const BN_ULONG *n0,
51$num="%i5";	# int num);
52
53$bits=32;
54for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55if ($bits==64)	{ $bias=2047; $frame=192; }
56else		{ $bias=0;    $frame=128; }
57
58$car0="%o0";
59$car1="%o1";
60$car2="%o2";	# 1 bit
61$acc0="%o3";
62$acc1="%o4";
63$mask="%g1";	# 32 bits, what a waste...
64$tmp0="%g4";
65$tmp1="%g5";
66
67$i="%l0";
68$j="%l1";
69$mul0="%l2";
70$mul1="%l3";
71$tp="%l4";
72$apj="%l5";
73$npj="%l6";
74$tpj="%l7";
75
76$fname="bn_mul_mont_int";
77
78$code=<<___;
79.section	".text",#alloc,#execinstr
80
81.global	$fname
82.align	32
83$fname:
84	cmp	%o5,4			! 128 bits minimum
85	bge,pt	%icc,.Lenter
86	sethi	%hi(0xffffffff),$mask
87	retl
88	clr	%o0
89.align	32
90.Lenter:
91	save	%sp,-$frame,%sp
92	sll	$num,2,$num		! num*=4
93	or	$mask,%lo(0xffffffff),$mask
94	ld	[$n0],$n0
95	cmp	$ap,$bp
96	and	$num,$mask,$num
97	ld	[$bp],$mul0		! bp[0]
98	nop
99
100	add	%sp,$bias,%o7		! real top of stack
101	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
102	sub	%o7,$num,%o7
103	ld	[$ap+4],$apj		! ap[1]
104	and	%o7,-1024,%o7
105	ld	[$np],$car1		! np[0]
106	sub	%o7,$bias,%sp		! alloca
107	ld	[$np+4],$npj		! np[1]
108	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109	mov	12,$j
110
111	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
112	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
113	and	$car0,$mask,$acc0
114	add	%sp,$bias+$frame,$tp
115	ld	[$ap+8],$apj		!prologue!
116
117	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
118	and	$mul1,$mask,$mul1
119
120	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
121	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
122	srlx	$car0,32,$car0
123	add	$acc0,$car1,$car1
124	ld	[$np+8],$npj		!prologue!
125	srlx	$car1,32,$car1
126	mov	$tmp0,$acc0		!prologue!
127
128.L1st:
129	mulx	$apj,$mul0,$tmp0
130	mulx	$npj,$mul1,$tmp1
131	add	$acc0,$car0,$car0
132	ld	[$ap+$j],$apj		! ap[j]
133	and	$car0,$mask,$acc0
134	add	$acc1,$car1,$car1
135	ld	[$np+$j],$npj		! np[j]
136	srlx	$car0,32,$car0
137	add	$acc0,$car1,$car1
138	add	$j,4,$j			! j++
139	mov	$tmp0,$acc0
140	st	$car1,[$tp]
141	cmp	$j,$num
142	mov	$tmp1,$acc1
143	srlx	$car1,32,$car1
144	bl	%icc,.L1st
145	add	$tp,4,$tp		! tp++
146!.L1st
147
148	mulx	$apj,$mul0,$tmp0	!epilogue!
149	mulx	$npj,$mul1,$tmp1
150	add	$acc0,$car0,$car0
151	and	$car0,$mask,$acc0
152	add	$acc1,$car1,$car1
153	srlx	$car0,32,$car0
154	add	$acc0,$car1,$car1
155	st	$car1,[$tp]
156	srlx	$car1,32,$car1
157
158	add	$tmp0,$car0,$car0
159	and	$car0,$mask,$acc0
160	add	$tmp1,$car1,$car1
161	srlx	$car0,32,$car0
162	add	$acc0,$car1,$car1
163	st	$car1,[$tp+4]
164	srlx	$car1,32,$car1
165
166	add	$car0,$car1,$car1
167	st	$car1,[$tp+8]
168	srlx	$car1,32,$car2
169
170	mov	4,$i			! i++
171	ld	[$bp+4],$mul0		! bp[1]
172.Louter:
173	add	%sp,$bias+$frame,$tp
174	ld	[$ap],$car0		! ap[0]
175	ld	[$ap+4],$apj		! ap[1]
176	ld	[$np],$car1		! np[0]
177	ld	[$np+4],$npj		! np[1]
178	ld	[$tp],$tmp1		! tp[0]
179	ld	[$tp+4],$tpj		! tp[1]
180	mov	12,$j
181
182	mulx	$car0,$mul0,$car0
183	mulx	$apj,$mul0,$tmp0	!prologue!
184	add	$tmp1,$car0,$car0
185	ld	[$ap+8],$apj		!prologue!
186	and	$car0,$mask,$acc0
187
188	mulx	$n0,$acc0,$mul1
189	and	$mul1,$mask,$mul1
190
191	mulx	$car1,$mul1,$car1
192	mulx	$npj,$mul1,$acc1	!prologue!
193	srlx	$car0,32,$car0
194	add	$acc0,$car1,$car1
195	ld	[$np+8],$npj		!prologue!
196	srlx	$car1,32,$car1
197	mov	$tmp0,$acc0		!prologue!
198
199.Linner:
200	mulx	$apj,$mul0,$tmp0
201	mulx	$npj,$mul1,$tmp1
202	add	$tpj,$car0,$car0
203	ld	[$ap+$j],$apj		! ap[j]
204	add	$acc0,$car0,$car0
205	add	$acc1,$car1,$car1
206	ld	[$np+$j],$npj		! np[j]
207	and	$car0,$mask,$acc0
208	ld	[$tp+8],$tpj		! tp[j]
209	srlx	$car0,32,$car0
210	add	$acc0,$car1,$car1
211	add	$j,4,$j			! j++
212	mov	$tmp0,$acc0
213	st	$car1,[$tp]		! tp[j-1]
214	srlx	$car1,32,$car1
215	mov	$tmp1,$acc1
216	cmp	$j,$num
217	bl	%icc,.Linner
218	add	$tp,4,$tp		! tp++
219!.Linner
220
221	mulx	$apj,$mul0,$tmp0	!epilogue!
222	mulx	$npj,$mul1,$tmp1
223	add	$tpj,$car0,$car0
224	add	$acc0,$car0,$car0
225	ld	[$tp+8],$tpj		! tp[j]
226	and	$car0,$mask,$acc0
227	add	$acc1,$car1,$car1
228	srlx	$car0,32,$car0
229	add	$acc0,$car1,$car1
230	st	$car1,[$tp]		! tp[j-1]
231	srlx	$car1,32,$car1
232
233	add	$tpj,$car0,$car0
234	add	$tmp0,$car0,$car0
235	and	$car0,$mask,$acc0
236	add	$tmp1,$car1,$car1
237	add	$acc0,$car1,$car1
238	st	$car1,[$tp+4]		! tp[j-1]
239	srlx	$car0,32,$car0
240	add	$i,4,$i			! i++
241	srlx	$car1,32,$car1
242
243	add	$car0,$car1,$car1
244	cmp	$i,$num
245	add	$car2,$car1,$car1
246	st	$car1,[$tp+8]
247
248	srlx	$car1,32,$car2
249	bl,a	%icc,.Louter
250	ld	[$bp+$i],$mul0		! bp[i]
251!.Louter
252
253	add	$tp,12,$tp
254
255.Ltail:
256	add	$np,$num,$np
257	add	$rp,$num,$rp
258	sub	%g0,$num,%o7		! k=-num
259	ba	.Lsub
260	subcc	%g0,%g0,%g0		! clear %icc.c
261.align	16
262.Lsub:
263	ld	[$tp+%o7],%o0
264	ld	[$np+%o7],%o1
265	subccc	%o0,%o1,%o1		! tp[j]-np[j]
266	add	$rp,%o7,$i
267	add	%o7,4,%o7
268	brnz	%o7,.Lsub
269	st	%o1,[$i]
270	subccc	$car2,0,$car2		! handle upmost overflow bit
271	sub	%g0,$num,%o7
272
273.Lcopy:
274	ld	[$tp+%o7],%o1		! conditional copy
275	ld	[$rp+%o7],%o0
276	st	%g0,[$tp+%o7]		! zap tp
277	movcs	%icc,%o1,%o0
278	st	%o0,[$rp+%o7]
279	add	%o7,4,%o7
280	brnz	%o7,.Lcopy
281	nop
282	mov	1,%i0
283	ret
284	restore
285___
286
287########
288######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
289######## code without following dedicated squaring procedure.
290########
291$sbit="%o5";
292
293$code.=<<___;
294.align	32
295.Lbn_sqr_mont:
296	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
297	mulx	$apj,$mul0,$tmp0		!prologue!
298	and	$car0,$mask,$acc0
299	add	%sp,$bias+$frame,$tp
300	ld	[$ap+8],$apj			!prologue!
301
302	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
303	srlx	$car0,32,$car0
304	and	$mul1,$mask,$mul1
305
306	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
307	mulx	$npj,$mul1,$acc1		!prologue!
308	and	$car0,1,$sbit
309	ld	[$np+8],$npj			!prologue!
310	srlx	$car0,1,$car0
311	add	$acc0,$car1,$car1
312	srlx	$car1,32,$car1
313	mov	$tmp0,$acc0			!prologue!
314
315.Lsqr_1st:
316	mulx	$apj,$mul0,$tmp0
317	mulx	$npj,$mul1,$tmp1
318	add	$acc0,$car0,$car0		! ap[j]*a0+c0
319	add	$acc1,$car1,$car1
320	ld	[$ap+$j],$apj			! ap[j]
321	and	$car0,$mask,$acc0
322	ld	[$np+$j],$npj			! np[j]
323	srlx	$car0,32,$car0
324	add	$acc0,$acc0,$acc0
325	or	$sbit,$acc0,$acc0
326	mov	$tmp1,$acc1
327	srlx	$acc0,32,$sbit
328	add	$j,4,$j				! j++
329	and	$acc0,$mask,$acc0
330	cmp	$j,$num
331	add	$acc0,$car1,$car1
332	st	$car1,[$tp]
333	mov	$tmp0,$acc0
334	srlx	$car1,32,$car1
335	bl	%icc,.Lsqr_1st
336	add	$tp,4,$tp			! tp++
337!.Lsqr_1st
338
339	mulx	$apj,$mul0,$tmp0		! epilogue
340	mulx	$npj,$mul1,$tmp1
341	add	$acc0,$car0,$car0		! ap[j]*a0+c0
342	add	$acc1,$car1,$car1
343	and	$car0,$mask,$acc0
344	srlx	$car0,32,$car0
345	add	$acc0,$acc0,$acc0
346	or	$sbit,$acc0,$acc0
347	srlx	$acc0,32,$sbit
348	and	$acc0,$mask,$acc0
349	add	$acc0,$car1,$car1
350	st	$car1,[$tp]
351	srlx	$car1,32,$car1
352
353	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
354	add	$tmp1,$car1,$car1
355	and	$car0,$mask,$acc0
356	srlx	$car0,32,$car0
357	add	$acc0,$acc0,$acc0
358	or	$sbit,$acc0,$acc0
359	srlx	$acc0,32,$sbit
360	and	$acc0,$mask,$acc0
361	add	$acc0,$car1,$car1
362	st	$car1,[$tp+4]
363	srlx	$car1,32,$car1
364
365	add	$car0,$car0,$car0
366	or	$sbit,$car0,$car0
367	add	$car0,$car1,$car1
368	st	$car1,[$tp+8]
369	srlx	$car1,32,$car2
370
371	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
372	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
373	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
374	ld	[$ap+4],$mul0			! ap[1]
375	ld	[$ap+8],$apj			! ap[2]
376	ld	[$np],$car1			! np[0]
377	ld	[$np+4],$npj			! np[1]
378	mulx	$n0,$tmp0,$mul1
379
380	mulx	$mul0,$mul0,$car0
381	and	$mul1,$mask,$mul1
382
383	mulx	$car1,$mul1,$car1
384	mulx	$npj,$mul1,$acc1
385	add	$tmp0,$car1,$car1
386	and	$car0,$mask,$acc0
387	ld	[$np+8],$npj			! np[2]
388	srlx	$car1,32,$car1
389	add	$tmp1,$car1,$car1
390	srlx	$car0,32,$car0
391	add	$acc0,$car1,$car1
392	and	$car0,1,$sbit
393	add	$acc1,$car1,$car1
394	srlx	$car0,1,$car0
395	mov	12,$j
396	st	$car1,[%sp+$bias+$frame]	! tp[0]=
397	srlx	$car1,32,$car1
398	add	%sp,$bias+$frame+4,$tp
399
400.Lsqr_2nd:
401	mulx	$apj,$mul0,$acc0
402	mulx	$npj,$mul1,$acc1
403	add	$acc0,$car0,$car0
404	add	$tpj,$sbit,$sbit
405	ld	[$ap+$j],$apj			! ap[j]
406	and	$car0,$mask,$acc0
407	ld	[$np+$j],$npj			! np[j]
408	srlx	$car0,32,$car0
409	add	$acc1,$car1,$car1
410	ld	[$tp+8],$tpj			! tp[j]
411	add	$acc0,$acc0,$acc0
412	add	$j,4,$j				! j++
413	add	$sbit,$acc0,$acc0
414	srlx	$acc0,32,$sbit
415	and	$acc0,$mask,$acc0
416	cmp	$j,$num
417	add	$acc0,$car1,$car1
418	st	$car1,[$tp]			! tp[j-1]
419	srlx	$car1,32,$car1
420	bl	%icc,.Lsqr_2nd
421	add	$tp,4,$tp			! tp++
422!.Lsqr_2nd
423
424	mulx	$apj,$mul0,$acc0
425	mulx	$npj,$mul1,$acc1
426	add	$acc0,$car0,$car0
427	add	$tpj,$sbit,$sbit
428	and	$car0,$mask,$acc0
429	srlx	$car0,32,$car0
430	add	$acc1,$car1,$car1
431	add	$acc0,$acc0,$acc0
432	add	$sbit,$acc0,$acc0
433	srlx	$acc0,32,$sbit
434	and	$acc0,$mask,$acc0
435	add	$acc0,$car1,$car1
436	st	$car1,[$tp]			! tp[j-1]
437	srlx	$car1,32,$car1
438
439	add	$car0,$car0,$car0
440	add	$sbit,$car0,$car0
441	add	$car0,$car1,$car1
442	add	$car2,$car1,$car1
443	st	$car1,[$tp+4]
444	srlx	$car1,32,$car2
445
446	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
447	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
448	ld	[$ap+8],$mul0			! ap[2]
449	ld	[$np],$car1			! np[0]
450	ld	[$np+4],$npj			! np[1]
451	mulx	$n0,$tmp1,$mul1
452	and	$mul1,$mask,$mul1
453	mov	8,$i
454
455	mulx	$mul0,$mul0,$car0
456	mulx	$car1,$mul1,$car1
457	and	$car0,$mask,$acc0
458	add	$tmp1,$car1,$car1
459	srlx	$car0,32,$car0
460	add	%sp,$bias+$frame,$tp
461	srlx	$car1,32,$car1
462	and	$car0,1,$sbit
463	srlx	$car0,1,$car0
464	mov	4,$j
465
466.Lsqr_outer:
467.Lsqr_inner1:
468	mulx	$npj,$mul1,$acc1
469	add	$tpj,$car1,$car1
470	add	$j,4,$j
471	ld	[$tp+8],$tpj
472	cmp	$j,$i
473	add	$acc1,$car1,$car1
474	ld	[$np+$j],$npj
475	st	$car1,[$tp]
476	srlx	$car1,32,$car1
477	bl	%icc,.Lsqr_inner1
478	add	$tp,4,$tp
479!.Lsqr_inner1
480
481	add	$j,4,$j
482	ld	[$ap+$j],$apj			! ap[j]
483	mulx	$npj,$mul1,$acc1
484	add	$tpj,$car1,$car1
485	ld	[$np+$j],$npj			! np[j]
486	srlx	$car1,32,$tmp0
487	and	$car1,$mask,$car1
488	add	$tmp0,$sbit,$sbit
489	add	$acc0,$car1,$car1
490	ld	[$tp+8],$tpj			! tp[j]
491	add	$acc1,$car1,$car1
492	st	$car1,[$tp]
493	srlx	$car1,32,$car1
494
495	add	$j,4,$j
496	cmp	$j,$num
497	be,pn	%icc,.Lsqr_no_inner2
498	add	$tp,4,$tp
499
500.Lsqr_inner2:
501	mulx	$apj,$mul0,$acc0
502	mulx	$npj,$mul1,$acc1
503	add	$tpj,$sbit,$sbit
504	add	$acc0,$car0,$car0
505	ld	[$ap+$j],$apj			! ap[j]
506	and	$car0,$mask,$acc0
507	ld	[$np+$j],$npj			! np[j]
508	srlx	$car0,32,$car0
509	add	$acc0,$acc0,$acc0
510	ld	[$tp+8],$tpj			! tp[j]
511	add	$sbit,$acc0,$acc0
512	add	$j,4,$j				! j++
513	srlx	$acc0,32,$sbit
514	and	$acc0,$mask,$acc0
515	cmp	$j,$num
516	add	$acc0,$car1,$car1
517	add	$acc1,$car1,$car1
518	st	$car1,[$tp]			! tp[j-1]
519	srlx	$car1,32,$car1
520	bl	%icc,.Lsqr_inner2
521	add	$tp,4,$tp			! tp++
522
523.Lsqr_no_inner2:
524	mulx	$apj,$mul0,$acc0
525	mulx	$npj,$mul1,$acc1
526	add	$tpj,$sbit,$sbit
527	add	$acc0,$car0,$car0
528	and	$car0,$mask,$acc0
529	srlx	$car0,32,$car0
530	add	$acc0,$acc0,$acc0
531	add	$sbit,$acc0,$acc0
532	srlx	$acc0,32,$sbit
533	and	$acc0,$mask,$acc0
534	add	$acc0,$car1,$car1
535	add	$acc1,$car1,$car1
536	st	$car1,[$tp]			! tp[j-1]
537	srlx	$car1,32,$car1
538
539	add	$car0,$car0,$car0
540	add	$sbit,$car0,$car0
541	add	$car0,$car1,$car1
542	add	$car2,$car1,$car1
543	st	$car1,[$tp+4]
544	srlx	$car1,32,$car2
545
546	add	$i,4,$i				! i++
547	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
548	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
549	ld	[$ap+$i],$mul0			! ap[j]
550	ld	[$np],$car1			! np[0]
551	ld	[$np+4],$npj			! np[1]
552	mulx	$n0,$tmp1,$mul1
553	and	$mul1,$mask,$mul1
554	add	$i,4,$tmp0
555
556	mulx	$mul0,$mul0,$car0
557	mulx	$car1,$mul1,$car1
558	and	$car0,$mask,$acc0
559	add	$tmp1,$car1,$car1
560	srlx	$car0,32,$car0
561	add	%sp,$bias+$frame,$tp
562	srlx	$car1,32,$car1
563	and	$car0,1,$sbit
564	srlx	$car0,1,$car0
565
566	cmp	$tmp0,$num			! i<num-1
567	bl	%icc,.Lsqr_outer
568	mov	4,$j
569
570.Lsqr_last:
571	mulx	$npj,$mul1,$acc1
572	add	$tpj,$car1,$car1
573	add	$j,4,$j
574	ld	[$tp+8],$tpj
575	cmp	$j,$i
576	add	$acc1,$car1,$car1
577	ld	[$np+$j],$npj
578	st	$car1,[$tp]
579	srlx	$car1,32,$car1
580	bl	%icc,.Lsqr_last
581	add	$tp,4,$tp
582!.Lsqr_last
583
584	mulx	$npj,$mul1,$acc1
585	add	$tpj,$acc0,$acc0
586	srlx	$acc0,32,$tmp0
587	and	$acc0,$mask,$acc0
588	add	$tmp0,$sbit,$sbit
589	add	$acc0,$car1,$car1
590	add	$acc1,$car1,$car1
591	st	$car1,[$tp]
592	srlx	$car1,32,$car1
593
594	add	$car0,$car0,$car0		! recover $car0
595	add	$sbit,$car0,$car0
596	add	$car0,$car1,$car1
597	add	$car2,$car1,$car1
598	st	$car1,[$tp+4]
599	srlx	$car1,32,$car2
600
601	ba	.Ltail
602	add	$tp,8,$tp
603.type	$fname,#function
604.size	$fname,(.-$fname)
605.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
606.align	32
607___
608$code =~ s/\`([^\`]*)\`/eval($1)/gem;
609print $code;
610close STDOUT;
611