sparcv9-mont.pl revision 325335
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2005
11#
12# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13# for undertaken effort are multiple. First of all, UltraSPARC is not
14# the whole SPARCv9 universe and other VIS-free implementations deserve
15# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18# several integrated RSA/DSA accelerator circuits accessible through
19# kernel driver [only(*)], but having decent user-land software
20# implementation is important too. Finally, reasons like desire to
21# experiment with dedicated squaring procedure. Yes, this module
22# implements one, because it was easiest to draft it in SPARCv9
23# instructions...
24
25# (*)	Engine accessing the driver in question is on my TODO list.
26#	For reference, acceleator is estimated to give 6 to 10 times
27#	improvement on single-threaded RSA sign. It should be noted
28#	that 6-10x improvement coefficient does not actually mean
29#	something extraordinary in terms of absolute [single-threaded]
30#	performance, as SPARCv9 instruction set is by all means least
31#	suitable for high performance crypto among other 64 bit
32#	platforms. 6-10x factor simply places T1 in same performance
33#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
34#	appear impressive at all, but it's the sign operation which is
35#	far more critical/interesting.
36
37# You might notice that inner loops are modulo-scheduled:-) This has
38# essentially negligible impact on UltraSPARC performance, it's
39# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40# the advantage... Currently this module surpasses sparcv9a-mont.pl
41# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42# module still have hidden potential [see TODO list there], which is
43# estimated to be larger than 20%...
44
45# int bn_mul_mont(
46$rp="%i0";	# BN_ULONG *rp,
47$ap="%i1";	# const BN_ULONG *ap,
48$bp="%i2";	# const BN_ULONG *bp,
49$np="%i3";	# const BN_ULONG *np,
50$n0="%i4";	# const BN_ULONG *n0,
51$num="%i5";	# int num);
52
53$bits=32;
54for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55if ($bits==64)	{ $bias=2047; $frame=192; }
56else		{ $bias=0;    $frame=128; }
57
58$car0="%o0";
59$car1="%o1";
60$car2="%o2";	# 1 bit
61$acc0="%o3";
62$acc1="%o4";
63$mask="%g1";	# 32 bits, what a waste...
64$tmp0="%g4";
65$tmp1="%g5";
66
67$i="%l0";
68$j="%l1";
69$mul0="%l2";
70$mul1="%l3";
71$tp="%l4";
72$apj="%l5";
73$npj="%l6";
74$tpj="%l7";
75
76$fname="bn_mul_mont_int";
77
78$code=<<___;
79.section	".text",#alloc,#execinstr
80
81.global	$fname
82.align	32
83$fname:
84	cmp	%o5,4			! 128 bits minimum
85	bge,pt	%icc,.Lenter
86	sethi	%hi(0xffffffff),$mask
87	retl
88	clr	%o0
89.align	32
90.Lenter:
91	save	%sp,-$frame,%sp
92	sll	$num,2,$num		! num*=4
93	or	$mask,%lo(0xffffffff),$mask
94	ld	[$n0],$n0
95	cmp	$ap,$bp
96	and	$num,$mask,$num
97	ld	[$bp],$mul0		! bp[0]
98	nop
99
100	add	%sp,$bias,%o7		! real top of stack
101	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
102	sub	%o7,$num,%o7
103	ld	[$ap+4],$apj		! ap[1]
104	and	%o7,-1024,%o7
105	ld	[$np],$car1		! np[0]
106	sub	%o7,$bias,%sp		! alloca
107	ld	[$np+4],$npj		! np[1]
108	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109	mov	12,$j
110
111	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
112	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
113	and	$car0,$mask,$acc0
114	add	%sp,$bias+$frame,$tp
115	ld	[$ap+8],$apj		!prologue!
116
117	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
118	and	$mul1,$mask,$mul1
119
120	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
121	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
122	srlx	$car0,32,$car0
123	add	$acc0,$car1,$car1
124	ld	[$np+8],$npj		!prologue!
125	srlx	$car1,32,$car1
126	mov	$tmp0,$acc0		!prologue!
127
128.L1st:
129	mulx	$apj,$mul0,$tmp0
130	mulx	$npj,$mul1,$tmp1
131	add	$acc0,$car0,$car0
132	ld	[$ap+$j],$apj		! ap[j]
133	and	$car0,$mask,$acc0
134	add	$acc1,$car1,$car1
135	ld	[$np+$j],$npj		! np[j]
136	srlx	$car0,32,$car0
137	add	$acc0,$car1,$car1
138	add	$j,4,$j			! j++
139	mov	$tmp0,$acc0
140	st	$car1,[$tp]
141	cmp	$j,$num
142	mov	$tmp1,$acc1
143	srlx	$car1,32,$car1
144	bl	%icc,.L1st
145	add	$tp,4,$tp		! tp++
146!.L1st
147
148	mulx	$apj,$mul0,$tmp0	!epilogue!
149	mulx	$npj,$mul1,$tmp1
150	add	$acc0,$car0,$car0
151	and	$car0,$mask,$acc0
152	add	$acc1,$car1,$car1
153	srlx	$car0,32,$car0
154	add	$acc0,$car1,$car1
155	st	$car1,[$tp]
156	srlx	$car1,32,$car1
157
158	add	$tmp0,$car0,$car0
159	and	$car0,$mask,$acc0
160	add	$tmp1,$car1,$car1
161	srlx	$car0,32,$car0
162	add	$acc0,$car1,$car1
163	st	$car1,[$tp+4]
164	srlx	$car1,32,$car1
165
166	add	$car0,$car1,$car1
167	st	$car1,[$tp+8]
168	srlx	$car1,32,$car2
169
170	mov	4,$i			! i++
171	ld	[$bp+4],$mul0		! bp[1]
172.Louter:
173	add	%sp,$bias+$frame,$tp
174	ld	[$ap],$car0		! ap[0]
175	ld	[$ap+4],$apj		! ap[1]
176	ld	[$np],$car1		! np[0]
177	ld	[$np+4],$npj		! np[1]
178	ld	[$tp],$tmp1		! tp[0]
179	ld	[$tp+4],$tpj		! tp[1]
180	mov	12,$j
181
182	mulx	$car0,$mul0,$car0
183	mulx	$apj,$mul0,$tmp0	!prologue!
184	add	$tmp1,$car0,$car0
185	ld	[$ap+8],$apj		!prologue!
186	and	$car0,$mask,$acc0
187
188	mulx	$n0,$acc0,$mul1
189	and	$mul1,$mask,$mul1
190
191	mulx	$car1,$mul1,$car1
192	mulx	$npj,$mul1,$acc1	!prologue!
193	srlx	$car0,32,$car0
194	add	$acc0,$car1,$car1
195	ld	[$np+8],$npj		!prologue!
196	srlx	$car1,32,$car1
197	mov	$tmp0,$acc0		!prologue!
198
199.Linner:
200	mulx	$apj,$mul0,$tmp0
201	mulx	$npj,$mul1,$tmp1
202	add	$tpj,$car0,$car0
203	ld	[$ap+$j],$apj		! ap[j]
204	add	$acc0,$car0,$car0
205	add	$acc1,$car1,$car1
206	ld	[$np+$j],$npj		! np[j]
207	and	$car0,$mask,$acc0
208	ld	[$tp+8],$tpj		! tp[j]
209	srlx	$car0,32,$car0
210	add	$acc0,$car1,$car1
211	add	$j,4,$j			! j++
212	mov	$tmp0,$acc0
213	st	$car1,[$tp]		! tp[j-1]
214	srlx	$car1,32,$car1
215	mov	$tmp1,$acc1
216	cmp	$j,$num
217	bl	%icc,.Linner
218	add	$tp,4,$tp		! tp++
219!.Linner
220
221	mulx	$apj,$mul0,$tmp0	!epilogue!
222	mulx	$npj,$mul1,$tmp1
223	add	$tpj,$car0,$car0
224	add	$acc0,$car0,$car0
225	ld	[$tp+8],$tpj		! tp[j]
226	and	$car0,$mask,$acc0
227	add	$acc1,$car1,$car1
228	srlx	$car0,32,$car0
229	add	$acc0,$car1,$car1
230	st	$car1,[$tp]		! tp[j-1]
231	srlx	$car1,32,$car1
232
233	add	$tpj,$car0,$car0
234	add	$tmp0,$car0,$car0
235	and	$car0,$mask,$acc0
236	add	$tmp1,$car1,$car1
237	add	$acc0,$car1,$car1
238	st	$car1,[$tp+4]		! tp[j-1]
239	srlx	$car0,32,$car0
240	add	$i,4,$i			! i++
241	srlx	$car1,32,$car1
242
243	add	$car0,$car1,$car1
244	cmp	$i,$num
245	add	$car2,$car1,$car1
246	st	$car1,[$tp+8]
247
248	srlx	$car1,32,$car2
249	bl,a	%icc,.Louter
250	ld	[$bp+$i],$mul0		! bp[i]
251!.Louter
252
253	add	$tp,12,$tp
254
255.Ltail:
256	add	$np,$num,$np
257	add	$rp,$num,$rp
258	mov	$tp,$ap
259	sub	%g0,$num,%o7		! k=-num
260	ba	.Lsub
261	subcc	%g0,%g0,%g0		! clear %icc.c
262.align	16
263.Lsub:
264	ld	[$tp+%o7],%o0
265	ld	[$np+%o7],%o1
266	subccc	%o0,%o1,%o1		! tp[j]-np[j]
267	add	$rp,%o7,$i
268	add	%o7,4,%o7
269	brnz	%o7,.Lsub
270	st	%o1,[$i]
271	subc	$car2,0,$car2		! handle upmost overflow bit
272	and	$tp,$car2,$ap
273	andn	$rp,$car2,$np
274	or	$ap,$np,$ap
275	sub	%g0,$num,%o7
276
277.Lcopy:
278	ld	[$ap+%o7],%o0		! copy or in-place refresh
279	st	%g0,[$tp+%o7]		! zap tp
280	st	%o0,[$rp+%o7]
281	add	%o7,4,%o7
282	brnz	%o7,.Lcopy
283	nop
284	mov	1,%i0
285	ret
286	restore
287___
288
289########
290######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291######## code without following dedicated squaring procedure.
292########
293$sbit="%o5";
294
295$code.=<<___;
296.align	32
297.Lbn_sqr_mont:
298	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
299	mulx	$apj,$mul0,$tmp0		!prologue!
300	and	$car0,$mask,$acc0
301	add	%sp,$bias+$frame,$tp
302	ld	[$ap+8],$apj			!prologue!
303
304	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
305	srlx	$car0,32,$car0
306	and	$mul1,$mask,$mul1
307
308	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
309	mulx	$npj,$mul1,$acc1		!prologue!
310	and	$car0,1,$sbit
311	ld	[$np+8],$npj			!prologue!
312	srlx	$car0,1,$car0
313	add	$acc0,$car1,$car1
314	srlx	$car1,32,$car1
315	mov	$tmp0,$acc0			!prologue!
316
317.Lsqr_1st:
318	mulx	$apj,$mul0,$tmp0
319	mulx	$npj,$mul1,$tmp1
320	add	$acc0,$car0,$car0		! ap[j]*a0+c0
321	add	$acc1,$car1,$car1
322	ld	[$ap+$j],$apj			! ap[j]
323	and	$car0,$mask,$acc0
324	ld	[$np+$j],$npj			! np[j]
325	srlx	$car0,32,$car0
326	add	$acc0,$acc0,$acc0
327	or	$sbit,$acc0,$acc0
328	mov	$tmp1,$acc1
329	srlx	$acc0,32,$sbit
330	add	$j,4,$j				! j++
331	and	$acc0,$mask,$acc0
332	cmp	$j,$num
333	add	$acc0,$car1,$car1
334	st	$car1,[$tp]
335	mov	$tmp0,$acc0
336	srlx	$car1,32,$car1
337	bl	%icc,.Lsqr_1st
338	add	$tp,4,$tp			! tp++
339!.Lsqr_1st
340
341	mulx	$apj,$mul0,$tmp0		! epilogue
342	mulx	$npj,$mul1,$tmp1
343	add	$acc0,$car0,$car0		! ap[j]*a0+c0
344	add	$acc1,$car1,$car1
345	and	$car0,$mask,$acc0
346	srlx	$car0,32,$car0
347	add	$acc0,$acc0,$acc0
348	or	$sbit,$acc0,$acc0
349	srlx	$acc0,32,$sbit
350	and	$acc0,$mask,$acc0
351	add	$acc0,$car1,$car1
352	st	$car1,[$tp]
353	srlx	$car1,32,$car1
354
355	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
356	add	$tmp1,$car1,$car1
357	and	$car0,$mask,$acc0
358	srlx	$car0,32,$car0
359	add	$acc0,$acc0,$acc0
360	or	$sbit,$acc0,$acc0
361	srlx	$acc0,32,$sbit
362	and	$acc0,$mask,$acc0
363	add	$acc0,$car1,$car1
364	st	$car1,[$tp+4]
365	srlx	$car1,32,$car1
366
367	add	$car0,$car0,$car0
368	or	$sbit,$car0,$car0
369	add	$car0,$car1,$car1
370	st	$car1,[$tp+8]
371	srlx	$car1,32,$car2
372
373	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
374	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
375	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
376	ld	[$ap+4],$mul0			! ap[1]
377	ld	[$ap+8],$apj			! ap[2]
378	ld	[$np],$car1			! np[0]
379	ld	[$np+4],$npj			! np[1]
380	mulx	$n0,$tmp0,$mul1
381
382	mulx	$mul0,$mul0,$car0
383	and	$mul1,$mask,$mul1
384
385	mulx	$car1,$mul1,$car1
386	mulx	$npj,$mul1,$acc1
387	add	$tmp0,$car1,$car1
388	and	$car0,$mask,$acc0
389	ld	[$np+8],$npj			! np[2]
390	srlx	$car1,32,$car1
391	add	$tmp1,$car1,$car1
392	srlx	$car0,32,$car0
393	add	$acc0,$car1,$car1
394	and	$car0,1,$sbit
395	add	$acc1,$car1,$car1
396	srlx	$car0,1,$car0
397	mov	12,$j
398	st	$car1,[%sp+$bias+$frame]	! tp[0]=
399	srlx	$car1,32,$car1
400	add	%sp,$bias+$frame+4,$tp
401
402.Lsqr_2nd:
403	mulx	$apj,$mul0,$acc0
404	mulx	$npj,$mul1,$acc1
405	add	$acc0,$car0,$car0
406	add	$tpj,$sbit,$sbit
407	ld	[$ap+$j],$apj			! ap[j]
408	and	$car0,$mask,$acc0
409	ld	[$np+$j],$npj			! np[j]
410	srlx	$car0,32,$car0
411	add	$acc1,$car1,$car1
412	ld	[$tp+8],$tpj			! tp[j]
413	add	$acc0,$acc0,$acc0
414	add	$j,4,$j				! j++
415	add	$sbit,$acc0,$acc0
416	srlx	$acc0,32,$sbit
417	and	$acc0,$mask,$acc0
418	cmp	$j,$num
419	add	$acc0,$car1,$car1
420	st	$car1,[$tp]			! tp[j-1]
421	srlx	$car1,32,$car1
422	bl	%icc,.Lsqr_2nd
423	add	$tp,4,$tp			! tp++
424!.Lsqr_2nd
425
426	mulx	$apj,$mul0,$acc0
427	mulx	$npj,$mul1,$acc1
428	add	$acc0,$car0,$car0
429	add	$tpj,$sbit,$sbit
430	and	$car0,$mask,$acc0
431	srlx	$car0,32,$car0
432	add	$acc1,$car1,$car1
433	add	$acc0,$acc0,$acc0
434	add	$sbit,$acc0,$acc0
435	srlx	$acc0,32,$sbit
436	and	$acc0,$mask,$acc0
437	add	$acc0,$car1,$car1
438	st	$car1,[$tp]			! tp[j-1]
439	srlx	$car1,32,$car1
440
441	add	$car0,$car0,$car0
442	add	$sbit,$car0,$car0
443	add	$car0,$car1,$car1
444	add	$car2,$car1,$car1
445	st	$car1,[$tp+4]
446	srlx	$car1,32,$car2
447
448	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
449	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
450	ld	[$ap+8],$mul0			! ap[2]
451	ld	[$np],$car1			! np[0]
452	ld	[$np+4],$npj			! np[1]
453	mulx	$n0,$tmp1,$mul1
454	and	$mul1,$mask,$mul1
455	mov	8,$i
456
457	mulx	$mul0,$mul0,$car0
458	mulx	$car1,$mul1,$car1
459	and	$car0,$mask,$acc0
460	add	$tmp1,$car1,$car1
461	srlx	$car0,32,$car0
462	add	%sp,$bias+$frame,$tp
463	srlx	$car1,32,$car1
464	and	$car0,1,$sbit
465	srlx	$car0,1,$car0
466	mov	4,$j
467
468.Lsqr_outer:
469.Lsqr_inner1:
470	mulx	$npj,$mul1,$acc1
471	add	$tpj,$car1,$car1
472	add	$j,4,$j
473	ld	[$tp+8],$tpj
474	cmp	$j,$i
475	add	$acc1,$car1,$car1
476	ld	[$np+$j],$npj
477	st	$car1,[$tp]
478	srlx	$car1,32,$car1
479	bl	%icc,.Lsqr_inner1
480	add	$tp,4,$tp
481!.Lsqr_inner1
482
483	add	$j,4,$j
484	ld	[$ap+$j],$apj			! ap[j]
485	mulx	$npj,$mul1,$acc1
486	add	$tpj,$car1,$car1
487	ld	[$np+$j],$npj			! np[j]
488	add	$acc0,$car1,$car1
489	ld	[$tp+8],$tpj			! tp[j]
490	add	$acc1,$car1,$car1
491	st	$car1,[$tp]
492	srlx	$car1,32,$car1
493
494	add	$j,4,$j
495	cmp	$j,$num
496	be,pn	%icc,.Lsqr_no_inner2
497	add	$tp,4,$tp
498
499.Lsqr_inner2:
500	mulx	$apj,$mul0,$acc0
501	mulx	$npj,$mul1,$acc1
502	add	$tpj,$sbit,$sbit
503	add	$acc0,$car0,$car0
504	ld	[$ap+$j],$apj			! ap[j]
505	and	$car0,$mask,$acc0
506	ld	[$np+$j],$npj			! np[j]
507	srlx	$car0,32,$car0
508	add	$acc0,$acc0,$acc0
509	ld	[$tp+8],$tpj			! tp[j]
510	add	$sbit,$acc0,$acc0
511	add	$j,4,$j				! j++
512	srlx	$acc0,32,$sbit
513	and	$acc0,$mask,$acc0
514	cmp	$j,$num
515	add	$acc0,$car1,$car1
516	add	$acc1,$car1,$car1
517	st	$car1,[$tp]			! tp[j-1]
518	srlx	$car1,32,$car1
519	bl	%icc,.Lsqr_inner2
520	add	$tp,4,$tp			! tp++
521
522.Lsqr_no_inner2:
523	mulx	$apj,$mul0,$acc0
524	mulx	$npj,$mul1,$acc1
525	add	$tpj,$sbit,$sbit
526	add	$acc0,$car0,$car0
527	and	$car0,$mask,$acc0
528	srlx	$car0,32,$car0
529	add	$acc0,$acc0,$acc0
530	add	$sbit,$acc0,$acc0
531	srlx	$acc0,32,$sbit
532	and	$acc0,$mask,$acc0
533	add	$acc0,$car1,$car1
534	add	$acc1,$car1,$car1
535	st	$car1,[$tp]			! tp[j-1]
536	srlx	$car1,32,$car1
537
538	add	$car0,$car0,$car0
539	add	$sbit,$car0,$car0
540	add	$car0,$car1,$car1
541	add	$car2,$car1,$car1
542	st	$car1,[$tp+4]
543	srlx	$car1,32,$car2
544
545	add	$i,4,$i				! i++
546	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
547	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
548	ld	[$ap+$i],$mul0			! ap[j]
549	ld	[$np],$car1			! np[0]
550	ld	[$np+4],$npj			! np[1]
551	mulx	$n0,$tmp1,$mul1
552	and	$mul1,$mask,$mul1
553	add	$i,4,$tmp0
554
555	mulx	$mul0,$mul0,$car0
556	mulx	$car1,$mul1,$car1
557	and	$car0,$mask,$acc0
558	add	$tmp1,$car1,$car1
559	srlx	$car0,32,$car0
560	add	%sp,$bias+$frame,$tp
561	srlx	$car1,32,$car1
562	and	$car0,1,$sbit
563	srlx	$car0,1,$car0
564
565	cmp	$tmp0,$num			! i<num-1
566	bl	%icc,.Lsqr_outer
567	mov	4,$j
568
569.Lsqr_last:
570	mulx	$npj,$mul1,$acc1
571	add	$tpj,$car1,$car1
572	add	$j,4,$j
573	ld	[$tp+8],$tpj
574	cmp	$j,$i
575	add	$acc1,$car1,$car1
576	ld	[$np+$j],$npj
577	st	$car1,[$tp]
578	srlx	$car1,32,$car1
579	bl	%icc,.Lsqr_last
580	add	$tp,4,$tp
581!.Lsqr_last
582
583	mulx	$npj,$mul1,$acc1
584	add	$tpj,$acc0,$acc0
585	srlx	$acc0,32,$tmp0
586	and	$acc0,$mask,$acc0
587	add	$tmp0,$sbit,$sbit
588	add	$acc0,$car1,$car1
589	add	$acc1,$car1,$car1
590	st	$car1,[$tp]
591	srlx	$car1,32,$car1
592
593	add	$car0,$car0,$car0		! recover $car0
594	add	$sbit,$car0,$car0
595	add	$car0,$car1,$car1
596	add	$car2,$car1,$car1
597	st	$car1,[$tp+4]
598	srlx	$car1,32,$car2
599
600	ba	.Ltail
601	add	$tp,8,$tp
602.type	$fname,#function
603.size	$fname,(.-$fname)
604.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
605.align	32
606___
607$code =~ s/\`([^\`]*)\`/eval($1)/gem;
608print $code;
609close STDOUT;
610