sparct4-mont.pl revision 337982
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
5# <appro@openssl.org>. The module is licensed under 2-clause BSD
6# license. November 2012. All rights reserved.
7# ====================================================================
8
9######################################################################
10# Montgomery squaring-n-multiplication module for SPARC T4.
11#
12# The module consists of three parts:
13#
14# 1) collection of "single-op" subroutines that perform single
15#    operation, Montgomery squaring or multiplication, on 512-,
16#    1024-, 1536- and 2048-bit operands;
17# 2) collection of "multi-op" subroutines that perform 5 squaring and
18#    1 multiplication operations on operands of above lengths;
19# 3) fall-back and helper VIS3 subroutines.
20#
21# RSA sign is dominated by multi-op subroutine, while RSA verify and
22# DSA - by single-op. Special note about 4096-bit RSA verify result.
23# Operands are too long for dedicated hardware and it's handled by
24# VIS3 code, which is why you don't see any improvement. It's surely
25# possible to improve it [by deploying 'mpmul' instruction], maybe in
26# the future...
27#
28# Performance improvement.
29#
30# 64-bit process, VIS3:
31#                   sign    verify    sign/s verify/s
32# rsa 1024 bits 0.000628s 0.000028s   1592.4  35434.4
33# rsa 2048 bits 0.003282s 0.000106s    304.7   9438.3
34# rsa 4096 bits 0.025866s 0.000340s     38.7   2940.9
35# dsa 1024 bits 0.000301s 0.000332s   3323.7   3013.9
36# dsa 2048 bits 0.001056s 0.001233s    946.9    810.8
37#
38# 64-bit process, this module:
39#                   sign    verify    sign/s verify/s
40# rsa 1024 bits 0.000256s 0.000016s   3904.4  61411.9
41# rsa 2048 bits 0.000946s 0.000029s   1056.8  34292.7
42# rsa 4096 bits 0.005061s 0.000340s    197.6   2940.5
43# dsa 1024 bits 0.000176s 0.000195s   5674.7   5130.5
44# dsa 2048 bits 0.000296s 0.000354s   3383.2   2827.6
45#
46######################################################################
47# 32-bit process, VIS3:
48#                   sign    verify    sign/s verify/s
49# rsa 1024 bits 0.000665s 0.000028s   1504.8  35233.3
50# rsa 2048 bits 0.003349s 0.000106s    298.6   9433.4
51# rsa 4096 bits 0.025959s 0.000341s     38.5   2934.8
52# dsa 1024 bits 0.000320s 0.000341s   3123.3   2929.6
53# dsa 2048 bits 0.001101s 0.001260s    908.2    793.4
54#
55# 32-bit process, this module:
56#                   sign    verify    sign/s verify/s
57# rsa 1024 bits 0.000301s 0.000017s   3317.1  60240.0
58# rsa 2048 bits 0.001034s 0.000030s    966.9  33812.7
59# rsa 4096 bits 0.005244s 0.000341s    190.7   2935.4
60# dsa 1024 bits 0.000201s 0.000205s   4976.1   4879.2
61# dsa 2048 bits 0.000328s 0.000360s   3051.1   2774.2
62#
63# 32-bit code is prone to performance degradation as interrupt rate
64# dispatched to CPU executing the code grows. This is because in
65# standard process of handling interrupt in 32-bit process context
66# upper halves of most integer registers used as input or output are
67# zeroed. This renders result invalid, and operation has to be re-run.
68# If CPU is "bothered" with timer interrupts only, the penalty is
69# hardly measurable. But in order to mitigate this problem for higher
70# interrupt rates contemporary Linux kernel recognizes biased stack
71# even in 32-bit process context and preserves full register contents.
72# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
73# for details.
74
75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76push(@INC,"${dir}","${dir}../../perlasm");
77require "sparcv9_modes.pl";
78
79$code.=<<___;
80#include "sparc_arch.h"
81
82#ifdef	__arch64__
83.register	%g2,#scratch
84.register	%g3,#scratch
85#endif
86
87.section	".text",#alloc,#execinstr
88
89#ifdef	__PIC__
90SPARC_PIC_THUNK(%g1)
91#endif
92___
93
94########################################################################
95# Register layout for mont[mul|sqr] instructions.
96# For details see "Oracle SPARC Architecture 2011" manual at
97# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
98#
99my @R=map("%f".2*$_,(0..11,30,31,12..29));
100my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
101my @A=(@N[0..13],@R[14..31]);
102my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
103
104########################################################################
105# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
106#			  const u64 *np,const BN_ULONG *n0);
107#
108sub generate_bn_mul_mont_t4() {
109my $NUM=shift;
110my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
111
112$code.=<<___;
113.globl	bn_mul_mont_t4_$NUM
114.align	32
115bn_mul_mont_t4_$NUM:
116#ifdef	__arch64__
117	mov	0,$sentinel
118	mov	-128,%g4
119#elif defined(SPARCV9_64BIT_STACK)
120	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
121	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
122	mov	-2047,%g4
123	and	%g1,SPARCV9_64BIT_STACK,%g1
124	movrz	%g1,0,%g4
125	mov	-1,$sentinel
126	add	%g4,-128,%g4
127#else
128	mov	-1,$sentinel
129	mov	-128,%g4
130#endif
131	sllx	$sentinel,32,$sentinel
132	save	%sp,%g4,%sp
133#ifndef	__arch64__
134	save	%sp,-128,%sp	! warm it up
135	save	%sp,-128,%sp
136	save	%sp,-128,%sp
137	save	%sp,-128,%sp
138	save	%sp,-128,%sp
139	save	%sp,-128,%sp
140	restore
141	restore
142	restore
143	restore
144	restore
145	restore
146#endif
147	and	%sp,1,%g4
148	or	$sentinel,%fp,%fp
149	or	%g4,$sentinel,$sentinel
150
151	! copy arguments to global registers
152	mov	%i0,$rp
153	mov	%i1,$ap
154	mov	%i2,$bp
155	mov	%i3,$np
156	ld	[%i4+0],%f1	! load *n0
157	ld	[%i4+4],%f0
158	fsrc2	%f0,%f60
159___
160
161# load ap[$NUM] ########################################################
162$code.=<<___;
163	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
164___
165for($i=0; $i<14 && $i<$NUM; $i++) {
166my $lo=$i<13?@A[$i+1]:"%o7";
167$code.=<<___;
168	ld	[$ap+$i*8+0],$lo
169	ld	[$ap+$i*8+4],@A[$i]
170	sllx	@A[$i],32,@A[$i]
171	or	$lo,@A[$i],@A[$i]
172___
173}
174for(; $i<$NUM; $i++) {
175my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
176$code.=<<___;
177	ld	[$ap+$i*8+0],$lo
178	ld	[$ap+$i*8+4],$hi
179	fsrc2	$hi,@A[$i]
180___
181}
182# load np[$NUM] ########################################################
183$code.=<<___;
184	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
185___
186for($i=0; $i<14 && $i<$NUM; $i++) {
187my $lo=$i<13?@N[$i+1]:"%o7";
188$code.=<<___;
189	ld	[$np+$i*8+0],$lo
190	ld	[$np+$i*8+4],@N[$i]
191	sllx	@N[$i],32,@N[$i]
192	or	$lo,@N[$i],@N[$i]
193___
194}
195$code.=<<___;
196	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
197___
198for(; $i<28 && $i<$NUM; $i++) {
199my $lo=$i<27?@N[$i+1]:"%o7";
200$code.=<<___;
201	ld	[$np+$i*8+0],$lo
202	ld	[$np+$i*8+4],@N[$i]
203	sllx	@N[$i],32,@N[$i]
204	or	$lo,@N[$i],@N[$i]
205___
206}
207$code.=<<___;
208	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
209___
210for(; $i<$NUM; $i++) {
211my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
212$code.=<<___;
213	ld	[$np+$i*8+0],$lo
214	ld	[$np+$i*8+4],@N[$i]
215	sllx	@N[$i],32,@N[$i]
216	or	$lo,@N[$i],@N[$i]
217___
218}
219$code.=<<___;
220	cmp	$ap,$bp
221	be	SIZE_T_CC,.Lmsquare_$NUM
222	nop
223___
224
225# load bp[$NUM] ########################################################
226$code.=<<___;
227	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
228___
229for($i=0; $i<14 && $i<$NUM; $i++) {
230my $lo=$i<13?@B[$i+1]:"%o7";
231$code.=<<___;
232	ld	[$bp+$i*8+0],$lo
233	ld	[$bp+$i*8+4],@B[$i]
234	sllx	@B[$i],32,@B[$i]
235	or	$lo,@B[$i],@B[$i]
236___
237}
238$code.=<<___;
239	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
240___
241for(; $i<$NUM; $i++) {
242my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
243$code.=<<___;
244	ld	[$bp+$i*8+0],$lo
245	ld	[$bp+$i*8+4],@B[$i]
246	sllx	@B[$i],32,@B[$i]
247	or	$lo,@B[$i],@B[$i]
248___
249}
250# magic ################################################################
251$code.=<<___;
252	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
253.Lmresume_$NUM:
254	fbu,pn	%fcc3,.Lmabort_$NUM
255#ifndef	__arch64__
256	and	%fp,$sentinel,$sentinel
257	brz,pn	$sentinel,.Lmabort_$NUM
258#endif
259	nop
260#ifdef	__arch64__
261	restore
262	restore
263	restore
264	restore
265	restore
266#else
267	restore;		and	%fp,$sentinel,$sentinel
268	restore;		and	%fp,$sentinel,$sentinel
269	restore;		and	%fp,$sentinel,$sentinel
270	restore;		and	%fp,$sentinel,$sentinel
271	 brz,pn	$sentinel,.Lmabort1_$NUM
272	restore
273#endif
274___
275
276# save tp[$NUM] ########################################################
277for($i=0; $i<14 && $i<$NUM; $i++) {
278$code.=<<___;
279	movxtod	@A[$i],@R[$i]
280___
281}
282$code.=<<___;
283#ifdef	__arch64__
284	restore
285#else
286	 and	%fp,$sentinel,$sentinel
287	restore
288	 and	$sentinel,1,%o7
289	 and	%fp,$sentinel,$sentinel
290	 srl	%fp,0,%fp		! just in case?
291	 or	%o7,$sentinel,$sentinel
292	brz,a,pn $sentinel,.Lmdone_$NUM
293	mov	0,%i0		! return failure
294#endif
295___
296for($i=0; $i<12 && $i<$NUM; $i++) {
297@R[$i] =~ /%f([0-9]+)/;
298my $lo = "%f".($1+1);
299$code.=<<___;
300	st	$lo,[$rp+$i*8+0]
301	st	@R[$i],[$rp+$i*8+4]
302___
303}
304for(; $i<$NUM; $i++) {
305my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
306$code.=<<___;
307	fsrc2	@R[$i],$hi
308	st	$lo,[$rp+$i*8+0]
309	st	$hi,[$rp+$i*8+4]
310___
311}
312$code.=<<___;
313	mov	1,%i0		! return success
314.Lmdone_$NUM:
315	ret
316	restore
317
318.Lmabort_$NUM:
319	restore
320	restore
321	restore
322	restore
323	restore
324.Lmabort1_$NUM:
325	restore
326
327	mov	0,%i0		! return failure
328	ret
329	restore
330
331.align	32
332.Lmsquare_$NUM:
333	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
334	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
335	.word   0x81b02940+$NUM-1	! montsqr	$NUM-1
336	ba	.Lmresume_$NUM
337	nop
338.type	bn_mul_mont_t4_$NUM, #function
339.size	bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
340___
341}
342
343for ($i=8;$i<=32;$i+=8) {
344	&generate_bn_mul_mont_t4($i);
345}
346
347########################################################################
348#
349sub load_ccr {
350my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
351$code.=<<___;
352	srl	$pwr,	2,	%o4
353	and	$pwr,	3,	%o5
354	and	%o4,	7,	%o4
355	sll	%o5,	3,	%o5	! offset within first cache line
356	add	%o5,	$ptbl,	$ptbl	! of the pwrtbl
357	or	%g0,	1,	%o5
358	sll	%o5,	%o4,	$ccr
359___
360$code.=<<___	if (!$skip_wr);
361	wr	$ccr,	%g0,	%ccr
362___
363}
364sub load_b_pair {
365my ($pwrtbl,$B0,$B1)=@_;
366
367$code.=<<___;
368	ldx	[$pwrtbl+0*32],	$B0
369	ldx	[$pwrtbl+8*32],	$B1
370	ldx	[$pwrtbl+1*32],	%o4
371	ldx	[$pwrtbl+9*32],	%o5
372	movvs	%icc,	%o4,	$B0
373	ldx	[$pwrtbl+2*32],	%o4
374	movvs	%icc,	%o5,	$B1
375	ldx	[$pwrtbl+10*32],%o5
376	move	%icc,	%o4,	$B0
377	ldx	[$pwrtbl+3*32],	%o4
378	move	%icc,	%o5,	$B1
379	ldx	[$pwrtbl+11*32],%o5
380	movneg	%icc,	%o4,	$B0
381	ldx	[$pwrtbl+4*32],	%o4
382	movneg	%icc,	%o5,	$B1
383	ldx	[$pwrtbl+12*32],%o5
384	movcs	%xcc,	%o4,	$B0
385	ldx	[$pwrtbl+5*32],%o4
386	movcs	%xcc,	%o5,	$B1
387	ldx	[$pwrtbl+13*32],%o5
388	movvs	%xcc,	%o4,	$B0
389	ldx	[$pwrtbl+6*32],	%o4
390	movvs	%xcc,	%o5,	$B1
391	ldx	[$pwrtbl+14*32],%o5
392	move	%xcc,	%o4,	$B0
393	ldx	[$pwrtbl+7*32],	%o4
394	move	%xcc,	%o5,	$B1
395	ldx	[$pwrtbl+15*32],%o5
396	movneg	%xcc,	%o4,	$B0
397	add	$pwrtbl,16*32,	$pwrtbl
398	movneg	%xcc,	%o5,	$B1
399___
400}
401sub load_b {
402my ($pwrtbl,$Bi)=@_;
403
404$code.=<<___;
405	ldx	[$pwrtbl+0*32],	$Bi
406	ldx	[$pwrtbl+1*32],	%o4
407	ldx	[$pwrtbl+2*32],	%o5
408	movvs	%icc,	%o4,	$Bi
409	ldx	[$pwrtbl+3*32],	%o4
410	move	%icc,	%o5,	$Bi
411	ldx	[$pwrtbl+4*32],	%o5
412	movneg	%icc,	%o4,	$Bi
413	ldx	[$pwrtbl+5*32],	%o4
414	movcs	%xcc,	%o5,	$Bi
415	ldx	[$pwrtbl+6*32],	%o5
416	movvs	%xcc,	%o4,	$Bi
417	ldx	[$pwrtbl+7*32],	%o4
418	move	%xcc,	%o5,	$Bi
419	add	$pwrtbl,8*32,	$pwrtbl
420	movneg	%xcc,	%o4,	$Bi
421___
422}
423
424########################################################################
425# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
426#			   const u64 *pwrtbl,int pwr,int stride);
427#
428sub generate_bn_pwr5_mont_t4() {
429my $NUM=shift;
430my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
431
432$code.=<<___;
433.globl	bn_pwr5_mont_t4_$NUM
434.align	32
435bn_pwr5_mont_t4_$NUM:
436#ifdef	__arch64__
437	mov	0,$sentinel
438	mov	-128,%g4
439#elif defined(SPARCV9_64BIT_STACK)
440	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
441	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
442	mov	-2047,%g4
443	and	%g1,SPARCV9_64BIT_STACK,%g1
444	movrz	%g1,0,%g4
445	mov	-1,$sentinel
446	add	%g4,-128,%g4
447#else
448	mov	-1,$sentinel
449	mov	-128,%g4
450#endif
451	sllx	$sentinel,32,$sentinel
452	save	%sp,%g4,%sp
453#ifndef	__arch64__
454	save	%sp,-128,%sp	! warm it up
455	save	%sp,-128,%sp
456	save	%sp,-128,%sp
457	save	%sp,-128,%sp
458	save	%sp,-128,%sp
459	save	%sp,-128,%sp
460	restore
461	restore
462	restore
463	restore
464	restore
465	restore
466#endif
467	and	%sp,1,%g4
468	or	$sentinel,%fp,%fp
469	or	%g4,$sentinel,$sentinel
470
471	! copy arguments to global registers
472	mov	%i0,$tp
473	mov	%i1,$np
474	ld	[%i2+0],%f1	! load *n0
475	ld	[%i2+4],%f0
476	mov	%i3,$pwrtbl
477	srl	%i4,%g0,%i4	! pack last arguments
478	sllx	%i5,32,$pwr
479	or	%i4,$pwr,$pwr
480	fsrc2	%f0,%f60
481___
482
483# load tp[$NUM] ########################################################
484$code.=<<___;
485	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
486___
487for($i=0; $i<14 && $i<$NUM; $i++) {
488$code.=<<___;
489	ldx	[$tp+$i*8],@A[$i]
490___
491}
492for(; $i<$NUM; $i++) {
493$code.=<<___;
494	ldd	[$tp+$i*8],@A[$i]
495___
496}
497# load np[$NUM] ########################################################
498$code.=<<___;
499	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
500___
501for($i=0; $i<14 && $i<$NUM; $i++) {
502$code.=<<___;
503	ldx	[$np+$i*8],@N[$i]
504___
505}
506$code.=<<___;
507	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
508___
509for(; $i<28 && $i<$NUM; $i++) {
510$code.=<<___;
511	ldx	[$np+$i*8],@N[$i]
512___
513}
514$code.=<<___;
515	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
516___
517for(; $i<$NUM; $i++) {
518$code.=<<___;
519	ldx	[$np+$i*8],@N[$i]
520___
521}
522# load pwrtbl[pwr] ########################################################
523$code.=<<___;
524	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
525
526	srlx	$pwr,	32,	%o4		! unpack $pwr
527	srl	$pwr,	%g0,	%o5
528	sub	%o4,	5,	%o4
529	mov	$pwrtbl,	%o7
530	sllx	%o4,	32,	$pwr		! re-pack $pwr
531	or	%o5,	$pwr,	$pwr
532	srl	%o5,	%o4,	%o5
533___
534	&load_ccr("%o7","%o5","%o4");
535$code.=<<___;
536	b	.Lstride_$NUM
537	nop
538.align	16
539.Lstride_$NUM:
540___
541for($i=0; $i<14 && $i<$NUM; $i+=2) {
542	&load_b_pair("%o7",@B[$i],@B[$i+1]);
543}
544$code.=<<___;
545	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
546___
547for(; $i<$NUM; $i+=2) {
548	&load_b_pair("%i7",@B[$i],@B[$i+1]);
549}
550$code.=<<___;
551	srax	$pwr,	32,	%o4		! unpack $pwr
552	srl	$pwr,	%g0,	%o5
553	sub	%o4,	5,	%o4
554	mov	$pwrtbl,	%i7
555	sllx	%o4,	32,	$pwr		! re-pack $pwr
556	or	%o5,	$pwr,	$pwr
557	srl	%o5,	%o4,	%o5
558___
559	&load_ccr("%i7","%o5","%o4",1);
560
561# magic ################################################################
562for($i=0; $i<5; $i++) {
563$code.=<<___;
564	.word	0x81b02940+$NUM-1	! montsqr	$NUM-1
565	fbu,pn	%fcc3,.Labort_$NUM
566#ifndef	__arch64__
567	and	%fp,$sentinel,$sentinel
568	brz,pn	$sentinel,.Labort_$NUM
569#endif
570	nop
571___
572}
573$code.=<<___;
574	wr	%o4,	%g0,	%ccr
575	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
576	fbu,pn	%fcc3,.Labort_$NUM
577#ifndef	__arch64__
578	and	%fp,$sentinel,$sentinel
579	brz,pn	$sentinel,.Labort_$NUM
580#endif
581
582	srax	$pwr,	32,	%o4
583#ifdef	__arch64__
584	brgez	%o4,.Lstride_$NUM
585	restore
586	restore
587	restore
588	restore
589	restore
590#else
591	brgez	%o4,.Lstride_$NUM
592	restore;		and	%fp,$sentinel,$sentinel
593	restore;		and	%fp,$sentinel,$sentinel
594	restore;		and	%fp,$sentinel,$sentinel
595	restore;		and	%fp,$sentinel,$sentinel
596	 brz,pn	$sentinel,.Labort1_$NUM
597	restore
598#endif
599___
600
601# save tp[$NUM] ########################################################
602for($i=0; $i<14 && $i<$NUM; $i++) {
603$code.=<<___;
604	movxtod	@A[$i],@R[$i]
605___
606}
607$code.=<<___;
608#ifdef	__arch64__
609	restore
610#else
611	 and	%fp,$sentinel,$sentinel
612	restore
613	 and	$sentinel,1,%o7
614	 and	%fp,$sentinel,$sentinel
615	 srl	%fp,0,%fp		! just in case?
616	 or	%o7,$sentinel,$sentinel
617	brz,a,pn $sentinel,.Ldone_$NUM
618	mov	0,%i0		! return failure
619#endif
620___
621for($i=0; $i<$NUM; $i++) {
622$code.=<<___;
623	std	@R[$i],[$tp+$i*8]
624___
625}
626$code.=<<___;
627	mov	1,%i0		! return success
628.Ldone_$NUM:
629	ret
630	restore
631
632.Labort_$NUM:
633	restore
634	restore
635	restore
636	restore
637	restore
638.Labort1_$NUM:
639	restore
640
641	mov	0,%i0		! return failure
642	ret
643	restore
644.type	bn_pwr5_mont_t4_$NUM, #function
645.size	bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
646___
647}
648
649for ($i=8;$i<=32;$i+=8) {
650	&generate_bn_pwr5_mont_t4($i);
651}
652
653{
654########################################################################
655# Fall-back subroutines
656#
657# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
658#
659($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
660	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
661
662# int bn_mul_mont(
663$rp="%o0";	# u64 *rp,
664$ap="%o1";	# const u64 *ap,
665$bp="%o2";	# const u64 *bp,
666$np="%o3";	# const u64 *np,
667$n0p="%o4";	# const BN_ULONG *n0,
668$num="%o5";	# int num);	# caller ensures that num is >=3
669$code.=<<___;
670.globl	bn_mul_mont_t4
671.align	32
672bn_mul_mont_t4:
673	add	%sp,	STACK_BIAS,	%g4	! real top of stack
674	sll	$num,	3,	$num		! size in bytes
675	add	$num,	63,	%g1
676	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
677	sub	%g4,	%g1,	%g1
678	andn	%g1,	63,	%g1		! align at 64 byte
679	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
680	sub	%g1,	%g4,	%g1
681
682	save	%sp,	%g1,	%sp
683___
684#	+-------------------------------+<-----	%sp
685#	.				.
686#	+-------------------------------+<-----	aligned at 64 bytes
687#	| __int64 tmp[0]		|
688#	+-------------------------------+
689#	.				.
690#	.				.
691#	+-------------------------------+<-----	aligned at 64 bytes
692#	.				.
693($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
694($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
695($ovf,$i)=($t0,$t1);
696$code.=<<___;
697	ld	[$n0p+0],	$t0	! pull n0[0..1] value
698	ld	[$n0p+4],	$t1
699	add	%sp, STACK_BIAS+STACK_FRAME, $tp
700	ldx	[$bp+0],	$m0	! m0=bp[0]
701	sllx	$t1,	32,	$n0
702	add	$bp,	8,	$bp
703	or	$t0,	$n0,	$n0
704
705	ldx	[$ap+0],	$aj	! ap[0]
706
707	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
708	umulxhi	$aj,	$m0,	$hi0
709
710	ldx	[$ap+8],	$aj	! ap[1]
711	add	$ap,	16,	$ap
712	ldx	[$np+0],	$nj	! np[0]
713
714	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
715
716	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
717	umulxhi	$aj,	$m0,	$aj	! ahi=aj
718
719	mulx	$nj,	$m1,	$lo1	! np[0]*m1
720	umulxhi	$nj,	$m1,	$hi1
721
722	ldx	[$np+8],	$nj	! np[1]
723
724	addcc	$lo0,	$lo1,	$lo1
725	add	$np,	16,	$np
726	addxc	%g0,	$hi1,	$hi1
727
728	mulx	$nj,	$m1,	$nlo	! np[1]*m1
729	umulxhi	$nj,	$m1,	$nj	! nhi=nj
730
731	ba	.L1st
732	sub	$num,	24,	$cnt	! cnt=num-3
733
734.align	16
735.L1st:
736	addcc	$alo,	$hi0,	$lo0
737	addxc	$aj,	%g0,	$hi0
738
739	ldx	[$ap+0],	$aj	! ap[j]
740	addcc	$nlo,	$hi1,	$lo1
741	add	$ap,	8,	$ap
742	addxc	$nj,	%g0,	$hi1	! nhi=nj
743
744	ldx	[$np+0],	$nj	! np[j]
745	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
746	add	$np,	8,	$np
747	umulxhi	$aj,	$m0,	$aj	! ahi=aj
748
749	mulx	$nj,	$m1,	$nlo	! np[j]*m1
750	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
751	umulxhi	$nj,	$m1,	$nj	! nhi=nj
752	addxc	%g0,	$hi1,	$hi1
753	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
754	add	$tp,	8,	$tp	! tp++
755
756	brnz,pt	$cnt,	.L1st
757	sub	$cnt,	8,	$cnt	! j--
758!.L1st
759	addcc	$alo,	$hi0,	$lo0
760	addxc	$aj,	%g0,	$hi0	! ahi=aj
761
762	addcc	$nlo,	$hi1,	$lo1
763	addxc	$nj,	%g0,	$hi1
764	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
765	addxc	%g0,	$hi1,	$hi1
766	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
767	add	$tp,	8,	$tp
768
769	addcc	$hi0,	$hi1,	$hi1
770	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
771	stxa	$hi1,	[$tp]0xe2
772	add	$tp,	8,	$tp
773
774	ba	.Louter
775	sub	$num,	16,	$i	! i=num-2
776
777.align	16
778.Louter:
779	ldx	[$bp+0],	$m0	! m0=bp[i]
780	add	$bp,	8,	$bp
781
782	sub	$ap,	$num,	$ap	! rewind
783	sub	$np,	$num,	$np
784	sub	$tp,	$num,	$tp
785
786	ldx	[$ap+0],	$aj	! ap[0]
787	ldx	[$np+0],	$nj	! np[0]
788
789	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
790	ldx	[$tp],		$tj	! tp[0]
791	umulxhi	$aj,	$m0,	$hi0
792	ldx	[$ap+8],	$aj	! ap[1]
793	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
794	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
795	addxc	%g0,	$hi0,	$hi0
796	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
797	umulxhi	$aj,	$m0,	$aj	! ahi=aj
798	mulx	$nj,	$m1,	$lo1	! np[0]*m1
799	add	$ap,	16,	$ap
800	umulxhi	$nj,	$m1,	$hi1
801	ldx	[$np+8],	$nj	! np[1]
802	add	$np,	16,	$np
803	addcc	$lo1,	$lo0,	$lo1
804	mulx	$nj,	$m1,	$nlo	! np[1]*m1
805	addxc	%g0,	$hi1,	$hi1
806	umulxhi	$nj,	$m1,	$nj	! nhi=nj
807
808	ba	.Linner
809	sub	$num,	24,	$cnt	! cnt=num-3
810.align	16
811.Linner:
812	addcc	$alo,	$hi0,	$lo0
813	ldx	[$tp+8],	$tj	! tp[j]
814	addxc	$aj,	%g0,	$hi0	! ahi=aj
815	ldx	[$ap+0],	$aj	! ap[j]
816	add	$ap,	8,	$ap
817	addcc	$nlo,	$hi1,	$lo1
818	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
819	addxc	$nj,	%g0,	$hi1	! nhi=nj
820	ldx	[$np+0],	$nj	! np[j]
821	add	$np,	8,	$np
822	umulxhi	$aj,	$m0,	$aj	! ahi=aj
823	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
824	mulx	$nj,	$m1,	$nlo	! np[j]*m1
825	addxc	%g0,	$hi0,	$hi0
826	umulxhi	$nj,	$m1,	$nj	! nhi=nj
827	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
828	addxc	%g0,	$hi1,	$hi1
829	stx	$lo1,	[$tp]		! tp[j-1]
830	add	$tp,	8,	$tp
831	brnz,pt	$cnt,	.Linner
832	sub	$cnt,	8,	$cnt
833!.Linner
834	ldx	[$tp+8],	$tj	! tp[j]
835	addcc	$alo,	$hi0,	$lo0
836	addxc	$aj,	%g0,	$hi0	! ahi=aj
837	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
838	addxc	%g0,	$hi0,	$hi0
839
840	addcc	$nlo,	$hi1,	$lo1
841	addxc	$nj,	%g0,	$hi1	! nhi=nj
842	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
843	addxc	%g0,	$hi1,	$hi1
844	stx	$lo1,	[$tp]		! tp[j-1]
845
846	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
847	addxccc	$hi1,	$hi0,	$hi1
848	addxc	%g0,	%g0,	$ovf
849	stx	$hi1,	[$tp+8]
850	add	$tp,	16,	$tp
851
852	brnz,pt	$i,	.Louter
853	sub	$i,	8,	$i
854
855	sub	$ap,	$num,	$ap	! rewind
856	sub	$np,	$num,	$np
857	sub	$tp,	$num,	$tp
858	ba	.Lsub
859	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
860
861.align	16
862.Lsub:
863	ldx	[$tp],		$tj
864	add	$tp,	8,	$tp
865	ldx	[$np+0],	$nj
866	add	$np,	8,	$np
867	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
868	srlx	$tj,	32,	$tj
869	srlx	$nj,	32,	$nj
870	subccc	$tj,	$nj,	$t3
871	add	$rp,	8,	$rp
872	st	$t2,	[$rp-4]		! reverse order
873	st	$t3,	[$rp-8]
874	brnz,pt	$cnt,	.Lsub
875	sub	$cnt,	8,	$cnt
876
877	sub	$np,	$num,	$np	! rewind
878	sub	$tp,	$num,	$tp
879	sub	$rp,	$num,	$rp
880
881	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
882	ba	.Lcopy
883	sub	$num,	8,	$cnt
884
885.align	16
886.Lcopy:					! conditional copy
887	ldx	[$tp],		$tj
888	ldx	[$rp+0],	$t2
889	stx	%g0,	[$tp]		! zap
890	add	$tp,	8,	$tp
891	movcs	%icc,	$tj,	$t2
892	stx	$t2,	[$rp+0]
893	add	$rp,	8,	$rp
894	brnz	$cnt,	.Lcopy
895	sub	$cnt,	8,	$cnt
896
897	mov	1,	%o0
898	ret
899	restore
900.type	bn_mul_mont_t4, #function
901.size	bn_mul_mont_t4, .-bn_mul_mont_t4
902___
903
904# int bn_mul_mont_gather5(
905$rp="%o0";	# u64 *rp,
906$ap="%o1";	# const u64 *ap,
907$bp="%o2";	# const u64 *pwrtbl,
908$np="%o3";	# const u64 *np,
909$n0p="%o4";	# const BN_ULONG *n0,
910$num="%o5";	# int num,	# caller ensures that num is >=3
911		# int power);
912$code.=<<___;
913.globl	bn_mul_mont_gather5_t4
914.align	32
915bn_mul_mont_gather5_t4:
916	add	%sp,	STACK_BIAS,	%g4	! real top of stack
917	sll	$num,	3,	$num		! size in bytes
918	add	$num,	63,	%g1
919	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
920	sub	%g4,	%g1,	%g1
921	andn	%g1,	63,	%g1		! align at 64 byte
922	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
923	sub	%g1,	%g4,	%g1
924	LDPTR	[%sp+STACK_7thARG],	%g4	! load power, 7th argument
925
926	save	%sp,	%g1,	%sp
927___
928#	+-------------------------------+<-----	%sp
929#	.				.
930#	+-------------------------------+<-----	aligned at 64 bytes
931#	| __int64 tmp[0]		|
932#	+-------------------------------+
933#	.				.
934#	.				.
935#	+-------------------------------+<-----	aligned at 64 bytes
936#	.				.
937($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
938($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
939($ovf,$i)=($t0,$t1);
940	&load_ccr($bp,"%g4",$ccr);
941	&load_b($bp,$m0,"%o7");		# m0=bp[0]
942
943$code.=<<___;
944	ld	[$n0p+0],	$t0	! pull n0[0..1] value
945	ld	[$n0p+4],	$t1
946	add	%sp, STACK_BIAS+STACK_FRAME, $tp
947	sllx	$t1,	32,	$n0
948	or	$t0,	$n0,	$n0
949
950	ldx	[$ap+0],	$aj	! ap[0]
951
952	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
953	umulxhi	$aj,	$m0,	$hi0
954
955	ldx	[$ap+8],	$aj	! ap[1]
956	add	$ap,	16,	$ap
957	ldx	[$np+0],	$nj	! np[0]
958
959	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
960
961	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
962	umulxhi	$aj,	$m0,	$aj	! ahi=aj
963
964	mulx	$nj,	$m1,	$lo1	! np[0]*m1
965	umulxhi	$nj,	$m1,	$hi1
966
967	ldx	[$np+8],	$nj	! np[1]
968
969	addcc	$lo0,	$lo1,	$lo1
970	add	$np,	16,	$np
971	addxc	%g0,	$hi1,	$hi1
972
973	mulx	$nj,	$m1,	$nlo	! np[1]*m1
974	umulxhi	$nj,	$m1,	$nj	! nhi=nj
975
976	ba	.L1st_g5
977	sub	$num,	24,	$cnt	! cnt=num-3
978
979.align	16
980.L1st_g5:
981	addcc	$alo,	$hi0,	$lo0
982	addxc	$aj,	%g0,	$hi0
983
984	ldx	[$ap+0],	$aj	! ap[j]
985	addcc	$nlo,	$hi1,	$lo1
986	add	$ap,	8,	$ap
987	addxc	$nj,	%g0,	$hi1	! nhi=nj
988
989	ldx	[$np+0],	$nj	! np[j]
990	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
991	add	$np,	8,	$np
992	umulxhi	$aj,	$m0,	$aj	! ahi=aj
993
994	mulx	$nj,	$m1,	$nlo	! np[j]*m1
995	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
996	umulxhi	$nj,	$m1,	$nj	! nhi=nj
997	addxc	%g0,	$hi1,	$hi1
998	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
999	add	$tp,	8,	$tp	! tp++
1000
1001	brnz,pt	$cnt,	.L1st_g5
1002	sub	$cnt,	8,	$cnt	! j--
1003!.L1st_g5
1004	addcc	$alo,	$hi0,	$lo0
1005	addxc	$aj,	%g0,	$hi0	! ahi=aj
1006
1007	addcc	$nlo,	$hi1,	$lo1
1008	addxc	$nj,	%g0,	$hi1
1009	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
1010	addxc	%g0,	$hi1,	$hi1
1011	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
1012	add	$tp,	8,	$tp
1013
1014	addcc	$hi0,	$hi1,	$hi1
1015	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
1016	stxa	$hi1,	[$tp]0xe2
1017	add	$tp,	8,	$tp
1018
1019	ba	.Louter_g5
1020	sub	$num,	16,	$i	! i=num-2
1021
1022.align	16
1023.Louter_g5:
1024	wr	$ccr,	%g0,	%ccr
1025___
1026	&load_b($bp,$m0);		# m0=bp[i]
1027$code.=<<___;
1028	sub	$ap,	$num,	$ap	! rewind
1029	sub	$np,	$num,	$np
1030	sub	$tp,	$num,	$tp
1031
1032	ldx	[$ap+0],	$aj	! ap[0]
1033	ldx	[$np+0],	$nj	! np[0]
1034
1035	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
1036	ldx	[$tp],		$tj	! tp[0]
1037	umulxhi	$aj,	$m0,	$hi0
1038	ldx	[$ap+8],	$aj	! ap[1]
1039	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
1040	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
1041	addxc	%g0,	$hi0,	$hi0
1042	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
1043	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1044	mulx	$nj,	$m1,	$lo1	! np[0]*m1
1045	add	$ap,	16,	$ap
1046	umulxhi	$nj,	$m1,	$hi1
1047	ldx	[$np+8],	$nj	! np[1]
1048	add	$np,	16,	$np
1049	addcc	$lo1,	$lo0,	$lo1
1050	mulx	$nj,	$m1,	$nlo	! np[1]*m1
1051	addxc	%g0,	$hi1,	$hi1
1052	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1053
1054	ba	.Linner_g5
1055	sub	$num,	24,	$cnt	! cnt=num-3
1056.align	16
1057.Linner_g5:
1058	addcc	$alo,	$hi0,	$lo0
1059	ldx	[$tp+8],	$tj	! tp[j]
1060	addxc	$aj,	%g0,	$hi0	! ahi=aj
1061	ldx	[$ap+0],	$aj	! ap[j]
1062	add	$ap,	8,	$ap
1063	addcc	$nlo,	$hi1,	$lo1
1064	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
1065	addxc	$nj,	%g0,	$hi1	! nhi=nj
1066	ldx	[$np+0],	$nj	! np[j]
1067	add	$np,	8,	$np
1068	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1069	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
1070	mulx	$nj,	$m1,	$nlo	! np[j]*m1
1071	addxc	%g0,	$hi0,	$hi0
1072	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1073	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
1074	addxc	%g0,	$hi1,	$hi1
1075	stx	$lo1,	[$tp]		! tp[j-1]
1076	add	$tp,	8,	$tp
1077	brnz,pt	$cnt,	.Linner_g5
1078	sub	$cnt,	8,	$cnt
1079!.Linner_g5
1080	ldx	[$tp+8],	$tj	! tp[j]
1081	addcc	$alo,	$hi0,	$lo0
1082	addxc	$aj,	%g0,	$hi0	! ahi=aj
1083	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
1084	addxc	%g0,	$hi0,	$hi0
1085
1086	addcc	$nlo,	$hi1,	$lo1
1087	addxc	$nj,	%g0,	$hi1	! nhi=nj
1088	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
1089	addxc	%g0,	$hi1,	$hi1
1090	stx	$lo1,	[$tp]		! tp[j-1]
1091
1092	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
1093	addxccc	$hi1,	$hi0,	$hi1
1094	addxc	%g0,	%g0,	$ovf
1095	stx	$hi1,	[$tp+8]
1096	add	$tp,	16,	$tp
1097
1098	brnz,pt	$i,	.Louter_g5
1099	sub	$i,	8,	$i
1100
1101	sub	$ap,	$num,	$ap	! rewind
1102	sub	$np,	$num,	$np
1103	sub	$tp,	$num,	$tp
1104	ba	.Lsub_g5
1105	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
1106
1107.align	16
1108.Lsub_g5:
1109	ldx	[$tp],		$tj
1110	add	$tp,	8,	$tp
1111	ldx	[$np+0],	$nj
1112	add	$np,	8,	$np
1113	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
1114	srlx	$tj,	32,	$tj
1115	srlx	$nj,	32,	$nj
1116	subccc	$tj,	$nj,	$t3
1117	add	$rp,	8,	$rp
1118	st	$t2,	[$rp-4]		! reverse order
1119	st	$t3,	[$rp-8]
1120	brnz,pt	$cnt,	.Lsub_g5
1121	sub	$cnt,	8,	$cnt
1122
1123	sub	$np,	$num,	$np	! rewind
1124	sub	$tp,	$num,	$tp
1125	sub	$rp,	$num,	$rp
1126
1127	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
1128	ba	.Lcopy_g5
1129	sub	$num,	8,	$cnt
1130
1131.align	16
1132.Lcopy_g5:				! conditional copy
1133	ldx	[$tp],		$tj
1134	ldx	[$rp+0],	$t2
1135	stx	%g0,	[$tp]		! zap
1136	add	$tp,	8,	$tp
1137	movcs	%icc,	$tj,	$t2
1138	stx	$t2,	[$rp+0]
1139	add	$rp,	8,	$rp
1140	brnz	$cnt,	.Lcopy_g5
1141	sub	$cnt,	8,	$cnt
1142
1143	mov	1,	%o0
1144	ret
1145	restore
1146.type	bn_mul_mont_gather5_t4, #function
1147.size	bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1148___
1149}
1150
1151$code.=<<___;
1152.globl	bn_flip_t4
1153.align	32
1154bn_flip_t4:
1155.Loop_flip:
1156	ld	[%o1+0],	%o4
1157	sub	%o2,	1,	%o2
1158	ld	[%o1+4],	%o5
1159	add	%o1,	8,	%o1
1160	st	%o5,	[%o0+0]
1161	st	%o4,	[%o0+4]
1162	brnz	%o2,	.Loop_flip
1163	add	%o0,	8,	%o0
1164	retl
1165	nop
1166.type	bn_flip_t4, #function
1167.size	bn_flip_t4, .-bn_flip_t4
1168
1169.globl	bn_flip_n_scatter5_t4
1170.align	32
1171bn_flip_n_scatter5_t4:
1172	sll	%o3,	3,	%o3
1173	srl	%o1,	1,	%o1
1174	add	%o3,	%o2,	%o2	! &pwrtbl[pwr]
1175	sub	%o1,	1,	%o1
1176.Loop_flip_n_scatter5:
1177	ld	[%o0+0],	%o4	! inp[i]
1178	ld	[%o0+4],	%o5
1179	add	%o0,	8,	%o0
1180	sllx	%o5,	32,	%o5
1181	or	%o4,	%o5,	%o5
1182	stx	%o5,	[%o2]
1183	add	%o2,	32*8,	%o2
1184	brnz	%o1,	.Loop_flip_n_scatter5
1185	sub	%o1,	1,	%o1
1186	retl
1187	nop
1188.type	bn_flip_n_scatter5_t4, #function
1189.size	bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1190
1191.globl	bn_gather5_t4
1192.align	32
1193bn_gather5_t4:
1194___
1195	&load_ccr("%o2","%o3","%g1");
1196$code.=<<___;
1197	sub	%o1,	1,	%o1
1198.Loop_gather5:
1199___
1200	&load_b("%o2","%g1");
1201$code.=<<___;
1202	stx	%g1,	[%o0]
1203	add	%o0,	8,	%o0
1204	brnz	%o1,	.Loop_gather5
1205	sub	%o1,	1,	%o1
1206
1207	retl
1208	nop
1209.type	bn_gather5_t4, #function
1210.size	bn_gather5_t4, .-bn_gather5_t4
1211
1212.asciz	"Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1213.align	4
1214___
1215
1216&emit_assembler();
1217
1218close STDOUT;
1219