1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim
10238384Sjkim# On PA-7100LC this module performs ~90-50% better, less for longer
11238384Sjkim# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
12238384Sjkim# that compiler utilized xmpyu instruction to perform 32x32=64-bit
13238384Sjkim# multiplication, which in turn means that "baseline" performance was
14238384Sjkim# optimal in respect to instruction set capabilities. Fair comparison
15238384Sjkim# with vendor compiler is problematic, because OpenSSL doesn't define
16238384Sjkim# BN_LLONG [presumably] for historical reasons, which drives compiler
17238384Sjkim# toward 4 times 16x16=32-bit multiplicatons [plus complementary
18238384Sjkim# shifts and additions] instead. This means that you should observe
19238384Sjkim# several times improvement over code generated by vendor compiler
20238384Sjkim# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
21238384Sjkim# improvement coefficient was never collected on PA-7100LC, or any
22238384Sjkim# other 1.1 CPU, because I don't have access to such machine with
23238384Sjkim# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
24238384Sjkim# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
25238384Sjkim# of ~5x on PA-8600.
26238384Sjkim#
27238384Sjkim# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
28238384Sjkim# reportedly ~2x faster than vendor compiler generated code [according
29238384Sjkim# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
30238384Sjkim# this implementation is actually 32-bit one, in the sense that it
31238384Sjkim# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
32238384Sjkim# 64-bit BN_LONGs... How do they interoperate then? No problem. This
33238384Sjkim# module picks halves of 64-bit values in reverse order and pretends
34238384Sjkim# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
35238384Sjkim# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
36238384Sjkim# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
37238384Sjkim# i.e. there is no "wider" multiplication like on most other 64-bit
38238384Sjkim# platforms. This means that even being effectively 32-bit, this
39238384Sjkim# implementation performs "64-bit" computational task in same amount
40238384Sjkim# of arithmetic operations, most notably multiplications. It requires
41238384Sjkim# more memory references, most notably to tp[num], but this doesn't
42238384Sjkim# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
43261037Sjkim# 2.0 code path provides virtually same performance as pa-risc2[W].s:
44238384Sjkim# it's ~10% better for shortest key length and ~10% worse for longest
45238384Sjkim# one.
46238384Sjkim#
47238384Sjkim# In case it wasn't clear. The module has two distinct code paths:
48238384Sjkim# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
49238384Sjkim# additions and 64-bit integer loads, not to mention specific
50238384Sjkim# instruction scheduling. In 64-bit build naturally only 2.0 code path
51238384Sjkim# is assembled. In 32-bit application context both code paths are
52238384Sjkim# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
53238384Sjkim# is taken automatically. Also, in 32-bit build the module imposes
54238384Sjkim# couple of limitations: vector lengths has to be even and vector
55238384Sjkim# addresses has to be 64-bit aligned. Normally neither is a problem:
56238384Sjkim# most common key lengths are even and vectors are commonly malloc-ed,
57238384Sjkim# which ensures alignment.
58238384Sjkim#
59238384Sjkim# Special thanks to polarhome.com for providing HP-UX account on
60238384Sjkim# PA-RISC 1.1 machine, and to correspondent who chose to remain
61238384Sjkim# anonymous for testing the code on PA-RISC 2.0 machine.
62238384Sjkim
63238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64238384Sjkim
65238384Sjkim$flavour = shift;
66238384Sjkim$output = shift;
67238384Sjkim
68238384Sjkimopen STDOUT,">$output";
69238384Sjkim
70238384Sjkimif ($flavour =~ /64/) {
71238384Sjkim	$LEVEL		="2.0W";
72238384Sjkim	$SIZE_T		=8;
73238384Sjkim	$FRAME_MARKER	=80;
74238384Sjkim	$SAVED_RP	=16;
75238384Sjkim	$PUSH		="std";
76238384Sjkim	$PUSHMA		="std,ma";
77238384Sjkim	$POP		="ldd";
78238384Sjkim	$POPMB		="ldd,mb";
79238384Sjkim	$BN_SZ		=$SIZE_T;
80238384Sjkim} else {
81238384Sjkim	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
82238384Sjkim	$SIZE_T		=4;
83238384Sjkim	$FRAME_MARKER	=48;
84238384Sjkim	$SAVED_RP	=20;
85238384Sjkim	$PUSH		="stw";
86238384Sjkim	$PUSHMA		="stwm";
87238384Sjkim	$POP		="ldw";
88238384Sjkim	$POPMB		="ldwm";
89238384Sjkim	$BN_SZ		=$SIZE_T;
90238384Sjkim	if (open CONF,"<${dir}../../opensslconf.h") {
91238384Sjkim	    while(<CONF>) {
92238384Sjkim		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
93238384Sjkim		    $BN_SZ=8;
94238384Sjkim		    $LEVEL="2.0";
95238384Sjkim		    last;
96238384Sjkim		}
97238384Sjkim	    }
98238384Sjkim	    close CONF;
99238384Sjkim	}
100238384Sjkim}
101238384Sjkim
102238384Sjkim$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
103238384Sjkim				#                [+ argument transfer]
104238384Sjkim$LOCALS=$FRAME-$FRAME_MARKER;
105238384Sjkim$FRAME+=32;			# local variables
106238384Sjkim
107238384Sjkim$tp="%r31";
108238384Sjkim$ti1="%r29";
109238384Sjkim$ti0="%r28";
110238384Sjkim
111238384Sjkim$rp="%r26";
112238384Sjkim$ap="%r25";
113238384Sjkim$bp="%r24";
114238384Sjkim$np="%r23";
115238384Sjkim$n0="%r22";	# passed through stack in 32-bit
116238384Sjkim$num="%r21";	# passed through stack in 32-bit
117238384Sjkim$idx="%r20";
118238384Sjkim$arrsz="%r19";
119238384Sjkim
120238384Sjkim$nm1="%r7";
121238384Sjkim$nm0="%r6";
122238384Sjkim$ab1="%r5";
123238384Sjkim$ab0="%r4";
124238384Sjkim
125238384Sjkim$fp="%r3";
126238384Sjkim$hi1="%r2";
127238384Sjkim$hi0="%r1";
128238384Sjkim
129238384Sjkim$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
130238384Sjkim
131238384Sjkim$fm0="%fr4";	$fti=$fm0;
132238384Sjkim$fbi="%fr5L";
133238384Sjkim$fn0="%fr5R";
134238384Sjkim$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
135238384Sjkim$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
136238384Sjkim
137238384Sjkim$code=<<___;
138238384Sjkim	.LEVEL	$LEVEL
139238384Sjkim	.SPACE	\$TEXT\$
140238384Sjkim	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
141238384Sjkim
142238384Sjkim	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
143238384Sjkim	.ALIGN	64
144238384Sjkimbn_mul_mont
145238384Sjkim	.PROC
146238384Sjkim	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
147238384Sjkim	.ENTRY
148238384Sjkim	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
149238384Sjkim	$PUSHMA	%r3,$FRAME(%sp)
150238384Sjkim	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
151238384Sjkim	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
152238384Sjkim	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
153238384Sjkim	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
154238384Sjkim	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
155238384Sjkim	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
156238384Sjkim	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
157238384Sjkim	ldo	-$FRAME(%sp),$fp
158238384Sjkim___
159238384Sjkim$code.=<<___ if ($SIZE_T==4);
160238384Sjkim	ldw	`-$FRAME_MARKER-4`($fp),$n0
161238384Sjkim	ldw	`-$FRAME_MARKER-8`($fp),$num
162238384Sjkim	nop
163238384Sjkim	nop					; alignment
164238384Sjkim___
165238384Sjkim$code.=<<___ if ($BN_SZ==4);
166238384Sjkim	comiclr,<=	6,$num,%r0		; are vectors long enough?
167238384Sjkim	b		L\$abort
168238384Sjkim	ldi		0,%r28			; signal "unhandled"
169238384Sjkim	add,ev		%r0,$num,$num		; is $num even?
170238384Sjkim	b		L\$abort
171238384Sjkim	nop
172238384Sjkim	or		$ap,$np,$ti1
173238384Sjkim	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
174238384Sjkim	b		L\$abort
175238384Sjkim	nop
176238384Sjkim	nop					; alignment
177238384Sjkim	nop
178238384Sjkim
179238384Sjkim	fldws		0($n0),${fn0}
180238384Sjkim	fldws,ma	4($bp),${fbi}		; bp[0]
181238384Sjkim___
182238384Sjkim$code.=<<___ if ($BN_SZ==8);
183238384Sjkim	comib,>		3,$num,L\$abort		; are vectors long enough?
184238384Sjkim	ldi		0,%r28			; signal "unhandled"
185238384Sjkim	addl		$num,$num,$num		; I operate on 32-bit values
186238384Sjkim
187238384Sjkim	fldws		4($n0),${fn0}		; only low part of n0
188238384Sjkim	fldws		4($bp),${fbi}		; bp[0] in flipped word order
189238384Sjkim___
190238384Sjkim$code.=<<___;
191238384Sjkim	fldds		0($ap),${fai}		; ap[0,1]
192238384Sjkim	fldds		0($np),${fni}		; np[0,1]
193238384Sjkim
194238384Sjkim	sh2addl		$num,%r0,$arrsz
195238384Sjkim	ldi		31,$hi0
196238384Sjkim	ldo		36($arrsz),$hi1		; space for tp[num+1]
197238384Sjkim	andcm		$hi1,$hi0,$hi1		; align
198238384Sjkim	addl		$hi1,%sp,%sp
199238384Sjkim	$PUSH		$fp,-$SIZE_T(%sp)
200238384Sjkim
201238384Sjkim	ldo		`$LOCALS+16`($fp),$xfer
202238384Sjkim	ldo		`$LOCALS+32+4`($fp),$tp
203238384Sjkim
204238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
205238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
206238384Sjkim	xmpyu		${fn0},${fab0}R,${fm0}
207238384Sjkim
208238384Sjkim	addl		$arrsz,$ap,$ap		; point at the end
209238384Sjkim	addl		$arrsz,$np,$np
210238384Sjkim	subi		0,$arrsz,$idx		; j=0
211238384Sjkim	ldo		8($idx),$idx		; j++++
212238384Sjkim
213238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
214238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
215238384Sjkim	fstds		${fab0},-16($xfer)
216238384Sjkim	fstds		${fnm0},-8($xfer)
217238384Sjkim	fstds		${fab1},0($xfer)
218238384Sjkim	fstds		${fnm1},8($xfer)
219238384Sjkim	 flddx		$idx($ap),${fai}	; ap[2,3]
220238384Sjkim	 flddx		$idx($np),${fni}	; np[2,3]
221238384Sjkim___
222238384Sjkim$code.=<<___ if ($BN_SZ==4);
223238384Sjkim	mtctl		$hi0,%cr11		; $hi0 still holds 31
224238384Sjkim	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
225238384Sjkim	b		L\$parisc11
226238384Sjkim	nop
227238384Sjkim___
228238384Sjkim$code.=<<___;					# PA-RISC 2.0 code-path
229238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
230238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
231238384Sjkim	ldd		-16($xfer),$ab0
232238384Sjkim	fstds		${fab0},-16($xfer)
233238384Sjkim
234238384Sjkim	extrd,u		$ab0,31,32,$hi0
235238384Sjkim	extrd,u		$ab0,63,32,$ab0
236238384Sjkim	ldd		-8($xfer),$nm0
237238384Sjkim	fstds		${fnm0},-8($xfer)
238238384Sjkim	 ldo		8($idx),$idx		; j++++
239238384Sjkim	 addl		$ab0,$nm0,$nm0		; low part is discarded
240238384Sjkim	 extrd,u	$nm0,31,32,$hi1
241238384Sjkim
242238384SjkimL\$1st
243238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
244238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
245238384Sjkim	ldd		0($xfer),$ab1
246238384Sjkim	fstds		${fab1},0($xfer)
247238384Sjkim	 addl		$hi0,$ab1,$ab1
248238384Sjkim	 extrd,u	$ab1,31,32,$hi0
249238384Sjkim	ldd		8($xfer),$nm1
250238384Sjkim	fstds		${fnm1},8($xfer)
251238384Sjkim	 extrd,u	$ab1,63,32,$ab1
252238384Sjkim	 addl		$hi1,$nm1,$nm1
253238384Sjkim	flddx		$idx($ap),${fai}	; ap[j,j+1]
254238384Sjkim	flddx		$idx($np),${fni}	; np[j,j+1]
255238384Sjkim	 addl		$ab1,$nm1,$nm1
256238384Sjkim	 extrd,u	$nm1,31,32,$hi1
257238384Sjkim
258238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
259238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
260238384Sjkim	ldd		-16($xfer),$ab0
261238384Sjkim	fstds		${fab0},-16($xfer)
262238384Sjkim	 addl		$hi0,$ab0,$ab0
263238384Sjkim	 extrd,u	$ab0,31,32,$hi0
264238384Sjkim	ldd		-8($xfer),$nm0
265238384Sjkim	fstds		${fnm0},-8($xfer)
266238384Sjkim	 extrd,u	$ab0,63,32,$ab0
267238384Sjkim	 addl		$hi1,$nm0,$nm0
268238384Sjkim	stw		$nm1,-4($tp)		; tp[j-1]
269238384Sjkim	 addl		$ab0,$nm0,$nm0
270238384Sjkim	 stw,ma		$nm0,8($tp)		; tp[j-1]
271238384Sjkim	addib,<>	8,$idx,L\$1st		; j++++
272238384Sjkim	 extrd,u	$nm0,31,32,$hi1
273238384Sjkim
274238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
275238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
276238384Sjkim	ldd		0($xfer),$ab1
277238384Sjkim	fstds		${fab1},0($xfer)
278238384Sjkim	 addl		$hi0,$ab1,$ab1
279238384Sjkim	 extrd,u	$ab1,31,32,$hi0
280238384Sjkim	ldd		8($xfer),$nm1
281238384Sjkim	fstds		${fnm1},8($xfer)
282238384Sjkim	 extrd,u	$ab1,63,32,$ab1
283238384Sjkim	 addl		$hi1,$nm1,$nm1
284238384Sjkim	ldd		-16($xfer),$ab0
285238384Sjkim	 addl		$ab1,$nm1,$nm1
286238384Sjkim	ldd		-8($xfer),$nm0
287238384Sjkim	 extrd,u	$nm1,31,32,$hi1
288238384Sjkim
289238384Sjkim	 addl		$hi0,$ab0,$ab0
290238384Sjkim	 extrd,u	$ab0,31,32,$hi0
291238384Sjkim	stw		$nm1,-4($tp)		; tp[j-1]
292238384Sjkim	 extrd,u	$ab0,63,32,$ab0
293238384Sjkim	 addl		$hi1,$nm0,$nm0
294238384Sjkim	ldd		0($xfer),$ab1
295238384Sjkim	 addl		$ab0,$nm0,$nm0
296238384Sjkim	ldd,mb		8($xfer),$nm1
297238384Sjkim	 extrd,u	$nm0,31,32,$hi1
298238384Sjkim	stw,ma		$nm0,8($tp)		; tp[j-1]
299238384Sjkim
300238384Sjkim	ldo		-1($num),$num		; i--
301238384Sjkim	subi		0,$arrsz,$idx		; j=0
302238384Sjkim___
303238384Sjkim$code.=<<___ if ($BN_SZ==4);
304238384Sjkim	fldws,ma	4($bp),${fbi}		; bp[1]
305238384Sjkim___
306238384Sjkim$code.=<<___ if ($BN_SZ==8);
307238384Sjkim	fldws		0($bp),${fbi}		; bp[1] in flipped word order
308238384Sjkim___
309238384Sjkim$code.=<<___;
310238384Sjkim	 flddx		$idx($ap),${fai}	; ap[0,1]
311238384Sjkim	 flddx		$idx($np),${fni}	; np[0,1]
312238384Sjkim	 fldws		8($xfer),${fti}R	; tp[0]
313238384Sjkim	addl		$hi0,$ab1,$ab1
314238384Sjkim	 extrd,u	$ab1,31,32,$hi0
315238384Sjkim	 extrd,u	$ab1,63,32,$ab1
316238384Sjkim	 ldo		8($idx),$idx		; j++++
317238384Sjkim	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
318238384Sjkim	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
319238384Sjkim	addl		$hi1,$nm1,$nm1
320238384Sjkim	addl		$ab1,$nm1,$nm1
321238384Sjkim	extrd,u		$nm1,31,32,$hi1
322238384Sjkim	 fstws,mb	${fab0}L,-8($xfer)	; save high part
323238384Sjkim	stw		$nm1,-4($tp)		; tp[j-1]
324238384Sjkim
325238384Sjkim	 fcpy,sgl	%fr0,${fti}L		; zero high part
326238384Sjkim	 fcpy,sgl	%fr0,${fab0}L
327238384Sjkim	addl		$hi1,$hi0,$hi0
328238384Sjkim	extrd,u		$hi0,31,32,$hi1
329238384Sjkim	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
330238384Sjkim	 fcnvxf,dbl,dbl	${fab0},${fab0}
331238384Sjkim	stw		$hi0,0($tp)
332238384Sjkim	stw		$hi1,4($tp)
333238384Sjkim
334238384Sjkim	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
335238384Sjkim	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
336238384Sjkim	xmpyu		${fn0},${fab0}R,${fm0}
337238384Sjkim	ldo		`$LOCALS+32+4`($fp),$tp
338238384SjkimL\$outer
339238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
340238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
341238384Sjkim	fstds		${fab0},-16($xfer)	; 33-bit value
342238384Sjkim	fstds		${fnm0},-8($xfer)
343238384Sjkim	 flddx		$idx($ap),${fai}	; ap[2]
344238384Sjkim	 flddx		$idx($np),${fni}	; np[2]
345238384Sjkim	 ldo		8($idx),$idx		; j++++
346238384Sjkim	ldd		-16($xfer),$ab0		; 33-bit value
347238384Sjkim	ldd		-8($xfer),$nm0
348238384Sjkim	ldw		0($xfer),$hi0		; high part
349238384Sjkim
350238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
351238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
352238384Sjkim	 extrd,u	$ab0,31,32,$ti0		; carry bit
353238384Sjkim	 extrd,u	$ab0,63,32,$ab0
354238384Sjkim	fstds		${fab1},0($xfer)
355238384Sjkim	 addl		$ti0,$hi0,$hi0		; account carry bit
356238384Sjkim	fstds		${fnm1},8($xfer)
357238384Sjkim	 addl		$ab0,$nm0,$nm0		; low part is discarded
358238384Sjkim	ldw		0($tp),$ti1		; tp[1]
359238384Sjkim	 extrd,u	$nm0,31,32,$hi1
360238384Sjkim	fstds		${fab0},-16($xfer)
361238384Sjkim	fstds		${fnm0},-8($xfer)
362238384Sjkim
363238384SjkimL\$inner
364238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
365238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
366238384Sjkim	ldd		0($xfer),$ab1
367238384Sjkim	fstds		${fab1},0($xfer)
368238384Sjkim	 addl		$hi0,$ti1,$ti1
369238384Sjkim	 addl		$ti1,$ab1,$ab1
370238384Sjkim	ldd		8($xfer),$nm1
371238384Sjkim	fstds		${fnm1},8($xfer)
372238384Sjkim	 extrd,u	$ab1,31,32,$hi0
373238384Sjkim	 extrd,u	$ab1,63,32,$ab1
374238384Sjkim	flddx		$idx($ap),${fai}	; ap[j,j+1]
375238384Sjkim	flddx		$idx($np),${fni}	; np[j,j+1]
376238384Sjkim	 addl		$hi1,$nm1,$nm1
377238384Sjkim	 addl		$ab1,$nm1,$nm1
378238384Sjkim	ldw		4($tp),$ti0		; tp[j]
379238384Sjkim	stw		$nm1,-4($tp)		; tp[j-1]
380238384Sjkim
381238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
382238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
383238384Sjkim	ldd		-16($xfer),$ab0
384238384Sjkim	fstds		${fab0},-16($xfer)
385238384Sjkim	 addl		$hi0,$ti0,$ti0
386238384Sjkim	 addl		$ti0,$ab0,$ab0
387238384Sjkim	ldd		-8($xfer),$nm0
388238384Sjkim	fstds		${fnm0},-8($xfer)
389238384Sjkim	 extrd,u	$ab0,31,32,$hi0
390238384Sjkim	 extrd,u	$nm1,31,32,$hi1
391238384Sjkim	ldw		8($tp),$ti1		; tp[j]
392238384Sjkim	 extrd,u	$ab0,63,32,$ab0
393238384Sjkim	 addl		$hi1,$nm0,$nm0
394238384Sjkim	 addl		$ab0,$nm0,$nm0
395238384Sjkim	 stw,ma		$nm0,8($tp)		; tp[j-1]
396238384Sjkim	addib,<>	8,$idx,L\$inner		; j++++
397238384Sjkim	 extrd,u	$nm0,31,32,$hi1
398238384Sjkim
399238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
400238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
401238384Sjkim	ldd		0($xfer),$ab1
402238384Sjkim	fstds		${fab1},0($xfer)
403238384Sjkim	 addl		$hi0,$ti1,$ti1
404238384Sjkim	 addl		$ti1,$ab1,$ab1
405238384Sjkim	ldd		8($xfer),$nm1
406238384Sjkim	fstds		${fnm1},8($xfer)
407238384Sjkim	 extrd,u	$ab1,31,32,$hi0
408238384Sjkim	 extrd,u	$ab1,63,32,$ab1
409238384Sjkim	ldw		4($tp),$ti0		; tp[j]
410238384Sjkim	 addl		$hi1,$nm1,$nm1
411238384Sjkim	 addl		$ab1,$nm1,$nm1
412238384Sjkim	ldd		-16($xfer),$ab0
413238384Sjkim	ldd		-8($xfer),$nm0
414238384Sjkim	 extrd,u	$nm1,31,32,$hi1
415238384Sjkim
416238384Sjkim	addl		$hi0,$ab0,$ab0
417238384Sjkim	 addl		$ti0,$ab0,$ab0
418238384Sjkim	 stw		$nm1,-4($tp)		; tp[j-1]
419238384Sjkim	 extrd,u	$ab0,31,32,$hi0
420238384Sjkim	ldw		8($tp),$ti1		; tp[j]
421238384Sjkim	 extrd,u	$ab0,63,32,$ab0
422238384Sjkim	 addl		$hi1,$nm0,$nm0
423238384Sjkim	ldd		0($xfer),$ab1
424238384Sjkim	 addl		$ab0,$nm0,$nm0
425238384Sjkim	ldd,mb		8($xfer),$nm1
426238384Sjkim	 extrd,u	$nm0,31,32,$hi1
427238384Sjkim	 stw,ma		$nm0,8($tp)		; tp[j-1]
428238384Sjkim
429238384Sjkim	addib,=		-1,$num,L\$outerdone	; i--
430238384Sjkim	subi		0,$arrsz,$idx		; j=0
431238384Sjkim___
432238384Sjkim$code.=<<___ if ($BN_SZ==4);
433238384Sjkim	fldws,ma	4($bp),${fbi}		; bp[i]
434238384Sjkim___
435238384Sjkim$code.=<<___ if ($BN_SZ==8);
436238384Sjkim	ldi		12,$ti0			; bp[i] in flipped word order
437238384Sjkim	addl,ev		%r0,$num,$num
438238384Sjkim	ldi		-4,$ti0
439238384Sjkim	addl		$ti0,$bp,$bp
440238384Sjkim	fldws		0($bp),${fbi}
441238384Sjkim___
442238384Sjkim$code.=<<___;
443238384Sjkim	 flddx		$idx($ap),${fai}	; ap[0]
444238384Sjkim	addl		$hi0,$ab1,$ab1
445238384Sjkim	 flddx		$idx($np),${fni}	; np[0]
446238384Sjkim	 fldws		8($xfer),${fti}R	; tp[0]
447238384Sjkim	addl		$ti1,$ab1,$ab1
448238384Sjkim	extrd,u		$ab1,31,32,$hi0
449238384Sjkim	extrd,u		$ab1,63,32,$ab1
450238384Sjkim
451238384Sjkim	 ldo		8($idx),$idx		; j++++
452238384Sjkim	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
453238384Sjkim	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
454238384Sjkim	ldw		4($tp),$ti0		; tp[j]
455238384Sjkim
456238384Sjkim	addl		$hi1,$nm1,$nm1
457238384Sjkim	 fstws,mb	${fab0}L,-8($xfer)	; save high part
458238384Sjkim	addl		$ab1,$nm1,$nm1
459238384Sjkim	extrd,u		$nm1,31,32,$hi1
460238384Sjkim	 fcpy,sgl	%fr0,${fti}L		; zero high part
461238384Sjkim	 fcpy,sgl	%fr0,${fab0}L
462238384Sjkim	stw		$nm1,-4($tp)		; tp[j-1]
463238384Sjkim
464238384Sjkim	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
465238384Sjkim	 fcnvxf,dbl,dbl	${fab0},${fab0}
466238384Sjkim	addl		$hi1,$hi0,$hi0
467238384Sjkim	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
468238384Sjkim	addl		$ti0,$hi0,$hi0
469238384Sjkim	extrd,u		$hi0,31,32,$hi1
470238384Sjkim	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
471238384Sjkim	stw		$hi0,0($tp)
472238384Sjkim	stw		$hi1,4($tp)
473238384Sjkim	 xmpyu		${fn0},${fab0}R,${fm0}
474238384Sjkim
475238384Sjkim	b		L\$outer
476238384Sjkim	ldo		`$LOCALS+32+4`($fp),$tp
477238384Sjkim
478238384SjkimL\$outerdone
479238384Sjkim	addl		$hi0,$ab1,$ab1
480238384Sjkim	addl		$ti1,$ab1,$ab1
481238384Sjkim	extrd,u		$ab1,31,32,$hi0
482238384Sjkim	extrd,u		$ab1,63,32,$ab1
483238384Sjkim
484238384Sjkim	ldw		4($tp),$ti0		; tp[j]
485238384Sjkim
486238384Sjkim	addl		$hi1,$nm1,$nm1
487238384Sjkim	addl		$ab1,$nm1,$nm1
488238384Sjkim	extrd,u		$nm1,31,32,$hi1
489238384Sjkim	stw		$nm1,-4($tp)		; tp[j-1]
490238384Sjkim
491238384Sjkim	addl		$hi1,$hi0,$hi0
492238384Sjkim	addl		$ti0,$hi0,$hi0
493238384Sjkim	extrd,u		$hi0,31,32,$hi1
494238384Sjkim	stw		$hi0,0($tp)
495238384Sjkim	stw		$hi1,4($tp)
496238384Sjkim
497238384Sjkim	ldo		`$LOCALS+32`($fp),$tp
498238384Sjkim	sub		%r0,%r0,%r0		; clear borrow
499238384Sjkim___
500238384Sjkim$code.=<<___ if ($BN_SZ==4);
501238384Sjkim	ldws,ma		4($tp),$ti0
502238384Sjkim	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
503238384Sjkim	b		L\$sub_pa11
504238384Sjkim	addl		$tp,$arrsz,$tp
505238384SjkimL\$sub
506238384Sjkim	ldwx		$idx($np),$hi0
507238384Sjkim	subb		$ti0,$hi0,$hi1
508238384Sjkim	ldwx		$idx($tp),$ti0
509238384Sjkim	addib,<>	4,$idx,L\$sub
510238384Sjkim	stws,ma		$hi1,4($rp)
511238384Sjkim
512238384Sjkim	subb		$ti0,%r0,$hi1
513238384Sjkim___
514238384Sjkim$code.=<<___ if ($BN_SZ==8);
515238384Sjkim	ldd,ma		8($tp),$ti0
516238384SjkimL\$sub
517238384Sjkim	ldd		$idx($np),$hi0
518238384Sjkim	shrpd		$ti0,$ti0,32,$ti0	; flip word order
519238384Sjkim	std		$ti0,-8($tp)		; save flipped value
520238384Sjkim	sub,db		$ti0,$hi0,$hi1
521238384Sjkim	ldd,ma		8($tp),$ti0
522238384Sjkim	addib,<>	8,$idx,L\$sub
523238384Sjkim	std,ma		$hi1,8($rp)
524238384Sjkim
525238384Sjkim	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
526238384Sjkim	sub,db		$ti0,%r0,$hi1
527238384Sjkim___
528238384Sjkim$code.=<<___;
529337982Sjkim	ldo		`$LOCALS+32`($fp),$tp
530238384Sjkim	sub		$rp,$arrsz,$rp		; rewind rp
531238384Sjkim	subi		0,$arrsz,$idx
532238384SjkimL\$copy
533337982Sjkim	ldd		0($tp),$ti0
534337982Sjkim	ldd		0($rp),$hi0
535238384Sjkim	std,ma		%r0,8($tp)
536337982Sjkim	comiclr,=	0,$hi1,%r0
537337982Sjkim	copy		$ti0,$hi0
538337982Sjkim	addib,<>	8,$idx,L\$copy
539337982Sjkim	std,ma		$hi0,8($rp)
540238384Sjkim___
541238384Sjkim
542238384Sjkimif ($BN_SZ==4) {				# PA-RISC 1.1 code-path
543238384Sjkim$ablo=$ab0;
544238384Sjkim$abhi=$ab1;
545238384Sjkim$nmlo0=$nm0;
546238384Sjkim$nmhi0=$nm1;
547238384Sjkim$nmlo1="%r9";
548238384Sjkim$nmhi1="%r8";
549238384Sjkim
550238384Sjkim$code.=<<___;
551238384Sjkim	b		L\$done
552238384Sjkim	nop
553238384Sjkim
554238384Sjkim	.ALIGN		8
555238384SjkimL\$parisc11
556238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
557238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
558238384Sjkim	ldw		-12($xfer),$ablo
559238384Sjkim	ldw		-16($xfer),$hi0
560238384Sjkim	ldw		-4($xfer),$nmlo0
561238384Sjkim	ldw		-8($xfer),$nmhi0
562238384Sjkim	fstds		${fab0},-16($xfer)
563238384Sjkim	fstds		${fnm0},-8($xfer)
564238384Sjkim
565238384Sjkim	 ldo		8($idx),$idx		; j++++
566238384Sjkim	 add		$ablo,$nmlo0,$nmlo0	; discarded
567238384Sjkim	 addc		%r0,$nmhi0,$hi1
568238384Sjkim	ldw		4($xfer),$ablo
569238384Sjkim	ldw		0($xfer),$abhi
570238384Sjkim	nop
571238384Sjkim
572238384SjkimL\$1st_pa11
573238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
574238384Sjkim	flddx		$idx($ap),${fai}	; ap[j,j+1]
575238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
576238384Sjkim	flddx		$idx($np),${fni}	; np[j,j+1]
577238384Sjkim	 add		$hi0,$ablo,$ablo
578238384Sjkim	ldw		12($xfer),$nmlo1
579238384Sjkim	 addc		%r0,$abhi,$hi0
580238384Sjkim	ldw		8($xfer),$nmhi1
581238384Sjkim	 add		$ablo,$nmlo1,$nmlo1
582238384Sjkim	fstds		${fab1},0($xfer)
583238384Sjkim	 addc		%r0,$nmhi1,$nmhi1
584238384Sjkim	fstds		${fnm1},8($xfer)
585238384Sjkim	 add		$hi1,$nmlo1,$nmlo1
586238384Sjkim	ldw		-12($xfer),$ablo
587238384Sjkim	 addc		%r0,$nmhi1,$hi1
588238384Sjkim	ldw		-16($xfer),$abhi
589238384Sjkim
590238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
591238384Sjkim	ldw		-4($xfer),$nmlo0
592238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
593238384Sjkim	ldw		-8($xfer),$nmhi0
594238384Sjkim	 add		$hi0,$ablo,$ablo
595238384Sjkim	stw		$nmlo1,-4($tp)		; tp[j-1]
596238384Sjkim	 addc		%r0,$abhi,$hi0
597238384Sjkim	fstds		${fab0},-16($xfer)
598238384Sjkim	 add		$ablo,$nmlo0,$nmlo0
599238384Sjkim	fstds		${fnm0},-8($xfer)
600238384Sjkim	 addc		%r0,$nmhi0,$nmhi0
601238384Sjkim	ldw		0($xfer),$abhi
602238384Sjkim	 add		$hi1,$nmlo0,$nmlo0
603238384Sjkim	ldw		4($xfer),$ablo
604238384Sjkim	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
605238384Sjkim	addib,<>	8,$idx,L\$1st_pa11	; j++++
606238384Sjkim	 addc		%r0,$nmhi0,$hi1
607238384Sjkim
608238384Sjkim	 ldw		8($xfer),$nmhi1
609238384Sjkim	 ldw		12($xfer),$nmlo1
610238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
611238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
612238384Sjkim	 add		$hi0,$ablo,$ablo
613238384Sjkim	fstds		${fab1},0($xfer)
614238384Sjkim	 addc		%r0,$abhi,$hi0
615238384Sjkim	fstds		${fnm1},8($xfer)
616238384Sjkim	 add		$ablo,$nmlo1,$nmlo1
617238384Sjkim	ldw		-16($xfer),$abhi
618238384Sjkim	 addc		%r0,$nmhi1,$nmhi1
619238384Sjkim	ldw		-12($xfer),$ablo
620238384Sjkim	 add		$hi1,$nmlo1,$nmlo1
621238384Sjkim	ldw		-8($xfer),$nmhi0
622238384Sjkim	 addc		%r0,$nmhi1,$hi1
623238384Sjkim	ldw		-4($xfer),$nmlo0
624238384Sjkim
625238384Sjkim	 add		$hi0,$ablo,$ablo
626238384Sjkim	stw		$nmlo1,-4($tp)		; tp[j-1]
627238384Sjkim	 addc		%r0,$abhi,$hi0
628238384Sjkim	ldw		0($xfer),$abhi
629238384Sjkim	 add		$ablo,$nmlo0,$nmlo0
630238384Sjkim	ldw		4($xfer),$ablo
631238384Sjkim	 addc		%r0,$nmhi0,$nmhi0
632238384Sjkim	ldws,mb		8($xfer),$nmhi1
633238384Sjkim	 add		$hi1,$nmlo0,$nmlo0
634238384Sjkim	ldw		4($xfer),$nmlo1
635238384Sjkim	 addc		%r0,$nmhi0,$hi1
636238384Sjkim	stws,ma		$nmlo0,8($tp)		; tp[j-1]
637238384Sjkim
638238384Sjkim	ldo		-1($num),$num		; i--
639238384Sjkim	subi		0,$arrsz,$idx		; j=0
640238384Sjkim
641238384Sjkim	 fldws,ma	4($bp),${fbi}		; bp[1]
642238384Sjkim	 flddx		$idx($ap),${fai}	; ap[0,1]
643238384Sjkim	 flddx		$idx($np),${fni}	; np[0,1]
644238384Sjkim	 fldws		8($xfer),${fti}R	; tp[0]
645238384Sjkim	add		$hi0,$ablo,$ablo
646238384Sjkim	addc		%r0,$abhi,$hi0
647238384Sjkim	 ldo		8($idx),$idx		; j++++
648238384Sjkim	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
649238384Sjkim	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
650238384Sjkim	add		$hi1,$nmlo1,$nmlo1
651238384Sjkim	addc		%r0,$nmhi1,$nmhi1
652238384Sjkim	add		$ablo,$nmlo1,$nmlo1
653238384Sjkim	addc		%r0,$nmhi1,$hi1
654238384Sjkim	 fstws,mb	${fab0}L,-8($xfer)	; save high part
655238384Sjkim	stw		$nmlo1,-4($tp)		; tp[j-1]
656238384Sjkim
657238384Sjkim	 fcpy,sgl	%fr0,${fti}L		; zero high part
658238384Sjkim	 fcpy,sgl	%fr0,${fab0}L
659238384Sjkim	add		$hi1,$hi0,$hi0
660238384Sjkim	addc		%r0,%r0,$hi1
661238384Sjkim	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
662238384Sjkim	 fcnvxf,dbl,dbl	${fab0},${fab0}
663238384Sjkim	stw		$hi0,0($tp)
664238384Sjkim	stw		$hi1,4($tp)
665238384Sjkim
666238384Sjkim	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
667238384Sjkim	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
668238384Sjkim	xmpyu		${fn0},${fab0}R,${fm0}
669238384Sjkim	ldo		`$LOCALS+32+4`($fp),$tp
670238384SjkimL\$outer_pa11
671238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
672238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
673238384Sjkim	fstds		${fab0},-16($xfer)	; 33-bit value
674238384Sjkim	fstds		${fnm0},-8($xfer)
675238384Sjkim	 flddx		$idx($ap),${fai}	; ap[2,3]
676238384Sjkim	 flddx		$idx($np),${fni}	; np[2,3]
677238384Sjkim	ldw		-16($xfer),$abhi	; carry bit actually
678238384Sjkim	 ldo		8($idx),$idx		; j++++
679238384Sjkim	ldw		-12($xfer),$ablo
680238384Sjkim	ldw		-8($xfer),$nmhi0
681238384Sjkim	ldw		-4($xfer),$nmlo0
682238384Sjkim	ldw		0($xfer),$hi0		; high part
683238384Sjkim
684238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
685238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
686238384Sjkim	fstds		${fab1},0($xfer)
687238384Sjkim	 addl		$abhi,$hi0,$hi0		; account carry bit
688238384Sjkim	fstds		${fnm1},8($xfer)
689238384Sjkim	 add		$ablo,$nmlo0,$nmlo0	; discarded
690238384Sjkim	ldw		0($tp),$ti1		; tp[1]
691238384Sjkim	 addc		%r0,$nmhi0,$hi1
692238384Sjkim	fstds		${fab0},-16($xfer)
693238384Sjkim	fstds		${fnm0},-8($xfer)
694238384Sjkim	ldw		4($xfer),$ablo
695238384Sjkim	ldw		0($xfer),$abhi
696238384Sjkim
697238384SjkimL\$inner_pa11
698238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
699238384Sjkim	flddx		$idx($ap),${fai}	; ap[j,j+1]
700238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
701238384Sjkim	flddx		$idx($np),${fni}	; np[j,j+1]
702238384Sjkim	 add		$hi0,$ablo,$ablo
703238384Sjkim	ldw		4($tp),$ti0		; tp[j]
704238384Sjkim	 addc		%r0,$abhi,$abhi
705238384Sjkim	ldw		12($xfer),$nmlo1
706238384Sjkim	 add		$ti1,$ablo,$ablo
707238384Sjkim	ldw		8($xfer),$nmhi1
708238384Sjkim	 addc		%r0,$abhi,$hi0
709238384Sjkim	fstds		${fab1},0($xfer)
710238384Sjkim	 add		$ablo,$nmlo1,$nmlo1
711238384Sjkim	fstds		${fnm1},8($xfer)
712238384Sjkim	 addc		%r0,$nmhi1,$nmhi1
713238384Sjkim	ldw		-12($xfer),$ablo
714238384Sjkim	 add		$hi1,$nmlo1,$nmlo1
715238384Sjkim	ldw		-16($xfer),$abhi
716238384Sjkim	 addc		%r0,$nmhi1,$hi1
717238384Sjkim
718238384Sjkim	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
719238384Sjkim	ldw		8($tp),$ti1		; tp[j]
720238384Sjkim	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
721238384Sjkim	ldw		-4($xfer),$nmlo0
722238384Sjkim	 add		$hi0,$ablo,$ablo
723238384Sjkim	ldw		-8($xfer),$nmhi0
724238384Sjkim	 addc		%r0,$abhi,$abhi
725238384Sjkim	stw		$nmlo1,-4($tp)		; tp[j-1]
726238384Sjkim	 add		$ti0,$ablo,$ablo
727238384Sjkim	fstds		${fab0},-16($xfer)
728238384Sjkim	 addc		%r0,$abhi,$hi0
729238384Sjkim	fstds		${fnm0},-8($xfer)
730238384Sjkim	 add		$ablo,$nmlo0,$nmlo0
731238384Sjkim	ldw		4($xfer),$ablo
732238384Sjkim	 addc		%r0,$nmhi0,$nmhi0
733238384Sjkim	ldw		0($xfer),$abhi
734238384Sjkim	 add		$hi1,$nmlo0,$nmlo0
735238384Sjkim	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
736238384Sjkim	addib,<>	8,$idx,L\$inner_pa11	; j++++
737238384Sjkim	 addc		%r0,$nmhi0,$hi1
738238384Sjkim
739238384Sjkim	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
740238384Sjkim	ldw		12($xfer),$nmlo1
741238384Sjkim	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
742238384Sjkim	ldw		8($xfer),$nmhi1
743238384Sjkim	 add		$hi0,$ablo,$ablo
744238384Sjkim	ldw		4($tp),$ti0		; tp[j]
745238384Sjkim	 addc		%r0,$abhi,$abhi
746238384Sjkim	fstds		${fab1},0($xfer)
747238384Sjkim	 add		$ti1,$ablo,$ablo
748238384Sjkim	fstds		${fnm1},8($xfer)
749238384Sjkim	 addc		%r0,$abhi,$hi0
750238384Sjkim	ldw		-16($xfer),$abhi
751238384Sjkim	 add		$ablo,$nmlo1,$nmlo1
752238384Sjkim	ldw		-12($xfer),$ablo
753238384Sjkim	 addc		%r0,$nmhi1,$nmhi1
754238384Sjkim	ldw		-8($xfer),$nmhi0
755238384Sjkim	 add		$hi1,$nmlo1,$nmlo1
756238384Sjkim	ldw		-4($xfer),$nmlo0
757238384Sjkim	 addc		%r0,$nmhi1,$hi1
758238384Sjkim
759238384Sjkim	add		$hi0,$ablo,$ablo
760238384Sjkim	 stw		$nmlo1,-4($tp)		; tp[j-1]
761238384Sjkim	addc		%r0,$abhi,$abhi
762238384Sjkim	 add		$ti0,$ablo,$ablo
763238384Sjkim	ldw		8($tp),$ti1		; tp[j]
764238384Sjkim	 addc		%r0,$abhi,$hi0
765238384Sjkim	ldw		0($xfer),$abhi
766238384Sjkim	 add		$ablo,$nmlo0,$nmlo0
767238384Sjkim	ldw		4($xfer),$ablo
768238384Sjkim	 addc		%r0,$nmhi0,$nmhi0
769238384Sjkim	ldws,mb		8($xfer),$nmhi1
770238384Sjkim	 add		$hi1,$nmlo0,$nmlo0
771238384Sjkim	ldw		4($xfer),$nmlo1
772238384Sjkim	 addc		%r0,$nmhi0,$hi1
773238384Sjkim	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
774238384Sjkim
775238384Sjkim	addib,=		-1,$num,L\$outerdone_pa11; i--
776238384Sjkim	subi		0,$arrsz,$idx		; j=0
777238384Sjkim
778238384Sjkim	 fldws,ma	4($bp),${fbi}		; bp[i]
779238384Sjkim	 flddx		$idx($ap),${fai}	; ap[0]
780238384Sjkim	add		$hi0,$ablo,$ablo
781238384Sjkim	addc		%r0,$abhi,$abhi
782238384Sjkim	 flddx		$idx($np),${fni}	; np[0]
783238384Sjkim	 fldws		8($xfer),${fti}R	; tp[0]
784238384Sjkim	add		$ti1,$ablo,$ablo
785238384Sjkim	addc		%r0,$abhi,$hi0
786238384Sjkim
787238384Sjkim	 ldo		8($idx),$idx		; j++++
788238384Sjkim	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
789238384Sjkim	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
790238384Sjkim	ldw		4($tp),$ti0		; tp[j]
791238384Sjkim
792238384Sjkim	add		$hi1,$nmlo1,$nmlo1
793238384Sjkim	addc		%r0,$nmhi1,$nmhi1
794238384Sjkim	 fstws,mb	${fab0}L,-8($xfer)	; save high part
795238384Sjkim	add		$ablo,$nmlo1,$nmlo1
796238384Sjkim	addc		%r0,$nmhi1,$hi1
797238384Sjkim	 fcpy,sgl	%fr0,${fti}L		; zero high part
798238384Sjkim	 fcpy,sgl	%fr0,${fab0}L
799238384Sjkim	stw		$nmlo1,-4($tp)		; tp[j-1]
800238384Sjkim
801238384Sjkim	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
802238384Sjkim	 fcnvxf,dbl,dbl	${fab0},${fab0}
803238384Sjkim	add		$hi1,$hi0,$hi0
804238384Sjkim	addc		%r0,%r0,$hi1
805238384Sjkim	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
806238384Sjkim	add		$ti0,$hi0,$hi0
807238384Sjkim	addc		%r0,$hi1,$hi1
808238384Sjkim	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
809238384Sjkim	stw		$hi0,0($tp)
810238384Sjkim	stw		$hi1,4($tp)
811238384Sjkim	 xmpyu		${fn0},${fab0}R,${fm0}
812238384Sjkim
813238384Sjkim	b		L\$outer_pa11
814238384Sjkim	ldo		`$LOCALS+32+4`($fp),$tp
815238384Sjkim
816238384SjkimL\$outerdone_pa11
817238384Sjkim	add		$hi0,$ablo,$ablo
818238384Sjkim	addc		%r0,$abhi,$abhi
819238384Sjkim	add		$ti1,$ablo,$ablo
820238384Sjkim	addc		%r0,$abhi,$hi0
821238384Sjkim
822238384Sjkim	ldw		4($tp),$ti0		; tp[j]
823238384Sjkim
824238384Sjkim	add		$hi1,$nmlo1,$nmlo1
825238384Sjkim	addc		%r0,$nmhi1,$nmhi1
826238384Sjkim	add		$ablo,$nmlo1,$nmlo1
827238384Sjkim	addc		%r0,$nmhi1,$hi1
828238384Sjkim	stw		$nmlo1,-4($tp)		; tp[j-1]
829238384Sjkim
830238384Sjkim	add		$hi1,$hi0,$hi0
831238384Sjkim	addc		%r0,%r0,$hi1
832238384Sjkim	add		$ti0,$hi0,$hi0
833238384Sjkim	addc		%r0,$hi1,$hi1
834238384Sjkim	stw		$hi0,0($tp)
835238384Sjkim	stw		$hi1,4($tp)
836238384Sjkim
837238384Sjkim	ldo		`$LOCALS+32+4`($fp),$tp
838238384Sjkim	sub		%r0,%r0,%r0		; clear borrow
839238384Sjkim	ldw		-4($tp),$ti0
840238384Sjkim	addl		$tp,$arrsz,$tp
841238384SjkimL\$sub_pa11
842238384Sjkim	ldwx		$idx($np),$hi0
843238384Sjkim	subb		$ti0,$hi0,$hi1
844238384Sjkim	ldwx		$idx($tp),$ti0
845238384Sjkim	addib,<>	4,$idx,L\$sub_pa11
846238384Sjkim	stws,ma		$hi1,4($rp)
847238384Sjkim
848238384Sjkim	subb		$ti0,%r0,$hi1
849238384Sjkim
850337982Sjkim	ldo		`$LOCALS+32`($fp),$tp
851238384Sjkim	sub		$rp,$arrsz,$rp		; rewind rp
852238384Sjkim	subi		0,$arrsz,$idx
853238384SjkimL\$copy_pa11
854337982Sjkim	ldw		0($tp),$ti0
855337982Sjkim	ldw		0($rp),$hi0
856238384Sjkim	stws,ma		%r0,4($tp)
857337982Sjkim	comiclr,=	0,$hi1,%r0
858337982Sjkim	copy		$ti0,$hi0
859238384Sjkim	addib,<>	4,$idx,L\$copy_pa11
860238384Sjkim	stws,ma		$hi0,4($rp)
861238384Sjkim
862238384Sjkim	nop					; alignment
863238384SjkimL\$done
864238384Sjkim___
865238384Sjkim}
866238384Sjkim
867238384Sjkim$code.=<<___;
868238384Sjkim	ldi		1,%r28			; signal "handled"
869238384Sjkim	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
870238384Sjkim
871238384Sjkim	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
872238384Sjkim	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
873238384Sjkim	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
874238384Sjkim	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
875238384Sjkim	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
876238384Sjkim	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
877238384Sjkim	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
878238384Sjkim	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
879238384SjkimL\$abort
880238384Sjkim	bv	(%r2)
881238384Sjkim	.EXIT
882238384Sjkim	$POPMB	-$FRAME(%sp),%r3
883238384Sjkim	.PROCEND
884238384Sjkim	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
885238384Sjkim___
886238384Sjkim
887238384Sjkim# Explicitly encode PA-RISC 2.0 instructions used in this module, so
888238384Sjkim# that it can be compiled with .LEVEL 1.0. It should be noted that I
889238384Sjkim# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
890238384Sjkim# directive...
891238384Sjkim
892238384Sjkimmy $ldd = sub {
893238384Sjkim  my ($mod,$args) = @_;
894238384Sjkim  my $orig = "ldd$mod\t$args";
895238384Sjkim
896238384Sjkim    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
897238384Sjkim    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
898238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
899238384Sjkim    }
900238384Sjkim    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
901238384Sjkim    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
902238384Sjkim	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
903238384Sjkim	$opcode|=(1<<5)  if ($mod =~ /^,m/);
904238384Sjkim	$opcode|=(1<<13) if ($mod =~ /^,mb/);
905238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
906238384Sjkim    }
907238384Sjkim    else { "\t".$orig; }
908238384Sjkim};
909238384Sjkim
910238384Sjkimmy $std = sub {
911238384Sjkim  my ($mod,$args) = @_;
912238384Sjkim  my $orig = "std$mod\t$args";
913238384Sjkim
914238384Sjkim    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
915238384Sjkim    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
916238384Sjkim	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
917238384Sjkim	$opcode|=(1<<5)  if ($mod =~ /^,m/);
918238384Sjkim	$opcode|=(1<<13) if ($mod =~ /^,mb/);
919238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
920238384Sjkim    }
921238384Sjkim    else { "\t".$orig; }
922238384Sjkim};
923238384Sjkim
924238384Sjkimmy $extrd = sub {
925238384Sjkim  my ($mod,$args) = @_;
926238384Sjkim  my $orig = "extrd$mod\t$args";
927238384Sjkim
928238384Sjkim    # I only have ",u" completer, it's implicitly encoded...
929238384Sjkim    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
930238384Sjkim    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
931238384Sjkim	my $len=32-$3;
932238384Sjkim	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
933238384Sjkim	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
934238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
935238384Sjkim    }
936238384Sjkim    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
937238384Sjkim    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
938238384Sjkim	my $len=32-$2;
939238384Sjkim	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
940238384Sjkim	$opcode |= (1<<13) if ($mod =~ /,\**=/);
941238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
942238384Sjkim    }
943238384Sjkim    else { "\t".$orig; }
944238384Sjkim};
945238384Sjkim
946238384Sjkimmy $shrpd = sub {
947238384Sjkim  my ($mod,$args) = @_;
948238384Sjkim  my $orig = "shrpd$mod\t$args";
949238384Sjkim
950238384Sjkim    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
951238384Sjkim    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
952238384Sjkim	my $cpos=63-$3;
953238384Sjkim	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
954238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
955238384Sjkim    }
956238384Sjkim    else { "\t".$orig; }
957238384Sjkim};
958238384Sjkim
959238384Sjkimmy $sub = sub {
960238384Sjkim  my ($mod,$args) = @_;
961238384Sjkim  my $orig = "sub$mod\t$args";
962238384Sjkim
963238384Sjkim    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
964238384Sjkim	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
965238384Sjkim	$opcode|=(1<<10);	# e1
966238384Sjkim	$opcode|=(1<<8);	# e2
967238384Sjkim	$opcode|=(1<<5);	# d
968238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
969238384Sjkim    }
970238384Sjkim    else { "\t".$orig; }
971238384Sjkim};
972238384Sjkim
973238384Sjkimsub assemble {
974238384Sjkim  my ($mnemonic,$mod,$args)=@_;
975238384Sjkim  my $opcode = eval("\$$mnemonic");
976238384Sjkim
977238384Sjkim    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
978238384Sjkim}
979238384Sjkim
980238384Sjkimforeach (split("\n",$code)) {
981238384Sjkim	s/\`([^\`]*)\`/eval $1/ge;
982238384Sjkim	# flip word order in 64-bit mode...
983238384Sjkim	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
984238384Sjkim	# assemble 2.0 instructions in 32-bit mode...
985238384Sjkim	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
986238384Sjkim
987261037Sjkim	s/\bbv\b/bve/gm	if ($SIZE_T==8);
988261037Sjkim
989238384Sjkim	print $_,"\n";
990238384Sjkim}
991238384Sjkimclose STDOUT;
992