ppc64-mont.pl revision 337982
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2007
11
12# The reason for undertaken effort is basically following. Even though
13# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
14# performance was observed to be less than impressive, essentially as
15# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
16# Well, it's not surprising that IBM had to make some sacrifices to
17# boost the clock frequency that much, but no overall improvement?
18# Having observed how much difference did switching to FPU make on
19# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
20# Unfortunately the resulting performance improvement is not as
21# impressive, ~30%, and in absolute terms is still very far from what
22# one would expect from 4.7GHz CPU. There is a chance that I'm doing
23# something wrong, but in the lack of assembler level micro-profiling
24# data or at least decent platform guide I can't tell... Or better
25# results might be achieved with VMX... Anyway, this module provides
26# *worse* performance on other PowerPC implementations, ~40-15% slower
27# on PPC970 depending on key length and ~40% slower on Power 5 for all
28# key lengths. As it's obviously inappropriate as "best all-round"
29# alternative, it has to be complemented with run-time CPU family
30# detection. Oh! It should also be noted that unlike other PowerPC
31# implementation IALU ppc-mont.pl module performs *suboptimaly* on
32# >=1024-bit key lengths on Power 6. It should also be noted that
33# *everything* said so far applies to 64-bit builds! As far as 32-bit
34# application executed on 64-bit CPU goes, this module is likely to
35# become preferred choice, because it's easy to adapt it for such
36# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
37
38# February 2008
39
40# Micro-profiling assisted optimization results in ~15% improvement
41# over original ppc64-mont.pl version, or overall ~50% improvement
42# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
43# Power 6 CPU, this module is 5-150% faster depending on key length,
44# [hereafter] more for longer keys. But if compared to ppc-mont.pl
45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
46# in absolute terms, but it's apparently the way Power 6 is...
47
48# December 2009
49
50# Adapted for 32-bit build this module delivers 25-120%, yes, more
51# than *twice* for longer keys, performance improvement over 32-bit
52# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
53# even 64-bit integer operations and the trouble is that most PPC
54# operating systems don't preserve upper halves of general purpose
55# registers upon 32-bit signal delivery. They do preserve them upon
56# context switch, but not signalling:-( This means that asynchronous
57# signals have to be blocked upon entry to this subroutine. Signal
58# masking (and of course complementary unmasking) has quite an impact
59# on performance, naturally larger for shorter keys. It's so severe
60# that 512-bit key performance can be as low as 1/3 of expected one.
61# This is why this routine can be engaged for longer key operations
62# only on these OSes, see crypto/ppccap.c for further details. MacOS X
63# is an exception from this and doesn't require signal masking, and
64# that's where above improvement coefficients were collected. For
65# others alternative would be to break dependence on upper halves of
66# GPRs by sticking to 32-bit integer operations...
67
68# December 2012
69
70# Remove above mentioned dependence on GPRs' upper halves in 32-bit
71# build. No signal masking overhead, but integer instructions are
72# *more* numerous... It's still "universally" faster than 32-bit
73# ppc-mont.pl, but improvement coefficient is not as impressive
74# for longer keys...
75
76$flavour = shift;
77
78if ($flavour =~ /32/) {
79	$SIZE_T=4;
80	$RZONE=	224;
81	$fname=	"bn_mul_mont_fpu64";
82
83	$STUX=	"stwux";	# store indexed and update
84	$PUSH=	"stw";
85	$POP=	"lwz";
86} elsif ($flavour =~ /64/) {
87	$SIZE_T=8;
88	$RZONE=	288;
89	$fname=	"bn_mul_mont_fpu64";
90
91	# same as above, but 64-bit mnemonics...
92	$STUX=	"stdux";	# store indexed and update
93	$PUSH=	"std";
94	$POP=	"ld";
95} else { die "nonsense $flavour"; }
96
97$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
98
99$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
100( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
101( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
102die "can't locate ppc-xlate.pl";
103
104open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
105
106$FRAME=64;	# padded frame header
107$TRANSFER=16*8;
108
109$carry="r0";
110$sp="r1";
111$toc="r2";
112$rp="r3";	$ovf="r3";
113$ap="r4";
114$bp="r5";
115$np="r6";
116$n0="r7";
117$num="r8";
118$rp="r9";	# $rp is reassigned
119$tp="r10";
120$j="r11";
121$i="r12";
122# non-volatile registers
123$c1="r19";
124$n1="r20";
125$a1="r21";
126$nap_d="r22";	# interleaved ap and np in double format
127$a0="r23";	# ap[0]
128$t0="r24";	# temporary registers
129$t1="r25";
130$t2="r26";
131$t3="r27";
132$t4="r28";
133$t5="r29";
134$t6="r30";
135$t7="r31";
136
137# PPC offers enough register bank capacity to unroll inner loops twice
138#
139#     ..A3A2A1A0
140#           dcba
141#    -----------
142#            A0a
143#           A0b
144#          A0c
145#         A0d
146#          A1a
147#         A1b
148#        A1c
149#       A1d
150#        A2a
151#       A2b
152#      A2c
153#     A2d
154#      A3a
155#     A3b
156#    A3c
157#   A3d
158#    ..a
159#   ..b
160#
161$ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
162$na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
163$dota="f8";	$dotb="f9";
164$A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
165$N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
166$T0a="f24";	$T0b="f25";
167$T1a="f26";	$T1b="f27";
168$T2a="f28";	$T2b="f29";
169$T3a="f30";	$T3b="f31";
170
171# sp----------->+-------------------------------+
172#		| saved sp			|
173#		+-------------------------------+
174#		.				.
175#   +64		+-------------------------------+
176#		| 16 gpr<->fpr transfer zone	|
177#		.				.
178#		.				.
179#   +16*8	+-------------------------------+
180#		| __int64 tmp[-1]		|
181#		+-------------------------------+
182#		| __int64 tmp[num]		|
183#		.				.
184#		.				.
185#		.				.
186#   +(num+1)*8	+-------------------------------+
187#		| padding to 64 byte boundary	|
188#		.				.
189#   +X		+-------------------------------+
190#		| double nap_d[4*num]		|
191#		.				.
192#		.				.
193#		.				.
194#		+-------------------------------+
195#		.				.
196#   -13*size_t	+-------------------------------+
197#		| 13 saved gpr, r19-r31		|
198#		.				.
199#		.				.
200#   -12*8	+-------------------------------+
201#		| 12 saved fpr, f20-f31		|
202#		.				.
203#		.				.
204#		+-------------------------------+
205
206$code=<<___;
207.machine "any"
208.text
209
210.globl	.$fname
211.align	5
212.$fname:
213	cmpwi	$num,`3*8/$SIZE_T`
214	mr	$rp,r3		; $rp is reassigned
215	li	r3,0		; possible "not handled" return code
216	bltlr-
217	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
218	bnelr-
219
220	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
221	li	$i,-4096
222	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
223	add	$tp,$tp,$num	; place for tp[num+1]
224	addi	$tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
225	subf	$tp,$tp,$sp	; $sp-$tp
226	and	$tp,$tp,$i	; minimize TLB usage
227	subf	$tp,$sp,$tp	; $tp-$sp
228	mr	$i,$sp
229	$STUX	$sp,$sp,$tp	; alloca
230
231	$PUSH	r19,`-12*8-13*$SIZE_T`($i)
232	$PUSH	r20,`-12*8-12*$SIZE_T`($i)
233	$PUSH	r21,`-12*8-11*$SIZE_T`($i)
234	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
235	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
236	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
237	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
238	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
239	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
240	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
241	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
242	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
243	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
244	stfd	f20,`-12*8`($i)
245	stfd	f21,`-11*8`($i)
246	stfd	f22,`-10*8`($i)
247	stfd	f23,`-9*8`($i)
248	stfd	f24,`-8*8`($i)
249	stfd	f25,`-7*8`($i)
250	stfd	f26,`-6*8`($i)
251	stfd	f27,`-5*8`($i)
252	stfd	f28,`-4*8`($i)
253	stfd	f29,`-3*8`($i)
254	stfd	f30,`-2*8`($i)
255	stfd	f31,`-1*8`($i)
256
257	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
258	li	$i,-64
259	add	$nap_d,$tp,$num
260	and	$nap_d,$nap_d,$i	; align to 64 bytes
261	; nap_d is off by 1, because it's used with stfdu/lfdu
262	addi	$nap_d,$nap_d,-8
263	srwi	$j,$num,`3+1`	; counter register, num/2
264	addi	$j,$j,-1
265	addi	$tp,$sp,`$FRAME+$TRANSFER-8`
266	li	$carry,0
267	mtctr	$j
268___
269
270$code.=<<___ if ($SIZE_T==8);
271	ld	$a0,0($ap)		; pull ap[0] value
272	ld	$t3,0($bp)		; bp[0]
273	ld	$n0,0($n0)		; pull n0[0] value
274
275	mulld	$t7,$a0,$t3		; ap[0]*bp[0]
276	; transfer bp[0] to FPU as 4x16-bit values
277	extrdi	$t0,$t3,16,48
278	extrdi	$t1,$t3,16,32
279	extrdi	$t2,$t3,16,16
280	extrdi	$t3,$t3,16,0
281	std	$t0,`$FRAME+0`($sp)
282	std	$t1,`$FRAME+8`($sp)
283	std	$t2,`$FRAME+16`($sp)
284	std	$t3,`$FRAME+24`($sp)
285
286	mulld	$t7,$t7,$n0		; tp[0]*n0
287	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
288	extrdi	$t4,$t7,16,48
289	extrdi	$t5,$t7,16,32
290	extrdi	$t6,$t7,16,16
291	extrdi	$t7,$t7,16,0
292	std	$t4,`$FRAME+32`($sp)
293	std	$t5,`$FRAME+40`($sp)
294	std	$t6,`$FRAME+48`($sp)
295	std	$t7,`$FRAME+56`($sp)
296
297	extrdi	$t0,$a0,32,32		; lwz	$t0,4($ap)
298	extrdi	$t1,$a0,32,0		; lwz	$t1,0($ap)
299	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[1] as 32-bit word pair
300	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
301	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[0] as 32-bit word pair
302	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
303	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[1] as 32-bit word pair
304	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
305___
306$code.=<<___ if ($SIZE_T==4);
307	lwz	$a0,0($ap)		; pull ap[0,1] value
308	mr	$n1,$n0
309	lwz	$a1,4($ap)
310	li	$c1,0
311	lwz	$t1,0($bp)		; bp[0,1]
312	lwz	$t3,4($bp)
313	lwz	$n0,0($n1)		; pull n0[0,1] value
314	lwz	$n1,4($n1)
315
316	mullw	$t4,$a0,$t1		; mulld ap[0]*bp[0]
317	mulhwu	$t5,$a0,$t1
318	mullw	$t6,$a1,$t1
319	mullw	$t7,$a0,$t3
320	add	$t5,$t5,$t6
321	add	$t5,$t5,$t7
322	; transfer bp[0] to FPU as 4x16-bit values
323	extrwi	$t0,$t1,16,16
324	extrwi	$t1,$t1,16,0
325	extrwi	$t2,$t3,16,16
326	extrwi	$t3,$t3,16,0
327	std	$t0,`$FRAME+0`($sp)	; yes, std in 32-bit build
328	std	$t1,`$FRAME+8`($sp)
329	std	$t2,`$FRAME+16`($sp)
330	std	$t3,`$FRAME+24`($sp)
331
332	mullw	$t0,$t4,$n0		; mulld tp[0]*n0
333	mulhwu	$t1,$t4,$n0
334	mullw	$t2,$t5,$n0
335	mullw	$t3,$t4,$n1
336	add	$t1,$t1,$t2
337	add	$t1,$t1,$t3
338	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
339	extrwi	$t4,$t0,16,16
340	extrwi	$t5,$t0,16,0
341	extrwi	$t6,$t1,16,16
342	extrwi	$t7,$t1,16,0
343	std	$t4,`$FRAME+32`($sp)	; yes, std in 32-bit build
344	std	$t5,`$FRAME+40`($sp)
345	std	$t6,`$FRAME+48`($sp)
346	std	$t7,`$FRAME+56`($sp)
347
348	mr	$t0,$a0			; lwz	$t0,0($ap)
349	mr	$t1,$a1			; lwz	$t1,4($ap)
350	lwz	$t2,8($ap)		; load a[j..j+3] as 32-bit word pairs
351	lwz	$t3,12($ap)
352	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
353	lwz	$t5,4($np)
354	lwz	$t6,8($np)
355	lwz	$t7,12($np)
356___
357$code.=<<___;
358	lfd	$ba,`$FRAME+0`($sp)
359	lfd	$bb,`$FRAME+8`($sp)
360	lfd	$bc,`$FRAME+16`($sp)
361	lfd	$bd,`$FRAME+24`($sp)
362	lfd	$na,`$FRAME+32`($sp)
363	lfd	$nb,`$FRAME+40`($sp)
364	lfd	$nc,`$FRAME+48`($sp)
365	lfd	$nd,`$FRAME+56`($sp)
366	std	$t0,`$FRAME+64`($sp)	; yes, std even in 32-bit build
367	std	$t1,`$FRAME+72`($sp)
368	std	$t2,`$FRAME+80`($sp)
369	std	$t3,`$FRAME+88`($sp)
370	std	$t4,`$FRAME+96`($sp)
371	std	$t5,`$FRAME+104`($sp)
372	std	$t6,`$FRAME+112`($sp)
373	std	$t7,`$FRAME+120`($sp)
374	fcfid	$ba,$ba
375	fcfid	$bb,$bb
376	fcfid	$bc,$bc
377	fcfid	$bd,$bd
378	fcfid	$na,$na
379	fcfid	$nb,$nb
380	fcfid	$nc,$nc
381	fcfid	$nd,$nd
382
383	lfd	$A0,`$FRAME+64`($sp)
384	lfd	$A1,`$FRAME+72`($sp)
385	lfd	$A2,`$FRAME+80`($sp)
386	lfd	$A3,`$FRAME+88`($sp)
387	lfd	$N0,`$FRAME+96`($sp)
388	lfd	$N1,`$FRAME+104`($sp)
389	lfd	$N2,`$FRAME+112`($sp)
390	lfd	$N3,`$FRAME+120`($sp)
391	fcfid	$A0,$A0
392	fcfid	$A1,$A1
393	fcfid	$A2,$A2
394	fcfid	$A3,$A3
395	fcfid	$N0,$N0
396	fcfid	$N1,$N1
397	fcfid	$N2,$N2
398	fcfid	$N3,$N3
399	addi	$ap,$ap,16
400	addi	$np,$np,16
401
402	fmul	$T1a,$A1,$ba
403	fmul	$T1b,$A1,$bb
404	stfd	$A0,8($nap_d)		; save a[j] in double format
405	stfd	$A1,16($nap_d)
406	fmul	$T2a,$A2,$ba
407	fmul	$T2b,$A2,$bb
408	stfd	$A2,24($nap_d)		; save a[j+1] in double format
409	stfd	$A3,32($nap_d)
410	fmul	$T3a,$A3,$ba
411	fmul	$T3b,$A3,$bb
412	stfd	$N0,40($nap_d)		; save n[j] in double format
413	stfd	$N1,48($nap_d)
414	fmul	$T0a,$A0,$ba
415	fmul	$T0b,$A0,$bb
416	stfd	$N2,56($nap_d)		; save n[j+1] in double format
417	stfdu	$N3,64($nap_d)
418
419	fmadd	$T1a,$A0,$bc,$T1a
420	fmadd	$T1b,$A0,$bd,$T1b
421	fmadd	$T2a,$A1,$bc,$T2a
422	fmadd	$T2b,$A1,$bd,$T2b
423	fmadd	$T3a,$A2,$bc,$T3a
424	fmadd	$T3b,$A2,$bd,$T3b
425	fmul	$dota,$A3,$bc
426	fmul	$dotb,$A3,$bd
427
428	fmadd	$T1a,$N1,$na,$T1a
429	fmadd	$T1b,$N1,$nb,$T1b
430	fmadd	$T2a,$N2,$na,$T2a
431	fmadd	$T2b,$N2,$nb,$T2b
432	fmadd	$T3a,$N3,$na,$T3a
433	fmadd	$T3b,$N3,$nb,$T3b
434	fmadd	$T0a,$N0,$na,$T0a
435	fmadd	$T0b,$N0,$nb,$T0b
436
437	fmadd	$T1a,$N0,$nc,$T1a
438	fmadd	$T1b,$N0,$nd,$T1b
439	fmadd	$T2a,$N1,$nc,$T2a
440	fmadd	$T2b,$N1,$nd,$T2b
441	fmadd	$T3a,$N2,$nc,$T3a
442	fmadd	$T3b,$N2,$nd,$T3b
443	fmadd	$dota,$N3,$nc,$dota
444	fmadd	$dotb,$N3,$nd,$dotb
445
446	fctid	$T0a,$T0a
447	fctid	$T0b,$T0b
448	fctid	$T1a,$T1a
449	fctid	$T1b,$T1b
450	fctid	$T2a,$T2a
451	fctid	$T2b,$T2b
452	fctid	$T3a,$T3a
453	fctid	$T3b,$T3b
454
455	stfd	$T0a,`$FRAME+0`($sp)
456	stfd	$T0b,`$FRAME+8`($sp)
457	stfd	$T1a,`$FRAME+16`($sp)
458	stfd	$T1b,`$FRAME+24`($sp)
459	stfd	$T2a,`$FRAME+32`($sp)
460	stfd	$T2b,`$FRAME+40`($sp)
461	stfd	$T3a,`$FRAME+48`($sp)
462	stfd	$T3b,`$FRAME+56`($sp)
463
464.align	5
465L1st:
466___
467$code.=<<___ if ($SIZE_T==8);
468	lwz	$t0,`4^$LITTLE_ENDIAN`($ap)	; load a[j] as 32-bit word pair
469	lwz	$t1,`0^$LITTLE_ENDIAN`($ap)
470	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[j+1] as 32-bit word pair
471	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
472	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[j] as 32-bit word pair
473	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
474	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[j+1] as 32-bit word pair
475	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
476___
477$code.=<<___ if ($SIZE_T==4);
478	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
479	lwz	$t1,4($ap)
480	lwz	$t2,8($ap)
481	lwz	$t3,12($ap)
482	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
483	lwz	$t5,4($np)
484	lwz	$t6,8($np)
485	lwz	$t7,12($np)
486___
487$code.=<<___;
488	std	$t0,`$FRAME+64`($sp)	; yes, std even in 32-bit build
489	std	$t1,`$FRAME+72`($sp)
490	std	$t2,`$FRAME+80`($sp)
491	std	$t3,`$FRAME+88`($sp)
492	std	$t4,`$FRAME+96`($sp)
493	std	$t5,`$FRAME+104`($sp)
494	std	$t6,`$FRAME+112`($sp)
495	std	$t7,`$FRAME+120`($sp)
496___
497if ($SIZE_T==8 or $flavour =~ /osx/) {
498$code.=<<___;
499	ld	$t0,`$FRAME+0`($sp)
500	ld	$t1,`$FRAME+8`($sp)
501	ld	$t2,`$FRAME+16`($sp)
502	ld	$t3,`$FRAME+24`($sp)
503	ld	$t4,`$FRAME+32`($sp)
504	ld	$t5,`$FRAME+40`($sp)
505	ld	$t6,`$FRAME+48`($sp)
506	ld	$t7,`$FRAME+56`($sp)
507___
508} else {
509$code.=<<___;
510	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
511	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
512	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
513	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
514	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
515	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
516	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
517	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
518___
519}
520$code.=<<___;
521	lfd	$A0,`$FRAME+64`($sp)
522	lfd	$A1,`$FRAME+72`($sp)
523	lfd	$A2,`$FRAME+80`($sp)
524	lfd	$A3,`$FRAME+88`($sp)
525	lfd	$N0,`$FRAME+96`($sp)
526	lfd	$N1,`$FRAME+104`($sp)
527	lfd	$N2,`$FRAME+112`($sp)
528	lfd	$N3,`$FRAME+120`($sp)
529	fcfid	$A0,$A0
530	fcfid	$A1,$A1
531	fcfid	$A2,$A2
532	fcfid	$A3,$A3
533	fcfid	$N0,$N0
534	fcfid	$N1,$N1
535	fcfid	$N2,$N2
536	fcfid	$N3,$N3
537	addi	$ap,$ap,16
538	addi	$np,$np,16
539
540	fmul	$T1a,$A1,$ba
541	fmul	$T1b,$A1,$bb
542	fmul	$T2a,$A2,$ba
543	fmul	$T2b,$A2,$bb
544	stfd	$A0,8($nap_d)		; save a[j] in double format
545	stfd	$A1,16($nap_d)
546	fmul	$T3a,$A3,$ba
547	fmul	$T3b,$A3,$bb
548	fmadd	$T0a,$A0,$ba,$dota
549	fmadd	$T0b,$A0,$bb,$dotb
550	stfd	$A2,24($nap_d)		; save a[j+1] in double format
551	stfd	$A3,32($nap_d)
552___
553if ($SIZE_T==8 or $flavour =~ /osx/) {
554$code.=<<___;
555	fmadd	$T1a,$A0,$bc,$T1a
556	fmadd	$T1b,$A0,$bd,$T1b
557	fmadd	$T2a,$A1,$bc,$T2a
558	fmadd	$T2b,$A1,$bd,$T2b
559	stfd	$N0,40($nap_d)		; save n[j] in double format
560	stfd	$N1,48($nap_d)
561	fmadd	$T3a,$A2,$bc,$T3a
562	fmadd	$T3b,$A2,$bd,$T3b
563	 add	$t0,$t0,$carry		; can not overflow
564	fmul	$dota,$A3,$bc
565	fmul	$dotb,$A3,$bd
566	stfd	$N2,56($nap_d)		; save n[j+1] in double format
567	stfdu	$N3,64($nap_d)
568	 srdi	$carry,$t0,16
569	 add	$t1,$t1,$carry
570	 srdi	$carry,$t1,16
571
572	fmadd	$T1a,$N1,$na,$T1a
573	fmadd	$T1b,$N1,$nb,$T1b
574	 insrdi	$t0,$t1,16,32
575	fmadd	$T2a,$N2,$na,$T2a
576	fmadd	$T2b,$N2,$nb,$T2b
577	 add	$t2,$t2,$carry
578	fmadd	$T3a,$N3,$na,$T3a
579	fmadd	$T3b,$N3,$nb,$T3b
580	 srdi	$carry,$t2,16
581	fmadd	$T0a,$N0,$na,$T0a
582	fmadd	$T0b,$N0,$nb,$T0b
583	 insrdi	$t0,$t2,16,16
584	 add	$t3,$t3,$carry
585	 srdi	$carry,$t3,16
586
587	fmadd	$T1a,$N0,$nc,$T1a
588	fmadd	$T1b,$N0,$nd,$T1b
589	 insrdi	$t0,$t3,16,0		; 0..63 bits
590	fmadd	$T2a,$N1,$nc,$T2a
591	fmadd	$T2b,$N1,$nd,$T2b
592	 add	$t4,$t4,$carry
593	fmadd	$T3a,$N2,$nc,$T3a
594	fmadd	$T3b,$N2,$nd,$T3b
595	 srdi	$carry,$t4,16
596	fmadd	$dota,$N3,$nc,$dota
597	fmadd	$dotb,$N3,$nd,$dotb
598	 add	$t5,$t5,$carry
599	 srdi	$carry,$t5,16
600	 insrdi	$t4,$t5,16,32
601
602	fctid	$T0a,$T0a
603	fctid	$T0b,$T0b
604	 add	$t6,$t6,$carry
605	fctid	$T1a,$T1a
606	fctid	$T1b,$T1b
607	 srdi	$carry,$t6,16
608	fctid	$T2a,$T2a
609	fctid	$T2b,$T2b
610	 insrdi	$t4,$t6,16,16
611	fctid	$T3a,$T3a
612	fctid	$T3b,$T3b
613	 add	$t7,$t7,$carry
614	 insrdi	$t4,$t7,16,0		; 64..127 bits
615	 srdi	$carry,$t7,16		; upper 33 bits
616
617	stfd	$T0a,`$FRAME+0`($sp)
618	stfd	$T0b,`$FRAME+8`($sp)
619	stfd	$T1a,`$FRAME+16`($sp)
620	stfd	$T1b,`$FRAME+24`($sp)
621	stfd	$T2a,`$FRAME+32`($sp)
622	stfd	$T2b,`$FRAME+40`($sp)
623	stfd	$T3a,`$FRAME+48`($sp)
624	stfd	$T3b,`$FRAME+56`($sp)
625	 std	$t0,8($tp)		; tp[j-1]
626	 stdu	$t4,16($tp)		; tp[j]
627___
628} else {
629$code.=<<___;
630	fmadd	$T1a,$A0,$bc,$T1a
631	fmadd	$T1b,$A0,$bd,$T1b
632	 addc	$t0,$t0,$carry
633	 adde	$t1,$t1,$c1
634	 srwi	$carry,$t0,16
635	fmadd	$T2a,$A1,$bc,$T2a
636	fmadd	$T2b,$A1,$bd,$T2b
637	stfd	$N0,40($nap_d)		; save n[j] in double format
638	stfd	$N1,48($nap_d)
639	 srwi	$c1,$t1,16
640	 insrwi	$carry,$t1,16,0
641	fmadd	$T3a,$A2,$bc,$T3a
642	fmadd	$T3b,$A2,$bd,$T3b
643	 addc	$t2,$t2,$carry
644	 adde	$t3,$t3,$c1
645	 srwi	$carry,$t2,16
646	fmul	$dota,$A3,$bc
647	fmul	$dotb,$A3,$bd
648	stfd	$N2,56($nap_d)		; save n[j+1] in double format
649	stfdu	$N3,64($nap_d)
650	 insrwi	$t0,$t2,16,0		; 0..31 bits
651	 srwi	$c1,$t3,16
652	 insrwi	$carry,$t3,16,0
653
654	fmadd	$T1a,$N1,$na,$T1a
655	fmadd	$T1b,$N1,$nb,$T1b
656	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
657	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
658	 addc	$t4,$t4,$carry
659	 adde	$t5,$t5,$c1
660	 srwi	$carry,$t4,16
661	fmadd	$T2a,$N2,$na,$T2a
662	fmadd	$T2b,$N2,$nb,$T2b
663	 srwi	$c1,$t5,16
664	 insrwi	$carry,$t5,16,0
665	fmadd	$T3a,$N3,$na,$T3a
666	fmadd	$T3b,$N3,$nb,$T3b
667	 addc	$t6,$t6,$carry
668	 adde	$t7,$t7,$c1
669	 srwi	$carry,$t6,16
670	fmadd	$T0a,$N0,$na,$T0a
671	fmadd	$T0b,$N0,$nb,$T0b
672	 insrwi	$t4,$t6,16,0		; 32..63 bits
673	 srwi	$c1,$t7,16
674	 insrwi	$carry,$t7,16,0
675
676	fmadd	$T1a,$N0,$nc,$T1a
677	fmadd	$T1b,$N0,$nd,$T1b
678	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
679	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
680	 addc	$t2,$t2,$carry
681	 adde	$t3,$t3,$c1
682	 srwi	$carry,$t2,16
683	fmadd	$T2a,$N1,$nc,$T2a
684	fmadd	$T2b,$N1,$nd,$T2b
685	 stw	$t0,12($tp)		; tp[j-1]
686	 stw	$t4,8($tp)
687	 srwi	$c1,$t3,16
688	 insrwi	$carry,$t3,16,0
689	fmadd	$T3a,$N2,$nc,$T3a
690	fmadd	$T3b,$N2,$nd,$T3b
691	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
692	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
693	 addc	$t6,$t6,$carry
694	 adde	$t7,$t7,$c1
695	 srwi	$carry,$t6,16
696	fmadd	$dota,$N3,$nc,$dota
697	fmadd	$dotb,$N3,$nd,$dotb
698	 insrwi	$t2,$t6,16,0		; 64..95 bits
699	 srwi	$c1,$t7,16
700	 insrwi	$carry,$t7,16,0
701
702	fctid	$T0a,$T0a
703	fctid	$T0b,$T0b
704	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
705	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
706	 addc	$t0,$t0,$carry
707	 adde	$t1,$t1,$c1
708	 srwi	$carry,$t0,16
709	fctid	$T1a,$T1a
710	fctid	$T1b,$T1b
711	 srwi	$c1,$t1,16
712	 insrwi	$carry,$t1,16,0
713	fctid	$T2a,$T2a
714	fctid	$T2b,$T2b
715	 addc	$t4,$t4,$carry
716	 adde	$t5,$t5,$c1
717	 srwi	$carry,$t4,16
718	fctid	$T3a,$T3a
719	fctid	$T3b,$T3b
720	 insrwi	$t0,$t4,16,0		; 96..127 bits
721	 srwi	$c1,$t5,16
722	 insrwi	$carry,$t5,16,0
723
724	stfd	$T0a,`$FRAME+0`($sp)
725	stfd	$T0b,`$FRAME+8`($sp)
726	stfd	$T1a,`$FRAME+16`($sp)
727	stfd	$T1b,`$FRAME+24`($sp)
728	stfd	$T2a,`$FRAME+32`($sp)
729	stfd	$T2b,`$FRAME+40`($sp)
730	stfd	$T3a,`$FRAME+48`($sp)
731	stfd	$T3b,`$FRAME+56`($sp)
732	 stw	$t2,20($tp)		; tp[j]
733	 stwu	$t0,16($tp)
734___
735}
736$code.=<<___;
737	bdnz	L1st
738
739	fctid	$dota,$dota
740	fctid	$dotb,$dotb
741___
742if ($SIZE_T==8 or $flavour =~ /osx/) {
743$code.=<<___;
744	ld	$t0,`$FRAME+0`($sp)
745	ld	$t1,`$FRAME+8`($sp)
746	ld	$t2,`$FRAME+16`($sp)
747	ld	$t3,`$FRAME+24`($sp)
748	ld	$t4,`$FRAME+32`($sp)
749	ld	$t5,`$FRAME+40`($sp)
750	ld	$t6,`$FRAME+48`($sp)
751	ld	$t7,`$FRAME+56`($sp)
752	stfd	$dota,`$FRAME+64`($sp)
753	stfd	$dotb,`$FRAME+72`($sp)
754
755	add	$t0,$t0,$carry		; can not overflow
756	srdi	$carry,$t0,16
757	add	$t1,$t1,$carry
758	srdi	$carry,$t1,16
759	insrdi	$t0,$t1,16,32
760	add	$t2,$t2,$carry
761	srdi	$carry,$t2,16
762	insrdi	$t0,$t2,16,16
763	add	$t3,$t3,$carry
764	srdi	$carry,$t3,16
765	insrdi	$t0,$t3,16,0		; 0..63 bits
766	add	$t4,$t4,$carry
767	srdi	$carry,$t4,16
768	add	$t5,$t5,$carry
769	srdi	$carry,$t5,16
770	insrdi	$t4,$t5,16,32
771	add	$t6,$t6,$carry
772	srdi	$carry,$t6,16
773	insrdi	$t4,$t6,16,16
774	add	$t7,$t7,$carry
775	insrdi	$t4,$t7,16,0		; 64..127 bits
776	srdi	$carry,$t7,16		; upper 33 bits
777	ld	$t6,`$FRAME+64`($sp)
778	ld	$t7,`$FRAME+72`($sp)
779
780	std	$t0,8($tp)		; tp[j-1]
781	stdu	$t4,16($tp)		; tp[j]
782
783	add	$t6,$t6,$carry		; can not overflow
784	srdi	$carry,$t6,16
785	add	$t7,$t7,$carry
786	insrdi	$t6,$t7,48,0
787	srdi	$ovf,$t7,48
788	std	$t6,8($tp)		; tp[num-1]
789___
790} else {
791$code.=<<___;
792	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
793	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
794	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
795	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
796	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
797	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
798	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
799	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
800	stfd	$dota,`$FRAME+64`($sp)
801	stfd	$dotb,`$FRAME+72`($sp)
802
803	addc	$t0,$t0,$carry
804	adde	$t1,$t1,$c1
805	srwi	$carry,$t0,16
806	insrwi	$carry,$t1,16,0
807	srwi	$c1,$t1,16
808	addc	$t2,$t2,$carry
809	adde	$t3,$t3,$c1
810	srwi	$carry,$t2,16
811	 insrwi	$t0,$t2,16,0		; 0..31 bits
812	insrwi	$carry,$t3,16,0
813	srwi	$c1,$t3,16
814	addc	$t4,$t4,$carry
815	adde	$t5,$t5,$c1
816	srwi	$carry,$t4,16
817	insrwi	$carry,$t5,16,0
818	srwi	$c1,$t5,16
819	addc	$t6,$t6,$carry
820	adde	$t7,$t7,$c1
821	srwi	$carry,$t6,16
822	 insrwi	$t4,$t6,16,0		; 32..63 bits
823	insrwi	$carry,$t7,16,0
824	srwi	$c1,$t7,16
825	 stw	$t0,12($tp)		; tp[j-1]
826	 stw	$t4,8($tp)
827
828	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
829	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
830	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
831	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
832	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
833	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
834	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
835	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
836
837	addc	$t2,$t2,$carry
838	adde	$t3,$t3,$c1
839	srwi	$carry,$t2,16
840	insrwi	$carry,$t3,16,0
841	srwi	$c1,$t3,16
842	addc	$t6,$t6,$carry
843	adde	$t7,$t7,$c1
844	srwi	$carry,$t6,16
845	 insrwi	$t2,$t6,16,0		; 64..95 bits
846	insrwi	$carry,$t7,16,0
847	srwi	$c1,$t7,16
848	addc	$t0,$t0,$carry
849	adde	$t1,$t1,$c1
850	srwi	$carry,$t0,16
851	insrwi	$carry,$t1,16,0
852	srwi	$c1,$t1,16
853	addc	$t4,$t4,$carry
854	adde	$t5,$t5,$c1
855	srwi	$carry,$t4,16
856	 insrwi	$t0,$t4,16,0		; 96..127 bits
857	insrwi	$carry,$t5,16,0
858	srwi	$c1,$t5,16
859	 stw	$t2,20($tp)		; tp[j]
860	 stwu	$t0,16($tp)
861
862	lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
863	lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
864	lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
865	lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
866
867	addc	$t6,$t6,$carry
868	adde	$t7,$t7,$c1
869	srwi	$carry,$t6,16
870	insrwi	$carry,$t7,16,0
871	srwi	$c1,$t7,16
872	addc	$t4,$t4,$carry
873	adde	$t5,$t5,$c1
874
875	insrwi	$t6,$t4,16,0
876	srwi	$t4,$t4,16
877	insrwi	$t4,$t5,16,0
878	srwi	$ovf,$t5,16
879	stw	$t6,12($tp)		; tp[num-1]
880	stw	$t4,8($tp)
881___
882}
883$code.=<<___;
884	slwi	$t7,$num,2
885	subf	$nap_d,$t7,$nap_d	; rewind pointer
886
887	li	$i,8			; i=1
888.align	5
889Louter:
890	addi	$tp,$sp,`$FRAME+$TRANSFER`
891	li	$carry,0
892	mtctr	$j
893___
894$code.=<<___ if ($SIZE_T==8);
895	ldx	$t3,$bp,$i		; bp[i]
896
897	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
898	mulld	$t7,$a0,$t3		; ap[0]*bp[i]
899	add	$t7,$t7,$t6		; ap[0]*bp[i]+tp[0]
900	; transfer bp[i] to FPU as 4x16-bit values
901	extrdi	$t0,$t3,16,48
902	extrdi	$t1,$t3,16,32
903	extrdi	$t2,$t3,16,16
904	extrdi	$t3,$t3,16,0
905	std	$t0,`$FRAME+0`($sp)
906	std	$t1,`$FRAME+8`($sp)
907	std	$t2,`$FRAME+16`($sp)
908	std	$t3,`$FRAME+24`($sp)
909
910	mulld	$t7,$t7,$n0		; tp[0]*n0
911	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
912	extrdi	$t4,$t7,16,48
913	extrdi	$t5,$t7,16,32
914	extrdi	$t6,$t7,16,16
915	extrdi	$t7,$t7,16,0
916	std	$t4,`$FRAME+32`($sp)
917	std	$t5,`$FRAME+40`($sp)
918	std	$t6,`$FRAME+48`($sp)
919	std	$t7,`$FRAME+56`($sp)
920___
921$code.=<<___ if ($SIZE_T==4);
922	add	$t0,$bp,$i
923	li	$c1,0
924	lwz	$t1,0($t0)		; bp[i,i+1]
925	lwz	$t3,4($t0)
926
927	mullw	$t4,$a0,$t1		; ap[0]*bp[i]
928	lwz	$t0,`$FRAME+$TRANSFER+8+4`($sp)	; tp[0]
929	mulhwu	$t5,$a0,$t1
930	lwz	$t2,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
931	mullw	$t6,$a1,$t1
932	mullw	$t7,$a0,$t3
933	add	$t5,$t5,$t6
934	add	$t5,$t5,$t7
935	addc	$t4,$t4,$t0		; ap[0]*bp[i]+tp[0]
936	adde	$t5,$t5,$t2
937	; transfer bp[i] to FPU as 4x16-bit values
938	extrwi	$t0,$t1,16,16
939	extrwi	$t1,$t1,16,0
940	extrwi	$t2,$t3,16,16
941	extrwi	$t3,$t3,16,0
942	std	$t0,`$FRAME+0`($sp)	; yes, std in 32-bit build
943	std	$t1,`$FRAME+8`($sp)
944	std	$t2,`$FRAME+16`($sp)
945	std	$t3,`$FRAME+24`($sp)
946
947	mullw	$t0,$t4,$n0		; mulld tp[0]*n0
948	mulhwu	$t1,$t4,$n0
949	mullw	$t2,$t5,$n0
950	mullw	$t3,$t4,$n1
951	add	$t1,$t1,$t2
952	add	$t1,$t1,$t3
953	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
954	extrwi	$t4,$t0,16,16
955	extrwi	$t5,$t0,16,0
956	extrwi	$t6,$t1,16,16
957	extrwi	$t7,$t1,16,0
958	std	$t4,`$FRAME+32`($sp)	; yes, std in 32-bit build
959	std	$t5,`$FRAME+40`($sp)
960	std	$t6,`$FRAME+48`($sp)
961	std	$t7,`$FRAME+56`($sp)
962___
963$code.=<<___;
964	lfd	$A0,8($nap_d)		; load a[j] in double format
965	lfd	$A1,16($nap_d)
966	lfd	$A2,24($nap_d)		; load a[j+1] in double format
967	lfd	$A3,32($nap_d)
968	lfd	$N0,40($nap_d)		; load n[j] in double format
969	lfd	$N1,48($nap_d)
970	lfd	$N2,56($nap_d)		; load n[j+1] in double format
971	lfdu	$N3,64($nap_d)
972
973	lfd	$ba,`$FRAME+0`($sp)
974	lfd	$bb,`$FRAME+8`($sp)
975	lfd	$bc,`$FRAME+16`($sp)
976	lfd	$bd,`$FRAME+24`($sp)
977	lfd	$na,`$FRAME+32`($sp)
978	lfd	$nb,`$FRAME+40`($sp)
979	lfd	$nc,`$FRAME+48`($sp)
980	lfd	$nd,`$FRAME+56`($sp)
981
982	fcfid	$ba,$ba
983	fcfid	$bb,$bb
984	fcfid	$bc,$bc
985	fcfid	$bd,$bd
986	fcfid	$na,$na
987	fcfid	$nb,$nb
988	fcfid	$nc,$nc
989	fcfid	$nd,$nd
990
991	fmul	$T1a,$A1,$ba
992	fmul	$T1b,$A1,$bb
993	fmul	$T2a,$A2,$ba
994	fmul	$T2b,$A2,$bb
995	fmul	$T3a,$A3,$ba
996	fmul	$T3b,$A3,$bb
997	fmul	$T0a,$A0,$ba
998	fmul	$T0b,$A0,$bb
999
1000	fmadd	$T1a,$A0,$bc,$T1a
1001	fmadd	$T1b,$A0,$bd,$T1b
1002	fmadd	$T2a,$A1,$bc,$T2a
1003	fmadd	$T2b,$A1,$bd,$T2b
1004	fmadd	$T3a,$A2,$bc,$T3a
1005	fmadd	$T3b,$A2,$bd,$T3b
1006	fmul	$dota,$A3,$bc
1007	fmul	$dotb,$A3,$bd
1008
1009	fmadd	$T1a,$N1,$na,$T1a
1010	fmadd	$T1b,$N1,$nb,$T1b
1011	 lfd	$A0,8($nap_d)		; load a[j] in double format
1012	 lfd	$A1,16($nap_d)
1013	fmadd	$T2a,$N2,$na,$T2a
1014	fmadd	$T2b,$N2,$nb,$T2b
1015	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
1016	 lfd	$A3,32($nap_d)
1017	fmadd	$T3a,$N3,$na,$T3a
1018	fmadd	$T3b,$N3,$nb,$T3b
1019	fmadd	$T0a,$N0,$na,$T0a
1020	fmadd	$T0b,$N0,$nb,$T0b
1021
1022	fmadd	$T1a,$N0,$nc,$T1a
1023	fmadd	$T1b,$N0,$nd,$T1b
1024	fmadd	$T2a,$N1,$nc,$T2a
1025	fmadd	$T2b,$N1,$nd,$T2b
1026	fmadd	$T3a,$N2,$nc,$T3a
1027	fmadd	$T3b,$N2,$nd,$T3b
1028	fmadd	$dota,$N3,$nc,$dota
1029	fmadd	$dotb,$N3,$nd,$dotb
1030
1031	fctid	$T0a,$T0a
1032	fctid	$T0b,$T0b
1033	fctid	$T1a,$T1a
1034	fctid	$T1b,$T1b
1035	fctid	$T2a,$T2a
1036	fctid	$T2b,$T2b
1037	fctid	$T3a,$T3a
1038	fctid	$T3b,$T3b
1039
1040	stfd	$T0a,`$FRAME+0`($sp)
1041	stfd	$T0b,`$FRAME+8`($sp)
1042	stfd	$T1a,`$FRAME+16`($sp)
1043	stfd	$T1b,`$FRAME+24`($sp)
1044	stfd	$T2a,`$FRAME+32`($sp)
1045	stfd	$T2b,`$FRAME+40`($sp)
1046	stfd	$T3a,`$FRAME+48`($sp)
1047	stfd	$T3b,`$FRAME+56`($sp)
1048
1049.align	5
1050Linner:
1051	fmul	$T1a,$A1,$ba
1052	fmul	$T1b,$A1,$bb
1053	fmul	$T2a,$A2,$ba
1054	fmul	$T2b,$A2,$bb
1055	lfd	$N0,40($nap_d)		; load n[j] in double format
1056	lfd	$N1,48($nap_d)
1057	fmul	$T3a,$A3,$ba
1058	fmul	$T3b,$A3,$bb
1059	fmadd	$T0a,$A0,$ba,$dota
1060	fmadd	$T0b,$A0,$bb,$dotb
1061	lfd	$N2,56($nap_d)		; load n[j+1] in double format
1062	lfdu	$N3,64($nap_d)
1063
1064	fmadd	$T1a,$A0,$bc,$T1a
1065	fmadd	$T1b,$A0,$bd,$T1b
1066	fmadd	$T2a,$A1,$bc,$T2a
1067	fmadd	$T2b,$A1,$bd,$T2b
1068	 lfd	$A0,8($nap_d)		; load a[j] in double format
1069	 lfd	$A1,16($nap_d)
1070	fmadd	$T3a,$A2,$bc,$T3a
1071	fmadd	$T3b,$A2,$bd,$T3b
1072	fmul	$dota,$A3,$bc
1073	fmul	$dotb,$A3,$bd
1074	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
1075	 lfd	$A3,32($nap_d)
1076___
1077if ($SIZE_T==8 or $flavour =~ /osx/) {
1078$code.=<<___;
1079	fmadd	$T1a,$N1,$na,$T1a
1080	fmadd	$T1b,$N1,$nb,$T1b
1081	 ld	$t0,`$FRAME+0`($sp)
1082	 ld	$t1,`$FRAME+8`($sp)
1083	fmadd	$T2a,$N2,$na,$T2a
1084	fmadd	$T2b,$N2,$nb,$T2b
1085	 ld	$t2,`$FRAME+16`($sp)
1086	 ld	$t3,`$FRAME+24`($sp)
1087	fmadd	$T3a,$N3,$na,$T3a
1088	fmadd	$T3b,$N3,$nb,$T3b
1089	 add	$t0,$t0,$carry		; can not overflow
1090	 ld	$t4,`$FRAME+32`($sp)
1091	 ld	$t5,`$FRAME+40`($sp)
1092	fmadd	$T0a,$N0,$na,$T0a
1093	fmadd	$T0b,$N0,$nb,$T0b
1094	 srdi	$carry,$t0,16
1095	 add	$t1,$t1,$carry
1096	 srdi	$carry,$t1,16
1097	 ld	$t6,`$FRAME+48`($sp)
1098	 ld	$t7,`$FRAME+56`($sp)
1099
1100	fmadd	$T1a,$N0,$nc,$T1a
1101	fmadd	$T1b,$N0,$nd,$T1b
1102	 insrdi	$t0,$t1,16,32
1103	 ld	$t1,8($tp)		; tp[j]
1104	fmadd	$T2a,$N1,$nc,$T2a
1105	fmadd	$T2b,$N1,$nd,$T2b
1106	 add	$t2,$t2,$carry
1107	fmadd	$T3a,$N2,$nc,$T3a
1108	fmadd	$T3b,$N2,$nd,$T3b
1109	 srdi	$carry,$t2,16
1110	 insrdi	$t0,$t2,16,16
1111	fmadd	$dota,$N3,$nc,$dota
1112	fmadd	$dotb,$N3,$nd,$dotb
1113	 add	$t3,$t3,$carry
1114	 ldu	$t2,16($tp)		; tp[j+1]
1115	 srdi	$carry,$t3,16
1116	 insrdi	$t0,$t3,16,0		; 0..63 bits
1117	 add	$t4,$t4,$carry
1118
1119	fctid	$T0a,$T0a
1120	fctid	$T0b,$T0b
1121	 srdi	$carry,$t4,16
1122	fctid	$T1a,$T1a
1123	fctid	$T1b,$T1b
1124	 add	$t5,$t5,$carry
1125	fctid	$T2a,$T2a
1126	fctid	$T2b,$T2b
1127	 srdi	$carry,$t5,16
1128	 insrdi	$t4,$t5,16,32
1129	fctid	$T3a,$T3a
1130	fctid	$T3b,$T3b
1131	 add	$t6,$t6,$carry
1132	 srdi	$carry,$t6,16
1133	 insrdi	$t4,$t6,16,16
1134
1135	stfd	$T0a,`$FRAME+0`($sp)
1136	stfd	$T0b,`$FRAME+8`($sp)
1137	 add	$t7,$t7,$carry
1138	 addc	$t3,$t0,$t1
1139___
1140$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1141	extrdi	$t0,$t0,32,0
1142	extrdi	$t1,$t1,32,0
1143	adde	$t0,$t0,$t1
1144___
1145$code.=<<___;
1146	stfd	$T1a,`$FRAME+16`($sp)
1147	stfd	$T1b,`$FRAME+24`($sp)
1148	 insrdi	$t4,$t7,16,0		; 64..127 bits
1149	 srdi	$carry,$t7,16		; upper 33 bits
1150	stfd	$T2a,`$FRAME+32`($sp)
1151	stfd	$T2b,`$FRAME+40`($sp)
1152	 adde	$t5,$t4,$t2
1153___
1154$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1155	extrdi	$t4,$t4,32,0
1156	extrdi	$t2,$t2,32,0
1157	adde	$t4,$t4,$t2
1158___
1159$code.=<<___;
1160	stfd	$T3a,`$FRAME+48`($sp)
1161	stfd	$T3b,`$FRAME+56`($sp)
1162	 addze	$carry,$carry
1163	 std	$t3,-16($tp)		; tp[j-1]
1164	 std	$t5,-8($tp)		; tp[j]
1165___
1166} else {
1167$code.=<<___;
1168	fmadd	$T1a,$N1,$na,$T1a
1169	fmadd	$T1b,$N1,$nb,$T1b
1170	 lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1171	 lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1172	fmadd	$T2a,$N2,$na,$T2a
1173	fmadd	$T2b,$N2,$nb,$T2b
1174	 lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1175	 lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1176	fmadd	$T3a,$N3,$na,$T3a
1177	fmadd	$T3b,$N3,$nb,$T3b
1178	 lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1179	 lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1180	 addc	$t0,$t0,$carry
1181	 adde	$t1,$t1,$c1
1182	 srwi	$carry,$t0,16
1183	fmadd	$T0a,$N0,$na,$T0a
1184	fmadd	$T0b,$N0,$nb,$T0b
1185	 lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1186	 lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1187	 srwi	$c1,$t1,16
1188	 insrwi	$carry,$t1,16,0
1189
1190	fmadd	$T1a,$N0,$nc,$T1a
1191	fmadd	$T1b,$N0,$nd,$T1b
1192	 addc	$t2,$t2,$carry
1193	 adde	$t3,$t3,$c1
1194	 srwi	$carry,$t2,16
1195	fmadd	$T2a,$N1,$nc,$T2a
1196	fmadd	$T2b,$N1,$nd,$T2b
1197	 insrwi	$t0,$t2,16,0		; 0..31 bits
1198	 srwi	$c1,$t3,16
1199	 insrwi	$carry,$t3,16,0
1200	fmadd	$T3a,$N2,$nc,$T3a
1201	fmadd	$T3b,$N2,$nd,$T3b
1202	 lwz	$t2,12($tp)		; tp[j]
1203	 lwz	$t3,8($tp)
1204	 addc	$t4,$t4,$carry
1205	 adde	$t5,$t5,$c1
1206	 srwi	$carry,$t4,16
1207	fmadd	$dota,$N3,$nc,$dota
1208	fmadd	$dotb,$N3,$nd,$dotb
1209	 srwi	$c1,$t5,16
1210	 insrwi	$carry,$t5,16,0
1211
1212	fctid	$T0a,$T0a
1213	 addc	$t6,$t6,$carry
1214	 adde	$t7,$t7,$c1
1215	 srwi	$carry,$t6,16
1216	fctid	$T0b,$T0b
1217	 insrwi	$t4,$t6,16,0		; 32..63 bits
1218	 srwi	$c1,$t7,16
1219	 insrwi	$carry,$t7,16,0
1220	fctid	$T1a,$T1a
1221	 addc	$t0,$t0,$t2
1222	 adde	$t4,$t4,$t3
1223	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
1224	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
1225	fctid	$T1b,$T1b
1226	 addze	$carry,$carry
1227	 addze	$c1,$c1
1228	 stw	$t0,4($tp)		; tp[j-1]
1229	 stw	$t4,0($tp)
1230	fctid	$T2a,$T2a
1231	 addc	$t2,$t2,$carry
1232	 adde	$t3,$t3,$c1
1233	 srwi	$carry,$t2,16
1234	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
1235	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
1236	fctid	$T2b,$T2b
1237	 srwi	$c1,$t3,16
1238	 insrwi	$carry,$t3,16,0
1239	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
1240	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
1241	fctid	$T3a,$T3a
1242	 addc	$t6,$t6,$carry
1243	 adde	$t7,$t7,$c1
1244	 srwi	$carry,$t6,16
1245	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
1246	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
1247	fctid	$T3b,$T3b
1248
1249	 insrwi	$t2,$t6,16,0		; 64..95 bits
1250	insrwi	$carry,$t7,16,0
1251	srwi	$c1,$t7,16
1252	 lwz	$t6,20($tp)
1253	 lwzu	$t7,16($tp)
1254	addc	$t0,$t0,$carry
1255	 stfd	$T0a,`$FRAME+0`($sp)
1256	adde	$t1,$t1,$c1
1257	srwi	$carry,$t0,16
1258	 stfd	$T0b,`$FRAME+8`($sp)
1259	insrwi	$carry,$t1,16,0
1260	srwi	$c1,$t1,16
1261	addc	$t4,$t4,$carry
1262	 stfd	$T1a,`$FRAME+16`($sp)
1263	adde	$t5,$t5,$c1
1264	srwi	$carry,$t4,16
1265	 insrwi	$t0,$t4,16,0		; 96..127 bits
1266	 stfd	$T1b,`$FRAME+24`($sp)
1267	insrwi	$carry,$t5,16,0
1268	srwi	$c1,$t5,16
1269
1270	addc	$t2,$t2,$t6
1271	 stfd	$T2a,`$FRAME+32`($sp)
1272	adde	$t0,$t0,$t7
1273	 stfd	$T2b,`$FRAME+40`($sp)
1274	addze	$carry,$carry
1275	 stfd	$T3a,`$FRAME+48`($sp)
1276	addze	$c1,$c1
1277	 stfd	$T3b,`$FRAME+56`($sp)
1278	 stw	$t2,-4($tp)		; tp[j]
1279	 stw	$t0,-8($tp)
1280___
1281}
1282$code.=<<___;
1283	bdnz	Linner
1284
1285	fctid	$dota,$dota
1286	fctid	$dotb,$dotb
1287___
1288if ($SIZE_T==8 or $flavour =~ /osx/) {
1289$code.=<<___;
1290	ld	$t0,`$FRAME+0`($sp)
1291	ld	$t1,`$FRAME+8`($sp)
1292	ld	$t2,`$FRAME+16`($sp)
1293	ld	$t3,`$FRAME+24`($sp)
1294	ld	$t4,`$FRAME+32`($sp)
1295	ld	$t5,`$FRAME+40`($sp)
1296	ld	$t6,`$FRAME+48`($sp)
1297	ld	$t7,`$FRAME+56`($sp)
1298	stfd	$dota,`$FRAME+64`($sp)
1299	stfd	$dotb,`$FRAME+72`($sp)
1300
1301	add	$t0,$t0,$carry		; can not overflow
1302	srdi	$carry,$t0,16
1303	add	$t1,$t1,$carry
1304	srdi	$carry,$t1,16
1305	insrdi	$t0,$t1,16,32
1306	add	$t2,$t2,$carry
1307	ld	$t1,8($tp)		; tp[j]
1308	srdi	$carry,$t2,16
1309	insrdi	$t0,$t2,16,16
1310	add	$t3,$t3,$carry
1311	ldu	$t2,16($tp)		; tp[j+1]
1312	srdi	$carry,$t3,16
1313	insrdi	$t0,$t3,16,0		; 0..63 bits
1314	add	$t4,$t4,$carry
1315	srdi	$carry,$t4,16
1316	add	$t5,$t5,$carry
1317	srdi	$carry,$t5,16
1318	insrdi	$t4,$t5,16,32
1319	add	$t6,$t6,$carry
1320	srdi	$carry,$t6,16
1321	insrdi	$t4,$t6,16,16
1322	add	$t7,$t7,$carry
1323	insrdi	$t4,$t7,16,0		; 64..127 bits
1324	srdi	$carry,$t7,16		; upper 33 bits
1325	ld	$t6,`$FRAME+64`($sp)
1326	ld	$t7,`$FRAME+72`($sp)
1327
1328	addc	$t3,$t0,$t1
1329___
1330$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1331	extrdi	$t0,$t0,32,0
1332	extrdi	$t1,$t1,32,0
1333	adde	$t0,$t0,$t1
1334___
1335$code.=<<___;
1336	adde	$t5,$t4,$t2
1337___
1338$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1339	extrdi	$t4,$t4,32,0
1340	extrdi	$t2,$t2,32,0
1341	adde	$t4,$t4,$t2
1342___
1343$code.=<<___;
1344	addze	$carry,$carry
1345
1346	std	$t3,-16($tp)		; tp[j-1]
1347	std	$t5,-8($tp)		; tp[j]
1348
1349	add	$carry,$carry,$ovf	; comsume upmost overflow
1350	add	$t6,$t6,$carry		; can not overflow
1351	srdi	$carry,$t6,16
1352	add	$t7,$t7,$carry
1353	insrdi	$t6,$t7,48,0
1354	srdi	$ovf,$t7,48
1355	std	$t6,0($tp)		; tp[num-1]
1356___
1357} else {
1358$code.=<<___;
1359	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1360	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1361	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1362	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1363	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1364	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1365	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1366	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1367	stfd	$dota,`$FRAME+64`($sp)
1368	stfd	$dotb,`$FRAME+72`($sp)
1369
1370	addc	$t0,$t0,$carry
1371	adde	$t1,$t1,$c1
1372	srwi	$carry,$t0,16
1373	insrwi	$carry,$t1,16,0
1374	srwi	$c1,$t1,16
1375	addc	$t2,$t2,$carry
1376	adde	$t3,$t3,$c1
1377	srwi	$carry,$t2,16
1378	 insrwi	$t0,$t2,16,0		; 0..31 bits
1379	 lwz	$t2,12($tp)		; tp[j]
1380	insrwi	$carry,$t3,16,0
1381	srwi	$c1,$t3,16
1382	 lwz	$t3,8($tp)
1383	addc	$t4,$t4,$carry
1384	adde	$t5,$t5,$c1
1385	srwi	$carry,$t4,16
1386	insrwi	$carry,$t5,16,0
1387	srwi	$c1,$t5,16
1388	addc	$t6,$t6,$carry
1389	adde	$t7,$t7,$c1
1390	srwi	$carry,$t6,16
1391	 insrwi	$t4,$t6,16,0		; 32..63 bits
1392	insrwi	$carry,$t7,16,0
1393	srwi	$c1,$t7,16
1394
1395	addc	$t0,$t0,$t2
1396	adde	$t4,$t4,$t3
1397	addze	$carry,$carry
1398	addze	$c1,$c1
1399	 stw	$t0,4($tp)		; tp[j-1]
1400	 stw	$t4,0($tp)
1401
1402	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
1403	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
1404	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
1405	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
1406	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
1407	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
1408	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
1409	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
1410
1411	addc	$t2,$t2,$carry
1412	adde	$t3,$t3,$c1
1413	srwi	$carry,$t2,16
1414	insrwi	$carry,$t3,16,0
1415	srwi	$c1,$t3,16
1416	addc	$t6,$t6,$carry
1417	adde	$t7,$t7,$c1
1418	srwi	$carry,$t6,16
1419	 insrwi	$t2,$t6,16,0		; 64..95 bits
1420	 lwz	$t6,20($tp)
1421	insrwi	$carry,$t7,16,0
1422	srwi	$c1,$t7,16
1423	 lwzu	$t7,16($tp)
1424	addc	$t0,$t0,$carry
1425	adde	$t1,$t1,$c1
1426	srwi	$carry,$t0,16
1427	insrwi	$carry,$t1,16,0
1428	srwi	$c1,$t1,16
1429	addc	$t4,$t4,$carry
1430	adde	$t5,$t5,$c1
1431	srwi	$carry,$t4,16
1432	 insrwi	$t0,$t4,16,0		; 96..127 bits
1433	insrwi	$carry,$t5,16,0
1434	srwi	$c1,$t5,16
1435
1436	addc	$t2,$t2,$t6
1437	adde	$t0,$t0,$t7
1438	 lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
1439	 lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
1440	addze	$carry,$carry
1441	addze	$c1,$c1
1442	 lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
1443	 lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
1444
1445	addc	$t6,$t6,$carry
1446	adde	$t7,$t7,$c1
1447	 stw	$t2,-4($tp)		; tp[j]
1448	 stw	$t0,-8($tp)
1449	addc	$t6,$t6,$ovf
1450	addze	$t7,$t7
1451	srwi	$carry,$t6,16
1452	insrwi	$carry,$t7,16,0
1453	srwi	$c1,$t7,16
1454	addc	$t4,$t4,$carry
1455	adde	$t5,$t5,$c1
1456
1457	insrwi	$t6,$t4,16,0
1458	srwi	$t4,$t4,16
1459	insrwi	$t4,$t5,16,0
1460	srwi	$ovf,$t5,16
1461	stw	$t6,4($tp)		; tp[num-1]
1462	stw	$t4,0($tp)
1463___
1464}
1465$code.=<<___;
1466	slwi	$t7,$num,2
1467	addi	$i,$i,8
1468	subf	$nap_d,$t7,$nap_d	; rewind pointer
1469	cmpw	$i,$num
1470	blt-	Louter
1471___
1472
1473$code.=<<___ if ($SIZE_T==8);
1474	subf	$np,$num,$np	; rewind np
1475	addi	$j,$j,1		; restore counter
1476	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
1477	addi	$tp,$sp,`$FRAME+$TRANSFER+8`
1478	addi	$t4,$sp,`$FRAME+$TRANSFER+16`
1479	addi	$t5,$np,8
1480	addi	$t6,$rp,8
1481	mtctr	$j
1482
1483.align	4
1484Lsub:	ldx	$t0,$tp,$i
1485	ldx	$t1,$np,$i
1486	ldx	$t2,$t4,$i
1487	ldx	$t3,$t5,$i
1488	subfe	$t0,$t1,$t0	; tp[j]-np[j]
1489	subfe	$t2,$t3,$t2	; tp[j+1]-np[j+1]
1490	stdx	$t0,$rp,$i
1491	stdx	$t2,$t6,$i
1492	addi	$i,$i,16
1493	bdnz	Lsub
1494
1495	li	$i,0
1496	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
1497	mtctr	$j
1498
1499.align	4
1500Lcopy:				; conditional copy
1501	ldx	$t0,$tp,$i
1502	ldx	$t1,$t4,$i
1503	ldx	$t2,$rp,$i
1504	ldx	$t3,$t6,$i
1505	std	$i,8($nap_d)	; zap nap_d
1506	std	$i,16($nap_d)
1507	std	$i,24($nap_d)
1508	std	$i,32($nap_d)
1509	std	$i,40($nap_d)
1510	std	$i,48($nap_d)
1511	std	$i,56($nap_d)
1512	stdu	$i,64($nap_d)
1513	and	$t0,$t0,$ovf
1514	and	$t1,$t1,$ovf
1515	andc	$t2,$t2,$ovf
1516	andc	$t3,$t3,$ovf
1517	or	$t0,$t0,$t2
1518	or	$t1,$t1,$t3
1519	stdx	$t0,$rp,$i
1520	stdx	$t1,$t6,$i
1521	stdx	$i,$tp,$i	; zap tp at once
1522	stdx	$i,$t4,$i
1523	addi	$i,$i,16
1524	bdnz	Lcopy
1525___
1526$code.=<<___ if ($SIZE_T==4);
1527	subf	$np,$num,$np	; rewind np
1528	addi	$j,$j,1		; restore counter
1529	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
1530	addi	$tp,$sp,`$FRAME+$TRANSFER`
1531	addi	$np,$np,-4
1532	addi	$rp,$rp,-4
1533	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
1534	mtctr	$j
1535
1536.align	4
1537Lsub:	lwz	$t0,12($tp)	; load tp[j..j+3] in 64-bit word order
1538	lwz	$t1,8($tp)
1539	lwz	$t2,20($tp)
1540	lwzu	$t3,16($tp)
1541	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
1542	lwz	$t5,8($np)
1543	lwz	$t6,12($np)
1544	lwzu	$t7,16($np)
1545	subfe	$t4,$t4,$t0	; tp[j]-np[j]
1546	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
1547	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
1548	 stw	$t1,8($ap)
1549	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
1550	 stw	$t2,12($ap)
1551	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
1552	 stwu	$t3,16($ap)
1553	stw	$t4,4($rp)
1554	stw	$t5,8($rp)
1555	stw	$t6,12($rp)
1556	stwu	$t7,16($rp)
1557	bdnz	Lsub
1558
1559	li	$i,0
1560	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
1561	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
1562	subf	$rp,$num,$rp	; rewind rp
1563	addi	$tp,$sp,`$FRAME+$TRANSFER`
1564	mtctr	$j
1565
1566.align	4
1567Lcopy:				; conditional copy
1568	lwz	$t0,4($ap)
1569	lwz	$t1,8($ap)
1570	lwz	$t2,12($ap)
1571	lwzu	$t3,16($ap)
1572	lwz	$t4,4($rp)
1573	lwz	$t5,8($rp)
1574	lwz	$t6,12($rp)
1575	lwz	$t7,16($rp)
1576	std	$i,8($nap_d)	; zap nap_d
1577	std	$i,16($nap_d)
1578	std	$i,24($nap_d)
1579	std	$i,32($nap_d)
1580	std	$i,40($nap_d)
1581	std	$i,48($nap_d)
1582	std	$i,56($nap_d)
1583	stdu	$i,64($nap_d)
1584	and	$t0,$t0,$ovf
1585	and	$t1,$t1,$ovf
1586	and	$t2,$t2,$ovf
1587	and	$t3,$t3,$ovf
1588	andc	$t4,$t4,$ovf
1589	andc	$t5,$t5,$ovf
1590	andc	$t6,$t6,$ovf
1591	andc	$t7,$t7,$ovf
1592	or	$t0,$t0,$t4
1593	or	$t1,$t1,$t5
1594	or	$t2,$t2,$t6
1595	or	$t3,$t3,$t7
1596	stw	$t0,4($rp)
1597	stw	$t1,8($rp)
1598	stw	$t2,12($rp)
1599	stwu	$t3,16($rp)
1600	std	$i,8($tp)	; zap tp at once
1601	stdu	$i,16($tp)
1602	bdnz	Lcopy
1603___
1604
1605$code.=<<___;
1606	$POP	$i,0($sp)
1607	li	r3,1	; signal "handled"
1608	$POP	r19,`-12*8-13*$SIZE_T`($i)
1609	$POP	r20,`-12*8-12*$SIZE_T`($i)
1610	$POP	r21,`-12*8-11*$SIZE_T`($i)
1611	$POP	r22,`-12*8-10*$SIZE_T`($i)
1612	$POP	r23,`-12*8-9*$SIZE_T`($i)
1613	$POP	r24,`-12*8-8*$SIZE_T`($i)
1614	$POP	r25,`-12*8-7*$SIZE_T`($i)
1615	$POP	r26,`-12*8-6*$SIZE_T`($i)
1616	$POP	r27,`-12*8-5*$SIZE_T`($i)
1617	$POP	r28,`-12*8-4*$SIZE_T`($i)
1618	$POP	r29,`-12*8-3*$SIZE_T`($i)
1619	$POP	r30,`-12*8-2*$SIZE_T`($i)
1620	$POP	r31,`-12*8-1*$SIZE_T`($i)
1621	lfd	f20,`-12*8`($i)
1622	lfd	f21,`-11*8`($i)
1623	lfd	f22,`-10*8`($i)
1624	lfd	f23,`-9*8`($i)
1625	lfd	f24,`-8*8`($i)
1626	lfd	f25,`-7*8`($i)
1627	lfd	f26,`-6*8`($i)
1628	lfd	f27,`-5*8`($i)
1629	lfd	f28,`-4*8`($i)
1630	lfd	f29,`-3*8`($i)
1631	lfd	f30,`-2*8`($i)
1632	lfd	f31,`-1*8`($i)
1633	mr	$sp,$i
1634	blr
1635	.long	0
1636	.byte	0,12,4,0,0x8c,13,6,0
1637	.long	0
1638.size	.$fname,.-.$fname
1639
1640.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1641___
1642
1643$code =~ s/\`([^\`]*)\`/eval $1/gem;
1644print $code;
1645close STDOUT;
1646