189837Skris.explicit
289837Skris.text
3142425Snectar.ident	"ia64.S, Version 2.1"
489837Skris.ident	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
589837Skris
689837Skris//
789837Skris// ====================================================================
889837Skris// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
989837Skris// project.
1089837Skris//
1189837Skris// Rights for redistribution and usage in source and binary forms are
1289837Skris// granted according to the OpenSSL license. Warranty of any kind is
1389837Skris// disclaimed.
1489837Skris// ====================================================================
1589837Skris//
16111147Snectar// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17111147Snectar// different from Itanium to this module viewpoint. Most notably, is it
18111147Snectar// "wider" than Itanium? Can you experience loop scalability as
19111147Snectar// discussed in commentary sections? Not really:-( Itanium2 has 6
20111147Snectar// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21111147Snectar// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22111147Snectar// ports is the same, i.e. 2, while I need 4. In other words, to this
23111147Snectar// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24111147Snectar// essentially different in respect to this module, and a re-tune was
25111147Snectar// required. Well, because some intruction latencies has changed. Most
26111147Snectar// noticeably those intensively used:
27111147Snectar//
28111147Snectar//			Itanium	Itanium2
29111147Snectar//	ldf8		9	6		L2 hit
30111147Snectar//	ld8		2	1		L1 hit
31111147Snectar//	getf		2	5
32111147Snectar//	xma[->getf]	7[+1]	4[+0]
33111147Snectar//	add[->st8]	1[+1]	1[+0]
34111147Snectar//
35111147Snectar// What does it mean? You might ratiocinate that the original code
36111147Snectar// should run just faster... Because sum of latencies is smaller...
37111147Snectar// Wrong! Note that getf latency increased. This means that if a loop is
38142425Snectar// scheduled for lower latency (as they were), then it will suffer from
39111147Snectar// stall condition and the code will therefore turn anti-scalable, e.g.
40111147Snectar// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41111147Snectar// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42111147Snectar// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43111147Snectar// for worst latency for every instruction aiming for best *all-round*
44111147Snectar// performance.
4589837Skris
4689837Skris// Q.	How much faster does it get?
4789837Skris// A.	Here is the output from 'openssl speed rsa dsa' for vanilla
4889837Skris//	0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
4989837Skris//	Linux 7.1 2.96-81):
5089837Skris//
5189837Skris//	                  sign    verify    sign/s verify/s
5289837Skris//	rsa  512 bits   0.0036s   0.0003s    275.3   2999.2
5389837Skris//	rsa 1024 bits   0.0203s   0.0011s     49.3    894.1
5489837Skris//	rsa 2048 bits   0.1331s   0.0040s      7.5    250.9
5589837Skris//	rsa 4096 bits   0.9270s   0.0147s      1.1     68.1
5689837Skris//	                  sign    verify    sign/s verify/s
5789837Skris//	dsa  512 bits   0.0035s   0.0043s    288.3    234.8
5889837Skris//	dsa 1024 bits   0.0111s   0.0135s     90.0     74.2
5989837Skris//
6089837Skris//	And here is similar output but for this assembler
6189837Skris//	implementation:-)
6289837Skris//
6389837Skris//	                  sign    verify    sign/s verify/s
6489837Skris//	rsa  512 bits   0.0021s   0.0001s    549.4   9638.5
6589837Skris//	rsa 1024 bits   0.0055s   0.0002s    183.8   4481.1
6689837Skris//	rsa 2048 bits   0.0244s   0.0006s     41.4   1726.3
6789837Skris//	rsa 4096 bits   0.1295s   0.0018s      7.7    561.5
6889837Skris//	                  sign    verify    sign/s verify/s
6989837Skris//	dsa  512 bits   0.0012s   0.0013s    891.9    756.6
7089837Skris//	dsa 1024 bits   0.0023s   0.0028s    440.4    376.2
7189837Skris//
7289837Skris//	Yes, you may argue that it's not fair comparison as it's
7389837Skris//	possible to craft the C implementation with BN_UMULT_HIGH
7489837Skris//	inline assembler macro. But of course! Here is the output
7589837Skris//	with the macro:
7689837Skris//
7789837Skris//	                  sign    verify    sign/s verify/s
7889837Skris//	rsa  512 bits   0.0020s   0.0002s    495.0   6561.0
7989837Skris//	rsa 1024 bits   0.0086s   0.0004s    116.2   2235.7
8089837Skris//	rsa 2048 bits   0.0519s   0.0015s     19.3    667.3
8189837Skris//	rsa 4096 bits   0.3464s   0.0053s      2.9    187.7
8289837Skris//	                  sign    verify    sign/s verify/s
8389837Skris//	dsa  512 bits   0.0016s   0.0020s    613.1    510.5
8489837Skris//	dsa 1024 bits   0.0045s   0.0054s    221.0    183.9
8589837Skris//
8689837Skris//	My code is still way faster, huh:-) And I believe that even
8789837Skris//	higher performance can be achieved. Note that as keys get
8889837Skris//	longer, performance gain is larger. Why? According to the
8989837Skris//	profiler there is another player in the field, namely
9089837Skris//	BN_from_montgomery consuming larger and larger portion of CPU
9189837Skris//	time as keysize decreases. I therefore consider putting effort
9289837Skris//	to assembler implementation of the following routine:
9389837Skris//
9489837Skris//	void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
9589837Skris//	{
9689837Skris//	int      i,j;
9789837Skris//	BN_ULONG v;
9889837Skris//
9989837Skris//	for (i=0; i<nl; i++)
10089837Skris//		{
10189837Skris//		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
10289837Skris//		nrp++;
10389837Skris//		rp++;
10489837Skris//		if (((nrp[-1]+=v)&BN_MASK2) < v)
10589837Skris//			for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
10689837Skris//		}
10789837Skris//	}
10889837Skris//
10989837Skris//	It might as well be beneficial to implement even combaX
11089837Skris//	variants, as it appears as it can literally unleash the
11189837Skris//	performance (see comment section to bn_mul_comba8 below).
11289837Skris//
11389837Skris//	And finally for your reference the output for 0.9.6a compiled
11489837Skris//	with SGIcc version 0.01.0-12 (keep in mind that for the moment
11589837Skris//	of this writing it's not possible to convince SGIcc to use
11689837Skris//	BN_UMULT_HIGH inline assembler macro, yet the code is fast,
11789837Skris//	i.e. for a compiler generated one:-):
11889837Skris//
11989837Skris//	                  sign    verify    sign/s verify/s
12089837Skris//	rsa  512 bits   0.0022s   0.0002s    452.7   5894.3
12189837Skris//	rsa 1024 bits   0.0097s   0.0005s    102.7   2002.9
12289837Skris//	rsa 2048 bits   0.0578s   0.0017s     17.3    600.2
12389837Skris//	rsa 4096 bits   0.3838s   0.0061s      2.6    164.5
12489837Skris//	                  sign    verify    sign/s verify/s
12589837Skris//	dsa  512 bits   0.0018s   0.0022s    547.3    459.6
12689837Skris//	dsa 1024 bits   0.0051s   0.0062s    196.6    161.3
12789837Skris//
12889837Skris//	Oh! Benchmarks were performed on 733MHz Lion-class Itanium
12989837Skris//	system running Redhat Linux 7.1 (very special thanks to Ray
13089837Skris//	McCaffity of Williams Communications for providing an account).
13189837Skris//
13289837Skris// Q.	What's the heck with 'rum 1<<5' at the end of every function?
13389837Skris// A.	Well, by clearing the "upper FP registers written" bit of the
13489837Skris//	User Mask I want to excuse the kernel from preserving upper
13589837Skris//	(f32-f128) FP register bank over process context switch, thus
13689837Skris//	minimizing bus bandwidth consumption during the switch (i.e.
13789837Skris//	after PKI opration completes and the program is off doing
13889837Skris//	something else like bulk symmetric encryption). Having said
13989837Skris//	this, I also want to point out that it might be good idea
14089837Skris//	to compile the whole toolkit (as well as majority of the
14189837Skris//	programs for that matter) with -mfixed-range=f32-f127 command
14289837Skris//	line option. No, it doesn't prevent the compiler from writing
14389837Skris//	to upper bank, but at least discourages to do so. If you don't
14489837Skris//	like the idea you have the option to compile the module with
14589837Skris//	-Drum=nop.m in command line.
14689837Skris//
14789837Skris
148142425Snectar#if defined(_HPUX_SOURCE) && !defined(_LP64)
149142425Snectar#define	ADDP	addp4
150142425Snectar#else
151142425Snectar#define	ADDP	add
152142425Snectar#endif
153142425Snectar
15489837Skris#if 1
15589837Skris//
15689837Skris// bn_[add|sub]_words routines.
15789837Skris//
15889837Skris// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
15989837Skris// data reside in L1 cache, i.e. 2 ticks away). It's possible to
16089837Skris// compress the epilogue and get down to 2*n+6, but at the cost of
16189837Skris// scalability (the neat feature of this implementation is that it
16289837Skris// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
16389837Skris// I consider that the epilogue is short enough as it is to trade tiny
16489837Skris// performance loss on Itanium for scalability.
16589837Skris//
16689837Skris// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
16789837Skris//
16889837Skris.global	bn_add_words#
16989837Skris.proc	bn_add_words#
17089837Skris.align	64
17189837Skris.skip	32	// makes the loop body aligned at 64-byte boundary
17289837Skrisbn_add_words:
17389837Skris	.prologue
17489837Skris	.save	ar.pfs,r2
17589837Skris{ .mii;	alloc		r2=ar.pfs,4,12,0,16
17689837Skris	cmp4.le		p6,p0=r35,r0	};;
17789837Skris{ .mfb;	mov		r8=r0			// return value
17889837Skris(p6)	br.ret.spnt.many	b0	};;
17989837Skris
180194206Ssimon{ .mib;	sub		r10=r35,r0,1
18189837Skris	.save	ar.lc,r3
18289837Skris	mov		r3=ar.lc
18389837Skris	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
18489837Skris					}
185142425Snectar{ .mib;	ADDP		r14=0,r32		// rp
186194206Ssimon	.save	pr,r9
18789837Skris	mov		r9=pr		};;
188194206Ssimon	.body
189142425Snectar{ .mii;	ADDP		r15=0,r33		// ap
19089837Skris	mov		ar.lc=r10
19189837Skris	mov		ar.ec=6		}
192142425Snectar{ .mib;	ADDP		r16=0,r34		// bp
19389837Skris	mov		pr.rot=1<<16	};;
19489837Skris
19589837Skris.L_bn_add_words_ctop:
19689837Skris{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
19789837Skris	(p18)	add		r39=r37,r34
19889837Skris	(p19)	cmp.ltu.unc	p56,p0=r40,r38	}
19989837Skris{ .mfb;	(p0)	nop.m		0x0
20089837Skris	(p0)	nop.f		0x0
20189837Skris	(p0)	nop.b		0x0		}
20289837Skris{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
20389837Skris	(p58)	cmp.eq.or	p57,p0=-1,r41	  // (p20)
20489837Skris	(p58)	add		r41=1,r41	} // (p20)
20589837Skris{ .mfb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
20689837Skris	(p0)	nop.f		0x0
20789837Skris	br.ctop.sptk	.L_bn_add_words_ctop	};;
20889837Skris.L_bn_add_words_cend:
20989837Skris
21089837Skris{ .mii;
21189837Skris(p59)	add		r8=1,r8		// return value
212111147Snectar	mov		pr=r9,0x1ffff
21389837Skris	mov		ar.lc=r3	}
21489837Skris{ .mbb;	nop.b		0x0
21589837Skris	br.ret.sptk.many	b0	};;
21689837Skris.endp	bn_add_words#
21789837Skris
21889837Skris//
21989837Skris// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
22089837Skris//
22189837Skris.global	bn_sub_words#
22289837Skris.proc	bn_sub_words#
22389837Skris.align	64
22489837Skris.skip	32	// makes the loop body aligned at 64-byte boundary
22589837Skrisbn_sub_words:
22689837Skris	.prologue
22789837Skris	.save	ar.pfs,r2
22889837Skris{ .mii;	alloc		r2=ar.pfs,4,12,0,16
22989837Skris	cmp4.le		p6,p0=r35,r0	};;
23089837Skris{ .mfb;	mov		r8=r0			// return value
23189837Skris(p6)	br.ret.spnt.many	b0	};;
23289837Skris
233194206Ssimon{ .mib;	sub		r10=r35,r0,1
23489837Skris	.save	ar.lc,r3
23589837Skris	mov		r3=ar.lc
23689837Skris	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
23789837Skris					}
238142425Snectar{ .mib;	ADDP		r14=0,r32		// rp
239194206Ssimon	.save	pr,r9
24089837Skris	mov		r9=pr		};;
241194206Ssimon	.body
242142425Snectar{ .mii;	ADDP		r15=0,r33		// ap
24389837Skris	mov		ar.lc=r10
24489837Skris	mov		ar.ec=6		}
245142425Snectar{ .mib;	ADDP		r16=0,r34		// bp
24689837Skris	mov		pr.rot=1<<16	};;
24789837Skris
24889837Skris.L_bn_sub_words_ctop:
24989837Skris{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
25089837Skris	(p18)	sub		r39=r37,r34
25189837Skris	(p19)	cmp.gtu.unc	p56,p0=r40,r38	}
25289837Skris{ .mfb;	(p0)	nop.m		0x0
25389837Skris	(p0)	nop.f		0x0
25489837Skris	(p0)	nop.b		0x0		}
25589837Skris{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
25689837Skris	(p58)	cmp.eq.or	p57,p0=0,r41	  // (p20)
25789837Skris	(p58)	add		r41=-1,r41	} // (p20)
25889837Skris{ .mbb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
25989837Skris	(p0)	nop.b		0x0
26089837Skris	br.ctop.sptk	.L_bn_sub_words_ctop	};;
26189837Skris.L_bn_sub_words_cend:
26289837Skris
26389837Skris{ .mii;
26489837Skris(p59)	add		r8=1,r8		// return value
265111147Snectar	mov		pr=r9,0x1ffff
26689837Skris	mov		ar.lc=r3	}
26789837Skris{ .mbb;	nop.b		0x0
26889837Skris	br.ret.sptk.many	b0	};;
26989837Skris.endp	bn_sub_words#
27089837Skris#endif
27189837Skris
27289837Skris#if 0
27389837Skris#define XMA_TEMPTATION
27489837Skris#endif
27589837Skris
27689837Skris#if 1
27789837Skris//
27889837Skris// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
27989837Skris//
28089837Skris.global	bn_mul_words#
28189837Skris.proc	bn_mul_words#
28289837Skris.align	64
28389837Skris.skip	32	// makes the loop body aligned at 64-byte boundary
28489837Skrisbn_mul_words:
28589837Skris	.prologue
28689837Skris	.save	ar.pfs,r2
28789837Skris#ifdef XMA_TEMPTATION
28889837Skris{ .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
28989837Skris#else
290111147Snectar{ .mfi;	alloc		r2=ar.pfs,4,12,0,16	};;
29189837Skris#endif
29289837Skris{ .mib;	mov		r8=r0			// return value
29389837Skris	cmp4.le		p6,p0=r34,r0
29489837Skris(p6)	br.ret.spnt.many	b0		};;
29589837Skris
296194206Ssimon{ .mii;	sub	r10=r34,r0,1
29789837Skris	.save	ar.lc,r3
29889837Skris	mov	r3=ar.lc
299194206Ssimon	.save	pr,r9
30089837Skris	mov	r9=pr			};;
30189837Skris
30289837Skris	.body
30389837Skris{ .mib;	setf.sig	f8=r35	// w
304111147Snectar	mov		pr.rot=0x800001<<16
305111147Snectar			// ------^----- serves as (p50) at first (p27)
30689837Skris	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
30789837Skris					}
30889837Skris
30989837Skris#ifndef XMA_TEMPTATION
31089837Skris
311142425Snectar{ .mmi;	ADDP		r14=0,r32	// rp
312142425Snectar	ADDP		r15=0,r33	// ap
31389837Skris	mov		ar.lc=r10	}
314142425Snectar{ .mmi;	mov		r40=0		// serves as r35 at first (p27)
315111147Snectar	mov		ar.ec=13	};;
31689837Skris
317111147Snectar// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
318111147Snectar// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
31989837Skris// bypass L1 cache and L2 latency is actually best-case scenario for
320111147Snectar// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
321111147Snectar// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
32289837Skris// would give us ~5% in *overall* performance improvement on "wider"
32389837Skris// IA-64, but would hurt Itanium for about same because of longer
32489837Skris// epilogue. As it's a matter of few percents in either case I've
32589837Skris// chosen to trade the scalability for development time (you can see
32689837Skris// this very instruction sequence in bn_mul_add_words loop which in
32789837Skris// turn is scalable).
32889837Skris.L_bn_mul_words_ctop:
329111147Snectar{ .mfi;	(p25)	getf.sig	r36=f52			// low
330111147Snectar	(p21)	xmpy.lu		f48=f37,f8
331111147Snectar	(p28)	cmp.ltu		p54,p50=r41,r39	}
33289837Skris{ .mfi;	(p16)	ldf8		f32=[r15],8
333111147Snectar	(p21)	xmpy.hu		f40=f37,f8
33489837Skris	(p0)	nop.i		0x0		};;
335111147Snectar{ .mii;	(p25)	getf.sig	r32=f44			// high
336111147Snectar	.pred.rel	"mutex",p50,p54
337111147Snectar	(p50)	add		r40=r38,r35		// (p27)
338111147Snectar	(p54)	add		r40=r38,r35,1	}	// (p27)
339111147Snectar{ .mfb;	(p28)	st8		[r14]=r41,8
34089837Skris	(p0)	nop.f		0x0
34189837Skris	br.ctop.sptk	.L_bn_mul_words_ctop	};;
34289837Skris.L_bn_mul_words_cend:
34389837Skris
34489837Skris{ .mii;	nop.m		0x0
345111147Snectar.pred.rel	"mutex",p51,p55
346111147Snectar(p51)	add		r8=r36,r0
347111147Snectar(p55)	add		r8=r36,r0,1	}
34889837Skris{ .mfb;	nop.m	0x0
34989837Skris	nop.f	0x0
35089837Skris	nop.b	0x0			}
35189837Skris
35289837Skris#else	// XMA_TEMPTATION
35389837Skris
35489837Skris	setf.sig	f37=r0	// serves as carry at (p18) tick
35589837Skris	mov		ar.lc=r10
35689837Skris	mov		ar.ec=5;;
35789837Skris
35889837Skris// Most of you examining this code very likely wonder why in the name
35989837Skris// of Intel the following loop is commented out? Indeed, it looks so
36089837Skris// neat that you find it hard to believe that it's something wrong
36189837Skris// with it, right? The catch is that every iteration depends on the
36289837Skris// result from previous one and the latter isn't available instantly.
36389837Skris// The loop therefore spins at the latency of xma minus 1, or in other
36489837Skris// words at 6*(n+4) ticks:-( Compare to the "production" loop above
36589837Skris// that runs in 2*(n+11) where the low latency problem is worked around
36689837Skris// by moving the dependency to one-tick latent interger ALU. Note that
36789837Skris// "distance" between ldf8 and xma is not latency of ldf8, but the
36889837Skris// *difference* between xma and ldf8 latencies.
36989837Skris.L_bn_mul_words_ctop:
37089837Skris{ .mfi;	(p16)	ldf8		f32=[r33],8
37189837Skris	(p18)	xma.hu		f38=f34,f8,f39	}
37289837Skris{ .mfb;	(p20)	stf8		[r32]=f37,8
37389837Skris	(p18)	xma.lu		f35=f34,f8,f39
37489837Skris	br.ctop.sptk	.L_bn_mul_words_ctop	};;
37589837Skris.L_bn_mul_words_cend:
37689837Skris
37789837Skris	getf.sig	r8=f41		// the return value
37889837Skris
37989837Skris#endif	// XMA_TEMPTATION
38089837Skris
38189837Skris{ .mii;	nop.m		0x0
382111147Snectar	mov		pr=r9,0x1ffff
38389837Skris	mov		ar.lc=r3	}
38489837Skris{ .mfb;	rum		1<<5		// clear um.mfh
38589837Skris	nop.f		0x0
38689837Skris	br.ret.sptk.many	b0	};;
38789837Skris.endp	bn_mul_words#
38889837Skris#endif
38989837Skris
39089837Skris#if 1
39189837Skris//
39289837Skris// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
39389837Skris//
39489837Skris.global	bn_mul_add_words#
39589837Skris.proc	bn_mul_add_words#
39689837Skris.align	64
397142425Snectar.skip	48	// makes the loop body aligned at 64-byte boundary
39889837Skrisbn_mul_add_words:
39989837Skris	.prologue
40089837Skris	.save	ar.pfs,r2
401142425Snectar{ .mmi;	alloc		r2=ar.pfs,4,4,0,8
402142425Snectar	cmp4.le		p6,p0=r34,r0
403194206Ssimon	.save	ar.lc,r3
404142425Snectar	mov		r3=ar.lc	};;
405142425Snectar{ .mib;	mov		r8=r0		// return value
406142425Snectar	sub		r10=r34,r0,1
40789837Skris(p6)	br.ret.spnt.many	b0	};;
40889837Skris
409142425Snectar{ .mib;	setf.sig	f8=r35		// w
410194206Ssimon	.save	pr,r9
411142425Snectar	mov		r9=pr
41289837Skris	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
41389837Skris					}
414194206Ssimon	.body
415142425Snectar{ .mmi;	ADDP		r14=0,r32	// rp
416142425Snectar	ADDP		r15=0,r33	// ap
41789837Skris	mov		ar.lc=r10	}
418142425Snectar{ .mii;	ADDP		r16=0,r32	// rp copy
419142425Snectar	mov		pr.rot=0x2001<<16
420142425Snectar			// ------^----- serves as (p40) at first (p27)
421142425Snectar	mov		ar.ec=11	};;
42289837Skris
423142425Snectar// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
424142425Snectar// Itanium 2. Yes, unlike previous versions it scales:-) Previous
425142425Snectar// version was peforming *all* additions in IALU and was starving
426142425Snectar// for those even on Itanium 2. In this version one addition is
427142425Snectar// moved to FPU and is folded with multiplication. This is at cost
428142425Snectar// of propogating the result from previous call to this subroutine
429142425Snectar// to L2 cache... In other words negligible even for shorter keys.
430142425Snectar// *Overall* performance improvement [over previous version] varies
431142425Snectar// from 11 to 22 percent depending on key length.
43289837Skris.L_bn_mul_add_words_ctop:
433142425Snectar.pred.rel	"mutex",p40,p42
434142425Snectar{ .mfi;	(p23)	getf.sig	r36=f45			// low
435142425Snectar	(p20)	xma.lu		f42=f36,f8,f50		// low
436142425Snectar	(p40)	add		r39=r39,r35	}	// (p27)
437142425Snectar{ .mfi;	(p16)	ldf8		f32=[r15],8		// *(ap++)
438142425Snectar	(p20)	xma.hu		f36=f36,f8,f50		// high
439142425Snectar	(p42)	add		r39=r39,r35,1	};;	// (p27)
440142425Snectar{ .mmi;	(p24)	getf.sig	r32=f40			// high
441142425Snectar	(p16)	ldf8		f46=[r16],8		// *(rp1++)
442142425Snectar	(p40)	cmp.ltu		p41,p39=r39,r35	}	// (p27)
443142425Snectar{ .mib;	(p26)	st8		[r14]=r39,8		// *(rp2++)
444142425Snectar	(p42)	cmp.leu		p41,p39=r39,r35		// (p27)
44589837Skris	br.ctop.sptk	.L_bn_mul_add_words_ctop};;
44689837Skris.L_bn_mul_add_words_cend:
44789837Skris
448142425Snectar{ .mmi;	.pred.rel	"mutex",p40,p42
449142425Snectar(p40)	add		r8=r35,r0
450142425Snectar(p42)	add		r8=r35,r0,1
451142425Snectar	mov		pr=r9,0x1ffff	}
452142425Snectar{ .mib;	rum		1<<5		// clear um.mfh
453142425Snectar	mov		ar.lc=r3
45489837Skris	br.ret.sptk.many	b0	};;
45589837Skris.endp	bn_mul_add_words#
45689837Skris#endif
45789837Skris
45889837Skris#if 1
45989837Skris//
46089837Skris// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
46189837Skris//
46289837Skris.global	bn_sqr_words#
46389837Skris.proc	bn_sqr_words#
46489837Skris.align	64
46589837Skris.skip	32	// makes the loop body aligned at 64-byte boundary
46689837Skrisbn_sqr_words:
46789837Skris	.prologue
46889837Skris	.save	ar.pfs,r2
46989837Skris{ .mii;	alloc		r2=ar.pfs,3,0,0,0
47089837Skris	sxt4		r34=r34		};;
47189837Skris{ .mii;	cmp.le		p6,p0=r34,r0
47289837Skris	mov		r8=r0		}	// return value
473142425Snectar{ .mfb;	ADDP		r32=0,r32
474142425Snectar	nop.f		0x0
47589837Skris(p6)	br.ret.spnt.many	b0	};;
47689837Skris
477194206Ssimon{ .mii;	sub	r10=r34,r0,1
47889837Skris	.save	ar.lc,r3
47989837Skris	mov	r3=ar.lc
480194206Ssimon	.save	pr,r9
48189837Skris	mov	r9=pr			};;
48289837Skris
48389837Skris	.body
484142425Snectar{ .mib;	ADDP		r33=0,r33
48589837Skris	mov		pr.rot=1<<16
48689837Skris	brp.loop.imp	.L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
48789837Skris					}
48889837Skris{ .mii;	add		r34=8,r32
48989837Skris	mov		ar.lc=r10
49089837Skris	mov		ar.ec=18	};;
49189837Skris
49289837Skris// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
49389837Skris// possible to compress the epilogue (I'm getting tired to write this
49489837Skris// comment over and over) and get down to 2*n+16 at the cost of
49589837Skris// scalability. The decision will very likely be reconsidered after the
49689837Skris// benchmark program is profiled. I.e. if perfomance gain on Itanium
49789837Skris// will appear larger than loss on "wider" IA-64, then the loop should
49889837Skris// be explicitely split and the epilogue compressed.
49989837Skris.L_bn_sqr_words_ctop:
50089837Skris{ .mfi;	(p16)	ldf8		f32=[r33],8
50189837Skris	(p25)	xmpy.lu		f42=f41,f41
50289837Skris	(p0)	nop.i		0x0		}
50389837Skris{ .mib;	(p33)	stf8		[r32]=f50,16
50489837Skris	(p0)	nop.i		0x0
50589837Skris	(p0)	nop.b		0x0		}
50689837Skris{ .mfi;	(p0)	nop.m		0x0
50789837Skris	(p25)	xmpy.hu		f52=f41,f41
50889837Skris	(p0)	nop.i		0x0		}
50989837Skris{ .mib;	(p33)	stf8		[r34]=f60,16
51089837Skris	(p0)	nop.i		0x0
51189837Skris	br.ctop.sptk	.L_bn_sqr_words_ctop	};;
51289837Skris.L_bn_sqr_words_cend:
51389837Skris
51489837Skris{ .mii;	nop.m		0x0
515111147Snectar	mov		pr=r9,0x1ffff
51689837Skris	mov		ar.lc=r3	}
51789837Skris{ .mfb;	rum		1<<5		// clear um.mfh
51889837Skris	nop.f		0x0
51989837Skris	br.ret.sptk.many	b0	};;
52089837Skris.endp	bn_sqr_words#
52189837Skris#endif
52289837Skris
52389837Skris#if 1
52489837Skris// Apparently we win nothing by implementing special bn_sqr_comba8.
52589837Skris// Yes, it is possible to reduce the number of multiplications by
52689837Skris// almost factor of two, but then the amount of additions would
52789837Skris// increase by factor of two (as we would have to perform those
52889837Skris// otherwise performed by xma ourselves). Normally we would trade
52989837Skris// anyway as multiplications are way more expensive, but not this
53089837Skris// time... Multiplication kernel is fully pipelined and as we drain
53189837Skris// one 128-bit multiplication result per clock cycle multiplications
53289837Skris// are effectively as inexpensive as additions. Special implementation
53389837Skris// might become of interest for "wider" IA-64 implementation as you'll
53489837Skris// be able to get through the multiplication phase faster (there won't
53589837Skris// be any stall issues as discussed in the commentary section below and
53689837Skris// you therefore will be able to employ all 4 FP units)... But these
53789837Skris// Itanium days it's simply too hard to justify the effort so I just
53889837Skris// drop down to bn_mul_comba8 code:-)
53989837Skris//
54089837Skris// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
54189837Skris//
54289837Skris.global	bn_sqr_comba8#
54389837Skris.proc	bn_sqr_comba8#
54489837Skris.align	64
54589837Skrisbn_sqr_comba8:
54689837Skris	.prologue
54789837Skris	.save	ar.pfs,r2
548142425Snectar#if defined(_HPUX_SOURCE) && !defined(_LP64)
54989837Skris{ .mii;	alloc	r2=ar.pfs,2,1,0,0
550111147Snectar	addp4	r33=0,r33
551111147Snectar	addp4	r32=0,r32		};;
552111147Snectar{ .mii;
553111147Snectar#else
554111147Snectar{ .mii;	alloc	r2=ar.pfs,2,1,0,0
555111147Snectar#endif
55689837Skris	mov	r34=r33
55789837Skris	add	r14=8,r33		};;
55889837Skris	.body
55989837Skris{ .mii;	add	r17=8,r34
56089837Skris	add	r15=16,r33
56189837Skris	add	r18=16,r34		}
56289837Skris{ .mfb;	add	r16=24,r33
56389837Skris	br	.L_cheat_entry_point8	};;
56489837Skris.endp	bn_sqr_comba8#
56589837Skris#endif
56689837Skris
56789837Skris#if 1
56889837Skris// I've estimated this routine to run in ~120 ticks, but in reality
56989837Skris// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
57089837Skris// cycles consumed for instructions fetch? Or did I misinterpret some
57189837Skris// clause in Itanium �-architecture manual? Comments are welcomed and
57289837Skris// highly appreciated.
57389837Skris//
574142425Snectar// On Itanium 2 it takes ~190 ticks. This is because of stalls on
575142425Snectar// result from getf.sig. I do nothing about it at this point for
576142425Snectar// reasons depicted below.
577142425Snectar//
57889837Skris// However! It should be noted that even 160 ticks is darn good result
57989837Skris// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
58089837Skris// C version (compiled with gcc with inline assembler). I really
58189837Skris// kicked compiler's butt here, didn't I? Yeah! This brings us to the
58289837Skris// following statement. It's damn shame that this routine isn't called
58389837Skris// very often nowadays! According to the profiler most CPU time is
58489837Skris// consumed by bn_mul_add_words called from BN_from_montgomery. In
58589837Skris// order to estimate what we're missing, I've compared the performance
58689837Skris// of this routine against "traditional" implementation, i.e. against
58789837Skris// following routine:
58889837Skris//
58989837Skris// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
59089837Skris// {	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
59189837Skris//	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
59289837Skris//	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
59389837Skris//	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
59489837Skris//	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
59589837Skris//	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
59689837Skris//	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
59789837Skris//	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
59889837Skris// }
59989837Skris//
60089837Skris// The one below is over 8 times faster than the one above:-( Even
60189837Skris// more reasons to "combafy" bn_mul_add_mont...
60289837Skris//
60389837Skris// And yes, this routine really made me wish there were an optimizing
60489837Skris// assembler! It also feels like it deserves a dedication.
60589837Skris//
60689837Skris//	To my wife for being there and to my kids...
60789837Skris//
60889837Skris// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
60989837Skris//
61089837Skris#define	carry1	r14
61189837Skris#define	carry2	r15
61289837Skris#define	carry3	r34
61389837Skris.global	bn_mul_comba8#
61489837Skris.proc	bn_mul_comba8#
61589837Skris.align	64
61689837Skrisbn_mul_comba8:
61789837Skris	.prologue
61889837Skris	.save	ar.pfs,r2
619142425Snectar#if defined(_HPUX_SOURCE) && !defined(_LP64)
62089837Skris{ .mii;	alloc	r2=ar.pfs,3,0,0,0
621111147Snectar	addp4	r33=0,r33
622111147Snectar	addp4	r34=0,r34		};;
623111147Snectar{ .mii;	addp4	r32=0,r32
624111147Snectar#else
625111147Snectar{ .mii;	alloc   r2=ar.pfs,3,0,0,0
626111147Snectar#endif
62789837Skris	add	r14=8,r33
62889837Skris	add	r17=8,r34		}
62989837Skris	.body
63089837Skris{ .mii;	add	r15=16,r33
63189837Skris	add	r18=16,r34
63289837Skris	add	r16=24,r33		}
63389837Skris.L_cheat_entry_point8:
63489837Skris{ .mmi;	add	r19=24,r34
63589837Skris
63689837Skris	ldf8	f32=[r33],32		};;
63789837Skris
63889837Skris{ .mmi;	ldf8	f120=[r34],32
63989837Skris	ldf8	f121=[r17],32		}
64089837Skris{ .mmi;	ldf8	f122=[r18],32
64189837Skris	ldf8	f123=[r19],32		};;
64289837Skris{ .mmi;	ldf8	f124=[r34]
64389837Skris	ldf8	f125=[r17]		}
64489837Skris{ .mmi;	ldf8	f126=[r18]
64589837Skris	ldf8	f127=[r19]		}
64689837Skris
64789837Skris{ .mmi;	ldf8	f33=[r14],32
64889837Skris	ldf8	f34=[r15],32		}
64989837Skris{ .mmi;	ldf8	f35=[r16],32;;
65089837Skris	ldf8	f36=[r33]		}
65189837Skris{ .mmi;	ldf8	f37=[r14]
65289837Skris	ldf8	f38=[r15]		}
65389837Skris{ .mfi;	ldf8	f39=[r16]
65489837Skris// -------\ Entering multiplier's heaven /-------
65589837Skris// ------------\                    /------------
65689837Skris// -----------------\          /-----------------
65789837Skris// ----------------------\/----------------------
65889837Skris		xma.hu	f41=f32,f120,f0		}
65989837Skris{ .mfi;		xma.lu	f40=f32,f120,f0		};; // (*)
66089837Skris{ .mfi;		xma.hu	f51=f32,f121,f0		}
66189837Skris{ .mfi;		xma.lu	f50=f32,f121,f0		};;
66289837Skris{ .mfi;		xma.hu	f61=f32,f122,f0		}
66389837Skris{ .mfi;		xma.lu	f60=f32,f122,f0		};;
66489837Skris{ .mfi;		xma.hu	f71=f32,f123,f0		}
66589837Skris{ .mfi;		xma.lu	f70=f32,f123,f0		};;
66689837Skris{ .mfi;		xma.hu	f81=f32,f124,f0		}
66789837Skris{ .mfi;		xma.lu	f80=f32,f124,f0		};;
66889837Skris{ .mfi;		xma.hu	f91=f32,f125,f0		}
66989837Skris{ .mfi;		xma.lu	f90=f32,f125,f0		};;
67089837Skris{ .mfi;		xma.hu	f101=f32,f126,f0	}
67189837Skris{ .mfi;		xma.lu	f100=f32,f126,f0	};;
67289837Skris{ .mfi;		xma.hu	f111=f32,f127,f0	}
67389837Skris{ .mfi;		xma.lu	f110=f32,f127,f0	};;//
67489837Skris// (*)	You can argue that splitting at every second bundle would
67589837Skris//	prevent "wider" IA-64 implementations from achieving the peak
67689837Skris//	performance. Well, not really... The catch is that if you
67789837Skris//	intend to keep 4 FP units busy by splitting at every fourth
67889837Skris//	bundle and thus perform these 16 multiplications in 4 ticks,
67989837Skris//	the first bundle *below* would stall because the result from
68089837Skris//	the first xma bundle *above* won't be available for another 3
68189837Skris//	ticks (if not more, being an optimist, I assume that "wider"
68289837Skris//	implementation will have same latency:-). This stall will hold
68389837Skris//	you back and the performance would be as if every second bundle
68489837Skris//	were split *anyway*...
68589837Skris{ .mfi;	getf.sig	r16=f40
68689837Skris		xma.hu	f42=f33,f120,f41
68789837Skris	add		r33=8,r32		}
68889837Skris{ .mfi;		xma.lu	f41=f33,f120,f41	};;
68989837Skris{ .mfi;	getf.sig	r24=f50
69089837Skris		xma.hu	f52=f33,f121,f51	}
69189837Skris{ .mfi;		xma.lu	f51=f33,f121,f51	};;
69289837Skris{ .mfi;	st8		[r32]=r16,16
69389837Skris		xma.hu	f62=f33,f122,f61	}
69489837Skris{ .mfi;		xma.lu	f61=f33,f122,f61	};;
69589837Skris{ .mfi;		xma.hu	f72=f33,f123,f71	}
69689837Skris{ .mfi;		xma.lu	f71=f33,f123,f71	};;
69789837Skris{ .mfi;		xma.hu	f82=f33,f124,f81	}
69889837Skris{ .mfi;		xma.lu	f81=f33,f124,f81	};;
69989837Skris{ .mfi;		xma.hu	f92=f33,f125,f91	}
70089837Skris{ .mfi;		xma.lu	f91=f33,f125,f91	};;
70189837Skris{ .mfi;		xma.hu	f102=f33,f126,f101	}
70289837Skris{ .mfi;		xma.lu	f101=f33,f126,f101	};;
70389837Skris{ .mfi;		xma.hu	f112=f33,f127,f111	}
70489837Skris{ .mfi;		xma.lu	f111=f33,f127,f111	};;//
70589837Skris//-------------------------------------------------//
70689837Skris{ .mfi;	getf.sig	r25=f41
70789837Skris		xma.hu	f43=f34,f120,f42	}
70889837Skris{ .mfi;		xma.lu	f42=f34,f120,f42	};;
70989837Skris{ .mfi;	getf.sig	r16=f60
71089837Skris		xma.hu	f53=f34,f121,f52	}
71189837Skris{ .mfi;		xma.lu	f52=f34,f121,f52	};;
71289837Skris{ .mfi;	getf.sig	r17=f51
71389837Skris		xma.hu	f63=f34,f122,f62
71489837Skris	add		r25=r25,r24		}
71589837Skris{ .mfi;		xma.lu	f62=f34,f122,f62
71689837Skris	mov		carry1=0		};;
71789837Skris{ .mfi;	cmp.ltu		p6,p0=r25,r24
71889837Skris		xma.hu	f73=f34,f123,f72	}
71989837Skris{ .mfi;		xma.lu	f72=f34,f123,f72	};;
72089837Skris{ .mfi;	st8		[r33]=r25,16
72189837Skris		xma.hu	f83=f34,f124,f82
72289837Skris(p6)	add		carry1=1,carry1		}
72389837Skris{ .mfi;		xma.lu	f82=f34,f124,f82	};;
72489837Skris{ .mfi;		xma.hu	f93=f34,f125,f92	}
72589837Skris{ .mfi;		xma.lu	f92=f34,f125,f92	};;
72689837Skris{ .mfi;		xma.hu	f103=f34,f126,f102	}
72789837Skris{ .mfi;		xma.lu	f102=f34,f126,f102	};;
72889837Skris{ .mfi;		xma.hu	f113=f34,f127,f112	}
72989837Skris{ .mfi;		xma.lu	f112=f34,f127,f112	};;//
73089837Skris//-------------------------------------------------//
73189837Skris{ .mfi;	getf.sig	r18=f42
73289837Skris		xma.hu	f44=f35,f120,f43
73389837Skris	add		r17=r17,r16		}
73489837Skris{ .mfi;		xma.lu	f43=f35,f120,f43	};;
73589837Skris{ .mfi;	getf.sig	r24=f70
73689837Skris		xma.hu	f54=f35,f121,f53	}
73789837Skris{ .mfi;	mov		carry2=0
73889837Skris		xma.lu	f53=f35,f121,f53	};;
73989837Skris{ .mfi;	getf.sig	r25=f61
74089837Skris		xma.hu	f64=f35,f122,f63
74189837Skris	cmp.ltu		p7,p0=r17,r16		}
74289837Skris{ .mfi;	add		r18=r18,r17
74389837Skris		xma.lu	f63=f35,f122,f63	};;
74489837Skris{ .mfi;	getf.sig	r26=f52
74589837Skris		xma.hu	f74=f35,f123,f73
74689837Skris(p7)	add		carry2=1,carry2		}
74789837Skris{ .mfi;	cmp.ltu		p7,p0=r18,r17
74889837Skris		xma.lu	f73=f35,f123,f73
74989837Skris	add		r18=r18,carry1		};;
75089837Skris{ .mfi;
75189837Skris		xma.hu	f84=f35,f124,f83
75289837Skris(p7)	add		carry2=1,carry2		}
75389837Skris{ .mfi;	cmp.ltu		p7,p0=r18,carry1
75489837Skris		xma.lu	f83=f35,f124,f83	};;
75589837Skris{ .mfi;	st8		[r32]=r18,16
75689837Skris		xma.hu	f94=f35,f125,f93
75789837Skris(p7)	add		carry2=1,carry2		}
75889837Skris{ .mfi;		xma.lu	f93=f35,f125,f93	};;
75989837Skris{ .mfi;		xma.hu	f104=f35,f126,f103	}
76089837Skris{ .mfi;		xma.lu	f103=f35,f126,f103	};;
76189837Skris{ .mfi;		xma.hu	f114=f35,f127,f113	}
76289837Skris{ .mfi;	mov		carry1=0
76389837Skris		xma.lu	f113=f35,f127,f113
76489837Skris	add		r25=r25,r24		};;//
76589837Skris//-------------------------------------------------//
76689837Skris{ .mfi;	getf.sig	r27=f43
76789837Skris		xma.hu	f45=f36,f120,f44
76889837Skris	cmp.ltu		p6,p0=r25,r24		}
76989837Skris{ .mfi;		xma.lu	f44=f36,f120,f44
77089837Skris	add		r26=r26,r25		};;
77189837Skris{ .mfi;	getf.sig	r16=f80
77289837Skris		xma.hu	f55=f36,f121,f54
77389837Skris(p6)	add		carry1=1,carry1		}
77489837Skris{ .mfi;		xma.lu	f54=f36,f121,f54	};;
77589837Skris{ .mfi;	getf.sig	r17=f71
77689837Skris		xma.hu	f65=f36,f122,f64
77789837Skris	cmp.ltu		p6,p0=r26,r25		}
77889837Skris{ .mfi;		xma.lu	f64=f36,f122,f64
77989837Skris	add		r27=r27,r26		};;
78089837Skris{ .mfi;	getf.sig	r18=f62
78189837Skris		xma.hu	f75=f36,f123,f74
78289837Skris(p6)	add		carry1=1,carry1		}
78389837Skris{ .mfi;	cmp.ltu		p6,p0=r27,r26
78489837Skris		xma.lu	f74=f36,f123,f74
78589837Skris	add		r27=r27,carry2		};;
78689837Skris{ .mfi;	getf.sig	r19=f53
78789837Skris		xma.hu	f85=f36,f124,f84
78889837Skris(p6)	add		carry1=1,carry1		}
78989837Skris{ .mfi;		xma.lu	f84=f36,f124,f84
79089837Skris	cmp.ltu		p6,p0=r27,carry2	};;
79189837Skris{ .mfi;	st8		[r33]=r27,16
79289837Skris		xma.hu	f95=f36,f125,f94
79389837Skris(p6)	add		carry1=1,carry1		}
79489837Skris{ .mfi;		xma.lu	f94=f36,f125,f94	};;
79589837Skris{ .mfi;		xma.hu	f105=f36,f126,f104	}
79689837Skris{ .mfi;	mov		carry2=0
79789837Skris		xma.lu	f104=f36,f126,f104
79889837Skris	add		r17=r17,r16		};;
79989837Skris{ .mfi;		xma.hu	f115=f36,f127,f114
80089837Skris	cmp.ltu		p7,p0=r17,r16		}
80189837Skris{ .mfi;		xma.lu	f114=f36,f127,f114
80289837Skris	add		r18=r18,r17		};;//
80389837Skris//-------------------------------------------------//
80489837Skris{ .mfi;	getf.sig	r20=f44
80589837Skris		xma.hu	f46=f37,f120,f45
80689837Skris(p7)	add		carry2=1,carry2		}
80789837Skris{ .mfi;	cmp.ltu		p7,p0=r18,r17
80889837Skris		xma.lu	f45=f37,f120,f45
80989837Skris	add		r19=r19,r18		};;
81089837Skris{ .mfi;	getf.sig	r24=f90
81189837Skris		xma.hu	f56=f37,f121,f55	}
81289837Skris{ .mfi;		xma.lu	f55=f37,f121,f55	};;
81389837Skris{ .mfi;	getf.sig	r25=f81
81489837Skris		xma.hu	f66=f37,f122,f65
81589837Skris(p7)	add		carry2=1,carry2		}
81689837Skris{ .mfi;	cmp.ltu		p7,p0=r19,r18
81789837Skris		xma.lu	f65=f37,f122,f65
81889837Skris	add		r20=r20,r19		};;
81989837Skris{ .mfi;	getf.sig	r26=f72
82089837Skris		xma.hu	f76=f37,f123,f75
82189837Skris(p7)	add		carry2=1,carry2		}
82289837Skris{ .mfi;	cmp.ltu		p7,p0=r20,r19
82389837Skris		xma.lu	f75=f37,f123,f75
82489837Skris	add		r20=r20,carry1		};;
82589837Skris{ .mfi;	getf.sig	r27=f63
82689837Skris		xma.hu	f86=f37,f124,f85
82789837Skris(p7)	add		carry2=1,carry2		}
82889837Skris{ .mfi;		xma.lu	f85=f37,f124,f85
82989837Skris	cmp.ltu		p7,p0=r20,carry1	};;
83089837Skris{ .mfi;	getf.sig	r28=f54
83189837Skris		xma.hu	f96=f37,f125,f95
83289837Skris(p7)	add		carry2=1,carry2		}
83389837Skris{ .mfi;	st8		[r32]=r20,16
83489837Skris		xma.lu	f95=f37,f125,f95	};;
83589837Skris{ .mfi;		xma.hu	f106=f37,f126,f105	}
83689837Skris{ .mfi;	mov		carry1=0
83789837Skris		xma.lu	f105=f37,f126,f105
83889837Skris	add		r25=r25,r24		};;
83989837Skris{ .mfi;		xma.hu	f116=f37,f127,f115
84089837Skris	cmp.ltu		p6,p0=r25,r24		}
84189837Skris{ .mfi;		xma.lu	f115=f37,f127,f115
84289837Skris	add		r26=r26,r25		};;//
84389837Skris//-------------------------------------------------//
84489837Skris{ .mfi;	getf.sig	r29=f45
84589837Skris		xma.hu	f47=f38,f120,f46
84689837Skris(p6)	add		carry1=1,carry1		}
84789837Skris{ .mfi;	cmp.ltu		p6,p0=r26,r25
84889837Skris		xma.lu	f46=f38,f120,f46
84989837Skris	add		r27=r27,r26		};;
85089837Skris{ .mfi;	getf.sig	r16=f100
85189837Skris		xma.hu	f57=f38,f121,f56
85289837Skris(p6)	add		carry1=1,carry1		}
85389837Skris{ .mfi;	cmp.ltu		p6,p0=r27,r26
85489837Skris		xma.lu	f56=f38,f121,f56
85589837Skris	add		r28=r28,r27		};;
85689837Skris{ .mfi;	getf.sig	r17=f91
85789837Skris		xma.hu	f67=f38,f122,f66
85889837Skris(p6)	add		carry1=1,carry1		}
85989837Skris{ .mfi;	cmp.ltu		p6,p0=r28,r27
86089837Skris		xma.lu	f66=f38,f122,f66
86189837Skris	add		r29=r29,r28		};;
86289837Skris{ .mfi;	getf.sig	r18=f82
86389837Skris		xma.hu	f77=f38,f123,f76
86489837Skris(p6)	add		carry1=1,carry1		}
86589837Skris{ .mfi;	cmp.ltu		p6,p0=r29,r28
86689837Skris		xma.lu	f76=f38,f123,f76
86789837Skris	add		r29=r29,carry2		};;
86889837Skris{ .mfi;	getf.sig	r19=f73
86989837Skris		xma.hu	f87=f38,f124,f86
87089837Skris(p6)	add		carry1=1,carry1		}
87189837Skris{ .mfi;		xma.lu	f86=f38,f124,f86
87289837Skris	cmp.ltu		p6,p0=r29,carry2	};;
87389837Skris{ .mfi;	getf.sig	r20=f64
87489837Skris		xma.hu	f97=f38,f125,f96
87589837Skris(p6)	add		carry1=1,carry1		}
87689837Skris{ .mfi;	st8		[r33]=r29,16
87789837Skris		xma.lu	f96=f38,f125,f96	};;
87889837Skris{ .mfi;	getf.sig	r21=f55
87989837Skris		xma.hu	f107=f38,f126,f106	}
88089837Skris{ .mfi;	mov		carry2=0
88189837Skris		xma.lu	f106=f38,f126,f106
88289837Skris	add		r17=r17,r16		};;
88389837Skris{ .mfi;		xma.hu	f117=f38,f127,f116
88489837Skris	cmp.ltu		p7,p0=r17,r16		}
88589837Skris{ .mfi;		xma.lu	f116=f38,f127,f116
88689837Skris	add		r18=r18,r17		};;//
88789837Skris//-------------------------------------------------//
88889837Skris{ .mfi;	getf.sig	r22=f46
88989837Skris		xma.hu	f48=f39,f120,f47
89089837Skris(p7)	add		carry2=1,carry2		}
89189837Skris{ .mfi;	cmp.ltu		p7,p0=r18,r17
89289837Skris		xma.lu	f47=f39,f120,f47
89389837Skris	add		r19=r19,r18		};;
89489837Skris{ .mfi;	getf.sig	r24=f110
89589837Skris		xma.hu	f58=f39,f121,f57
89689837Skris(p7)	add		carry2=1,carry2		}
89789837Skris{ .mfi;	cmp.ltu		p7,p0=r19,r18
89889837Skris		xma.lu	f57=f39,f121,f57
89989837Skris	add		r20=r20,r19		};;
90089837Skris{ .mfi;	getf.sig	r25=f101
90189837Skris		xma.hu	f68=f39,f122,f67
90289837Skris(p7)	add		carry2=1,carry2		}
90389837Skris{ .mfi;	cmp.ltu		p7,p0=r20,r19
90489837Skris		xma.lu	f67=f39,f122,f67
90589837Skris	add		r21=r21,r20		};;
90689837Skris{ .mfi;	getf.sig	r26=f92
90789837Skris		xma.hu	f78=f39,f123,f77
90889837Skris(p7)	add		carry2=1,carry2		}
90989837Skris{ .mfi;	cmp.ltu		p7,p0=r21,r20
91089837Skris		xma.lu	f77=f39,f123,f77
91189837Skris	add		r22=r22,r21		};;
91289837Skris{ .mfi;	getf.sig	r27=f83
91389837Skris		xma.hu	f88=f39,f124,f87
91489837Skris(p7)	add		carry2=1,carry2		}
91589837Skris{ .mfi;	cmp.ltu		p7,p0=r22,r21
91689837Skris		xma.lu	f87=f39,f124,f87
91789837Skris	add		r22=r22,carry1		};;
91889837Skris{ .mfi;	getf.sig	r28=f74
91989837Skris		xma.hu	f98=f39,f125,f97
92089837Skris(p7)	add		carry2=1,carry2		}
92189837Skris{ .mfi;		xma.lu	f97=f39,f125,f97
92289837Skris	cmp.ltu		p7,p0=r22,carry1	};;
92389837Skris{ .mfi;	getf.sig	r29=f65
92489837Skris		xma.hu	f108=f39,f126,f107
92589837Skris(p7)	add		carry2=1,carry2		}
92689837Skris{ .mfi;	st8		[r32]=r22,16
92789837Skris		xma.lu	f107=f39,f126,f107	};;
92889837Skris{ .mfi;	getf.sig	r30=f56
92989837Skris		xma.hu	f118=f39,f127,f117	}
93089837Skris{ .mfi;		xma.lu	f117=f39,f127,f117	};;//
93189837Skris//-------------------------------------------------//
93289837Skris// Leaving muliplier's heaven... Quite a ride, huh?
93389837Skris
93489837Skris{ .mii;	getf.sig	r31=f47
93589837Skris	add		r25=r25,r24
93689837Skris	mov		carry1=0		};;
93789837Skris{ .mii;		getf.sig	r16=f111
93889837Skris	cmp.ltu		p6,p0=r25,r24
93989837Skris	add		r26=r26,r25		};;
94089837Skris{ .mfb;		getf.sig	r17=f102	}
94189837Skris{ .mii;
94289837Skris(p6)	add		carry1=1,carry1
94389837Skris	cmp.ltu		p6,p0=r26,r25
94489837Skris	add		r27=r27,r26		};;
94589837Skris{ .mfb;	nop.m	0x0				}
94689837Skris{ .mii;
94789837Skris(p6)	add		carry1=1,carry1
94889837Skris	cmp.ltu		p6,p0=r27,r26
94989837Skris	add		r28=r28,r27		};;
95089837Skris{ .mii;		getf.sig	r18=f93
95189837Skris		add		r17=r17,r16
95289837Skris		mov		carry3=0	}
95389837Skris{ .mii;
95489837Skris(p6)	add		carry1=1,carry1
95589837Skris	cmp.ltu		p6,p0=r28,r27
95689837Skris	add		r29=r29,r28		};;
95789837Skris{ .mii;		getf.sig	r19=f84
95889837Skris		cmp.ltu		p7,p0=r17,r16	}
95989837Skris{ .mii;
96089837Skris(p6)	add		carry1=1,carry1
96189837Skris	cmp.ltu		p6,p0=r29,r28
96289837Skris	add		r30=r30,r29		};;
96389837Skris{ .mii;		getf.sig	r20=f75
96489837Skris		add		r18=r18,r17	}
96589837Skris{ .mii;
96689837Skris(p6)	add		carry1=1,carry1
96789837Skris	cmp.ltu		p6,p0=r30,r29
96889837Skris	add		r31=r31,r30		};;
96989837Skris{ .mfb;		getf.sig	r21=f66		}
97089837Skris{ .mii;	(p7)	add		carry3=1,carry3
97189837Skris		cmp.ltu		p7,p0=r18,r17
97289837Skris		add		r19=r19,r18	}
97389837Skris{ .mfb;	nop.m	0x0				}
97489837Skris{ .mii;
97589837Skris(p6)	add		carry1=1,carry1
97689837Skris	cmp.ltu		p6,p0=r31,r30
97789837Skris	add		r31=r31,carry2		};;
97889837Skris{ .mfb;		getf.sig	r22=f57		}
97989837Skris{ .mii;	(p7)	add		carry3=1,carry3
98089837Skris		cmp.ltu		p7,p0=r19,r18
98189837Skris		add		r20=r20,r19	}
98289837Skris{ .mfb;	nop.m	0x0				}
98389837Skris{ .mii;
98489837Skris(p6)	add		carry1=1,carry1
98589837Skris	cmp.ltu		p6,p0=r31,carry2	};;
98689837Skris{ .mfb;		getf.sig	r23=f48		}
98789837Skris{ .mii;	(p7)	add		carry3=1,carry3
98889837Skris		cmp.ltu		p7,p0=r20,r19
98989837Skris		add		r21=r21,r20	}
99089837Skris{ .mii;
99189837Skris(p6)	add		carry1=1,carry1		}
99289837Skris{ .mfb;	st8		[r33]=r31,16		};;
99389837Skris
99489837Skris{ .mfb;	getf.sig	r24=f112		}
99589837Skris{ .mii;	(p7)	add		carry3=1,carry3
99689837Skris		cmp.ltu		p7,p0=r21,r20
99789837Skris		add		r22=r22,r21	};;
99889837Skris{ .mfb;	getf.sig	r25=f103		}
99989837Skris{ .mii;	(p7)	add		carry3=1,carry3
100089837Skris		cmp.ltu		p7,p0=r22,r21
100189837Skris		add		r23=r23,r22	};;
100289837Skris{ .mfb;	getf.sig	r26=f94			}
100389837Skris{ .mii;	(p7)	add		carry3=1,carry3
100489837Skris		cmp.ltu		p7,p0=r23,r22
100589837Skris		add		r23=r23,carry1	};;
100689837Skris{ .mfb;	getf.sig	r27=f85			}
100789837Skris{ .mii;	(p7)	add		carry3=1,carry3
100889837Skris		cmp.ltu		p7,p8=r23,carry1};;
100989837Skris{ .mii;	getf.sig	r28=f76
101089837Skris	add		r25=r25,r24
101189837Skris	mov		carry1=0		}
101289837Skris{ .mii;		st8		[r32]=r23,16
101389837Skris	(p7)	add		carry2=1,carry3
101489837Skris	(p8)	add		carry2=0,carry3	};;
101589837Skris
101689837Skris{ .mfb;	nop.m	0x0				}
101789837Skris{ .mii;	getf.sig	r29=f67
101889837Skris	cmp.ltu		p6,p0=r25,r24
101989837Skris	add		r26=r26,r25		};;
102089837Skris{ .mfb;	getf.sig	r30=f58			}
102189837Skris{ .mii;
102289837Skris(p6)	add		carry1=1,carry1
102389837Skris	cmp.ltu		p6,p0=r26,r25
102489837Skris	add		r27=r27,r26		};;
102589837Skris{ .mfb;		getf.sig	r16=f113	}
102689837Skris{ .mii;
102789837Skris(p6)	add		carry1=1,carry1
102889837Skris	cmp.ltu		p6,p0=r27,r26
102989837Skris	add		r28=r28,r27		};;
103089837Skris{ .mfb;		getf.sig	r17=f104	}
103189837Skris{ .mii;
103289837Skris(p6)	add		carry1=1,carry1
103389837Skris	cmp.ltu		p6,p0=r28,r27
103489837Skris	add		r29=r29,r28		};;
103589837Skris{ .mfb;		getf.sig	r18=f95		}
103689837Skris{ .mii;
103789837Skris(p6)	add		carry1=1,carry1
103889837Skris	cmp.ltu		p6,p0=r29,r28
103989837Skris	add		r30=r30,r29		};;
104089837Skris{ .mii;		getf.sig	r19=f86
104189837Skris		add		r17=r17,r16
104289837Skris		mov		carry3=0	}
104389837Skris{ .mii;
104489837Skris(p6)	add		carry1=1,carry1
104589837Skris	cmp.ltu		p6,p0=r30,r29
104689837Skris	add		r30=r30,carry2		};;
104789837Skris{ .mii;		getf.sig	r20=f77
104889837Skris		cmp.ltu		p7,p0=r17,r16
104989837Skris		add		r18=r18,r17	}
105089837Skris{ .mii;
105189837Skris(p6)	add		carry1=1,carry1
105289837Skris	cmp.ltu		p6,p0=r30,carry2	};;
105389837Skris{ .mfb;		getf.sig	r21=f68		}
105489837Skris{ .mii;	st8		[r33]=r30,16
105589837Skris(p6)	add		carry1=1,carry1		};;
105689837Skris
105789837Skris{ .mfb;	getf.sig	r24=f114		}
105889837Skris{ .mii;	(p7)	add		carry3=1,carry3
105989837Skris		cmp.ltu		p7,p0=r18,r17
106089837Skris		add		r19=r19,r18	};;
106189837Skris{ .mfb;	getf.sig	r25=f105		}
106289837Skris{ .mii;	(p7)	add		carry3=1,carry3
106389837Skris		cmp.ltu		p7,p0=r19,r18
106489837Skris		add		r20=r20,r19	};;
106589837Skris{ .mfb;	getf.sig	r26=f96			}
106689837Skris{ .mii;	(p7)	add		carry3=1,carry3
106789837Skris		cmp.ltu		p7,p0=r20,r19
106889837Skris		add		r21=r21,r20	};;
106989837Skris{ .mfb;	getf.sig	r27=f87			}
107089837Skris{ .mii;	(p7)	add		carry3=1,carry3
107189837Skris		cmp.ltu		p7,p0=r21,r20
107289837Skris		add		r21=r21,carry1	};;
107389837Skris{ .mib;	getf.sig	r28=f78
107489837Skris	add		r25=r25,r24		}
107589837Skris{ .mib;	(p7)	add		carry3=1,carry3
107689837Skris		cmp.ltu		p7,p8=r21,carry1};;
107789837Skris{ .mii;		st8		[r32]=r21,16
107889837Skris	(p7)	add		carry2=1,carry3
107989837Skris	(p8)	add		carry2=0,carry3	}
108089837Skris
108189837Skris{ .mii;	mov		carry1=0
108289837Skris	cmp.ltu		p6,p0=r25,r24
108389837Skris	add		r26=r26,r25		};;
108489837Skris{ .mfb;		getf.sig	r16=f115	}
108589837Skris{ .mii;
108689837Skris(p6)	add		carry1=1,carry1
108789837Skris	cmp.ltu		p6,p0=r26,r25
108889837Skris	add		r27=r27,r26		};;
108989837Skris{ .mfb;		getf.sig	r17=f106	}
109089837Skris{ .mii;
109189837Skris(p6)	add		carry1=1,carry1
109289837Skris	cmp.ltu		p6,p0=r27,r26
109389837Skris	add		r28=r28,r27		};;
109489837Skris{ .mfb;		getf.sig	r18=f97		}
109589837Skris{ .mii;
109689837Skris(p6)	add		carry1=1,carry1
109789837Skris	cmp.ltu		p6,p0=r28,r27
109889837Skris	add		r28=r28,carry2		};;
109989837Skris{ .mib;		getf.sig	r19=f88
110089837Skris		add		r17=r17,r16	}
110189837Skris{ .mib;
110289837Skris(p6)	add		carry1=1,carry1
110389837Skris	cmp.ltu		p6,p0=r28,carry2	};;
110489837Skris{ .mii;	st8		[r33]=r28,16
110589837Skris(p6)	add		carry1=1,carry1		}
110689837Skris
110789837Skris{ .mii;		mov		carry2=0
110889837Skris		cmp.ltu		p7,p0=r17,r16
110989837Skris		add		r18=r18,r17	};;
111089837Skris{ .mfb;	getf.sig	r24=f116		}
111189837Skris{ .mii;	(p7)	add		carry2=1,carry2
111289837Skris		cmp.ltu		p7,p0=r18,r17
111389837Skris		add		r19=r19,r18	};;
111489837Skris{ .mfb;	getf.sig	r25=f107		}
111589837Skris{ .mii;	(p7)	add		carry2=1,carry2
111689837Skris		cmp.ltu		p7,p0=r19,r18
111789837Skris		add		r19=r19,carry1	};;
111889837Skris{ .mfb;	getf.sig	r26=f98			}
111989837Skris{ .mii;	(p7)	add		carry2=1,carry2
112089837Skris		cmp.ltu		p7,p0=r19,carry1};;
112189837Skris{ .mii;		st8		[r32]=r19,16
112289837Skris	(p7)	add		carry2=1,carry2	}
112389837Skris
112489837Skris{ .mfb;	add		r25=r25,r24		};;
112589837Skris
112689837Skris{ .mfb;		getf.sig	r16=f117	}
112789837Skris{ .mii;	mov		carry1=0
112889837Skris	cmp.ltu		p6,p0=r25,r24
112989837Skris	add		r26=r26,r25		};;
113089837Skris{ .mfb;		getf.sig	r17=f108	}
113189837Skris{ .mii;
113289837Skris(p6)	add		carry1=1,carry1
113389837Skris	cmp.ltu		p6,p0=r26,r25
113489837Skris	add		r26=r26,carry2		};;
113589837Skris{ .mfb;	nop.m	0x0				}
113689837Skris{ .mii;
113789837Skris(p6)	add		carry1=1,carry1
113889837Skris	cmp.ltu		p6,p0=r26,carry2	};;
113989837Skris{ .mii;	st8		[r33]=r26,16
114089837Skris(p6)	add		carry1=1,carry1		}
114189837Skris
114289837Skris{ .mfb;		add		r17=r17,r16	};;
114389837Skris{ .mfb;	getf.sig	r24=f118		}
114489837Skris{ .mii;		mov		carry2=0
114589837Skris		cmp.ltu		p7,p0=r17,r16
114689837Skris		add		r17=r17,carry1	};;
114789837Skris{ .mii;	(p7)	add		carry2=1,carry2
114889837Skris		cmp.ltu		p7,p0=r17,carry1};;
114989837Skris{ .mii;		st8		[r32]=r17
115089837Skris	(p7)	add		carry2=1,carry2	};;
115189837Skris{ .mfb;	add		r24=r24,carry2		};;
115289837Skris{ .mib;	st8		[r33]=r24		}
115389837Skris
115489837Skris{ .mib;	rum		1<<5		// clear um.mfh
115589837Skris	br.ret.sptk.many	b0	};;
115689837Skris.endp	bn_mul_comba8#
115789837Skris#undef	carry3
115889837Skris#undef	carry2
115989837Skris#undef	carry1
116089837Skris#endif
116189837Skris
116289837Skris#if 1
116389837Skris// It's possible to make it faster (see comment to bn_sqr_comba8), but
116489837Skris// I reckon it doesn't worth the effort. Basically because the routine
116589837Skris// (actually both of them) practically never called... So I just play
116689837Skris// same trick as with bn_sqr_comba8.
116789837Skris//
116889837Skris// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
116989837Skris//
117089837Skris.global	bn_sqr_comba4#
117189837Skris.proc	bn_sqr_comba4#
117289837Skris.align	64
117389837Skrisbn_sqr_comba4:
117489837Skris	.prologue
117589837Skris	.save	ar.pfs,r2
1176142425Snectar#if defined(_HPUX_SOURCE) && !defined(_LP64)
1177111147Snectar{ .mii;	alloc   r2=ar.pfs,2,1,0,0
1178111147Snectar	addp4	r32=0,r32
1179111147Snectar	addp4	r33=0,r33		};;
1180111147Snectar{ .mii;
1181111147Snectar#else
118289837Skris{ .mii;	alloc	r2=ar.pfs,2,1,0,0
1183111147Snectar#endif
118489837Skris	mov	r34=r33
118589837Skris	add	r14=8,r33		};;
118689837Skris	.body
118789837Skris{ .mii;	add	r17=8,r34
118889837Skris	add	r15=16,r33
118989837Skris	add	r18=16,r34		}
119089837Skris{ .mfb;	add	r16=24,r33
119189837Skris	br	.L_cheat_entry_point4	};;
119289837Skris.endp	bn_sqr_comba4#
119389837Skris#endif
119489837Skris
119589837Skris#if 1
119689837Skris// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
119789837Skris//
119889837Skris// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
119989837Skris//
120089837Skris#define	carry1	r14
120189837Skris#define	carry2	r15
120289837Skris.global	bn_mul_comba4#
120389837Skris.proc	bn_mul_comba4#
120489837Skris.align	64
120589837Skrisbn_mul_comba4:
120689837Skris	.prologue
120789837Skris	.save	ar.pfs,r2
1208142425Snectar#if defined(_HPUX_SOURCE) && !defined(_LP64)
1209111147Snectar{ .mii;	alloc   r2=ar.pfs,3,0,0,0
1210111147Snectar	addp4	r33=0,r33
1211111147Snectar	addp4	r34=0,r34		};;
1212111147Snectar{ .mii;	addp4	r32=0,r32
1213111147Snectar#else
121489837Skris{ .mii;	alloc	r2=ar.pfs,3,0,0,0
1215111147Snectar#endif
121689837Skris	add	r14=8,r33
121789837Skris	add	r17=8,r34		}
121889837Skris	.body
121989837Skris{ .mii;	add	r15=16,r33
122089837Skris	add	r18=16,r34
122189837Skris	add	r16=24,r33		};;
122289837Skris.L_cheat_entry_point4:
122389837Skris{ .mmi;	add	r19=24,r34
122489837Skris
122589837Skris	ldf8	f32=[r33]		}
122689837Skris
122789837Skris{ .mmi;	ldf8	f120=[r34]
122889837Skris	ldf8	f121=[r17]		};;
122989837Skris{ .mmi;	ldf8	f122=[r18]
123089837Skris	ldf8	f123=[r19]		}
123189837Skris
123289837Skris{ .mmi;	ldf8	f33=[r14]
123389837Skris	ldf8	f34=[r15]		}
123489837Skris{ .mfi;	ldf8	f35=[r16]
123589837Skris
123689837Skris		xma.hu	f41=f32,f120,f0		}
123789837Skris{ .mfi;		xma.lu	f40=f32,f120,f0		};;
123889837Skris{ .mfi;		xma.hu	f51=f32,f121,f0		}
123989837Skris{ .mfi;		xma.lu	f50=f32,f121,f0		};;
124089837Skris{ .mfi;		xma.hu	f61=f32,f122,f0		}
124189837Skris{ .mfi;		xma.lu	f60=f32,f122,f0		};;
124289837Skris{ .mfi;		xma.hu	f71=f32,f123,f0		}
124389837Skris{ .mfi;		xma.lu	f70=f32,f123,f0		};;//
124489837Skris// Major stall takes place here, and 3 more places below. Result from
124589837Skris// first xma is not available for another 3 ticks.
124689837Skris{ .mfi;	getf.sig	r16=f40
124789837Skris		xma.hu	f42=f33,f120,f41
124889837Skris	add		r33=8,r32		}
124989837Skris{ .mfi;		xma.lu	f41=f33,f120,f41	};;
125089837Skris{ .mfi;	getf.sig	r24=f50
125189837Skris		xma.hu	f52=f33,f121,f51	}
125289837Skris{ .mfi;		xma.lu	f51=f33,f121,f51	};;
125389837Skris{ .mfi;	st8		[r32]=r16,16
125489837Skris		xma.hu	f62=f33,f122,f61	}
125589837Skris{ .mfi;		xma.lu	f61=f33,f122,f61	};;
125689837Skris{ .mfi;		xma.hu	f72=f33,f123,f71	}
125789837Skris{ .mfi;		xma.lu	f71=f33,f123,f71	};;//
125889837Skris//-------------------------------------------------//
125989837Skris{ .mfi;	getf.sig	r25=f41
126089837Skris		xma.hu	f43=f34,f120,f42	}
126189837Skris{ .mfi;		xma.lu	f42=f34,f120,f42	};;
126289837Skris{ .mfi;	getf.sig	r16=f60
126389837Skris		xma.hu	f53=f34,f121,f52	}
126489837Skris{ .mfi;		xma.lu	f52=f34,f121,f52	};;
126589837Skris{ .mfi;	getf.sig	r17=f51
126689837Skris		xma.hu	f63=f34,f122,f62
126789837Skris	add		r25=r25,r24		}
126889837Skris{ .mfi;	mov		carry1=0
126989837Skris		xma.lu	f62=f34,f122,f62	};;
127089837Skris{ .mfi;	st8		[r33]=r25,16
127189837Skris		xma.hu	f73=f34,f123,f72
127289837Skris	cmp.ltu		p6,p0=r25,r24		}
127389837Skris{ .mfi;		xma.lu	f72=f34,f123,f72	};;//
127489837Skris//-------------------------------------------------//
127589837Skris{ .mfi;	getf.sig	r18=f42
127689837Skris		xma.hu	f44=f35,f120,f43
127789837Skris(p6)	add		carry1=1,carry1		}
127889837Skris{ .mfi;	add		r17=r17,r16
127989837Skris		xma.lu	f43=f35,f120,f43
128089837Skris	mov		carry2=0		};;
128189837Skris{ .mfi;	getf.sig	r24=f70
128289837Skris		xma.hu	f54=f35,f121,f53
128389837Skris	cmp.ltu		p7,p0=r17,r16		}
128489837Skris{ .mfi;		xma.lu	f53=f35,f121,f53	};;
128589837Skris{ .mfi;	getf.sig	r25=f61
128689837Skris		xma.hu	f64=f35,f122,f63
128789837Skris	add		r18=r18,r17		}
128889837Skris{ .mfi;		xma.lu	f63=f35,f122,f63
128989837Skris(p7)	add		carry2=1,carry2		};;
129089837Skris{ .mfi;	getf.sig	r26=f52
129189837Skris		xma.hu	f74=f35,f123,f73
129289837Skris	cmp.ltu		p7,p0=r18,r17		}
129389837Skris{ .mfi;		xma.lu	f73=f35,f123,f73
129489837Skris	add		r18=r18,carry1		};;
129589837Skris//-------------------------------------------------//
129689837Skris{ .mii;	st8		[r32]=r18,16
129789837Skris(p7)	add		carry2=1,carry2
129889837Skris	cmp.ltu		p7,p0=r18,carry1	};;
129989837Skris
130089837Skris{ .mfi;	getf.sig	r27=f43	// last major stall
130189837Skris(p7)	add		carry2=1,carry2		};;
130289837Skris{ .mii;		getf.sig	r16=f71
130389837Skris	add		r25=r25,r24
130489837Skris	mov		carry1=0		};;
130589837Skris{ .mii;		getf.sig	r17=f62
130689837Skris	cmp.ltu		p6,p0=r25,r24
130789837Skris	add		r26=r26,r25		};;
130889837Skris{ .mii;
130989837Skris(p6)	add		carry1=1,carry1
131089837Skris	cmp.ltu		p6,p0=r26,r25
131189837Skris	add		r27=r27,r26		};;
131289837Skris{ .mii;
131389837Skris(p6)	add		carry1=1,carry1
131489837Skris	cmp.ltu		p6,p0=r27,r26
131589837Skris	add		r27=r27,carry2		};;
131689837Skris{ .mii;		getf.sig	r18=f53
131789837Skris(p6)	add		carry1=1,carry1
131889837Skris	cmp.ltu		p6,p0=r27,carry2	};;
131989837Skris{ .mfi;	st8		[r33]=r27,16
132089837Skris(p6)	add		carry1=1,carry1		}
132189837Skris
132289837Skris{ .mii;		getf.sig	r19=f44
132389837Skris		add		r17=r17,r16
132489837Skris		mov		carry2=0	};;
132589837Skris{ .mii;	getf.sig	r24=f72
132689837Skris		cmp.ltu		p7,p0=r17,r16
132789837Skris		add		r18=r18,r17	};;
132889837Skris{ .mii;	(p7)	add		carry2=1,carry2
132989837Skris		cmp.ltu		p7,p0=r18,r17
133089837Skris		add		r19=r19,r18	};;
133189837Skris{ .mii;	(p7)	add		carry2=1,carry2
133289837Skris		cmp.ltu		p7,p0=r19,r18
133389837Skris		add		r19=r19,carry1	};;
133489837Skris{ .mii;	getf.sig	r25=f63
133589837Skris	(p7)	add		carry2=1,carry2
133689837Skris		cmp.ltu		p7,p0=r19,carry1};;
133789837Skris{ .mii;		st8		[r32]=r19,16
133889837Skris	(p7)	add		carry2=1,carry2	}
133989837Skris
134089837Skris{ .mii;	getf.sig	r26=f54
134189837Skris	add		r25=r25,r24
134289837Skris	mov		carry1=0		};;
134389837Skris{ .mii;		getf.sig	r16=f73
134489837Skris	cmp.ltu		p6,p0=r25,r24
134589837Skris	add		r26=r26,r25		};;
134689837Skris{ .mii;
134789837Skris(p6)	add		carry1=1,carry1
134889837Skris	cmp.ltu		p6,p0=r26,r25
134989837Skris	add		r26=r26,carry2		};;
135089837Skris{ .mii;		getf.sig	r17=f64
135189837Skris(p6)	add		carry1=1,carry1
135289837Skris	cmp.ltu		p6,p0=r26,carry2	};;
135389837Skris{ .mii;	st8		[r33]=r26,16
135489837Skris(p6)	add		carry1=1,carry1		}
135589837Skris
135689837Skris{ .mii;	getf.sig	r24=f74
135789837Skris		add		r17=r17,r16
135889837Skris		mov		carry2=0	};;
135989837Skris{ .mii;		cmp.ltu		p7,p0=r17,r16
136089837Skris		add		r17=r17,carry1	};;
136189837Skris
136289837Skris{ .mii;	(p7)	add		carry2=1,carry2
136389837Skris		cmp.ltu		p7,p0=r17,carry1};;
136489837Skris{ .mii;		st8		[r32]=r17,16
136589837Skris	(p7)	add		carry2=1,carry2	};;
136689837Skris
136789837Skris{ .mii;	add		r24=r24,carry2		};;
136889837Skris{ .mii;	st8		[r33]=r24		}
136989837Skris
137089837Skris{ .mib;	rum		1<<5		// clear um.mfh
137189837Skris	br.ret.sptk.many	b0	};;
137289837Skris.endp	bn_mul_comba4#
137389837Skris#undef	carry2
137489837Skris#undef	carry1
137589837Skris#endif
137689837Skris
137789837Skris#if 1
137889837Skris//
137989837Skris// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
138089837Skris//
138189837Skris// In the nutshell it's a port of my MIPS III/IV implementation.
138289837Skris//
138389837Skris#define	AT	r14
138489837Skris#define	H	r16
138589837Skris#define	HH	r20
138689837Skris#define	L	r17
138789837Skris#define	D	r18
138889837Skris#define	DH	r22
138989837Skris#define	I	r21
139089837Skris
139189837Skris#if 0
1392142425Snectar// Some preprocessors (most notably HP-UX) appear to be allergic to
1393142425Snectar// macros enclosed to parenthesis [as these three were].
139489837Skris#define	cont	p16
139589837Skris#define	break	p0	// p20
139689837Skris#define	equ	p24
139789837Skris#else
139889837Skriscont=p16
139989837Skrisbreak=p0
140089837Skrisequ=p24
140189837Skris#endif
140289837Skris
140389837Skris.global	abort#
140489837Skris.global	bn_div_words#
140589837Skris.proc	bn_div_words#
140689837Skris.align	64
140789837Skrisbn_div_words:
140889837Skris	.prologue
140989837Skris	.save	ar.pfs,r2
1410194206Ssimon{ .mii;	alloc		r2=ar.pfs,3,5,0,8
141189837Skris	.save	b0,r3
141289837Skris	mov		r3=b0
1413194206Ssimon	.save	pr,r10
141489837Skris	mov		r10=pr		};;
141589837Skris{ .mmb;	cmp.eq		p6,p0=r34,r0
141689837Skris	mov		r8=-1
141789837Skris(p6)	br.ret.spnt.many	b0	};;
141889837Skris
141989837Skris	.body
142089837Skris{ .mii;	mov		H=r32		// save h
142189837Skris	mov		ar.ec=0		// don't rotate at exit
142289837Skris	mov		pr.rot=0	}
142389837Skris{ .mii;	mov		L=r33		// save l
142489837Skris	mov		r36=r0		};;
142589837Skris
142689837Skris.L_divw_shift:	// -vv- note signed comparison
142789837Skris{ .mfi;	(p0)	cmp.lt		p16,p0=r0,r34	// d
142889837Skris	(p0)	shladd		r33=r34,1,r0	}
142989837Skris{ .mfb;	(p0)	add		r35=1,r36
143089837Skris	(p0)	nop.f		0x0
143189837Skris(p16)	br.wtop.dpnt		.L_divw_shift	};;
143289837Skris
143389837Skris{ .mii;	mov		D=r34
143489837Skris	shr.u		DH=r34,32
143589837Skris	sub		r35=64,r36		};;
143689837Skris{ .mii;	setf.sig	f7=DH
143789837Skris	shr.u		AT=H,r35
143889837Skris	mov		I=r36			};;
143989837Skris{ .mib;	cmp.ne		p6,p0=r0,AT
144089837Skris	shl		H=H,r36
144189837Skris(p6)	br.call.spnt.clr	b0=abort	};;	// overflow, die...
144289837Skris
144389837Skris{ .mfi;	fcvt.xuf.s1	f7=f7
144489837Skris	shr.u		AT=L,r35		};;
144589837Skris{ .mii;	shl		L=L,r36
144689837Skris	or		H=H,AT			};;
144789837Skris
144889837Skris{ .mii;	nop.m		0x0
144989837Skris	cmp.leu		p6,p0=D,H;;
145089837Skris(p6)	sub		H=H,D			}
145189837Skris
145289837Skris{ .mlx;	setf.sig	f14=D
145389837Skris	movl		AT=0xffffffff		};;
145489837Skris///////////////////////////////////////////////////////////
145589837Skris{ .mii;	setf.sig	f6=H
145689837Skris	shr.u		HH=H,32;;
145789837Skris	cmp.eq		p6,p7=HH,DH		};;
145889837Skris{ .mfb;
145989837Skris(p6)	setf.sig	f8=AT
146089837Skris(p7)	fcvt.xuf.s1	f6=f6
146189837Skris(p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;
146289837Skris
146389837Skris{ .mfi;	getf.sig	r33=f8				// q
146489837Skris	xmpy.lu		f9=f8,f14		}
146589837Skris{ .mfi;	xmpy.hu		f10=f8,f14
146689837Skris	shrp		H=H,L,32		};;
146789837Skris
146889837Skris{ .mmi;	getf.sig	r35=f9				// tl
146989837Skris	getf.sig	r31=f10			};;	// th
147089837Skris
147189837Skris.L_divw_1st_iter:
147289837Skris{ .mii;	(p0)	add		r32=-1,r33
147389837Skris	(p0)	cmp.eq		equ,cont=HH,r31		};;
147489837Skris{ .mii;	(p0)	cmp.ltu		p8,p0=r35,D
147589837Skris	(p0)	sub		r34=r35,D
147689837Skris	(equ)	cmp.leu		break,cont=r35,H	};;
147789837Skris{ .mib;	(cont)	cmp.leu		cont,break=HH,r31
147889837Skris	(p8)	add		r31=-1,r31
147989837Skris(cont)	br.wtop.spnt		.L_divw_1st_iter	};;
148089837Skris///////////////////////////////////////////////////////////
148189837Skris{ .mii;	sub		H=H,r35
148289837Skris	shl		r8=r33,32
148389837Skris	shl		L=L,32			};;
148489837Skris///////////////////////////////////////////////////////////
148589837Skris{ .mii;	setf.sig	f6=H
148689837Skris	shr.u		HH=H,32;;
148789837Skris	cmp.eq		p6,p7=HH,DH		};;
148889837Skris{ .mfb;
148989837Skris(p6)	setf.sig	f8=AT
149089837Skris(p7)	fcvt.xuf.s1	f6=f6
149189837Skris(p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;
149289837Skris
149389837Skris{ .mfi;	getf.sig	r33=f8				// q
149489837Skris	xmpy.lu		f9=f8,f14		}
149589837Skris{ .mfi;	xmpy.hu		f10=f8,f14
149689837Skris	shrp		H=H,L,32		};;
149789837Skris
149889837Skris{ .mmi;	getf.sig	r35=f9				// tl
149989837Skris	getf.sig	r31=f10			};;	// th
150089837Skris
150189837Skris.L_divw_2nd_iter:
150289837Skris{ .mii;	(p0)	add		r32=-1,r33
150389837Skris	(p0)	cmp.eq		equ,cont=HH,r31		};;
150489837Skris{ .mii;	(p0)	cmp.ltu		p8,p0=r35,D
150589837Skris	(p0)	sub		r34=r35,D
150689837Skris	(equ)	cmp.leu		break,cont=r35,H	};;
150789837Skris{ .mib;	(cont)	cmp.leu		cont,break=HH,r31
150889837Skris	(p8)	add		r31=-1,r31
150989837Skris(cont)	br.wtop.spnt		.L_divw_2nd_iter	};;
151089837Skris///////////////////////////////////////////////////////////
151189837Skris{ .mii;	sub	H=H,r35
151289837Skris	or	r8=r8,r33
151389837Skris	mov	ar.pfs=r2		};;
151489837Skris{ .mii;	shr.u	r9=H,I			// remainder if anybody wants it
1515111147Snectar	mov	pr=r10,0x1ffff		}
151689837Skris{ .mfb;	br.ret.sptk.many	b0	};;
151789837Skris
151889837Skris// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
151989837Skris// procedure.
152089837Skris//
152189837Skris// inputs:	f6 = (double)a, f7 = (double)b
152289837Skris// output:	f8 = (int)(a/b)
152389837Skris// clobbered:	f8,f9,f10,f11,pred
152489837Skrispred=p15
1525142425Snectar// One can argue that this snippet is copyrighted to Intel
1526142425Snectar// Corporation, as it's essentially identical to one of those
1527142425Snectar// found in "Divide, Square Root and Remainder" section at
1528142425Snectar// http://www.intel.com/software/products/opensource/libraries/num.htm.
1529142425Snectar// Yes, I admit that the referred code was used as template,
1530142425Snectar// but after I realized that there hardly is any other instruction
1531142425Snectar// sequence which would perform this operation. I mean I figure that
1532142425Snectar// any independent attempt to implement high-performance division
1533142425Snectar// will result in code virtually identical to the Intel code. It
1534142425Snectar// should be noted though that below division kernel is 1 cycle
1535142425Snectar// faster than Intel one (note commented splits:-), not to mention
1536142425Snectar// original prologue (rather lack of one) and epilogue.
153789837Skris.align	32
153889837Skris.skip	16
153989837Skris.L_udiv64_32_b6:
154089837Skris	frcpa.s1	f8,pred=f6,f7;;		// [0]  y0 = 1 / b
154189837Skris
154289837Skris(pred)	fnma.s1		f9=f7,f8,f1		// [5]  e0 = 1 - b * y0
154389837Skris(pred)	fmpy.s1		f10=f6,f8;;		// [5]  q0 = a * y0
154489837Skris(pred)	fmpy.s1		f11=f9,f9		// [10] e1 = e0 * e0
154589837Skris(pred)	fma.s1		f10=f9,f10,f10;;	// [10] q1 = q0 + e0 * q0
154689837Skris(pred)	fma.s1		f8=f9,f8,f8	//;;	// [15] y1 = y0 + e0 * y0
154789837Skris(pred)	fma.s1		f9=f11,f10,f10;;	// [15] q2 = q1 + e1 * q1
154889837Skris(pred)	fma.s1		f8=f11,f8,f8	//;;	// [20] y2 = y1 + e1 * y1
154989837Skris(pred)	fnma.s1		f10=f7,f9,f6;;		// [20] r2 = a - b * q2
155089837Skris(pred)	fma.s1		f8=f10,f8,f9;;		// [25] q3 = q2 + r2 * y2
155189837Skris
155289837Skris	fcvt.fxu.trunc.s1	f8=f8		// [30] q = trunc(q3)
155389837Skris	br.ret.sptk.many	b6;;
155489837Skris.endp	bn_div_words#
155589837Skris#endif
1556