ia64.S revision 89837
189837Skris.explicit
289837Skris.text
389837Skris.ident	"ia64.S, Version 1.1"
489837Skris.ident	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
589837Skris
689837Skris//
789837Skris// ====================================================================
889837Skris// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
989837Skris// project.
1089837Skris//
1189837Skris// Rights for redistribution and usage in source and binary forms are
1289837Skris// granted according to the OpenSSL license. Warranty of any kind is
1389837Skris// disclaimed.
1489837Skris// ====================================================================
1589837Skris//
1689837Skris
1789837Skris// Q.	How much faster does it get?
1889837Skris// A.	Here is the output from 'openssl speed rsa dsa' for vanilla
1989837Skris//	0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
2089837Skris//	Linux 7.1 2.96-81):
2189837Skris//
2289837Skris//	                  sign    verify    sign/s verify/s
2389837Skris//	rsa  512 bits   0.0036s   0.0003s    275.3   2999.2
2489837Skris//	rsa 1024 bits   0.0203s   0.0011s     49.3    894.1
2589837Skris//	rsa 2048 bits   0.1331s   0.0040s      7.5    250.9
2689837Skris//	rsa 4096 bits   0.9270s   0.0147s      1.1     68.1
2789837Skris//	                  sign    verify    sign/s verify/s
2889837Skris//	dsa  512 bits   0.0035s   0.0043s    288.3    234.8
2989837Skris//	dsa 1024 bits   0.0111s   0.0135s     90.0     74.2
3089837Skris//
3189837Skris//	And here is similar output but for this assembler
3289837Skris//	implementation:-)
3389837Skris//
3489837Skris//	                  sign    verify    sign/s verify/s
3589837Skris//	rsa  512 bits   0.0021s   0.0001s    549.4   9638.5
3689837Skris//	rsa 1024 bits   0.0055s   0.0002s    183.8   4481.1
3789837Skris//	rsa 2048 bits   0.0244s   0.0006s     41.4   1726.3
3889837Skris//	rsa 4096 bits   0.1295s   0.0018s      7.7    561.5
3989837Skris//	                  sign    verify    sign/s verify/s
4089837Skris//	dsa  512 bits   0.0012s   0.0013s    891.9    756.6
4189837Skris//	dsa 1024 bits   0.0023s   0.0028s    440.4    376.2
4289837Skris//
4389837Skris//	Yes, you may argue that it's not fair comparison as it's
4489837Skris//	possible to craft the C implementation with BN_UMULT_HIGH
4589837Skris//	inline assembler macro. But of course! Here is the output
4689837Skris//	with the macro:
4789837Skris//
4889837Skris//	                  sign    verify    sign/s verify/s
4989837Skris//	rsa  512 bits   0.0020s   0.0002s    495.0   6561.0
5089837Skris//	rsa 1024 bits   0.0086s   0.0004s    116.2   2235.7
5189837Skris//	rsa 2048 bits   0.0519s   0.0015s     19.3    667.3
5289837Skris//	rsa 4096 bits   0.3464s   0.0053s      2.9    187.7
5389837Skris//	                  sign    verify    sign/s verify/s
5489837Skris//	dsa  512 bits   0.0016s   0.0020s    613.1    510.5
5589837Skris//	dsa 1024 bits   0.0045s   0.0054s    221.0    183.9
5689837Skris//
5789837Skris//	My code is still way faster, huh:-) And I believe that even
5889837Skris//	higher performance can be achieved. Note that as keys get
5989837Skris//	longer, performance gain is larger. Why? According to the
6089837Skris//	profiler there is another player in the field, namely
6189837Skris//	BN_from_montgomery consuming larger and larger portion of CPU
6289837Skris//	time as keysize decreases. I therefore consider putting effort
6389837Skris//	to assembler implementation of the following routine:
6489837Skris//
6589837Skris//	void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
6689837Skris//	{
6789837Skris//	int      i,j;
6889837Skris//	BN_ULONG v;
6989837Skris//
7089837Skris//	for (i=0; i<nl; i++)
7189837Skris//		{
7289837Skris//		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
7389837Skris//		nrp++;
7489837Skris//		rp++;
7589837Skris//		if (((nrp[-1]+=v)&BN_MASK2) < v)
7689837Skris//			for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
7789837Skris//		}
7889837Skris//	}
7989837Skris//
8089837Skris//	It might as well be beneficial to implement even combaX
8189837Skris//	variants, as it appears as it can literally unleash the
8289837Skris//	performance (see comment section to bn_mul_comba8 below).
8389837Skris//
8489837Skris//	And finally for your reference the output for 0.9.6a compiled
8589837Skris//	with SGIcc version 0.01.0-12 (keep in mind that for the moment
8689837Skris//	of this writing it's not possible to convince SGIcc to use
8789837Skris//	BN_UMULT_HIGH inline assembler macro, yet the code is fast,
8889837Skris//	i.e. for a compiler generated one:-):
8989837Skris//
9089837Skris//	                  sign    verify    sign/s verify/s
9189837Skris//	rsa  512 bits   0.0022s   0.0002s    452.7   5894.3
9289837Skris//	rsa 1024 bits   0.0097s   0.0005s    102.7   2002.9
9389837Skris//	rsa 2048 bits   0.0578s   0.0017s     17.3    600.2
9489837Skris//	rsa 4096 bits   0.3838s   0.0061s      2.6    164.5
9589837Skris//	                  sign    verify    sign/s verify/s
9689837Skris//	dsa  512 bits   0.0018s   0.0022s    547.3    459.6
9789837Skris//	dsa 1024 bits   0.0051s   0.0062s    196.6    161.3
9889837Skris//
9989837Skris//	Oh! Benchmarks were performed on 733MHz Lion-class Itanium
10089837Skris//	system running Redhat Linux 7.1 (very special thanks to Ray
10189837Skris//	McCaffity of Williams Communications for providing an account).
10289837Skris//
10389837Skris// Q.	What's the heck with 'rum 1<<5' at the end of every function?
10489837Skris// A.	Well, by clearing the "upper FP registers written" bit of the
10589837Skris//	User Mask I want to excuse the kernel from preserving upper
10689837Skris//	(f32-f128) FP register bank over process context switch, thus
10789837Skris//	minimizing bus bandwidth consumption during the switch (i.e.
10889837Skris//	after PKI opration completes and the program is off doing
10989837Skris//	something else like bulk symmetric encryption). Having said
11089837Skris//	this, I also want to point out that it might be good idea
11189837Skris//	to compile the whole toolkit (as well as majority of the
11289837Skris//	programs for that matter) with -mfixed-range=f32-f127 command
11389837Skris//	line option. No, it doesn't prevent the compiler from writing
11489837Skris//	to upper bank, but at least discourages to do so. If you don't
11589837Skris//	like the idea you have the option to compile the module with
11689837Skris//	-Drum=nop.m in command line.
11789837Skris//
11889837Skris
11989837Skris#if 1
12089837Skris//
12189837Skris// bn_[add|sub]_words routines.
12289837Skris//
12389837Skris// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
12489837Skris// data reside in L1 cache, i.e. 2 ticks away). It's possible to
12589837Skris// compress the epilogue and get down to 2*n+6, but at the cost of
12689837Skris// scalability (the neat feature of this implementation is that it
12789837Skris// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
12889837Skris// I consider that the epilogue is short enough as it is to trade tiny
12989837Skris// performance loss on Itanium for scalability.
13089837Skris//
13189837Skris// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
13289837Skris//
13389837Skris.global	bn_add_words#
13489837Skris.proc	bn_add_words#
13589837Skris.align	64
13689837Skris.skip	32	// makes the loop body aligned at 64-byte boundary
13789837Skrisbn_add_words:
13889837Skris	.prologue
13989837Skris	.fframe	0
14089837Skris	.save	ar.pfs,r2
14189837Skris{ .mii;	alloc		r2=ar.pfs,4,12,0,16
14289837Skris	cmp4.le		p6,p0=r35,r0	};;
14389837Skris{ .mfb;	mov		r8=r0			// return value
14489837Skris(p6)	br.ret.spnt.many	b0	};;
14589837Skris
14689837Skris	.save	ar.lc,r3
14789837Skris{ .mib;	sub		r10=r35,r0,1
14889837Skris	mov		r3=ar.lc
14989837Skris	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
15089837Skris					}
15189837Skris	.body
15289837Skris{ .mib;	mov		r14=r32			// rp
15389837Skris	mov		r9=pr		};;
15489837Skris{ .mii;	mov		r15=r33			// ap
15589837Skris	mov		ar.lc=r10
15689837Skris	mov		ar.ec=6		}
15789837Skris{ .mib;	mov		r16=r34			// bp
15889837Skris	mov		pr.rot=1<<16	};;
15989837Skris
16089837Skris.L_bn_add_words_ctop:
16189837Skris{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
16289837Skris	(p18)	add		r39=r37,r34
16389837Skris	(p19)	cmp.ltu.unc	p56,p0=r40,r38	}
16489837Skris{ .mfb;	(p0)	nop.m		0x0
16589837Skris	(p0)	nop.f		0x0
16689837Skris	(p0)	nop.b		0x0		}
16789837Skris{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
16889837Skris	(p58)	cmp.eq.or	p57,p0=-1,r41	  // (p20)
16989837Skris	(p58)	add		r41=1,r41	} // (p20)
17089837Skris{ .mfb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
17189837Skris	(p0)	nop.f		0x0
17289837Skris	br.ctop.sptk	.L_bn_add_words_ctop	};;
17389837Skris.L_bn_add_words_cend:
17489837Skris
17589837Skris{ .mii;
17689837Skris(p59)	add		r8=1,r8		// return value
17789837Skris	mov		pr=r9,-1
17889837Skris	mov		ar.lc=r3	}
17989837Skris{ .mbb;	nop.b		0x0
18089837Skris	br.ret.sptk.many	b0	};;
18189837Skris.endp	bn_add_words#
18289837Skris
18389837Skris//
18489837Skris// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
18589837Skris//
18689837Skris.global	bn_sub_words#
18789837Skris.proc	bn_sub_words#
18889837Skris.align	64
18989837Skris.skip	32	// makes the loop body aligned at 64-byte boundary
19089837Skrisbn_sub_words:
19189837Skris	.prologue
19289837Skris	.fframe	0
19389837Skris	.save	ar.pfs,r2
19489837Skris{ .mii;	alloc		r2=ar.pfs,4,12,0,16
19589837Skris	cmp4.le		p6,p0=r35,r0	};;
19689837Skris{ .mfb;	mov		r8=r0			// return value
19789837Skris(p6)	br.ret.spnt.many	b0	};;
19889837Skris
19989837Skris	.save	ar.lc,r3
20089837Skris{ .mib;	sub		r10=r35,r0,1
20189837Skris	mov		r3=ar.lc
20289837Skris	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
20389837Skris					}
20489837Skris	.body
20589837Skris{ .mib;	mov		r14=r32			// rp
20689837Skris	mov		r9=pr		};;
20789837Skris{ .mii;	mov		r15=r33			// ap
20889837Skris	mov		ar.lc=r10
20989837Skris	mov		ar.ec=6		}
21089837Skris{ .mib;	mov		r16=r34			// bp
21189837Skris	mov		pr.rot=1<<16	};;
21289837Skris
21389837Skris.L_bn_sub_words_ctop:
21489837Skris{ .mii;	(p16)	ld8		r32=[r16],8	  // b=*(bp++)
21589837Skris	(p18)	sub		r39=r37,r34
21689837Skris	(p19)	cmp.gtu.unc	p56,p0=r40,r38	}
21789837Skris{ .mfb;	(p0)	nop.m		0x0
21889837Skris	(p0)	nop.f		0x0
21989837Skris	(p0)	nop.b		0x0		}
22089837Skris{ .mii;	(p16)	ld8		r35=[r15],8	  // a=*(ap++)
22189837Skris	(p58)	cmp.eq.or	p57,p0=0,r41	  // (p20)
22289837Skris	(p58)	add		r41=-1,r41	} // (p20)
22389837Skris{ .mbb;	(p21)	st8		[r14]=r42,8	  // *(rp++)=r
22489837Skris	(p0)	nop.b		0x0
22589837Skris	br.ctop.sptk	.L_bn_sub_words_ctop	};;
22689837Skris.L_bn_sub_words_cend:
22789837Skris
22889837Skris{ .mii;
22989837Skris(p59)	add		r8=1,r8		// return value
23089837Skris	mov		pr=r9,-1
23189837Skris	mov		ar.lc=r3	}
23289837Skris{ .mbb;	nop.b		0x0
23389837Skris	br.ret.sptk.many	b0	};;
23489837Skris.endp	bn_sub_words#
23589837Skris#endif
23689837Skris
23789837Skris#if 0
23889837Skris#define XMA_TEMPTATION
23989837Skris#endif
24089837Skris
24189837Skris#if 1
24289837Skris//
24389837Skris// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
24489837Skris//
24589837Skris.global	bn_mul_words#
24689837Skris.proc	bn_mul_words#
24789837Skris.align	64
24889837Skris.skip	32	// makes the loop body aligned at 64-byte boundary
24989837Skrisbn_mul_words:
25089837Skris	.prologue
25189837Skris	.fframe	0
25289837Skris	.save	ar.pfs,r2
25389837Skris#ifdef XMA_TEMPTATION
25489837Skris{ .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
25589837Skris#else
25689837Skris{ .mfi;	alloc		r2=ar.pfs,4,4,0,8	};;
25789837Skris#endif
25889837Skris{ .mib;	mov		r8=r0			// return value
25989837Skris	cmp4.le		p6,p0=r34,r0
26089837Skris(p6)	br.ret.spnt.many	b0		};;
26189837Skris
26289837Skris	.save	ar.lc,r3
26389837Skris{ .mii;	sub	r10=r34,r0,1
26489837Skris	mov	r3=ar.lc
26589837Skris	mov	r9=pr			};;
26689837Skris
26789837Skris	.body
26889837Skris{ .mib;	setf.sig	f8=r35	// w
26989837Skris	mov		pr.rot=0x400001<<16
27089837Skris			// ------^----- serves as (p48) at first (p26)
27189837Skris	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
27289837Skris					}
27389837Skris
27489837Skris#ifndef XMA_TEMPTATION
27589837Skris
27689837Skris{ .mii;	mov		r14=r32	// rp
27789837Skris	mov		r15=r33	// ap
27889837Skris	mov		ar.lc=r10	}
27989837Skris{ .mii;	mov		r39=0	// serves as r33 at first (p26)
28089837Skris	mov		ar.ec=12	};;
28189837Skris
28289837Skris// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2
28389837Skris// cache (i.e. 9 ticks away) as floating point load/store instructions
28489837Skris// bypass L1 cache and L2 latency is actually best-case scenario for
28589837Skris// ldf8. The loop is not scalable and shall run in 2*(n+11) even on
28689837Skris// "wider" IA-64 implementations. It's a trade-off here. n+22 loop
28789837Skris// would give us ~5% in *overall* performance improvement on "wider"
28889837Skris// IA-64, but would hurt Itanium for about same because of longer
28989837Skris// epilogue. As it's a matter of few percents in either case I've
29089837Skris// chosen to trade the scalability for development time (you can see
29189837Skris// this very instruction sequence in bn_mul_add_words loop which in
29289837Skris// turn is scalable).
29389837Skris.L_bn_mul_words_ctop:
29489837Skris{ .mfi;	(p25)	getf.sig	r36=f49			// low
29589837Skris	(p21)	xmpy.lu		f45=f37,f8
29689837Skris	(p27)	cmp.ltu		p52,p48=r39,r38	}
29789837Skris{ .mfi;	(p16)	ldf8		f32=[r15],8
29889837Skris	(p21)	xmpy.hu		f38=f37,f8
29989837Skris	(p0)	nop.i		0x0		};;
30089837Skris{ .mii;	(p26)	getf.sig	r32=f43			// high
30189837Skris	.pred.rel	"mutex",p48,p52
30289837Skris	(p48)	add		r38=r37,r33		// (p26)
30389837Skris	(p52)	add		r38=r37,r33,1	}	// (p26)
30489837Skris{ .mfb;	(p27)	st8		[r14]=r39,8
30589837Skris	(p0)	nop.f		0x0
30689837Skris	br.ctop.sptk	.L_bn_mul_words_ctop	};;
30789837Skris.L_bn_mul_words_cend:
30889837Skris
30989837Skris{ .mii;	nop.m		0x0
31089837Skris.pred.rel	"mutex",p49,p53
31189837Skris(p49)	add		r8=r34,r0
31289837Skris(p53)	add		r8=r34,r0,1	}
31389837Skris{ .mfb;	nop.m	0x0
31489837Skris	nop.f	0x0
31589837Skris	nop.b	0x0			}
31689837Skris
31789837Skris#else	// XMA_TEMPTATION
31889837Skris
31989837Skris	setf.sig	f37=r0	// serves as carry at (p18) tick
32089837Skris	mov		ar.lc=r10
32189837Skris	mov		ar.ec=5;;
32289837Skris
32389837Skris// Most of you examining this code very likely wonder why in the name
32489837Skris// of Intel the following loop is commented out? Indeed, it looks so
32589837Skris// neat that you find it hard to believe that it's something wrong
32689837Skris// with it, right? The catch is that every iteration depends on the
32789837Skris// result from previous one and the latter isn't available instantly.
32889837Skris// The loop therefore spins at the latency of xma minus 1, or in other
32989837Skris// words at 6*(n+4) ticks:-( Compare to the "production" loop above
33089837Skris// that runs in 2*(n+11) where the low latency problem is worked around
33189837Skris// by moving the dependency to one-tick latent interger ALU. Note that
33289837Skris// "distance" between ldf8 and xma is not latency of ldf8, but the
33389837Skris// *difference* between xma and ldf8 latencies.
33489837Skris.L_bn_mul_words_ctop:
33589837Skris{ .mfi;	(p16)	ldf8		f32=[r33],8
33689837Skris	(p18)	xma.hu		f38=f34,f8,f39	}
33789837Skris{ .mfb;	(p20)	stf8		[r32]=f37,8
33889837Skris	(p18)	xma.lu		f35=f34,f8,f39
33989837Skris	br.ctop.sptk	.L_bn_mul_words_ctop	};;
34089837Skris.L_bn_mul_words_cend:
34189837Skris
34289837Skris	getf.sig	r8=f41		// the return value
34389837Skris
34489837Skris#endif	// XMA_TEMPTATION
34589837Skris
34689837Skris{ .mii;	nop.m		0x0
34789837Skris	mov		pr=r9,-1
34889837Skris	mov		ar.lc=r3	}
34989837Skris{ .mfb;	rum		1<<5		// clear um.mfh
35089837Skris	nop.f		0x0
35189837Skris	br.ret.sptk.many	b0	};;
35289837Skris.endp	bn_mul_words#
35389837Skris#endif
35489837Skris
35589837Skris#if 1
35689837Skris//
35789837Skris// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
35889837Skris//
35989837Skris.global	bn_mul_add_words#
36089837Skris.proc	bn_mul_add_words#
36189837Skris.align	64
36289837Skris//.skip	0	// makes the loop split at 64-byte boundary
36389837Skrisbn_mul_add_words:
36489837Skris	.prologue
36589837Skris	.fframe	0
36689837Skris	.save	ar.pfs,r2
36789837Skris{ .mii;	alloc		r2=ar.pfs,4,12,0,16
36889837Skris	cmp4.le		p6,p0=r34,r0	};;
36989837Skris{ .mfb;	mov		r8=r0			// return value
37089837Skris(p6)	br.ret.spnt.many	b0	};;
37189837Skris
37289837Skris	.save	ar.lc,r3
37389837Skris{ .mii;	sub	r10=r34,r0,1
37489837Skris	mov	r3=ar.lc
37589837Skris	mov	r9=pr			};;
37689837Skris
37789837Skris	.body
37889837Skris{ .mib;	setf.sig	f8=r35	// w
37989837Skris	mov		pr.rot=0x400001<<16
38089837Skris			// ------^----- serves as (p48) at first (p26)
38189837Skris	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
38289837Skris					}
38389837Skris{ .mii;	mov		r14=r32	// rp
38489837Skris	mov		r15=r33	// ap
38589837Skris	mov		ar.lc=r10	}
38689837Skris{ .mii;	mov		r39=0	// serves as r33 at first (p26)
38789837Skris	mov		r18=r32	// rp copy
38889837Skris	mov		ar.ec=14	};;
38989837Skris
39089837Skris// This loop spins in 3*(n+13) ticks on Itanium and should spin in
39189837Skris// 2*(n+13) on "wider" IA-64 implementations (to be verified with new
39289837Skris// �-architecture manuals as they become available). As usual it's
39389837Skris// possible to compress the epilogue, down to 10 in this case, at the
39489837Skris// cost of scalability. Compressed (and therefore non-scalable) loop
39589837Skris// running at 3*(n+10) would buy you ~10% on Itanium but take ~35%
39689837Skris// from "wider" IA-64 so let it be scalable! Special attention was
39789837Skris// paid for having the loop body split at 64-byte boundary. ld8 is
39889837Skris// scheduled for L1 cache as the data is more than likely there.
39989837Skris// Indeed, bn_mul_words has put it there a moment ago:-)
40089837Skris.L_bn_mul_add_words_ctop:
40189837Skris{ .mfi;	(p25)	getf.sig	r36=f49			// low
40289837Skris	(p21)	xmpy.lu		f45=f37,f8
40389837Skris	(p27)	cmp.ltu		p52,p48=r39,r38	}
40489837Skris{ .mfi;	(p16)	ldf8		f32=[r15],8
40589837Skris	(p21)	xmpy.hu		f38=f37,f8
40689837Skris	(p27)	add		r43=r43,r39	};;
40789837Skris{ .mii;	(p26)	getf.sig	r32=f43			// high
40889837Skris	.pred.rel	"mutex",p48,p52
40989837Skris	(p48)	add		r38=r37,r33		// (p26)
41089837Skris	(p52)	add		r38=r37,r33,1	}	// (p26)
41189837Skris{ .mfb;	(p27)	cmp.ltu.unc	p56,p0=r43,r39
41289837Skris	(p0)	nop.f		0x0
41389837Skris	(p0)	nop.b		0x0		}
41489837Skris{ .mii;	(p26)	ld8		r42=[r18],8
41589837Skris	(p58)	cmp.eq.or	p57,p0=-1,r44
41689837Skris	(p58)	add		r44=1,r44	}
41789837Skris{ .mfb;	(p29)	st8		[r14]=r45,8
41889837Skris	(p0)	nop.f		0x0
41989837Skris	br.ctop.sptk	.L_bn_mul_add_words_ctop};;
42089837Skris.L_bn_mul_add_words_cend:
42189837Skris
42289837Skris{ .mii;	nop.m		0x0
42389837Skris.pred.rel	"mutex",p51,p55
42489837Skris(p51)	add		r8=r36,r0
42589837Skris(p55)	add		r8=r36,r0,1	}
42689837Skris{ .mfb;	nop.m	0x0
42789837Skris	nop.f	0x0
42889837Skris	nop.b	0x0			};;
42989837Skris{ .mii;
43089837Skris(p59)	add		r8=1,r8
43189837Skris	mov		pr=r9,-1
43289837Skris	mov		ar.lc=r3	}
43389837Skris{ .mfb;	rum		1<<5		// clear um.mfh
43489837Skris	nop.f		0x0
43589837Skris	br.ret.sptk.many	b0	};;
43689837Skris.endp	bn_mul_add_words#
43789837Skris#endif
43889837Skris
43989837Skris#if 1
44089837Skris//
44189837Skris// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
44289837Skris//
44389837Skris.global	bn_sqr_words#
44489837Skris.proc	bn_sqr_words#
44589837Skris.align	64
44689837Skris.skip	32	// makes the loop body aligned at 64-byte boundary
44789837Skrisbn_sqr_words:
44889837Skris	.prologue
44989837Skris	.fframe	0
45089837Skris	.save	ar.pfs,r2
45189837Skris{ .mii;	alloc		r2=ar.pfs,3,0,0,0
45289837Skris	sxt4		r34=r34		};;
45389837Skris{ .mii;	cmp.le		p6,p0=r34,r0
45489837Skris	mov		r8=r0		}	// return value
45589837Skris{ .mfb;	nop.f		0x0
45689837Skris(p6)	br.ret.spnt.many	b0	};;
45789837Skris
45889837Skris	.save	ar.lc,r3
45989837Skris{ .mii;	sub	r10=r34,r0,1
46089837Skris	mov	r3=ar.lc
46189837Skris	mov	r9=pr			};;
46289837Skris
46389837Skris	.body
46489837Skris{ .mib;
46589837Skris	mov		pr.rot=1<<16
46689837Skris	brp.loop.imp	.L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
46789837Skris					}
46889837Skris{ .mii;	add		r34=8,r32
46989837Skris	mov		ar.lc=r10
47089837Skris	mov		ar.ec=18	};;
47189837Skris
47289837Skris// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
47389837Skris// possible to compress the epilogue (I'm getting tired to write this
47489837Skris// comment over and over) and get down to 2*n+16 at the cost of
47589837Skris// scalability. The decision will very likely be reconsidered after the
47689837Skris// benchmark program is profiled. I.e. if perfomance gain on Itanium
47789837Skris// will appear larger than loss on "wider" IA-64, then the loop should
47889837Skris// be explicitely split and the epilogue compressed.
47989837Skris.L_bn_sqr_words_ctop:
48089837Skris{ .mfi;	(p16)	ldf8		f32=[r33],8
48189837Skris	(p25)	xmpy.lu		f42=f41,f41
48289837Skris	(p0)	nop.i		0x0		}
48389837Skris{ .mib;	(p33)	stf8		[r32]=f50,16
48489837Skris	(p0)	nop.i		0x0
48589837Skris	(p0)	nop.b		0x0		}
48689837Skris{ .mfi;	(p0)	nop.m		0x0
48789837Skris	(p25)	xmpy.hu		f52=f41,f41
48889837Skris	(p0)	nop.i		0x0		}
48989837Skris{ .mib;	(p33)	stf8		[r34]=f60,16
49089837Skris	(p0)	nop.i		0x0
49189837Skris	br.ctop.sptk	.L_bn_sqr_words_ctop	};;
49289837Skris.L_bn_sqr_words_cend:
49389837Skris
49489837Skris{ .mii;	nop.m		0x0
49589837Skris	mov		pr=r9,-1
49689837Skris	mov		ar.lc=r3	}
49789837Skris{ .mfb;	rum		1<<5		// clear um.mfh
49889837Skris	nop.f		0x0
49989837Skris	br.ret.sptk.many	b0	};;
50089837Skris.endp	bn_sqr_words#
50189837Skris#endif
50289837Skris
50389837Skris#if 1
50489837Skris// Apparently we win nothing by implementing special bn_sqr_comba8.
50589837Skris// Yes, it is possible to reduce the number of multiplications by
50689837Skris// almost factor of two, but then the amount of additions would
50789837Skris// increase by factor of two (as we would have to perform those
50889837Skris// otherwise performed by xma ourselves). Normally we would trade
50989837Skris// anyway as multiplications are way more expensive, but not this
51089837Skris// time... Multiplication kernel is fully pipelined and as we drain
51189837Skris// one 128-bit multiplication result per clock cycle multiplications
51289837Skris// are effectively as inexpensive as additions. Special implementation
51389837Skris// might become of interest for "wider" IA-64 implementation as you'll
51489837Skris// be able to get through the multiplication phase faster (there won't
51589837Skris// be any stall issues as discussed in the commentary section below and
51689837Skris// you therefore will be able to employ all 4 FP units)... But these
51789837Skris// Itanium days it's simply too hard to justify the effort so I just
51889837Skris// drop down to bn_mul_comba8 code:-)
51989837Skris//
52089837Skris// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
52189837Skris//
52289837Skris.global	bn_sqr_comba8#
52389837Skris.proc	bn_sqr_comba8#
52489837Skris.align	64
52589837Skrisbn_sqr_comba8:
52689837Skris	.prologue
52789837Skris	.fframe	0
52889837Skris	.save	ar.pfs,r2
52989837Skris{ .mii;	alloc	r2=ar.pfs,2,1,0,0
53089837Skris	mov	r34=r33
53189837Skris	add	r14=8,r33		};;
53289837Skris	.body
53389837Skris{ .mii;	add	r17=8,r34
53489837Skris	add	r15=16,r33
53589837Skris	add	r18=16,r34		}
53689837Skris{ .mfb;	add	r16=24,r33
53789837Skris	br	.L_cheat_entry_point8	};;
53889837Skris.endp	bn_sqr_comba8#
53989837Skris#endif
54089837Skris
54189837Skris#if 1
54289837Skris// I've estimated this routine to run in ~120 ticks, but in reality
54389837Skris// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
54489837Skris// cycles consumed for instructions fetch? Or did I misinterpret some
54589837Skris// clause in Itanium �-architecture manual? Comments are welcomed and
54689837Skris// highly appreciated.
54789837Skris//
54889837Skris// However! It should be noted that even 160 ticks is darn good result
54989837Skris// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
55089837Skris// C version (compiled with gcc with inline assembler). I really
55189837Skris// kicked compiler's butt here, didn't I? Yeah! This brings us to the
55289837Skris// following statement. It's damn shame that this routine isn't called
55389837Skris// very often nowadays! According to the profiler most CPU time is
55489837Skris// consumed by bn_mul_add_words called from BN_from_montgomery. In
55589837Skris// order to estimate what we're missing, I've compared the performance
55689837Skris// of this routine against "traditional" implementation, i.e. against
55789837Skris// following routine:
55889837Skris//
55989837Skris// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
56089837Skris// {	r[ 8]=bn_mul_words(    &(r[0]),a,8,b[0]);
56189837Skris//	r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
56289837Skris//	r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
56389837Skris//	r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
56489837Skris//	r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
56589837Skris//	r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
56689837Skris//	r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
56789837Skris//	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
56889837Skris// }
56989837Skris//
57089837Skris// The one below is over 8 times faster than the one above:-( Even
57189837Skris// more reasons to "combafy" bn_mul_add_mont...
57289837Skris//
57389837Skris// And yes, this routine really made me wish there were an optimizing
57489837Skris// assembler! It also feels like it deserves a dedication.
57589837Skris//
57689837Skris//	To my wife for being there and to my kids...
57789837Skris//
57889837Skris// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
57989837Skris//
58089837Skris#define	carry1	r14
58189837Skris#define	carry2	r15
58289837Skris#define	carry3	r34
58389837Skris.global	bn_mul_comba8#
58489837Skris.proc	bn_mul_comba8#
58589837Skris.align	64
58689837Skrisbn_mul_comba8:
58789837Skris	.prologue
58889837Skris	.fframe	0
58989837Skris	.save	ar.pfs,r2
59089837Skris{ .mii;	alloc	r2=ar.pfs,3,0,0,0
59189837Skris	add	r14=8,r33
59289837Skris	add	r17=8,r34		}
59389837Skris	.body
59489837Skris{ .mii;	add	r15=16,r33
59589837Skris	add	r18=16,r34
59689837Skris	add	r16=24,r33		}
59789837Skris.L_cheat_entry_point8:
59889837Skris{ .mmi;	add	r19=24,r34
59989837Skris
60089837Skris	ldf8	f32=[r33],32		};;
60189837Skris
60289837Skris{ .mmi;	ldf8	f120=[r34],32
60389837Skris	ldf8	f121=[r17],32		}
60489837Skris{ .mmi;	ldf8	f122=[r18],32
60589837Skris	ldf8	f123=[r19],32		};;
60689837Skris{ .mmi;	ldf8	f124=[r34]
60789837Skris	ldf8	f125=[r17]		}
60889837Skris{ .mmi;	ldf8	f126=[r18]
60989837Skris	ldf8	f127=[r19]		}
61089837Skris
61189837Skris{ .mmi;	ldf8	f33=[r14],32
61289837Skris	ldf8	f34=[r15],32		}
61389837Skris{ .mmi;	ldf8	f35=[r16],32;;
61489837Skris	ldf8	f36=[r33]		}
61589837Skris{ .mmi;	ldf8	f37=[r14]
61689837Skris	ldf8	f38=[r15]		}
61789837Skris{ .mfi;	ldf8	f39=[r16]
61889837Skris// -------\ Entering multiplier's heaven /-------
61989837Skris// ------------\                    /------------
62089837Skris// -----------------\          /-----------------
62189837Skris// ----------------------\/----------------------
62289837Skris		xma.hu	f41=f32,f120,f0		}
62389837Skris{ .mfi;		xma.lu	f40=f32,f120,f0		};; // (*)
62489837Skris{ .mfi;		xma.hu	f51=f32,f121,f0		}
62589837Skris{ .mfi;		xma.lu	f50=f32,f121,f0		};;
62689837Skris{ .mfi;		xma.hu	f61=f32,f122,f0		}
62789837Skris{ .mfi;		xma.lu	f60=f32,f122,f0		};;
62889837Skris{ .mfi;		xma.hu	f71=f32,f123,f0		}
62989837Skris{ .mfi;		xma.lu	f70=f32,f123,f0		};;
63089837Skris{ .mfi;		xma.hu	f81=f32,f124,f0		}
63189837Skris{ .mfi;		xma.lu	f80=f32,f124,f0		};;
63289837Skris{ .mfi;		xma.hu	f91=f32,f125,f0		}
63389837Skris{ .mfi;		xma.lu	f90=f32,f125,f0		};;
63489837Skris{ .mfi;		xma.hu	f101=f32,f126,f0	}
63589837Skris{ .mfi;		xma.lu	f100=f32,f126,f0	};;
63689837Skris{ .mfi;		xma.hu	f111=f32,f127,f0	}
63789837Skris{ .mfi;		xma.lu	f110=f32,f127,f0	};;//
63889837Skris// (*)	You can argue that splitting at every second bundle would
63989837Skris//	prevent "wider" IA-64 implementations from achieving the peak
64089837Skris//	performance. Well, not really... The catch is that if you
64189837Skris//	intend to keep 4 FP units busy by splitting at every fourth
64289837Skris//	bundle and thus perform these 16 multiplications in 4 ticks,
64389837Skris//	the first bundle *below* would stall because the result from
64489837Skris//	the first xma bundle *above* won't be available for another 3
64589837Skris//	ticks (if not more, being an optimist, I assume that "wider"
64689837Skris//	implementation will have same latency:-). This stall will hold
64789837Skris//	you back and the performance would be as if every second bundle
64889837Skris//	were split *anyway*...
64989837Skris{ .mfi;	getf.sig	r16=f40
65089837Skris		xma.hu	f42=f33,f120,f41
65189837Skris	add		r33=8,r32		}
65289837Skris{ .mfi;		xma.lu	f41=f33,f120,f41	};;
65389837Skris{ .mfi;	getf.sig	r24=f50
65489837Skris		xma.hu	f52=f33,f121,f51	}
65589837Skris{ .mfi;		xma.lu	f51=f33,f121,f51	};;
65689837Skris{ .mfi;	st8		[r32]=r16,16
65789837Skris		xma.hu	f62=f33,f122,f61	}
65889837Skris{ .mfi;		xma.lu	f61=f33,f122,f61	};;
65989837Skris{ .mfi;		xma.hu	f72=f33,f123,f71	}
66089837Skris{ .mfi;		xma.lu	f71=f33,f123,f71	};;
66189837Skris{ .mfi;		xma.hu	f82=f33,f124,f81	}
66289837Skris{ .mfi;		xma.lu	f81=f33,f124,f81	};;
66389837Skris{ .mfi;		xma.hu	f92=f33,f125,f91	}
66489837Skris{ .mfi;		xma.lu	f91=f33,f125,f91	};;
66589837Skris{ .mfi;		xma.hu	f102=f33,f126,f101	}
66689837Skris{ .mfi;		xma.lu	f101=f33,f126,f101	};;
66789837Skris{ .mfi;		xma.hu	f112=f33,f127,f111	}
66889837Skris{ .mfi;		xma.lu	f111=f33,f127,f111	};;//
66989837Skris//-------------------------------------------------//
67089837Skris{ .mfi;	getf.sig	r25=f41
67189837Skris		xma.hu	f43=f34,f120,f42	}
67289837Skris{ .mfi;		xma.lu	f42=f34,f120,f42	};;
67389837Skris{ .mfi;	getf.sig	r16=f60
67489837Skris		xma.hu	f53=f34,f121,f52	}
67589837Skris{ .mfi;		xma.lu	f52=f34,f121,f52	};;
67689837Skris{ .mfi;	getf.sig	r17=f51
67789837Skris		xma.hu	f63=f34,f122,f62
67889837Skris	add		r25=r25,r24		}
67989837Skris{ .mfi;		xma.lu	f62=f34,f122,f62
68089837Skris	mov		carry1=0		};;
68189837Skris{ .mfi;	cmp.ltu		p6,p0=r25,r24
68289837Skris		xma.hu	f73=f34,f123,f72	}
68389837Skris{ .mfi;		xma.lu	f72=f34,f123,f72	};;
68489837Skris{ .mfi;	st8		[r33]=r25,16
68589837Skris		xma.hu	f83=f34,f124,f82
68689837Skris(p6)	add		carry1=1,carry1		}
68789837Skris{ .mfi;		xma.lu	f82=f34,f124,f82	};;
68889837Skris{ .mfi;		xma.hu	f93=f34,f125,f92	}
68989837Skris{ .mfi;		xma.lu	f92=f34,f125,f92	};;
69089837Skris{ .mfi;		xma.hu	f103=f34,f126,f102	}
69189837Skris{ .mfi;		xma.lu	f102=f34,f126,f102	};;
69289837Skris{ .mfi;		xma.hu	f113=f34,f127,f112	}
69389837Skris{ .mfi;		xma.lu	f112=f34,f127,f112	};;//
69489837Skris//-------------------------------------------------//
69589837Skris{ .mfi;	getf.sig	r18=f42
69689837Skris		xma.hu	f44=f35,f120,f43
69789837Skris	add		r17=r17,r16		}
69889837Skris{ .mfi;		xma.lu	f43=f35,f120,f43	};;
69989837Skris{ .mfi;	getf.sig	r24=f70
70089837Skris		xma.hu	f54=f35,f121,f53	}
70189837Skris{ .mfi;	mov		carry2=0
70289837Skris		xma.lu	f53=f35,f121,f53	};;
70389837Skris{ .mfi;	getf.sig	r25=f61
70489837Skris		xma.hu	f64=f35,f122,f63
70589837Skris	cmp.ltu		p7,p0=r17,r16		}
70689837Skris{ .mfi;	add		r18=r18,r17
70789837Skris		xma.lu	f63=f35,f122,f63	};;
70889837Skris{ .mfi;	getf.sig	r26=f52
70989837Skris		xma.hu	f74=f35,f123,f73
71089837Skris(p7)	add		carry2=1,carry2		}
71189837Skris{ .mfi;	cmp.ltu		p7,p0=r18,r17
71289837Skris		xma.lu	f73=f35,f123,f73
71389837Skris	add		r18=r18,carry1		};;
71489837Skris{ .mfi;
71589837Skris		xma.hu	f84=f35,f124,f83
71689837Skris(p7)	add		carry2=1,carry2		}
71789837Skris{ .mfi;	cmp.ltu		p7,p0=r18,carry1
71889837Skris		xma.lu	f83=f35,f124,f83	};;
71989837Skris{ .mfi;	st8		[r32]=r18,16
72089837Skris		xma.hu	f94=f35,f125,f93
72189837Skris(p7)	add		carry2=1,carry2		}
72289837Skris{ .mfi;		xma.lu	f93=f35,f125,f93	};;
72389837Skris{ .mfi;		xma.hu	f104=f35,f126,f103	}
72489837Skris{ .mfi;		xma.lu	f103=f35,f126,f103	};;
72589837Skris{ .mfi;		xma.hu	f114=f35,f127,f113	}
72689837Skris{ .mfi;	mov		carry1=0
72789837Skris		xma.lu	f113=f35,f127,f113
72889837Skris	add		r25=r25,r24		};;//
72989837Skris//-------------------------------------------------//
73089837Skris{ .mfi;	getf.sig	r27=f43
73189837Skris		xma.hu	f45=f36,f120,f44
73289837Skris	cmp.ltu		p6,p0=r25,r24		}
73389837Skris{ .mfi;		xma.lu	f44=f36,f120,f44
73489837Skris	add		r26=r26,r25		};;
73589837Skris{ .mfi;	getf.sig	r16=f80
73689837Skris		xma.hu	f55=f36,f121,f54
73789837Skris(p6)	add		carry1=1,carry1		}
73889837Skris{ .mfi;		xma.lu	f54=f36,f121,f54	};;
73989837Skris{ .mfi;	getf.sig	r17=f71
74089837Skris		xma.hu	f65=f36,f122,f64
74189837Skris	cmp.ltu		p6,p0=r26,r25		}
74289837Skris{ .mfi;		xma.lu	f64=f36,f122,f64
74389837Skris	add		r27=r27,r26		};;
74489837Skris{ .mfi;	getf.sig	r18=f62
74589837Skris		xma.hu	f75=f36,f123,f74
74689837Skris(p6)	add		carry1=1,carry1		}
74789837Skris{ .mfi;	cmp.ltu		p6,p0=r27,r26
74889837Skris		xma.lu	f74=f36,f123,f74
74989837Skris	add		r27=r27,carry2		};;
75089837Skris{ .mfi;	getf.sig	r19=f53
75189837Skris		xma.hu	f85=f36,f124,f84
75289837Skris(p6)	add		carry1=1,carry1		}
75389837Skris{ .mfi;		xma.lu	f84=f36,f124,f84
75489837Skris	cmp.ltu		p6,p0=r27,carry2	};;
75589837Skris{ .mfi;	st8		[r33]=r27,16
75689837Skris		xma.hu	f95=f36,f125,f94
75789837Skris(p6)	add		carry1=1,carry1		}
75889837Skris{ .mfi;		xma.lu	f94=f36,f125,f94	};;
75989837Skris{ .mfi;		xma.hu	f105=f36,f126,f104	}
76089837Skris{ .mfi;	mov		carry2=0
76189837Skris		xma.lu	f104=f36,f126,f104
76289837Skris	add		r17=r17,r16		};;
76389837Skris{ .mfi;		xma.hu	f115=f36,f127,f114
76489837Skris	cmp.ltu		p7,p0=r17,r16		}
76589837Skris{ .mfi;		xma.lu	f114=f36,f127,f114
76689837Skris	add		r18=r18,r17		};;//
76789837Skris//-------------------------------------------------//
76889837Skris{ .mfi;	getf.sig	r20=f44
76989837Skris		xma.hu	f46=f37,f120,f45
77089837Skris(p7)	add		carry2=1,carry2		}
77189837Skris{ .mfi;	cmp.ltu		p7,p0=r18,r17
77289837Skris		xma.lu	f45=f37,f120,f45
77389837Skris	add		r19=r19,r18		};;
77489837Skris{ .mfi;	getf.sig	r24=f90
77589837Skris		xma.hu	f56=f37,f121,f55	}
77689837Skris{ .mfi;		xma.lu	f55=f37,f121,f55	};;
77789837Skris{ .mfi;	getf.sig	r25=f81
77889837Skris		xma.hu	f66=f37,f122,f65
77989837Skris(p7)	add		carry2=1,carry2		}
78089837Skris{ .mfi;	cmp.ltu		p7,p0=r19,r18
78189837Skris		xma.lu	f65=f37,f122,f65
78289837Skris	add		r20=r20,r19		};;
78389837Skris{ .mfi;	getf.sig	r26=f72
78489837Skris		xma.hu	f76=f37,f123,f75
78589837Skris(p7)	add		carry2=1,carry2		}
78689837Skris{ .mfi;	cmp.ltu		p7,p0=r20,r19
78789837Skris		xma.lu	f75=f37,f123,f75
78889837Skris	add		r20=r20,carry1		};;
78989837Skris{ .mfi;	getf.sig	r27=f63
79089837Skris		xma.hu	f86=f37,f124,f85
79189837Skris(p7)	add		carry2=1,carry2		}
79289837Skris{ .mfi;		xma.lu	f85=f37,f124,f85
79389837Skris	cmp.ltu		p7,p0=r20,carry1	};;
79489837Skris{ .mfi;	getf.sig	r28=f54
79589837Skris		xma.hu	f96=f37,f125,f95
79689837Skris(p7)	add		carry2=1,carry2		}
79789837Skris{ .mfi;	st8		[r32]=r20,16
79889837Skris		xma.lu	f95=f37,f125,f95	};;
79989837Skris{ .mfi;		xma.hu	f106=f37,f126,f105	}
80089837Skris{ .mfi;	mov		carry1=0
80189837Skris		xma.lu	f105=f37,f126,f105
80289837Skris	add		r25=r25,r24		};;
80389837Skris{ .mfi;		xma.hu	f116=f37,f127,f115
80489837Skris	cmp.ltu		p6,p0=r25,r24		}
80589837Skris{ .mfi;		xma.lu	f115=f37,f127,f115
80689837Skris	add		r26=r26,r25		};;//
80789837Skris//-------------------------------------------------//
80889837Skris{ .mfi;	getf.sig	r29=f45
80989837Skris		xma.hu	f47=f38,f120,f46
81089837Skris(p6)	add		carry1=1,carry1		}
81189837Skris{ .mfi;	cmp.ltu		p6,p0=r26,r25
81289837Skris		xma.lu	f46=f38,f120,f46
81389837Skris	add		r27=r27,r26		};;
81489837Skris{ .mfi;	getf.sig	r16=f100
81589837Skris		xma.hu	f57=f38,f121,f56
81689837Skris(p6)	add		carry1=1,carry1		}
81789837Skris{ .mfi;	cmp.ltu		p6,p0=r27,r26
81889837Skris		xma.lu	f56=f38,f121,f56
81989837Skris	add		r28=r28,r27		};;
82089837Skris{ .mfi;	getf.sig	r17=f91
82189837Skris		xma.hu	f67=f38,f122,f66
82289837Skris(p6)	add		carry1=1,carry1		}
82389837Skris{ .mfi;	cmp.ltu		p6,p0=r28,r27
82489837Skris		xma.lu	f66=f38,f122,f66
82589837Skris	add		r29=r29,r28		};;
82689837Skris{ .mfi;	getf.sig	r18=f82
82789837Skris		xma.hu	f77=f38,f123,f76
82889837Skris(p6)	add		carry1=1,carry1		}
82989837Skris{ .mfi;	cmp.ltu		p6,p0=r29,r28
83089837Skris		xma.lu	f76=f38,f123,f76
83189837Skris	add		r29=r29,carry2		};;
83289837Skris{ .mfi;	getf.sig	r19=f73
83389837Skris		xma.hu	f87=f38,f124,f86
83489837Skris(p6)	add		carry1=1,carry1		}
83589837Skris{ .mfi;		xma.lu	f86=f38,f124,f86
83689837Skris	cmp.ltu		p6,p0=r29,carry2	};;
83789837Skris{ .mfi;	getf.sig	r20=f64
83889837Skris		xma.hu	f97=f38,f125,f96
83989837Skris(p6)	add		carry1=1,carry1		}
84089837Skris{ .mfi;	st8		[r33]=r29,16
84189837Skris		xma.lu	f96=f38,f125,f96	};;
84289837Skris{ .mfi;	getf.sig	r21=f55
84389837Skris		xma.hu	f107=f38,f126,f106	}
84489837Skris{ .mfi;	mov		carry2=0
84589837Skris		xma.lu	f106=f38,f126,f106
84689837Skris	add		r17=r17,r16		};;
84789837Skris{ .mfi;		xma.hu	f117=f38,f127,f116
84889837Skris	cmp.ltu		p7,p0=r17,r16		}
84989837Skris{ .mfi;		xma.lu	f116=f38,f127,f116
85089837Skris	add		r18=r18,r17		};;//
85189837Skris//-------------------------------------------------//
85289837Skris{ .mfi;	getf.sig	r22=f46
85389837Skris		xma.hu	f48=f39,f120,f47
85489837Skris(p7)	add		carry2=1,carry2		}
85589837Skris{ .mfi;	cmp.ltu		p7,p0=r18,r17
85689837Skris		xma.lu	f47=f39,f120,f47
85789837Skris	add		r19=r19,r18		};;
85889837Skris{ .mfi;	getf.sig	r24=f110
85989837Skris		xma.hu	f58=f39,f121,f57
86089837Skris(p7)	add		carry2=1,carry2		}
86189837Skris{ .mfi;	cmp.ltu		p7,p0=r19,r18
86289837Skris		xma.lu	f57=f39,f121,f57
86389837Skris	add		r20=r20,r19		};;
86489837Skris{ .mfi;	getf.sig	r25=f101
86589837Skris		xma.hu	f68=f39,f122,f67
86689837Skris(p7)	add		carry2=1,carry2		}
86789837Skris{ .mfi;	cmp.ltu		p7,p0=r20,r19
86889837Skris		xma.lu	f67=f39,f122,f67
86989837Skris	add		r21=r21,r20		};;
87089837Skris{ .mfi;	getf.sig	r26=f92
87189837Skris		xma.hu	f78=f39,f123,f77
87289837Skris(p7)	add		carry2=1,carry2		}
87389837Skris{ .mfi;	cmp.ltu		p7,p0=r21,r20
87489837Skris		xma.lu	f77=f39,f123,f77
87589837Skris	add		r22=r22,r21		};;
87689837Skris{ .mfi;	getf.sig	r27=f83
87789837Skris		xma.hu	f88=f39,f124,f87
87889837Skris(p7)	add		carry2=1,carry2		}
87989837Skris{ .mfi;	cmp.ltu		p7,p0=r22,r21
88089837Skris		xma.lu	f87=f39,f124,f87
88189837Skris	add		r22=r22,carry1		};;
88289837Skris{ .mfi;	getf.sig	r28=f74
88389837Skris		xma.hu	f98=f39,f125,f97
88489837Skris(p7)	add		carry2=1,carry2		}
88589837Skris{ .mfi;		xma.lu	f97=f39,f125,f97
88689837Skris	cmp.ltu		p7,p0=r22,carry1	};;
88789837Skris{ .mfi;	getf.sig	r29=f65
88889837Skris		xma.hu	f108=f39,f126,f107
88989837Skris(p7)	add		carry2=1,carry2		}
89089837Skris{ .mfi;	st8		[r32]=r22,16
89189837Skris		xma.lu	f107=f39,f126,f107	};;
89289837Skris{ .mfi;	getf.sig	r30=f56
89389837Skris		xma.hu	f118=f39,f127,f117	}
89489837Skris{ .mfi;		xma.lu	f117=f39,f127,f117	};;//
89589837Skris//-------------------------------------------------//
89689837Skris// Leaving muliplier's heaven... Quite a ride, huh?
89789837Skris
89889837Skris{ .mii;	getf.sig	r31=f47
89989837Skris	add		r25=r25,r24
90089837Skris	mov		carry1=0		};;
90189837Skris{ .mii;		getf.sig	r16=f111
90289837Skris	cmp.ltu		p6,p0=r25,r24
90389837Skris	add		r26=r26,r25		};;
90489837Skris{ .mfb;		getf.sig	r17=f102	}
90589837Skris{ .mii;
90689837Skris(p6)	add		carry1=1,carry1
90789837Skris	cmp.ltu		p6,p0=r26,r25
90889837Skris	add		r27=r27,r26		};;
90989837Skris{ .mfb;	nop.m	0x0				}
91089837Skris{ .mii;
91189837Skris(p6)	add		carry1=1,carry1
91289837Skris	cmp.ltu		p6,p0=r27,r26
91389837Skris	add		r28=r28,r27		};;
91489837Skris{ .mii;		getf.sig	r18=f93
91589837Skris		add		r17=r17,r16
91689837Skris		mov		carry3=0	}
91789837Skris{ .mii;
91889837Skris(p6)	add		carry1=1,carry1
91989837Skris	cmp.ltu		p6,p0=r28,r27
92089837Skris	add		r29=r29,r28		};;
92189837Skris{ .mii;		getf.sig	r19=f84
92289837Skris		cmp.ltu		p7,p0=r17,r16	}
92389837Skris{ .mii;
92489837Skris(p6)	add		carry1=1,carry1
92589837Skris	cmp.ltu		p6,p0=r29,r28
92689837Skris	add		r30=r30,r29		};;
92789837Skris{ .mii;		getf.sig	r20=f75
92889837Skris		add		r18=r18,r17	}
92989837Skris{ .mii;
93089837Skris(p6)	add		carry1=1,carry1
93189837Skris	cmp.ltu		p6,p0=r30,r29
93289837Skris	add		r31=r31,r30		};;
93389837Skris{ .mfb;		getf.sig	r21=f66		}
93489837Skris{ .mii;	(p7)	add		carry3=1,carry3
93589837Skris		cmp.ltu		p7,p0=r18,r17
93689837Skris		add		r19=r19,r18	}
93789837Skris{ .mfb;	nop.m	0x0				}
93889837Skris{ .mii;
93989837Skris(p6)	add		carry1=1,carry1
94089837Skris	cmp.ltu		p6,p0=r31,r30
94189837Skris	add		r31=r31,carry2		};;
94289837Skris{ .mfb;		getf.sig	r22=f57		}
94389837Skris{ .mii;	(p7)	add		carry3=1,carry3
94489837Skris		cmp.ltu		p7,p0=r19,r18
94589837Skris		add		r20=r20,r19	}
94689837Skris{ .mfb;	nop.m	0x0				}
94789837Skris{ .mii;
94889837Skris(p6)	add		carry1=1,carry1
94989837Skris	cmp.ltu		p6,p0=r31,carry2	};;
95089837Skris{ .mfb;		getf.sig	r23=f48		}
95189837Skris{ .mii;	(p7)	add		carry3=1,carry3
95289837Skris		cmp.ltu		p7,p0=r20,r19
95389837Skris		add		r21=r21,r20	}
95489837Skris{ .mii;
95589837Skris(p6)	add		carry1=1,carry1		}
95689837Skris{ .mfb;	st8		[r33]=r31,16		};;
95789837Skris
95889837Skris{ .mfb;	getf.sig	r24=f112		}
95989837Skris{ .mii;	(p7)	add		carry3=1,carry3
96089837Skris		cmp.ltu		p7,p0=r21,r20
96189837Skris		add		r22=r22,r21	};;
96289837Skris{ .mfb;	getf.sig	r25=f103		}
96389837Skris{ .mii;	(p7)	add		carry3=1,carry3
96489837Skris		cmp.ltu		p7,p0=r22,r21
96589837Skris		add		r23=r23,r22	};;
96689837Skris{ .mfb;	getf.sig	r26=f94			}
96789837Skris{ .mii;	(p7)	add		carry3=1,carry3
96889837Skris		cmp.ltu		p7,p0=r23,r22
96989837Skris		add		r23=r23,carry1	};;
97089837Skris{ .mfb;	getf.sig	r27=f85			}
97189837Skris{ .mii;	(p7)	add		carry3=1,carry3
97289837Skris		cmp.ltu		p7,p8=r23,carry1};;
97389837Skris{ .mii;	getf.sig	r28=f76
97489837Skris	add		r25=r25,r24
97589837Skris	mov		carry1=0		}
97689837Skris{ .mii;		st8		[r32]=r23,16
97789837Skris	(p7)	add		carry2=1,carry3
97889837Skris	(p8)	add		carry2=0,carry3	};;
97989837Skris
98089837Skris{ .mfb;	nop.m	0x0				}
98189837Skris{ .mii;	getf.sig	r29=f67
98289837Skris	cmp.ltu		p6,p0=r25,r24
98389837Skris	add		r26=r26,r25		};;
98489837Skris{ .mfb;	getf.sig	r30=f58			}
98589837Skris{ .mii;
98689837Skris(p6)	add		carry1=1,carry1
98789837Skris	cmp.ltu		p6,p0=r26,r25
98889837Skris	add		r27=r27,r26		};;
98989837Skris{ .mfb;		getf.sig	r16=f113	}
99089837Skris{ .mii;
99189837Skris(p6)	add		carry1=1,carry1
99289837Skris	cmp.ltu		p6,p0=r27,r26
99389837Skris	add		r28=r28,r27		};;
99489837Skris{ .mfb;		getf.sig	r17=f104	}
99589837Skris{ .mii;
99689837Skris(p6)	add		carry1=1,carry1
99789837Skris	cmp.ltu		p6,p0=r28,r27
99889837Skris	add		r29=r29,r28		};;
99989837Skris{ .mfb;		getf.sig	r18=f95		}
100089837Skris{ .mii;
100189837Skris(p6)	add		carry1=1,carry1
100289837Skris	cmp.ltu		p6,p0=r29,r28
100389837Skris	add		r30=r30,r29		};;
100489837Skris{ .mii;		getf.sig	r19=f86
100589837Skris		add		r17=r17,r16
100689837Skris		mov		carry3=0	}
100789837Skris{ .mii;
100889837Skris(p6)	add		carry1=1,carry1
100989837Skris	cmp.ltu		p6,p0=r30,r29
101089837Skris	add		r30=r30,carry2		};;
101189837Skris{ .mii;		getf.sig	r20=f77
101289837Skris		cmp.ltu		p7,p0=r17,r16
101389837Skris		add		r18=r18,r17	}
101489837Skris{ .mii;
101589837Skris(p6)	add		carry1=1,carry1
101689837Skris	cmp.ltu		p6,p0=r30,carry2	};;
101789837Skris{ .mfb;		getf.sig	r21=f68		}
101889837Skris{ .mii;	st8		[r33]=r30,16
101989837Skris(p6)	add		carry1=1,carry1		};;
102089837Skris
102189837Skris{ .mfb;	getf.sig	r24=f114		}
102289837Skris{ .mii;	(p7)	add		carry3=1,carry3
102389837Skris		cmp.ltu		p7,p0=r18,r17
102489837Skris		add		r19=r19,r18	};;
102589837Skris{ .mfb;	getf.sig	r25=f105		}
102689837Skris{ .mii;	(p7)	add		carry3=1,carry3
102789837Skris		cmp.ltu		p7,p0=r19,r18
102889837Skris		add		r20=r20,r19	};;
102989837Skris{ .mfb;	getf.sig	r26=f96			}
103089837Skris{ .mii;	(p7)	add		carry3=1,carry3
103189837Skris		cmp.ltu		p7,p0=r20,r19
103289837Skris		add		r21=r21,r20	};;
103389837Skris{ .mfb;	getf.sig	r27=f87			}
103489837Skris{ .mii;	(p7)	add		carry3=1,carry3
103589837Skris		cmp.ltu		p7,p0=r21,r20
103689837Skris		add		r21=r21,carry1	};;
103789837Skris{ .mib;	getf.sig	r28=f78
103889837Skris	add		r25=r25,r24		}
103989837Skris{ .mib;	(p7)	add		carry3=1,carry3
104089837Skris		cmp.ltu		p7,p8=r21,carry1};;
104189837Skris{ .mii;		st8		[r32]=r21,16
104289837Skris	(p7)	add		carry2=1,carry3
104389837Skris	(p8)	add		carry2=0,carry3	}
104489837Skris
104589837Skris{ .mii;	mov		carry1=0
104689837Skris	cmp.ltu		p6,p0=r25,r24
104789837Skris	add		r26=r26,r25		};;
104889837Skris{ .mfb;		getf.sig	r16=f115	}
104989837Skris{ .mii;
105089837Skris(p6)	add		carry1=1,carry1
105189837Skris	cmp.ltu		p6,p0=r26,r25
105289837Skris	add		r27=r27,r26		};;
105389837Skris{ .mfb;		getf.sig	r17=f106	}
105489837Skris{ .mii;
105589837Skris(p6)	add		carry1=1,carry1
105689837Skris	cmp.ltu		p6,p0=r27,r26
105789837Skris	add		r28=r28,r27		};;
105889837Skris{ .mfb;		getf.sig	r18=f97		}
105989837Skris{ .mii;
106089837Skris(p6)	add		carry1=1,carry1
106189837Skris	cmp.ltu		p6,p0=r28,r27
106289837Skris	add		r28=r28,carry2		};;
106389837Skris{ .mib;		getf.sig	r19=f88
106489837Skris		add		r17=r17,r16	}
106589837Skris{ .mib;
106689837Skris(p6)	add		carry1=1,carry1
106789837Skris	cmp.ltu		p6,p0=r28,carry2	};;
106889837Skris{ .mii;	st8		[r33]=r28,16
106989837Skris(p6)	add		carry1=1,carry1		}
107089837Skris
107189837Skris{ .mii;		mov		carry2=0
107289837Skris		cmp.ltu		p7,p0=r17,r16
107389837Skris		add		r18=r18,r17	};;
107489837Skris{ .mfb;	getf.sig	r24=f116		}
107589837Skris{ .mii;	(p7)	add		carry2=1,carry2
107689837Skris		cmp.ltu		p7,p0=r18,r17
107789837Skris		add		r19=r19,r18	};;
107889837Skris{ .mfb;	getf.sig	r25=f107		}
107989837Skris{ .mii;	(p7)	add		carry2=1,carry2
108089837Skris		cmp.ltu		p7,p0=r19,r18
108189837Skris		add		r19=r19,carry1	};;
108289837Skris{ .mfb;	getf.sig	r26=f98			}
108389837Skris{ .mii;	(p7)	add		carry2=1,carry2
108489837Skris		cmp.ltu		p7,p0=r19,carry1};;
108589837Skris{ .mii;		st8		[r32]=r19,16
108689837Skris	(p7)	add		carry2=1,carry2	}
108789837Skris
108889837Skris{ .mfb;	add		r25=r25,r24		};;
108989837Skris
109089837Skris{ .mfb;		getf.sig	r16=f117	}
109189837Skris{ .mii;	mov		carry1=0
109289837Skris	cmp.ltu		p6,p0=r25,r24
109389837Skris	add		r26=r26,r25		};;
109489837Skris{ .mfb;		getf.sig	r17=f108	}
109589837Skris{ .mii;
109689837Skris(p6)	add		carry1=1,carry1
109789837Skris	cmp.ltu		p6,p0=r26,r25
109889837Skris	add		r26=r26,carry2		};;
109989837Skris{ .mfb;	nop.m	0x0				}
110089837Skris{ .mii;
110189837Skris(p6)	add		carry1=1,carry1
110289837Skris	cmp.ltu		p6,p0=r26,carry2	};;
110389837Skris{ .mii;	st8		[r33]=r26,16
110489837Skris(p6)	add		carry1=1,carry1		}
110589837Skris
110689837Skris{ .mfb;		add		r17=r17,r16	};;
110789837Skris{ .mfb;	getf.sig	r24=f118		}
110889837Skris{ .mii;		mov		carry2=0
110989837Skris		cmp.ltu		p7,p0=r17,r16
111089837Skris		add		r17=r17,carry1	};;
111189837Skris{ .mii;	(p7)	add		carry2=1,carry2
111289837Skris		cmp.ltu		p7,p0=r17,carry1};;
111389837Skris{ .mii;		st8		[r32]=r17
111489837Skris	(p7)	add		carry2=1,carry2	};;
111589837Skris{ .mfb;	add		r24=r24,carry2		};;
111689837Skris{ .mib;	st8		[r33]=r24		}
111789837Skris
111889837Skris{ .mib;	rum		1<<5		// clear um.mfh
111989837Skris	br.ret.sptk.many	b0	};;
112089837Skris.endp	bn_mul_comba8#
112189837Skris#undef	carry3
112289837Skris#undef	carry2
112389837Skris#undef	carry1
112489837Skris#endif
112589837Skris
112689837Skris#if 1
112789837Skris// It's possible to make it faster (see comment to bn_sqr_comba8), but
112889837Skris// I reckon it doesn't worth the effort. Basically because the routine
112989837Skris// (actually both of them) practically never called... So I just play
113089837Skris// same trick as with bn_sqr_comba8.
113189837Skris//
113289837Skris// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
113389837Skris//
113489837Skris.global	bn_sqr_comba4#
113589837Skris.proc	bn_sqr_comba4#
113689837Skris.align	64
113789837Skrisbn_sqr_comba4:
113889837Skris	.prologue
113989837Skris	.fframe	0
114089837Skris	.save	ar.pfs,r2
114189837Skris{ .mii;	alloc	r2=ar.pfs,2,1,0,0
114289837Skris	mov	r34=r33
114389837Skris	add	r14=8,r33		};;
114489837Skris	.body
114589837Skris{ .mii;	add	r17=8,r34
114689837Skris	add	r15=16,r33
114789837Skris	add	r18=16,r34		}
114889837Skris{ .mfb;	add	r16=24,r33
114989837Skris	br	.L_cheat_entry_point4	};;
115089837Skris.endp	bn_sqr_comba4#
115189837Skris#endif
115289837Skris
115389837Skris#if 1
115489837Skris// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
115589837Skris//
115689837Skris// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
115789837Skris//
115889837Skris#define	carry1	r14
115989837Skris#define	carry2	r15
116089837Skris.global	bn_mul_comba4#
116189837Skris.proc	bn_mul_comba4#
116289837Skris.align	64
116389837Skrisbn_mul_comba4:
116489837Skris	.prologue
116589837Skris	.fframe	0
116689837Skris	.save	ar.pfs,r2
116789837Skris{ .mii;	alloc	r2=ar.pfs,3,0,0,0
116889837Skris	add	r14=8,r33
116989837Skris	add	r17=8,r34		}
117089837Skris	.body
117189837Skris{ .mii;	add	r15=16,r33
117289837Skris	add	r18=16,r34
117389837Skris	add	r16=24,r33		};;
117489837Skris.L_cheat_entry_point4:
117589837Skris{ .mmi;	add	r19=24,r34
117689837Skris
117789837Skris	ldf8	f32=[r33]		}
117889837Skris
117989837Skris{ .mmi;	ldf8	f120=[r34]
118089837Skris	ldf8	f121=[r17]		};;
118189837Skris{ .mmi;	ldf8	f122=[r18]
118289837Skris	ldf8	f123=[r19]		}
118389837Skris
118489837Skris{ .mmi;	ldf8	f33=[r14]
118589837Skris	ldf8	f34=[r15]		}
118689837Skris{ .mfi;	ldf8	f35=[r16]
118789837Skris
118889837Skris		xma.hu	f41=f32,f120,f0		}
118989837Skris{ .mfi;		xma.lu	f40=f32,f120,f0		};;
119089837Skris{ .mfi;		xma.hu	f51=f32,f121,f0		}
119189837Skris{ .mfi;		xma.lu	f50=f32,f121,f0		};;
119289837Skris{ .mfi;		xma.hu	f61=f32,f122,f0		}
119389837Skris{ .mfi;		xma.lu	f60=f32,f122,f0		};;
119489837Skris{ .mfi;		xma.hu	f71=f32,f123,f0		}
119589837Skris{ .mfi;		xma.lu	f70=f32,f123,f0		};;//
119689837Skris// Major stall takes place here, and 3 more places below. Result from
119789837Skris// first xma is not available for another 3 ticks.
119889837Skris{ .mfi;	getf.sig	r16=f40
119989837Skris		xma.hu	f42=f33,f120,f41
120089837Skris	add		r33=8,r32		}
120189837Skris{ .mfi;		xma.lu	f41=f33,f120,f41	};;
120289837Skris{ .mfi;	getf.sig	r24=f50
120389837Skris		xma.hu	f52=f33,f121,f51	}
120489837Skris{ .mfi;		xma.lu	f51=f33,f121,f51	};;
120589837Skris{ .mfi;	st8		[r32]=r16,16
120689837Skris		xma.hu	f62=f33,f122,f61	}
120789837Skris{ .mfi;		xma.lu	f61=f33,f122,f61	};;
120889837Skris{ .mfi;		xma.hu	f72=f33,f123,f71	}
120989837Skris{ .mfi;		xma.lu	f71=f33,f123,f71	};;//
121089837Skris//-------------------------------------------------//
121189837Skris{ .mfi;	getf.sig	r25=f41
121289837Skris		xma.hu	f43=f34,f120,f42	}
121389837Skris{ .mfi;		xma.lu	f42=f34,f120,f42	};;
121489837Skris{ .mfi;	getf.sig	r16=f60
121589837Skris		xma.hu	f53=f34,f121,f52	}
121689837Skris{ .mfi;		xma.lu	f52=f34,f121,f52	};;
121789837Skris{ .mfi;	getf.sig	r17=f51
121889837Skris		xma.hu	f63=f34,f122,f62
121989837Skris	add		r25=r25,r24		}
122089837Skris{ .mfi;	mov		carry1=0
122189837Skris		xma.lu	f62=f34,f122,f62	};;
122289837Skris{ .mfi;	st8		[r33]=r25,16
122389837Skris		xma.hu	f73=f34,f123,f72
122489837Skris	cmp.ltu		p6,p0=r25,r24		}
122589837Skris{ .mfi;		xma.lu	f72=f34,f123,f72	};;//
122689837Skris//-------------------------------------------------//
122789837Skris{ .mfi;	getf.sig	r18=f42
122889837Skris		xma.hu	f44=f35,f120,f43
122989837Skris(p6)	add		carry1=1,carry1		}
123089837Skris{ .mfi;	add		r17=r17,r16
123189837Skris		xma.lu	f43=f35,f120,f43
123289837Skris	mov		carry2=0		};;
123389837Skris{ .mfi;	getf.sig	r24=f70
123489837Skris		xma.hu	f54=f35,f121,f53
123589837Skris	cmp.ltu		p7,p0=r17,r16		}
123689837Skris{ .mfi;		xma.lu	f53=f35,f121,f53	};;
123789837Skris{ .mfi;	getf.sig	r25=f61
123889837Skris		xma.hu	f64=f35,f122,f63
123989837Skris	add		r18=r18,r17		}
124089837Skris{ .mfi;		xma.lu	f63=f35,f122,f63
124189837Skris(p7)	add		carry2=1,carry2		};;
124289837Skris{ .mfi;	getf.sig	r26=f52
124389837Skris		xma.hu	f74=f35,f123,f73
124489837Skris	cmp.ltu		p7,p0=r18,r17		}
124589837Skris{ .mfi;		xma.lu	f73=f35,f123,f73
124689837Skris	add		r18=r18,carry1		};;
124789837Skris//-------------------------------------------------//
124889837Skris{ .mii;	st8		[r32]=r18,16
124989837Skris(p7)	add		carry2=1,carry2
125089837Skris	cmp.ltu		p7,p0=r18,carry1	};;
125189837Skris
125289837Skris{ .mfi;	getf.sig	r27=f43	// last major stall
125389837Skris(p7)	add		carry2=1,carry2		};;
125489837Skris{ .mii;		getf.sig	r16=f71
125589837Skris	add		r25=r25,r24
125689837Skris	mov		carry1=0		};;
125789837Skris{ .mii;		getf.sig	r17=f62
125889837Skris	cmp.ltu		p6,p0=r25,r24
125989837Skris	add		r26=r26,r25		};;
126089837Skris{ .mii;
126189837Skris(p6)	add		carry1=1,carry1
126289837Skris	cmp.ltu		p6,p0=r26,r25
126389837Skris	add		r27=r27,r26		};;
126489837Skris{ .mii;
126589837Skris(p6)	add		carry1=1,carry1
126689837Skris	cmp.ltu		p6,p0=r27,r26
126789837Skris	add		r27=r27,carry2		};;
126889837Skris{ .mii;		getf.sig	r18=f53
126989837Skris(p6)	add		carry1=1,carry1
127089837Skris	cmp.ltu		p6,p0=r27,carry2	};;
127189837Skris{ .mfi;	st8		[r33]=r27,16
127289837Skris(p6)	add		carry1=1,carry1		}
127389837Skris
127489837Skris{ .mii;		getf.sig	r19=f44
127589837Skris		add		r17=r17,r16
127689837Skris		mov		carry2=0	};;
127789837Skris{ .mii;	getf.sig	r24=f72
127889837Skris		cmp.ltu		p7,p0=r17,r16
127989837Skris		add		r18=r18,r17	};;
128089837Skris{ .mii;	(p7)	add		carry2=1,carry2
128189837Skris		cmp.ltu		p7,p0=r18,r17
128289837Skris		add		r19=r19,r18	};;
128389837Skris{ .mii;	(p7)	add		carry2=1,carry2
128489837Skris		cmp.ltu		p7,p0=r19,r18
128589837Skris		add		r19=r19,carry1	};;
128689837Skris{ .mii;	getf.sig	r25=f63
128789837Skris	(p7)	add		carry2=1,carry2
128889837Skris		cmp.ltu		p7,p0=r19,carry1};;
128989837Skris{ .mii;		st8		[r32]=r19,16
129089837Skris	(p7)	add		carry2=1,carry2	}
129189837Skris
129289837Skris{ .mii;	getf.sig	r26=f54
129389837Skris	add		r25=r25,r24
129489837Skris	mov		carry1=0		};;
129589837Skris{ .mii;		getf.sig	r16=f73
129689837Skris	cmp.ltu		p6,p0=r25,r24
129789837Skris	add		r26=r26,r25		};;
129889837Skris{ .mii;
129989837Skris(p6)	add		carry1=1,carry1
130089837Skris	cmp.ltu		p6,p0=r26,r25
130189837Skris	add		r26=r26,carry2		};;
130289837Skris{ .mii;		getf.sig	r17=f64
130389837Skris(p6)	add		carry1=1,carry1
130489837Skris	cmp.ltu		p6,p0=r26,carry2	};;
130589837Skris{ .mii;	st8		[r33]=r26,16
130689837Skris(p6)	add		carry1=1,carry1		}
130789837Skris
130889837Skris{ .mii;	getf.sig	r24=f74
130989837Skris		add		r17=r17,r16
131089837Skris		mov		carry2=0	};;
131189837Skris{ .mii;		cmp.ltu		p7,p0=r17,r16
131289837Skris		add		r17=r17,carry1	};;
131389837Skris
131489837Skris{ .mii;	(p7)	add		carry2=1,carry2
131589837Skris		cmp.ltu		p7,p0=r17,carry1};;
131689837Skris{ .mii;		st8		[r32]=r17,16
131789837Skris	(p7)	add		carry2=1,carry2	};;
131889837Skris
131989837Skris{ .mii;	add		r24=r24,carry2		};;
132089837Skris{ .mii;	st8		[r33]=r24		}
132189837Skris
132289837Skris{ .mib;	rum		1<<5		// clear um.mfh
132389837Skris	br.ret.sptk.many	b0	};;
132489837Skris.endp	bn_mul_comba4#
132589837Skris#undef	carry2
132689837Skris#undef	carry1
132789837Skris#endif
132889837Skris
132989837Skris#if 1
133089837Skris//
133189837Skris// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
133289837Skris//
133389837Skris// In the nutshell it's a port of my MIPS III/IV implementation.
133489837Skris//
133589837Skris#define	AT	r14
133689837Skris#define	H	r16
133789837Skris#define	HH	r20
133889837Skris#define	L	r17
133989837Skris#define	D	r18
134089837Skris#define	DH	r22
134189837Skris#define	I	r21
134289837Skris
134389837Skris#if 0
134489837Skris// Some preprocessors (most notably HP-UX) apper to be allergic to
134589837Skris// macros enclosed to parenthesis as these three will be.
134689837Skris#define	cont	p16
134789837Skris#define	break	p0	// p20
134889837Skris#define	equ	p24
134989837Skris#else
135089837Skriscont=p16
135189837Skrisbreak=p0
135289837Skrisequ=p24
135389837Skris#endif
135489837Skris
135589837Skris.global	abort#
135689837Skris.global	bn_div_words#
135789837Skris.proc	bn_div_words#
135889837Skris.align	64
135989837Skrisbn_div_words:
136089837Skris	.prologue
136189837Skris	.fframe	0
136289837Skris	.save	ar.pfs,r2
136389837Skris	.save	b0,r3
136489837Skris{ .mii;	alloc		r2=ar.pfs,3,5,0,8
136589837Skris	mov		r3=b0
136689837Skris	mov		r10=pr		};;
136789837Skris{ .mmb;	cmp.eq		p6,p0=r34,r0
136889837Skris	mov		r8=-1
136989837Skris(p6)	br.ret.spnt.many	b0	};;
137089837Skris
137189837Skris	.body
137289837Skris{ .mii;	mov		H=r32		// save h
137389837Skris	mov		ar.ec=0		// don't rotate at exit
137489837Skris	mov		pr.rot=0	}
137589837Skris{ .mii;	mov		L=r33		// save l
137689837Skris	mov		r36=r0		};;
137789837Skris
137889837Skris.L_divw_shift:	// -vv- note signed comparison
137989837Skris{ .mfi;	(p0)	cmp.lt		p16,p0=r0,r34	// d
138089837Skris	(p0)	shladd		r33=r34,1,r0	}
138189837Skris{ .mfb;	(p0)	add		r35=1,r36
138289837Skris	(p0)	nop.f		0x0
138389837Skris(p16)	br.wtop.dpnt		.L_divw_shift	};;
138489837Skris
138589837Skris{ .mii;	mov		D=r34
138689837Skris	shr.u		DH=r34,32
138789837Skris	sub		r35=64,r36		};;
138889837Skris{ .mii;	setf.sig	f7=DH
138989837Skris	shr.u		AT=H,r35
139089837Skris	mov		I=r36			};;
139189837Skris{ .mib;	cmp.ne		p6,p0=r0,AT
139289837Skris	shl		H=H,r36
139389837Skris(p6)	br.call.spnt.clr	b0=abort	};;	// overflow, die...
139489837Skris
139589837Skris{ .mfi;	fcvt.xuf.s1	f7=f7
139689837Skris	shr.u		AT=L,r35		};;
139789837Skris{ .mii;	shl		L=L,r36
139889837Skris	or		H=H,AT			};;
139989837Skris
140089837Skris{ .mii;	nop.m		0x0
140189837Skris	cmp.leu		p6,p0=D,H;;
140289837Skris(p6)	sub		H=H,D			}
140389837Skris
140489837Skris{ .mlx;	setf.sig	f14=D
140589837Skris	movl		AT=0xffffffff		};;
140689837Skris///////////////////////////////////////////////////////////
140789837Skris{ .mii;	setf.sig	f6=H
140889837Skris	shr.u		HH=H,32;;
140989837Skris	cmp.eq		p6,p7=HH,DH		};;
141089837Skris{ .mfb;
141189837Skris(p6)	setf.sig	f8=AT
141289837Skris(p7)	fcvt.xuf.s1	f6=f6
141389837Skris(p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;
141489837Skris
141589837Skris{ .mfi;	getf.sig	r33=f8				// q
141689837Skris	xmpy.lu		f9=f8,f14		}
141789837Skris{ .mfi;	xmpy.hu		f10=f8,f14
141889837Skris	shrp		H=H,L,32		};;
141989837Skris
142089837Skris{ .mmi;	getf.sig	r35=f9				// tl
142189837Skris	getf.sig	r31=f10			};;	// th
142289837Skris
142389837Skris.L_divw_1st_iter:
142489837Skris{ .mii;	(p0)	add		r32=-1,r33
142589837Skris	(p0)	cmp.eq		equ,cont=HH,r31		};;
142689837Skris{ .mii;	(p0)	cmp.ltu		p8,p0=r35,D
142789837Skris	(p0)	sub		r34=r35,D
142889837Skris	(equ)	cmp.leu		break,cont=r35,H	};;
142989837Skris{ .mib;	(cont)	cmp.leu		cont,break=HH,r31
143089837Skris	(p8)	add		r31=-1,r31
143189837Skris(cont)	br.wtop.spnt		.L_divw_1st_iter	};;
143289837Skris///////////////////////////////////////////////////////////
143389837Skris{ .mii;	sub		H=H,r35
143489837Skris	shl		r8=r33,32
143589837Skris	shl		L=L,32			};;
143689837Skris///////////////////////////////////////////////////////////
143789837Skris{ .mii;	setf.sig	f6=H
143889837Skris	shr.u		HH=H,32;;
143989837Skris	cmp.eq		p6,p7=HH,DH		};;
144089837Skris{ .mfb;
144189837Skris(p6)	setf.sig	f8=AT
144289837Skris(p7)	fcvt.xuf.s1	f6=f6
144389837Skris(p7)	br.call.sptk	b6=.L_udiv64_32_b6	};;
144489837Skris
144589837Skris{ .mfi;	getf.sig	r33=f8				// q
144689837Skris	xmpy.lu		f9=f8,f14		}
144789837Skris{ .mfi;	xmpy.hu		f10=f8,f14
144889837Skris	shrp		H=H,L,32		};;
144989837Skris
145089837Skris{ .mmi;	getf.sig	r35=f9				// tl
145189837Skris	getf.sig	r31=f10			};;	// th
145289837Skris
145389837Skris.L_divw_2nd_iter:
145489837Skris{ .mii;	(p0)	add		r32=-1,r33
145589837Skris	(p0)	cmp.eq		equ,cont=HH,r31		};;
145689837Skris{ .mii;	(p0)	cmp.ltu		p8,p0=r35,D
145789837Skris	(p0)	sub		r34=r35,D
145889837Skris	(equ)	cmp.leu		break,cont=r35,H	};;
145989837Skris{ .mib;	(cont)	cmp.leu		cont,break=HH,r31
146089837Skris	(p8)	add		r31=-1,r31
146189837Skris(cont)	br.wtop.spnt		.L_divw_2nd_iter	};;
146289837Skris///////////////////////////////////////////////////////////
146389837Skris{ .mii;	sub	H=H,r35
146489837Skris	or	r8=r8,r33
146589837Skris	mov	ar.pfs=r2		};;
146689837Skris{ .mii;	shr.u	r9=H,I			// remainder if anybody wants it
146789837Skris	mov	pr=r10,-1		}
146889837Skris{ .mfb;	br.ret.sptk.many	b0	};;
146989837Skris
147089837Skris// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
147189837Skris// procedure.
147289837Skris//
147389837Skris// inputs:	f6 = (double)a, f7 = (double)b
147489837Skris// output:	f8 = (int)(a/b)
147589837Skris// clobbered:	f8,f9,f10,f11,pred
147689837Skrispred=p15
147789837Skris// This procedure is essentially Intel code and therefore is
147889837Skris// copyrighted to Intel Corporation (I suppose...). It's sligtly
147989837Skris// modified for specific needs.
148089837Skris.align	32
148189837Skris.skip	16
148289837Skris.L_udiv64_32_b6:
148389837Skris	frcpa.s1	f8,pred=f6,f7;;		// [0]  y0 = 1 / b
148489837Skris
148589837Skris(pred)	fnma.s1		f9=f7,f8,f1		// [5]  e0 = 1 - b * y0
148689837Skris(pred)	fmpy.s1		f10=f6,f8;;		// [5]  q0 = a * y0
148789837Skris(pred)	fmpy.s1		f11=f9,f9		// [10] e1 = e0 * e0
148889837Skris(pred)	fma.s1		f10=f9,f10,f10;;	// [10] q1 = q0 + e0 * q0
148989837Skris(pred)	fma.s1		f8=f9,f8,f8	//;;	// [15] y1 = y0 + e0 * y0
149089837Skris(pred)	fma.s1		f9=f11,f10,f10;;	// [15] q2 = q1 + e1 * q1
149189837Skris(pred)	fma.s1		f8=f11,f8,f8	//;;	// [20] y2 = y1 + e1 * y1
149289837Skris(pred)	fnma.s1		f10=f7,f9,f6;;		// [20] r2 = a - b * q2
149389837Skris(pred)	fma.s1		f8=f10,f8,f9;;		// [25] q3 = q2 + r2 * y2
149489837Skris
149589837Skris	fcvt.fxu.trunc.s1	f8=f8		// [30] q = trunc(q3)
149689837Skris	br.ret.sptk.many	b6;;
149789837Skris.endp	bn_div_words#
149889837Skris#endif
1499