x86_64-gcc.c revision 109998
1109998Smarkm/*
2109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002.
3109998Smarkm *
4109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5109998Smarkm * project.
6109998Smarkm *
7109998Smarkm * Rights for redistribution and usage in source and binary forms are
8109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is
9109998Smarkm * disclaimed.
10109998Smarkm *
11109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
12109998Smarkm *    versions, like 1.0...
13109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty
14109998Smarkm *    proof-of-concept hack. As you can see it's implemented with
15109998Smarkm *    inline assembler, which means that you're bound to GCC and that
16109998Smarkm *    there must be a room for fine-tuning.
17109998Smarkm *
18109998Smarkm * Q. Why inline assembler?
19109998Smarkm * A. x86_64 features own ABI I'm not familiar with. Which is why
20109998Smarkm *    I decided to let the compiler take care of subroutine
21109998Smarkm *    prologue/epilogue as well as register allocation.
22109998Smarkm *
23109998Smarkm * Q. How much faster does it get?
24109998Smarkm * A. Unfortunately people sitting on x86_64 hardware are prohibited
25109998Smarkm *    to disclose the performance numbers, so they (SuSE labs to be
26109998Smarkm *    specific) wouldn't tell me. However! Very similar coding technique
27109998Smarkm *    (reaching out for 128-bit result from 64x64-bit multiplication)
28109998Smarkm *    results in >3 times performance improvement on MIPS and I see no
29109998Smarkm *    reason why gain on x86_64 would be so much different:-)
30109998Smarkm */
31109998Smarkm
32109998Smarkm#define BN_ULONG unsigned long
33109998Smarkm
34109998Smarkm/*
35109998Smarkm * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
36109998Smarkm * "g"(0)		let the compiler to decide where does it
37109998Smarkm *			want to keep the value of zero;
38109998Smarkm */
39109998Smarkm#define mul_add(r,a,word,carry) do {	\
40109998Smarkm	register BN_ULONG high,low;	\
41109998Smarkm	asm ("mulq %3"			\
42109998Smarkm		: "=a"(low),"=d"(high)	\
43109998Smarkm		: "a"(word),"m"(a)	\
44109998Smarkm		: "cc");		\
45109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
46109998Smarkm		: "+r"(carry),"+d"(high)\
47109998Smarkm		: "a"(low),"g"(0)	\
48109998Smarkm		: "cc");		\
49109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
50109998Smarkm		: "+m"(r),"+d"(high)	\
51109998Smarkm		: "r"(carry),"g"(0)	\
52109998Smarkm		: "cc");		\
53109998Smarkm	carry=high;			\
54109998Smarkm	} while (0)
55109998Smarkm
56109998Smarkm#define mul(r,a,word,carry) do {	\
57109998Smarkm	register BN_ULONG high,low;	\
58109998Smarkm	asm ("mulq %3"			\
59109998Smarkm		: "=a"(low),"=d"(high)	\
60109998Smarkm		: "a"(word),"g"(a)	\
61109998Smarkm		: "cc");		\
62109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
63109998Smarkm		: "+r"(carry),"+d"(high)\
64109998Smarkm		: "a"(low),"g"(0)	\
65109998Smarkm		: "cc");		\
66109998Smarkm	(r)=carry, carry=high;		\
67109998Smarkm	} while (0)
68109998Smarkm
69109998Smarkm#define sqr(r0,r1,a)			\
70109998Smarkm	asm ("mulq %2"			\
71109998Smarkm		: "=a"(r0),"=d"(r1)	\
72109998Smarkm		: "a"(a)		\
73109998Smarkm		: "cc");
74109998Smarkm
75109998SmarkmBN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
76109998Smarkm	{
77109998Smarkm	BN_ULONG c1=0;
78109998Smarkm
79109998Smarkm	if (num <= 0) return(c1);
80109998Smarkm
81109998Smarkm	while (num&~3)
82109998Smarkm		{
83109998Smarkm		mul_add(rp[0],ap[0],w,c1);
84109998Smarkm		mul_add(rp[1],ap[1],w,c1);
85109998Smarkm		mul_add(rp[2],ap[2],w,c1);
86109998Smarkm		mul_add(rp[3],ap[3],w,c1);
87109998Smarkm		ap+=4; rp+=4; num-=4;
88109998Smarkm		}
89109998Smarkm	if (num)
90109998Smarkm		{
91109998Smarkm		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
92109998Smarkm		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
93109998Smarkm		mul_add(rp[2],ap[2],w,c1); return c1;
94109998Smarkm		}
95109998Smarkm
96109998Smarkm	return(c1);
97109998Smarkm	}
98109998Smarkm
99109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
100109998Smarkm	{
101109998Smarkm	BN_ULONG c1=0;
102109998Smarkm
103109998Smarkm	if (num <= 0) return(c1);
104109998Smarkm
105109998Smarkm	while (num&~3)
106109998Smarkm		{
107109998Smarkm		mul(rp[0],ap[0],w,c1);
108109998Smarkm		mul(rp[1],ap[1],w,c1);
109109998Smarkm		mul(rp[2],ap[2],w,c1);
110109998Smarkm		mul(rp[3],ap[3],w,c1);
111109998Smarkm		ap+=4; rp+=4; num-=4;
112109998Smarkm		}
113109998Smarkm	if (num)
114109998Smarkm		{
115109998Smarkm		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
116109998Smarkm		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
117109998Smarkm		mul(rp[2],ap[2],w,c1);
118109998Smarkm		}
119109998Smarkm	return(c1);
120109998Smarkm	}
121109998Smarkm
122109998Smarkmvoid bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
123109998Smarkm        {
124109998Smarkm	if (n <= 0) return;
125109998Smarkm
126109998Smarkm	while (n&~3)
127109998Smarkm		{
128109998Smarkm		sqr(r[0],r[1],a[0]);
129109998Smarkm		sqr(r[2],r[3],a[1]);
130109998Smarkm		sqr(r[4],r[5],a[2]);
131109998Smarkm		sqr(r[6],r[7],a[3]);
132109998Smarkm		a+=4; r+=8; n-=4;
133109998Smarkm		}
134109998Smarkm	if (n)
135109998Smarkm		{
136109998Smarkm		sqr(r[0],r[1],a[0]); if (--n == 0) return;
137109998Smarkm		sqr(r[2],r[3],a[1]); if (--n == 0) return;
138109998Smarkm		sqr(r[4],r[5],a[2]);
139109998Smarkm		}
140109998Smarkm	}
141109998Smarkm
142109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
143109998Smarkm{	BN_ULONG ret,waste;
144109998Smarkm
145109998Smarkm	asm ("divq	%3"
146109998Smarkm		: "=a"(ret),"=d"(waste)
147109998Smarkm		: "a"(l),"d"(h),"g"(d)
148109998Smarkm		: "cc");
149109998Smarkm
150109998Smarkm	return ret;
151109998Smarkm}
152109998Smarkm
153109998SmarkmBN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
154109998Smarkm{ BN_ULONG ret,i;
155109998Smarkm
156109998Smarkm	if (n <= 0) return 0;
157109998Smarkm
158109998Smarkm	asm (
159109998Smarkm	"	subq	%2,%2		\n"
160109998Smarkm	".align 16			\n"
161109998Smarkm	"1:	movq	(%4,%2,8),%0	\n"
162109998Smarkm	"	adcq	(%5,%2,8),%0	\n"
163109998Smarkm	"	movq	%0,(%3,%2,8)	\n"
164109998Smarkm	"	leaq	1(%2),%2	\n"
165109998Smarkm	"	loop	1b		\n"
166109998Smarkm	"	sbbq	%0,%0		\n"
167109998Smarkm		: "+a"(ret),"+c"(n),"+r"(i)
168109998Smarkm		: "r"(rp),"r"(ap),"r"(bp)
169109998Smarkm		: "cc"
170109998Smarkm	);
171109998Smarkm
172109998Smarkm  return ret&1;
173109998Smarkm}
174109998Smarkm
175109998Smarkm#ifndef SIMICS
176109998SmarkmBN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
177109998Smarkm{ BN_ULONG ret,i;
178109998Smarkm
179109998Smarkm	if (n <= 0) return 0;
180109998Smarkm
181109998Smarkm	asm (
182109998Smarkm	"	subq	%2,%2		\n"
183109998Smarkm	".align 16			\n"
184109998Smarkm	"1:	movq	(%4,%2,8),%0	\n"
185109998Smarkm	"	sbbq	(%5,%2,8),%0	\n"
186109998Smarkm	"	movq	%0,(%3,%2,8)	\n"
187109998Smarkm	"	leaq	1(%2),%2	\n"
188109998Smarkm	"	loop	1b		\n"
189109998Smarkm	"	sbbq	%0,%0		\n"
190109998Smarkm		: "+a"(ret),"+c"(n),"+r"(i)
191109998Smarkm		: "r"(rp),"r"(ap),"r"(bp)
192109998Smarkm		: "cc"
193109998Smarkm	);
194109998Smarkm
195109998Smarkm  return ret&1;
196109998Smarkm}
197109998Smarkm#else
198109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */
199109998Smarkm#define BN_MASK2 0xffffffffffffffffL
200109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
201109998Smarkm        {
202109998Smarkm	BN_ULONG t1,t2;
203109998Smarkm	int c=0;
204109998Smarkm
205109998Smarkm	if (n <= 0) return((BN_ULONG)0);
206109998Smarkm
207109998Smarkm	for (;;)
208109998Smarkm		{
209109998Smarkm		t1=a[0]; t2=b[0];
210109998Smarkm		r[0]=(t1-t2-c)&BN_MASK2;
211109998Smarkm		if (t1 != t2) c=(t1 < t2);
212109998Smarkm		if (--n <= 0) break;
213109998Smarkm
214109998Smarkm		t1=a[1]; t2=b[1];
215109998Smarkm		r[1]=(t1-t2-c)&BN_MASK2;
216109998Smarkm		if (t1 != t2) c=(t1 < t2);
217109998Smarkm		if (--n <= 0) break;
218109998Smarkm
219109998Smarkm		t1=a[2]; t2=b[2];
220109998Smarkm		r[2]=(t1-t2-c)&BN_MASK2;
221109998Smarkm		if (t1 != t2) c=(t1 < t2);
222109998Smarkm		if (--n <= 0) break;
223109998Smarkm
224109998Smarkm		t1=a[3]; t2=b[3];
225109998Smarkm		r[3]=(t1-t2-c)&BN_MASK2;
226109998Smarkm		if (t1 != t2) c=(t1 < t2);
227109998Smarkm		if (--n <= 0) break;
228109998Smarkm
229109998Smarkm		a+=4;
230109998Smarkm		b+=4;
231109998Smarkm		r+=4;
232109998Smarkm		}
233109998Smarkm	return(c);
234109998Smarkm	}
235109998Smarkm#endif
236109998Smarkm
237109998Smarkm/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
238109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
239109998Smarkm/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
240109998Smarkm/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
241109998Smarkm
242109998Smarkm#if 0
243109998Smarkm/* original macros are kept for reference purposes */
244109998Smarkm#define mul_add_c(a,b,c0,c1,c2) {	\
245109998Smarkm	BN_ULONG ta=(a),tb=(b);		\
246109998Smarkm	t1 = ta * tb;			\
247109998Smarkm	t2 = BN_UMULT_HIGH(ta,tb);	\
248109998Smarkm	c0 += t1; t2 += (c0<t1)?1:0;	\
249109998Smarkm	c1 += t2; c2 += (c1<t2)?1:0;	\
250109998Smarkm	}
251109998Smarkm
252109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) {	\
253109998Smarkm	BN_ULONG ta=(a),tb=(b),t0;	\
254109998Smarkm	t1 = BN_UMULT_HIGH(ta,tb);	\
255109998Smarkm	t0 = ta * tb;			\
256109998Smarkm	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
257109998Smarkm	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
258109998Smarkm	c0 += t1; t2 += (c0<t1)?1:0;	\
259109998Smarkm	c1 += t2; c2 += (c1<t2)?1:0;	\
260109998Smarkm	}
261109998Smarkm#else
262109998Smarkm#define mul_add_c(a,b,c0,c1,c2)	do {	\
263109998Smarkm	asm ("mulq %3"			\
264109998Smarkm		: "=a"(t1),"=d"(t2)	\
265109998Smarkm		: "a"(a),"m"(b)		\
266109998Smarkm		: "cc");		\
267109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
268109998Smarkm		: "+r"(c0),"+d"(t2)	\
269109998Smarkm		: "a"(t1),"g"(0)	\
270109998Smarkm		: "cc");		\
271109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
272109998Smarkm		: "+r"(c1),"+r"(c2)	\
273109998Smarkm		: "d"(t2),"g"(0)	\
274109998Smarkm		: "cc");		\
275109998Smarkm	} while (0)
276109998Smarkm
277109998Smarkm#define sqr_add_c(a,i,c0,c1,c2)	do {	\
278109998Smarkm	asm ("mulq %2"			\
279109998Smarkm		: "=a"(t1),"=d"(t2)	\
280109998Smarkm		: "a"(a[i])		\
281109998Smarkm		: "cc");		\
282109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
283109998Smarkm		: "+r"(c0),"+d"(t2)	\
284109998Smarkm		: "a"(t1),"g"(0)	\
285109998Smarkm		: "cc");		\
286109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
287109998Smarkm		: "+r"(c1),"+r"(c2)	\
288109998Smarkm		: "d"(t2),"g"(0)	\
289109998Smarkm		: "cc");		\
290109998Smarkm	} while (0)
291109998Smarkm
292109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) do {	\
293109998Smarkm	asm ("mulq %3"			\
294109998Smarkm		: "=a"(t1),"=d"(t2)	\
295109998Smarkm		: "a"(a),"m"(b)		\
296109998Smarkm		: "cc");		\
297109998Smarkm	asm ("addq %0,%0; adcq %2,%1"	\
298109998Smarkm		: "+d"(t2),"+r"(c2)	\
299109998Smarkm		: "g"(0)		\
300109998Smarkm		: "cc");		\
301109998Smarkm	asm ("addq %0,%0; adcq %2,%1"	\
302109998Smarkm		: "+a"(t1),"+d"(t2)	\
303109998Smarkm		: "g"(0)		\
304109998Smarkm		: "cc");		\
305109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
306109998Smarkm		: "+r"(c0),"+d"(t2)	\
307109998Smarkm		: "a"(t1),"g"(0)	\
308109998Smarkm		: "cc");		\
309109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
310109998Smarkm		: "+r"(c1),"+r"(c2)	\
311109998Smarkm		: "d"(t2),"g"(0)	\
312109998Smarkm		: "cc");		\
313109998Smarkm	} while (0)
314109998Smarkm#endif
315109998Smarkm
316109998Smarkm#define sqr_add_c2(a,i,j,c0,c1,c2)	\
317109998Smarkm	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
318109998Smarkm
319109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
320109998Smarkm	{
321109998Smarkm	BN_ULONG bl,bh;
322109998Smarkm	BN_ULONG t1,t2;
323109998Smarkm	BN_ULONG c1,c2,c3;
324109998Smarkm
325109998Smarkm	c1=0;
326109998Smarkm	c2=0;
327109998Smarkm	c3=0;
328109998Smarkm	mul_add_c(a[0],b[0],c1,c2,c3);
329109998Smarkm	r[0]=c1;
330109998Smarkm	c1=0;
331109998Smarkm	mul_add_c(a[0],b[1],c2,c3,c1);
332109998Smarkm	mul_add_c(a[1],b[0],c2,c3,c1);
333109998Smarkm	r[1]=c2;
334109998Smarkm	c2=0;
335109998Smarkm	mul_add_c(a[2],b[0],c3,c1,c2);
336109998Smarkm	mul_add_c(a[1],b[1],c3,c1,c2);
337109998Smarkm	mul_add_c(a[0],b[2],c3,c1,c2);
338109998Smarkm	r[2]=c3;
339109998Smarkm	c3=0;
340109998Smarkm	mul_add_c(a[0],b[3],c1,c2,c3);
341109998Smarkm	mul_add_c(a[1],b[2],c1,c2,c3);
342109998Smarkm	mul_add_c(a[2],b[1],c1,c2,c3);
343109998Smarkm	mul_add_c(a[3],b[0],c1,c2,c3);
344109998Smarkm	r[3]=c1;
345109998Smarkm	c1=0;
346109998Smarkm	mul_add_c(a[4],b[0],c2,c3,c1);
347109998Smarkm	mul_add_c(a[3],b[1],c2,c3,c1);
348109998Smarkm	mul_add_c(a[2],b[2],c2,c3,c1);
349109998Smarkm	mul_add_c(a[1],b[3],c2,c3,c1);
350109998Smarkm	mul_add_c(a[0],b[4],c2,c3,c1);
351109998Smarkm	r[4]=c2;
352109998Smarkm	c2=0;
353109998Smarkm	mul_add_c(a[0],b[5],c3,c1,c2);
354109998Smarkm	mul_add_c(a[1],b[4],c3,c1,c2);
355109998Smarkm	mul_add_c(a[2],b[3],c3,c1,c2);
356109998Smarkm	mul_add_c(a[3],b[2],c3,c1,c2);
357109998Smarkm	mul_add_c(a[4],b[1],c3,c1,c2);
358109998Smarkm	mul_add_c(a[5],b[0],c3,c1,c2);
359109998Smarkm	r[5]=c3;
360109998Smarkm	c3=0;
361109998Smarkm	mul_add_c(a[6],b[0],c1,c2,c3);
362109998Smarkm	mul_add_c(a[5],b[1],c1,c2,c3);
363109998Smarkm	mul_add_c(a[4],b[2],c1,c2,c3);
364109998Smarkm	mul_add_c(a[3],b[3],c1,c2,c3);
365109998Smarkm	mul_add_c(a[2],b[4],c1,c2,c3);
366109998Smarkm	mul_add_c(a[1],b[5],c1,c2,c3);
367109998Smarkm	mul_add_c(a[0],b[6],c1,c2,c3);
368109998Smarkm	r[6]=c1;
369109998Smarkm	c1=0;
370109998Smarkm	mul_add_c(a[0],b[7],c2,c3,c1);
371109998Smarkm	mul_add_c(a[1],b[6],c2,c3,c1);
372109998Smarkm	mul_add_c(a[2],b[5],c2,c3,c1);
373109998Smarkm	mul_add_c(a[3],b[4],c2,c3,c1);
374109998Smarkm	mul_add_c(a[4],b[3],c2,c3,c1);
375109998Smarkm	mul_add_c(a[5],b[2],c2,c3,c1);
376109998Smarkm	mul_add_c(a[6],b[1],c2,c3,c1);
377109998Smarkm	mul_add_c(a[7],b[0],c2,c3,c1);
378109998Smarkm	r[7]=c2;
379109998Smarkm	c2=0;
380109998Smarkm	mul_add_c(a[7],b[1],c3,c1,c2);
381109998Smarkm	mul_add_c(a[6],b[2],c3,c1,c2);
382109998Smarkm	mul_add_c(a[5],b[3],c3,c1,c2);
383109998Smarkm	mul_add_c(a[4],b[4],c3,c1,c2);
384109998Smarkm	mul_add_c(a[3],b[5],c3,c1,c2);
385109998Smarkm	mul_add_c(a[2],b[6],c3,c1,c2);
386109998Smarkm	mul_add_c(a[1],b[7],c3,c1,c2);
387109998Smarkm	r[8]=c3;
388109998Smarkm	c3=0;
389109998Smarkm	mul_add_c(a[2],b[7],c1,c2,c3);
390109998Smarkm	mul_add_c(a[3],b[6],c1,c2,c3);
391109998Smarkm	mul_add_c(a[4],b[5],c1,c2,c3);
392109998Smarkm	mul_add_c(a[5],b[4],c1,c2,c3);
393109998Smarkm	mul_add_c(a[6],b[3],c1,c2,c3);
394109998Smarkm	mul_add_c(a[7],b[2],c1,c2,c3);
395109998Smarkm	r[9]=c1;
396109998Smarkm	c1=0;
397109998Smarkm	mul_add_c(a[7],b[3],c2,c3,c1);
398109998Smarkm	mul_add_c(a[6],b[4],c2,c3,c1);
399109998Smarkm	mul_add_c(a[5],b[5],c2,c3,c1);
400109998Smarkm	mul_add_c(a[4],b[6],c2,c3,c1);
401109998Smarkm	mul_add_c(a[3],b[7],c2,c3,c1);
402109998Smarkm	r[10]=c2;
403109998Smarkm	c2=0;
404109998Smarkm	mul_add_c(a[4],b[7],c3,c1,c2);
405109998Smarkm	mul_add_c(a[5],b[6],c3,c1,c2);
406109998Smarkm	mul_add_c(a[6],b[5],c3,c1,c2);
407109998Smarkm	mul_add_c(a[7],b[4],c3,c1,c2);
408109998Smarkm	r[11]=c3;
409109998Smarkm	c3=0;
410109998Smarkm	mul_add_c(a[7],b[5],c1,c2,c3);
411109998Smarkm	mul_add_c(a[6],b[6],c1,c2,c3);
412109998Smarkm	mul_add_c(a[5],b[7],c1,c2,c3);
413109998Smarkm	r[12]=c1;
414109998Smarkm	c1=0;
415109998Smarkm	mul_add_c(a[6],b[7],c2,c3,c1);
416109998Smarkm	mul_add_c(a[7],b[6],c2,c3,c1);
417109998Smarkm	r[13]=c2;
418109998Smarkm	c2=0;
419109998Smarkm	mul_add_c(a[7],b[7],c3,c1,c2);
420109998Smarkm	r[14]=c3;
421109998Smarkm	r[15]=c1;
422109998Smarkm	}
423109998Smarkm
424109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
425109998Smarkm	{
426109998Smarkm	BN_ULONG bl,bh;
427109998Smarkm	BN_ULONG t1,t2;
428109998Smarkm	BN_ULONG c1,c2,c3;
429109998Smarkm
430109998Smarkm	c1=0;
431109998Smarkm	c2=0;
432109998Smarkm	c3=0;
433109998Smarkm	mul_add_c(a[0],b[0],c1,c2,c3);
434109998Smarkm	r[0]=c1;
435109998Smarkm	c1=0;
436109998Smarkm	mul_add_c(a[0],b[1],c2,c3,c1);
437109998Smarkm	mul_add_c(a[1],b[0],c2,c3,c1);
438109998Smarkm	r[1]=c2;
439109998Smarkm	c2=0;
440109998Smarkm	mul_add_c(a[2],b[0],c3,c1,c2);
441109998Smarkm	mul_add_c(a[1],b[1],c3,c1,c2);
442109998Smarkm	mul_add_c(a[0],b[2],c3,c1,c2);
443109998Smarkm	r[2]=c3;
444109998Smarkm	c3=0;
445109998Smarkm	mul_add_c(a[0],b[3],c1,c2,c3);
446109998Smarkm	mul_add_c(a[1],b[2],c1,c2,c3);
447109998Smarkm	mul_add_c(a[2],b[1],c1,c2,c3);
448109998Smarkm	mul_add_c(a[3],b[0],c1,c2,c3);
449109998Smarkm	r[3]=c1;
450109998Smarkm	c1=0;
451109998Smarkm	mul_add_c(a[3],b[1],c2,c3,c1);
452109998Smarkm	mul_add_c(a[2],b[2],c2,c3,c1);
453109998Smarkm	mul_add_c(a[1],b[3],c2,c3,c1);
454109998Smarkm	r[4]=c2;
455109998Smarkm	c2=0;
456109998Smarkm	mul_add_c(a[2],b[3],c3,c1,c2);
457109998Smarkm	mul_add_c(a[3],b[2],c3,c1,c2);
458109998Smarkm	r[5]=c3;
459109998Smarkm	c3=0;
460109998Smarkm	mul_add_c(a[3],b[3],c1,c2,c3);
461109998Smarkm	r[6]=c1;
462109998Smarkm	r[7]=c2;
463109998Smarkm	}
464109998Smarkm
465109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
466109998Smarkm	{
467109998Smarkm	BN_ULONG bl,bh;
468109998Smarkm	BN_ULONG t1,t2;
469109998Smarkm	BN_ULONG c1,c2,c3;
470109998Smarkm
471109998Smarkm	c1=0;
472109998Smarkm	c2=0;
473109998Smarkm	c3=0;
474109998Smarkm	sqr_add_c(a,0,c1,c2,c3);
475109998Smarkm	r[0]=c1;
476109998Smarkm	c1=0;
477109998Smarkm	sqr_add_c2(a,1,0,c2,c3,c1);
478109998Smarkm	r[1]=c2;
479109998Smarkm	c2=0;
480109998Smarkm	sqr_add_c(a,1,c3,c1,c2);
481109998Smarkm	sqr_add_c2(a,2,0,c3,c1,c2);
482109998Smarkm	r[2]=c3;
483109998Smarkm	c3=0;
484109998Smarkm	sqr_add_c2(a,3,0,c1,c2,c3);
485109998Smarkm	sqr_add_c2(a,2,1,c1,c2,c3);
486109998Smarkm	r[3]=c1;
487109998Smarkm	c1=0;
488109998Smarkm	sqr_add_c(a,2,c2,c3,c1);
489109998Smarkm	sqr_add_c2(a,3,1,c2,c3,c1);
490109998Smarkm	sqr_add_c2(a,4,0,c2,c3,c1);
491109998Smarkm	r[4]=c2;
492109998Smarkm	c2=0;
493109998Smarkm	sqr_add_c2(a,5,0,c3,c1,c2);
494109998Smarkm	sqr_add_c2(a,4,1,c3,c1,c2);
495109998Smarkm	sqr_add_c2(a,3,2,c3,c1,c2);
496109998Smarkm	r[5]=c3;
497109998Smarkm	c3=0;
498109998Smarkm	sqr_add_c(a,3,c1,c2,c3);
499109998Smarkm	sqr_add_c2(a,4,2,c1,c2,c3);
500109998Smarkm	sqr_add_c2(a,5,1,c1,c2,c3);
501109998Smarkm	sqr_add_c2(a,6,0,c1,c2,c3);
502109998Smarkm	r[6]=c1;
503109998Smarkm	c1=0;
504109998Smarkm	sqr_add_c2(a,7,0,c2,c3,c1);
505109998Smarkm	sqr_add_c2(a,6,1,c2,c3,c1);
506109998Smarkm	sqr_add_c2(a,5,2,c2,c3,c1);
507109998Smarkm	sqr_add_c2(a,4,3,c2,c3,c1);
508109998Smarkm	r[7]=c2;
509109998Smarkm	c2=0;
510109998Smarkm	sqr_add_c(a,4,c3,c1,c2);
511109998Smarkm	sqr_add_c2(a,5,3,c3,c1,c2);
512109998Smarkm	sqr_add_c2(a,6,2,c3,c1,c2);
513109998Smarkm	sqr_add_c2(a,7,1,c3,c1,c2);
514109998Smarkm	r[8]=c3;
515109998Smarkm	c3=0;
516109998Smarkm	sqr_add_c2(a,7,2,c1,c2,c3);
517109998Smarkm	sqr_add_c2(a,6,3,c1,c2,c3);
518109998Smarkm	sqr_add_c2(a,5,4,c1,c2,c3);
519109998Smarkm	r[9]=c1;
520109998Smarkm	c1=0;
521109998Smarkm	sqr_add_c(a,5,c2,c3,c1);
522109998Smarkm	sqr_add_c2(a,6,4,c2,c3,c1);
523109998Smarkm	sqr_add_c2(a,7,3,c2,c3,c1);
524109998Smarkm	r[10]=c2;
525109998Smarkm	c2=0;
526109998Smarkm	sqr_add_c2(a,7,4,c3,c1,c2);
527109998Smarkm	sqr_add_c2(a,6,5,c3,c1,c2);
528109998Smarkm	r[11]=c3;
529109998Smarkm	c3=0;
530109998Smarkm	sqr_add_c(a,6,c1,c2,c3);
531109998Smarkm	sqr_add_c2(a,7,5,c1,c2,c3);
532109998Smarkm	r[12]=c1;
533109998Smarkm	c1=0;
534109998Smarkm	sqr_add_c2(a,7,6,c2,c3,c1);
535109998Smarkm	r[13]=c2;
536109998Smarkm	c2=0;
537109998Smarkm	sqr_add_c(a,7,c3,c1,c2);
538109998Smarkm	r[14]=c3;
539109998Smarkm	r[15]=c1;
540109998Smarkm	}
541109998Smarkm
542109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
543109998Smarkm	{
544109998Smarkm	BN_ULONG bl,bh;
545109998Smarkm	BN_ULONG t1,t2;
546109998Smarkm	BN_ULONG c1,c2,c3;
547109998Smarkm
548109998Smarkm	c1=0;
549109998Smarkm	c2=0;
550109998Smarkm	c3=0;
551109998Smarkm	sqr_add_c(a,0,c1,c2,c3);
552109998Smarkm	r[0]=c1;
553109998Smarkm	c1=0;
554109998Smarkm	sqr_add_c2(a,1,0,c2,c3,c1);
555109998Smarkm	r[1]=c2;
556109998Smarkm	c2=0;
557109998Smarkm	sqr_add_c(a,1,c3,c1,c2);
558109998Smarkm	sqr_add_c2(a,2,0,c3,c1,c2);
559109998Smarkm	r[2]=c3;
560109998Smarkm	c3=0;
561109998Smarkm	sqr_add_c2(a,3,0,c1,c2,c3);
562109998Smarkm	sqr_add_c2(a,2,1,c1,c2,c3);
563109998Smarkm	r[3]=c1;
564109998Smarkm	c1=0;
565109998Smarkm	sqr_add_c(a,2,c2,c3,c1);
566109998Smarkm	sqr_add_c2(a,3,1,c2,c3,c1);
567109998Smarkm	r[4]=c2;
568109998Smarkm	c2=0;
569109998Smarkm	sqr_add_c2(a,3,2,c3,c1,c2);
570109998Smarkm	r[5]=c3;
571109998Smarkm	c3=0;
572109998Smarkm	sqr_add_c(a,3,c1,c2,c3);
573109998Smarkm	r[6]=c1;
574109998Smarkm	r[7]=c2;
575109998Smarkm	}
576