x86_64-gcc.c revision 205128
1205128Ssimon#include "../bn_lcl.h"
2162911Ssimon#ifdef __SUNPRO_C
3162911Ssimon# include "../bn_asm.c"	/* kind of dirty hack for Sun Studio */
4162911Ssimon#else
5109998Smarkm/*
6109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002.
7109998Smarkm *
8109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9109998Smarkm * project.
10109998Smarkm *
11109998Smarkm * Rights for redistribution and usage in source and binary forms are
12109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is
13109998Smarkm * disclaimed.
14109998Smarkm *
15109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16109998Smarkm *    versions, like 1.0...
17109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty
18109998Smarkm *    proof-of-concept hack. As you can see it's implemented with
19109998Smarkm *    inline assembler, which means that you're bound to GCC and that
20160814Ssimon *    there might be enough room for further improvement.
21109998Smarkm *
22109998Smarkm * Q. Why inline assembler?
23160814Ssimon * A. x86_64 features own ABI which I'm not familiar with. This is
24160814Ssimon *    why I decided to let the compiler take care of subroutine
25160814Ssimon *    prologue/epilogue as well as register allocation. For reference.
26160814Ssimon *    Win64 implements different ABI for AMD64, different from Linux.
27109998Smarkm *
28109998Smarkm * Q. How much faster does it get?
29160814Ssimon * A. 'apps/openssl speed rsa dsa' output with no-asm:
30160814Ssimon *
31160814Ssimon *	                  sign    verify    sign/s verify/s
32160814Ssimon *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
33160814Ssimon *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
34160814Ssimon *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
35160814Ssimon *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
36160814Ssimon *	                  sign    verify    sign/s verify/s
37160814Ssimon *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
38160814Ssimon *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
39160814Ssimon *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
40160814Ssimon *
41160814Ssimon *    'apps/openssl speed rsa dsa' output with this module:
42160814Ssimon *
43160814Ssimon *	                  sign    verify    sign/s verify/s
44160814Ssimon *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
45160814Ssimon *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
46160814Ssimon *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
47160814Ssimon *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
48160814Ssimon *	                  sign    verify    sign/s verify/s
49160814Ssimon *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
50160814Ssimon *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
51160814Ssimon *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
52160814Ssimon *
53160814Ssimon *    For the reference. IA-32 assembler implementation performs
54160814Ssimon *    very much like 64-bit code compiled with no-asm on the same
55160814Ssimon *    machine.
56109998Smarkm */
57109998Smarkm
58109998Smarkm#define BN_ULONG unsigned long
59109998Smarkm
60205128Ssimon#undef mul
61205128Ssimon#undef mul_add
62205128Ssimon
63109998Smarkm/*
64109998Smarkm * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
65109998Smarkm * "g"(0)		let the compiler to decide where does it
66109998Smarkm *			want to keep the value of zero;
67109998Smarkm */
68109998Smarkm#define mul_add(r,a,word,carry) do {	\
69109998Smarkm	register BN_ULONG high,low;	\
70109998Smarkm	asm ("mulq %3"			\
71109998Smarkm		: "=a"(low),"=d"(high)	\
72109998Smarkm		: "a"(word),"m"(a)	\
73109998Smarkm		: "cc");		\
74109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
75109998Smarkm		: "+r"(carry),"+d"(high)\
76109998Smarkm		: "a"(low),"g"(0)	\
77109998Smarkm		: "cc");		\
78109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
79109998Smarkm		: "+m"(r),"+d"(high)	\
80109998Smarkm		: "r"(carry),"g"(0)	\
81109998Smarkm		: "cc");		\
82109998Smarkm	carry=high;			\
83109998Smarkm	} while (0)
84109998Smarkm
85109998Smarkm#define mul(r,a,word,carry) do {	\
86109998Smarkm	register BN_ULONG high,low;	\
87109998Smarkm	asm ("mulq %3"			\
88109998Smarkm		: "=a"(low),"=d"(high)	\
89109998Smarkm		: "a"(word),"g"(a)	\
90109998Smarkm		: "cc");		\
91109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
92109998Smarkm		: "+r"(carry),"+d"(high)\
93109998Smarkm		: "a"(low),"g"(0)	\
94109998Smarkm		: "cc");		\
95109998Smarkm	(r)=carry, carry=high;		\
96109998Smarkm	} while (0)
97109998Smarkm
98109998Smarkm#define sqr(r0,r1,a)			\
99109998Smarkm	asm ("mulq %2"			\
100109998Smarkm		: "=a"(r0),"=d"(r1)	\
101109998Smarkm		: "a"(a)		\
102109998Smarkm		: "cc");
103109998Smarkm
104205128SsimonBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
105109998Smarkm	{
106109998Smarkm	BN_ULONG c1=0;
107109998Smarkm
108109998Smarkm	if (num <= 0) return(c1);
109109998Smarkm
110109998Smarkm	while (num&~3)
111109998Smarkm		{
112109998Smarkm		mul_add(rp[0],ap[0],w,c1);
113109998Smarkm		mul_add(rp[1],ap[1],w,c1);
114109998Smarkm		mul_add(rp[2],ap[2],w,c1);
115109998Smarkm		mul_add(rp[3],ap[3],w,c1);
116109998Smarkm		ap+=4; rp+=4; num-=4;
117109998Smarkm		}
118109998Smarkm	if (num)
119109998Smarkm		{
120109998Smarkm		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
121109998Smarkm		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
122109998Smarkm		mul_add(rp[2],ap[2],w,c1); return c1;
123109998Smarkm		}
124109998Smarkm
125109998Smarkm	return(c1);
126109998Smarkm	}
127109998Smarkm
128205128SsimonBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
129109998Smarkm	{
130109998Smarkm	BN_ULONG c1=0;
131109998Smarkm
132109998Smarkm	if (num <= 0) return(c1);
133109998Smarkm
134109998Smarkm	while (num&~3)
135109998Smarkm		{
136109998Smarkm		mul(rp[0],ap[0],w,c1);
137109998Smarkm		mul(rp[1],ap[1],w,c1);
138109998Smarkm		mul(rp[2],ap[2],w,c1);
139109998Smarkm		mul(rp[3],ap[3],w,c1);
140109998Smarkm		ap+=4; rp+=4; num-=4;
141109998Smarkm		}
142109998Smarkm	if (num)
143109998Smarkm		{
144109998Smarkm		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
145109998Smarkm		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
146109998Smarkm		mul(rp[2],ap[2],w,c1);
147109998Smarkm		}
148109998Smarkm	return(c1);
149109998Smarkm	}
150109998Smarkm
151205128Ssimonvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
152109998Smarkm        {
153109998Smarkm	if (n <= 0) return;
154109998Smarkm
155109998Smarkm	while (n&~3)
156109998Smarkm		{
157109998Smarkm		sqr(r[0],r[1],a[0]);
158109998Smarkm		sqr(r[2],r[3],a[1]);
159109998Smarkm		sqr(r[4],r[5],a[2]);
160109998Smarkm		sqr(r[6],r[7],a[3]);
161109998Smarkm		a+=4; r+=8; n-=4;
162109998Smarkm		}
163109998Smarkm	if (n)
164109998Smarkm		{
165109998Smarkm		sqr(r[0],r[1],a[0]); if (--n == 0) return;
166109998Smarkm		sqr(r[2],r[3],a[1]); if (--n == 0) return;
167109998Smarkm		sqr(r[4],r[5],a[2]);
168109998Smarkm		}
169109998Smarkm	}
170109998Smarkm
171109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
172109998Smarkm{	BN_ULONG ret,waste;
173109998Smarkm
174127128Snectar	asm ("divq	%4"
175109998Smarkm		: "=a"(ret),"=d"(waste)
176109998Smarkm		: "a"(l),"d"(h),"g"(d)
177109998Smarkm		: "cc");
178109998Smarkm
179109998Smarkm	return ret;
180109998Smarkm}
181109998Smarkm
182205128SsimonBN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
183160814Ssimon{ BN_ULONG ret=0,i=0;
184109998Smarkm
185109998Smarkm	if (n <= 0) return 0;
186109998Smarkm
187109998Smarkm	asm (
188109998Smarkm	"	subq	%2,%2		\n"
189109998Smarkm	".align 16			\n"
190109998Smarkm	"1:	movq	(%4,%2,8),%0	\n"
191109998Smarkm	"	adcq	(%5,%2,8),%0	\n"
192109998Smarkm	"	movq	%0,(%3,%2,8)	\n"
193109998Smarkm	"	leaq	1(%2),%2	\n"
194109998Smarkm	"	loop	1b		\n"
195109998Smarkm	"	sbbq	%0,%0		\n"
196160814Ssimon		: "=&a"(ret),"+c"(n),"=&r"(i)
197109998Smarkm		: "r"(rp),"r"(ap),"r"(bp)
198109998Smarkm		: "cc"
199109998Smarkm	);
200109998Smarkm
201109998Smarkm  return ret&1;
202109998Smarkm}
203109998Smarkm
204109998Smarkm#ifndef SIMICS
205205128SsimonBN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
206160814Ssimon{ BN_ULONG ret=0,i=0;
207109998Smarkm
208109998Smarkm	if (n <= 0) return 0;
209109998Smarkm
210109998Smarkm	asm (
211109998Smarkm	"	subq	%2,%2		\n"
212109998Smarkm	".align 16			\n"
213109998Smarkm	"1:	movq	(%4,%2,8),%0	\n"
214109998Smarkm	"	sbbq	(%5,%2,8),%0	\n"
215109998Smarkm	"	movq	%0,(%3,%2,8)	\n"
216109998Smarkm	"	leaq	1(%2),%2	\n"
217109998Smarkm	"	loop	1b		\n"
218109998Smarkm	"	sbbq	%0,%0		\n"
219160814Ssimon		: "=&a"(ret),"+c"(n),"=&r"(i)
220109998Smarkm		: "r"(rp),"r"(ap),"r"(bp)
221109998Smarkm		: "cc"
222109998Smarkm	);
223109998Smarkm
224109998Smarkm  return ret&1;
225109998Smarkm}
226109998Smarkm#else
227109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */
228109998Smarkm#define BN_MASK2 0xffffffffffffffffL
229109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
230109998Smarkm        {
231109998Smarkm	BN_ULONG t1,t2;
232109998Smarkm	int c=0;
233109998Smarkm
234109998Smarkm	if (n <= 0) return((BN_ULONG)0);
235109998Smarkm
236109998Smarkm	for (;;)
237109998Smarkm		{
238109998Smarkm		t1=a[0]; t2=b[0];
239109998Smarkm		r[0]=(t1-t2-c)&BN_MASK2;
240109998Smarkm		if (t1 != t2) c=(t1 < t2);
241109998Smarkm		if (--n <= 0) break;
242109998Smarkm
243109998Smarkm		t1=a[1]; t2=b[1];
244109998Smarkm		r[1]=(t1-t2-c)&BN_MASK2;
245109998Smarkm		if (t1 != t2) c=(t1 < t2);
246109998Smarkm		if (--n <= 0) break;
247109998Smarkm
248109998Smarkm		t1=a[2]; t2=b[2];
249109998Smarkm		r[2]=(t1-t2-c)&BN_MASK2;
250109998Smarkm		if (t1 != t2) c=(t1 < t2);
251109998Smarkm		if (--n <= 0) break;
252109998Smarkm
253109998Smarkm		t1=a[3]; t2=b[3];
254109998Smarkm		r[3]=(t1-t2-c)&BN_MASK2;
255109998Smarkm		if (t1 != t2) c=(t1 < t2);
256109998Smarkm		if (--n <= 0) break;
257109998Smarkm
258109998Smarkm		a+=4;
259109998Smarkm		b+=4;
260109998Smarkm		r+=4;
261109998Smarkm		}
262109998Smarkm	return(c);
263109998Smarkm	}
264109998Smarkm#endif
265109998Smarkm
266109998Smarkm/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
267109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
268109998Smarkm/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
269109998Smarkm/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
270109998Smarkm
271109998Smarkm#if 0
272109998Smarkm/* original macros are kept for reference purposes */
273109998Smarkm#define mul_add_c(a,b,c0,c1,c2) {	\
274109998Smarkm	BN_ULONG ta=(a),tb=(b);		\
275109998Smarkm	t1 = ta * tb;			\
276109998Smarkm	t2 = BN_UMULT_HIGH(ta,tb);	\
277109998Smarkm	c0 += t1; t2 += (c0<t1)?1:0;	\
278109998Smarkm	c1 += t2; c2 += (c1<t2)?1:0;	\
279109998Smarkm	}
280109998Smarkm
281109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) {	\
282109998Smarkm	BN_ULONG ta=(a),tb=(b),t0;	\
283109998Smarkm	t1 = BN_UMULT_HIGH(ta,tb);	\
284109998Smarkm	t0 = ta * tb;			\
285109998Smarkm	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
286109998Smarkm	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
287109998Smarkm	c0 += t1; t2 += (c0<t1)?1:0;	\
288109998Smarkm	c1 += t2; c2 += (c1<t2)?1:0;	\
289109998Smarkm	}
290109998Smarkm#else
291109998Smarkm#define mul_add_c(a,b,c0,c1,c2)	do {	\
292109998Smarkm	asm ("mulq %3"			\
293109998Smarkm		: "=a"(t1),"=d"(t2)	\
294109998Smarkm		: "a"(a),"m"(b)		\
295109998Smarkm		: "cc");		\
296109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
297109998Smarkm		: "+r"(c0),"+d"(t2)	\
298109998Smarkm		: "a"(t1),"g"(0)	\
299109998Smarkm		: "cc");		\
300109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
301109998Smarkm		: "+r"(c1),"+r"(c2)	\
302109998Smarkm		: "d"(t2),"g"(0)	\
303109998Smarkm		: "cc");		\
304109998Smarkm	} while (0)
305109998Smarkm
306109998Smarkm#define sqr_add_c(a,i,c0,c1,c2)	do {	\
307109998Smarkm	asm ("mulq %2"			\
308109998Smarkm		: "=a"(t1),"=d"(t2)	\
309109998Smarkm		: "a"(a[i])		\
310109998Smarkm		: "cc");		\
311109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
312109998Smarkm		: "+r"(c0),"+d"(t2)	\
313109998Smarkm		: "a"(t1),"g"(0)	\
314109998Smarkm		: "cc");		\
315109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
316109998Smarkm		: "+r"(c1),"+r"(c2)	\
317109998Smarkm		: "d"(t2),"g"(0)	\
318109998Smarkm		: "cc");		\
319109998Smarkm	} while (0)
320109998Smarkm
321109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) do {	\
322109998Smarkm	asm ("mulq %3"			\
323109998Smarkm		: "=a"(t1),"=d"(t2)	\
324109998Smarkm		: "a"(a),"m"(b)		\
325109998Smarkm		: "cc");		\
326109998Smarkm	asm ("addq %0,%0; adcq %2,%1"	\
327109998Smarkm		: "+d"(t2),"+r"(c2)	\
328109998Smarkm		: "g"(0)		\
329109998Smarkm		: "cc");		\
330109998Smarkm	asm ("addq %0,%0; adcq %2,%1"	\
331109998Smarkm		: "+a"(t1),"+d"(t2)	\
332109998Smarkm		: "g"(0)		\
333109998Smarkm		: "cc");		\
334109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
335109998Smarkm		: "+r"(c0),"+d"(t2)	\
336109998Smarkm		: "a"(t1),"g"(0)	\
337109998Smarkm		: "cc");		\
338109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
339109998Smarkm		: "+r"(c1),"+r"(c2)	\
340109998Smarkm		: "d"(t2),"g"(0)	\
341109998Smarkm		: "cc");		\
342109998Smarkm	} while (0)
343109998Smarkm#endif
344109998Smarkm
345109998Smarkm#define sqr_add_c2(a,i,j,c0,c1,c2)	\
346109998Smarkm	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
347109998Smarkm
348109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
349109998Smarkm	{
350109998Smarkm	BN_ULONG t1,t2;
351109998Smarkm	BN_ULONG c1,c2,c3;
352109998Smarkm
353109998Smarkm	c1=0;
354109998Smarkm	c2=0;
355109998Smarkm	c3=0;
356109998Smarkm	mul_add_c(a[0],b[0],c1,c2,c3);
357109998Smarkm	r[0]=c1;
358109998Smarkm	c1=0;
359109998Smarkm	mul_add_c(a[0],b[1],c2,c3,c1);
360109998Smarkm	mul_add_c(a[1],b[0],c2,c3,c1);
361109998Smarkm	r[1]=c2;
362109998Smarkm	c2=0;
363109998Smarkm	mul_add_c(a[2],b[0],c3,c1,c2);
364109998Smarkm	mul_add_c(a[1],b[1],c3,c1,c2);
365109998Smarkm	mul_add_c(a[0],b[2],c3,c1,c2);
366109998Smarkm	r[2]=c3;
367109998Smarkm	c3=0;
368109998Smarkm	mul_add_c(a[0],b[3],c1,c2,c3);
369109998Smarkm	mul_add_c(a[1],b[2],c1,c2,c3);
370109998Smarkm	mul_add_c(a[2],b[1],c1,c2,c3);
371109998Smarkm	mul_add_c(a[3],b[0],c1,c2,c3);
372109998Smarkm	r[3]=c1;
373109998Smarkm	c1=0;
374109998Smarkm	mul_add_c(a[4],b[0],c2,c3,c1);
375109998Smarkm	mul_add_c(a[3],b[1],c2,c3,c1);
376109998Smarkm	mul_add_c(a[2],b[2],c2,c3,c1);
377109998Smarkm	mul_add_c(a[1],b[3],c2,c3,c1);
378109998Smarkm	mul_add_c(a[0],b[4],c2,c3,c1);
379109998Smarkm	r[4]=c2;
380109998Smarkm	c2=0;
381109998Smarkm	mul_add_c(a[0],b[5],c3,c1,c2);
382109998Smarkm	mul_add_c(a[1],b[4],c3,c1,c2);
383109998Smarkm	mul_add_c(a[2],b[3],c3,c1,c2);
384109998Smarkm	mul_add_c(a[3],b[2],c3,c1,c2);
385109998Smarkm	mul_add_c(a[4],b[1],c3,c1,c2);
386109998Smarkm	mul_add_c(a[5],b[0],c3,c1,c2);
387109998Smarkm	r[5]=c3;
388109998Smarkm	c3=0;
389109998Smarkm	mul_add_c(a[6],b[0],c1,c2,c3);
390109998Smarkm	mul_add_c(a[5],b[1],c1,c2,c3);
391109998Smarkm	mul_add_c(a[4],b[2],c1,c2,c3);
392109998Smarkm	mul_add_c(a[3],b[3],c1,c2,c3);
393109998Smarkm	mul_add_c(a[2],b[4],c1,c2,c3);
394109998Smarkm	mul_add_c(a[1],b[5],c1,c2,c3);
395109998Smarkm	mul_add_c(a[0],b[6],c1,c2,c3);
396109998Smarkm	r[6]=c1;
397109998Smarkm	c1=0;
398109998Smarkm	mul_add_c(a[0],b[7],c2,c3,c1);
399109998Smarkm	mul_add_c(a[1],b[6],c2,c3,c1);
400109998Smarkm	mul_add_c(a[2],b[5],c2,c3,c1);
401109998Smarkm	mul_add_c(a[3],b[4],c2,c3,c1);
402109998Smarkm	mul_add_c(a[4],b[3],c2,c3,c1);
403109998Smarkm	mul_add_c(a[5],b[2],c2,c3,c1);
404109998Smarkm	mul_add_c(a[6],b[1],c2,c3,c1);
405109998Smarkm	mul_add_c(a[7],b[0],c2,c3,c1);
406109998Smarkm	r[7]=c2;
407109998Smarkm	c2=0;
408109998Smarkm	mul_add_c(a[7],b[1],c3,c1,c2);
409109998Smarkm	mul_add_c(a[6],b[2],c3,c1,c2);
410109998Smarkm	mul_add_c(a[5],b[3],c3,c1,c2);
411109998Smarkm	mul_add_c(a[4],b[4],c3,c1,c2);
412109998Smarkm	mul_add_c(a[3],b[5],c3,c1,c2);
413109998Smarkm	mul_add_c(a[2],b[6],c3,c1,c2);
414109998Smarkm	mul_add_c(a[1],b[7],c3,c1,c2);
415109998Smarkm	r[8]=c3;
416109998Smarkm	c3=0;
417109998Smarkm	mul_add_c(a[2],b[7],c1,c2,c3);
418109998Smarkm	mul_add_c(a[3],b[6],c1,c2,c3);
419109998Smarkm	mul_add_c(a[4],b[5],c1,c2,c3);
420109998Smarkm	mul_add_c(a[5],b[4],c1,c2,c3);
421109998Smarkm	mul_add_c(a[6],b[3],c1,c2,c3);
422109998Smarkm	mul_add_c(a[7],b[2],c1,c2,c3);
423109998Smarkm	r[9]=c1;
424109998Smarkm	c1=0;
425109998Smarkm	mul_add_c(a[7],b[3],c2,c3,c1);
426109998Smarkm	mul_add_c(a[6],b[4],c2,c3,c1);
427109998Smarkm	mul_add_c(a[5],b[5],c2,c3,c1);
428109998Smarkm	mul_add_c(a[4],b[6],c2,c3,c1);
429109998Smarkm	mul_add_c(a[3],b[7],c2,c3,c1);
430109998Smarkm	r[10]=c2;
431109998Smarkm	c2=0;
432109998Smarkm	mul_add_c(a[4],b[7],c3,c1,c2);
433109998Smarkm	mul_add_c(a[5],b[6],c3,c1,c2);
434109998Smarkm	mul_add_c(a[6],b[5],c3,c1,c2);
435109998Smarkm	mul_add_c(a[7],b[4],c3,c1,c2);
436109998Smarkm	r[11]=c3;
437109998Smarkm	c3=0;
438109998Smarkm	mul_add_c(a[7],b[5],c1,c2,c3);
439109998Smarkm	mul_add_c(a[6],b[6],c1,c2,c3);
440109998Smarkm	mul_add_c(a[5],b[7],c1,c2,c3);
441109998Smarkm	r[12]=c1;
442109998Smarkm	c1=0;
443109998Smarkm	mul_add_c(a[6],b[7],c2,c3,c1);
444109998Smarkm	mul_add_c(a[7],b[6],c2,c3,c1);
445109998Smarkm	r[13]=c2;
446109998Smarkm	c2=0;
447109998Smarkm	mul_add_c(a[7],b[7],c3,c1,c2);
448109998Smarkm	r[14]=c3;
449109998Smarkm	r[15]=c1;
450109998Smarkm	}
451109998Smarkm
452109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
453109998Smarkm	{
454109998Smarkm	BN_ULONG t1,t2;
455109998Smarkm	BN_ULONG c1,c2,c3;
456109998Smarkm
457109998Smarkm	c1=0;
458109998Smarkm	c2=0;
459109998Smarkm	c3=0;
460109998Smarkm	mul_add_c(a[0],b[0],c1,c2,c3);
461109998Smarkm	r[0]=c1;
462109998Smarkm	c1=0;
463109998Smarkm	mul_add_c(a[0],b[1],c2,c3,c1);
464109998Smarkm	mul_add_c(a[1],b[0],c2,c3,c1);
465109998Smarkm	r[1]=c2;
466109998Smarkm	c2=0;
467109998Smarkm	mul_add_c(a[2],b[0],c3,c1,c2);
468109998Smarkm	mul_add_c(a[1],b[1],c3,c1,c2);
469109998Smarkm	mul_add_c(a[0],b[2],c3,c1,c2);
470109998Smarkm	r[2]=c3;
471109998Smarkm	c3=0;
472109998Smarkm	mul_add_c(a[0],b[3],c1,c2,c3);
473109998Smarkm	mul_add_c(a[1],b[2],c1,c2,c3);
474109998Smarkm	mul_add_c(a[2],b[1],c1,c2,c3);
475109998Smarkm	mul_add_c(a[3],b[0],c1,c2,c3);
476109998Smarkm	r[3]=c1;
477109998Smarkm	c1=0;
478109998Smarkm	mul_add_c(a[3],b[1],c2,c3,c1);
479109998Smarkm	mul_add_c(a[2],b[2],c2,c3,c1);
480109998Smarkm	mul_add_c(a[1],b[3],c2,c3,c1);
481109998Smarkm	r[4]=c2;
482109998Smarkm	c2=0;
483109998Smarkm	mul_add_c(a[2],b[3],c3,c1,c2);
484109998Smarkm	mul_add_c(a[3],b[2],c3,c1,c2);
485109998Smarkm	r[5]=c3;
486109998Smarkm	c3=0;
487109998Smarkm	mul_add_c(a[3],b[3],c1,c2,c3);
488109998Smarkm	r[6]=c1;
489109998Smarkm	r[7]=c2;
490109998Smarkm	}
491109998Smarkm
492205128Ssimonvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
493109998Smarkm	{
494109998Smarkm	BN_ULONG t1,t2;
495109998Smarkm	BN_ULONG c1,c2,c3;
496109998Smarkm
497109998Smarkm	c1=0;
498109998Smarkm	c2=0;
499109998Smarkm	c3=0;
500109998Smarkm	sqr_add_c(a,0,c1,c2,c3);
501109998Smarkm	r[0]=c1;
502109998Smarkm	c1=0;
503109998Smarkm	sqr_add_c2(a,1,0,c2,c3,c1);
504109998Smarkm	r[1]=c2;
505109998Smarkm	c2=0;
506109998Smarkm	sqr_add_c(a,1,c3,c1,c2);
507109998Smarkm	sqr_add_c2(a,2,0,c3,c1,c2);
508109998Smarkm	r[2]=c3;
509109998Smarkm	c3=0;
510109998Smarkm	sqr_add_c2(a,3,0,c1,c2,c3);
511109998Smarkm	sqr_add_c2(a,2,1,c1,c2,c3);
512109998Smarkm	r[3]=c1;
513109998Smarkm	c1=0;
514109998Smarkm	sqr_add_c(a,2,c2,c3,c1);
515109998Smarkm	sqr_add_c2(a,3,1,c2,c3,c1);
516109998Smarkm	sqr_add_c2(a,4,0,c2,c3,c1);
517109998Smarkm	r[4]=c2;
518109998Smarkm	c2=0;
519109998Smarkm	sqr_add_c2(a,5,0,c3,c1,c2);
520109998Smarkm	sqr_add_c2(a,4,1,c3,c1,c2);
521109998Smarkm	sqr_add_c2(a,3,2,c3,c1,c2);
522109998Smarkm	r[5]=c3;
523109998Smarkm	c3=0;
524109998Smarkm	sqr_add_c(a,3,c1,c2,c3);
525109998Smarkm	sqr_add_c2(a,4,2,c1,c2,c3);
526109998Smarkm	sqr_add_c2(a,5,1,c1,c2,c3);
527109998Smarkm	sqr_add_c2(a,6,0,c1,c2,c3);
528109998Smarkm	r[6]=c1;
529109998Smarkm	c1=0;
530109998Smarkm	sqr_add_c2(a,7,0,c2,c3,c1);
531109998Smarkm	sqr_add_c2(a,6,1,c2,c3,c1);
532109998Smarkm	sqr_add_c2(a,5,2,c2,c3,c1);
533109998Smarkm	sqr_add_c2(a,4,3,c2,c3,c1);
534109998Smarkm	r[7]=c2;
535109998Smarkm	c2=0;
536109998Smarkm	sqr_add_c(a,4,c3,c1,c2);
537109998Smarkm	sqr_add_c2(a,5,3,c3,c1,c2);
538109998Smarkm	sqr_add_c2(a,6,2,c3,c1,c2);
539109998Smarkm	sqr_add_c2(a,7,1,c3,c1,c2);
540109998Smarkm	r[8]=c3;
541109998Smarkm	c3=0;
542109998Smarkm	sqr_add_c2(a,7,2,c1,c2,c3);
543109998Smarkm	sqr_add_c2(a,6,3,c1,c2,c3);
544109998Smarkm	sqr_add_c2(a,5,4,c1,c2,c3);
545109998Smarkm	r[9]=c1;
546109998Smarkm	c1=0;
547109998Smarkm	sqr_add_c(a,5,c2,c3,c1);
548109998Smarkm	sqr_add_c2(a,6,4,c2,c3,c1);
549109998Smarkm	sqr_add_c2(a,7,3,c2,c3,c1);
550109998Smarkm	r[10]=c2;
551109998Smarkm	c2=0;
552109998Smarkm	sqr_add_c2(a,7,4,c3,c1,c2);
553109998Smarkm	sqr_add_c2(a,6,5,c3,c1,c2);
554109998Smarkm	r[11]=c3;
555109998Smarkm	c3=0;
556109998Smarkm	sqr_add_c(a,6,c1,c2,c3);
557109998Smarkm	sqr_add_c2(a,7,5,c1,c2,c3);
558109998Smarkm	r[12]=c1;
559109998Smarkm	c1=0;
560109998Smarkm	sqr_add_c2(a,7,6,c2,c3,c1);
561109998Smarkm	r[13]=c2;
562109998Smarkm	c2=0;
563109998Smarkm	sqr_add_c(a,7,c3,c1,c2);
564109998Smarkm	r[14]=c3;
565109998Smarkm	r[15]=c1;
566109998Smarkm	}
567109998Smarkm
568205128Ssimonvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
569109998Smarkm	{
570109998Smarkm	BN_ULONG t1,t2;
571109998Smarkm	BN_ULONG c1,c2,c3;
572109998Smarkm
573109998Smarkm	c1=0;
574109998Smarkm	c2=0;
575109998Smarkm	c3=0;
576109998Smarkm	sqr_add_c(a,0,c1,c2,c3);
577109998Smarkm	r[0]=c1;
578109998Smarkm	c1=0;
579109998Smarkm	sqr_add_c2(a,1,0,c2,c3,c1);
580109998Smarkm	r[1]=c2;
581109998Smarkm	c2=0;
582109998Smarkm	sqr_add_c(a,1,c3,c1,c2);
583109998Smarkm	sqr_add_c2(a,2,0,c3,c1,c2);
584109998Smarkm	r[2]=c3;
585109998Smarkm	c3=0;
586109998Smarkm	sqr_add_c2(a,3,0,c1,c2,c3);
587109998Smarkm	sqr_add_c2(a,2,1,c1,c2,c3);
588109998Smarkm	r[3]=c1;
589109998Smarkm	c1=0;
590109998Smarkm	sqr_add_c(a,2,c2,c3,c1);
591109998Smarkm	sqr_add_c2(a,3,1,c2,c3,c1);
592109998Smarkm	r[4]=c2;
593109998Smarkm	c2=0;
594109998Smarkm	sqr_add_c2(a,3,2,c3,c1,c2);
595109998Smarkm	r[5]=c3;
596109998Smarkm	c3=0;
597109998Smarkm	sqr_add_c(a,3,c1,c2,c3);
598109998Smarkm	r[6]=c1;
599109998Smarkm	r[7]=c2;
600109998Smarkm	}
601162911Ssimon#endif
602