1205128Ssimon#include "../bn_lcl.h"
2238405Sjkim#if !(defined(__GNUC__) && __GNUC__>=2)
3162911Ssimon# include "../bn_asm.c"	/* kind of dirty hack for Sun Studio */
4162911Ssimon#else
5109998Smarkm/*
6109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002.
7109998Smarkm *
8109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9109998Smarkm * project.
10109998Smarkm *
11109998Smarkm * Rights for redistribution and usage in source and binary forms are
12109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is
13109998Smarkm * disclaimed.
14109998Smarkm *
15109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
16109998Smarkm *    versions, like 1.0...
17109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty
18109998Smarkm *    proof-of-concept hack. As you can see it's implemented with
19109998Smarkm *    inline assembler, which means that you're bound to GCC and that
20160814Ssimon *    there might be enough room for further improvement.
21109998Smarkm *
22109998Smarkm * Q. Why inline assembler?
23160814Ssimon * A. x86_64 features own ABI which I'm not familiar with. This is
24160814Ssimon *    why I decided to let the compiler take care of subroutine
25160814Ssimon *    prologue/epilogue as well as register allocation. For reference.
26160814Ssimon *    Win64 implements different ABI for AMD64, different from Linux.
27109998Smarkm *
28109998Smarkm * Q. How much faster does it get?
29160814Ssimon * A. 'apps/openssl speed rsa dsa' output with no-asm:
30160814Ssimon *
31160814Ssimon *	                  sign    verify    sign/s verify/s
32160814Ssimon *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
33160814Ssimon *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
34160814Ssimon *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
35160814Ssimon *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
36160814Ssimon *	                  sign    verify    sign/s verify/s
37160814Ssimon *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
38160814Ssimon *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
39160814Ssimon *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
40160814Ssimon *
41160814Ssimon *    'apps/openssl speed rsa dsa' output with this module:
42160814Ssimon *
43160814Ssimon *	                  sign    verify    sign/s verify/s
44160814Ssimon *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
45160814Ssimon *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
46160814Ssimon *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
47160814Ssimon *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
48160814Ssimon *	                  sign    verify    sign/s verify/s
49160814Ssimon *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
50160814Ssimon *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
51160814Ssimon *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
52160814Ssimon *
53160814Ssimon *    For the reference. IA-32 assembler implementation performs
54160814Ssimon *    very much like 64-bit code compiled with no-asm on the same
55160814Ssimon *    machine.
56109998Smarkm */
57109998Smarkm
58238405Sjkim#ifdef _WIN64
59238405Sjkim#define BN_ULONG unsigned long long
60238405Sjkim#else
61109998Smarkm#define BN_ULONG unsigned long
62238405Sjkim#endif
63109998Smarkm
64205128Ssimon#undef mul
65205128Ssimon#undef mul_add
66206046Ssimon#undef sqr
67205128Ssimon
68109998Smarkm/*
69109998Smarkm * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
70109998Smarkm * "g"(0)		let the compiler to decide where does it
71109998Smarkm *			want to keep the value of zero;
72109998Smarkm */
73109998Smarkm#define mul_add(r,a,word,carry) do {	\
74109998Smarkm	register BN_ULONG high,low;	\
75109998Smarkm	asm ("mulq %3"			\
76109998Smarkm		: "=a"(low),"=d"(high)	\
77109998Smarkm		: "a"(word),"m"(a)	\
78109998Smarkm		: "cc");		\
79109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
80109998Smarkm		: "+r"(carry),"+d"(high)\
81109998Smarkm		: "a"(low),"g"(0)	\
82109998Smarkm		: "cc");		\
83109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
84109998Smarkm		: "+m"(r),"+d"(high)	\
85109998Smarkm		: "r"(carry),"g"(0)	\
86109998Smarkm		: "cc");		\
87109998Smarkm	carry=high;			\
88109998Smarkm	} while (0)
89109998Smarkm
90109998Smarkm#define mul(r,a,word,carry) do {	\
91109998Smarkm	register BN_ULONG high,low;	\
92109998Smarkm	asm ("mulq %3"			\
93109998Smarkm		: "=a"(low),"=d"(high)	\
94109998Smarkm		: "a"(word),"g"(a)	\
95109998Smarkm		: "cc");		\
96109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
97109998Smarkm		: "+r"(carry),"+d"(high)\
98109998Smarkm		: "a"(low),"g"(0)	\
99109998Smarkm		: "cc");		\
100109998Smarkm	(r)=carry, carry=high;		\
101109998Smarkm	} while (0)
102109998Smarkm
103109998Smarkm#define sqr(r0,r1,a)			\
104109998Smarkm	asm ("mulq %2"			\
105109998Smarkm		: "=a"(r0),"=d"(r1)	\
106109998Smarkm		: "a"(a)		\
107109998Smarkm		: "cc");
108109998Smarkm
109205128SsimonBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
110109998Smarkm	{
111109998Smarkm	BN_ULONG c1=0;
112109998Smarkm
113109998Smarkm	if (num <= 0) return(c1);
114109998Smarkm
115109998Smarkm	while (num&~3)
116109998Smarkm		{
117109998Smarkm		mul_add(rp[0],ap[0],w,c1);
118109998Smarkm		mul_add(rp[1],ap[1],w,c1);
119109998Smarkm		mul_add(rp[2],ap[2],w,c1);
120109998Smarkm		mul_add(rp[3],ap[3],w,c1);
121109998Smarkm		ap+=4; rp+=4; num-=4;
122109998Smarkm		}
123109998Smarkm	if (num)
124109998Smarkm		{
125109998Smarkm		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
126109998Smarkm		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
127109998Smarkm		mul_add(rp[2],ap[2],w,c1); return c1;
128109998Smarkm		}
129109998Smarkm
130109998Smarkm	return(c1);
131109998Smarkm	}
132109998Smarkm
133205128SsimonBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
134109998Smarkm	{
135109998Smarkm	BN_ULONG c1=0;
136109998Smarkm
137109998Smarkm	if (num <= 0) return(c1);
138109998Smarkm
139109998Smarkm	while (num&~3)
140109998Smarkm		{
141109998Smarkm		mul(rp[0],ap[0],w,c1);
142109998Smarkm		mul(rp[1],ap[1],w,c1);
143109998Smarkm		mul(rp[2],ap[2],w,c1);
144109998Smarkm		mul(rp[3],ap[3],w,c1);
145109998Smarkm		ap+=4; rp+=4; num-=4;
146109998Smarkm		}
147109998Smarkm	if (num)
148109998Smarkm		{
149109998Smarkm		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
150109998Smarkm		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
151109998Smarkm		mul(rp[2],ap[2],w,c1);
152109998Smarkm		}
153109998Smarkm	return(c1);
154109998Smarkm	}
155109998Smarkm
156205128Ssimonvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
157109998Smarkm        {
158109998Smarkm	if (n <= 0) return;
159109998Smarkm
160109998Smarkm	while (n&~3)
161109998Smarkm		{
162109998Smarkm		sqr(r[0],r[1],a[0]);
163109998Smarkm		sqr(r[2],r[3],a[1]);
164109998Smarkm		sqr(r[4],r[5],a[2]);
165109998Smarkm		sqr(r[6],r[7],a[3]);
166109998Smarkm		a+=4; r+=8; n-=4;
167109998Smarkm		}
168109998Smarkm	if (n)
169109998Smarkm		{
170109998Smarkm		sqr(r[0],r[1],a[0]); if (--n == 0) return;
171109998Smarkm		sqr(r[2],r[3],a[1]); if (--n == 0) return;
172109998Smarkm		sqr(r[4],r[5],a[2]);
173109998Smarkm		}
174109998Smarkm	}
175109998Smarkm
176109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
177109998Smarkm{	BN_ULONG ret,waste;
178109998Smarkm
179127128Snectar	asm ("divq	%4"
180109998Smarkm		: "=a"(ret),"=d"(waste)
181109998Smarkm		: "a"(l),"d"(h),"g"(d)
182109998Smarkm		: "cc");
183109998Smarkm
184109998Smarkm	return ret;
185109998Smarkm}
186109998Smarkm
187205128SsimonBN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
188160814Ssimon{ BN_ULONG ret=0,i=0;
189109998Smarkm
190109998Smarkm	if (n <= 0) return 0;
191109998Smarkm
192279264Sdelphij	asm volatile (
193109998Smarkm	"	subq	%2,%2		\n"
194238405Sjkim	".p2align 4			\n"
195109998Smarkm	"1:	movq	(%4,%2,8),%0	\n"
196109998Smarkm	"	adcq	(%5,%2,8),%0	\n"
197109998Smarkm	"	movq	%0,(%3,%2,8)	\n"
198109998Smarkm	"	leaq	1(%2),%2	\n"
199109998Smarkm	"	loop	1b		\n"
200109998Smarkm	"	sbbq	%0,%0		\n"
201160814Ssimon		: "=&a"(ret),"+c"(n),"=&r"(i)
202109998Smarkm		: "r"(rp),"r"(ap),"r"(bp)
203279264Sdelphij		: "cc", "memory"
204109998Smarkm	);
205109998Smarkm
206109998Smarkm  return ret&1;
207109998Smarkm}
208109998Smarkm
209109998Smarkm#ifndef SIMICS
210205128SsimonBN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
211160814Ssimon{ BN_ULONG ret=0,i=0;
212109998Smarkm
213109998Smarkm	if (n <= 0) return 0;
214109998Smarkm
215279264Sdelphij	asm volatile (
216109998Smarkm	"	subq	%2,%2		\n"
217238405Sjkim	".p2align 4			\n"
218109998Smarkm	"1:	movq	(%4,%2,8),%0	\n"
219109998Smarkm	"	sbbq	(%5,%2,8),%0	\n"
220109998Smarkm	"	movq	%0,(%3,%2,8)	\n"
221109998Smarkm	"	leaq	1(%2),%2	\n"
222109998Smarkm	"	loop	1b		\n"
223109998Smarkm	"	sbbq	%0,%0		\n"
224160814Ssimon		: "=&a"(ret),"+c"(n),"=&r"(i)
225109998Smarkm		: "r"(rp),"r"(ap),"r"(bp)
226279264Sdelphij		: "cc", "memory"
227109998Smarkm	);
228109998Smarkm
229109998Smarkm  return ret&1;
230109998Smarkm}
231109998Smarkm#else
232109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */
233109998Smarkm#define BN_MASK2 0xffffffffffffffffL
234109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
235109998Smarkm        {
236109998Smarkm	BN_ULONG t1,t2;
237109998Smarkm	int c=0;
238109998Smarkm
239109998Smarkm	if (n <= 0) return((BN_ULONG)0);
240109998Smarkm
241109998Smarkm	for (;;)
242109998Smarkm		{
243109998Smarkm		t1=a[0]; t2=b[0];
244109998Smarkm		r[0]=(t1-t2-c)&BN_MASK2;
245109998Smarkm		if (t1 != t2) c=(t1 < t2);
246109998Smarkm		if (--n <= 0) break;
247109998Smarkm
248109998Smarkm		t1=a[1]; t2=b[1];
249109998Smarkm		r[1]=(t1-t2-c)&BN_MASK2;
250109998Smarkm		if (t1 != t2) c=(t1 < t2);
251109998Smarkm		if (--n <= 0) break;
252109998Smarkm
253109998Smarkm		t1=a[2]; t2=b[2];
254109998Smarkm		r[2]=(t1-t2-c)&BN_MASK2;
255109998Smarkm		if (t1 != t2) c=(t1 < t2);
256109998Smarkm		if (--n <= 0) break;
257109998Smarkm
258109998Smarkm		t1=a[3]; t2=b[3];
259109998Smarkm		r[3]=(t1-t2-c)&BN_MASK2;
260109998Smarkm		if (t1 != t2) c=(t1 < t2);
261109998Smarkm		if (--n <= 0) break;
262109998Smarkm
263109998Smarkm		a+=4;
264109998Smarkm		b+=4;
265109998Smarkm		r+=4;
266109998Smarkm		}
267109998Smarkm	return(c);
268109998Smarkm	}
269109998Smarkm#endif
270109998Smarkm
271109998Smarkm/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
272109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
273109998Smarkm/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
274109998Smarkm/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
275109998Smarkm
276277195Sdelphij/*
277277195Sdelphij * Keep in mind that carrying into high part of multiplication result
278277195Sdelphij * can not overflow, because it cannot be all-ones.
279277195Sdelphij */
280109998Smarkm#if 0
281109998Smarkm/* original macros are kept for reference purposes */
282109998Smarkm#define mul_add_c(a,b,c0,c1,c2) {	\
283109998Smarkm	BN_ULONG ta=(a),tb=(b);		\
284109998Smarkm	t1 = ta * tb;			\
285109998Smarkm	t2 = BN_UMULT_HIGH(ta,tb);	\
286109998Smarkm	c0 += t1; t2 += (c0<t1)?1:0;	\
287109998Smarkm	c1 += t2; c2 += (c1<t2)?1:0;	\
288109998Smarkm	}
289109998Smarkm
290109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) {	\
291109998Smarkm	BN_ULONG ta=(a),tb=(b),t0;	\
292109998Smarkm	t1 = BN_UMULT_HIGH(ta,tb);	\
293109998Smarkm	t0 = ta * tb;			\
294277195Sdelphij	c0 += t0; t2 = t1+((c0<t0)?1:0);\
295109998Smarkm	c1 += t2; c2 += (c1<t2)?1:0;	\
296277195Sdelphij	c0 += t0; t1 += (c0<t0)?1:0;	\
297277195Sdelphij	c1 += t1; c2 += (c1<t1)?1:0;	\
298109998Smarkm	}
299109998Smarkm#else
300109998Smarkm#define mul_add_c(a,b,c0,c1,c2)	do {	\
301109998Smarkm	asm ("mulq %3"			\
302109998Smarkm		: "=a"(t1),"=d"(t2)	\
303109998Smarkm		: "a"(a),"m"(b)		\
304109998Smarkm		: "cc");		\
305109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
306109998Smarkm		: "+r"(c0),"+d"(t2)	\
307109998Smarkm		: "a"(t1),"g"(0)	\
308109998Smarkm		: "cc");		\
309109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
310109998Smarkm		: "+r"(c1),"+r"(c2)	\
311109998Smarkm		: "d"(t2),"g"(0)	\
312109998Smarkm		: "cc");		\
313109998Smarkm	} while (0)
314109998Smarkm
315109998Smarkm#define sqr_add_c(a,i,c0,c1,c2)	do {	\
316109998Smarkm	asm ("mulq %2"			\
317109998Smarkm		: "=a"(t1),"=d"(t2)	\
318109998Smarkm		: "a"(a[i])		\
319109998Smarkm		: "cc");		\
320109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
321109998Smarkm		: "+r"(c0),"+d"(t2)	\
322109998Smarkm		: "a"(t1),"g"(0)	\
323109998Smarkm		: "cc");		\
324109998Smarkm	asm ("addq %2,%0; adcq %3,%1"	\
325109998Smarkm		: "+r"(c1),"+r"(c2)	\
326109998Smarkm		: "d"(t2),"g"(0)	\
327109998Smarkm		: "cc");		\
328109998Smarkm	} while (0)
329109998Smarkm
330109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) do {	\
331109998Smarkm	asm ("mulq %3"			\
332109998Smarkm		: "=a"(t1),"=d"(t2)	\
333109998Smarkm		: "a"(a),"m"(b)		\
334109998Smarkm		: "cc");		\
335277195Sdelphij	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"	\
336277195Sdelphij		: "+r"(c0),"+r"(c1),"+r"(c2)		\
337277195Sdelphij		: "r"(t1),"r"(t2),"g"(0)		\
338277195Sdelphij		: "cc");				\
339277195Sdelphij	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"	\
340277195Sdelphij		: "+r"(c0),"+r"(c1),"+r"(c2)		\
341277195Sdelphij		: "r"(t1),"r"(t2),"g"(0)		\
342277195Sdelphij		: "cc");				\
343109998Smarkm	} while (0)
344109998Smarkm#endif
345109998Smarkm
346109998Smarkm#define sqr_add_c2(a,i,j,c0,c1,c2)	\
347109998Smarkm	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
348109998Smarkm
349109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
350109998Smarkm	{
351109998Smarkm	BN_ULONG t1,t2;
352109998Smarkm	BN_ULONG c1,c2,c3;
353109998Smarkm
354109998Smarkm	c1=0;
355109998Smarkm	c2=0;
356109998Smarkm	c3=0;
357109998Smarkm	mul_add_c(a[0],b[0],c1,c2,c3);
358109998Smarkm	r[0]=c1;
359109998Smarkm	c1=0;
360109998Smarkm	mul_add_c(a[0],b[1],c2,c3,c1);
361109998Smarkm	mul_add_c(a[1],b[0],c2,c3,c1);
362109998Smarkm	r[1]=c2;
363109998Smarkm	c2=0;
364109998Smarkm	mul_add_c(a[2],b[0],c3,c1,c2);
365109998Smarkm	mul_add_c(a[1],b[1],c3,c1,c2);
366109998Smarkm	mul_add_c(a[0],b[2],c3,c1,c2);
367109998Smarkm	r[2]=c3;
368109998Smarkm	c3=0;
369109998Smarkm	mul_add_c(a[0],b[3],c1,c2,c3);
370109998Smarkm	mul_add_c(a[1],b[2],c1,c2,c3);
371109998Smarkm	mul_add_c(a[2],b[1],c1,c2,c3);
372109998Smarkm	mul_add_c(a[3],b[0],c1,c2,c3);
373109998Smarkm	r[3]=c1;
374109998Smarkm	c1=0;
375109998Smarkm	mul_add_c(a[4],b[0],c2,c3,c1);
376109998Smarkm	mul_add_c(a[3],b[1],c2,c3,c1);
377109998Smarkm	mul_add_c(a[2],b[2],c2,c3,c1);
378109998Smarkm	mul_add_c(a[1],b[3],c2,c3,c1);
379109998Smarkm	mul_add_c(a[0],b[4],c2,c3,c1);
380109998Smarkm	r[4]=c2;
381109998Smarkm	c2=0;
382109998Smarkm	mul_add_c(a[0],b[5],c3,c1,c2);
383109998Smarkm	mul_add_c(a[1],b[4],c3,c1,c2);
384109998Smarkm	mul_add_c(a[2],b[3],c3,c1,c2);
385109998Smarkm	mul_add_c(a[3],b[2],c3,c1,c2);
386109998Smarkm	mul_add_c(a[4],b[1],c3,c1,c2);
387109998Smarkm	mul_add_c(a[5],b[0],c3,c1,c2);
388109998Smarkm	r[5]=c3;
389109998Smarkm	c3=0;
390109998Smarkm	mul_add_c(a[6],b[0],c1,c2,c3);
391109998Smarkm	mul_add_c(a[5],b[1],c1,c2,c3);
392109998Smarkm	mul_add_c(a[4],b[2],c1,c2,c3);
393109998Smarkm	mul_add_c(a[3],b[3],c1,c2,c3);
394109998Smarkm	mul_add_c(a[2],b[4],c1,c2,c3);
395109998Smarkm	mul_add_c(a[1],b[5],c1,c2,c3);
396109998Smarkm	mul_add_c(a[0],b[6],c1,c2,c3);
397109998Smarkm	r[6]=c1;
398109998Smarkm	c1=0;
399109998Smarkm	mul_add_c(a[0],b[7],c2,c3,c1);
400109998Smarkm	mul_add_c(a[1],b[6],c2,c3,c1);
401109998Smarkm	mul_add_c(a[2],b[5],c2,c3,c1);
402109998Smarkm	mul_add_c(a[3],b[4],c2,c3,c1);
403109998Smarkm	mul_add_c(a[4],b[3],c2,c3,c1);
404109998Smarkm	mul_add_c(a[5],b[2],c2,c3,c1);
405109998Smarkm	mul_add_c(a[6],b[1],c2,c3,c1);
406109998Smarkm	mul_add_c(a[7],b[0],c2,c3,c1);
407109998Smarkm	r[7]=c2;
408109998Smarkm	c2=0;
409109998Smarkm	mul_add_c(a[7],b[1],c3,c1,c2);
410109998Smarkm	mul_add_c(a[6],b[2],c3,c1,c2);
411109998Smarkm	mul_add_c(a[5],b[3],c3,c1,c2);
412109998Smarkm	mul_add_c(a[4],b[4],c3,c1,c2);
413109998Smarkm	mul_add_c(a[3],b[5],c3,c1,c2);
414109998Smarkm	mul_add_c(a[2],b[6],c3,c1,c2);
415109998Smarkm	mul_add_c(a[1],b[7],c3,c1,c2);
416109998Smarkm	r[8]=c3;
417109998Smarkm	c3=0;
418109998Smarkm	mul_add_c(a[2],b[7],c1,c2,c3);
419109998Smarkm	mul_add_c(a[3],b[6],c1,c2,c3);
420109998Smarkm	mul_add_c(a[4],b[5],c1,c2,c3);
421109998Smarkm	mul_add_c(a[5],b[4],c1,c2,c3);
422109998Smarkm	mul_add_c(a[6],b[3],c1,c2,c3);
423109998Smarkm	mul_add_c(a[7],b[2],c1,c2,c3);
424109998Smarkm	r[9]=c1;
425109998Smarkm	c1=0;
426109998Smarkm	mul_add_c(a[7],b[3],c2,c3,c1);
427109998Smarkm	mul_add_c(a[6],b[4],c2,c3,c1);
428109998Smarkm	mul_add_c(a[5],b[5],c2,c3,c1);
429109998Smarkm	mul_add_c(a[4],b[6],c2,c3,c1);
430109998Smarkm	mul_add_c(a[3],b[7],c2,c3,c1);
431109998Smarkm	r[10]=c2;
432109998Smarkm	c2=0;
433109998Smarkm	mul_add_c(a[4],b[7],c3,c1,c2);
434109998Smarkm	mul_add_c(a[5],b[6],c3,c1,c2);
435109998Smarkm	mul_add_c(a[6],b[5],c3,c1,c2);
436109998Smarkm	mul_add_c(a[7],b[4],c3,c1,c2);
437109998Smarkm	r[11]=c3;
438109998Smarkm	c3=0;
439109998Smarkm	mul_add_c(a[7],b[5],c1,c2,c3);
440109998Smarkm	mul_add_c(a[6],b[6],c1,c2,c3);
441109998Smarkm	mul_add_c(a[5],b[7],c1,c2,c3);
442109998Smarkm	r[12]=c1;
443109998Smarkm	c1=0;
444109998Smarkm	mul_add_c(a[6],b[7],c2,c3,c1);
445109998Smarkm	mul_add_c(a[7],b[6],c2,c3,c1);
446109998Smarkm	r[13]=c2;
447109998Smarkm	c2=0;
448109998Smarkm	mul_add_c(a[7],b[7],c3,c1,c2);
449109998Smarkm	r[14]=c3;
450109998Smarkm	r[15]=c1;
451109998Smarkm	}
452109998Smarkm
453109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
454109998Smarkm	{
455109998Smarkm	BN_ULONG t1,t2;
456109998Smarkm	BN_ULONG c1,c2,c3;
457109998Smarkm
458109998Smarkm	c1=0;
459109998Smarkm	c2=0;
460109998Smarkm	c3=0;
461109998Smarkm	mul_add_c(a[0],b[0],c1,c2,c3);
462109998Smarkm	r[0]=c1;
463109998Smarkm	c1=0;
464109998Smarkm	mul_add_c(a[0],b[1],c2,c3,c1);
465109998Smarkm	mul_add_c(a[1],b[0],c2,c3,c1);
466109998Smarkm	r[1]=c2;
467109998Smarkm	c2=0;
468109998Smarkm	mul_add_c(a[2],b[0],c3,c1,c2);
469109998Smarkm	mul_add_c(a[1],b[1],c3,c1,c2);
470109998Smarkm	mul_add_c(a[0],b[2],c3,c1,c2);
471109998Smarkm	r[2]=c3;
472109998Smarkm	c3=0;
473109998Smarkm	mul_add_c(a[0],b[3],c1,c2,c3);
474109998Smarkm	mul_add_c(a[1],b[2],c1,c2,c3);
475109998Smarkm	mul_add_c(a[2],b[1],c1,c2,c3);
476109998Smarkm	mul_add_c(a[3],b[0],c1,c2,c3);
477109998Smarkm	r[3]=c1;
478109998Smarkm	c1=0;
479109998Smarkm	mul_add_c(a[3],b[1],c2,c3,c1);
480109998Smarkm	mul_add_c(a[2],b[2],c2,c3,c1);
481109998Smarkm	mul_add_c(a[1],b[3],c2,c3,c1);
482109998Smarkm	r[4]=c2;
483109998Smarkm	c2=0;
484109998Smarkm	mul_add_c(a[2],b[3],c3,c1,c2);
485109998Smarkm	mul_add_c(a[3],b[2],c3,c1,c2);
486109998Smarkm	r[5]=c3;
487109998Smarkm	c3=0;
488109998Smarkm	mul_add_c(a[3],b[3],c1,c2,c3);
489109998Smarkm	r[6]=c1;
490109998Smarkm	r[7]=c2;
491109998Smarkm	}
492109998Smarkm
493205128Ssimonvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
494109998Smarkm	{
495109998Smarkm	BN_ULONG t1,t2;
496109998Smarkm	BN_ULONG c1,c2,c3;
497109998Smarkm
498109998Smarkm	c1=0;
499109998Smarkm	c2=0;
500109998Smarkm	c3=0;
501109998Smarkm	sqr_add_c(a,0,c1,c2,c3);
502109998Smarkm	r[0]=c1;
503109998Smarkm	c1=0;
504109998Smarkm	sqr_add_c2(a,1,0,c2,c3,c1);
505109998Smarkm	r[1]=c2;
506109998Smarkm	c2=0;
507109998Smarkm	sqr_add_c(a,1,c3,c1,c2);
508109998Smarkm	sqr_add_c2(a,2,0,c3,c1,c2);
509109998Smarkm	r[2]=c3;
510109998Smarkm	c3=0;
511109998Smarkm	sqr_add_c2(a,3,0,c1,c2,c3);
512109998Smarkm	sqr_add_c2(a,2,1,c1,c2,c3);
513109998Smarkm	r[3]=c1;
514109998Smarkm	c1=0;
515109998Smarkm	sqr_add_c(a,2,c2,c3,c1);
516109998Smarkm	sqr_add_c2(a,3,1,c2,c3,c1);
517109998Smarkm	sqr_add_c2(a,4,0,c2,c3,c1);
518109998Smarkm	r[4]=c2;
519109998Smarkm	c2=0;
520109998Smarkm	sqr_add_c2(a,5,0,c3,c1,c2);
521109998Smarkm	sqr_add_c2(a,4,1,c3,c1,c2);
522109998Smarkm	sqr_add_c2(a,3,2,c3,c1,c2);
523109998Smarkm	r[5]=c3;
524109998Smarkm	c3=0;
525109998Smarkm	sqr_add_c(a,3,c1,c2,c3);
526109998Smarkm	sqr_add_c2(a,4,2,c1,c2,c3);
527109998Smarkm	sqr_add_c2(a,5,1,c1,c2,c3);
528109998Smarkm	sqr_add_c2(a,6,0,c1,c2,c3);
529109998Smarkm	r[6]=c1;
530109998Smarkm	c1=0;
531109998Smarkm	sqr_add_c2(a,7,0,c2,c3,c1);
532109998Smarkm	sqr_add_c2(a,6,1,c2,c3,c1);
533109998Smarkm	sqr_add_c2(a,5,2,c2,c3,c1);
534109998Smarkm	sqr_add_c2(a,4,3,c2,c3,c1);
535109998Smarkm	r[7]=c2;
536109998Smarkm	c2=0;
537109998Smarkm	sqr_add_c(a,4,c3,c1,c2);
538109998Smarkm	sqr_add_c2(a,5,3,c3,c1,c2);
539109998Smarkm	sqr_add_c2(a,6,2,c3,c1,c2);
540109998Smarkm	sqr_add_c2(a,7,1,c3,c1,c2);
541109998Smarkm	r[8]=c3;
542109998Smarkm	c3=0;
543109998Smarkm	sqr_add_c2(a,7,2,c1,c2,c3);
544109998Smarkm	sqr_add_c2(a,6,3,c1,c2,c3);
545109998Smarkm	sqr_add_c2(a,5,4,c1,c2,c3);
546109998Smarkm	r[9]=c1;
547109998Smarkm	c1=0;
548109998Smarkm	sqr_add_c(a,5,c2,c3,c1);
549109998Smarkm	sqr_add_c2(a,6,4,c2,c3,c1);
550109998Smarkm	sqr_add_c2(a,7,3,c2,c3,c1);
551109998Smarkm	r[10]=c2;
552109998Smarkm	c2=0;
553109998Smarkm	sqr_add_c2(a,7,4,c3,c1,c2);
554109998Smarkm	sqr_add_c2(a,6,5,c3,c1,c2);
555109998Smarkm	r[11]=c3;
556109998Smarkm	c3=0;
557109998Smarkm	sqr_add_c(a,6,c1,c2,c3);
558109998Smarkm	sqr_add_c2(a,7,5,c1,c2,c3);
559109998Smarkm	r[12]=c1;
560109998Smarkm	c1=0;
561109998Smarkm	sqr_add_c2(a,7,6,c2,c3,c1);
562109998Smarkm	r[13]=c2;
563109998Smarkm	c2=0;
564109998Smarkm	sqr_add_c(a,7,c3,c1,c2);
565109998Smarkm	r[14]=c3;
566109998Smarkm	r[15]=c1;
567109998Smarkm	}
568109998Smarkm
569205128Ssimonvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
570109998Smarkm	{
571109998Smarkm	BN_ULONG t1,t2;
572109998Smarkm	BN_ULONG c1,c2,c3;
573109998Smarkm
574109998Smarkm	c1=0;
575109998Smarkm	c2=0;
576109998Smarkm	c3=0;
577109998Smarkm	sqr_add_c(a,0,c1,c2,c3);
578109998Smarkm	r[0]=c1;
579109998Smarkm	c1=0;
580109998Smarkm	sqr_add_c2(a,1,0,c2,c3,c1);
581109998Smarkm	r[1]=c2;
582109998Smarkm	c2=0;
583109998Smarkm	sqr_add_c(a,1,c3,c1,c2);
584109998Smarkm	sqr_add_c2(a,2,0,c3,c1,c2);
585109998Smarkm	r[2]=c3;
586109998Smarkm	c3=0;
587109998Smarkm	sqr_add_c2(a,3,0,c1,c2,c3);
588109998Smarkm	sqr_add_c2(a,2,1,c1,c2,c3);
589109998Smarkm	r[3]=c1;
590109998Smarkm	c1=0;
591109998Smarkm	sqr_add_c(a,2,c2,c3,c1);
592109998Smarkm	sqr_add_c2(a,3,1,c2,c3,c1);
593109998Smarkm	r[4]=c2;
594109998Smarkm	c2=0;
595109998Smarkm	sqr_add_c2(a,3,2,c3,c1,c2);
596109998Smarkm	r[5]=c3;
597109998Smarkm	c3=0;
598109998Smarkm	sqr_add_c(a,3,c1,c2,c3);
599109998Smarkm	r[6]=c1;
600109998Smarkm	r[7]=c2;
601109998Smarkm	}
602162911Ssimon#endif
603