x86_64-gcc.c revision 160814
1/*
2 * x86_64 BIGNUM accelerator version 0.1, December 2002.
3 *
4 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 * project.
6 *
7 * Rights for redistribution and usage in source and binary forms are
8 * granted according to the OpenSSL license. Warranty of any kind is
9 * disclaimed.
10 *
11 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
12 *    versions, like 1.0...
13 * A. Well, that's because this code is basically a quick-n-dirty
14 *    proof-of-concept hack. As you can see it's implemented with
15 *    inline assembler, which means that you're bound to GCC and that
16 *    there might be enough room for further improvement.
17 *
18 * Q. Why inline assembler?
19 * A. x86_64 features own ABI which I'm not familiar with. This is
20 *    why I decided to let the compiler take care of subroutine
21 *    prologue/epilogue as well as register allocation. For reference.
22 *    Win64 implements different ABI for AMD64, different from Linux.
23 *
24 * Q. How much faster does it get?
25 * A. 'apps/openssl speed rsa dsa' output with no-asm:
26 *
27 *	                  sign    verify    sign/s verify/s
28 *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
29 *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
30 *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
31 *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
32 *	                  sign    verify    sign/s verify/s
33 *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
34 *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
35 *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
36 *
37 *    'apps/openssl speed rsa dsa' output with this module:
38 *
39 *	                  sign    verify    sign/s verify/s
40 *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
41 *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
42 *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
43 *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
44 *	                  sign    verify    sign/s verify/s
45 *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
46 *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
47 *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
48 *
49 *    For the reference. IA-32 assembler implementation performs
50 *    very much like 64-bit code compiled with no-asm on the same
51 *    machine.
52 */
53
54#define BN_ULONG unsigned long
55
56/*
57 * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
58 * "g"(0)		let the compiler to decide where does it
59 *			want to keep the value of zero;
60 */
61#define mul_add(r,a,word,carry) do {	\
62	register BN_ULONG high,low;	\
63	asm ("mulq %3"			\
64		: "=a"(low),"=d"(high)	\
65		: "a"(word),"m"(a)	\
66		: "cc");		\
67	asm ("addq %2,%0; adcq %3,%1"	\
68		: "+r"(carry),"+d"(high)\
69		: "a"(low),"g"(0)	\
70		: "cc");		\
71	asm ("addq %2,%0; adcq %3,%1"	\
72		: "+m"(r),"+d"(high)	\
73		: "r"(carry),"g"(0)	\
74		: "cc");		\
75	carry=high;			\
76	} while (0)
77
78#define mul(r,a,word,carry) do {	\
79	register BN_ULONG high,low;	\
80	asm ("mulq %3"			\
81		: "=a"(low),"=d"(high)	\
82		: "a"(word),"g"(a)	\
83		: "cc");		\
84	asm ("addq %2,%0; adcq %3,%1"	\
85		: "+r"(carry),"+d"(high)\
86		: "a"(low),"g"(0)	\
87		: "cc");		\
88	(r)=carry, carry=high;		\
89	} while (0)
90
91#define sqr(r0,r1,a)			\
92	asm ("mulq %2"			\
93		: "=a"(r0),"=d"(r1)	\
94		: "a"(a)		\
95		: "cc");
96
97BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
98	{
99	BN_ULONG c1=0;
100
101	if (num <= 0) return(c1);
102
103	while (num&~3)
104		{
105		mul_add(rp[0],ap[0],w,c1);
106		mul_add(rp[1],ap[1],w,c1);
107		mul_add(rp[2],ap[2],w,c1);
108		mul_add(rp[3],ap[3],w,c1);
109		ap+=4; rp+=4; num-=4;
110		}
111	if (num)
112		{
113		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
114		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
115		mul_add(rp[2],ap[2],w,c1); return c1;
116		}
117
118	return(c1);
119	}
120
121BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
122	{
123	BN_ULONG c1=0;
124
125	if (num <= 0) return(c1);
126
127	while (num&~3)
128		{
129		mul(rp[0],ap[0],w,c1);
130		mul(rp[1],ap[1],w,c1);
131		mul(rp[2],ap[2],w,c1);
132		mul(rp[3],ap[3],w,c1);
133		ap+=4; rp+=4; num-=4;
134		}
135	if (num)
136		{
137		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
138		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
139		mul(rp[2],ap[2],w,c1);
140		}
141	return(c1);
142	}
143
144void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
145        {
146	if (n <= 0) return;
147
148	while (n&~3)
149		{
150		sqr(r[0],r[1],a[0]);
151		sqr(r[2],r[3],a[1]);
152		sqr(r[4],r[5],a[2]);
153		sqr(r[6],r[7],a[3]);
154		a+=4; r+=8; n-=4;
155		}
156	if (n)
157		{
158		sqr(r[0],r[1],a[0]); if (--n == 0) return;
159		sqr(r[2],r[3],a[1]); if (--n == 0) return;
160		sqr(r[4],r[5],a[2]);
161		}
162	}
163
164BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
165{	BN_ULONG ret,waste;
166
167	asm ("divq	%4"
168		: "=a"(ret),"=d"(waste)
169		: "a"(l),"d"(h),"g"(d)
170		: "cc");
171
172	return ret;
173}
174
175BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
176{ BN_ULONG ret=0,i=0;
177
178	if (n <= 0) return 0;
179
180	asm (
181	"	subq	%2,%2		\n"
182	".align 16			\n"
183	"1:	movq	(%4,%2,8),%0	\n"
184	"	adcq	(%5,%2,8),%0	\n"
185	"	movq	%0,(%3,%2,8)	\n"
186	"	leaq	1(%2),%2	\n"
187	"	loop	1b		\n"
188	"	sbbq	%0,%0		\n"
189		: "=&a"(ret),"+c"(n),"=&r"(i)
190		: "r"(rp),"r"(ap),"r"(bp)
191		: "cc"
192	);
193
194  return ret&1;
195}
196
197#ifndef SIMICS
198BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
199{ BN_ULONG ret=0,i=0;
200
201	if (n <= 0) return 0;
202
203	asm (
204	"	subq	%2,%2		\n"
205	".align 16			\n"
206	"1:	movq	(%4,%2,8),%0	\n"
207	"	sbbq	(%5,%2,8),%0	\n"
208	"	movq	%0,(%3,%2,8)	\n"
209	"	leaq	1(%2),%2	\n"
210	"	loop	1b		\n"
211	"	sbbq	%0,%0		\n"
212		: "=&a"(ret),"+c"(n),"=&r"(i)
213		: "r"(rp),"r"(ap),"r"(bp)
214		: "cc"
215	);
216
217  return ret&1;
218}
219#else
220/* Simics 1.4<7 has buggy sbbq:-( */
221#define BN_MASK2 0xffffffffffffffffL
222BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
223        {
224	BN_ULONG t1,t2;
225	int c=0;
226
227	if (n <= 0) return((BN_ULONG)0);
228
229	for (;;)
230		{
231		t1=a[0]; t2=b[0];
232		r[0]=(t1-t2-c)&BN_MASK2;
233		if (t1 != t2) c=(t1 < t2);
234		if (--n <= 0) break;
235
236		t1=a[1]; t2=b[1];
237		r[1]=(t1-t2-c)&BN_MASK2;
238		if (t1 != t2) c=(t1 < t2);
239		if (--n <= 0) break;
240
241		t1=a[2]; t2=b[2];
242		r[2]=(t1-t2-c)&BN_MASK2;
243		if (t1 != t2) c=(t1 < t2);
244		if (--n <= 0) break;
245
246		t1=a[3]; t2=b[3];
247		r[3]=(t1-t2-c)&BN_MASK2;
248		if (t1 != t2) c=(t1 < t2);
249		if (--n <= 0) break;
250
251		a+=4;
252		b+=4;
253		r+=4;
254		}
255	return(c);
256	}
257#endif
258
259/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
260/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
261/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
262/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
263
264#if 0
265/* original macros are kept for reference purposes */
266#define mul_add_c(a,b,c0,c1,c2) {	\
267	BN_ULONG ta=(a),tb=(b);		\
268	t1 = ta * tb;			\
269	t2 = BN_UMULT_HIGH(ta,tb);	\
270	c0 += t1; t2 += (c0<t1)?1:0;	\
271	c1 += t2; c2 += (c1<t2)?1:0;	\
272	}
273
274#define mul_add_c2(a,b,c0,c1,c2) {	\
275	BN_ULONG ta=(a),tb=(b),t0;	\
276	t1 = BN_UMULT_HIGH(ta,tb);	\
277	t0 = ta * tb;			\
278	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
279	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
280	c0 += t1; t2 += (c0<t1)?1:0;	\
281	c1 += t2; c2 += (c1<t2)?1:0;	\
282	}
283#else
284#define mul_add_c(a,b,c0,c1,c2)	do {	\
285	asm ("mulq %3"			\
286		: "=a"(t1),"=d"(t2)	\
287		: "a"(a),"m"(b)		\
288		: "cc");		\
289	asm ("addq %2,%0; adcq %3,%1"	\
290		: "+r"(c0),"+d"(t2)	\
291		: "a"(t1),"g"(0)	\
292		: "cc");		\
293	asm ("addq %2,%0; adcq %3,%1"	\
294		: "+r"(c1),"+r"(c2)	\
295		: "d"(t2),"g"(0)	\
296		: "cc");		\
297	} while (0)
298
299#define sqr_add_c(a,i,c0,c1,c2)	do {	\
300	asm ("mulq %2"			\
301		: "=a"(t1),"=d"(t2)	\
302		: "a"(a[i])		\
303		: "cc");		\
304	asm ("addq %2,%0; adcq %3,%1"	\
305		: "+r"(c0),"+d"(t2)	\
306		: "a"(t1),"g"(0)	\
307		: "cc");		\
308	asm ("addq %2,%0; adcq %3,%1"	\
309		: "+r"(c1),"+r"(c2)	\
310		: "d"(t2),"g"(0)	\
311		: "cc");		\
312	} while (0)
313
314#define mul_add_c2(a,b,c0,c1,c2) do {	\
315	asm ("mulq %3"			\
316		: "=a"(t1),"=d"(t2)	\
317		: "a"(a),"m"(b)		\
318		: "cc");		\
319	asm ("addq %0,%0; adcq %2,%1"	\
320		: "+d"(t2),"+r"(c2)	\
321		: "g"(0)		\
322		: "cc");		\
323	asm ("addq %0,%0; adcq %2,%1"	\
324		: "+a"(t1),"+d"(t2)	\
325		: "g"(0)		\
326		: "cc");		\
327	asm ("addq %2,%0; adcq %3,%1"	\
328		: "+r"(c0),"+d"(t2)	\
329		: "a"(t1),"g"(0)	\
330		: "cc");		\
331	asm ("addq %2,%0; adcq %3,%1"	\
332		: "+r"(c1),"+r"(c2)	\
333		: "d"(t2),"g"(0)	\
334		: "cc");		\
335	} while (0)
336#endif
337
338#define sqr_add_c2(a,i,j,c0,c1,c2)	\
339	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
340
341void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
342	{
343	BN_ULONG t1,t2;
344	BN_ULONG c1,c2,c3;
345
346	c1=0;
347	c2=0;
348	c3=0;
349	mul_add_c(a[0],b[0],c1,c2,c3);
350	r[0]=c1;
351	c1=0;
352	mul_add_c(a[0],b[1],c2,c3,c1);
353	mul_add_c(a[1],b[0],c2,c3,c1);
354	r[1]=c2;
355	c2=0;
356	mul_add_c(a[2],b[0],c3,c1,c2);
357	mul_add_c(a[1],b[1],c3,c1,c2);
358	mul_add_c(a[0],b[2],c3,c1,c2);
359	r[2]=c3;
360	c3=0;
361	mul_add_c(a[0],b[3],c1,c2,c3);
362	mul_add_c(a[1],b[2],c1,c2,c3);
363	mul_add_c(a[2],b[1],c1,c2,c3);
364	mul_add_c(a[3],b[0],c1,c2,c3);
365	r[3]=c1;
366	c1=0;
367	mul_add_c(a[4],b[0],c2,c3,c1);
368	mul_add_c(a[3],b[1],c2,c3,c1);
369	mul_add_c(a[2],b[2],c2,c3,c1);
370	mul_add_c(a[1],b[3],c2,c3,c1);
371	mul_add_c(a[0],b[4],c2,c3,c1);
372	r[4]=c2;
373	c2=0;
374	mul_add_c(a[0],b[5],c3,c1,c2);
375	mul_add_c(a[1],b[4],c3,c1,c2);
376	mul_add_c(a[2],b[3],c3,c1,c2);
377	mul_add_c(a[3],b[2],c3,c1,c2);
378	mul_add_c(a[4],b[1],c3,c1,c2);
379	mul_add_c(a[5],b[0],c3,c1,c2);
380	r[5]=c3;
381	c3=0;
382	mul_add_c(a[6],b[0],c1,c2,c3);
383	mul_add_c(a[5],b[1],c1,c2,c3);
384	mul_add_c(a[4],b[2],c1,c2,c3);
385	mul_add_c(a[3],b[3],c1,c2,c3);
386	mul_add_c(a[2],b[4],c1,c2,c3);
387	mul_add_c(a[1],b[5],c1,c2,c3);
388	mul_add_c(a[0],b[6],c1,c2,c3);
389	r[6]=c1;
390	c1=0;
391	mul_add_c(a[0],b[7],c2,c3,c1);
392	mul_add_c(a[1],b[6],c2,c3,c1);
393	mul_add_c(a[2],b[5],c2,c3,c1);
394	mul_add_c(a[3],b[4],c2,c3,c1);
395	mul_add_c(a[4],b[3],c2,c3,c1);
396	mul_add_c(a[5],b[2],c2,c3,c1);
397	mul_add_c(a[6],b[1],c2,c3,c1);
398	mul_add_c(a[7],b[0],c2,c3,c1);
399	r[7]=c2;
400	c2=0;
401	mul_add_c(a[7],b[1],c3,c1,c2);
402	mul_add_c(a[6],b[2],c3,c1,c2);
403	mul_add_c(a[5],b[3],c3,c1,c2);
404	mul_add_c(a[4],b[4],c3,c1,c2);
405	mul_add_c(a[3],b[5],c3,c1,c2);
406	mul_add_c(a[2],b[6],c3,c1,c2);
407	mul_add_c(a[1],b[7],c3,c1,c2);
408	r[8]=c3;
409	c3=0;
410	mul_add_c(a[2],b[7],c1,c2,c3);
411	mul_add_c(a[3],b[6],c1,c2,c3);
412	mul_add_c(a[4],b[5],c1,c2,c3);
413	mul_add_c(a[5],b[4],c1,c2,c3);
414	mul_add_c(a[6],b[3],c1,c2,c3);
415	mul_add_c(a[7],b[2],c1,c2,c3);
416	r[9]=c1;
417	c1=0;
418	mul_add_c(a[7],b[3],c2,c3,c1);
419	mul_add_c(a[6],b[4],c2,c3,c1);
420	mul_add_c(a[5],b[5],c2,c3,c1);
421	mul_add_c(a[4],b[6],c2,c3,c1);
422	mul_add_c(a[3],b[7],c2,c3,c1);
423	r[10]=c2;
424	c2=0;
425	mul_add_c(a[4],b[7],c3,c1,c2);
426	mul_add_c(a[5],b[6],c3,c1,c2);
427	mul_add_c(a[6],b[5],c3,c1,c2);
428	mul_add_c(a[7],b[4],c3,c1,c2);
429	r[11]=c3;
430	c3=0;
431	mul_add_c(a[7],b[5],c1,c2,c3);
432	mul_add_c(a[6],b[6],c1,c2,c3);
433	mul_add_c(a[5],b[7],c1,c2,c3);
434	r[12]=c1;
435	c1=0;
436	mul_add_c(a[6],b[7],c2,c3,c1);
437	mul_add_c(a[7],b[6],c2,c3,c1);
438	r[13]=c2;
439	c2=0;
440	mul_add_c(a[7],b[7],c3,c1,c2);
441	r[14]=c3;
442	r[15]=c1;
443	}
444
445void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
446	{
447	BN_ULONG t1,t2;
448	BN_ULONG c1,c2,c3;
449
450	c1=0;
451	c2=0;
452	c3=0;
453	mul_add_c(a[0],b[0],c1,c2,c3);
454	r[0]=c1;
455	c1=0;
456	mul_add_c(a[0],b[1],c2,c3,c1);
457	mul_add_c(a[1],b[0],c2,c3,c1);
458	r[1]=c2;
459	c2=0;
460	mul_add_c(a[2],b[0],c3,c1,c2);
461	mul_add_c(a[1],b[1],c3,c1,c2);
462	mul_add_c(a[0],b[2],c3,c1,c2);
463	r[2]=c3;
464	c3=0;
465	mul_add_c(a[0],b[3],c1,c2,c3);
466	mul_add_c(a[1],b[2],c1,c2,c3);
467	mul_add_c(a[2],b[1],c1,c2,c3);
468	mul_add_c(a[3],b[0],c1,c2,c3);
469	r[3]=c1;
470	c1=0;
471	mul_add_c(a[3],b[1],c2,c3,c1);
472	mul_add_c(a[2],b[2],c2,c3,c1);
473	mul_add_c(a[1],b[3],c2,c3,c1);
474	r[4]=c2;
475	c2=0;
476	mul_add_c(a[2],b[3],c3,c1,c2);
477	mul_add_c(a[3],b[2],c3,c1,c2);
478	r[5]=c3;
479	c3=0;
480	mul_add_c(a[3],b[3],c1,c2,c3);
481	r[6]=c1;
482	r[7]=c2;
483	}
484
485void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
486	{
487	BN_ULONG t1,t2;
488	BN_ULONG c1,c2,c3;
489
490	c1=0;
491	c2=0;
492	c3=0;
493	sqr_add_c(a,0,c1,c2,c3);
494	r[0]=c1;
495	c1=0;
496	sqr_add_c2(a,1,0,c2,c3,c1);
497	r[1]=c2;
498	c2=0;
499	sqr_add_c(a,1,c3,c1,c2);
500	sqr_add_c2(a,2,0,c3,c1,c2);
501	r[2]=c3;
502	c3=0;
503	sqr_add_c2(a,3,0,c1,c2,c3);
504	sqr_add_c2(a,2,1,c1,c2,c3);
505	r[3]=c1;
506	c1=0;
507	sqr_add_c(a,2,c2,c3,c1);
508	sqr_add_c2(a,3,1,c2,c3,c1);
509	sqr_add_c2(a,4,0,c2,c3,c1);
510	r[4]=c2;
511	c2=0;
512	sqr_add_c2(a,5,0,c3,c1,c2);
513	sqr_add_c2(a,4,1,c3,c1,c2);
514	sqr_add_c2(a,3,2,c3,c1,c2);
515	r[5]=c3;
516	c3=0;
517	sqr_add_c(a,3,c1,c2,c3);
518	sqr_add_c2(a,4,2,c1,c2,c3);
519	sqr_add_c2(a,5,1,c1,c2,c3);
520	sqr_add_c2(a,6,0,c1,c2,c3);
521	r[6]=c1;
522	c1=0;
523	sqr_add_c2(a,7,0,c2,c3,c1);
524	sqr_add_c2(a,6,1,c2,c3,c1);
525	sqr_add_c2(a,5,2,c2,c3,c1);
526	sqr_add_c2(a,4,3,c2,c3,c1);
527	r[7]=c2;
528	c2=0;
529	sqr_add_c(a,4,c3,c1,c2);
530	sqr_add_c2(a,5,3,c3,c1,c2);
531	sqr_add_c2(a,6,2,c3,c1,c2);
532	sqr_add_c2(a,7,1,c3,c1,c2);
533	r[8]=c3;
534	c3=0;
535	sqr_add_c2(a,7,2,c1,c2,c3);
536	sqr_add_c2(a,6,3,c1,c2,c3);
537	sqr_add_c2(a,5,4,c1,c2,c3);
538	r[9]=c1;
539	c1=0;
540	sqr_add_c(a,5,c2,c3,c1);
541	sqr_add_c2(a,6,4,c2,c3,c1);
542	sqr_add_c2(a,7,3,c2,c3,c1);
543	r[10]=c2;
544	c2=0;
545	sqr_add_c2(a,7,4,c3,c1,c2);
546	sqr_add_c2(a,6,5,c3,c1,c2);
547	r[11]=c3;
548	c3=0;
549	sqr_add_c(a,6,c1,c2,c3);
550	sqr_add_c2(a,7,5,c1,c2,c3);
551	r[12]=c1;
552	c1=0;
553	sqr_add_c2(a,7,6,c2,c3,c1);
554	r[13]=c2;
555	c2=0;
556	sqr_add_c(a,7,c3,c1,c2);
557	r[14]=c3;
558	r[15]=c1;
559	}
560
561void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
562	{
563	BN_ULONG t1,t2;
564	BN_ULONG c1,c2,c3;
565
566	c1=0;
567	c2=0;
568	c3=0;
569	sqr_add_c(a,0,c1,c2,c3);
570	r[0]=c1;
571	c1=0;
572	sqr_add_c2(a,1,0,c2,c3,c1);
573	r[1]=c2;
574	c2=0;
575	sqr_add_c(a,1,c3,c1,c2);
576	sqr_add_c2(a,2,0,c3,c1,c2);
577	r[2]=c3;
578	c3=0;
579	sqr_add_c2(a,3,0,c1,c2,c3);
580	sqr_add_c2(a,2,1,c1,c2,c3);
581	r[3]=c1;
582	c1=0;
583	sqr_add_c(a,2,c2,c3,c1);
584	sqr_add_c2(a,3,1,c2,c3,c1);
585	r[4]=c2;
586	c2=0;
587	sqr_add_c2(a,3,2,c3,c1,c2);
588	r[5]=c3;
589	c3=0;
590	sqr_add_c(a,3,c1,c2,c3);
591	r[6]=c1;
592	r[7]=c2;
593	}
594