x86_64-gcc.c revision 279265
1#include "../bn_lcl.h" 2#ifdef __SUNPRO_C 3# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 4#else 5/* 6 * x86_64 BIGNUM accelerator version 0.1, December 2002. 7 * 8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9 * project. 10 * 11 * Rights for redistribution and usage in source and binary forms are 12 * granted according to the OpenSSL license. Warranty of any kind is 13 * disclaimed. 14 * 15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 16 * versions, like 1.0... 17 * A. Well, that's because this code is basically a quick-n-dirty 18 * proof-of-concept hack. As you can see it's implemented with 19 * inline assembler, which means that you're bound to GCC and that 20 * there might be enough room for further improvement. 21 * 22 * Q. Why inline assembler? 23 * A. x86_64 features own ABI which I'm not familiar with. This is 24 * why I decided to let the compiler take care of subroutine 25 * prologue/epilogue as well as register allocation. For reference. 26 * Win64 implements different ABI for AMD64, different from Linux. 27 * 28 * Q. How much faster does it get? 29 * A. 'apps/openssl speed rsa dsa' output with no-asm: 30 * 31 * sign verify sign/s verify/s 32 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 33 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 34 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 35 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 36 * sign verify sign/s verify/s 37 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 38 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 39 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 40 * 41 * 'apps/openssl speed rsa dsa' output with this module: 42 * 43 * sign verify sign/s verify/s 44 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 45 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 46 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 47 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 48 * sign verify sign/s verify/s 49 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 50 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 51 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 52 * 53 * For the reference. IA-32 assembler implementation performs 54 * very much like 64-bit code compiled with no-asm on the same 55 * machine. 56 */ 57 58#define BN_ULONG unsigned long 59 60#undef mul 61#undef mul_add 62#undef sqr 63 64/* 65 * "m"(a), "+m"(r) is the way to favor DirectPath �-code; 66 * "g"(0) let the compiler to decide where does it 67 * want to keep the value of zero; 68 */ 69#define mul_add(r,a,word,carry) do { \ 70 register BN_ULONG high,low; \ 71 asm ("mulq %3" \ 72 : "=a"(low),"=d"(high) \ 73 : "a"(word),"m"(a) \ 74 : "cc"); \ 75 asm ("addq %2,%0; adcq %3,%1" \ 76 : "+r"(carry),"+d"(high)\ 77 : "a"(low),"g"(0) \ 78 : "cc"); \ 79 asm ("addq %2,%0; adcq %3,%1" \ 80 : "+m"(r),"+d"(high) \ 81 : "r"(carry),"g"(0) \ 82 : "cc"); \ 83 carry=high; \ 84 } while (0) 85 86#define mul(r,a,word,carry) do { \ 87 register BN_ULONG high,low; \ 88 asm ("mulq %3" \ 89 : "=a"(low),"=d"(high) \ 90 : "a"(word),"g"(a) \ 91 : "cc"); \ 92 asm ("addq %2,%0; adcq %3,%1" \ 93 : "+r"(carry),"+d"(high)\ 94 : "a"(low),"g"(0) \ 95 : "cc"); \ 96 (r)=carry, carry=high; \ 97 } while (0) 98 99#define sqr(r0,r1,a) \ 100 asm ("mulq %2" \ 101 : "=a"(r0),"=d"(r1) \ 102 : "a"(a) \ 103 : "cc"); 104 105BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 106 { 107 BN_ULONG c1=0; 108 109 if (num <= 0) return(c1); 110 111 while (num&~3) 112 { 113 mul_add(rp[0],ap[0],w,c1); 114 mul_add(rp[1],ap[1],w,c1); 115 mul_add(rp[2],ap[2],w,c1); 116 mul_add(rp[3],ap[3],w,c1); 117 ap+=4; rp+=4; num-=4; 118 } 119 if (num) 120 { 121 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; 122 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; 123 mul_add(rp[2],ap[2],w,c1); return c1; 124 } 125 126 return(c1); 127 } 128 129BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 130 { 131 BN_ULONG c1=0; 132 133 if (num <= 0) return(c1); 134 135 while (num&~3) 136 { 137 mul(rp[0],ap[0],w,c1); 138 mul(rp[1],ap[1],w,c1); 139 mul(rp[2],ap[2],w,c1); 140 mul(rp[3],ap[3],w,c1); 141 ap+=4; rp+=4; num-=4; 142 } 143 if (num) 144 { 145 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; 146 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; 147 mul(rp[2],ap[2],w,c1); 148 } 149 return(c1); 150 } 151 152void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 153 { 154 if (n <= 0) return; 155 156 while (n&~3) 157 { 158 sqr(r[0],r[1],a[0]); 159 sqr(r[2],r[3],a[1]); 160 sqr(r[4],r[5],a[2]); 161 sqr(r[6],r[7],a[3]); 162 a+=4; r+=8; n-=4; 163 } 164 if (n) 165 { 166 sqr(r[0],r[1],a[0]); if (--n == 0) return; 167 sqr(r[2],r[3],a[1]); if (--n == 0) return; 168 sqr(r[4],r[5],a[2]); 169 } 170 } 171 172BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 173{ BN_ULONG ret,waste; 174 175 asm ("divq %4" 176 : "=a"(ret),"=d"(waste) 177 : "a"(l),"d"(h),"g"(d) 178 : "cc"); 179 180 return ret; 181} 182 183BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) 184{ BN_ULONG ret=0,i=0; 185 186 if (n <= 0) return 0; 187 188 asm volatile ( 189 " subq %2,%2 \n" 190 ".align 16 \n" 191 "1: movq (%4,%2,8),%0 \n" 192 " adcq (%5,%2,8),%0 \n" 193 " movq %0,(%3,%2,8) \n" 194 " leaq 1(%2),%2 \n" 195 " loop 1b \n" 196 " sbbq %0,%0 \n" 197 : "=&a"(ret),"+c"(n),"=&r"(i) 198 : "r"(rp),"r"(ap),"r"(bp) 199 : "cc", "memory" 200 ); 201 202 return ret&1; 203} 204 205#ifndef SIMICS 206BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) 207{ BN_ULONG ret=0,i=0; 208 209 if (n <= 0) return 0; 210 211 asm volatile ( 212 " subq %2,%2 \n" 213 ".align 16 \n" 214 "1: movq (%4,%2,8),%0 \n" 215 " sbbq (%5,%2,8),%0 \n" 216 " movq %0,(%3,%2,8) \n" 217 " leaq 1(%2),%2 \n" 218 " loop 1b \n" 219 " sbbq %0,%0 \n" 220 : "=&a"(ret),"+c"(n),"=&r"(i) 221 : "r"(rp),"r"(ap),"r"(bp) 222 : "cc", "memory" 223 ); 224 225 return ret&1; 226} 227#else 228/* Simics 1.4<7 has buggy sbbq:-( */ 229#define BN_MASK2 0xffffffffffffffffL 230BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 231 { 232 BN_ULONG t1,t2; 233 int c=0; 234 235 if (n <= 0) return((BN_ULONG)0); 236 237 for (;;) 238 { 239 t1=a[0]; t2=b[0]; 240 r[0]=(t1-t2-c)&BN_MASK2; 241 if (t1 != t2) c=(t1 < t2); 242 if (--n <= 0) break; 243 244 t1=a[1]; t2=b[1]; 245 r[1]=(t1-t2-c)&BN_MASK2; 246 if (t1 != t2) c=(t1 < t2); 247 if (--n <= 0) break; 248 249 t1=a[2]; t2=b[2]; 250 r[2]=(t1-t2-c)&BN_MASK2; 251 if (t1 != t2) c=(t1 < t2); 252 if (--n <= 0) break; 253 254 t1=a[3]; t2=b[3]; 255 r[3]=(t1-t2-c)&BN_MASK2; 256 if (t1 != t2) c=(t1 < t2); 257 if (--n <= 0) break; 258 259 a+=4; 260 b+=4; 261 r+=4; 262 } 263 return(c); 264 } 265#endif 266 267/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 268/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 269/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 270/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ 271 272/* 273 * Keep in mind that carrying into high part of multiplication result 274 * can not overflow, because it cannot be all-ones. 275 */ 276#if 0 277/* original macros are kept for reference purposes */ 278#define mul_add_c(a,b,c0,c1,c2) { \ 279 BN_ULONG ta=(a),tb=(b); \ 280 t1 = ta * tb; \ 281 t2 = BN_UMULT_HIGH(ta,tb); \ 282 c0 += t1; t2 += (c0<t1)?1:0; \ 283 c1 += t2; c2 += (c1<t2)?1:0; \ 284 } 285 286#define mul_add_c2(a,b,c0,c1,c2) { \ 287 BN_ULONG ta=(a),tb=(b),t0; \ 288 t1 = BN_UMULT_HIGH(ta,tb); \ 289 t0 = ta * tb; \ 290 c0 += t0; t2 = t1+((c0<t0)?1:0);\ 291 c1 += t2; c2 += (c1<t2)?1:0; \ 292 c0 += t0; t1 += (c0<t0)?1:0; \ 293 c1 += t1; c2 += (c1<t1)?1:0; \ 294 } 295#else 296#define mul_add_c(a,b,c0,c1,c2) do { \ 297 asm ("mulq %3" \ 298 : "=a"(t1),"=d"(t2) \ 299 : "a"(a),"m"(b) \ 300 : "cc"); \ 301 asm ("addq %2,%0; adcq %3,%1" \ 302 : "+r"(c0),"+d"(t2) \ 303 : "a"(t1),"g"(0) \ 304 : "cc"); \ 305 asm ("addq %2,%0; adcq %3,%1" \ 306 : "+r"(c1),"+r"(c2) \ 307 : "d"(t2),"g"(0) \ 308 : "cc"); \ 309 } while (0) 310 311#define sqr_add_c(a,i,c0,c1,c2) do { \ 312 asm ("mulq %2" \ 313 : "=a"(t1),"=d"(t2) \ 314 : "a"(a[i]) \ 315 : "cc"); \ 316 asm ("addq %2,%0; adcq %3,%1" \ 317 : "+r"(c0),"+d"(t2) \ 318 : "a"(t1),"g"(0) \ 319 : "cc"); \ 320 asm ("addq %2,%0; adcq %3,%1" \ 321 : "+r"(c1),"+r"(c2) \ 322 : "d"(t2),"g"(0) \ 323 : "cc"); \ 324 } while (0) 325 326#define mul_add_c2(a,b,c0,c1,c2) do { \ 327 asm ("mulq %3" \ 328 : "=a"(t1),"=d"(t2) \ 329 : "a"(a),"m"(b) \ 330 : "cc"); \ 331 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 332 : "+r"(c0),"+r"(c1),"+r"(c2) \ 333 : "r"(t1),"r"(t2),"g"(0) \ 334 : "cc"); \ 335 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 336 : "+r"(c0),"+r"(c1),"+r"(c2) \ 337 : "r"(t1),"r"(t2),"g"(0) \ 338 : "cc"); \ 339 } while (0) 340#endif 341 342#define sqr_add_c2(a,i,j,c0,c1,c2) \ 343 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 344 345void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 346 { 347 BN_ULONG t1,t2; 348 BN_ULONG c1,c2,c3; 349 350 c1=0; 351 c2=0; 352 c3=0; 353 mul_add_c(a[0],b[0],c1,c2,c3); 354 r[0]=c1; 355 c1=0; 356 mul_add_c(a[0],b[1],c2,c3,c1); 357 mul_add_c(a[1],b[0],c2,c3,c1); 358 r[1]=c2; 359 c2=0; 360 mul_add_c(a[2],b[0],c3,c1,c2); 361 mul_add_c(a[1],b[1],c3,c1,c2); 362 mul_add_c(a[0],b[2],c3,c1,c2); 363 r[2]=c3; 364 c3=0; 365 mul_add_c(a[0],b[3],c1,c2,c3); 366 mul_add_c(a[1],b[2],c1,c2,c3); 367 mul_add_c(a[2],b[1],c1,c2,c3); 368 mul_add_c(a[3],b[0],c1,c2,c3); 369 r[3]=c1; 370 c1=0; 371 mul_add_c(a[4],b[0],c2,c3,c1); 372 mul_add_c(a[3],b[1],c2,c3,c1); 373 mul_add_c(a[2],b[2],c2,c3,c1); 374 mul_add_c(a[1],b[3],c2,c3,c1); 375 mul_add_c(a[0],b[4],c2,c3,c1); 376 r[4]=c2; 377 c2=0; 378 mul_add_c(a[0],b[5],c3,c1,c2); 379 mul_add_c(a[1],b[4],c3,c1,c2); 380 mul_add_c(a[2],b[3],c3,c1,c2); 381 mul_add_c(a[3],b[2],c3,c1,c2); 382 mul_add_c(a[4],b[1],c3,c1,c2); 383 mul_add_c(a[5],b[0],c3,c1,c2); 384 r[5]=c3; 385 c3=0; 386 mul_add_c(a[6],b[0],c1,c2,c3); 387 mul_add_c(a[5],b[1],c1,c2,c3); 388 mul_add_c(a[4],b[2],c1,c2,c3); 389 mul_add_c(a[3],b[3],c1,c2,c3); 390 mul_add_c(a[2],b[4],c1,c2,c3); 391 mul_add_c(a[1],b[5],c1,c2,c3); 392 mul_add_c(a[0],b[6],c1,c2,c3); 393 r[6]=c1; 394 c1=0; 395 mul_add_c(a[0],b[7],c2,c3,c1); 396 mul_add_c(a[1],b[6],c2,c3,c1); 397 mul_add_c(a[2],b[5],c2,c3,c1); 398 mul_add_c(a[3],b[4],c2,c3,c1); 399 mul_add_c(a[4],b[3],c2,c3,c1); 400 mul_add_c(a[5],b[2],c2,c3,c1); 401 mul_add_c(a[6],b[1],c2,c3,c1); 402 mul_add_c(a[7],b[0],c2,c3,c1); 403 r[7]=c2; 404 c2=0; 405 mul_add_c(a[7],b[1],c3,c1,c2); 406 mul_add_c(a[6],b[2],c3,c1,c2); 407 mul_add_c(a[5],b[3],c3,c1,c2); 408 mul_add_c(a[4],b[4],c3,c1,c2); 409 mul_add_c(a[3],b[5],c3,c1,c2); 410 mul_add_c(a[2],b[6],c3,c1,c2); 411 mul_add_c(a[1],b[7],c3,c1,c2); 412 r[8]=c3; 413 c3=0; 414 mul_add_c(a[2],b[7],c1,c2,c3); 415 mul_add_c(a[3],b[6],c1,c2,c3); 416 mul_add_c(a[4],b[5],c1,c2,c3); 417 mul_add_c(a[5],b[4],c1,c2,c3); 418 mul_add_c(a[6],b[3],c1,c2,c3); 419 mul_add_c(a[7],b[2],c1,c2,c3); 420 r[9]=c1; 421 c1=0; 422 mul_add_c(a[7],b[3],c2,c3,c1); 423 mul_add_c(a[6],b[4],c2,c3,c1); 424 mul_add_c(a[5],b[5],c2,c3,c1); 425 mul_add_c(a[4],b[6],c2,c3,c1); 426 mul_add_c(a[3],b[7],c2,c3,c1); 427 r[10]=c2; 428 c2=0; 429 mul_add_c(a[4],b[7],c3,c1,c2); 430 mul_add_c(a[5],b[6],c3,c1,c2); 431 mul_add_c(a[6],b[5],c3,c1,c2); 432 mul_add_c(a[7],b[4],c3,c1,c2); 433 r[11]=c3; 434 c3=0; 435 mul_add_c(a[7],b[5],c1,c2,c3); 436 mul_add_c(a[6],b[6],c1,c2,c3); 437 mul_add_c(a[5],b[7],c1,c2,c3); 438 r[12]=c1; 439 c1=0; 440 mul_add_c(a[6],b[7],c2,c3,c1); 441 mul_add_c(a[7],b[6],c2,c3,c1); 442 r[13]=c2; 443 c2=0; 444 mul_add_c(a[7],b[7],c3,c1,c2); 445 r[14]=c3; 446 r[15]=c1; 447 } 448 449void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 450 { 451 BN_ULONG t1,t2; 452 BN_ULONG c1,c2,c3; 453 454 c1=0; 455 c2=0; 456 c3=0; 457 mul_add_c(a[0],b[0],c1,c2,c3); 458 r[0]=c1; 459 c1=0; 460 mul_add_c(a[0],b[1],c2,c3,c1); 461 mul_add_c(a[1],b[0],c2,c3,c1); 462 r[1]=c2; 463 c2=0; 464 mul_add_c(a[2],b[0],c3,c1,c2); 465 mul_add_c(a[1],b[1],c3,c1,c2); 466 mul_add_c(a[0],b[2],c3,c1,c2); 467 r[2]=c3; 468 c3=0; 469 mul_add_c(a[0],b[3],c1,c2,c3); 470 mul_add_c(a[1],b[2],c1,c2,c3); 471 mul_add_c(a[2],b[1],c1,c2,c3); 472 mul_add_c(a[3],b[0],c1,c2,c3); 473 r[3]=c1; 474 c1=0; 475 mul_add_c(a[3],b[1],c2,c3,c1); 476 mul_add_c(a[2],b[2],c2,c3,c1); 477 mul_add_c(a[1],b[3],c2,c3,c1); 478 r[4]=c2; 479 c2=0; 480 mul_add_c(a[2],b[3],c3,c1,c2); 481 mul_add_c(a[3],b[2],c3,c1,c2); 482 r[5]=c3; 483 c3=0; 484 mul_add_c(a[3],b[3],c1,c2,c3); 485 r[6]=c1; 486 r[7]=c2; 487 } 488 489void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 490 { 491 BN_ULONG t1,t2; 492 BN_ULONG c1,c2,c3; 493 494 c1=0; 495 c2=0; 496 c3=0; 497 sqr_add_c(a,0,c1,c2,c3); 498 r[0]=c1; 499 c1=0; 500 sqr_add_c2(a,1,0,c2,c3,c1); 501 r[1]=c2; 502 c2=0; 503 sqr_add_c(a,1,c3,c1,c2); 504 sqr_add_c2(a,2,0,c3,c1,c2); 505 r[2]=c3; 506 c3=0; 507 sqr_add_c2(a,3,0,c1,c2,c3); 508 sqr_add_c2(a,2,1,c1,c2,c3); 509 r[3]=c1; 510 c1=0; 511 sqr_add_c(a,2,c2,c3,c1); 512 sqr_add_c2(a,3,1,c2,c3,c1); 513 sqr_add_c2(a,4,0,c2,c3,c1); 514 r[4]=c2; 515 c2=0; 516 sqr_add_c2(a,5,0,c3,c1,c2); 517 sqr_add_c2(a,4,1,c3,c1,c2); 518 sqr_add_c2(a,3,2,c3,c1,c2); 519 r[5]=c3; 520 c3=0; 521 sqr_add_c(a,3,c1,c2,c3); 522 sqr_add_c2(a,4,2,c1,c2,c3); 523 sqr_add_c2(a,5,1,c1,c2,c3); 524 sqr_add_c2(a,6,0,c1,c2,c3); 525 r[6]=c1; 526 c1=0; 527 sqr_add_c2(a,7,0,c2,c3,c1); 528 sqr_add_c2(a,6,1,c2,c3,c1); 529 sqr_add_c2(a,5,2,c2,c3,c1); 530 sqr_add_c2(a,4,3,c2,c3,c1); 531 r[7]=c2; 532 c2=0; 533 sqr_add_c(a,4,c3,c1,c2); 534 sqr_add_c2(a,5,3,c3,c1,c2); 535 sqr_add_c2(a,6,2,c3,c1,c2); 536 sqr_add_c2(a,7,1,c3,c1,c2); 537 r[8]=c3; 538 c3=0; 539 sqr_add_c2(a,7,2,c1,c2,c3); 540 sqr_add_c2(a,6,3,c1,c2,c3); 541 sqr_add_c2(a,5,4,c1,c2,c3); 542 r[9]=c1; 543 c1=0; 544 sqr_add_c(a,5,c2,c3,c1); 545 sqr_add_c2(a,6,4,c2,c3,c1); 546 sqr_add_c2(a,7,3,c2,c3,c1); 547 r[10]=c2; 548 c2=0; 549 sqr_add_c2(a,7,4,c3,c1,c2); 550 sqr_add_c2(a,6,5,c3,c1,c2); 551 r[11]=c3; 552 c3=0; 553 sqr_add_c(a,6,c1,c2,c3); 554 sqr_add_c2(a,7,5,c1,c2,c3); 555 r[12]=c1; 556 c1=0; 557 sqr_add_c2(a,7,6,c2,c3,c1); 558 r[13]=c2; 559 c2=0; 560 sqr_add_c(a,7,c3,c1,c2); 561 r[14]=c3; 562 r[15]=c1; 563 } 564 565void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 566 { 567 BN_ULONG t1,t2; 568 BN_ULONG c1,c2,c3; 569 570 c1=0; 571 c2=0; 572 c3=0; 573 sqr_add_c(a,0,c1,c2,c3); 574 r[0]=c1; 575 c1=0; 576 sqr_add_c2(a,1,0,c2,c3,c1); 577 r[1]=c2; 578 c2=0; 579 sqr_add_c(a,1,c3,c1,c2); 580 sqr_add_c2(a,2,0,c3,c1,c2); 581 r[2]=c3; 582 c3=0; 583 sqr_add_c2(a,3,0,c1,c2,c3); 584 sqr_add_c2(a,2,1,c1,c2,c3); 585 r[3]=c1; 586 c1=0; 587 sqr_add_c(a,2,c2,c3,c1); 588 sqr_add_c2(a,3,1,c2,c3,c1); 589 r[4]=c2; 590 c2=0; 591 sqr_add_c2(a,3,2,c3,c1,c2); 592 r[5]=c3; 593 c3=0; 594 sqr_add_c(a,3,c1,c2,c3); 595 r[6]=c1; 596 r[7]=c2; 597 } 598#endif 599