x86_64-gcc.c revision 340704
1#include "../bn_lcl.h" 2#if !(defined(__GNUC__) && __GNUC__>=2) 3# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 4#else 5/*- 6 * x86_64 BIGNUM accelerator version 0.1, December 2002. 7 * 8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9 * project. 10 * 11 * Rights for redistribution and usage in source and binary forms are 12 * granted according to the OpenSSL license. Warranty of any kind is 13 * disclaimed. 14 * 15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 16 * versions, like 1.0... 17 * A. Well, that's because this code is basically a quick-n-dirty 18 * proof-of-concept hack. As you can see it's implemented with 19 * inline assembler, which means that you're bound to GCC and that 20 * there might be enough room for further improvement. 21 * 22 * Q. Why inline assembler? 23 * A. x86_64 features own ABI which I'm not familiar with. This is 24 * why I decided to let the compiler take care of subroutine 25 * prologue/epilogue as well as register allocation. For reference. 26 * Win64 implements different ABI for AMD64, different from Linux. 27 * 28 * Q. How much faster does it get? 29 * A. 'apps/openssl speed rsa dsa' output with no-asm: 30 * 31 * sign verify sign/s verify/s 32 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 33 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 34 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 35 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 36 * sign verify sign/s verify/s 37 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 38 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 39 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 40 * 41 * 'apps/openssl speed rsa dsa' output with this module: 42 * 43 * sign verify sign/s verify/s 44 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 45 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 46 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 47 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 48 * sign verify sign/s verify/s 49 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 50 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 51 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 52 * 53 * For the reference. IA-32 assembler implementation performs 54 * very much like 64-bit code compiled with no-asm on the same 55 * machine. 56 */ 57 58# undef mul 59# undef mul_add 60 61/*- 62 * "m"(a), "+m"(r) is the way to favor DirectPath ��-code; 63 * "g"(0) let the compiler to decide where does it 64 * want to keep the value of zero; 65 */ 66# define mul_add(r,a,word,carry) do { \ 67 register BN_ULONG high,low; \ 68 asm ("mulq %3" \ 69 : "=a"(low),"=d"(high) \ 70 : "a"(word),"m"(a) \ 71 : "cc"); \ 72 asm ("addq %2,%0; adcq %3,%1" \ 73 : "+r"(carry),"+d"(high)\ 74 : "a"(low),"g"(0) \ 75 : "cc"); \ 76 asm ("addq %2,%0; adcq %3,%1" \ 77 : "+m"(r),"+d"(high) \ 78 : "r"(carry),"g"(0) \ 79 : "cc"); \ 80 carry=high; \ 81 } while (0) 82 83# define mul(r,a,word,carry) do { \ 84 register BN_ULONG high,low; \ 85 asm ("mulq %3" \ 86 : "=a"(low),"=d"(high) \ 87 : "a"(word),"g"(a) \ 88 : "cc"); \ 89 asm ("addq %2,%0; adcq %3,%1" \ 90 : "+r"(carry),"+d"(high)\ 91 : "a"(low),"g"(0) \ 92 : "cc"); \ 93 (r)=carry, carry=high; \ 94 } while (0) 95# undef sqr 96# define sqr(r0,r1,a) \ 97 asm ("mulq %2" \ 98 : "=a"(r0),"=d"(r1) \ 99 : "a"(a) \ 100 : "cc"); 101 102BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 103 BN_ULONG w) 104{ 105 BN_ULONG c1 = 0; 106 107 if (num <= 0) 108 return (c1); 109 110 while (num & ~3) { 111 mul_add(rp[0], ap[0], w, c1); 112 mul_add(rp[1], ap[1], w, c1); 113 mul_add(rp[2], ap[2], w, c1); 114 mul_add(rp[3], ap[3], w, c1); 115 ap += 4; 116 rp += 4; 117 num -= 4; 118 } 119 if (num) { 120 mul_add(rp[0], ap[0], w, c1); 121 if (--num == 0) 122 return c1; 123 mul_add(rp[1], ap[1], w, c1); 124 if (--num == 0) 125 return c1; 126 mul_add(rp[2], ap[2], w, c1); 127 return c1; 128 } 129 130 return (c1); 131} 132 133BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 134{ 135 BN_ULONG c1 = 0; 136 137 if (num <= 0) 138 return (c1); 139 140 while (num & ~3) { 141 mul(rp[0], ap[0], w, c1); 142 mul(rp[1], ap[1], w, c1); 143 mul(rp[2], ap[2], w, c1); 144 mul(rp[3], ap[3], w, c1); 145 ap += 4; 146 rp += 4; 147 num -= 4; 148 } 149 if (num) { 150 mul(rp[0], ap[0], w, c1); 151 if (--num == 0) 152 return c1; 153 mul(rp[1], ap[1], w, c1); 154 if (--num == 0) 155 return c1; 156 mul(rp[2], ap[2], w, c1); 157 } 158 return (c1); 159} 160 161void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 162{ 163 if (n <= 0) 164 return; 165 166 while (n & ~3) { 167 sqr(r[0], r[1], a[0]); 168 sqr(r[2], r[3], a[1]); 169 sqr(r[4], r[5], a[2]); 170 sqr(r[6], r[7], a[3]); 171 a += 4; 172 r += 8; 173 n -= 4; 174 } 175 if (n) { 176 sqr(r[0], r[1], a[0]); 177 if (--n == 0) 178 return; 179 sqr(r[2], r[3], a[1]); 180 if (--n == 0) 181 return; 182 sqr(r[4], r[5], a[2]); 183 } 184} 185 186BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 187{ 188 BN_ULONG ret, waste; 189 190 asm("divq %4":"=a"(ret), "=d"(waste) 191 : "a"(l), "d"(h), "r"(d) 192 : "cc"); 193 194 return ret; 195} 196 197BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 198 int n) 199{ 200 BN_ULONG ret; 201 size_t i = 0; 202 203 if (n <= 0) 204 return 0; 205 206 asm volatile (" subq %0,%0 \n" /* clear carry */ 207 " jmp 1f \n" 208 ".p2align 4 \n" 209 "1: movq (%4,%2,8),%0 \n" 210 " adcq (%5,%2,8),%0 \n" 211 " movq %0,(%3,%2,8) \n" 212 " lea 1(%2),%2 \n" 213 " loop 1b \n" 214 " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), 215 "+r"(i) 216 :"r"(rp), "r"(ap), "r"(bp) 217 :"cc", "memory"); 218 219 return ret & 1; 220} 221 222# ifndef SIMICS 223BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 224 int n) 225{ 226 BN_ULONG ret; 227 size_t i = 0; 228 229 if (n <= 0) 230 return 0; 231 232 asm volatile (" subq %0,%0 \n" /* clear borrow */ 233 " jmp 1f \n" 234 ".p2align 4 \n" 235 "1: movq (%4,%2,8),%0 \n" 236 " sbbq (%5,%2,8),%0 \n" 237 " movq %0,(%3,%2,8) \n" 238 " lea 1(%2),%2 \n" 239 " loop 1b \n" 240 " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), 241 "+r"(i) 242 :"r"(rp), "r"(ap), "r"(bp) 243 :"cc", "memory"); 244 245 return ret & 1; 246} 247# else 248/* Simics 1.4<7 has buggy sbbq:-( */ 249# define BN_MASK2 0xffffffffffffffffL 250BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 251{ 252 BN_ULONG t1, t2; 253 int c = 0; 254 255 if (n <= 0) 256 return ((BN_ULONG)0); 257 258 for (;;) { 259 t1 = a[0]; 260 t2 = b[0]; 261 r[0] = (t1 - t2 - c) & BN_MASK2; 262 if (t1 != t2) 263 c = (t1 < t2); 264 if (--n <= 0) 265 break; 266 267 t1 = a[1]; 268 t2 = b[1]; 269 r[1] = (t1 - t2 - c) & BN_MASK2; 270 if (t1 != t2) 271 c = (t1 < t2); 272 if (--n <= 0) 273 break; 274 275 t1 = a[2]; 276 t2 = b[2]; 277 r[2] = (t1 - t2 - c) & BN_MASK2; 278 if (t1 != t2) 279 c = (t1 < t2); 280 if (--n <= 0) 281 break; 282 283 t1 = a[3]; 284 t2 = b[3]; 285 r[3] = (t1 - t2 - c) & BN_MASK2; 286 if (t1 != t2) 287 c = (t1 < t2); 288 if (--n <= 0) 289 break; 290 291 a += 4; 292 b += 4; 293 r += 4; 294 } 295 return (c); 296} 297# endif 298 299/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 300/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 301/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 302/* 303 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 304 * c=(c2,c1,c0) 305 */ 306 307/* 308 * Keep in mind that carrying into high part of multiplication result 309 * can not overflow, because it cannot be all-ones. 310 */ 311# if 0 312/* original macros are kept for reference purposes */ 313# define mul_add_c(a,b,c0,c1,c2) do { \ 314 BN_ULONG ta = (a), tb = (b); \ 315 BN_ULONG lo, hi; \ 316 BN_UMULT_LOHI(lo,hi,ta,tb); \ 317 c0 += lo; hi += (c0<lo)?1:0; \ 318 c1 += hi; c2 += (c1<hi)?1:0; \ 319 } while(0) 320 321# define mul_add_c2(a,b,c0,c1,c2) do { \ 322 BN_ULONG ta = (a), tb = (b); \ 323 BN_ULONG lo, hi, tt; \ 324 BN_UMULT_LOHI(lo,hi,ta,tb); \ 325 c0 += lo; tt = hi+((c0<lo)?1:0); \ 326 c1 += tt; c2 += (c1<tt)?1:0; \ 327 c0 += lo; hi += (c0<lo)?1:0; \ 328 c1 += hi; c2 += (c1<hi)?1:0; \ 329 } while(0) 330 331# define sqr_add_c(a,i,c0,c1,c2) do { \ 332 BN_ULONG ta = (a)[i]; \ 333 BN_ULONG lo, hi; \ 334 BN_UMULT_LOHI(lo,hi,ta,ta); \ 335 c0 += lo; hi += (c0<lo)?1:0; \ 336 c1 += hi; c2 += (c1<hi)?1:0; \ 337 } while(0) 338# else 339# define mul_add_c(a,b,c0,c1,c2) do { \ 340 BN_ULONG t1,t2; \ 341 asm ("mulq %3" \ 342 : "=a"(t1),"=d"(t2) \ 343 : "a"(a),"m"(b) \ 344 : "cc"); \ 345 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 346 : "+r"(c0),"+r"(c1),"+r"(c2) \ 347 : "r"(t1),"r"(t2),"g"(0) \ 348 : "cc"); \ 349 } while (0) 350 351# define sqr_add_c(a,i,c0,c1,c2) do { \ 352 BN_ULONG t1,t2; \ 353 asm ("mulq %2" \ 354 : "=a"(t1),"=d"(t2) \ 355 : "a"(a[i]) \ 356 : "cc"); \ 357 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 358 : "+r"(c0),"+r"(c1),"+r"(c2) \ 359 : "r"(t1),"r"(t2),"g"(0) \ 360 : "cc"); \ 361 } while (0) 362 363# define mul_add_c2(a,b,c0,c1,c2) do { \ 364 BN_ULONG t1,t2; \ 365 asm ("mulq %3" \ 366 : "=a"(t1),"=d"(t2) \ 367 : "a"(a),"m"(b) \ 368 : "cc"); \ 369 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 370 : "+r"(c0),"+r"(c1),"+r"(c2) \ 371 : "r"(t1),"r"(t2),"g"(0) \ 372 : "cc"); \ 373 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 374 : "+r"(c0),"+r"(c1),"+r"(c2) \ 375 : "r"(t1),"r"(t2),"g"(0) \ 376 : "cc"); \ 377 } while (0) 378# endif 379 380# define sqr_add_c2(a,i,j,c0,c1,c2) \ 381 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 382 383void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 384{ 385 BN_ULONG c1, c2, c3; 386 387 c1 = 0; 388 c2 = 0; 389 c3 = 0; 390 mul_add_c(a[0], b[0], c1, c2, c3); 391 r[0] = c1; 392 c1 = 0; 393 mul_add_c(a[0], b[1], c2, c3, c1); 394 mul_add_c(a[1], b[0], c2, c3, c1); 395 r[1] = c2; 396 c2 = 0; 397 mul_add_c(a[2], b[0], c3, c1, c2); 398 mul_add_c(a[1], b[1], c3, c1, c2); 399 mul_add_c(a[0], b[2], c3, c1, c2); 400 r[2] = c3; 401 c3 = 0; 402 mul_add_c(a[0], b[3], c1, c2, c3); 403 mul_add_c(a[1], b[2], c1, c2, c3); 404 mul_add_c(a[2], b[1], c1, c2, c3); 405 mul_add_c(a[3], b[0], c1, c2, c3); 406 r[3] = c1; 407 c1 = 0; 408 mul_add_c(a[4], b[0], c2, c3, c1); 409 mul_add_c(a[3], b[1], c2, c3, c1); 410 mul_add_c(a[2], b[2], c2, c3, c1); 411 mul_add_c(a[1], b[3], c2, c3, c1); 412 mul_add_c(a[0], b[4], c2, c3, c1); 413 r[4] = c2; 414 c2 = 0; 415 mul_add_c(a[0], b[5], c3, c1, c2); 416 mul_add_c(a[1], b[4], c3, c1, c2); 417 mul_add_c(a[2], b[3], c3, c1, c2); 418 mul_add_c(a[3], b[2], c3, c1, c2); 419 mul_add_c(a[4], b[1], c3, c1, c2); 420 mul_add_c(a[5], b[0], c3, c1, c2); 421 r[5] = c3; 422 c3 = 0; 423 mul_add_c(a[6], b[0], c1, c2, c3); 424 mul_add_c(a[5], b[1], c1, c2, c3); 425 mul_add_c(a[4], b[2], c1, c2, c3); 426 mul_add_c(a[3], b[3], c1, c2, c3); 427 mul_add_c(a[2], b[4], c1, c2, c3); 428 mul_add_c(a[1], b[5], c1, c2, c3); 429 mul_add_c(a[0], b[6], c1, c2, c3); 430 r[6] = c1; 431 c1 = 0; 432 mul_add_c(a[0], b[7], c2, c3, c1); 433 mul_add_c(a[1], b[6], c2, c3, c1); 434 mul_add_c(a[2], b[5], c2, c3, c1); 435 mul_add_c(a[3], b[4], c2, c3, c1); 436 mul_add_c(a[4], b[3], c2, c3, c1); 437 mul_add_c(a[5], b[2], c2, c3, c1); 438 mul_add_c(a[6], b[1], c2, c3, c1); 439 mul_add_c(a[7], b[0], c2, c3, c1); 440 r[7] = c2; 441 c2 = 0; 442 mul_add_c(a[7], b[1], c3, c1, c2); 443 mul_add_c(a[6], b[2], c3, c1, c2); 444 mul_add_c(a[5], b[3], c3, c1, c2); 445 mul_add_c(a[4], b[4], c3, c1, c2); 446 mul_add_c(a[3], b[5], c3, c1, c2); 447 mul_add_c(a[2], b[6], c3, c1, c2); 448 mul_add_c(a[1], b[7], c3, c1, c2); 449 r[8] = c3; 450 c3 = 0; 451 mul_add_c(a[2], b[7], c1, c2, c3); 452 mul_add_c(a[3], b[6], c1, c2, c3); 453 mul_add_c(a[4], b[5], c1, c2, c3); 454 mul_add_c(a[5], b[4], c1, c2, c3); 455 mul_add_c(a[6], b[3], c1, c2, c3); 456 mul_add_c(a[7], b[2], c1, c2, c3); 457 r[9] = c1; 458 c1 = 0; 459 mul_add_c(a[7], b[3], c2, c3, c1); 460 mul_add_c(a[6], b[4], c2, c3, c1); 461 mul_add_c(a[5], b[5], c2, c3, c1); 462 mul_add_c(a[4], b[6], c2, c3, c1); 463 mul_add_c(a[3], b[7], c2, c3, c1); 464 r[10] = c2; 465 c2 = 0; 466 mul_add_c(a[4], b[7], c3, c1, c2); 467 mul_add_c(a[5], b[6], c3, c1, c2); 468 mul_add_c(a[6], b[5], c3, c1, c2); 469 mul_add_c(a[7], b[4], c3, c1, c2); 470 r[11] = c3; 471 c3 = 0; 472 mul_add_c(a[7], b[5], c1, c2, c3); 473 mul_add_c(a[6], b[6], c1, c2, c3); 474 mul_add_c(a[5], b[7], c1, c2, c3); 475 r[12] = c1; 476 c1 = 0; 477 mul_add_c(a[6], b[7], c2, c3, c1); 478 mul_add_c(a[7], b[6], c2, c3, c1); 479 r[13] = c2; 480 c2 = 0; 481 mul_add_c(a[7], b[7], c3, c1, c2); 482 r[14] = c3; 483 r[15] = c1; 484} 485 486void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 487{ 488 BN_ULONG c1, c2, c3; 489 490 c1 = 0; 491 c2 = 0; 492 c3 = 0; 493 mul_add_c(a[0], b[0], c1, c2, c3); 494 r[0] = c1; 495 c1 = 0; 496 mul_add_c(a[0], b[1], c2, c3, c1); 497 mul_add_c(a[1], b[0], c2, c3, c1); 498 r[1] = c2; 499 c2 = 0; 500 mul_add_c(a[2], b[0], c3, c1, c2); 501 mul_add_c(a[1], b[1], c3, c1, c2); 502 mul_add_c(a[0], b[2], c3, c1, c2); 503 r[2] = c3; 504 c3 = 0; 505 mul_add_c(a[0], b[3], c1, c2, c3); 506 mul_add_c(a[1], b[2], c1, c2, c3); 507 mul_add_c(a[2], b[1], c1, c2, c3); 508 mul_add_c(a[3], b[0], c1, c2, c3); 509 r[3] = c1; 510 c1 = 0; 511 mul_add_c(a[3], b[1], c2, c3, c1); 512 mul_add_c(a[2], b[2], c2, c3, c1); 513 mul_add_c(a[1], b[3], c2, c3, c1); 514 r[4] = c2; 515 c2 = 0; 516 mul_add_c(a[2], b[3], c3, c1, c2); 517 mul_add_c(a[3], b[2], c3, c1, c2); 518 r[5] = c3; 519 c3 = 0; 520 mul_add_c(a[3], b[3], c1, c2, c3); 521 r[6] = c1; 522 r[7] = c2; 523} 524 525void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 526{ 527 BN_ULONG c1, c2, c3; 528 529 c1 = 0; 530 c2 = 0; 531 c3 = 0; 532 sqr_add_c(a, 0, c1, c2, c3); 533 r[0] = c1; 534 c1 = 0; 535 sqr_add_c2(a, 1, 0, c2, c3, c1); 536 r[1] = c2; 537 c2 = 0; 538 sqr_add_c(a, 1, c3, c1, c2); 539 sqr_add_c2(a, 2, 0, c3, c1, c2); 540 r[2] = c3; 541 c3 = 0; 542 sqr_add_c2(a, 3, 0, c1, c2, c3); 543 sqr_add_c2(a, 2, 1, c1, c2, c3); 544 r[3] = c1; 545 c1 = 0; 546 sqr_add_c(a, 2, c2, c3, c1); 547 sqr_add_c2(a, 3, 1, c2, c3, c1); 548 sqr_add_c2(a, 4, 0, c2, c3, c1); 549 r[4] = c2; 550 c2 = 0; 551 sqr_add_c2(a, 5, 0, c3, c1, c2); 552 sqr_add_c2(a, 4, 1, c3, c1, c2); 553 sqr_add_c2(a, 3, 2, c3, c1, c2); 554 r[5] = c3; 555 c3 = 0; 556 sqr_add_c(a, 3, c1, c2, c3); 557 sqr_add_c2(a, 4, 2, c1, c2, c3); 558 sqr_add_c2(a, 5, 1, c1, c2, c3); 559 sqr_add_c2(a, 6, 0, c1, c2, c3); 560 r[6] = c1; 561 c1 = 0; 562 sqr_add_c2(a, 7, 0, c2, c3, c1); 563 sqr_add_c2(a, 6, 1, c2, c3, c1); 564 sqr_add_c2(a, 5, 2, c2, c3, c1); 565 sqr_add_c2(a, 4, 3, c2, c3, c1); 566 r[7] = c2; 567 c2 = 0; 568 sqr_add_c(a, 4, c3, c1, c2); 569 sqr_add_c2(a, 5, 3, c3, c1, c2); 570 sqr_add_c2(a, 6, 2, c3, c1, c2); 571 sqr_add_c2(a, 7, 1, c3, c1, c2); 572 r[8] = c3; 573 c3 = 0; 574 sqr_add_c2(a, 7, 2, c1, c2, c3); 575 sqr_add_c2(a, 6, 3, c1, c2, c3); 576 sqr_add_c2(a, 5, 4, c1, c2, c3); 577 r[9] = c1; 578 c1 = 0; 579 sqr_add_c(a, 5, c2, c3, c1); 580 sqr_add_c2(a, 6, 4, c2, c3, c1); 581 sqr_add_c2(a, 7, 3, c2, c3, c1); 582 r[10] = c2; 583 c2 = 0; 584 sqr_add_c2(a, 7, 4, c3, c1, c2); 585 sqr_add_c2(a, 6, 5, c3, c1, c2); 586 r[11] = c3; 587 c3 = 0; 588 sqr_add_c(a, 6, c1, c2, c3); 589 sqr_add_c2(a, 7, 5, c1, c2, c3); 590 r[12] = c1; 591 c1 = 0; 592 sqr_add_c2(a, 7, 6, c2, c3, c1); 593 r[13] = c2; 594 c2 = 0; 595 sqr_add_c(a, 7, c3, c1, c2); 596 r[14] = c3; 597 r[15] = c1; 598} 599 600void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 601{ 602 BN_ULONG c1, c2, c3; 603 604 c1 = 0; 605 c2 = 0; 606 c3 = 0; 607 sqr_add_c(a, 0, c1, c2, c3); 608 r[0] = c1; 609 c1 = 0; 610 sqr_add_c2(a, 1, 0, c2, c3, c1); 611 r[1] = c2; 612 c2 = 0; 613 sqr_add_c(a, 1, c3, c1, c2); 614 sqr_add_c2(a, 2, 0, c3, c1, c2); 615 r[2] = c3; 616 c3 = 0; 617 sqr_add_c2(a, 3, 0, c1, c2, c3); 618 sqr_add_c2(a, 2, 1, c1, c2, c3); 619 r[3] = c1; 620 c1 = 0; 621 sqr_add_c(a, 2, c2, c3, c1); 622 sqr_add_c2(a, 3, 1, c2, c3, c1); 623 r[4] = c2; 624 c2 = 0; 625 sqr_add_c2(a, 3, 2, c3, c1, c2); 626 r[5] = c3; 627 c3 = 0; 628 sqr_add_c(a, 3, c1, c2, c3); 629 r[6] = c1; 630 r[7] = c2; 631} 632#endif 633