x86_64-gcc.c revision 306195
1#include "../bn_lcl.h" 2#if !(defined(__GNUC__) && __GNUC__>=2) 3# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 4#else 5/*- 6 * x86_64 BIGNUM accelerator version 0.1, December 2002. 7 * 8 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9 * project. 10 * 11 * Rights for redistribution and usage in source and binary forms are 12 * granted according to the OpenSSL license. Warranty of any kind is 13 * disclaimed. 14 * 15 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 16 * versions, like 1.0... 17 * A. Well, that's because this code is basically a quick-n-dirty 18 * proof-of-concept hack. As you can see it's implemented with 19 * inline assembler, which means that you're bound to GCC and that 20 * there might be enough room for further improvement. 21 * 22 * Q. Why inline assembler? 23 * A. x86_64 features own ABI which I'm not familiar with. This is 24 * why I decided to let the compiler take care of subroutine 25 * prologue/epilogue as well as register allocation. For reference. 26 * Win64 implements different ABI for AMD64, different from Linux. 27 * 28 * Q. How much faster does it get? 29 * A. 'apps/openssl speed rsa dsa' output with no-asm: 30 * 31 * sign verify sign/s verify/s 32 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 33 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 34 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 35 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 36 * sign verify sign/s verify/s 37 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 38 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 39 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 40 * 41 * 'apps/openssl speed rsa dsa' output with this module: 42 * 43 * sign verify sign/s verify/s 44 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 45 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 46 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 47 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 48 * sign verify sign/s verify/s 49 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 50 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 51 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 52 * 53 * For the reference. IA-32 assembler implementation performs 54 * very much like 64-bit code compiled with no-asm on the same 55 * machine. 56 */ 57 58# if defined(_WIN64) || !defined(__LP64__) 59# define BN_ULONG unsigned long long 60# else 61# define BN_ULONG unsigned long 62# endif 63 64# undef mul 65# undef mul_add 66 67/*- 68 * "m"(a), "+m"(r) is the way to favor DirectPath ��-code; 69 * "g"(0) let the compiler to decide where does it 70 * want to keep the value of zero; 71 */ 72# define mul_add(r,a,word,carry) do { \ 73 register BN_ULONG high,low; \ 74 asm ("mulq %3" \ 75 : "=a"(low),"=d"(high) \ 76 : "a"(word),"m"(a) \ 77 : "cc"); \ 78 asm ("addq %2,%0; adcq %3,%1" \ 79 : "+r"(carry),"+d"(high)\ 80 : "a"(low),"g"(0) \ 81 : "cc"); \ 82 asm ("addq %2,%0; adcq %3,%1" \ 83 : "+m"(r),"+d"(high) \ 84 : "r"(carry),"g"(0) \ 85 : "cc"); \ 86 carry=high; \ 87 } while (0) 88 89# define mul(r,a,word,carry) do { \ 90 register BN_ULONG high,low; \ 91 asm ("mulq %3" \ 92 : "=a"(low),"=d"(high) \ 93 : "a"(word),"g"(a) \ 94 : "cc"); \ 95 asm ("addq %2,%0; adcq %3,%1" \ 96 : "+r"(carry),"+d"(high)\ 97 : "a"(low),"g"(0) \ 98 : "cc"); \ 99 (r)=carry, carry=high; \ 100 } while (0) 101# undef sqr 102# define sqr(r0,r1,a) \ 103 asm ("mulq %2" \ 104 : "=a"(r0),"=d"(r1) \ 105 : "a"(a) \ 106 : "cc"); 107 108BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 109 BN_ULONG w) 110{ 111 BN_ULONG c1 = 0; 112 113 if (num <= 0) 114 return (c1); 115 116 while (num & ~3) { 117 mul_add(rp[0], ap[0], w, c1); 118 mul_add(rp[1], ap[1], w, c1); 119 mul_add(rp[2], ap[2], w, c1); 120 mul_add(rp[3], ap[3], w, c1); 121 ap += 4; 122 rp += 4; 123 num -= 4; 124 } 125 if (num) { 126 mul_add(rp[0], ap[0], w, c1); 127 if (--num == 0) 128 return c1; 129 mul_add(rp[1], ap[1], w, c1); 130 if (--num == 0) 131 return c1; 132 mul_add(rp[2], ap[2], w, c1); 133 return c1; 134 } 135 136 return (c1); 137} 138 139BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 140{ 141 BN_ULONG c1 = 0; 142 143 if (num <= 0) 144 return (c1); 145 146 while (num & ~3) { 147 mul(rp[0], ap[0], w, c1); 148 mul(rp[1], ap[1], w, c1); 149 mul(rp[2], ap[2], w, c1); 150 mul(rp[3], ap[3], w, c1); 151 ap += 4; 152 rp += 4; 153 num -= 4; 154 } 155 if (num) { 156 mul(rp[0], ap[0], w, c1); 157 if (--num == 0) 158 return c1; 159 mul(rp[1], ap[1], w, c1); 160 if (--num == 0) 161 return c1; 162 mul(rp[2], ap[2], w, c1); 163 } 164 return (c1); 165} 166 167void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 168{ 169 if (n <= 0) 170 return; 171 172 while (n & ~3) { 173 sqr(r[0], r[1], a[0]); 174 sqr(r[2], r[3], a[1]); 175 sqr(r[4], r[5], a[2]); 176 sqr(r[6], r[7], a[3]); 177 a += 4; 178 r += 8; 179 n -= 4; 180 } 181 if (n) { 182 sqr(r[0], r[1], a[0]); 183 if (--n == 0) 184 return; 185 sqr(r[2], r[3], a[1]); 186 if (--n == 0) 187 return; 188 sqr(r[4], r[5], a[2]); 189 } 190} 191 192BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 193{ 194 BN_ULONG ret, waste; 195 196 asm("divq %4":"=a"(ret), "=d"(waste) 197 : "a"(l), "d"(h), "r"(d) 198 : "cc"); 199 200 return ret; 201} 202 203BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 204 int n) 205{ 206 BN_ULONG ret; 207 size_t i = 0; 208 209 if (n <= 0) 210 return 0; 211 212 asm volatile (" subq %0,%0 \n" /* clear carry */ 213 " jmp 1f \n" 214 ".p2align 4 \n" 215 "1: movq (%4,%2,8),%0 \n" 216 " adcq (%5,%2,8),%0 \n" 217 " movq %0,(%3,%2,8) \n" 218 " lea 1(%2),%2 \n" 219 " loop 1b \n" 220 " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), 221 "+r"(i) 222 :"r"(rp), "r"(ap), "r"(bp) 223 :"cc", "memory"); 224 225 return ret & 1; 226} 227 228# ifndef SIMICS 229BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 230 int n) 231{ 232 BN_ULONG ret; 233 size_t i = 0; 234 235 if (n <= 0) 236 return 0; 237 238 asm volatile (" subq %0,%0 \n" /* clear borrow */ 239 " jmp 1f \n" 240 ".p2align 4 \n" 241 "1: movq (%4,%2,8),%0 \n" 242 " sbbq (%5,%2,8),%0 \n" 243 " movq %0,(%3,%2,8) \n" 244 " lea 1(%2),%2 \n" 245 " loop 1b \n" 246 " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), 247 "+r"(i) 248 :"r"(rp), "r"(ap), "r"(bp) 249 :"cc", "memory"); 250 251 return ret & 1; 252} 253# else 254/* Simics 1.4<7 has buggy sbbq:-( */ 255# define BN_MASK2 0xffffffffffffffffL 256BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 257{ 258 BN_ULONG t1, t2; 259 int c = 0; 260 261 if (n <= 0) 262 return ((BN_ULONG)0); 263 264 for (;;) { 265 t1 = a[0]; 266 t2 = b[0]; 267 r[0] = (t1 - t2 - c) & BN_MASK2; 268 if (t1 != t2) 269 c = (t1 < t2); 270 if (--n <= 0) 271 break; 272 273 t1 = a[1]; 274 t2 = b[1]; 275 r[1] = (t1 - t2 - c) & BN_MASK2; 276 if (t1 != t2) 277 c = (t1 < t2); 278 if (--n <= 0) 279 break; 280 281 t1 = a[2]; 282 t2 = b[2]; 283 r[2] = (t1 - t2 - c) & BN_MASK2; 284 if (t1 != t2) 285 c = (t1 < t2); 286 if (--n <= 0) 287 break; 288 289 t1 = a[3]; 290 t2 = b[3]; 291 r[3] = (t1 - t2 - c) & BN_MASK2; 292 if (t1 != t2) 293 c = (t1 < t2); 294 if (--n <= 0) 295 break; 296 297 a += 4; 298 b += 4; 299 r += 4; 300 } 301 return (c); 302} 303# endif 304 305/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 306/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 307/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 308/* 309 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 310 * c=(c2,c1,c0) 311 */ 312 313/* 314 * Keep in mind that carrying into high part of multiplication result 315 * can not overflow, because it cannot be all-ones. 316 */ 317# if 0 318/* original macros are kept for reference purposes */ 319# define mul_add_c(a,b,c0,c1,c2) do { \ 320 BN_ULONG ta = (a), tb = (b); \ 321 BN_ULONG lo, hi; \ 322 BN_UMULT_LOHI(lo,hi,ta,tb); \ 323 c0 += lo; hi += (c0<lo)?1:0; \ 324 c1 += hi; c2 += (c1<hi)?1:0; \ 325 } while(0) 326 327# define mul_add_c2(a,b,c0,c1,c2) do { \ 328 BN_ULONG ta = (a), tb = (b); \ 329 BN_ULONG lo, hi, tt; \ 330 BN_UMULT_LOHI(lo,hi,ta,tb); \ 331 c0 += lo; tt = hi+((c0<lo)?1:0); \ 332 c1 += tt; c2 += (c1<tt)?1:0; \ 333 c0 += lo; hi += (c0<lo)?1:0; \ 334 c1 += hi; c2 += (c1<hi)?1:0; \ 335 } while(0) 336 337# define sqr_add_c(a,i,c0,c1,c2) do { \ 338 BN_ULONG ta = (a)[i]; \ 339 BN_ULONG lo, hi; \ 340 BN_UMULT_LOHI(lo,hi,ta,ta); \ 341 c0 += lo; hi += (c0<lo)?1:0; \ 342 c1 += hi; c2 += (c1<hi)?1:0; \ 343 } while(0) 344# else 345# define mul_add_c(a,b,c0,c1,c2) do { \ 346 BN_ULONG t1,t2; \ 347 asm ("mulq %3" \ 348 : "=a"(t1),"=d"(t2) \ 349 : "a"(a),"m"(b) \ 350 : "cc"); \ 351 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 352 : "+r"(c0),"+r"(c1),"+r"(c2) \ 353 : "r"(t1),"r"(t2),"g"(0) \ 354 : "cc"); \ 355 } while (0) 356 357# define sqr_add_c(a,i,c0,c1,c2) do { \ 358 BN_ULONG t1,t2; \ 359 asm ("mulq %2" \ 360 : "=a"(t1),"=d"(t2) \ 361 : "a"(a[i]) \ 362 : "cc"); \ 363 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 364 : "+r"(c0),"+r"(c1),"+r"(c2) \ 365 : "r"(t1),"r"(t2),"g"(0) \ 366 : "cc"); \ 367 } while (0) 368 369# define mul_add_c2(a,b,c0,c1,c2) do { \ 370 BN_ULONG t1,t2; \ 371 asm ("mulq %3" \ 372 : "=a"(t1),"=d"(t2) \ 373 : "a"(a),"m"(b) \ 374 : "cc"); \ 375 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 376 : "+r"(c0),"+r"(c1),"+r"(c2) \ 377 : "r"(t1),"r"(t2),"g"(0) \ 378 : "cc"); \ 379 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 380 : "+r"(c0),"+r"(c1),"+r"(c2) \ 381 : "r"(t1),"r"(t2),"g"(0) \ 382 : "cc"); \ 383 } while (0) 384# endif 385 386# define sqr_add_c2(a,i,j,c0,c1,c2) \ 387 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 388 389void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 390{ 391 BN_ULONG c1, c2, c3; 392 393 c1 = 0; 394 c2 = 0; 395 c3 = 0; 396 mul_add_c(a[0], b[0], c1, c2, c3); 397 r[0] = c1; 398 c1 = 0; 399 mul_add_c(a[0], b[1], c2, c3, c1); 400 mul_add_c(a[1], b[0], c2, c3, c1); 401 r[1] = c2; 402 c2 = 0; 403 mul_add_c(a[2], b[0], c3, c1, c2); 404 mul_add_c(a[1], b[1], c3, c1, c2); 405 mul_add_c(a[0], b[2], c3, c1, c2); 406 r[2] = c3; 407 c3 = 0; 408 mul_add_c(a[0], b[3], c1, c2, c3); 409 mul_add_c(a[1], b[2], c1, c2, c3); 410 mul_add_c(a[2], b[1], c1, c2, c3); 411 mul_add_c(a[3], b[0], c1, c2, c3); 412 r[3] = c1; 413 c1 = 0; 414 mul_add_c(a[4], b[0], c2, c3, c1); 415 mul_add_c(a[3], b[1], c2, c3, c1); 416 mul_add_c(a[2], b[2], c2, c3, c1); 417 mul_add_c(a[1], b[3], c2, c3, c1); 418 mul_add_c(a[0], b[4], c2, c3, c1); 419 r[4] = c2; 420 c2 = 0; 421 mul_add_c(a[0], b[5], c3, c1, c2); 422 mul_add_c(a[1], b[4], c3, c1, c2); 423 mul_add_c(a[2], b[3], c3, c1, c2); 424 mul_add_c(a[3], b[2], c3, c1, c2); 425 mul_add_c(a[4], b[1], c3, c1, c2); 426 mul_add_c(a[5], b[0], c3, c1, c2); 427 r[5] = c3; 428 c3 = 0; 429 mul_add_c(a[6], b[0], c1, c2, c3); 430 mul_add_c(a[5], b[1], c1, c2, c3); 431 mul_add_c(a[4], b[2], c1, c2, c3); 432 mul_add_c(a[3], b[3], c1, c2, c3); 433 mul_add_c(a[2], b[4], c1, c2, c3); 434 mul_add_c(a[1], b[5], c1, c2, c3); 435 mul_add_c(a[0], b[6], c1, c2, c3); 436 r[6] = c1; 437 c1 = 0; 438 mul_add_c(a[0], b[7], c2, c3, c1); 439 mul_add_c(a[1], b[6], c2, c3, c1); 440 mul_add_c(a[2], b[5], c2, c3, c1); 441 mul_add_c(a[3], b[4], c2, c3, c1); 442 mul_add_c(a[4], b[3], c2, c3, c1); 443 mul_add_c(a[5], b[2], c2, c3, c1); 444 mul_add_c(a[6], b[1], c2, c3, c1); 445 mul_add_c(a[7], b[0], c2, c3, c1); 446 r[7] = c2; 447 c2 = 0; 448 mul_add_c(a[7], b[1], c3, c1, c2); 449 mul_add_c(a[6], b[2], c3, c1, c2); 450 mul_add_c(a[5], b[3], c3, c1, c2); 451 mul_add_c(a[4], b[4], c3, c1, c2); 452 mul_add_c(a[3], b[5], c3, c1, c2); 453 mul_add_c(a[2], b[6], c3, c1, c2); 454 mul_add_c(a[1], b[7], c3, c1, c2); 455 r[8] = c3; 456 c3 = 0; 457 mul_add_c(a[2], b[7], c1, c2, c3); 458 mul_add_c(a[3], b[6], c1, c2, c3); 459 mul_add_c(a[4], b[5], c1, c2, c3); 460 mul_add_c(a[5], b[4], c1, c2, c3); 461 mul_add_c(a[6], b[3], c1, c2, c3); 462 mul_add_c(a[7], b[2], c1, c2, c3); 463 r[9] = c1; 464 c1 = 0; 465 mul_add_c(a[7], b[3], c2, c3, c1); 466 mul_add_c(a[6], b[4], c2, c3, c1); 467 mul_add_c(a[5], b[5], c2, c3, c1); 468 mul_add_c(a[4], b[6], c2, c3, c1); 469 mul_add_c(a[3], b[7], c2, c3, c1); 470 r[10] = c2; 471 c2 = 0; 472 mul_add_c(a[4], b[7], c3, c1, c2); 473 mul_add_c(a[5], b[6], c3, c1, c2); 474 mul_add_c(a[6], b[5], c3, c1, c2); 475 mul_add_c(a[7], b[4], c3, c1, c2); 476 r[11] = c3; 477 c3 = 0; 478 mul_add_c(a[7], b[5], c1, c2, c3); 479 mul_add_c(a[6], b[6], c1, c2, c3); 480 mul_add_c(a[5], b[7], c1, c2, c3); 481 r[12] = c1; 482 c1 = 0; 483 mul_add_c(a[6], b[7], c2, c3, c1); 484 mul_add_c(a[7], b[6], c2, c3, c1); 485 r[13] = c2; 486 c2 = 0; 487 mul_add_c(a[7], b[7], c3, c1, c2); 488 r[14] = c3; 489 r[15] = c1; 490} 491 492void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 493{ 494 BN_ULONG c1, c2, c3; 495 496 c1 = 0; 497 c2 = 0; 498 c3 = 0; 499 mul_add_c(a[0], b[0], c1, c2, c3); 500 r[0] = c1; 501 c1 = 0; 502 mul_add_c(a[0], b[1], c2, c3, c1); 503 mul_add_c(a[1], b[0], c2, c3, c1); 504 r[1] = c2; 505 c2 = 0; 506 mul_add_c(a[2], b[0], c3, c1, c2); 507 mul_add_c(a[1], b[1], c3, c1, c2); 508 mul_add_c(a[0], b[2], c3, c1, c2); 509 r[2] = c3; 510 c3 = 0; 511 mul_add_c(a[0], b[3], c1, c2, c3); 512 mul_add_c(a[1], b[2], c1, c2, c3); 513 mul_add_c(a[2], b[1], c1, c2, c3); 514 mul_add_c(a[3], b[0], c1, c2, c3); 515 r[3] = c1; 516 c1 = 0; 517 mul_add_c(a[3], b[1], c2, c3, c1); 518 mul_add_c(a[2], b[2], c2, c3, c1); 519 mul_add_c(a[1], b[3], c2, c3, c1); 520 r[4] = c2; 521 c2 = 0; 522 mul_add_c(a[2], b[3], c3, c1, c2); 523 mul_add_c(a[3], b[2], c3, c1, c2); 524 r[5] = c3; 525 c3 = 0; 526 mul_add_c(a[3], b[3], c1, c2, c3); 527 r[6] = c1; 528 r[7] = c2; 529} 530 531void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 532{ 533 BN_ULONG c1, c2, c3; 534 535 c1 = 0; 536 c2 = 0; 537 c3 = 0; 538 sqr_add_c(a, 0, c1, c2, c3); 539 r[0] = c1; 540 c1 = 0; 541 sqr_add_c2(a, 1, 0, c2, c3, c1); 542 r[1] = c2; 543 c2 = 0; 544 sqr_add_c(a, 1, c3, c1, c2); 545 sqr_add_c2(a, 2, 0, c3, c1, c2); 546 r[2] = c3; 547 c3 = 0; 548 sqr_add_c2(a, 3, 0, c1, c2, c3); 549 sqr_add_c2(a, 2, 1, c1, c2, c3); 550 r[3] = c1; 551 c1 = 0; 552 sqr_add_c(a, 2, c2, c3, c1); 553 sqr_add_c2(a, 3, 1, c2, c3, c1); 554 sqr_add_c2(a, 4, 0, c2, c3, c1); 555 r[4] = c2; 556 c2 = 0; 557 sqr_add_c2(a, 5, 0, c3, c1, c2); 558 sqr_add_c2(a, 4, 1, c3, c1, c2); 559 sqr_add_c2(a, 3, 2, c3, c1, c2); 560 r[5] = c3; 561 c3 = 0; 562 sqr_add_c(a, 3, c1, c2, c3); 563 sqr_add_c2(a, 4, 2, c1, c2, c3); 564 sqr_add_c2(a, 5, 1, c1, c2, c3); 565 sqr_add_c2(a, 6, 0, c1, c2, c3); 566 r[6] = c1; 567 c1 = 0; 568 sqr_add_c2(a, 7, 0, c2, c3, c1); 569 sqr_add_c2(a, 6, 1, c2, c3, c1); 570 sqr_add_c2(a, 5, 2, c2, c3, c1); 571 sqr_add_c2(a, 4, 3, c2, c3, c1); 572 r[7] = c2; 573 c2 = 0; 574 sqr_add_c(a, 4, c3, c1, c2); 575 sqr_add_c2(a, 5, 3, c3, c1, c2); 576 sqr_add_c2(a, 6, 2, c3, c1, c2); 577 sqr_add_c2(a, 7, 1, c3, c1, c2); 578 r[8] = c3; 579 c3 = 0; 580 sqr_add_c2(a, 7, 2, c1, c2, c3); 581 sqr_add_c2(a, 6, 3, c1, c2, c3); 582 sqr_add_c2(a, 5, 4, c1, c2, c3); 583 r[9] = c1; 584 c1 = 0; 585 sqr_add_c(a, 5, c2, c3, c1); 586 sqr_add_c2(a, 6, 4, c2, c3, c1); 587 sqr_add_c2(a, 7, 3, c2, c3, c1); 588 r[10] = c2; 589 c2 = 0; 590 sqr_add_c2(a, 7, 4, c3, c1, c2); 591 sqr_add_c2(a, 6, 5, c3, c1, c2); 592 r[11] = c3; 593 c3 = 0; 594 sqr_add_c(a, 6, c1, c2, c3); 595 sqr_add_c2(a, 7, 5, c1, c2, c3); 596 r[12] = c1; 597 c1 = 0; 598 sqr_add_c2(a, 7, 6, c2, c3, c1); 599 r[13] = c2; 600 c2 = 0; 601 sqr_add_c(a, 7, c3, c1, c2); 602 r[14] = c3; 603 r[15] = c1; 604} 605 606void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 607{ 608 BN_ULONG c1, c2, c3; 609 610 c1 = 0; 611 c2 = 0; 612 c3 = 0; 613 sqr_add_c(a, 0, c1, c2, c3); 614 r[0] = c1; 615 c1 = 0; 616 sqr_add_c2(a, 1, 0, c2, c3, c1); 617 r[1] = c2; 618 c2 = 0; 619 sqr_add_c(a, 1, c3, c1, c2); 620 sqr_add_c2(a, 2, 0, c3, c1, c2); 621 r[2] = c3; 622 c3 = 0; 623 sqr_add_c2(a, 3, 0, c1, c2, c3); 624 sqr_add_c2(a, 2, 1, c1, c2, c3); 625 r[3] = c1; 626 c1 = 0; 627 sqr_add_c(a, 2, c2, c3, c1); 628 sqr_add_c2(a, 3, 1, c2, c3, c1); 629 r[4] = c2; 630 c2 = 0; 631 sqr_add_c2(a, 3, 2, c3, c1, c2); 632 r[5] = c3; 633 c3 = 0; 634 sqr_add_c(a, 3, c1, c2, c3); 635 r[6] = c1; 636 r[7] = c2; 637} 638#endif 639