1/* 2 * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the Apache License 2.0 (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10#include "../bn_local.h" 11#if !(defined(__GNUC__) && __GNUC__>=2) 12# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 13#else 14/*- 15 * x86_64 BIGNUM accelerator version 0.1, December 2002. 16 * 17 * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL 18 * project. 19 * 20 * Rights for redistribution and usage in source and binary forms are 21 * granted according to the License. Warranty of any kind is disclaimed. 22 * 23 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 24 * versions, like 1.0... 25 * A. Well, that's because this code is basically a quick-n-dirty 26 * proof-of-concept hack. As you can see it's implemented with 27 * inline assembler, which means that you're bound to GCC and that 28 * there might be enough room for further improvement. 29 * 30 * Q. Why inline assembler? 31 * A. x86_64 features own ABI which I'm not familiar with. This is 32 * why I decided to let the compiler take care of subroutine 33 * prologue/epilogue as well as register allocation. For reference. 34 * Win64 implements different ABI for AMD64, different from Linux. 35 * 36 * Q. How much faster does it get? 37 * A. 'apps/openssl speed rsa dsa' output with no-asm: 38 * 39 * sign verify sign/s verify/s 40 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 41 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 42 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 43 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 44 * sign verify sign/s verify/s 45 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 46 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 47 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 48 * 49 * 'apps/openssl speed rsa dsa' output with this module: 50 * 51 * sign verify sign/s verify/s 52 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 53 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 54 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 55 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 56 * sign verify sign/s verify/s 57 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 58 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 59 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 60 * 61 * For the reference. IA-32 assembler implementation performs 62 * very much like 64-bit code compiled with no-asm on the same 63 * machine. 64 */ 65 66# undef mul 67# undef mul_add 68 69/*- 70 * "m"(a), "+m"(r) is the way to favor DirectPath ��-code; 71 * "g"(0) let the compiler to decide where does it 72 * want to keep the value of zero; 73 */ 74# define mul_add(r,a,word,carry) do { \ 75 register BN_ULONG high,low; \ 76 asm ("mulq %3" \ 77 : "=a"(low),"=d"(high) \ 78 : "a"(word),"m"(a) \ 79 : "cc"); \ 80 asm ("addq %2,%0; adcq %3,%1" \ 81 : "+r"(carry),"+d"(high)\ 82 : "a"(low),"g"(0) \ 83 : "cc"); \ 84 asm ("addq %2,%0; adcq %3,%1" \ 85 : "+m"(r),"+d"(high) \ 86 : "r"(carry),"g"(0) \ 87 : "cc"); \ 88 carry=high; \ 89 } while (0) 90 91# define mul(r,a,word,carry) do { \ 92 register BN_ULONG high,low; \ 93 asm ("mulq %3" \ 94 : "=a"(low),"=d"(high) \ 95 : "a"(word),"g"(a) \ 96 : "cc"); \ 97 asm ("addq %2,%0; adcq %3,%1" \ 98 : "+r"(carry),"+d"(high)\ 99 : "a"(low),"g"(0) \ 100 : "cc"); \ 101 (r)=carry, carry=high; \ 102 } while (0) 103# undef sqr 104# define sqr(r0,r1,a) \ 105 asm ("mulq %2" \ 106 : "=a"(r0),"=d"(r1) \ 107 : "a"(a) \ 108 : "cc"); 109 110BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 111 BN_ULONG w) 112{ 113 BN_ULONG c1 = 0; 114 115 if (num <= 0) 116 return c1; 117 118 while (num & ~3) { 119 mul_add(rp[0], ap[0], w, c1); 120 mul_add(rp[1], ap[1], w, c1); 121 mul_add(rp[2], ap[2], w, c1); 122 mul_add(rp[3], ap[3], w, c1); 123 ap += 4; 124 rp += 4; 125 num -= 4; 126 } 127 if (num) { 128 mul_add(rp[0], ap[0], w, c1); 129 if (--num == 0) 130 return c1; 131 mul_add(rp[1], ap[1], w, c1); 132 if (--num == 0) 133 return c1; 134 mul_add(rp[2], ap[2], w, c1); 135 return c1; 136 } 137 138 return c1; 139} 140 141BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 142{ 143 BN_ULONG c1 = 0; 144 145 if (num <= 0) 146 return c1; 147 148 while (num & ~3) { 149 mul(rp[0], ap[0], w, c1); 150 mul(rp[1], ap[1], w, c1); 151 mul(rp[2], ap[2], w, c1); 152 mul(rp[3], ap[3], w, c1); 153 ap += 4; 154 rp += 4; 155 num -= 4; 156 } 157 if (num) { 158 mul(rp[0], ap[0], w, c1); 159 if (--num == 0) 160 return c1; 161 mul(rp[1], ap[1], w, c1); 162 if (--num == 0) 163 return c1; 164 mul(rp[2], ap[2], w, c1); 165 } 166 return c1; 167} 168 169void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 170{ 171 if (n <= 0) 172 return; 173 174 while (n & ~3) { 175 sqr(r[0], r[1], a[0]); 176 sqr(r[2], r[3], a[1]); 177 sqr(r[4], r[5], a[2]); 178 sqr(r[6], r[7], a[3]); 179 a += 4; 180 r += 8; 181 n -= 4; 182 } 183 if (n) { 184 sqr(r[0], r[1], a[0]); 185 if (--n == 0) 186 return; 187 sqr(r[2], r[3], a[1]); 188 if (--n == 0) 189 return; 190 sqr(r[4], r[5], a[2]); 191 } 192} 193 194BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 195{ 196 BN_ULONG ret, waste; 197 198 asm("divq %4":"=a"(ret), "=d"(waste) 199 : "a"(l), "d"(h), "r"(d) 200 : "cc"); 201 202 return ret; 203} 204 205BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 206 int n) 207{ 208 BN_ULONG ret; 209 size_t i = 0; 210 211 if (n <= 0) 212 return 0; 213 214 asm volatile (" subq %0,%0 \n" /* clear carry */ 215 " jmp 1f \n" 216 ".p2align 4 \n" 217 "1: movq (%4,%2,8),%0 \n" 218 " adcq (%5,%2,8),%0 \n" 219 " movq %0,(%3,%2,8) \n" 220 " lea 1(%2),%2 \n" 221 " dec %1 \n" 222 " jnz 1b \n" 223 " sbbq %0,%0 \n" 224 :"=&r" (ret), "+c"(n), "+r"(i) 225 :"r"(rp), "r"(ap), "r"(bp) 226 :"cc", "memory"); 227 228 return ret & 1; 229} 230 231# ifndef SIMICS 232BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 233 int n) 234{ 235 BN_ULONG ret; 236 size_t i = 0; 237 238 if (n <= 0) 239 return 0; 240 241 asm volatile (" subq %0,%0 \n" /* clear borrow */ 242 " jmp 1f \n" 243 ".p2align 4 \n" 244 "1: movq (%4,%2,8),%0 \n" 245 " sbbq (%5,%2,8),%0 \n" 246 " movq %0,(%3,%2,8) \n" 247 " lea 1(%2),%2 \n" 248 " dec %1 \n" 249 " jnz 1b \n" 250 " sbbq %0,%0 \n" 251 :"=&r" (ret), "+c"(n), "+r"(i) 252 :"r"(rp), "r"(ap), "r"(bp) 253 :"cc", "memory"); 254 255 return ret & 1; 256} 257# else 258/* Simics 1.4<7 has buggy sbbq:-( */ 259# define BN_MASK2 0xffffffffffffffffL 260BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 261{ 262 BN_ULONG t1, t2; 263 int c = 0; 264 265 if (n <= 0) 266 return (BN_ULONG)0; 267 268 for (;;) { 269 t1 = a[0]; 270 t2 = b[0]; 271 r[0] = (t1 - t2 - c) & BN_MASK2; 272 if (t1 != t2) 273 c = (t1 < t2); 274 if (--n <= 0) 275 break; 276 277 t1 = a[1]; 278 t2 = b[1]; 279 r[1] = (t1 - t2 - c) & BN_MASK2; 280 if (t1 != t2) 281 c = (t1 < t2); 282 if (--n <= 0) 283 break; 284 285 t1 = a[2]; 286 t2 = b[2]; 287 r[2] = (t1 - t2 - c) & BN_MASK2; 288 if (t1 != t2) 289 c = (t1 < t2); 290 if (--n <= 0) 291 break; 292 293 t1 = a[3]; 294 t2 = b[3]; 295 r[3] = (t1 - t2 - c) & BN_MASK2; 296 if (t1 != t2) 297 c = (t1 < t2); 298 if (--n <= 0) 299 break; 300 301 a += 4; 302 b += 4; 303 r += 4; 304 } 305 return c; 306} 307# endif 308 309/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 310/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 311/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 312/* 313 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 314 * c=(c2,c1,c0) 315 */ 316 317/* 318 * Keep in mind that carrying into high part of multiplication result 319 * can not overflow, because it cannot be all-ones. 320 */ 321# if 0 322/* original macros are kept for reference purposes */ 323# define mul_add_c(a,b,c0,c1,c2) do { \ 324 BN_ULONG ta = (a), tb = (b); \ 325 BN_ULONG lo, hi; \ 326 BN_UMULT_LOHI(lo,hi,ta,tb); \ 327 c0 += lo; hi += (c0<lo)?1:0; \ 328 c1 += hi; c2 += (c1<hi)?1:0; \ 329 } while(0) 330 331# define mul_add_c2(a,b,c0,c1,c2) do { \ 332 BN_ULONG ta = (a), tb = (b); \ 333 BN_ULONG lo, hi, tt; \ 334 BN_UMULT_LOHI(lo,hi,ta,tb); \ 335 c0 += lo; tt = hi+((c0<lo)?1:0); \ 336 c1 += tt; c2 += (c1<tt)?1:0; \ 337 c0 += lo; hi += (c0<lo)?1:0; \ 338 c1 += hi; c2 += (c1<hi)?1:0; \ 339 } while(0) 340 341# define sqr_add_c(a,i,c0,c1,c2) do { \ 342 BN_ULONG ta = (a)[i]; \ 343 BN_ULONG lo, hi; \ 344 BN_UMULT_LOHI(lo,hi,ta,ta); \ 345 c0 += lo; hi += (c0<lo)?1:0; \ 346 c1 += hi; c2 += (c1<hi)?1:0; \ 347 } while(0) 348# else 349# define mul_add_c(a,b,c0,c1,c2) do { \ 350 BN_ULONG t1,t2; \ 351 asm ("mulq %3" \ 352 : "=a"(t1),"=d"(t2) \ 353 : "a"(a),"m"(b) \ 354 : "cc"); \ 355 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 356 : "+r"(c0),"+r"(c1),"+r"(c2) \ 357 : "r"(t1),"r"(t2),"g"(0) \ 358 : "cc"); \ 359 } while (0) 360 361# define sqr_add_c(a,i,c0,c1,c2) do { \ 362 BN_ULONG t1,t2; \ 363 asm ("mulq %2" \ 364 : "=a"(t1),"=d"(t2) \ 365 : "a"(a[i]) \ 366 : "cc"); \ 367 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 368 : "+r"(c0),"+r"(c1),"+r"(c2) \ 369 : "r"(t1),"r"(t2),"g"(0) \ 370 : "cc"); \ 371 } while (0) 372 373# define mul_add_c2(a,b,c0,c1,c2) do { \ 374 BN_ULONG t1,t2; \ 375 asm ("mulq %3" \ 376 : "=a"(t1),"=d"(t2) \ 377 : "a"(a),"m"(b) \ 378 : "cc"); \ 379 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 380 : "+r"(c0),"+r"(c1),"+r"(c2) \ 381 : "r"(t1),"r"(t2),"g"(0) \ 382 : "cc"); \ 383 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 384 : "+r"(c0),"+r"(c1),"+r"(c2) \ 385 : "r"(t1),"r"(t2),"g"(0) \ 386 : "cc"); \ 387 } while (0) 388# endif 389 390# define sqr_add_c2(a,i,j,c0,c1,c2) \ 391 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 392 393void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 394{ 395 BN_ULONG c1, c2, c3; 396 397 c1 = 0; 398 c2 = 0; 399 c3 = 0; 400 mul_add_c(a[0], b[0], c1, c2, c3); 401 r[0] = c1; 402 c1 = 0; 403 mul_add_c(a[0], b[1], c2, c3, c1); 404 mul_add_c(a[1], b[0], c2, c3, c1); 405 r[1] = c2; 406 c2 = 0; 407 mul_add_c(a[2], b[0], c3, c1, c2); 408 mul_add_c(a[1], b[1], c3, c1, c2); 409 mul_add_c(a[0], b[2], c3, c1, c2); 410 r[2] = c3; 411 c3 = 0; 412 mul_add_c(a[0], b[3], c1, c2, c3); 413 mul_add_c(a[1], b[2], c1, c2, c3); 414 mul_add_c(a[2], b[1], c1, c2, c3); 415 mul_add_c(a[3], b[0], c1, c2, c3); 416 r[3] = c1; 417 c1 = 0; 418 mul_add_c(a[4], b[0], c2, c3, c1); 419 mul_add_c(a[3], b[1], c2, c3, c1); 420 mul_add_c(a[2], b[2], c2, c3, c1); 421 mul_add_c(a[1], b[3], c2, c3, c1); 422 mul_add_c(a[0], b[4], c2, c3, c1); 423 r[4] = c2; 424 c2 = 0; 425 mul_add_c(a[0], b[5], c3, c1, c2); 426 mul_add_c(a[1], b[4], c3, c1, c2); 427 mul_add_c(a[2], b[3], c3, c1, c2); 428 mul_add_c(a[3], b[2], c3, c1, c2); 429 mul_add_c(a[4], b[1], c3, c1, c2); 430 mul_add_c(a[5], b[0], c3, c1, c2); 431 r[5] = c3; 432 c3 = 0; 433 mul_add_c(a[6], b[0], c1, c2, c3); 434 mul_add_c(a[5], b[1], c1, c2, c3); 435 mul_add_c(a[4], b[2], c1, c2, c3); 436 mul_add_c(a[3], b[3], c1, c2, c3); 437 mul_add_c(a[2], b[4], c1, c2, c3); 438 mul_add_c(a[1], b[5], c1, c2, c3); 439 mul_add_c(a[0], b[6], c1, c2, c3); 440 r[6] = c1; 441 c1 = 0; 442 mul_add_c(a[0], b[7], c2, c3, c1); 443 mul_add_c(a[1], b[6], c2, c3, c1); 444 mul_add_c(a[2], b[5], c2, c3, c1); 445 mul_add_c(a[3], b[4], c2, c3, c1); 446 mul_add_c(a[4], b[3], c2, c3, c1); 447 mul_add_c(a[5], b[2], c2, c3, c1); 448 mul_add_c(a[6], b[1], c2, c3, c1); 449 mul_add_c(a[7], b[0], c2, c3, c1); 450 r[7] = c2; 451 c2 = 0; 452 mul_add_c(a[7], b[1], c3, c1, c2); 453 mul_add_c(a[6], b[2], c3, c1, c2); 454 mul_add_c(a[5], b[3], c3, c1, c2); 455 mul_add_c(a[4], b[4], c3, c1, c2); 456 mul_add_c(a[3], b[5], c3, c1, c2); 457 mul_add_c(a[2], b[6], c3, c1, c2); 458 mul_add_c(a[1], b[7], c3, c1, c2); 459 r[8] = c3; 460 c3 = 0; 461 mul_add_c(a[2], b[7], c1, c2, c3); 462 mul_add_c(a[3], b[6], c1, c2, c3); 463 mul_add_c(a[4], b[5], c1, c2, c3); 464 mul_add_c(a[5], b[4], c1, c2, c3); 465 mul_add_c(a[6], b[3], c1, c2, c3); 466 mul_add_c(a[7], b[2], c1, c2, c3); 467 r[9] = c1; 468 c1 = 0; 469 mul_add_c(a[7], b[3], c2, c3, c1); 470 mul_add_c(a[6], b[4], c2, c3, c1); 471 mul_add_c(a[5], b[5], c2, c3, c1); 472 mul_add_c(a[4], b[6], c2, c3, c1); 473 mul_add_c(a[3], b[7], c2, c3, c1); 474 r[10] = c2; 475 c2 = 0; 476 mul_add_c(a[4], b[7], c3, c1, c2); 477 mul_add_c(a[5], b[6], c3, c1, c2); 478 mul_add_c(a[6], b[5], c3, c1, c2); 479 mul_add_c(a[7], b[4], c3, c1, c2); 480 r[11] = c3; 481 c3 = 0; 482 mul_add_c(a[7], b[5], c1, c2, c3); 483 mul_add_c(a[6], b[6], c1, c2, c3); 484 mul_add_c(a[5], b[7], c1, c2, c3); 485 r[12] = c1; 486 c1 = 0; 487 mul_add_c(a[6], b[7], c2, c3, c1); 488 mul_add_c(a[7], b[6], c2, c3, c1); 489 r[13] = c2; 490 c2 = 0; 491 mul_add_c(a[7], b[7], c3, c1, c2); 492 r[14] = c3; 493 r[15] = c1; 494} 495 496void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 497{ 498 BN_ULONG c1, c2, c3; 499 500 c1 = 0; 501 c2 = 0; 502 c3 = 0; 503 mul_add_c(a[0], b[0], c1, c2, c3); 504 r[0] = c1; 505 c1 = 0; 506 mul_add_c(a[0], b[1], c2, c3, c1); 507 mul_add_c(a[1], b[0], c2, c3, c1); 508 r[1] = c2; 509 c2 = 0; 510 mul_add_c(a[2], b[0], c3, c1, c2); 511 mul_add_c(a[1], b[1], c3, c1, c2); 512 mul_add_c(a[0], b[2], c3, c1, c2); 513 r[2] = c3; 514 c3 = 0; 515 mul_add_c(a[0], b[3], c1, c2, c3); 516 mul_add_c(a[1], b[2], c1, c2, c3); 517 mul_add_c(a[2], b[1], c1, c2, c3); 518 mul_add_c(a[3], b[0], c1, c2, c3); 519 r[3] = c1; 520 c1 = 0; 521 mul_add_c(a[3], b[1], c2, c3, c1); 522 mul_add_c(a[2], b[2], c2, c3, c1); 523 mul_add_c(a[1], b[3], c2, c3, c1); 524 r[4] = c2; 525 c2 = 0; 526 mul_add_c(a[2], b[3], c3, c1, c2); 527 mul_add_c(a[3], b[2], c3, c1, c2); 528 r[5] = c3; 529 c3 = 0; 530 mul_add_c(a[3], b[3], c1, c2, c3); 531 r[6] = c1; 532 r[7] = c2; 533} 534 535void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 536{ 537 BN_ULONG c1, c2, c3; 538 539 c1 = 0; 540 c2 = 0; 541 c3 = 0; 542 sqr_add_c(a, 0, c1, c2, c3); 543 r[0] = c1; 544 c1 = 0; 545 sqr_add_c2(a, 1, 0, c2, c3, c1); 546 r[1] = c2; 547 c2 = 0; 548 sqr_add_c(a, 1, c3, c1, c2); 549 sqr_add_c2(a, 2, 0, c3, c1, c2); 550 r[2] = c3; 551 c3 = 0; 552 sqr_add_c2(a, 3, 0, c1, c2, c3); 553 sqr_add_c2(a, 2, 1, c1, c2, c3); 554 r[3] = c1; 555 c1 = 0; 556 sqr_add_c(a, 2, c2, c3, c1); 557 sqr_add_c2(a, 3, 1, c2, c3, c1); 558 sqr_add_c2(a, 4, 0, c2, c3, c1); 559 r[4] = c2; 560 c2 = 0; 561 sqr_add_c2(a, 5, 0, c3, c1, c2); 562 sqr_add_c2(a, 4, 1, c3, c1, c2); 563 sqr_add_c2(a, 3, 2, c3, c1, c2); 564 r[5] = c3; 565 c3 = 0; 566 sqr_add_c(a, 3, c1, c2, c3); 567 sqr_add_c2(a, 4, 2, c1, c2, c3); 568 sqr_add_c2(a, 5, 1, c1, c2, c3); 569 sqr_add_c2(a, 6, 0, c1, c2, c3); 570 r[6] = c1; 571 c1 = 0; 572 sqr_add_c2(a, 7, 0, c2, c3, c1); 573 sqr_add_c2(a, 6, 1, c2, c3, c1); 574 sqr_add_c2(a, 5, 2, c2, c3, c1); 575 sqr_add_c2(a, 4, 3, c2, c3, c1); 576 r[7] = c2; 577 c2 = 0; 578 sqr_add_c(a, 4, c3, c1, c2); 579 sqr_add_c2(a, 5, 3, c3, c1, c2); 580 sqr_add_c2(a, 6, 2, c3, c1, c2); 581 sqr_add_c2(a, 7, 1, c3, c1, c2); 582 r[8] = c3; 583 c3 = 0; 584 sqr_add_c2(a, 7, 2, c1, c2, c3); 585 sqr_add_c2(a, 6, 3, c1, c2, c3); 586 sqr_add_c2(a, 5, 4, c1, c2, c3); 587 r[9] = c1; 588 c1 = 0; 589 sqr_add_c(a, 5, c2, c3, c1); 590 sqr_add_c2(a, 6, 4, c2, c3, c1); 591 sqr_add_c2(a, 7, 3, c2, c3, c1); 592 r[10] = c2; 593 c2 = 0; 594 sqr_add_c2(a, 7, 4, c3, c1, c2); 595 sqr_add_c2(a, 6, 5, c3, c1, c2); 596 r[11] = c3; 597 c3 = 0; 598 sqr_add_c(a, 6, c1, c2, c3); 599 sqr_add_c2(a, 7, 5, c1, c2, c3); 600 r[12] = c1; 601 c1 = 0; 602 sqr_add_c2(a, 7, 6, c2, c3, c1); 603 r[13] = c2; 604 c2 = 0; 605 sqr_add_c(a, 7, c3, c1, c2); 606 r[14] = c3; 607 r[15] = c1; 608} 609 610void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 611{ 612 BN_ULONG c1, c2, c3; 613 614 c1 = 0; 615 c2 = 0; 616 c3 = 0; 617 sqr_add_c(a, 0, c1, c2, c3); 618 r[0] = c1; 619 c1 = 0; 620 sqr_add_c2(a, 1, 0, c2, c3, c1); 621 r[1] = c2; 622 c2 = 0; 623 sqr_add_c(a, 1, c3, c1, c2); 624 sqr_add_c2(a, 2, 0, c3, c1, c2); 625 r[2] = c3; 626 c3 = 0; 627 sqr_add_c2(a, 3, 0, c1, c2, c3); 628 sqr_add_c2(a, 2, 1, c1, c2, c3); 629 r[3] = c1; 630 c1 = 0; 631 sqr_add_c(a, 2, c2, c3, c1); 632 sqr_add_c2(a, 3, 1, c2, c3, c1); 633 r[4] = c2; 634 c2 = 0; 635 sqr_add_c2(a, 3, 2, c3, c1, c2); 636 r[5] = c3; 637 c3 = 0; 638 sqr_add_c(a, 3, c1, c2, c3); 639 r[6] = c1; 640 r[7] = c2; 641} 642#endif 643