bn_asm.c revision 296465
1/* crypto/bn/bn_asm.c */ 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 3 * All rights reserved. 4 * 5 * This package is an SSL implementation written 6 * by Eric Young (eay@cryptsoft.com). 7 * The implementation was written so as to conform with Netscapes SSL. 8 * 9 * This library is free for commercial and non-commercial use as long as 10 * the following conditions are aheared to. The following conditions 11 * apply to all code found in this distribution, be it the RC4, RSA, 12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation 13 * included with this distribution is covered by the same copyright terms 14 * except that the holder is Tim Hudson (tjh@cryptsoft.com). 15 * 16 * Copyright remains Eric Young's, and as such any Copyright notices in 17 * the code are not to be removed. 18 * If this package is used in a product, Eric Young should be given attribution 19 * as the author of the parts of the library used. 20 * This can be in the form of a textual message at program startup or 21 * in documentation (online or textual) provided with the package. 22 * 23 * Redistribution and use in source and binary forms, with or without 24 * modification, are permitted provided that the following conditions 25 * are met: 26 * 1. Redistributions of source code must retain the copyright 27 * notice, this list of conditions and the following disclaimer. 28 * 2. Redistributions in binary form must reproduce the above copyright 29 * notice, this list of conditions and the following disclaimer in the 30 * documentation and/or other materials provided with the distribution. 31 * 3. All advertising materials mentioning features or use of this software 32 * must display the following acknowledgement: 33 * "This product includes cryptographic software written by 34 * Eric Young (eay@cryptsoft.com)" 35 * The word 'cryptographic' can be left out if the rouines from the library 36 * being used are not cryptographic related :-). 37 * 4. If you include any Windows specific code (or a derivative thereof) from 38 * the apps directory (application code) you must include an acknowledgement: 39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" 40 * 41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 51 * SUCH DAMAGE. 52 * 53 * The licence and distribution terms for any publically available version or 54 * derivative of this code cannot be changed. i.e. this code cannot simply be 55 * copied and put under another distribution licence 56 * [including the GNU Public Licence.] 57 */ 58 59#ifndef BN_DEBUG 60# undef NDEBUG /* avoid conflicting definitions */ 61# define NDEBUG 62#endif 63 64#include <stdio.h> 65#include <assert.h> 66#include "cryptlib.h" 67#include "bn_lcl.h" 68 69#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 70 71BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 72 BN_ULONG w) 73{ 74 BN_ULONG c1 = 0; 75 76 assert(num >= 0); 77 if (num <= 0) 78 return (c1); 79 80 while (num & ~3) { 81 mul_add(rp[0], ap[0], w, c1); 82 mul_add(rp[1], ap[1], w, c1); 83 mul_add(rp[2], ap[2], w, c1); 84 mul_add(rp[3], ap[3], w, c1); 85 ap += 4; 86 rp += 4; 87 num -= 4; 88 } 89 if (num) { 90 mul_add(rp[0], ap[0], w, c1); 91 if (--num == 0) 92 return c1; 93 mul_add(rp[1], ap[1], w, c1); 94 if (--num == 0) 95 return c1; 96 mul_add(rp[2], ap[2], w, c1); 97 return c1; 98 } 99 100 return (c1); 101} 102 103BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 104{ 105 BN_ULONG c1 = 0; 106 107 assert(num >= 0); 108 if (num <= 0) 109 return (c1); 110 111 while (num & ~3) { 112 mul(rp[0], ap[0], w, c1); 113 mul(rp[1], ap[1], w, c1); 114 mul(rp[2], ap[2], w, c1); 115 mul(rp[3], ap[3], w, c1); 116 ap += 4; 117 rp += 4; 118 num -= 4; 119 } 120 if (num) { 121 mul(rp[0], ap[0], w, c1); 122 if (--num == 0) 123 return c1; 124 mul(rp[1], ap[1], w, c1); 125 if (--num == 0) 126 return c1; 127 mul(rp[2], ap[2], w, c1); 128 } 129 return (c1); 130} 131 132void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 133{ 134 assert(n >= 0); 135 if (n <= 0) 136 return; 137 while (n & ~3) { 138 sqr(r[0], r[1], a[0]); 139 sqr(r[2], r[3], a[1]); 140 sqr(r[4], r[5], a[2]); 141 sqr(r[6], r[7], a[3]); 142 a += 4; 143 r += 8; 144 n -= 4; 145 } 146 if (n) { 147 sqr(r[0], r[1], a[0]); 148 if (--n == 0) 149 return; 150 sqr(r[2], r[3], a[1]); 151 if (--n == 0) 152 return; 153 sqr(r[4], r[5], a[2]); 154 } 155} 156 157#else /* !(defined(BN_LLONG) || 158 * defined(BN_UMULT_HIGH)) */ 159 160BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 161 BN_ULONG w) 162{ 163 BN_ULONG c = 0; 164 BN_ULONG bl, bh; 165 166 assert(num >= 0); 167 if (num <= 0) 168 return ((BN_ULONG)0); 169 170 bl = LBITS(w); 171 bh = HBITS(w); 172 173 for (;;) { 174 mul_add(rp[0], ap[0], bl, bh, c); 175 if (--num == 0) 176 break; 177 mul_add(rp[1], ap[1], bl, bh, c); 178 if (--num == 0) 179 break; 180 mul_add(rp[2], ap[2], bl, bh, c); 181 if (--num == 0) 182 break; 183 mul_add(rp[3], ap[3], bl, bh, c); 184 if (--num == 0) 185 break; 186 ap += 4; 187 rp += 4; 188 } 189 return (c); 190} 191 192BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 193{ 194 BN_ULONG carry = 0; 195 BN_ULONG bl, bh; 196 197 assert(num >= 0); 198 if (num <= 0) 199 return ((BN_ULONG)0); 200 201 bl = LBITS(w); 202 bh = HBITS(w); 203 204 for (;;) { 205 mul(rp[0], ap[0], bl, bh, carry); 206 if (--num == 0) 207 break; 208 mul(rp[1], ap[1], bl, bh, carry); 209 if (--num == 0) 210 break; 211 mul(rp[2], ap[2], bl, bh, carry); 212 if (--num == 0) 213 break; 214 mul(rp[3], ap[3], bl, bh, carry); 215 if (--num == 0) 216 break; 217 ap += 4; 218 rp += 4; 219 } 220 return (carry); 221} 222 223void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 224{ 225 assert(n >= 0); 226 if (n <= 0) 227 return; 228 for (;;) { 229 sqr64(r[0], r[1], a[0]); 230 if (--n == 0) 231 break; 232 233 sqr64(r[2], r[3], a[1]); 234 if (--n == 0) 235 break; 236 237 sqr64(r[4], r[5], a[2]); 238 if (--n == 0) 239 break; 240 241 sqr64(r[6], r[7], a[3]); 242 if (--n == 0) 243 break; 244 245 a += 4; 246 r += 8; 247 } 248} 249 250#endif /* !(defined(BN_LLONG) || 251 * defined(BN_UMULT_HIGH)) */ 252 253#if defined(BN_LLONG) && defined(BN_DIV2W) 254 255BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 256{ 257 return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 258} 259 260#else 261 262/* Divide h,l by d and return the result. */ 263/* I need to test this some more :-( */ 264BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 265{ 266 BN_ULONG dh, dl, q, ret = 0, th, tl, t; 267 int i, count = 2; 268 269 if (d == 0) 270 return (BN_MASK2); 271 272 i = BN_num_bits_word(d); 273 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 274 275 i = BN_BITS2 - i; 276 if (h >= d) 277 h -= d; 278 279 if (i) { 280 d <<= i; 281 h = (h << i) | (l >> (BN_BITS2 - i)); 282 l <<= i; 283 } 284 dh = (d & BN_MASK2h) >> BN_BITS4; 285 dl = (d & BN_MASK2l); 286 for (;;) { 287 if ((h >> BN_BITS4) == dh) 288 q = BN_MASK2l; 289 else 290 q = h / dh; 291 292 th = q * dh; 293 tl = dl * q; 294 for (;;) { 295 t = h - th; 296 if ((t & BN_MASK2h) || 297 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 298 break; 299 q--; 300 th -= dh; 301 tl -= dl; 302 } 303 t = (tl >> BN_BITS4); 304 tl = (tl << BN_BITS4) & BN_MASK2h; 305 th += t; 306 307 if (l < tl) 308 th++; 309 l -= tl; 310 if (h < th) { 311 h += d; 312 q--; 313 } 314 h -= th; 315 316 if (--count == 0) 317 break; 318 319 ret = q << BN_BITS4; 320 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 321 l = (l & BN_MASK2l) << BN_BITS4; 322 } 323 ret |= q; 324 return (ret); 325} 326#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 327 328#ifdef BN_LLONG 329BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 330 int n) 331{ 332 BN_ULLONG ll = 0; 333 334 assert(n >= 0); 335 if (n <= 0) 336 return ((BN_ULONG)0); 337 338 for (;;) { 339 ll += (BN_ULLONG) a[0] + b[0]; 340 r[0] = (BN_ULONG)ll & BN_MASK2; 341 ll >>= BN_BITS2; 342 if (--n <= 0) 343 break; 344 345 ll += (BN_ULLONG) a[1] + b[1]; 346 r[1] = (BN_ULONG)ll & BN_MASK2; 347 ll >>= BN_BITS2; 348 if (--n <= 0) 349 break; 350 351 ll += (BN_ULLONG) a[2] + b[2]; 352 r[2] = (BN_ULONG)ll & BN_MASK2; 353 ll >>= BN_BITS2; 354 if (--n <= 0) 355 break; 356 357 ll += (BN_ULLONG) a[3] + b[3]; 358 r[3] = (BN_ULONG)ll & BN_MASK2; 359 ll >>= BN_BITS2; 360 if (--n <= 0) 361 break; 362 363 a += 4; 364 b += 4; 365 r += 4; 366 } 367 return ((BN_ULONG)ll); 368} 369#else /* !BN_LLONG */ 370BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 371 int n) 372{ 373 BN_ULONG c, l, t; 374 375 assert(n >= 0); 376 if (n <= 0) 377 return ((BN_ULONG)0); 378 379 c = 0; 380 for (;;) { 381 t = a[0]; 382 t = (t + c) & BN_MASK2; 383 c = (t < c); 384 l = (t + b[0]) & BN_MASK2; 385 c += (l < t); 386 r[0] = l; 387 if (--n <= 0) 388 break; 389 390 t = a[1]; 391 t = (t + c) & BN_MASK2; 392 c = (t < c); 393 l = (t + b[1]) & BN_MASK2; 394 c += (l < t); 395 r[1] = l; 396 if (--n <= 0) 397 break; 398 399 t = a[2]; 400 t = (t + c) & BN_MASK2; 401 c = (t < c); 402 l = (t + b[2]) & BN_MASK2; 403 c += (l < t); 404 r[2] = l; 405 if (--n <= 0) 406 break; 407 408 t = a[3]; 409 t = (t + c) & BN_MASK2; 410 c = (t < c); 411 l = (t + b[3]) & BN_MASK2; 412 c += (l < t); 413 r[3] = l; 414 if (--n <= 0) 415 break; 416 417 a += 4; 418 b += 4; 419 r += 4; 420 } 421 return ((BN_ULONG)c); 422} 423#endif /* !BN_LLONG */ 424 425BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 426 int n) 427{ 428 BN_ULONG t1, t2; 429 int c = 0; 430 431 assert(n >= 0); 432 if (n <= 0) 433 return ((BN_ULONG)0); 434 435 for (;;) { 436 t1 = a[0]; 437 t2 = b[0]; 438 r[0] = (t1 - t2 - c) & BN_MASK2; 439 if (t1 != t2) 440 c = (t1 < t2); 441 if (--n <= 0) 442 break; 443 444 t1 = a[1]; 445 t2 = b[1]; 446 r[1] = (t1 - t2 - c) & BN_MASK2; 447 if (t1 != t2) 448 c = (t1 < t2); 449 if (--n <= 0) 450 break; 451 452 t1 = a[2]; 453 t2 = b[2]; 454 r[2] = (t1 - t2 - c) & BN_MASK2; 455 if (t1 != t2) 456 c = (t1 < t2); 457 if (--n <= 0) 458 break; 459 460 t1 = a[3]; 461 t2 = b[3]; 462 r[3] = (t1 - t2 - c) & BN_MASK2; 463 if (t1 != t2) 464 c = (t1 < t2); 465 if (--n <= 0) 466 break; 467 468 a += 4; 469 b += 4; 470 r += 4; 471 } 472 return (c); 473} 474 475#ifdef BN_MUL_COMBA 476 477# undef bn_mul_comba8 478# undef bn_mul_comba4 479# undef bn_sqr_comba8 480# undef bn_sqr_comba4 481 482/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 483/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 484/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 485/* 486 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 487 * c=(c2,c1,c0) 488 */ 489 490/* 491 * Keep in mind that carrying into high part of multiplication result 492 * can not overflow, because it cannot be all-ones. 493 */ 494# ifdef BN_LLONG 495# define mul_add_c(a,b,c0,c1,c2) \ 496 t=(BN_ULLONG)a*b; \ 497 t1=(BN_ULONG)Lw(t); \ 498 t2=(BN_ULONG)Hw(t); \ 499 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 500 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 501 502# define mul_add_c2(a,b,c0,c1,c2) \ 503 t=(BN_ULLONG)a*b; \ 504 tt=(t+t)&BN_MASK; \ 505 if (tt < t) c2++; \ 506 t1=(BN_ULONG)Lw(tt); \ 507 t2=(BN_ULONG)Hw(tt); \ 508 c0=(c0+t1)&BN_MASK2; \ 509 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ 510 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 511 512# define sqr_add_c(a,i,c0,c1,c2) \ 513 t=(BN_ULLONG)a[i]*a[i]; \ 514 t1=(BN_ULONG)Lw(t); \ 515 t2=(BN_ULONG)Hw(t); \ 516 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 517 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 518 519# define sqr_add_c2(a,i,j,c0,c1,c2) \ 520 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 521 522# elif defined(BN_UMULT_LOHI) 523 524# define mul_add_c(a,b,c0,c1,c2) { \ 525 BN_ULONG ta=(a),tb=(b); \ 526 BN_UMULT_LOHI(t1,t2,ta,tb); \ 527 c0 += t1; t2 += (c0<t1)?1:0; \ 528 c1 += t2; c2 += (c1<t2)?1:0; \ 529 } 530 531# define mul_add_c2(a,b,c0,c1,c2) { \ 532 BN_ULONG ta=(a),tb=(b),t0; \ 533 BN_UMULT_LOHI(t0,t1,ta,tb); \ 534 c0 += t0; t2 = t1+((c0<t0)?1:0);\ 535 c1 += t2; c2 += (c1<t2)?1:0; \ 536 c0 += t0; t1 += (c0<t0)?1:0; \ 537 c1 += t1; c2 += (c1<t1)?1:0; \ 538 } 539 540# define sqr_add_c(a,i,c0,c1,c2) { \ 541 BN_ULONG ta=(a)[i]; \ 542 BN_UMULT_LOHI(t1,t2,ta,ta); \ 543 c0 += t1; t2 += (c0<t1)?1:0; \ 544 c1 += t2; c2 += (c1<t2)?1:0; \ 545 } 546 547# define sqr_add_c2(a,i,j,c0,c1,c2) \ 548 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 549 550# elif defined(BN_UMULT_HIGH) 551 552# define mul_add_c(a,b,c0,c1,c2) { \ 553 BN_ULONG ta=(a),tb=(b); \ 554 t1 = ta * tb; \ 555 t2 = BN_UMULT_HIGH(ta,tb); \ 556 c0 += t1; t2 += (c0<t1)?1:0; \ 557 c1 += t2; c2 += (c1<t2)?1:0; \ 558 } 559 560# define mul_add_c2(a,b,c0,c1,c2) { \ 561 BN_ULONG ta=(a),tb=(b),t0; \ 562 t1 = BN_UMULT_HIGH(ta,tb); \ 563 t0 = ta * tb; \ 564 c0 += t0; t2 = t1+((c0<t0)?1:0);\ 565 c1 += t2; c2 += (c1<t2)?1:0; \ 566 c0 += t0; t1 += (c0<t0)?1:0; \ 567 c1 += t1; c2 += (c1<t1)?1:0; \ 568 } 569 570# define sqr_add_c(a,i,c0,c1,c2) { \ 571 BN_ULONG ta=(a)[i]; \ 572 t1 = ta * ta; \ 573 t2 = BN_UMULT_HIGH(ta,ta); \ 574 c0 += t1; t2 += (c0<t1)?1:0; \ 575 c1 += t2; c2 += (c1<t2)?1:0; \ 576 } 577 578# define sqr_add_c2(a,i,j,c0,c1,c2) \ 579 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 580 581# else /* !BN_LLONG */ 582# define mul_add_c(a,b,c0,c1,c2) \ 583 t1=LBITS(a); t2=HBITS(a); \ 584 bl=LBITS(b); bh=HBITS(b); \ 585 mul64(t1,t2,bl,bh); \ 586 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 587 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 588 589# define mul_add_c2(a,b,c0,c1,c2) \ 590 t1=LBITS(a); t2=HBITS(a); \ 591 bl=LBITS(b); bh=HBITS(b); \ 592 mul64(t1,t2,bl,bh); \ 593 if (t2 & BN_TBIT) c2++; \ 594 t2=(t2+t2)&BN_MASK2; \ 595 if (t1 & BN_TBIT) t2++; \ 596 t1=(t1+t1)&BN_MASK2; \ 597 c0=(c0+t1)&BN_MASK2; \ 598 if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ 599 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 600 601# define sqr_add_c(a,i,c0,c1,c2) \ 602 sqr64(t1,t2,(a)[i]); \ 603 c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 604 c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 605 606# define sqr_add_c2(a,i,j,c0,c1,c2) \ 607 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 608# endif /* !BN_LLONG */ 609 610void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 611{ 612# ifdef BN_LLONG 613 BN_ULLONG t; 614# else 615 BN_ULONG bl, bh; 616# endif 617 BN_ULONG t1, t2; 618 BN_ULONG c1, c2, c3; 619 620 c1 = 0; 621 c2 = 0; 622 c3 = 0; 623 mul_add_c(a[0], b[0], c1, c2, c3); 624 r[0] = c1; 625 c1 = 0; 626 mul_add_c(a[0], b[1], c2, c3, c1); 627 mul_add_c(a[1], b[0], c2, c3, c1); 628 r[1] = c2; 629 c2 = 0; 630 mul_add_c(a[2], b[0], c3, c1, c2); 631 mul_add_c(a[1], b[1], c3, c1, c2); 632 mul_add_c(a[0], b[2], c3, c1, c2); 633 r[2] = c3; 634 c3 = 0; 635 mul_add_c(a[0], b[3], c1, c2, c3); 636 mul_add_c(a[1], b[2], c1, c2, c3); 637 mul_add_c(a[2], b[1], c1, c2, c3); 638 mul_add_c(a[3], b[0], c1, c2, c3); 639 r[3] = c1; 640 c1 = 0; 641 mul_add_c(a[4], b[0], c2, c3, c1); 642 mul_add_c(a[3], b[1], c2, c3, c1); 643 mul_add_c(a[2], b[2], c2, c3, c1); 644 mul_add_c(a[1], b[3], c2, c3, c1); 645 mul_add_c(a[0], b[4], c2, c3, c1); 646 r[4] = c2; 647 c2 = 0; 648 mul_add_c(a[0], b[5], c3, c1, c2); 649 mul_add_c(a[1], b[4], c3, c1, c2); 650 mul_add_c(a[2], b[3], c3, c1, c2); 651 mul_add_c(a[3], b[2], c3, c1, c2); 652 mul_add_c(a[4], b[1], c3, c1, c2); 653 mul_add_c(a[5], b[0], c3, c1, c2); 654 r[5] = c3; 655 c3 = 0; 656 mul_add_c(a[6], b[0], c1, c2, c3); 657 mul_add_c(a[5], b[1], c1, c2, c3); 658 mul_add_c(a[4], b[2], c1, c2, c3); 659 mul_add_c(a[3], b[3], c1, c2, c3); 660 mul_add_c(a[2], b[4], c1, c2, c3); 661 mul_add_c(a[1], b[5], c1, c2, c3); 662 mul_add_c(a[0], b[6], c1, c2, c3); 663 r[6] = c1; 664 c1 = 0; 665 mul_add_c(a[0], b[7], c2, c3, c1); 666 mul_add_c(a[1], b[6], c2, c3, c1); 667 mul_add_c(a[2], b[5], c2, c3, c1); 668 mul_add_c(a[3], b[4], c2, c3, c1); 669 mul_add_c(a[4], b[3], c2, c3, c1); 670 mul_add_c(a[5], b[2], c2, c3, c1); 671 mul_add_c(a[6], b[1], c2, c3, c1); 672 mul_add_c(a[7], b[0], c2, c3, c1); 673 r[7] = c2; 674 c2 = 0; 675 mul_add_c(a[7], b[1], c3, c1, c2); 676 mul_add_c(a[6], b[2], c3, c1, c2); 677 mul_add_c(a[5], b[3], c3, c1, c2); 678 mul_add_c(a[4], b[4], c3, c1, c2); 679 mul_add_c(a[3], b[5], c3, c1, c2); 680 mul_add_c(a[2], b[6], c3, c1, c2); 681 mul_add_c(a[1], b[7], c3, c1, c2); 682 r[8] = c3; 683 c3 = 0; 684 mul_add_c(a[2], b[7], c1, c2, c3); 685 mul_add_c(a[3], b[6], c1, c2, c3); 686 mul_add_c(a[4], b[5], c1, c2, c3); 687 mul_add_c(a[5], b[4], c1, c2, c3); 688 mul_add_c(a[6], b[3], c1, c2, c3); 689 mul_add_c(a[7], b[2], c1, c2, c3); 690 r[9] = c1; 691 c1 = 0; 692 mul_add_c(a[7], b[3], c2, c3, c1); 693 mul_add_c(a[6], b[4], c2, c3, c1); 694 mul_add_c(a[5], b[5], c2, c3, c1); 695 mul_add_c(a[4], b[6], c2, c3, c1); 696 mul_add_c(a[3], b[7], c2, c3, c1); 697 r[10] = c2; 698 c2 = 0; 699 mul_add_c(a[4], b[7], c3, c1, c2); 700 mul_add_c(a[5], b[6], c3, c1, c2); 701 mul_add_c(a[6], b[5], c3, c1, c2); 702 mul_add_c(a[7], b[4], c3, c1, c2); 703 r[11] = c3; 704 c3 = 0; 705 mul_add_c(a[7], b[5], c1, c2, c3); 706 mul_add_c(a[6], b[6], c1, c2, c3); 707 mul_add_c(a[5], b[7], c1, c2, c3); 708 r[12] = c1; 709 c1 = 0; 710 mul_add_c(a[6], b[7], c2, c3, c1); 711 mul_add_c(a[7], b[6], c2, c3, c1); 712 r[13] = c2; 713 c2 = 0; 714 mul_add_c(a[7], b[7], c3, c1, c2); 715 r[14] = c3; 716 r[15] = c1; 717} 718 719void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 720{ 721# ifdef BN_LLONG 722 BN_ULLONG t; 723# else 724 BN_ULONG bl, bh; 725# endif 726 BN_ULONG t1, t2; 727 BN_ULONG c1, c2, c3; 728 729 c1 = 0; 730 c2 = 0; 731 c3 = 0; 732 mul_add_c(a[0], b[0], c1, c2, c3); 733 r[0] = c1; 734 c1 = 0; 735 mul_add_c(a[0], b[1], c2, c3, c1); 736 mul_add_c(a[1], b[0], c2, c3, c1); 737 r[1] = c2; 738 c2 = 0; 739 mul_add_c(a[2], b[0], c3, c1, c2); 740 mul_add_c(a[1], b[1], c3, c1, c2); 741 mul_add_c(a[0], b[2], c3, c1, c2); 742 r[2] = c3; 743 c3 = 0; 744 mul_add_c(a[0], b[3], c1, c2, c3); 745 mul_add_c(a[1], b[2], c1, c2, c3); 746 mul_add_c(a[2], b[1], c1, c2, c3); 747 mul_add_c(a[3], b[0], c1, c2, c3); 748 r[3] = c1; 749 c1 = 0; 750 mul_add_c(a[3], b[1], c2, c3, c1); 751 mul_add_c(a[2], b[2], c2, c3, c1); 752 mul_add_c(a[1], b[3], c2, c3, c1); 753 r[4] = c2; 754 c2 = 0; 755 mul_add_c(a[2], b[3], c3, c1, c2); 756 mul_add_c(a[3], b[2], c3, c1, c2); 757 r[5] = c3; 758 c3 = 0; 759 mul_add_c(a[3], b[3], c1, c2, c3); 760 r[6] = c1; 761 r[7] = c2; 762} 763 764void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 765{ 766# ifdef BN_LLONG 767 BN_ULLONG t, tt; 768# else 769 BN_ULONG bl, bh; 770# endif 771 BN_ULONG t1, t2; 772 BN_ULONG c1, c2, c3; 773 774 c1 = 0; 775 c2 = 0; 776 c3 = 0; 777 sqr_add_c(a, 0, c1, c2, c3); 778 r[0] = c1; 779 c1 = 0; 780 sqr_add_c2(a, 1, 0, c2, c3, c1); 781 r[1] = c2; 782 c2 = 0; 783 sqr_add_c(a, 1, c3, c1, c2); 784 sqr_add_c2(a, 2, 0, c3, c1, c2); 785 r[2] = c3; 786 c3 = 0; 787 sqr_add_c2(a, 3, 0, c1, c2, c3); 788 sqr_add_c2(a, 2, 1, c1, c2, c3); 789 r[3] = c1; 790 c1 = 0; 791 sqr_add_c(a, 2, c2, c3, c1); 792 sqr_add_c2(a, 3, 1, c2, c3, c1); 793 sqr_add_c2(a, 4, 0, c2, c3, c1); 794 r[4] = c2; 795 c2 = 0; 796 sqr_add_c2(a, 5, 0, c3, c1, c2); 797 sqr_add_c2(a, 4, 1, c3, c1, c2); 798 sqr_add_c2(a, 3, 2, c3, c1, c2); 799 r[5] = c3; 800 c3 = 0; 801 sqr_add_c(a, 3, c1, c2, c3); 802 sqr_add_c2(a, 4, 2, c1, c2, c3); 803 sqr_add_c2(a, 5, 1, c1, c2, c3); 804 sqr_add_c2(a, 6, 0, c1, c2, c3); 805 r[6] = c1; 806 c1 = 0; 807 sqr_add_c2(a, 7, 0, c2, c3, c1); 808 sqr_add_c2(a, 6, 1, c2, c3, c1); 809 sqr_add_c2(a, 5, 2, c2, c3, c1); 810 sqr_add_c2(a, 4, 3, c2, c3, c1); 811 r[7] = c2; 812 c2 = 0; 813 sqr_add_c(a, 4, c3, c1, c2); 814 sqr_add_c2(a, 5, 3, c3, c1, c2); 815 sqr_add_c2(a, 6, 2, c3, c1, c2); 816 sqr_add_c2(a, 7, 1, c3, c1, c2); 817 r[8] = c3; 818 c3 = 0; 819 sqr_add_c2(a, 7, 2, c1, c2, c3); 820 sqr_add_c2(a, 6, 3, c1, c2, c3); 821 sqr_add_c2(a, 5, 4, c1, c2, c3); 822 r[9] = c1; 823 c1 = 0; 824 sqr_add_c(a, 5, c2, c3, c1); 825 sqr_add_c2(a, 6, 4, c2, c3, c1); 826 sqr_add_c2(a, 7, 3, c2, c3, c1); 827 r[10] = c2; 828 c2 = 0; 829 sqr_add_c2(a, 7, 4, c3, c1, c2); 830 sqr_add_c2(a, 6, 5, c3, c1, c2); 831 r[11] = c3; 832 c3 = 0; 833 sqr_add_c(a, 6, c1, c2, c3); 834 sqr_add_c2(a, 7, 5, c1, c2, c3); 835 r[12] = c1; 836 c1 = 0; 837 sqr_add_c2(a, 7, 6, c2, c3, c1); 838 r[13] = c2; 839 c2 = 0; 840 sqr_add_c(a, 7, c3, c1, c2); 841 r[14] = c3; 842 r[15] = c1; 843} 844 845void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 846{ 847# ifdef BN_LLONG 848 BN_ULLONG t, tt; 849# else 850 BN_ULONG bl, bh; 851# endif 852 BN_ULONG t1, t2; 853 BN_ULONG c1, c2, c3; 854 855 c1 = 0; 856 c2 = 0; 857 c3 = 0; 858 sqr_add_c(a, 0, c1, c2, c3); 859 r[0] = c1; 860 c1 = 0; 861 sqr_add_c2(a, 1, 0, c2, c3, c1); 862 r[1] = c2; 863 c2 = 0; 864 sqr_add_c(a, 1, c3, c1, c2); 865 sqr_add_c2(a, 2, 0, c3, c1, c2); 866 r[2] = c3; 867 c3 = 0; 868 sqr_add_c2(a, 3, 0, c1, c2, c3); 869 sqr_add_c2(a, 2, 1, c1, c2, c3); 870 r[3] = c1; 871 c1 = 0; 872 sqr_add_c(a, 2, c2, c3, c1); 873 sqr_add_c2(a, 3, 1, c2, c3, c1); 874 r[4] = c2; 875 c2 = 0; 876 sqr_add_c2(a, 3, 2, c3, c1, c2); 877 r[5] = c3; 878 c3 = 0; 879 sqr_add_c(a, 3, c1, c2, c3); 880 r[6] = c1; 881 r[7] = c2; 882} 883#else /* !BN_MUL_COMBA */ 884 885/* hmm... is it faster just to do a multiply? */ 886# undef bn_sqr_comba4 887void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 888{ 889 BN_ULONG t[8]; 890 bn_sqr_normal(r, a, 4, t); 891} 892 893# undef bn_sqr_comba8 894void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 895{ 896 BN_ULONG t[16]; 897 bn_sqr_normal(r, a, 8, t); 898} 899 900void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 901{ 902 r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 903 r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 904 r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 905 r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 906} 907 908void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 909{ 910 r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 911 r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 912 r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 913 r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 914 r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 915 r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 916 r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 917 r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 918} 919 920#endif /* !BN_MUL_COMBA */ 921