155714Skris/* crypto/bn/bn_asm.c */ 255714Skris/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 355714Skris * All rights reserved. 455714Skris * 555714Skris * This package is an SSL implementation written 655714Skris * by Eric Young (eay@cryptsoft.com). 755714Skris * The implementation was written so as to conform with Netscapes SSL. 8296341Sdelphij * 955714Skris * This library is free for commercial and non-commercial use as long as 1055714Skris * the following conditions are aheared to. The following conditions 1155714Skris * apply to all code found in this distribution, be it the RC4, RSA, 1255714Skris * lhash, DES, etc., code; not just the SSL code. The SSL documentation 1355714Skris * included with this distribution is covered by the same copyright terms 1455714Skris * except that the holder is Tim Hudson (tjh@cryptsoft.com). 15296341Sdelphij * 1655714Skris * Copyright remains Eric Young's, and as such any Copyright notices in 1755714Skris * the code are not to be removed. 1855714Skris * If this package is used in a product, Eric Young should be given attribution 1955714Skris * as the author of the parts of the library used. 2055714Skris * This can be in the form of a textual message at program startup or 2155714Skris * in documentation (online or textual) provided with the package. 22296341Sdelphij * 2355714Skris * Redistribution and use in source and binary forms, with or without 2455714Skris * modification, are permitted provided that the following conditions 2555714Skris * are met: 2655714Skris * 1. Redistributions of source code must retain the copyright 2755714Skris * notice, this list of conditions and the following disclaimer. 2855714Skris * 2. Redistributions in binary form must reproduce the above copyright 2955714Skris * notice, this list of conditions and the following disclaimer in the 3055714Skris * documentation and/or other materials provided with the distribution. 3155714Skris * 3. All advertising materials mentioning features or use of this software 3255714Skris * must display the following acknowledgement: 3355714Skris * "This product includes cryptographic software written by 3455714Skris * Eric Young (eay@cryptsoft.com)" 3555714Skris * The word 'cryptographic' can be left out if the rouines from the library 3655714Skris * being used are not cryptographic related :-). 37296341Sdelphij * 4. If you include any Windows specific code (or a derivative thereof) from 3855714Skris * the apps directory (application code) you must include an acknowledgement: 3955714Skris * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" 40296341Sdelphij * 4155714Skris * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 4255714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 4355714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 4455714Skris * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 4555714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 4655714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 4755714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 4855714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 4955714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 5055714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 5155714Skris * SUCH DAMAGE. 52296341Sdelphij * 5355714Skris * The licence and distribution terms for any publically available version or 5455714Skris * derivative of this code cannot be changed. i.e. this code cannot simply be 5555714Skris * copied and put under another distribution licence 5655714Skris * [including the GNU Public Licence.] 5755714Skris */ 5855714Skris 5959191Skris#ifndef BN_DEBUG 60296341Sdelphij# undef NDEBUG /* avoid conflicting definitions */ 6159191Skris# define NDEBUG 6259191Skris#endif 6359191Skris 6455714Skris#include <stdio.h> 6559191Skris#include <assert.h> 6655714Skris#include "cryptlib.h" 6755714Skris#include "bn_lcl.h" 6855714Skris 6959191Skris#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 7055714Skris 71296341SdelphijBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 72296341Sdelphij BN_ULONG w) 73296341Sdelphij{ 74296341Sdelphij BN_ULONG c1 = 0; 7555714Skris 76296341Sdelphij assert(num >= 0); 77296341Sdelphij if (num <= 0) 78296341Sdelphij return (c1); 7955714Skris 80296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT 81296341Sdelphij while (num & ~3) { 82296341Sdelphij mul_add(rp[0], ap[0], w, c1); 83296341Sdelphij mul_add(rp[1], ap[1], w, c1); 84296341Sdelphij mul_add(rp[2], ap[2], w, c1); 85296341Sdelphij mul_add(rp[3], ap[3], w, c1); 86296341Sdelphij ap += 4; 87296341Sdelphij rp += 4; 88296341Sdelphij num -= 4; 89296341Sdelphij } 90296341Sdelphij# endif 91296341Sdelphij while (num) { 92296341Sdelphij mul_add(rp[0], ap[0], w, c1); 93296341Sdelphij ap++; 94296341Sdelphij rp++; 95296341Sdelphij num--; 96296341Sdelphij } 9755714Skris 98296341Sdelphij return (c1); 99296341Sdelphij} 100296341Sdelphij 101109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 102296341Sdelphij{ 103296341Sdelphij BN_ULONG c1 = 0; 10455714Skris 105296341Sdelphij assert(num >= 0); 106296341Sdelphij if (num <= 0) 107296341Sdelphij return (c1); 10855714Skris 109296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT 110296341Sdelphij while (num & ~3) { 111296341Sdelphij mul(rp[0], ap[0], w, c1); 112296341Sdelphij mul(rp[1], ap[1], w, c1); 113296341Sdelphij mul(rp[2], ap[2], w, c1); 114296341Sdelphij mul(rp[3], ap[3], w, c1); 115296341Sdelphij ap += 4; 116296341Sdelphij rp += 4; 117296341Sdelphij num -= 4; 118296341Sdelphij } 119296341Sdelphij# endif 120296341Sdelphij while (num) { 121296341Sdelphij mul(rp[0], ap[0], w, c1); 122296341Sdelphij ap++; 123296341Sdelphij rp++; 124296341Sdelphij num--; 125296341Sdelphij } 126296341Sdelphij return (c1); 127296341Sdelphij} 12855714Skris 129109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 130296341Sdelphij{ 131296341Sdelphij assert(n >= 0); 132296341Sdelphij if (n <= 0) 133296341Sdelphij return; 134238405Sjkim 135296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT 136296341Sdelphij while (n & ~3) { 137296341Sdelphij sqr(r[0], r[1], a[0]); 138296341Sdelphij sqr(r[2], r[3], a[1]); 139296341Sdelphij sqr(r[4], r[5], a[2]); 140296341Sdelphij sqr(r[6], r[7], a[3]); 141296341Sdelphij a += 4; 142296341Sdelphij r += 8; 143296341Sdelphij n -= 4; 144296341Sdelphij } 145296341Sdelphij# endif 146296341Sdelphij while (n) { 147296341Sdelphij sqr(r[0], r[1], a[0]); 148296341Sdelphij a++; 149296341Sdelphij r += 2; 150296341Sdelphij n--; 151296341Sdelphij } 152296341Sdelphij} 15355714Skris 154296341Sdelphij#else /* !(defined(BN_LLONG) || 155296341Sdelphij * defined(BN_UMULT_HIGH)) */ 15655714Skris 157296341SdelphijBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 158296341Sdelphij BN_ULONG w) 159296341Sdelphij{ 160296341Sdelphij BN_ULONG c = 0; 161296341Sdelphij BN_ULONG bl, bh; 16255714Skris 163296341Sdelphij assert(num >= 0); 164296341Sdelphij if (num <= 0) 165296341Sdelphij return ((BN_ULONG)0); 16655714Skris 167296341Sdelphij bl = LBITS(w); 168296341Sdelphij bh = HBITS(w); 16955714Skris 170296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT 171296341Sdelphij while (num & ~3) { 172296341Sdelphij mul_add(rp[0], ap[0], bl, bh, c); 173296341Sdelphij mul_add(rp[1], ap[1], bl, bh, c); 174296341Sdelphij mul_add(rp[2], ap[2], bl, bh, c); 175296341Sdelphij mul_add(rp[3], ap[3], bl, bh, c); 176296341Sdelphij ap += 4; 177296341Sdelphij rp += 4; 178296341Sdelphij num -= 4; 179296341Sdelphij } 180296341Sdelphij# endif 181296341Sdelphij while (num) { 182296341Sdelphij mul_add(rp[0], ap[0], bl, bh, c); 183296341Sdelphij ap++; 184296341Sdelphij rp++; 185296341Sdelphij num--; 186296341Sdelphij } 187296341Sdelphij return (c); 188296341Sdelphij} 18955714Skris 190109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 191296341Sdelphij{ 192296341Sdelphij BN_ULONG carry = 0; 193296341Sdelphij BN_ULONG bl, bh; 19455714Skris 195296341Sdelphij assert(num >= 0); 196296341Sdelphij if (num <= 0) 197296341Sdelphij return ((BN_ULONG)0); 19855714Skris 199296341Sdelphij bl = LBITS(w); 200296341Sdelphij bh = HBITS(w); 20155714Skris 202296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT 203296341Sdelphij while (num & ~3) { 204296341Sdelphij mul(rp[0], ap[0], bl, bh, carry); 205296341Sdelphij mul(rp[1], ap[1], bl, bh, carry); 206296341Sdelphij mul(rp[2], ap[2], bl, bh, carry); 207296341Sdelphij mul(rp[3], ap[3], bl, bh, carry); 208296341Sdelphij ap += 4; 209296341Sdelphij rp += 4; 210296341Sdelphij num -= 4; 211296341Sdelphij } 212296341Sdelphij# endif 213296341Sdelphij while (num) { 214296341Sdelphij mul(rp[0], ap[0], bl, bh, carry); 215296341Sdelphij ap++; 216296341Sdelphij rp++; 217296341Sdelphij num--; 218296341Sdelphij } 219296341Sdelphij return (carry); 220296341Sdelphij} 22155714Skris 222109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 223296341Sdelphij{ 224296341Sdelphij assert(n >= 0); 225296341Sdelphij if (n <= 0) 226296341Sdelphij return; 227238405Sjkim 228296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT 229296341Sdelphij while (n & ~3) { 230296341Sdelphij sqr64(r[0], r[1], a[0]); 231296341Sdelphij sqr64(r[2], r[3], a[1]); 232296341Sdelphij sqr64(r[4], r[5], a[2]); 233296341Sdelphij sqr64(r[6], r[7], a[3]); 234296341Sdelphij a += 4; 235296341Sdelphij r += 8; 236296341Sdelphij n -= 4; 237296341Sdelphij } 238296341Sdelphij# endif 239296341Sdelphij while (n) { 240296341Sdelphij sqr64(r[0], r[1], a[0]); 241296341Sdelphij a++; 242296341Sdelphij r += 2; 243296341Sdelphij n--; 244296341Sdelphij } 245296341Sdelphij} 24655714Skris 247296341Sdelphij#endif /* !(defined(BN_LLONG) || 248296341Sdelphij * defined(BN_UMULT_HIGH)) */ 24955714Skris 25055714Skris#if defined(BN_LLONG) && defined(BN_DIV2W) 25155714Skris 25255714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 253296341Sdelphij{ 254296341Sdelphij return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 255296341Sdelphij} 25655714Skris 25755714Skris#else 25855714Skris 25968651Skris/* Divide h,l by d and return the result. */ 26055714Skris/* I need to test this some more :-( */ 26155714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 262296341Sdelphij{ 263296341Sdelphij BN_ULONG dh, dl, q, ret = 0, th, tl, t; 264296341Sdelphij int i, count = 2; 26555714Skris 266296341Sdelphij if (d == 0) 267296341Sdelphij return (BN_MASK2); 26855714Skris 269296341Sdelphij i = BN_num_bits_word(d); 270296341Sdelphij assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 27168651Skris 272296341Sdelphij i = BN_BITS2 - i; 273296341Sdelphij if (h >= d) 274296341Sdelphij h -= d; 27555714Skris 276296341Sdelphij if (i) { 277296341Sdelphij d <<= i; 278296341Sdelphij h = (h << i) | (l >> (BN_BITS2 - i)); 279296341Sdelphij l <<= i; 280296341Sdelphij } 281296341Sdelphij dh = (d & BN_MASK2h) >> BN_BITS4; 282296341Sdelphij dl = (d & BN_MASK2l); 283296341Sdelphij for (;;) { 284296341Sdelphij if ((h >> BN_BITS4) == dh) 285296341Sdelphij q = BN_MASK2l; 286296341Sdelphij else 287296341Sdelphij q = h / dh; 28855714Skris 289296341Sdelphij th = q * dh; 290296341Sdelphij tl = dl * q; 291296341Sdelphij for (;;) { 292296341Sdelphij t = h - th; 293296341Sdelphij if ((t & BN_MASK2h) || 294296341Sdelphij ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 295296341Sdelphij break; 296296341Sdelphij q--; 297296341Sdelphij th -= dh; 298296341Sdelphij tl -= dl; 299296341Sdelphij } 300296341Sdelphij t = (tl >> BN_BITS4); 301296341Sdelphij tl = (tl << BN_BITS4) & BN_MASK2h; 302296341Sdelphij th += t; 30355714Skris 304296341Sdelphij if (l < tl) 305296341Sdelphij th++; 306296341Sdelphij l -= tl; 307296341Sdelphij if (h < th) { 308296341Sdelphij h += d; 309296341Sdelphij q--; 310296341Sdelphij } 311296341Sdelphij h -= th; 31255714Skris 313296341Sdelphij if (--count == 0) 314296341Sdelphij break; 31555714Skris 316296341Sdelphij ret = q << BN_BITS4; 317296341Sdelphij h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 318296341Sdelphij l = (l & BN_MASK2l) << BN_BITS4; 319296341Sdelphij } 320296341Sdelphij ret |= q; 321296341Sdelphij return (ret); 322296341Sdelphij} 323296341Sdelphij#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 32455714Skris 32555714Skris#ifdef BN_LLONG 326296341SdelphijBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 327296341Sdelphij int n) 328296341Sdelphij{ 329296341Sdelphij BN_ULLONG ll = 0; 33055714Skris 331296341Sdelphij assert(n >= 0); 332296341Sdelphij if (n <= 0) 333296341Sdelphij return ((BN_ULONG)0); 33455714Skris 335296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT 336296341Sdelphij while (n & ~3) { 337296341Sdelphij ll += (BN_ULLONG) a[0] + b[0]; 338296341Sdelphij r[0] = (BN_ULONG)ll & BN_MASK2; 339296341Sdelphij ll >>= BN_BITS2; 340296341Sdelphij ll += (BN_ULLONG) a[1] + b[1]; 341296341Sdelphij r[1] = (BN_ULONG)ll & BN_MASK2; 342296341Sdelphij ll >>= BN_BITS2; 343296341Sdelphij ll += (BN_ULLONG) a[2] + b[2]; 344296341Sdelphij r[2] = (BN_ULONG)ll & BN_MASK2; 345296341Sdelphij ll >>= BN_BITS2; 346296341Sdelphij ll += (BN_ULLONG) a[3] + b[3]; 347296341Sdelphij r[3] = (BN_ULONG)ll & BN_MASK2; 348296341Sdelphij ll >>= BN_BITS2; 349296341Sdelphij a += 4; 350296341Sdelphij b += 4; 351296341Sdelphij r += 4; 352296341Sdelphij n -= 4; 353296341Sdelphij } 354296341Sdelphij# endif 355296341Sdelphij while (n) { 356296341Sdelphij ll += (BN_ULLONG) a[0] + b[0]; 357296341Sdelphij r[0] = (BN_ULONG)ll & BN_MASK2; 358296341Sdelphij ll >>= BN_BITS2; 359296341Sdelphij a++; 360296341Sdelphij b++; 361296341Sdelphij r++; 362296341Sdelphij n--; 363296341Sdelphij } 364296341Sdelphij return ((BN_ULONG)ll); 365296341Sdelphij} 366296341Sdelphij#else /* !BN_LLONG */ 367296341SdelphijBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 368296341Sdelphij int n) 369296341Sdelphij{ 370296341Sdelphij BN_ULONG c, l, t; 37155714Skris 372296341Sdelphij assert(n >= 0); 373296341Sdelphij if (n <= 0) 374296341Sdelphij return ((BN_ULONG)0); 37555714Skris 376296341Sdelphij c = 0; 377296341Sdelphij# ifndef OPENSSL_SMALL_FOOTPRINT 378296341Sdelphij while (n & ~3) { 379296341Sdelphij t = a[0]; 380296341Sdelphij t = (t + c) & BN_MASK2; 381296341Sdelphij c = (t < c); 382296341Sdelphij l = (t + b[0]) & BN_MASK2; 383296341Sdelphij c += (l < t); 384296341Sdelphij r[0] = l; 385296341Sdelphij t = a[1]; 386296341Sdelphij t = (t + c) & BN_MASK2; 387296341Sdelphij c = (t < c); 388296341Sdelphij l = (t + b[1]) & BN_MASK2; 389296341Sdelphij c += (l < t); 390296341Sdelphij r[1] = l; 391296341Sdelphij t = a[2]; 392296341Sdelphij t = (t + c) & BN_MASK2; 393296341Sdelphij c = (t < c); 394296341Sdelphij l = (t + b[2]) & BN_MASK2; 395296341Sdelphij c += (l < t); 396296341Sdelphij r[2] = l; 397296341Sdelphij t = a[3]; 398296341Sdelphij t = (t + c) & BN_MASK2; 399296341Sdelphij c = (t < c); 400296341Sdelphij l = (t + b[3]) & BN_MASK2; 401296341Sdelphij c += (l < t); 402296341Sdelphij r[3] = l; 403296341Sdelphij a += 4; 404296341Sdelphij b += 4; 405296341Sdelphij r += 4; 406296341Sdelphij n -= 4; 407296341Sdelphij } 408296341Sdelphij# endif 409296341Sdelphij while (n) { 410296341Sdelphij t = a[0]; 411296341Sdelphij t = (t + c) & BN_MASK2; 412296341Sdelphij c = (t < c); 413296341Sdelphij l = (t + b[0]) & BN_MASK2; 414296341Sdelphij c += (l < t); 415296341Sdelphij r[0] = l; 416296341Sdelphij a++; 417296341Sdelphij b++; 418296341Sdelphij r++; 419296341Sdelphij n--; 420296341Sdelphij } 421296341Sdelphij return ((BN_ULONG)c); 422296341Sdelphij} 423296341Sdelphij#endif /* !BN_LLONG */ 42455714Skris 425296341SdelphijBN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 426296341Sdelphij int n) 427296341Sdelphij{ 428296341Sdelphij BN_ULONG t1, t2; 429296341Sdelphij int c = 0; 43055714Skris 431296341Sdelphij assert(n >= 0); 432296341Sdelphij if (n <= 0) 433296341Sdelphij return ((BN_ULONG)0); 43455714Skris 435238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 436296341Sdelphij while (n & ~3) { 437296341Sdelphij t1 = a[0]; 438296341Sdelphij t2 = b[0]; 439296341Sdelphij r[0] = (t1 - t2 - c) & BN_MASK2; 440296341Sdelphij if (t1 != t2) 441296341Sdelphij c = (t1 < t2); 442296341Sdelphij t1 = a[1]; 443296341Sdelphij t2 = b[1]; 444296341Sdelphij r[1] = (t1 - t2 - c) & BN_MASK2; 445296341Sdelphij if (t1 != t2) 446296341Sdelphij c = (t1 < t2); 447296341Sdelphij t1 = a[2]; 448296341Sdelphij t2 = b[2]; 449296341Sdelphij r[2] = (t1 - t2 - c) & BN_MASK2; 450296341Sdelphij if (t1 != t2) 451296341Sdelphij c = (t1 < t2); 452296341Sdelphij t1 = a[3]; 453296341Sdelphij t2 = b[3]; 454296341Sdelphij r[3] = (t1 - t2 - c) & BN_MASK2; 455296341Sdelphij if (t1 != t2) 456296341Sdelphij c = (t1 < t2); 457296341Sdelphij a += 4; 458296341Sdelphij b += 4; 459296341Sdelphij r += 4; 460296341Sdelphij n -= 4; 461296341Sdelphij } 462238405Sjkim#endif 463296341Sdelphij while (n) { 464296341Sdelphij t1 = a[0]; 465296341Sdelphij t2 = b[0]; 466296341Sdelphij r[0] = (t1 - t2 - c) & BN_MASK2; 467296341Sdelphij if (t1 != t2) 468296341Sdelphij c = (t1 < t2); 469296341Sdelphij a++; 470296341Sdelphij b++; 471296341Sdelphij r++; 472296341Sdelphij n--; 473296341Sdelphij } 474296341Sdelphij return (c); 475296341Sdelphij} 47655714Skris 477238405Sjkim#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 47855714Skris 479296341Sdelphij# undef bn_mul_comba8 480296341Sdelphij# undef bn_mul_comba4 481296341Sdelphij# undef bn_sqr_comba8 482296341Sdelphij# undef bn_sqr_comba4 48355714Skris 48459191Skris/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 48559191Skris/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 48659191Skris/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 487296341Sdelphij/* 488296341Sdelphij * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 489296341Sdelphij * c=(c2,c1,c0) 490296341Sdelphij */ 49159191Skris 492277195Sdelphij/* 493277195Sdelphij * Keep in mind that carrying into high part of multiplication result 494277195Sdelphij * can not overflow, because it cannot be all-ones. 495277195Sdelphij */ 496296341Sdelphij# ifdef BN_LLONG 497296341Sdelphij# define mul_add_c(a,b,c0,c1,c2) \ 498296341Sdelphij t=(BN_ULLONG)a*b; \ 499296341Sdelphij t1=(BN_ULONG)Lw(t); \ 500296341Sdelphij t2=(BN_ULONG)Hw(t); \ 501296341Sdelphij c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 502296341Sdelphij c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 50355714Skris 504296341Sdelphij# define mul_add_c2(a,b,c0,c1,c2) \ 505296341Sdelphij t=(BN_ULLONG)a*b; \ 506296341Sdelphij tt=(t+t)&BN_MASK; \ 507296341Sdelphij if (tt < t) c2++; \ 508296341Sdelphij t1=(BN_ULONG)Lw(tt); \ 509296341Sdelphij t2=(BN_ULONG)Hw(tt); \ 510296341Sdelphij c0=(c0+t1)&BN_MASK2; \ 511296341Sdelphij if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ 512296341Sdelphij c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 51355714Skris 514296341Sdelphij# define sqr_add_c(a,i,c0,c1,c2) \ 515296341Sdelphij t=(BN_ULLONG)a[i]*a[i]; \ 516296341Sdelphij t1=(BN_ULONG)Lw(t); \ 517296341Sdelphij t2=(BN_ULONG)Hw(t); \ 518296341Sdelphij c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 519296341Sdelphij c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 52055714Skris 521296341Sdelphij# define sqr_add_c2(a,i,j,c0,c1,c2) \ 522296341Sdelphij mul_add_c2((a)[i],(a)[j],c0,c1,c2) 52359191Skris 524296341Sdelphij# elif defined(BN_UMULT_LOHI) 525160814Ssimon 526296341Sdelphij# define mul_add_c(a,b,c0,c1,c2) { \ 527296341Sdelphij BN_ULONG ta=(a),tb=(b); \ 528296341Sdelphij BN_UMULT_LOHI(t1,t2,ta,tb); \ 529296341Sdelphij c0 += t1; t2 += (c0<t1)?1:0; \ 530296341Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 531296341Sdelphij } 532160814Ssimon 533296341Sdelphij# define mul_add_c2(a,b,c0,c1,c2) { \ 534296341Sdelphij BN_ULONG ta=(a),tb=(b),t0; \ 535296341Sdelphij BN_UMULT_LOHI(t0,t1,ta,tb); \ 536296341Sdelphij c0 += t0; t2 = t1+((c0<t0)?1:0);\ 537296341Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 538296341Sdelphij c0 += t0; t1 += (c0<t0)?1:0; \ 539296341Sdelphij c1 += t1; c2 += (c1<t1)?1:0; \ 540296341Sdelphij } 541160814Ssimon 542296341Sdelphij# define sqr_add_c(a,i,c0,c1,c2) { \ 543296341Sdelphij BN_ULONG ta=(a)[i]; \ 544296341Sdelphij BN_UMULT_LOHI(t1,t2,ta,ta); \ 545296341Sdelphij c0 += t1; t2 += (c0<t1)?1:0; \ 546296341Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 547296341Sdelphij } 548160814Ssimon 549296341Sdelphij# define sqr_add_c2(a,i,j,c0,c1,c2) \ 550296341Sdelphij mul_add_c2((a)[i],(a)[j],c0,c1,c2) 551160814Ssimon 552296341Sdelphij# elif defined(BN_UMULT_HIGH) 55359191Skris 554296341Sdelphij# define mul_add_c(a,b,c0,c1,c2) { \ 555296341Sdelphij BN_ULONG ta=(a),tb=(b); \ 556296341Sdelphij t1 = ta * tb; \ 557296341Sdelphij t2 = BN_UMULT_HIGH(ta,tb); \ 558296341Sdelphij c0 += t1; t2 += (c0<t1)?1:0; \ 559296341Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 560296341Sdelphij } 56159191Skris 562296341Sdelphij# define mul_add_c2(a,b,c0,c1,c2) { \ 563296341Sdelphij BN_ULONG ta=(a),tb=(b),t0; \ 564296341Sdelphij t1 = BN_UMULT_HIGH(ta,tb); \ 565296341Sdelphij t0 = ta * tb; \ 566296341Sdelphij c0 += t0; t2 = t1+((c0<t0)?1:0);\ 567296341Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 568296341Sdelphij c0 += t0; t1 += (c0<t0)?1:0; \ 569296341Sdelphij c1 += t1; c2 += (c1<t1)?1:0; \ 570296341Sdelphij } 57159191Skris 572296341Sdelphij# define sqr_add_c(a,i,c0,c1,c2) { \ 573296341Sdelphij BN_ULONG ta=(a)[i]; \ 574296341Sdelphij t1 = ta * ta; \ 575296341Sdelphij t2 = BN_UMULT_HIGH(ta,ta); \ 576296341Sdelphij c0 += t1; t2 += (c0<t1)?1:0; \ 577296341Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 578296341Sdelphij } 57959191Skris 580296341Sdelphij# define sqr_add_c2(a,i,j,c0,c1,c2) \ 581296341Sdelphij mul_add_c2((a)[i],(a)[j],c0,c1,c2) 58259191Skris 583296341Sdelphij# else /* !BN_LLONG */ 584296341Sdelphij# define mul_add_c(a,b,c0,c1,c2) \ 585296341Sdelphij t1=LBITS(a); t2=HBITS(a); \ 586296341Sdelphij bl=LBITS(b); bh=HBITS(b); \ 587296341Sdelphij mul64(t1,t2,bl,bh); \ 588296341Sdelphij c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 589296341Sdelphij c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 59055714Skris 591296341Sdelphij# define mul_add_c2(a,b,c0,c1,c2) \ 592296341Sdelphij t1=LBITS(a); t2=HBITS(a); \ 593296341Sdelphij bl=LBITS(b); bh=HBITS(b); \ 594296341Sdelphij mul64(t1,t2,bl,bh); \ 595296341Sdelphij if (t2 & BN_TBIT) c2++; \ 596296341Sdelphij t2=(t2+t2)&BN_MASK2; \ 597296341Sdelphij if (t1 & BN_TBIT) t2++; \ 598296341Sdelphij t1=(t1+t1)&BN_MASK2; \ 599296341Sdelphij c0=(c0+t1)&BN_MASK2; \ 600296341Sdelphij if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ 601296341Sdelphij c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 60255714Skris 603296341Sdelphij# define sqr_add_c(a,i,c0,c1,c2) \ 604296341Sdelphij sqr64(t1,t2,(a)[i]); \ 605296341Sdelphij c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 606296341Sdelphij c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 60755714Skris 608296341Sdelphij# define sqr_add_c2(a,i,j,c0,c1,c2) \ 609296341Sdelphij mul_add_c2((a)[i],(a)[j],c0,c1,c2) 610296341Sdelphij# endif /* !BN_LLONG */ 61155714Skris 61255714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 613296341Sdelphij{ 614296341Sdelphij# ifdef BN_LLONG 615296341Sdelphij BN_ULLONG t; 616296341Sdelphij# else 617296341Sdelphij BN_ULONG bl, bh; 618296341Sdelphij# endif 619296341Sdelphij BN_ULONG t1, t2; 620296341Sdelphij BN_ULONG c1, c2, c3; 62155714Skris 622296341Sdelphij c1 = 0; 623296341Sdelphij c2 = 0; 624296341Sdelphij c3 = 0; 625296341Sdelphij mul_add_c(a[0], b[0], c1, c2, c3); 626296341Sdelphij r[0] = c1; 627296341Sdelphij c1 = 0; 628296341Sdelphij mul_add_c(a[0], b[1], c2, c3, c1); 629296341Sdelphij mul_add_c(a[1], b[0], c2, c3, c1); 630296341Sdelphij r[1] = c2; 631296341Sdelphij c2 = 0; 632296341Sdelphij mul_add_c(a[2], b[0], c3, c1, c2); 633296341Sdelphij mul_add_c(a[1], b[1], c3, c1, c2); 634296341Sdelphij mul_add_c(a[0], b[2], c3, c1, c2); 635296341Sdelphij r[2] = c3; 636296341Sdelphij c3 = 0; 637296341Sdelphij mul_add_c(a[0], b[3], c1, c2, c3); 638296341Sdelphij mul_add_c(a[1], b[2], c1, c2, c3); 639296341Sdelphij mul_add_c(a[2], b[1], c1, c2, c3); 640296341Sdelphij mul_add_c(a[3], b[0], c1, c2, c3); 641296341Sdelphij r[3] = c1; 642296341Sdelphij c1 = 0; 643296341Sdelphij mul_add_c(a[4], b[0], c2, c3, c1); 644296341Sdelphij mul_add_c(a[3], b[1], c2, c3, c1); 645296341Sdelphij mul_add_c(a[2], b[2], c2, c3, c1); 646296341Sdelphij mul_add_c(a[1], b[3], c2, c3, c1); 647296341Sdelphij mul_add_c(a[0], b[4], c2, c3, c1); 648296341Sdelphij r[4] = c2; 649296341Sdelphij c2 = 0; 650296341Sdelphij mul_add_c(a[0], b[5], c3, c1, c2); 651296341Sdelphij mul_add_c(a[1], b[4], c3, c1, c2); 652296341Sdelphij mul_add_c(a[2], b[3], c3, c1, c2); 653296341Sdelphij mul_add_c(a[3], b[2], c3, c1, c2); 654296341Sdelphij mul_add_c(a[4], b[1], c3, c1, c2); 655296341Sdelphij mul_add_c(a[5], b[0], c3, c1, c2); 656296341Sdelphij r[5] = c3; 657296341Sdelphij c3 = 0; 658296341Sdelphij mul_add_c(a[6], b[0], c1, c2, c3); 659296341Sdelphij mul_add_c(a[5], b[1], c1, c2, c3); 660296341Sdelphij mul_add_c(a[4], b[2], c1, c2, c3); 661296341Sdelphij mul_add_c(a[3], b[3], c1, c2, c3); 662296341Sdelphij mul_add_c(a[2], b[4], c1, c2, c3); 663296341Sdelphij mul_add_c(a[1], b[5], c1, c2, c3); 664296341Sdelphij mul_add_c(a[0], b[6], c1, c2, c3); 665296341Sdelphij r[6] = c1; 666296341Sdelphij c1 = 0; 667296341Sdelphij mul_add_c(a[0], b[7], c2, c3, c1); 668296341Sdelphij mul_add_c(a[1], b[6], c2, c3, c1); 669296341Sdelphij mul_add_c(a[2], b[5], c2, c3, c1); 670296341Sdelphij mul_add_c(a[3], b[4], c2, c3, c1); 671296341Sdelphij mul_add_c(a[4], b[3], c2, c3, c1); 672296341Sdelphij mul_add_c(a[5], b[2], c2, c3, c1); 673296341Sdelphij mul_add_c(a[6], b[1], c2, c3, c1); 674296341Sdelphij mul_add_c(a[7], b[0], c2, c3, c1); 675296341Sdelphij r[7] = c2; 676296341Sdelphij c2 = 0; 677296341Sdelphij mul_add_c(a[7], b[1], c3, c1, c2); 678296341Sdelphij mul_add_c(a[6], b[2], c3, c1, c2); 679296341Sdelphij mul_add_c(a[5], b[3], c3, c1, c2); 680296341Sdelphij mul_add_c(a[4], b[4], c3, c1, c2); 681296341Sdelphij mul_add_c(a[3], b[5], c3, c1, c2); 682296341Sdelphij mul_add_c(a[2], b[6], c3, c1, c2); 683296341Sdelphij mul_add_c(a[1], b[7], c3, c1, c2); 684296341Sdelphij r[8] = c3; 685296341Sdelphij c3 = 0; 686296341Sdelphij mul_add_c(a[2], b[7], c1, c2, c3); 687296341Sdelphij mul_add_c(a[3], b[6], c1, c2, c3); 688296341Sdelphij mul_add_c(a[4], b[5], c1, c2, c3); 689296341Sdelphij mul_add_c(a[5], b[4], c1, c2, c3); 690296341Sdelphij mul_add_c(a[6], b[3], c1, c2, c3); 691296341Sdelphij mul_add_c(a[7], b[2], c1, c2, c3); 692296341Sdelphij r[9] = c1; 693296341Sdelphij c1 = 0; 694296341Sdelphij mul_add_c(a[7], b[3], c2, c3, c1); 695296341Sdelphij mul_add_c(a[6], b[4], c2, c3, c1); 696296341Sdelphij mul_add_c(a[5], b[5], c2, c3, c1); 697296341Sdelphij mul_add_c(a[4], b[6], c2, c3, c1); 698296341Sdelphij mul_add_c(a[3], b[7], c2, c3, c1); 699296341Sdelphij r[10] = c2; 700296341Sdelphij c2 = 0; 701296341Sdelphij mul_add_c(a[4], b[7], c3, c1, c2); 702296341Sdelphij mul_add_c(a[5], b[6], c3, c1, c2); 703296341Sdelphij mul_add_c(a[6], b[5], c3, c1, c2); 704296341Sdelphij mul_add_c(a[7], b[4], c3, c1, c2); 705296341Sdelphij r[11] = c3; 706296341Sdelphij c3 = 0; 707296341Sdelphij mul_add_c(a[7], b[5], c1, c2, c3); 708296341Sdelphij mul_add_c(a[6], b[6], c1, c2, c3); 709296341Sdelphij mul_add_c(a[5], b[7], c1, c2, c3); 710296341Sdelphij r[12] = c1; 711296341Sdelphij c1 = 0; 712296341Sdelphij mul_add_c(a[6], b[7], c2, c3, c1); 713296341Sdelphij mul_add_c(a[7], b[6], c2, c3, c1); 714296341Sdelphij r[13] = c2; 715296341Sdelphij c2 = 0; 716296341Sdelphij mul_add_c(a[7], b[7], c3, c1, c2); 717296341Sdelphij r[14] = c3; 718296341Sdelphij r[15] = c1; 719296341Sdelphij} 72055714Skris 72155714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 722296341Sdelphij{ 723296341Sdelphij# ifdef BN_LLONG 724296341Sdelphij BN_ULLONG t; 725296341Sdelphij# else 726296341Sdelphij BN_ULONG bl, bh; 727296341Sdelphij# endif 728296341Sdelphij BN_ULONG t1, t2; 729296341Sdelphij BN_ULONG c1, c2, c3; 73055714Skris 731296341Sdelphij c1 = 0; 732296341Sdelphij c2 = 0; 733296341Sdelphij c3 = 0; 734296341Sdelphij mul_add_c(a[0], b[0], c1, c2, c3); 735296341Sdelphij r[0] = c1; 736296341Sdelphij c1 = 0; 737296341Sdelphij mul_add_c(a[0], b[1], c2, c3, c1); 738296341Sdelphij mul_add_c(a[1], b[0], c2, c3, c1); 739296341Sdelphij r[1] = c2; 740296341Sdelphij c2 = 0; 741296341Sdelphij mul_add_c(a[2], b[0], c3, c1, c2); 742296341Sdelphij mul_add_c(a[1], b[1], c3, c1, c2); 743296341Sdelphij mul_add_c(a[0], b[2], c3, c1, c2); 744296341Sdelphij r[2] = c3; 745296341Sdelphij c3 = 0; 746296341Sdelphij mul_add_c(a[0], b[3], c1, c2, c3); 747296341Sdelphij mul_add_c(a[1], b[2], c1, c2, c3); 748296341Sdelphij mul_add_c(a[2], b[1], c1, c2, c3); 749296341Sdelphij mul_add_c(a[3], b[0], c1, c2, c3); 750296341Sdelphij r[3] = c1; 751296341Sdelphij c1 = 0; 752296341Sdelphij mul_add_c(a[3], b[1], c2, c3, c1); 753296341Sdelphij mul_add_c(a[2], b[2], c2, c3, c1); 754296341Sdelphij mul_add_c(a[1], b[3], c2, c3, c1); 755296341Sdelphij r[4] = c2; 756296341Sdelphij c2 = 0; 757296341Sdelphij mul_add_c(a[2], b[3], c3, c1, c2); 758296341Sdelphij mul_add_c(a[3], b[2], c3, c1, c2); 759296341Sdelphij r[5] = c3; 760296341Sdelphij c3 = 0; 761296341Sdelphij mul_add_c(a[3], b[3], c1, c2, c3); 762296341Sdelphij r[6] = c1; 763296341Sdelphij r[7] = c2; 764296341Sdelphij} 76555714Skris 766109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 767296341Sdelphij{ 768296341Sdelphij# ifdef BN_LLONG 769296341Sdelphij BN_ULLONG t, tt; 770296341Sdelphij# else 771296341Sdelphij BN_ULONG bl, bh; 772296341Sdelphij# endif 773296341Sdelphij BN_ULONG t1, t2; 774296341Sdelphij BN_ULONG c1, c2, c3; 77555714Skris 776296341Sdelphij c1 = 0; 777296341Sdelphij c2 = 0; 778296341Sdelphij c3 = 0; 779296341Sdelphij sqr_add_c(a, 0, c1, c2, c3); 780296341Sdelphij r[0] = c1; 781296341Sdelphij c1 = 0; 782296341Sdelphij sqr_add_c2(a, 1, 0, c2, c3, c1); 783296341Sdelphij r[1] = c2; 784296341Sdelphij c2 = 0; 785296341Sdelphij sqr_add_c(a, 1, c3, c1, c2); 786296341Sdelphij sqr_add_c2(a, 2, 0, c3, c1, c2); 787296341Sdelphij r[2] = c3; 788296341Sdelphij c3 = 0; 789296341Sdelphij sqr_add_c2(a, 3, 0, c1, c2, c3); 790296341Sdelphij sqr_add_c2(a, 2, 1, c1, c2, c3); 791296341Sdelphij r[3] = c1; 792296341Sdelphij c1 = 0; 793296341Sdelphij sqr_add_c(a, 2, c2, c3, c1); 794296341Sdelphij sqr_add_c2(a, 3, 1, c2, c3, c1); 795296341Sdelphij sqr_add_c2(a, 4, 0, c2, c3, c1); 796296341Sdelphij r[4] = c2; 797296341Sdelphij c2 = 0; 798296341Sdelphij sqr_add_c2(a, 5, 0, c3, c1, c2); 799296341Sdelphij sqr_add_c2(a, 4, 1, c3, c1, c2); 800296341Sdelphij sqr_add_c2(a, 3, 2, c3, c1, c2); 801296341Sdelphij r[5] = c3; 802296341Sdelphij c3 = 0; 803296341Sdelphij sqr_add_c(a, 3, c1, c2, c3); 804296341Sdelphij sqr_add_c2(a, 4, 2, c1, c2, c3); 805296341Sdelphij sqr_add_c2(a, 5, 1, c1, c2, c3); 806296341Sdelphij sqr_add_c2(a, 6, 0, c1, c2, c3); 807296341Sdelphij r[6] = c1; 808296341Sdelphij c1 = 0; 809296341Sdelphij sqr_add_c2(a, 7, 0, c2, c3, c1); 810296341Sdelphij sqr_add_c2(a, 6, 1, c2, c3, c1); 811296341Sdelphij sqr_add_c2(a, 5, 2, c2, c3, c1); 812296341Sdelphij sqr_add_c2(a, 4, 3, c2, c3, c1); 813296341Sdelphij r[7] = c2; 814296341Sdelphij c2 = 0; 815296341Sdelphij sqr_add_c(a, 4, c3, c1, c2); 816296341Sdelphij sqr_add_c2(a, 5, 3, c3, c1, c2); 817296341Sdelphij sqr_add_c2(a, 6, 2, c3, c1, c2); 818296341Sdelphij sqr_add_c2(a, 7, 1, c3, c1, c2); 819296341Sdelphij r[8] = c3; 820296341Sdelphij c3 = 0; 821296341Sdelphij sqr_add_c2(a, 7, 2, c1, c2, c3); 822296341Sdelphij sqr_add_c2(a, 6, 3, c1, c2, c3); 823296341Sdelphij sqr_add_c2(a, 5, 4, c1, c2, c3); 824296341Sdelphij r[9] = c1; 825296341Sdelphij c1 = 0; 826296341Sdelphij sqr_add_c(a, 5, c2, c3, c1); 827296341Sdelphij sqr_add_c2(a, 6, 4, c2, c3, c1); 828296341Sdelphij sqr_add_c2(a, 7, 3, c2, c3, c1); 829296341Sdelphij r[10] = c2; 830296341Sdelphij c2 = 0; 831296341Sdelphij sqr_add_c2(a, 7, 4, c3, c1, c2); 832296341Sdelphij sqr_add_c2(a, 6, 5, c3, c1, c2); 833296341Sdelphij r[11] = c3; 834296341Sdelphij c3 = 0; 835296341Sdelphij sqr_add_c(a, 6, c1, c2, c3); 836296341Sdelphij sqr_add_c2(a, 7, 5, c1, c2, c3); 837296341Sdelphij r[12] = c1; 838296341Sdelphij c1 = 0; 839296341Sdelphij sqr_add_c2(a, 7, 6, c2, c3, c1); 840296341Sdelphij r[13] = c2; 841296341Sdelphij c2 = 0; 842296341Sdelphij sqr_add_c(a, 7, c3, c1, c2); 843296341Sdelphij r[14] = c3; 844296341Sdelphij r[15] = c1; 845296341Sdelphij} 84655714Skris 847109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 848296341Sdelphij{ 849296341Sdelphij# ifdef BN_LLONG 850296341Sdelphij BN_ULLONG t, tt; 851296341Sdelphij# else 852296341Sdelphij BN_ULONG bl, bh; 853296341Sdelphij# endif 854296341Sdelphij BN_ULONG t1, t2; 855296341Sdelphij BN_ULONG c1, c2, c3; 85655714Skris 857296341Sdelphij c1 = 0; 858296341Sdelphij c2 = 0; 859296341Sdelphij c3 = 0; 860296341Sdelphij sqr_add_c(a, 0, c1, c2, c3); 861296341Sdelphij r[0] = c1; 862296341Sdelphij c1 = 0; 863296341Sdelphij sqr_add_c2(a, 1, 0, c2, c3, c1); 864296341Sdelphij r[1] = c2; 865296341Sdelphij c2 = 0; 866296341Sdelphij sqr_add_c(a, 1, c3, c1, c2); 867296341Sdelphij sqr_add_c2(a, 2, 0, c3, c1, c2); 868296341Sdelphij r[2] = c3; 869296341Sdelphij c3 = 0; 870296341Sdelphij sqr_add_c2(a, 3, 0, c1, c2, c3); 871296341Sdelphij sqr_add_c2(a, 2, 1, c1, c2, c3); 872296341Sdelphij r[3] = c1; 873296341Sdelphij c1 = 0; 874296341Sdelphij sqr_add_c(a, 2, c2, c3, c1); 875296341Sdelphij sqr_add_c2(a, 3, 1, c2, c3, c1); 876296341Sdelphij r[4] = c2; 877296341Sdelphij c2 = 0; 878296341Sdelphij sqr_add_c2(a, 3, 2, c3, c1, c2); 879296341Sdelphij r[5] = c3; 880296341Sdelphij c3 = 0; 881296341Sdelphij sqr_add_c(a, 3, c1, c2, c3); 882296341Sdelphij r[6] = c1; 883296341Sdelphij r[7] = c2; 884296341Sdelphij} 885238405Sjkim 886296341Sdelphij# ifdef OPENSSL_NO_ASM 887296341Sdelphij# ifdef OPENSSL_BN_ASM_MONT 888296341Sdelphij# include <alloca.h> 889238405Sjkim/* 890238405Sjkim * This is essentially reference implementation, which may or may not 891238405Sjkim * result in performance improvement. E.g. on IA-32 this routine was 892238405Sjkim * observed to give 40% faster rsa1024 private key operations and 10% 893238405Sjkim * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 894238405Sjkim * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 895238405Sjkim * reference implementation, one to be used as starting point for 896238405Sjkim * platform-specific assembler. Mentioned numbers apply to compiler 897238405Sjkim * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 898238405Sjkim * can vary not only from platform to platform, but even for compiler 899238405Sjkim * versions. Assembler vs. assembler improvement coefficients can 900238405Sjkim * [and are known to] differ and are to be documented elsewhere. 901238405Sjkim */ 902296341Sdelphijint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 903296341Sdelphij const BN_ULONG *np, const BN_ULONG *n0p, int num) 904296341Sdelphij{ 905296341Sdelphij BN_ULONG c0, c1, ml, *tp, n0; 906296341Sdelphij# ifdef mul64 907296341Sdelphij BN_ULONG mh; 908296341Sdelphij# endif 909296341Sdelphij volatile BN_ULONG *vp; 910296341Sdelphij int i = 0, j; 911238405Sjkim 912296341Sdelphij# if 0 /* template for platform-specific 913296341Sdelphij * implementation */ 914296341Sdelphij if (ap == bp) 915296341Sdelphij return bn_sqr_mont(rp, ap, np, n0p, num); 916296341Sdelphij# endif 917296341Sdelphij vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 918238405Sjkim 919296341Sdelphij n0 = *n0p; 920238405Sjkim 921296341Sdelphij c0 = 0; 922296341Sdelphij ml = bp[0]; 923296341Sdelphij# ifdef mul64 924296341Sdelphij mh = HBITS(ml); 925296341Sdelphij ml = LBITS(ml); 926296341Sdelphij for (j = 0; j < num; ++j) 927296341Sdelphij mul(tp[j], ap[j], ml, mh, c0); 928296341Sdelphij# else 929296341Sdelphij for (j = 0; j < num; ++j) 930296341Sdelphij mul(tp[j], ap[j], ml, c0); 931296341Sdelphij# endif 932238405Sjkim 933296341Sdelphij tp[num] = c0; 934296341Sdelphij tp[num + 1] = 0; 935296341Sdelphij goto enter; 936238405Sjkim 937296341Sdelphij for (i = 0; i < num; i++) { 938296341Sdelphij c0 = 0; 939296341Sdelphij ml = bp[i]; 940296341Sdelphij# ifdef mul64 941296341Sdelphij mh = HBITS(ml); 942296341Sdelphij ml = LBITS(ml); 943296341Sdelphij for (j = 0; j < num; ++j) 944296341Sdelphij mul_add(tp[j], ap[j], ml, mh, c0); 945296341Sdelphij# else 946296341Sdelphij for (j = 0; j < num; ++j) 947296341Sdelphij mul_add(tp[j], ap[j], ml, c0); 948296341Sdelphij# endif 949296341Sdelphij c1 = (tp[num] + c0) & BN_MASK2; 950296341Sdelphij tp[num] = c1; 951296341Sdelphij tp[num + 1] = (c1 < c0 ? 1 : 0); 952296341Sdelphij enter: 953296341Sdelphij c1 = tp[0]; 954296341Sdelphij ml = (c1 * n0) & BN_MASK2; 955296341Sdelphij c0 = 0; 956296341Sdelphij# ifdef mul64 957296341Sdelphij mh = HBITS(ml); 958296341Sdelphij ml = LBITS(ml); 959296341Sdelphij mul_add(c1, np[0], ml, mh, c0); 960296341Sdelphij# else 961296341Sdelphij mul_add(c1, ml, np[0], c0); 962296341Sdelphij# endif 963296341Sdelphij for (j = 1; j < num; j++) { 964296341Sdelphij c1 = tp[j]; 965296341Sdelphij# ifdef mul64 966296341Sdelphij mul_add(c1, np[j], ml, mh, c0); 967296341Sdelphij# else 968296341Sdelphij mul_add(c1, ml, np[j], c0); 969296341Sdelphij# endif 970296341Sdelphij tp[j - 1] = c1 & BN_MASK2; 971296341Sdelphij } 972296341Sdelphij c1 = (tp[num] + c0) & BN_MASK2; 973296341Sdelphij tp[num - 1] = c1; 974296341Sdelphij tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 975296341Sdelphij } 976238405Sjkim 977296341Sdelphij if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 978296341Sdelphij c0 = bn_sub_words(rp, tp, np, num); 979296341Sdelphij if (tp[num] != 0 || c0 == 0) { 980296341Sdelphij for (i = 0; i < num + 2; i++) 981296341Sdelphij vp[i] = 0; 982296341Sdelphij return 1; 983296341Sdelphij } 984296341Sdelphij } 985296341Sdelphij for (i = 0; i < num; i++) 986296341Sdelphij rp[i] = tp[i], vp[i] = 0; 987296341Sdelphij vp[num] = 0; 988296341Sdelphij vp[num + 1] = 0; 989296341Sdelphij return 1; 990296341Sdelphij} 991296341Sdelphij# else 992238405Sjkim/* 993238405Sjkim * Return value of 0 indicates that multiplication/convolution was not 994238405Sjkim * performed to signal the caller to fall down to alternative/original 995238405Sjkim * code-path. 996238405Sjkim */ 997296341Sdelphijint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 998296341Sdelphij const BN_ULONG *np, const BN_ULONG *n0, int num) 999296341Sdelphij{ 1000296341Sdelphij return 0; 1001296341Sdelphij} 1002296341Sdelphij# endif /* OPENSSL_BN_ASM_MONT */ 1003296341Sdelphij# endif 1004238405Sjkim 1005296341Sdelphij#else /* !BN_MUL_COMBA */ 100655714Skris 100755714Skris/* hmm... is it faster just to do a multiply? */ 1008296341Sdelphij# undef bn_sqr_comba4 1009238405Sjkimvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 1010296341Sdelphij{ 1011296341Sdelphij BN_ULONG t[8]; 1012296341Sdelphij bn_sqr_normal(r, a, 4, t); 1013296341Sdelphij} 101455714Skris 1015296341Sdelphij# undef bn_sqr_comba8 1016238405Sjkimvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 1017296341Sdelphij{ 1018296341Sdelphij BN_ULONG t[16]; 1019296341Sdelphij bn_sqr_normal(r, a, 8, t); 1020296341Sdelphij} 102155714Skris 102255714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1023296341Sdelphij{ 1024296341Sdelphij r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 1025296341Sdelphij r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 1026296341Sdelphij r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 1027296341Sdelphij r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 1028296341Sdelphij} 102955714Skris 103055714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1031296341Sdelphij{ 1032296341Sdelphij r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 1033296341Sdelphij r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 1034296341Sdelphij r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 1035296341Sdelphij r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 1036296341Sdelphij r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 1037296341Sdelphij r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 1038296341Sdelphij r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 1039296341Sdelphij r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 1040296341Sdelphij} 104155714Skris 1042296341Sdelphij# ifdef OPENSSL_NO_ASM 1043296341Sdelphij# ifdef OPENSSL_BN_ASM_MONT 1044296341Sdelphij# include <alloca.h> 1045296341Sdelphijint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1046296341Sdelphij const BN_ULONG *np, const BN_ULONG *n0p, int num) 1047296341Sdelphij{ 1048296341Sdelphij BN_ULONG c0, c1, *tp, n0 = *n0p; 1049296341Sdelphij volatile BN_ULONG *vp; 1050296341Sdelphij int i = 0, j; 1051238405Sjkim 1052296341Sdelphij vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 1053238405Sjkim 1054296341Sdelphij for (i = 0; i <= num; i++) 1055296341Sdelphij tp[i] = 0; 1056238405Sjkim 1057296341Sdelphij for (i = 0; i < num; i++) { 1058296341Sdelphij c0 = bn_mul_add_words(tp, ap, num, bp[i]); 1059296341Sdelphij c1 = (tp[num] + c0) & BN_MASK2; 1060296341Sdelphij tp[num] = c1; 1061296341Sdelphij tp[num + 1] = (c1 < c0 ? 1 : 0); 1062238405Sjkim 1063296341Sdelphij c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 1064296341Sdelphij c1 = (tp[num] + c0) & BN_MASK2; 1065296341Sdelphij tp[num] = c1; 1066296341Sdelphij tp[num + 1] += (c1 < c0 ? 1 : 0); 1067296341Sdelphij for (j = 0; j <= num; j++) 1068296341Sdelphij tp[j] = tp[j + 1]; 1069296341Sdelphij } 1070238405Sjkim 1071296341Sdelphij if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 1072296341Sdelphij c0 = bn_sub_words(rp, tp, np, num); 1073296341Sdelphij if (tp[num] != 0 || c0 == 0) { 1074296341Sdelphij for (i = 0; i < num + 2; i++) 1075296341Sdelphij vp[i] = 0; 1076296341Sdelphij return 1; 1077296341Sdelphij } 1078296341Sdelphij } 1079296341Sdelphij for (i = 0; i < num; i++) 1080296341Sdelphij rp[i] = tp[i], vp[i] = 0; 1081296341Sdelphij vp[num] = 0; 1082296341Sdelphij vp[num + 1] = 0; 1083296341Sdelphij return 1; 1084296341Sdelphij} 1085296341Sdelphij# else 1086296341Sdelphijint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1087296341Sdelphij const BN_ULONG *np, const BN_ULONG *n0, int num) 1088296341Sdelphij{ 1089296341Sdelphij return 0; 1090296341Sdelphij} 1091296341Sdelphij# endif /* OPENSSL_BN_ASM_MONT */ 1092296341Sdelphij# endif 1093238405Sjkim 1094296341Sdelphij#endif /* !BN_MUL_COMBA */ 1095