155714Skris/* crypto/bn/bn_asm.c */ 255714Skris/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 355714Skris * All rights reserved. 455714Skris * 555714Skris * This package is an SSL implementation written 655714Skris * by Eric Young (eay@cryptsoft.com). 755714Skris * The implementation was written so as to conform with Netscapes SSL. 8280297Sjkim * 955714Skris * This library is free for commercial and non-commercial use as long as 1055714Skris * the following conditions are aheared to. The following conditions 1155714Skris * apply to all code found in this distribution, be it the RC4, RSA, 1255714Skris * lhash, DES, etc., code; not just the SSL code. The SSL documentation 1355714Skris * included with this distribution is covered by the same copyright terms 1455714Skris * except that the holder is Tim Hudson (tjh@cryptsoft.com). 15280297Sjkim * 1655714Skris * Copyright remains Eric Young's, and as such any Copyright notices in 1755714Skris * the code are not to be removed. 1855714Skris * If this package is used in a product, Eric Young should be given attribution 1955714Skris * as the author of the parts of the library used. 2055714Skris * This can be in the form of a textual message at program startup or 2155714Skris * in documentation (online or textual) provided with the package. 22280297Sjkim * 2355714Skris * Redistribution and use in source and binary forms, with or without 2455714Skris * modification, are permitted provided that the following conditions 2555714Skris * are met: 2655714Skris * 1. Redistributions of source code must retain the copyright 2755714Skris * notice, this list of conditions and the following disclaimer. 2855714Skris * 2. Redistributions in binary form must reproduce the above copyright 2955714Skris * notice, this list of conditions and the following disclaimer in the 3055714Skris * documentation and/or other materials provided with the distribution. 3155714Skris * 3. All advertising materials mentioning features or use of this software 3255714Skris * must display the following acknowledgement: 3355714Skris * "This product includes cryptographic software written by 3455714Skris * Eric Young (eay@cryptsoft.com)" 3555714Skris * The word 'cryptographic' can be left out if the rouines from the library 3655714Skris * being used are not cryptographic related :-). 37280297Sjkim * 4. If you include any Windows specific code (or a derivative thereof) from 3855714Skris * the apps directory (application code) you must include an acknowledgement: 3955714Skris * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" 40280297Sjkim * 4155714Skris * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 4255714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 4355714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 4455714Skris * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 4555714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 4655714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 4755714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 4855714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 4955714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 5055714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 5155714Skris * SUCH DAMAGE. 52280297Sjkim * 5355714Skris * The licence and distribution terms for any publically available version or 5455714Skris * derivative of this code cannot be changed. i.e. this code cannot simply be 5555714Skris * copied and put under another distribution licence 5655714Skris * [including the GNU Public Licence.] 5755714Skris */ 5855714Skris 5959191Skris#ifndef BN_DEBUG 60280297Sjkim# undef NDEBUG /* avoid conflicting definitions */ 6159191Skris# define NDEBUG 6259191Skris#endif 6359191Skris 6455714Skris#include <stdio.h> 6559191Skris#include <assert.h> 6655714Skris#include "cryptlib.h" 6755714Skris#include "bn_lcl.h" 6855714Skris 6959191Skris#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 7055714Skris 71280297SjkimBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 72280297Sjkim BN_ULONG w) 73280297Sjkim{ 74280297Sjkim BN_ULONG c1 = 0; 7555714Skris 76280297Sjkim assert(num >= 0); 77280297Sjkim if (num <= 0) 78280297Sjkim return (c1); 7955714Skris 80280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT 81280297Sjkim while (num & ~3) { 82280297Sjkim mul_add(rp[0], ap[0], w, c1); 83280297Sjkim mul_add(rp[1], ap[1], w, c1); 84280297Sjkim mul_add(rp[2], ap[2], w, c1); 85280297Sjkim mul_add(rp[3], ap[3], w, c1); 86280297Sjkim ap += 4; 87280297Sjkim rp += 4; 88280297Sjkim num -= 4; 89280297Sjkim } 90280297Sjkim# endif 91280297Sjkim while (num) { 92280297Sjkim mul_add(rp[0], ap[0], w, c1); 93280297Sjkim ap++; 94280297Sjkim rp++; 95280297Sjkim num--; 96280297Sjkim } 9755714Skris 98280297Sjkim return (c1); 99280297Sjkim} 100280297Sjkim 101109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 102280297Sjkim{ 103280297Sjkim BN_ULONG c1 = 0; 10455714Skris 105280297Sjkim assert(num >= 0); 106280297Sjkim if (num <= 0) 107280297Sjkim return (c1); 10855714Skris 109280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT 110280297Sjkim while (num & ~3) { 111280297Sjkim mul(rp[0], ap[0], w, c1); 112280297Sjkim mul(rp[1], ap[1], w, c1); 113280297Sjkim mul(rp[2], ap[2], w, c1); 114280297Sjkim mul(rp[3], ap[3], w, c1); 115280297Sjkim ap += 4; 116280297Sjkim rp += 4; 117280297Sjkim num -= 4; 118280297Sjkim } 119280297Sjkim# endif 120280297Sjkim while (num) { 121280297Sjkim mul(rp[0], ap[0], w, c1); 122280297Sjkim ap++; 123280297Sjkim rp++; 124280297Sjkim num--; 125280297Sjkim } 126280297Sjkim return (c1); 127280297Sjkim} 12855714Skris 129109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 130280297Sjkim{ 131280297Sjkim assert(n >= 0); 132280297Sjkim if (n <= 0) 133280297Sjkim return; 134238405Sjkim 135280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT 136280297Sjkim while (n & ~3) { 137280297Sjkim sqr(r[0], r[1], a[0]); 138280297Sjkim sqr(r[2], r[3], a[1]); 139280297Sjkim sqr(r[4], r[5], a[2]); 140280297Sjkim sqr(r[6], r[7], a[3]); 141280297Sjkim a += 4; 142280297Sjkim r += 8; 143280297Sjkim n -= 4; 144280297Sjkim } 145280297Sjkim# endif 146280297Sjkim while (n) { 147280297Sjkim sqr(r[0], r[1], a[0]); 148280297Sjkim a++; 149280297Sjkim r += 2; 150280297Sjkim n--; 151280297Sjkim } 152280297Sjkim} 15355714Skris 154280297Sjkim#else /* !(defined(BN_LLONG) || 155280297Sjkim * defined(BN_UMULT_HIGH)) */ 15655714Skris 157280297SjkimBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 158280297Sjkim BN_ULONG w) 159280297Sjkim{ 160280297Sjkim BN_ULONG c = 0; 161280297Sjkim BN_ULONG bl, bh; 16255714Skris 163280297Sjkim assert(num >= 0); 164280297Sjkim if (num <= 0) 165280297Sjkim return ((BN_ULONG)0); 16655714Skris 167280297Sjkim bl = LBITS(w); 168280297Sjkim bh = HBITS(w); 16955714Skris 170280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT 171280297Sjkim while (num & ~3) { 172280297Sjkim mul_add(rp[0], ap[0], bl, bh, c); 173280297Sjkim mul_add(rp[1], ap[1], bl, bh, c); 174280297Sjkim mul_add(rp[2], ap[2], bl, bh, c); 175280297Sjkim mul_add(rp[3], ap[3], bl, bh, c); 176280297Sjkim ap += 4; 177280297Sjkim rp += 4; 178280297Sjkim num -= 4; 179280297Sjkim } 180280297Sjkim# endif 181280297Sjkim while (num) { 182280297Sjkim mul_add(rp[0], ap[0], bl, bh, c); 183280297Sjkim ap++; 184280297Sjkim rp++; 185280297Sjkim num--; 186280297Sjkim } 187280297Sjkim return (c); 188280297Sjkim} 18955714Skris 190109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 191280297Sjkim{ 192280297Sjkim BN_ULONG carry = 0; 193280297Sjkim BN_ULONG bl, bh; 19455714Skris 195280297Sjkim assert(num >= 0); 196280297Sjkim if (num <= 0) 197280297Sjkim return ((BN_ULONG)0); 19855714Skris 199280297Sjkim bl = LBITS(w); 200280297Sjkim bh = HBITS(w); 20155714Skris 202280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT 203280297Sjkim while (num & ~3) { 204280297Sjkim mul(rp[0], ap[0], bl, bh, carry); 205280297Sjkim mul(rp[1], ap[1], bl, bh, carry); 206280297Sjkim mul(rp[2], ap[2], bl, bh, carry); 207280297Sjkim mul(rp[3], ap[3], bl, bh, carry); 208280297Sjkim ap += 4; 209280297Sjkim rp += 4; 210280297Sjkim num -= 4; 211280297Sjkim } 212280297Sjkim# endif 213280297Sjkim while (num) { 214280297Sjkim mul(rp[0], ap[0], bl, bh, carry); 215280297Sjkim ap++; 216280297Sjkim rp++; 217280297Sjkim num--; 218280297Sjkim } 219280297Sjkim return (carry); 220280297Sjkim} 22155714Skris 222109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 223280297Sjkim{ 224280297Sjkim assert(n >= 0); 225280297Sjkim if (n <= 0) 226280297Sjkim return; 227238405Sjkim 228280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT 229280297Sjkim while (n & ~3) { 230280297Sjkim sqr64(r[0], r[1], a[0]); 231280297Sjkim sqr64(r[2], r[3], a[1]); 232280297Sjkim sqr64(r[4], r[5], a[2]); 233280297Sjkim sqr64(r[6], r[7], a[3]); 234280297Sjkim a += 4; 235280297Sjkim r += 8; 236280297Sjkim n -= 4; 237280297Sjkim } 238280297Sjkim# endif 239280297Sjkim while (n) { 240280297Sjkim sqr64(r[0], r[1], a[0]); 241280297Sjkim a++; 242280297Sjkim r += 2; 243280297Sjkim n--; 244280297Sjkim } 245280297Sjkim} 24655714Skris 247280297Sjkim#endif /* !(defined(BN_LLONG) || 248280297Sjkim * defined(BN_UMULT_HIGH)) */ 24955714Skris 25055714Skris#if defined(BN_LLONG) && defined(BN_DIV2W) 25155714Skris 25255714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 253280297Sjkim{ 254280297Sjkim return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 255280297Sjkim} 25655714Skris 25755714Skris#else 25855714Skris 25968651Skris/* Divide h,l by d and return the result. */ 26055714Skris/* I need to test this some more :-( */ 26155714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 262280297Sjkim{ 263280297Sjkim BN_ULONG dh, dl, q, ret = 0, th, tl, t; 264280297Sjkim int i, count = 2; 26555714Skris 266280297Sjkim if (d == 0) 267280297Sjkim return (BN_MASK2); 26855714Skris 269280297Sjkim i = BN_num_bits_word(d); 270280297Sjkim assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 27168651Skris 272280297Sjkim i = BN_BITS2 - i; 273280297Sjkim if (h >= d) 274280297Sjkim h -= d; 27555714Skris 276280297Sjkim if (i) { 277280297Sjkim d <<= i; 278280297Sjkim h = (h << i) | (l >> (BN_BITS2 - i)); 279280297Sjkim l <<= i; 280280297Sjkim } 281280297Sjkim dh = (d & BN_MASK2h) >> BN_BITS4; 282280297Sjkim dl = (d & BN_MASK2l); 283280297Sjkim for (;;) { 284280297Sjkim if ((h >> BN_BITS4) == dh) 285280297Sjkim q = BN_MASK2l; 286280297Sjkim else 287280297Sjkim q = h / dh; 28855714Skris 289280297Sjkim th = q * dh; 290280297Sjkim tl = dl * q; 291280297Sjkim for (;;) { 292280297Sjkim t = h - th; 293280297Sjkim if ((t & BN_MASK2h) || 294280297Sjkim ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 295280297Sjkim break; 296280297Sjkim q--; 297280297Sjkim th -= dh; 298280297Sjkim tl -= dl; 299280297Sjkim } 300280297Sjkim t = (tl >> BN_BITS4); 301280297Sjkim tl = (tl << BN_BITS4) & BN_MASK2h; 302280297Sjkim th += t; 30355714Skris 304280297Sjkim if (l < tl) 305280297Sjkim th++; 306280297Sjkim l -= tl; 307280297Sjkim if (h < th) { 308280297Sjkim h += d; 309280297Sjkim q--; 310280297Sjkim } 311280297Sjkim h -= th; 31255714Skris 313280297Sjkim if (--count == 0) 314280297Sjkim break; 31555714Skris 316280297Sjkim ret = q << BN_BITS4; 317280297Sjkim h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 318280297Sjkim l = (l & BN_MASK2l) << BN_BITS4; 319280297Sjkim } 320280297Sjkim ret |= q; 321280297Sjkim return (ret); 322280297Sjkim} 323280297Sjkim#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 32455714Skris 32555714Skris#ifdef BN_LLONG 326280297SjkimBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 327280297Sjkim int n) 328280297Sjkim{ 329280297Sjkim BN_ULLONG ll = 0; 33055714Skris 331280297Sjkim assert(n >= 0); 332280297Sjkim if (n <= 0) 333280297Sjkim return ((BN_ULONG)0); 33455714Skris 335280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT 336280297Sjkim while (n & ~3) { 337280297Sjkim ll += (BN_ULLONG) a[0] + b[0]; 338280297Sjkim r[0] = (BN_ULONG)ll & BN_MASK2; 339280297Sjkim ll >>= BN_BITS2; 340280297Sjkim ll += (BN_ULLONG) a[1] + b[1]; 341280297Sjkim r[1] = (BN_ULONG)ll & BN_MASK2; 342280297Sjkim ll >>= BN_BITS2; 343280297Sjkim ll += (BN_ULLONG) a[2] + b[2]; 344280297Sjkim r[2] = (BN_ULONG)ll & BN_MASK2; 345280297Sjkim ll >>= BN_BITS2; 346280297Sjkim ll += (BN_ULLONG) a[3] + b[3]; 347280297Sjkim r[3] = (BN_ULONG)ll & BN_MASK2; 348280297Sjkim ll >>= BN_BITS2; 349280297Sjkim a += 4; 350280297Sjkim b += 4; 351280297Sjkim r += 4; 352280297Sjkim n -= 4; 353280297Sjkim } 354280297Sjkim# endif 355280297Sjkim while (n) { 356280297Sjkim ll += (BN_ULLONG) a[0] + b[0]; 357280297Sjkim r[0] = (BN_ULONG)ll & BN_MASK2; 358280297Sjkim ll >>= BN_BITS2; 359280297Sjkim a++; 360280297Sjkim b++; 361280297Sjkim r++; 362280297Sjkim n--; 363280297Sjkim } 364280297Sjkim return ((BN_ULONG)ll); 365280297Sjkim} 366280297Sjkim#else /* !BN_LLONG */ 367280297SjkimBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 368280297Sjkim int n) 369280297Sjkim{ 370280297Sjkim BN_ULONG c, l, t; 37155714Skris 372280297Sjkim assert(n >= 0); 373280297Sjkim if (n <= 0) 374280297Sjkim return ((BN_ULONG)0); 37555714Skris 376280297Sjkim c = 0; 377280297Sjkim# ifndef OPENSSL_SMALL_FOOTPRINT 378280297Sjkim while (n & ~3) { 379280297Sjkim t = a[0]; 380280297Sjkim t = (t + c) & BN_MASK2; 381280297Sjkim c = (t < c); 382280297Sjkim l = (t + b[0]) & BN_MASK2; 383280297Sjkim c += (l < t); 384280297Sjkim r[0] = l; 385280297Sjkim t = a[1]; 386280297Sjkim t = (t + c) & BN_MASK2; 387280297Sjkim c = (t < c); 388280297Sjkim l = (t + b[1]) & BN_MASK2; 389280297Sjkim c += (l < t); 390280297Sjkim r[1] = l; 391280297Sjkim t = a[2]; 392280297Sjkim t = (t + c) & BN_MASK2; 393280297Sjkim c = (t < c); 394280297Sjkim l = (t + b[2]) & BN_MASK2; 395280297Sjkim c += (l < t); 396280297Sjkim r[2] = l; 397280297Sjkim t = a[3]; 398280297Sjkim t = (t + c) & BN_MASK2; 399280297Sjkim c = (t < c); 400280297Sjkim l = (t + b[3]) & BN_MASK2; 401280297Sjkim c += (l < t); 402280297Sjkim r[3] = l; 403280297Sjkim a += 4; 404280297Sjkim b += 4; 405280297Sjkim r += 4; 406280297Sjkim n -= 4; 407280297Sjkim } 408280297Sjkim# endif 409280297Sjkim while (n) { 410280297Sjkim t = a[0]; 411280297Sjkim t = (t + c) & BN_MASK2; 412280297Sjkim c = (t < c); 413280297Sjkim l = (t + b[0]) & BN_MASK2; 414280297Sjkim c += (l < t); 415280297Sjkim r[0] = l; 416280297Sjkim a++; 417280297Sjkim b++; 418280297Sjkim r++; 419280297Sjkim n--; 420280297Sjkim } 421280297Sjkim return ((BN_ULONG)c); 422280297Sjkim} 423280297Sjkim#endif /* !BN_LLONG */ 42455714Skris 425280297SjkimBN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 426280297Sjkim int n) 427280297Sjkim{ 428280297Sjkim BN_ULONG t1, t2; 429280297Sjkim int c = 0; 43055714Skris 431280297Sjkim assert(n >= 0); 432280297Sjkim if (n <= 0) 433280297Sjkim return ((BN_ULONG)0); 43455714Skris 435238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 436280297Sjkim while (n & ~3) { 437280297Sjkim t1 = a[0]; 438280297Sjkim t2 = b[0]; 439280297Sjkim r[0] = (t1 - t2 - c) & BN_MASK2; 440280297Sjkim if (t1 != t2) 441280297Sjkim c = (t1 < t2); 442280297Sjkim t1 = a[1]; 443280297Sjkim t2 = b[1]; 444280297Sjkim r[1] = (t1 - t2 - c) & BN_MASK2; 445280297Sjkim if (t1 != t2) 446280297Sjkim c = (t1 < t2); 447280297Sjkim t1 = a[2]; 448280297Sjkim t2 = b[2]; 449280297Sjkim r[2] = (t1 - t2 - c) & BN_MASK2; 450280297Sjkim if (t1 != t2) 451280297Sjkim c = (t1 < t2); 452280297Sjkim t1 = a[3]; 453280297Sjkim t2 = b[3]; 454280297Sjkim r[3] = (t1 - t2 - c) & BN_MASK2; 455280297Sjkim if (t1 != t2) 456280297Sjkim c = (t1 < t2); 457280297Sjkim a += 4; 458280297Sjkim b += 4; 459280297Sjkim r += 4; 460280297Sjkim n -= 4; 461280297Sjkim } 462238405Sjkim#endif 463280297Sjkim while (n) { 464280297Sjkim t1 = a[0]; 465280297Sjkim t2 = b[0]; 466280297Sjkim r[0] = (t1 - t2 - c) & BN_MASK2; 467280297Sjkim if (t1 != t2) 468280297Sjkim c = (t1 < t2); 469280297Sjkim a++; 470280297Sjkim b++; 471280297Sjkim r++; 472280297Sjkim n--; 473280297Sjkim } 474280297Sjkim return (c); 475280297Sjkim} 47655714Skris 477238405Sjkim#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 47855714Skris 479280297Sjkim# undef bn_mul_comba8 480280297Sjkim# undef bn_mul_comba4 481280297Sjkim# undef bn_sqr_comba8 482280297Sjkim# undef bn_sqr_comba4 48355714Skris 48459191Skris/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 48559191Skris/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 48659191Skris/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 487280297Sjkim/* 488280297Sjkim * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 489280297Sjkim * c=(c2,c1,c0) 490280297Sjkim */ 49159191Skris 492290207Sjkim# ifdef BN_LLONG 493276861Sjkim/* 494290207Sjkim * Keep in mind that additions to multiplication result can not 495290207Sjkim * overflow, because its high half cannot be all-ones. 496276861Sjkim */ 497290207Sjkim# define mul_add_c(a,b,c0,c1,c2) do { \ 498290207Sjkim BN_ULONG hi; \ 499290207Sjkim BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 500290207Sjkim t += c0; /* no carry */ \ 501290207Sjkim c0 = (BN_ULONG)Lw(t); \ 502290207Sjkim hi = (BN_ULONG)Hw(t); \ 503290207Sjkim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 504290207Sjkim } while(0) 50555714Skris 506290207Sjkim# define mul_add_c2(a,b,c0,c1,c2) do { \ 507290207Sjkim BN_ULONG hi; \ 508290207Sjkim BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 509290207Sjkim BN_ULLONG tt = t+c0; /* no carry */ \ 510290207Sjkim c0 = (BN_ULONG)Lw(tt); \ 511290207Sjkim hi = (BN_ULONG)Hw(tt); \ 512290207Sjkim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 513290207Sjkim t += c0; /* no carry */ \ 514290207Sjkim c0 = (BN_ULONG)Lw(t); \ 515290207Sjkim hi = (BN_ULONG)Hw(t); \ 516290207Sjkim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 517290207Sjkim } while(0) 51855714Skris 519290207Sjkim# define sqr_add_c(a,i,c0,c1,c2) do { \ 520290207Sjkim BN_ULONG hi; \ 521290207Sjkim BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ 522290207Sjkim t += c0; /* no carry */ \ 523290207Sjkim c0 = (BN_ULONG)Lw(t); \ 524290207Sjkim hi = (BN_ULONG)Hw(t); \ 525290207Sjkim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 526290207Sjkim } while(0) 52755714Skris 528280297Sjkim# define sqr_add_c2(a,i,j,c0,c1,c2) \ 529280297Sjkim mul_add_c2((a)[i],(a)[j],c0,c1,c2) 53059191Skris 531280297Sjkim# elif defined(BN_UMULT_LOHI) 532290207Sjkim/* 533290207Sjkim * Keep in mind that additions to hi can not overflow, because 534290207Sjkim * the high word of a multiplication result cannot be all-ones. 535290207Sjkim */ 536290207Sjkim# define mul_add_c(a,b,c0,c1,c2) do { \ 537290207Sjkim BN_ULONG ta = (a), tb = (b); \ 538290207Sjkim BN_ULONG lo, hi; \ 539290207Sjkim BN_UMULT_LOHI(lo,hi,ta,tb); \ 540290207Sjkim c0 += lo; hi += (c0<lo)?1:0; \ 541290207Sjkim c1 += hi; c2 += (c1<hi)?1:0; \ 542290207Sjkim } while(0) 543160814Ssimon 544290207Sjkim# define mul_add_c2(a,b,c0,c1,c2) do { \ 545290207Sjkim BN_ULONG ta = (a), tb = (b); \ 546290207Sjkim BN_ULONG lo, hi, tt; \ 547290207Sjkim BN_UMULT_LOHI(lo,hi,ta,tb); \ 548290207Sjkim c0 += lo; tt = hi+((c0<lo)?1:0); \ 549290207Sjkim c1 += tt; c2 += (c1<tt)?1:0; \ 550290207Sjkim c0 += lo; hi += (c0<lo)?1:0; \ 551290207Sjkim c1 += hi; c2 += (c1<hi)?1:0; \ 552290207Sjkim } while(0) 553160814Ssimon 554290207Sjkim# define sqr_add_c(a,i,c0,c1,c2) do { \ 555290207Sjkim BN_ULONG ta = (a)[i]; \ 556290207Sjkim BN_ULONG lo, hi; \ 557290207Sjkim BN_UMULT_LOHI(lo,hi,ta,ta); \ 558290207Sjkim c0 += lo; hi += (c0<lo)?1:0; \ 559290207Sjkim c1 += hi; c2 += (c1<hi)?1:0; \ 560290207Sjkim } while(0) 561160814Ssimon 562280297Sjkim# define sqr_add_c2(a,i,j,c0,c1,c2) \ 563280297Sjkim mul_add_c2((a)[i],(a)[j],c0,c1,c2) 564160814Ssimon 565280297Sjkim# elif defined(BN_UMULT_HIGH) 566290207Sjkim/* 567290207Sjkim * Keep in mind that additions to hi can not overflow, because 568290207Sjkim * the high word of a multiplication result cannot be all-ones. 569290207Sjkim */ 570290207Sjkim# define mul_add_c(a,b,c0,c1,c2) do { \ 571290207Sjkim BN_ULONG ta = (a), tb = (b); \ 572290207Sjkim BN_ULONG lo = ta * tb; \ 573290207Sjkim BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 574290207Sjkim c0 += lo; hi += (c0<lo)?1:0; \ 575290207Sjkim c1 += hi; c2 += (c1<hi)?1:0; \ 576290207Sjkim } while(0) 57759191Skris 578290207Sjkim# define mul_add_c2(a,b,c0,c1,c2) do { \ 579290207Sjkim BN_ULONG ta = (a), tb = (b), tt; \ 580290207Sjkim BN_ULONG lo = ta * tb; \ 581290207Sjkim BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 582290207Sjkim c0 += lo; tt = hi + ((c0<lo)?1:0); \ 583290207Sjkim c1 += tt; c2 += (c1<tt)?1:0; \ 584290207Sjkim c0 += lo; hi += (c0<lo)?1:0; \ 585290207Sjkim c1 += hi; c2 += (c1<hi)?1:0; \ 586290207Sjkim } while(0) 58759191Skris 588290207Sjkim# define sqr_add_c(a,i,c0,c1,c2) do { \ 589290207Sjkim BN_ULONG ta = (a)[i]; \ 590290207Sjkim BN_ULONG lo = ta * ta; \ 591290207Sjkim BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ 592290207Sjkim c0 += lo; hi += (c0<lo)?1:0; \ 593290207Sjkim c1 += hi; c2 += (c1<hi)?1:0; \ 594290207Sjkim } while(0) 59559191Skris 596280297Sjkim# define sqr_add_c2(a,i,j,c0,c1,c2) \ 597280297Sjkim mul_add_c2((a)[i],(a)[j],c0,c1,c2) 59859191Skris 599280297Sjkim# else /* !BN_LLONG */ 600290207Sjkim/* 601290207Sjkim * Keep in mind that additions to hi can not overflow, because 602290207Sjkim * the high word of a multiplication result cannot be all-ones. 603290207Sjkim */ 604290207Sjkim# define mul_add_c(a,b,c0,c1,c2) do { \ 605290207Sjkim BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 606290207Sjkim BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 607290207Sjkim mul64(lo,hi,bl,bh); \ 608290207Sjkim c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 609290207Sjkim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 610290207Sjkim } while(0) 61155714Skris 612290207Sjkim# define mul_add_c2(a,b,c0,c1,c2) do { \ 613290207Sjkim BN_ULONG tt; \ 614290207Sjkim BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 615290207Sjkim BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 616290207Sjkim mul64(lo,hi,bl,bh); \ 617290207Sjkim tt = hi; \ 618290207Sjkim c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ 619290207Sjkim c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ 620290207Sjkim c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 621290207Sjkim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 622290207Sjkim } while(0) 62355714Skris 624290207Sjkim# define sqr_add_c(a,i,c0,c1,c2) do { \ 625290207Sjkim BN_ULONG lo, hi; \ 626290207Sjkim sqr64(lo,hi,(a)[i]); \ 627290207Sjkim c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 628290207Sjkim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 629290207Sjkim } while(0) 63055714Skris 631280297Sjkim# define sqr_add_c2(a,i,j,c0,c1,c2) \ 632280297Sjkim mul_add_c2((a)[i],(a)[j],c0,c1,c2) 633280297Sjkim# endif /* !BN_LLONG */ 63455714Skris 63555714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 636280297Sjkim{ 637280297Sjkim BN_ULONG c1, c2, c3; 63855714Skris 639280297Sjkim c1 = 0; 640280297Sjkim c2 = 0; 641280297Sjkim c3 = 0; 642280297Sjkim mul_add_c(a[0], b[0], c1, c2, c3); 643280297Sjkim r[0] = c1; 644280297Sjkim c1 = 0; 645280297Sjkim mul_add_c(a[0], b[1], c2, c3, c1); 646280297Sjkim mul_add_c(a[1], b[0], c2, c3, c1); 647280297Sjkim r[1] = c2; 648280297Sjkim c2 = 0; 649280297Sjkim mul_add_c(a[2], b[0], c3, c1, c2); 650280297Sjkim mul_add_c(a[1], b[1], c3, c1, c2); 651280297Sjkim mul_add_c(a[0], b[2], c3, c1, c2); 652280297Sjkim r[2] = c3; 653280297Sjkim c3 = 0; 654280297Sjkim mul_add_c(a[0], b[3], c1, c2, c3); 655280297Sjkim mul_add_c(a[1], b[2], c1, c2, c3); 656280297Sjkim mul_add_c(a[2], b[1], c1, c2, c3); 657280297Sjkim mul_add_c(a[3], b[0], c1, c2, c3); 658280297Sjkim r[3] = c1; 659280297Sjkim c1 = 0; 660280297Sjkim mul_add_c(a[4], b[0], c2, c3, c1); 661280297Sjkim mul_add_c(a[3], b[1], c2, c3, c1); 662280297Sjkim mul_add_c(a[2], b[2], c2, c3, c1); 663280297Sjkim mul_add_c(a[1], b[3], c2, c3, c1); 664280297Sjkim mul_add_c(a[0], b[4], c2, c3, c1); 665280297Sjkim r[4] = c2; 666280297Sjkim c2 = 0; 667280297Sjkim mul_add_c(a[0], b[5], c3, c1, c2); 668280297Sjkim mul_add_c(a[1], b[4], c3, c1, c2); 669280297Sjkim mul_add_c(a[2], b[3], c3, c1, c2); 670280297Sjkim mul_add_c(a[3], b[2], c3, c1, c2); 671280297Sjkim mul_add_c(a[4], b[1], c3, c1, c2); 672280297Sjkim mul_add_c(a[5], b[0], c3, c1, c2); 673280297Sjkim r[5] = c3; 674280297Sjkim c3 = 0; 675280297Sjkim mul_add_c(a[6], b[0], c1, c2, c3); 676280297Sjkim mul_add_c(a[5], b[1], c1, c2, c3); 677280297Sjkim mul_add_c(a[4], b[2], c1, c2, c3); 678280297Sjkim mul_add_c(a[3], b[3], c1, c2, c3); 679280297Sjkim mul_add_c(a[2], b[4], c1, c2, c3); 680280297Sjkim mul_add_c(a[1], b[5], c1, c2, c3); 681280297Sjkim mul_add_c(a[0], b[6], c1, c2, c3); 682280297Sjkim r[6] = c1; 683280297Sjkim c1 = 0; 684280297Sjkim mul_add_c(a[0], b[7], c2, c3, c1); 685280297Sjkim mul_add_c(a[1], b[6], c2, c3, c1); 686280297Sjkim mul_add_c(a[2], b[5], c2, c3, c1); 687280297Sjkim mul_add_c(a[3], b[4], c2, c3, c1); 688280297Sjkim mul_add_c(a[4], b[3], c2, c3, c1); 689280297Sjkim mul_add_c(a[5], b[2], c2, c3, c1); 690280297Sjkim mul_add_c(a[6], b[1], c2, c3, c1); 691280297Sjkim mul_add_c(a[7], b[0], c2, c3, c1); 692280297Sjkim r[7] = c2; 693280297Sjkim c2 = 0; 694280297Sjkim mul_add_c(a[7], b[1], c3, c1, c2); 695280297Sjkim mul_add_c(a[6], b[2], c3, c1, c2); 696280297Sjkim mul_add_c(a[5], b[3], c3, c1, c2); 697280297Sjkim mul_add_c(a[4], b[4], c3, c1, c2); 698280297Sjkim mul_add_c(a[3], b[5], c3, c1, c2); 699280297Sjkim mul_add_c(a[2], b[6], c3, c1, c2); 700280297Sjkim mul_add_c(a[1], b[7], c3, c1, c2); 701280297Sjkim r[8] = c3; 702280297Sjkim c3 = 0; 703280297Sjkim mul_add_c(a[2], b[7], c1, c2, c3); 704280297Sjkim mul_add_c(a[3], b[6], c1, c2, c3); 705280297Sjkim mul_add_c(a[4], b[5], c1, c2, c3); 706280297Sjkim mul_add_c(a[5], b[4], c1, c2, c3); 707280297Sjkim mul_add_c(a[6], b[3], c1, c2, c3); 708280297Sjkim mul_add_c(a[7], b[2], c1, c2, c3); 709280297Sjkim r[9] = c1; 710280297Sjkim c1 = 0; 711280297Sjkim mul_add_c(a[7], b[3], c2, c3, c1); 712280297Sjkim mul_add_c(a[6], b[4], c2, c3, c1); 713280297Sjkim mul_add_c(a[5], b[5], c2, c3, c1); 714280297Sjkim mul_add_c(a[4], b[6], c2, c3, c1); 715280297Sjkim mul_add_c(a[3], b[7], c2, c3, c1); 716280297Sjkim r[10] = c2; 717280297Sjkim c2 = 0; 718280297Sjkim mul_add_c(a[4], b[7], c3, c1, c2); 719280297Sjkim mul_add_c(a[5], b[6], c3, c1, c2); 720280297Sjkim mul_add_c(a[6], b[5], c3, c1, c2); 721280297Sjkim mul_add_c(a[7], b[4], c3, c1, c2); 722280297Sjkim r[11] = c3; 723280297Sjkim c3 = 0; 724280297Sjkim mul_add_c(a[7], b[5], c1, c2, c3); 725280297Sjkim mul_add_c(a[6], b[6], c1, c2, c3); 726280297Sjkim mul_add_c(a[5], b[7], c1, c2, c3); 727280297Sjkim r[12] = c1; 728280297Sjkim c1 = 0; 729280297Sjkim mul_add_c(a[6], b[7], c2, c3, c1); 730280297Sjkim mul_add_c(a[7], b[6], c2, c3, c1); 731280297Sjkim r[13] = c2; 732280297Sjkim c2 = 0; 733280297Sjkim mul_add_c(a[7], b[7], c3, c1, c2); 734280297Sjkim r[14] = c3; 735280297Sjkim r[15] = c1; 736280297Sjkim} 73755714Skris 73855714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 739280297Sjkim{ 740280297Sjkim BN_ULONG c1, c2, c3; 74155714Skris 742280297Sjkim c1 = 0; 743280297Sjkim c2 = 0; 744280297Sjkim c3 = 0; 745280297Sjkim mul_add_c(a[0], b[0], c1, c2, c3); 746280297Sjkim r[0] = c1; 747280297Sjkim c1 = 0; 748280297Sjkim mul_add_c(a[0], b[1], c2, c3, c1); 749280297Sjkim mul_add_c(a[1], b[0], c2, c3, c1); 750280297Sjkim r[1] = c2; 751280297Sjkim c2 = 0; 752280297Sjkim mul_add_c(a[2], b[0], c3, c1, c2); 753280297Sjkim mul_add_c(a[1], b[1], c3, c1, c2); 754280297Sjkim mul_add_c(a[0], b[2], c3, c1, c2); 755280297Sjkim r[2] = c3; 756280297Sjkim c3 = 0; 757280297Sjkim mul_add_c(a[0], b[3], c1, c2, c3); 758280297Sjkim mul_add_c(a[1], b[2], c1, c2, c3); 759280297Sjkim mul_add_c(a[2], b[1], c1, c2, c3); 760280297Sjkim mul_add_c(a[3], b[0], c1, c2, c3); 761280297Sjkim r[3] = c1; 762280297Sjkim c1 = 0; 763280297Sjkim mul_add_c(a[3], b[1], c2, c3, c1); 764280297Sjkim mul_add_c(a[2], b[2], c2, c3, c1); 765280297Sjkim mul_add_c(a[1], b[3], c2, c3, c1); 766280297Sjkim r[4] = c2; 767280297Sjkim c2 = 0; 768280297Sjkim mul_add_c(a[2], b[3], c3, c1, c2); 769280297Sjkim mul_add_c(a[3], b[2], c3, c1, c2); 770280297Sjkim r[5] = c3; 771280297Sjkim c3 = 0; 772280297Sjkim mul_add_c(a[3], b[3], c1, c2, c3); 773280297Sjkim r[6] = c1; 774280297Sjkim r[7] = c2; 775280297Sjkim} 77655714Skris 777109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 778280297Sjkim{ 779280297Sjkim BN_ULONG c1, c2, c3; 78055714Skris 781280297Sjkim c1 = 0; 782280297Sjkim c2 = 0; 783280297Sjkim c3 = 0; 784280297Sjkim sqr_add_c(a, 0, c1, c2, c3); 785280297Sjkim r[0] = c1; 786280297Sjkim c1 = 0; 787280297Sjkim sqr_add_c2(a, 1, 0, c2, c3, c1); 788280297Sjkim r[1] = c2; 789280297Sjkim c2 = 0; 790280297Sjkim sqr_add_c(a, 1, c3, c1, c2); 791280297Sjkim sqr_add_c2(a, 2, 0, c3, c1, c2); 792280297Sjkim r[2] = c3; 793280297Sjkim c3 = 0; 794280297Sjkim sqr_add_c2(a, 3, 0, c1, c2, c3); 795280297Sjkim sqr_add_c2(a, 2, 1, c1, c2, c3); 796280297Sjkim r[3] = c1; 797280297Sjkim c1 = 0; 798280297Sjkim sqr_add_c(a, 2, c2, c3, c1); 799280297Sjkim sqr_add_c2(a, 3, 1, c2, c3, c1); 800280297Sjkim sqr_add_c2(a, 4, 0, c2, c3, c1); 801280297Sjkim r[4] = c2; 802280297Sjkim c2 = 0; 803280297Sjkim sqr_add_c2(a, 5, 0, c3, c1, c2); 804280297Sjkim sqr_add_c2(a, 4, 1, c3, c1, c2); 805280297Sjkim sqr_add_c2(a, 3, 2, c3, c1, c2); 806280297Sjkim r[5] = c3; 807280297Sjkim c3 = 0; 808280297Sjkim sqr_add_c(a, 3, c1, c2, c3); 809280297Sjkim sqr_add_c2(a, 4, 2, c1, c2, c3); 810280297Sjkim sqr_add_c2(a, 5, 1, c1, c2, c3); 811280297Sjkim sqr_add_c2(a, 6, 0, c1, c2, c3); 812280297Sjkim r[6] = c1; 813280297Sjkim c1 = 0; 814280297Sjkim sqr_add_c2(a, 7, 0, c2, c3, c1); 815280297Sjkim sqr_add_c2(a, 6, 1, c2, c3, c1); 816280297Sjkim sqr_add_c2(a, 5, 2, c2, c3, c1); 817280297Sjkim sqr_add_c2(a, 4, 3, c2, c3, c1); 818280297Sjkim r[7] = c2; 819280297Sjkim c2 = 0; 820280297Sjkim sqr_add_c(a, 4, c3, c1, c2); 821280297Sjkim sqr_add_c2(a, 5, 3, c3, c1, c2); 822280297Sjkim sqr_add_c2(a, 6, 2, c3, c1, c2); 823280297Sjkim sqr_add_c2(a, 7, 1, c3, c1, c2); 824280297Sjkim r[8] = c3; 825280297Sjkim c3 = 0; 826280297Sjkim sqr_add_c2(a, 7, 2, c1, c2, c3); 827280297Sjkim sqr_add_c2(a, 6, 3, c1, c2, c3); 828280297Sjkim sqr_add_c2(a, 5, 4, c1, c2, c3); 829280297Sjkim r[9] = c1; 830280297Sjkim c1 = 0; 831280297Sjkim sqr_add_c(a, 5, c2, c3, c1); 832280297Sjkim sqr_add_c2(a, 6, 4, c2, c3, c1); 833280297Sjkim sqr_add_c2(a, 7, 3, c2, c3, c1); 834280297Sjkim r[10] = c2; 835280297Sjkim c2 = 0; 836280297Sjkim sqr_add_c2(a, 7, 4, c3, c1, c2); 837280297Sjkim sqr_add_c2(a, 6, 5, c3, c1, c2); 838280297Sjkim r[11] = c3; 839280297Sjkim c3 = 0; 840280297Sjkim sqr_add_c(a, 6, c1, c2, c3); 841280297Sjkim sqr_add_c2(a, 7, 5, c1, c2, c3); 842280297Sjkim r[12] = c1; 843280297Sjkim c1 = 0; 844280297Sjkim sqr_add_c2(a, 7, 6, c2, c3, c1); 845280297Sjkim r[13] = c2; 846280297Sjkim c2 = 0; 847280297Sjkim sqr_add_c(a, 7, c3, c1, c2); 848280297Sjkim r[14] = c3; 849280297Sjkim r[15] = c1; 850280297Sjkim} 85155714Skris 852109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 853280297Sjkim{ 854280297Sjkim BN_ULONG c1, c2, c3; 85555714Skris 856280297Sjkim c1 = 0; 857280297Sjkim c2 = 0; 858280297Sjkim c3 = 0; 859280297Sjkim sqr_add_c(a, 0, c1, c2, c3); 860280297Sjkim r[0] = c1; 861280297Sjkim c1 = 0; 862280297Sjkim sqr_add_c2(a, 1, 0, c2, c3, c1); 863280297Sjkim r[1] = c2; 864280297Sjkim c2 = 0; 865280297Sjkim sqr_add_c(a, 1, c3, c1, c2); 866280297Sjkim sqr_add_c2(a, 2, 0, c3, c1, c2); 867280297Sjkim r[2] = c3; 868280297Sjkim c3 = 0; 869280297Sjkim sqr_add_c2(a, 3, 0, c1, c2, c3); 870280297Sjkim sqr_add_c2(a, 2, 1, c1, c2, c3); 871280297Sjkim r[3] = c1; 872280297Sjkim c1 = 0; 873280297Sjkim sqr_add_c(a, 2, c2, c3, c1); 874280297Sjkim sqr_add_c2(a, 3, 1, c2, c3, c1); 875280297Sjkim r[4] = c2; 876280297Sjkim c2 = 0; 877280297Sjkim sqr_add_c2(a, 3, 2, c3, c1, c2); 878280297Sjkim r[5] = c3; 879280297Sjkim c3 = 0; 880280297Sjkim sqr_add_c(a, 3, c1, c2, c3); 881280297Sjkim r[6] = c1; 882280297Sjkim r[7] = c2; 883280297Sjkim} 884238405Sjkim 885280297Sjkim# ifdef OPENSSL_NO_ASM 886280297Sjkim# ifdef OPENSSL_BN_ASM_MONT 887280297Sjkim# include <alloca.h> 888238405Sjkim/* 889238405Sjkim * This is essentially reference implementation, which may or may not 890238405Sjkim * result in performance improvement. E.g. on IA-32 this routine was 891238405Sjkim * observed to give 40% faster rsa1024 private key operations and 10% 892238405Sjkim * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 893238405Sjkim * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 894238405Sjkim * reference implementation, one to be used as starting point for 895238405Sjkim * platform-specific assembler. Mentioned numbers apply to compiler 896238405Sjkim * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 897238405Sjkim * can vary not only from platform to platform, but even for compiler 898238405Sjkim * versions. Assembler vs. assembler improvement coefficients can 899238405Sjkim * [and are known to] differ and are to be documented elsewhere. 900238405Sjkim */ 901280297Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 902280297Sjkim const BN_ULONG *np, const BN_ULONG *n0p, int num) 903280297Sjkim{ 904280297Sjkim BN_ULONG c0, c1, ml, *tp, n0; 905280297Sjkim# ifdef mul64 906280297Sjkim BN_ULONG mh; 907280297Sjkim# endif 908280297Sjkim volatile BN_ULONG *vp; 909280297Sjkim int i = 0, j; 910238405Sjkim 911280297Sjkim# if 0 /* template for platform-specific 912280297Sjkim * implementation */ 913280297Sjkim if (ap == bp) 914280297Sjkim return bn_sqr_mont(rp, ap, np, n0p, num); 915280297Sjkim# endif 916280297Sjkim vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 917238405Sjkim 918280297Sjkim n0 = *n0p; 919238405Sjkim 920280297Sjkim c0 = 0; 921280297Sjkim ml = bp[0]; 922280297Sjkim# ifdef mul64 923280297Sjkim mh = HBITS(ml); 924280297Sjkim ml = LBITS(ml); 925280297Sjkim for (j = 0; j < num; ++j) 926280297Sjkim mul(tp[j], ap[j], ml, mh, c0); 927280297Sjkim# else 928280297Sjkim for (j = 0; j < num; ++j) 929280297Sjkim mul(tp[j], ap[j], ml, c0); 930280297Sjkim# endif 931238405Sjkim 932280297Sjkim tp[num] = c0; 933280297Sjkim tp[num + 1] = 0; 934280297Sjkim goto enter; 935238405Sjkim 936280297Sjkim for (i = 0; i < num; i++) { 937280297Sjkim c0 = 0; 938280297Sjkim ml = bp[i]; 939280297Sjkim# ifdef mul64 940280297Sjkim mh = HBITS(ml); 941280297Sjkim ml = LBITS(ml); 942280297Sjkim for (j = 0; j < num; ++j) 943280297Sjkim mul_add(tp[j], ap[j], ml, mh, c0); 944280297Sjkim# else 945280297Sjkim for (j = 0; j < num; ++j) 946280297Sjkim mul_add(tp[j], ap[j], ml, c0); 947280297Sjkim# endif 948280297Sjkim c1 = (tp[num] + c0) & BN_MASK2; 949280297Sjkim tp[num] = c1; 950280297Sjkim tp[num + 1] = (c1 < c0 ? 1 : 0); 951280297Sjkim enter: 952280297Sjkim c1 = tp[0]; 953280297Sjkim ml = (c1 * n0) & BN_MASK2; 954280297Sjkim c0 = 0; 955280297Sjkim# ifdef mul64 956280297Sjkim mh = HBITS(ml); 957280297Sjkim ml = LBITS(ml); 958280297Sjkim mul_add(c1, np[0], ml, mh, c0); 959280297Sjkim# else 960280297Sjkim mul_add(c1, ml, np[0], c0); 961280297Sjkim# endif 962280297Sjkim for (j = 1; j < num; j++) { 963280297Sjkim c1 = tp[j]; 964280297Sjkim# ifdef mul64 965280297Sjkim mul_add(c1, np[j], ml, mh, c0); 966280297Sjkim# else 967280297Sjkim mul_add(c1, ml, np[j], c0); 968280297Sjkim# endif 969280297Sjkim tp[j - 1] = c1 & BN_MASK2; 970280297Sjkim } 971280297Sjkim c1 = (tp[num] + c0) & BN_MASK2; 972280297Sjkim tp[num - 1] = c1; 973280297Sjkim tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 974280297Sjkim } 975238405Sjkim 976280297Sjkim if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 977280297Sjkim c0 = bn_sub_words(rp, tp, np, num); 978280297Sjkim if (tp[num] != 0 || c0 == 0) { 979280297Sjkim for (i = 0; i < num + 2; i++) 980280297Sjkim vp[i] = 0; 981280297Sjkim return 1; 982280297Sjkim } 983280297Sjkim } 984280297Sjkim for (i = 0; i < num; i++) 985280297Sjkim rp[i] = tp[i], vp[i] = 0; 986280297Sjkim vp[num] = 0; 987280297Sjkim vp[num + 1] = 0; 988280297Sjkim return 1; 989280297Sjkim} 990280297Sjkim# else 991238405Sjkim/* 992238405Sjkim * Return value of 0 indicates that multiplication/convolution was not 993238405Sjkim * performed to signal the caller to fall down to alternative/original 994238405Sjkim * code-path. 995238405Sjkim */ 996280297Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 997280297Sjkim const BN_ULONG *np, const BN_ULONG *n0, int num) 998280297Sjkim{ 999280297Sjkim return 0; 1000280297Sjkim} 1001280297Sjkim# endif /* OPENSSL_BN_ASM_MONT */ 1002280297Sjkim# endif 1003238405Sjkim 1004280297Sjkim#else /* !BN_MUL_COMBA */ 100555714Skris 100655714Skris/* hmm... is it faster just to do a multiply? */ 1007280297Sjkim# undef bn_sqr_comba4 1008238405Sjkimvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 1009280297Sjkim{ 1010280297Sjkim BN_ULONG t[8]; 1011280297Sjkim bn_sqr_normal(r, a, 4, t); 1012280297Sjkim} 101355714Skris 1014280297Sjkim# undef bn_sqr_comba8 1015238405Sjkimvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 1016280297Sjkim{ 1017280297Sjkim BN_ULONG t[16]; 1018280297Sjkim bn_sqr_normal(r, a, 8, t); 1019280297Sjkim} 102055714Skris 102155714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1022280297Sjkim{ 1023280297Sjkim r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 1024280297Sjkim r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 1025280297Sjkim r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 1026280297Sjkim r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 1027280297Sjkim} 102855714Skris 102955714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 1030280297Sjkim{ 1031280297Sjkim r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 1032280297Sjkim r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 1033280297Sjkim r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 1034280297Sjkim r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 1035280297Sjkim r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 1036280297Sjkim r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 1037280297Sjkim r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 1038280297Sjkim r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 1039280297Sjkim} 104055714Skris 1041280297Sjkim# ifdef OPENSSL_NO_ASM 1042280297Sjkim# ifdef OPENSSL_BN_ASM_MONT 1043280297Sjkim# include <alloca.h> 1044280297Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1045280297Sjkim const BN_ULONG *np, const BN_ULONG *n0p, int num) 1046280297Sjkim{ 1047280297Sjkim BN_ULONG c0, c1, *tp, n0 = *n0p; 1048280297Sjkim volatile BN_ULONG *vp; 1049280297Sjkim int i = 0, j; 1050238405Sjkim 1051280297Sjkim vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 1052238405Sjkim 1053280297Sjkim for (i = 0; i <= num; i++) 1054280297Sjkim tp[i] = 0; 1055238405Sjkim 1056280297Sjkim for (i = 0; i < num; i++) { 1057280297Sjkim c0 = bn_mul_add_words(tp, ap, num, bp[i]); 1058280297Sjkim c1 = (tp[num] + c0) & BN_MASK2; 1059280297Sjkim tp[num] = c1; 1060280297Sjkim tp[num + 1] = (c1 < c0 ? 1 : 0); 1061238405Sjkim 1062280297Sjkim c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 1063280297Sjkim c1 = (tp[num] + c0) & BN_MASK2; 1064280297Sjkim tp[num] = c1; 1065280297Sjkim tp[num + 1] += (c1 < c0 ? 1 : 0); 1066280297Sjkim for (j = 0; j <= num; j++) 1067280297Sjkim tp[j] = tp[j + 1]; 1068280297Sjkim } 1069238405Sjkim 1070280297Sjkim if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 1071280297Sjkim c0 = bn_sub_words(rp, tp, np, num); 1072280297Sjkim if (tp[num] != 0 || c0 == 0) { 1073280297Sjkim for (i = 0; i < num + 2; i++) 1074280297Sjkim vp[i] = 0; 1075280297Sjkim return 1; 1076280297Sjkim } 1077280297Sjkim } 1078280297Sjkim for (i = 0; i < num; i++) 1079280297Sjkim rp[i] = tp[i], vp[i] = 0; 1080280297Sjkim vp[num] = 0; 1081280297Sjkim vp[num + 1] = 0; 1082280297Sjkim return 1; 1083280297Sjkim} 1084280297Sjkim# else 1085280297Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 1086280297Sjkim const BN_ULONG *np, const BN_ULONG *n0, int num) 1087280297Sjkim{ 1088280297Sjkim return 0; 1089280297Sjkim} 1090280297Sjkim# endif /* OPENSSL_BN_ASM_MONT */ 1091280297Sjkim# endif 1092238405Sjkim 1093280297Sjkim#endif /* !BN_MUL_COMBA */ 1094