155714Skris/* crypto/bn/bn_asm.c */ 255714Skris/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 355714Skris * All rights reserved. 455714Skris * 555714Skris * This package is an SSL implementation written 655714Skris * by Eric Young (eay@cryptsoft.com). 755714Skris * The implementation was written so as to conform with Netscapes SSL. 855714Skris * 955714Skris * This library is free for commercial and non-commercial use as long as 1055714Skris * the following conditions are aheared to. The following conditions 1155714Skris * apply to all code found in this distribution, be it the RC4, RSA, 1255714Skris * lhash, DES, etc., code; not just the SSL code. The SSL documentation 1355714Skris * included with this distribution is covered by the same copyright terms 1455714Skris * except that the holder is Tim Hudson (tjh@cryptsoft.com). 1555714Skris * 1655714Skris * Copyright remains Eric Young's, and as such any Copyright notices in 1755714Skris * the code are not to be removed. 1855714Skris * If this package is used in a product, Eric Young should be given attribution 1955714Skris * as the author of the parts of the library used. 2055714Skris * This can be in the form of a textual message at program startup or 2155714Skris * in documentation (online or textual) provided with the package. 2255714Skris * 2355714Skris * Redistribution and use in source and binary forms, with or without 2455714Skris * modification, are permitted provided that the following conditions 2555714Skris * are met: 2655714Skris * 1. Redistributions of source code must retain the copyright 2755714Skris * notice, this list of conditions and the following disclaimer. 2855714Skris * 2. Redistributions in binary form must reproduce the above copyright 2955714Skris * notice, this list of conditions and the following disclaimer in the 3055714Skris * documentation and/or other materials provided with the distribution. 3155714Skris * 3. All advertising materials mentioning features or use of this software 3255714Skris * must display the following acknowledgement: 3355714Skris * "This product includes cryptographic software written by 3455714Skris * Eric Young (eay@cryptsoft.com)" 3555714Skris * The word 'cryptographic' can be left out if the rouines from the library 3655714Skris * being used are not cryptographic related :-). 3755714Skris * 4. If you include any Windows specific code (or a derivative thereof) from 3855714Skris * the apps directory (application code) you must include an acknowledgement: 3955714Skris * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" 4055714Skris * 4155714Skris * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND 4255714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 4355714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 4455714Skris * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 4555714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 4655714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 4755714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 4855714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 4955714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 5055714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 5155714Skris * SUCH DAMAGE. 5255714Skris * 5355714Skris * The licence and distribution terms for any publically available version or 5455714Skris * derivative of this code cannot be changed. i.e. this code cannot simply be 5555714Skris * copied and put under another distribution licence 5655714Skris * [including the GNU Public Licence.] 5755714Skris */ 5855714Skris 5959191Skris#ifndef BN_DEBUG 6059191Skris# undef NDEBUG /* avoid conflicting definitions */ 6159191Skris# define NDEBUG 6259191Skris#endif 6359191Skris 6455714Skris#include <stdio.h> 6559191Skris#include <assert.h> 6655714Skris#include "cryptlib.h" 6755714Skris#include "bn_lcl.h" 6855714Skris 6959191Skris#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 7055714Skris 71109998SmarkmBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 7255714Skris { 7355714Skris BN_ULONG c1=0; 7455714Skris 7559191Skris assert(num >= 0); 7655714Skris if (num <= 0) return(c1); 7755714Skris 78238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 7959191Skris while (num&~3) 8055714Skris { 8155714Skris mul_add(rp[0],ap[0],w,c1); 8255714Skris mul_add(rp[1],ap[1],w,c1); 8355714Skris mul_add(rp[2],ap[2],w,c1); 8455714Skris mul_add(rp[3],ap[3],w,c1); 8559191Skris ap+=4; rp+=4; num-=4; 8655714Skris } 87238405Sjkim#endif 88238405Sjkim while (num) 8959191Skris { 90238405Sjkim mul_add(rp[0],ap[0],w,c1); 91238405Sjkim ap++; rp++; num--; 9259191Skris } 9355714Skris 9455714Skris return(c1); 9555714Skris } 9655714Skris 97109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 9855714Skris { 9955714Skris BN_ULONG c1=0; 10055714Skris 10159191Skris assert(num >= 0); 10255714Skris if (num <= 0) return(c1); 10355714Skris 104238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 10559191Skris while (num&~3) 10655714Skris { 10755714Skris mul(rp[0],ap[0],w,c1); 10855714Skris mul(rp[1],ap[1],w,c1); 10955714Skris mul(rp[2],ap[2],w,c1); 11055714Skris mul(rp[3],ap[3],w,c1); 11159191Skris ap+=4; rp+=4; num-=4; 11255714Skris } 113238405Sjkim#endif 114238405Sjkim while (num) 11559191Skris { 116238405Sjkim mul(rp[0],ap[0],w,c1); 117238405Sjkim ap++; rp++; num--; 11859191Skris } 11955714Skris return(c1); 12055714Skris } 12155714Skris 122109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 12355714Skris { 12459191Skris assert(n >= 0); 12555714Skris if (n <= 0) return; 126238405Sjkim 127238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 12859191Skris while (n&~3) 12955714Skris { 13059191Skris sqr(r[0],r[1],a[0]); 13159191Skris sqr(r[2],r[3],a[1]); 13259191Skris sqr(r[4],r[5],a[2]); 13359191Skris sqr(r[6],r[7],a[3]); 13459191Skris a+=4; r+=8; n-=4; 13555714Skris } 136238405Sjkim#endif 137238405Sjkim while (n) 13859191Skris { 139238405Sjkim sqr(r[0],r[1],a[0]); 140238405Sjkim a++; r+=2; n--; 14159191Skris } 14255714Skris } 14355714Skris 14459191Skris#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ 14555714Skris 146109998SmarkmBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 14755714Skris { 14855714Skris BN_ULONG c=0; 14955714Skris BN_ULONG bl,bh; 15055714Skris 15159191Skris assert(num >= 0); 15255714Skris if (num <= 0) return((BN_ULONG)0); 15355714Skris 15455714Skris bl=LBITS(w); 15555714Skris bh=HBITS(w); 15655714Skris 157238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 158238405Sjkim while (num&~3) 15955714Skris { 16055714Skris mul_add(rp[0],ap[0],bl,bh,c); 16155714Skris mul_add(rp[1],ap[1],bl,bh,c); 16255714Skris mul_add(rp[2],ap[2],bl,bh,c); 16355714Skris mul_add(rp[3],ap[3],bl,bh,c); 164238405Sjkim ap+=4; rp+=4; num-=4; 16555714Skris } 166238405Sjkim#endif 167238405Sjkim while (num) 168238405Sjkim { 169238405Sjkim mul_add(rp[0],ap[0],bl,bh,c); 170238405Sjkim ap++; rp++; num--; 171238405Sjkim } 17255714Skris return(c); 17355714Skris } 17455714Skris 175109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 17655714Skris { 17755714Skris BN_ULONG carry=0; 17855714Skris BN_ULONG bl,bh; 17955714Skris 18059191Skris assert(num >= 0); 18155714Skris if (num <= 0) return((BN_ULONG)0); 18255714Skris 18355714Skris bl=LBITS(w); 18455714Skris bh=HBITS(w); 18555714Skris 186238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 187238405Sjkim while (num&~3) 18855714Skris { 18955714Skris mul(rp[0],ap[0],bl,bh,carry); 19055714Skris mul(rp[1],ap[1],bl,bh,carry); 19155714Skris mul(rp[2],ap[2],bl,bh,carry); 19255714Skris mul(rp[3],ap[3],bl,bh,carry); 193238405Sjkim ap+=4; rp+=4; num-=4; 19455714Skris } 195238405Sjkim#endif 196238405Sjkim while (num) 197238405Sjkim { 198238405Sjkim mul(rp[0],ap[0],bl,bh,carry); 199238405Sjkim ap++; rp++; num--; 200238405Sjkim } 20155714Skris return(carry); 20255714Skris } 20355714Skris 204109998Smarkmvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 20555714Skris { 20659191Skris assert(n >= 0); 20755714Skris if (n <= 0) return; 208238405Sjkim 209238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 210238405Sjkim while (n&~3) 21155714Skris { 21255714Skris sqr64(r[0],r[1],a[0]); 21355714Skris sqr64(r[2],r[3],a[1]); 21455714Skris sqr64(r[4],r[5],a[2]); 21555714Skris sqr64(r[6],r[7],a[3]); 216238405Sjkim a+=4; r+=8; n-=4; 21755714Skris } 218238405Sjkim#endif 219238405Sjkim while (n) 220238405Sjkim { 221238405Sjkim sqr64(r[0],r[1],a[0]); 222238405Sjkim a++; r+=2; n--; 223238405Sjkim } 22455714Skris } 22555714Skris 22659191Skris#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */ 22755714Skris 22855714Skris#if defined(BN_LLONG) && defined(BN_DIV2W) 22955714Skris 23055714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 23155714Skris { 23255714Skris return((BN_ULONG)(((((BN_ULLONG)h)<<BN_BITS2)|l)/(BN_ULLONG)d)); 23355714Skris } 23455714Skris 23555714Skris#else 23655714Skris 23768651Skris/* Divide h,l by d and return the result. */ 23855714Skris/* I need to test this some more :-( */ 23955714SkrisBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 24055714Skris { 24155714Skris BN_ULONG dh,dl,q,ret=0,th,tl,t; 24255714Skris int i,count=2; 24355714Skris 24455714Skris if (d == 0) return(BN_MASK2); 24555714Skris 24655714Skris i=BN_num_bits_word(d); 247160814Ssimon assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i)); 24868651Skris 24955714Skris i=BN_BITS2-i; 25055714Skris if (h >= d) h-=d; 25155714Skris 25255714Skris if (i) 25355714Skris { 25455714Skris d<<=i; 25555714Skris h=(h<<i)|(l>>(BN_BITS2-i)); 25655714Skris l<<=i; 25755714Skris } 25855714Skris dh=(d&BN_MASK2h)>>BN_BITS4; 25955714Skris dl=(d&BN_MASK2l); 26055714Skris for (;;) 26155714Skris { 26255714Skris if ((h>>BN_BITS4) == dh) 26355714Skris q=BN_MASK2l; 26455714Skris else 26555714Skris q=h/dh; 26655714Skris 26755714Skris th=q*dh; 26855714Skris tl=dl*q; 26955714Skris for (;;) 27055714Skris { 27155714Skris t=h-th; 27255714Skris if ((t&BN_MASK2h) || 27355714Skris ((tl) <= ( 27455714Skris (t<<BN_BITS4)| 27555714Skris ((l&BN_MASK2h)>>BN_BITS4)))) 27655714Skris break; 27755714Skris q--; 27855714Skris th-=dh; 27955714Skris tl-=dl; 28055714Skris } 28155714Skris t=(tl>>BN_BITS4); 28255714Skris tl=(tl<<BN_BITS4)&BN_MASK2h; 28355714Skris th+=t; 28455714Skris 28555714Skris if (l < tl) th++; 28655714Skris l-=tl; 28755714Skris if (h < th) 28855714Skris { 28955714Skris h+=d; 29055714Skris q--; 29155714Skris } 29255714Skris h-=th; 29355714Skris 29455714Skris if (--count == 0) break; 29555714Skris 29655714Skris ret=q<<BN_BITS4; 29755714Skris h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2; 29855714Skris l=(l&BN_MASK2l)<<BN_BITS4; 29955714Skris } 30055714Skris ret|=q; 30155714Skris return(ret); 30255714Skris } 30359191Skris#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 30455714Skris 30555714Skris#ifdef BN_LLONG 306109998SmarkmBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) 30755714Skris { 30855714Skris BN_ULLONG ll=0; 30955714Skris 31059191Skris assert(n >= 0); 31155714Skris if (n <= 0) return((BN_ULONG)0); 31255714Skris 313238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 314238405Sjkim while (n&~3) 31555714Skris { 31655714Skris ll+=(BN_ULLONG)a[0]+b[0]; 31755714Skris r[0]=(BN_ULONG)ll&BN_MASK2; 31855714Skris ll>>=BN_BITS2; 31955714Skris ll+=(BN_ULLONG)a[1]+b[1]; 32055714Skris r[1]=(BN_ULONG)ll&BN_MASK2; 32155714Skris ll>>=BN_BITS2; 32255714Skris ll+=(BN_ULLONG)a[2]+b[2]; 32355714Skris r[2]=(BN_ULONG)ll&BN_MASK2; 32455714Skris ll>>=BN_BITS2; 32555714Skris ll+=(BN_ULLONG)a[3]+b[3]; 32655714Skris r[3]=(BN_ULONG)ll&BN_MASK2; 32755714Skris ll>>=BN_BITS2; 328238405Sjkim a+=4; b+=4; r+=4; n-=4; 32955714Skris } 330238405Sjkim#endif 331238405Sjkim while (n) 332238405Sjkim { 333238405Sjkim ll+=(BN_ULLONG)a[0]+b[0]; 334238405Sjkim r[0]=(BN_ULONG)ll&BN_MASK2; 335238405Sjkim ll>>=BN_BITS2; 336238405Sjkim a++; b++; r++; n--; 337238405Sjkim } 33855714Skris return((BN_ULONG)ll); 33955714Skris } 34059191Skris#else /* !BN_LLONG */ 341109998SmarkmBN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) 34255714Skris { 34355714Skris BN_ULONG c,l,t; 34455714Skris 34559191Skris assert(n >= 0); 34655714Skris if (n <= 0) return((BN_ULONG)0); 34755714Skris 34855714Skris c=0; 349238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 350238405Sjkim while (n&~3) 35155714Skris { 35255714Skris t=a[0]; 35355714Skris t=(t+c)&BN_MASK2; 35455714Skris c=(t < c); 35555714Skris l=(t+b[0])&BN_MASK2; 35655714Skris c+=(l < t); 35755714Skris r[0]=l; 35855714Skris t=a[1]; 35955714Skris t=(t+c)&BN_MASK2; 36055714Skris c=(t < c); 36155714Skris l=(t+b[1])&BN_MASK2; 36255714Skris c+=(l < t); 36355714Skris r[1]=l; 36455714Skris t=a[2]; 36555714Skris t=(t+c)&BN_MASK2; 36655714Skris c=(t < c); 36755714Skris l=(t+b[2])&BN_MASK2; 36855714Skris c+=(l < t); 36955714Skris r[2]=l; 37055714Skris t=a[3]; 37155714Skris t=(t+c)&BN_MASK2; 37255714Skris c=(t < c); 37355714Skris l=(t+b[3])&BN_MASK2; 37455714Skris c+=(l < t); 37555714Skris r[3]=l; 376238405Sjkim a+=4; b+=4; r+=4; n-=4; 37755714Skris } 378238405Sjkim#endif 379238405Sjkim while(n) 380238405Sjkim { 381238405Sjkim t=a[0]; 382238405Sjkim t=(t+c)&BN_MASK2; 383238405Sjkim c=(t < c); 384238405Sjkim l=(t+b[0])&BN_MASK2; 385238405Sjkim c+=(l < t); 386238405Sjkim r[0]=l; 387238405Sjkim a++; b++; r++; n--; 388238405Sjkim } 38955714Skris return((BN_ULONG)c); 39055714Skris } 39159191Skris#endif /* !BN_LLONG */ 39255714Skris 393109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n) 39455714Skris { 39555714Skris BN_ULONG t1,t2; 39655714Skris int c=0; 39755714Skris 39859191Skris assert(n >= 0); 39955714Skris if (n <= 0) return((BN_ULONG)0); 40055714Skris 401238405Sjkim#ifndef OPENSSL_SMALL_FOOTPRINT 402238405Sjkim while (n&~3) 40355714Skris { 40455714Skris t1=a[0]; t2=b[0]; 40555714Skris r[0]=(t1-t2-c)&BN_MASK2; 40655714Skris if (t1 != t2) c=(t1 < t2); 40755714Skris t1=a[1]; t2=b[1]; 40855714Skris r[1]=(t1-t2-c)&BN_MASK2; 40955714Skris if (t1 != t2) c=(t1 < t2); 41055714Skris t1=a[2]; t2=b[2]; 41155714Skris r[2]=(t1-t2-c)&BN_MASK2; 41255714Skris if (t1 != t2) c=(t1 < t2); 41355714Skris t1=a[3]; t2=b[3]; 41455714Skris r[3]=(t1-t2-c)&BN_MASK2; 41555714Skris if (t1 != t2) c=(t1 < t2); 416238405Sjkim a+=4; b+=4; r+=4; n-=4; 41755714Skris } 418238405Sjkim#endif 419238405Sjkim while (n) 420238405Sjkim { 421238405Sjkim t1=a[0]; t2=b[0]; 422238405Sjkim r[0]=(t1-t2-c)&BN_MASK2; 423238405Sjkim if (t1 != t2) c=(t1 < t2); 424238405Sjkim a++; b++; r++; n--; 425238405Sjkim } 42655714Skris return(c); 42755714Skris } 42855714Skris 429238405Sjkim#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 43055714Skris 43155714Skris#undef bn_mul_comba8 43255714Skris#undef bn_mul_comba4 43355714Skris#undef bn_sqr_comba8 43455714Skris#undef bn_sqr_comba4 43555714Skris 43659191Skris/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 43759191Skris/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 43859191Skris/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 43959191Skris/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ 44059191Skris 441277195Sdelphij/* 442277195Sdelphij * Keep in mind that carrying into high part of multiplication result 443277195Sdelphij * can not overflow, because it cannot be all-ones. 444277195Sdelphij */ 44555714Skris#ifdef BN_LLONG 44655714Skris#define mul_add_c(a,b,c0,c1,c2) \ 44755714Skris t=(BN_ULLONG)a*b; \ 44855714Skris t1=(BN_ULONG)Lw(t); \ 44955714Skris t2=(BN_ULONG)Hw(t); \ 45055714Skris c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 45155714Skris c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 45255714Skris 45355714Skris#define mul_add_c2(a,b,c0,c1,c2) \ 45455714Skris t=(BN_ULLONG)a*b; \ 45555714Skris tt=(t+t)&BN_MASK; \ 45655714Skris if (tt < t) c2++; \ 45755714Skris t1=(BN_ULONG)Lw(tt); \ 45855714Skris t2=(BN_ULONG)Hw(tt); \ 45955714Skris c0=(c0+t1)&BN_MASK2; \ 46055714Skris if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ 46155714Skris c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 46255714Skris 46355714Skris#define sqr_add_c(a,i,c0,c1,c2) \ 46455714Skris t=(BN_ULLONG)a[i]*a[i]; \ 46555714Skris t1=(BN_ULONG)Lw(t); \ 46655714Skris t2=(BN_ULONG)Hw(t); \ 46755714Skris c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 46855714Skris c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 46955714Skris 47055714Skris#define sqr_add_c2(a,i,j,c0,c1,c2) \ 47155714Skris mul_add_c2((a)[i],(a)[j],c0,c1,c2) 47259191Skris 473160814Ssimon#elif defined(BN_UMULT_LOHI) 474160814Ssimon 475160814Ssimon#define mul_add_c(a,b,c0,c1,c2) { \ 476160814Ssimon BN_ULONG ta=(a),tb=(b); \ 477160814Ssimon BN_UMULT_LOHI(t1,t2,ta,tb); \ 478160814Ssimon c0 += t1; t2 += (c0<t1)?1:0; \ 479160814Ssimon c1 += t2; c2 += (c1<t2)?1:0; \ 480160814Ssimon } 481160814Ssimon 482160814Ssimon#define mul_add_c2(a,b,c0,c1,c2) { \ 483160814Ssimon BN_ULONG ta=(a),tb=(b),t0; \ 484160814Ssimon BN_UMULT_LOHI(t0,t1,ta,tb); \ 485277195Sdelphij c0 += t0; t2 = t1+((c0<t0)?1:0);\ 486160814Ssimon c1 += t2; c2 += (c1<t2)?1:0; \ 487277195Sdelphij c0 += t0; t1 += (c0<t0)?1:0; \ 488277195Sdelphij c1 += t1; c2 += (c1<t1)?1:0; \ 489160814Ssimon } 490160814Ssimon 491160814Ssimon#define sqr_add_c(a,i,c0,c1,c2) { \ 492160814Ssimon BN_ULONG ta=(a)[i]; \ 493160814Ssimon BN_UMULT_LOHI(t1,t2,ta,ta); \ 494160814Ssimon c0 += t1; t2 += (c0<t1)?1:0; \ 495160814Ssimon c1 += t2; c2 += (c1<t2)?1:0; \ 496160814Ssimon } 497160814Ssimon 498160814Ssimon#define sqr_add_c2(a,i,j,c0,c1,c2) \ 499160814Ssimon mul_add_c2((a)[i],(a)[j],c0,c1,c2) 500160814Ssimon 50159191Skris#elif defined(BN_UMULT_HIGH) 50259191Skris 50359191Skris#define mul_add_c(a,b,c0,c1,c2) { \ 50459191Skris BN_ULONG ta=(a),tb=(b); \ 50559191Skris t1 = ta * tb; \ 50659191Skris t2 = BN_UMULT_HIGH(ta,tb); \ 50759191Skris c0 += t1; t2 += (c0<t1)?1:0; \ 50859191Skris c1 += t2; c2 += (c1<t2)?1:0; \ 50959191Skris } 51059191Skris 51159191Skris#define mul_add_c2(a,b,c0,c1,c2) { \ 51259191Skris BN_ULONG ta=(a),tb=(b),t0; \ 51359191Skris t1 = BN_UMULT_HIGH(ta,tb); \ 51459191Skris t0 = ta * tb; \ 515277195Sdelphij c0 += t0; t2 = t1+((c0<t0)?1:0);\ 51659191Skris c1 += t2; c2 += (c1<t2)?1:0; \ 517277195Sdelphij c0 += t0; t1 += (c0<t0)?1:0; \ 518277195Sdelphij c1 += t1; c2 += (c1<t1)?1:0; \ 51959191Skris } 52059191Skris 52159191Skris#define sqr_add_c(a,i,c0,c1,c2) { \ 52259191Skris BN_ULONG ta=(a)[i]; \ 52359191Skris t1 = ta * ta; \ 52459191Skris t2 = BN_UMULT_HIGH(ta,ta); \ 52559191Skris c0 += t1; t2 += (c0<t1)?1:0; \ 52659191Skris c1 += t2; c2 += (c1<t2)?1:0; \ 52759191Skris } 52859191Skris 52959191Skris#define sqr_add_c2(a,i,j,c0,c1,c2) \ 53059191Skris mul_add_c2((a)[i],(a)[j],c0,c1,c2) 53159191Skris 53259191Skris#else /* !BN_LLONG */ 53355714Skris#define mul_add_c(a,b,c0,c1,c2) \ 53455714Skris t1=LBITS(a); t2=HBITS(a); \ 53555714Skris bl=LBITS(b); bh=HBITS(b); \ 53655714Skris mul64(t1,t2,bl,bh); \ 53755714Skris c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 53855714Skris c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 53955714Skris 54055714Skris#define mul_add_c2(a,b,c0,c1,c2) \ 54155714Skris t1=LBITS(a); t2=HBITS(a); \ 54255714Skris bl=LBITS(b); bh=HBITS(b); \ 54355714Skris mul64(t1,t2,bl,bh); \ 54455714Skris if (t2 & BN_TBIT) c2++; \ 54555714Skris t2=(t2+t2)&BN_MASK2; \ 54655714Skris if (t1 & BN_TBIT) t2++; \ 54755714Skris t1=(t1+t1)&BN_MASK2; \ 54855714Skris c0=(c0+t1)&BN_MASK2; \ 54955714Skris if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \ 55055714Skris c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 55155714Skris 55255714Skris#define sqr_add_c(a,i,c0,c1,c2) \ 55355714Skris sqr64(t1,t2,(a)[i]); \ 55455714Skris c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \ 55555714Skris c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++; 55655714Skris 55755714Skris#define sqr_add_c2(a,i,j,c0,c1,c2) \ 55855714Skris mul_add_c2((a)[i],(a)[j],c0,c1,c2) 55959191Skris#endif /* !BN_LLONG */ 56055714Skris 56155714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 56255714Skris { 56355714Skris#ifdef BN_LLONG 56455714Skris BN_ULLONG t; 56555714Skris#else 56655714Skris BN_ULONG bl,bh; 56755714Skris#endif 56855714Skris BN_ULONG t1,t2; 56955714Skris BN_ULONG c1,c2,c3; 57055714Skris 57155714Skris c1=0; 57255714Skris c2=0; 57355714Skris c3=0; 57455714Skris mul_add_c(a[0],b[0],c1,c2,c3); 57555714Skris r[0]=c1; 57655714Skris c1=0; 57755714Skris mul_add_c(a[0],b[1],c2,c3,c1); 57855714Skris mul_add_c(a[1],b[0],c2,c3,c1); 57955714Skris r[1]=c2; 58055714Skris c2=0; 58155714Skris mul_add_c(a[2],b[0],c3,c1,c2); 58255714Skris mul_add_c(a[1],b[1],c3,c1,c2); 58355714Skris mul_add_c(a[0],b[2],c3,c1,c2); 58455714Skris r[2]=c3; 58555714Skris c3=0; 58655714Skris mul_add_c(a[0],b[3],c1,c2,c3); 58755714Skris mul_add_c(a[1],b[2],c1,c2,c3); 58855714Skris mul_add_c(a[2],b[1],c1,c2,c3); 58955714Skris mul_add_c(a[3],b[0],c1,c2,c3); 59055714Skris r[3]=c1; 59155714Skris c1=0; 59255714Skris mul_add_c(a[4],b[0],c2,c3,c1); 59355714Skris mul_add_c(a[3],b[1],c2,c3,c1); 59455714Skris mul_add_c(a[2],b[2],c2,c3,c1); 59555714Skris mul_add_c(a[1],b[3],c2,c3,c1); 59655714Skris mul_add_c(a[0],b[4],c2,c3,c1); 59755714Skris r[4]=c2; 59855714Skris c2=0; 59955714Skris mul_add_c(a[0],b[5],c3,c1,c2); 60055714Skris mul_add_c(a[1],b[4],c3,c1,c2); 60155714Skris mul_add_c(a[2],b[3],c3,c1,c2); 60255714Skris mul_add_c(a[3],b[2],c3,c1,c2); 60355714Skris mul_add_c(a[4],b[1],c3,c1,c2); 60455714Skris mul_add_c(a[5],b[0],c3,c1,c2); 60555714Skris r[5]=c3; 60655714Skris c3=0; 60755714Skris mul_add_c(a[6],b[0],c1,c2,c3); 60855714Skris mul_add_c(a[5],b[1],c1,c2,c3); 60955714Skris mul_add_c(a[4],b[2],c1,c2,c3); 61055714Skris mul_add_c(a[3],b[3],c1,c2,c3); 61155714Skris mul_add_c(a[2],b[4],c1,c2,c3); 61255714Skris mul_add_c(a[1],b[5],c1,c2,c3); 61355714Skris mul_add_c(a[0],b[6],c1,c2,c3); 61455714Skris r[6]=c1; 61555714Skris c1=0; 61655714Skris mul_add_c(a[0],b[7],c2,c3,c1); 61755714Skris mul_add_c(a[1],b[6],c2,c3,c1); 61855714Skris mul_add_c(a[2],b[5],c2,c3,c1); 61955714Skris mul_add_c(a[3],b[4],c2,c3,c1); 62055714Skris mul_add_c(a[4],b[3],c2,c3,c1); 62155714Skris mul_add_c(a[5],b[2],c2,c3,c1); 62255714Skris mul_add_c(a[6],b[1],c2,c3,c1); 62355714Skris mul_add_c(a[7],b[0],c2,c3,c1); 62455714Skris r[7]=c2; 62555714Skris c2=0; 62655714Skris mul_add_c(a[7],b[1],c3,c1,c2); 62755714Skris mul_add_c(a[6],b[2],c3,c1,c2); 62855714Skris mul_add_c(a[5],b[3],c3,c1,c2); 62955714Skris mul_add_c(a[4],b[4],c3,c1,c2); 63055714Skris mul_add_c(a[3],b[5],c3,c1,c2); 63155714Skris mul_add_c(a[2],b[6],c3,c1,c2); 63255714Skris mul_add_c(a[1],b[7],c3,c1,c2); 63355714Skris r[8]=c3; 63455714Skris c3=0; 63555714Skris mul_add_c(a[2],b[7],c1,c2,c3); 63655714Skris mul_add_c(a[3],b[6],c1,c2,c3); 63755714Skris mul_add_c(a[4],b[5],c1,c2,c3); 63855714Skris mul_add_c(a[5],b[4],c1,c2,c3); 63955714Skris mul_add_c(a[6],b[3],c1,c2,c3); 64055714Skris mul_add_c(a[7],b[2],c1,c2,c3); 64155714Skris r[9]=c1; 64255714Skris c1=0; 64355714Skris mul_add_c(a[7],b[3],c2,c3,c1); 64455714Skris mul_add_c(a[6],b[4],c2,c3,c1); 64555714Skris mul_add_c(a[5],b[5],c2,c3,c1); 64655714Skris mul_add_c(a[4],b[6],c2,c3,c1); 64755714Skris mul_add_c(a[3],b[7],c2,c3,c1); 64855714Skris r[10]=c2; 64955714Skris c2=0; 65055714Skris mul_add_c(a[4],b[7],c3,c1,c2); 65155714Skris mul_add_c(a[5],b[6],c3,c1,c2); 65255714Skris mul_add_c(a[6],b[5],c3,c1,c2); 65355714Skris mul_add_c(a[7],b[4],c3,c1,c2); 65455714Skris r[11]=c3; 65555714Skris c3=0; 65655714Skris mul_add_c(a[7],b[5],c1,c2,c3); 65755714Skris mul_add_c(a[6],b[6],c1,c2,c3); 65855714Skris mul_add_c(a[5],b[7],c1,c2,c3); 65955714Skris r[12]=c1; 66055714Skris c1=0; 66155714Skris mul_add_c(a[6],b[7],c2,c3,c1); 66255714Skris mul_add_c(a[7],b[6],c2,c3,c1); 66355714Skris r[13]=c2; 66455714Skris c2=0; 66555714Skris mul_add_c(a[7],b[7],c3,c1,c2); 66655714Skris r[14]=c3; 66755714Skris r[15]=c1; 66855714Skris } 66955714Skris 67055714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 67155714Skris { 67255714Skris#ifdef BN_LLONG 67355714Skris BN_ULLONG t; 67455714Skris#else 67555714Skris BN_ULONG bl,bh; 67655714Skris#endif 67755714Skris BN_ULONG t1,t2; 67855714Skris BN_ULONG c1,c2,c3; 67955714Skris 68055714Skris c1=0; 68155714Skris c2=0; 68255714Skris c3=0; 68355714Skris mul_add_c(a[0],b[0],c1,c2,c3); 68455714Skris r[0]=c1; 68555714Skris c1=0; 68655714Skris mul_add_c(a[0],b[1],c2,c3,c1); 68755714Skris mul_add_c(a[1],b[0],c2,c3,c1); 68855714Skris r[1]=c2; 68955714Skris c2=0; 69055714Skris mul_add_c(a[2],b[0],c3,c1,c2); 69155714Skris mul_add_c(a[1],b[1],c3,c1,c2); 69255714Skris mul_add_c(a[0],b[2],c3,c1,c2); 69355714Skris r[2]=c3; 69455714Skris c3=0; 69555714Skris mul_add_c(a[0],b[3],c1,c2,c3); 69655714Skris mul_add_c(a[1],b[2],c1,c2,c3); 69755714Skris mul_add_c(a[2],b[1],c1,c2,c3); 69855714Skris mul_add_c(a[3],b[0],c1,c2,c3); 69955714Skris r[3]=c1; 70055714Skris c1=0; 70155714Skris mul_add_c(a[3],b[1],c2,c3,c1); 70255714Skris mul_add_c(a[2],b[2],c2,c3,c1); 70355714Skris mul_add_c(a[1],b[3],c2,c3,c1); 70455714Skris r[4]=c2; 70555714Skris c2=0; 70655714Skris mul_add_c(a[2],b[3],c3,c1,c2); 70755714Skris mul_add_c(a[3],b[2],c3,c1,c2); 70855714Skris r[5]=c3; 70955714Skris c3=0; 71055714Skris mul_add_c(a[3],b[3],c1,c2,c3); 71155714Skris r[6]=c1; 71255714Skris r[7]=c2; 71355714Skris } 71455714Skris 715109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 71655714Skris { 71755714Skris#ifdef BN_LLONG 71855714Skris BN_ULLONG t,tt; 71955714Skris#else 72055714Skris BN_ULONG bl,bh; 72155714Skris#endif 72255714Skris BN_ULONG t1,t2; 72355714Skris BN_ULONG c1,c2,c3; 72455714Skris 72555714Skris c1=0; 72655714Skris c2=0; 72755714Skris c3=0; 72855714Skris sqr_add_c(a,0,c1,c2,c3); 72955714Skris r[0]=c1; 73055714Skris c1=0; 73155714Skris sqr_add_c2(a,1,0,c2,c3,c1); 73255714Skris r[1]=c2; 73355714Skris c2=0; 73455714Skris sqr_add_c(a,1,c3,c1,c2); 73555714Skris sqr_add_c2(a,2,0,c3,c1,c2); 73655714Skris r[2]=c3; 73755714Skris c3=0; 73855714Skris sqr_add_c2(a,3,0,c1,c2,c3); 73955714Skris sqr_add_c2(a,2,1,c1,c2,c3); 74055714Skris r[3]=c1; 74155714Skris c1=0; 74255714Skris sqr_add_c(a,2,c2,c3,c1); 74355714Skris sqr_add_c2(a,3,1,c2,c3,c1); 74455714Skris sqr_add_c2(a,4,0,c2,c3,c1); 74555714Skris r[4]=c2; 74655714Skris c2=0; 74755714Skris sqr_add_c2(a,5,0,c3,c1,c2); 74855714Skris sqr_add_c2(a,4,1,c3,c1,c2); 74955714Skris sqr_add_c2(a,3,2,c3,c1,c2); 75055714Skris r[5]=c3; 75155714Skris c3=0; 75255714Skris sqr_add_c(a,3,c1,c2,c3); 75355714Skris sqr_add_c2(a,4,2,c1,c2,c3); 75455714Skris sqr_add_c2(a,5,1,c1,c2,c3); 75555714Skris sqr_add_c2(a,6,0,c1,c2,c3); 75655714Skris r[6]=c1; 75755714Skris c1=0; 75855714Skris sqr_add_c2(a,7,0,c2,c3,c1); 75955714Skris sqr_add_c2(a,6,1,c2,c3,c1); 76055714Skris sqr_add_c2(a,5,2,c2,c3,c1); 76155714Skris sqr_add_c2(a,4,3,c2,c3,c1); 76255714Skris r[7]=c2; 76355714Skris c2=0; 76455714Skris sqr_add_c(a,4,c3,c1,c2); 76555714Skris sqr_add_c2(a,5,3,c3,c1,c2); 76655714Skris sqr_add_c2(a,6,2,c3,c1,c2); 76755714Skris sqr_add_c2(a,7,1,c3,c1,c2); 76855714Skris r[8]=c3; 76955714Skris c3=0; 77055714Skris sqr_add_c2(a,7,2,c1,c2,c3); 77155714Skris sqr_add_c2(a,6,3,c1,c2,c3); 77255714Skris sqr_add_c2(a,5,4,c1,c2,c3); 77355714Skris r[9]=c1; 77455714Skris c1=0; 77555714Skris sqr_add_c(a,5,c2,c3,c1); 77655714Skris sqr_add_c2(a,6,4,c2,c3,c1); 77755714Skris sqr_add_c2(a,7,3,c2,c3,c1); 77855714Skris r[10]=c2; 77955714Skris c2=0; 78055714Skris sqr_add_c2(a,7,4,c3,c1,c2); 78155714Skris sqr_add_c2(a,6,5,c3,c1,c2); 78255714Skris r[11]=c3; 78355714Skris c3=0; 78455714Skris sqr_add_c(a,6,c1,c2,c3); 78555714Skris sqr_add_c2(a,7,5,c1,c2,c3); 78655714Skris r[12]=c1; 78755714Skris c1=0; 78855714Skris sqr_add_c2(a,7,6,c2,c3,c1); 78955714Skris r[13]=c2; 79055714Skris c2=0; 79155714Skris sqr_add_c(a,7,c3,c1,c2); 79255714Skris r[14]=c3; 79355714Skris r[15]=c1; 79455714Skris } 79555714Skris 796109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 79755714Skris { 79855714Skris#ifdef BN_LLONG 79955714Skris BN_ULLONG t,tt; 80055714Skris#else 80155714Skris BN_ULONG bl,bh; 80255714Skris#endif 80355714Skris BN_ULONG t1,t2; 80455714Skris BN_ULONG c1,c2,c3; 80555714Skris 80655714Skris c1=0; 80755714Skris c2=0; 80855714Skris c3=0; 80955714Skris sqr_add_c(a,0,c1,c2,c3); 81055714Skris r[0]=c1; 81155714Skris c1=0; 81255714Skris sqr_add_c2(a,1,0,c2,c3,c1); 81355714Skris r[1]=c2; 81455714Skris c2=0; 81555714Skris sqr_add_c(a,1,c3,c1,c2); 81655714Skris sqr_add_c2(a,2,0,c3,c1,c2); 81755714Skris r[2]=c3; 81855714Skris c3=0; 81955714Skris sqr_add_c2(a,3,0,c1,c2,c3); 82055714Skris sqr_add_c2(a,2,1,c1,c2,c3); 82155714Skris r[3]=c1; 82255714Skris c1=0; 82355714Skris sqr_add_c(a,2,c2,c3,c1); 82455714Skris sqr_add_c2(a,3,1,c2,c3,c1); 82555714Skris r[4]=c2; 82655714Skris c2=0; 82755714Skris sqr_add_c2(a,3,2,c3,c1,c2); 82855714Skris r[5]=c3; 82955714Skris c3=0; 83055714Skris sqr_add_c(a,3,c1,c2,c3); 83155714Skris r[6]=c1; 83255714Skris r[7]=c2; 83355714Skris } 834238405Sjkim 835238405Sjkim#ifdef OPENSSL_NO_ASM 836238405Sjkim#ifdef OPENSSL_BN_ASM_MONT 837238405Sjkim#include <alloca.h> 838238405Sjkim/* 839238405Sjkim * This is essentially reference implementation, which may or may not 840238405Sjkim * result in performance improvement. E.g. on IA-32 this routine was 841238405Sjkim * observed to give 40% faster rsa1024 private key operations and 10% 842238405Sjkim * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 843238405Sjkim * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 844238405Sjkim * reference implementation, one to be used as starting point for 845238405Sjkim * platform-specific assembler. Mentioned numbers apply to compiler 846238405Sjkim * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 847238405Sjkim * can vary not only from platform to platform, but even for compiler 848238405Sjkim * versions. Assembler vs. assembler improvement coefficients can 849238405Sjkim * [and are known to] differ and are to be documented elsewhere. 850238405Sjkim */ 851238405Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num) 852238405Sjkim { 853238405Sjkim BN_ULONG c0,c1,ml,*tp,n0; 854238405Sjkim#ifdef mul64 855238405Sjkim BN_ULONG mh; 856238405Sjkim#endif 857238405Sjkim volatile BN_ULONG *vp; 858238405Sjkim int i=0,j; 859238405Sjkim 860238405Sjkim#if 0 /* template for platform-specific implementation */ 861238405Sjkim if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num); 862238405Sjkim#endif 863238405Sjkim vp = tp = alloca((num+2)*sizeof(BN_ULONG)); 864238405Sjkim 865238405Sjkim n0 = *n0p; 866238405Sjkim 867238405Sjkim c0 = 0; 868238405Sjkim ml = bp[0]; 869238405Sjkim#ifdef mul64 870238405Sjkim mh = HBITS(ml); 871238405Sjkim ml = LBITS(ml); 872238405Sjkim for (j=0;j<num;++j) 873238405Sjkim mul(tp[j],ap[j],ml,mh,c0); 874238405Sjkim#else 875238405Sjkim for (j=0;j<num;++j) 876238405Sjkim mul(tp[j],ap[j],ml,c0); 877238405Sjkim#endif 878238405Sjkim 879238405Sjkim tp[num] = c0; 880238405Sjkim tp[num+1] = 0; 881238405Sjkim goto enter; 882238405Sjkim 883238405Sjkim for(i=0;i<num;i++) 884238405Sjkim { 885238405Sjkim c0 = 0; 886238405Sjkim ml = bp[i]; 887238405Sjkim#ifdef mul64 888238405Sjkim mh = HBITS(ml); 889238405Sjkim ml = LBITS(ml); 890238405Sjkim for (j=0;j<num;++j) 891238405Sjkim mul_add(tp[j],ap[j],ml,mh,c0); 892238405Sjkim#else 893238405Sjkim for (j=0;j<num;++j) 894238405Sjkim mul_add(tp[j],ap[j],ml,c0); 895238405Sjkim#endif 896238405Sjkim c1 = (tp[num] + c0)&BN_MASK2; 897238405Sjkim tp[num] = c1; 898238405Sjkim tp[num+1] = (c1<c0?1:0); 899238405Sjkim enter: 900238405Sjkim c1 = tp[0]; 901238405Sjkim ml = (c1*n0)&BN_MASK2; 902238405Sjkim c0 = 0; 903238405Sjkim#ifdef mul64 904238405Sjkim mh = HBITS(ml); 905238405Sjkim ml = LBITS(ml); 906238405Sjkim mul_add(c1,np[0],ml,mh,c0); 907238405Sjkim#else 908238405Sjkim mul_add(c1,ml,np[0],c0); 909238405Sjkim#endif 910238405Sjkim for(j=1;j<num;j++) 911238405Sjkim { 912238405Sjkim c1 = tp[j]; 913238405Sjkim#ifdef mul64 914238405Sjkim mul_add(c1,np[j],ml,mh,c0); 915238405Sjkim#else 916238405Sjkim mul_add(c1,ml,np[j],c0); 917238405Sjkim#endif 918238405Sjkim tp[j-1] = c1&BN_MASK2; 919238405Sjkim } 920238405Sjkim c1 = (tp[num] + c0)&BN_MASK2; 921238405Sjkim tp[num-1] = c1; 922238405Sjkim tp[num] = tp[num+1] + (c1<c0?1:0); 923238405Sjkim } 924238405Sjkim 925238405Sjkim if (tp[num]!=0 || tp[num-1]>=np[num-1]) 926238405Sjkim { 927238405Sjkim c0 = bn_sub_words(rp,tp,np,num); 928238405Sjkim if (tp[num]!=0 || c0==0) 929238405Sjkim { 930238405Sjkim for(i=0;i<num+2;i++) vp[i] = 0; 931238405Sjkim return 1; 932238405Sjkim } 933238405Sjkim } 934238405Sjkim for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; 935238405Sjkim vp[num] = 0; 936238405Sjkim vp[num+1] = 0; 937238405Sjkim return 1; 938238405Sjkim } 939238405Sjkim#else 940238405Sjkim/* 941238405Sjkim * Return value of 0 indicates that multiplication/convolution was not 942238405Sjkim * performed to signal the caller to fall down to alternative/original 943238405Sjkim * code-path. 944238405Sjkim */ 945238405Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num) 946238405Sjkim{ return 0; } 947238405Sjkim#endif /* OPENSSL_BN_ASM_MONT */ 948238405Sjkim#endif 949238405Sjkim 95059191Skris#else /* !BN_MUL_COMBA */ 95155714Skris 95255714Skris/* hmm... is it faster just to do a multiply? */ 95355714Skris#undef bn_sqr_comba4 954238405Sjkimvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 95555714Skris { 95655714Skris BN_ULONG t[8]; 95755714Skris bn_sqr_normal(r,a,4,t); 95855714Skris } 95955714Skris 96055714Skris#undef bn_sqr_comba8 961238405Sjkimvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 96255714Skris { 96355714Skris BN_ULONG t[16]; 96455714Skris bn_sqr_normal(r,a,8,t); 96555714Skris } 96655714Skris 96755714Skrisvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 96855714Skris { 96955714Skris r[4]=bn_mul_words( &(r[0]),a,4,b[0]); 97055714Skris r[5]=bn_mul_add_words(&(r[1]),a,4,b[1]); 97155714Skris r[6]=bn_mul_add_words(&(r[2]),a,4,b[2]); 97255714Skris r[7]=bn_mul_add_words(&(r[3]),a,4,b[3]); 97355714Skris } 97455714Skris 97555714Skrisvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 97655714Skris { 97755714Skris r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]); 97855714Skris r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]); 97955714Skris r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]); 98055714Skris r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]); 98155714Skris r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]); 98255714Skris r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]); 98355714Skris r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]); 98455714Skris r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); 98555714Skris } 98655714Skris 987238405Sjkim#ifdef OPENSSL_NO_ASM 988238405Sjkim#ifdef OPENSSL_BN_ASM_MONT 989238405Sjkim#include <alloca.h> 990238405Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num) 991238405Sjkim { 992238405Sjkim BN_ULONG c0,c1,*tp,n0=*n0p; 993238405Sjkim volatile BN_ULONG *vp; 994238405Sjkim int i=0,j; 995238405Sjkim 996238405Sjkim vp = tp = alloca((num+2)*sizeof(BN_ULONG)); 997238405Sjkim 998238405Sjkim for(i=0;i<=num;i++) tp[i]=0; 999238405Sjkim 1000238405Sjkim for(i=0;i<num;i++) 1001238405Sjkim { 1002238405Sjkim c0 = bn_mul_add_words(tp,ap,num,bp[i]); 1003238405Sjkim c1 = (tp[num] + c0)&BN_MASK2; 1004238405Sjkim tp[num] = c1; 1005238405Sjkim tp[num+1] = (c1<c0?1:0); 1006238405Sjkim 1007238405Sjkim c0 = bn_mul_add_words(tp,np,num,tp[0]*n0); 1008238405Sjkim c1 = (tp[num] + c0)&BN_MASK2; 1009238405Sjkim tp[num] = c1; 1010238405Sjkim tp[num+1] += (c1<c0?1:0); 1011238405Sjkim for(j=0;j<=num;j++) tp[j]=tp[j+1]; 1012238405Sjkim } 1013238405Sjkim 1014238405Sjkim if (tp[num]!=0 || tp[num-1]>=np[num-1]) 1015238405Sjkim { 1016238405Sjkim c0 = bn_sub_words(rp,tp,np,num); 1017238405Sjkim if (tp[num]!=0 || c0==0) 1018238405Sjkim { 1019238405Sjkim for(i=0;i<num+2;i++) vp[i] = 0; 1020238405Sjkim return 1; 1021238405Sjkim } 1022238405Sjkim } 1023238405Sjkim for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0; 1024238405Sjkim vp[num] = 0; 1025238405Sjkim vp[num+1] = 0; 1026238405Sjkim return 1; 1027238405Sjkim } 1028238405Sjkim#else 1029238405Sjkimint bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num) 1030238405Sjkim{ return 0; } 1031238405Sjkim#endif /* OPENSSL_BN_ASM_MONT */ 1032238405Sjkim#endif 1033238405Sjkim 103459191Skris#endif /* !BN_MUL_COMBA */ 1035