1205128Ssimon#include "../bn_lcl.h" 2238405Sjkim#if !(defined(__GNUC__) && __GNUC__>=2) 3162911Ssimon# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 4162911Ssimon#else 5109998Smarkm/* 6109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002. 7109998Smarkm * 8109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9109998Smarkm * project. 10109998Smarkm * 11109998Smarkm * Rights for redistribution and usage in source and binary forms are 12109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is 13109998Smarkm * disclaimed. 14109998Smarkm * 15109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 16109998Smarkm * versions, like 1.0... 17109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty 18109998Smarkm * proof-of-concept hack. As you can see it's implemented with 19109998Smarkm * inline assembler, which means that you're bound to GCC and that 20160814Ssimon * there might be enough room for further improvement. 21109998Smarkm * 22109998Smarkm * Q. Why inline assembler? 23160814Ssimon * A. x86_64 features own ABI which I'm not familiar with. This is 24160814Ssimon * why I decided to let the compiler take care of subroutine 25160814Ssimon * prologue/epilogue as well as register allocation. For reference. 26160814Ssimon * Win64 implements different ABI for AMD64, different from Linux. 27109998Smarkm * 28109998Smarkm * Q. How much faster does it get? 29160814Ssimon * A. 'apps/openssl speed rsa dsa' output with no-asm: 30160814Ssimon * 31160814Ssimon * sign verify sign/s verify/s 32160814Ssimon * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 33160814Ssimon * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 34160814Ssimon * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 35160814Ssimon * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 36160814Ssimon * sign verify sign/s verify/s 37160814Ssimon * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 38160814Ssimon * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 39160814Ssimon * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 40160814Ssimon * 41160814Ssimon * 'apps/openssl speed rsa dsa' output with this module: 42160814Ssimon * 43160814Ssimon * sign verify sign/s verify/s 44160814Ssimon * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 45160814Ssimon * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 46160814Ssimon * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 47160814Ssimon * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 48160814Ssimon * sign verify sign/s verify/s 49160814Ssimon * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 50160814Ssimon * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 51160814Ssimon * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 52160814Ssimon * 53160814Ssimon * For the reference. IA-32 assembler implementation performs 54160814Ssimon * very much like 64-bit code compiled with no-asm on the same 55160814Ssimon * machine. 56109998Smarkm */ 57109998Smarkm 58238405Sjkim#ifdef _WIN64 59238405Sjkim#define BN_ULONG unsigned long long 60238405Sjkim#else 61109998Smarkm#define BN_ULONG unsigned long 62238405Sjkim#endif 63109998Smarkm 64205128Ssimon#undef mul 65205128Ssimon#undef mul_add 66206046Ssimon#undef sqr 67205128Ssimon 68109998Smarkm/* 69109998Smarkm * "m"(a), "+m"(r) is the way to favor DirectPath �-code; 70109998Smarkm * "g"(0) let the compiler to decide where does it 71109998Smarkm * want to keep the value of zero; 72109998Smarkm */ 73109998Smarkm#define mul_add(r,a,word,carry) do { \ 74109998Smarkm register BN_ULONG high,low; \ 75109998Smarkm asm ("mulq %3" \ 76109998Smarkm : "=a"(low),"=d"(high) \ 77109998Smarkm : "a"(word),"m"(a) \ 78109998Smarkm : "cc"); \ 79109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 80109998Smarkm : "+r"(carry),"+d"(high)\ 81109998Smarkm : "a"(low),"g"(0) \ 82109998Smarkm : "cc"); \ 83109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 84109998Smarkm : "+m"(r),"+d"(high) \ 85109998Smarkm : "r"(carry),"g"(0) \ 86109998Smarkm : "cc"); \ 87109998Smarkm carry=high; \ 88109998Smarkm } while (0) 89109998Smarkm 90109998Smarkm#define mul(r,a,word,carry) do { \ 91109998Smarkm register BN_ULONG high,low; \ 92109998Smarkm asm ("mulq %3" \ 93109998Smarkm : "=a"(low),"=d"(high) \ 94109998Smarkm : "a"(word),"g"(a) \ 95109998Smarkm : "cc"); \ 96109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 97109998Smarkm : "+r"(carry),"+d"(high)\ 98109998Smarkm : "a"(low),"g"(0) \ 99109998Smarkm : "cc"); \ 100109998Smarkm (r)=carry, carry=high; \ 101109998Smarkm } while (0) 102109998Smarkm 103109998Smarkm#define sqr(r0,r1,a) \ 104109998Smarkm asm ("mulq %2" \ 105109998Smarkm : "=a"(r0),"=d"(r1) \ 106109998Smarkm : "a"(a) \ 107109998Smarkm : "cc"); 108109998Smarkm 109205128SsimonBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 110109998Smarkm { 111109998Smarkm BN_ULONG c1=0; 112109998Smarkm 113109998Smarkm if (num <= 0) return(c1); 114109998Smarkm 115109998Smarkm while (num&~3) 116109998Smarkm { 117109998Smarkm mul_add(rp[0],ap[0],w,c1); 118109998Smarkm mul_add(rp[1],ap[1],w,c1); 119109998Smarkm mul_add(rp[2],ap[2],w,c1); 120109998Smarkm mul_add(rp[3],ap[3],w,c1); 121109998Smarkm ap+=4; rp+=4; num-=4; 122109998Smarkm } 123109998Smarkm if (num) 124109998Smarkm { 125109998Smarkm mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; 126109998Smarkm mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; 127109998Smarkm mul_add(rp[2],ap[2],w,c1); return c1; 128109998Smarkm } 129109998Smarkm 130109998Smarkm return(c1); 131109998Smarkm } 132109998Smarkm 133205128SsimonBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 134109998Smarkm { 135109998Smarkm BN_ULONG c1=0; 136109998Smarkm 137109998Smarkm if (num <= 0) return(c1); 138109998Smarkm 139109998Smarkm while (num&~3) 140109998Smarkm { 141109998Smarkm mul(rp[0],ap[0],w,c1); 142109998Smarkm mul(rp[1],ap[1],w,c1); 143109998Smarkm mul(rp[2],ap[2],w,c1); 144109998Smarkm mul(rp[3],ap[3],w,c1); 145109998Smarkm ap+=4; rp+=4; num-=4; 146109998Smarkm } 147109998Smarkm if (num) 148109998Smarkm { 149109998Smarkm mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; 150109998Smarkm mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; 151109998Smarkm mul(rp[2],ap[2],w,c1); 152109998Smarkm } 153109998Smarkm return(c1); 154109998Smarkm } 155109998Smarkm 156205128Ssimonvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 157109998Smarkm { 158109998Smarkm if (n <= 0) return; 159109998Smarkm 160109998Smarkm while (n&~3) 161109998Smarkm { 162109998Smarkm sqr(r[0],r[1],a[0]); 163109998Smarkm sqr(r[2],r[3],a[1]); 164109998Smarkm sqr(r[4],r[5],a[2]); 165109998Smarkm sqr(r[6],r[7],a[3]); 166109998Smarkm a+=4; r+=8; n-=4; 167109998Smarkm } 168109998Smarkm if (n) 169109998Smarkm { 170109998Smarkm sqr(r[0],r[1],a[0]); if (--n == 0) return; 171109998Smarkm sqr(r[2],r[3],a[1]); if (--n == 0) return; 172109998Smarkm sqr(r[4],r[5],a[2]); 173109998Smarkm } 174109998Smarkm } 175109998Smarkm 176109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 177109998Smarkm{ BN_ULONG ret,waste; 178109998Smarkm 179127128Snectar asm ("divq %4" 180109998Smarkm : "=a"(ret),"=d"(waste) 181109998Smarkm : "a"(l),"d"(h),"g"(d) 182109998Smarkm : "cc"); 183109998Smarkm 184109998Smarkm return ret; 185109998Smarkm} 186109998Smarkm 187205128SsimonBN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) 188160814Ssimon{ BN_ULONG ret=0,i=0; 189109998Smarkm 190109998Smarkm if (n <= 0) return 0; 191109998Smarkm 192279264Sdelphij asm volatile ( 193109998Smarkm " subq %2,%2 \n" 194238405Sjkim ".p2align 4 \n" 195109998Smarkm "1: movq (%4,%2,8),%0 \n" 196109998Smarkm " adcq (%5,%2,8),%0 \n" 197109998Smarkm " movq %0,(%3,%2,8) \n" 198109998Smarkm " leaq 1(%2),%2 \n" 199109998Smarkm " loop 1b \n" 200109998Smarkm " sbbq %0,%0 \n" 201160814Ssimon : "=&a"(ret),"+c"(n),"=&r"(i) 202109998Smarkm : "r"(rp),"r"(ap),"r"(bp) 203279264Sdelphij : "cc", "memory" 204109998Smarkm ); 205109998Smarkm 206109998Smarkm return ret&1; 207109998Smarkm} 208109998Smarkm 209109998Smarkm#ifndef SIMICS 210205128SsimonBN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) 211160814Ssimon{ BN_ULONG ret=0,i=0; 212109998Smarkm 213109998Smarkm if (n <= 0) return 0; 214109998Smarkm 215279264Sdelphij asm volatile ( 216109998Smarkm " subq %2,%2 \n" 217238405Sjkim ".p2align 4 \n" 218109998Smarkm "1: movq (%4,%2,8),%0 \n" 219109998Smarkm " sbbq (%5,%2,8),%0 \n" 220109998Smarkm " movq %0,(%3,%2,8) \n" 221109998Smarkm " leaq 1(%2),%2 \n" 222109998Smarkm " loop 1b \n" 223109998Smarkm " sbbq %0,%0 \n" 224160814Ssimon : "=&a"(ret),"+c"(n),"=&r"(i) 225109998Smarkm : "r"(rp),"r"(ap),"r"(bp) 226279264Sdelphij : "cc", "memory" 227109998Smarkm ); 228109998Smarkm 229109998Smarkm return ret&1; 230109998Smarkm} 231109998Smarkm#else 232109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */ 233109998Smarkm#define BN_MASK2 0xffffffffffffffffL 234109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 235109998Smarkm { 236109998Smarkm BN_ULONG t1,t2; 237109998Smarkm int c=0; 238109998Smarkm 239109998Smarkm if (n <= 0) return((BN_ULONG)0); 240109998Smarkm 241109998Smarkm for (;;) 242109998Smarkm { 243109998Smarkm t1=a[0]; t2=b[0]; 244109998Smarkm r[0]=(t1-t2-c)&BN_MASK2; 245109998Smarkm if (t1 != t2) c=(t1 < t2); 246109998Smarkm if (--n <= 0) break; 247109998Smarkm 248109998Smarkm t1=a[1]; t2=b[1]; 249109998Smarkm r[1]=(t1-t2-c)&BN_MASK2; 250109998Smarkm if (t1 != t2) c=(t1 < t2); 251109998Smarkm if (--n <= 0) break; 252109998Smarkm 253109998Smarkm t1=a[2]; t2=b[2]; 254109998Smarkm r[2]=(t1-t2-c)&BN_MASK2; 255109998Smarkm if (t1 != t2) c=(t1 < t2); 256109998Smarkm if (--n <= 0) break; 257109998Smarkm 258109998Smarkm t1=a[3]; t2=b[3]; 259109998Smarkm r[3]=(t1-t2-c)&BN_MASK2; 260109998Smarkm if (t1 != t2) c=(t1 < t2); 261109998Smarkm if (--n <= 0) break; 262109998Smarkm 263109998Smarkm a+=4; 264109998Smarkm b+=4; 265109998Smarkm r+=4; 266109998Smarkm } 267109998Smarkm return(c); 268109998Smarkm } 269109998Smarkm#endif 270109998Smarkm 271109998Smarkm/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 272109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 273109998Smarkm/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 274109998Smarkm/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ 275109998Smarkm 276277195Sdelphij/* 277277195Sdelphij * Keep in mind that carrying into high part of multiplication result 278277195Sdelphij * can not overflow, because it cannot be all-ones. 279277195Sdelphij */ 280109998Smarkm#if 0 281109998Smarkm/* original macros are kept for reference purposes */ 282109998Smarkm#define mul_add_c(a,b,c0,c1,c2) { \ 283109998Smarkm BN_ULONG ta=(a),tb=(b); \ 284109998Smarkm t1 = ta * tb; \ 285109998Smarkm t2 = BN_UMULT_HIGH(ta,tb); \ 286109998Smarkm c0 += t1; t2 += (c0<t1)?1:0; \ 287109998Smarkm c1 += t2; c2 += (c1<t2)?1:0; \ 288109998Smarkm } 289109998Smarkm 290109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) { \ 291109998Smarkm BN_ULONG ta=(a),tb=(b),t0; \ 292109998Smarkm t1 = BN_UMULT_HIGH(ta,tb); \ 293109998Smarkm t0 = ta * tb; \ 294277195Sdelphij c0 += t0; t2 = t1+((c0<t0)?1:0);\ 295109998Smarkm c1 += t2; c2 += (c1<t2)?1:0; \ 296277195Sdelphij c0 += t0; t1 += (c0<t0)?1:0; \ 297277195Sdelphij c1 += t1; c2 += (c1<t1)?1:0; \ 298109998Smarkm } 299109998Smarkm#else 300109998Smarkm#define mul_add_c(a,b,c0,c1,c2) do { \ 301109998Smarkm asm ("mulq %3" \ 302109998Smarkm : "=a"(t1),"=d"(t2) \ 303109998Smarkm : "a"(a),"m"(b) \ 304109998Smarkm : "cc"); \ 305109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 306109998Smarkm : "+r"(c0),"+d"(t2) \ 307109998Smarkm : "a"(t1),"g"(0) \ 308109998Smarkm : "cc"); \ 309109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 310109998Smarkm : "+r"(c1),"+r"(c2) \ 311109998Smarkm : "d"(t2),"g"(0) \ 312109998Smarkm : "cc"); \ 313109998Smarkm } while (0) 314109998Smarkm 315109998Smarkm#define sqr_add_c(a,i,c0,c1,c2) do { \ 316109998Smarkm asm ("mulq %2" \ 317109998Smarkm : "=a"(t1),"=d"(t2) \ 318109998Smarkm : "a"(a[i]) \ 319109998Smarkm : "cc"); \ 320109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 321109998Smarkm : "+r"(c0),"+d"(t2) \ 322109998Smarkm : "a"(t1),"g"(0) \ 323109998Smarkm : "cc"); \ 324109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 325109998Smarkm : "+r"(c1),"+r"(c2) \ 326109998Smarkm : "d"(t2),"g"(0) \ 327109998Smarkm : "cc"); \ 328109998Smarkm } while (0) 329109998Smarkm 330109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) do { \ 331109998Smarkm asm ("mulq %3" \ 332109998Smarkm : "=a"(t1),"=d"(t2) \ 333109998Smarkm : "a"(a),"m"(b) \ 334109998Smarkm : "cc"); \ 335277195Sdelphij asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 336277195Sdelphij : "+r"(c0),"+r"(c1),"+r"(c2) \ 337277195Sdelphij : "r"(t1),"r"(t2),"g"(0) \ 338277195Sdelphij : "cc"); \ 339277195Sdelphij asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 340277195Sdelphij : "+r"(c0),"+r"(c1),"+r"(c2) \ 341277195Sdelphij : "r"(t1),"r"(t2),"g"(0) \ 342277195Sdelphij : "cc"); \ 343109998Smarkm } while (0) 344109998Smarkm#endif 345109998Smarkm 346109998Smarkm#define sqr_add_c2(a,i,j,c0,c1,c2) \ 347109998Smarkm mul_add_c2((a)[i],(a)[j],c0,c1,c2) 348109998Smarkm 349109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 350109998Smarkm { 351109998Smarkm BN_ULONG t1,t2; 352109998Smarkm BN_ULONG c1,c2,c3; 353109998Smarkm 354109998Smarkm c1=0; 355109998Smarkm c2=0; 356109998Smarkm c3=0; 357109998Smarkm mul_add_c(a[0],b[0],c1,c2,c3); 358109998Smarkm r[0]=c1; 359109998Smarkm c1=0; 360109998Smarkm mul_add_c(a[0],b[1],c2,c3,c1); 361109998Smarkm mul_add_c(a[1],b[0],c2,c3,c1); 362109998Smarkm r[1]=c2; 363109998Smarkm c2=0; 364109998Smarkm mul_add_c(a[2],b[0],c3,c1,c2); 365109998Smarkm mul_add_c(a[1],b[1],c3,c1,c2); 366109998Smarkm mul_add_c(a[0],b[2],c3,c1,c2); 367109998Smarkm r[2]=c3; 368109998Smarkm c3=0; 369109998Smarkm mul_add_c(a[0],b[3],c1,c2,c3); 370109998Smarkm mul_add_c(a[1],b[2],c1,c2,c3); 371109998Smarkm mul_add_c(a[2],b[1],c1,c2,c3); 372109998Smarkm mul_add_c(a[3],b[0],c1,c2,c3); 373109998Smarkm r[3]=c1; 374109998Smarkm c1=0; 375109998Smarkm mul_add_c(a[4],b[0],c2,c3,c1); 376109998Smarkm mul_add_c(a[3],b[1],c2,c3,c1); 377109998Smarkm mul_add_c(a[2],b[2],c2,c3,c1); 378109998Smarkm mul_add_c(a[1],b[3],c2,c3,c1); 379109998Smarkm mul_add_c(a[0],b[4],c2,c3,c1); 380109998Smarkm r[4]=c2; 381109998Smarkm c2=0; 382109998Smarkm mul_add_c(a[0],b[5],c3,c1,c2); 383109998Smarkm mul_add_c(a[1],b[4],c3,c1,c2); 384109998Smarkm mul_add_c(a[2],b[3],c3,c1,c2); 385109998Smarkm mul_add_c(a[3],b[2],c3,c1,c2); 386109998Smarkm mul_add_c(a[4],b[1],c3,c1,c2); 387109998Smarkm mul_add_c(a[5],b[0],c3,c1,c2); 388109998Smarkm r[5]=c3; 389109998Smarkm c3=0; 390109998Smarkm mul_add_c(a[6],b[0],c1,c2,c3); 391109998Smarkm mul_add_c(a[5],b[1],c1,c2,c3); 392109998Smarkm mul_add_c(a[4],b[2],c1,c2,c3); 393109998Smarkm mul_add_c(a[3],b[3],c1,c2,c3); 394109998Smarkm mul_add_c(a[2],b[4],c1,c2,c3); 395109998Smarkm mul_add_c(a[1],b[5],c1,c2,c3); 396109998Smarkm mul_add_c(a[0],b[6],c1,c2,c3); 397109998Smarkm r[6]=c1; 398109998Smarkm c1=0; 399109998Smarkm mul_add_c(a[0],b[7],c2,c3,c1); 400109998Smarkm mul_add_c(a[1],b[6],c2,c3,c1); 401109998Smarkm mul_add_c(a[2],b[5],c2,c3,c1); 402109998Smarkm mul_add_c(a[3],b[4],c2,c3,c1); 403109998Smarkm mul_add_c(a[4],b[3],c2,c3,c1); 404109998Smarkm mul_add_c(a[5],b[2],c2,c3,c1); 405109998Smarkm mul_add_c(a[6],b[1],c2,c3,c1); 406109998Smarkm mul_add_c(a[7],b[0],c2,c3,c1); 407109998Smarkm r[7]=c2; 408109998Smarkm c2=0; 409109998Smarkm mul_add_c(a[7],b[1],c3,c1,c2); 410109998Smarkm mul_add_c(a[6],b[2],c3,c1,c2); 411109998Smarkm mul_add_c(a[5],b[3],c3,c1,c2); 412109998Smarkm mul_add_c(a[4],b[4],c3,c1,c2); 413109998Smarkm mul_add_c(a[3],b[5],c3,c1,c2); 414109998Smarkm mul_add_c(a[2],b[6],c3,c1,c2); 415109998Smarkm mul_add_c(a[1],b[7],c3,c1,c2); 416109998Smarkm r[8]=c3; 417109998Smarkm c3=0; 418109998Smarkm mul_add_c(a[2],b[7],c1,c2,c3); 419109998Smarkm mul_add_c(a[3],b[6],c1,c2,c3); 420109998Smarkm mul_add_c(a[4],b[5],c1,c2,c3); 421109998Smarkm mul_add_c(a[5],b[4],c1,c2,c3); 422109998Smarkm mul_add_c(a[6],b[3],c1,c2,c3); 423109998Smarkm mul_add_c(a[7],b[2],c1,c2,c3); 424109998Smarkm r[9]=c1; 425109998Smarkm c1=0; 426109998Smarkm mul_add_c(a[7],b[3],c2,c3,c1); 427109998Smarkm mul_add_c(a[6],b[4],c2,c3,c1); 428109998Smarkm mul_add_c(a[5],b[5],c2,c3,c1); 429109998Smarkm mul_add_c(a[4],b[6],c2,c3,c1); 430109998Smarkm mul_add_c(a[3],b[7],c2,c3,c1); 431109998Smarkm r[10]=c2; 432109998Smarkm c2=0; 433109998Smarkm mul_add_c(a[4],b[7],c3,c1,c2); 434109998Smarkm mul_add_c(a[5],b[6],c3,c1,c2); 435109998Smarkm mul_add_c(a[6],b[5],c3,c1,c2); 436109998Smarkm mul_add_c(a[7],b[4],c3,c1,c2); 437109998Smarkm r[11]=c3; 438109998Smarkm c3=0; 439109998Smarkm mul_add_c(a[7],b[5],c1,c2,c3); 440109998Smarkm mul_add_c(a[6],b[6],c1,c2,c3); 441109998Smarkm mul_add_c(a[5],b[7],c1,c2,c3); 442109998Smarkm r[12]=c1; 443109998Smarkm c1=0; 444109998Smarkm mul_add_c(a[6],b[7],c2,c3,c1); 445109998Smarkm mul_add_c(a[7],b[6],c2,c3,c1); 446109998Smarkm r[13]=c2; 447109998Smarkm c2=0; 448109998Smarkm mul_add_c(a[7],b[7],c3,c1,c2); 449109998Smarkm r[14]=c3; 450109998Smarkm r[15]=c1; 451109998Smarkm } 452109998Smarkm 453109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 454109998Smarkm { 455109998Smarkm BN_ULONG t1,t2; 456109998Smarkm BN_ULONG c1,c2,c3; 457109998Smarkm 458109998Smarkm c1=0; 459109998Smarkm c2=0; 460109998Smarkm c3=0; 461109998Smarkm mul_add_c(a[0],b[0],c1,c2,c3); 462109998Smarkm r[0]=c1; 463109998Smarkm c1=0; 464109998Smarkm mul_add_c(a[0],b[1],c2,c3,c1); 465109998Smarkm mul_add_c(a[1],b[0],c2,c3,c1); 466109998Smarkm r[1]=c2; 467109998Smarkm c2=0; 468109998Smarkm mul_add_c(a[2],b[0],c3,c1,c2); 469109998Smarkm mul_add_c(a[1],b[1],c3,c1,c2); 470109998Smarkm mul_add_c(a[0],b[2],c3,c1,c2); 471109998Smarkm r[2]=c3; 472109998Smarkm c3=0; 473109998Smarkm mul_add_c(a[0],b[3],c1,c2,c3); 474109998Smarkm mul_add_c(a[1],b[2],c1,c2,c3); 475109998Smarkm mul_add_c(a[2],b[1],c1,c2,c3); 476109998Smarkm mul_add_c(a[3],b[0],c1,c2,c3); 477109998Smarkm r[3]=c1; 478109998Smarkm c1=0; 479109998Smarkm mul_add_c(a[3],b[1],c2,c3,c1); 480109998Smarkm mul_add_c(a[2],b[2],c2,c3,c1); 481109998Smarkm mul_add_c(a[1],b[3],c2,c3,c1); 482109998Smarkm r[4]=c2; 483109998Smarkm c2=0; 484109998Smarkm mul_add_c(a[2],b[3],c3,c1,c2); 485109998Smarkm mul_add_c(a[3],b[2],c3,c1,c2); 486109998Smarkm r[5]=c3; 487109998Smarkm c3=0; 488109998Smarkm mul_add_c(a[3],b[3],c1,c2,c3); 489109998Smarkm r[6]=c1; 490109998Smarkm r[7]=c2; 491109998Smarkm } 492109998Smarkm 493205128Ssimonvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 494109998Smarkm { 495109998Smarkm BN_ULONG t1,t2; 496109998Smarkm BN_ULONG c1,c2,c3; 497109998Smarkm 498109998Smarkm c1=0; 499109998Smarkm c2=0; 500109998Smarkm c3=0; 501109998Smarkm sqr_add_c(a,0,c1,c2,c3); 502109998Smarkm r[0]=c1; 503109998Smarkm c1=0; 504109998Smarkm sqr_add_c2(a,1,0,c2,c3,c1); 505109998Smarkm r[1]=c2; 506109998Smarkm c2=0; 507109998Smarkm sqr_add_c(a,1,c3,c1,c2); 508109998Smarkm sqr_add_c2(a,2,0,c3,c1,c2); 509109998Smarkm r[2]=c3; 510109998Smarkm c3=0; 511109998Smarkm sqr_add_c2(a,3,0,c1,c2,c3); 512109998Smarkm sqr_add_c2(a,2,1,c1,c2,c3); 513109998Smarkm r[3]=c1; 514109998Smarkm c1=0; 515109998Smarkm sqr_add_c(a,2,c2,c3,c1); 516109998Smarkm sqr_add_c2(a,3,1,c2,c3,c1); 517109998Smarkm sqr_add_c2(a,4,0,c2,c3,c1); 518109998Smarkm r[4]=c2; 519109998Smarkm c2=0; 520109998Smarkm sqr_add_c2(a,5,0,c3,c1,c2); 521109998Smarkm sqr_add_c2(a,4,1,c3,c1,c2); 522109998Smarkm sqr_add_c2(a,3,2,c3,c1,c2); 523109998Smarkm r[5]=c3; 524109998Smarkm c3=0; 525109998Smarkm sqr_add_c(a,3,c1,c2,c3); 526109998Smarkm sqr_add_c2(a,4,2,c1,c2,c3); 527109998Smarkm sqr_add_c2(a,5,1,c1,c2,c3); 528109998Smarkm sqr_add_c2(a,6,0,c1,c2,c3); 529109998Smarkm r[6]=c1; 530109998Smarkm c1=0; 531109998Smarkm sqr_add_c2(a,7,0,c2,c3,c1); 532109998Smarkm sqr_add_c2(a,6,1,c2,c3,c1); 533109998Smarkm sqr_add_c2(a,5,2,c2,c3,c1); 534109998Smarkm sqr_add_c2(a,4,3,c2,c3,c1); 535109998Smarkm r[7]=c2; 536109998Smarkm c2=0; 537109998Smarkm sqr_add_c(a,4,c3,c1,c2); 538109998Smarkm sqr_add_c2(a,5,3,c3,c1,c2); 539109998Smarkm sqr_add_c2(a,6,2,c3,c1,c2); 540109998Smarkm sqr_add_c2(a,7,1,c3,c1,c2); 541109998Smarkm r[8]=c3; 542109998Smarkm c3=0; 543109998Smarkm sqr_add_c2(a,7,2,c1,c2,c3); 544109998Smarkm sqr_add_c2(a,6,3,c1,c2,c3); 545109998Smarkm sqr_add_c2(a,5,4,c1,c2,c3); 546109998Smarkm r[9]=c1; 547109998Smarkm c1=0; 548109998Smarkm sqr_add_c(a,5,c2,c3,c1); 549109998Smarkm sqr_add_c2(a,6,4,c2,c3,c1); 550109998Smarkm sqr_add_c2(a,7,3,c2,c3,c1); 551109998Smarkm r[10]=c2; 552109998Smarkm c2=0; 553109998Smarkm sqr_add_c2(a,7,4,c3,c1,c2); 554109998Smarkm sqr_add_c2(a,6,5,c3,c1,c2); 555109998Smarkm r[11]=c3; 556109998Smarkm c3=0; 557109998Smarkm sqr_add_c(a,6,c1,c2,c3); 558109998Smarkm sqr_add_c2(a,7,5,c1,c2,c3); 559109998Smarkm r[12]=c1; 560109998Smarkm c1=0; 561109998Smarkm sqr_add_c2(a,7,6,c2,c3,c1); 562109998Smarkm r[13]=c2; 563109998Smarkm c2=0; 564109998Smarkm sqr_add_c(a,7,c3,c1,c2); 565109998Smarkm r[14]=c3; 566109998Smarkm r[15]=c1; 567109998Smarkm } 568109998Smarkm 569205128Ssimonvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 570109998Smarkm { 571109998Smarkm BN_ULONG t1,t2; 572109998Smarkm BN_ULONG c1,c2,c3; 573109998Smarkm 574109998Smarkm c1=0; 575109998Smarkm c2=0; 576109998Smarkm c3=0; 577109998Smarkm sqr_add_c(a,0,c1,c2,c3); 578109998Smarkm r[0]=c1; 579109998Smarkm c1=0; 580109998Smarkm sqr_add_c2(a,1,0,c2,c3,c1); 581109998Smarkm r[1]=c2; 582109998Smarkm c2=0; 583109998Smarkm sqr_add_c(a,1,c3,c1,c2); 584109998Smarkm sqr_add_c2(a,2,0,c3,c1,c2); 585109998Smarkm r[2]=c3; 586109998Smarkm c3=0; 587109998Smarkm sqr_add_c2(a,3,0,c1,c2,c3); 588109998Smarkm sqr_add_c2(a,2,1,c1,c2,c3); 589109998Smarkm r[3]=c1; 590109998Smarkm c1=0; 591109998Smarkm sqr_add_c(a,2,c2,c3,c1); 592109998Smarkm sqr_add_c2(a,3,1,c2,c3,c1); 593109998Smarkm r[4]=c2; 594109998Smarkm c2=0; 595109998Smarkm sqr_add_c2(a,3,2,c3,c1,c2); 596109998Smarkm r[5]=c3; 597109998Smarkm c3=0; 598109998Smarkm sqr_add_c(a,3,c1,c2,c3); 599109998Smarkm r[6]=c1; 600109998Smarkm r[7]=c2; 601109998Smarkm } 602162911Ssimon#endif 603