x86_64-gcc.c revision 205128
1205128Ssimon#include "../bn_lcl.h" 2162911Ssimon#ifdef __SUNPRO_C 3162911Ssimon# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 4162911Ssimon#else 5109998Smarkm/* 6109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002. 7109998Smarkm * 8109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9109998Smarkm * project. 10109998Smarkm * 11109998Smarkm * Rights for redistribution and usage in source and binary forms are 12109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is 13109998Smarkm * disclaimed. 14109998Smarkm * 15109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 16109998Smarkm * versions, like 1.0... 17109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty 18109998Smarkm * proof-of-concept hack. As you can see it's implemented with 19109998Smarkm * inline assembler, which means that you're bound to GCC and that 20160814Ssimon * there might be enough room for further improvement. 21109998Smarkm * 22109998Smarkm * Q. Why inline assembler? 23160814Ssimon * A. x86_64 features own ABI which I'm not familiar with. This is 24160814Ssimon * why I decided to let the compiler take care of subroutine 25160814Ssimon * prologue/epilogue as well as register allocation. For reference. 26160814Ssimon * Win64 implements different ABI for AMD64, different from Linux. 27109998Smarkm * 28109998Smarkm * Q. How much faster does it get? 29160814Ssimon * A. 'apps/openssl speed rsa dsa' output with no-asm: 30160814Ssimon * 31160814Ssimon * sign verify sign/s verify/s 32160814Ssimon * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 33160814Ssimon * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 34160814Ssimon * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 35160814Ssimon * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 36160814Ssimon * sign verify sign/s verify/s 37160814Ssimon * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 38160814Ssimon * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 39160814Ssimon * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 40160814Ssimon * 41160814Ssimon * 'apps/openssl speed rsa dsa' output with this module: 42160814Ssimon * 43160814Ssimon * sign verify sign/s verify/s 44160814Ssimon * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 45160814Ssimon * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 46160814Ssimon * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 47160814Ssimon * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 48160814Ssimon * sign verify sign/s verify/s 49160814Ssimon * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 50160814Ssimon * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 51160814Ssimon * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 52160814Ssimon * 53160814Ssimon * For the reference. IA-32 assembler implementation performs 54160814Ssimon * very much like 64-bit code compiled with no-asm on the same 55160814Ssimon * machine. 56109998Smarkm */ 57109998Smarkm 58109998Smarkm#define BN_ULONG unsigned long 59109998Smarkm 60205128Ssimon#undef mul 61205128Ssimon#undef mul_add 62205128Ssimon 63109998Smarkm/* 64109998Smarkm * "m"(a), "+m"(r) is the way to favor DirectPath �-code; 65109998Smarkm * "g"(0) let the compiler to decide where does it 66109998Smarkm * want to keep the value of zero; 67109998Smarkm */ 68109998Smarkm#define mul_add(r,a,word,carry) do { \ 69109998Smarkm register BN_ULONG high,low; \ 70109998Smarkm asm ("mulq %3" \ 71109998Smarkm : "=a"(low),"=d"(high) \ 72109998Smarkm : "a"(word),"m"(a) \ 73109998Smarkm : "cc"); \ 74109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 75109998Smarkm : "+r"(carry),"+d"(high)\ 76109998Smarkm : "a"(low),"g"(0) \ 77109998Smarkm : "cc"); \ 78109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 79109998Smarkm : "+m"(r),"+d"(high) \ 80109998Smarkm : "r"(carry),"g"(0) \ 81109998Smarkm : "cc"); \ 82109998Smarkm carry=high; \ 83109998Smarkm } while (0) 84109998Smarkm 85109998Smarkm#define mul(r,a,word,carry) do { \ 86109998Smarkm register BN_ULONG high,low; \ 87109998Smarkm asm ("mulq %3" \ 88109998Smarkm : "=a"(low),"=d"(high) \ 89109998Smarkm : "a"(word),"g"(a) \ 90109998Smarkm : "cc"); \ 91109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 92109998Smarkm : "+r"(carry),"+d"(high)\ 93109998Smarkm : "a"(low),"g"(0) \ 94109998Smarkm : "cc"); \ 95109998Smarkm (r)=carry, carry=high; \ 96109998Smarkm } while (0) 97109998Smarkm 98109998Smarkm#define sqr(r0,r1,a) \ 99109998Smarkm asm ("mulq %2" \ 100109998Smarkm : "=a"(r0),"=d"(r1) \ 101109998Smarkm : "a"(a) \ 102109998Smarkm : "cc"); 103109998Smarkm 104205128SsimonBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 105109998Smarkm { 106109998Smarkm BN_ULONG c1=0; 107109998Smarkm 108109998Smarkm if (num <= 0) return(c1); 109109998Smarkm 110109998Smarkm while (num&~3) 111109998Smarkm { 112109998Smarkm mul_add(rp[0],ap[0],w,c1); 113109998Smarkm mul_add(rp[1],ap[1],w,c1); 114109998Smarkm mul_add(rp[2],ap[2],w,c1); 115109998Smarkm mul_add(rp[3],ap[3],w,c1); 116109998Smarkm ap+=4; rp+=4; num-=4; 117109998Smarkm } 118109998Smarkm if (num) 119109998Smarkm { 120109998Smarkm mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; 121109998Smarkm mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; 122109998Smarkm mul_add(rp[2],ap[2],w,c1); return c1; 123109998Smarkm } 124109998Smarkm 125109998Smarkm return(c1); 126109998Smarkm } 127109998Smarkm 128205128SsimonBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 129109998Smarkm { 130109998Smarkm BN_ULONG c1=0; 131109998Smarkm 132109998Smarkm if (num <= 0) return(c1); 133109998Smarkm 134109998Smarkm while (num&~3) 135109998Smarkm { 136109998Smarkm mul(rp[0],ap[0],w,c1); 137109998Smarkm mul(rp[1],ap[1],w,c1); 138109998Smarkm mul(rp[2],ap[2],w,c1); 139109998Smarkm mul(rp[3],ap[3],w,c1); 140109998Smarkm ap+=4; rp+=4; num-=4; 141109998Smarkm } 142109998Smarkm if (num) 143109998Smarkm { 144109998Smarkm mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; 145109998Smarkm mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; 146109998Smarkm mul(rp[2],ap[2],w,c1); 147109998Smarkm } 148109998Smarkm return(c1); 149109998Smarkm } 150109998Smarkm 151205128Ssimonvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 152109998Smarkm { 153109998Smarkm if (n <= 0) return; 154109998Smarkm 155109998Smarkm while (n&~3) 156109998Smarkm { 157109998Smarkm sqr(r[0],r[1],a[0]); 158109998Smarkm sqr(r[2],r[3],a[1]); 159109998Smarkm sqr(r[4],r[5],a[2]); 160109998Smarkm sqr(r[6],r[7],a[3]); 161109998Smarkm a+=4; r+=8; n-=4; 162109998Smarkm } 163109998Smarkm if (n) 164109998Smarkm { 165109998Smarkm sqr(r[0],r[1],a[0]); if (--n == 0) return; 166109998Smarkm sqr(r[2],r[3],a[1]); if (--n == 0) return; 167109998Smarkm sqr(r[4],r[5],a[2]); 168109998Smarkm } 169109998Smarkm } 170109998Smarkm 171109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 172109998Smarkm{ BN_ULONG ret,waste; 173109998Smarkm 174127128Snectar asm ("divq %4" 175109998Smarkm : "=a"(ret),"=d"(waste) 176109998Smarkm : "a"(l),"d"(h),"g"(d) 177109998Smarkm : "cc"); 178109998Smarkm 179109998Smarkm return ret; 180109998Smarkm} 181109998Smarkm 182205128SsimonBN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) 183160814Ssimon{ BN_ULONG ret=0,i=0; 184109998Smarkm 185109998Smarkm if (n <= 0) return 0; 186109998Smarkm 187109998Smarkm asm ( 188109998Smarkm " subq %2,%2 \n" 189109998Smarkm ".align 16 \n" 190109998Smarkm "1: movq (%4,%2,8),%0 \n" 191109998Smarkm " adcq (%5,%2,8),%0 \n" 192109998Smarkm " movq %0,(%3,%2,8) \n" 193109998Smarkm " leaq 1(%2),%2 \n" 194109998Smarkm " loop 1b \n" 195109998Smarkm " sbbq %0,%0 \n" 196160814Ssimon : "=&a"(ret),"+c"(n),"=&r"(i) 197109998Smarkm : "r"(rp),"r"(ap),"r"(bp) 198109998Smarkm : "cc" 199109998Smarkm ); 200109998Smarkm 201109998Smarkm return ret&1; 202109998Smarkm} 203109998Smarkm 204109998Smarkm#ifndef SIMICS 205205128SsimonBN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) 206160814Ssimon{ BN_ULONG ret=0,i=0; 207109998Smarkm 208109998Smarkm if (n <= 0) return 0; 209109998Smarkm 210109998Smarkm asm ( 211109998Smarkm " subq %2,%2 \n" 212109998Smarkm ".align 16 \n" 213109998Smarkm "1: movq (%4,%2,8),%0 \n" 214109998Smarkm " sbbq (%5,%2,8),%0 \n" 215109998Smarkm " movq %0,(%3,%2,8) \n" 216109998Smarkm " leaq 1(%2),%2 \n" 217109998Smarkm " loop 1b \n" 218109998Smarkm " sbbq %0,%0 \n" 219160814Ssimon : "=&a"(ret),"+c"(n),"=&r"(i) 220109998Smarkm : "r"(rp),"r"(ap),"r"(bp) 221109998Smarkm : "cc" 222109998Smarkm ); 223109998Smarkm 224109998Smarkm return ret&1; 225109998Smarkm} 226109998Smarkm#else 227109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */ 228109998Smarkm#define BN_MASK2 0xffffffffffffffffL 229109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 230109998Smarkm { 231109998Smarkm BN_ULONG t1,t2; 232109998Smarkm int c=0; 233109998Smarkm 234109998Smarkm if (n <= 0) return((BN_ULONG)0); 235109998Smarkm 236109998Smarkm for (;;) 237109998Smarkm { 238109998Smarkm t1=a[0]; t2=b[0]; 239109998Smarkm r[0]=(t1-t2-c)&BN_MASK2; 240109998Smarkm if (t1 != t2) c=(t1 < t2); 241109998Smarkm if (--n <= 0) break; 242109998Smarkm 243109998Smarkm t1=a[1]; t2=b[1]; 244109998Smarkm r[1]=(t1-t2-c)&BN_MASK2; 245109998Smarkm if (t1 != t2) c=(t1 < t2); 246109998Smarkm if (--n <= 0) break; 247109998Smarkm 248109998Smarkm t1=a[2]; t2=b[2]; 249109998Smarkm r[2]=(t1-t2-c)&BN_MASK2; 250109998Smarkm if (t1 != t2) c=(t1 < t2); 251109998Smarkm if (--n <= 0) break; 252109998Smarkm 253109998Smarkm t1=a[3]; t2=b[3]; 254109998Smarkm r[3]=(t1-t2-c)&BN_MASK2; 255109998Smarkm if (t1 != t2) c=(t1 < t2); 256109998Smarkm if (--n <= 0) break; 257109998Smarkm 258109998Smarkm a+=4; 259109998Smarkm b+=4; 260109998Smarkm r+=4; 261109998Smarkm } 262109998Smarkm return(c); 263109998Smarkm } 264109998Smarkm#endif 265109998Smarkm 266109998Smarkm/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 267109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 268109998Smarkm/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 269109998Smarkm/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ 270109998Smarkm 271109998Smarkm#if 0 272109998Smarkm/* original macros are kept for reference purposes */ 273109998Smarkm#define mul_add_c(a,b,c0,c1,c2) { \ 274109998Smarkm BN_ULONG ta=(a),tb=(b); \ 275109998Smarkm t1 = ta * tb; \ 276109998Smarkm t2 = BN_UMULT_HIGH(ta,tb); \ 277109998Smarkm c0 += t1; t2 += (c0<t1)?1:0; \ 278109998Smarkm c1 += t2; c2 += (c1<t2)?1:0; \ 279109998Smarkm } 280109998Smarkm 281109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) { \ 282109998Smarkm BN_ULONG ta=(a),tb=(b),t0; \ 283109998Smarkm t1 = BN_UMULT_HIGH(ta,tb); \ 284109998Smarkm t0 = ta * tb; \ 285109998Smarkm t2 = t1+t1; c2 += (t2<t1)?1:0; \ 286109998Smarkm t1 = t0+t0; t2 += (t1<t0)?1:0; \ 287109998Smarkm c0 += t1; t2 += (c0<t1)?1:0; \ 288109998Smarkm c1 += t2; c2 += (c1<t2)?1:0; \ 289109998Smarkm } 290109998Smarkm#else 291109998Smarkm#define mul_add_c(a,b,c0,c1,c2) do { \ 292109998Smarkm asm ("mulq %3" \ 293109998Smarkm : "=a"(t1),"=d"(t2) \ 294109998Smarkm : "a"(a),"m"(b) \ 295109998Smarkm : "cc"); \ 296109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 297109998Smarkm : "+r"(c0),"+d"(t2) \ 298109998Smarkm : "a"(t1),"g"(0) \ 299109998Smarkm : "cc"); \ 300109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 301109998Smarkm : "+r"(c1),"+r"(c2) \ 302109998Smarkm : "d"(t2),"g"(0) \ 303109998Smarkm : "cc"); \ 304109998Smarkm } while (0) 305109998Smarkm 306109998Smarkm#define sqr_add_c(a,i,c0,c1,c2) do { \ 307109998Smarkm asm ("mulq %2" \ 308109998Smarkm : "=a"(t1),"=d"(t2) \ 309109998Smarkm : "a"(a[i]) \ 310109998Smarkm : "cc"); \ 311109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 312109998Smarkm : "+r"(c0),"+d"(t2) \ 313109998Smarkm : "a"(t1),"g"(0) \ 314109998Smarkm : "cc"); \ 315109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 316109998Smarkm : "+r"(c1),"+r"(c2) \ 317109998Smarkm : "d"(t2),"g"(0) \ 318109998Smarkm : "cc"); \ 319109998Smarkm } while (0) 320109998Smarkm 321109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) do { \ 322109998Smarkm asm ("mulq %3" \ 323109998Smarkm : "=a"(t1),"=d"(t2) \ 324109998Smarkm : "a"(a),"m"(b) \ 325109998Smarkm : "cc"); \ 326109998Smarkm asm ("addq %0,%0; adcq %2,%1" \ 327109998Smarkm : "+d"(t2),"+r"(c2) \ 328109998Smarkm : "g"(0) \ 329109998Smarkm : "cc"); \ 330109998Smarkm asm ("addq %0,%0; adcq %2,%1" \ 331109998Smarkm : "+a"(t1),"+d"(t2) \ 332109998Smarkm : "g"(0) \ 333109998Smarkm : "cc"); \ 334109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 335109998Smarkm : "+r"(c0),"+d"(t2) \ 336109998Smarkm : "a"(t1),"g"(0) \ 337109998Smarkm : "cc"); \ 338109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 339109998Smarkm : "+r"(c1),"+r"(c2) \ 340109998Smarkm : "d"(t2),"g"(0) \ 341109998Smarkm : "cc"); \ 342109998Smarkm } while (0) 343109998Smarkm#endif 344109998Smarkm 345109998Smarkm#define sqr_add_c2(a,i,j,c0,c1,c2) \ 346109998Smarkm mul_add_c2((a)[i],(a)[j],c0,c1,c2) 347109998Smarkm 348109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 349109998Smarkm { 350109998Smarkm BN_ULONG t1,t2; 351109998Smarkm BN_ULONG c1,c2,c3; 352109998Smarkm 353109998Smarkm c1=0; 354109998Smarkm c2=0; 355109998Smarkm c3=0; 356109998Smarkm mul_add_c(a[0],b[0],c1,c2,c3); 357109998Smarkm r[0]=c1; 358109998Smarkm c1=0; 359109998Smarkm mul_add_c(a[0],b[1],c2,c3,c1); 360109998Smarkm mul_add_c(a[1],b[0],c2,c3,c1); 361109998Smarkm r[1]=c2; 362109998Smarkm c2=0; 363109998Smarkm mul_add_c(a[2],b[0],c3,c1,c2); 364109998Smarkm mul_add_c(a[1],b[1],c3,c1,c2); 365109998Smarkm mul_add_c(a[0],b[2],c3,c1,c2); 366109998Smarkm r[2]=c3; 367109998Smarkm c3=0; 368109998Smarkm mul_add_c(a[0],b[3],c1,c2,c3); 369109998Smarkm mul_add_c(a[1],b[2],c1,c2,c3); 370109998Smarkm mul_add_c(a[2],b[1],c1,c2,c3); 371109998Smarkm mul_add_c(a[3],b[0],c1,c2,c3); 372109998Smarkm r[3]=c1; 373109998Smarkm c1=0; 374109998Smarkm mul_add_c(a[4],b[0],c2,c3,c1); 375109998Smarkm mul_add_c(a[3],b[1],c2,c3,c1); 376109998Smarkm mul_add_c(a[2],b[2],c2,c3,c1); 377109998Smarkm mul_add_c(a[1],b[3],c2,c3,c1); 378109998Smarkm mul_add_c(a[0],b[4],c2,c3,c1); 379109998Smarkm r[4]=c2; 380109998Smarkm c2=0; 381109998Smarkm mul_add_c(a[0],b[5],c3,c1,c2); 382109998Smarkm mul_add_c(a[1],b[4],c3,c1,c2); 383109998Smarkm mul_add_c(a[2],b[3],c3,c1,c2); 384109998Smarkm mul_add_c(a[3],b[2],c3,c1,c2); 385109998Smarkm mul_add_c(a[4],b[1],c3,c1,c2); 386109998Smarkm mul_add_c(a[5],b[0],c3,c1,c2); 387109998Smarkm r[5]=c3; 388109998Smarkm c3=0; 389109998Smarkm mul_add_c(a[6],b[0],c1,c2,c3); 390109998Smarkm mul_add_c(a[5],b[1],c1,c2,c3); 391109998Smarkm mul_add_c(a[4],b[2],c1,c2,c3); 392109998Smarkm mul_add_c(a[3],b[3],c1,c2,c3); 393109998Smarkm mul_add_c(a[2],b[4],c1,c2,c3); 394109998Smarkm mul_add_c(a[1],b[5],c1,c2,c3); 395109998Smarkm mul_add_c(a[0],b[6],c1,c2,c3); 396109998Smarkm r[6]=c1; 397109998Smarkm c1=0; 398109998Smarkm mul_add_c(a[0],b[7],c2,c3,c1); 399109998Smarkm mul_add_c(a[1],b[6],c2,c3,c1); 400109998Smarkm mul_add_c(a[2],b[5],c2,c3,c1); 401109998Smarkm mul_add_c(a[3],b[4],c2,c3,c1); 402109998Smarkm mul_add_c(a[4],b[3],c2,c3,c1); 403109998Smarkm mul_add_c(a[5],b[2],c2,c3,c1); 404109998Smarkm mul_add_c(a[6],b[1],c2,c3,c1); 405109998Smarkm mul_add_c(a[7],b[0],c2,c3,c1); 406109998Smarkm r[7]=c2; 407109998Smarkm c2=0; 408109998Smarkm mul_add_c(a[7],b[1],c3,c1,c2); 409109998Smarkm mul_add_c(a[6],b[2],c3,c1,c2); 410109998Smarkm mul_add_c(a[5],b[3],c3,c1,c2); 411109998Smarkm mul_add_c(a[4],b[4],c3,c1,c2); 412109998Smarkm mul_add_c(a[3],b[5],c3,c1,c2); 413109998Smarkm mul_add_c(a[2],b[6],c3,c1,c2); 414109998Smarkm mul_add_c(a[1],b[7],c3,c1,c2); 415109998Smarkm r[8]=c3; 416109998Smarkm c3=0; 417109998Smarkm mul_add_c(a[2],b[7],c1,c2,c3); 418109998Smarkm mul_add_c(a[3],b[6],c1,c2,c3); 419109998Smarkm mul_add_c(a[4],b[5],c1,c2,c3); 420109998Smarkm mul_add_c(a[5],b[4],c1,c2,c3); 421109998Smarkm mul_add_c(a[6],b[3],c1,c2,c3); 422109998Smarkm mul_add_c(a[7],b[2],c1,c2,c3); 423109998Smarkm r[9]=c1; 424109998Smarkm c1=0; 425109998Smarkm mul_add_c(a[7],b[3],c2,c3,c1); 426109998Smarkm mul_add_c(a[6],b[4],c2,c3,c1); 427109998Smarkm mul_add_c(a[5],b[5],c2,c3,c1); 428109998Smarkm mul_add_c(a[4],b[6],c2,c3,c1); 429109998Smarkm mul_add_c(a[3],b[7],c2,c3,c1); 430109998Smarkm r[10]=c2; 431109998Smarkm c2=0; 432109998Smarkm mul_add_c(a[4],b[7],c3,c1,c2); 433109998Smarkm mul_add_c(a[5],b[6],c3,c1,c2); 434109998Smarkm mul_add_c(a[6],b[5],c3,c1,c2); 435109998Smarkm mul_add_c(a[7],b[4],c3,c1,c2); 436109998Smarkm r[11]=c3; 437109998Smarkm c3=0; 438109998Smarkm mul_add_c(a[7],b[5],c1,c2,c3); 439109998Smarkm mul_add_c(a[6],b[6],c1,c2,c3); 440109998Smarkm mul_add_c(a[5],b[7],c1,c2,c3); 441109998Smarkm r[12]=c1; 442109998Smarkm c1=0; 443109998Smarkm mul_add_c(a[6],b[7],c2,c3,c1); 444109998Smarkm mul_add_c(a[7],b[6],c2,c3,c1); 445109998Smarkm r[13]=c2; 446109998Smarkm c2=0; 447109998Smarkm mul_add_c(a[7],b[7],c3,c1,c2); 448109998Smarkm r[14]=c3; 449109998Smarkm r[15]=c1; 450109998Smarkm } 451109998Smarkm 452109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 453109998Smarkm { 454109998Smarkm BN_ULONG t1,t2; 455109998Smarkm BN_ULONG c1,c2,c3; 456109998Smarkm 457109998Smarkm c1=0; 458109998Smarkm c2=0; 459109998Smarkm c3=0; 460109998Smarkm mul_add_c(a[0],b[0],c1,c2,c3); 461109998Smarkm r[0]=c1; 462109998Smarkm c1=0; 463109998Smarkm mul_add_c(a[0],b[1],c2,c3,c1); 464109998Smarkm mul_add_c(a[1],b[0],c2,c3,c1); 465109998Smarkm r[1]=c2; 466109998Smarkm c2=0; 467109998Smarkm mul_add_c(a[2],b[0],c3,c1,c2); 468109998Smarkm mul_add_c(a[1],b[1],c3,c1,c2); 469109998Smarkm mul_add_c(a[0],b[2],c3,c1,c2); 470109998Smarkm r[2]=c3; 471109998Smarkm c3=0; 472109998Smarkm mul_add_c(a[0],b[3],c1,c2,c3); 473109998Smarkm mul_add_c(a[1],b[2],c1,c2,c3); 474109998Smarkm mul_add_c(a[2],b[1],c1,c2,c3); 475109998Smarkm mul_add_c(a[3],b[0],c1,c2,c3); 476109998Smarkm r[3]=c1; 477109998Smarkm c1=0; 478109998Smarkm mul_add_c(a[3],b[1],c2,c3,c1); 479109998Smarkm mul_add_c(a[2],b[2],c2,c3,c1); 480109998Smarkm mul_add_c(a[1],b[3],c2,c3,c1); 481109998Smarkm r[4]=c2; 482109998Smarkm c2=0; 483109998Smarkm mul_add_c(a[2],b[3],c3,c1,c2); 484109998Smarkm mul_add_c(a[3],b[2],c3,c1,c2); 485109998Smarkm r[5]=c3; 486109998Smarkm c3=0; 487109998Smarkm mul_add_c(a[3],b[3],c1,c2,c3); 488109998Smarkm r[6]=c1; 489109998Smarkm r[7]=c2; 490109998Smarkm } 491109998Smarkm 492205128Ssimonvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 493109998Smarkm { 494109998Smarkm BN_ULONG t1,t2; 495109998Smarkm BN_ULONG c1,c2,c3; 496109998Smarkm 497109998Smarkm c1=0; 498109998Smarkm c2=0; 499109998Smarkm c3=0; 500109998Smarkm sqr_add_c(a,0,c1,c2,c3); 501109998Smarkm r[0]=c1; 502109998Smarkm c1=0; 503109998Smarkm sqr_add_c2(a,1,0,c2,c3,c1); 504109998Smarkm r[1]=c2; 505109998Smarkm c2=0; 506109998Smarkm sqr_add_c(a,1,c3,c1,c2); 507109998Smarkm sqr_add_c2(a,2,0,c3,c1,c2); 508109998Smarkm r[2]=c3; 509109998Smarkm c3=0; 510109998Smarkm sqr_add_c2(a,3,0,c1,c2,c3); 511109998Smarkm sqr_add_c2(a,2,1,c1,c2,c3); 512109998Smarkm r[3]=c1; 513109998Smarkm c1=0; 514109998Smarkm sqr_add_c(a,2,c2,c3,c1); 515109998Smarkm sqr_add_c2(a,3,1,c2,c3,c1); 516109998Smarkm sqr_add_c2(a,4,0,c2,c3,c1); 517109998Smarkm r[4]=c2; 518109998Smarkm c2=0; 519109998Smarkm sqr_add_c2(a,5,0,c3,c1,c2); 520109998Smarkm sqr_add_c2(a,4,1,c3,c1,c2); 521109998Smarkm sqr_add_c2(a,3,2,c3,c1,c2); 522109998Smarkm r[5]=c3; 523109998Smarkm c3=0; 524109998Smarkm sqr_add_c(a,3,c1,c2,c3); 525109998Smarkm sqr_add_c2(a,4,2,c1,c2,c3); 526109998Smarkm sqr_add_c2(a,5,1,c1,c2,c3); 527109998Smarkm sqr_add_c2(a,6,0,c1,c2,c3); 528109998Smarkm r[6]=c1; 529109998Smarkm c1=0; 530109998Smarkm sqr_add_c2(a,7,0,c2,c3,c1); 531109998Smarkm sqr_add_c2(a,6,1,c2,c3,c1); 532109998Smarkm sqr_add_c2(a,5,2,c2,c3,c1); 533109998Smarkm sqr_add_c2(a,4,3,c2,c3,c1); 534109998Smarkm r[7]=c2; 535109998Smarkm c2=0; 536109998Smarkm sqr_add_c(a,4,c3,c1,c2); 537109998Smarkm sqr_add_c2(a,5,3,c3,c1,c2); 538109998Smarkm sqr_add_c2(a,6,2,c3,c1,c2); 539109998Smarkm sqr_add_c2(a,7,1,c3,c1,c2); 540109998Smarkm r[8]=c3; 541109998Smarkm c3=0; 542109998Smarkm sqr_add_c2(a,7,2,c1,c2,c3); 543109998Smarkm sqr_add_c2(a,6,3,c1,c2,c3); 544109998Smarkm sqr_add_c2(a,5,4,c1,c2,c3); 545109998Smarkm r[9]=c1; 546109998Smarkm c1=0; 547109998Smarkm sqr_add_c(a,5,c2,c3,c1); 548109998Smarkm sqr_add_c2(a,6,4,c2,c3,c1); 549109998Smarkm sqr_add_c2(a,7,3,c2,c3,c1); 550109998Smarkm r[10]=c2; 551109998Smarkm c2=0; 552109998Smarkm sqr_add_c2(a,7,4,c3,c1,c2); 553109998Smarkm sqr_add_c2(a,6,5,c3,c1,c2); 554109998Smarkm r[11]=c3; 555109998Smarkm c3=0; 556109998Smarkm sqr_add_c(a,6,c1,c2,c3); 557109998Smarkm sqr_add_c2(a,7,5,c1,c2,c3); 558109998Smarkm r[12]=c1; 559109998Smarkm c1=0; 560109998Smarkm sqr_add_c2(a,7,6,c2,c3,c1); 561109998Smarkm r[13]=c2; 562109998Smarkm c2=0; 563109998Smarkm sqr_add_c(a,7,c3,c1,c2); 564109998Smarkm r[14]=c3; 565109998Smarkm r[15]=c1; 566109998Smarkm } 567109998Smarkm 568205128Ssimonvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 569109998Smarkm { 570109998Smarkm BN_ULONG t1,t2; 571109998Smarkm BN_ULONG c1,c2,c3; 572109998Smarkm 573109998Smarkm c1=0; 574109998Smarkm c2=0; 575109998Smarkm c3=0; 576109998Smarkm sqr_add_c(a,0,c1,c2,c3); 577109998Smarkm r[0]=c1; 578109998Smarkm c1=0; 579109998Smarkm sqr_add_c2(a,1,0,c2,c3,c1); 580109998Smarkm r[1]=c2; 581109998Smarkm c2=0; 582109998Smarkm sqr_add_c(a,1,c3,c1,c2); 583109998Smarkm sqr_add_c2(a,2,0,c3,c1,c2); 584109998Smarkm r[2]=c3; 585109998Smarkm c3=0; 586109998Smarkm sqr_add_c2(a,3,0,c1,c2,c3); 587109998Smarkm sqr_add_c2(a,2,1,c1,c2,c3); 588109998Smarkm r[3]=c1; 589109998Smarkm c1=0; 590109998Smarkm sqr_add_c(a,2,c2,c3,c1); 591109998Smarkm sqr_add_c2(a,3,1,c2,c3,c1); 592109998Smarkm r[4]=c2; 593109998Smarkm c2=0; 594109998Smarkm sqr_add_c2(a,3,2,c3,c1,c2); 595109998Smarkm r[5]=c3; 596109998Smarkm c3=0; 597109998Smarkm sqr_add_c(a,3,c1,c2,c3); 598109998Smarkm r[6]=c1; 599109998Smarkm r[7]=c2; 600109998Smarkm } 601162911Ssimon#endif 602