x86_64-gcc.c revision 109998
1109998Smarkm/* 2109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002. 3109998Smarkm * 4109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5109998Smarkm * project. 6109998Smarkm * 7109998Smarkm * Rights for redistribution and usage in source and binary forms are 8109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is 9109998Smarkm * disclaimed. 10109998Smarkm * 11109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 12109998Smarkm * versions, like 1.0... 13109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty 14109998Smarkm * proof-of-concept hack. As you can see it's implemented with 15109998Smarkm * inline assembler, which means that you're bound to GCC and that 16109998Smarkm * there must be a room for fine-tuning. 17109998Smarkm * 18109998Smarkm * Q. Why inline assembler? 19109998Smarkm * A. x86_64 features own ABI I'm not familiar with. Which is why 20109998Smarkm * I decided to let the compiler take care of subroutine 21109998Smarkm * prologue/epilogue as well as register allocation. 22109998Smarkm * 23109998Smarkm * Q. How much faster does it get? 24109998Smarkm * A. Unfortunately people sitting on x86_64 hardware are prohibited 25109998Smarkm * to disclose the performance numbers, so they (SuSE labs to be 26109998Smarkm * specific) wouldn't tell me. However! Very similar coding technique 27109998Smarkm * (reaching out for 128-bit result from 64x64-bit multiplication) 28109998Smarkm * results in >3 times performance improvement on MIPS and I see no 29109998Smarkm * reason why gain on x86_64 would be so much different:-) 30109998Smarkm */ 31109998Smarkm 32109998Smarkm#define BN_ULONG unsigned long 33109998Smarkm 34109998Smarkm/* 35109998Smarkm * "m"(a), "+m"(r) is the way to favor DirectPath �-code; 36109998Smarkm * "g"(0) let the compiler to decide where does it 37109998Smarkm * want to keep the value of zero; 38109998Smarkm */ 39109998Smarkm#define mul_add(r,a,word,carry) do { \ 40109998Smarkm register BN_ULONG high,low; \ 41109998Smarkm asm ("mulq %3" \ 42109998Smarkm : "=a"(low),"=d"(high) \ 43109998Smarkm : "a"(word),"m"(a) \ 44109998Smarkm : "cc"); \ 45109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 46109998Smarkm : "+r"(carry),"+d"(high)\ 47109998Smarkm : "a"(low),"g"(0) \ 48109998Smarkm : "cc"); \ 49109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 50109998Smarkm : "+m"(r),"+d"(high) \ 51109998Smarkm : "r"(carry),"g"(0) \ 52109998Smarkm : "cc"); \ 53109998Smarkm carry=high; \ 54109998Smarkm } while (0) 55109998Smarkm 56109998Smarkm#define mul(r,a,word,carry) do { \ 57109998Smarkm register BN_ULONG high,low; \ 58109998Smarkm asm ("mulq %3" \ 59109998Smarkm : "=a"(low),"=d"(high) \ 60109998Smarkm : "a"(word),"g"(a) \ 61109998Smarkm : "cc"); \ 62109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 63109998Smarkm : "+r"(carry),"+d"(high)\ 64109998Smarkm : "a"(low),"g"(0) \ 65109998Smarkm : "cc"); \ 66109998Smarkm (r)=carry, carry=high; \ 67109998Smarkm } while (0) 68109998Smarkm 69109998Smarkm#define sqr(r0,r1,a) \ 70109998Smarkm asm ("mulq %2" \ 71109998Smarkm : "=a"(r0),"=d"(r1) \ 72109998Smarkm : "a"(a) \ 73109998Smarkm : "cc"); 74109998Smarkm 75109998SmarkmBN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 76109998Smarkm { 77109998Smarkm BN_ULONG c1=0; 78109998Smarkm 79109998Smarkm if (num <= 0) return(c1); 80109998Smarkm 81109998Smarkm while (num&~3) 82109998Smarkm { 83109998Smarkm mul_add(rp[0],ap[0],w,c1); 84109998Smarkm mul_add(rp[1],ap[1],w,c1); 85109998Smarkm mul_add(rp[2],ap[2],w,c1); 86109998Smarkm mul_add(rp[3],ap[3],w,c1); 87109998Smarkm ap+=4; rp+=4; num-=4; 88109998Smarkm } 89109998Smarkm if (num) 90109998Smarkm { 91109998Smarkm mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; 92109998Smarkm mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; 93109998Smarkm mul_add(rp[2],ap[2],w,c1); return c1; 94109998Smarkm } 95109998Smarkm 96109998Smarkm return(c1); 97109998Smarkm } 98109998Smarkm 99109998SmarkmBN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 100109998Smarkm { 101109998Smarkm BN_ULONG c1=0; 102109998Smarkm 103109998Smarkm if (num <= 0) return(c1); 104109998Smarkm 105109998Smarkm while (num&~3) 106109998Smarkm { 107109998Smarkm mul(rp[0],ap[0],w,c1); 108109998Smarkm mul(rp[1],ap[1],w,c1); 109109998Smarkm mul(rp[2],ap[2],w,c1); 110109998Smarkm mul(rp[3],ap[3],w,c1); 111109998Smarkm ap+=4; rp+=4; num-=4; 112109998Smarkm } 113109998Smarkm if (num) 114109998Smarkm { 115109998Smarkm mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; 116109998Smarkm mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; 117109998Smarkm mul(rp[2],ap[2],w,c1); 118109998Smarkm } 119109998Smarkm return(c1); 120109998Smarkm } 121109998Smarkm 122109998Smarkmvoid bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 123109998Smarkm { 124109998Smarkm if (n <= 0) return; 125109998Smarkm 126109998Smarkm while (n&~3) 127109998Smarkm { 128109998Smarkm sqr(r[0],r[1],a[0]); 129109998Smarkm sqr(r[2],r[3],a[1]); 130109998Smarkm sqr(r[4],r[5],a[2]); 131109998Smarkm sqr(r[6],r[7],a[3]); 132109998Smarkm a+=4; r+=8; n-=4; 133109998Smarkm } 134109998Smarkm if (n) 135109998Smarkm { 136109998Smarkm sqr(r[0],r[1],a[0]); if (--n == 0) return; 137109998Smarkm sqr(r[2],r[3],a[1]); if (--n == 0) return; 138109998Smarkm sqr(r[4],r[5],a[2]); 139109998Smarkm } 140109998Smarkm } 141109998Smarkm 142109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 143109998Smarkm{ BN_ULONG ret,waste; 144109998Smarkm 145109998Smarkm asm ("divq %3" 146109998Smarkm : "=a"(ret),"=d"(waste) 147109998Smarkm : "a"(l),"d"(h),"g"(d) 148109998Smarkm : "cc"); 149109998Smarkm 150109998Smarkm return ret; 151109998Smarkm} 152109998Smarkm 153109998SmarkmBN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) 154109998Smarkm{ BN_ULONG ret,i; 155109998Smarkm 156109998Smarkm if (n <= 0) return 0; 157109998Smarkm 158109998Smarkm asm ( 159109998Smarkm " subq %2,%2 \n" 160109998Smarkm ".align 16 \n" 161109998Smarkm "1: movq (%4,%2,8),%0 \n" 162109998Smarkm " adcq (%5,%2,8),%0 \n" 163109998Smarkm " movq %0,(%3,%2,8) \n" 164109998Smarkm " leaq 1(%2),%2 \n" 165109998Smarkm " loop 1b \n" 166109998Smarkm " sbbq %0,%0 \n" 167109998Smarkm : "+a"(ret),"+c"(n),"+r"(i) 168109998Smarkm : "r"(rp),"r"(ap),"r"(bp) 169109998Smarkm : "cc" 170109998Smarkm ); 171109998Smarkm 172109998Smarkm return ret&1; 173109998Smarkm} 174109998Smarkm 175109998Smarkm#ifndef SIMICS 176109998SmarkmBN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) 177109998Smarkm{ BN_ULONG ret,i; 178109998Smarkm 179109998Smarkm if (n <= 0) return 0; 180109998Smarkm 181109998Smarkm asm ( 182109998Smarkm " subq %2,%2 \n" 183109998Smarkm ".align 16 \n" 184109998Smarkm "1: movq (%4,%2,8),%0 \n" 185109998Smarkm " sbbq (%5,%2,8),%0 \n" 186109998Smarkm " movq %0,(%3,%2,8) \n" 187109998Smarkm " leaq 1(%2),%2 \n" 188109998Smarkm " loop 1b \n" 189109998Smarkm " sbbq %0,%0 \n" 190109998Smarkm : "+a"(ret),"+c"(n),"+r"(i) 191109998Smarkm : "r"(rp),"r"(ap),"r"(bp) 192109998Smarkm : "cc" 193109998Smarkm ); 194109998Smarkm 195109998Smarkm return ret&1; 196109998Smarkm} 197109998Smarkm#else 198109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */ 199109998Smarkm#define BN_MASK2 0xffffffffffffffffL 200109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 201109998Smarkm { 202109998Smarkm BN_ULONG t1,t2; 203109998Smarkm int c=0; 204109998Smarkm 205109998Smarkm if (n <= 0) return((BN_ULONG)0); 206109998Smarkm 207109998Smarkm for (;;) 208109998Smarkm { 209109998Smarkm t1=a[0]; t2=b[0]; 210109998Smarkm r[0]=(t1-t2-c)&BN_MASK2; 211109998Smarkm if (t1 != t2) c=(t1 < t2); 212109998Smarkm if (--n <= 0) break; 213109998Smarkm 214109998Smarkm t1=a[1]; t2=b[1]; 215109998Smarkm r[1]=(t1-t2-c)&BN_MASK2; 216109998Smarkm if (t1 != t2) c=(t1 < t2); 217109998Smarkm if (--n <= 0) break; 218109998Smarkm 219109998Smarkm t1=a[2]; t2=b[2]; 220109998Smarkm r[2]=(t1-t2-c)&BN_MASK2; 221109998Smarkm if (t1 != t2) c=(t1 < t2); 222109998Smarkm if (--n <= 0) break; 223109998Smarkm 224109998Smarkm t1=a[3]; t2=b[3]; 225109998Smarkm r[3]=(t1-t2-c)&BN_MASK2; 226109998Smarkm if (t1 != t2) c=(t1 < t2); 227109998Smarkm if (--n <= 0) break; 228109998Smarkm 229109998Smarkm a+=4; 230109998Smarkm b+=4; 231109998Smarkm r+=4; 232109998Smarkm } 233109998Smarkm return(c); 234109998Smarkm } 235109998Smarkm#endif 236109998Smarkm 237109998Smarkm/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 238109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 239109998Smarkm/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 240109998Smarkm/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ 241109998Smarkm 242109998Smarkm#if 0 243109998Smarkm/* original macros are kept for reference purposes */ 244109998Smarkm#define mul_add_c(a,b,c0,c1,c2) { \ 245109998Smarkm BN_ULONG ta=(a),tb=(b); \ 246109998Smarkm t1 = ta * tb; \ 247109998Smarkm t2 = BN_UMULT_HIGH(ta,tb); \ 248109998Smarkm c0 += t1; t2 += (c0<t1)?1:0; \ 249109998Smarkm c1 += t2; c2 += (c1<t2)?1:0; \ 250109998Smarkm } 251109998Smarkm 252109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) { \ 253109998Smarkm BN_ULONG ta=(a),tb=(b),t0; \ 254109998Smarkm t1 = BN_UMULT_HIGH(ta,tb); \ 255109998Smarkm t0 = ta * tb; \ 256109998Smarkm t2 = t1+t1; c2 += (t2<t1)?1:0; \ 257109998Smarkm t1 = t0+t0; t2 += (t1<t0)?1:0; \ 258109998Smarkm c0 += t1; t2 += (c0<t1)?1:0; \ 259109998Smarkm c1 += t2; c2 += (c1<t2)?1:0; \ 260109998Smarkm } 261109998Smarkm#else 262109998Smarkm#define mul_add_c(a,b,c0,c1,c2) do { \ 263109998Smarkm asm ("mulq %3" \ 264109998Smarkm : "=a"(t1),"=d"(t2) \ 265109998Smarkm : "a"(a),"m"(b) \ 266109998Smarkm : "cc"); \ 267109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 268109998Smarkm : "+r"(c0),"+d"(t2) \ 269109998Smarkm : "a"(t1),"g"(0) \ 270109998Smarkm : "cc"); \ 271109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 272109998Smarkm : "+r"(c1),"+r"(c2) \ 273109998Smarkm : "d"(t2),"g"(0) \ 274109998Smarkm : "cc"); \ 275109998Smarkm } while (0) 276109998Smarkm 277109998Smarkm#define sqr_add_c(a,i,c0,c1,c2) do { \ 278109998Smarkm asm ("mulq %2" \ 279109998Smarkm : "=a"(t1),"=d"(t2) \ 280109998Smarkm : "a"(a[i]) \ 281109998Smarkm : "cc"); \ 282109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 283109998Smarkm : "+r"(c0),"+d"(t2) \ 284109998Smarkm : "a"(t1),"g"(0) \ 285109998Smarkm : "cc"); \ 286109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 287109998Smarkm : "+r"(c1),"+r"(c2) \ 288109998Smarkm : "d"(t2),"g"(0) \ 289109998Smarkm : "cc"); \ 290109998Smarkm } while (0) 291109998Smarkm 292109998Smarkm#define mul_add_c2(a,b,c0,c1,c2) do { \ 293109998Smarkm asm ("mulq %3" \ 294109998Smarkm : "=a"(t1),"=d"(t2) \ 295109998Smarkm : "a"(a),"m"(b) \ 296109998Smarkm : "cc"); \ 297109998Smarkm asm ("addq %0,%0; adcq %2,%1" \ 298109998Smarkm : "+d"(t2),"+r"(c2) \ 299109998Smarkm : "g"(0) \ 300109998Smarkm : "cc"); \ 301109998Smarkm asm ("addq %0,%0; adcq %2,%1" \ 302109998Smarkm : "+a"(t1),"+d"(t2) \ 303109998Smarkm : "g"(0) \ 304109998Smarkm : "cc"); \ 305109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 306109998Smarkm : "+r"(c0),"+d"(t2) \ 307109998Smarkm : "a"(t1),"g"(0) \ 308109998Smarkm : "cc"); \ 309109998Smarkm asm ("addq %2,%0; adcq %3,%1" \ 310109998Smarkm : "+r"(c1),"+r"(c2) \ 311109998Smarkm : "d"(t2),"g"(0) \ 312109998Smarkm : "cc"); \ 313109998Smarkm } while (0) 314109998Smarkm#endif 315109998Smarkm 316109998Smarkm#define sqr_add_c2(a,i,j,c0,c1,c2) \ 317109998Smarkm mul_add_c2((a)[i],(a)[j],c0,c1,c2) 318109998Smarkm 319109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 320109998Smarkm { 321109998Smarkm BN_ULONG bl,bh; 322109998Smarkm BN_ULONG t1,t2; 323109998Smarkm BN_ULONG c1,c2,c3; 324109998Smarkm 325109998Smarkm c1=0; 326109998Smarkm c2=0; 327109998Smarkm c3=0; 328109998Smarkm mul_add_c(a[0],b[0],c1,c2,c3); 329109998Smarkm r[0]=c1; 330109998Smarkm c1=0; 331109998Smarkm mul_add_c(a[0],b[1],c2,c3,c1); 332109998Smarkm mul_add_c(a[1],b[0],c2,c3,c1); 333109998Smarkm r[1]=c2; 334109998Smarkm c2=0; 335109998Smarkm mul_add_c(a[2],b[0],c3,c1,c2); 336109998Smarkm mul_add_c(a[1],b[1],c3,c1,c2); 337109998Smarkm mul_add_c(a[0],b[2],c3,c1,c2); 338109998Smarkm r[2]=c3; 339109998Smarkm c3=0; 340109998Smarkm mul_add_c(a[0],b[3],c1,c2,c3); 341109998Smarkm mul_add_c(a[1],b[2],c1,c2,c3); 342109998Smarkm mul_add_c(a[2],b[1],c1,c2,c3); 343109998Smarkm mul_add_c(a[3],b[0],c1,c2,c3); 344109998Smarkm r[3]=c1; 345109998Smarkm c1=0; 346109998Smarkm mul_add_c(a[4],b[0],c2,c3,c1); 347109998Smarkm mul_add_c(a[3],b[1],c2,c3,c1); 348109998Smarkm mul_add_c(a[2],b[2],c2,c3,c1); 349109998Smarkm mul_add_c(a[1],b[3],c2,c3,c1); 350109998Smarkm mul_add_c(a[0],b[4],c2,c3,c1); 351109998Smarkm r[4]=c2; 352109998Smarkm c2=0; 353109998Smarkm mul_add_c(a[0],b[5],c3,c1,c2); 354109998Smarkm mul_add_c(a[1],b[4],c3,c1,c2); 355109998Smarkm mul_add_c(a[2],b[3],c3,c1,c2); 356109998Smarkm mul_add_c(a[3],b[2],c3,c1,c2); 357109998Smarkm mul_add_c(a[4],b[1],c3,c1,c2); 358109998Smarkm mul_add_c(a[5],b[0],c3,c1,c2); 359109998Smarkm r[5]=c3; 360109998Smarkm c3=0; 361109998Smarkm mul_add_c(a[6],b[0],c1,c2,c3); 362109998Smarkm mul_add_c(a[5],b[1],c1,c2,c3); 363109998Smarkm mul_add_c(a[4],b[2],c1,c2,c3); 364109998Smarkm mul_add_c(a[3],b[3],c1,c2,c3); 365109998Smarkm mul_add_c(a[2],b[4],c1,c2,c3); 366109998Smarkm mul_add_c(a[1],b[5],c1,c2,c3); 367109998Smarkm mul_add_c(a[0],b[6],c1,c2,c3); 368109998Smarkm r[6]=c1; 369109998Smarkm c1=0; 370109998Smarkm mul_add_c(a[0],b[7],c2,c3,c1); 371109998Smarkm mul_add_c(a[1],b[6],c2,c3,c1); 372109998Smarkm mul_add_c(a[2],b[5],c2,c3,c1); 373109998Smarkm mul_add_c(a[3],b[4],c2,c3,c1); 374109998Smarkm mul_add_c(a[4],b[3],c2,c3,c1); 375109998Smarkm mul_add_c(a[5],b[2],c2,c3,c1); 376109998Smarkm mul_add_c(a[6],b[1],c2,c3,c1); 377109998Smarkm mul_add_c(a[7],b[0],c2,c3,c1); 378109998Smarkm r[7]=c2; 379109998Smarkm c2=0; 380109998Smarkm mul_add_c(a[7],b[1],c3,c1,c2); 381109998Smarkm mul_add_c(a[6],b[2],c3,c1,c2); 382109998Smarkm mul_add_c(a[5],b[3],c3,c1,c2); 383109998Smarkm mul_add_c(a[4],b[4],c3,c1,c2); 384109998Smarkm mul_add_c(a[3],b[5],c3,c1,c2); 385109998Smarkm mul_add_c(a[2],b[6],c3,c1,c2); 386109998Smarkm mul_add_c(a[1],b[7],c3,c1,c2); 387109998Smarkm r[8]=c3; 388109998Smarkm c3=0; 389109998Smarkm mul_add_c(a[2],b[7],c1,c2,c3); 390109998Smarkm mul_add_c(a[3],b[6],c1,c2,c3); 391109998Smarkm mul_add_c(a[4],b[5],c1,c2,c3); 392109998Smarkm mul_add_c(a[5],b[4],c1,c2,c3); 393109998Smarkm mul_add_c(a[6],b[3],c1,c2,c3); 394109998Smarkm mul_add_c(a[7],b[2],c1,c2,c3); 395109998Smarkm r[9]=c1; 396109998Smarkm c1=0; 397109998Smarkm mul_add_c(a[7],b[3],c2,c3,c1); 398109998Smarkm mul_add_c(a[6],b[4],c2,c3,c1); 399109998Smarkm mul_add_c(a[5],b[5],c2,c3,c1); 400109998Smarkm mul_add_c(a[4],b[6],c2,c3,c1); 401109998Smarkm mul_add_c(a[3],b[7],c2,c3,c1); 402109998Smarkm r[10]=c2; 403109998Smarkm c2=0; 404109998Smarkm mul_add_c(a[4],b[7],c3,c1,c2); 405109998Smarkm mul_add_c(a[5],b[6],c3,c1,c2); 406109998Smarkm mul_add_c(a[6],b[5],c3,c1,c2); 407109998Smarkm mul_add_c(a[7],b[4],c3,c1,c2); 408109998Smarkm r[11]=c3; 409109998Smarkm c3=0; 410109998Smarkm mul_add_c(a[7],b[5],c1,c2,c3); 411109998Smarkm mul_add_c(a[6],b[6],c1,c2,c3); 412109998Smarkm mul_add_c(a[5],b[7],c1,c2,c3); 413109998Smarkm r[12]=c1; 414109998Smarkm c1=0; 415109998Smarkm mul_add_c(a[6],b[7],c2,c3,c1); 416109998Smarkm mul_add_c(a[7],b[6],c2,c3,c1); 417109998Smarkm r[13]=c2; 418109998Smarkm c2=0; 419109998Smarkm mul_add_c(a[7],b[7],c3,c1,c2); 420109998Smarkm r[14]=c3; 421109998Smarkm r[15]=c1; 422109998Smarkm } 423109998Smarkm 424109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 425109998Smarkm { 426109998Smarkm BN_ULONG bl,bh; 427109998Smarkm BN_ULONG t1,t2; 428109998Smarkm BN_ULONG c1,c2,c3; 429109998Smarkm 430109998Smarkm c1=0; 431109998Smarkm c2=0; 432109998Smarkm c3=0; 433109998Smarkm mul_add_c(a[0],b[0],c1,c2,c3); 434109998Smarkm r[0]=c1; 435109998Smarkm c1=0; 436109998Smarkm mul_add_c(a[0],b[1],c2,c3,c1); 437109998Smarkm mul_add_c(a[1],b[0],c2,c3,c1); 438109998Smarkm r[1]=c2; 439109998Smarkm c2=0; 440109998Smarkm mul_add_c(a[2],b[0],c3,c1,c2); 441109998Smarkm mul_add_c(a[1],b[1],c3,c1,c2); 442109998Smarkm mul_add_c(a[0],b[2],c3,c1,c2); 443109998Smarkm r[2]=c3; 444109998Smarkm c3=0; 445109998Smarkm mul_add_c(a[0],b[3],c1,c2,c3); 446109998Smarkm mul_add_c(a[1],b[2],c1,c2,c3); 447109998Smarkm mul_add_c(a[2],b[1],c1,c2,c3); 448109998Smarkm mul_add_c(a[3],b[0],c1,c2,c3); 449109998Smarkm r[3]=c1; 450109998Smarkm c1=0; 451109998Smarkm mul_add_c(a[3],b[1],c2,c3,c1); 452109998Smarkm mul_add_c(a[2],b[2],c2,c3,c1); 453109998Smarkm mul_add_c(a[1],b[3],c2,c3,c1); 454109998Smarkm r[4]=c2; 455109998Smarkm c2=0; 456109998Smarkm mul_add_c(a[2],b[3],c3,c1,c2); 457109998Smarkm mul_add_c(a[3],b[2],c3,c1,c2); 458109998Smarkm r[5]=c3; 459109998Smarkm c3=0; 460109998Smarkm mul_add_c(a[3],b[3],c1,c2,c3); 461109998Smarkm r[6]=c1; 462109998Smarkm r[7]=c2; 463109998Smarkm } 464109998Smarkm 465109998Smarkmvoid bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 466109998Smarkm { 467109998Smarkm BN_ULONG bl,bh; 468109998Smarkm BN_ULONG t1,t2; 469109998Smarkm BN_ULONG c1,c2,c3; 470109998Smarkm 471109998Smarkm c1=0; 472109998Smarkm c2=0; 473109998Smarkm c3=0; 474109998Smarkm sqr_add_c(a,0,c1,c2,c3); 475109998Smarkm r[0]=c1; 476109998Smarkm c1=0; 477109998Smarkm sqr_add_c2(a,1,0,c2,c3,c1); 478109998Smarkm r[1]=c2; 479109998Smarkm c2=0; 480109998Smarkm sqr_add_c(a,1,c3,c1,c2); 481109998Smarkm sqr_add_c2(a,2,0,c3,c1,c2); 482109998Smarkm r[2]=c3; 483109998Smarkm c3=0; 484109998Smarkm sqr_add_c2(a,3,0,c1,c2,c3); 485109998Smarkm sqr_add_c2(a,2,1,c1,c2,c3); 486109998Smarkm r[3]=c1; 487109998Smarkm c1=0; 488109998Smarkm sqr_add_c(a,2,c2,c3,c1); 489109998Smarkm sqr_add_c2(a,3,1,c2,c3,c1); 490109998Smarkm sqr_add_c2(a,4,0,c2,c3,c1); 491109998Smarkm r[4]=c2; 492109998Smarkm c2=0; 493109998Smarkm sqr_add_c2(a,5,0,c3,c1,c2); 494109998Smarkm sqr_add_c2(a,4,1,c3,c1,c2); 495109998Smarkm sqr_add_c2(a,3,2,c3,c1,c2); 496109998Smarkm r[5]=c3; 497109998Smarkm c3=0; 498109998Smarkm sqr_add_c(a,3,c1,c2,c3); 499109998Smarkm sqr_add_c2(a,4,2,c1,c2,c3); 500109998Smarkm sqr_add_c2(a,5,1,c1,c2,c3); 501109998Smarkm sqr_add_c2(a,6,0,c1,c2,c3); 502109998Smarkm r[6]=c1; 503109998Smarkm c1=0; 504109998Smarkm sqr_add_c2(a,7,0,c2,c3,c1); 505109998Smarkm sqr_add_c2(a,6,1,c2,c3,c1); 506109998Smarkm sqr_add_c2(a,5,2,c2,c3,c1); 507109998Smarkm sqr_add_c2(a,4,3,c2,c3,c1); 508109998Smarkm r[7]=c2; 509109998Smarkm c2=0; 510109998Smarkm sqr_add_c(a,4,c3,c1,c2); 511109998Smarkm sqr_add_c2(a,5,3,c3,c1,c2); 512109998Smarkm sqr_add_c2(a,6,2,c3,c1,c2); 513109998Smarkm sqr_add_c2(a,7,1,c3,c1,c2); 514109998Smarkm r[8]=c3; 515109998Smarkm c3=0; 516109998Smarkm sqr_add_c2(a,7,2,c1,c2,c3); 517109998Smarkm sqr_add_c2(a,6,3,c1,c2,c3); 518109998Smarkm sqr_add_c2(a,5,4,c1,c2,c3); 519109998Smarkm r[9]=c1; 520109998Smarkm c1=0; 521109998Smarkm sqr_add_c(a,5,c2,c3,c1); 522109998Smarkm sqr_add_c2(a,6,4,c2,c3,c1); 523109998Smarkm sqr_add_c2(a,7,3,c2,c3,c1); 524109998Smarkm r[10]=c2; 525109998Smarkm c2=0; 526109998Smarkm sqr_add_c2(a,7,4,c3,c1,c2); 527109998Smarkm sqr_add_c2(a,6,5,c3,c1,c2); 528109998Smarkm r[11]=c3; 529109998Smarkm c3=0; 530109998Smarkm sqr_add_c(a,6,c1,c2,c3); 531109998Smarkm sqr_add_c2(a,7,5,c1,c2,c3); 532109998Smarkm r[12]=c1; 533109998Smarkm c1=0; 534109998Smarkm sqr_add_c2(a,7,6,c2,c3,c1); 535109998Smarkm r[13]=c2; 536109998Smarkm c2=0; 537109998Smarkm sqr_add_c(a,7,c3,c1,c2); 538109998Smarkm r[14]=c3; 539109998Smarkm r[15]=c1; 540109998Smarkm } 541109998Smarkm 542109998Smarkmvoid bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 543109998Smarkm { 544109998Smarkm BN_ULONG bl,bh; 545109998Smarkm BN_ULONG t1,t2; 546109998Smarkm BN_ULONG c1,c2,c3; 547109998Smarkm 548109998Smarkm c1=0; 549109998Smarkm c2=0; 550109998Smarkm c3=0; 551109998Smarkm sqr_add_c(a,0,c1,c2,c3); 552109998Smarkm r[0]=c1; 553109998Smarkm c1=0; 554109998Smarkm sqr_add_c2(a,1,0,c2,c3,c1); 555109998Smarkm r[1]=c2; 556109998Smarkm c2=0; 557109998Smarkm sqr_add_c(a,1,c3,c1,c2); 558109998Smarkm sqr_add_c2(a,2,0,c3,c1,c2); 559109998Smarkm r[2]=c3; 560109998Smarkm c3=0; 561109998Smarkm sqr_add_c2(a,3,0,c1,c2,c3); 562109998Smarkm sqr_add_c2(a,2,1,c1,c2,c3); 563109998Smarkm r[3]=c1; 564109998Smarkm c1=0; 565109998Smarkm sqr_add_c(a,2,c2,c3,c1); 566109998Smarkm sqr_add_c2(a,3,1,c2,c3,c1); 567109998Smarkm r[4]=c2; 568109998Smarkm c2=0; 569109998Smarkm sqr_add_c2(a,3,2,c3,c1,c2); 570109998Smarkm r[5]=c3; 571109998Smarkm c3=0; 572109998Smarkm sqr_add_c(a,3,c1,c2,c3); 573109998Smarkm r[6]=c1; 574109998Smarkm r[7]=c2; 575109998Smarkm } 576