1205128Ssimon#include "../bn_lcl.h" 2162911Ssimon#ifdef __SUNPRO_C 3296465Sdelphij# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ 4162911Ssimon#else 5296465Sdelphij/*- 6109998Smarkm * x86_64 BIGNUM accelerator version 0.1, December 2002. 7109998Smarkm * 8109998Smarkm * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 9109998Smarkm * project. 10109998Smarkm * 11109998Smarkm * Rights for redistribution and usage in source and binary forms are 12109998Smarkm * granted according to the OpenSSL license. Warranty of any kind is 13109998Smarkm * disclaimed. 14109998Smarkm * 15109998Smarkm * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 16109998Smarkm * versions, like 1.0... 17109998Smarkm * A. Well, that's because this code is basically a quick-n-dirty 18109998Smarkm * proof-of-concept hack. As you can see it's implemented with 19109998Smarkm * inline assembler, which means that you're bound to GCC and that 20160814Ssimon * there might be enough room for further improvement. 21109998Smarkm * 22109998Smarkm * Q. Why inline assembler? 23160814Ssimon * A. x86_64 features own ABI which I'm not familiar with. This is 24160814Ssimon * why I decided to let the compiler take care of subroutine 25160814Ssimon * prologue/epilogue as well as register allocation. For reference. 26160814Ssimon * Win64 implements different ABI for AMD64, different from Linux. 27109998Smarkm * 28109998Smarkm * Q. How much faster does it get? 29160814Ssimon * A. 'apps/openssl speed rsa dsa' output with no-asm: 30160814Ssimon * 31296465Sdelphij * sign verify sign/s verify/s 32296465Sdelphij * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 33296465Sdelphij * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 34296465Sdelphij * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 35296465Sdelphij * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 36296465Sdelphij * sign verify sign/s verify/s 37296465Sdelphij * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 38296465Sdelphij * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 39296465Sdelphij * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 40160814Ssimon * 41160814Ssimon * 'apps/openssl speed rsa dsa' output with this module: 42160814Ssimon * 43296465Sdelphij * sign verify sign/s verify/s 44296465Sdelphij * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 45296465Sdelphij * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 46296465Sdelphij * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 47296465Sdelphij * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 48296465Sdelphij * sign verify sign/s verify/s 49296465Sdelphij * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 50296465Sdelphij * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 51296465Sdelphij * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 52160814Ssimon * 53160814Ssimon * For the reference. IA-32 assembler implementation performs 54160814Ssimon * very much like 64-bit code compiled with no-asm on the same 55160814Ssimon * machine. 56109998Smarkm */ 57109998Smarkm 58296465Sdelphij# define BN_ULONG unsigned long 59109998Smarkm 60296465Sdelphij# undef mul 61296465Sdelphij# undef mul_add 62296465Sdelphij# undef sqr 63205128Ssimon 64296465Sdelphij/*- 65296465Sdelphij * "m"(a), "+m"(r) is the way to favor DirectPath �-code; 66296465Sdelphij * "g"(0) let the compiler to decide where does it 67296465Sdelphij * want to keep the value of zero; 68109998Smarkm */ 69296465Sdelphij# define mul_add(r,a,word,carry) do { \ 70296465Sdelphij register BN_ULONG high,low; \ 71296465Sdelphij asm ("mulq %3" \ 72296465Sdelphij : "=a"(low),"=d"(high) \ 73296465Sdelphij : "a"(word),"m"(a) \ 74296465Sdelphij : "cc"); \ 75296465Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 76296465Sdelphij : "+r"(carry),"+d"(high)\ 77296465Sdelphij : "a"(low),"g"(0) \ 78296465Sdelphij : "cc"); \ 79296465Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 80296465Sdelphij : "+m"(r),"+d"(high) \ 81296465Sdelphij : "r"(carry),"g"(0) \ 82296465Sdelphij : "cc"); \ 83296465Sdelphij carry=high; \ 84296465Sdelphij } while (0) 85109998Smarkm 86296465Sdelphij# define mul(r,a,word,carry) do { \ 87296465Sdelphij register BN_ULONG high,low; \ 88296465Sdelphij asm ("mulq %3" \ 89296465Sdelphij : "=a"(low),"=d"(high) \ 90296465Sdelphij : "a"(word),"g"(a) \ 91296465Sdelphij : "cc"); \ 92296465Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 93296465Sdelphij : "+r"(carry),"+d"(high)\ 94296465Sdelphij : "a"(low),"g"(0) \ 95296465Sdelphij : "cc"); \ 96296465Sdelphij (r)=carry, carry=high; \ 97296465Sdelphij } while (0) 98109998Smarkm 99296465Sdelphij# define sqr(r0,r1,a) \ 100296465Sdelphij asm ("mulq %2" \ 101296465Sdelphij : "=a"(r0),"=d"(r1) \ 102296465Sdelphij : "a"(a) \ 103296465Sdelphij : "cc"); 104109998Smarkm 105296465SdelphijBN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 106296465Sdelphij BN_ULONG w) 107296465Sdelphij{ 108296465Sdelphij BN_ULONG c1 = 0; 109109998Smarkm 110296465Sdelphij if (num <= 0) 111296465Sdelphij return (c1); 112109998Smarkm 113296465Sdelphij while (num & ~3) { 114296465Sdelphij mul_add(rp[0], ap[0], w, c1); 115296465Sdelphij mul_add(rp[1], ap[1], w, c1); 116296465Sdelphij mul_add(rp[2], ap[2], w, c1); 117296465Sdelphij mul_add(rp[3], ap[3], w, c1); 118296465Sdelphij ap += 4; 119296465Sdelphij rp += 4; 120296465Sdelphij num -= 4; 121296465Sdelphij } 122296465Sdelphij if (num) { 123296465Sdelphij mul_add(rp[0], ap[0], w, c1); 124296465Sdelphij if (--num == 0) 125296465Sdelphij return c1; 126296465Sdelphij mul_add(rp[1], ap[1], w, c1); 127296465Sdelphij if (--num == 0) 128296465Sdelphij return c1; 129296465Sdelphij mul_add(rp[2], ap[2], w, c1); 130296465Sdelphij return c1; 131296465Sdelphij } 132109998Smarkm 133296465Sdelphij return (c1); 134296465Sdelphij} 135296465Sdelphij 136205128SsimonBN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 137296465Sdelphij{ 138296465Sdelphij BN_ULONG c1 = 0; 139109998Smarkm 140296465Sdelphij if (num <= 0) 141296465Sdelphij return (c1); 142109998Smarkm 143296465Sdelphij while (num & ~3) { 144296465Sdelphij mul(rp[0], ap[0], w, c1); 145296465Sdelphij mul(rp[1], ap[1], w, c1); 146296465Sdelphij mul(rp[2], ap[2], w, c1); 147296465Sdelphij mul(rp[3], ap[3], w, c1); 148296465Sdelphij ap += 4; 149296465Sdelphij rp += 4; 150296465Sdelphij num -= 4; 151296465Sdelphij } 152296465Sdelphij if (num) { 153296465Sdelphij mul(rp[0], ap[0], w, c1); 154296465Sdelphij if (--num == 0) 155296465Sdelphij return c1; 156296465Sdelphij mul(rp[1], ap[1], w, c1); 157296465Sdelphij if (--num == 0) 158296465Sdelphij return c1; 159296465Sdelphij mul(rp[2], ap[2], w, c1); 160296465Sdelphij } 161296465Sdelphij return (c1); 162296465Sdelphij} 163109998Smarkm 164205128Ssimonvoid bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 165296465Sdelphij{ 166296465Sdelphij if (n <= 0) 167296465Sdelphij return; 168109998Smarkm 169296465Sdelphij while (n & ~3) { 170296465Sdelphij sqr(r[0], r[1], a[0]); 171296465Sdelphij sqr(r[2], r[3], a[1]); 172296465Sdelphij sqr(r[4], r[5], a[2]); 173296465Sdelphij sqr(r[6], r[7], a[3]); 174296465Sdelphij a += 4; 175296465Sdelphij r += 8; 176296465Sdelphij n -= 4; 177296465Sdelphij } 178296465Sdelphij if (n) { 179296465Sdelphij sqr(r[0], r[1], a[0]); 180296465Sdelphij if (--n == 0) 181296465Sdelphij return; 182296465Sdelphij sqr(r[2], r[3], a[1]); 183296465Sdelphij if (--n == 0) 184296465Sdelphij return; 185296465Sdelphij sqr(r[4], r[5], a[2]); 186296465Sdelphij } 187296465Sdelphij} 188109998Smarkm 189109998SmarkmBN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 190296465Sdelphij{ 191296465Sdelphij BN_ULONG ret, waste; 192109998Smarkm 193296465Sdelphij asm("divq %4":"=a"(ret), "=d"(waste) 194296465Sdelphij : "a"(l), "d"(h), "g"(d) 195296465Sdelphij : "cc"); 196109998Smarkm 197296465Sdelphij return ret; 198109998Smarkm} 199109998Smarkm 200296465SdelphijBN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 201296465Sdelphij int n) 202296465Sdelphij{ 203296465Sdelphij BN_ULONG ret = 0, i = 0; 204109998Smarkm 205296465Sdelphij if (n <= 0) 206296465Sdelphij return 0; 207109998Smarkm 208296465Sdelphij asm volatile (" subq %2,%2 \n" 209296465Sdelphij ".align 16 \n" 210296465Sdelphij "1: movq (%4,%2,8),%0 \n" 211296465Sdelphij " adcq (%5,%2,8),%0 \n" 212296465Sdelphij " movq %0,(%3,%2,8) \n" 213296465Sdelphij " leaq 1(%2),%2 \n" 214296465Sdelphij " loop 1b \n" 215296465Sdelphij " sbbq %0,%0 \n":"=&a" (ret), "+c"(n), 216296465Sdelphij "=&r"(i) 217296465Sdelphij :"r"(rp), "r"(ap), "r"(bp) 218296465Sdelphij :"cc", "memory"); 219109998Smarkm 220296465Sdelphij return ret & 1; 221109998Smarkm} 222109998Smarkm 223296465Sdelphij# ifndef SIMICS 224296465SdelphijBN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 225296465Sdelphij int n) 226296465Sdelphij{ 227296465Sdelphij BN_ULONG ret = 0, i = 0; 228109998Smarkm 229296465Sdelphij if (n <= 0) 230296465Sdelphij return 0; 231109998Smarkm 232296465Sdelphij asm volatile (" subq %2,%2 \n" 233296465Sdelphij ".align 16 \n" 234296465Sdelphij "1: movq (%4,%2,8),%0 \n" 235296465Sdelphij " sbbq (%5,%2,8),%0 \n" 236296465Sdelphij " movq %0,(%3,%2,8) \n" 237296465Sdelphij " leaq 1(%2),%2 \n" 238296465Sdelphij " loop 1b \n" 239296465Sdelphij " sbbq %0,%0 \n":"=&a" (ret), "+c"(n), 240296465Sdelphij "=&r"(i) 241296465Sdelphij :"r"(rp), "r"(ap), "r"(bp) 242296465Sdelphij :"cc", "memory"); 243109998Smarkm 244296465Sdelphij return ret & 1; 245109998Smarkm} 246296465Sdelphij# else 247109998Smarkm/* Simics 1.4<7 has buggy sbbq:-( */ 248296465Sdelphij# define BN_MASK2 0xffffffffffffffffL 249109998SmarkmBN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 250296465Sdelphij{ 251296465Sdelphij BN_ULONG t1, t2; 252296465Sdelphij int c = 0; 253109998Smarkm 254296465Sdelphij if (n <= 0) 255296465Sdelphij return ((BN_ULONG)0); 256109998Smarkm 257296465Sdelphij for (;;) { 258296465Sdelphij t1 = a[0]; 259296465Sdelphij t2 = b[0]; 260296465Sdelphij r[0] = (t1 - t2 - c) & BN_MASK2; 261296465Sdelphij if (t1 != t2) 262296465Sdelphij c = (t1 < t2); 263296465Sdelphij if (--n <= 0) 264296465Sdelphij break; 265109998Smarkm 266296465Sdelphij t1 = a[1]; 267296465Sdelphij t2 = b[1]; 268296465Sdelphij r[1] = (t1 - t2 - c) & BN_MASK2; 269296465Sdelphij if (t1 != t2) 270296465Sdelphij c = (t1 < t2); 271296465Sdelphij if (--n <= 0) 272296465Sdelphij break; 273109998Smarkm 274296465Sdelphij t1 = a[2]; 275296465Sdelphij t2 = b[2]; 276296465Sdelphij r[2] = (t1 - t2 - c) & BN_MASK2; 277296465Sdelphij if (t1 != t2) 278296465Sdelphij c = (t1 < t2); 279296465Sdelphij if (--n <= 0) 280296465Sdelphij break; 281109998Smarkm 282296465Sdelphij t1 = a[3]; 283296465Sdelphij t2 = b[3]; 284296465Sdelphij r[3] = (t1 - t2 - c) & BN_MASK2; 285296465Sdelphij if (t1 != t2) 286296465Sdelphij c = (t1 < t2); 287296465Sdelphij if (--n <= 0) 288296465Sdelphij break; 289109998Smarkm 290296465Sdelphij a += 4; 291296465Sdelphij b += 4; 292296465Sdelphij r += 4; 293296465Sdelphij } 294296465Sdelphij return (c); 295296465Sdelphij} 296296465Sdelphij# endif 297109998Smarkm 298109998Smarkm/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 299109998Smarkm/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 300109998Smarkm/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 301296465Sdelphij/* 302296465Sdelphij * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 303296465Sdelphij * c=(c2,c1,c0) 304296465Sdelphij */ 305109998Smarkm 306277195Sdelphij/* 307277195Sdelphij * Keep in mind that carrying into high part of multiplication result 308277195Sdelphij * can not overflow, because it cannot be all-ones. 309277195Sdelphij */ 310296465Sdelphij# if 0 311109998Smarkm/* original macros are kept for reference purposes */ 312296465Sdelphij# define mul_add_c(a,b,c0,c1,c2) { \ 313296465Sdelphij BN_ULONG ta=(a),tb=(b); \ 314296465Sdelphij t1 = ta * tb; \ 315296465Sdelphij t2 = BN_UMULT_HIGH(ta,tb); \ 316296465Sdelphij c0 += t1; t2 += (c0<t1)?1:0; \ 317296465Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 318296465Sdelphij } 319109998Smarkm 320296465Sdelphij# define mul_add_c2(a,b,c0,c1,c2) { \ 321296465Sdelphij BN_ULONG ta=(a),tb=(b),t0; \ 322296465Sdelphij t1 = BN_UMULT_HIGH(ta,tb); \ 323296465Sdelphij t0 = ta * tb; \ 324296465Sdelphij c0 += t0; t2 = t1+((c0<t0)?1:0);\ 325296465Sdelphij c1 += t2; c2 += (c1<t2)?1:0; \ 326296465Sdelphij c0 += t0; t1 += (c0<t0)?1:0; \ 327296465Sdelphij c1 += t1; c2 += (c1<t1)?1:0; \ 328296465Sdelphij } 329296465Sdelphij# else 330296465Sdelphij# define mul_add_c(a,b,c0,c1,c2) do { \ 331296465Sdelphij asm ("mulq %3" \ 332296465Sdelphij : "=a"(t1),"=d"(t2) \ 333296465Sdelphij : "a"(a),"m"(b) \ 334296465Sdelphij : "cc"); \ 335296465Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 336296465Sdelphij : "+r"(c0),"+d"(t2) \ 337296465Sdelphij : "a"(t1),"g"(0) \ 338296465Sdelphij : "cc"); \ 339296465Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 340296465Sdelphij : "+r"(c1),"+r"(c2) \ 341296465Sdelphij : "d"(t2),"g"(0) \ 342296465Sdelphij : "cc"); \ 343296465Sdelphij } while (0) 344109998Smarkm 345296465Sdelphij# define sqr_add_c(a,i,c0,c1,c2) do { \ 346296465Sdelphij asm ("mulq %2" \ 347296465Sdelphij : "=a"(t1),"=d"(t2) \ 348296465Sdelphij : "a"(a[i]) \ 349296465Sdelphij : "cc"); \ 350296465Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 351296465Sdelphij : "+r"(c0),"+d"(t2) \ 352296465Sdelphij : "a"(t1),"g"(0) \ 353296465Sdelphij : "cc"); \ 354296465Sdelphij asm ("addq %2,%0; adcq %3,%1" \ 355296465Sdelphij : "+r"(c1),"+r"(c2) \ 356296465Sdelphij : "d"(t2),"g"(0) \ 357296465Sdelphij : "cc"); \ 358296465Sdelphij } while (0) 359109998Smarkm 360296465Sdelphij# define mul_add_c2(a,b,c0,c1,c2) do { \ 361296465Sdelphij asm ("mulq %3" \ 362296465Sdelphij : "=a"(t1),"=d"(t2) \ 363296465Sdelphij : "a"(a),"m"(b) \ 364296465Sdelphij : "cc"); \ 365296465Sdelphij asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 366296465Sdelphij : "+r"(c0),"+r"(c1),"+r"(c2) \ 367296465Sdelphij : "r"(t1),"r"(t2),"g"(0) \ 368296465Sdelphij : "cc"); \ 369296465Sdelphij asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ 370296465Sdelphij : "+r"(c0),"+r"(c1),"+r"(c2) \ 371296465Sdelphij : "r"(t1),"r"(t2),"g"(0) \ 372296465Sdelphij : "cc"); \ 373296465Sdelphij } while (0) 374296465Sdelphij# endif 375109998Smarkm 376296465Sdelphij# define sqr_add_c2(a,i,j,c0,c1,c2) \ 377296465Sdelphij mul_add_c2((a)[i],(a)[j],c0,c1,c2) 378109998Smarkm 379109998Smarkmvoid bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 380296465Sdelphij{ 381296465Sdelphij BN_ULONG t1, t2; 382296465Sdelphij BN_ULONG c1, c2, c3; 383109998Smarkm 384296465Sdelphij c1 = 0; 385296465Sdelphij c2 = 0; 386296465Sdelphij c3 = 0; 387296465Sdelphij mul_add_c(a[0], b[0], c1, c2, c3); 388296465Sdelphij r[0] = c1; 389296465Sdelphij c1 = 0; 390296465Sdelphij mul_add_c(a[0], b[1], c2, c3, c1); 391296465Sdelphij mul_add_c(a[1], b[0], c2, c3, c1); 392296465Sdelphij r[1] = c2; 393296465Sdelphij c2 = 0; 394296465Sdelphij mul_add_c(a[2], b[0], c3, c1, c2); 395296465Sdelphij mul_add_c(a[1], b[1], c3, c1, c2); 396296465Sdelphij mul_add_c(a[0], b[2], c3, c1, c2); 397296465Sdelphij r[2] = c3; 398296465Sdelphij c3 = 0; 399296465Sdelphij mul_add_c(a[0], b[3], c1, c2, c3); 400296465Sdelphij mul_add_c(a[1], b[2], c1, c2, c3); 401296465Sdelphij mul_add_c(a[2], b[1], c1, c2, c3); 402296465Sdelphij mul_add_c(a[3], b[0], c1, c2, c3); 403296465Sdelphij r[3] = c1; 404296465Sdelphij c1 = 0; 405296465Sdelphij mul_add_c(a[4], b[0], c2, c3, c1); 406296465Sdelphij mul_add_c(a[3], b[1], c2, c3, c1); 407296465Sdelphij mul_add_c(a[2], b[2], c2, c3, c1); 408296465Sdelphij mul_add_c(a[1], b[3], c2, c3, c1); 409296465Sdelphij mul_add_c(a[0], b[4], c2, c3, c1); 410296465Sdelphij r[4] = c2; 411296465Sdelphij c2 = 0; 412296465Sdelphij mul_add_c(a[0], b[5], c3, c1, c2); 413296465Sdelphij mul_add_c(a[1], b[4], c3, c1, c2); 414296465Sdelphij mul_add_c(a[2], b[3], c3, c1, c2); 415296465Sdelphij mul_add_c(a[3], b[2], c3, c1, c2); 416296465Sdelphij mul_add_c(a[4], b[1], c3, c1, c2); 417296465Sdelphij mul_add_c(a[5], b[0], c3, c1, c2); 418296465Sdelphij r[5] = c3; 419296465Sdelphij c3 = 0; 420296465Sdelphij mul_add_c(a[6], b[0], c1, c2, c3); 421296465Sdelphij mul_add_c(a[5], b[1], c1, c2, c3); 422296465Sdelphij mul_add_c(a[4], b[2], c1, c2, c3); 423296465Sdelphij mul_add_c(a[3], b[3], c1, c2, c3); 424296465Sdelphij mul_add_c(a[2], b[4], c1, c2, c3); 425296465Sdelphij mul_add_c(a[1], b[5], c1, c2, c3); 426296465Sdelphij mul_add_c(a[0], b[6], c1, c2, c3); 427296465Sdelphij r[6] = c1; 428296465Sdelphij c1 = 0; 429296465Sdelphij mul_add_c(a[0], b[7], c2, c3, c1); 430296465Sdelphij mul_add_c(a[1], b[6], c2, c3, c1); 431296465Sdelphij mul_add_c(a[2], b[5], c2, c3, c1); 432296465Sdelphij mul_add_c(a[3], b[4], c2, c3, c1); 433296465Sdelphij mul_add_c(a[4], b[3], c2, c3, c1); 434296465Sdelphij mul_add_c(a[5], b[2], c2, c3, c1); 435296465Sdelphij mul_add_c(a[6], b[1], c2, c3, c1); 436296465Sdelphij mul_add_c(a[7], b[0], c2, c3, c1); 437296465Sdelphij r[7] = c2; 438296465Sdelphij c2 = 0; 439296465Sdelphij mul_add_c(a[7], b[1], c3, c1, c2); 440296465Sdelphij mul_add_c(a[6], b[2], c3, c1, c2); 441296465Sdelphij mul_add_c(a[5], b[3], c3, c1, c2); 442296465Sdelphij mul_add_c(a[4], b[4], c3, c1, c2); 443296465Sdelphij mul_add_c(a[3], b[5], c3, c1, c2); 444296465Sdelphij mul_add_c(a[2], b[6], c3, c1, c2); 445296465Sdelphij mul_add_c(a[1], b[7], c3, c1, c2); 446296465Sdelphij r[8] = c3; 447296465Sdelphij c3 = 0; 448296465Sdelphij mul_add_c(a[2], b[7], c1, c2, c3); 449296465Sdelphij mul_add_c(a[3], b[6], c1, c2, c3); 450296465Sdelphij mul_add_c(a[4], b[5], c1, c2, c3); 451296465Sdelphij mul_add_c(a[5], b[4], c1, c2, c3); 452296465Sdelphij mul_add_c(a[6], b[3], c1, c2, c3); 453296465Sdelphij mul_add_c(a[7], b[2], c1, c2, c3); 454296465Sdelphij r[9] = c1; 455296465Sdelphij c1 = 0; 456296465Sdelphij mul_add_c(a[7], b[3], c2, c3, c1); 457296465Sdelphij mul_add_c(a[6], b[4], c2, c3, c1); 458296465Sdelphij mul_add_c(a[5], b[5], c2, c3, c1); 459296465Sdelphij mul_add_c(a[4], b[6], c2, c3, c1); 460296465Sdelphij mul_add_c(a[3], b[7], c2, c3, c1); 461296465Sdelphij r[10] = c2; 462296465Sdelphij c2 = 0; 463296465Sdelphij mul_add_c(a[4], b[7], c3, c1, c2); 464296465Sdelphij mul_add_c(a[5], b[6], c3, c1, c2); 465296465Sdelphij mul_add_c(a[6], b[5], c3, c1, c2); 466296465Sdelphij mul_add_c(a[7], b[4], c3, c1, c2); 467296465Sdelphij r[11] = c3; 468296465Sdelphij c3 = 0; 469296465Sdelphij mul_add_c(a[7], b[5], c1, c2, c3); 470296465Sdelphij mul_add_c(a[6], b[6], c1, c2, c3); 471296465Sdelphij mul_add_c(a[5], b[7], c1, c2, c3); 472296465Sdelphij r[12] = c1; 473296465Sdelphij c1 = 0; 474296465Sdelphij mul_add_c(a[6], b[7], c2, c3, c1); 475296465Sdelphij mul_add_c(a[7], b[6], c2, c3, c1); 476296465Sdelphij r[13] = c2; 477296465Sdelphij c2 = 0; 478296465Sdelphij mul_add_c(a[7], b[7], c3, c1, c2); 479296465Sdelphij r[14] = c3; 480296465Sdelphij r[15] = c1; 481296465Sdelphij} 482109998Smarkm 483109998Smarkmvoid bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 484296465Sdelphij{ 485296465Sdelphij BN_ULONG t1, t2; 486296465Sdelphij BN_ULONG c1, c2, c3; 487109998Smarkm 488296465Sdelphij c1 = 0; 489296465Sdelphij c2 = 0; 490296465Sdelphij c3 = 0; 491296465Sdelphij mul_add_c(a[0], b[0], c1, c2, c3); 492296465Sdelphij r[0] = c1; 493296465Sdelphij c1 = 0; 494296465Sdelphij mul_add_c(a[0], b[1], c2, c3, c1); 495296465Sdelphij mul_add_c(a[1], b[0], c2, c3, c1); 496296465Sdelphij r[1] = c2; 497296465Sdelphij c2 = 0; 498296465Sdelphij mul_add_c(a[2], b[0], c3, c1, c2); 499296465Sdelphij mul_add_c(a[1], b[1], c3, c1, c2); 500296465Sdelphij mul_add_c(a[0], b[2], c3, c1, c2); 501296465Sdelphij r[2] = c3; 502296465Sdelphij c3 = 0; 503296465Sdelphij mul_add_c(a[0], b[3], c1, c2, c3); 504296465Sdelphij mul_add_c(a[1], b[2], c1, c2, c3); 505296465Sdelphij mul_add_c(a[2], b[1], c1, c2, c3); 506296465Sdelphij mul_add_c(a[3], b[0], c1, c2, c3); 507296465Sdelphij r[3] = c1; 508296465Sdelphij c1 = 0; 509296465Sdelphij mul_add_c(a[3], b[1], c2, c3, c1); 510296465Sdelphij mul_add_c(a[2], b[2], c2, c3, c1); 511296465Sdelphij mul_add_c(a[1], b[3], c2, c3, c1); 512296465Sdelphij r[4] = c2; 513296465Sdelphij c2 = 0; 514296465Sdelphij mul_add_c(a[2], b[3], c3, c1, c2); 515296465Sdelphij mul_add_c(a[3], b[2], c3, c1, c2); 516296465Sdelphij r[5] = c3; 517296465Sdelphij c3 = 0; 518296465Sdelphij mul_add_c(a[3], b[3], c1, c2, c3); 519296465Sdelphij r[6] = c1; 520296465Sdelphij r[7] = c2; 521296465Sdelphij} 522109998Smarkm 523205128Ssimonvoid bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 524296465Sdelphij{ 525296465Sdelphij BN_ULONG t1, t2; 526296465Sdelphij BN_ULONG c1, c2, c3; 527109998Smarkm 528296465Sdelphij c1 = 0; 529296465Sdelphij c2 = 0; 530296465Sdelphij c3 = 0; 531296465Sdelphij sqr_add_c(a, 0, c1, c2, c3); 532296465Sdelphij r[0] = c1; 533296465Sdelphij c1 = 0; 534296465Sdelphij sqr_add_c2(a, 1, 0, c2, c3, c1); 535296465Sdelphij r[1] = c2; 536296465Sdelphij c2 = 0; 537296465Sdelphij sqr_add_c(a, 1, c3, c1, c2); 538296465Sdelphij sqr_add_c2(a, 2, 0, c3, c1, c2); 539296465Sdelphij r[2] = c3; 540296465Sdelphij c3 = 0; 541296465Sdelphij sqr_add_c2(a, 3, 0, c1, c2, c3); 542296465Sdelphij sqr_add_c2(a, 2, 1, c1, c2, c3); 543296465Sdelphij r[3] = c1; 544296465Sdelphij c1 = 0; 545296465Sdelphij sqr_add_c(a, 2, c2, c3, c1); 546296465Sdelphij sqr_add_c2(a, 3, 1, c2, c3, c1); 547296465Sdelphij sqr_add_c2(a, 4, 0, c2, c3, c1); 548296465Sdelphij r[4] = c2; 549296465Sdelphij c2 = 0; 550296465Sdelphij sqr_add_c2(a, 5, 0, c3, c1, c2); 551296465Sdelphij sqr_add_c2(a, 4, 1, c3, c1, c2); 552296465Sdelphij sqr_add_c2(a, 3, 2, c3, c1, c2); 553296465Sdelphij r[5] = c3; 554296465Sdelphij c3 = 0; 555296465Sdelphij sqr_add_c(a, 3, c1, c2, c3); 556296465Sdelphij sqr_add_c2(a, 4, 2, c1, c2, c3); 557296465Sdelphij sqr_add_c2(a, 5, 1, c1, c2, c3); 558296465Sdelphij sqr_add_c2(a, 6, 0, c1, c2, c3); 559296465Sdelphij r[6] = c1; 560296465Sdelphij c1 = 0; 561296465Sdelphij sqr_add_c2(a, 7, 0, c2, c3, c1); 562296465Sdelphij sqr_add_c2(a, 6, 1, c2, c3, c1); 563296465Sdelphij sqr_add_c2(a, 5, 2, c2, c3, c1); 564296465Sdelphij sqr_add_c2(a, 4, 3, c2, c3, c1); 565296465Sdelphij r[7] = c2; 566296465Sdelphij c2 = 0; 567296465Sdelphij sqr_add_c(a, 4, c3, c1, c2); 568296465Sdelphij sqr_add_c2(a, 5, 3, c3, c1, c2); 569296465Sdelphij sqr_add_c2(a, 6, 2, c3, c1, c2); 570296465Sdelphij sqr_add_c2(a, 7, 1, c3, c1, c2); 571296465Sdelphij r[8] = c3; 572296465Sdelphij c3 = 0; 573296465Sdelphij sqr_add_c2(a, 7, 2, c1, c2, c3); 574296465Sdelphij sqr_add_c2(a, 6, 3, c1, c2, c3); 575296465Sdelphij sqr_add_c2(a, 5, 4, c1, c2, c3); 576296465Sdelphij r[9] = c1; 577296465Sdelphij c1 = 0; 578296465Sdelphij sqr_add_c(a, 5, c2, c3, c1); 579296465Sdelphij sqr_add_c2(a, 6, 4, c2, c3, c1); 580296465Sdelphij sqr_add_c2(a, 7, 3, c2, c3, c1); 581296465Sdelphij r[10] = c2; 582296465Sdelphij c2 = 0; 583296465Sdelphij sqr_add_c2(a, 7, 4, c3, c1, c2); 584296465Sdelphij sqr_add_c2(a, 6, 5, c3, c1, c2); 585296465Sdelphij r[11] = c3; 586296465Sdelphij c3 = 0; 587296465Sdelphij sqr_add_c(a, 6, c1, c2, c3); 588296465Sdelphij sqr_add_c2(a, 7, 5, c1, c2, c3); 589296465Sdelphij r[12] = c1; 590296465Sdelphij c1 = 0; 591296465Sdelphij sqr_add_c2(a, 7, 6, c2, c3, c1); 592296465Sdelphij r[13] = c2; 593296465Sdelphij c2 = 0; 594296465Sdelphij sqr_add_c(a, 7, c3, c1, c2); 595296465Sdelphij r[14] = c3; 596296465Sdelphij r[15] = c1; 597296465Sdelphij} 598109998Smarkm 599205128Ssimonvoid bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 600296465Sdelphij{ 601296465Sdelphij BN_ULONG t1, t2; 602296465Sdelphij BN_ULONG c1, c2, c3; 603109998Smarkm 604296465Sdelphij c1 = 0; 605296465Sdelphij c2 = 0; 606296465Sdelphij c3 = 0; 607296465Sdelphij sqr_add_c(a, 0, c1, c2, c3); 608296465Sdelphij r[0] = c1; 609296465Sdelphij c1 = 0; 610296465Sdelphij sqr_add_c2(a, 1, 0, c2, c3, c1); 611296465Sdelphij r[1] = c2; 612296465Sdelphij c2 = 0; 613296465Sdelphij sqr_add_c(a, 1, c3, c1, c2); 614296465Sdelphij sqr_add_c2(a, 2, 0, c3, c1, c2); 615296465Sdelphij r[2] = c3; 616296465Sdelphij c3 = 0; 617296465Sdelphij sqr_add_c2(a, 3, 0, c1, c2, c3); 618296465Sdelphij sqr_add_c2(a, 2, 1, c1, c2, c3); 619296465Sdelphij r[3] = c1; 620296465Sdelphij c1 = 0; 621296465Sdelphij sqr_add_c(a, 2, c2, c3, c1); 622296465Sdelphij sqr_add_c2(a, 3, 1, c2, c3, c1); 623296465Sdelphij r[4] = c2; 624296465Sdelphij c2 = 0; 625296465Sdelphij sqr_add_c2(a, 3, 2, c3, c1, c2); 626296465Sdelphij r[5] = c3; 627296465Sdelphij c3 = 0; 628296465Sdelphij sqr_add_c(a, 3, c1, c2, c3); 629296465Sdelphij r[6] = c1; 630296465Sdelphij r[7] = c2; 631296465Sdelphij} 632162911Ssimon#endif 633