1/* 2 * x86_64 BIGNUM accelerator version 0.1, December 2002. 3 * 4 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5 * project. 6 * 7 * Rights for redistribution and usage in source and binary forms are 8 * granted according to the OpenSSL license. Warranty of any kind is 9 * disclaimed. 10 * 11 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 12 * versions, like 1.0... 13 * A. Well, that's because this code is basically a quick-n-dirty 14 * proof-of-concept hack. As you can see it's implemented with 15 * inline assembler, which means that you're bound to GCC and that 16 * there must be a room for fine-tuning. 17 * 18 * Q. Why inline assembler? 19 * A. x86_64 features own ABI I'm not familiar with. Which is why 20 * I decided to let the compiler take care of subroutine 21 * prologue/epilogue as well as register allocation. 22 * 23 * Q. How much faster does it get? 24 * A. Unfortunately people sitting on x86_64 hardware are prohibited 25 * to disclose the performance numbers, so they (SuSE labs to be 26 * specific) wouldn't tell me. However! Very similar coding technique 27 * (reaching out for 128-bit result from 64x64-bit multiplication) 28 * results in >3 times performance improvement on MIPS and I see no 29 * reason why gain on x86_64 would be so much different:-) 30 */ 31 32#define BN_ULONG unsigned long 33 34/* 35 * "m"(a), "+m"(r) is the way to favor DirectPath �-code; 36 * "g"(0) let the compiler to decide where does it 37 * want to keep the value of zero; 38 */ 39#define mul_add(r,a,word,carry) do { \ 40 register BN_ULONG high,low; \ 41 asm ("mulq %3" \ 42 : "=a"(low),"=d"(high) \ 43 : "a"(word),"m"(a) \ 44 : "cc"); \ 45 asm ("addq %2,%0; adcq %3,%1" \ 46 : "+r"(carry),"+d"(high)\ 47 : "a"(low),"g"(0) \ 48 : "cc"); \ 49 asm ("addq %2,%0; adcq %3,%1" \ 50 : "+m"(r),"+d"(high) \ 51 : "r"(carry),"g"(0) \ 52 : "cc"); \ 53 carry=high; \ 54 } while (0) 55 56#define mul(r,a,word,carry) do { \ 57 register BN_ULONG high,low; \ 58 asm ("mulq %3" \ 59 : "=a"(low),"=d"(high) \ 60 : "a"(word),"g"(a) \ 61 : "cc"); \ 62 asm ("addq %2,%0; adcq %3,%1" \ 63 : "+r"(carry),"+d"(high)\ 64 : "a"(low),"g"(0) \ 65 : "cc"); \ 66 (r)=carry, carry=high; \ 67 } while (0) 68 69#define sqr(r0,r1,a) \ 70 asm ("mulq %2" \ 71 : "=a"(r0),"=d"(r1) \ 72 : "a"(a) \ 73 : "cc"); 74 75BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 76 { 77 BN_ULONG c1=0; 78 79 if (num <= 0) return(c1); 80 81 while (num&~3) 82 { 83 mul_add(rp[0],ap[0],w,c1); 84 mul_add(rp[1],ap[1],w,c1); 85 mul_add(rp[2],ap[2],w,c1); 86 mul_add(rp[3],ap[3],w,c1); 87 ap+=4; rp+=4; num-=4; 88 } 89 if (num) 90 { 91 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; 92 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; 93 mul_add(rp[2],ap[2],w,c1); return c1; 94 } 95 96 return(c1); 97 } 98 99BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) 100 { 101 BN_ULONG c1=0; 102 103 if (num <= 0) return(c1); 104 105 while (num&~3) 106 { 107 mul(rp[0],ap[0],w,c1); 108 mul(rp[1],ap[1],w,c1); 109 mul(rp[2],ap[2],w,c1); 110 mul(rp[3],ap[3],w,c1); 111 ap+=4; rp+=4; num-=4; 112 } 113 if (num) 114 { 115 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; 116 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; 117 mul(rp[2],ap[2],w,c1); 118 } 119 return(c1); 120 } 121 122void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) 123 { 124 if (n <= 0) return; 125 126 while (n&~3) 127 { 128 sqr(r[0],r[1],a[0]); 129 sqr(r[2],r[3],a[1]); 130 sqr(r[4],r[5],a[2]); 131 sqr(r[6],r[7],a[3]); 132 a+=4; r+=8; n-=4; 133 } 134 if (n) 135 { 136 sqr(r[0],r[1],a[0]); if (--n == 0) return; 137 sqr(r[2],r[3],a[1]); if (--n == 0) return; 138 sqr(r[4],r[5],a[2]); 139 } 140 } 141 142BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 143{ BN_ULONG ret,waste; 144 145 asm ("divq %4" 146 : "=a"(ret),"=d"(waste) 147 : "a"(l),"d"(h),"g"(d) 148 : "cc"); 149 150 return ret; 151} 152 153BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) 154{ BN_ULONG ret,i; 155 156 if (n <= 0) return 0; 157 158 asm ( 159 " subq %2,%2 \n" 160 ".align 16 \n" 161 "1: movq (%4,%2,8),%0 \n" 162 " adcq (%5,%2,8),%0 \n" 163 " movq %0,(%3,%2,8) \n" 164 " leaq 1(%2),%2 \n" 165 " loop 1b \n" 166 " sbbq %0,%0 \n" 167 : "+a"(ret),"+c"(n),"+r"(i) 168 : "r"(rp),"r"(ap),"r"(bp) 169 : "cc" 170 ); 171 172 return ret&1; 173} 174 175#ifndef SIMICS 176BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) 177{ BN_ULONG ret,i; 178 179 if (n <= 0) return 0; 180 181 asm ( 182 " subq %2,%2 \n" 183 ".align 16 \n" 184 "1: movq (%4,%2,8),%0 \n" 185 " sbbq (%5,%2,8),%0 \n" 186 " movq %0,(%3,%2,8) \n" 187 " leaq 1(%2),%2 \n" 188 " loop 1b \n" 189 " sbbq %0,%0 \n" 190 : "+a"(ret),"+c"(n),"+r"(i) 191 : "r"(rp),"r"(ap),"r"(bp) 192 : "cc" 193 ); 194 195 return ret&1; 196} 197#else 198/* Simics 1.4<7 has buggy sbbq:-( */ 199#define BN_MASK2 0xffffffffffffffffL 200BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) 201 { 202 BN_ULONG t1,t2; 203 int c=0; 204 205 if (n <= 0) return((BN_ULONG)0); 206 207 for (;;) 208 { 209 t1=a[0]; t2=b[0]; 210 r[0]=(t1-t2-c)&BN_MASK2; 211 if (t1 != t2) c=(t1 < t2); 212 if (--n <= 0) break; 213 214 t1=a[1]; t2=b[1]; 215 r[1]=(t1-t2-c)&BN_MASK2; 216 if (t1 != t2) c=(t1 < t2); 217 if (--n <= 0) break; 218 219 t1=a[2]; t2=b[2]; 220 r[2]=(t1-t2-c)&BN_MASK2; 221 if (t1 != t2) c=(t1 < t2); 222 if (--n <= 0) break; 223 224 t1=a[3]; t2=b[3]; 225 r[3]=(t1-t2-c)&BN_MASK2; 226 if (t1 != t2) c=(t1 < t2); 227 if (--n <= 0) break; 228 229 a+=4; 230 b+=4; 231 r+=4; 232 } 233 return(c); 234 } 235#endif 236 237/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 238/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 239/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 240/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ 241 242#if 0 243/* original macros are kept for reference purposes */ 244#define mul_add_c(a,b,c0,c1,c2) { \ 245 BN_ULONG ta=(a),tb=(b); \ 246 t1 = ta * tb; \ 247 t2 = BN_UMULT_HIGH(ta,tb); \ 248 c0 += t1; t2 += (c0<t1)?1:0; \ 249 c1 += t2; c2 += (c1<t2)?1:0; \ 250 } 251 252#define mul_add_c2(a,b,c0,c1,c2) { \ 253 BN_ULONG ta=(a),tb=(b),t0; \ 254 t1 = BN_UMULT_HIGH(ta,tb); \ 255 t0 = ta * tb; \ 256 t2 = t1+t1; c2 += (t2<t1)?1:0; \ 257 t1 = t0+t0; t2 += (t1<t0)?1:0; \ 258 c0 += t1; t2 += (c0<t1)?1:0; \ 259 c1 += t2; c2 += (c1<t2)?1:0; \ 260 } 261#else 262#define mul_add_c(a,b,c0,c1,c2) do { \ 263 asm ("mulq %3" \ 264 : "=a"(t1),"=d"(t2) \ 265 : "a"(a),"m"(b) \ 266 : "cc"); \ 267 asm ("addq %2,%0; adcq %3,%1" \ 268 : "+r"(c0),"+d"(t2) \ 269 : "a"(t1),"g"(0) \ 270 : "cc"); \ 271 asm ("addq %2,%0; adcq %3,%1" \ 272 : "+r"(c1),"+r"(c2) \ 273 : "d"(t2),"g"(0) \ 274 : "cc"); \ 275 } while (0) 276 277#define sqr_add_c(a,i,c0,c1,c2) do { \ 278 asm ("mulq %2" \ 279 : "=a"(t1),"=d"(t2) \ 280 : "a"(a[i]) \ 281 : "cc"); \ 282 asm ("addq %2,%0; adcq %3,%1" \ 283 : "+r"(c0),"+d"(t2) \ 284 : "a"(t1),"g"(0) \ 285 : "cc"); \ 286 asm ("addq %2,%0; adcq %3,%1" \ 287 : "+r"(c1),"+r"(c2) \ 288 : "d"(t2),"g"(0) \ 289 : "cc"); \ 290 } while (0) 291 292#define mul_add_c2(a,b,c0,c1,c2) do { \ 293 asm ("mulq %3" \ 294 : "=a"(t1),"=d"(t2) \ 295 : "a"(a),"m"(b) \ 296 : "cc"); \ 297 asm ("addq %0,%0; adcq %2,%1" \ 298 : "+d"(t2),"+r"(c2) \ 299 : "g"(0) \ 300 : "cc"); \ 301 asm ("addq %0,%0; adcq %2,%1" \ 302 : "+a"(t1),"+d"(t2) \ 303 : "g"(0) \ 304 : "cc"); \ 305 asm ("addq %2,%0; adcq %3,%1" \ 306 : "+r"(c0),"+d"(t2) \ 307 : "a"(t1),"g"(0) \ 308 : "cc"); \ 309 asm ("addq %2,%0; adcq %3,%1" \ 310 : "+r"(c1),"+r"(c2) \ 311 : "d"(t2),"g"(0) \ 312 : "cc"); \ 313 } while (0) 314#endif 315 316#define sqr_add_c2(a,i,j,c0,c1,c2) \ 317 mul_add_c2((a)[i],(a)[j],c0,c1,c2) 318 319void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 320 { 321 BN_ULONG bl,bh; 322 BN_ULONG t1,t2; 323 BN_ULONG c1,c2,c3; 324 325 c1=0; 326 c2=0; 327 c3=0; 328 mul_add_c(a[0],b[0],c1,c2,c3); 329 r[0]=c1; 330 c1=0; 331 mul_add_c(a[0],b[1],c2,c3,c1); 332 mul_add_c(a[1],b[0],c2,c3,c1); 333 r[1]=c2; 334 c2=0; 335 mul_add_c(a[2],b[0],c3,c1,c2); 336 mul_add_c(a[1],b[1],c3,c1,c2); 337 mul_add_c(a[0],b[2],c3,c1,c2); 338 r[2]=c3; 339 c3=0; 340 mul_add_c(a[0],b[3],c1,c2,c3); 341 mul_add_c(a[1],b[2],c1,c2,c3); 342 mul_add_c(a[2],b[1],c1,c2,c3); 343 mul_add_c(a[3],b[0],c1,c2,c3); 344 r[3]=c1; 345 c1=0; 346 mul_add_c(a[4],b[0],c2,c3,c1); 347 mul_add_c(a[3],b[1],c2,c3,c1); 348 mul_add_c(a[2],b[2],c2,c3,c1); 349 mul_add_c(a[1],b[3],c2,c3,c1); 350 mul_add_c(a[0],b[4],c2,c3,c1); 351 r[4]=c2; 352 c2=0; 353 mul_add_c(a[0],b[5],c3,c1,c2); 354 mul_add_c(a[1],b[4],c3,c1,c2); 355 mul_add_c(a[2],b[3],c3,c1,c2); 356 mul_add_c(a[3],b[2],c3,c1,c2); 357 mul_add_c(a[4],b[1],c3,c1,c2); 358 mul_add_c(a[5],b[0],c3,c1,c2); 359 r[5]=c3; 360 c3=0; 361 mul_add_c(a[6],b[0],c1,c2,c3); 362 mul_add_c(a[5],b[1],c1,c2,c3); 363 mul_add_c(a[4],b[2],c1,c2,c3); 364 mul_add_c(a[3],b[3],c1,c2,c3); 365 mul_add_c(a[2],b[4],c1,c2,c3); 366 mul_add_c(a[1],b[5],c1,c2,c3); 367 mul_add_c(a[0],b[6],c1,c2,c3); 368 r[6]=c1; 369 c1=0; 370 mul_add_c(a[0],b[7],c2,c3,c1); 371 mul_add_c(a[1],b[6],c2,c3,c1); 372 mul_add_c(a[2],b[5],c2,c3,c1); 373 mul_add_c(a[3],b[4],c2,c3,c1); 374 mul_add_c(a[4],b[3],c2,c3,c1); 375 mul_add_c(a[5],b[2],c2,c3,c1); 376 mul_add_c(a[6],b[1],c2,c3,c1); 377 mul_add_c(a[7],b[0],c2,c3,c1); 378 r[7]=c2; 379 c2=0; 380 mul_add_c(a[7],b[1],c3,c1,c2); 381 mul_add_c(a[6],b[2],c3,c1,c2); 382 mul_add_c(a[5],b[3],c3,c1,c2); 383 mul_add_c(a[4],b[4],c3,c1,c2); 384 mul_add_c(a[3],b[5],c3,c1,c2); 385 mul_add_c(a[2],b[6],c3,c1,c2); 386 mul_add_c(a[1],b[7],c3,c1,c2); 387 r[8]=c3; 388 c3=0; 389 mul_add_c(a[2],b[7],c1,c2,c3); 390 mul_add_c(a[3],b[6],c1,c2,c3); 391 mul_add_c(a[4],b[5],c1,c2,c3); 392 mul_add_c(a[5],b[4],c1,c2,c3); 393 mul_add_c(a[6],b[3],c1,c2,c3); 394 mul_add_c(a[7],b[2],c1,c2,c3); 395 r[9]=c1; 396 c1=0; 397 mul_add_c(a[7],b[3],c2,c3,c1); 398 mul_add_c(a[6],b[4],c2,c3,c1); 399 mul_add_c(a[5],b[5],c2,c3,c1); 400 mul_add_c(a[4],b[6],c2,c3,c1); 401 mul_add_c(a[3],b[7],c2,c3,c1); 402 r[10]=c2; 403 c2=0; 404 mul_add_c(a[4],b[7],c3,c1,c2); 405 mul_add_c(a[5],b[6],c3,c1,c2); 406 mul_add_c(a[6],b[5],c3,c1,c2); 407 mul_add_c(a[7],b[4],c3,c1,c2); 408 r[11]=c3; 409 c3=0; 410 mul_add_c(a[7],b[5],c1,c2,c3); 411 mul_add_c(a[6],b[6],c1,c2,c3); 412 mul_add_c(a[5],b[7],c1,c2,c3); 413 r[12]=c1; 414 c1=0; 415 mul_add_c(a[6],b[7],c2,c3,c1); 416 mul_add_c(a[7],b[6],c2,c3,c1); 417 r[13]=c2; 418 c2=0; 419 mul_add_c(a[7],b[7],c3,c1,c2); 420 r[14]=c3; 421 r[15]=c1; 422 } 423 424void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 425 { 426 BN_ULONG bl,bh; 427 BN_ULONG t1,t2; 428 BN_ULONG c1,c2,c3; 429 430 c1=0; 431 c2=0; 432 c3=0; 433 mul_add_c(a[0],b[0],c1,c2,c3); 434 r[0]=c1; 435 c1=0; 436 mul_add_c(a[0],b[1],c2,c3,c1); 437 mul_add_c(a[1],b[0],c2,c3,c1); 438 r[1]=c2; 439 c2=0; 440 mul_add_c(a[2],b[0],c3,c1,c2); 441 mul_add_c(a[1],b[1],c3,c1,c2); 442 mul_add_c(a[0],b[2],c3,c1,c2); 443 r[2]=c3; 444 c3=0; 445 mul_add_c(a[0],b[3],c1,c2,c3); 446 mul_add_c(a[1],b[2],c1,c2,c3); 447 mul_add_c(a[2],b[1],c1,c2,c3); 448 mul_add_c(a[3],b[0],c1,c2,c3); 449 r[3]=c1; 450 c1=0; 451 mul_add_c(a[3],b[1],c2,c3,c1); 452 mul_add_c(a[2],b[2],c2,c3,c1); 453 mul_add_c(a[1],b[3],c2,c3,c1); 454 r[4]=c2; 455 c2=0; 456 mul_add_c(a[2],b[3],c3,c1,c2); 457 mul_add_c(a[3],b[2],c3,c1,c2); 458 r[5]=c3; 459 c3=0; 460 mul_add_c(a[3],b[3],c1,c2,c3); 461 r[6]=c1; 462 r[7]=c2; 463 } 464 465void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) 466 { 467 BN_ULONG bl,bh; 468 BN_ULONG t1,t2; 469 BN_ULONG c1,c2,c3; 470 471 c1=0; 472 c2=0; 473 c3=0; 474 sqr_add_c(a,0,c1,c2,c3); 475 r[0]=c1; 476 c1=0; 477 sqr_add_c2(a,1,0,c2,c3,c1); 478 r[1]=c2; 479 c2=0; 480 sqr_add_c(a,1,c3,c1,c2); 481 sqr_add_c2(a,2,0,c3,c1,c2); 482 r[2]=c3; 483 c3=0; 484 sqr_add_c2(a,3,0,c1,c2,c3); 485 sqr_add_c2(a,2,1,c1,c2,c3); 486 r[3]=c1; 487 c1=0; 488 sqr_add_c(a,2,c2,c3,c1); 489 sqr_add_c2(a,3,1,c2,c3,c1); 490 sqr_add_c2(a,4,0,c2,c3,c1); 491 r[4]=c2; 492 c2=0; 493 sqr_add_c2(a,5,0,c3,c1,c2); 494 sqr_add_c2(a,4,1,c3,c1,c2); 495 sqr_add_c2(a,3,2,c3,c1,c2); 496 r[5]=c3; 497 c3=0; 498 sqr_add_c(a,3,c1,c2,c3); 499 sqr_add_c2(a,4,2,c1,c2,c3); 500 sqr_add_c2(a,5,1,c1,c2,c3); 501 sqr_add_c2(a,6,0,c1,c2,c3); 502 r[6]=c1; 503 c1=0; 504 sqr_add_c2(a,7,0,c2,c3,c1); 505 sqr_add_c2(a,6,1,c2,c3,c1); 506 sqr_add_c2(a,5,2,c2,c3,c1); 507 sqr_add_c2(a,4,3,c2,c3,c1); 508 r[7]=c2; 509 c2=0; 510 sqr_add_c(a,4,c3,c1,c2); 511 sqr_add_c2(a,5,3,c3,c1,c2); 512 sqr_add_c2(a,6,2,c3,c1,c2); 513 sqr_add_c2(a,7,1,c3,c1,c2); 514 r[8]=c3; 515 c3=0; 516 sqr_add_c2(a,7,2,c1,c2,c3); 517 sqr_add_c2(a,6,3,c1,c2,c3); 518 sqr_add_c2(a,5,4,c1,c2,c3); 519 r[9]=c1; 520 c1=0; 521 sqr_add_c(a,5,c2,c3,c1); 522 sqr_add_c2(a,6,4,c2,c3,c1); 523 sqr_add_c2(a,7,3,c2,c3,c1); 524 r[10]=c2; 525 c2=0; 526 sqr_add_c2(a,7,4,c3,c1,c2); 527 sqr_add_c2(a,6,5,c3,c1,c2); 528 r[11]=c3; 529 c3=0; 530 sqr_add_c(a,6,c1,c2,c3); 531 sqr_add_c2(a,7,5,c1,c2,c3); 532 r[12]=c1; 533 c1=0; 534 sqr_add_c2(a,7,6,c2,c3,c1); 535 r[13]=c2; 536 c2=0; 537 sqr_add_c(a,7,c3,c1,c2); 538 r[14]=c3; 539 r[15]=c1; 540 } 541 542void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) 543 { 544 BN_ULONG bl,bh; 545 BN_ULONG t1,t2; 546 BN_ULONG c1,c2,c3; 547 548 c1=0; 549 c2=0; 550 c3=0; 551 sqr_add_c(a,0,c1,c2,c3); 552 r[0]=c1; 553 c1=0; 554 sqr_add_c2(a,1,0,c2,c3,c1); 555 r[1]=c2; 556 c2=0; 557 sqr_add_c(a,1,c3,c1,c2); 558 sqr_add_c2(a,2,0,c3,c1,c2); 559 r[2]=c3; 560 c3=0; 561 sqr_add_c2(a,3,0,c1,c2,c3); 562 sqr_add_c2(a,2,1,c1,c2,c3); 563 r[3]=c1; 564 c1=0; 565 sqr_add_c(a,2,c2,c3,c1); 566 sqr_add_c2(a,3,1,c2,c3,c1); 567 r[4]=c2; 568 c2=0; 569 sqr_add_c2(a,3,2,c3,c1,c2); 570 r[5]=c3; 571 c3=0; 572 sqr_add_c(a,3,c1,c2,c3); 573 r[6]=c1; 574 r[7]=c2; 575 } 576