1#!/usr/local/bin/perl 2 3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 4push(@INC,"${dir}","${dir}../../perlasm"); 5require "x86asm.pl"; 6 7&asm_init($ARGV[0],$0); 8 9$sse2=0; 10for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 11 12&external_label("OPENSSL_ia32cap_P") if ($sse2); 13 14&bn_mul_add_words("bn_mul_add_words"); 15&bn_mul_words("bn_mul_words"); 16&bn_sqr_words("bn_sqr_words"); 17&bn_div_words("bn_div_words"); 18&bn_add_words("bn_add_words"); 19&bn_sub_words("bn_sub_words"); 20 21&asm_finish(); 22 23sub bn_mul_add_words 24 { 25 local($name)=@_; 26 27 &function_begin_B($name,""); 28 29 $r="eax"; 30 $a="edx"; 31 $c="ecx"; 32 33 if ($sse2) { 34 &picsetup("eax"); 35 &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 36 &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 37 &jnc(&label("maw_non_sse2")); 38 39 &mov($r,&wparam(0)); 40 &mov($a,&wparam(1)); 41 &mov($c,&wparam(2)); 42 &movd("mm0",&wparam(3)); # mm0 = w 43 &pxor("mm1","mm1"); # mm1 = carry_in 44 &jmp(&label("maw_sse2_entry")); 45 46 &set_label("maw_sse2_unrolled",16); 47 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] 48 &paddq("mm1","mm3"); # mm1 = carry_in + r[0] 49 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] 50 &pmuludq("mm2","mm0"); # mm2 = w*a[0] 51 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1] 52 &pmuludq("mm4","mm0"); # mm4 = w*a[1] 53 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2] 54 &pmuludq("mm6","mm0"); # mm6 = w*a[2] 55 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3] 56 &pmuludq("mm7","mm0"); # mm7 = w*a[3] 57 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0] 58 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1] 59 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1] 60 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2] 61 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2] 62 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3] 63 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3] 64 &movd(&DWP(0,$r,"",0),"mm1"); 65 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4] 66 &pmuludq("mm2","mm0"); # mm2 = w*a[4] 67 &psrlq("mm1",32); # mm1 = carry0 68 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5] 69 &pmuludq("mm4","mm0"); # mm4 = w*a[5] 70 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1] 71 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6] 72 &pmuludq("mm6","mm0"); # mm6 = w*a[6] 73 &movd(&DWP(4,$r,"",0),"mm1"); 74 &psrlq("mm1",32); # mm1 = carry1 75 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7] 76 &add($a,32); 77 &pmuludq("mm3","mm0"); # mm3 = w*a[7] 78 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2] 79 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4] 80 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4] 81 &movd(&DWP(8,$r,"",0),"mm1"); 82 &psrlq("mm1",32); # mm1 = carry2 83 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3] 84 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5] 85 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5] 86 &movd(&DWP(12,$r,"",0),"mm1"); 87 &psrlq("mm1",32); # mm1 = carry3 88 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4] 89 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6] 90 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6] 91 &movd(&DWP(16,$r,"",0),"mm1"); 92 &psrlq("mm1",32); # mm1 = carry4 93 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5] 94 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7] 95 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7] 96 &movd(&DWP(20,$r,"",0),"mm1"); 97 &psrlq("mm1",32); # mm1 = carry5 98 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6] 99 &movd(&DWP(24,$r,"",0),"mm1"); 100 &psrlq("mm1",32); # mm1 = carry6 101 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] 102 &movd(&DWP(28,$r,"",0),"mm1"); 103 &lea($r,&DWP(32,$r)); 104 &psrlq("mm1",32); # mm1 = carry_out 105 106 &sub($c,8); 107 &jz(&label("maw_sse2_exit")); 108 &set_label("maw_sse2_entry"); 109 &test($c,0xfffffff8); 110 &jnz(&label("maw_sse2_unrolled")); 111 112 &set_label("maw_sse2_loop",4); 113 &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 114 &movd("mm3",&DWP(0,$r)); # mm3 = r[i] 115 &pmuludq("mm2","mm0"); # a[i] *= w 116 &lea($a,&DWP(4,$a)); 117 &paddq("mm1","mm3"); # carry += r[i] 118 &paddq("mm1","mm2"); # carry += a[i]*w 119 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 120 &sub($c,1); 121 &psrlq("mm1",32); # carry = carry_high 122 &lea($r,&DWP(4,$r)); 123 &jnz(&label("maw_sse2_loop")); 124 &set_label("maw_sse2_exit"); 125 &movd("eax","mm1"); # c = carry_out 126 &emms(); 127 &ret(); 128 129 &set_label("maw_non_sse2",16); 130 } 131 132 # function_begin prologue 133 &push("ebp"); 134 &push("ebx"); 135 &push("esi"); 136 &push("edi"); 137 138 &comment(""); 139 $Low="eax"; 140 $High="edx"; 141 $a="ebx"; 142 $w="ebp"; 143 $r="edi"; 144 $c="esi"; 145 146 &xor($c,$c); # clear carry 147 &mov($r,&wparam(0)); # 148 149 &mov("ecx",&wparam(2)); # 150 &mov($a,&wparam(1)); # 151 152 &and("ecx",0xfffffff8); # num / 8 153 &mov($w,&wparam(3)); # 154 155 &push("ecx"); # Up the stack for a tmp variable 156 157 &jz(&label("maw_finish")); 158 159 &set_label("maw_loop",16); 160 161 for ($i=0; $i<32; $i+=4) 162 { 163 &comment("Round $i"); 164 165 &mov("eax",&DWP($i,$a)); # *a 166 &mul($w); # *a * w 167 &add("eax",$c); # L(t)+= c 168 &adc("edx",0); # H(t)+=carry 169 &add("eax",&DWP($i,$r)); # L(t)+= *r 170 &adc("edx",0); # H(t)+=carry 171 &mov(&DWP($i,$r),"eax"); # *r= L(t); 172 &mov($c,"edx"); # c= H(t); 173 } 174 175 &comment(""); 176 &sub("ecx",8); 177 &lea($a,&DWP(32,$a)); 178 &lea($r,&DWP(32,$r)); 179 &jnz(&label("maw_loop")); 180 181 &set_label("maw_finish",0); 182 &mov("ecx",&wparam(2)); # get num 183 &and("ecx",7); 184 &jnz(&label("maw_finish2")); # helps branch prediction 185 &jmp(&label("maw_end")); 186 187 &set_label("maw_finish2",1); 188 for ($i=0; $i<7; $i++) 189 { 190 &comment("Tail Round $i"); 191 &mov("eax",&DWP($i*4,$a)); # *a 192 &mul($w); # *a * w 193 &add("eax",$c); # L(t)+=c 194 &adc("edx",0); # H(t)+=carry 195 &add("eax",&DWP($i*4,$r)); # L(t)+= *r 196 &adc("edx",0); # H(t)+=carry 197 &dec("ecx") if ($i != 7-1); 198 &mov(&DWP($i*4,$r),"eax"); # *r= L(t); 199 &mov($c,"edx"); # c= H(t); 200 &jz(&label("maw_end")) if ($i != 7-1); 201 } 202 &set_label("maw_end",0); 203 &mov("eax",$c); 204 205 &pop("ecx"); # clear variable from 206 207 &function_end($name); 208 } 209 210sub bn_mul_words 211 { 212 local($name)=@_; 213 214 &function_begin_B($name,""); 215 216 $r="eax"; 217 $a="edx"; 218 $c="ecx"; 219 220 if ($sse2) { 221 &picsetup("eax"); 222 &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 223 &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 224 &jnc(&label("mw_non_sse2")); 225 226 &mov($r,&wparam(0)); 227 &mov($a,&wparam(1)); 228 &mov($c,&wparam(2)); 229 &movd("mm0",&wparam(3)); # mm0 = w 230 &pxor("mm1","mm1"); # mm1 = carry = 0 231 232 &set_label("mw_sse2_loop",16); 233 &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 234 &pmuludq("mm2","mm0"); # a[i] *= w 235 &lea($a,&DWP(4,$a)); 236 &paddq("mm1","mm2"); # carry += a[i]*w 237 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 238 &sub($c,1); 239 &psrlq("mm1",32); # carry = carry_high 240 &lea($r,&DWP(4,$r)); 241 &jnz(&label("mw_sse2_loop")); 242 243 &movd("eax","mm1"); # return carry 244 &emms(); 245 &ret(); 246 &set_label("mw_non_sse2",16); 247 } 248 249 # function_begin prologue 250 &push("ebp"); 251 &push("ebx"); 252 &push("esi"); 253 &push("edi"); 254 255 &comment(""); 256 $Low="eax"; 257 $High="edx"; 258 $a="ebx"; 259 $w="ecx"; 260 $r="edi"; 261 $c="esi"; 262 $num="ebp"; 263 264 &xor($c,$c); # clear carry 265 &mov($r,&wparam(0)); # 266 &mov($a,&wparam(1)); # 267 &mov($num,&wparam(2)); # 268 &mov($w,&wparam(3)); # 269 270 &and($num,0xfffffff8); # num / 8 271 &jz(&label("mw_finish")); 272 273 &set_label("mw_loop",0); 274 for ($i=0; $i<32; $i+=4) 275 { 276 &comment("Round $i"); 277 278 &mov("eax",&DWP($i,$a,"",0)); # *a 279 &mul($w); # *a * w 280 &add("eax",$c); # L(t)+=c 281 # XXX 282 283 &adc("edx",0); # H(t)+=carry 284 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 285 286 &mov($c,"edx"); # c= H(t); 287 } 288 289 &comment(""); 290 &add($a,32); 291 &add($r,32); 292 &sub($num,8); 293 &jz(&label("mw_finish")); 294 &jmp(&label("mw_loop")); 295 296 &set_label("mw_finish",0); 297 &mov($num,&wparam(2)); # get num 298 &and($num,7); 299 &jnz(&label("mw_finish2")); 300 &jmp(&label("mw_end")); 301 302 &set_label("mw_finish2",1); 303 for ($i=0; $i<7; $i++) 304 { 305 &comment("Tail Round $i"); 306 &mov("eax",&DWP($i*4,$a,"",0));# *a 307 &mul($w); # *a * w 308 &add("eax",$c); # L(t)+=c 309 # XXX 310 &adc("edx",0); # H(t)+=carry 311 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); 312 &mov($c,"edx"); # c= H(t); 313 &dec($num) if ($i != 7-1); 314 &jz(&label("mw_end")) if ($i != 7-1); 315 } 316 &set_label("mw_end",0); 317 &mov("eax",$c); 318 319 &function_end($name); 320 } 321 322sub bn_sqr_words 323 { 324 local($name)=@_; 325 326 &function_begin_B($name,""); 327 328 $r="eax"; 329 $a="edx"; 330 $c="ecx"; 331 332 if ($sse2) { 333 &picsetup("eax"); 334 &picsymbol("eax", "OPENSSL_ia32cap_P", "eax"); 335 &bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2"); 336 &jnc(&label("sqr_non_sse2")); 337 338 &mov($r,&wparam(0)); 339 &mov($a,&wparam(1)); 340 &mov($c,&wparam(2)); 341 342 &set_label("sqr_sse2_loop",16); 343 &movd("mm0",&DWP(0,$a)); # mm0 = a[i] 344 &pmuludq("mm0","mm0"); # a[i] *= a[i] 345 &lea($a,&DWP(4,$a)); # a++ 346 &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i] 347 &sub($c,1); 348 &lea($r,&DWP(8,$r)); # r += 2 349 &jnz(&label("sqr_sse2_loop")); 350 351 &emms(); 352 &ret(); 353 &set_label("sqr_non_sse2",16); 354 } 355 356 # function_begin prologue 357 &push("ebp"); 358 &push("ebx"); 359 &push("esi"); 360 &push("edi"); 361 362 &comment(""); 363 $r="esi"; 364 $a="edi"; 365 $num="ebx"; 366 367 &mov($r,&wparam(0)); # 368 &mov($a,&wparam(1)); # 369 &mov($num,&wparam(2)); # 370 371 &and($num,0xfffffff8); # num / 8 372 &jz(&label("sw_finish")); 373 374 &set_label("sw_loop",0); 375 for ($i=0; $i<32; $i+=4) 376 { 377 &comment("Round $i"); 378 &mov("eax",&DWP($i,$a,"",0)); # *a 379 # XXX 380 &mul("eax"); # *a * *a 381 &mov(&DWP($i*2,$r,"",0),"eax"); # 382 &mov(&DWP($i*2+4,$r,"",0),"edx");# 383 } 384 385 &comment(""); 386 &add($a,32); 387 &add($r,64); 388 &sub($num,8); 389 &jnz(&label("sw_loop")); 390 391 &set_label("sw_finish",0); 392 &mov($num,&wparam(2)); # get num 393 &and($num,7); 394 &jz(&label("sw_end")); 395 396 for ($i=0; $i<7; $i++) 397 { 398 &comment("Tail Round $i"); 399 &mov("eax",&DWP($i*4,$a,"",0)); # *a 400 # XXX 401 &mul("eax"); # *a * *a 402 &mov(&DWP($i*8,$r,"",0),"eax"); # 403 &dec($num) if ($i != 7-1); 404 &mov(&DWP($i*8+4,$r,"",0),"edx"); 405 &jz(&label("sw_end")) if ($i != 7-1); 406 } 407 &set_label("sw_end",0); 408 409 &function_end($name); 410 } 411 412sub bn_div_words 413 { 414 local($name)=@_; 415 416 &function_begin_B($name,""); 417 &mov("edx",&wparam(0)); # 418 &mov("eax",&wparam(1)); # 419 &mov("ecx",&wparam(2)); # 420 &div("ecx"); 421 &ret(); 422 &function_end_B($name); 423 } 424 425sub bn_add_words 426 { 427 local($name)=@_; 428 429 &function_begin($name,""); 430 431 &comment(""); 432 $a="esi"; 433 $b="edi"; 434 $c="eax"; 435 $r="ebx"; 436 $tmp1="ecx"; 437 $tmp2="edx"; 438 $num="ebp"; 439 440 &mov($r,&wparam(0)); # get r 441 &mov($a,&wparam(1)); # get a 442 &mov($b,&wparam(2)); # get b 443 &mov($num,&wparam(3)); # get num 444 &xor($c,$c); # clear carry 445 &and($num,0xfffffff8); # num / 8 446 447 &jz(&label("aw_finish")); 448 449 &set_label("aw_loop",0); 450 for ($i=0; $i<8; $i++) 451 { 452 &comment("Round $i"); 453 454 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 455 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 456 &add($tmp1,$c); 457 &mov($c,0); 458 &adc($c,$c); 459 &add($tmp1,$tmp2); 460 &adc($c,0); 461 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 462 } 463 464 &comment(""); 465 &add($a,32); 466 &add($b,32); 467 &add($r,32); 468 &sub($num,8); 469 &jnz(&label("aw_loop")); 470 471 &set_label("aw_finish",0); 472 &mov($num,&wparam(3)); # get num 473 &and($num,7); 474 &jz(&label("aw_end")); 475 476 for ($i=0; $i<7; $i++) 477 { 478 &comment("Tail Round $i"); 479 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 480 &mov($tmp2,&DWP($i*4,$b,"",0));# *b 481 &add($tmp1,$c); 482 &mov($c,0); 483 &adc($c,$c); 484 &add($tmp1,$tmp2); 485 &adc($c,0); 486 &dec($num) if ($i != 6); 487 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 488 &jz(&label("aw_end")) if ($i != 6); 489 } 490 &set_label("aw_end",0); 491 492# &mov("eax",$c); # $c is "eax" 493 494 &function_end($name); 495 } 496 497sub bn_sub_words 498 { 499 local($name)=@_; 500 501 &function_begin($name,""); 502 503 &comment(""); 504 $a="esi"; 505 $b="edi"; 506 $c="eax"; 507 $r="ebx"; 508 $tmp1="ecx"; 509 $tmp2="edx"; 510 $num="ebp"; 511 512 &mov($r,&wparam(0)); # get r 513 &mov($a,&wparam(1)); # get a 514 &mov($b,&wparam(2)); # get b 515 &mov($num,&wparam(3)); # get num 516 &xor($c,$c); # clear carry 517 &and($num,0xfffffff8); # num / 8 518 519 &jz(&label("aw_finish")); 520 521 &set_label("aw_loop",0); 522 for ($i=0; $i<8; $i++) 523 { 524 &comment("Round $i"); 525 526 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 527 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 528 &sub($tmp1,$c); 529 &mov($c,0); 530 &adc($c,$c); 531 &sub($tmp1,$tmp2); 532 &adc($c,0); 533 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 534 } 535 536 &comment(""); 537 &add($a,32); 538 &add($b,32); 539 &add($r,32); 540 &sub($num,8); 541 &jnz(&label("aw_loop")); 542 543 &set_label("aw_finish",0); 544 &mov($num,&wparam(3)); # get num 545 &and($num,7); 546 &jz(&label("aw_end")); 547 548 for ($i=0; $i<7; $i++) 549 { 550 &comment("Tail Round $i"); 551 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 552 &mov($tmp2,&DWP($i*4,$b,"",0));# *b 553 &sub($tmp1,$c); 554 &mov($c,0); 555 &adc($c,$c); 556 &sub($tmp1,$tmp2); 557 &adc($c,0); 558 &dec($num) if ($i != 6); 559 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 560 &jz(&label("aw_end")) if ($i != 6); 561 } 562 &set_label("aw_end",0); 563 564# &mov("eax",$c); # $c is "eax" 565 566 &function_end($name); 567 } 568