1#!/usr/local/bin/perl 2 3push(@INC,"perlasm","../../perlasm"); 4require "x86asm.pl"; 5 6&asm_init($ARGV[0],$0); 7 8$sse2=0; 9for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 10 11&external_label("OPENSSL_ia32cap_P") if ($sse2); 12 13&bn_mul_add_words("bn_mul_add_words"); 14&bn_mul_words("bn_mul_words"); 15&bn_sqr_words("bn_sqr_words"); 16&bn_div_words("bn_div_words"); 17&bn_add_words("bn_add_words"); 18&bn_sub_words("bn_sub_words"); 19&bn_sub_part_words("bn_sub_part_words"); 20 21&asm_finish(); 22 23sub bn_mul_add_words 24 { 25 local($name)=@_; 26 27 &function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); 28 29 &comment(""); 30 $Low="eax"; 31 $High="edx"; 32 $a="ebx"; 33 $w="ebp"; 34 $r="edi"; 35 $c="esi"; 36 37 &xor($c,$c); # clear carry 38 &mov($r,&wparam(0)); # 39 40 &mov("ecx",&wparam(2)); # 41 &mov($a,&wparam(1)); # 42 43 &and("ecx",0xfffffff8); # num / 8 44 &mov($w,&wparam(3)); # 45 46 &push("ecx"); # Up the stack for a tmp variable 47 48 &jz(&label("maw_finish")); 49 50 if ($sse2) { 51 &picmeup("eax","OPENSSL_ia32cap_P"); 52 &bt(&DWP(0,"eax"),26); 53 &jnc(&label("maw_loop")); 54 55 &movd("mm0",$w); # mm0 = w 56 &pxor("mm1","mm1"); # mm1 = carry_in 57 58 &set_label("maw_sse2_loop",0); 59 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] 60 &paddq("mm1","mm3"); # mm1 = carry_in + r[0] 61 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] 62 &pmuludq("mm2","mm0"); # mm2 = w*a[0] 63 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1] 64 &pmuludq("mm4","mm0"); # mm4 = w*a[1] 65 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2] 66 &pmuludq("mm6","mm0"); # mm6 = w*a[2] 67 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3] 68 &pmuludq("mm7","mm0"); # mm7 = w*a[3] 69 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0] 70 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1] 71 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1] 72 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2] 73 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2] 74 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3] 75 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3] 76 &movd(&DWP(0,$r,"",0),"mm1"); 77 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4] 78 &pmuludq("mm2","mm0"); # mm2 = w*a[4] 79 &psrlq("mm1",32); # mm1 = carry0 80 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5] 81 &pmuludq("mm4","mm0"); # mm4 = w*a[5] 82 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1] 83 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6] 84 &pmuludq("mm6","mm0"); # mm6 = w*a[6] 85 &movd(&DWP(4,$r,"",0),"mm1"); 86 &psrlq("mm1",32); # mm1 = carry1 87 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7] 88 &add($a,32); 89 &pmuludq("mm3","mm0"); # mm3 = w*a[7] 90 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2] 91 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4] 92 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4] 93 &movd(&DWP(8,$r,"",0),"mm1"); 94 &psrlq("mm1",32); # mm1 = carry2 95 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3] 96 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5] 97 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5] 98 &movd(&DWP(12,$r,"",0),"mm1"); 99 &psrlq("mm1",32); # mm1 = carry3 100 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4] 101 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6] 102 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6] 103 &movd(&DWP(16,$r,"",0),"mm1"); 104 &psrlq("mm1",32); # mm1 = carry4 105 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5] 106 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7] 107 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7] 108 &movd(&DWP(20,$r,"",0),"mm1"); 109 &psrlq("mm1",32); # mm1 = carry5 110 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6] 111 &movd(&DWP(24,$r,"",0),"mm1"); 112 &psrlq("mm1",32); # mm1 = carry6 113 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] 114 &movd(&DWP(28,$r,"",0),"mm1"); 115 &add($r,32); 116 &psrlq("mm1",32); # mm1 = carry_out 117 118 &sub("ecx",8); 119 &jnz(&label("maw_sse2_loop")); 120 121 &movd($c,"mm1"); # c = carry_out 122 &emms(); 123 124 &jmp(&label("maw_finish")); 125 } 126 127 &set_label("maw_loop",0); 128 129 &mov(&swtmp(0),"ecx"); # 130 131 for ($i=0; $i<32; $i+=4) 132 { 133 &comment("Round $i"); 134 135 &mov("eax",&DWP($i,$a,"",0)); # *a 136 &mul($w); # *a * w 137 &add("eax",$c); # L(t)+= *r 138 &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r 139 &adc("edx",0); # H(t)+=carry 140 &add("eax",$c); # L(t)+=c 141 &adc("edx",0); # H(t)+=carry 142 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 143 &mov($c,"edx"); # c= H(t); 144 } 145 146 &comment(""); 147 &mov("ecx",&swtmp(0)); # 148 &add($a,32); 149 &add($r,32); 150 &sub("ecx",8); 151 &jnz(&label("maw_loop")); 152 153 &set_label("maw_finish",0); 154 &mov("ecx",&wparam(2)); # get num 155 &and("ecx",7); 156 &jnz(&label("maw_finish2")); # helps branch prediction 157 &jmp(&label("maw_end")); 158 159 &set_label("maw_finish2",1); 160 for ($i=0; $i<7; $i++) 161 { 162 &comment("Tail Round $i"); 163 &mov("eax",&DWP($i*4,$a,"",0));# *a 164 &mul($w); # *a * w 165 &add("eax",$c); # L(t)+=c 166 &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r 167 &adc("edx",0); # H(t)+=carry 168 &add("eax",$c); 169 &adc("edx",0); # H(t)+=carry 170 &dec("ecx") if ($i != 7-1); 171 &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); 172 &mov($c,"edx"); # c= H(t); 173 &jz(&label("maw_end")) if ($i != 7-1); 174 } 175 &set_label("maw_end",0); 176 &mov("eax",$c); 177 178 &pop("ecx"); # clear variable from 179 180 &function_end($name); 181 } 182 183sub bn_mul_words 184 { 185 local($name)=@_; 186 187 &function_begin($name,""); 188 189 &comment(""); 190 $Low="eax"; 191 $High="edx"; 192 $a="ebx"; 193 $w="ecx"; 194 $r="edi"; 195 $c="esi"; 196 $num="ebp"; 197 198 &xor($c,$c); # clear carry 199 &mov($r,&wparam(0)); # 200 &mov($a,&wparam(1)); # 201 &mov($num,&wparam(2)); # 202 &mov($w,&wparam(3)); # 203 204 &and($num,0xfffffff8); # num / 8 205 &jz(&label("mw_finish")); 206 207 &set_label("mw_loop",0); 208 for ($i=0; $i<32; $i+=4) 209 { 210 &comment("Round $i"); 211 212 &mov("eax",&DWP($i,$a,"",0)); # *a 213 &mul($w); # *a * w 214 &add("eax",$c); # L(t)+=c 215 # XXX 216 217 &adc("edx",0); # H(t)+=carry 218 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 219 220 &mov($c,"edx"); # c= H(t); 221 } 222 223 &comment(""); 224 &add($a,32); 225 &add($r,32); 226 &sub($num,8); 227 &jz(&label("mw_finish")); 228 &jmp(&label("mw_loop")); 229 230 &set_label("mw_finish",0); 231 &mov($num,&wparam(2)); # get num 232 &and($num,7); 233 &jnz(&label("mw_finish2")); 234 &jmp(&label("mw_end")); 235 236 &set_label("mw_finish2",1); 237 for ($i=0; $i<7; $i++) 238 { 239 &comment("Tail Round $i"); 240 &mov("eax",&DWP($i*4,$a,"",0));# *a 241 &mul($w); # *a * w 242 &add("eax",$c); # L(t)+=c 243 # XXX 244 &adc("edx",0); # H(t)+=carry 245 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); 246 &mov($c,"edx"); # c= H(t); 247 &dec($num) if ($i != 7-1); 248 &jz(&label("mw_end")) if ($i != 7-1); 249 } 250 &set_label("mw_end",0); 251 &mov("eax",$c); 252 253 &function_end($name); 254 } 255 256sub bn_sqr_words 257 { 258 local($name)=@_; 259 260 &function_begin($name,""); 261 262 &comment(""); 263 $r="esi"; 264 $a="edi"; 265 $num="ebx"; 266 267 &mov($r,&wparam(0)); # 268 &mov($a,&wparam(1)); # 269 &mov($num,&wparam(2)); # 270 271 &and($num,0xfffffff8); # num / 8 272 &jz(&label("sw_finish")); 273 274 &set_label("sw_loop",0); 275 for ($i=0; $i<32; $i+=4) 276 { 277 &comment("Round $i"); 278 &mov("eax",&DWP($i,$a,"",0)); # *a 279 # XXX 280 &mul("eax"); # *a * *a 281 &mov(&DWP($i*2,$r,"",0),"eax"); # 282 &mov(&DWP($i*2+4,$r,"",0),"edx");# 283 } 284 285 &comment(""); 286 &add($a,32); 287 &add($r,64); 288 &sub($num,8); 289 &jnz(&label("sw_loop")); 290 291 &set_label("sw_finish",0); 292 &mov($num,&wparam(2)); # get num 293 &and($num,7); 294 &jz(&label("sw_end")); 295 296 for ($i=0; $i<7; $i++) 297 { 298 &comment("Tail Round $i"); 299 &mov("eax",&DWP($i*4,$a,"",0)); # *a 300 # XXX 301 &mul("eax"); # *a * *a 302 &mov(&DWP($i*8,$r,"",0),"eax"); # 303 &dec($num) if ($i != 7-1); 304 &mov(&DWP($i*8+4,$r,"",0),"edx"); 305 &jz(&label("sw_end")) if ($i != 7-1); 306 } 307 &set_label("sw_end",0); 308 309 &function_end($name); 310 } 311 312sub bn_div_words 313 { 314 local($name)=@_; 315 316 &function_begin($name,""); 317 &mov("edx",&wparam(0)); # 318 &mov("eax",&wparam(1)); # 319 &mov("ebx",&wparam(2)); # 320 &div("ebx"); 321 &function_end($name); 322 } 323 324sub bn_add_words 325 { 326 local($name)=@_; 327 328 &function_begin($name,""); 329 330 &comment(""); 331 $a="esi"; 332 $b="edi"; 333 $c="eax"; 334 $r="ebx"; 335 $tmp1="ecx"; 336 $tmp2="edx"; 337 $num="ebp"; 338 339 &mov($r,&wparam(0)); # get r 340 &mov($a,&wparam(1)); # get a 341 &mov($b,&wparam(2)); # get b 342 &mov($num,&wparam(3)); # get num 343 &xor($c,$c); # clear carry 344 &and($num,0xfffffff8); # num / 8 345 346 &jz(&label("aw_finish")); 347 348 &set_label("aw_loop",0); 349 for ($i=0; $i<8; $i++) 350 { 351 &comment("Round $i"); 352 353 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 354 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 355 &add($tmp1,$c); 356 &mov($c,0); 357 &adc($c,$c); 358 &add($tmp1,$tmp2); 359 &adc($c,0); 360 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 361 } 362 363 &comment(""); 364 &add($a,32); 365 &add($b,32); 366 &add($r,32); 367 &sub($num,8); 368 &jnz(&label("aw_loop")); 369 370 &set_label("aw_finish",0); 371 &mov($num,&wparam(3)); # get num 372 &and($num,7); 373 &jz(&label("aw_end")); 374 375 for ($i=0; $i<7; $i++) 376 { 377 &comment("Tail Round $i"); 378 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 379 &mov($tmp2,&DWP($i*4,$b,"",0));# *b 380 &add($tmp1,$c); 381 &mov($c,0); 382 &adc($c,$c); 383 &add($tmp1,$tmp2); 384 &adc($c,0); 385 &dec($num) if ($i != 6); 386 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 387 &jz(&label("aw_end")) if ($i != 6); 388 } 389 &set_label("aw_end",0); 390 391# &mov("eax",$c); # $c is "eax" 392 393 &function_end($name); 394 } 395 396sub bn_sub_words 397 { 398 local($name)=@_; 399 400 &function_begin($name,""); 401 402 &comment(""); 403 $a="esi"; 404 $b="edi"; 405 $c="eax"; 406 $r="ebx"; 407 $tmp1="ecx"; 408 $tmp2="edx"; 409 $num="ebp"; 410 411 &mov($r,&wparam(0)); # get r 412 &mov($a,&wparam(1)); # get a 413 &mov($b,&wparam(2)); # get b 414 &mov($num,&wparam(3)); # get num 415 &xor($c,$c); # clear carry 416 &and($num,0xfffffff8); # num / 8 417 418 &jz(&label("aw_finish")); 419 420 &set_label("aw_loop",0); 421 for ($i=0; $i<8; $i++) 422 { 423 &comment("Round $i"); 424 425 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 426 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 427 &sub($tmp1,$c); 428 &mov($c,0); 429 &adc($c,$c); 430 &sub($tmp1,$tmp2); 431 &adc($c,0); 432 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 433 } 434 435 &comment(""); 436 &add($a,32); 437 &add($b,32); 438 &add($r,32); 439 &sub($num,8); 440 &jnz(&label("aw_loop")); 441 442 &set_label("aw_finish",0); 443 &mov($num,&wparam(3)); # get num 444 &and($num,7); 445 &jz(&label("aw_end")); 446 447 for ($i=0; $i<7; $i++) 448 { 449 &comment("Tail Round $i"); 450 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 451 &mov($tmp2,&DWP($i*4,$b,"",0));# *b 452 &sub($tmp1,$c); 453 &mov($c,0); 454 &adc($c,$c); 455 &sub($tmp1,$tmp2); 456 &adc($c,0); 457 &dec($num) if ($i != 6); 458 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 459 &jz(&label("aw_end")) if ($i != 6); 460 } 461 &set_label("aw_end",0); 462 463# &mov("eax",$c); # $c is "eax" 464 465 &function_end($name); 466 } 467 468sub bn_sub_part_words 469 { 470 local($name)=@_; 471 472 &function_begin($name,""); 473 474 &comment(""); 475 $a="esi"; 476 $b="edi"; 477 $c="eax"; 478 $r="ebx"; 479 $tmp1="ecx"; 480 $tmp2="edx"; 481 $num="ebp"; 482 483 &mov($r,&wparam(0)); # get r 484 &mov($a,&wparam(1)); # get a 485 &mov($b,&wparam(2)); # get b 486 &mov($num,&wparam(3)); # get num 487 &xor($c,$c); # clear carry 488 &and($num,0xfffffff8); # num / 8 489 490 &jz(&label("aw_finish")); 491 492 &set_label("aw_loop",0); 493 for ($i=0; $i<8; $i++) 494 { 495 &comment("Round $i"); 496 497 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 498 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 499 &sub($tmp1,$c); 500 &mov($c,0); 501 &adc($c,$c); 502 &sub($tmp1,$tmp2); 503 &adc($c,0); 504 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 505 } 506 507 &comment(""); 508 &add($a,32); 509 &add($b,32); 510 &add($r,32); 511 &sub($num,8); 512 &jnz(&label("aw_loop")); 513 514 &set_label("aw_finish",0); 515 &mov($num,&wparam(3)); # get num 516 &and($num,7); 517 &jz(&label("aw_end")); 518 519 for ($i=0; $i<7; $i++) 520 { 521 &comment("Tail Round $i"); 522 &mov($tmp1,&DWP(0,$a,"",0)); # *a 523 &mov($tmp2,&DWP(0,$b,"",0));# *b 524 &sub($tmp1,$c); 525 &mov($c,0); 526 &adc($c,$c); 527 &sub($tmp1,$tmp2); 528 &adc($c,0); 529 &mov(&DWP(0,$r,"",0),$tmp1); # *r 530 &add($a, 4); 531 &add($b, 4); 532 &add($r, 4); 533 &dec($num) if ($i != 6); 534 &jz(&label("aw_end")) if ($i != 6); 535 } 536 &set_label("aw_end",0); 537 538 &cmp(&wparam(4),0); 539 &je(&label("pw_end")); 540 541 &mov($num,&wparam(4)); # get dl 542 &cmp($num,0); 543 &je(&label("pw_end")); 544 &jge(&label("pw_pos")); 545 546 &comment("pw_neg"); 547 &mov($tmp2,0); 548 &sub($tmp2,$num); 549 &mov($num,$tmp2); 550 &and($num,0xfffffff8); # num / 8 551 &jz(&label("pw_neg_finish")); 552 553 &set_label("pw_neg_loop",0); 554 for ($i=0; $i<8; $i++) 555 { 556 &comment("dl<0 Round $i"); 557 558 &mov($tmp1,0); 559 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 560 &sub($tmp1,$c); 561 &mov($c,0); 562 &adc($c,$c); 563 &sub($tmp1,$tmp2); 564 &adc($c,0); 565 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 566 } 567 568 &comment(""); 569 &add($b,32); 570 &add($r,32); 571 &sub($num,8); 572 &jnz(&label("pw_neg_loop")); 573 574 &set_label("pw_neg_finish",0); 575 &mov($tmp2,&wparam(4)); # get dl 576 &mov($num,0); 577 &sub($num,$tmp2); 578 &and($num,7); 579 &jz(&label("pw_end")); 580 581 for ($i=0; $i<7; $i++) 582 { 583 &comment("dl<0 Tail Round $i"); 584 &mov($tmp1,0); 585 &mov($tmp2,&DWP($i*4,$b,"",0));# *b 586 &sub($tmp1,$c); 587 &mov($c,0); 588 &adc($c,$c); 589 &sub($tmp1,$tmp2); 590 &adc($c,0); 591 &dec($num) if ($i != 6); 592 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 593 &jz(&label("pw_end")) if ($i != 6); 594 } 595 596 &jmp(&label("pw_end")); 597 598 &set_label("pw_pos",0); 599 600 &and($num,0xfffffff8); # num / 8 601 &jz(&label("pw_pos_finish")); 602 603 &set_label("pw_pos_loop",0); 604 605 for ($i=0; $i<8; $i++) 606 { 607 &comment("dl>0 Round $i"); 608 609 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 610 &sub($tmp1,$c); 611 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 612 &jnc(&label("pw_nc".$i)); 613 } 614 615 &comment(""); 616 &add($a,32); 617 &add($r,32); 618 &sub($num,8); 619 &jnz(&label("pw_pos_loop")); 620 621 &set_label("pw_pos_finish",0); 622 &mov($num,&wparam(4)); # get dl 623 &and($num,7); 624 &jz(&label("pw_end")); 625 626 for ($i=0; $i<7; $i++) 627 { 628 &comment("dl>0 Tail Round $i"); 629 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 630 &sub($tmp1,$c); 631 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 632 &jnc(&label("pw_tail_nc".$i)); 633 &dec($num) if ($i != 6); 634 &jz(&label("pw_end")) if ($i != 6); 635 } 636 &mov($c,1); 637 &jmp(&label("pw_end")); 638 639 &set_label("pw_nc_loop",0); 640 for ($i=0; $i<8; $i++) 641 { 642 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 643 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 644 &set_label("pw_nc".$i,0); 645 } 646 647 &comment(""); 648 &add($a,32); 649 &add($r,32); 650 &sub($num,8); 651 &jnz(&label("pw_nc_loop")); 652 653 &mov($num,&wparam(4)); # get dl 654 &and($num,7); 655 &jz(&label("pw_nc_end")); 656 657 for ($i=0; $i<7; $i++) 658 { 659 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 660 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 661 &set_label("pw_tail_nc".$i,0); 662 &dec($num) if ($i != 6); 663 &jz(&label("pw_nc_end")) if ($i != 6); 664 } 665 666 &set_label("pw_nc_end",0); 667 &mov($c,0); 668 669 &set_label("pw_end",0); 670 671# &mov("eax",$c); # $c is "eax" 672 673 &function_end($name); 674 } 675 676