x86-mont.pl revision 337982
1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# October 2005 11# 12# This is a "teaser" code, as it can be improved in several ways... 13# First of all non-SSE2 path should be implemented (yes, for now it 14# performs Montgomery multiplication/convolution only on SSE2-capable 15# CPUs such as P4, others fall down to original code). Then inner loop 16# can be unrolled and modulo-scheduled to improve ILP and possibly 17# moved to 128-bit XMM register bank (though it would require input 18# rearrangement and/or increase bus bandwidth utilization). Dedicated 19# squaring procedure should give further performance improvement... 20# Yet, for being draft, the code improves rsa512 *sign* benchmark by 21# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) 22 23# December 2006 24# 25# Modulo-scheduling SSE2 loops results in further 15-20% improvement. 26# Integer-only code [being equipped with dedicated squaring procedure] 27# gives ~40% on rsa512 sign benchmark... 28 29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30push(@INC,"${dir}","${dir}../../perlasm"); 31require "x86asm.pl"; 32 33&asm_init($ARGV[0],$0); 34 35$sse2=0; 36for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 37 38&external_label("OPENSSL_ia32cap_P") if ($sse2); 39 40&function_begin("bn_mul_mont"); 41 42$i="edx"; 43$j="ecx"; 44$ap="esi"; $tp="esi"; # overlapping variables!!! 45$rp="edi"; $bp="edi"; # overlapping variables!!! 46$np="ebp"; 47$num="ebx"; 48 49$_num=&DWP(4*0,"esp"); # stack top layout 50$_rp=&DWP(4*1,"esp"); 51$_ap=&DWP(4*2,"esp"); 52$_bp=&DWP(4*3,"esp"); 53$_np=&DWP(4*4,"esp"); 54$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); 55$_sp=&DWP(4*6,"esp"); 56$_bpend=&DWP(4*7,"esp"); 57$frame=32; # size of above frame rounded up to 16n 58 59 &xor ("eax","eax"); 60 &mov ("edi",&wparam(5)); # int num 61 &cmp ("edi",4); 62 &jl (&label("just_leave")); 63 64 &lea ("esi",&wparam(0)); # put aside pointer to argument block 65 &lea ("edx",&wparam(1)); # load ap 66 &add ("edi",2); # extra two words on top of tp 67 &neg ("edi"); 68 &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) 69 &neg ("edi"); 70 71 # minimize cache contention by arraning 2K window between stack 72 # pointer and ap argument [np is also position sensitive vector, 73 # but it's assumed to be near ap, as it's allocated at ~same 74 # time]. 75 &mov ("eax","ebp"); 76 &sub ("eax","edx"); 77 &and ("eax",2047); 78 &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 79 80 &xor ("edx","ebp"); 81 &and ("edx",2048); 82 &xor ("edx",2048); 83 &sub ("ebp","edx"); # this splits them apart modulo 4096 84 85 &and ("ebp",-64); # align to cache line 86 87 # Some OSes, *cough*-dows, insist on stack being "wired" to 88 # physical memory in strictly sequential manner, i.e. if stack 89 # allocation spans two pages, then reference to farmost one can 90 # be punishable by SEGV. But page walking can do good even on 91 # other OSes, because it guarantees that villain thread hits 92 # the guard page before it can make damage to innocent one... 93 &mov ("eax","esp"); 94 &sub ("eax","ebp"); 95 &and ("eax",-4096); 96 &mov ("edx","esp"); # saved stack pointer! 97 &lea ("esp",&DWP(0,"ebp","eax")); 98 &mov ("eax",&DWP(0,"esp")); 99 &cmp ("esp","ebp"); 100 &ja (&label("page_walk")); 101 &jmp (&label("page_walk_done")); 102 103&set_label("page_walk",16); 104 &lea ("esp",&DWP(-4096,"esp")); 105 &mov ("eax",&DWP(0,"esp")); 106 &cmp ("esp","ebp"); 107 &ja (&label("page_walk")); 108&set_label("page_walk_done"); 109 110 ################################# load argument block... 111 &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp 112 &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap 113 &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp 114 &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np 115 &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 116 #&mov ("edi",&DWP(5*4,"esi"));# int num 117 118 &mov ("esi",&DWP(0,"esi")); # pull n0[0] 119 &mov ($_rp,"eax"); # ... save a copy of argument block 120 &mov ($_ap,"ebx"); 121 &mov ($_bp,"ecx"); 122 &mov ($_np,"ebp"); 123 &mov ($_n0,"esi"); 124 &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling 125 #&mov ($_num,$num); # redundant as $num is not reused 126 &mov ($_sp,"edx"); # saved stack pointer! 127 128if($sse2) { 129$acc0="mm0"; # mmx register bank layout 130$acc1="mm1"; 131$car0="mm2"; 132$car1="mm3"; 133$mul0="mm4"; 134$mul1="mm5"; 135$temp="mm6"; 136$mask="mm7"; 137 138 &picmeup("eax","OPENSSL_ia32cap_P"); 139 &bt (&DWP(0,"eax"),26); 140 &jnc (&label("non_sse2")); 141 142 &mov ("eax",-1); 143 &movd ($mask,"eax"); # mask 32 lower bits 144 145 &mov ($ap,$_ap); # load input pointers 146 &mov ($bp,$_bp); 147 &mov ($np,$_np); 148 149 &xor ($i,$i); # i=0 150 &xor ($j,$j); # j=0 151 152 &movd ($mul0,&DWP(0,$bp)); # bp[0] 153 &movd ($mul1,&DWP(0,$ap)); # ap[0] 154 &movd ($car1,&DWP(0,$np)); # np[0] 155 156 &pmuludq($mul1,$mul0); # ap[0]*bp[0] 157 &movq ($car0,$mul1); 158 &movq ($acc0,$mul1); # I wish movd worked for 159 &pand ($acc0,$mask); # inter-register transfers 160 161 &pmuludq($mul1,$_n0q); # *=n0 162 163 &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 164 &paddq ($car1,$acc0); 165 166 &movd ($acc1,&DWP(4,$np)); # np[1] 167 &movd ($acc0,&DWP(4,$ap)); # ap[1] 168 169 &psrlq ($car0,32); 170 &psrlq ($car1,32); 171 172 &inc ($j); # j++ 173&set_label("1st",16); 174 &pmuludq($acc0,$mul0); # ap[j]*bp[0] 175 &pmuludq($acc1,$mul1); # np[j]*m1 176 &paddq ($car0,$acc0); # +=c0 177 &paddq ($car1,$acc1); # +=c1 178 179 &movq ($acc0,$car0); 180 &pand ($acc0,$mask); 181 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 182 &paddq ($car1,$acc0); # +=ap[j]*bp[0]; 183 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 184 &psrlq ($car0,32); 185 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= 186 &psrlq ($car1,32); 187 188 &lea ($j,&DWP(1,$j)); 189 &cmp ($j,$num); 190 &jl (&label("1st")); 191 192 &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] 193 &pmuludq($acc1,$mul1); # np[num-1]*m1 194 &paddq ($car0,$acc0); # +=c0 195 &paddq ($car1,$acc1); # +=c1 196 197 &movq ($acc0,$car0); 198 &pand ($acc0,$mask); 199 &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; 200 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 201 202 &psrlq ($car0,32); 203 &psrlq ($car1,32); 204 205 &paddq ($car1,$car0); 206 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 207 208 &inc ($i); # i++ 209&set_label("outer"); 210 &xor ($j,$j); # j=0 211 212 &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] 213 &movd ($mul1,&DWP(0,$ap)); # ap[0] 214 &movd ($temp,&DWP($frame,"esp")); # tp[0] 215 &movd ($car1,&DWP(0,$np)); # np[0] 216 &pmuludq($mul1,$mul0); # ap[0]*bp[i] 217 218 &paddq ($mul1,$temp); # +=tp[0] 219 &movq ($acc0,$mul1); 220 &movq ($car0,$mul1); 221 &pand ($acc0,$mask); 222 223 &pmuludq($mul1,$_n0q); # *=n0 224 225 &pmuludq($car1,$mul1); 226 &paddq ($car1,$acc0); 227 228 &movd ($temp,&DWP($frame+4,"esp")); # tp[1] 229 &movd ($acc1,&DWP(4,$np)); # np[1] 230 &movd ($acc0,&DWP(4,$ap)); # ap[1] 231 232 &psrlq ($car0,32); 233 &psrlq ($car1,32); 234 &paddq ($car0,$temp); # +=tp[1] 235 236 &inc ($j); # j++ 237 &dec ($num); 238&set_label("inner"); 239 &pmuludq($acc0,$mul0); # ap[j]*bp[i] 240 &pmuludq($acc1,$mul1); # np[j]*m1 241 &paddq ($car0,$acc0); # +=c0 242 &paddq ($car1,$acc1); # +=c1 243 244 &movq ($acc0,$car0); 245 &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] 246 &pand ($acc0,$mask); 247 &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] 248 &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] 249 &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] 250 &psrlq ($car0,32); 251 &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= 252 &psrlq ($car1,32); 253 &paddq ($car0,$temp); # +=tp[j+1] 254 255 &dec ($num); 256 &lea ($j,&DWP(1,$j)); # j++ 257 &jnz (&label("inner")); 258 259 &mov ($num,$j); 260 &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] 261 &pmuludq($acc1,$mul1); # np[num-1]*m1 262 &paddq ($car0,$acc0); # +=c0 263 &paddq ($car1,$acc1); # +=c1 264 265 &movq ($acc0,$car0); 266 &pand ($acc0,$mask); 267 &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] 268 &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= 269 &psrlq ($car0,32); 270 &psrlq ($car1,32); 271 272 &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] 273 &paddq ($car1,$car0); 274 &paddq ($car1,$temp); 275 &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] 276 277 &lea ($i,&DWP(1,$i)); # i++ 278 &cmp ($i,$num); 279 &jle (&label("outer")); 280 281 &emms (); # done with mmx bank 282 &jmp (&label("common_tail")); 283 284&set_label("non_sse2",16); 285} 286 287if (0) { 288 &mov ("esp",$_sp); 289 &xor ("eax","eax"); # signal "not fast enough [yet]" 290 &jmp (&label("just_leave")); 291 # While the below code provides competitive performance for 292 # all key lengthes on modern Intel cores, it's still more 293 # than 10% slower for 4096-bit key elsewhere:-( "Competitive" 294 # means compared to the original integer-only assembler. 295 # 512-bit RSA sign is better by ~40%, but that's about all 296 # one can say about all CPUs... 297} else { 298$inp="esi"; # integer path uses these registers differently 299$word="edi"; 300$carry="ebp"; 301 302 &mov ($inp,$_ap); 303 &lea ($carry,&DWP(1,$num)); 304 &mov ($word,$_bp); 305 &xor ($j,$j); # j=0 306 &mov ("edx",$inp); 307 &and ($carry,1); # see if num is even 308 &sub ("edx",$word); # see if ap==bp 309 &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] 310 &or ($carry,"edx"); 311 &mov ($word,&DWP(0,$word)); # bp[0] 312 &jz (&label("bn_sqr_mont")); 313 &mov ($_bpend,"eax"); 314 &mov ("eax",&DWP(0,$inp)); 315 &xor ("edx","edx"); 316 317&set_label("mull",16); 318 &mov ($carry,"edx"); 319 &mul ($word); # ap[j]*bp[0] 320 &add ($carry,"eax"); 321 &lea ($j,&DWP(1,$j)); 322 &adc ("edx",0); 323 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 324 &cmp ($j,$num); 325 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 326 &jl (&label("mull")); 327 328 &mov ($carry,"edx"); 329 &mul ($word); # ap[num-1]*bp[0] 330 &mov ($word,$_n0); 331 &add ("eax",$carry); 332 &mov ($inp,$_np); 333 &adc ("edx",0); 334 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 335 336 &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= 337 &xor ($j,$j); 338 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 339 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 340 341 &mov ("eax",&DWP(0,$inp)); # np[0] 342 &mul ($word); # np[0]*m 343 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 344 &mov ("eax",&DWP(4,$inp)); # np[1] 345 &adc ("edx",0); 346 &inc ($j); 347 348 &jmp (&label("2ndmadd")); 349 350&set_label("1stmadd",16); 351 &mov ($carry,"edx"); 352 &mul ($word); # ap[j]*bp[i] 353 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 354 &lea ($j,&DWP(1,$j)); 355 &adc ("edx",0); 356 &add ($carry,"eax"); 357 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] 358 &adc ("edx",0); 359 &cmp ($j,$num); 360 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 361 &jl (&label("1stmadd")); 362 363 &mov ($carry,"edx"); 364 &mul ($word); # ap[num-1]*bp[i] 365 &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] 366 &mov ($word,$_n0); 367 &adc ("edx",0); 368 &mov ($inp,$_np); 369 &add ($carry,"eax"); 370 &adc ("edx",0); 371 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 372 373 &xor ($j,$j); 374 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 375 &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= 376 &adc ($j,0); 377 &mov ("eax",&DWP(0,$inp)); # np[0] 378 &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= 379 &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= 380 381 &mul ($word); # np[0]*m 382 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 383 &mov ("eax",&DWP(4,$inp)); # np[1] 384 &adc ("edx",0); 385 &mov ($j,1); 386 387&set_label("2ndmadd",16); 388 &mov ($carry,"edx"); 389 &mul ($word); # np[j]*m 390 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 391 &lea ($j,&DWP(1,$j)); 392 &adc ("edx",0); 393 &add ($carry,"eax"); 394 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] 395 &adc ("edx",0); 396 &cmp ($j,$num); 397 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= 398 &jl (&label("2ndmadd")); 399 400 &mov ($carry,"edx"); 401 &mul ($word); # np[j]*m 402 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 403 &adc ("edx",0); 404 &add ($carry,"eax"); 405 &adc ("edx",0); 406 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 407 408 &xor ("eax","eax"); 409 &mov ($j,$_bp); # &bp[i] 410 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 411 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 412 &lea ($j,&DWP(4,$j)); 413 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 414 &cmp ($j,$_bpend); 415 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 416 &je (&label("common_tail")); 417 418 &mov ($word,&DWP(0,$j)); # bp[i+1] 419 &mov ($inp,$_ap); 420 &mov ($_bp,$j); # &bp[++i] 421 &xor ($j,$j); 422 &xor ("edx","edx"); 423 &mov ("eax",&DWP(0,$inp)); 424 &jmp (&label("1stmadd")); 425 426&set_label("bn_sqr_mont",16); 427$sbit=$num; 428 &mov ($_num,$num); 429 &mov ($_bp,$j); # i=0 430 431 &mov ("eax",$word); # ap[0] 432 &mul ($word); # ap[0]*ap[0] 433 &mov (&DWP($frame,"esp"),"eax"); # tp[0]= 434 &mov ($sbit,"edx"); 435 &shr ("edx",1); 436 &and ($sbit,1); 437 &inc ($j); 438&set_label("sqr",16); 439 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 440 &mov ($carry,"edx"); 441 &mul ($word); # ap[j]*ap[0] 442 &add ("eax",$carry); 443 &lea ($j,&DWP(1,$j)); 444 &adc ("edx",0); 445 &lea ($carry,&DWP(0,$sbit,"eax",2)); 446 &shr ("eax",31); 447 &cmp ($j,$_num); 448 &mov ($sbit,"eax"); 449 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 450 &jl (&label("sqr")); 451 452 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] 453 &mov ($carry,"edx"); 454 &mul ($word); # ap[num-1]*ap[0] 455 &add ("eax",$carry); 456 &mov ($word,$_n0); 457 &adc ("edx",0); 458 &mov ($inp,$_np); 459 &lea ($carry,&DWP(0,$sbit,"eax",2)); 460 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 461 &shr ("eax",31); 462 &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= 463 464 &lea ($carry,&DWP(0,"eax","edx",2)); 465 &mov ("eax",&DWP(0,$inp)); # np[0] 466 &shr ("edx",31); 467 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= 468 &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= 469 470 &mul ($word); # np[0]*m 471 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 472 &mov ($num,$j); 473 &adc ("edx",0); 474 &mov ("eax",&DWP(4,$inp)); # np[1] 475 &mov ($j,1); 476 477&set_label("3rdmadd",16); 478 &mov ($carry,"edx"); 479 &mul ($word); # np[j]*m 480 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 481 &adc ("edx",0); 482 &add ($carry,"eax"); 483 &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] 484 &adc ("edx",0); 485 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= 486 487 &mov ($carry,"edx"); 488 &mul ($word); # np[j+1]*m 489 &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] 490 &lea ($j,&DWP(2,$j)); 491 &adc ("edx",0); 492 &add ($carry,"eax"); 493 &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] 494 &adc ("edx",0); 495 &cmp ($j,$num); 496 &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= 497 &jl (&label("3rdmadd")); 498 499 &mov ($carry,"edx"); 500 &mul ($word); # np[j]*m 501 &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] 502 &adc ("edx",0); 503 &add ($carry,"eax"); 504 &adc ("edx",0); 505 &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= 506 507 &mov ($j,$_bp); # i 508 &xor ("eax","eax"); 509 &mov ($inp,$_ap); 510 &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] 511 &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] 512 &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= 513 &cmp ($j,$num); 514 &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= 515 &je (&label("common_tail")); 516 517 &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] 518 &lea ($j,&DWP(1,$j)); 519 &mov ("eax",$word); 520 &mov ($_bp,$j); # ++i 521 &mul ($word); # ap[i]*ap[i] 522 &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] 523 &adc ("edx",0); 524 &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= 525 &xor ($carry,$carry); 526 &cmp ($j,$num); 527 &lea ($j,&DWP(1,$j)); 528 &je (&label("sqrlast")); 529 530 &mov ($sbit,"edx"); # zaps $num 531 &shr ("edx",1); 532 &and ($sbit,1); 533&set_label("sqradd",16); 534 &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] 535 &mov ($carry,"edx"); 536 &mul ($word); # ap[j]*ap[i] 537 &add ("eax",$carry); 538 &lea ($carry,&DWP(0,"eax","eax")); 539 &adc ("edx",0); 540 &shr ("eax",31); 541 &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] 542 &lea ($j,&DWP(1,$j)); 543 &adc ("eax",0); 544 &add ($carry,$sbit); 545 &adc ("eax",0); 546 &cmp ($j,$_num); 547 &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= 548 &mov ($sbit,"eax"); 549 &jle (&label("sqradd")); 550 551 &mov ($carry,"edx"); 552 &add ("edx","edx"); 553 &shr ($carry,31); 554 &add ("edx",$sbit); 555 &adc ($carry,0); 556&set_label("sqrlast"); 557 &mov ($word,$_n0); 558 &mov ($inp,$_np); 559 &imul ($word,&DWP($frame,"esp")); # n0*tp[0] 560 561 &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] 562 &mov ("eax",&DWP(0,$inp)); # np[0] 563 &adc ($carry,0); 564 &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= 565 &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= 566 567 &mul ($word); # np[0]*m 568 &add ("eax",&DWP($frame,"esp")); # +=tp[0] 569 &lea ($num,&DWP(-1,$j)); 570 &adc ("edx",0); 571 &mov ($j,1); 572 &mov ("eax",&DWP(4,$inp)); # np[1] 573 574 &jmp (&label("3rdmadd")); 575} 576 577&set_label("common_tail",16); 578 &mov ($np,$_np); # load modulus pointer 579 &mov ($rp,$_rp); # load result pointer 580 &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] 581 582 &mov ("eax",&DWP(0,$tp)); # tp[0] 583 &mov ($j,$num); # j=num-1 584 &xor ($i,$i); # i=0 and clear CF! 585 586&set_label("sub",16); 587 &sbb ("eax",&DWP(0,$np,$i,4)); 588 &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] 589 &dec ($j); # doesn't affect CF! 590 &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] 591 &lea ($i,&DWP(1,$i)); # i++ 592 &jge (&label("sub")); 593 594 &sbb ("eax",0); # handle upmost overflow bit 595 &mov ("edx",-1); 596 &xor ("edx","eax"); 597 &jmp (&label("copy")); 598 599&set_label("copy",16); # conditional copy 600 &mov ($tp,&DWP($frame,"esp",$num,4)); 601 &mov ($np,&DWP(0,$rp,$num,4)); 602 &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector 603 &and ($tp,"eax"); 604 &and ($np,"edx"); 605 &or ($np,$tp); 606 &mov (&DWP(0,$rp,$num,4),$np); 607 &dec ($num); 608 &jge (&label("copy")); 609 610 &mov ("esp",$_sp); # pull saved stack pointer 611 &mov ("eax",1); 612&set_label("just_leave"); 613&function_end("bn_mul_mont"); 614 615&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); 616 617&asm_finish(); 618