155714Skris#!/usr/local/bin/perl 255714Skris 3238405Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 4238405Sjkimpush(@INC,"${dir}","${dir}../../perlasm"); 555714Skrisrequire "x86asm.pl"; 655714Skris 755714Skris&asm_init($ARGV[0],$0); 855714Skris 9160814Ssimon$sse2=0; 10160814Ssimonfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 11160814Ssimon 12160814Ssimon&external_label("OPENSSL_ia32cap_P") if ($sse2); 13160814Ssimon 1455714Skris&bn_mul_add_words("bn_mul_add_words"); 1555714Skris&bn_mul_words("bn_mul_words"); 1655714Skris&bn_sqr_words("bn_sqr_words"); 1755714Skris&bn_div_words("bn_div_words"); 1855714Skris&bn_add_words("bn_add_words"); 1955714Skris&bn_sub_words("bn_sub_words"); 20160814Ssimon&bn_sub_part_words("bn_sub_part_words"); 2155714Skris 2255714Skris&asm_finish(); 2355714Skris 2455714Skrissub bn_mul_add_words 2555714Skris { 2655714Skris local($name)=@_; 2755714Skris 28238405Sjkim &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); 2955714Skris 30238405Sjkim $r="eax"; 31238405Sjkim $a="edx"; 32238405Sjkim $c="ecx"; 3355714Skris 34160814Ssimon if ($sse2) { 35160814Ssimon &picmeup("eax","OPENSSL_ia32cap_P"); 36160814Ssimon &bt(&DWP(0,"eax"),26); 37238405Sjkim &jnc(&label("maw_non_sse2")); 38160814Ssimon 39238405Sjkim &mov($r,&wparam(0)); 40238405Sjkim &mov($a,&wparam(1)); 41238405Sjkim &mov($c,&wparam(2)); 42238405Sjkim &movd("mm0",&wparam(3)); # mm0 = w 43160814Ssimon &pxor("mm1","mm1"); # mm1 = carry_in 44238405Sjkim &jmp(&label("maw_sse2_entry")); 45238405Sjkim 46238405Sjkim &set_label("maw_sse2_unrolled",16); 47160814Ssimon &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] 48160814Ssimon &paddq("mm1","mm3"); # mm1 = carry_in + r[0] 49160814Ssimon &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0] 50160814Ssimon &pmuludq("mm2","mm0"); # mm2 = w*a[0] 51160814Ssimon &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1] 52160814Ssimon &pmuludq("mm4","mm0"); # mm4 = w*a[1] 53160814Ssimon &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2] 54160814Ssimon &pmuludq("mm6","mm0"); # mm6 = w*a[2] 55160814Ssimon &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3] 56160814Ssimon &pmuludq("mm7","mm0"); # mm7 = w*a[3] 57160814Ssimon &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0] 58160814Ssimon &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1] 59160814Ssimon &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1] 60160814Ssimon &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2] 61160814Ssimon &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2] 62160814Ssimon &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3] 63160814Ssimon &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3] 64160814Ssimon &movd(&DWP(0,$r,"",0),"mm1"); 65160814Ssimon &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4] 66160814Ssimon &pmuludq("mm2","mm0"); # mm2 = w*a[4] 67160814Ssimon &psrlq("mm1",32); # mm1 = carry0 68160814Ssimon &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5] 69160814Ssimon &pmuludq("mm4","mm0"); # mm4 = w*a[5] 70160814Ssimon &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1] 71160814Ssimon &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6] 72160814Ssimon &pmuludq("mm6","mm0"); # mm6 = w*a[6] 73160814Ssimon &movd(&DWP(4,$r,"",0),"mm1"); 74160814Ssimon &psrlq("mm1",32); # mm1 = carry1 75160814Ssimon &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7] 76160814Ssimon &add($a,32); 77160814Ssimon &pmuludq("mm3","mm0"); # mm3 = w*a[7] 78160814Ssimon &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2] 79160814Ssimon &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4] 80160814Ssimon &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4] 81160814Ssimon &movd(&DWP(8,$r,"",0),"mm1"); 82160814Ssimon &psrlq("mm1",32); # mm1 = carry2 83160814Ssimon &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3] 84160814Ssimon &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5] 85160814Ssimon &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5] 86160814Ssimon &movd(&DWP(12,$r,"",0),"mm1"); 87160814Ssimon &psrlq("mm1",32); # mm1 = carry3 88160814Ssimon &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4] 89160814Ssimon &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6] 90160814Ssimon &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6] 91160814Ssimon &movd(&DWP(16,$r,"",0),"mm1"); 92160814Ssimon &psrlq("mm1",32); # mm1 = carry4 93160814Ssimon &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5] 94160814Ssimon &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7] 95160814Ssimon &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7] 96160814Ssimon &movd(&DWP(20,$r,"",0),"mm1"); 97160814Ssimon &psrlq("mm1",32); # mm1 = carry5 98160814Ssimon &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6] 99160814Ssimon &movd(&DWP(24,$r,"",0),"mm1"); 100160814Ssimon &psrlq("mm1",32); # mm1 = carry6 101160814Ssimon &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7] 102160814Ssimon &movd(&DWP(28,$r,"",0),"mm1"); 103238405Sjkim &lea($r,&DWP(32,$r)); 104160814Ssimon &psrlq("mm1",32); # mm1 = carry_out 105160814Ssimon 106238405Sjkim &sub($c,8); 107238405Sjkim &jz(&label("maw_sse2_exit")); 108238405Sjkim &set_label("maw_sse2_entry"); 109238405Sjkim &test($c,0xfffffff8); 110238405Sjkim &jnz(&label("maw_sse2_unrolled")); 111238405Sjkim 112238405Sjkim &set_label("maw_sse2_loop",4); 113238405Sjkim &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 114238405Sjkim &movd("mm3",&DWP(0,$r)); # mm3 = r[i] 115238405Sjkim &pmuludq("mm2","mm0"); # a[i] *= w 116238405Sjkim &lea($a,&DWP(4,$a)); 117238405Sjkim &paddq("mm1","mm3"); # carry += r[i] 118238405Sjkim &paddq("mm1","mm2"); # carry += a[i]*w 119238405Sjkim &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 120238405Sjkim &sub($c,1); 121238405Sjkim &psrlq("mm1",32); # carry = carry_high 122238405Sjkim &lea($r,&DWP(4,$r)); 123160814Ssimon &jnz(&label("maw_sse2_loop")); 124238405Sjkim &set_label("maw_sse2_exit"); 125238405Sjkim &movd("eax","mm1"); # c = carry_out 126160814Ssimon &emms(); 127238405Sjkim &ret(); 128160814Ssimon 129238405Sjkim &set_label("maw_non_sse2",16); 130160814Ssimon } 131160814Ssimon 132238405Sjkim # function_begin prologue 133238405Sjkim &push("ebp"); 134238405Sjkim &push("ebx"); 135238405Sjkim &push("esi"); 136238405Sjkim &push("edi"); 13755714Skris 138238405Sjkim &comment(""); 139238405Sjkim $Low="eax"; 140238405Sjkim $High="edx"; 141238405Sjkim $a="ebx"; 142238405Sjkim $w="ebp"; 143238405Sjkim $r="edi"; 144238405Sjkim $c="esi"; 14555714Skris 146238405Sjkim &xor($c,$c); # clear carry 147238405Sjkim &mov($r,&wparam(0)); # 148238405Sjkim 149238405Sjkim &mov("ecx",&wparam(2)); # 150238405Sjkim &mov($a,&wparam(1)); # 151238405Sjkim 152238405Sjkim &and("ecx",0xfffffff8); # num / 8 153238405Sjkim &mov($w,&wparam(3)); # 154238405Sjkim 155238405Sjkim &push("ecx"); # Up the stack for a tmp variable 156238405Sjkim 157238405Sjkim &jz(&label("maw_finish")); 158238405Sjkim 159238405Sjkim &set_label("maw_loop",16); 160238405Sjkim 16155714Skris for ($i=0; $i<32; $i+=4) 16255714Skris { 16355714Skris &comment("Round $i"); 16455714Skris 165238405Sjkim &mov("eax",&DWP($i,$a)); # *a 16655714Skris &mul($w); # *a * w 167238405Sjkim &add("eax",$c); # L(t)+= c 16855714Skris &adc("edx",0); # H(t)+=carry 169238405Sjkim &add("eax",&DWP($i,$r)); # L(t)+= *r 17055714Skris &adc("edx",0); # H(t)+=carry 171238405Sjkim &mov(&DWP($i,$r),"eax"); # *r= L(t); 17255714Skris &mov($c,"edx"); # c= H(t); 17355714Skris } 17455714Skris 17555714Skris &comment(""); 17655714Skris &sub("ecx",8); 177238405Sjkim &lea($a,&DWP(32,$a)); 178238405Sjkim &lea($r,&DWP(32,$r)); 17955714Skris &jnz(&label("maw_loop")); 18055714Skris 18155714Skris &set_label("maw_finish",0); 18255714Skris &mov("ecx",&wparam(2)); # get num 18355714Skris &and("ecx",7); 18455714Skris &jnz(&label("maw_finish2")); # helps branch prediction 18555714Skris &jmp(&label("maw_end")); 18655714Skris 18755714Skris &set_label("maw_finish2",1); 18855714Skris for ($i=0; $i<7; $i++) 18955714Skris { 19055714Skris &comment("Tail Round $i"); 191238405Sjkim &mov("eax",&DWP($i*4,$a)); # *a 19255714Skris &mul($w); # *a * w 19355714Skris &add("eax",$c); # L(t)+=c 19455714Skris &adc("edx",0); # H(t)+=carry 195238405Sjkim &add("eax",&DWP($i*4,$r)); # L(t)+= *r 19655714Skris &adc("edx",0); # H(t)+=carry 19755714Skris &dec("ecx") if ($i != 7-1); 198238405Sjkim &mov(&DWP($i*4,$r),"eax"); # *r= L(t); 199238405Sjkim &mov($c,"edx"); # c= H(t); 20055714Skris &jz(&label("maw_end")) if ($i != 7-1); 20155714Skris } 20255714Skris &set_label("maw_end",0); 20355714Skris &mov("eax",$c); 20455714Skris 20555714Skris &pop("ecx"); # clear variable from 20655714Skris 20755714Skris &function_end($name); 20855714Skris } 20955714Skris 21055714Skrissub bn_mul_words 21155714Skris { 21255714Skris local($name)=@_; 21355714Skris 214238405Sjkim &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); 21555714Skris 216238405Sjkim $r="eax"; 217238405Sjkim $a="edx"; 218238405Sjkim $c="ecx"; 219238405Sjkim 220238405Sjkim if ($sse2) { 221238405Sjkim &picmeup("eax","OPENSSL_ia32cap_P"); 222238405Sjkim &bt(&DWP(0,"eax"),26); 223238405Sjkim &jnc(&label("mw_non_sse2")); 224238405Sjkim 225238405Sjkim &mov($r,&wparam(0)); 226238405Sjkim &mov($a,&wparam(1)); 227238405Sjkim &mov($c,&wparam(2)); 228238405Sjkim &movd("mm0",&wparam(3)); # mm0 = w 229238405Sjkim &pxor("mm1","mm1"); # mm1 = carry = 0 230238405Sjkim 231238405Sjkim &set_label("mw_sse2_loop",16); 232238405Sjkim &movd("mm2",&DWP(0,$a)); # mm2 = a[i] 233238405Sjkim &pmuludq("mm2","mm0"); # a[i] *= w 234238405Sjkim &lea($a,&DWP(4,$a)); 235238405Sjkim &paddq("mm1","mm2"); # carry += a[i]*w 236238405Sjkim &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low 237238405Sjkim &sub($c,1); 238238405Sjkim &psrlq("mm1",32); # carry = carry_high 239238405Sjkim &lea($r,&DWP(4,$r)); 240238405Sjkim &jnz(&label("mw_sse2_loop")); 241238405Sjkim 242238405Sjkim &movd("eax","mm1"); # return carry 243238405Sjkim &emms(); 244238405Sjkim &ret(); 245238405Sjkim &set_label("mw_non_sse2",16); 246238405Sjkim } 247238405Sjkim 248238405Sjkim # function_begin prologue 249238405Sjkim &push("ebp"); 250238405Sjkim &push("ebx"); 251238405Sjkim &push("esi"); 252238405Sjkim &push("edi"); 253238405Sjkim 25455714Skris &comment(""); 25555714Skris $Low="eax"; 25655714Skris $High="edx"; 25755714Skris $a="ebx"; 25855714Skris $w="ecx"; 25955714Skris $r="edi"; 26055714Skris $c="esi"; 26155714Skris $num="ebp"; 26255714Skris 26355714Skris &xor($c,$c); # clear carry 26455714Skris &mov($r,&wparam(0)); # 26555714Skris &mov($a,&wparam(1)); # 26655714Skris &mov($num,&wparam(2)); # 26755714Skris &mov($w,&wparam(3)); # 26855714Skris 26955714Skris &and($num,0xfffffff8); # num / 8 27055714Skris &jz(&label("mw_finish")); 27155714Skris 27255714Skris &set_label("mw_loop",0); 27355714Skris for ($i=0; $i<32; $i+=4) 27455714Skris { 27555714Skris &comment("Round $i"); 27655714Skris 27755714Skris &mov("eax",&DWP($i,$a,"",0)); # *a 27855714Skris &mul($w); # *a * w 27955714Skris &add("eax",$c); # L(t)+=c 28055714Skris # XXX 28155714Skris 28255714Skris &adc("edx",0); # H(t)+=carry 28355714Skris &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); 28455714Skris 28555714Skris &mov($c,"edx"); # c= H(t); 28655714Skris } 28755714Skris 28855714Skris &comment(""); 28955714Skris &add($a,32); 29055714Skris &add($r,32); 29155714Skris &sub($num,8); 29255714Skris &jz(&label("mw_finish")); 29355714Skris &jmp(&label("mw_loop")); 29455714Skris 29555714Skris &set_label("mw_finish",0); 29655714Skris &mov($num,&wparam(2)); # get num 29755714Skris &and($num,7); 29855714Skris &jnz(&label("mw_finish2")); 29955714Skris &jmp(&label("mw_end")); 30055714Skris 30155714Skris &set_label("mw_finish2",1); 30255714Skris for ($i=0; $i<7; $i++) 30355714Skris { 30455714Skris &comment("Tail Round $i"); 30555714Skris &mov("eax",&DWP($i*4,$a,"",0));# *a 30655714Skris &mul($w); # *a * w 30755714Skris &add("eax",$c); # L(t)+=c 30855714Skris # XXX 30955714Skris &adc("edx",0); # H(t)+=carry 31055714Skris &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); 31155714Skris &mov($c,"edx"); # c= H(t); 31255714Skris &dec($num) if ($i != 7-1); 31355714Skris &jz(&label("mw_end")) if ($i != 7-1); 31455714Skris } 31555714Skris &set_label("mw_end",0); 31655714Skris &mov("eax",$c); 31755714Skris 31855714Skris &function_end($name); 31955714Skris } 32055714Skris 32155714Skrissub bn_sqr_words 32255714Skris { 32355714Skris local($name)=@_; 32455714Skris 325238405Sjkim &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":""); 32655714Skris 327238405Sjkim $r="eax"; 328238405Sjkim $a="edx"; 329238405Sjkim $c="ecx"; 330238405Sjkim 331238405Sjkim if ($sse2) { 332238405Sjkim &picmeup("eax","OPENSSL_ia32cap_P"); 333238405Sjkim &bt(&DWP(0,"eax"),26); 334238405Sjkim &jnc(&label("sqr_non_sse2")); 335238405Sjkim 336238405Sjkim &mov($r,&wparam(0)); 337238405Sjkim &mov($a,&wparam(1)); 338238405Sjkim &mov($c,&wparam(2)); 339238405Sjkim 340238405Sjkim &set_label("sqr_sse2_loop",16); 341238405Sjkim &movd("mm0",&DWP(0,$a)); # mm0 = a[i] 342238405Sjkim &pmuludq("mm0","mm0"); # a[i] *= a[i] 343238405Sjkim &lea($a,&DWP(4,$a)); # a++ 344238405Sjkim &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i] 345238405Sjkim &sub($c,1); 346238405Sjkim &lea($r,&DWP(8,$r)); # r += 2 347238405Sjkim &jnz(&label("sqr_sse2_loop")); 348238405Sjkim 349238405Sjkim &emms(); 350238405Sjkim &ret(); 351238405Sjkim &set_label("sqr_non_sse2",16); 352238405Sjkim } 353238405Sjkim 354238405Sjkim # function_begin prologue 355238405Sjkim &push("ebp"); 356238405Sjkim &push("ebx"); 357238405Sjkim &push("esi"); 358238405Sjkim &push("edi"); 359238405Sjkim 36055714Skris &comment(""); 36155714Skris $r="esi"; 36255714Skris $a="edi"; 36355714Skris $num="ebx"; 36455714Skris 36555714Skris &mov($r,&wparam(0)); # 36655714Skris &mov($a,&wparam(1)); # 36755714Skris &mov($num,&wparam(2)); # 36855714Skris 36955714Skris &and($num,0xfffffff8); # num / 8 37055714Skris &jz(&label("sw_finish")); 37155714Skris 37255714Skris &set_label("sw_loop",0); 37355714Skris for ($i=0; $i<32; $i+=4) 37455714Skris { 37555714Skris &comment("Round $i"); 37655714Skris &mov("eax",&DWP($i,$a,"",0)); # *a 37755714Skris # XXX 37855714Skris &mul("eax"); # *a * *a 37955714Skris &mov(&DWP($i*2,$r,"",0),"eax"); # 38055714Skris &mov(&DWP($i*2+4,$r,"",0),"edx");# 38155714Skris } 38255714Skris 38355714Skris &comment(""); 38455714Skris &add($a,32); 38555714Skris &add($r,64); 38655714Skris &sub($num,8); 38755714Skris &jnz(&label("sw_loop")); 38855714Skris 38955714Skris &set_label("sw_finish",0); 39055714Skris &mov($num,&wparam(2)); # get num 39155714Skris &and($num,7); 39255714Skris &jz(&label("sw_end")); 39355714Skris 39455714Skris for ($i=0; $i<7; $i++) 39555714Skris { 39655714Skris &comment("Tail Round $i"); 39755714Skris &mov("eax",&DWP($i*4,$a,"",0)); # *a 39855714Skris # XXX 39955714Skris &mul("eax"); # *a * *a 40055714Skris &mov(&DWP($i*8,$r,"",0),"eax"); # 40155714Skris &dec($num) if ($i != 7-1); 40255714Skris &mov(&DWP($i*8+4,$r,"",0),"edx"); 40355714Skris &jz(&label("sw_end")) if ($i != 7-1); 40455714Skris } 40555714Skris &set_label("sw_end",0); 40655714Skris 40755714Skris &function_end($name); 40855714Skris } 40955714Skris 41055714Skrissub bn_div_words 41155714Skris { 41255714Skris local($name)=@_; 41355714Skris 414238405Sjkim &function_begin_B($name,""); 41555714Skris &mov("edx",&wparam(0)); # 41655714Skris &mov("eax",&wparam(1)); # 417238405Sjkim &mov("ecx",&wparam(2)); # 418238405Sjkim &div("ecx"); 419238405Sjkim &ret(); 420238405Sjkim &function_end_B($name); 42155714Skris } 42255714Skris 42355714Skrissub bn_add_words 42455714Skris { 42555714Skris local($name)=@_; 42655714Skris 42755714Skris &function_begin($name,""); 42855714Skris 42955714Skris &comment(""); 43055714Skris $a="esi"; 43155714Skris $b="edi"; 43255714Skris $c="eax"; 43355714Skris $r="ebx"; 43455714Skris $tmp1="ecx"; 43555714Skris $tmp2="edx"; 43655714Skris $num="ebp"; 43755714Skris 43855714Skris &mov($r,&wparam(0)); # get r 43955714Skris &mov($a,&wparam(1)); # get a 44055714Skris &mov($b,&wparam(2)); # get b 44155714Skris &mov($num,&wparam(3)); # get num 44255714Skris &xor($c,$c); # clear carry 44355714Skris &and($num,0xfffffff8); # num / 8 44455714Skris 44555714Skris &jz(&label("aw_finish")); 44655714Skris 44755714Skris &set_label("aw_loop",0); 44855714Skris for ($i=0; $i<8; $i++) 44955714Skris { 45055714Skris &comment("Round $i"); 45155714Skris 45255714Skris &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 45355714Skris &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 45455714Skris &add($tmp1,$c); 45555714Skris &mov($c,0); 45655714Skris &adc($c,$c); 45755714Skris &add($tmp1,$tmp2); 45855714Skris &adc($c,0); 45955714Skris &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 46055714Skris } 46155714Skris 46255714Skris &comment(""); 46355714Skris &add($a,32); 46455714Skris &add($b,32); 46555714Skris &add($r,32); 46655714Skris &sub($num,8); 46755714Skris &jnz(&label("aw_loop")); 46855714Skris 46955714Skris &set_label("aw_finish",0); 47055714Skris &mov($num,&wparam(3)); # get num 47155714Skris &and($num,7); 47255714Skris &jz(&label("aw_end")); 47355714Skris 47455714Skris for ($i=0; $i<7; $i++) 47555714Skris { 47655714Skris &comment("Tail Round $i"); 47755714Skris &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 47855714Skris &mov($tmp2,&DWP($i*4,$b,"",0));# *b 47955714Skris &add($tmp1,$c); 48055714Skris &mov($c,0); 48155714Skris &adc($c,$c); 48255714Skris &add($tmp1,$tmp2); 48355714Skris &adc($c,0); 48455714Skris &dec($num) if ($i != 6); 485109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 48655714Skris &jz(&label("aw_end")) if ($i != 6); 48755714Skris } 48855714Skris &set_label("aw_end",0); 48955714Skris 49055714Skris# &mov("eax",$c); # $c is "eax" 49155714Skris 49255714Skris &function_end($name); 49355714Skris } 49455714Skris 49555714Skrissub bn_sub_words 49655714Skris { 49755714Skris local($name)=@_; 49855714Skris 49955714Skris &function_begin($name,""); 50055714Skris 50155714Skris &comment(""); 50255714Skris $a="esi"; 50355714Skris $b="edi"; 50455714Skris $c="eax"; 50555714Skris $r="ebx"; 50655714Skris $tmp1="ecx"; 50755714Skris $tmp2="edx"; 50855714Skris $num="ebp"; 50955714Skris 51055714Skris &mov($r,&wparam(0)); # get r 51155714Skris &mov($a,&wparam(1)); # get a 51255714Skris &mov($b,&wparam(2)); # get b 51355714Skris &mov($num,&wparam(3)); # get num 51455714Skris &xor($c,$c); # clear carry 51555714Skris &and($num,0xfffffff8); # num / 8 51655714Skris 51755714Skris &jz(&label("aw_finish")); 51855714Skris 51955714Skris &set_label("aw_loop",0); 52055714Skris for ($i=0; $i<8; $i++) 52155714Skris { 52255714Skris &comment("Round $i"); 52355714Skris 52455714Skris &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 52555714Skris &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 52655714Skris &sub($tmp1,$c); 52755714Skris &mov($c,0); 52855714Skris &adc($c,$c); 52955714Skris &sub($tmp1,$tmp2); 53055714Skris &adc($c,0); 53155714Skris &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 53255714Skris } 53355714Skris 53455714Skris &comment(""); 53555714Skris &add($a,32); 53655714Skris &add($b,32); 53755714Skris &add($r,32); 53855714Skris &sub($num,8); 53955714Skris &jnz(&label("aw_loop")); 54055714Skris 54155714Skris &set_label("aw_finish",0); 54255714Skris &mov($num,&wparam(3)); # get num 54355714Skris &and($num,7); 54455714Skris &jz(&label("aw_end")); 54555714Skris 54655714Skris for ($i=0; $i<7; $i++) 54755714Skris { 54855714Skris &comment("Tail Round $i"); 54955714Skris &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 55055714Skris &mov($tmp2,&DWP($i*4,$b,"",0));# *b 55155714Skris &sub($tmp1,$c); 55255714Skris &mov($c,0); 55355714Skris &adc($c,$c); 55455714Skris &sub($tmp1,$tmp2); 55555714Skris &adc($c,0); 55655714Skris &dec($num) if ($i != 6); 557109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 55855714Skris &jz(&label("aw_end")) if ($i != 6); 55955714Skris } 56055714Skris &set_label("aw_end",0); 56155714Skris 56255714Skris# &mov("eax",$c); # $c is "eax" 56355714Skris 56455714Skris &function_end($name); 56555714Skris } 56655714Skris 567109998Smarkmsub bn_sub_part_words 568109998Smarkm { 569109998Smarkm local($name)=@_; 570109998Smarkm 571109998Smarkm &function_begin($name,""); 572109998Smarkm 573109998Smarkm &comment(""); 574109998Smarkm $a="esi"; 575109998Smarkm $b="edi"; 576109998Smarkm $c="eax"; 577109998Smarkm $r="ebx"; 578109998Smarkm $tmp1="ecx"; 579109998Smarkm $tmp2="edx"; 580109998Smarkm $num="ebp"; 581109998Smarkm 582109998Smarkm &mov($r,&wparam(0)); # get r 583109998Smarkm &mov($a,&wparam(1)); # get a 584109998Smarkm &mov($b,&wparam(2)); # get b 585109998Smarkm &mov($num,&wparam(3)); # get num 586109998Smarkm &xor($c,$c); # clear carry 587109998Smarkm &and($num,0xfffffff8); # num / 8 588109998Smarkm 589109998Smarkm &jz(&label("aw_finish")); 590109998Smarkm 591109998Smarkm &set_label("aw_loop",0); 592109998Smarkm for ($i=0; $i<8; $i++) 593109998Smarkm { 594109998Smarkm &comment("Round $i"); 595109998Smarkm 596109998Smarkm &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 597109998Smarkm &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 598109998Smarkm &sub($tmp1,$c); 599109998Smarkm &mov($c,0); 600109998Smarkm &adc($c,$c); 601109998Smarkm &sub($tmp1,$tmp2); 602109998Smarkm &adc($c,0); 603109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 604109998Smarkm } 605109998Smarkm 606109998Smarkm &comment(""); 607109998Smarkm &add($a,32); 608109998Smarkm &add($b,32); 609109998Smarkm &add($r,32); 610109998Smarkm &sub($num,8); 611109998Smarkm &jnz(&label("aw_loop")); 612109998Smarkm 613109998Smarkm &set_label("aw_finish",0); 614109998Smarkm &mov($num,&wparam(3)); # get num 615109998Smarkm &and($num,7); 616109998Smarkm &jz(&label("aw_end")); 617109998Smarkm 618109998Smarkm for ($i=0; $i<7; $i++) 619109998Smarkm { 620109998Smarkm &comment("Tail Round $i"); 621109998Smarkm &mov($tmp1,&DWP(0,$a,"",0)); # *a 622109998Smarkm &mov($tmp2,&DWP(0,$b,"",0));# *b 623109998Smarkm &sub($tmp1,$c); 624109998Smarkm &mov($c,0); 625109998Smarkm &adc($c,$c); 626109998Smarkm &sub($tmp1,$tmp2); 627109998Smarkm &adc($c,0); 628109998Smarkm &mov(&DWP(0,$r,"",0),$tmp1); # *r 629109998Smarkm &add($a, 4); 630109998Smarkm &add($b, 4); 631109998Smarkm &add($r, 4); 632109998Smarkm &dec($num) if ($i != 6); 633109998Smarkm &jz(&label("aw_end")) if ($i != 6); 634109998Smarkm } 635109998Smarkm &set_label("aw_end",0); 636109998Smarkm 637109998Smarkm &cmp(&wparam(4),0); 638109998Smarkm &je(&label("pw_end")); 639109998Smarkm 640109998Smarkm &mov($num,&wparam(4)); # get dl 641109998Smarkm &cmp($num,0); 642109998Smarkm &je(&label("pw_end")); 643109998Smarkm &jge(&label("pw_pos")); 644109998Smarkm 645109998Smarkm &comment("pw_neg"); 646109998Smarkm &mov($tmp2,0); 647109998Smarkm &sub($tmp2,$num); 648109998Smarkm &mov($num,$tmp2); 649109998Smarkm &and($num,0xfffffff8); # num / 8 650109998Smarkm &jz(&label("pw_neg_finish")); 651109998Smarkm 652109998Smarkm &set_label("pw_neg_loop",0); 653109998Smarkm for ($i=0; $i<8; $i++) 654109998Smarkm { 655109998Smarkm &comment("dl<0 Round $i"); 656109998Smarkm 657109998Smarkm &mov($tmp1,0); 658109998Smarkm &mov($tmp2,&DWP($i*4,$b,"",0)); # *b 659109998Smarkm &sub($tmp1,$c); 660109998Smarkm &mov($c,0); 661109998Smarkm &adc($c,$c); 662109998Smarkm &sub($tmp1,$tmp2); 663109998Smarkm &adc($c,0); 664109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 665109998Smarkm } 666109998Smarkm 667109998Smarkm &comment(""); 668109998Smarkm &add($b,32); 669109998Smarkm &add($r,32); 670109998Smarkm &sub($num,8); 671109998Smarkm &jnz(&label("pw_neg_loop")); 672109998Smarkm 673109998Smarkm &set_label("pw_neg_finish",0); 674109998Smarkm &mov($tmp2,&wparam(4)); # get dl 675109998Smarkm &mov($num,0); 676109998Smarkm &sub($num,$tmp2); 677109998Smarkm &and($num,7); 678109998Smarkm &jz(&label("pw_end")); 679109998Smarkm 680109998Smarkm for ($i=0; $i<7; $i++) 681109998Smarkm { 682109998Smarkm &comment("dl<0 Tail Round $i"); 683109998Smarkm &mov($tmp1,0); 684109998Smarkm &mov($tmp2,&DWP($i*4,$b,"",0));# *b 685109998Smarkm &sub($tmp1,$c); 686109998Smarkm &mov($c,0); 687109998Smarkm &adc($c,$c); 688109998Smarkm &sub($tmp1,$tmp2); 689109998Smarkm &adc($c,0); 690109998Smarkm &dec($num) if ($i != 6); 691109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 692109998Smarkm &jz(&label("pw_end")) if ($i != 6); 693109998Smarkm } 694109998Smarkm 695109998Smarkm &jmp(&label("pw_end")); 696109998Smarkm 697109998Smarkm &set_label("pw_pos",0); 698109998Smarkm 699109998Smarkm &and($num,0xfffffff8); # num / 8 700109998Smarkm &jz(&label("pw_pos_finish")); 701109998Smarkm 702109998Smarkm &set_label("pw_pos_loop",0); 703109998Smarkm 704109998Smarkm for ($i=0; $i<8; $i++) 705109998Smarkm { 706109998Smarkm &comment("dl>0 Round $i"); 707109998Smarkm 708109998Smarkm &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 709109998Smarkm &sub($tmp1,$c); 710109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 711109998Smarkm &jnc(&label("pw_nc".$i)); 712109998Smarkm } 713109998Smarkm 714109998Smarkm &comment(""); 715109998Smarkm &add($a,32); 716109998Smarkm &add($r,32); 717109998Smarkm &sub($num,8); 718109998Smarkm &jnz(&label("pw_pos_loop")); 719109998Smarkm 720109998Smarkm &set_label("pw_pos_finish",0); 721109998Smarkm &mov($num,&wparam(4)); # get dl 722109998Smarkm &and($num,7); 723109998Smarkm &jz(&label("pw_end")); 724109998Smarkm 725109998Smarkm for ($i=0; $i<7; $i++) 726109998Smarkm { 727109998Smarkm &comment("dl>0 Tail Round $i"); 728109998Smarkm &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 729109998Smarkm &sub($tmp1,$c); 730109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 731109998Smarkm &jnc(&label("pw_tail_nc".$i)); 732109998Smarkm &dec($num) if ($i != 6); 733109998Smarkm &jz(&label("pw_end")) if ($i != 6); 734109998Smarkm } 735109998Smarkm &mov($c,1); 736109998Smarkm &jmp(&label("pw_end")); 737109998Smarkm 738109998Smarkm &set_label("pw_nc_loop",0); 739109998Smarkm for ($i=0; $i<8; $i++) 740109998Smarkm { 741109998Smarkm &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 742109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 743109998Smarkm &set_label("pw_nc".$i,0); 744109998Smarkm } 745109998Smarkm 746109998Smarkm &comment(""); 747109998Smarkm &add($a,32); 748109998Smarkm &add($r,32); 749109998Smarkm &sub($num,8); 750109998Smarkm &jnz(&label("pw_nc_loop")); 751109998Smarkm 752109998Smarkm &mov($num,&wparam(4)); # get dl 753109998Smarkm &and($num,7); 754109998Smarkm &jz(&label("pw_nc_end")); 755109998Smarkm 756109998Smarkm for ($i=0; $i<7; $i++) 757109998Smarkm { 758109998Smarkm &mov($tmp1,&DWP($i*4,$a,"",0)); # *a 759109998Smarkm &mov(&DWP($i*4,$r,"",0),$tmp1); # *r 760109998Smarkm &set_label("pw_tail_nc".$i,0); 761109998Smarkm &dec($num) if ($i != 6); 762109998Smarkm &jz(&label("pw_nc_end")) if ($i != 6); 763109998Smarkm } 764109998Smarkm 765109998Smarkm &set_label("pw_nc_end",0); 766109998Smarkm &mov($c,0); 767109998Smarkm 768109998Smarkm &set_label("pw_end",0); 769109998Smarkm 770109998Smarkm# &mov("eax",$c); # $c is "eax" 771109998Smarkm 772109998Smarkm &function_end($name); 773109998Smarkm } 774109998Smarkm 775