155714Skris#!/usr/local/bin/perl
255714Skris
3238405Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4238405Sjkimpush(@INC,"${dir}","${dir}../../perlasm");
555714Skrisrequire "x86asm.pl";
655714Skris
755714Skris&asm_init($ARGV[0],$0);
855714Skris
9160814Ssimon$sse2=0;
10160814Ssimonfor (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
11160814Ssimon
12160814Ssimon&external_label("OPENSSL_ia32cap_P") if ($sse2);
13160814Ssimon
1455714Skris&bn_mul_add_words("bn_mul_add_words");
1555714Skris&bn_mul_words("bn_mul_words");
1655714Skris&bn_sqr_words("bn_sqr_words");
1755714Skris&bn_div_words("bn_div_words");
1855714Skris&bn_add_words("bn_add_words");
1955714Skris&bn_sub_words("bn_sub_words");
20160814Ssimon&bn_sub_part_words("bn_sub_part_words");
2155714Skris
2255714Skris&asm_finish();
2355714Skris
2455714Skrissub bn_mul_add_words
2555714Skris	{
2655714Skris	local($name)=@_;
2755714Skris
28238405Sjkim	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
2955714Skris
30238405Sjkim	$r="eax";
31238405Sjkim	$a="edx";
32238405Sjkim	$c="ecx";
3355714Skris
34160814Ssimon	if ($sse2) {
35160814Ssimon		&picmeup("eax","OPENSSL_ia32cap_P");
36160814Ssimon		&bt(&DWP(0,"eax"),26);
37238405Sjkim		&jnc(&label("maw_non_sse2"));
38160814Ssimon
39238405Sjkim		&mov($r,&wparam(0));
40238405Sjkim		&mov($a,&wparam(1));
41238405Sjkim		&mov($c,&wparam(2));
42238405Sjkim		&movd("mm0",&wparam(3));	# mm0 = w
43160814Ssimon		&pxor("mm1","mm1");		# mm1 = carry_in
44238405Sjkim		&jmp(&label("maw_sse2_entry"));
45238405Sjkim
46238405Sjkim	&set_label("maw_sse2_unrolled",16);
47160814Ssimon		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
48160814Ssimon		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
49160814Ssimon		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
50160814Ssimon		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
51160814Ssimon		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
52160814Ssimon		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
53160814Ssimon		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
54160814Ssimon		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
55160814Ssimon		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
56160814Ssimon		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
57160814Ssimon		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
58160814Ssimon		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
59160814Ssimon		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
60160814Ssimon		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
61160814Ssimon		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
62160814Ssimon		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
63160814Ssimon		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
64160814Ssimon		&movd(&DWP(0,$r,"",0),"mm1");
65160814Ssimon		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
66160814Ssimon		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
67160814Ssimon		&psrlq("mm1",32);		# mm1 = carry0
68160814Ssimon		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
69160814Ssimon		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
70160814Ssimon		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
71160814Ssimon		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
72160814Ssimon		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
73160814Ssimon		&movd(&DWP(4,$r,"",0),"mm1");
74160814Ssimon		&psrlq("mm1",32);		# mm1 = carry1
75160814Ssimon		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
76160814Ssimon		&add($a,32);
77160814Ssimon		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
78160814Ssimon		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
79160814Ssimon		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
80160814Ssimon		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
81160814Ssimon		&movd(&DWP(8,$r,"",0),"mm1");
82160814Ssimon		&psrlq("mm1",32);		# mm1 = carry2
83160814Ssimon		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
84160814Ssimon		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
85160814Ssimon		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
86160814Ssimon		&movd(&DWP(12,$r,"",0),"mm1");
87160814Ssimon		&psrlq("mm1",32);		# mm1 = carry3
88160814Ssimon		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
89160814Ssimon		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
90160814Ssimon		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
91160814Ssimon		&movd(&DWP(16,$r,"",0),"mm1");
92160814Ssimon		&psrlq("mm1",32);		# mm1 = carry4
93160814Ssimon		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
94160814Ssimon		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
95160814Ssimon		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
96160814Ssimon		&movd(&DWP(20,$r,"",0),"mm1");
97160814Ssimon		&psrlq("mm1",32);		# mm1 = carry5
98160814Ssimon		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
99160814Ssimon		&movd(&DWP(24,$r,"",0),"mm1");
100160814Ssimon		&psrlq("mm1",32);		# mm1 = carry6
101160814Ssimon		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
102160814Ssimon		&movd(&DWP(28,$r,"",0),"mm1");
103238405Sjkim		&lea($r,&DWP(32,$r));
104160814Ssimon		&psrlq("mm1",32);		# mm1 = carry_out
105160814Ssimon
106238405Sjkim		&sub($c,8);
107238405Sjkim		&jz(&label("maw_sse2_exit"));
108238405Sjkim	&set_label("maw_sse2_entry");
109238405Sjkim		&test($c,0xfffffff8);
110238405Sjkim		&jnz(&label("maw_sse2_unrolled"));
111238405Sjkim
112238405Sjkim	&set_label("maw_sse2_loop",4);
113238405Sjkim		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
114238405Sjkim		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
115238405Sjkim		&pmuludq("mm2","mm0");		# a[i] *= w
116238405Sjkim		&lea($a,&DWP(4,$a));
117238405Sjkim		&paddq("mm1","mm3");		# carry += r[i]
118238405Sjkim		&paddq("mm1","mm2");		# carry += a[i]*w
119238405Sjkim		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
120238405Sjkim		&sub($c,1);
121238405Sjkim		&psrlq("mm1",32);		# carry = carry_high
122238405Sjkim		&lea($r,&DWP(4,$r));
123160814Ssimon		&jnz(&label("maw_sse2_loop"));
124238405Sjkim	&set_label("maw_sse2_exit");
125238405Sjkim		&movd("eax","mm1");		# c = carry_out
126160814Ssimon		&emms();
127238405Sjkim		&ret();
128160814Ssimon
129238405Sjkim	&set_label("maw_non_sse2",16);
130160814Ssimon	}
131160814Ssimon
132238405Sjkim	# function_begin prologue
133238405Sjkim	&push("ebp");
134238405Sjkim	&push("ebx");
135238405Sjkim	&push("esi");
136238405Sjkim	&push("edi");
13755714Skris
138238405Sjkim	&comment("");
139238405Sjkim	$Low="eax";
140238405Sjkim	$High="edx";
141238405Sjkim	$a="ebx";
142238405Sjkim	$w="ebp";
143238405Sjkim	$r="edi";
144238405Sjkim	$c="esi";
14555714Skris
146238405Sjkim	&xor($c,$c);		# clear carry
147238405Sjkim	&mov($r,&wparam(0));	#
148238405Sjkim
149238405Sjkim	&mov("ecx",&wparam(2));	#
150238405Sjkim	&mov($a,&wparam(1));	#
151238405Sjkim
152238405Sjkim	&and("ecx",0xfffffff8);	# num / 8
153238405Sjkim	&mov($w,&wparam(3));	#
154238405Sjkim
155238405Sjkim	&push("ecx");		# Up the stack for a tmp variable
156238405Sjkim
157238405Sjkim	&jz(&label("maw_finish"));
158238405Sjkim
159238405Sjkim	&set_label("maw_loop",16);
160238405Sjkim
16155714Skris	for ($i=0; $i<32; $i+=4)
16255714Skris		{
16355714Skris		&comment("Round $i");
16455714Skris
165238405Sjkim		 &mov("eax",&DWP($i,$a)); 	# *a
16655714Skris		&mul($w);			# *a * w
167238405Sjkim		&add("eax",$c);			# L(t)+= c
16855714Skris		&adc("edx",0);			# H(t)+=carry
169238405Sjkim		 &add("eax",&DWP($i,$r));	# L(t)+= *r
17055714Skris		&adc("edx",0);			# H(t)+=carry
171238405Sjkim		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
17255714Skris		&mov($c,"edx");			# c=  H(t);
17355714Skris		}
17455714Skris
17555714Skris	&comment("");
17655714Skris	&sub("ecx",8);
177238405Sjkim	&lea($a,&DWP(32,$a));
178238405Sjkim	&lea($r,&DWP(32,$r));
17955714Skris	&jnz(&label("maw_loop"));
18055714Skris
18155714Skris	&set_label("maw_finish",0);
18255714Skris	&mov("ecx",&wparam(2));	# get num
18355714Skris	&and("ecx",7);
18455714Skris	&jnz(&label("maw_finish2"));	# helps branch prediction
18555714Skris	&jmp(&label("maw_end"));
18655714Skris
18755714Skris	&set_label("maw_finish2",1);
18855714Skris	for ($i=0; $i<7; $i++)
18955714Skris		{
19055714Skris		&comment("Tail Round $i");
191238405Sjkim		 &mov("eax",&DWP($i*4,$a));	# *a
19255714Skris		&mul($w);			# *a * w
19355714Skris		&add("eax",$c);			# L(t)+=c
19455714Skris		&adc("edx",0);			# H(t)+=carry
195238405Sjkim		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
19655714Skris		&adc("edx",0);			# H(t)+=carry
19755714Skris		 &dec("ecx") if ($i != 7-1);
198238405Sjkim		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
199238405Sjkim		 &mov($c,"edx");		# c=  H(t);
20055714Skris		&jz(&label("maw_end")) if ($i != 7-1);
20155714Skris		}
20255714Skris	&set_label("maw_end",0);
20355714Skris	&mov("eax",$c);
20455714Skris
20555714Skris	&pop("ecx");	# clear variable from
20655714Skris
20755714Skris	&function_end($name);
20855714Skris	}
20955714Skris
21055714Skrissub bn_mul_words
21155714Skris	{
21255714Skris	local($name)=@_;
21355714Skris
214238405Sjkim	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
21555714Skris
216238405Sjkim	$r="eax";
217238405Sjkim	$a="edx";
218238405Sjkim	$c="ecx";
219238405Sjkim
220238405Sjkim	if ($sse2) {
221238405Sjkim		&picmeup("eax","OPENSSL_ia32cap_P");
222238405Sjkim		&bt(&DWP(0,"eax"),26);
223238405Sjkim		&jnc(&label("mw_non_sse2"));
224238405Sjkim
225238405Sjkim		&mov($r,&wparam(0));
226238405Sjkim		&mov($a,&wparam(1));
227238405Sjkim		&mov($c,&wparam(2));
228238405Sjkim		&movd("mm0",&wparam(3));	# mm0 = w
229238405Sjkim		&pxor("mm1","mm1");		# mm1 = carry = 0
230238405Sjkim
231238405Sjkim	&set_label("mw_sse2_loop",16);
232238405Sjkim		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
233238405Sjkim		&pmuludq("mm2","mm0");		# a[i] *= w
234238405Sjkim		&lea($a,&DWP(4,$a));
235238405Sjkim		&paddq("mm1","mm2");		# carry += a[i]*w
236238405Sjkim		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
237238405Sjkim		&sub($c,1);
238238405Sjkim		&psrlq("mm1",32);		# carry = carry_high
239238405Sjkim		&lea($r,&DWP(4,$r));
240238405Sjkim		&jnz(&label("mw_sse2_loop"));
241238405Sjkim
242238405Sjkim		&movd("eax","mm1");		# return carry
243238405Sjkim		&emms();
244238405Sjkim		&ret();
245238405Sjkim	&set_label("mw_non_sse2",16);
246238405Sjkim	}
247238405Sjkim
248238405Sjkim	# function_begin prologue
249238405Sjkim	&push("ebp");
250238405Sjkim	&push("ebx");
251238405Sjkim	&push("esi");
252238405Sjkim	&push("edi");
253238405Sjkim
25455714Skris	&comment("");
25555714Skris	$Low="eax";
25655714Skris	$High="edx";
25755714Skris	$a="ebx";
25855714Skris	$w="ecx";
25955714Skris	$r="edi";
26055714Skris	$c="esi";
26155714Skris	$num="ebp";
26255714Skris
26355714Skris	&xor($c,$c);		# clear carry
26455714Skris	&mov($r,&wparam(0));	#
26555714Skris	&mov($a,&wparam(1));	#
26655714Skris	&mov($num,&wparam(2));	#
26755714Skris	&mov($w,&wparam(3));	#
26855714Skris
26955714Skris	&and($num,0xfffffff8);	# num / 8
27055714Skris	&jz(&label("mw_finish"));
27155714Skris
27255714Skris	&set_label("mw_loop",0);
27355714Skris	for ($i=0; $i<32; $i+=4)
27455714Skris		{
27555714Skris		&comment("Round $i");
27655714Skris
27755714Skris		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
27855714Skris		&mul($w);			# *a * w
27955714Skris		&add("eax",$c);			# L(t)+=c
28055714Skris		 # XXX
28155714Skris
28255714Skris		&adc("edx",0);			# H(t)+=carry
28355714Skris		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
28455714Skris
28555714Skris		&mov($c,"edx");			# c=  H(t);
28655714Skris		}
28755714Skris
28855714Skris	&comment("");
28955714Skris	&add($a,32);
29055714Skris	&add($r,32);
29155714Skris	&sub($num,8);
29255714Skris	&jz(&label("mw_finish"));
29355714Skris	&jmp(&label("mw_loop"));
29455714Skris
29555714Skris	&set_label("mw_finish",0);
29655714Skris	&mov($num,&wparam(2));	# get num
29755714Skris	&and($num,7);
29855714Skris	&jnz(&label("mw_finish2"));
29955714Skris	&jmp(&label("mw_end"));
30055714Skris
30155714Skris	&set_label("mw_finish2",1);
30255714Skris	for ($i=0; $i<7; $i++)
30355714Skris		{
30455714Skris		&comment("Tail Round $i");
30555714Skris		 &mov("eax",&DWP($i*4,$a,"",0));# *a
30655714Skris		&mul($w);			# *a * w
30755714Skris		&add("eax",$c);			# L(t)+=c
30855714Skris		 # XXX
30955714Skris		&adc("edx",0);			# H(t)+=carry
31055714Skris		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
31155714Skris		&mov($c,"edx");			# c=  H(t);
31255714Skris		 &dec($num) if ($i != 7-1);
31355714Skris		&jz(&label("mw_end")) if ($i != 7-1);
31455714Skris		}
31555714Skris	&set_label("mw_end",0);
31655714Skris	&mov("eax",$c);
31755714Skris
31855714Skris	&function_end($name);
31955714Skris	}
32055714Skris
32155714Skrissub bn_sqr_words
32255714Skris	{
32355714Skris	local($name)=@_;
32455714Skris
325238405Sjkim	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
32655714Skris
327238405Sjkim	$r="eax";
328238405Sjkim	$a="edx";
329238405Sjkim	$c="ecx";
330238405Sjkim
331238405Sjkim	if ($sse2) {
332238405Sjkim		&picmeup("eax","OPENSSL_ia32cap_P");
333238405Sjkim		&bt(&DWP(0,"eax"),26);
334238405Sjkim		&jnc(&label("sqr_non_sse2"));
335238405Sjkim
336238405Sjkim		&mov($r,&wparam(0));
337238405Sjkim		&mov($a,&wparam(1));
338238405Sjkim		&mov($c,&wparam(2));
339238405Sjkim
340238405Sjkim	&set_label("sqr_sse2_loop",16);
341238405Sjkim		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
342238405Sjkim		&pmuludq("mm0","mm0");		# a[i] *= a[i]
343238405Sjkim		&lea($a,&DWP(4,$a));		# a++
344238405Sjkim		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
345238405Sjkim		&sub($c,1);
346238405Sjkim		&lea($r,&DWP(8,$r));		# r += 2
347238405Sjkim		&jnz(&label("sqr_sse2_loop"));
348238405Sjkim
349238405Sjkim		&emms();
350238405Sjkim		&ret();
351238405Sjkim	&set_label("sqr_non_sse2",16);
352238405Sjkim	}
353238405Sjkim
354238405Sjkim	# function_begin prologue
355238405Sjkim	&push("ebp");
356238405Sjkim	&push("ebx");
357238405Sjkim	&push("esi");
358238405Sjkim	&push("edi");
359238405Sjkim
36055714Skris	&comment("");
36155714Skris	$r="esi";
36255714Skris	$a="edi";
36355714Skris	$num="ebx";
36455714Skris
36555714Skris	&mov($r,&wparam(0));	#
36655714Skris	&mov($a,&wparam(1));	#
36755714Skris	&mov($num,&wparam(2));	#
36855714Skris
36955714Skris	&and($num,0xfffffff8);	# num / 8
37055714Skris	&jz(&label("sw_finish"));
37155714Skris
37255714Skris	&set_label("sw_loop",0);
37355714Skris	for ($i=0; $i<32; $i+=4)
37455714Skris		{
37555714Skris		&comment("Round $i");
37655714Skris		&mov("eax",&DWP($i,$a,"",0)); 	# *a
37755714Skris		 # XXX
37855714Skris		&mul("eax");			# *a * *a
37955714Skris		&mov(&DWP($i*2,$r,"",0),"eax");	#
38055714Skris		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
38155714Skris		}
38255714Skris
38355714Skris	&comment("");
38455714Skris	&add($a,32);
38555714Skris	&add($r,64);
38655714Skris	&sub($num,8);
38755714Skris	&jnz(&label("sw_loop"));
38855714Skris
38955714Skris	&set_label("sw_finish",0);
39055714Skris	&mov($num,&wparam(2));	# get num
39155714Skris	&and($num,7);
39255714Skris	&jz(&label("sw_end"));
39355714Skris
39455714Skris	for ($i=0; $i<7; $i++)
39555714Skris		{
39655714Skris		&comment("Tail Round $i");
39755714Skris		&mov("eax",&DWP($i*4,$a,"",0));	# *a
39855714Skris		 # XXX
39955714Skris		&mul("eax");			# *a * *a
40055714Skris		&mov(&DWP($i*8,$r,"",0),"eax");	#
40155714Skris		 &dec($num) if ($i != 7-1);
40255714Skris		&mov(&DWP($i*8+4,$r,"",0),"edx");
40355714Skris		 &jz(&label("sw_end")) if ($i != 7-1);
40455714Skris		}
40555714Skris	&set_label("sw_end",0);
40655714Skris
40755714Skris	&function_end($name);
40855714Skris	}
40955714Skris
41055714Skrissub bn_div_words
41155714Skris	{
41255714Skris	local($name)=@_;
41355714Skris
414238405Sjkim	&function_begin_B($name,"");
41555714Skris	&mov("edx",&wparam(0));	#
41655714Skris	&mov("eax",&wparam(1));	#
417238405Sjkim	&mov("ecx",&wparam(2));	#
418238405Sjkim	&div("ecx");
419238405Sjkim	&ret();
420238405Sjkim	&function_end_B($name);
42155714Skris	}
42255714Skris
42355714Skrissub bn_add_words
42455714Skris	{
42555714Skris	local($name)=@_;
42655714Skris
42755714Skris	&function_begin($name,"");
42855714Skris
42955714Skris	&comment("");
43055714Skris	$a="esi";
43155714Skris	$b="edi";
43255714Skris	$c="eax";
43355714Skris	$r="ebx";
43455714Skris	$tmp1="ecx";
43555714Skris	$tmp2="edx";
43655714Skris	$num="ebp";
43755714Skris
43855714Skris	&mov($r,&wparam(0));	# get r
43955714Skris	 &mov($a,&wparam(1));	# get a
44055714Skris	&mov($b,&wparam(2));	# get b
44155714Skris	 &mov($num,&wparam(3));	# get num
44255714Skris	&xor($c,$c);		# clear carry
44355714Skris	 &and($num,0xfffffff8);	# num / 8
44455714Skris
44555714Skris	&jz(&label("aw_finish"));
44655714Skris
44755714Skris	&set_label("aw_loop",0);
44855714Skris	for ($i=0; $i<8; $i++)
44955714Skris		{
45055714Skris		&comment("Round $i");
45155714Skris
45255714Skris		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
45355714Skris		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
45455714Skris		&add($tmp1,$c);
45555714Skris		 &mov($c,0);
45655714Skris		&adc($c,$c);
45755714Skris		 &add($tmp1,$tmp2);
45855714Skris		&adc($c,0);
45955714Skris		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
46055714Skris		}
46155714Skris
46255714Skris	&comment("");
46355714Skris	&add($a,32);
46455714Skris	 &add($b,32);
46555714Skris	&add($r,32);
46655714Skris	 &sub($num,8);
46755714Skris	&jnz(&label("aw_loop"));
46855714Skris
46955714Skris	&set_label("aw_finish",0);
47055714Skris	&mov($num,&wparam(3));	# get num
47155714Skris	&and($num,7);
47255714Skris	 &jz(&label("aw_end"));
47355714Skris
47455714Skris	for ($i=0; $i<7; $i++)
47555714Skris		{
47655714Skris		&comment("Tail Round $i");
47755714Skris		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
47855714Skris		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
47955714Skris		&add($tmp1,$c);
48055714Skris		 &mov($c,0);
48155714Skris		&adc($c,$c);
48255714Skris		 &add($tmp1,$tmp2);
48355714Skris		&adc($c,0);
48455714Skris		 &dec($num) if ($i != 6);
485109998Smarkm		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
48655714Skris		 &jz(&label("aw_end")) if ($i != 6);
48755714Skris		}
48855714Skris	&set_label("aw_end",0);
48955714Skris
49055714Skris#	&mov("eax",$c);		# $c is "eax"
49155714Skris
49255714Skris	&function_end($name);
49355714Skris	}
49455714Skris
49555714Skrissub bn_sub_words
49655714Skris	{
49755714Skris	local($name)=@_;
49855714Skris
49955714Skris	&function_begin($name,"");
50055714Skris
50155714Skris	&comment("");
50255714Skris	$a="esi";
50355714Skris	$b="edi";
50455714Skris	$c="eax";
50555714Skris	$r="ebx";
50655714Skris	$tmp1="ecx";
50755714Skris	$tmp2="edx";
50855714Skris	$num="ebp";
50955714Skris
51055714Skris	&mov($r,&wparam(0));	# get r
51155714Skris	 &mov($a,&wparam(1));	# get a
51255714Skris	&mov($b,&wparam(2));	# get b
51355714Skris	 &mov($num,&wparam(3));	# get num
51455714Skris	&xor($c,$c);		# clear carry
51555714Skris	 &and($num,0xfffffff8);	# num / 8
51655714Skris
51755714Skris	&jz(&label("aw_finish"));
51855714Skris
51955714Skris	&set_label("aw_loop",0);
52055714Skris	for ($i=0; $i<8; $i++)
52155714Skris		{
52255714Skris		&comment("Round $i");
52355714Skris
52455714Skris		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
52555714Skris		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
52655714Skris		&sub($tmp1,$c);
52755714Skris		 &mov($c,0);
52855714Skris		&adc($c,$c);
52955714Skris		 &sub($tmp1,$tmp2);
53055714Skris		&adc($c,0);
53155714Skris		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
53255714Skris		}
53355714Skris
53455714Skris	&comment("");
53555714Skris	&add($a,32);
53655714Skris	 &add($b,32);
53755714Skris	&add($r,32);
53855714Skris	 &sub($num,8);
53955714Skris	&jnz(&label("aw_loop"));
54055714Skris
54155714Skris	&set_label("aw_finish",0);
54255714Skris	&mov($num,&wparam(3));	# get num
54355714Skris	&and($num,7);
54455714Skris	 &jz(&label("aw_end"));
54555714Skris
54655714Skris	for ($i=0; $i<7; $i++)
54755714Skris		{
54855714Skris		&comment("Tail Round $i");
54955714Skris		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
55055714Skris		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
55155714Skris		&sub($tmp1,$c);
55255714Skris		 &mov($c,0);
55355714Skris		&adc($c,$c);
55455714Skris		 &sub($tmp1,$tmp2);
55555714Skris		&adc($c,0);
55655714Skris		 &dec($num) if ($i != 6);
557109998Smarkm		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
55855714Skris		 &jz(&label("aw_end")) if ($i != 6);
55955714Skris		}
56055714Skris	&set_label("aw_end",0);
56155714Skris
56255714Skris#	&mov("eax",$c);		# $c is "eax"
56355714Skris
56455714Skris	&function_end($name);
56555714Skris	}
56655714Skris
567109998Smarkmsub bn_sub_part_words
568109998Smarkm	{
569109998Smarkm	local($name)=@_;
570109998Smarkm
571109998Smarkm	&function_begin($name,"");
572109998Smarkm
573109998Smarkm	&comment("");
574109998Smarkm	$a="esi";
575109998Smarkm	$b="edi";
576109998Smarkm	$c="eax";
577109998Smarkm	$r="ebx";
578109998Smarkm	$tmp1="ecx";
579109998Smarkm	$tmp2="edx";
580109998Smarkm	$num="ebp";
581109998Smarkm
582109998Smarkm	&mov($r,&wparam(0));	# get r
583109998Smarkm	 &mov($a,&wparam(1));	# get a
584109998Smarkm	&mov($b,&wparam(2));	# get b
585109998Smarkm	 &mov($num,&wparam(3));	# get num
586109998Smarkm	&xor($c,$c);		# clear carry
587109998Smarkm	 &and($num,0xfffffff8);	# num / 8
588109998Smarkm
589109998Smarkm	&jz(&label("aw_finish"));
590109998Smarkm
591109998Smarkm	&set_label("aw_loop",0);
592109998Smarkm	for ($i=0; $i<8; $i++)
593109998Smarkm		{
594109998Smarkm		&comment("Round $i");
595109998Smarkm
596109998Smarkm		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
597109998Smarkm		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
598109998Smarkm		&sub($tmp1,$c);
599109998Smarkm		 &mov($c,0);
600109998Smarkm		&adc($c,$c);
601109998Smarkm		 &sub($tmp1,$tmp2);
602109998Smarkm		&adc($c,0);
603109998Smarkm		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
604109998Smarkm		}
605109998Smarkm
606109998Smarkm	&comment("");
607109998Smarkm	&add($a,32);
608109998Smarkm	 &add($b,32);
609109998Smarkm	&add($r,32);
610109998Smarkm	 &sub($num,8);
611109998Smarkm	&jnz(&label("aw_loop"));
612109998Smarkm
613109998Smarkm	&set_label("aw_finish",0);
614109998Smarkm	&mov($num,&wparam(3));	# get num
615109998Smarkm	&and($num,7);
616109998Smarkm	 &jz(&label("aw_end"));
617109998Smarkm
618109998Smarkm	for ($i=0; $i<7; $i++)
619109998Smarkm		{
620109998Smarkm		&comment("Tail Round $i");
621109998Smarkm		&mov($tmp1,&DWP(0,$a,"",0));	# *a
622109998Smarkm		 &mov($tmp2,&DWP(0,$b,"",0));# *b
623109998Smarkm		&sub($tmp1,$c);
624109998Smarkm		 &mov($c,0);
625109998Smarkm		&adc($c,$c);
626109998Smarkm		 &sub($tmp1,$tmp2);
627109998Smarkm		&adc($c,0);
628109998Smarkm		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
629109998Smarkm		&add($a, 4);
630109998Smarkm		&add($b, 4);
631109998Smarkm		&add($r, 4);
632109998Smarkm		 &dec($num) if ($i != 6);
633109998Smarkm		 &jz(&label("aw_end")) if ($i != 6);
634109998Smarkm		}
635109998Smarkm	&set_label("aw_end",0);
636109998Smarkm
637109998Smarkm	&cmp(&wparam(4),0);
638109998Smarkm	&je(&label("pw_end"));
639109998Smarkm
640109998Smarkm	&mov($num,&wparam(4));	# get dl
641109998Smarkm	&cmp($num,0);
642109998Smarkm	&je(&label("pw_end"));
643109998Smarkm	&jge(&label("pw_pos"));
644109998Smarkm
645109998Smarkm	&comment("pw_neg");
646109998Smarkm	&mov($tmp2,0);
647109998Smarkm	&sub($tmp2,$num);
648109998Smarkm	&mov($num,$tmp2);
649109998Smarkm	&and($num,0xfffffff8);	# num / 8
650109998Smarkm	&jz(&label("pw_neg_finish"));
651109998Smarkm
652109998Smarkm	&set_label("pw_neg_loop",0);
653109998Smarkm	for ($i=0; $i<8; $i++)
654109998Smarkm	{
655109998Smarkm	    &comment("dl<0 Round $i");
656109998Smarkm
657109998Smarkm	    &mov($tmp1,0);
658109998Smarkm	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
659109998Smarkm	    &sub($tmp1,$c);
660109998Smarkm	    &mov($c,0);
661109998Smarkm	    &adc($c,$c);
662109998Smarkm	    &sub($tmp1,$tmp2);
663109998Smarkm	    &adc($c,0);
664109998Smarkm	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
665109998Smarkm	}
666109998Smarkm
667109998Smarkm	&comment("");
668109998Smarkm	&add($b,32);
669109998Smarkm	&add($r,32);
670109998Smarkm	&sub($num,8);
671109998Smarkm	&jnz(&label("pw_neg_loop"));
672109998Smarkm
673109998Smarkm	&set_label("pw_neg_finish",0);
674109998Smarkm	&mov($tmp2,&wparam(4));	# get dl
675109998Smarkm	&mov($num,0);
676109998Smarkm	&sub($num,$tmp2);
677109998Smarkm	&and($num,7);
678109998Smarkm	&jz(&label("pw_end"));
679109998Smarkm
680109998Smarkm	for ($i=0; $i<7; $i++)
681109998Smarkm	{
682109998Smarkm	    &comment("dl<0 Tail Round $i");
683109998Smarkm	    &mov($tmp1,0);
684109998Smarkm	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
685109998Smarkm	    &sub($tmp1,$c);
686109998Smarkm	    &mov($c,0);
687109998Smarkm	    &adc($c,$c);
688109998Smarkm	    &sub($tmp1,$tmp2);
689109998Smarkm	    &adc($c,0);
690109998Smarkm	    &dec($num) if ($i != 6);
691109998Smarkm	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
692109998Smarkm	    &jz(&label("pw_end")) if ($i != 6);
693109998Smarkm	}
694109998Smarkm
695109998Smarkm	&jmp(&label("pw_end"));
696109998Smarkm
697109998Smarkm	&set_label("pw_pos",0);
698109998Smarkm
699109998Smarkm	&and($num,0xfffffff8);	# num / 8
700109998Smarkm	&jz(&label("pw_pos_finish"));
701109998Smarkm
702109998Smarkm	&set_label("pw_pos_loop",0);
703109998Smarkm
704109998Smarkm	for ($i=0; $i<8; $i++)
705109998Smarkm	{
706109998Smarkm	    &comment("dl>0 Round $i");
707109998Smarkm
708109998Smarkm	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
709109998Smarkm	    &sub($tmp1,$c);
710109998Smarkm	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
711109998Smarkm	    &jnc(&label("pw_nc".$i));
712109998Smarkm	}
713109998Smarkm
714109998Smarkm	&comment("");
715109998Smarkm	&add($a,32);
716109998Smarkm	&add($r,32);
717109998Smarkm	&sub($num,8);
718109998Smarkm	&jnz(&label("pw_pos_loop"));
719109998Smarkm
720109998Smarkm	&set_label("pw_pos_finish",0);
721109998Smarkm	&mov($num,&wparam(4));	# get dl
722109998Smarkm	&and($num,7);
723109998Smarkm	&jz(&label("pw_end"));
724109998Smarkm
725109998Smarkm	for ($i=0; $i<7; $i++)
726109998Smarkm	{
727109998Smarkm	    &comment("dl>0 Tail Round $i");
728109998Smarkm	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
729109998Smarkm	    &sub($tmp1,$c);
730109998Smarkm	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
731109998Smarkm	    &jnc(&label("pw_tail_nc".$i));
732109998Smarkm	    &dec($num) if ($i != 6);
733109998Smarkm	    &jz(&label("pw_end")) if ($i != 6);
734109998Smarkm	}
735109998Smarkm	&mov($c,1);
736109998Smarkm	&jmp(&label("pw_end"));
737109998Smarkm
738109998Smarkm	&set_label("pw_nc_loop",0);
739109998Smarkm	for ($i=0; $i<8; $i++)
740109998Smarkm	{
741109998Smarkm	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
742109998Smarkm	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
743109998Smarkm	    &set_label("pw_nc".$i,0);
744109998Smarkm	}
745109998Smarkm
746109998Smarkm	&comment("");
747109998Smarkm	&add($a,32);
748109998Smarkm	&add($r,32);
749109998Smarkm	&sub($num,8);
750109998Smarkm	&jnz(&label("pw_nc_loop"));
751109998Smarkm
752109998Smarkm	&mov($num,&wparam(4));	# get dl
753109998Smarkm	&and($num,7);
754109998Smarkm	&jz(&label("pw_nc_end"));
755109998Smarkm
756109998Smarkm	for ($i=0; $i<7; $i++)
757109998Smarkm	{
758109998Smarkm	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
759109998Smarkm	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
760109998Smarkm	    &set_label("pw_tail_nc".$i,0);
761109998Smarkm	    &dec($num) if ($i != 6);
762109998Smarkm	    &jz(&label("pw_nc_end")) if ($i != 6);
763109998Smarkm	}
764109998Smarkm
765109998Smarkm	&set_label("pw_nc_end",0);
766109998Smarkm	&mov($c,0);
767109998Smarkm
768109998Smarkm	&set_label("pw_end",0);
769109998Smarkm
770109998Smarkm#	&mov("eax",$c);		# $c is "eax"
771109998Smarkm
772109998Smarkm	&function_end($name);
773109998Smarkm	}
774109998Smarkm
775