1#!/usr/local/bin/perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4push(@INC,"${dir}","${dir}../../perlasm");
5require "x86asm.pl";
6
7&asm_init($ARGV[0],$0);
8
9$sse2=0;
10for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
11
12&external_label("OPENSSL_ia32cap_P") if ($sse2);
13
14&bn_mul_add_words("bn_mul_add_words");
15&bn_mul_words("bn_mul_words");
16&bn_sqr_words("bn_sqr_words");
17&bn_div_words("bn_div_words");
18&bn_add_words("bn_add_words");
19&bn_sub_words("bn_sub_words");
20
21&asm_finish();
22
23sub bn_mul_add_words
24	{
25	local($name)=@_;
26
27	&function_begin_B($name,"");
28
29	$r="eax";
30	$a="edx";
31	$c="ecx";
32
33	if ($sse2) {
34		&picsetup("eax");
35		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
36		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
37		&jnc(&label("maw_non_sse2"));
38
39		&mov($r,&wparam(0));
40		&mov($a,&wparam(1));
41		&mov($c,&wparam(2));
42		&movd("mm0",&wparam(3));	# mm0 = w
43		&pxor("mm1","mm1");		# mm1 = carry_in
44		&jmp(&label("maw_sse2_entry"));
45
46	&set_label("maw_sse2_unrolled",16);
47		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
48		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
49		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
50		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
51		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
52		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
53		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
54		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
55		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
56		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
57		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
58		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
59		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
60		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
61		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
62		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
63		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
64		&movd(&DWP(0,$r,"",0),"mm1");
65		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
66		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
67		&psrlq("mm1",32);		# mm1 = carry0
68		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
69		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
70		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
71		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
72		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
73		&movd(&DWP(4,$r,"",0),"mm1");
74		&psrlq("mm1",32);		# mm1 = carry1
75		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
76		&add($a,32);
77		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
78		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
79		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
80		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
81		&movd(&DWP(8,$r,"",0),"mm1");
82		&psrlq("mm1",32);		# mm1 = carry2
83		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
84		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
85		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
86		&movd(&DWP(12,$r,"",0),"mm1");
87		&psrlq("mm1",32);		# mm1 = carry3
88		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
89		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
90		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
91		&movd(&DWP(16,$r,"",0),"mm1");
92		&psrlq("mm1",32);		# mm1 = carry4
93		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
94		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
95		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
96		&movd(&DWP(20,$r,"",0),"mm1");
97		&psrlq("mm1",32);		# mm1 = carry5
98		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
99		&movd(&DWP(24,$r,"",0),"mm1");
100		&psrlq("mm1",32);		# mm1 = carry6
101		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
102		&movd(&DWP(28,$r,"",0),"mm1");
103		&lea($r,&DWP(32,$r));
104		&psrlq("mm1",32);		# mm1 = carry_out
105
106		&sub($c,8);
107		&jz(&label("maw_sse2_exit"));
108	&set_label("maw_sse2_entry");
109		&test($c,0xfffffff8);
110		&jnz(&label("maw_sse2_unrolled"));
111
112	&set_label("maw_sse2_loop",4);
113		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
114		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
115		&pmuludq("mm2","mm0");		# a[i] *= w
116		&lea($a,&DWP(4,$a));
117		&paddq("mm1","mm3");		# carry += r[i]
118		&paddq("mm1","mm2");		# carry += a[i]*w
119		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
120		&sub($c,1);
121		&psrlq("mm1",32);		# carry = carry_high
122		&lea($r,&DWP(4,$r));
123		&jnz(&label("maw_sse2_loop"));
124	&set_label("maw_sse2_exit");
125		&movd("eax","mm1");		# c = carry_out
126		&emms();
127		&ret();
128
129	&set_label("maw_non_sse2",16);
130	}
131
132	# function_begin prologue
133	&push("ebp");
134	&push("ebx");
135	&push("esi");
136	&push("edi");
137
138	&comment("");
139	$Low="eax";
140	$High="edx";
141	$a="ebx";
142	$w="ebp";
143	$r="edi";
144	$c="esi";
145
146	&xor($c,$c);		# clear carry
147	&mov($r,&wparam(0));	#
148
149	&mov("ecx",&wparam(2));	#
150	&mov($a,&wparam(1));	#
151
152	&and("ecx",0xfffffff8);	# num / 8
153	&mov($w,&wparam(3));	#
154
155	&push("ecx");		# Up the stack for a tmp variable
156
157	&jz(&label("maw_finish"));
158
159	&set_label("maw_loop",16);
160
161	for ($i=0; $i<32; $i+=4)
162		{
163		&comment("Round $i");
164
165		 &mov("eax",&DWP($i,$a)); 	# *a
166		&mul($w);			# *a * w
167		&add("eax",$c);			# L(t)+= c
168		&adc("edx",0);			# H(t)+=carry
169		 &add("eax",&DWP($i,$r));	# L(t)+= *r
170		&adc("edx",0);			# H(t)+=carry
171		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
172		&mov($c,"edx");			# c=  H(t);
173		}
174
175	&comment("");
176	&sub("ecx",8);
177	&lea($a,&DWP(32,$a));
178	&lea($r,&DWP(32,$r));
179	&jnz(&label("maw_loop"));
180
181	&set_label("maw_finish",0);
182	&mov("ecx",&wparam(2));	# get num
183	&and("ecx",7);
184	&jnz(&label("maw_finish2"));	# helps branch prediction
185	&jmp(&label("maw_end"));
186
187	&set_label("maw_finish2",1);
188	for ($i=0; $i<7; $i++)
189		{
190		&comment("Tail Round $i");
191		 &mov("eax",&DWP($i*4,$a));	# *a
192		&mul($w);			# *a * w
193		&add("eax",$c);			# L(t)+=c
194		&adc("edx",0);			# H(t)+=carry
195		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
196		&adc("edx",0);			# H(t)+=carry
197		 &dec("ecx") if ($i != 7-1);
198		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
199		 &mov($c,"edx");		# c=  H(t);
200		&jz(&label("maw_end")) if ($i != 7-1);
201		}
202	&set_label("maw_end",0);
203	&mov("eax",$c);
204
205	&pop("ecx");	# clear variable from
206
207	&function_end($name);
208	}
209
210sub bn_mul_words
211	{
212	local($name)=@_;
213
214	&function_begin_B($name,"");
215
216	$r="eax";
217	$a="edx";
218	$c="ecx";
219
220	if ($sse2) {
221		&picsetup("eax");
222		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
223		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
224		&jnc(&label("mw_non_sse2"));
225
226		&mov($r,&wparam(0));
227		&mov($a,&wparam(1));
228		&mov($c,&wparam(2));
229		&movd("mm0",&wparam(3));	# mm0 = w
230		&pxor("mm1","mm1");		# mm1 = carry = 0
231
232	&set_label("mw_sse2_loop",16);
233		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
234		&pmuludq("mm2","mm0");		# a[i] *= w
235		&lea($a,&DWP(4,$a));
236		&paddq("mm1","mm2");		# carry += a[i]*w
237		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
238		&sub($c,1);
239		&psrlq("mm1",32);		# carry = carry_high
240		&lea($r,&DWP(4,$r));
241		&jnz(&label("mw_sse2_loop"));
242
243		&movd("eax","mm1");		# return carry
244		&emms();
245		&ret();
246	&set_label("mw_non_sse2",16);
247	}
248
249	# function_begin prologue
250	&push("ebp");
251	&push("ebx");
252	&push("esi");
253	&push("edi");
254
255	&comment("");
256	$Low="eax";
257	$High="edx";
258	$a="ebx";
259	$w="ecx";
260	$r="edi";
261	$c="esi";
262	$num="ebp";
263
264	&xor($c,$c);		# clear carry
265	&mov($r,&wparam(0));	#
266	&mov($a,&wparam(1));	#
267	&mov($num,&wparam(2));	#
268	&mov($w,&wparam(3));	#
269
270	&and($num,0xfffffff8);	# num / 8
271	&jz(&label("mw_finish"));
272
273	&set_label("mw_loop",0);
274	for ($i=0; $i<32; $i+=4)
275		{
276		&comment("Round $i");
277
278		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
279		&mul($w);			# *a * w
280		&add("eax",$c);			# L(t)+=c
281		 # XXX
282
283		&adc("edx",0);			# H(t)+=carry
284		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
285
286		&mov($c,"edx");			# c=  H(t);
287		}
288
289	&comment("");
290	&add($a,32);
291	&add($r,32);
292	&sub($num,8);
293	&jz(&label("mw_finish"));
294	&jmp(&label("mw_loop"));
295
296	&set_label("mw_finish",0);
297	&mov($num,&wparam(2));	# get num
298	&and($num,7);
299	&jnz(&label("mw_finish2"));
300	&jmp(&label("mw_end"));
301
302	&set_label("mw_finish2",1);
303	for ($i=0; $i<7; $i++)
304		{
305		&comment("Tail Round $i");
306		 &mov("eax",&DWP($i*4,$a,"",0));# *a
307		&mul($w);			# *a * w
308		&add("eax",$c);			# L(t)+=c
309		 # XXX
310		&adc("edx",0);			# H(t)+=carry
311		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
312		&mov($c,"edx");			# c=  H(t);
313		 &dec($num) if ($i != 7-1);
314		&jz(&label("mw_end")) if ($i != 7-1);
315		}
316	&set_label("mw_end",0);
317	&mov("eax",$c);
318
319	&function_end($name);
320	}
321
322sub bn_sqr_words
323	{
324	local($name)=@_;
325
326	&function_begin_B($name,"");
327
328	$r="eax";
329	$a="edx";
330	$c="ecx";
331
332	if ($sse2) {
333		&picsetup("eax");
334		&picsymbol("eax", "OPENSSL_ia32cap_P", "eax");
335		&bt(&DWP(0,"eax"),"\$IA32CAP_BIT0_SSE2");
336		&jnc(&label("sqr_non_sse2"));
337
338		&mov($r,&wparam(0));
339		&mov($a,&wparam(1));
340		&mov($c,&wparam(2));
341
342	&set_label("sqr_sse2_loop",16);
343		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
344		&pmuludq("mm0","mm0");		# a[i] *= a[i]
345		&lea($a,&DWP(4,$a));		# a++
346		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
347		&sub($c,1);
348		&lea($r,&DWP(8,$r));		# r += 2
349		&jnz(&label("sqr_sse2_loop"));
350
351		&emms();
352		&ret();
353	&set_label("sqr_non_sse2",16);
354	}
355
356	# function_begin prologue
357	&push("ebp");
358	&push("ebx");
359	&push("esi");
360	&push("edi");
361
362	&comment("");
363	$r="esi";
364	$a="edi";
365	$num="ebx";
366
367	&mov($r,&wparam(0));	#
368	&mov($a,&wparam(1));	#
369	&mov($num,&wparam(2));	#
370
371	&and($num,0xfffffff8);	# num / 8
372	&jz(&label("sw_finish"));
373
374	&set_label("sw_loop",0);
375	for ($i=0; $i<32; $i+=4)
376		{
377		&comment("Round $i");
378		&mov("eax",&DWP($i,$a,"",0)); 	# *a
379		 # XXX
380		&mul("eax");			# *a * *a
381		&mov(&DWP($i*2,$r,"",0),"eax");	#
382		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
383		}
384
385	&comment("");
386	&add($a,32);
387	&add($r,64);
388	&sub($num,8);
389	&jnz(&label("sw_loop"));
390
391	&set_label("sw_finish",0);
392	&mov($num,&wparam(2));	# get num
393	&and($num,7);
394	&jz(&label("sw_end"));
395
396	for ($i=0; $i<7; $i++)
397		{
398		&comment("Tail Round $i");
399		&mov("eax",&DWP($i*4,$a,"",0));	# *a
400		 # XXX
401		&mul("eax");			# *a * *a
402		&mov(&DWP($i*8,$r,"",0),"eax");	#
403		 &dec($num) if ($i != 7-1);
404		&mov(&DWP($i*8+4,$r,"",0),"edx");
405		 &jz(&label("sw_end")) if ($i != 7-1);
406		}
407	&set_label("sw_end",0);
408
409	&function_end($name);
410	}
411
412sub bn_div_words
413	{
414	local($name)=@_;
415
416	&function_begin_B($name,"");
417	&mov("edx",&wparam(0));	#
418	&mov("eax",&wparam(1));	#
419	&mov("ecx",&wparam(2));	#
420	&div("ecx");
421	&ret();
422	&function_end_B($name);
423	}
424
425sub bn_add_words
426	{
427	local($name)=@_;
428
429	&function_begin($name,"");
430
431	&comment("");
432	$a="esi";
433	$b="edi";
434	$c="eax";
435	$r="ebx";
436	$tmp1="ecx";
437	$tmp2="edx";
438	$num="ebp";
439
440	&mov($r,&wparam(0));	# get r
441	 &mov($a,&wparam(1));	# get a
442	&mov($b,&wparam(2));	# get b
443	 &mov($num,&wparam(3));	# get num
444	&xor($c,$c);		# clear carry
445	 &and($num,0xfffffff8);	# num / 8
446
447	&jz(&label("aw_finish"));
448
449	&set_label("aw_loop",0);
450	for ($i=0; $i<8; $i++)
451		{
452		&comment("Round $i");
453
454		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
455		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
456		&add($tmp1,$c);
457		 &mov($c,0);
458		&adc($c,$c);
459		 &add($tmp1,$tmp2);
460		&adc($c,0);
461		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
462		}
463
464	&comment("");
465	&add($a,32);
466	 &add($b,32);
467	&add($r,32);
468	 &sub($num,8);
469	&jnz(&label("aw_loop"));
470
471	&set_label("aw_finish",0);
472	&mov($num,&wparam(3));	# get num
473	&and($num,7);
474	 &jz(&label("aw_end"));
475
476	for ($i=0; $i<7; $i++)
477		{
478		&comment("Tail Round $i");
479		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
480		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
481		&add($tmp1,$c);
482		 &mov($c,0);
483		&adc($c,$c);
484		 &add($tmp1,$tmp2);
485		&adc($c,0);
486		 &dec($num) if ($i != 6);
487		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
488		 &jz(&label("aw_end")) if ($i != 6);
489		}
490	&set_label("aw_end",0);
491
492#	&mov("eax",$c);		# $c is "eax"
493
494	&function_end($name);
495	}
496
497sub bn_sub_words
498	{
499	local($name)=@_;
500
501	&function_begin($name,"");
502
503	&comment("");
504	$a="esi";
505	$b="edi";
506	$c="eax";
507	$r="ebx";
508	$tmp1="ecx";
509	$tmp2="edx";
510	$num="ebp";
511
512	&mov($r,&wparam(0));	# get r
513	 &mov($a,&wparam(1));	# get a
514	&mov($b,&wparam(2));	# get b
515	 &mov($num,&wparam(3));	# get num
516	&xor($c,$c);		# clear carry
517	 &and($num,0xfffffff8);	# num / 8
518
519	&jz(&label("aw_finish"));
520
521	&set_label("aw_loop",0);
522	for ($i=0; $i<8; $i++)
523		{
524		&comment("Round $i");
525
526		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
527		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
528		&sub($tmp1,$c);
529		 &mov($c,0);
530		&adc($c,$c);
531		 &sub($tmp1,$tmp2);
532		&adc($c,0);
533		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
534		}
535
536	&comment("");
537	&add($a,32);
538	 &add($b,32);
539	&add($r,32);
540	 &sub($num,8);
541	&jnz(&label("aw_loop"));
542
543	&set_label("aw_finish",0);
544	&mov($num,&wparam(3));	# get num
545	&and($num,7);
546	 &jz(&label("aw_end"));
547
548	for ($i=0; $i<7; $i++)
549		{
550		&comment("Tail Round $i");
551		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
552		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
553		&sub($tmp1,$c);
554		 &mov($c,0);
555		&adc($c,$c);
556		 &sub($tmp1,$tmp2);
557		&adc($c,0);
558		 &dec($num) if ($i != 6);
559		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
560		 &jz(&label("aw_end")) if ($i != 6);
561		}
562	&set_label("aw_end",0);
563
564#	&mov("eax",$c);		# $c is "eax"
565
566	&function_end($name);
567	}
568