sha1-586.pl revision 160815
1#!/usr/local/bin/perl
2
3# It was noted that Intel IA-32 C compiler generates code which
4# performs ~30% *faster* on P4 CPU than original *hand-coded*
5# SHA1 assembler implementation. To address this problem (and
6# prove that humans are still better than machines:-), the
7# original code was overhauled, which resulted in following
8# performance changes:
9#
10#		compared with original	compared with Intel cc
11#		assembler impl.		generated code
12# Pentium	-16%			+48%
13# PIII/AMD	+8%			+16%
14# P4		+85%(!)			+45%
15#
16# As you can see Pentium came out as looser:-( Yet I reckoned that
17# improvement on P4 outweights the loss and incorporate this
18# re-tuned code to 0.9.7 and later.
19# ----------------------------------------------------------------
20# Those who for any particular reason absolutely must score on
21# Pentium can replace this module with one from 0.9.6 distribution.
22# This "offer" shall be revoked the moment programming interface to
23# this module is changed, in which case this paragraph should be
24# removed.
25# ----------------------------------------------------------------
26#					<appro@fy.chalmers.se>
27
28$normal=0;
29
30push(@INC,"perlasm","../../perlasm");
31require "x86asm.pl";
32
33&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
34
35$A="eax";
36$B="ecx";
37$C="ebx";
38$D="edx";
39$E="edi";
40$T="esi";
41$tmp1="ebp";
42
43$off=9*4;
44
45@K=(0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6);
46
47&sha1_block_data("sha1_block_asm_data_order");
48
49&asm_finish();
50
51sub Nn
52	{
53	local($p)=@_;
54	local(%n)=($A,$T,$B,$A,$C,$B,$D,$C,$E,$D,$T,$E);
55	return($n{$p});
56	}
57
58sub Np
59	{
60	local($p)=@_;
61	local(%n)=($A,$T,$B,$A,$C,$B,$D,$C,$E,$D,$T,$E);
62	local(%n)=($A,$B,$B,$C,$C,$D,$D,$E,$E,$T,$T,$A);
63	return($n{$p});
64	}
65
66sub Na
67	{
68	local($n)=@_;
69	return( (($n   )&0x0f),
70		(($n+ 2)&0x0f),
71		(($n+ 8)&0x0f),
72		(($n+13)&0x0f),
73		(($n+ 1)&0x0f));
74	}
75
76sub X_expand
77	{
78	local($in)=@_;
79
80	&comment("First, load the words onto the stack in network byte order");
81	for ($i=0; $i<16; $i+=2)
82		{
83		&mov($A,&DWP(($i+0)*4,$in,"",0));# unless $i == 0;
84		 &mov($B,&DWP(($i+1)*4,$in,"",0));
85		&bswap($A);
86		 &bswap($B);
87		&mov(&swtmp($i+0),$A);
88		 &mov(&swtmp($i+1),$B);
89		}
90
91	&comment("We now have the X array on the stack");
92	&comment("starting at sp-4");
93	}
94
95# Rules of engagement
96# F is always trashable at the start, the running total.
97# E becomes the next F so it can be trashed after it has been 'accumulated'
98# F becomes A in the next round.  We don't need to access it much.
99# During the X update part, the result ends up in $X[$n0].
100
101sub BODY_00_15
102	{
103	local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
104
105	&comment("00_15 $n");
106
107	&mov($f,$c);			# f to hold F_00_19(b,c,d)
108	 if ($n==0)  { &mov($tmp1,$a); }
109	 else        { &mov($a,$tmp1); }
110	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
111	 &xor($f,$d);
112	&and($f,$b);
113	 &add($tmp1,$e);		# tmp1+=e;
114	&mov($e,&swtmp($n));		# e becomes volatile and
115	 				# is loaded with xi
116	 &xor($f,$d);			# f holds F_00_19(b,c,d)
117	&rotr($b,2);			# b=ROTATE(b,30)
118	 &lea($tmp1,&DWP($K,$tmp1,$e,1));# tmp1+=K_00_19+xi
119
120	if ($n==15) { &add($f,$tmp1); }	# f+=tmp1
121	else        { &add($tmp1,$f); }
122	}
123
124sub BODY_16_19
125	{
126	local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
127	local($n0,$n1,$n2,$n3,$np)=&Na($n);
128
129	&comment("16_19 $n");
130
131	&mov($f,&swtmp($n1));		# f to hold Xupdate(xi,xa,xb,xc,xd)
132	 &mov($tmp1,$c);		# tmp1 to hold F_00_19(b,c,d)
133	&xor($f,&swtmp($n0));
134	 &xor($tmp1,$d);
135	&xor($f,&swtmp($n2));
136	 &and($tmp1,$b);		# tmp1 holds F_00_19(b,c,d)
137	&rotr($b,2);			# b=ROTATE(b,30)
138	 &xor($f,&swtmp($n3));		# f holds xa^xb^xc^xd
139	&rotl($f,1);			# f=ROATE(f,1)
140	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
141	&mov(&swtmp($n0),$f);		# xi=f
142	&lea($f,&DWP($K,$f,$e,1));	# f+=K_00_19+e
143	 &mov($e,$a);			# e becomes volatile
144	&rotl($e,5);			# e=ROTATE(a,5)
145	 &add($f,$tmp1);		# f+=F_00_19(b,c,d)
146	&add($f,$e);			# f+=ROTATE(a,5)
147	}
148
149sub BODY_20_39
150	{
151	local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
152
153	&comment("20_39 $n");
154	local($n0,$n1,$n2,$n3,$np)=&Na($n);
155
156	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
157	 &mov($f,&swtmp($n0));		# f to hold Xupdate(xi,xa,xb,xc,xd)
158	&rotr($b,2);			# b=ROTATE(b,30)
159	 &xor($f,&swtmp($n1));
160	&xor($tmp1,$c);
161	 &xor($f,&swtmp($n2));
162	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
163	 &xor($f,&swtmp($n3));		# f holds xa^xb^xc^xd
164	&rotl($f,1);			# f=ROTATE(f,1)
165	 &add($tmp1,$e);
166	&mov(&swtmp($n0),$f);		# xi=f
167	 &mov($e,$a);			# e becomes volatile
168	&rotl($e,5);			# e=ROTATE(a,5)
169	 &lea($f,&DWP($K,$f,$tmp1,1));	# f+=K_20_39+e
170	&add($f,$e);			# f+=ROTATE(a,5)
171	}
172
173sub BODY_40_59
174	{
175	local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
176
177	&comment("40_59 $n");
178	local($n0,$n1,$n2,$n3,$np)=&Na($n);
179
180	&mov($f,&swtmp($n0));		# f to hold Xupdate(xi,xa,xb,xc,xd)
181	 &mov($tmp1,&swtmp($n1));
182	&xor($f,$tmp1);
183	 &mov($tmp1,&swtmp($n2));
184	&xor($f,$tmp1);
185	 &mov($tmp1,&swtmp($n3));
186	&xor($f,$tmp1);			# f holds xa^xb^xc^xd
187	 &mov($tmp1,$b);		# tmp1 to hold F_40_59(b,c,d)
188	&rotl($f,1);			# f=ROTATE(f,1)
189	 &or($tmp1,$c);
190	&mov(&swtmp($n0),$f);		# xi=f
191	 &and($tmp1,$d);
192	&lea($f,&DWP($K,$f,$e,1));	# f+=K_40_59+e
193	 &mov($e,$b);			# e becomes volatile and is used
194					# to calculate F_40_59(b,c,d)
195	&rotr($b,2);			# b=ROTATE(b,30)
196	 &and($e,$c);
197	&or($tmp1,$e);			# tmp1 holds F_40_59(b,c,d)
198	 &mov($e,$a);
199	&rotl($e,5);			# e=ROTATE(a,5)
200	 &add($f,$tmp1);		# f+=tmp1;
201	&add($f,$e);			# f+=ROTATE(a,5)
202	}
203
204sub BODY_60_79
205	{
206	&BODY_20_39(@_);
207	}
208
209sub sha1_block_host
210	{
211	local($name, $sclabel)=@_;
212
213	&function_begin_B($name,"");
214
215	# parameter 1 is the MD5_CTX structure.
216	# A	0
217	# B	4
218	# C	8
219	# D 	12
220	# E 	16
221
222	&mov("ecx",	&wparam(2));
223	 &push("esi");
224	&shl("ecx",6);
225	 &mov("esi",	&wparam(1));
226	&push("ebp");
227	 &add("ecx","esi");	# offset to leave on
228	&push("ebx");
229	 &mov("ebp",	&wparam(0));
230	&push("edi");
231	 &mov($D,	&DWP(12,"ebp","",0));
232	&stack_push(18+9);
233	 &mov($E,	&DWP(16,"ebp","",0));
234	&mov($C,	&DWP( 8,"ebp","",0));
235	 &mov(&swtmp(17),"ecx");
236
237	&comment("First we need to setup the X array");
238
239	for ($i=0; $i<16; $i+=2)
240		{
241		&mov($A,&DWP(($i+0)*4,"esi","",0));# unless $i == 0;
242		 &mov($B,&DWP(($i+1)*4,"esi","",0));
243		&mov(&swtmp($i+0),$A);
244		 &mov(&swtmp($i+1),$B);
245		}
246	&jmp($sclabel);
247	&function_end_B($name);
248	}
249
250
251sub sha1_block_data
252	{
253	local($name)=@_;
254
255	&function_begin_B($name,"");
256
257	# parameter 1 is the MD5_CTX structure.
258	# A	0
259	# B	4
260	# C	8
261	# D 	12
262	# E 	16
263
264	&mov("ecx",	&wparam(2));
265	 &push("esi");
266	&shl("ecx",6);
267	 &mov("esi",	&wparam(1));
268	&push("ebp");
269	 &add("ecx","esi");	# offset to leave on
270	&push("ebx");
271	 &mov("ebp",	&wparam(0));
272	&push("edi");
273	 &mov($D,	&DWP(12,"ebp","",0));
274	&stack_push(18+9);
275	 &mov($E,	&DWP(16,"ebp","",0));
276	&mov($C,	&DWP( 8,"ebp","",0));
277	 &mov(&swtmp(17),"ecx");
278
279	&comment("First we need to setup the X array");
280
281	&set_label("start") unless $normal;
282
283	&X_expand("esi");
284	 &mov(&wparam(1),"esi");
285
286	&set_label("shortcut", 0, 1);
287	&comment("");
288	&comment("Start processing");
289
290	# odd start
291	&mov($A,	&DWP( 0,"ebp","",0));
292	 &mov($B,	&DWP( 4,"ebp","",0));
293	$X="esp";
294	&BODY_00_15(-2,$K[0],$X, 0,$A,$B,$C,$D,$E,$T);
295	&BODY_00_15( 0,$K[0],$X, 1,$T,$A,$B,$C,$D,$E);
296	&BODY_00_15( 0,$K[0],$X, 2,$E,$T,$A,$B,$C,$D);
297	&BODY_00_15( 0,$K[0],$X, 3,$D,$E,$T,$A,$B,$C);
298	&BODY_00_15( 0,$K[0],$X, 4,$C,$D,$E,$T,$A,$B);
299	&BODY_00_15( 0,$K[0],$X, 5,$B,$C,$D,$E,$T,$A);
300	&BODY_00_15( 0,$K[0],$X, 6,$A,$B,$C,$D,$E,$T);
301	&BODY_00_15( 0,$K[0],$X, 7,$T,$A,$B,$C,$D,$E);
302	&BODY_00_15( 0,$K[0],$X, 8,$E,$T,$A,$B,$C,$D);
303	&BODY_00_15( 0,$K[0],$X, 9,$D,$E,$T,$A,$B,$C);
304	&BODY_00_15( 0,$K[0],$X,10,$C,$D,$E,$T,$A,$B);
305	&BODY_00_15( 0,$K[0],$X,11,$B,$C,$D,$E,$T,$A);
306	&BODY_00_15( 0,$K[0],$X,12,$A,$B,$C,$D,$E,$T);
307	&BODY_00_15( 0,$K[0],$X,13,$T,$A,$B,$C,$D,$E);
308	&BODY_00_15( 0,$K[0],$X,14,$E,$T,$A,$B,$C,$D);
309	&BODY_00_15( 1,$K[0],$X,15,$D,$E,$T,$A,$B,$C);
310	&BODY_16_19(-1,$K[0],$X,16,$C,$D,$E,$T,$A,$B);
311	&BODY_16_19( 0,$K[0],$X,17,$B,$C,$D,$E,$T,$A);
312	&BODY_16_19( 0,$K[0],$X,18,$A,$B,$C,$D,$E,$T);
313	&BODY_16_19( 1,$K[0],$X,19,$T,$A,$B,$C,$D,$E);
314
315	&BODY_20_39(-1,$K[1],$X,20,$E,$T,$A,$B,$C,$D);
316	&BODY_20_39( 0,$K[1],$X,21,$D,$E,$T,$A,$B,$C);
317	&BODY_20_39( 0,$K[1],$X,22,$C,$D,$E,$T,$A,$B);
318	&BODY_20_39( 0,$K[1],$X,23,$B,$C,$D,$E,$T,$A);
319	&BODY_20_39( 0,$K[1],$X,24,$A,$B,$C,$D,$E,$T);
320	&BODY_20_39( 0,$K[1],$X,25,$T,$A,$B,$C,$D,$E);
321	&BODY_20_39( 0,$K[1],$X,26,$E,$T,$A,$B,$C,$D);
322	&BODY_20_39( 0,$K[1],$X,27,$D,$E,$T,$A,$B,$C);
323	&BODY_20_39( 0,$K[1],$X,28,$C,$D,$E,$T,$A,$B);
324	&BODY_20_39( 0,$K[1],$X,29,$B,$C,$D,$E,$T,$A);
325	&BODY_20_39( 0,$K[1],$X,30,$A,$B,$C,$D,$E,$T);
326	&BODY_20_39( 0,$K[1],$X,31,$T,$A,$B,$C,$D,$E);
327	&BODY_20_39( 0,$K[1],$X,32,$E,$T,$A,$B,$C,$D);
328	&BODY_20_39( 0,$K[1],$X,33,$D,$E,$T,$A,$B,$C);
329	&BODY_20_39( 0,$K[1],$X,34,$C,$D,$E,$T,$A,$B);
330	&BODY_20_39( 0,$K[1],$X,35,$B,$C,$D,$E,$T,$A);
331	&BODY_20_39( 0,$K[1],$X,36,$A,$B,$C,$D,$E,$T);
332	&BODY_20_39( 0,$K[1],$X,37,$T,$A,$B,$C,$D,$E);
333	&BODY_20_39( 0,$K[1],$X,38,$E,$T,$A,$B,$C,$D);
334	&BODY_20_39( 1,$K[1],$X,39,$D,$E,$T,$A,$B,$C);
335
336	&BODY_40_59(-1,$K[2],$X,40,$C,$D,$E,$T,$A,$B);
337	&BODY_40_59( 0,$K[2],$X,41,$B,$C,$D,$E,$T,$A);
338	&BODY_40_59( 0,$K[2],$X,42,$A,$B,$C,$D,$E,$T);
339	&BODY_40_59( 0,$K[2],$X,43,$T,$A,$B,$C,$D,$E);
340	&BODY_40_59( 0,$K[2],$X,44,$E,$T,$A,$B,$C,$D);
341	&BODY_40_59( 0,$K[2],$X,45,$D,$E,$T,$A,$B,$C);
342	&BODY_40_59( 0,$K[2],$X,46,$C,$D,$E,$T,$A,$B);
343	&BODY_40_59( 0,$K[2],$X,47,$B,$C,$D,$E,$T,$A);
344	&BODY_40_59( 0,$K[2],$X,48,$A,$B,$C,$D,$E,$T);
345	&BODY_40_59( 0,$K[2],$X,49,$T,$A,$B,$C,$D,$E);
346	&BODY_40_59( 0,$K[2],$X,50,$E,$T,$A,$B,$C,$D);
347	&BODY_40_59( 0,$K[2],$X,51,$D,$E,$T,$A,$B,$C);
348	&BODY_40_59( 0,$K[2],$X,52,$C,$D,$E,$T,$A,$B);
349	&BODY_40_59( 0,$K[2],$X,53,$B,$C,$D,$E,$T,$A);
350	&BODY_40_59( 0,$K[2],$X,54,$A,$B,$C,$D,$E,$T);
351	&BODY_40_59( 0,$K[2],$X,55,$T,$A,$B,$C,$D,$E);
352	&BODY_40_59( 0,$K[2],$X,56,$E,$T,$A,$B,$C,$D);
353	&BODY_40_59( 0,$K[2],$X,57,$D,$E,$T,$A,$B,$C);
354	&BODY_40_59( 0,$K[2],$X,58,$C,$D,$E,$T,$A,$B);
355	&BODY_40_59( 1,$K[2],$X,59,$B,$C,$D,$E,$T,$A);
356
357	&BODY_60_79(-1,$K[3],$X,60,$A,$B,$C,$D,$E,$T);
358	&BODY_60_79( 0,$K[3],$X,61,$T,$A,$B,$C,$D,$E);
359	&BODY_60_79( 0,$K[3],$X,62,$E,$T,$A,$B,$C,$D);
360	&BODY_60_79( 0,$K[3],$X,63,$D,$E,$T,$A,$B,$C);
361	&BODY_60_79( 0,$K[3],$X,64,$C,$D,$E,$T,$A,$B);
362	&BODY_60_79( 0,$K[3],$X,65,$B,$C,$D,$E,$T,$A);
363	&BODY_60_79( 0,$K[3],$X,66,$A,$B,$C,$D,$E,$T);
364	&BODY_60_79( 0,$K[3],$X,67,$T,$A,$B,$C,$D,$E);
365	&BODY_60_79( 0,$K[3],$X,68,$E,$T,$A,$B,$C,$D);
366	&BODY_60_79( 0,$K[3],$X,69,$D,$E,$T,$A,$B,$C);
367	&BODY_60_79( 0,$K[3],$X,70,$C,$D,$E,$T,$A,$B);
368	&BODY_60_79( 0,$K[3],$X,71,$B,$C,$D,$E,$T,$A);
369	&BODY_60_79( 0,$K[3],$X,72,$A,$B,$C,$D,$E,$T);
370	&BODY_60_79( 0,$K[3],$X,73,$T,$A,$B,$C,$D,$E);
371	&BODY_60_79( 0,$K[3],$X,74,$E,$T,$A,$B,$C,$D);
372	&BODY_60_79( 0,$K[3],$X,75,$D,$E,$T,$A,$B,$C);
373	&BODY_60_79( 0,$K[3],$X,76,$C,$D,$E,$T,$A,$B);
374	&BODY_60_79( 0,$K[3],$X,77,$B,$C,$D,$E,$T,$A);
375	&BODY_60_79( 0,$K[3],$X,78,$A,$B,$C,$D,$E,$T);
376	&BODY_60_79( 2,$K[3],$X,79,$T,$A,$B,$C,$D,$E);
377
378	&comment("End processing");
379	&comment("");
380	# D is the tmp value
381
382	# E -> A
383	# T -> B
384	# A -> C
385	# B -> D
386	# C -> E
387	# D -> T
388
389	&mov($tmp1,&wparam(0));
390
391	 &mov($D,	&DWP(12,$tmp1,"",0));
392	&add($D,$B);
393	 &mov($B,	&DWP( 4,$tmp1,"",0));
394	&add($B,$T);
395	 &mov($T,	$A);
396	&mov($A,	&DWP( 0,$tmp1,"",0));
397	 &mov(&DWP(12,$tmp1,"",0),$D);
398
399	&add($A,$E);
400	 &mov($E,	&DWP(16,$tmp1,"",0));
401	&add($E,$C);
402	 &mov($C,	&DWP( 8,$tmp1,"",0));
403	&add($C,$T);
404
405	 &mov(&DWP( 0,$tmp1,"",0),$A);
406	&mov("esi",&wparam(1));
407	 &mov(&DWP( 8,$tmp1,"",0),$C);
408 	&add("esi",64);
409	 &mov("eax",&swtmp(17));
410	&mov(&DWP(16,$tmp1,"",0),$E);
411	 &cmp("esi","eax");
412	&mov(&DWP( 4,$tmp1,"",0),$B);
413	 &jb(&label("start"));
414
415	&stack_pop(18+9);
416	 &pop("edi");
417	&pop("ebx");
418	 &pop("ebp");
419	&pop("esi");
420	 &ret();
421
422	# keep a note of shortcut label so it can be used outside
423	# block.
424	my $sclabel = &label("shortcut");
425
426	&function_end_B($name);
427	# Putting this here avoids problems with MASM in debugging mode
428	&sha1_block_host("sha1_block_asm_host_order", $sclabel);
429	}
430
431