1183234Ssimon#!/usr/bin/env perl
2183234Ssimon#
3183234Ssimon# ====================================================================
4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and
6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further
7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/.
8183234Ssimon# ====================================================================
9183234Ssimon#
10183234Ssimon# sha1_block procedure for x86_64.
11183234Ssimon#
12183234Ssimon# It was brought to my attention that on EM64T compiler-generated code
13183234Ssimon# was far behind 32-bit assembler implementation. This is unlike on
14183234Ssimon# Opteron where compiler-generated code was only 15% behind 32-bit
15183234Ssimon# assembler, which originally made it hard to motivate the effort.
16183234Ssimon# There was suggestion to mechanically translate 32-bit code, but I
17183234Ssimon# dismissed it, reasoning that x86_64 offers enough register bank
18183234Ssimon# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19183234Ssimon# implementation:-) However! While 64-bit code does performs better
20183234Ssimon# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21183234Ssimon# x86_64 does offer larger *addressable* bank, but out-of-order core
22183234Ssimon# reaches for even more registers through dynamic aliasing, and EM64T
23183234Ssimon# core must have managed to run-time optimize even 32-bit code just as
24183234Ssimon# good as 64-bit one. Performance improvement is summarized in the
25183234Ssimon# following table:
26183234Ssimon#
27183234Ssimon#		gcc 3.4		32-bit asm	cycles/byte
28183234Ssimon# Opteron	+45%		+20%		6.8
29183234Ssimon# Xeon P4	+65%		+0%		9.9
30183234Ssimon# Core2		+60%		+10%		7.0
31183234Ssimon
32183234Ssimon$output=shift;
33183234Ssimon
34183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37183234Ssimondie "can't locate x86_64-xlate.pl";
38183234Ssimon
39183234Ssimonopen STDOUT,"| $^X $xlate $output";
40183234Ssimon
41183234Ssimon$ctx="%rdi";	# 1st arg
42183234Ssimon$inp="%rsi";	# 2nd arg
43183234Ssimon$num="%rdx";	# 3rd arg
44183234Ssimon
45183234Ssimon# reassign arguments in order to produce more compact code
46183234Ssimon$ctx="%r8";
47183234Ssimon$inp="%r9";
48183234Ssimon$num="%r10";
49183234Ssimon
50183234Ssimon$xi="%eax";
51183234Ssimon$t0="%ebx";
52183234Ssimon$t1="%ecx";
53183234Ssimon$A="%edx";
54183234Ssimon$B="%esi";
55183234Ssimon$C="%edi";
56183234Ssimon$D="%ebp";
57183234Ssimon$E="%r11d";
58183234Ssimon$T="%r12d";
59183234Ssimon
60183234Ssimon@V=($A,$B,$C,$D,$E,$T);
61183234Ssimon
62183234Ssimonsub PROLOGUE {
63183234Ssimonmy $func=shift;
64183234Ssimon$code.=<<___;
65183234Ssimon.globl	$func
66183234Ssimon.type	$func,\@function,3
67183234Ssimon.align	16
68183234Ssimon$func:
69183234Ssimon	push	%rbx
70183234Ssimon	push	%rbp
71183234Ssimon	push	%r12
72183234Ssimon	mov	%rsp,%rax
73183234Ssimon	mov	%rdi,$ctx	# reassigned argument
74183234Ssimon	sub	\$`8+16*4`,%rsp
75183234Ssimon	mov	%rsi,$inp	# reassigned argument
76183234Ssimon	and	\$-64,%rsp
77183234Ssimon	mov	%rdx,$num	# reassigned argument
78183234Ssimon	mov	%rax,`16*4`(%rsp)
79183234Ssimon
80183234Ssimon	mov	0($ctx),$A
81183234Ssimon	mov	4($ctx),$B
82183234Ssimon	mov	8($ctx),$C
83183234Ssimon	mov	12($ctx),$D
84183234Ssimon	mov	16($ctx),$E
85183234Ssimon___
86183234Ssimon}
87183234Ssimon
88183234Ssimonsub EPILOGUE {
89183234Ssimonmy $func=shift;
90183234Ssimon$code.=<<___;
91183234Ssimon	mov	`16*4`(%rsp),%rsp
92183234Ssimon	pop	%r12
93183234Ssimon	pop	%rbp
94183234Ssimon	pop	%rbx
95183234Ssimon	ret
96183234Ssimon.size	$func,.-$func
97183234Ssimon___
98183234Ssimon}
99183234Ssimon
100183234Ssimonsub BODY_00_19 {
101183234Ssimonmy ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
102183234Ssimonmy $j=$i+1;
103183234Ssimon$code.=<<___ if ($i==0);
104183234Ssimon	mov	`4*$i`($inp),$xi
105183234Ssimon	`"bswap	$xi"	if(!defined($host))`
106183234Ssimon	mov	$xi,`4*$i`(%rsp)
107183234Ssimon___
108183234Ssimon$code.=<<___ if ($i<15);
109183234Ssimon	lea	0x5a827999($xi,$e),$f
110183234Ssimon	mov	$c,$t0
111183234Ssimon	mov	`4*$j`($inp),$xi
112183234Ssimon	mov	$a,$e
113183234Ssimon	xor	$d,$t0
114183234Ssimon	`"bswap	$xi"	if(!defined($host))`
115183234Ssimon	rol	\$5,$e
116183234Ssimon	and	$b,$t0
117183234Ssimon	mov	$xi,`4*$j`(%rsp)
118183234Ssimon	add	$e,$f
119183234Ssimon	xor	$d,$t0
120183234Ssimon	rol	\$30,$b
121183234Ssimon	add	$t0,$f
122183234Ssimon___
123183234Ssimon$code.=<<___ if ($i>=15);
124183234Ssimon	lea	0x5a827999($xi,$e),$f
125183234Ssimon	mov	`4*($j%16)`(%rsp),$xi
126183234Ssimon	mov	$c,$t0
127183234Ssimon	mov	$a,$e
128183234Ssimon	xor	`4*(($j+2)%16)`(%rsp),$xi
129183234Ssimon	xor	$d,$t0
130183234Ssimon	rol	\$5,$e
131183234Ssimon	xor	`4*(($j+8)%16)`(%rsp),$xi
132183234Ssimon	and	$b,$t0
133183234Ssimon	add	$e,$f
134183234Ssimon	xor	`4*(($j+13)%16)`(%rsp),$xi
135183234Ssimon	xor	$d,$t0
136183234Ssimon	rol	\$30,$b
137183234Ssimon	add	$t0,$f
138183234Ssimon	rol	\$1,$xi
139183234Ssimon	mov	$xi,`4*($j%16)`(%rsp)
140183234Ssimon___
141183234Ssimon}
142183234Ssimon
143183234Ssimonsub BODY_20_39 {
144183234Ssimonmy ($i,$a,$b,$c,$d,$e,$f)=@_;
145183234Ssimonmy $j=$i+1;
146183234Ssimonmy $K=($i<40)?0x6ed9eba1:0xca62c1d6;
147183234Ssimon$code.=<<___ if ($i<79);
148183234Ssimon	lea	$K($xi,$e),$f
149183234Ssimon	mov	`4*($j%16)`(%rsp),$xi
150183234Ssimon	mov	$c,$t0
151183234Ssimon	mov	$a,$e
152183234Ssimon	xor	`4*(($j+2)%16)`(%rsp),$xi
153183234Ssimon	xor	$b,$t0
154183234Ssimon	rol	\$5,$e
155183234Ssimon	xor	`4*(($j+8)%16)`(%rsp),$xi
156183234Ssimon	xor	$d,$t0
157183234Ssimon	add	$e,$f
158183234Ssimon	xor	`4*(($j+13)%16)`(%rsp),$xi
159183234Ssimon	rol	\$30,$b
160183234Ssimon	add	$t0,$f
161183234Ssimon	rol	\$1,$xi
162183234Ssimon___
163183234Ssimon$code.=<<___ if ($i<76);
164183234Ssimon	mov	$xi,`4*($j%16)`(%rsp)
165183234Ssimon___
166183234Ssimon$code.=<<___ if ($i==79);
167183234Ssimon	lea	$K($xi,$e),$f
168183234Ssimon	mov	$c,$t0
169183234Ssimon	mov	$a,$e
170183234Ssimon	xor	$b,$t0
171183234Ssimon	rol	\$5,$e
172183234Ssimon	xor	$d,$t0
173183234Ssimon	add	$e,$f
174183234Ssimon	rol	\$30,$b
175183234Ssimon	add	$t0,$f
176183234Ssimon___
177183234Ssimon}
178183234Ssimon
179183234Ssimonsub BODY_40_59 {
180183234Ssimonmy ($i,$a,$b,$c,$d,$e,$f)=@_;
181183234Ssimonmy $j=$i+1;
182183234Ssimon$code.=<<___;
183183234Ssimon	lea	0x8f1bbcdc($xi,$e),$f
184183234Ssimon	mov	`4*($j%16)`(%rsp),$xi
185183234Ssimon	mov	$b,$t0
186183234Ssimon	mov	$b,$t1
187183234Ssimon	xor	`4*(($j+2)%16)`(%rsp),$xi
188183234Ssimon	mov	$a,$e
189183234Ssimon	and	$c,$t0
190183234Ssimon	xor	`4*(($j+8)%16)`(%rsp),$xi
191183234Ssimon	or	$c,$t1
192183234Ssimon	rol	\$5,$e
193183234Ssimon	xor	`4*(($j+13)%16)`(%rsp),$xi
194183234Ssimon	and	$d,$t1
195183234Ssimon	add	$e,$f
196183234Ssimon	rol	\$1,$xi
197183234Ssimon	or	$t1,$t0
198183234Ssimon	rol	\$30,$b
199183234Ssimon	mov	$xi,`4*($j%16)`(%rsp)
200183234Ssimon	add	$t0,$f
201183234Ssimon___
202183234Ssimon}
203183234Ssimon
204183234Ssimon$code=".text\n";
205183234Ssimon
206183234Ssimon&PROLOGUE("sha1_block_data_order");
207183234Ssimon$code.=".align	4\n.Lloop:\n";
208183234Ssimonfor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
209183234Ssimonfor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
210183234Ssimonfor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
211183234Ssimonfor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212183234Ssimon$code.=<<___;
213183234Ssimon	add	0($ctx),$E
214183234Ssimon	add	4($ctx),$T
215183234Ssimon	add	8($ctx),$A
216183234Ssimon	add	12($ctx),$B
217183234Ssimon	add	16($ctx),$C
218183234Ssimon	mov	$E,0($ctx)
219183234Ssimon	mov	$T,4($ctx)
220183234Ssimon	mov	$A,8($ctx)
221183234Ssimon	mov	$B,12($ctx)
222183234Ssimon	mov	$C,16($ctx)
223183234Ssimon
224183234Ssimon	xchg	$E,$A	# mov	$E,$A
225183234Ssimon	xchg	$T,$B	# mov	$T,$B
226183234Ssimon	xchg	$E,$C	# mov	$A,$C
227183234Ssimon	xchg	$T,$D	# mov	$B,$D
228183234Ssimon			# mov	$C,$E
229183234Ssimon	lea	`16*4`($inp),$inp
230183234Ssimon	sub	\$1,$num
231183234Ssimon	jnz	.Lloop
232183234Ssimon___
233183234Ssimon&EPILOGUE("sha1_block_data_order");
234183234Ssimon$code.=<<___;
235183234Ssimon.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
236183234Ssimon___
237183234Ssimon
238183234Ssimon####################################################################
239183234Ssimon
240183234Ssimon$code =~ s/\`([^\`]*)\`/eval $1/gem;
241183234Ssimonprint $code;
242183234Ssimonclose STDOUT;
243