• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/router/openssl-1.0.0q/crypto/sha/asm/
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does performs better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32$flavour = shift;
33$output  = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
43open OUT,"| \"$^X\" $xlate $flavour $output";
44*STDOUT=*OUT;
45
46$ctx="%rdi";	# 1st arg
47$inp="%rsi";	# 2nd arg
48$num="%rdx";	# 3rd arg
49
50# reassign arguments in order to produce more compact code
51$ctx="%r8";
52$inp="%r9";
53$num="%r10";
54
55$xi="%eax";
56$t0="%ebx";
57$t1="%ecx";
58$A="%edx";
59$B="%esi";
60$C="%edi";
61$D="%ebp";
62$E="%r11d";
63$T="%r12d";
64
65@V=($A,$B,$C,$D,$E,$T);
66
67sub PROLOGUE {
68my $func=shift;
69$code.=<<___;
70.globl	$func
71.type	$func,\@function,3
72.align	16
73$func:
74	push	%rbx
75	push	%rbp
76	push	%r12
77	mov	%rsp,%r11
78	mov	%rdi,$ctx	# reassigned argument
79	sub	\$`8+16*4`,%rsp
80	mov	%rsi,$inp	# reassigned argument
81	and	\$-64,%rsp
82	mov	%rdx,$num	# reassigned argument
83	mov	%r11,`16*4`(%rsp)
84.Lprologue:
85
86	mov	0($ctx),$A
87	mov	4($ctx),$B
88	mov	8($ctx),$C
89	mov	12($ctx),$D
90	mov	16($ctx),$E
91___
92}
93
94sub EPILOGUE {
95my $func=shift;
96$code.=<<___;
97	mov	`16*4`(%rsp),%rsi
98	mov	(%rsi),%r12
99	mov	8(%rsi),%rbp
100	mov	16(%rsi),%rbx
101	lea	24(%rsi),%rsp
102.Lepilogue:
103	ret
104.size	$func,.-$func
105___
106}
107
108sub BODY_00_19 {
109my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
110my $j=$i+1;
111$code.=<<___ if ($i==0);
112	mov	`4*$i`($inp),$xi
113	`"bswap	$xi"	if(!defined($host))`
114	mov	$xi,`4*$i`(%rsp)
115___
116$code.=<<___ if ($i<15);
117	lea	0x5a827999($xi,$e),$f
118	mov	$c,$t0
119	mov	`4*$j`($inp),$xi
120	mov	$a,$e
121	xor	$d,$t0
122	`"bswap	$xi"	if(!defined($host))`
123	rol	\$5,$e
124	and	$b,$t0
125	mov	$xi,`4*$j`(%rsp)
126	add	$e,$f
127	xor	$d,$t0
128	rol	\$30,$b
129	add	$t0,$f
130___
131$code.=<<___ if ($i>=15);
132	lea	0x5a827999($xi,$e),$f
133	mov	`4*($j%16)`(%rsp),$xi
134	mov	$c,$t0
135	mov	$a,$e
136	xor	`4*(($j+2)%16)`(%rsp),$xi
137	xor	$d,$t0
138	rol	\$5,$e
139	xor	`4*(($j+8)%16)`(%rsp),$xi
140	and	$b,$t0
141	add	$e,$f
142	xor	`4*(($j+13)%16)`(%rsp),$xi
143	xor	$d,$t0
144	rol	\$30,$b
145	add	$t0,$f
146	rol	\$1,$xi
147	mov	$xi,`4*($j%16)`(%rsp)
148___
149}
150
151sub BODY_20_39 {
152my ($i,$a,$b,$c,$d,$e,$f)=@_;
153my $j=$i+1;
154my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
155$code.=<<___ if ($i<79);
156	lea	$K($xi,$e),$f
157	mov	`4*($j%16)`(%rsp),$xi
158	mov	$c,$t0
159	mov	$a,$e
160	xor	`4*(($j+2)%16)`(%rsp),$xi
161	xor	$b,$t0
162	rol	\$5,$e
163	xor	`4*(($j+8)%16)`(%rsp),$xi
164	xor	$d,$t0
165	add	$e,$f
166	xor	`4*(($j+13)%16)`(%rsp),$xi
167	rol	\$30,$b
168	add	$t0,$f
169	rol	\$1,$xi
170___
171$code.=<<___ if ($i<76);
172	mov	$xi,`4*($j%16)`(%rsp)
173___
174$code.=<<___ if ($i==79);
175	lea	$K($xi,$e),$f
176	mov	$c,$t0
177	mov	$a,$e
178	xor	$b,$t0
179	rol	\$5,$e
180	xor	$d,$t0
181	add	$e,$f
182	rol	\$30,$b
183	add	$t0,$f
184___
185}
186
187sub BODY_40_59 {
188my ($i,$a,$b,$c,$d,$e,$f)=@_;
189my $j=$i+1;
190$code.=<<___;
191	lea	0x8f1bbcdc($xi,$e),$f
192	mov	`4*($j%16)`(%rsp),$xi
193	mov	$b,$t0
194	mov	$b,$t1
195	xor	`4*(($j+2)%16)`(%rsp),$xi
196	mov	$a,$e
197	and	$c,$t0
198	xor	`4*(($j+8)%16)`(%rsp),$xi
199	or	$c,$t1
200	rol	\$5,$e
201	xor	`4*(($j+13)%16)`(%rsp),$xi
202	and	$d,$t1
203	add	$e,$f
204	rol	\$1,$xi
205	or	$t1,$t0
206	rol	\$30,$b
207	mov	$xi,`4*($j%16)`(%rsp)
208	add	$t0,$f
209___
210}
211
212$code=".text\n";
213
214&PROLOGUE("sha1_block_data_order");
215$code.=".align	4\n.Lloop:\n";
216for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
217for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
218for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
219for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
220$code.=<<___;
221	add	0($ctx),$E
222	add	4($ctx),$T
223	add	8($ctx),$A
224	add	12($ctx),$B
225	add	16($ctx),$C
226	mov	$E,0($ctx)
227	mov	$T,4($ctx)
228	mov	$A,8($ctx)
229	mov	$B,12($ctx)
230	mov	$C,16($ctx)
231
232	xchg	$E,$A	# mov	$E,$A
233	xchg	$T,$B	# mov	$T,$B
234	xchg	$E,$C	# mov	$A,$C
235	xchg	$T,$D	# mov	$B,$D
236			# mov	$C,$E
237	lea	`16*4`($inp),$inp
238	sub	\$1,$num
239	jnz	.Lloop
240___
241&EPILOGUE("sha1_block_data_order");
242$code.=<<___;
243.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
244.align	16
245___
246
247# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
248#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
249if ($win64) {
250$rec="%rcx";
251$frame="%rdx";
252$context="%r8";
253$disp="%r9";
254
255$code.=<<___;
256.extern	__imp_RtlVirtualUnwind
257.type	se_handler,\@abi-omnipotent
258.align	16
259se_handler:
260	push	%rsi
261	push	%rdi
262	push	%rbx
263	push	%rbp
264	push	%r12
265	push	%r13
266	push	%r14
267	push	%r15
268	pushfq
269	sub	\$64,%rsp
270
271	mov	120($context),%rax	# pull context->Rax
272	mov	248($context),%rbx	# pull context->Rip
273
274	lea	.Lprologue(%rip),%r10
275	cmp	%r10,%rbx		# context->Rip<.Lprologue
276	jb	.Lin_prologue
277
278	mov	152($context),%rax	# pull context->Rsp
279
280	lea	.Lepilogue(%rip),%r10
281	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
282	jae	.Lin_prologue
283
284	mov	`16*4`(%rax),%rax	# pull saved stack pointer
285	lea	24(%rax),%rax
286
287	mov	-8(%rax),%rbx
288	mov	-16(%rax),%rbp
289	mov	-24(%rax),%r12
290	mov	%rbx,144($context)	# restore context->Rbx
291	mov	%rbp,160($context)	# restore context->Rbp
292	mov	%r12,216($context)	# restore context->R12
293
294.Lin_prologue:
295	mov	8(%rax),%rdi
296	mov	16(%rax),%rsi
297	mov	%rax,152($context)	# restore context->Rsp
298	mov	%rsi,168($context)	# restore context->Rsi
299	mov	%rdi,176($context)	# restore context->Rdi
300
301	mov	40($disp),%rdi		# disp->ContextRecord
302	mov	$context,%rsi		# context
303	mov	\$154,%ecx		# sizeof(CONTEXT)
304	.long	0xa548f3fc		# cld; rep movsq
305
306	mov	$disp,%rsi
307	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
308	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
309	mov	0(%rsi),%r8		# arg3, disp->ControlPc
310	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
311	mov	40(%rsi),%r10		# disp->ContextRecord
312	lea	56(%rsi),%r11		# &disp->HandlerData
313	lea	24(%rsi),%r12		# &disp->EstablisherFrame
314	mov	%r10,32(%rsp)		# arg5
315	mov	%r11,40(%rsp)		# arg6
316	mov	%r12,48(%rsp)		# arg7
317	mov	%rcx,56(%rsp)		# arg8, (NULL)
318	call	*__imp_RtlVirtualUnwind(%rip)
319
320	mov	\$1,%eax		# ExceptionContinueSearch
321	add	\$64,%rsp
322	popfq
323	pop	%r15
324	pop	%r14
325	pop	%r13
326	pop	%r12
327	pop	%rbp
328	pop	%rbx
329	pop	%rdi
330	pop	%rsi
331	ret
332.size	se_handler,.-se_handler
333
334.section	.pdata
335.align	4
336	.rva	.LSEH_begin_sha1_block_data_order
337	.rva	.LSEH_end_sha1_block_data_order
338	.rva	.LSEH_info_sha1_block_data_order
339
340.section	.xdata
341.align	8
342.LSEH_info_sha1_block_data_order:
343	.byte	9,0,0,0
344	.rva	se_handler
345___
346}
347
348####################################################################
349
350$code =~ s/\`([^\`]*)\`/eval $1/gem;
351print $code;
352close STDOUT;
353