1183234Ssimon#!/usr/bin/env perl
2183234Ssimon#
3183234Ssimon# ====================================================================
4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and
6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further
7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/.
8183234Ssimon# ====================================================================
9183234Ssimon#
10183234Ssimon# sha1_block procedure for x86_64.
11183234Ssimon#
12183234Ssimon# It was brought to my attention that on EM64T compiler-generated code
13183234Ssimon# was far behind 32-bit assembler implementation. This is unlike on
14183234Ssimon# Opteron where compiler-generated code was only 15% behind 32-bit
15183234Ssimon# assembler, which originally made it hard to motivate the effort.
16183234Ssimon# There was suggestion to mechanically translate 32-bit code, but I
17183234Ssimon# dismissed it, reasoning that x86_64 offers enough register bank
18183234Ssimon# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19238405Sjkim# implementation:-) However! While 64-bit code does perform better
20183234Ssimon# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21183234Ssimon# x86_64 does offer larger *addressable* bank, but out-of-order core
22183234Ssimon# reaches for even more registers through dynamic aliasing, and EM64T
23183234Ssimon# core must have managed to run-time optimize even 32-bit code just as
24183234Ssimon# good as 64-bit one. Performance improvement is summarized in the
25183234Ssimon# following table:
26183234Ssimon#
27183234Ssimon#		gcc 3.4		32-bit asm	cycles/byte
28183234Ssimon# Opteron	+45%		+20%		6.8
29183234Ssimon# Xeon P4	+65%		+0%		9.9
30183234Ssimon# Core2		+60%		+10%		7.0
31183234Ssimon
32238405Sjkim# August 2009.
33238405Sjkim#
34238405Sjkim# The code was revised to minimize code size and to maximize
35238405Sjkim# "distance" between instructions producing input to 'lea'
36238405Sjkim# instruction and the 'lea' instruction itself, which is essential
37238405Sjkim# for Intel Atom core.
38183234Ssimon
39238405Sjkim# October 2010.
40238405Sjkim#
41238405Sjkim# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42238405Sjkim# is to offload message schedule denoted by Wt in NIST specification,
43238405Sjkim# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44238405Sjkim# for background and implementation details. The only difference from
45238405Sjkim# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46238405Sjkim# to free temporary registers.
47238405Sjkim
48238405Sjkim# April 2011.
49238405Sjkim#
50238405Sjkim# Add AVX code path. See sha1-586.pl for further information.
51238405Sjkim
52238405Sjkim######################################################################
53238405Sjkim# Current performance is summarized in following table. Numbers are
54238405Sjkim# CPU clock cycles spent to process single byte (less is better).
55238405Sjkim#
56238405Sjkim#		x86_64		SSSE3		AVX
57238405Sjkim# P4		9.8		-
58238405Sjkim# Opteron	6.6		-
59238405Sjkim# Core2		6.7		6.1/+10%	-
60238405Sjkim# Atom		11.0		9.7/+13%	-
61238405Sjkim# Westmere	7.1		5.6/+27%	-
62238405Sjkim# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
63238405Sjkim
64238405Sjkim$flavour = shift;
65238405Sjkim$output  = shift;
66238405Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
67238405Sjkim
68238405Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
69238405Sjkim
70183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73183234Ssimondie "can't locate x86_64-xlate.pl";
74183234Ssimon
75238405Sjkim$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76238405Sjkim		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77238405Sjkim	   $1>=2.19);
78238405Sjkim$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79238405Sjkim	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80238405Sjkim	   $1>=2.09);
81238405Sjkim$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82238405Sjkim	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83238405Sjkim	   $1>=10);
84183234Ssimon
85246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
86246772Sjkim*STDOUT=*OUT;
87238405Sjkim
88183234Ssimon$ctx="%rdi";	# 1st arg
89183234Ssimon$inp="%rsi";	# 2nd arg
90183234Ssimon$num="%rdx";	# 3rd arg
91183234Ssimon
92183234Ssimon# reassign arguments in order to produce more compact code
93183234Ssimon$ctx="%r8";
94183234Ssimon$inp="%r9";
95183234Ssimon$num="%r10";
96183234Ssimon
97238405Sjkim$t0="%eax";
98238405Sjkim$t1="%ebx";
99238405Sjkim$t2="%ecx";
100238405Sjkim@xi=("%edx","%ebp");
101238405Sjkim$A="%esi";
102238405Sjkim$B="%edi";
103238405Sjkim$C="%r11d";
104238405Sjkim$D="%r12d";
105238405Sjkim$E="%r13d";
106183234Ssimon
107238405Sjkim@V=($A,$B,$C,$D,$E);
108183234Ssimon
109183234Ssimonsub BODY_00_19 {
110238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
111183234Ssimonmy $j=$i+1;
112183234Ssimon$code.=<<___ if ($i==0);
113238405Sjkim	mov	`4*$i`($inp),$xi[0]
114238405Sjkim	bswap	$xi[0]
115238405Sjkim	mov	$xi[0],`4*$i`(%rsp)
116183234Ssimon___
117183234Ssimon$code.=<<___ if ($i<15);
118183234Ssimon	mov	$c,$t0
119238405Sjkim	mov	`4*$j`($inp),$xi[1]
120238405Sjkim	mov	$a,$t2
121183234Ssimon	xor	$d,$t0
122238405Sjkim	bswap	$xi[1]
123238405Sjkim	rol	\$5,$t2
124238405Sjkim	lea	0x5a827999($xi[0],$e),$e
125183234Ssimon	and	$b,$t0
126238405Sjkim	mov	$xi[1],`4*$j`(%rsp)
127238405Sjkim	add	$t2,$e
128183234Ssimon	xor	$d,$t0
129183234Ssimon	rol	\$30,$b
130238405Sjkim	add	$t0,$e
131183234Ssimon___
132183234Ssimon$code.=<<___ if ($i>=15);
133238405Sjkim	mov	`4*($j%16)`(%rsp),$xi[1]
134183234Ssimon	mov	$c,$t0
135238405Sjkim	mov	$a,$t2
136238405Sjkim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
137183234Ssimon	xor	$d,$t0
138238405Sjkim	rol	\$5,$t2
139238405Sjkim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
140183234Ssimon	and	$b,$t0
141238405Sjkim	lea	0x5a827999($xi[0],$e),$e
142238405Sjkim	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
143183234Ssimon	xor	$d,$t0
144238405Sjkim	rol	\$1,$xi[1]
145238405Sjkim	add	$t2,$e
146183234Ssimon	rol	\$30,$b
147238405Sjkim	mov	$xi[1],`4*($j%16)`(%rsp)
148238405Sjkim	add	$t0,$e
149183234Ssimon___
150238405Sjkimunshift(@xi,pop(@xi));
151183234Ssimon}
152183234Ssimon
153183234Ssimonsub BODY_20_39 {
154238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
155183234Ssimonmy $j=$i+1;
156183234Ssimonmy $K=($i<40)?0x6ed9eba1:0xca62c1d6;
157183234Ssimon$code.=<<___ if ($i<79);
158238405Sjkim	mov	`4*($j%16)`(%rsp),$xi[1]
159183234Ssimon	mov	$c,$t0
160238405Sjkim	mov	$a,$t2
161238405Sjkim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
162183234Ssimon	xor	$b,$t0
163238405Sjkim	rol	\$5,$t2
164238405Sjkim	lea	$K($xi[0],$e),$e
165238405Sjkim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
166183234Ssimon	xor	$d,$t0
167238405Sjkim	add	$t2,$e
168238405Sjkim	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
169183234Ssimon	rol	\$30,$b
170238405Sjkim	add	$t0,$e
171238405Sjkim	rol	\$1,$xi[1]
172183234Ssimon___
173183234Ssimon$code.=<<___ if ($i<76);
174238405Sjkim	mov	$xi[1],`4*($j%16)`(%rsp)
175183234Ssimon___
176183234Ssimon$code.=<<___ if ($i==79);
177183234Ssimon	mov	$c,$t0
178238405Sjkim	mov	$a,$t2
179183234Ssimon	xor	$b,$t0
180238405Sjkim	lea	$K($xi[0],$e),$e
181238405Sjkim	rol	\$5,$t2
182183234Ssimon	xor	$d,$t0
183238405Sjkim	add	$t2,$e
184183234Ssimon	rol	\$30,$b
185238405Sjkim	add	$t0,$e
186183234Ssimon___
187238405Sjkimunshift(@xi,pop(@xi));
188183234Ssimon}
189183234Ssimon
190183234Ssimonsub BODY_40_59 {
191238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_;
192183234Ssimonmy $j=$i+1;
193183234Ssimon$code.=<<___;
194238405Sjkim	mov	`4*($j%16)`(%rsp),$xi[1]
195238405Sjkim	mov	$c,$t0
196238405Sjkim	mov	$c,$t1
197238405Sjkim	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
198238405Sjkim	and	$d,$t0
199238405Sjkim	mov	$a,$t2
200238405Sjkim	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
201238405Sjkim	xor	$d,$t1
202238405Sjkim	lea	0x8f1bbcdc($xi[0],$e),$e
203238405Sjkim	rol	\$5,$t2
204238405Sjkim	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
205238405Sjkim	add	$t0,$e
206238405Sjkim	and	$b,$t1
207238405Sjkim	rol	\$1,$xi[1]
208238405Sjkim	add	$t1,$e
209183234Ssimon	rol	\$30,$b
210238405Sjkim	mov	$xi[1],`4*($j%16)`(%rsp)
211238405Sjkim	add	$t2,$e
212183234Ssimon___
213238405Sjkimunshift(@xi,pop(@xi));
214183234Ssimon}
215183234Ssimon
216238405Sjkim$code.=<<___;
217238405Sjkim.text
218238405Sjkim.extern	OPENSSL_ia32cap_P
219183234Ssimon
220238405Sjkim.globl	sha1_block_data_order
221238405Sjkim.type	sha1_block_data_order,\@function,3
222238405Sjkim.align	16
223238405Sjkimsha1_block_data_order:
224238405Sjkim	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
225238405Sjkim	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
226238405Sjkim	test	\$`1<<9`,%r8d		# check SSSE3 bit
227238405Sjkim	jz	.Lialu
228238405Sjkim___
229238405Sjkim$code.=<<___ if ($avx);
230238405Sjkim	and	\$`1<<28`,%r8d		# mask AVX bit
231238405Sjkim	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
232238405Sjkim	or	%r9d,%r8d
233238405Sjkim	cmp	\$`1<<28|1<<30`,%r8d
234238405Sjkim	je	_avx_shortcut
235238405Sjkim___
236238405Sjkim$code.=<<___;
237238405Sjkim	jmp	_ssse3_shortcut
238238405Sjkim
239238405Sjkim.align	16
240238405Sjkim.Lialu:
241238405Sjkim	push	%rbx
242238405Sjkim	push	%rbp
243238405Sjkim	push	%r12
244238405Sjkim	push	%r13
245238405Sjkim	mov	%rsp,%r11
246238405Sjkim	mov	%rdi,$ctx	# reassigned argument
247238405Sjkim	sub	\$`8+16*4`,%rsp
248238405Sjkim	mov	%rsi,$inp	# reassigned argument
249238405Sjkim	and	\$-64,%rsp
250238405Sjkim	mov	%rdx,$num	# reassigned argument
251238405Sjkim	mov	%r11,`16*4`(%rsp)
252238405Sjkim.Lprologue:
253238405Sjkim
254238405Sjkim	mov	0($ctx),$A
255238405Sjkim	mov	4($ctx),$B
256238405Sjkim	mov	8($ctx),$C
257238405Sjkim	mov	12($ctx),$D
258238405Sjkim	mov	16($ctx),$E
259238405Sjkim	jmp	.Lloop
260238405Sjkim
261238405Sjkim.align	16
262238405Sjkim.Lloop:
263238405Sjkim___
264183234Ssimonfor($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
265183234Ssimonfor(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
266183234Ssimonfor(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
267183234Ssimonfor(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
268183234Ssimon$code.=<<___;
269238405Sjkim	add	0($ctx),$A
270238405Sjkim	add	4($ctx),$B
271238405Sjkim	add	8($ctx),$C
272238405Sjkim	add	12($ctx),$D
273238405Sjkim	add	16($ctx),$E
274238405Sjkim	mov	$A,0($ctx)
275238405Sjkim	mov	$B,4($ctx)
276238405Sjkim	mov	$C,8($ctx)
277238405Sjkim	mov	$D,12($ctx)
278238405Sjkim	mov	$E,16($ctx)
279183234Ssimon
280238405Sjkim	sub	\$1,$num
281183234Ssimon	lea	`16*4`($inp),$inp
282183234Ssimon	jnz	.Lloop
283238405Sjkim
284238405Sjkim	mov	`16*4`(%rsp),%rsi
285238405Sjkim	mov	(%rsi),%r13
286238405Sjkim	mov	8(%rsi),%r12
287238405Sjkim	mov	16(%rsi),%rbp
288238405Sjkim	mov	24(%rsi),%rbx
289238405Sjkim	lea	32(%rsi),%rsp
290238405Sjkim.Lepilogue:
291238405Sjkim	ret
292238405Sjkim.size	sha1_block_data_order,.-sha1_block_data_order
293183234Ssimon___
294238405Sjkim{{{
295238405Sjkimmy $Xi=4;
296238405Sjkimmy @X=map("%xmm$_",(4..7,0..3));
297238405Sjkimmy @Tx=map("%xmm$_",(8..10));
298238405Sjkimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
299238405Sjkimmy @T=("%esi","%edi");
300238405Sjkimmy $j=0;
301238405Sjkimmy $K_XX_XX="%r11";
302238405Sjkim
303238405Sjkimmy $_rol=sub { &rol(@_) };
304238405Sjkimmy $_ror=sub { &ror(@_) };
305238405Sjkim
306183234Ssimon$code.=<<___;
307238405Sjkim.type	sha1_block_data_order_ssse3,\@function,3
308238405Sjkim.align	16
309238405Sjkimsha1_block_data_order_ssse3:
310238405Sjkim_ssse3_shortcut:
311238405Sjkim	push	%rbx
312238405Sjkim	push	%rbp
313238405Sjkim	push	%r12
314238405Sjkim	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
315238405Sjkim___
316238405Sjkim$code.=<<___ if ($win64);
317238405Sjkim	movaps	%xmm6,64+0(%rsp)
318238405Sjkim	movaps	%xmm7,64+16(%rsp)
319238405Sjkim	movaps	%xmm8,64+32(%rsp)
320238405Sjkim	movaps	%xmm9,64+48(%rsp)
321238405Sjkim	movaps	%xmm10,64+64(%rsp)
322238405Sjkim.Lprologue_ssse3:
323238405Sjkim___
324238405Sjkim$code.=<<___;
325238405Sjkim	mov	%rdi,$ctx	# reassigned argument
326238405Sjkim	mov	%rsi,$inp	# reassigned argument
327238405Sjkim	mov	%rdx,$num	# reassigned argument
328238405Sjkim
329238405Sjkim	shl	\$6,$num
330238405Sjkim	add	$inp,$num
331238405Sjkim	lea	K_XX_XX(%rip),$K_XX_XX
332238405Sjkim
333238405Sjkim	mov	0($ctx),$A		# load context
334238405Sjkim	mov	4($ctx),$B
335238405Sjkim	mov	8($ctx),$C
336238405Sjkim	mov	12($ctx),$D
337238405Sjkim	mov	$B,@T[0]		# magic seed
338238405Sjkim	mov	16($ctx),$E
339238405Sjkim
340238405Sjkim	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
341238405Sjkim	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
342238405Sjkim	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
343238405Sjkim	movdqu	16($inp),@X[-3&7]
344238405Sjkim	movdqu	32($inp),@X[-2&7]
345238405Sjkim	movdqu	48($inp),@X[-1&7]
346238405Sjkim	pshufb	@X[2],@X[-4&7]		# byte swap
347238405Sjkim	add	\$64,$inp
348238405Sjkim	pshufb	@X[2],@X[-3&7]
349238405Sjkim	pshufb	@X[2],@X[-2&7]
350238405Sjkim	pshufb	@X[2],@X[-1&7]
351238405Sjkim	paddd	@Tx[1],@X[-4&7]		# add K_00_19
352238405Sjkim	paddd	@Tx[1],@X[-3&7]
353238405Sjkim	paddd	@Tx[1],@X[-2&7]
354238405Sjkim	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
355238405Sjkim	psubd	@Tx[1],@X[-4&7]		# restore X[]
356238405Sjkim	movdqa	@X[-3&7],16(%rsp)
357238405Sjkim	psubd	@Tx[1],@X[-3&7]
358238405Sjkim	movdqa	@X[-2&7],32(%rsp)
359238405Sjkim	psubd	@Tx[1],@X[-2&7]
360238405Sjkim	jmp	.Loop_ssse3
361238405Sjkim___
362238405Sjkim
363238405Sjkimsub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
364238405Sjkim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
365238405Sjkim  my $arg = pop;
366238405Sjkim    $arg = "\$$arg" if ($arg*1 eq $arg);
367238405Sjkim    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
368238405Sjkim}
369238405Sjkim
370238405Sjkimsub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
371238405Sjkim{ use integer;
372238405Sjkim  my $body = shift;
373238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
374238405Sjkim  my ($a,$b,$c,$d,$e);
375238405Sjkim
376238405Sjkim	&movdqa	(@X[0],@X[-3&7]);
377238405Sjkim	 eval(shift(@insns));
378238405Sjkim	 eval(shift(@insns));
379238405Sjkim	&movdqa	(@Tx[0],@X[-1&7]);
380238405Sjkim	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
381238405Sjkim	 eval(shift(@insns));
382238405Sjkim	 eval(shift(@insns));
383238405Sjkim
384238405Sjkim	  &paddd	(@Tx[1],@X[-1&7]);
385238405Sjkim	 eval(shift(@insns));
386238405Sjkim	 eval(shift(@insns));
387238405Sjkim	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
388238405Sjkim	 eval(shift(@insns));
389238405Sjkim	 eval(shift(@insns));
390238405Sjkim	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
391238405Sjkim	 eval(shift(@insns));
392238405Sjkim	 eval(shift(@insns));
393238405Sjkim
394238405Sjkim	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
395238405Sjkim	 eval(shift(@insns));
396238405Sjkim	 eval(shift(@insns));
397238405Sjkim	 eval(shift(@insns));
398238405Sjkim	 eval(shift(@insns));
399238405Sjkim
400238405Sjkim	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
401238405Sjkim	 eval(shift(@insns));
402238405Sjkim	 eval(shift(@insns));
403238405Sjkim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
404238405Sjkim	 eval(shift(@insns));
405238405Sjkim	 eval(shift(@insns));
406238405Sjkim
407238405Sjkim	&movdqa	(@Tx[2],@X[0]);
408238405Sjkim	&movdqa	(@Tx[0],@X[0]);
409238405Sjkim	 eval(shift(@insns));
410238405Sjkim	 eval(shift(@insns));
411238405Sjkim	 eval(shift(@insns));
412238405Sjkim	 eval(shift(@insns));
413238405Sjkim
414238405Sjkim	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
415238405Sjkim	&paddd	(@X[0],@X[0]);
416238405Sjkim	 eval(shift(@insns));
417238405Sjkim	 eval(shift(@insns));
418238405Sjkim	 eval(shift(@insns));
419238405Sjkim	 eval(shift(@insns));
420238405Sjkim
421238405Sjkim	&psrld	(@Tx[0],31);
422238405Sjkim	 eval(shift(@insns));
423238405Sjkim	 eval(shift(@insns));
424238405Sjkim	&movdqa	(@Tx[1],@Tx[2]);
425238405Sjkim	 eval(shift(@insns));
426238405Sjkim	 eval(shift(@insns));
427238405Sjkim
428238405Sjkim	&psrld	(@Tx[2],30);
429238405Sjkim	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
430238405Sjkim	 eval(shift(@insns));
431238405Sjkim	 eval(shift(@insns));
432238405Sjkim	 eval(shift(@insns));
433238405Sjkim	 eval(shift(@insns));
434238405Sjkim
435238405Sjkim	&pslld	(@Tx[1],2);
436238405Sjkim	&pxor	(@X[0],@Tx[2]);
437238405Sjkim	 eval(shift(@insns));
438238405Sjkim	 eval(shift(@insns));
439238405Sjkim	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
440238405Sjkim	 eval(shift(@insns));
441238405Sjkim	 eval(shift(@insns));
442238405Sjkim
443238405Sjkim	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
444238405Sjkim
445238405Sjkim	 foreach (@insns) { eval; }	# remaining instructions [if any]
446238405Sjkim
447238405Sjkim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
448238405Sjkim		push(@Tx,shift(@Tx));
449238405Sjkim}
450238405Sjkim
451238405Sjkimsub Xupdate_ssse3_32_79()
452238405Sjkim{ use integer;
453238405Sjkim  my $body = shift;
454238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
455238405Sjkim  my ($a,$b,$c,$d,$e);
456238405Sjkim
457238405Sjkim	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
458238405Sjkim	 eval(shift(@insns));		# body_20_39
459238405Sjkim	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
460238405Sjkim	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
461238405Sjkim	 eval(shift(@insns));
462238405Sjkim	 eval(shift(@insns));
463238405Sjkim	 eval(shift(@insns));		# rol
464238405Sjkim
465238405Sjkim	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
466238405Sjkim	 eval(shift(@insns));
467238405Sjkim	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
468238405Sjkim	if ($Xi%5) {
469238405Sjkim	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
470238405Sjkim	} else {			# ... or load next one
471238405Sjkim	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
472238405Sjkim	}
473238405Sjkim	  &paddd	(@Tx[1],@X[-1&7]);
474238405Sjkim	 eval(shift(@insns));		# ror
475238405Sjkim	 eval(shift(@insns));
476238405Sjkim
477238405Sjkim	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
478238405Sjkim	 eval(shift(@insns));		# body_20_39
479238405Sjkim	 eval(shift(@insns));
480238405Sjkim	 eval(shift(@insns));
481238405Sjkim	 eval(shift(@insns));		# rol
482238405Sjkim
483238405Sjkim	&movdqa	(@Tx[0],@X[0]);
484238405Sjkim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
485238405Sjkim	 eval(shift(@insns));
486238405Sjkim	 eval(shift(@insns));
487238405Sjkim	 eval(shift(@insns));		# ror
488238405Sjkim	 eval(shift(@insns));
489238405Sjkim
490238405Sjkim	&pslld	(@X[0],2);
491238405Sjkim	 eval(shift(@insns));		# body_20_39
492238405Sjkim	 eval(shift(@insns));
493238405Sjkim	&psrld	(@Tx[0],30);
494238405Sjkim	 eval(shift(@insns));
495238405Sjkim	 eval(shift(@insns));		# rol
496238405Sjkim	 eval(shift(@insns));
497238405Sjkim	 eval(shift(@insns));
498238405Sjkim	 eval(shift(@insns));		# ror
499238405Sjkim	 eval(shift(@insns));
500238405Sjkim
501238405Sjkim	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
502238405Sjkim	 eval(shift(@insns));		# body_20_39
503238405Sjkim	 eval(shift(@insns));
504238405Sjkim	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
505238405Sjkim	 eval(shift(@insns));
506238405Sjkim	 eval(shift(@insns));		# rol
507238405Sjkim	 eval(shift(@insns));
508238405Sjkim	 eval(shift(@insns));
509238405Sjkim	 eval(shift(@insns));		# rol
510238405Sjkim	 eval(shift(@insns));
511238405Sjkim
512238405Sjkim	 foreach (@insns) { eval; }	# remaining instructions
513238405Sjkim
514238405Sjkim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
515238405Sjkim		push(@Tx,shift(@Tx));
516238405Sjkim}
517238405Sjkim
518238405Sjkimsub Xuplast_ssse3_80()
519238405Sjkim{ use integer;
520238405Sjkim  my $body = shift;
521238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
522238405Sjkim  my ($a,$b,$c,$d,$e);
523238405Sjkim
524238405Sjkim	 eval(shift(@insns));
525238405Sjkim	  &paddd	(@Tx[1],@X[-1&7]);
526238405Sjkim	 eval(shift(@insns));
527238405Sjkim	 eval(shift(@insns));
528238405Sjkim	 eval(shift(@insns));
529238405Sjkim	 eval(shift(@insns));
530238405Sjkim
531238405Sjkim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
532238405Sjkim
533238405Sjkim	 foreach (@insns) { eval; }		# remaining instructions
534238405Sjkim
535238405Sjkim	&cmp	($inp,$num);
536238405Sjkim	&je	(".Ldone_ssse3");
537238405Sjkim
538238405Sjkim	unshift(@Tx,pop(@Tx));
539238405Sjkim
540238405Sjkim	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
541238405Sjkim	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
542238405Sjkim	&movdqu	(@X[-4&7],"0($inp)");		# load input
543238405Sjkim	&movdqu	(@X[-3&7],"16($inp)");
544238405Sjkim	&movdqu	(@X[-2&7],"32($inp)");
545238405Sjkim	&movdqu	(@X[-1&7],"48($inp)");
546238405Sjkim	&pshufb	(@X[-4&7],@X[2]);		# byte swap
547238405Sjkim	&add	($inp,64);
548238405Sjkim
549238405Sjkim  $Xi=0;
550238405Sjkim}
551238405Sjkim
552238405Sjkimsub Xloop_ssse3()
553238405Sjkim{ use integer;
554238405Sjkim  my $body = shift;
555238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
556238405Sjkim  my ($a,$b,$c,$d,$e);
557238405Sjkim
558238405Sjkim	 eval(shift(@insns));
559238405Sjkim	 eval(shift(@insns));
560238405Sjkim	&pshufb	(@X[($Xi-3)&7],@X[2]);
561238405Sjkim	 eval(shift(@insns));
562238405Sjkim	 eval(shift(@insns));
563238405Sjkim	&paddd	(@X[($Xi-4)&7],@Tx[1]);
564238405Sjkim	 eval(shift(@insns));
565238405Sjkim	 eval(shift(@insns));
566238405Sjkim	 eval(shift(@insns));
567238405Sjkim	 eval(shift(@insns));
568238405Sjkim	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
569238405Sjkim	 eval(shift(@insns));
570238405Sjkim	 eval(shift(@insns));
571238405Sjkim	&psubd	(@X[($Xi-4)&7],@Tx[1]);
572238405Sjkim
573238405Sjkim	foreach (@insns) { eval; }
574238405Sjkim  $Xi++;
575238405Sjkim}
576238405Sjkim
577238405Sjkimsub Xtail_ssse3()
578238405Sjkim{ use integer;
579238405Sjkim  my $body = shift;
580238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
581238405Sjkim  my ($a,$b,$c,$d,$e);
582238405Sjkim
583238405Sjkim	foreach (@insns) { eval; }
584238405Sjkim}
585238405Sjkim
586238405Sjkimsub body_00_19 () {
587238405Sjkim	(
588238405Sjkim	'($a,$b,$c,$d,$e)=@V;'.
589238405Sjkim	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
590238405Sjkim	'&xor	($c,$d);',
591238405Sjkim	'&mov	(@T[1],$a);',	# $b in next round
592238405Sjkim	'&$_rol	($a,5);',
593238405Sjkim	'&and	(@T[0],$c);',	# ($b&($c^$d))
594238405Sjkim	'&xor	($c,$d);',	# restore $c
595238405Sjkim	'&xor	(@T[0],$d);',
596238405Sjkim	'&add	($e,$a);',
597238405Sjkim	'&$_ror	($b,$j?7:2);',	# $b>>>2
598238405Sjkim	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
599238405Sjkim	);
600238405Sjkim}
601238405Sjkim
602238405Sjkimsub body_20_39 () {
603238405Sjkim	(
604238405Sjkim	'($a,$b,$c,$d,$e)=@V;'.
605238405Sjkim	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
606238405Sjkim	'&xor	(@T[0],$d);',	# ($b^$d)
607238405Sjkim	'&mov	(@T[1],$a);',	# $b in next round
608238405Sjkim	'&$_rol	($a,5);',
609238405Sjkim	'&xor	(@T[0],$c);',	# ($b^$d^$c)
610238405Sjkim	'&add	($e,$a);',
611238405Sjkim	'&$_ror	($b,7);',	# $b>>>2
612238405Sjkim	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
613238405Sjkim	);
614238405Sjkim}
615238405Sjkim
616238405Sjkimsub body_40_59 () {
617238405Sjkim	(
618238405Sjkim	'($a,$b,$c,$d,$e)=@V;'.
619238405Sjkim	'&mov	(@T[1],$c);',
620238405Sjkim	'&xor	($c,$d);',
621238405Sjkim	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
622238405Sjkim	'&and	(@T[1],$d);',
623238405Sjkim	'&and	(@T[0],$c);',	# ($b&($c^$d))
624238405Sjkim	'&$_ror	($b,7);',	# $b>>>2
625238405Sjkim	'&add	($e,@T[1]);',
626238405Sjkim	'&mov	(@T[1],$a);',	# $b in next round
627238405Sjkim	'&$_rol	($a,5);',
628238405Sjkim	'&add	($e,@T[0]);',
629238405Sjkim	'&xor	($c,$d);',	# restore $c
630238405Sjkim	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
631238405Sjkim	);
632238405Sjkim}
633238405Sjkim$code.=<<___;
634238405Sjkim.align	16
635238405Sjkim.Loop_ssse3:
636238405Sjkim___
637238405Sjkim	&Xupdate_ssse3_16_31(\&body_00_19);
638238405Sjkim	&Xupdate_ssse3_16_31(\&body_00_19);
639238405Sjkim	&Xupdate_ssse3_16_31(\&body_00_19);
640238405Sjkim	&Xupdate_ssse3_16_31(\&body_00_19);
641238405Sjkim	&Xupdate_ssse3_32_79(\&body_00_19);
642238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
643238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
644238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
645238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
646238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
647238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
648238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
649238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
650238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
651238405Sjkim	&Xupdate_ssse3_32_79(\&body_40_59);
652238405Sjkim	&Xupdate_ssse3_32_79(\&body_20_39);
653238405Sjkim	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
654238405Sjkim
655238405Sjkim				$saved_j=$j; @saved_V=@V;
656238405Sjkim
657238405Sjkim	&Xloop_ssse3(\&body_20_39);
658238405Sjkim	&Xloop_ssse3(\&body_20_39);
659238405Sjkim	&Xloop_ssse3(\&body_20_39);
660238405Sjkim
661238405Sjkim$code.=<<___;
662238405Sjkim	add	0($ctx),$A			# update context
663238405Sjkim	add	4($ctx),@T[0]
664238405Sjkim	add	8($ctx),$C
665238405Sjkim	add	12($ctx),$D
666238405Sjkim	mov	$A,0($ctx)
667238405Sjkim	add	16($ctx),$E
668238405Sjkim	mov	@T[0],4($ctx)
669238405Sjkim	mov	@T[0],$B			# magic seed
670238405Sjkim	mov	$C,8($ctx)
671238405Sjkim	mov	$D,12($ctx)
672238405Sjkim	mov	$E,16($ctx)
673238405Sjkim	jmp	.Loop_ssse3
674238405Sjkim
675238405Sjkim.align	16
676238405Sjkim.Ldone_ssse3:
677238405Sjkim___
678238405Sjkim				$j=$saved_j; @V=@saved_V;
679238405Sjkim
680238405Sjkim	&Xtail_ssse3(\&body_20_39);
681238405Sjkim	&Xtail_ssse3(\&body_20_39);
682238405Sjkim	&Xtail_ssse3(\&body_20_39);
683238405Sjkim
684238405Sjkim$code.=<<___;
685238405Sjkim	add	0($ctx),$A			# update context
686238405Sjkim	add	4($ctx),@T[0]
687238405Sjkim	add	8($ctx),$C
688238405Sjkim	mov	$A,0($ctx)
689238405Sjkim	add	12($ctx),$D
690238405Sjkim	mov	@T[0],4($ctx)
691238405Sjkim	add	16($ctx),$E
692238405Sjkim	mov	$C,8($ctx)
693238405Sjkim	mov	$D,12($ctx)
694238405Sjkim	mov	$E,16($ctx)
695238405Sjkim___
696238405Sjkim$code.=<<___ if ($win64);
697238405Sjkim	movaps	64+0(%rsp),%xmm6
698238405Sjkim	movaps	64+16(%rsp),%xmm7
699238405Sjkim	movaps	64+32(%rsp),%xmm8
700238405Sjkim	movaps	64+48(%rsp),%xmm9
701238405Sjkim	movaps	64+64(%rsp),%xmm10
702238405Sjkim___
703238405Sjkim$code.=<<___;
704238405Sjkim	lea	`64+($win64?5*16:0)`(%rsp),%rsi
705238405Sjkim	mov	0(%rsi),%r12
706238405Sjkim	mov	8(%rsi),%rbp
707238405Sjkim	mov	16(%rsi),%rbx
708238405Sjkim	lea	24(%rsi),%rsp
709238405Sjkim.Lepilogue_ssse3:
710238405Sjkim	ret
711238405Sjkim.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
712238405Sjkim___
713238405Sjkim
714238405Sjkimif ($avx) {
715238405Sjkimmy $Xi=4;
716238405Sjkimmy @X=map("%xmm$_",(4..7,0..3));
717238405Sjkimmy @Tx=map("%xmm$_",(8..10));
718238405Sjkimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
719238405Sjkimmy @T=("%esi","%edi");
720238405Sjkimmy $j=0;
721238405Sjkimmy $K_XX_XX="%r11";
722238405Sjkim
723238405Sjkimmy $_rol=sub { &shld(@_[0],@_) };
724238405Sjkimmy $_ror=sub { &shrd(@_[0],@_) };
725238405Sjkim
726238405Sjkim$code.=<<___;
727238405Sjkim.type	sha1_block_data_order_avx,\@function,3
728238405Sjkim.align	16
729238405Sjkimsha1_block_data_order_avx:
730238405Sjkim_avx_shortcut:
731238405Sjkim	push	%rbx
732238405Sjkim	push	%rbp
733238405Sjkim	push	%r12
734238405Sjkim	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
735238405Sjkim___
736238405Sjkim$code.=<<___ if ($win64);
737238405Sjkim	movaps	%xmm6,64+0(%rsp)
738238405Sjkim	movaps	%xmm7,64+16(%rsp)
739238405Sjkim	movaps	%xmm8,64+32(%rsp)
740238405Sjkim	movaps	%xmm9,64+48(%rsp)
741238405Sjkim	movaps	%xmm10,64+64(%rsp)
742238405Sjkim.Lprologue_avx:
743238405Sjkim___
744238405Sjkim$code.=<<___;
745238405Sjkim	mov	%rdi,$ctx	# reassigned argument
746238405Sjkim	mov	%rsi,$inp	# reassigned argument
747238405Sjkim	mov	%rdx,$num	# reassigned argument
748279264Sdelphij	vzeroupper
749238405Sjkim
750238405Sjkim	shl	\$6,$num
751238405Sjkim	add	$inp,$num
752238405Sjkim	lea	K_XX_XX(%rip),$K_XX_XX
753238405Sjkim
754238405Sjkim	mov	0($ctx),$A		# load context
755238405Sjkim	mov	4($ctx),$B
756238405Sjkim	mov	8($ctx),$C
757238405Sjkim	mov	12($ctx),$D
758238405Sjkim	mov	$B,@T[0]		# magic seed
759238405Sjkim	mov	16($ctx),$E
760238405Sjkim
761238405Sjkim	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
762238405Sjkim	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
763238405Sjkim	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
764238405Sjkim	vmovdqu	16($inp),@X[-3&7]
765238405Sjkim	vmovdqu	32($inp),@X[-2&7]
766238405Sjkim	vmovdqu	48($inp),@X[-1&7]
767238405Sjkim	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
768238405Sjkim	add	\$64,$inp
769238405Sjkim	vpshufb	@X[2],@X[-3&7],@X[-3&7]
770238405Sjkim	vpshufb	@X[2],@X[-2&7],@X[-2&7]
771238405Sjkim	vpshufb	@X[2],@X[-1&7],@X[-1&7]
772238405Sjkim	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
773238405Sjkim	vpaddd	@Tx[1],@X[-3&7],@X[1]
774238405Sjkim	vpaddd	@Tx[1],@X[-2&7],@X[2]
775238405Sjkim	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
776238405Sjkim	vmovdqa	@X[1],16(%rsp)
777238405Sjkim	vmovdqa	@X[2],32(%rsp)
778238405Sjkim	jmp	.Loop_avx
779238405Sjkim___
780238405Sjkim
781238405Sjkimsub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
782238405Sjkim{ use integer;
783238405Sjkim  my $body = shift;
784238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
785238405Sjkim  my ($a,$b,$c,$d,$e);
786238405Sjkim
787238405Sjkim	 eval(shift(@insns));
788238405Sjkim	 eval(shift(@insns));
789238405Sjkim	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
790238405Sjkim	 eval(shift(@insns));
791238405Sjkim	 eval(shift(@insns));
792238405Sjkim
793238405Sjkim	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
794238405Sjkim	 eval(shift(@insns));
795238405Sjkim	 eval(shift(@insns));
796238405Sjkim	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
797238405Sjkim	 eval(shift(@insns));
798238405Sjkim	 eval(shift(@insns));
799238405Sjkim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
800238405Sjkim	 eval(shift(@insns));
801238405Sjkim	 eval(shift(@insns));
802238405Sjkim
803238405Sjkim	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
804238405Sjkim	 eval(shift(@insns));
805238405Sjkim	 eval(shift(@insns));
806238405Sjkim	 eval(shift(@insns));
807238405Sjkim	 eval(shift(@insns));
808238405Sjkim
809238405Sjkim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
810238405Sjkim	 eval(shift(@insns));
811238405Sjkim	 eval(shift(@insns));
812238405Sjkim	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
813238405Sjkim	 eval(shift(@insns));
814238405Sjkim	 eval(shift(@insns));
815238405Sjkim
816238405Sjkim	&vpsrld	(@Tx[0],@X[0],31);
817238405Sjkim	 eval(shift(@insns));
818238405Sjkim	 eval(shift(@insns));
819238405Sjkim	 eval(shift(@insns));
820238405Sjkim	 eval(shift(@insns));
821238405Sjkim
822238405Sjkim	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
823238405Sjkim	&vpaddd	(@X[0],@X[0],@X[0]);
824238405Sjkim	 eval(shift(@insns));
825238405Sjkim	 eval(shift(@insns));
826238405Sjkim	 eval(shift(@insns));
827238405Sjkim	 eval(shift(@insns));
828238405Sjkim
829238405Sjkim	&vpsrld	(@Tx[1],@Tx[2],30);
830238405Sjkim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
831238405Sjkim	 eval(shift(@insns));
832238405Sjkim	 eval(shift(@insns));
833238405Sjkim	 eval(shift(@insns));
834238405Sjkim	 eval(shift(@insns));
835238405Sjkim
836238405Sjkim	&vpslld	(@Tx[2],@Tx[2],2);
837238405Sjkim	&vpxor	(@X[0],@X[0],@Tx[1]);
838238405Sjkim	 eval(shift(@insns));
839238405Sjkim	 eval(shift(@insns));
840238405Sjkim	 eval(shift(@insns));
841238405Sjkim	 eval(shift(@insns));
842238405Sjkim
843238405Sjkim	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
844238405Sjkim	 eval(shift(@insns));
845238405Sjkim	 eval(shift(@insns));
846238405Sjkim	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
847238405Sjkim	 eval(shift(@insns));
848238405Sjkim	 eval(shift(@insns));
849238405Sjkim
850238405Sjkim
851238405Sjkim	 foreach (@insns) { eval; }	# remaining instructions [if any]
852238405Sjkim
853238405Sjkim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
854238405Sjkim		push(@Tx,shift(@Tx));
855238405Sjkim}
856238405Sjkim
857238405Sjkimsub Xupdate_avx_32_79()
858238405Sjkim{ use integer;
859238405Sjkim  my $body = shift;
860238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
861238405Sjkim  my ($a,$b,$c,$d,$e);
862238405Sjkim
863238405Sjkim	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
864238405Sjkim	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
865238405Sjkim	 eval(shift(@insns));		# body_20_39
866238405Sjkim	 eval(shift(@insns));
867238405Sjkim	 eval(shift(@insns));
868238405Sjkim	 eval(shift(@insns));		# rol
869238405Sjkim
870238405Sjkim	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
871238405Sjkim	 eval(shift(@insns));
872238405Sjkim	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
873238405Sjkim	if ($Xi%5) {
874238405Sjkim	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
875238405Sjkim	} else {			# ... or load next one
876238405Sjkim	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
877238405Sjkim	}
878238405Sjkim	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
879238405Sjkim	 eval(shift(@insns));		# ror
880238405Sjkim	 eval(shift(@insns));
881238405Sjkim
882238405Sjkim	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
883238405Sjkim	 eval(shift(@insns));		# body_20_39
884238405Sjkim	 eval(shift(@insns));
885238405Sjkim	 eval(shift(@insns));
886238405Sjkim	 eval(shift(@insns));		# rol
887238405Sjkim
888238405Sjkim	&vpsrld	(@Tx[0],@X[0],30);
889238405Sjkim	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
890238405Sjkim	 eval(shift(@insns));
891238405Sjkim	 eval(shift(@insns));
892238405Sjkim	 eval(shift(@insns));		# ror
893238405Sjkim	 eval(shift(@insns));
894238405Sjkim
895238405Sjkim	&vpslld	(@X[0],@X[0],2);
896238405Sjkim	 eval(shift(@insns));		# body_20_39
897238405Sjkim	 eval(shift(@insns));
898238405Sjkim	 eval(shift(@insns));
899238405Sjkim	 eval(shift(@insns));		# rol
900238405Sjkim	 eval(shift(@insns));
901238405Sjkim	 eval(shift(@insns));
902238405Sjkim	 eval(shift(@insns));		# ror
903238405Sjkim	 eval(shift(@insns));
904238405Sjkim
905238405Sjkim	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
906238405Sjkim	 eval(shift(@insns));		# body_20_39
907238405Sjkim	 eval(shift(@insns));
908238405Sjkim	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
909238405Sjkim	 eval(shift(@insns));
910238405Sjkim	 eval(shift(@insns));		# rol
911238405Sjkim	 eval(shift(@insns));
912238405Sjkim	 eval(shift(@insns));
913238405Sjkim	 eval(shift(@insns));		# rol
914238405Sjkim	 eval(shift(@insns));
915238405Sjkim
916238405Sjkim	 foreach (@insns) { eval; }	# remaining instructions
917238405Sjkim
918238405Sjkim  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
919238405Sjkim		push(@Tx,shift(@Tx));
920238405Sjkim}
921238405Sjkim
922238405Sjkimsub Xuplast_avx_80()
923238405Sjkim{ use integer;
924238405Sjkim  my $body = shift;
925238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
926238405Sjkim  my ($a,$b,$c,$d,$e);
927238405Sjkim
928238405Sjkim	 eval(shift(@insns));
929238405Sjkim	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
930238405Sjkim	 eval(shift(@insns));
931238405Sjkim	 eval(shift(@insns));
932238405Sjkim	 eval(shift(@insns));
933238405Sjkim	 eval(shift(@insns));
934238405Sjkim
935238405Sjkim	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
936238405Sjkim
937238405Sjkim	 foreach (@insns) { eval; }		# remaining instructions
938238405Sjkim
939238405Sjkim	&cmp	($inp,$num);
940238405Sjkim	&je	(".Ldone_avx");
941238405Sjkim
942238405Sjkim	unshift(@Tx,pop(@Tx));
943238405Sjkim
944238405Sjkim	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
945238405Sjkim	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
946238405Sjkim	&vmovdqu(@X[-4&7],"0($inp)");		# load input
947238405Sjkim	&vmovdqu(@X[-3&7],"16($inp)");
948238405Sjkim	&vmovdqu(@X[-2&7],"32($inp)");
949238405Sjkim	&vmovdqu(@X[-1&7],"48($inp)");
950238405Sjkim	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
951238405Sjkim	&add	($inp,64);
952238405Sjkim
953238405Sjkim  $Xi=0;
954238405Sjkim}
955238405Sjkim
956238405Sjkimsub Xloop_avx()
957238405Sjkim{ use integer;
958238405Sjkim  my $body = shift;
959238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
960238405Sjkim  my ($a,$b,$c,$d,$e);
961238405Sjkim
962238405Sjkim	 eval(shift(@insns));
963238405Sjkim	 eval(shift(@insns));
964238405Sjkim	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
965238405Sjkim	 eval(shift(@insns));
966238405Sjkim	 eval(shift(@insns));
967238405Sjkim	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
968238405Sjkim	 eval(shift(@insns));
969238405Sjkim	 eval(shift(@insns));
970238405Sjkim	 eval(shift(@insns));
971238405Sjkim	 eval(shift(@insns));
972238405Sjkim	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
973238405Sjkim	 eval(shift(@insns));
974238405Sjkim	 eval(shift(@insns));
975238405Sjkim
976238405Sjkim	foreach (@insns) { eval; }
977238405Sjkim  $Xi++;
978238405Sjkim}
979238405Sjkim
980238405Sjkimsub Xtail_avx()
981238405Sjkim{ use integer;
982238405Sjkim  my $body = shift;
983238405Sjkim  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
984238405Sjkim  my ($a,$b,$c,$d,$e);
985238405Sjkim
986238405Sjkim	foreach (@insns) { eval; }
987238405Sjkim}
988238405Sjkim
989238405Sjkim$code.=<<___;
990238405Sjkim.align	16
991238405Sjkim.Loop_avx:
992238405Sjkim___
993238405Sjkim	&Xupdate_avx_16_31(\&body_00_19);
994238405Sjkim	&Xupdate_avx_16_31(\&body_00_19);
995238405Sjkim	&Xupdate_avx_16_31(\&body_00_19);
996238405Sjkim	&Xupdate_avx_16_31(\&body_00_19);
997238405Sjkim	&Xupdate_avx_32_79(\&body_00_19);
998238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
999238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1000238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1001238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1002238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1003238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1004238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1005238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1006238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1007238405Sjkim	&Xupdate_avx_32_79(\&body_40_59);
1008238405Sjkim	&Xupdate_avx_32_79(\&body_20_39);
1009238405Sjkim	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1010238405Sjkim
1011238405Sjkim				$saved_j=$j; @saved_V=@V;
1012238405Sjkim
1013238405Sjkim	&Xloop_avx(\&body_20_39);
1014238405Sjkim	&Xloop_avx(\&body_20_39);
1015238405Sjkim	&Xloop_avx(\&body_20_39);
1016238405Sjkim
1017238405Sjkim$code.=<<___;
1018238405Sjkim	add	0($ctx),$A			# update context
1019238405Sjkim	add	4($ctx),@T[0]
1020238405Sjkim	add	8($ctx),$C
1021238405Sjkim	add	12($ctx),$D
1022238405Sjkim	mov	$A,0($ctx)
1023238405Sjkim	add	16($ctx),$E
1024238405Sjkim	mov	@T[0],4($ctx)
1025238405Sjkim	mov	@T[0],$B			# magic seed
1026238405Sjkim	mov	$C,8($ctx)
1027238405Sjkim	mov	$D,12($ctx)
1028238405Sjkim	mov	$E,16($ctx)
1029238405Sjkim	jmp	.Loop_avx
1030238405Sjkim
1031238405Sjkim.align	16
1032238405Sjkim.Ldone_avx:
1033238405Sjkim___
1034238405Sjkim				$j=$saved_j; @V=@saved_V;
1035238405Sjkim
1036238405Sjkim	&Xtail_avx(\&body_20_39);
1037238405Sjkim	&Xtail_avx(\&body_20_39);
1038238405Sjkim	&Xtail_avx(\&body_20_39);
1039238405Sjkim
1040238405Sjkim$code.=<<___;
1041279264Sdelphij	vzeroupper
1042238405Sjkim
1043238405Sjkim	add	0($ctx),$A			# update context
1044238405Sjkim	add	4($ctx),@T[0]
1045238405Sjkim	add	8($ctx),$C
1046238405Sjkim	mov	$A,0($ctx)
1047238405Sjkim	add	12($ctx),$D
1048238405Sjkim	mov	@T[0],4($ctx)
1049238405Sjkim	add	16($ctx),$E
1050238405Sjkim	mov	$C,8($ctx)
1051238405Sjkim	mov	$D,12($ctx)
1052238405Sjkim	mov	$E,16($ctx)
1053238405Sjkim___
1054238405Sjkim$code.=<<___ if ($win64);
1055238405Sjkim	movaps	64+0(%rsp),%xmm6
1056238405Sjkim	movaps	64+16(%rsp),%xmm7
1057238405Sjkim	movaps	64+32(%rsp),%xmm8
1058238405Sjkim	movaps	64+48(%rsp),%xmm9
1059238405Sjkim	movaps	64+64(%rsp),%xmm10
1060238405Sjkim___
1061238405Sjkim$code.=<<___;
1062238405Sjkim	lea	`64+($win64?5*16:0)`(%rsp),%rsi
1063238405Sjkim	mov	0(%rsi),%r12
1064238405Sjkim	mov	8(%rsi),%rbp
1065238405Sjkim	mov	16(%rsi),%rbx
1066238405Sjkim	lea	24(%rsi),%rsp
1067238405Sjkim.Lepilogue_avx:
1068238405Sjkim	ret
1069238405Sjkim.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1070238405Sjkim___
1071238405Sjkim}
1072238405Sjkim$code.=<<___;
1073238405Sjkim.align	64
1074238405SjkimK_XX_XX:
1075238405Sjkim.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1076238405Sjkim.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1077238405Sjkim.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1078238405Sjkim.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1079238405Sjkim.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1080238405Sjkim___
1081238405Sjkim}}}
1082238405Sjkim$code.=<<___;
1083183234Ssimon.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1084238405Sjkim.align	64
1085183234Ssimon___
1086183234Ssimon
1087238405Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1088238405Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1089238405Sjkimif ($win64) {
1090238405Sjkim$rec="%rcx";
1091238405Sjkim$frame="%rdx";
1092238405Sjkim$context="%r8";
1093238405Sjkim$disp="%r9";
1094238405Sjkim
1095238405Sjkim$code.=<<___;
1096238405Sjkim.extern	__imp_RtlVirtualUnwind
1097238405Sjkim.type	se_handler,\@abi-omnipotent
1098238405Sjkim.align	16
1099238405Sjkimse_handler:
1100238405Sjkim	push	%rsi
1101238405Sjkim	push	%rdi
1102238405Sjkim	push	%rbx
1103238405Sjkim	push	%rbp
1104238405Sjkim	push	%r12
1105238405Sjkim	push	%r13
1106238405Sjkim	push	%r14
1107238405Sjkim	push	%r15
1108238405Sjkim	pushfq
1109238405Sjkim	sub	\$64,%rsp
1110238405Sjkim
1111238405Sjkim	mov	120($context),%rax	# pull context->Rax
1112238405Sjkim	mov	248($context),%rbx	# pull context->Rip
1113238405Sjkim
1114238405Sjkim	lea	.Lprologue(%rip),%r10
1115238405Sjkim	cmp	%r10,%rbx		# context->Rip<.Lprologue
1116238405Sjkim	jb	.Lcommon_seh_tail
1117238405Sjkim
1118238405Sjkim	mov	152($context),%rax	# pull context->Rsp
1119238405Sjkim
1120238405Sjkim	lea	.Lepilogue(%rip),%r10
1121238405Sjkim	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1122238405Sjkim	jae	.Lcommon_seh_tail
1123238405Sjkim
1124238405Sjkim	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1125238405Sjkim	lea	32(%rax),%rax
1126238405Sjkim
1127238405Sjkim	mov	-8(%rax),%rbx
1128238405Sjkim	mov	-16(%rax),%rbp
1129238405Sjkim	mov	-24(%rax),%r12
1130238405Sjkim	mov	-32(%rax),%r13
1131238405Sjkim	mov	%rbx,144($context)	# restore context->Rbx
1132238405Sjkim	mov	%rbp,160($context)	# restore context->Rbp
1133238405Sjkim	mov	%r12,216($context)	# restore context->R12
1134238405Sjkim	mov	%r13,224($context)	# restore context->R13
1135238405Sjkim
1136238405Sjkim	jmp	.Lcommon_seh_tail
1137238405Sjkim.size	se_handler,.-se_handler
1138238405Sjkim
1139238405Sjkim.type	ssse3_handler,\@abi-omnipotent
1140238405Sjkim.align	16
1141238405Sjkimssse3_handler:
1142238405Sjkim	push	%rsi
1143238405Sjkim	push	%rdi
1144238405Sjkim	push	%rbx
1145238405Sjkim	push	%rbp
1146238405Sjkim	push	%r12
1147238405Sjkim	push	%r13
1148238405Sjkim	push	%r14
1149238405Sjkim	push	%r15
1150238405Sjkim	pushfq
1151238405Sjkim	sub	\$64,%rsp
1152238405Sjkim
1153238405Sjkim	mov	120($context),%rax	# pull context->Rax
1154238405Sjkim	mov	248($context),%rbx	# pull context->Rip
1155238405Sjkim
1156238405Sjkim	mov	8($disp),%rsi		# disp->ImageBase
1157238405Sjkim	mov	56($disp),%r11		# disp->HandlerData
1158238405Sjkim
1159238405Sjkim	mov	0(%r11),%r10d		# HandlerData[0]
1160238405Sjkim	lea	(%rsi,%r10),%r10	# prologue label
1161238405Sjkim	cmp	%r10,%rbx		# context->Rip<prologue label
1162238405Sjkim	jb	.Lcommon_seh_tail
1163238405Sjkim
1164238405Sjkim	mov	152($context),%rax	# pull context->Rsp
1165238405Sjkim
1166238405Sjkim	mov	4(%r11),%r10d		# HandlerData[1]
1167238405Sjkim	lea	(%rsi,%r10),%r10	# epilogue label
1168238405Sjkim	cmp	%r10,%rbx		# context->Rip>=epilogue label
1169238405Sjkim	jae	.Lcommon_seh_tail
1170238405Sjkim
1171238405Sjkim	lea	64(%rax),%rsi
1172238405Sjkim	lea	512($context),%rdi	# &context.Xmm6
1173238405Sjkim	mov	\$10,%ecx
1174238405Sjkim	.long	0xa548f3fc		# cld; rep movsq
1175238405Sjkim	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
1176238405Sjkim
1177238405Sjkim	mov	-8(%rax),%rbx
1178238405Sjkim	mov	-16(%rax),%rbp
1179238405Sjkim	mov	-24(%rax),%r12
1180238405Sjkim	mov	%rbx,144($context)	# restore context->Rbx
1181238405Sjkim	mov	%rbp,160($context)	# restore context->Rbp
1182238405Sjkim	mov	%r12,216($context)	# restore cotnext->R12
1183238405Sjkim
1184238405Sjkim.Lcommon_seh_tail:
1185238405Sjkim	mov	8(%rax),%rdi
1186238405Sjkim	mov	16(%rax),%rsi
1187238405Sjkim	mov	%rax,152($context)	# restore context->Rsp
1188238405Sjkim	mov	%rsi,168($context)	# restore context->Rsi
1189238405Sjkim	mov	%rdi,176($context)	# restore context->Rdi
1190238405Sjkim
1191238405Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
1192238405Sjkim	mov	$context,%rsi		# context
1193238405Sjkim	mov	\$154,%ecx		# sizeof(CONTEXT)
1194238405Sjkim	.long	0xa548f3fc		# cld; rep movsq
1195238405Sjkim
1196238405Sjkim	mov	$disp,%rsi
1197238405Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1198238405Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1199238405Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1200238405Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1201238405Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
1202238405Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
1203238405Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1204238405Sjkim	mov	%r10,32(%rsp)		# arg5
1205238405Sjkim	mov	%r11,40(%rsp)		# arg6
1206238405Sjkim	mov	%r12,48(%rsp)		# arg7
1207238405Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
1208238405Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
1209238405Sjkim
1210238405Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
1211238405Sjkim	add	\$64,%rsp
1212238405Sjkim	popfq
1213238405Sjkim	pop	%r15
1214238405Sjkim	pop	%r14
1215238405Sjkim	pop	%r13
1216238405Sjkim	pop	%r12
1217238405Sjkim	pop	%rbp
1218238405Sjkim	pop	%rbx
1219238405Sjkim	pop	%rdi
1220238405Sjkim	pop	%rsi
1221238405Sjkim	ret
1222238405Sjkim.size	ssse3_handler,.-ssse3_handler
1223238405Sjkim
1224238405Sjkim.section	.pdata
1225238405Sjkim.align	4
1226238405Sjkim	.rva	.LSEH_begin_sha1_block_data_order
1227238405Sjkim	.rva	.LSEH_end_sha1_block_data_order
1228238405Sjkim	.rva	.LSEH_info_sha1_block_data_order
1229238405Sjkim	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1230238405Sjkim	.rva	.LSEH_end_sha1_block_data_order_ssse3
1231238405Sjkim	.rva	.LSEH_info_sha1_block_data_order_ssse3
1232238405Sjkim___
1233238405Sjkim$code.=<<___ if ($avx);
1234238405Sjkim	.rva	.LSEH_begin_sha1_block_data_order_avx
1235238405Sjkim	.rva	.LSEH_end_sha1_block_data_order_avx
1236238405Sjkim	.rva	.LSEH_info_sha1_block_data_order_avx
1237238405Sjkim___
1238238405Sjkim$code.=<<___;
1239238405Sjkim.section	.xdata
1240238405Sjkim.align	8
1241238405Sjkim.LSEH_info_sha1_block_data_order:
1242238405Sjkim	.byte	9,0,0,0
1243238405Sjkim	.rva	se_handler
1244238405Sjkim.LSEH_info_sha1_block_data_order_ssse3:
1245238405Sjkim	.byte	9,0,0,0
1246238405Sjkim	.rva	ssse3_handler
1247238405Sjkim	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1248238405Sjkim___
1249238405Sjkim$code.=<<___ if ($avx);
1250238405Sjkim.LSEH_info_sha1_block_data_order_avx:
1251238405Sjkim	.byte	9,0,0,0
1252238405Sjkim	.rva	ssse3_handler
1253238405Sjkim	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1254238405Sjkim___
1255238405Sjkim}
1256238405Sjkim
1257183234Ssimon####################################################################
1258183234Ssimon
1259183234Ssimon$code =~ s/\`([^\`]*)\`/eval $1/gem;
1260183234Ssimonprint $code;
1261183234Ssimonclose STDOUT;
1262