sha1-x86_64.pl revision 306195
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52# May 2013.
53#
54# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
55# and loading pair of consecutive blocks to 256-bit %ymm registers)
56# did not provide impressive performance improvement till a crucial
57# hint regarding the number of Xupdate iterations to pre-compute in
58# advance was provided by Ilya Albrekht of Intel Corp.
59
60# March 2014.
61#
62# Add support for Intel SHA Extensions.
63
64######################################################################
65# Current performance is summarized in following table. Numbers are
66# CPU clock cycles spent to process single byte (less is better).
67#
68#		x86_64		SSSE3		AVX[2]
69# P4		9.05		-
70# Opteron	6.26		-
71# Core2		6.55		6.05/+8%	-
72# Westmere	6.73		5.30/+27%	-
73# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
74# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
75# Haswell	5.45		4.15/+31%	3.57/+53%
76# Bulldozer	9.11		5.95/+53%
77# VIA Nano	9.32		7.15/+30%
78# Atom		10.3		9.17/+12%
79# Silvermont	13.1(*)		9.37/+40%
80#
81# (*)	obviously suboptimal result, nothing was done about it,
82#	because SSSE3 code is compiled unconditionally;
83
84$flavour = shift;
85$output  = shift;
86if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
87
88$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
89
90$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
91( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
92( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
93die "can't locate x86_64-xlate.pl";
94
95if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
96		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
97	$avx = ($1>=2.19) + ($1>=2.22);
98}
99
100if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
101	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
102	$avx = ($1>=2.09) + ($1>=2.10);
103}
104
105if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
106	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
107	$avx = ($1>=10) + ($1>=11);
108}
109
110if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) {
111	$avx = ($2>=3.0) + ($2>3.0);
112}
113
114$shaext=1;	### set to zero if compiling for 1.0.1
115$avx=1		if (!$shaext && $avx);
116
117open OUT,"| \"$^X\" $xlate $flavour $output";
118*STDOUT=*OUT;
119
120$ctx="%rdi";	# 1st arg
121$inp="%rsi";	# 2nd arg
122$num="%rdx";	# 3rd arg
123
124# reassign arguments in order to produce more compact code
125$ctx="%r8";
126$inp="%r9";
127$num="%r10";
128
129$t0="%eax";
130$t1="%ebx";
131$t2="%ecx";
132@xi=("%edx","%ebp","%r14d");
133$A="%esi";
134$B="%edi";
135$C="%r11d";
136$D="%r12d";
137$E="%r13d";
138
139@V=($A,$B,$C,$D,$E);
140
141sub BODY_00_19 {
142my ($i,$a,$b,$c,$d,$e)=@_;
143my $j=$i+1;
144$code.=<<___ if ($i==0);
145	mov	`4*$i`($inp),$xi[0]
146	bswap	$xi[0]
147___
148$code.=<<___ if ($i<15);
149	mov	`4*$j`($inp),$xi[1]
150	mov	$d,$t0
151	mov	$xi[0],`4*$i`(%rsp)
152	mov	$a,$t2
153	bswap	$xi[1]
154	xor	$c,$t0
155	rol	\$5,$t2
156	and	$b,$t0
157	lea	0x5a827999($xi[0],$e),$e
158	add	$t2,$e
159	xor	$d,$t0
160	rol	\$30,$b
161	add	$t0,$e
162___
163$code.=<<___ if ($i>=15);
164	xor	`4*($j%16)`(%rsp),$xi[1]
165	mov	$d,$t0
166	mov	$xi[0],`4*($i%16)`(%rsp)
167	mov	$a,$t2
168	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
169	xor	$c,$t0
170	rol	\$5,$t2
171	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
172	and	$b,$t0
173	lea	0x5a827999($xi[0],$e),$e
174	rol	\$30,$b
175	xor	$d,$t0
176	add	$t2,$e
177	rol	\$1,$xi[1]
178	add	$t0,$e
179___
180push(@xi,shift(@xi));
181}
182
183sub BODY_20_39 {
184my ($i,$a,$b,$c,$d,$e)=@_;
185my $j=$i+1;
186my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
187$code.=<<___ if ($i<79);
188	xor	`4*($j%16)`(%rsp),$xi[1]
189	mov	$b,$t0
190	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
191	mov	$a,$t2
192	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
193	xor	$d,$t0
194	rol	\$5,$t2
195	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
196	lea	$K($xi[0],$e),$e
197	xor	$c,$t0
198	add	$t2,$e
199	rol	\$30,$b
200	add	$t0,$e
201	rol	\$1,$xi[1]
202___
203$code.=<<___ if ($i==79);
204	mov	$b,$t0
205	mov	$a,$t2
206	xor	$d,$t0
207	lea	$K($xi[0],$e),$e
208	rol	\$5,$t2
209	xor	$c,$t0
210	add	$t2,$e
211	rol	\$30,$b
212	add	$t0,$e
213___
214push(@xi,shift(@xi));
215}
216
217sub BODY_40_59 {
218my ($i,$a,$b,$c,$d,$e)=@_;
219my $j=$i+1;
220$code.=<<___;
221	xor	`4*($j%16)`(%rsp),$xi[1]
222	mov	$d,$t0
223	mov	$xi[0],`4*($i%16)`(%rsp)
224	mov	$d,$t1
225	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
226	and	$c,$t0
227	mov	$a,$t2
228	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
229	lea	0x8f1bbcdc($xi[0],$e),$e
230	xor	$c,$t1
231	rol	\$5,$t2
232	add	$t0,$e
233	rol	\$1,$xi[1]
234	and	$b,$t1
235	add	$t2,$e
236	rol	\$30,$b
237	add	$t1,$e
238___
239push(@xi,shift(@xi));
240}
241
242$code.=<<___;
243.text
244.extern	OPENSSL_ia32cap_P
245
246.globl	sha1_block_data_order
247.type	sha1_block_data_order,\@function,3
248.align	16
249sha1_block_data_order:
250	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
251	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
252	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
253	test	\$`1<<9`,%r8d		# check SSSE3 bit
254	jz	.Lialu
255___
256$code.=<<___ if ($shaext);
257	test	\$`1<<29`,%r10d		# check SHA bit
258	jnz	_shaext_shortcut
259___
260$code.=<<___ if ($avx>1);
261	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
262	cmp	\$`1<<3|1<<5|1<<8`,%r10d
263	je	_avx2_shortcut
264___
265$code.=<<___ if ($avx);
266	and	\$`1<<28`,%r8d		# mask AVX bit
267	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
268	or	%r9d,%r8d
269	cmp	\$`1<<28|1<<30`,%r8d
270	je	_avx_shortcut
271___
272$code.=<<___;
273	jmp	_ssse3_shortcut
274
275.align	16
276.Lialu:
277	mov	%rsp,%rax
278	push	%rbx
279	push	%rbp
280	push	%r12
281	push	%r13
282	push	%r14
283	mov	%rdi,$ctx	# reassigned argument
284	sub	\$`8+16*4`,%rsp
285	mov	%rsi,$inp	# reassigned argument
286	and	\$-64,%rsp
287	mov	%rdx,$num	# reassigned argument
288	mov	%rax,`16*4`(%rsp)
289.Lprologue:
290
291	mov	0($ctx),$A
292	mov	4($ctx),$B
293	mov	8($ctx),$C
294	mov	12($ctx),$D
295	mov	16($ctx),$E
296	jmp	.Lloop
297
298.align	16
299.Lloop:
300___
301for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
302for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
303for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
304for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
305$code.=<<___;
306	add	0($ctx),$A
307	add	4($ctx),$B
308	add	8($ctx),$C
309	add	12($ctx),$D
310	add	16($ctx),$E
311	mov	$A,0($ctx)
312	mov	$B,4($ctx)
313	mov	$C,8($ctx)
314	mov	$D,12($ctx)
315	mov	$E,16($ctx)
316
317	sub	\$1,$num
318	lea	`16*4`($inp),$inp
319	jnz	.Lloop
320
321	mov	`16*4`(%rsp),%rsi
322	mov	-40(%rsi),%r14
323	mov	-32(%rsi),%r13
324	mov	-24(%rsi),%r12
325	mov	-16(%rsi),%rbp
326	mov	-8(%rsi),%rbx
327	lea	(%rsi),%rsp
328.Lepilogue:
329	ret
330.size	sha1_block_data_order,.-sha1_block_data_order
331___
332if ($shaext) {{{
333######################################################################
334# Intel SHA Extensions implementation of SHA1 update function.
335#
336my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
337my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
338my @MSG=map("%xmm$_",(4..7));
339
340$code.=<<___;
341.type	sha1_block_data_order_shaext,\@function,3
342.align	32
343sha1_block_data_order_shaext:
344_shaext_shortcut:
345___
346$code.=<<___ if ($win64);
347	lea	`-8-4*16`(%rsp),%rsp
348	movaps	%xmm6,-8-4*16(%rax)
349	movaps	%xmm7,-8-3*16(%rax)
350	movaps	%xmm8,-8-2*16(%rax)
351	movaps	%xmm9,-8-1*16(%rax)
352.Lprologue_shaext:
353___
354$code.=<<___;
355	movdqu	($ctx),$ABCD
356	movd	16($ctx),$E
357	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
358
359	movdqu	($inp),@MSG[0]
360	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
361	movdqu	0x10($inp),@MSG[1]
362	pshufd	\$0b00011011,$E,$E		# flip word order
363	movdqu	0x20($inp),@MSG[2]
364	pshufb	$BSWAP,@MSG[0]
365	movdqu	0x30($inp),@MSG[3]
366	pshufb	$BSWAP,@MSG[1]
367	pshufb	$BSWAP,@MSG[2]
368	movdqa	$E,$E_SAVE			# offload $E
369	pshufb	$BSWAP,@MSG[3]
370	jmp	.Loop_shaext
371
372.align	16
373.Loop_shaext:
374	dec		$num
375	lea		0x40($inp),%r8		# next input block
376	paddd		@MSG[0],$E
377	cmovne		%r8,$inp
378	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
379___
380for($i=0;$i<20-4;$i+=2) {
381$code.=<<___;
382	sha1msg1	@MSG[1],@MSG[0]
383	movdqa		$ABCD,$E_
384	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
385	sha1nexte	@MSG[1],$E_
386	pxor		@MSG[2],@MSG[0]
387	sha1msg1	@MSG[2],@MSG[1]
388	sha1msg2	@MSG[3],@MSG[0]
389
390	movdqa		$ABCD,$E
391	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
392	sha1nexte	@MSG[2],$E
393	pxor		@MSG[3],@MSG[1]
394	sha1msg2	@MSG[0],@MSG[1]
395___
396	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
397}
398$code.=<<___;
399	movdqu		($inp),@MSG[0]
400	movdqa		$ABCD,$E_
401	sha1rnds4	\$3,$E,$ABCD		# 64-67
402	sha1nexte	@MSG[1],$E_
403	movdqu		0x10($inp),@MSG[1]
404	pshufb		$BSWAP,@MSG[0]
405
406	movdqa		$ABCD,$E
407	sha1rnds4	\$3,$E_,$ABCD		# 68-71
408	sha1nexte	@MSG[2],$E
409	movdqu		0x20($inp),@MSG[2]
410	pshufb		$BSWAP,@MSG[1]
411
412	movdqa		$ABCD,$E_
413	sha1rnds4	\$3,$E,$ABCD		# 72-75
414	sha1nexte	@MSG[3],$E_
415	movdqu		0x30($inp),@MSG[3]
416	pshufb		$BSWAP,@MSG[2]
417
418	movdqa		$ABCD,$E
419	sha1rnds4	\$3,$E_,$ABCD		# 76-79
420	sha1nexte	$E_SAVE,$E
421	pshufb		$BSWAP,@MSG[3]
422
423	paddd		$ABCD_SAVE,$ABCD
424	movdqa		$E,$E_SAVE		# offload $E
425
426	jnz		.Loop_shaext
427
428	pshufd	\$0b00011011,$ABCD,$ABCD
429	pshufd	\$0b00011011,$E,$E
430	movdqu	$ABCD,($ctx)
431	movd	$E,16($ctx)
432___
433$code.=<<___ if ($win64);
434	movaps	-8-4*16(%rax),%xmm6
435	movaps	-8-3*16(%rax),%xmm7
436	movaps	-8-2*16(%rax),%xmm8
437	movaps	-8-1*16(%rax),%xmm9
438	mov	%rax,%rsp
439.Lepilogue_shaext:
440___
441$code.=<<___;
442	ret
443.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
444___
445}}}
446{{{
447my $Xi=4;
448my @X=map("%xmm$_",(4..7,0..3));
449my @Tx=map("%xmm$_",(8..10));
450my $Kx="%xmm11";
451my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
452my @T=("%esi","%edi");
453my $j=0;
454my $rx=0;
455my $K_XX_XX="%r11";
456
457my $_rol=sub { &rol(@_) };
458my $_ror=sub { &ror(@_) };
459
460{ my $sn;
461sub align32() {
462  ++$sn;
463$code.=<<___;
464	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
465.align	32
466.Lalign32_$sn:
467___
468}
469}
470
471$code.=<<___;
472.type	sha1_block_data_order_ssse3,\@function,3
473.align	16
474sha1_block_data_order_ssse3:
475_ssse3_shortcut:
476	mov	%rsp,%rax
477	push	%rbx
478	push	%rbp
479	push	%r12
480	push	%r13		# redundant, done to share Win64 SE handler
481	push	%r14
482	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
483___
484$code.=<<___ if ($win64);
485	movaps	%xmm6,-40-6*16(%rax)
486	movaps	%xmm7,-40-5*16(%rax)
487	movaps	%xmm8,-40-4*16(%rax)
488	movaps	%xmm9,-40-3*16(%rax)
489	movaps	%xmm10,-40-2*16(%rax)
490	movaps	%xmm11,-40-1*16(%rax)
491.Lprologue_ssse3:
492___
493$code.=<<___;
494	mov	%rax,%r14	# original %rsp
495	and	\$-64,%rsp
496	mov	%rdi,$ctx	# reassigned argument
497	mov	%rsi,$inp	# reassigned argument
498	mov	%rdx,$num	# reassigned argument
499
500	shl	\$6,$num
501	add	$inp,$num
502	lea	K_XX_XX+64(%rip),$K_XX_XX
503
504	mov	0($ctx),$A		# load context
505	mov	4($ctx),$B
506	mov	8($ctx),$C
507	mov	12($ctx),$D
508	mov	$B,@T[0]		# magic seed
509	mov	16($ctx),$E
510	mov	$C,@T[1]
511	xor	$D,@T[1]
512	and	@T[1],@T[0]
513
514	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
515	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
516	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
517	movdqu	16($inp),@X[-3&7]
518	movdqu	32($inp),@X[-2&7]
519	movdqu	48($inp),@X[-1&7]
520	pshufb	@X[2],@X[-4&7]		# byte swap
521	pshufb	@X[2],@X[-3&7]
522	pshufb	@X[2],@X[-2&7]
523	add	\$64,$inp
524	paddd	@Tx[1],@X[-4&7]		# add K_00_19
525	pshufb	@X[2],@X[-1&7]
526	paddd	@Tx[1],@X[-3&7]
527	paddd	@Tx[1],@X[-2&7]
528	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
529	psubd	@Tx[1],@X[-4&7]		# restore X[]
530	movdqa	@X[-3&7],16(%rsp)
531	psubd	@Tx[1],@X[-3&7]
532	movdqa	@X[-2&7],32(%rsp)
533	psubd	@Tx[1],@X[-2&7]
534	jmp	.Loop_ssse3
535___
536
537sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
538{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
539  my $arg = pop;
540    $arg = "\$$arg" if ($arg*1 eq $arg);
541    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
542}
543
544sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
545{ use integer;
546  my $body = shift;
547  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
548  my ($a,$b,$c,$d,$e);
549
550	 eval(shift(@insns));		# ror
551	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
552	 eval(shift(@insns));
553	&movdqa	(@Tx[0],@X[-1&7]);
554	  &paddd	(@Tx[1],@X[-1&7]);
555	 eval(shift(@insns));
556	 eval(shift(@insns));
557
558	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
559	 eval(shift(@insns));
560	 eval(shift(@insns));		# rol
561	 eval(shift(@insns));
562	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
563	 eval(shift(@insns));
564	 eval(shift(@insns));
565
566	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
567	 eval(shift(@insns));
568	 eval(shift(@insns));		# ror
569	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
570	 eval(shift(@insns));
571	 eval(shift(@insns));
572	 eval(shift(@insns));
573
574	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
575	 eval(shift(@insns));
576	 eval(shift(@insns));		# rol
577	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
578	 eval(shift(@insns));
579	 eval(shift(@insns));
580
581	&movdqa	(@Tx[2],@X[0]);
582	 eval(shift(@insns));
583	 eval(shift(@insns));
584	 eval(shift(@insns));		# ror
585	&movdqa	(@Tx[0],@X[0]);
586	 eval(shift(@insns));
587
588	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
589	&paddd	(@X[0],@X[0]);
590	 eval(shift(@insns));
591	 eval(shift(@insns));
592
593	&psrld	(@Tx[0],31);
594	 eval(shift(@insns));
595	 eval(shift(@insns));		# rol
596	 eval(shift(@insns));
597	&movdqa	(@Tx[1],@Tx[2]);
598	 eval(shift(@insns));
599	 eval(shift(@insns));
600
601	&psrld	(@Tx[2],30);
602	 eval(shift(@insns));
603	 eval(shift(@insns));		# ror
604	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
605	 eval(shift(@insns));
606	 eval(shift(@insns));
607	 eval(shift(@insns));
608
609	&pslld	(@Tx[1],2);
610	&pxor	(@X[0],@Tx[2]);
611	 eval(shift(@insns));
612	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
613	 eval(shift(@insns));		# rol
614	 eval(shift(@insns));
615	 eval(shift(@insns));
616
617	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
618	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
619
620	 foreach (@insns) { eval; }	# remaining instructions [if any]
621
622  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
623		push(@Tx,shift(@Tx));
624}
625
626sub Xupdate_ssse3_32_79()
627{ use integer;
628  my $body = shift;
629  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
630  my ($a,$b,$c,$d,$e);
631
632	 eval(shift(@insns))		if ($Xi==8);
633	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
634	 eval(shift(@insns))		if ($Xi==8);
635	 eval(shift(@insns));		# body_20_39
636	 eval(shift(@insns));
637	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
638	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
639	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
640	 eval(shift(@insns));
641	 eval(shift(@insns));		# rol
642
643	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
644	 eval(shift(@insns));
645	 eval(shift(@insns));
646	if ($Xi%5) {
647	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
648	} else {			# ... or load next one
649	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
650	}
651	 eval(shift(@insns));		# ror
652	  &paddd	(@Tx[1],@X[-1&7]);
653	 eval(shift(@insns));
654
655	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
656	 eval(shift(@insns));		# body_20_39
657	 eval(shift(@insns));
658	 eval(shift(@insns));
659	 eval(shift(@insns));		# rol
660	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
661
662	&movdqa	(@Tx[0],@X[0]);
663	 eval(shift(@insns));
664	 eval(shift(@insns));
665	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
666	 eval(shift(@insns));		# ror
667	 eval(shift(@insns));
668	 eval(shift(@insns));		# body_20_39
669
670	&pslld	(@X[0],2);
671	 eval(shift(@insns));
672	 eval(shift(@insns));
673	&psrld	(@Tx[0],30);
674	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
675	 eval(shift(@insns));
676	 eval(shift(@insns));
677	 eval(shift(@insns));		# ror
678
679	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
680	 eval(shift(@insns));
681	 eval(shift(@insns));		# body_20_39
682	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
683	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
684	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
685	 eval(shift(@insns));
686	 eval(shift(@insns));		# rol
687	 eval(shift(@insns));
688	 eval(shift(@insns));
689	 eval(shift(@insns));		# rol
690	 eval(shift(@insns));
691
692	 foreach (@insns) { eval; }	# remaining instructions
693
694  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
695		push(@Tx,shift(@Tx));
696}
697
698sub Xuplast_ssse3_80()
699{ use integer;
700  my $body = shift;
701  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
702  my ($a,$b,$c,$d,$e);
703
704	 eval(shift(@insns));
705	 eval(shift(@insns));
706	 eval(shift(@insns));
707	 eval(shift(@insns));
708	  &paddd	(@Tx[1],@X[-1&7]);
709	 eval(shift(@insns));
710	 eval(shift(@insns));
711
712	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
713
714	 foreach (@insns) { eval; }		# remaining instructions
715
716	&cmp	($inp,$num);
717	&je	(".Ldone_ssse3");
718
719	unshift(@Tx,pop(@Tx));
720
721	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
722	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
723	&movdqu	(@X[-4&7],"0($inp)");		# load input
724	&movdqu	(@X[-3&7],"16($inp)");
725	&movdqu	(@X[-2&7],"32($inp)");
726	&movdqu	(@X[-1&7],"48($inp)");
727	&pshufb	(@X[-4&7],@X[2]);		# byte swap
728	&add	($inp,64);
729
730  $Xi=0;
731}
732
733sub Xloop_ssse3()
734{ use integer;
735  my $body = shift;
736  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
737  my ($a,$b,$c,$d,$e);
738
739	 eval(shift(@insns));
740	 eval(shift(@insns));
741	 eval(shift(@insns));
742	&pshufb	(@X[($Xi-3)&7],@X[2]);
743	 eval(shift(@insns));
744	 eval(shift(@insns));
745	 eval(shift(@insns));
746	 eval(shift(@insns));
747	&paddd	(@X[($Xi-4)&7],@Tx[1]);
748	 eval(shift(@insns));
749	 eval(shift(@insns));
750	 eval(shift(@insns));
751	 eval(shift(@insns));
752	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
753	 eval(shift(@insns));
754	 eval(shift(@insns));
755	 eval(shift(@insns));
756	 eval(shift(@insns));
757	&psubd	(@X[($Xi-4)&7],@Tx[1]);
758
759	foreach (@insns) { eval; }
760  $Xi++;
761}
762
763sub Xtail_ssse3()
764{ use integer;
765  my $body = shift;
766  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
767  my ($a,$b,$c,$d,$e);
768
769	foreach (@insns) { eval; }
770}
771
772sub body_00_19 () {	# ((c^d)&b)^d
773	# on start @T[0]=(c^d)&b
774	return &body_20_39() if ($rx==19); $rx++;
775	(
776	'($a,$b,$c,$d,$e)=@V;'.
777	'&$_ror	($b,$j?7:2)',	# $b>>>2
778	'&xor	(@T[0],$d)',
779	'&mov	(@T[1],$a)',	# $b for next round
780
781	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
782	'&xor	($b,$c)',	# $c^$d for next round
783
784	'&$_rol	($a,5)',
785	'&add	($e,@T[0])',
786	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
787
788	'&xor	($b,$c)',	# restore $b
789	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
790	);
791}
792
793sub body_20_39 () {	# b^d^c
794	# on entry @T[0]=b^d
795	return &body_40_59() if ($rx==39); $rx++;
796	(
797	'($a,$b,$c,$d,$e)=@V;'.
798	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
799	'&xor	(@T[0],$d)	if($j==19);'.
800	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
801	'&mov	(@T[1],$a)',	# $b for next round
802
803	'&$_rol	($a,5)',
804	'&add	($e,@T[0])',
805	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
806
807	'&$_ror	($b,7)',	# $b>>>2
808	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
809	);
810}
811
812sub body_40_59 () {	# ((b^c)&(c^d))^c
813	# on entry @T[0]=(b^c), (c^=d)
814	$rx++;
815	(
816	'($a,$b,$c,$d,$e)=@V;'.
817	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
818	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
819	'&xor	($c,$d)		if ($j>=40)',	# restore $c
820
821	'&$_ror	($b,7)',	# $b>>>2
822	'&mov	(@T[1],$a)',	# $b for next round
823	'&xor	(@T[0],$c)',
824
825	'&$_rol	($a,5)',
826	'&add	($e,@T[0])',
827	'&xor	(@T[1],$c)	if ($j==59);'.
828	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
829
830	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
831	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
832	);
833}
834$code.=<<___;
835.align	16
836.Loop_ssse3:
837___
838	&Xupdate_ssse3_16_31(\&body_00_19);
839	&Xupdate_ssse3_16_31(\&body_00_19);
840	&Xupdate_ssse3_16_31(\&body_00_19);
841	&Xupdate_ssse3_16_31(\&body_00_19);
842	&Xupdate_ssse3_32_79(\&body_00_19);
843	&Xupdate_ssse3_32_79(\&body_20_39);
844	&Xupdate_ssse3_32_79(\&body_20_39);
845	&Xupdate_ssse3_32_79(\&body_20_39);
846	&Xupdate_ssse3_32_79(\&body_20_39);
847	&Xupdate_ssse3_32_79(\&body_20_39);
848	&Xupdate_ssse3_32_79(\&body_40_59);
849	&Xupdate_ssse3_32_79(\&body_40_59);
850	&Xupdate_ssse3_32_79(\&body_40_59);
851	&Xupdate_ssse3_32_79(\&body_40_59);
852	&Xupdate_ssse3_32_79(\&body_40_59);
853	&Xupdate_ssse3_32_79(\&body_20_39);
854	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
855
856				$saved_j=$j; @saved_V=@V;
857
858	&Xloop_ssse3(\&body_20_39);
859	&Xloop_ssse3(\&body_20_39);
860	&Xloop_ssse3(\&body_20_39);
861
862$code.=<<___;
863	add	0($ctx),$A			# update context
864	add	4($ctx),@T[0]
865	add	8($ctx),$C
866	add	12($ctx),$D
867	mov	$A,0($ctx)
868	add	16($ctx),$E
869	mov	@T[0],4($ctx)
870	mov	@T[0],$B			# magic seed
871	mov	$C,8($ctx)
872	mov	$C,@T[1]
873	mov	$D,12($ctx)
874	xor	$D,@T[1]
875	mov	$E,16($ctx)
876	and	@T[1],@T[0]
877	jmp	.Loop_ssse3
878
879.align	16
880.Ldone_ssse3:
881___
882				$j=$saved_j; @V=@saved_V;
883
884	&Xtail_ssse3(\&body_20_39);
885	&Xtail_ssse3(\&body_20_39);
886	&Xtail_ssse3(\&body_20_39);
887
888$code.=<<___;
889	add	0($ctx),$A			# update context
890	add	4($ctx),@T[0]
891	add	8($ctx),$C
892	mov	$A,0($ctx)
893	add	12($ctx),$D
894	mov	@T[0],4($ctx)
895	add	16($ctx),$E
896	mov	$C,8($ctx)
897	mov	$D,12($ctx)
898	mov	$E,16($ctx)
899___
900$code.=<<___ if ($win64);
901	movaps	-40-6*16(%r14),%xmm6
902	movaps	-40-5*16(%r14),%xmm7
903	movaps	-40-4*16(%r14),%xmm8
904	movaps	-40-3*16(%r14),%xmm9
905	movaps	-40-2*16(%r14),%xmm10
906	movaps	-40-1*16(%r14),%xmm11
907___
908$code.=<<___;
909	lea	(%r14),%rsi
910	mov	-40(%rsi),%r14
911	mov	-32(%rsi),%r13
912	mov	-24(%rsi),%r12
913	mov	-16(%rsi),%rbp
914	mov	-8(%rsi),%rbx
915	lea	(%rsi),%rsp
916.Lepilogue_ssse3:
917	ret
918.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
919___
920
921if ($avx) {
922$Xi=4;				# reset variables
923@X=map("%xmm$_",(4..7,0..3));
924@Tx=map("%xmm$_",(8..10));
925$j=0;
926$rx=0;
927
928my $done_avx_label=".Ldone_avx";
929
930my $_rol=sub { &shld(@_[0],@_) };
931my $_ror=sub { &shrd(@_[0],@_) };
932
933$code.=<<___;
934.type	sha1_block_data_order_avx,\@function,3
935.align	16
936sha1_block_data_order_avx:
937_avx_shortcut:
938	mov	%rsp,%rax
939	push	%rbx
940	push	%rbp
941	push	%r12
942	push	%r13		# redundant, done to share Win64 SE handler
943	push	%r14
944	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
945	vzeroupper
946___
947$code.=<<___ if ($win64);
948	vmovaps	%xmm6,-40-6*16(%rax)
949	vmovaps	%xmm7,-40-5*16(%rax)
950	vmovaps	%xmm8,-40-4*16(%rax)
951	vmovaps	%xmm9,-40-3*16(%rax)
952	vmovaps	%xmm10,-40-2*16(%rax)
953	vmovaps	%xmm11,-40-1*16(%rax)
954.Lprologue_avx:
955___
956$code.=<<___;
957	mov	%rax,%r14	# original %rsp
958	and	\$-64,%rsp
959	mov	%rdi,$ctx	# reassigned argument
960	mov	%rsi,$inp	# reassigned argument
961	mov	%rdx,$num	# reassigned argument
962
963	shl	\$6,$num
964	add	$inp,$num
965	lea	K_XX_XX+64(%rip),$K_XX_XX
966
967	mov	0($ctx),$A		# load context
968	mov	4($ctx),$B
969	mov	8($ctx),$C
970	mov	12($ctx),$D
971	mov	$B,@T[0]		# magic seed
972	mov	16($ctx),$E
973	mov	$C,@T[1]
974	xor	$D,@T[1]
975	and	@T[1],@T[0]
976
977	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
978	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
979	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
980	vmovdqu	16($inp),@X[-3&7]
981	vmovdqu	32($inp),@X[-2&7]
982	vmovdqu	48($inp),@X[-1&7]
983	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
984	add	\$64,$inp
985	vpshufb	@X[2],@X[-3&7],@X[-3&7]
986	vpshufb	@X[2],@X[-2&7],@X[-2&7]
987	vpshufb	@X[2],@X[-1&7],@X[-1&7]
988	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
989	vpaddd	$Kx,@X[-3&7],@X[1]
990	vpaddd	$Kx,@X[-2&7],@X[2]
991	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
992	vmovdqa	@X[1],16(%rsp)
993	vmovdqa	@X[2],32(%rsp)
994	jmp	.Loop_avx
995___
996
997sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
998{ use integer;
999  my $body = shift;
1000  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1001  my ($a,$b,$c,$d,$e);
1002
1003	 eval(shift(@insns));
1004	 eval(shift(@insns));
1005	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1006	 eval(shift(@insns));
1007	 eval(shift(@insns));
1008
1009	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1010	 eval(shift(@insns));
1011	 eval(shift(@insns));
1012	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1013	 eval(shift(@insns));
1014	 eval(shift(@insns));
1015	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1016	 eval(shift(@insns));
1017	 eval(shift(@insns));
1018
1019	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1020	 eval(shift(@insns));
1021	 eval(shift(@insns));
1022	 eval(shift(@insns));
1023	 eval(shift(@insns));
1024
1025	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1026	 eval(shift(@insns));
1027	 eval(shift(@insns));
1028	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1029	 eval(shift(@insns));
1030	 eval(shift(@insns));
1031
1032	&vpsrld	(@Tx[0],@X[0],31);
1033	 eval(shift(@insns));
1034	 eval(shift(@insns));
1035	 eval(shift(@insns));
1036	 eval(shift(@insns));
1037
1038	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1039	&vpaddd	(@X[0],@X[0],@X[0]);
1040	 eval(shift(@insns));
1041	 eval(shift(@insns));
1042	 eval(shift(@insns));
1043	 eval(shift(@insns));
1044
1045	&vpsrld	(@Tx[1],@Tx[2],30);
1046	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1047	 eval(shift(@insns));
1048	 eval(shift(@insns));
1049	 eval(shift(@insns));
1050	 eval(shift(@insns));
1051
1052	&vpslld	(@Tx[2],@Tx[2],2);
1053	&vpxor	(@X[0],@X[0],@Tx[1]);
1054	 eval(shift(@insns));
1055	 eval(shift(@insns));
1056	 eval(shift(@insns));
1057	 eval(shift(@insns));
1058
1059	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1060	 eval(shift(@insns));
1061	 eval(shift(@insns));
1062	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1063	 eval(shift(@insns));
1064	 eval(shift(@insns));
1065
1066
1067	 foreach (@insns) { eval; }	# remaining instructions [if any]
1068
1069  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1070}
1071
1072sub Xupdate_avx_32_79()
1073{ use integer;
1074  my $body = shift;
1075  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1076  my ($a,$b,$c,$d,$e);
1077
1078	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1079	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1080	 eval(shift(@insns));		# body_20_39
1081	 eval(shift(@insns));
1082	 eval(shift(@insns));
1083	 eval(shift(@insns));		# rol
1084
1085	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1086	 eval(shift(@insns));
1087	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1088	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1089	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1090	 eval(shift(@insns));		# ror
1091	 eval(shift(@insns));
1092
1093	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1094	 eval(shift(@insns));		# body_20_39
1095	 eval(shift(@insns));
1096	 eval(shift(@insns));
1097	 eval(shift(@insns));		# rol
1098
1099	&vpsrld	(@Tx[0],@X[0],30);
1100	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1101	 eval(shift(@insns));
1102	 eval(shift(@insns));
1103	 eval(shift(@insns));		# ror
1104	 eval(shift(@insns));
1105
1106	&vpslld	(@X[0],@X[0],2);
1107	 eval(shift(@insns));		# body_20_39
1108	 eval(shift(@insns));
1109	 eval(shift(@insns));
1110	 eval(shift(@insns));		# rol
1111	 eval(shift(@insns));
1112	 eval(shift(@insns));
1113	 eval(shift(@insns));		# ror
1114	 eval(shift(@insns));
1115
1116	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1117	 eval(shift(@insns));		# body_20_39
1118	 eval(shift(@insns));
1119	 eval(shift(@insns));
1120	 eval(shift(@insns));		# rol
1121	 eval(shift(@insns));
1122	 eval(shift(@insns));
1123	 eval(shift(@insns));		# rol
1124	 eval(shift(@insns));
1125
1126	 foreach (@insns) { eval; }	# remaining instructions
1127
1128  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1129}
1130
1131sub Xuplast_avx_80()
1132{ use integer;
1133  my $body = shift;
1134  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1135  my ($a,$b,$c,$d,$e);
1136
1137	 eval(shift(@insns));
1138	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1139	 eval(shift(@insns));
1140	 eval(shift(@insns));
1141	 eval(shift(@insns));
1142	 eval(shift(@insns));
1143
1144	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1145
1146	 foreach (@insns) { eval; }		# remaining instructions
1147
1148	&cmp	($inp,$num);
1149	&je	($done_avx_label);
1150
1151	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1152	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1153	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1154	&vmovdqu(@X[-3&7],"16($inp)");
1155	&vmovdqu(@X[-2&7],"32($inp)");
1156	&vmovdqu(@X[-1&7],"48($inp)");
1157	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1158	&add	($inp,64);
1159
1160  $Xi=0;
1161}
1162
1163sub Xloop_avx()
1164{ use integer;
1165  my $body = shift;
1166  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1167  my ($a,$b,$c,$d,$e);
1168
1169	 eval(shift(@insns));
1170	 eval(shift(@insns));
1171	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1172	 eval(shift(@insns));
1173	 eval(shift(@insns));
1174	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1175	 eval(shift(@insns));
1176	 eval(shift(@insns));
1177	 eval(shift(@insns));
1178	 eval(shift(@insns));
1179	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1180	 eval(shift(@insns));
1181	 eval(shift(@insns));
1182
1183	foreach (@insns) { eval; }
1184  $Xi++;
1185}
1186
1187sub Xtail_avx()
1188{ use integer;
1189  my $body = shift;
1190  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1191  my ($a,$b,$c,$d,$e);
1192
1193	foreach (@insns) { eval; }
1194}
1195
1196$code.=<<___;
1197.align	16
1198.Loop_avx:
1199___
1200	&Xupdate_avx_16_31(\&body_00_19);
1201	&Xupdate_avx_16_31(\&body_00_19);
1202	&Xupdate_avx_16_31(\&body_00_19);
1203	&Xupdate_avx_16_31(\&body_00_19);
1204	&Xupdate_avx_32_79(\&body_00_19);
1205	&Xupdate_avx_32_79(\&body_20_39);
1206	&Xupdate_avx_32_79(\&body_20_39);
1207	&Xupdate_avx_32_79(\&body_20_39);
1208	&Xupdate_avx_32_79(\&body_20_39);
1209	&Xupdate_avx_32_79(\&body_20_39);
1210	&Xupdate_avx_32_79(\&body_40_59);
1211	&Xupdate_avx_32_79(\&body_40_59);
1212	&Xupdate_avx_32_79(\&body_40_59);
1213	&Xupdate_avx_32_79(\&body_40_59);
1214	&Xupdate_avx_32_79(\&body_40_59);
1215	&Xupdate_avx_32_79(\&body_20_39);
1216	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1217
1218				$saved_j=$j; @saved_V=@V;
1219
1220	&Xloop_avx(\&body_20_39);
1221	&Xloop_avx(\&body_20_39);
1222	&Xloop_avx(\&body_20_39);
1223
1224$code.=<<___;
1225	add	0($ctx),$A			# update context
1226	add	4($ctx),@T[0]
1227	add	8($ctx),$C
1228	add	12($ctx),$D
1229	mov	$A,0($ctx)
1230	add	16($ctx),$E
1231	mov	@T[0],4($ctx)
1232	mov	@T[0],$B			# magic seed
1233	mov	$C,8($ctx)
1234	mov	$C,@T[1]
1235	mov	$D,12($ctx)
1236	xor	$D,@T[1]
1237	mov	$E,16($ctx)
1238	and	@T[1],@T[0]
1239	jmp	.Loop_avx
1240
1241.align	16
1242$done_avx_label:
1243___
1244				$j=$saved_j; @V=@saved_V;
1245
1246	&Xtail_avx(\&body_20_39);
1247	&Xtail_avx(\&body_20_39);
1248	&Xtail_avx(\&body_20_39);
1249
1250$code.=<<___;
1251	vzeroupper
1252
1253	add	0($ctx),$A			# update context
1254	add	4($ctx),@T[0]
1255	add	8($ctx),$C
1256	mov	$A,0($ctx)
1257	add	12($ctx),$D
1258	mov	@T[0],4($ctx)
1259	add	16($ctx),$E
1260	mov	$C,8($ctx)
1261	mov	$D,12($ctx)
1262	mov	$E,16($ctx)
1263___
1264$code.=<<___ if ($win64);
1265	movaps	-40-6*16(%r14),%xmm6
1266	movaps	-40-5*16(%r14),%xmm7
1267	movaps	-40-4*16(%r14),%xmm8
1268	movaps	-40-3*16(%r14),%xmm9
1269	movaps	-40-2*16(%r14),%xmm10
1270	movaps	-40-1*16(%r14),%xmm11
1271___
1272$code.=<<___;
1273	lea	(%r14),%rsi
1274	mov	-40(%rsi),%r14
1275	mov	-32(%rsi),%r13
1276	mov	-24(%rsi),%r12
1277	mov	-16(%rsi),%rbp
1278	mov	-8(%rsi),%rbx
1279	lea	(%rsi),%rsp
1280.Lepilogue_avx:
1281	ret
1282.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1283___
1284
1285if ($avx>1) {
1286use integer;
1287$Xi=4;					# reset variables
1288@X=map("%ymm$_",(4..7,0..3));
1289@Tx=map("%ymm$_",(8..10));
1290$Kx="%ymm11";
1291$j=0;
1292
1293my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1294my ($a5,$t0)=("%r12d","%edi");
1295
1296my ($A,$F,$B,$C,$D,$E)=@ROTX;
1297my $rx=0;
1298my $frame="%r13";
1299
1300$code.=<<___;
1301.type	sha1_block_data_order_avx2,\@function,3
1302.align	16
1303sha1_block_data_order_avx2:
1304_avx2_shortcut:
1305	mov	%rsp,%rax
1306	push	%rbx
1307	push	%rbp
1308	push	%r12
1309	push	%r13
1310	push	%r14
1311	vzeroupper
1312___
1313$code.=<<___ if ($win64);
1314	lea	-6*16(%rsp),%rsp
1315	vmovaps	%xmm6,-40-6*16(%rax)
1316	vmovaps	%xmm7,-40-5*16(%rax)
1317	vmovaps	%xmm8,-40-4*16(%rax)
1318	vmovaps	%xmm9,-40-3*16(%rax)
1319	vmovaps	%xmm10,-40-2*16(%rax)
1320	vmovaps	%xmm11,-40-1*16(%rax)
1321.Lprologue_avx2:
1322___
1323$code.=<<___;
1324	mov	%rax,%r14		# original %rsp
1325	mov	%rdi,$ctx		# reassigned argument
1326	mov	%rsi,$inp		# reassigned argument
1327	mov	%rdx,$num		# reassigned argument
1328
1329	lea	-640(%rsp),%rsp
1330	shl	\$6,$num
1331	 lea	64($inp),$frame
1332	and	\$-128,%rsp
1333	add	$inp,$num
1334	lea	K_XX_XX+64(%rip),$K_XX_XX
1335
1336	mov	0($ctx),$A		# load context
1337	 cmp	$num,$frame
1338	 cmovae	$inp,$frame		# next or same block
1339	mov	4($ctx),$F
1340	mov	8($ctx),$C
1341	mov	12($ctx),$D
1342	mov	16($ctx),$E
1343	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1344
1345	vmovdqu		($inp),%xmm0
1346	vmovdqu		16($inp),%xmm1
1347	vmovdqu		32($inp),%xmm2
1348	vmovdqu		48($inp),%xmm3
1349	lea		64($inp),$inp
1350	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1351	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1352	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1353	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1354	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1355	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1356	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1357	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1358	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1359
1360	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1361	vpaddd	$Kx,@X[-3&7],@X[1]
1362	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1363	vpaddd	$Kx,@X[-2&7],@X[2]
1364	vmovdqu	@X[1],32(%rsp)
1365	vpaddd	$Kx,@X[-1&7],@X[3]
1366	vmovdqu	@X[2],64(%rsp)
1367	vmovdqu	@X[3],96(%rsp)
1368___
1369for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1370    use integer;
1371
1372	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1373	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1374	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1375	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1376	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1377	&vpsrld	(@Tx[0],@X[0],31);
1378	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1379	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1380	&vpaddd	(@X[0],@X[0],@X[0]);
1381	&vpsrld	(@Tx[1],@Tx[2],30);
1382	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1383	&vpslld	(@Tx[2],@Tx[2],2);
1384	&vpxor	(@X[0],@X[0],@Tx[1]);
1385	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1386	&vpaddd	(@Tx[1],@X[0],$Kx);
1387	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1388
1389	push(@X,shift(@X));	# "rotate" X[]
1390}
1391$code.=<<___;
1392	lea	128(%rsp),$frame
1393	jmp	.Loop_avx2
1394.align	32
1395.Loop_avx2:
1396	rorx	\$2,$F,$B
1397	andn	$D,$F,$t0
1398	and	$C,$F
1399	xor	$t0,$F
1400___
1401sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1402	# at start $f=(b&c)^(~b&d), $b>>>=2
1403	return &bodyx_20_39() if ($rx==19); $rx++;
1404	(
1405	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1406
1407	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1408	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1409	'&andn	($t0,$a,$c)',			# ~b&d for next round
1410
1411	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1412	'&rorx	($a5,$a,27)',			# a<<<5
1413	'&rorx	($f,$a,2)',			# b>>>2 for next round
1414	'&and	($a,$b)',			# b&c for next round
1415
1416	'&add	($e,$a5)',			# e+=a<<<5
1417	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1418
1419	'unshift(@ROTX,pop(@ROTX)); $j++;'
1420	)
1421}
1422
1423sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1424	# on entry $f=b^c^d, $b>>>=2
1425	return &bodyx_40_59() if ($rx==39); $rx++;
1426	(
1427	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1428
1429	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1430	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1431
1432	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1433	'&rorx	($a5,$a,27)',			# a<<<5
1434	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1435	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1436
1437	'&add	($e,$a5)',			# e+=a<<<5
1438	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1439
1440	'unshift(@ROTX,pop(@ROTX)); $j++;'
1441	)
1442}
1443
1444sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1445	# on entry $f=((b^c)&(c^d)), $b>>>=2
1446	$rx++;
1447	(
1448	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1449
1450	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1451	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1452	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1453	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1454	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1455
1456	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1457	'&rorx	($a5,$a,27)',			# a<<<5
1458	'&rorx	($f,$a,2)',			# b>>>2 in next round
1459	'&xor	($a,$b)',			# b^c for next round
1460
1461	'&add	($e,$a5)',			# e+=a<<<5
1462	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1463	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1464
1465	'unshift(@ROTX,pop(@ROTX)); $j++;'
1466	)
1467}
1468
1469sub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
1470{ use integer;
1471  my $body = shift;
1472  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1473  my ($a,$b,$c,$d,$e);
1474
1475	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1476	 eval(shift(@insns));
1477	 eval(shift(@insns));
1478	 eval(shift(@insns));
1479	 eval(shift(@insns));
1480
1481	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1482	 eval(shift(@insns));
1483	 eval(shift(@insns));
1484	 eval(shift(@insns));
1485
1486	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1487	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1488	 eval(shift(@insns));
1489	 eval(shift(@insns));
1490
1491	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1492	 eval(shift(@insns));
1493	 eval(shift(@insns));
1494	 eval(shift(@insns));
1495	 eval(shift(@insns));
1496
1497	&vpsrld	(@Tx[0],@X[0],31);
1498	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1499	 eval(shift(@insns));
1500	 eval(shift(@insns));
1501	 eval(shift(@insns));
1502
1503	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1504	&vpaddd	(@X[0],@X[0],@X[0]);
1505	 eval(shift(@insns));
1506	 eval(shift(@insns));
1507
1508	&vpsrld	(@Tx[1],@Tx[2],30);
1509	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1510	 eval(shift(@insns));
1511	 eval(shift(@insns));
1512
1513	&vpslld	(@Tx[2],@Tx[2],2);
1514	&vpxor	(@X[0],@X[0],@Tx[1]);
1515	 eval(shift(@insns));
1516	 eval(shift(@insns));
1517
1518	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1519	 eval(shift(@insns));
1520	 eval(shift(@insns));
1521	 eval(shift(@insns));
1522
1523	&vpaddd	(@Tx[1],@X[0],$Kx);
1524	 eval(shift(@insns));
1525	 eval(shift(@insns));
1526	 eval(shift(@insns));
1527	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1528
1529	 foreach (@insns) { eval; }	# remaining instructions [if any]
1530
1531	$Xi++;
1532	push(@X,shift(@X));	# "rotate" X[]
1533}
1534
1535sub Xupdate_avx2_32_79()
1536{ use integer;
1537  my $body = shift;
1538  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1539  my ($a,$b,$c,$d,$e);
1540
1541	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1542	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1543	 eval(shift(@insns));
1544	 eval(shift(@insns));
1545
1546	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1547	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1548	 eval(shift(@insns));
1549	 eval(shift(@insns));
1550	 eval(shift(@insns));
1551
1552	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1553	 eval(shift(@insns));
1554	 eval(shift(@insns));
1555	 eval(shift(@insns));
1556
1557	&vpsrld	(@Tx[0],@X[0],30);
1558	&vpslld	(@X[0],@X[0],2);
1559	 eval(shift(@insns));
1560	 eval(shift(@insns));
1561	 eval(shift(@insns));
1562
1563	#&vpslld	(@X[0],@X[0],2);
1564	 eval(shift(@insns));
1565	 eval(shift(@insns));
1566	 eval(shift(@insns));
1567
1568	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1569	 eval(shift(@insns));
1570	 eval(shift(@insns));
1571	 eval(shift(@insns));
1572	 eval(shift(@insns));
1573
1574	&vpaddd	(@Tx[1],@X[0],$Kx);
1575	 eval(shift(@insns));
1576	 eval(shift(@insns));
1577	 eval(shift(@insns));
1578	 eval(shift(@insns));
1579
1580	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1581
1582	 foreach (@insns) { eval; }	# remaining instructions
1583
1584	$Xi++;
1585	push(@X,shift(@X));	# "rotate" X[]
1586}
1587
1588sub Xloop_avx2()
1589{ use integer;
1590  my $body = shift;
1591  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1592  my ($a,$b,$c,$d,$e);
1593
1594	 foreach (@insns) { eval; }
1595}
1596
1597	&align32();
1598	&Xupdate_avx2_32_79(\&bodyx_00_19);
1599	&Xupdate_avx2_32_79(\&bodyx_00_19);
1600	&Xupdate_avx2_32_79(\&bodyx_00_19);
1601	&Xupdate_avx2_32_79(\&bodyx_00_19);
1602
1603	&Xupdate_avx2_32_79(\&bodyx_20_39);
1604	&Xupdate_avx2_32_79(\&bodyx_20_39);
1605	&Xupdate_avx2_32_79(\&bodyx_20_39);
1606	&Xupdate_avx2_32_79(\&bodyx_20_39);
1607
1608	&align32();
1609	&Xupdate_avx2_32_79(\&bodyx_40_59);
1610	&Xupdate_avx2_32_79(\&bodyx_40_59);
1611	&Xupdate_avx2_32_79(\&bodyx_40_59);
1612	&Xupdate_avx2_32_79(\&bodyx_40_59);
1613
1614	&Xloop_avx2(\&bodyx_20_39);
1615	&Xloop_avx2(\&bodyx_20_39);
1616	&Xloop_avx2(\&bodyx_20_39);
1617	&Xloop_avx2(\&bodyx_20_39);
1618
1619$code.=<<___;
1620	lea	128($inp),$frame
1621	lea	128($inp),%rdi			# borrow $t0
1622	cmp	$num,$frame
1623	cmovae	$inp,$frame			# next or previous block
1624
1625	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1626	add	0($ctx),@ROTX[0]		# update context
1627	add	4($ctx),@ROTX[1]
1628	add	8($ctx),@ROTX[3]
1629	mov	@ROTX[0],0($ctx)
1630	add	12($ctx),@ROTX[4]
1631	mov	@ROTX[1],4($ctx)
1632	 mov	@ROTX[0],$A			# A=d
1633	add	16($ctx),@ROTX[5]
1634	 mov	@ROTX[3],$a5
1635	mov	@ROTX[3],8($ctx)
1636	 mov	@ROTX[4],$D			# D=b
1637	 #xchg	@ROTX[5],$F			# F=c, C=f
1638	mov	@ROTX[4],12($ctx)
1639	 mov	@ROTX[1],$F			# F=e
1640	mov	@ROTX[5],16($ctx)
1641	#mov	$F,16($ctx)
1642	 mov	@ROTX[5],$E			# E=c
1643	 mov	$a5,$C				# C=f
1644	 #xchg	$F,$E				# E=c, F=e
1645
1646	cmp	$num,$inp
1647	je	.Ldone_avx2
1648___
1649
1650$Xi=4;				# reset variables
1651@X=map("%ymm$_",(4..7,0..3));
1652
1653$code.=<<___;
1654	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1655	cmp	$num,%rdi			# borrowed $t0
1656	ja	.Last_avx2
1657
1658	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1659	vmovdqu		-48(%rdi),%xmm1
1660	vmovdqu		-32(%rdi),%xmm2
1661	vmovdqu		-16(%rdi),%xmm3
1662	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1663	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1664	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1665	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1666	jmp	.Last_avx2
1667
1668.align	32
1669.Last_avx2:
1670	lea	128+16(%rsp),$frame
1671	rorx	\$2,$F,$B
1672	andn	$D,$F,$t0
1673	and	$C,$F
1674	xor	$t0,$F
1675	sub	\$-128,$inp
1676___
1677	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1678
1679	&Xloop_avx2	(\&bodyx_00_19);
1680	&Xloop_avx2	(\&bodyx_00_19);
1681	&Xloop_avx2	(\&bodyx_00_19);
1682	&Xloop_avx2	(\&bodyx_00_19);
1683
1684	&Xloop_avx2	(\&bodyx_20_39);
1685	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1686	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1687	&Xloop_avx2	(\&bodyx_20_39);
1688	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1689	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1690	&Xloop_avx2	(\&bodyx_20_39);
1691	  &vmovdqu	("0(%rsp)",@Tx[0]);
1692	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1693	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1694	&Xloop_avx2	(\&bodyx_20_39);
1695	  &vmovdqu	("32(%rsp)",@Tx[1]);
1696	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1697	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1698
1699	&Xloop_avx2	(\&bodyx_40_59);
1700	&align32	();
1701	  &vmovdqu	("64(%rsp)",@X[2]);
1702	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1703	&Xloop_avx2	(\&bodyx_40_59);
1704	  &vmovdqu	("96(%rsp)",@X[3]);
1705	&Xloop_avx2	(\&bodyx_40_59);
1706	&Xupdate_avx2_16_31(\&bodyx_40_59);
1707
1708	&Xupdate_avx2_16_31(\&bodyx_20_39);
1709	&Xupdate_avx2_16_31(\&bodyx_20_39);
1710	&Xupdate_avx2_16_31(\&bodyx_20_39);
1711	&Xloop_avx2	(\&bodyx_20_39);
1712
1713$code.=<<___;
1714	lea	128(%rsp),$frame
1715
1716	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1717	add	0($ctx),@ROTX[0]		# update context
1718	add	4($ctx),@ROTX[1]
1719	add	8($ctx),@ROTX[3]
1720	mov	@ROTX[0],0($ctx)
1721	add	12($ctx),@ROTX[4]
1722	mov	@ROTX[1],4($ctx)
1723	 mov	@ROTX[0],$A			# A=d
1724	add	16($ctx),@ROTX[5]
1725	 mov	@ROTX[3],$a5
1726	mov	@ROTX[3],8($ctx)
1727	 mov	@ROTX[4],$D			# D=b
1728	 #xchg	@ROTX[5],$F			# F=c, C=f
1729	mov	@ROTX[4],12($ctx)
1730	 mov	@ROTX[1],$F			# F=e
1731	mov	@ROTX[5],16($ctx)
1732	#mov	$F,16($ctx)
1733	 mov	@ROTX[5],$E			# E=c
1734	 mov	$a5,$C				# C=f
1735	 #xchg	$F,$E				# E=c, F=e
1736
1737	cmp	$num,$inp
1738	jbe	.Loop_avx2
1739
1740.Ldone_avx2:
1741	vzeroupper
1742___
1743$code.=<<___ if ($win64);
1744	movaps	-40-6*16(%r14),%xmm6
1745	movaps	-40-5*16(%r14),%xmm7
1746	movaps	-40-4*16(%r14),%xmm8
1747	movaps	-40-3*16(%r14),%xmm9
1748	movaps	-40-2*16(%r14),%xmm10
1749	movaps	-40-1*16(%r14),%xmm11
1750___
1751$code.=<<___;
1752	lea	(%r14),%rsi
1753	mov	-40(%rsi),%r14
1754	mov	-32(%rsi),%r13
1755	mov	-24(%rsi),%r12
1756	mov	-16(%rsi),%rbp
1757	mov	-8(%rsi),%rbx
1758	lea	(%rsi),%rsp
1759.Lepilogue_avx2:
1760	ret
1761.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1762___
1763}
1764}
1765$code.=<<___;
1766.align	64
1767K_XX_XX:
1768.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1769.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1770.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1771.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1772.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1773.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1774.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1775.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1776.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1777.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1778.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1779___
1780}}}
1781$code.=<<___;
1782.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1783.align	64
1784___
1785
1786# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1787#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1788if ($win64) {
1789$rec="%rcx";
1790$frame="%rdx";
1791$context="%r8";
1792$disp="%r9";
1793
1794$code.=<<___;
1795.extern	__imp_RtlVirtualUnwind
1796.type	se_handler,\@abi-omnipotent
1797.align	16
1798se_handler:
1799	push	%rsi
1800	push	%rdi
1801	push	%rbx
1802	push	%rbp
1803	push	%r12
1804	push	%r13
1805	push	%r14
1806	push	%r15
1807	pushfq
1808	sub	\$64,%rsp
1809
1810	mov	120($context),%rax	# pull context->Rax
1811	mov	248($context),%rbx	# pull context->Rip
1812
1813	lea	.Lprologue(%rip),%r10
1814	cmp	%r10,%rbx		# context->Rip<.Lprologue
1815	jb	.Lcommon_seh_tail
1816
1817	mov	152($context),%rax	# pull context->Rsp
1818
1819	lea	.Lepilogue(%rip),%r10
1820	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1821	jae	.Lcommon_seh_tail
1822
1823	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1824
1825	mov	-8(%rax),%rbx
1826	mov	-16(%rax),%rbp
1827	mov	-24(%rax),%r12
1828	mov	-32(%rax),%r13
1829	mov	-40(%rax),%r14
1830	mov	%rbx,144($context)	# restore context->Rbx
1831	mov	%rbp,160($context)	# restore context->Rbp
1832	mov	%r12,216($context)	# restore context->R12
1833	mov	%r13,224($context)	# restore context->R13
1834	mov	%r14,232($context)	# restore context->R14
1835
1836	jmp	.Lcommon_seh_tail
1837.size	se_handler,.-se_handler
1838___
1839
1840$code.=<<___ if ($shaext);
1841.type	shaext_handler,\@abi-omnipotent
1842.align	16
1843shaext_handler:
1844	push	%rsi
1845	push	%rdi
1846	push	%rbx
1847	push	%rbp
1848	push	%r12
1849	push	%r13
1850	push	%r14
1851	push	%r15
1852	pushfq
1853	sub	\$64,%rsp
1854
1855	mov	120($context),%rax	# pull context->Rax
1856	mov	248($context),%rbx	# pull context->Rip
1857
1858	lea	.Lprologue_shaext(%rip),%r10
1859	cmp	%r10,%rbx		# context->Rip<.Lprologue
1860	jb	.Lcommon_seh_tail
1861
1862	lea	.Lepilogue_shaext(%rip),%r10
1863	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1864	jae	.Lcommon_seh_tail
1865
1866	lea	-8-4*16(%rax),%rsi
1867	lea	512($context),%rdi	# &context.Xmm6
1868	mov	\$8,%ecx
1869	.long	0xa548f3fc		# cld; rep movsq
1870
1871	jmp	.Lcommon_seh_tail
1872.size	shaext_handler,.-shaext_handler
1873___
1874
1875$code.=<<___;
1876.type	ssse3_handler,\@abi-omnipotent
1877.align	16
1878ssse3_handler:
1879	push	%rsi
1880	push	%rdi
1881	push	%rbx
1882	push	%rbp
1883	push	%r12
1884	push	%r13
1885	push	%r14
1886	push	%r15
1887	pushfq
1888	sub	\$64,%rsp
1889
1890	mov	120($context),%rax	# pull context->Rax
1891	mov	248($context),%rbx	# pull context->Rip
1892
1893	mov	8($disp),%rsi		# disp->ImageBase
1894	mov	56($disp),%r11		# disp->HandlerData
1895
1896	mov	0(%r11),%r10d		# HandlerData[0]
1897	lea	(%rsi,%r10),%r10	# prologue label
1898	cmp	%r10,%rbx		# context->Rip<prologue label
1899	jb	.Lcommon_seh_tail
1900
1901	mov	152($context),%rax	# pull context->Rsp
1902
1903	mov	4(%r11),%r10d		# HandlerData[1]
1904	lea	(%rsi,%r10),%r10	# epilogue label
1905	cmp	%r10,%rbx		# context->Rip>=epilogue label
1906	jae	.Lcommon_seh_tail
1907
1908	mov	232($context),%rax	# pull context->R14
1909
1910	lea	-40-6*16(%rax),%rsi
1911	lea	512($context),%rdi	# &context.Xmm6
1912	mov	\$12,%ecx
1913	.long	0xa548f3fc		# cld; rep movsq
1914
1915	mov	-8(%rax),%rbx
1916	mov	-16(%rax),%rbp
1917	mov	-24(%rax),%r12
1918	mov	-32(%rax),%r13
1919	mov	-40(%rax),%r14
1920	mov	%rbx,144($context)	# restore context->Rbx
1921	mov	%rbp,160($context)	# restore context->Rbp
1922	mov	%r12,216($context)	# restore cotnext->R12
1923	mov	%r13,224($context)	# restore cotnext->R13
1924	mov	%r14,232($context)	# restore cotnext->R14
1925
1926.Lcommon_seh_tail:
1927	mov	8(%rax),%rdi
1928	mov	16(%rax),%rsi
1929	mov	%rax,152($context)	# restore context->Rsp
1930	mov	%rsi,168($context)	# restore context->Rsi
1931	mov	%rdi,176($context)	# restore context->Rdi
1932
1933	mov	40($disp),%rdi		# disp->ContextRecord
1934	mov	$context,%rsi		# context
1935	mov	\$154,%ecx		# sizeof(CONTEXT)
1936	.long	0xa548f3fc		# cld; rep movsq
1937
1938	mov	$disp,%rsi
1939	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1940	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1941	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1942	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1943	mov	40(%rsi),%r10		# disp->ContextRecord
1944	lea	56(%rsi),%r11		# &disp->HandlerData
1945	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1946	mov	%r10,32(%rsp)		# arg5
1947	mov	%r11,40(%rsp)		# arg6
1948	mov	%r12,48(%rsp)		# arg7
1949	mov	%rcx,56(%rsp)		# arg8, (NULL)
1950	call	*__imp_RtlVirtualUnwind(%rip)
1951
1952	mov	\$1,%eax		# ExceptionContinueSearch
1953	add	\$64,%rsp
1954	popfq
1955	pop	%r15
1956	pop	%r14
1957	pop	%r13
1958	pop	%r12
1959	pop	%rbp
1960	pop	%rbx
1961	pop	%rdi
1962	pop	%rsi
1963	ret
1964.size	ssse3_handler,.-ssse3_handler
1965
1966.section	.pdata
1967.align	4
1968	.rva	.LSEH_begin_sha1_block_data_order
1969	.rva	.LSEH_end_sha1_block_data_order
1970	.rva	.LSEH_info_sha1_block_data_order
1971___
1972$code.=<<___ if ($shaext);
1973	.rva	.LSEH_begin_sha1_block_data_order_shaext
1974	.rva	.LSEH_end_sha1_block_data_order_shaext
1975	.rva	.LSEH_info_sha1_block_data_order_shaext
1976___
1977$code.=<<___;
1978	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1979	.rva	.LSEH_end_sha1_block_data_order_ssse3
1980	.rva	.LSEH_info_sha1_block_data_order_ssse3
1981___
1982$code.=<<___ if ($avx);
1983	.rva	.LSEH_begin_sha1_block_data_order_avx
1984	.rva	.LSEH_end_sha1_block_data_order_avx
1985	.rva	.LSEH_info_sha1_block_data_order_avx
1986___
1987$code.=<<___ if ($avx>1);
1988	.rva	.LSEH_begin_sha1_block_data_order_avx2
1989	.rva	.LSEH_end_sha1_block_data_order_avx2
1990	.rva	.LSEH_info_sha1_block_data_order_avx2
1991___
1992$code.=<<___;
1993.section	.xdata
1994.align	8
1995.LSEH_info_sha1_block_data_order:
1996	.byte	9,0,0,0
1997	.rva	se_handler
1998___
1999$code.=<<___ if ($shaext);
2000.LSEH_info_sha1_block_data_order_shaext:
2001	.byte	9,0,0,0
2002	.rva	shaext_handler
2003___
2004$code.=<<___;
2005.LSEH_info_sha1_block_data_order_ssse3:
2006	.byte	9,0,0,0
2007	.rva	ssse3_handler
2008	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2009___
2010$code.=<<___ if ($avx);
2011.LSEH_info_sha1_block_data_order_avx:
2012	.byte	9,0,0,0
2013	.rva	ssse3_handler
2014	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2015___
2016$code.=<<___ if ($avx>1);
2017.LSEH_info_sha1_block_data_order_avx2:
2018	.byte	9,0,0,0
2019	.rva	ssse3_handler
2020	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2021___
2022}
2023
2024####################################################################
2025
2026sub sha1rnds4 {
2027    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2028      my @opcode=(0x0f,0x3a,0xcc);
2029	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2030	my $c=$1;
2031	push @opcode,$c=~/^0/?oct($c):$c;
2032	return ".byte\t".join(',',@opcode);
2033    } else {
2034	return "sha1rnds4\t".@_[0];
2035    }
2036}
2037
2038sub sha1op38 {
2039    my $instr = shift;
2040    my %opcodelet = (
2041		"sha1nexte" => 0xc8,
2042  		"sha1msg1"  => 0xc9,
2043		"sha1msg2"  => 0xca	);
2044
2045    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2046      my @opcode=(0x0f,0x38);
2047      my $rex=0;
2048	$rex|=0x04			if ($2>=8);
2049	$rex|=0x01			if ($1>=8);
2050	unshift @opcode,0x40|$rex	if ($rex);
2051	push @opcode,$opcodelet{$instr};
2052	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2053	return ".byte\t".join(',',@opcode);
2054    } else {
2055	return $instr."\t".@_[0];
2056    }
2057}
2058
2059foreach (split("\n",$code)) {
2060	s/\`([^\`]*)\`/eval $1/geo;
2061
2062	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2063	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2064
2065	print $_,"\n";
2066}
2067close STDOUT;
2068