1#! /usr/bin/env perl
2# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# sha1_block procedure for x86_64.
18#
19# It was brought to my attention that on EM64T compiler-generated code
20# was far behind 32-bit assembler implementation. This is unlike on
21# Opteron where compiler-generated code was only 15% behind 32-bit
22# assembler, which originally made it hard to motivate the effort.
23# There was suggestion to mechanically translate 32-bit code, but I
24# dismissed it, reasoning that x86_64 offers enough register bank
25# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26# implementation:-) However! While 64-bit code does perform better
27# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28# x86_64 does offer larger *addressable* bank, but out-of-order core
29# reaches for even more registers through dynamic aliasing, and EM64T
30# core must have managed to run-time optimize even 32-bit code just as
31# good as 64-bit one. Performance improvement is summarized in the
32# following table:
33#
34#		gcc 3.4		32-bit asm	cycles/byte
35# Opteron	+45%		+20%		6.8
36# Xeon P4	+65%		+0%		9.9
37# Core2		+60%		+10%		7.0
38
39# August 2009.
40#
41# The code was revised to minimize code size and to maximize
42# "distance" between instructions producing input to 'lea'
43# instruction and the 'lea' instruction itself, which is essential
44# for Intel Atom core.
45
46# October 2010.
47#
48# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49# is to offload message schedule denoted by Wt in NIST specification,
50# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51# for background and implementation details. The only difference from
52# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53# to free temporary registers.
54
55# April 2011.
56#
57# Add AVX code path. See sha1-586.pl for further information.
58
59# May 2013.
60#
61# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62# and loading pair of consecutive blocks to 256-bit %ymm registers)
63# did not provide impressive performance improvement till a crucial
64# hint regarding the number of Xupdate iterations to pre-compute in
65# advance was provided by Ilya Albrekht of Intel Corp.
66
67# March 2014.
68#
69# Add support for Intel SHA Extensions.
70
71######################################################################
72# Current performance is summarized in following table. Numbers are
73# CPU clock cycles spent to process single byte (less is better).
74#
75#		x86_64		SSSE3		AVX[2]
76# P4		9.05		-
77# Opteron	6.26		-
78# Core2		6.55		6.05/+8%	-
79# Westmere	6.73		5.30/+27%	-
80# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
81# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
82# Haswell	5.45		4.15/+31%	3.57/+53%
83# Skylake	5.18		4.06/+28%	3.54/+46%
84# Bulldozer	9.11		5.95/+53%
85# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
86# VIA Nano	9.32		7.15/+30%
87# Atom		10.3		9.17/+12%
88# Silvermont	13.1(*)		9.37/+40%
89# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
91#
92# (*)	obviously suboptimal result, nothing was done about it,
93#	because SSSE3 code is compiled unconditionally;
94# (**)	SHAEXT result
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# In upstream, this is controlled by shelling out to the compiler to check
108# versions, but BoringSSL is intended to be used with pre-generated perlasm
109# output, so this isn't useful anyway.
110#
111# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
112# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
113# did not tie them together until after $shaext was added.
114$avx = 1;
115
116# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
117# been tested.
118$shaext=0;	### set to zero if compiling for 1.0.1
119$avx=1		if (!$shaext && $avx);
120
121open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
122*STDOUT=*OUT;
123
124$ctx="%rdi";	# 1st arg
125$inp="%rsi";	# 2nd arg
126$num="%rdx";	# 3rd arg
127
128# reassign arguments in order to produce more compact code
129$ctx="%r8";
130$inp="%r9";
131$num="%r10";
132
133$t0="%eax";
134$t1="%ebx";
135$t2="%ecx";
136@xi=("%edx","%ebp","%r14d");
137$A="%esi";
138$B="%edi";
139$C="%r11d";
140$D="%r12d";
141$E="%r13d";
142
143@V=($A,$B,$C,$D,$E);
144
145sub BODY_00_19 {
146my ($i,$a,$b,$c,$d,$e)=@_;
147my $j=$i+1;
148$code.=<<___ if ($i==0);
149	mov	`4*$i`($inp),$xi[0]
150	bswap	$xi[0]
151___
152$code.=<<___ if ($i<15);
153	mov	`4*$j`($inp),$xi[1]
154	mov	$d,$t0
155	mov	$xi[0],`4*$i`(%rsp)
156	mov	$a,$t2
157	bswap	$xi[1]
158	xor	$c,$t0
159	rol	\$5,$t2
160	and	$b,$t0
161	lea	0x5a827999($xi[0],$e),$e
162	add	$t2,$e
163	xor	$d,$t0
164	rol	\$30,$b
165	add	$t0,$e
166___
167$code.=<<___ if ($i>=15);
168	xor	`4*($j%16)`(%rsp),$xi[1]
169	mov	$d,$t0
170	mov	$xi[0],`4*($i%16)`(%rsp)
171	mov	$a,$t2
172	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
173	xor	$c,$t0
174	rol	\$5,$t2
175	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
176	and	$b,$t0
177	lea	0x5a827999($xi[0],$e),$e
178	rol	\$30,$b
179	xor	$d,$t0
180	add	$t2,$e
181	rol	\$1,$xi[1]
182	add	$t0,$e
183___
184push(@xi,shift(@xi));
185}
186
187sub BODY_20_39 {
188my ($i,$a,$b,$c,$d,$e)=@_;
189my $j=$i+1;
190my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
191$code.=<<___ if ($i<79);
192	xor	`4*($j%16)`(%rsp),$xi[1]
193	mov	$b,$t0
194	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
195	mov	$a,$t2
196	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
197	xor	$d,$t0
198	rol	\$5,$t2
199	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
200	lea	$K($xi[0],$e),$e
201	xor	$c,$t0
202	add	$t2,$e
203	rol	\$30,$b
204	add	$t0,$e
205	rol	\$1,$xi[1]
206___
207$code.=<<___ if ($i==79);
208	mov	$b,$t0
209	mov	$a,$t2
210	xor	$d,$t0
211	lea	$K($xi[0],$e),$e
212	rol	\$5,$t2
213	xor	$c,$t0
214	add	$t2,$e
215	rol	\$30,$b
216	add	$t0,$e
217___
218push(@xi,shift(@xi));
219}
220
221sub BODY_40_59 {
222my ($i,$a,$b,$c,$d,$e)=@_;
223my $j=$i+1;
224$code.=<<___;
225	xor	`4*($j%16)`(%rsp),$xi[1]
226	mov	$d,$t0
227	mov	$xi[0],`4*($i%16)`(%rsp)
228	mov	$d,$t1
229	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
230	and	$c,$t0
231	mov	$a,$t2
232	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
233	lea	0x8f1bbcdc($xi[0],$e),$e
234	xor	$c,$t1
235	rol	\$5,$t2
236	add	$t0,$e
237	rol	\$1,$xi[1]
238	and	$b,$t1
239	add	$t2,$e
240	rol	\$30,$b
241	add	$t1,$e
242___
243push(@xi,shift(@xi));
244}
245
246$code.=<<___;
247.text
248.extern	OPENSSL_ia32cap_P
249
250.globl	sha1_block_data_order
251.type	sha1_block_data_order,\@function,3
252.align	16
253sha1_block_data_order:
254	leaq	OPENSSL_ia32cap_P(%rip),%r10
255	mov	0(%r10),%r9d
256	mov	4(%r10),%r8d
257	mov	8(%r10),%r10d
258	test	\$`1<<9`,%r8d		# check SSSE3 bit
259	jz	.Lialu
260___
261$code.=<<___ if ($shaext);
262	test	\$`1<<29`,%r10d		# check SHA bit
263	jnz	_shaext_shortcut
264___
265$code.=<<___ if ($avx>1);
266	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
267	cmp	\$`1<<3|1<<5|1<<8`,%r10d
268	je	_avx2_shortcut
269___
270$code.=<<___ if ($avx);
271	and	\$`1<<28`,%r8d		# mask AVX bit
272	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
273	or	%r9d,%r8d
274	cmp	\$`1<<28|1<<30`,%r8d
275	je	_avx_shortcut
276___
277$code.=<<___;
278	jmp	_ssse3_shortcut
279
280.align	16
281.Lialu:
282	mov	%rsp,%rax
283	push	%rbx
284	push	%rbp
285	push	%r12
286	push	%r13
287	push	%r14
288	mov	%rdi,$ctx	# reassigned argument
289	sub	\$`8+16*4`,%rsp
290	mov	%rsi,$inp	# reassigned argument
291	and	\$-64,%rsp
292	mov	%rdx,$num	# reassigned argument
293	mov	%rax,`16*4`(%rsp)
294.Lprologue:
295
296	mov	0($ctx),$A
297	mov	4($ctx),$B
298	mov	8($ctx),$C
299	mov	12($ctx),$D
300	mov	16($ctx),$E
301	jmp	.Lloop
302
303.align	16
304.Lloop:
305___
306for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
307for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
308for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
309for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
310$code.=<<___;
311	add	0($ctx),$A
312	add	4($ctx),$B
313	add	8($ctx),$C
314	add	12($ctx),$D
315	add	16($ctx),$E
316	mov	$A,0($ctx)
317	mov	$B,4($ctx)
318	mov	$C,8($ctx)
319	mov	$D,12($ctx)
320	mov	$E,16($ctx)
321
322	sub	\$1,$num
323	lea	`16*4`($inp),$inp
324	jnz	.Lloop
325
326	mov	`16*4`(%rsp),%rsi
327	mov	-40(%rsi),%r14
328	mov	-32(%rsi),%r13
329	mov	-24(%rsi),%r12
330	mov	-16(%rsi),%rbp
331	mov	-8(%rsi),%rbx
332	lea	(%rsi),%rsp
333.Lepilogue:
334	ret
335.size	sha1_block_data_order,.-sha1_block_data_order
336___
337if ($shaext) {{{
338######################################################################
339# Intel SHA Extensions implementation of SHA1 update function.
340#
341my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
342my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
343my @MSG=map("%xmm$_",(4..7));
344
345$code.=<<___;
346.type	sha1_block_data_order_shaext,\@function,3
347.align	32
348sha1_block_data_order_shaext:
349_shaext_shortcut:
350___
351$code.=<<___ if ($win64);
352	lea	`-8-4*16`(%rsp),%rsp
353	movaps	%xmm6,-8-4*16(%rax)
354	movaps	%xmm7,-8-3*16(%rax)
355	movaps	%xmm8,-8-2*16(%rax)
356	movaps	%xmm9,-8-1*16(%rax)
357.Lprologue_shaext:
358___
359$code.=<<___;
360	movdqu	($ctx),$ABCD
361	movd	16($ctx),$E
362	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
363
364	movdqu	($inp),@MSG[0]
365	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
366	movdqu	0x10($inp),@MSG[1]
367	pshufd	\$0b00011011,$E,$E		# flip word order
368	movdqu	0x20($inp),@MSG[2]
369	pshufb	$BSWAP,@MSG[0]
370	movdqu	0x30($inp),@MSG[3]
371	pshufb	$BSWAP,@MSG[1]
372	pshufb	$BSWAP,@MSG[2]
373	movdqa	$E,$E_SAVE			# offload $E
374	pshufb	$BSWAP,@MSG[3]
375	jmp	.Loop_shaext
376
377.align	16
378.Loop_shaext:
379	dec		$num
380	lea		0x40($inp),%r8		# next input block
381	paddd		@MSG[0],$E
382	cmovne		%r8,$inp
383	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
384___
385for($i=0;$i<20-4;$i+=2) {
386$code.=<<___;
387	sha1msg1	@MSG[1],@MSG[0]
388	movdqa		$ABCD,$E_
389	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
390	sha1nexte	@MSG[1],$E_
391	pxor		@MSG[2],@MSG[0]
392	sha1msg1	@MSG[2],@MSG[1]
393	sha1msg2	@MSG[3],@MSG[0]
394
395	movdqa		$ABCD,$E
396	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
397	sha1nexte	@MSG[2],$E
398	pxor		@MSG[3],@MSG[1]
399	sha1msg2	@MSG[0],@MSG[1]
400___
401	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
402}
403$code.=<<___;
404	movdqu		($inp),@MSG[0]
405	movdqa		$ABCD,$E_
406	sha1rnds4	\$3,$E,$ABCD		# 64-67
407	sha1nexte	@MSG[1],$E_
408	movdqu		0x10($inp),@MSG[1]
409	pshufb		$BSWAP,@MSG[0]
410
411	movdqa		$ABCD,$E
412	sha1rnds4	\$3,$E_,$ABCD		# 68-71
413	sha1nexte	@MSG[2],$E
414	movdqu		0x20($inp),@MSG[2]
415	pshufb		$BSWAP,@MSG[1]
416
417	movdqa		$ABCD,$E_
418	sha1rnds4	\$3,$E,$ABCD		# 72-75
419	sha1nexte	@MSG[3],$E_
420	movdqu		0x30($inp),@MSG[3]
421	pshufb		$BSWAP,@MSG[2]
422
423	movdqa		$ABCD,$E
424	sha1rnds4	\$3,$E_,$ABCD		# 76-79
425	sha1nexte	$E_SAVE,$E
426	pshufb		$BSWAP,@MSG[3]
427
428	paddd		$ABCD_SAVE,$ABCD
429	movdqa		$E,$E_SAVE		# offload $E
430
431	jnz		.Loop_shaext
432
433	pshufd	\$0b00011011,$ABCD,$ABCD
434	pshufd	\$0b00011011,$E,$E
435	movdqu	$ABCD,($ctx)
436	movd	$E,16($ctx)
437___
438$code.=<<___ if ($win64);
439	movaps	-8-4*16(%rax),%xmm6
440	movaps	-8-3*16(%rax),%xmm7
441	movaps	-8-2*16(%rax),%xmm8
442	movaps	-8-1*16(%rax),%xmm9
443	mov	%rax,%rsp
444.Lepilogue_shaext:
445___
446$code.=<<___;
447	ret
448.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
449___
450}}}
451{{{
452my $Xi=4;
453my @X=map("%xmm$_",(4..7,0..3));
454my @Tx=map("%xmm$_",(8..10));
455my $Kx="%xmm11";
456my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
457my @T=("%esi","%edi");
458my $j=0;
459my $rx=0;
460my $K_XX_XX="%r14";
461my $fp="%r11";
462
463my $_rol=sub { &rol(@_) };
464my $_ror=sub { &ror(@_) };
465
466{ my $sn;
467sub align32() {
468  ++$sn;
469$code.=<<___;
470	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
471.align	32
472.Lalign32_$sn:
473___
474}
475}
476
477$code.=<<___;
478.type	sha1_block_data_order_ssse3,\@function,3
479.align	16
480sha1_block_data_order_ssse3:
481_ssse3_shortcut:
482	mov	%rsp,$fp	# frame pointer
483	push	%rbx
484	push	%rbp
485	push	%r12
486	push	%r13		# redundant, done to share Win64 SE handler
487	push	%r14
488	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
489___
490$code.=<<___ if ($win64);
491	movaps	%xmm6,-40-6*16($fp)
492	movaps	%xmm7,-40-5*16($fp)
493	movaps	%xmm8,-40-4*16($fp)
494	movaps	%xmm9,-40-3*16($fp)
495	movaps	%xmm10,-40-2*16($fp)
496	movaps	%xmm11,-40-1*16($fp)
497.Lprologue_ssse3:
498___
499$code.=<<___;
500	and	\$-64,%rsp
501	mov	%rdi,$ctx	# reassigned argument
502	mov	%rsi,$inp	# reassigned argument
503	mov	%rdx,$num	# reassigned argument
504
505	shl	\$6,$num
506	add	$inp,$num
507	lea	K_XX_XX+64(%rip),$K_XX_XX
508
509	mov	0($ctx),$A		# load context
510	mov	4($ctx),$B
511	mov	8($ctx),$C
512	mov	12($ctx),$D
513	mov	$B,@T[0]		# magic seed
514	mov	16($ctx),$E
515	mov	$C,@T[1]
516	xor	$D,@T[1]
517	and	@T[1],@T[0]
518
519	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
520	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
521	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
522	movdqu	16($inp),@X[-3&7]
523	movdqu	32($inp),@X[-2&7]
524	movdqu	48($inp),@X[-1&7]
525	pshufb	@X[2],@X[-4&7]		# byte swap
526	pshufb	@X[2],@X[-3&7]
527	pshufb	@X[2],@X[-2&7]
528	add	\$64,$inp
529	paddd	@Tx[1],@X[-4&7]		# add K_00_19
530	pshufb	@X[2],@X[-1&7]
531	paddd	@Tx[1],@X[-3&7]
532	paddd	@Tx[1],@X[-2&7]
533	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
534	psubd	@Tx[1],@X[-4&7]		# restore X[]
535	movdqa	@X[-3&7],16(%rsp)
536	psubd	@Tx[1],@X[-3&7]
537	movdqa	@X[-2&7],32(%rsp)
538	psubd	@Tx[1],@X[-2&7]
539	jmp	.Loop_ssse3
540___
541
542sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
543{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
544  my $arg = pop;
545    $arg = "\$$arg" if ($arg*1 eq $arg);
546    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
547}
548
549sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
550{ use integer;
551  my $body = shift;
552  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
553  my ($a,$b,$c,$d,$e);
554
555	 eval(shift(@insns));		# ror
556	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
557	 eval(shift(@insns));
558	&movdqa	(@Tx[0],@X[-1&7]);
559	  &paddd	(@Tx[1],@X[-1&7]);
560	 eval(shift(@insns));
561	 eval(shift(@insns));
562
563	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
564	 eval(shift(@insns));
565	 eval(shift(@insns));		# rol
566	 eval(shift(@insns));
567	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
568	 eval(shift(@insns));
569	 eval(shift(@insns));
570
571	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
572	 eval(shift(@insns));
573	 eval(shift(@insns));		# ror
574	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
575	 eval(shift(@insns));
576	 eval(shift(@insns));
577	 eval(shift(@insns));
578
579	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
580	 eval(shift(@insns));
581	 eval(shift(@insns));		# rol
582	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
583	 eval(shift(@insns));
584	 eval(shift(@insns));
585
586	&movdqa	(@Tx[2],@X[0]);
587	 eval(shift(@insns));
588	 eval(shift(@insns));
589	 eval(shift(@insns));		# ror
590	&movdqa	(@Tx[0],@X[0]);
591	 eval(shift(@insns));
592
593	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
594	&paddd	(@X[0],@X[0]);
595	 eval(shift(@insns));
596	 eval(shift(@insns));
597
598	&psrld	(@Tx[0],31);
599	 eval(shift(@insns));
600	 eval(shift(@insns));		# rol
601	 eval(shift(@insns));
602	&movdqa	(@Tx[1],@Tx[2]);
603	 eval(shift(@insns));
604	 eval(shift(@insns));
605
606	&psrld	(@Tx[2],30);
607	 eval(shift(@insns));
608	 eval(shift(@insns));		# ror
609	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
610	 eval(shift(@insns));
611	 eval(shift(@insns));
612	 eval(shift(@insns));
613
614	&pslld	(@Tx[1],2);
615	&pxor	(@X[0],@Tx[2]);
616	 eval(shift(@insns));
617	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
618	 eval(shift(@insns));		# rol
619	 eval(shift(@insns));
620	 eval(shift(@insns));
621
622	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
623	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
624
625	 foreach (@insns) { eval; }	# remaining instructions [if any]
626
627  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
628		push(@Tx,shift(@Tx));
629}
630
631sub Xupdate_ssse3_32_79()
632{ use integer;
633  my $body = shift;
634  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
635  my ($a,$b,$c,$d,$e);
636
637	 eval(shift(@insns))		if ($Xi==8);
638	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
639	 eval(shift(@insns))		if ($Xi==8);
640	 eval(shift(@insns));		# body_20_39
641	 eval(shift(@insns));
642	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
643	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
644	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
645	 eval(shift(@insns));
646	 eval(shift(@insns));		# rol
647
648	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
649	 eval(shift(@insns));
650	 eval(shift(@insns));
651	if ($Xi%5) {
652	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
653	} else {			# ... or load next one
654	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
655	}
656	 eval(shift(@insns));		# ror
657	  &paddd	(@Tx[1],@X[-1&7]);
658	 eval(shift(@insns));
659
660	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
661	 eval(shift(@insns));		# body_20_39
662	 eval(shift(@insns));
663	 eval(shift(@insns));
664	 eval(shift(@insns));		# rol
665	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
666
667	&movdqa	(@Tx[0],@X[0]);
668	 eval(shift(@insns));
669	 eval(shift(@insns));
670	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
671	 eval(shift(@insns));		# ror
672	 eval(shift(@insns));
673	 eval(shift(@insns));		# body_20_39
674
675	&pslld	(@X[0],2);
676	 eval(shift(@insns));
677	 eval(shift(@insns));
678	&psrld	(@Tx[0],30);
679	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
680	 eval(shift(@insns));
681	 eval(shift(@insns));
682	 eval(shift(@insns));		# ror
683
684	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
685	 eval(shift(@insns));
686	 eval(shift(@insns));		# body_20_39
687	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
688	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
689	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
690	 eval(shift(@insns));
691	 eval(shift(@insns));		# rol
692	 eval(shift(@insns));
693	 eval(shift(@insns));
694	 eval(shift(@insns));		# rol
695	 eval(shift(@insns));
696
697	 foreach (@insns) { eval; }	# remaining instructions
698
699  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
700		push(@Tx,shift(@Tx));
701}
702
703sub Xuplast_ssse3_80()
704{ use integer;
705  my $body = shift;
706  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
707  my ($a,$b,$c,$d,$e);
708
709	 eval(shift(@insns));
710	 eval(shift(@insns));
711	 eval(shift(@insns));
712	 eval(shift(@insns));
713	  &paddd	(@Tx[1],@X[-1&7]);
714	 eval(shift(@insns));
715	 eval(shift(@insns));
716
717	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
718
719	 foreach (@insns) { eval; }		# remaining instructions
720
721	&cmp	($inp,$num);
722	&je	(".Ldone_ssse3");
723
724	unshift(@Tx,pop(@Tx));
725
726	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
727	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
728	&movdqu	(@X[-4&7],"0($inp)");		# load input
729	&movdqu	(@X[-3&7],"16($inp)");
730	&movdqu	(@X[-2&7],"32($inp)");
731	&movdqu	(@X[-1&7],"48($inp)");
732	&pshufb	(@X[-4&7],@X[2]);		# byte swap
733	&add	($inp,64);
734
735  $Xi=0;
736}
737
738sub Xloop_ssse3()
739{ use integer;
740  my $body = shift;
741  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
742  my ($a,$b,$c,$d,$e);
743
744	 eval(shift(@insns));
745	 eval(shift(@insns));
746	 eval(shift(@insns));
747	&pshufb	(@X[($Xi-3)&7],@X[2]);
748	 eval(shift(@insns));
749	 eval(shift(@insns));
750	 eval(shift(@insns));
751	 eval(shift(@insns));
752	&paddd	(@X[($Xi-4)&7],@Tx[1]);
753	 eval(shift(@insns));
754	 eval(shift(@insns));
755	 eval(shift(@insns));
756	 eval(shift(@insns));
757	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
758	 eval(shift(@insns));
759	 eval(shift(@insns));
760	 eval(shift(@insns));
761	 eval(shift(@insns));
762	&psubd	(@X[($Xi-4)&7],@Tx[1]);
763
764	foreach (@insns) { eval; }
765  $Xi++;
766}
767
768sub Xtail_ssse3()
769{ use integer;
770  my $body = shift;
771  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
772  my ($a,$b,$c,$d,$e);
773
774	foreach (@insns) { eval; }
775}
776
777sub body_00_19 () {	# ((c^d)&b)^d
778	# on start @T[0]=(c^d)&b
779	return &body_20_39() if ($rx==19); $rx++;
780	(
781	'($a,$b,$c,$d,$e)=@V;'.
782	'&$_ror	($b,$j?7:2)',	# $b>>>2
783	'&xor	(@T[0],$d)',
784	'&mov	(@T[1],$a)',	# $b for next round
785
786	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
787	'&xor	($b,$c)',	# $c^$d for next round
788
789	'&$_rol	($a,5)',
790	'&add	($e,@T[0])',
791	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
792
793	'&xor	($b,$c)',	# restore $b
794	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
795	);
796}
797
798sub body_20_39 () {	# b^d^c
799	# on entry @T[0]=b^d
800	return &body_40_59() if ($rx==39); $rx++;
801	(
802	'($a,$b,$c,$d,$e)=@V;'.
803	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
804	'&xor	(@T[0],$d)	if($j==19);'.
805	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
806	'&mov	(@T[1],$a)',	# $b for next round
807
808	'&$_rol	($a,5)',
809	'&add	($e,@T[0])',
810	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
811
812	'&$_ror	($b,7)',	# $b>>>2
813	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
814	);
815}
816
817sub body_40_59 () {	# ((b^c)&(c^d))^c
818	# on entry @T[0]=(b^c), (c^=d)
819	$rx++;
820	(
821	'($a,$b,$c,$d,$e)=@V;'.
822	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
823	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
824	'&xor	($c,$d)		if ($j>=40)',	# restore $c
825
826	'&$_ror	($b,7)',	# $b>>>2
827	'&mov	(@T[1],$a)',	# $b for next round
828	'&xor	(@T[0],$c)',
829
830	'&$_rol	($a,5)',
831	'&add	($e,@T[0])',
832	'&xor	(@T[1],$c)	if ($j==59);'.
833	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
834
835	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
836	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
837	);
838}
839$code.=<<___;
840.align	16
841.Loop_ssse3:
842___
843	&Xupdate_ssse3_16_31(\&body_00_19);
844	&Xupdate_ssse3_16_31(\&body_00_19);
845	&Xupdate_ssse3_16_31(\&body_00_19);
846	&Xupdate_ssse3_16_31(\&body_00_19);
847	&Xupdate_ssse3_32_79(\&body_00_19);
848	&Xupdate_ssse3_32_79(\&body_20_39);
849	&Xupdate_ssse3_32_79(\&body_20_39);
850	&Xupdate_ssse3_32_79(\&body_20_39);
851	&Xupdate_ssse3_32_79(\&body_20_39);
852	&Xupdate_ssse3_32_79(\&body_20_39);
853	&Xupdate_ssse3_32_79(\&body_40_59);
854	&Xupdate_ssse3_32_79(\&body_40_59);
855	&Xupdate_ssse3_32_79(\&body_40_59);
856	&Xupdate_ssse3_32_79(\&body_40_59);
857	&Xupdate_ssse3_32_79(\&body_40_59);
858	&Xupdate_ssse3_32_79(\&body_20_39);
859	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
860
861				$saved_j=$j; @saved_V=@V;
862
863	&Xloop_ssse3(\&body_20_39);
864	&Xloop_ssse3(\&body_20_39);
865	&Xloop_ssse3(\&body_20_39);
866
867$code.=<<___;
868	add	0($ctx),$A			# update context
869	add	4($ctx),@T[0]
870	add	8($ctx),$C
871	add	12($ctx),$D
872	mov	$A,0($ctx)
873	add	16($ctx),$E
874	mov	@T[0],4($ctx)
875	mov	@T[0],$B			# magic seed
876	mov	$C,8($ctx)
877	mov	$C,@T[1]
878	mov	$D,12($ctx)
879	xor	$D,@T[1]
880	mov	$E,16($ctx)
881	and	@T[1],@T[0]
882	jmp	.Loop_ssse3
883
884.align	16
885.Ldone_ssse3:
886___
887				$j=$saved_j; @V=@saved_V;
888
889	&Xtail_ssse3(\&body_20_39);
890	&Xtail_ssse3(\&body_20_39);
891	&Xtail_ssse3(\&body_20_39);
892
893$code.=<<___;
894	add	0($ctx),$A			# update context
895	add	4($ctx),@T[0]
896	add	8($ctx),$C
897	mov	$A,0($ctx)
898	add	12($ctx),$D
899	mov	@T[0],4($ctx)
900	add	16($ctx),$E
901	mov	$C,8($ctx)
902	mov	$D,12($ctx)
903	mov	$E,16($ctx)
904___
905$code.=<<___ if ($win64);
906	movaps	-40-6*16($fp),%xmm6
907	movaps	-40-5*16($fp),%xmm7
908	movaps	-40-4*16($fp),%xmm8
909	movaps	-40-3*16($fp),%xmm9
910	movaps	-40-2*16($fp),%xmm10
911	movaps	-40-1*16($fp),%xmm11
912___
913$code.=<<___;
914	mov	-40($fp),%r14
915	mov	-32($fp),%r13
916	mov	-24($fp),%r12
917	mov	-16($fp),%rbp
918	mov	-8($fp),%rbx
919	lea	($fp),%rsp
920.Lepilogue_ssse3:
921	ret
922.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
923___
924
925if ($avx) {
926$Xi=4;				# reset variables
927@X=map("%xmm$_",(4..7,0..3));
928@Tx=map("%xmm$_",(8..10));
929$j=0;
930$rx=0;
931
932my $done_avx_label=".Ldone_avx";
933
934my $_rol=sub { &shld(@_[0],@_) };
935my $_ror=sub { &shrd(@_[0],@_) };
936
937$code.=<<___;
938.type	sha1_block_data_order_avx,\@function,3
939.align	16
940sha1_block_data_order_avx:
941_avx_shortcut:
942	mov	%rsp,$fp
943	push	%rbx
944	push	%rbp
945	push	%r12
946	push	%r13		# redundant, done to share Win64 SE handler
947	push	%r14
948	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
949	vzeroupper
950___
951$code.=<<___ if ($win64);
952	vmovaps	%xmm6,-40-6*16($fp)
953	vmovaps	%xmm7,-40-5*16($fp)
954	vmovaps	%xmm8,-40-4*16($fp)
955	vmovaps	%xmm9,-40-3*16($fp)
956	vmovaps	%xmm10,-40-2*16($fp)
957	vmovaps	%xmm11,-40-1*16($fp)
958.Lprologue_avx:
959___
960$code.=<<___;
961	and	\$-64,%rsp
962	mov	%rdi,$ctx	# reassigned argument
963	mov	%rsi,$inp	# reassigned argument
964	mov	%rdx,$num	# reassigned argument
965
966	shl	\$6,$num
967	add	$inp,$num
968	lea	K_XX_XX+64(%rip),$K_XX_XX
969
970	mov	0($ctx),$A		# load context
971	mov	4($ctx),$B
972	mov	8($ctx),$C
973	mov	12($ctx),$D
974	mov	$B,@T[0]		# magic seed
975	mov	16($ctx),$E
976	mov	$C,@T[1]
977	xor	$D,@T[1]
978	and	@T[1],@T[0]
979
980	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
981	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
982	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
983	vmovdqu	16($inp),@X[-3&7]
984	vmovdqu	32($inp),@X[-2&7]
985	vmovdqu	48($inp),@X[-1&7]
986	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
987	add	\$64,$inp
988	vpshufb	@X[2],@X[-3&7],@X[-3&7]
989	vpshufb	@X[2],@X[-2&7],@X[-2&7]
990	vpshufb	@X[2],@X[-1&7],@X[-1&7]
991	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
992	vpaddd	$Kx,@X[-3&7],@X[1]
993	vpaddd	$Kx,@X[-2&7],@X[2]
994	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
995	vmovdqa	@X[1],16(%rsp)
996	vmovdqa	@X[2],32(%rsp)
997	jmp	.Loop_avx
998___
999
1000sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
1001{ use integer;
1002  my $body = shift;
1003  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1004  my ($a,$b,$c,$d,$e);
1005
1006	 eval(shift(@insns));
1007	 eval(shift(@insns));
1008	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1009	 eval(shift(@insns));
1010	 eval(shift(@insns));
1011
1012	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1013	 eval(shift(@insns));
1014	 eval(shift(@insns));
1015	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1016	 eval(shift(@insns));
1017	 eval(shift(@insns));
1018	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1019	 eval(shift(@insns));
1020	 eval(shift(@insns));
1021
1022	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1023	 eval(shift(@insns));
1024	 eval(shift(@insns));
1025	 eval(shift(@insns));
1026	 eval(shift(@insns));
1027
1028	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1029	 eval(shift(@insns));
1030	 eval(shift(@insns));
1031	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1032	 eval(shift(@insns));
1033	 eval(shift(@insns));
1034
1035	&vpsrld	(@Tx[0],@X[0],31);
1036	 eval(shift(@insns));
1037	 eval(shift(@insns));
1038	 eval(shift(@insns));
1039	 eval(shift(@insns));
1040
1041	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1042	&vpaddd	(@X[0],@X[0],@X[0]);
1043	 eval(shift(@insns));
1044	 eval(shift(@insns));
1045	 eval(shift(@insns));
1046	 eval(shift(@insns));
1047
1048	&vpsrld	(@Tx[1],@Tx[2],30);
1049	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1050	 eval(shift(@insns));
1051	 eval(shift(@insns));
1052	 eval(shift(@insns));
1053	 eval(shift(@insns));
1054
1055	&vpslld	(@Tx[2],@Tx[2],2);
1056	&vpxor	(@X[0],@X[0],@Tx[1]);
1057	 eval(shift(@insns));
1058	 eval(shift(@insns));
1059	 eval(shift(@insns));
1060	 eval(shift(@insns));
1061
1062	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1063	 eval(shift(@insns));
1064	 eval(shift(@insns));
1065	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1066	 eval(shift(@insns));
1067	 eval(shift(@insns));
1068
1069
1070	 foreach (@insns) { eval; }	# remaining instructions [if any]
1071
1072  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1073}
1074
1075sub Xupdate_avx_32_79()
1076{ use integer;
1077  my $body = shift;
1078  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1079  my ($a,$b,$c,$d,$e);
1080
1081	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1082	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1083	 eval(shift(@insns));		# body_20_39
1084	 eval(shift(@insns));
1085	 eval(shift(@insns));
1086	 eval(shift(@insns));		# rol
1087
1088	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1089	 eval(shift(@insns));
1090	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1091	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1092	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1093	 eval(shift(@insns));		# ror
1094	 eval(shift(@insns));
1095
1096	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1097	 eval(shift(@insns));		# body_20_39
1098	 eval(shift(@insns));
1099	 eval(shift(@insns));
1100	 eval(shift(@insns));		# rol
1101
1102	&vpsrld	(@Tx[0],@X[0],30);
1103	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1104	 eval(shift(@insns));
1105	 eval(shift(@insns));
1106	 eval(shift(@insns));		# ror
1107	 eval(shift(@insns));
1108
1109	&vpslld	(@X[0],@X[0],2);
1110	 eval(shift(@insns));		# body_20_39
1111	 eval(shift(@insns));
1112	 eval(shift(@insns));
1113	 eval(shift(@insns));		# rol
1114	 eval(shift(@insns));
1115	 eval(shift(@insns));
1116	 eval(shift(@insns));		# ror
1117	 eval(shift(@insns));
1118
1119	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1120	 eval(shift(@insns));		# body_20_39
1121	 eval(shift(@insns));
1122	 eval(shift(@insns));
1123	 eval(shift(@insns));		# rol
1124	 eval(shift(@insns));
1125	 eval(shift(@insns));
1126	 eval(shift(@insns));		# rol
1127	 eval(shift(@insns));
1128
1129	 foreach (@insns) { eval; }	# remaining instructions
1130
1131  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1132}
1133
1134sub Xuplast_avx_80()
1135{ use integer;
1136  my $body = shift;
1137  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1138  my ($a,$b,$c,$d,$e);
1139
1140	 eval(shift(@insns));
1141	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1142	 eval(shift(@insns));
1143	 eval(shift(@insns));
1144	 eval(shift(@insns));
1145	 eval(shift(@insns));
1146
1147	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1148
1149	 foreach (@insns) { eval; }		# remaining instructions
1150
1151	&cmp	($inp,$num);
1152	&je	($done_avx_label);
1153
1154	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1155	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1156	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1157	&vmovdqu(@X[-3&7],"16($inp)");
1158	&vmovdqu(@X[-2&7],"32($inp)");
1159	&vmovdqu(@X[-1&7],"48($inp)");
1160	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1161	&add	($inp,64);
1162
1163  $Xi=0;
1164}
1165
1166sub Xloop_avx()
1167{ use integer;
1168  my $body = shift;
1169  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1170  my ($a,$b,$c,$d,$e);
1171
1172	 eval(shift(@insns));
1173	 eval(shift(@insns));
1174	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1175	 eval(shift(@insns));
1176	 eval(shift(@insns));
1177	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1178	 eval(shift(@insns));
1179	 eval(shift(@insns));
1180	 eval(shift(@insns));
1181	 eval(shift(@insns));
1182	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1183	 eval(shift(@insns));
1184	 eval(shift(@insns));
1185
1186	foreach (@insns) { eval; }
1187  $Xi++;
1188}
1189
1190sub Xtail_avx()
1191{ use integer;
1192  my $body = shift;
1193  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1194  my ($a,$b,$c,$d,$e);
1195
1196	foreach (@insns) { eval; }
1197}
1198
1199$code.=<<___;
1200.align	16
1201.Loop_avx:
1202___
1203	&Xupdate_avx_16_31(\&body_00_19);
1204	&Xupdate_avx_16_31(\&body_00_19);
1205	&Xupdate_avx_16_31(\&body_00_19);
1206	&Xupdate_avx_16_31(\&body_00_19);
1207	&Xupdate_avx_32_79(\&body_00_19);
1208	&Xupdate_avx_32_79(\&body_20_39);
1209	&Xupdate_avx_32_79(\&body_20_39);
1210	&Xupdate_avx_32_79(\&body_20_39);
1211	&Xupdate_avx_32_79(\&body_20_39);
1212	&Xupdate_avx_32_79(\&body_20_39);
1213	&Xupdate_avx_32_79(\&body_40_59);
1214	&Xupdate_avx_32_79(\&body_40_59);
1215	&Xupdate_avx_32_79(\&body_40_59);
1216	&Xupdate_avx_32_79(\&body_40_59);
1217	&Xupdate_avx_32_79(\&body_40_59);
1218	&Xupdate_avx_32_79(\&body_20_39);
1219	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1220
1221				$saved_j=$j; @saved_V=@V;
1222
1223	&Xloop_avx(\&body_20_39);
1224	&Xloop_avx(\&body_20_39);
1225	&Xloop_avx(\&body_20_39);
1226
1227$code.=<<___;
1228	add	0($ctx),$A			# update context
1229	add	4($ctx),@T[0]
1230	add	8($ctx),$C
1231	add	12($ctx),$D
1232	mov	$A,0($ctx)
1233	add	16($ctx),$E
1234	mov	@T[0],4($ctx)
1235	mov	@T[0],$B			# magic seed
1236	mov	$C,8($ctx)
1237	mov	$C,@T[1]
1238	mov	$D,12($ctx)
1239	xor	$D,@T[1]
1240	mov	$E,16($ctx)
1241	and	@T[1],@T[0]
1242	jmp	.Loop_avx
1243
1244.align	16
1245$done_avx_label:
1246___
1247				$j=$saved_j; @V=@saved_V;
1248
1249	&Xtail_avx(\&body_20_39);
1250	&Xtail_avx(\&body_20_39);
1251	&Xtail_avx(\&body_20_39);
1252
1253$code.=<<___;
1254	vzeroupper
1255
1256	add	0($ctx),$A			# update context
1257	add	4($ctx),@T[0]
1258	add	8($ctx),$C
1259	mov	$A,0($ctx)
1260	add	12($ctx),$D
1261	mov	@T[0],4($ctx)
1262	add	16($ctx),$E
1263	mov	$C,8($ctx)
1264	mov	$D,12($ctx)
1265	mov	$E,16($ctx)
1266___
1267$code.=<<___ if ($win64);
1268	movaps	-40-6*16($fp),%xmm6
1269	movaps	-40-5*16($fp),%xmm7
1270	movaps	-40-4*16($fp),%xmm8
1271	movaps	-40-3*16($fp),%xmm9
1272	movaps	-40-2*16($fp),%xmm10
1273	movaps	-40-1*16($fp),%xmm11
1274___
1275$code.=<<___;
1276	mov	-40($fp),%r14
1277	mov	-32($fp),%r13
1278	mov	-24($fp),%r12
1279	mov	-16($fp),%rbp
1280	mov	-8($fp),%rbx
1281	lea	($fp),%rsp
1282.Lepilogue_avx:
1283	ret
1284.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1285___
1286
1287if ($avx>1) {
1288use integer;
1289$Xi=4;					# reset variables
1290@X=map("%ymm$_",(4..7,0..3));
1291@Tx=map("%ymm$_",(8..10));
1292$Kx="%ymm11";
1293$j=0;
1294
1295my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1296my ($a5,$t0)=("%r12d","%edi");
1297
1298my ($A,$F,$B,$C,$D,$E)=@ROTX;
1299my $rx=0;
1300my $frame="%r13";
1301
1302$code.=<<___;
1303.type	sha1_block_data_order_avx2,\@function,3
1304.align	16
1305sha1_block_data_order_avx2:
1306_avx2_shortcut:
1307	mov	%rsp,$fp
1308	push	%rbx
1309	push	%rbp
1310	push	%r12
1311	push	%r13
1312	push	%r14
1313	vzeroupper
1314___
1315$code.=<<___ if ($win64);
1316	lea	-6*16(%rsp),%rsp
1317	vmovaps	%xmm6,-40-6*16($fp)
1318	vmovaps	%xmm7,-40-5*16($fp)
1319	vmovaps	%xmm8,-40-4*16($fp)
1320	vmovaps	%xmm9,-40-3*16($fp)
1321	vmovaps	%xmm10,-40-2*16($fp)
1322	vmovaps	%xmm11,-40-1*16($fp)
1323.Lprologue_avx2:
1324___
1325$code.=<<___;
1326	mov	%rdi,$ctx		# reassigned argument
1327	mov	%rsi,$inp		# reassigned argument
1328	mov	%rdx,$num		# reassigned argument
1329
1330	lea	-640(%rsp),%rsp
1331	shl	\$6,$num
1332	 lea	64($inp),$frame
1333	and	\$-128,%rsp
1334	add	$inp,$num
1335	lea	K_XX_XX+64(%rip),$K_XX_XX
1336
1337	mov	0($ctx),$A		# load context
1338	 cmp	$num,$frame
1339	 cmovae	$inp,$frame		# next or same block
1340	mov	4($ctx),$F
1341	mov	8($ctx),$C
1342	mov	12($ctx),$D
1343	mov	16($ctx),$E
1344	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1345
1346	vmovdqu		($inp),%xmm0
1347	vmovdqu		16($inp),%xmm1
1348	vmovdqu		32($inp),%xmm2
1349	vmovdqu		48($inp),%xmm3
1350	lea		64($inp),$inp
1351	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1352	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1353	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1354	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1355	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1356	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1357	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1358	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1359	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1360
1361	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1362	vpaddd	$Kx,@X[-3&7],@X[1]
1363	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1364	vpaddd	$Kx,@X[-2&7],@X[2]
1365	vmovdqu	@X[1],32(%rsp)
1366	vpaddd	$Kx,@X[-1&7],@X[3]
1367	vmovdqu	@X[2],64(%rsp)
1368	vmovdqu	@X[3],96(%rsp)
1369___
1370for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1371    use integer;
1372
1373	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1374	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1375	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1376	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1377	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1378	&vpsrld	(@Tx[0],@X[0],31);
1379	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1380	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1381	&vpaddd	(@X[0],@X[0],@X[0]);
1382	&vpsrld	(@Tx[1],@Tx[2],30);
1383	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1384	&vpslld	(@Tx[2],@Tx[2],2);
1385	&vpxor	(@X[0],@X[0],@Tx[1]);
1386	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1387	&vpaddd	(@Tx[1],@X[0],$Kx);
1388	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1389
1390	push(@X,shift(@X));	# "rotate" X[]
1391}
1392$code.=<<___;
1393	lea	128(%rsp),$frame
1394	jmp	.Loop_avx2
1395.align	32
1396.Loop_avx2:
1397	rorx	\$2,$F,$B
1398	andn	$D,$F,$t0
1399	and	$C,$F
1400	xor	$t0,$F
1401___
1402sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1403	# at start $f=(b&c)^(~b&d), $b>>>=2
1404	return &bodyx_20_39() if ($rx==19); $rx++;
1405	(
1406	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1407
1408	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1409	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1410	'&andn	($t0,$a,$c)',			# ~b&d for next round
1411
1412	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1413	'&rorx	($a5,$a,27)',			# a<<<5
1414	'&rorx	($f,$a,2)',			# b>>>2 for next round
1415	'&and	($a,$b)',			# b&c for next round
1416
1417	'&add	($e,$a5)',			# e+=a<<<5
1418	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1419
1420	'unshift(@ROTX,pop(@ROTX)); $j++;'
1421	)
1422}
1423
1424sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1425	# on entry $f=b^c^d, $b>>>=2
1426	return &bodyx_40_59() if ($rx==39); $rx++;
1427	(
1428	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1429
1430	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1431	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1432
1433	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1434	'&rorx	($a5,$a,27)',			# a<<<5
1435	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1436	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1437
1438	'&add	($e,$a5)',			# e+=a<<<5
1439	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1440
1441	'unshift(@ROTX,pop(@ROTX)); $j++;'
1442	)
1443}
1444
1445sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1446	# on entry $f=((b^c)&(c^d)), $b>>>=2
1447	$rx++;
1448	(
1449	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1450
1451	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1452	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1453	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1454	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1455	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1456
1457	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1458	'&rorx	($a5,$a,27)',			# a<<<5
1459	'&rorx	($f,$a,2)',			# b>>>2 in next round
1460	'&xor	($a,$b)',			# b^c for next round
1461
1462	'&add	($e,$a5)',			# e+=a<<<5
1463	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1464	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1465
1466	'unshift(@ROTX,pop(@ROTX)); $j++;'
1467	)
1468}
1469
1470sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
1471{ use integer;
1472  my $body = shift;
1473  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1474  my ($a,$b,$c,$d,$e);
1475
1476	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1477	 eval(shift(@insns));
1478	 eval(shift(@insns));
1479	 eval(shift(@insns));
1480	 eval(shift(@insns));
1481
1482	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1483	 eval(shift(@insns));
1484	 eval(shift(@insns));
1485	 eval(shift(@insns));
1486
1487	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1488	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1489	 eval(shift(@insns));
1490	 eval(shift(@insns));
1491
1492	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1493	 eval(shift(@insns));
1494	 eval(shift(@insns));
1495	 eval(shift(@insns));
1496	 eval(shift(@insns));
1497
1498	&vpsrld	(@Tx[0],@X[0],31);
1499	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1500	 eval(shift(@insns));
1501	 eval(shift(@insns));
1502	 eval(shift(@insns));
1503
1504	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1505	&vpaddd	(@X[0],@X[0],@X[0]);
1506	 eval(shift(@insns));
1507	 eval(shift(@insns));
1508
1509	&vpsrld	(@Tx[1],@Tx[2],30);
1510	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1511	 eval(shift(@insns));
1512	 eval(shift(@insns));
1513
1514	&vpslld	(@Tx[2],@Tx[2],2);
1515	&vpxor	(@X[0],@X[0],@Tx[1]);
1516	 eval(shift(@insns));
1517	 eval(shift(@insns));
1518
1519	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1520	 eval(shift(@insns));
1521	 eval(shift(@insns));
1522	 eval(shift(@insns));
1523
1524	&vpaddd	(@Tx[1],@X[0],$Kx);
1525	 eval(shift(@insns));
1526	 eval(shift(@insns));
1527	 eval(shift(@insns));
1528	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1529
1530	 foreach (@insns) { eval; }	# remaining instructions [if any]
1531
1532	$Xi++;
1533	push(@X,shift(@X));	# "rotate" X[]
1534}
1535
1536sub Xupdate_avx2_32_79()
1537{ use integer;
1538  my $body = shift;
1539  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1540  my ($a,$b,$c,$d,$e);
1541
1542	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1543	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1544	 eval(shift(@insns));
1545	 eval(shift(@insns));
1546
1547	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1548	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1549	 eval(shift(@insns));
1550	 eval(shift(@insns));
1551	 eval(shift(@insns));
1552
1553	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1554	 eval(shift(@insns));
1555	 eval(shift(@insns));
1556	 eval(shift(@insns));
1557
1558	&vpsrld	(@Tx[0],@X[0],30);
1559	&vpslld	(@X[0],@X[0],2);
1560	 eval(shift(@insns));
1561	 eval(shift(@insns));
1562	 eval(shift(@insns));
1563
1564	#&vpslld	(@X[0],@X[0],2);
1565	 eval(shift(@insns));
1566	 eval(shift(@insns));
1567	 eval(shift(@insns));
1568
1569	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1570	 eval(shift(@insns));
1571	 eval(shift(@insns));
1572	 eval(shift(@insns));
1573	 eval(shift(@insns));
1574
1575	&vpaddd	(@Tx[1],@X[0],$Kx);
1576	 eval(shift(@insns));
1577	 eval(shift(@insns));
1578	 eval(shift(@insns));
1579	 eval(shift(@insns));
1580
1581	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1582
1583	 foreach (@insns) { eval; }	# remaining instructions
1584
1585	$Xi++;
1586	push(@X,shift(@X));	# "rotate" X[]
1587}
1588
1589sub Xloop_avx2()
1590{ use integer;
1591  my $body = shift;
1592  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1593  my ($a,$b,$c,$d,$e);
1594
1595	 foreach (@insns) { eval; }
1596}
1597
1598	&align32();
1599	&Xupdate_avx2_32_79(\&bodyx_00_19);
1600	&Xupdate_avx2_32_79(\&bodyx_00_19);
1601	&Xupdate_avx2_32_79(\&bodyx_00_19);
1602	&Xupdate_avx2_32_79(\&bodyx_00_19);
1603
1604	&Xupdate_avx2_32_79(\&bodyx_20_39);
1605	&Xupdate_avx2_32_79(\&bodyx_20_39);
1606	&Xupdate_avx2_32_79(\&bodyx_20_39);
1607	&Xupdate_avx2_32_79(\&bodyx_20_39);
1608
1609	&align32();
1610	&Xupdate_avx2_32_79(\&bodyx_40_59);
1611	&Xupdate_avx2_32_79(\&bodyx_40_59);
1612	&Xupdate_avx2_32_79(\&bodyx_40_59);
1613	&Xupdate_avx2_32_79(\&bodyx_40_59);
1614
1615	&Xloop_avx2(\&bodyx_20_39);
1616	&Xloop_avx2(\&bodyx_20_39);
1617	&Xloop_avx2(\&bodyx_20_39);
1618	&Xloop_avx2(\&bodyx_20_39);
1619
1620$code.=<<___;
1621	lea	128($inp),$frame
1622	lea	128($inp),%rdi			# borrow $t0
1623	cmp	$num,$frame
1624	cmovae	$inp,$frame			# next or previous block
1625
1626	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1627	add	0($ctx),@ROTX[0]		# update context
1628	add	4($ctx),@ROTX[1]
1629	add	8($ctx),@ROTX[3]
1630	mov	@ROTX[0],0($ctx)
1631	add	12($ctx),@ROTX[4]
1632	mov	@ROTX[1],4($ctx)
1633	 mov	@ROTX[0],$A			# A=d
1634	add	16($ctx),@ROTX[5]
1635	 mov	@ROTX[3],$a5
1636	mov	@ROTX[3],8($ctx)
1637	 mov	@ROTX[4],$D			# D=b
1638	 #xchg	@ROTX[5],$F			# F=c, C=f
1639	mov	@ROTX[4],12($ctx)
1640	 mov	@ROTX[1],$F			# F=e
1641	mov	@ROTX[5],16($ctx)
1642	#mov	$F,16($ctx)
1643	 mov	@ROTX[5],$E			# E=c
1644	 mov	$a5,$C				# C=f
1645	 #xchg	$F,$E				# E=c, F=e
1646
1647	cmp	$num,$inp
1648	je	.Ldone_avx2
1649___
1650
1651$Xi=4;				# reset variables
1652@X=map("%ymm$_",(4..7,0..3));
1653
1654$code.=<<___;
1655	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1656	cmp	$num,%rdi			# borrowed $t0
1657	ja	.Last_avx2
1658
1659	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1660	vmovdqu		-48(%rdi),%xmm1
1661	vmovdqu		-32(%rdi),%xmm2
1662	vmovdqu		-16(%rdi),%xmm3
1663	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1664	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1665	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1666	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1667	jmp	.Last_avx2
1668
1669.align	32
1670.Last_avx2:
1671	lea	128+16(%rsp),$frame
1672	rorx	\$2,$F,$B
1673	andn	$D,$F,$t0
1674	and	$C,$F
1675	xor	$t0,$F
1676	sub	\$-128,$inp
1677___
1678	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1679
1680	&Xloop_avx2	(\&bodyx_00_19);
1681	&Xloop_avx2	(\&bodyx_00_19);
1682	&Xloop_avx2	(\&bodyx_00_19);
1683	&Xloop_avx2	(\&bodyx_00_19);
1684
1685	&Xloop_avx2	(\&bodyx_20_39);
1686	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1687	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1688	&Xloop_avx2	(\&bodyx_20_39);
1689	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1690	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1691	&Xloop_avx2	(\&bodyx_20_39);
1692	  &vmovdqu	("0(%rsp)",@Tx[0]);
1693	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1694	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1695	&Xloop_avx2	(\&bodyx_20_39);
1696	  &vmovdqu	("32(%rsp)",@Tx[1]);
1697	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1698	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1699
1700	&Xloop_avx2	(\&bodyx_40_59);
1701	&align32	();
1702	  &vmovdqu	("64(%rsp)",@X[2]);
1703	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1704	&Xloop_avx2	(\&bodyx_40_59);
1705	  &vmovdqu	("96(%rsp)",@X[3]);
1706	&Xloop_avx2	(\&bodyx_40_59);
1707	&Xupdate_avx2_16_31(\&bodyx_40_59);
1708
1709	&Xupdate_avx2_16_31(\&bodyx_20_39);
1710	&Xupdate_avx2_16_31(\&bodyx_20_39);
1711	&Xupdate_avx2_16_31(\&bodyx_20_39);
1712	&Xloop_avx2	(\&bodyx_20_39);
1713
1714$code.=<<___;
1715	lea	128(%rsp),$frame
1716
1717	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1718	add	0($ctx),@ROTX[0]		# update context
1719	add	4($ctx),@ROTX[1]
1720	add	8($ctx),@ROTX[3]
1721	mov	@ROTX[0],0($ctx)
1722	add	12($ctx),@ROTX[4]
1723	mov	@ROTX[1],4($ctx)
1724	 mov	@ROTX[0],$A			# A=d
1725	add	16($ctx),@ROTX[5]
1726	 mov	@ROTX[3],$a5
1727	mov	@ROTX[3],8($ctx)
1728	 mov	@ROTX[4],$D			# D=b
1729	 #xchg	@ROTX[5],$F			# F=c, C=f
1730	mov	@ROTX[4],12($ctx)
1731	 mov	@ROTX[1],$F			# F=e
1732	mov	@ROTX[5],16($ctx)
1733	#mov	$F,16($ctx)
1734	 mov	@ROTX[5],$E			# E=c
1735	 mov	$a5,$C				# C=f
1736	 #xchg	$F,$E				# E=c, F=e
1737
1738	cmp	$num,$inp
1739	jbe	.Loop_avx2
1740
1741.Ldone_avx2:
1742	vzeroupper
1743___
1744$code.=<<___ if ($win64);
1745	movaps	-40-6*16($fp),%xmm6
1746	movaps	-40-5*16($fp),%xmm7
1747	movaps	-40-4*16($fp),%xmm8
1748	movaps	-40-3*16($fp),%xmm9
1749	movaps	-40-2*16($fp),%xmm10
1750	movaps	-40-1*16($fp),%xmm11
1751___
1752$code.=<<___;
1753	mov	-40($fp),%r14
1754	mov	-32($fp),%r13
1755	mov	-24($fp),%r12
1756	mov	-16($fp),%rbp
1757	mov	-8($fp),%rbx
1758	lea	($fp),%rsp
1759.Lepilogue_avx2:
1760	ret
1761.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1762___
1763}
1764}
1765$code.=<<___;
1766.align	64
1767K_XX_XX:
1768.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1769.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1770.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1771.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1772.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1773.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1774.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1775.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1776.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1777.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1778.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1779___
1780}}}
1781$code.=<<___;
1782.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1783.align	64
1784___
1785
1786# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1787#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1788if ($win64) {
1789$rec="%rcx";
1790$frame="%rdx";
1791$context="%r8";
1792$disp="%r9";
1793
1794$code.=<<___;
1795.extern	__imp_RtlVirtualUnwind
1796.type	se_handler,\@abi-omnipotent
1797.align	16
1798se_handler:
1799	push	%rsi
1800	push	%rdi
1801	push	%rbx
1802	push	%rbp
1803	push	%r12
1804	push	%r13
1805	push	%r14
1806	push	%r15
1807	pushfq
1808	sub	\$64,%rsp
1809
1810	mov	120($context),%rax	# pull context->Rax
1811	mov	248($context),%rbx	# pull context->Rip
1812
1813	lea	.Lprologue(%rip),%r10
1814	cmp	%r10,%rbx		# context->Rip<.Lprologue
1815	jb	.Lcommon_seh_tail
1816
1817	mov	152($context),%rax	# pull context->Rsp
1818
1819	lea	.Lepilogue(%rip),%r10
1820	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1821	jae	.Lcommon_seh_tail
1822
1823	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1824
1825	mov	-8(%rax),%rbx
1826	mov	-16(%rax),%rbp
1827	mov	-24(%rax),%r12
1828	mov	-32(%rax),%r13
1829	mov	-40(%rax),%r14
1830	mov	%rbx,144($context)	# restore context->Rbx
1831	mov	%rbp,160($context)	# restore context->Rbp
1832	mov	%r12,216($context)	# restore context->R12
1833	mov	%r13,224($context)	# restore context->R13
1834	mov	%r14,232($context)	# restore context->R14
1835
1836	jmp	.Lcommon_seh_tail
1837.size	se_handler,.-se_handler
1838___
1839
1840$code.=<<___ if ($shaext);
1841.type	shaext_handler,\@abi-omnipotent
1842.align	16
1843shaext_handler:
1844	push	%rsi
1845	push	%rdi
1846	push	%rbx
1847	push	%rbp
1848	push	%r12
1849	push	%r13
1850	push	%r14
1851	push	%r15
1852	pushfq
1853	sub	\$64,%rsp
1854
1855	mov	120($context),%rax	# pull context->Rax
1856	mov	248($context),%rbx	# pull context->Rip
1857
1858	lea	.Lprologue_shaext(%rip),%r10
1859	cmp	%r10,%rbx		# context->Rip<.Lprologue
1860	jb	.Lcommon_seh_tail
1861
1862	lea	.Lepilogue_shaext(%rip),%r10
1863	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1864	jae	.Lcommon_seh_tail
1865
1866	lea	-8-4*16(%rax),%rsi
1867	lea	512($context),%rdi	# &context.Xmm6
1868	mov	\$8,%ecx
1869	.long	0xa548f3fc		# cld; rep movsq
1870
1871	jmp	.Lcommon_seh_tail
1872.size	shaext_handler,.-shaext_handler
1873___
1874
1875$code.=<<___;
1876.type	ssse3_handler,\@abi-omnipotent
1877.align	16
1878ssse3_handler:
1879	push	%rsi
1880	push	%rdi
1881	push	%rbx
1882	push	%rbp
1883	push	%r12
1884	push	%r13
1885	push	%r14
1886	push	%r15
1887	pushfq
1888	sub	\$64,%rsp
1889
1890	mov	120($context),%rax	# pull context->Rax
1891	mov	248($context),%rbx	# pull context->Rip
1892
1893	mov	8($disp),%rsi		# disp->ImageBase
1894	mov	56($disp),%r11		# disp->HandlerData
1895
1896	mov	0(%r11),%r10d		# HandlerData[0]
1897	lea	(%rsi,%r10),%r10	# prologue label
1898	cmp	%r10,%rbx		# context->Rip<prologue label
1899	jb	.Lcommon_seh_tail
1900
1901	mov	208($context),%rax	# pull context->R11
1902
1903	mov	4(%r11),%r10d		# HandlerData[1]
1904	lea	(%rsi,%r10),%r10	# epilogue label
1905	cmp	%r10,%rbx		# context->Rip>=epilogue label
1906	jae	.Lcommon_seh_tail
1907
1908	lea	-40-6*16(%rax),%rsi
1909	lea	512($context),%rdi	# &context.Xmm6
1910	mov	\$12,%ecx
1911	.long	0xa548f3fc		# cld; rep movsq
1912
1913	mov	-8(%rax),%rbx
1914	mov	-16(%rax),%rbp
1915	mov	-24(%rax),%r12
1916	mov	-32(%rax),%r13
1917	mov	-40(%rax),%r14
1918	mov	%rbx,144($context)	# restore context->Rbx
1919	mov	%rbp,160($context)	# restore context->Rbp
1920	mov	%r12,216($context)	# restore cotnext->R12
1921	mov	%r13,224($context)	# restore cotnext->R13
1922	mov	%r14,232($context)	# restore cotnext->R14
1923
1924.Lcommon_seh_tail:
1925	mov	8(%rax),%rdi
1926	mov	16(%rax),%rsi
1927	mov	%rax,152($context)	# restore context->Rsp
1928	mov	%rsi,168($context)	# restore context->Rsi
1929	mov	%rdi,176($context)	# restore context->Rdi
1930
1931	mov	40($disp),%rdi		# disp->ContextRecord
1932	mov	$context,%rsi		# context
1933	mov	\$154,%ecx		# sizeof(CONTEXT)
1934	.long	0xa548f3fc		# cld; rep movsq
1935
1936	mov	$disp,%rsi
1937	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1938	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1939	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1940	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1941	mov	40(%rsi),%r10		# disp->ContextRecord
1942	lea	56(%rsi),%r11		# &disp->HandlerData
1943	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1944	mov	%r10,32(%rsp)		# arg5
1945	mov	%r11,40(%rsp)		# arg6
1946	mov	%r12,48(%rsp)		# arg7
1947	mov	%rcx,56(%rsp)		# arg8, (NULL)
1948	call	*__imp_RtlVirtualUnwind(%rip)
1949
1950	mov	\$1,%eax		# ExceptionContinueSearch
1951	add	\$64,%rsp
1952	popfq
1953	pop	%r15
1954	pop	%r14
1955	pop	%r13
1956	pop	%r12
1957	pop	%rbp
1958	pop	%rbx
1959	pop	%rdi
1960	pop	%rsi
1961	ret
1962.size	ssse3_handler,.-ssse3_handler
1963
1964.section	.pdata
1965.align	4
1966	.rva	.LSEH_begin_sha1_block_data_order
1967	.rva	.LSEH_end_sha1_block_data_order
1968	.rva	.LSEH_info_sha1_block_data_order
1969___
1970$code.=<<___ if ($shaext);
1971	.rva	.LSEH_begin_sha1_block_data_order_shaext
1972	.rva	.LSEH_end_sha1_block_data_order_shaext
1973	.rva	.LSEH_info_sha1_block_data_order_shaext
1974___
1975$code.=<<___;
1976	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1977	.rva	.LSEH_end_sha1_block_data_order_ssse3
1978	.rva	.LSEH_info_sha1_block_data_order_ssse3
1979___
1980$code.=<<___ if ($avx);
1981	.rva	.LSEH_begin_sha1_block_data_order_avx
1982	.rva	.LSEH_end_sha1_block_data_order_avx
1983	.rva	.LSEH_info_sha1_block_data_order_avx
1984___
1985$code.=<<___ if ($avx>1);
1986	.rva	.LSEH_begin_sha1_block_data_order_avx2
1987	.rva	.LSEH_end_sha1_block_data_order_avx2
1988	.rva	.LSEH_info_sha1_block_data_order_avx2
1989___
1990$code.=<<___;
1991.section	.xdata
1992.align	8
1993.LSEH_info_sha1_block_data_order:
1994	.byte	9,0,0,0
1995	.rva	se_handler
1996___
1997$code.=<<___ if ($shaext);
1998.LSEH_info_sha1_block_data_order_shaext:
1999	.byte	9,0,0,0
2000	.rva	shaext_handler
2001___
2002$code.=<<___;
2003.LSEH_info_sha1_block_data_order_ssse3:
2004	.byte	9,0,0,0
2005	.rva	ssse3_handler
2006	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2007___
2008$code.=<<___ if ($avx);
2009.LSEH_info_sha1_block_data_order_avx:
2010	.byte	9,0,0,0
2011	.rva	ssse3_handler
2012	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2013___
2014$code.=<<___ if ($avx>1);
2015.LSEH_info_sha1_block_data_order_avx2:
2016	.byte	9,0,0,0
2017	.rva	ssse3_handler
2018	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2019___
2020}
2021
2022####################################################################
2023
2024sub sha1rnds4 {
2025    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2026      my @opcode=(0x0f,0x3a,0xcc);
2027	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2028	my $c=$1;
2029	push @opcode,$c=~/^0/?oct($c):$c;
2030	return ".byte\t".join(',',@opcode);
2031    } else {
2032	return "sha1rnds4\t".@_[0];
2033    }
2034}
2035
2036sub sha1op38 {
2037    my $instr = shift;
2038    my %opcodelet = (
2039		"sha1nexte" => 0xc8,
2040  		"sha1msg1"  => 0xc9,
2041		"sha1msg2"  => 0xca	);
2042
2043    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2044      my @opcode=(0x0f,0x38);
2045      my $rex=0;
2046	$rex|=0x04			if ($2>=8);
2047	$rex|=0x01			if ($1>=8);
2048	unshift @opcode,0x40|$rex	if ($rex);
2049	push @opcode,$opcodelet{$instr};
2050	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2051	return ".byte\t".join(',',@opcode);
2052    } else {
2053	return $instr."\t".@_[0];
2054    }
2055}
2056
2057foreach (split("\n",$code)) {
2058	s/\`([^\`]*)\`/eval $1/geo;
2059
2060	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2061	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2062
2063	print $_,"\n";
2064}
2065close STDOUT;
2066