• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/router/openssl/crypto/sha/asm/
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# sha256/512_block procedure for x86_64.
10#
11# 40% improvement over compiler-generated code on Opteron. On EM64T
12# sha256 was observed to run >80% faster and sha512 - >40%. No magical
13# tricks, just straight implementation... I really wonder why gcc
14# [being armed with inline assembler] fails to generate as fast code.
15# The only thing which is cool about this module is that it's very
16# same instruction sequence used for both SHA-256 and SHA-512. In
17# former case the instructions operate on 32-bit operands, while in
18# latter - on 64-bit ones. All I had to do is to get one flavor right,
19# the other one passed the test right away:-)
20#
21# sha256_block runs in ~1005 cycles on Opteron, which gives you
22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23# frequency in GHz. sha512_block runs in ~1275 cycles, which results
24# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25# Well, if you compare it to IA-64 implementation, which maintains
26# X[16] in register bank[!], tends to 4 instructions per CPU clock
27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28# issue Opteron pipeline and X[16] maintained in memory. So that *if*
29# there is a way to improve it, *then* the only way would be to try to
30# offload X[16] updates to SSE unit, but that would require "deeper"
31# loop unroll, which in turn would naturally cause size blow-up, not
32# to mention increased complexity! And once again, only *if* it's
33# actually possible to noticeably improve overall ILP, instruction
34# level parallelism, on a given CPU implementation in this case.
35#
36# Special note on Intel EM64T. While Opteron CPU exhibits perfect
37# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38# [currently available] EM64T CPUs apparently are far from it. On the
39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40# sha256_block:-( This is presumably because 64-bit shifts/rotates
41# apparently are not atomic instructions, but implemented in microcode.
42#
43# May 2012.
44#
45# Optimization including one of Pavel Semjanov's ideas, alternative
46# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47# unfortunately -2% SHA512 on P4 [which nobody should care about
48# that much].
49#
50# June 2012.
51#
52# Add SIMD code paths, see below for improvement coefficients. SSSE3
53# code path was not attempted for SHA512, because improvement is not
54# estimated to be high enough, noticeably less than 9%, to justify
55# the effort, not on pre-AVX processors. [Obviously with exclusion
56# for VIA Nano, but it has SHA512 instruction that is faster and
57# should be used instead.] For reference, corresponding estimated
58# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59# higher coefficients are observed on VIA Nano and Bulldozer has more
60# to do with specifics of their architecture [which is topic for
61# separate discussion].
62#
63# November 2012.
64#
65# Add AVX2 code path. Two consecutive input blocks are loaded to
66# 256-bit %ymm registers, with data from first block to least
67# significant 128-bit halves and data from second to most significant.
68# The data is then processed with same SIMD instruction sequence as
69# for AVX, but with %ymm as operands. Side effect is increased stack
70# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
71# code size increase.
72#
73# March 2014.
74#
75# Add support for Intel SHA Extensions.
76
77######################################################################
78# Current performance in cycles per processed byte (less is better):
79#
80#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
81#
82# AMD K8	14.9	-	    -		    9.57    -
83# P4		17.3	-	    -		    30.8    -
84# Core 2	15.6	13.8(+13%)  -		    9.97    -
85# Westmere	14.8	12.3(+19%)  -		    9.58    -
86# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
87# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
88# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
89# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
90# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
91# Atom		23.0	18.9(+22%)  -		    14.7    -
92# Silvermont	27.4	20.6(+33%)  -               17.5    -
93#
94# (*)	whichever best applicable;
95# (**)	switch from ror to shrd stands for fair share of improvement;
96# (***)	execution time is fully determined by remaining integer-only
97#	part, body_00_15; reducing the amount of SIMD instructions
98#	below certain limit makes no difference/sense; to conserve
99#	space SHA256 XOP code path is therefore omitted;
100
101$flavour = shift;
102$output  = shift;
103if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
104
105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110die "can't locate x86_64-xlate.pl";
111
112if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
113		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
114	$avx = ($1>=2.19) + ($1>=2.22);
115}
116
117if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
118	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
119	$avx = ($1>=2.09) + ($1>=2.10);
120}
121
122if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
123	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
124	$avx = ($1>=10) + ($1>=11);
125}
126
127if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
128	$avx = ($2>=3.0) + ($2>3.0);
129}
130
131$shaext=1;	### set to zero if compiling for 1.0.1
132$avx=1		if (!$shaext && $avx);
133
134open OUT,"| \"$^X\" $xlate $flavour $output";
135*STDOUT=*OUT;
136
137if ($output =~ /512/) {
138	$func="sha512_block_data_order";
139	$TABLE="K512";
140	$SZ=8;
141	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
142					"%r8", "%r9", "%r10","%r11");
143	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
144	@Sigma0=(28,34,39);
145	@Sigma1=(14,18,41);
146	@sigma0=(1,  8, 7);
147	@sigma1=(19,61, 6);
148	$rounds=80;
149} else {
150	$func="sha256_block_data_order";
151	$TABLE="K256";
152	$SZ=4;
153	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
154					"%r8d","%r9d","%r10d","%r11d");
155	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
156	@Sigma0=( 2,13,22);
157	@Sigma1=( 6,11,25);
158	@sigma0=( 7,18, 3);
159	@sigma1=(17,19,10);
160	$rounds=64;
161}
162
163$ctx="%rdi";	# 1st arg, zapped by $a3
164$inp="%rsi";	# 2nd arg
165$Tbl="%rbp";
166
167$_ctx="16*$SZ+0*8(%rsp)";
168$_inp="16*$SZ+1*8(%rsp)";
169$_end="16*$SZ+2*8(%rsp)";
170$_rsp="16*$SZ+3*8(%rsp)";
171$framesz="16*$SZ+4*8";
172
173
174sub ROUND_00_15()
175{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
176  my $STRIDE=$SZ;
177     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
178
179$code.=<<___;
180	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
181	mov	$f,$a2
182
183	xor	$e,$a0
184	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
185	xor	$g,$a2			# f^g
186
187	mov	$T1,`$SZ*($i&0xf)`(%rsp)
188	xor	$a,$a1
189	and	$e,$a2			# (f^g)&e
190
191	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
192	add	$h,$T1			# T1+=h
193	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
194
195	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
196	xor	$e,$a0
197	add	$a2,$T1			# T1+=Ch(e,f,g)
198
199	mov	$a,$a2
200	add	($Tbl),$T1		# T1+=K[round]
201	xor	$a,$a1
202
203	xor	$b,$a2			# a^b, b^c in next round
204	ror	\$$Sigma1[0],$a0	# Sigma1(e)
205	mov	$b,$h
206
207	and	$a2,$a3
208	ror	\$$Sigma0[0],$a1	# Sigma0(a)
209	add	$a0,$T1			# T1+=Sigma1(e)
210
211	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
212	add	$T1,$d			# d+=T1
213	add	$T1,$h			# h+=T1
214
215	lea	$STRIDE($Tbl),$Tbl	# round++
216___
217$code.=<<___ if ($i<15);
218	add	$a1,$h			# h+=Sigma0(a)
219___
220	($a2,$a3) = ($a3,$a2);
221}
222
223sub ROUND_16_XX()
224{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
225
226$code.=<<___;
227	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
228	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
229
230	mov	$a0,$T1
231	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
232	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
233	mov	$a2,$a1
234	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
235
236	xor	$T1,$a0
237	shr	\$$sigma0[2],$T1
238	ror	\$$sigma0[0],$a0
239	xor	$a1,$a2
240	shr	\$$sigma1[2],$a1
241
242	ror	\$$sigma1[0],$a2
243	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
244	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
245	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
246
247	add	`$SZ*($i&0xf)`(%rsp),$T1
248	mov	$e,$a0
249	add	$a2,$T1
250	mov	$a,$a1
251___
252	&ROUND_00_15(@_);
253}
254
255$code=<<___;
256.text
257
258.extern	OPENSSL_ia32cap_P
259.globl	$func
260.type	$func,\@function,3
261.align	16
262$func:
263___
264$code.=<<___ if ($SZ==4 || $avx);
265	lea	OPENSSL_ia32cap_P(%rip),%r11
266	mov	0(%r11),%r9d
267	mov	4(%r11),%r10d
268	mov	8(%r11),%r11d
269___
270$code.=<<___ if ($SZ==4 && $shaext);
271	test	\$`1<<29`,%r11d		# check for SHA
272	jnz	_shaext_shortcut
273___
274$code.=<<___ if ($avx && $SZ==8);
275	test	\$`1<<11`,%r10d		# check for XOP
276	jnz	.Lxop_shortcut
277___
278$code.=<<___ if ($avx>1);
279	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
280	cmp	\$`1<<8|1<<5|1<<3`,%r11d
281	je	.Lavx2_shortcut
282___
283$code.=<<___ if ($avx);
284	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
285	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
286	or	%r9d,%r10d
287	cmp	\$`1<<28|1<<9|1<<30`,%r10d
288	je	.Lavx_shortcut
289___
290$code.=<<___ if ($SZ==4);
291	test	\$`1<<9`,%r10d
292	jnz	.Lssse3_shortcut
293___
294$code.=<<___;
295	push	%rbx
296	push	%rbp
297	push	%r12
298	push	%r13
299	push	%r14
300	push	%r15
301	mov	%rsp,%r11		# copy %rsp
302	shl	\$4,%rdx		# num*16
303	sub	\$$framesz,%rsp
304	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
305	and	\$-64,%rsp		# align stack frame
306	mov	$ctx,$_ctx		# save ctx, 1st arg
307	mov	$inp,$_inp		# save inp, 2nd arh
308	mov	%rdx,$_end		# save end pointer, "3rd" arg
309	mov	%r11,$_rsp		# save copy of %rsp
310.Lprologue:
311
312	mov	$SZ*0($ctx),$A
313	mov	$SZ*1($ctx),$B
314	mov	$SZ*2($ctx),$C
315	mov	$SZ*3($ctx),$D
316	mov	$SZ*4($ctx),$E
317	mov	$SZ*5($ctx),$F
318	mov	$SZ*6($ctx),$G
319	mov	$SZ*7($ctx),$H
320	jmp	.Lloop
321
322.align	16
323.Lloop:
324	mov	$B,$a3
325	lea	$TABLE(%rip),$Tbl
326	xor	$C,$a3			# magic
327___
328	for($i=0;$i<16;$i++) {
329		$code.="	mov	$SZ*$i($inp),$T1\n";
330		$code.="	mov	@ROT[4],$a0\n";
331		$code.="	mov	@ROT[0],$a1\n";
332		$code.="	bswap	$T1\n";
333		&ROUND_00_15($i,@ROT);
334		unshift(@ROT,pop(@ROT));
335	}
336$code.=<<___;
337	jmp	.Lrounds_16_xx
338.align	16
339.Lrounds_16_xx:
340___
341	for(;$i<32;$i++) {
342		&ROUND_16_XX($i,@ROT);
343		unshift(@ROT,pop(@ROT));
344	}
345
346$code.=<<___;
347	cmpb	\$0,`$SZ-1`($Tbl)
348	jnz	.Lrounds_16_xx
349
350	mov	$_ctx,$ctx
351	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
352	lea	16*$SZ($inp),$inp
353
354	add	$SZ*0($ctx),$A
355	add	$SZ*1($ctx),$B
356	add	$SZ*2($ctx),$C
357	add	$SZ*3($ctx),$D
358	add	$SZ*4($ctx),$E
359	add	$SZ*5($ctx),$F
360	add	$SZ*6($ctx),$G
361	add	$SZ*7($ctx),$H
362
363	cmp	$_end,$inp
364
365	mov	$A,$SZ*0($ctx)
366	mov	$B,$SZ*1($ctx)
367	mov	$C,$SZ*2($ctx)
368	mov	$D,$SZ*3($ctx)
369	mov	$E,$SZ*4($ctx)
370	mov	$F,$SZ*5($ctx)
371	mov	$G,$SZ*6($ctx)
372	mov	$H,$SZ*7($ctx)
373	jb	.Lloop
374
375	mov	$_rsp,%rsi
376	mov	(%rsi),%r15
377	mov	8(%rsi),%r14
378	mov	16(%rsi),%r13
379	mov	24(%rsi),%r12
380	mov	32(%rsi),%rbp
381	mov	40(%rsi),%rbx
382	lea	48(%rsi),%rsp
383.Lepilogue:
384	ret
385.size	$func,.-$func
386___
387
388if ($SZ==4) {
389$code.=<<___;
390.align	64
391.type	$TABLE,\@object
392$TABLE:
393	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
394	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
395	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
396	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
397	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
398	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
399	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
400	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
401	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
402	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
403	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
404	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
405	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
406	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
407	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
408	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
409	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
410	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
411	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
412	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
413	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
414	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
415	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
416	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
417	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
418	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
419	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
420	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
421	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
422	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
423	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
424	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
425
426	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
427	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
428	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
429	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
430	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
431	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
432	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
433___
434} else {
435$code.=<<___;
436.align	64
437.type	$TABLE,\@object
438$TABLE:
439	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
440	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
441	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
442	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
443	.quad	0x3956c25bf348b538,0x59f111f1b605d019
444	.quad	0x3956c25bf348b538,0x59f111f1b605d019
445	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
446	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
447	.quad	0xd807aa98a3030242,0x12835b0145706fbe
448	.quad	0xd807aa98a3030242,0x12835b0145706fbe
449	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
450	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
451	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
452	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
453	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
454	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
455	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
456	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
457	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
458	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
459	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
460	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
461	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
462	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
463	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
464	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
465	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
466	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
467	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
468	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
469	.quad	0x06ca6351e003826f,0x142929670a0e6e70
470	.quad	0x06ca6351e003826f,0x142929670a0e6e70
471	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
472	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
473	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
474	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
475	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
476	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
477	.quad	0x81c2c92e47edaee6,0x92722c851482353b
478	.quad	0x81c2c92e47edaee6,0x92722c851482353b
479	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
480	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
481	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
482	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
483	.quad	0xd192e819d6ef5218,0xd69906245565a910
484	.quad	0xd192e819d6ef5218,0xd69906245565a910
485	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
486	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
487	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
488	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
489	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
490	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
491	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
492	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
493	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
494	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
495	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
496	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
497	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
498	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
499	.quad	0x90befffa23631e28,0xa4506cebde82bde9
500	.quad	0x90befffa23631e28,0xa4506cebde82bde9
501	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
502	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
503	.quad	0xca273eceea26619c,0xd186b8c721c0c207
504	.quad	0xca273eceea26619c,0xd186b8c721c0c207
505	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
506	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
507	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
508	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
509	.quad	0x113f9804bef90dae,0x1b710b35131c471b
510	.quad	0x113f9804bef90dae,0x1b710b35131c471b
511	.quad	0x28db77f523047d84,0x32caab7b40c72493
512	.quad	0x28db77f523047d84,0x32caab7b40c72493
513	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
514	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
515	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
516	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
517	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
518	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
519
520	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
521	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
522	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
523___
524}
525
526######################################################################
527# SIMD code paths
528#
529if ($SZ==4 && $shaext) {{{
530######################################################################
531# Intel SHA Extensions implementation of SHA256 update function.
532#
533my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
534
535my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
536my @MSG=map("%xmm$_",(3..6));
537
538$code.=<<___;
539.type	sha256_block_data_order_shaext,\@function,3
540.align	64
541sha256_block_data_order_shaext:
542_shaext_shortcut:
543___
544$code.=<<___ if ($win64);
545	lea	`-8-5*16`(%rsp),%rsp
546	movaps	%xmm6,-8-5*16(%rax)
547	movaps	%xmm7,-8-4*16(%rax)
548	movaps	%xmm8,-8-3*16(%rax)
549	movaps	%xmm9,-8-2*16(%rax)
550	movaps	%xmm10,-8-1*16(%rax)
551.Lprologue_shaext:
552___
553$code.=<<___;
554	lea		K256+0x80(%rip),$Tbl
555	movdqu		($ctx),$ABEF		# DCBA
556	movdqu		16($ctx),$CDGH		# HGFE
557	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
558
559	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
560	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
561	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
562	movdqa		$TMP,$BSWAP		# offload
563	palignr		\$8,$CDGH,$ABEF		# ABEF
564	punpcklqdq	$Wi,$CDGH		# CDGH
565	jmp		.Loop_shaext
566
567.align	16
568.Loop_shaext:
569	movdqu		($inp),@MSG[0]
570	movdqu		0x10($inp),@MSG[1]
571	movdqu		0x20($inp),@MSG[2]
572	pshufb		$TMP,@MSG[0]
573	movdqu		0x30($inp),@MSG[3]
574
575	movdqa		0*32-0x80($Tbl),$Wi
576	paddd		@MSG[0],$Wi
577	pshufb		$TMP,@MSG[1]
578	movdqa		$CDGH,$CDGH_SAVE	# offload
579	sha256rnds2	$ABEF,$CDGH		# 0-3
580	pshufd		\$0x0e,$Wi,$Wi
581	nop
582	movdqa		$ABEF,$ABEF_SAVE	# offload
583	sha256rnds2	$CDGH,$ABEF
584
585	movdqa		1*32-0x80($Tbl),$Wi
586	paddd		@MSG[1],$Wi
587	pshufb		$TMP,@MSG[2]
588	sha256rnds2	$ABEF,$CDGH		# 4-7
589	pshufd		\$0x0e,$Wi,$Wi
590	lea		0x40($inp),$inp
591	sha256msg1	@MSG[1],@MSG[0]
592	sha256rnds2	$CDGH,$ABEF
593
594	movdqa		2*32-0x80($Tbl),$Wi
595	paddd		@MSG[2],$Wi
596	pshufb		$TMP,@MSG[3]
597	sha256rnds2	$ABEF,$CDGH		# 8-11
598	pshufd		\$0x0e,$Wi,$Wi
599	movdqa		@MSG[3],$TMP
600	palignr		\$4,@MSG[2],$TMP
601	nop
602	paddd		$TMP,@MSG[0]
603	sha256msg1	@MSG[2],@MSG[1]
604	sha256rnds2	$CDGH,$ABEF
605
606	movdqa		3*32-0x80($Tbl),$Wi
607	paddd		@MSG[3],$Wi
608	sha256msg2	@MSG[3],@MSG[0]
609	sha256rnds2	$ABEF,$CDGH		# 12-15
610	pshufd		\$0x0e,$Wi,$Wi
611	movdqa		@MSG[0],$TMP
612	palignr		\$4,@MSG[3],$TMP
613	nop
614	paddd		$TMP,@MSG[1]
615	sha256msg1	@MSG[3],@MSG[2]
616	sha256rnds2	$CDGH,$ABEF
617___
618for($i=4;$i<16-3;$i++) {
619$code.=<<___;
620	movdqa		$i*32-0x80($Tbl),$Wi
621	paddd		@MSG[0],$Wi
622	sha256msg2	@MSG[0],@MSG[1]
623	sha256rnds2	$ABEF,$CDGH		# 16-19...
624	pshufd		\$0x0e,$Wi,$Wi
625	movdqa		@MSG[1],$TMP
626	palignr		\$4,@MSG[0],$TMP
627	nop
628	paddd		$TMP,@MSG[2]
629	sha256msg1	@MSG[0],@MSG[3]
630	sha256rnds2	$CDGH,$ABEF
631___
632	push(@MSG,shift(@MSG));
633}
634$code.=<<___;
635	movdqa		13*32-0x80($Tbl),$Wi
636	paddd		@MSG[0],$Wi
637	sha256msg2	@MSG[0],@MSG[1]
638	sha256rnds2	$ABEF,$CDGH		# 52-55
639	pshufd		\$0x0e,$Wi,$Wi
640	movdqa		@MSG[1],$TMP
641	palignr		\$4,@MSG[0],$TMP
642	sha256rnds2	$CDGH,$ABEF
643	paddd		$TMP,@MSG[2]
644
645	movdqa		14*32-0x80($Tbl),$Wi
646	paddd		@MSG[1],$Wi
647	sha256rnds2	$ABEF,$CDGH		# 56-59
648	pshufd		\$0x0e,$Wi,$Wi
649	sha256msg2	@MSG[1],@MSG[2]
650	movdqa		$BSWAP,$TMP
651	sha256rnds2	$CDGH,$ABEF
652
653	movdqa		15*32-0x80($Tbl),$Wi
654	paddd		@MSG[2],$Wi
655	nop
656	sha256rnds2	$ABEF,$CDGH		# 60-63
657	pshufd		\$0x0e,$Wi,$Wi
658	dec		$num
659	nop
660	sha256rnds2	$CDGH,$ABEF
661
662	paddd		$CDGH_SAVE,$CDGH
663	paddd		$ABEF_SAVE,$ABEF
664	jnz		.Loop_shaext
665
666	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
667	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
668	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
669	punpckhqdq	$CDGH,$ABEF		# DCBA
670	palignr		\$8,$TMP,$CDGH		# HGFE
671
672	movdqu	$ABEF,($ctx)
673	movdqu	$CDGH,16($ctx)
674___
675$code.=<<___ if ($win64);
676	movaps	-8-5*16(%rax),%xmm6
677	movaps	-8-4*16(%rax),%xmm7
678	movaps	-8-3*16(%rax),%xmm8
679	movaps	-8-2*16(%rax),%xmm9
680	movaps	-8-1*16(%rax),%xmm10
681	mov	%rax,%rsp
682.Lepilogue_shaext:
683___
684$code.=<<___;
685	ret
686.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
687___
688}}}
689{{{
690
691my $a4=$T1;
692my ($a,$b,$c,$d,$e,$f,$g,$h);
693
694sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
695{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
696  my $arg = pop;
697    $arg = "\$$arg" if ($arg*1 eq $arg);
698    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
699}
700
701sub body_00_15 () {
702	(
703	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
704
705	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
706	'&mov	($a,$a1)',
707	'&mov	($a4,$f)',
708
709	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
710	'&xor	($a0,$e)',
711	'&xor	($a4,$g)',			# f^g
712
713	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
714	'&xor	($a1,$a)',
715	'&and	($a4,$e)',			# (f^g)&e
716
717	'&xor	($a0,$e)',
718	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
719	'&mov	($a2,$a)',
720
721	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
722	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
723	'&xor	($a2,$b)',			# a^b, b^c in next round
724
725	'&add	($h,$a4)',			# h+=Ch(e,f,g)
726	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
727	'&and	($a3,$a2)',			# (b^c)&(a^b)
728
729	'&xor	($a1,$a)',
730	'&add	($h,$a0)',			# h+=Sigma1(e)
731	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
732
733	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
734	'&add	($d,$h)',			# d+=h
735	'&add	($h,$a3)',			# h+=Maj(a,b,c)
736
737	'&mov	($a0,$d)',
738	'&add	($a1,$h);'.			# h+=Sigma0(a)
739	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
740	);
741}
742
743######################################################################
744# SSSE3 code path
745#
746if ($SZ==4) {	# SHA256 only
747my @X = map("%xmm$_",(0..3));
748my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
749
750$code.=<<___;
751.type	${func}_ssse3,\@function,3
752.align	64
753${func}_ssse3:
754.Lssse3_shortcut:
755	push	%rbx
756	push	%rbp
757	push	%r12
758	push	%r13
759	push	%r14
760	push	%r15
761	mov	%rsp,%r11		# copy %rsp
762	shl	\$4,%rdx		# num*16
763	sub	\$`$framesz+$win64*16*4`,%rsp
764	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
765	and	\$-64,%rsp		# align stack frame
766	mov	$ctx,$_ctx		# save ctx, 1st arg
767	mov	$inp,$_inp		# save inp, 2nd arh
768	mov	%rdx,$_end		# save end pointer, "3rd" arg
769	mov	%r11,$_rsp		# save copy of %rsp
770___
771$code.=<<___ if ($win64);
772	movaps	%xmm6,16*$SZ+32(%rsp)
773	movaps	%xmm7,16*$SZ+48(%rsp)
774	movaps	%xmm8,16*$SZ+64(%rsp)
775	movaps	%xmm9,16*$SZ+80(%rsp)
776___
777$code.=<<___;
778.Lprologue_ssse3:
779
780	mov	$SZ*0($ctx),$A
781	mov	$SZ*1($ctx),$B
782	mov	$SZ*2($ctx),$C
783	mov	$SZ*3($ctx),$D
784	mov	$SZ*4($ctx),$E
785	mov	$SZ*5($ctx),$F
786	mov	$SZ*6($ctx),$G
787	mov	$SZ*7($ctx),$H
788___
789
790$code.=<<___;
791	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
792	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
793	jmp	.Lloop_ssse3
794.align	16
795.Lloop_ssse3:
796	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
797	movdqu	0x00($inp),@X[0]
798	movdqu	0x10($inp),@X[1]
799	movdqu	0x20($inp),@X[2]
800	pshufb	$t3,@X[0]
801	movdqu	0x30($inp),@X[3]
802	lea	$TABLE(%rip),$Tbl
803	pshufb	$t3,@X[1]
804	movdqa	0x00($Tbl),$t0
805	movdqa	0x20($Tbl),$t1
806	pshufb	$t3,@X[2]
807	paddd	@X[0],$t0
808	movdqa	0x40($Tbl),$t2
809	pshufb	$t3,@X[3]
810	movdqa	0x60($Tbl),$t3
811	paddd	@X[1],$t1
812	paddd	@X[2],$t2
813	paddd	@X[3],$t3
814	movdqa	$t0,0x00(%rsp)
815	mov	$A,$a1
816	movdqa	$t1,0x10(%rsp)
817	mov	$B,$a3
818	movdqa	$t2,0x20(%rsp)
819	xor	$C,$a3			# magic
820	movdqa	$t3,0x30(%rsp)
821	mov	$E,$a0
822	jmp	.Lssse3_00_47
823
824.align	16
825.Lssse3_00_47:
826	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
827___
828sub Xupdate_256_SSSE3 () {
829	(
830	'&movdqa	($t0,@X[1]);',
831	'&movdqa	($t3,@X[3])',
832	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
833	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
834	'&movdqa	($t1,$t0)',
835	'&movdqa	($t2,$t0);',
836	'&psrld		($t0,$sigma0[2])',
837	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
838	'&psrld		($t2,$sigma0[0])',
839	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
840	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
841	'&pxor		($t0,$t2)',
842	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
843	'&pxor		($t0,$t1)',
844	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
845	'&pxor		($t0,$t2);',
846	 '&movdqa	($t2,$t3)',
847	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
848	 '&psrld	($t3,$sigma1[2])',
849	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
850	 '&psrlq	($t2,$sigma1[0])',
851	 '&pxor		($t3,$t2);',
852	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
853	 '&pxor		($t3,$t2)',
854	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
855	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
856	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
857	 '&movdqa	($t2,$t3);',
858	 '&psrld	($t3,$sigma1[2])',
859	 '&psrlq	($t2,$sigma1[0])',
860	 '&pxor		($t3,$t2);',
861	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
862	 '&pxor		($t3,$t2);',
863	'&movdqa	($t2,16*2*$j."($Tbl)")',
864	 '&pshufb	($t3,$t5)',
865	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
866	);
867}
868
869sub SSSE3_256_00_47 () {
870my $j = shift;
871my $body = shift;
872my @X = @_;
873my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
874
875    if (0) {
876	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
877	    eval;
878	    eval(shift(@insns));
879	    eval(shift(@insns));
880	    eval(shift(@insns));
881	}
882    } else {			# squeeze extra 4% on Westmere and 19% on Atom
883	  eval(shift(@insns));	#@
884	&movdqa		($t0,@X[1]);
885	  eval(shift(@insns));
886	  eval(shift(@insns));
887	&movdqa		($t3,@X[3]);
888	  eval(shift(@insns));	#@
889	  eval(shift(@insns));
890	  eval(shift(@insns));
891	  eval(shift(@insns));	#@
892	  eval(shift(@insns));
893	&palignr	($t0,@X[0],$SZ);	# X[1..4]
894	  eval(shift(@insns));
895	  eval(shift(@insns));
896	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
897	  eval(shift(@insns));
898	  eval(shift(@insns));
899	  eval(shift(@insns));
900	  eval(shift(@insns));	#@
901	&movdqa		($t1,$t0);
902	  eval(shift(@insns));
903	  eval(shift(@insns));
904	&movdqa		($t2,$t0);
905	  eval(shift(@insns));	#@
906	  eval(shift(@insns));
907	&psrld		($t0,$sigma0[2]);
908	  eval(shift(@insns));
909	  eval(shift(@insns));
910	  eval(shift(@insns));
911	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
912	  eval(shift(@insns));	#@
913	  eval(shift(@insns));
914	&psrld		($t2,$sigma0[0]);
915	  eval(shift(@insns));
916	  eval(shift(@insns));
917	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
918	  eval(shift(@insns));
919	  eval(shift(@insns));	#@
920	&pslld		($t1,8*$SZ-$sigma0[1]);
921	  eval(shift(@insns));
922	  eval(shift(@insns));
923	&pxor		($t0,$t2);
924	  eval(shift(@insns));	#@
925	  eval(shift(@insns));
926	  eval(shift(@insns));
927	  eval(shift(@insns));	#@
928	&psrld		($t2,$sigma0[1]-$sigma0[0]);
929	  eval(shift(@insns));
930	&pxor		($t0,$t1);
931	  eval(shift(@insns));
932	  eval(shift(@insns));
933	&pslld		($t1,$sigma0[1]-$sigma0[0]);
934	  eval(shift(@insns));
935	  eval(shift(@insns));
936	&pxor		($t0,$t2);
937	  eval(shift(@insns));
938	  eval(shift(@insns));	#@
939	 &movdqa	($t2,$t3);
940	  eval(shift(@insns));
941	  eval(shift(@insns));
942	&pxor		($t0,$t1);		# sigma0(X[1..4])
943	  eval(shift(@insns));	#@
944	  eval(shift(@insns));
945	  eval(shift(@insns));
946	 &psrld		($t3,$sigma1[2]);
947	  eval(shift(@insns));
948	  eval(shift(@insns));
949	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
950	  eval(shift(@insns));	#@
951	  eval(shift(@insns));
952	 &psrlq		($t2,$sigma1[0]);
953	  eval(shift(@insns));
954	  eval(shift(@insns));
955	  eval(shift(@insns));
956	 &pxor		($t3,$t2);
957	  eval(shift(@insns));	#@
958	  eval(shift(@insns));
959	  eval(shift(@insns));
960	  eval(shift(@insns));	#@
961	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
962	  eval(shift(@insns));
963	  eval(shift(@insns));
964	 &pxor		($t3,$t2);
965	  eval(shift(@insns));	#@
966	  eval(shift(@insns));
967	  eval(shift(@insns));
968	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
969	 &pshufd	($t3,$t3,0b10000000);
970	  eval(shift(@insns));
971	  eval(shift(@insns));
972	  eval(shift(@insns));
973	 &psrldq	($t3,8);
974	  eval(shift(@insns));
975	  eval(shift(@insns));	#@
976	  eval(shift(@insns));
977	  eval(shift(@insns));
978	  eval(shift(@insns));	#@
979	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
980	  eval(shift(@insns));
981	  eval(shift(@insns));
982	  eval(shift(@insns));
983	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
984	  eval(shift(@insns));
985	  eval(shift(@insns));	#@
986	  eval(shift(@insns));
987	 &movdqa	($t2,$t3);
988	  eval(shift(@insns));
989	  eval(shift(@insns));
990	 &psrld		($t3,$sigma1[2]);
991	  eval(shift(@insns));
992	  eval(shift(@insns));	#@
993	 &psrlq		($t2,$sigma1[0]);
994	  eval(shift(@insns));
995	  eval(shift(@insns));
996	 &pxor		($t3,$t2);
997	  eval(shift(@insns));	#@
998	  eval(shift(@insns));
999	  eval(shift(@insns));
1000	  eval(shift(@insns));	#@
1001	  eval(shift(@insns));
1002	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1003	  eval(shift(@insns));
1004	  eval(shift(@insns));
1005	  eval(shift(@insns));
1006	 &pxor		($t3,$t2);
1007	  eval(shift(@insns));
1008	  eval(shift(@insns));
1009	  eval(shift(@insns));	#@
1010	 #&pshufb	($t3,$t5);
1011	 &pshufd	($t3,$t3,0b00001000);
1012	  eval(shift(@insns));
1013	  eval(shift(@insns));
1014	&movdqa		($t2,16*2*$j."($Tbl)");
1015	  eval(shift(@insns));	#@
1016	  eval(shift(@insns));
1017	 &pslldq	($t3,8);
1018	  eval(shift(@insns));
1019	  eval(shift(@insns));
1020	  eval(shift(@insns));
1021	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1022	  eval(shift(@insns));	#@
1023	  eval(shift(@insns));
1024	  eval(shift(@insns));
1025    }
1026	&paddd		($t2,@X[0]);
1027	  foreach (@insns) { eval; }		# remaining instructions
1028	&movdqa		(16*$j."(%rsp)",$t2);
1029}
1030
1031    for ($i=0,$j=0; $j<4; $j++) {
1032	&SSSE3_256_00_47($j,\&body_00_15,@X);
1033	push(@X,shift(@X));			# rotate(@X)
1034    }
1035	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1036	&jne	(".Lssse3_00_47");
1037
1038    for ($i=0; $i<16; ) {
1039	foreach(body_00_15()) { eval; }
1040    }
1041$code.=<<___;
1042	mov	$_ctx,$ctx
1043	mov	$a1,$A
1044
1045	add	$SZ*0($ctx),$A
1046	lea	16*$SZ($inp),$inp
1047	add	$SZ*1($ctx),$B
1048	add	$SZ*2($ctx),$C
1049	add	$SZ*3($ctx),$D
1050	add	$SZ*4($ctx),$E
1051	add	$SZ*5($ctx),$F
1052	add	$SZ*6($ctx),$G
1053	add	$SZ*7($ctx),$H
1054
1055	cmp	$_end,$inp
1056
1057	mov	$A,$SZ*0($ctx)
1058	mov	$B,$SZ*1($ctx)
1059	mov	$C,$SZ*2($ctx)
1060	mov	$D,$SZ*3($ctx)
1061	mov	$E,$SZ*4($ctx)
1062	mov	$F,$SZ*5($ctx)
1063	mov	$G,$SZ*6($ctx)
1064	mov	$H,$SZ*7($ctx)
1065	jb	.Lloop_ssse3
1066
1067	mov	$_rsp,%rsi
1068___
1069$code.=<<___ if ($win64);
1070	movaps	16*$SZ+32(%rsp),%xmm6
1071	movaps	16*$SZ+48(%rsp),%xmm7
1072	movaps	16*$SZ+64(%rsp),%xmm8
1073	movaps	16*$SZ+80(%rsp),%xmm9
1074___
1075$code.=<<___;
1076	mov	(%rsi),%r15
1077	mov	8(%rsi),%r14
1078	mov	16(%rsi),%r13
1079	mov	24(%rsi),%r12
1080	mov	32(%rsi),%rbp
1081	mov	40(%rsi),%rbx
1082	lea	48(%rsi),%rsp
1083.Lepilogue_ssse3:
1084	ret
1085.size	${func}_ssse3,.-${func}_ssse3
1086___
1087}
1088
1089if ($avx) {{
1090######################################################################
1091# XOP code path
1092#
1093if ($SZ==8) {	# SHA512 only
1094$code.=<<___;
1095.type	${func}_xop,\@function,3
1096.align	64
1097${func}_xop:
1098.Lxop_shortcut:
1099	push	%rbx
1100	push	%rbp
1101	push	%r12
1102	push	%r13
1103	push	%r14
1104	push	%r15
1105	mov	%rsp,%r11		# copy %rsp
1106	shl	\$4,%rdx		# num*16
1107	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1108	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1109	and	\$-64,%rsp		# align stack frame
1110	mov	$ctx,$_ctx		# save ctx, 1st arg
1111	mov	$inp,$_inp		# save inp, 2nd arh
1112	mov	%rdx,$_end		# save end pointer, "3rd" arg
1113	mov	%r11,$_rsp		# save copy of %rsp
1114___
1115$code.=<<___ if ($win64);
1116	movaps	%xmm6,16*$SZ+32(%rsp)
1117	movaps	%xmm7,16*$SZ+48(%rsp)
1118	movaps	%xmm8,16*$SZ+64(%rsp)
1119	movaps	%xmm9,16*$SZ+80(%rsp)
1120___
1121$code.=<<___ if ($win64 && $SZ>4);
1122	movaps	%xmm10,16*$SZ+96(%rsp)
1123	movaps	%xmm11,16*$SZ+112(%rsp)
1124___
1125$code.=<<___;
1126.Lprologue_xop:
1127
1128	vzeroupper
1129	mov	$SZ*0($ctx),$A
1130	mov	$SZ*1($ctx),$B
1131	mov	$SZ*2($ctx),$C
1132	mov	$SZ*3($ctx),$D
1133	mov	$SZ*4($ctx),$E
1134	mov	$SZ*5($ctx),$F
1135	mov	$SZ*6($ctx),$G
1136	mov	$SZ*7($ctx),$H
1137	jmp	.Lloop_xop
1138___
1139					if ($SZ==4) {	# SHA256
1140    my @X = map("%xmm$_",(0..3));
1141    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1142
1143$code.=<<___;
1144.align	16
1145.Lloop_xop:
1146	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1147	vmovdqu	0x00($inp),@X[0]
1148	vmovdqu	0x10($inp),@X[1]
1149	vmovdqu	0x20($inp),@X[2]
1150	vmovdqu	0x30($inp),@X[3]
1151	vpshufb	$t3,@X[0],@X[0]
1152	lea	$TABLE(%rip),$Tbl
1153	vpshufb	$t3,@X[1],@X[1]
1154	vpshufb	$t3,@X[2],@X[2]
1155	vpaddd	0x00($Tbl),@X[0],$t0
1156	vpshufb	$t3,@X[3],@X[3]
1157	vpaddd	0x20($Tbl),@X[1],$t1
1158	vpaddd	0x40($Tbl),@X[2],$t2
1159	vpaddd	0x60($Tbl),@X[3],$t3
1160	vmovdqa	$t0,0x00(%rsp)
1161	mov	$A,$a1
1162	vmovdqa	$t1,0x10(%rsp)
1163	mov	$B,$a3
1164	vmovdqa	$t2,0x20(%rsp)
1165	xor	$C,$a3			# magic
1166	vmovdqa	$t3,0x30(%rsp)
1167	mov	$E,$a0
1168	jmp	.Lxop_00_47
1169
1170.align	16
1171.Lxop_00_47:
1172	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1173___
1174sub XOP_256_00_47 () {
1175my $j = shift;
1176my $body = shift;
1177my @X = @_;
1178my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1179
1180	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1181	  eval(shift(@insns));
1182	  eval(shift(@insns));
1183	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1184	  eval(shift(@insns));
1185	  eval(shift(@insns));
1186	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1187	  eval(shift(@insns));
1188	  eval(shift(@insns));
1189	&vpsrld		($t0,$t0,$sigma0[2]);
1190	  eval(shift(@insns));
1191	  eval(shift(@insns));
1192	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1193	  eval(shift(@insns));
1194	  eval(shift(@insns));
1195	  eval(shift(@insns));
1196	  eval(shift(@insns));
1197	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1198	  eval(shift(@insns));
1199	  eval(shift(@insns));
1200	&vpxor		($t0,$t0,$t1);
1201	  eval(shift(@insns));
1202	  eval(shift(@insns));
1203	  eval(shift(@insns));
1204	  eval(shift(@insns));
1205	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1206	  eval(shift(@insns));
1207	  eval(shift(@insns));
1208	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1209	  eval(shift(@insns));
1210	  eval(shift(@insns));
1211	 &vpsrld	($t2,@X[3],$sigma1[2]);
1212	  eval(shift(@insns));
1213	  eval(shift(@insns));
1214	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1215	  eval(shift(@insns));
1216	  eval(shift(@insns));
1217	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1218	  eval(shift(@insns));
1219	  eval(shift(@insns));
1220	 &vpxor		($t3,$t3,$t2);
1221	  eval(shift(@insns));
1222	  eval(shift(@insns));
1223	  eval(shift(@insns));
1224	  eval(shift(@insns));
1225	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1226	  eval(shift(@insns));
1227	  eval(shift(@insns));
1228	  eval(shift(@insns));
1229	  eval(shift(@insns));
1230	&vpsrldq	($t3,$t3,8);
1231	  eval(shift(@insns));
1232	  eval(shift(@insns));
1233	  eval(shift(@insns));
1234	  eval(shift(@insns));
1235	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1236	  eval(shift(@insns));
1237	  eval(shift(@insns));
1238	  eval(shift(@insns));
1239	  eval(shift(@insns));
1240	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1241	  eval(shift(@insns));
1242	  eval(shift(@insns));
1243	 &vpsrld	($t2,@X[0],$sigma1[2]);
1244	  eval(shift(@insns));
1245	  eval(shift(@insns));
1246	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1247	  eval(shift(@insns));
1248	  eval(shift(@insns));
1249	 &vpxor		($t3,$t3,$t2);
1250	  eval(shift(@insns));
1251	  eval(shift(@insns));
1252	  eval(shift(@insns));
1253	  eval(shift(@insns));
1254	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1255	  eval(shift(@insns));
1256	  eval(shift(@insns));
1257	  eval(shift(@insns));
1258	  eval(shift(@insns));
1259	&vpslldq	($t3,$t3,8);		# 22 instructions
1260	  eval(shift(@insns));
1261	  eval(shift(@insns));
1262	  eval(shift(@insns));
1263	  eval(shift(@insns));
1264	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1265	  eval(shift(@insns));
1266	  eval(shift(@insns));
1267	  eval(shift(@insns));
1268	  eval(shift(@insns));
1269	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1270	  foreach (@insns) { eval; }		# remaining instructions
1271	&vmovdqa	(16*$j."(%rsp)",$t2);
1272}
1273
1274    for ($i=0,$j=0; $j<4; $j++) {
1275	&XOP_256_00_47($j,\&body_00_15,@X);
1276	push(@X,shift(@X));			# rotate(@X)
1277    }
1278	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1279	&jne	(".Lxop_00_47");
1280
1281    for ($i=0; $i<16; ) {
1282	foreach(body_00_15()) { eval; }
1283    }
1284
1285					} else {	# SHA512
1286    my @X = map("%xmm$_",(0..7));
1287    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1288
1289$code.=<<___;
1290.align	16
1291.Lloop_xop:
1292	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1293	vmovdqu	0x00($inp),@X[0]
1294	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1295	vmovdqu	0x10($inp),@X[1]
1296	vmovdqu	0x20($inp),@X[2]
1297	vpshufb	$t3,@X[0],@X[0]
1298	vmovdqu	0x30($inp),@X[3]
1299	vpshufb	$t3,@X[1],@X[1]
1300	vmovdqu	0x40($inp),@X[4]
1301	vpshufb	$t3,@X[2],@X[2]
1302	vmovdqu	0x50($inp),@X[5]
1303	vpshufb	$t3,@X[3],@X[3]
1304	vmovdqu	0x60($inp),@X[6]
1305	vpshufb	$t3,@X[4],@X[4]
1306	vmovdqu	0x70($inp),@X[7]
1307	vpshufb	$t3,@X[5],@X[5]
1308	vpaddq	-0x80($Tbl),@X[0],$t0
1309	vpshufb	$t3,@X[6],@X[6]
1310	vpaddq	-0x60($Tbl),@X[1],$t1
1311	vpshufb	$t3,@X[7],@X[7]
1312	vpaddq	-0x40($Tbl),@X[2],$t2
1313	vpaddq	-0x20($Tbl),@X[3],$t3
1314	vmovdqa	$t0,0x00(%rsp)
1315	vpaddq	0x00($Tbl),@X[4],$t0
1316	vmovdqa	$t1,0x10(%rsp)
1317	vpaddq	0x20($Tbl),@X[5],$t1
1318	vmovdqa	$t2,0x20(%rsp)
1319	vpaddq	0x40($Tbl),@X[6],$t2
1320	vmovdqa	$t3,0x30(%rsp)
1321	vpaddq	0x60($Tbl),@X[7],$t3
1322	vmovdqa	$t0,0x40(%rsp)
1323	mov	$A,$a1
1324	vmovdqa	$t1,0x50(%rsp)
1325	mov	$B,$a3
1326	vmovdqa	$t2,0x60(%rsp)
1327	xor	$C,$a3			# magic
1328	vmovdqa	$t3,0x70(%rsp)
1329	mov	$E,$a0
1330	jmp	.Lxop_00_47
1331
1332.align	16
1333.Lxop_00_47:
1334	add	\$`16*2*$SZ`,$Tbl
1335___
1336sub XOP_512_00_47 () {
1337my $j = shift;
1338my $body = shift;
1339my @X = @_;
1340my @insns = (&$body,&$body);			# 52 instructions
1341
1342	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1343	  eval(shift(@insns));
1344	  eval(shift(@insns));
1345	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1346	  eval(shift(@insns));
1347	  eval(shift(@insns));
1348	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1349	  eval(shift(@insns));
1350	  eval(shift(@insns));
1351	&vpsrlq		($t0,$t0,$sigma0[2]);
1352	  eval(shift(@insns));
1353	  eval(shift(@insns));
1354	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1355	  eval(shift(@insns));
1356	  eval(shift(@insns));
1357	  eval(shift(@insns));
1358	  eval(shift(@insns));
1359	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1360	  eval(shift(@insns));
1361	  eval(shift(@insns));
1362	&vpxor		($t0,$t0,$t1);
1363	  eval(shift(@insns));
1364	  eval(shift(@insns));
1365	  eval(shift(@insns));
1366	  eval(shift(@insns));
1367	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1368	  eval(shift(@insns));
1369	  eval(shift(@insns));
1370	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1371	  eval(shift(@insns));
1372	  eval(shift(@insns));
1373	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1374	  eval(shift(@insns));
1375	  eval(shift(@insns));
1376	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1377	  eval(shift(@insns));
1378	  eval(shift(@insns));
1379	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1380	  eval(shift(@insns));
1381	  eval(shift(@insns));
1382	 &vpxor		($t3,$t3,$t2);
1383	  eval(shift(@insns));
1384	  eval(shift(@insns));
1385	  eval(shift(@insns));
1386	  eval(shift(@insns));
1387	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1388	  eval(shift(@insns));
1389	  eval(shift(@insns));
1390	  eval(shift(@insns));
1391	  eval(shift(@insns));
1392	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1393	  eval(shift(@insns));
1394	  eval(shift(@insns));
1395	  eval(shift(@insns));
1396	  eval(shift(@insns));
1397	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1398	  foreach (@insns) { eval; }		# remaining instructions
1399	&vmovdqa	(16*$j."(%rsp)",$t2);
1400}
1401
1402    for ($i=0,$j=0; $j<8; $j++) {
1403	&XOP_512_00_47($j,\&body_00_15,@X);
1404	push(@X,shift(@X));			# rotate(@X)
1405    }
1406	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1407	&jne	(".Lxop_00_47");
1408
1409    for ($i=0; $i<16; ) {
1410	foreach(body_00_15()) { eval; }
1411    }
1412}
1413$code.=<<___;
1414	mov	$_ctx,$ctx
1415	mov	$a1,$A
1416
1417	add	$SZ*0($ctx),$A
1418	lea	16*$SZ($inp),$inp
1419	add	$SZ*1($ctx),$B
1420	add	$SZ*2($ctx),$C
1421	add	$SZ*3($ctx),$D
1422	add	$SZ*4($ctx),$E
1423	add	$SZ*5($ctx),$F
1424	add	$SZ*6($ctx),$G
1425	add	$SZ*7($ctx),$H
1426
1427	cmp	$_end,$inp
1428
1429	mov	$A,$SZ*0($ctx)
1430	mov	$B,$SZ*1($ctx)
1431	mov	$C,$SZ*2($ctx)
1432	mov	$D,$SZ*3($ctx)
1433	mov	$E,$SZ*4($ctx)
1434	mov	$F,$SZ*5($ctx)
1435	mov	$G,$SZ*6($ctx)
1436	mov	$H,$SZ*7($ctx)
1437	jb	.Lloop_xop
1438
1439	mov	$_rsp,%rsi
1440	vzeroupper
1441___
1442$code.=<<___ if ($win64);
1443	movaps	16*$SZ+32(%rsp),%xmm6
1444	movaps	16*$SZ+48(%rsp),%xmm7
1445	movaps	16*$SZ+64(%rsp),%xmm8
1446	movaps	16*$SZ+80(%rsp),%xmm9
1447___
1448$code.=<<___ if ($win64 && $SZ>4);
1449	movaps	16*$SZ+96(%rsp),%xmm10
1450	movaps	16*$SZ+112(%rsp),%xmm11
1451___
1452$code.=<<___;
1453	mov	(%rsi),%r15
1454	mov	8(%rsi),%r14
1455	mov	16(%rsi),%r13
1456	mov	24(%rsi),%r12
1457	mov	32(%rsi),%rbp
1458	mov	40(%rsi),%rbx
1459	lea	48(%rsi),%rsp
1460.Lepilogue_xop:
1461	ret
1462.size	${func}_xop,.-${func}_xop
1463___
1464}
1465######################################################################
1466# AVX+shrd code path
1467#
1468local *ror = sub { &shrd(@_[0],@_) };
1469
1470$code.=<<___;
1471.type	${func}_avx,\@function,3
1472.align	64
1473${func}_avx:
1474.Lavx_shortcut:
1475	push	%rbx
1476	push	%rbp
1477	push	%r12
1478	push	%r13
1479	push	%r14
1480	push	%r15
1481	mov	%rsp,%r11		# copy %rsp
1482	shl	\$4,%rdx		# num*16
1483	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1484	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1485	and	\$-64,%rsp		# align stack frame
1486	mov	$ctx,$_ctx		# save ctx, 1st arg
1487	mov	$inp,$_inp		# save inp, 2nd arh
1488	mov	%rdx,$_end		# save end pointer, "3rd" arg
1489	mov	%r11,$_rsp		# save copy of %rsp
1490___
1491$code.=<<___ if ($win64);
1492	movaps	%xmm6,16*$SZ+32(%rsp)
1493	movaps	%xmm7,16*$SZ+48(%rsp)
1494	movaps	%xmm8,16*$SZ+64(%rsp)
1495	movaps	%xmm9,16*$SZ+80(%rsp)
1496___
1497$code.=<<___ if ($win64 && $SZ>4);
1498	movaps	%xmm10,16*$SZ+96(%rsp)
1499	movaps	%xmm11,16*$SZ+112(%rsp)
1500___
1501$code.=<<___;
1502.Lprologue_avx:
1503
1504	vzeroupper
1505	mov	$SZ*0($ctx),$A
1506	mov	$SZ*1($ctx),$B
1507	mov	$SZ*2($ctx),$C
1508	mov	$SZ*3($ctx),$D
1509	mov	$SZ*4($ctx),$E
1510	mov	$SZ*5($ctx),$F
1511	mov	$SZ*6($ctx),$G
1512	mov	$SZ*7($ctx),$H
1513___
1514					if ($SZ==4) {	# SHA256
1515    my @X = map("%xmm$_",(0..3));
1516    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1517
1518$code.=<<___;
1519	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1520	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1521	jmp	.Lloop_avx
1522.align	16
1523.Lloop_avx:
1524	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1525	vmovdqu	0x00($inp),@X[0]
1526	vmovdqu	0x10($inp),@X[1]
1527	vmovdqu	0x20($inp),@X[2]
1528	vmovdqu	0x30($inp),@X[3]
1529	vpshufb	$t3,@X[0],@X[0]
1530	lea	$TABLE(%rip),$Tbl
1531	vpshufb	$t3,@X[1],@X[1]
1532	vpshufb	$t3,@X[2],@X[2]
1533	vpaddd	0x00($Tbl),@X[0],$t0
1534	vpshufb	$t3,@X[3],@X[3]
1535	vpaddd	0x20($Tbl),@X[1],$t1
1536	vpaddd	0x40($Tbl),@X[2],$t2
1537	vpaddd	0x60($Tbl),@X[3],$t3
1538	vmovdqa	$t0,0x00(%rsp)
1539	mov	$A,$a1
1540	vmovdqa	$t1,0x10(%rsp)
1541	mov	$B,$a3
1542	vmovdqa	$t2,0x20(%rsp)
1543	xor	$C,$a3			# magic
1544	vmovdqa	$t3,0x30(%rsp)
1545	mov	$E,$a0
1546	jmp	.Lavx_00_47
1547
1548.align	16
1549.Lavx_00_47:
1550	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1551___
1552sub Xupdate_256_AVX () {
1553	(
1554	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1555	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1556	'&vpsrld	($t2,$t0,$sigma0[0]);',
1557	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1558	'&vpsrld	($t3,$t0,$sigma0[2])',
1559	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1560	'&vpxor		($t0,$t3,$t2)',
1561	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1562	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1563	'&vpxor		($t0,$t0,$t1)',
1564	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1565	'&vpxor		($t0,$t0,$t2)',
1566	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1567	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1568	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1569	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1570	 '&vpxor	($t2,$t2,$t3);',
1571	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1572	 '&vpxor	($t2,$t2,$t3)',
1573	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1574	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1575	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1576	 '&vpsrld	($t2,$t3,$sigma1[2])',
1577	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1578	 '&vpxor	($t2,$t2,$t3);',
1579	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1580	 '&vpxor	($t2,$t2,$t3)',
1581	 '&vpshufb	($t2,$t2,$t5)',
1582	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1583	);
1584}
1585
1586sub AVX_256_00_47 () {
1587my $j = shift;
1588my $body = shift;
1589my @X = @_;
1590my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1591
1592	foreach (Xupdate_256_AVX()) {		# 29 instructions
1593	    eval;
1594	    eval(shift(@insns));
1595	    eval(shift(@insns));
1596	    eval(shift(@insns));
1597	}
1598	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1599	  foreach (@insns) { eval; }		# remaining instructions
1600	&vmovdqa	(16*$j."(%rsp)",$t2);
1601}
1602
1603    for ($i=0,$j=0; $j<4; $j++) {
1604	&AVX_256_00_47($j,\&body_00_15,@X);
1605	push(@X,shift(@X));			# rotate(@X)
1606    }
1607	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1608	&jne	(".Lavx_00_47");
1609
1610    for ($i=0; $i<16; ) {
1611	foreach(body_00_15()) { eval; }
1612    }
1613
1614					} else {	# SHA512
1615    my @X = map("%xmm$_",(0..7));
1616    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1617
1618$code.=<<___;
1619	jmp	.Lloop_avx
1620.align	16
1621.Lloop_avx:
1622	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1623	vmovdqu	0x00($inp),@X[0]
1624	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1625	vmovdqu	0x10($inp),@X[1]
1626	vmovdqu	0x20($inp),@X[2]
1627	vpshufb	$t3,@X[0],@X[0]
1628	vmovdqu	0x30($inp),@X[3]
1629	vpshufb	$t3,@X[1],@X[1]
1630	vmovdqu	0x40($inp),@X[4]
1631	vpshufb	$t3,@X[2],@X[2]
1632	vmovdqu	0x50($inp),@X[5]
1633	vpshufb	$t3,@X[3],@X[3]
1634	vmovdqu	0x60($inp),@X[6]
1635	vpshufb	$t3,@X[4],@X[4]
1636	vmovdqu	0x70($inp),@X[7]
1637	vpshufb	$t3,@X[5],@X[5]
1638	vpaddq	-0x80($Tbl),@X[0],$t0
1639	vpshufb	$t3,@X[6],@X[6]
1640	vpaddq	-0x60($Tbl),@X[1],$t1
1641	vpshufb	$t3,@X[7],@X[7]
1642	vpaddq	-0x40($Tbl),@X[2],$t2
1643	vpaddq	-0x20($Tbl),@X[3],$t3
1644	vmovdqa	$t0,0x00(%rsp)
1645	vpaddq	0x00($Tbl),@X[4],$t0
1646	vmovdqa	$t1,0x10(%rsp)
1647	vpaddq	0x20($Tbl),@X[5],$t1
1648	vmovdqa	$t2,0x20(%rsp)
1649	vpaddq	0x40($Tbl),@X[6],$t2
1650	vmovdqa	$t3,0x30(%rsp)
1651	vpaddq	0x60($Tbl),@X[7],$t3
1652	vmovdqa	$t0,0x40(%rsp)
1653	mov	$A,$a1
1654	vmovdqa	$t1,0x50(%rsp)
1655	mov	$B,$a3
1656	vmovdqa	$t2,0x60(%rsp)
1657	xor	$C,$a3			# magic
1658	vmovdqa	$t3,0x70(%rsp)
1659	mov	$E,$a0
1660	jmp	.Lavx_00_47
1661
1662.align	16
1663.Lavx_00_47:
1664	add	\$`16*2*$SZ`,$Tbl
1665___
1666sub Xupdate_512_AVX () {
1667	(
1668	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1669	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1670	'&vpsrlq	($t2,$t0,$sigma0[0])',
1671	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1672	'&vpsrlq	($t3,$t0,$sigma0[2])',
1673	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1674	 '&vpxor	($t0,$t3,$t2)',
1675	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1676	 '&vpxor	($t0,$t0,$t1)',
1677	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1678	 '&vpxor	($t0,$t0,$t2)',
1679	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1680	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1681	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1682	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1683	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1684	 '&vpxor	($t3,$t3,$t2)',
1685	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1686	 '&vpxor	($t3,$t3,$t1)',
1687	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1688	 '&vpxor	($t3,$t3,$t2)',
1689	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1690	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1691	);
1692}
1693
1694sub AVX_512_00_47 () {
1695my $j = shift;
1696my $body = shift;
1697my @X = @_;
1698my @insns = (&$body,&$body);			# 52 instructions
1699
1700	foreach (Xupdate_512_AVX()) {		# 23 instructions
1701	    eval;
1702	    eval(shift(@insns));
1703	    eval(shift(@insns));
1704	}
1705	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1706	  foreach (@insns) { eval; }		# remaining instructions
1707	&vmovdqa	(16*$j."(%rsp)",$t2);
1708}
1709
1710    for ($i=0,$j=0; $j<8; $j++) {
1711	&AVX_512_00_47($j,\&body_00_15,@X);
1712	push(@X,shift(@X));			# rotate(@X)
1713    }
1714	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1715	&jne	(".Lavx_00_47");
1716
1717    for ($i=0; $i<16; ) {
1718	foreach(body_00_15()) { eval; }
1719    }
1720}
1721$code.=<<___;
1722	mov	$_ctx,$ctx
1723	mov	$a1,$A
1724
1725	add	$SZ*0($ctx),$A
1726	lea	16*$SZ($inp),$inp
1727	add	$SZ*1($ctx),$B
1728	add	$SZ*2($ctx),$C
1729	add	$SZ*3($ctx),$D
1730	add	$SZ*4($ctx),$E
1731	add	$SZ*5($ctx),$F
1732	add	$SZ*6($ctx),$G
1733	add	$SZ*7($ctx),$H
1734
1735	cmp	$_end,$inp
1736
1737	mov	$A,$SZ*0($ctx)
1738	mov	$B,$SZ*1($ctx)
1739	mov	$C,$SZ*2($ctx)
1740	mov	$D,$SZ*3($ctx)
1741	mov	$E,$SZ*4($ctx)
1742	mov	$F,$SZ*5($ctx)
1743	mov	$G,$SZ*6($ctx)
1744	mov	$H,$SZ*7($ctx)
1745	jb	.Lloop_avx
1746
1747	mov	$_rsp,%rsi
1748	vzeroupper
1749___
1750$code.=<<___ if ($win64);
1751	movaps	16*$SZ+32(%rsp),%xmm6
1752	movaps	16*$SZ+48(%rsp),%xmm7
1753	movaps	16*$SZ+64(%rsp),%xmm8
1754	movaps	16*$SZ+80(%rsp),%xmm9
1755___
1756$code.=<<___ if ($win64 && $SZ>4);
1757	movaps	16*$SZ+96(%rsp),%xmm10
1758	movaps	16*$SZ+112(%rsp),%xmm11
1759___
1760$code.=<<___;
1761	mov	(%rsi),%r15
1762	mov	8(%rsi),%r14
1763	mov	16(%rsi),%r13
1764	mov	24(%rsi),%r12
1765	mov	32(%rsi),%rbp
1766	mov	40(%rsi),%rbx
1767	lea	48(%rsi),%rsp
1768.Lepilogue_avx:
1769	ret
1770.size	${func}_avx,.-${func}_avx
1771___
1772
1773if ($avx>1) {{
1774######################################################################
1775# AVX2+BMI code path
1776#
1777my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1778my $PUSH8=8*2*$SZ;
1779use integer;
1780
1781sub bodyx_00_15 () {
1782	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1783	(
1784	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1785
1786	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1787	'&and	($a4,$e)',		# f&e
1788	'&rorx	($a0,$e,$Sigma1[2])',
1789	'&rorx	($a2,$e,$Sigma1[1])',
1790
1791	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1792	'&lea	($h,"($h,$a4)")',
1793	'&andn	($a4,$e,$g)',		# ~e&g
1794	'&xor	($a0,$a2)',
1795
1796	'&rorx	($a1,$e,$Sigma1[0])',
1797	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1798	'&xor	($a0,$a1)',		# Sigma1(e)
1799	'&mov	($a2,$a)',
1800
1801	'&rorx	($a4,$a,$Sigma0[2])',
1802	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1803	'&xor	($a2,$b)',		# a^b, b^c in next round
1804	'&rorx	($a1,$a,$Sigma0[1])',
1805
1806	'&rorx	($a0,$a,$Sigma0[0])',
1807	'&lea	($d,"($d,$h)")',	# d+=h
1808	'&and	($a3,$a2)',		# (b^c)&(a^b)
1809	'&xor	($a1,$a4)',
1810
1811	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1812	'&xor	($a1,$a0)',		# Sigma0(a)
1813	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1814	'&mov	($a4,$e)',		# copy of f in future
1815
1816	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1817	);
1818	# and at the finish one has to $a+=$a1
1819}
1820
1821$code.=<<___;
1822.type	${func}_avx2,\@function,3
1823.align	64
1824${func}_avx2:
1825.Lavx2_shortcut:
1826	push	%rbx
1827	push	%rbp
1828	push	%r12
1829	push	%r13
1830	push	%r14
1831	push	%r15
1832	mov	%rsp,%r11		# copy %rsp
1833	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1834	shl	\$4,%rdx		# num*16
1835	and	\$-256*$SZ,%rsp		# align stack frame
1836	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1837	add	\$`2*$SZ*($rounds-8)`,%rsp
1838	mov	$ctx,$_ctx		# save ctx, 1st arg
1839	mov	$inp,$_inp		# save inp, 2nd arh
1840	mov	%rdx,$_end		# save end pointer, "3rd" arg
1841	mov	%r11,$_rsp		# save copy of %rsp
1842___
1843$code.=<<___ if ($win64);
1844	movaps	%xmm6,16*$SZ+32(%rsp)
1845	movaps	%xmm7,16*$SZ+48(%rsp)
1846	movaps	%xmm8,16*$SZ+64(%rsp)
1847	movaps	%xmm9,16*$SZ+80(%rsp)
1848___
1849$code.=<<___ if ($win64 && $SZ>4);
1850	movaps	%xmm10,16*$SZ+96(%rsp)
1851	movaps	%xmm11,16*$SZ+112(%rsp)
1852___
1853$code.=<<___;
1854.Lprologue_avx2:
1855
1856	vzeroupper
1857	sub	\$-16*$SZ,$inp		# inp++, size optimization
1858	mov	$SZ*0($ctx),$A
1859	mov	$inp,%r12		# borrow $T1
1860	mov	$SZ*1($ctx),$B
1861	cmp	%rdx,$inp		# $_end
1862	mov	$SZ*2($ctx),$C
1863	cmove	%rsp,%r12		# next block or random data
1864	mov	$SZ*3($ctx),$D
1865	mov	$SZ*4($ctx),$E
1866	mov	$SZ*5($ctx),$F
1867	mov	$SZ*6($ctx),$G
1868	mov	$SZ*7($ctx),$H
1869___
1870					if ($SZ==4) {	# SHA256
1871    my @X = map("%ymm$_",(0..3));
1872    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1873
1874$code.=<<___;
1875	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1876	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1877	jmp	.Loop_avx2
1878.align	16
1879.Loop_avx2:
1880	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1881	vmovdqu	-16*$SZ+0($inp),%xmm0
1882	vmovdqu	-16*$SZ+16($inp),%xmm1
1883	vmovdqu	-16*$SZ+32($inp),%xmm2
1884	vmovdqu	-16*$SZ+48($inp),%xmm3
1885	#mov		$inp,$_inp	# offload $inp
1886	vinserti128	\$1,(%r12),@X[0],@X[0]
1887	vinserti128	\$1,16(%r12),@X[1],@X[1]
1888	vpshufb		$t3,@X[0],@X[0]
1889	vinserti128	\$1,32(%r12),@X[2],@X[2]
1890	vpshufb		$t3,@X[1],@X[1]
1891	vinserti128	\$1,48(%r12),@X[3],@X[3]
1892
1893	lea	$TABLE(%rip),$Tbl
1894	vpshufb	$t3,@X[2],@X[2]
1895	vpaddd	0x00($Tbl),@X[0],$t0
1896	vpshufb	$t3,@X[3],@X[3]
1897	vpaddd	0x20($Tbl),@X[1],$t1
1898	vpaddd	0x40($Tbl),@X[2],$t2
1899	vpaddd	0x60($Tbl),@X[3],$t3
1900	vmovdqa	$t0,0x00(%rsp)
1901	xor	$a1,$a1
1902	vmovdqa	$t1,0x20(%rsp)
1903	lea	-$PUSH8(%rsp),%rsp
1904	mov	$B,$a3
1905	vmovdqa	$t2,0x00(%rsp)
1906	xor	$C,$a3			# magic
1907	vmovdqa	$t3,0x20(%rsp)
1908	mov	$F,$a4
1909	sub	\$-16*2*$SZ,$Tbl	# size optimization
1910	jmp	.Lavx2_00_47
1911
1912.align	16
1913.Lavx2_00_47:
1914___
1915
1916sub AVX2_256_00_47 () {
1917my $j = shift;
1918my $body = shift;
1919my @X = @_;
1920my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1921my $base = "+2*$PUSH8(%rsp)";
1922
1923	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1924	foreach (Xupdate_256_AVX()) {		# 29 instructions
1925	    eval;
1926	    eval(shift(@insns));
1927	    eval(shift(@insns));
1928	    eval(shift(@insns));
1929	}
1930	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1931	  foreach (@insns) { eval; }		# remaining instructions
1932	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1933}
1934
1935    for ($i=0,$j=0; $j<4; $j++) {
1936	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1937	push(@X,shift(@X));			# rotate(@X)
1938    }
1939	&lea	($Tbl,16*2*$SZ."($Tbl)");
1940	&cmpb	(($SZ-1)."($Tbl)",0);
1941	&jne	(".Lavx2_00_47");
1942
1943    for ($i=0; $i<16; ) {
1944	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1945	foreach(bodyx_00_15()) { eval; }
1946    }
1947					} else {	# SHA512
1948    my @X = map("%ymm$_",(0..7));
1949    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1950
1951$code.=<<___;
1952	jmp	.Loop_avx2
1953.align	16
1954.Loop_avx2:
1955	vmovdqu	-16*$SZ($inp),%xmm0
1956	vmovdqu	-16*$SZ+16($inp),%xmm1
1957	vmovdqu	-16*$SZ+32($inp),%xmm2
1958	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1959	vmovdqu	-16*$SZ+48($inp),%xmm3
1960	vmovdqu	-16*$SZ+64($inp),%xmm4
1961	vmovdqu	-16*$SZ+80($inp),%xmm5
1962	vmovdqu	-16*$SZ+96($inp),%xmm6
1963	vmovdqu	-16*$SZ+112($inp),%xmm7
1964	#mov	$inp,$_inp	# offload $inp
1965	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
1966	vinserti128	\$1,(%r12),@X[0],@X[0]
1967	vinserti128	\$1,16(%r12),@X[1],@X[1]
1968	 vpshufb	$t2,@X[0],@X[0]
1969	vinserti128	\$1,32(%r12),@X[2],@X[2]
1970	 vpshufb	$t2,@X[1],@X[1]
1971	vinserti128	\$1,48(%r12),@X[3],@X[3]
1972	 vpshufb	$t2,@X[2],@X[2]
1973	vinserti128	\$1,64(%r12),@X[4],@X[4]
1974	 vpshufb	$t2,@X[3],@X[3]
1975	vinserti128	\$1,80(%r12),@X[5],@X[5]
1976	 vpshufb	$t2,@X[4],@X[4]
1977	vinserti128	\$1,96(%r12),@X[6],@X[6]
1978	 vpshufb	$t2,@X[5],@X[5]
1979	vinserti128	\$1,112(%r12),@X[7],@X[7]
1980
1981	vpaddq	-0x80($Tbl),@X[0],$t0
1982	vpshufb	$t2,@X[6],@X[6]
1983	vpaddq	-0x60($Tbl),@X[1],$t1
1984	vpshufb	$t2,@X[7],@X[7]
1985	vpaddq	-0x40($Tbl),@X[2],$t2
1986	vpaddq	-0x20($Tbl),@X[3],$t3
1987	vmovdqa	$t0,0x00(%rsp)
1988	vpaddq	0x00($Tbl),@X[4],$t0
1989	vmovdqa	$t1,0x20(%rsp)
1990	vpaddq	0x20($Tbl),@X[5],$t1
1991	vmovdqa	$t2,0x40(%rsp)
1992	vpaddq	0x40($Tbl),@X[6],$t2
1993	vmovdqa	$t3,0x60(%rsp)
1994	lea	-$PUSH8(%rsp),%rsp
1995	vpaddq	0x60($Tbl),@X[7],$t3
1996	vmovdqa	$t0,0x00(%rsp)
1997	xor	$a1,$a1
1998	vmovdqa	$t1,0x20(%rsp)
1999	mov	$B,$a3
2000	vmovdqa	$t2,0x40(%rsp)
2001	xor	$C,$a3			# magic
2002	vmovdqa	$t3,0x60(%rsp)
2003	mov	$F,$a4
2004	add	\$16*2*$SZ,$Tbl
2005	jmp	.Lavx2_00_47
2006
2007.align	16
2008.Lavx2_00_47:
2009___
2010
2011sub AVX2_512_00_47 () {
2012my $j = shift;
2013my $body = shift;
2014my @X = @_;
2015my @insns = (&$body,&$body);			# 48 instructions
2016my $base = "+2*$PUSH8(%rsp)";
2017
2018	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
2019	foreach (Xupdate_512_AVX()) {		# 23 instructions
2020	    eval;
2021	    if ($_ !~ /\;$/) {
2022		eval(shift(@insns));
2023		eval(shift(@insns));
2024		eval(shift(@insns));
2025	    }
2026	}
2027	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2028	  foreach (@insns) { eval; }		# remaining instructions
2029	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2030}
2031
2032    for ($i=0,$j=0; $j<8; $j++) {
2033	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2034	push(@X,shift(@X));			# rotate(@X)
2035    }
2036	&lea	($Tbl,16*2*$SZ."($Tbl)");
2037	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2038	&jne	(".Lavx2_00_47");
2039
2040    for ($i=0; $i<16; ) {
2041	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2042	foreach(bodyx_00_15()) { eval; }
2043    }
2044}
2045$code.=<<___;
2046	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2047	add	$a1,$A
2048	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2049	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2050
2051	add	$SZ*0($ctx),$A
2052	add	$SZ*1($ctx),$B
2053	add	$SZ*2($ctx),$C
2054	add	$SZ*3($ctx),$D
2055	add	$SZ*4($ctx),$E
2056	add	$SZ*5($ctx),$F
2057	add	$SZ*6($ctx),$G
2058	add	$SZ*7($ctx),$H
2059
2060	mov	$A,$SZ*0($ctx)
2061	mov	$B,$SZ*1($ctx)
2062	mov	$C,$SZ*2($ctx)
2063	mov	$D,$SZ*3($ctx)
2064	mov	$E,$SZ*4($ctx)
2065	mov	$F,$SZ*5($ctx)
2066	mov	$G,$SZ*6($ctx)
2067	mov	$H,$SZ*7($ctx)
2068
2069	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2070	je	.Ldone_avx2
2071
2072	xor	$a1,$a1
2073	mov	$B,$a3
2074	xor	$C,$a3			# magic
2075	mov	$F,$a4
2076	jmp	.Lower_avx2
2077.align	16
2078.Lower_avx2:
2079___
2080    for ($i=0; $i<8; ) {
2081	my $base="+16($Tbl)";
2082	foreach(bodyx_00_15()) { eval; }
2083    }
2084$code.=<<___;
2085	lea	-$PUSH8($Tbl),$Tbl
2086	cmp	%rsp,$Tbl
2087	jae	.Lower_avx2
2088
2089	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2090	add	$a1,$A
2091	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2092	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2093
2094	add	$SZ*0($ctx),$A
2095	add	$SZ*1($ctx),$B
2096	add	$SZ*2($ctx),$C
2097	add	$SZ*3($ctx),$D
2098	add	$SZ*4($ctx),$E
2099	add	$SZ*5($ctx),$F
2100	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2101	add	$SZ*6($ctx),$G
2102	mov	$inp,%r12
2103	add	$SZ*7($ctx),$H
2104	cmp	$_end,$inp
2105
2106	mov	$A,$SZ*0($ctx)
2107	cmove	%rsp,%r12		# next block or stale data
2108	mov	$B,$SZ*1($ctx)
2109	mov	$C,$SZ*2($ctx)
2110	mov	$D,$SZ*3($ctx)
2111	mov	$E,$SZ*4($ctx)
2112	mov	$F,$SZ*5($ctx)
2113	mov	$G,$SZ*6($ctx)
2114	mov	$H,$SZ*7($ctx)
2115
2116	jbe	.Loop_avx2
2117	lea	(%rsp),$Tbl
2118
2119.Ldone_avx2:
2120	lea	($Tbl),%rsp
2121	mov	$_rsp,%rsi
2122	vzeroupper
2123___
2124$code.=<<___ if ($win64);
2125	movaps	16*$SZ+32(%rsp),%xmm6
2126	movaps	16*$SZ+48(%rsp),%xmm7
2127	movaps	16*$SZ+64(%rsp),%xmm8
2128	movaps	16*$SZ+80(%rsp),%xmm9
2129___
2130$code.=<<___ if ($win64 && $SZ>4);
2131	movaps	16*$SZ+96(%rsp),%xmm10
2132	movaps	16*$SZ+112(%rsp),%xmm11
2133___
2134$code.=<<___;
2135	mov	(%rsi),%r15
2136	mov	8(%rsi),%r14
2137	mov	16(%rsi),%r13
2138	mov	24(%rsi),%r12
2139	mov	32(%rsi),%rbp
2140	mov	40(%rsi),%rbx
2141	lea	48(%rsi),%rsp
2142.Lepilogue_avx2:
2143	ret
2144.size	${func}_avx2,.-${func}_avx2
2145___
2146}}
2147}}}}}
2148
2149# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2150#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2151if ($win64) {
2152$rec="%rcx";
2153$frame="%rdx";
2154$context="%r8";
2155$disp="%r9";
2156
2157$code.=<<___;
2158.extern	__imp_RtlVirtualUnwind
2159.type	se_handler,\@abi-omnipotent
2160.align	16
2161se_handler:
2162	push	%rsi
2163	push	%rdi
2164	push	%rbx
2165	push	%rbp
2166	push	%r12
2167	push	%r13
2168	push	%r14
2169	push	%r15
2170	pushfq
2171	sub	\$64,%rsp
2172
2173	mov	120($context),%rax	# pull context->Rax
2174	mov	248($context),%rbx	# pull context->Rip
2175
2176	mov	8($disp),%rsi		# disp->ImageBase
2177	mov	56($disp),%r11		# disp->HanderlData
2178
2179	mov	0(%r11),%r10d		# HandlerData[0]
2180	lea	(%rsi,%r10),%r10	# prologue label
2181	cmp	%r10,%rbx		# context->Rip<prologue label
2182	jb	.Lin_prologue
2183
2184	mov	152($context),%rax	# pull context->Rsp
2185
2186	mov	4(%r11),%r10d		# HandlerData[1]
2187	lea	(%rsi,%r10),%r10	# epilogue label
2188	cmp	%r10,%rbx		# context->Rip>=epilogue label
2189	jae	.Lin_prologue
2190___
2191$code.=<<___ if ($avx>1);
2192	lea	.Lavx2_shortcut(%rip),%r10
2193	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2194	jb	.Lnot_in_avx2
2195
2196	and	\$-256*$SZ,%rax
2197	add	\$`2*$SZ*($rounds-8)`,%rax
2198.Lnot_in_avx2:
2199___
2200$code.=<<___;
2201	mov	%rax,%rsi		# put aside Rsp
2202	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2203	lea	48(%rax),%rax
2204
2205	mov	-8(%rax),%rbx
2206	mov	-16(%rax),%rbp
2207	mov	-24(%rax),%r12
2208	mov	-32(%rax),%r13
2209	mov	-40(%rax),%r14
2210	mov	-48(%rax),%r15
2211	mov	%rbx,144($context)	# restore context->Rbx
2212	mov	%rbp,160($context)	# restore context->Rbp
2213	mov	%r12,216($context)	# restore context->R12
2214	mov	%r13,224($context)	# restore context->R13
2215	mov	%r14,232($context)	# restore context->R14
2216	mov	%r15,240($context)	# restore context->R15
2217
2218	lea	.Lepilogue(%rip),%r10
2219	cmp	%r10,%rbx
2220	jb	.Lin_prologue		# non-AVX code
2221
2222	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2223	lea	512($context),%rdi	# &context.Xmm6
2224	mov	\$`$SZ==4?8:12`,%ecx
2225	.long	0xa548f3fc		# cld; rep movsq
2226
2227.Lin_prologue:
2228	mov	8(%rax),%rdi
2229	mov	16(%rax),%rsi
2230	mov	%rax,152($context)	# restore context->Rsp
2231	mov	%rsi,168($context)	# restore context->Rsi
2232	mov	%rdi,176($context)	# restore context->Rdi
2233
2234	mov	40($disp),%rdi		# disp->ContextRecord
2235	mov	$context,%rsi		# context
2236	mov	\$154,%ecx		# sizeof(CONTEXT)
2237	.long	0xa548f3fc		# cld; rep movsq
2238
2239	mov	$disp,%rsi
2240	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2241	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2242	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2243	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2244	mov	40(%rsi),%r10		# disp->ContextRecord
2245	lea	56(%rsi),%r11		# &disp->HandlerData
2246	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2247	mov	%r10,32(%rsp)		# arg5
2248	mov	%r11,40(%rsp)		# arg6
2249	mov	%r12,48(%rsp)		# arg7
2250	mov	%rcx,56(%rsp)		# arg8, (NULL)
2251	call	*__imp_RtlVirtualUnwind(%rip)
2252
2253	mov	\$1,%eax		# ExceptionContinueSearch
2254	add	\$64,%rsp
2255	popfq
2256	pop	%r15
2257	pop	%r14
2258	pop	%r13
2259	pop	%r12
2260	pop	%rbp
2261	pop	%rbx
2262	pop	%rdi
2263	pop	%rsi
2264	ret
2265.size	se_handler,.-se_handler
2266___
2267
2268$code.=<<___ if ($SZ==4 && $shaext);
2269.type	shaext_handler,\@abi-omnipotent
2270.align	16
2271shaext_handler:
2272	push	%rsi
2273	push	%rdi
2274	push	%rbx
2275	push	%rbp
2276	push	%r12
2277	push	%r13
2278	push	%r14
2279	push	%r15
2280	pushfq
2281	sub	\$64,%rsp
2282
2283	mov	120($context),%rax	# pull context->Rax
2284	mov	248($context),%rbx	# pull context->Rip
2285
2286	lea	.Lprologue_shaext(%rip),%r10
2287	cmp	%r10,%rbx		# context->Rip<.Lprologue
2288	jb	.Lin_prologue
2289
2290	lea	.Lepilogue_shaext(%rip),%r10
2291	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2292	jae	.Lin_prologue
2293
2294	lea	-8-5*16(%rax),%rsi
2295	lea	512($context),%rdi	# &context.Xmm6
2296	mov	\$10,%ecx
2297	.long	0xa548f3fc		# cld; rep movsq
2298
2299	jmp	.Lin_prologue
2300.size	shaext_handler,.-shaext_handler
2301___
2302
2303$code.=<<___;
2304.section	.pdata
2305.align	4
2306	.rva	.LSEH_begin_$func
2307	.rva	.LSEH_end_$func
2308	.rva	.LSEH_info_$func
2309___
2310$code.=<<___ if ($SZ==4 && $shaext);
2311	.rva	.LSEH_begin_${func}_shaext
2312	.rva	.LSEH_end_${func}_shaext
2313	.rva	.LSEH_info_${func}_shaext
2314___
2315$code.=<<___ if ($SZ==4);
2316	.rva	.LSEH_begin_${func}_ssse3
2317	.rva	.LSEH_end_${func}_ssse3
2318	.rva	.LSEH_info_${func}_ssse3
2319___
2320$code.=<<___ if ($avx && $SZ==8);
2321	.rva	.LSEH_begin_${func}_xop
2322	.rva	.LSEH_end_${func}_xop
2323	.rva	.LSEH_info_${func}_xop
2324___
2325$code.=<<___ if ($avx);
2326	.rva	.LSEH_begin_${func}_avx
2327	.rva	.LSEH_end_${func}_avx
2328	.rva	.LSEH_info_${func}_avx
2329___
2330$code.=<<___ if ($avx>1);
2331	.rva	.LSEH_begin_${func}_avx2
2332	.rva	.LSEH_end_${func}_avx2
2333	.rva	.LSEH_info_${func}_avx2
2334___
2335$code.=<<___;
2336.section	.xdata
2337.align	8
2338.LSEH_info_$func:
2339	.byte	9,0,0,0
2340	.rva	se_handler
2341	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2342___
2343$code.=<<___ if ($SZ==4 && $shaext);
2344.LSEH_info_${func}_shaext:
2345	.byte	9,0,0,0
2346	.rva	shaext_handler
2347___
2348$code.=<<___ if ($SZ==4);
2349.LSEH_info_${func}_ssse3:
2350	.byte	9,0,0,0
2351	.rva	se_handler
2352	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2353___
2354$code.=<<___ if ($avx && $SZ==8);
2355.LSEH_info_${func}_xop:
2356	.byte	9,0,0,0
2357	.rva	se_handler
2358	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2359___
2360$code.=<<___ if ($avx);
2361.LSEH_info_${func}_avx:
2362	.byte	9,0,0,0
2363	.rva	se_handler
2364	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2365___
2366$code.=<<___ if ($avx>1);
2367.LSEH_info_${func}_avx2:
2368	.byte	9,0,0,0
2369	.rva	se_handler
2370	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2371___
2372}
2373
2374sub sha256op38 {
2375    my $instr = shift;
2376    my %opcodelet = (
2377		"sha256rnds2" => 0xcb,
2378  		"sha256msg1"  => 0xcc,
2379		"sha256msg2"  => 0xcd	);
2380
2381    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2382      my @opcode=(0x0f,0x38);
2383	push @opcode,$opcodelet{$instr};
2384	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2385	return ".byte\t".join(',',@opcode);
2386    } else {
2387	return $instr."\t".@_[0];
2388    }
2389}
2390
2391foreach (split("\n",$code)) {
2392	s/\`([^\`]*)\`/eval $1/geo;
2393
2394	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2395
2396	print $_,"\n";
2397}
2398close STDOUT;
2399