1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111
112$flavour = shift;
113$output  = shift;
114if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
115
116$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
117
118$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
119( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
120( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
121die "can't locate x86_64-xlate.pl";
122
123# In upstream, this is controlled by shelling out to the compiler to check
124# versions, but BoringSSL is intended to be used with pre-generated perlasm
125# output, so this isn't useful anyway.
126#
127# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
128# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
129# did not tie them together until after $shaext was added.
130$avx = 1;
131
132# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
133# been tested.
134$shaext=0;	### set to zero if compiling for 1.0.1
135$avx=1		if (!$shaext && $avx);
136
137open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
138*STDOUT=*OUT;
139
140if ($output =~ /512/) {
141	$func="sha512_block_data_order";
142	$TABLE="K512";
143	$SZ=8;
144	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
145					"%r8", "%r9", "%r10","%r11");
146	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
147	@Sigma0=(28,34,39);
148	@Sigma1=(14,18,41);
149	@sigma0=(1,  8, 7);
150	@sigma1=(19,61, 6);
151	$rounds=80;
152} else {
153	$func="sha256_block_data_order";
154	$TABLE="K256";
155	$SZ=4;
156	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
157					"%r8d","%r9d","%r10d","%r11d");
158	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
159	@Sigma0=( 2,13,22);
160	@Sigma1=( 6,11,25);
161	@sigma0=( 7,18, 3);
162	@sigma1=(17,19,10);
163	$rounds=64;
164}
165
166$ctx="%rdi";	# 1st arg, zapped by $a3
167$inp="%rsi";	# 2nd arg
168$Tbl="%rbp";
169
170$_ctx="16*$SZ+0*8(%rsp)";
171$_inp="16*$SZ+1*8(%rsp)";
172$_end="16*$SZ+2*8(%rsp)";
173$_rsp="16*$SZ+3*8(%rsp)";
174$framesz="16*$SZ+4*8";
175
176
177sub ROUND_00_15()
178{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
179  my $STRIDE=$SZ;
180     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
181
182$code.=<<___;
183	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
184	mov	$f,$a2
185
186	xor	$e,$a0
187	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
188	xor	$g,$a2			# f^g
189
190	mov	$T1,`$SZ*($i&0xf)`(%rsp)
191	xor	$a,$a1
192	and	$e,$a2			# (f^g)&e
193
194	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
195	add	$h,$T1			# T1+=h
196	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
197
198	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
199	xor	$e,$a0
200	add	$a2,$T1			# T1+=Ch(e,f,g)
201
202	mov	$a,$a2
203	add	($Tbl),$T1		# T1+=K[round]
204	xor	$a,$a1
205
206	xor	$b,$a2			# a^b, b^c in next round
207	ror	\$$Sigma1[0],$a0	# Sigma1(e)
208	mov	$b,$h
209
210	and	$a2,$a3
211	ror	\$$Sigma0[0],$a1	# Sigma0(a)
212	add	$a0,$T1			# T1+=Sigma1(e)
213
214	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
215	add	$T1,$d			# d+=T1
216	add	$T1,$h			# h+=T1
217
218	lea	$STRIDE($Tbl),$Tbl	# round++
219___
220$code.=<<___ if ($i<15);
221	add	$a1,$h			# h+=Sigma0(a)
222___
223	($a2,$a3) = ($a3,$a2);
224}
225
226sub ROUND_16_XX()
227{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
228
229$code.=<<___;
230	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
231	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
232
233	mov	$a0,$T1
234	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
235	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
236	mov	$a2,$a1
237	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
238
239	xor	$T1,$a0
240	shr	\$$sigma0[2],$T1
241	ror	\$$sigma0[0],$a0
242	xor	$a1,$a2
243	shr	\$$sigma1[2],$a1
244
245	ror	\$$sigma1[0],$a2
246	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
247	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
248	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
249
250	add	`$SZ*($i&0xf)`(%rsp),$T1
251	mov	$e,$a0
252	add	$a2,$T1
253	mov	$a,$a1
254___
255	&ROUND_00_15(@_);
256}
257
258$code=<<___;
259.text
260
261.extern	OPENSSL_ia32cap_P
262.globl	$func
263.type	$func,\@function,3
264.align	16
265$func:
266___
267$code.=<<___ if ($SZ==4 || $avx);
268	leaq	OPENSSL_ia32cap_P(%rip),%r11
269	mov	0(%r11),%r9d
270	mov	4(%r11),%r10d
271	mov	8(%r11),%r11d
272___
273$code.=<<___ if ($SZ==4 && $shaext);
274	test	\$`1<<29`,%r11d		# check for SHA
275	jnz	_shaext_shortcut
276___
277$code.=<<___ if ($avx && $SZ==8);
278	test	\$`1<<11`,%r10d		# check for XOP
279	jnz	.Lxop_shortcut
280___
281$code.=<<___ if ($avx>1);
282	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
283	cmp	\$`1<<8|1<<5|1<<3`,%r11d
284	je	.Lavx2_shortcut
285___
286$code.=<<___ if ($avx);
287	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
288	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
289	or	%r9d,%r10d
290	cmp	\$`1<<28|1<<9|1<<30`,%r10d
291	je	.Lavx_shortcut
292___
293$code.=<<___ if ($SZ==4);
294	test	\$`1<<9`,%r10d
295	jnz	.Lssse3_shortcut
296___
297$code.=<<___;
298	mov	%rsp,%rax		# copy %rsp
299	push	%rbx
300	push	%rbp
301	push	%r12
302	push	%r13
303	push	%r14
304	push	%r15
305	shl	\$4,%rdx		# num*16
306	sub	\$$framesz,%rsp
307	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
308	and	\$-64,%rsp		# align stack frame
309	mov	$ctx,$_ctx		# save ctx, 1st arg
310	mov	$inp,$_inp		# save inp, 2nd arh
311	mov	%rdx,$_end		# save end pointer, "3rd" arg
312	mov	%rax,$_rsp		# save copy of %rsp
313.Lprologue:
314
315	mov	$SZ*0($ctx),$A
316	mov	$SZ*1($ctx),$B
317	mov	$SZ*2($ctx),$C
318	mov	$SZ*3($ctx),$D
319	mov	$SZ*4($ctx),$E
320	mov	$SZ*5($ctx),$F
321	mov	$SZ*6($ctx),$G
322	mov	$SZ*7($ctx),$H
323	jmp	.Lloop
324
325.align	16
326.Lloop:
327	mov	$B,$a3
328	lea	$TABLE(%rip),$Tbl
329	xor	$C,$a3			# magic
330___
331	for($i=0;$i<16;$i++) {
332		$code.="	mov	$SZ*$i($inp),$T1\n";
333		$code.="	mov	@ROT[4],$a0\n";
334		$code.="	mov	@ROT[0],$a1\n";
335		$code.="	bswap	$T1\n";
336		&ROUND_00_15($i,@ROT);
337		unshift(@ROT,pop(@ROT));
338	}
339$code.=<<___;
340	jmp	.Lrounds_16_xx
341.align	16
342.Lrounds_16_xx:
343___
344	for(;$i<32;$i++) {
345		&ROUND_16_XX($i,@ROT);
346		unshift(@ROT,pop(@ROT));
347	}
348
349$code.=<<___;
350	cmpb	\$0,`$SZ-1`($Tbl)
351	jnz	.Lrounds_16_xx
352
353	mov	$_ctx,$ctx
354	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
355	lea	16*$SZ($inp),$inp
356
357	add	$SZ*0($ctx),$A
358	add	$SZ*1($ctx),$B
359	add	$SZ*2($ctx),$C
360	add	$SZ*3($ctx),$D
361	add	$SZ*4($ctx),$E
362	add	$SZ*5($ctx),$F
363	add	$SZ*6($ctx),$G
364	add	$SZ*7($ctx),$H
365
366	cmp	$_end,$inp
367
368	mov	$A,$SZ*0($ctx)
369	mov	$B,$SZ*1($ctx)
370	mov	$C,$SZ*2($ctx)
371	mov	$D,$SZ*3($ctx)
372	mov	$E,$SZ*4($ctx)
373	mov	$F,$SZ*5($ctx)
374	mov	$G,$SZ*6($ctx)
375	mov	$H,$SZ*7($ctx)
376	jb	.Lloop
377
378	mov	$_rsp,%rsi
379	mov	-48(%rsi),%r15
380	mov	-40(%rsi),%r14
381	mov	-32(%rsi),%r13
382	mov	-24(%rsi),%r12
383	mov	-16(%rsi),%rbp
384	mov	-8(%rsi),%rbx
385	lea	(%rsi),%rsp
386.Lepilogue:
387	ret
388.size	$func,.-$func
389___
390
391if ($SZ==4) {
392$code.=<<___;
393.align	64
394.type	$TABLE,\@object
395$TABLE:
396	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
397	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
398	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
399	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
400	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
401	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
402	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
403	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
404	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
405	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
406	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
407	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
408	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
409	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
410	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
411	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
412	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
413	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
414	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
415	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
416	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
417	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
418	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
419	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
420	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
421	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
422	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
423	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
424	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
425	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
426	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
427	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
428
429	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
430	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
431	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
432	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
433	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
434	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
435	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
436___
437} else {
438$code.=<<___;
439.align	64
440.type	$TABLE,\@object
441$TABLE:
442	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
443	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
444	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
445	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
446	.quad	0x3956c25bf348b538,0x59f111f1b605d019
447	.quad	0x3956c25bf348b538,0x59f111f1b605d019
448	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
449	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
450	.quad	0xd807aa98a3030242,0x12835b0145706fbe
451	.quad	0xd807aa98a3030242,0x12835b0145706fbe
452	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
453	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
454	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
455	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
456	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
457	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
458	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
459	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
460	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
461	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
462	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
463	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
464	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
465	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
466	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
467	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
468	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
469	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
470	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
471	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
472	.quad	0x06ca6351e003826f,0x142929670a0e6e70
473	.quad	0x06ca6351e003826f,0x142929670a0e6e70
474	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
475	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
476	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
477	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
478	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
479	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
480	.quad	0x81c2c92e47edaee6,0x92722c851482353b
481	.quad	0x81c2c92e47edaee6,0x92722c851482353b
482	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
483	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
484	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
485	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
486	.quad	0xd192e819d6ef5218,0xd69906245565a910
487	.quad	0xd192e819d6ef5218,0xd69906245565a910
488	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
489	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
490	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
491	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
492	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
493	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
494	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
495	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
496	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
497	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
498	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
499	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
500	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
501	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
502	.quad	0x90befffa23631e28,0xa4506cebde82bde9
503	.quad	0x90befffa23631e28,0xa4506cebde82bde9
504	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
505	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
506	.quad	0xca273eceea26619c,0xd186b8c721c0c207
507	.quad	0xca273eceea26619c,0xd186b8c721c0c207
508	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
509	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
510	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
511	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
512	.quad	0x113f9804bef90dae,0x1b710b35131c471b
513	.quad	0x113f9804bef90dae,0x1b710b35131c471b
514	.quad	0x28db77f523047d84,0x32caab7b40c72493
515	.quad	0x28db77f523047d84,0x32caab7b40c72493
516	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
517	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
518	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
519	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
520	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
521	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
522
523	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
524	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
525	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
526___
527}
528
529######################################################################
530# SIMD code paths
531#
532if ($SZ==4 && $shaext) {{{
533######################################################################
534# Intel SHA Extensions implementation of SHA256 update function.
535#
536my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
537
538my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
539my @MSG=map("%xmm$_",(3..6));
540
541$code.=<<___;
542.type	sha256_block_data_order_shaext,\@function,3
543.align	64
544sha256_block_data_order_shaext:
545_shaext_shortcut:
546___
547$code.=<<___ if ($win64);
548	lea	`-8-5*16`(%rsp),%rsp
549	movaps	%xmm6,-8-5*16(%rax)
550	movaps	%xmm7,-8-4*16(%rax)
551	movaps	%xmm8,-8-3*16(%rax)
552	movaps	%xmm9,-8-2*16(%rax)
553	movaps	%xmm10,-8-1*16(%rax)
554.Lprologue_shaext:
555___
556$code.=<<___;
557	lea		K256+0x80(%rip),$Tbl
558	movdqu		($ctx),$ABEF		# DCBA
559	movdqu		16($ctx),$CDGH		# HGFE
560	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
561
562	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
563	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
564	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
565	movdqa		$TMP,$BSWAP		# offload
566	palignr		\$8,$CDGH,$ABEF		# ABEF
567	punpcklqdq	$Wi,$CDGH		# CDGH
568	jmp		.Loop_shaext
569
570.align	16
571.Loop_shaext:
572	movdqu		($inp),@MSG[0]
573	movdqu		0x10($inp),@MSG[1]
574	movdqu		0x20($inp),@MSG[2]
575	pshufb		$TMP,@MSG[0]
576	movdqu		0x30($inp),@MSG[3]
577
578	movdqa		0*32-0x80($Tbl),$Wi
579	paddd		@MSG[0],$Wi
580	pshufb		$TMP,@MSG[1]
581	movdqa		$CDGH,$CDGH_SAVE	# offload
582	sha256rnds2	$ABEF,$CDGH		# 0-3
583	pshufd		\$0x0e,$Wi,$Wi
584	nop
585	movdqa		$ABEF,$ABEF_SAVE	# offload
586	sha256rnds2	$CDGH,$ABEF
587
588	movdqa		1*32-0x80($Tbl),$Wi
589	paddd		@MSG[1],$Wi
590	pshufb		$TMP,@MSG[2]
591	sha256rnds2	$ABEF,$CDGH		# 4-7
592	pshufd		\$0x0e,$Wi,$Wi
593	lea		0x40($inp),$inp
594	sha256msg1	@MSG[1],@MSG[0]
595	sha256rnds2	$CDGH,$ABEF
596
597	movdqa		2*32-0x80($Tbl),$Wi
598	paddd		@MSG[2],$Wi
599	pshufb		$TMP,@MSG[3]
600	sha256rnds2	$ABEF,$CDGH		# 8-11
601	pshufd		\$0x0e,$Wi,$Wi
602	movdqa		@MSG[3],$TMP
603	palignr		\$4,@MSG[2],$TMP
604	nop
605	paddd		$TMP,@MSG[0]
606	sha256msg1	@MSG[2],@MSG[1]
607	sha256rnds2	$CDGH,$ABEF
608
609	movdqa		3*32-0x80($Tbl),$Wi
610	paddd		@MSG[3],$Wi
611	sha256msg2	@MSG[3],@MSG[0]
612	sha256rnds2	$ABEF,$CDGH		# 12-15
613	pshufd		\$0x0e,$Wi,$Wi
614	movdqa		@MSG[0],$TMP
615	palignr		\$4,@MSG[3],$TMP
616	nop
617	paddd		$TMP,@MSG[1]
618	sha256msg1	@MSG[3],@MSG[2]
619	sha256rnds2	$CDGH,$ABEF
620___
621for($i=4;$i<16-3;$i++) {
622$code.=<<___;
623	movdqa		$i*32-0x80($Tbl),$Wi
624	paddd		@MSG[0],$Wi
625	sha256msg2	@MSG[0],@MSG[1]
626	sha256rnds2	$ABEF,$CDGH		# 16-19...
627	pshufd		\$0x0e,$Wi,$Wi
628	movdqa		@MSG[1],$TMP
629	palignr		\$4,@MSG[0],$TMP
630	nop
631	paddd		$TMP,@MSG[2]
632	sha256msg1	@MSG[0],@MSG[3]
633	sha256rnds2	$CDGH,$ABEF
634___
635	push(@MSG,shift(@MSG));
636}
637$code.=<<___;
638	movdqa		13*32-0x80($Tbl),$Wi
639	paddd		@MSG[0],$Wi
640	sha256msg2	@MSG[0],@MSG[1]
641	sha256rnds2	$ABEF,$CDGH		# 52-55
642	pshufd		\$0x0e,$Wi,$Wi
643	movdqa		@MSG[1],$TMP
644	palignr		\$4,@MSG[0],$TMP
645	sha256rnds2	$CDGH,$ABEF
646	paddd		$TMP,@MSG[2]
647
648	movdqa		14*32-0x80($Tbl),$Wi
649	paddd		@MSG[1],$Wi
650	sha256rnds2	$ABEF,$CDGH		# 56-59
651	pshufd		\$0x0e,$Wi,$Wi
652	sha256msg2	@MSG[1],@MSG[2]
653	movdqa		$BSWAP,$TMP
654	sha256rnds2	$CDGH,$ABEF
655
656	movdqa		15*32-0x80($Tbl),$Wi
657	paddd		@MSG[2],$Wi
658	nop
659	sha256rnds2	$ABEF,$CDGH		# 60-63
660	pshufd		\$0x0e,$Wi,$Wi
661	dec		$num
662	nop
663	sha256rnds2	$CDGH,$ABEF
664
665	paddd		$CDGH_SAVE,$CDGH
666	paddd		$ABEF_SAVE,$ABEF
667	jnz		.Loop_shaext
668
669	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
670	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
671	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
672	punpckhqdq	$CDGH,$ABEF		# DCBA
673	palignr		\$8,$TMP,$CDGH		# HGFE
674
675	movdqu	$ABEF,($ctx)
676	movdqu	$CDGH,16($ctx)
677___
678$code.=<<___ if ($win64);
679	movaps	-8-5*16(%rax),%xmm6
680	movaps	-8-4*16(%rax),%xmm7
681	movaps	-8-3*16(%rax),%xmm8
682	movaps	-8-2*16(%rax),%xmm9
683	movaps	-8-1*16(%rax),%xmm10
684	mov	%rax,%rsp
685.Lepilogue_shaext:
686___
687$code.=<<___;
688	ret
689.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
690___
691}}}
692{{{
693
694my $a4=$T1;
695my ($a,$b,$c,$d,$e,$f,$g,$h);
696
697sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
698{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
699  my $arg = pop;
700    $arg = "\$$arg" if ($arg*1 eq $arg);
701    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
702}
703
704sub body_00_15 () {
705	(
706	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
707
708	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
709	'&mov	($a,$a1)',
710	'&mov	($a4,$f)',
711
712	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
713	'&xor	($a0,$e)',
714	'&xor	($a4,$g)',			# f^g
715
716	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
717	'&xor	($a1,$a)',
718	'&and	($a4,$e)',			# (f^g)&e
719
720	'&xor	($a0,$e)',
721	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
722	'&mov	($a2,$a)',
723
724	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
725	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
726	'&xor	($a2,$b)',			# a^b, b^c in next round
727
728	'&add	($h,$a4)',			# h+=Ch(e,f,g)
729	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
730	'&and	($a3,$a2)',			# (b^c)&(a^b)
731
732	'&xor	($a1,$a)',
733	'&add	($h,$a0)',			# h+=Sigma1(e)
734	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
735
736	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
737	'&add	($d,$h)',			# d+=h
738	'&add	($h,$a3)',			# h+=Maj(a,b,c)
739
740	'&mov	($a0,$d)',
741	'&add	($a1,$h);'.			# h+=Sigma0(a)
742	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
743	);
744}
745
746######################################################################
747# SSSE3 code path
748#
749if ($SZ==4) {	# SHA256 only
750my @X = map("%xmm$_",(0..3));
751my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
752
753$code.=<<___;
754.type	${func}_ssse3,\@function,3
755.align	64
756${func}_ssse3:
757.Lssse3_shortcut:
758	mov	%rsp,%rax		# copy %rsp
759	push	%rbx
760	push	%rbp
761	push	%r12
762	push	%r13
763	push	%r14
764	push	%r15
765	shl	\$4,%rdx		# num*16
766	sub	\$`$framesz+$win64*16*4`,%rsp
767	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
768	and	\$-64,%rsp		# align stack frame
769	mov	$ctx,$_ctx		# save ctx, 1st arg
770	mov	$inp,$_inp		# save inp, 2nd arh
771	mov	%rdx,$_end		# save end pointer, "3rd" arg
772	mov	%rax,$_rsp		# save copy of %rsp
773___
774$code.=<<___ if ($win64);
775	movaps	%xmm6,16*$SZ+32(%rsp)
776	movaps	%xmm7,16*$SZ+48(%rsp)
777	movaps	%xmm8,16*$SZ+64(%rsp)
778	movaps	%xmm9,16*$SZ+80(%rsp)
779___
780$code.=<<___;
781.Lprologue_ssse3:
782
783	mov	$SZ*0($ctx),$A
784	mov	$SZ*1($ctx),$B
785	mov	$SZ*2($ctx),$C
786	mov	$SZ*3($ctx),$D
787	mov	$SZ*4($ctx),$E
788	mov	$SZ*5($ctx),$F
789	mov	$SZ*6($ctx),$G
790	mov	$SZ*7($ctx),$H
791___
792
793$code.=<<___;
794	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
795	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
796	jmp	.Lloop_ssse3
797.align	16
798.Lloop_ssse3:
799	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
800	movdqu	0x00($inp),@X[0]
801	movdqu	0x10($inp),@X[1]
802	movdqu	0x20($inp),@X[2]
803	pshufb	$t3,@X[0]
804	movdqu	0x30($inp),@X[3]
805	lea	$TABLE(%rip),$Tbl
806	pshufb	$t3,@X[1]
807	movdqa	0x00($Tbl),$t0
808	movdqa	0x20($Tbl),$t1
809	pshufb	$t3,@X[2]
810	paddd	@X[0],$t0
811	movdqa	0x40($Tbl),$t2
812	pshufb	$t3,@X[3]
813	movdqa	0x60($Tbl),$t3
814	paddd	@X[1],$t1
815	paddd	@X[2],$t2
816	paddd	@X[3],$t3
817	movdqa	$t0,0x00(%rsp)
818	mov	$A,$a1
819	movdqa	$t1,0x10(%rsp)
820	mov	$B,$a3
821	movdqa	$t2,0x20(%rsp)
822	xor	$C,$a3			# magic
823	movdqa	$t3,0x30(%rsp)
824	mov	$E,$a0
825	jmp	.Lssse3_00_47
826
827.align	16
828.Lssse3_00_47:
829	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
830___
831sub Xupdate_256_SSSE3 () {
832	(
833	'&movdqa	($t0,@X[1]);',
834	'&movdqa	($t3,@X[3])',
835	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
836	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
837	'&movdqa	($t1,$t0)',
838	'&movdqa	($t2,$t0);',
839	'&psrld		($t0,$sigma0[2])',
840	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
841	'&psrld		($t2,$sigma0[0])',
842	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
843	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
844	'&pxor		($t0,$t2)',
845	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
846	'&pxor		($t0,$t1)',
847	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
848	'&pxor		($t0,$t2);',
849	 '&movdqa	($t2,$t3)',
850	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
851	 '&psrld	($t3,$sigma1[2])',
852	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
853	 '&psrlq	($t2,$sigma1[0])',
854	 '&pxor		($t3,$t2);',
855	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
856	 '&pxor		($t3,$t2)',
857	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
858	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
859	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
860	 '&movdqa	($t2,$t3);',
861	 '&psrld	($t3,$sigma1[2])',
862	 '&psrlq	($t2,$sigma1[0])',
863	 '&pxor		($t3,$t2);',
864	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
865	 '&pxor		($t3,$t2);',
866	'&movdqa	($t2,16*2*$j."($Tbl)")',
867	 '&pshufb	($t3,$t5)',
868	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
869	);
870}
871
872sub SSSE3_256_00_47 () {
873my $j = shift;
874my $body = shift;
875my @X = @_;
876my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
877
878    if (0) {
879	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
880	    eval;
881	    eval(shift(@insns));
882	    eval(shift(@insns));
883	    eval(shift(@insns));
884	}
885    } else {			# squeeze extra 4% on Westmere and 19% on Atom
886	  eval(shift(@insns));	#@
887	&movdqa		($t0,@X[1]);
888	  eval(shift(@insns));
889	  eval(shift(@insns));
890	&movdqa		($t3,@X[3]);
891	  eval(shift(@insns));	#@
892	  eval(shift(@insns));
893	  eval(shift(@insns));
894	  eval(shift(@insns));	#@
895	  eval(shift(@insns));
896	&palignr	($t0,@X[0],$SZ);	# X[1..4]
897	  eval(shift(@insns));
898	  eval(shift(@insns));
899	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
900	  eval(shift(@insns));
901	  eval(shift(@insns));
902	  eval(shift(@insns));
903	  eval(shift(@insns));	#@
904	&movdqa		($t1,$t0);
905	  eval(shift(@insns));
906	  eval(shift(@insns));
907	&movdqa		($t2,$t0);
908	  eval(shift(@insns));	#@
909	  eval(shift(@insns));
910	&psrld		($t0,$sigma0[2]);
911	  eval(shift(@insns));
912	  eval(shift(@insns));
913	  eval(shift(@insns));
914	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
915	  eval(shift(@insns));	#@
916	  eval(shift(@insns));
917	&psrld		($t2,$sigma0[0]);
918	  eval(shift(@insns));
919	  eval(shift(@insns));
920	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
921	  eval(shift(@insns));
922	  eval(shift(@insns));	#@
923	&pslld		($t1,8*$SZ-$sigma0[1]);
924	  eval(shift(@insns));
925	  eval(shift(@insns));
926	&pxor		($t0,$t2);
927	  eval(shift(@insns));	#@
928	  eval(shift(@insns));
929	  eval(shift(@insns));
930	  eval(shift(@insns));	#@
931	&psrld		($t2,$sigma0[1]-$sigma0[0]);
932	  eval(shift(@insns));
933	&pxor		($t0,$t1);
934	  eval(shift(@insns));
935	  eval(shift(@insns));
936	&pslld		($t1,$sigma0[1]-$sigma0[0]);
937	  eval(shift(@insns));
938	  eval(shift(@insns));
939	&pxor		($t0,$t2);
940	  eval(shift(@insns));
941	  eval(shift(@insns));	#@
942	 &movdqa	($t2,$t3);
943	  eval(shift(@insns));
944	  eval(shift(@insns));
945	&pxor		($t0,$t1);		# sigma0(X[1..4])
946	  eval(shift(@insns));	#@
947	  eval(shift(@insns));
948	  eval(shift(@insns));
949	 &psrld		($t3,$sigma1[2]);
950	  eval(shift(@insns));
951	  eval(shift(@insns));
952	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
953	  eval(shift(@insns));	#@
954	  eval(shift(@insns));
955	 &psrlq		($t2,$sigma1[0]);
956	  eval(shift(@insns));
957	  eval(shift(@insns));
958	  eval(shift(@insns));
959	 &pxor		($t3,$t2);
960	  eval(shift(@insns));	#@
961	  eval(shift(@insns));
962	  eval(shift(@insns));
963	  eval(shift(@insns));	#@
964	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
965	  eval(shift(@insns));
966	  eval(shift(@insns));
967	 &pxor		($t3,$t2);
968	  eval(shift(@insns));	#@
969	  eval(shift(@insns));
970	  eval(shift(@insns));
971	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
972	 &pshufd	($t3,$t3,0b10000000);
973	  eval(shift(@insns));
974	  eval(shift(@insns));
975	  eval(shift(@insns));
976	 &psrldq	($t3,8);
977	  eval(shift(@insns));
978	  eval(shift(@insns));	#@
979	  eval(shift(@insns));
980	  eval(shift(@insns));
981	  eval(shift(@insns));	#@
982	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
983	  eval(shift(@insns));
984	  eval(shift(@insns));
985	  eval(shift(@insns));
986	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
987	  eval(shift(@insns));
988	  eval(shift(@insns));	#@
989	  eval(shift(@insns));
990	 &movdqa	($t2,$t3);
991	  eval(shift(@insns));
992	  eval(shift(@insns));
993	 &psrld		($t3,$sigma1[2]);
994	  eval(shift(@insns));
995	  eval(shift(@insns));	#@
996	 &psrlq		($t2,$sigma1[0]);
997	  eval(shift(@insns));
998	  eval(shift(@insns));
999	 &pxor		($t3,$t2);
1000	  eval(shift(@insns));	#@
1001	  eval(shift(@insns));
1002	  eval(shift(@insns));
1003	  eval(shift(@insns));	#@
1004	  eval(shift(@insns));
1005	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1006	  eval(shift(@insns));
1007	  eval(shift(@insns));
1008	  eval(shift(@insns));
1009	 &pxor		($t3,$t2);
1010	  eval(shift(@insns));
1011	  eval(shift(@insns));
1012	  eval(shift(@insns));	#@
1013	 #&pshufb	($t3,$t5);
1014	 &pshufd	($t3,$t3,0b00001000);
1015	  eval(shift(@insns));
1016	  eval(shift(@insns));
1017	&movdqa		($t2,16*2*$j."($Tbl)");
1018	  eval(shift(@insns));	#@
1019	  eval(shift(@insns));
1020	 &pslldq	($t3,8);
1021	  eval(shift(@insns));
1022	  eval(shift(@insns));
1023	  eval(shift(@insns));
1024	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1025	  eval(shift(@insns));	#@
1026	  eval(shift(@insns));
1027	  eval(shift(@insns));
1028    }
1029	&paddd		($t2,@X[0]);
1030	  foreach (@insns) { eval; }		# remaining instructions
1031	&movdqa		(16*$j."(%rsp)",$t2);
1032}
1033
1034    for ($i=0,$j=0; $j<4; $j++) {
1035	&SSSE3_256_00_47($j,\&body_00_15,@X);
1036	push(@X,shift(@X));			# rotate(@X)
1037    }
1038	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1039	&jne	(".Lssse3_00_47");
1040
1041    for ($i=0; $i<16; ) {
1042	foreach(body_00_15()) { eval; }
1043    }
1044$code.=<<___;
1045	mov	$_ctx,$ctx
1046	mov	$a1,$A
1047
1048	add	$SZ*0($ctx),$A
1049	lea	16*$SZ($inp),$inp
1050	add	$SZ*1($ctx),$B
1051	add	$SZ*2($ctx),$C
1052	add	$SZ*3($ctx),$D
1053	add	$SZ*4($ctx),$E
1054	add	$SZ*5($ctx),$F
1055	add	$SZ*6($ctx),$G
1056	add	$SZ*7($ctx),$H
1057
1058	cmp	$_end,$inp
1059
1060	mov	$A,$SZ*0($ctx)
1061	mov	$B,$SZ*1($ctx)
1062	mov	$C,$SZ*2($ctx)
1063	mov	$D,$SZ*3($ctx)
1064	mov	$E,$SZ*4($ctx)
1065	mov	$F,$SZ*5($ctx)
1066	mov	$G,$SZ*6($ctx)
1067	mov	$H,$SZ*7($ctx)
1068	jb	.Lloop_ssse3
1069
1070	mov	$_rsp,%rsi
1071___
1072$code.=<<___ if ($win64);
1073	movaps	16*$SZ+32(%rsp),%xmm6
1074	movaps	16*$SZ+48(%rsp),%xmm7
1075	movaps	16*$SZ+64(%rsp),%xmm8
1076	movaps	16*$SZ+80(%rsp),%xmm9
1077___
1078$code.=<<___;
1079	mov	-48(%rsi),%r15
1080	mov	-40(%rsi),%r14
1081	mov	-32(%rsi),%r13
1082	mov	-24(%rsi),%r12
1083	mov	-16(%rsi),%rbp
1084	mov	-8(%rsi),%rbx
1085	lea	(%rsi),%rsp
1086.Lepilogue_ssse3:
1087	ret
1088.size	${func}_ssse3,.-${func}_ssse3
1089___
1090}
1091
1092if ($avx) {{
1093######################################################################
1094# XOP code path
1095#
1096if ($SZ==8) {	# SHA512 only
1097$code.=<<___;
1098.type	${func}_xop,\@function,3
1099.align	64
1100${func}_xop:
1101.Lxop_shortcut:
1102	mov	%rsp,%rax		# copy %rsp
1103	push	%rbx
1104	push	%rbp
1105	push	%r12
1106	push	%r13
1107	push	%r14
1108	push	%r15
1109	shl	\$4,%rdx		# num*16
1110	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1111	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1112	and	\$-64,%rsp		# align stack frame
1113	mov	$ctx,$_ctx		# save ctx, 1st arg
1114	mov	$inp,$_inp		# save inp, 2nd arh
1115	mov	%rdx,$_end		# save end pointer, "3rd" arg
1116	mov	%rax,$_rsp		# save copy of %rsp
1117___
1118$code.=<<___ if ($win64);
1119	movaps	%xmm6,16*$SZ+32(%rsp)
1120	movaps	%xmm7,16*$SZ+48(%rsp)
1121	movaps	%xmm8,16*$SZ+64(%rsp)
1122	movaps	%xmm9,16*$SZ+80(%rsp)
1123___
1124$code.=<<___ if ($win64 && $SZ>4);
1125	movaps	%xmm10,16*$SZ+96(%rsp)
1126	movaps	%xmm11,16*$SZ+112(%rsp)
1127___
1128$code.=<<___;
1129.Lprologue_xop:
1130
1131	vzeroupper
1132	mov	$SZ*0($ctx),$A
1133	mov	$SZ*1($ctx),$B
1134	mov	$SZ*2($ctx),$C
1135	mov	$SZ*3($ctx),$D
1136	mov	$SZ*4($ctx),$E
1137	mov	$SZ*5($ctx),$F
1138	mov	$SZ*6($ctx),$G
1139	mov	$SZ*7($ctx),$H
1140	jmp	.Lloop_xop
1141___
1142					if ($SZ==4) {	# SHA256
1143    my @X = map("%xmm$_",(0..3));
1144    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1145
1146$code.=<<___;
1147.align	16
1148.Lloop_xop:
1149	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1150	vmovdqu	0x00($inp),@X[0]
1151	vmovdqu	0x10($inp),@X[1]
1152	vmovdqu	0x20($inp),@X[2]
1153	vmovdqu	0x30($inp),@X[3]
1154	vpshufb	$t3,@X[0],@X[0]
1155	lea	$TABLE(%rip),$Tbl
1156	vpshufb	$t3,@X[1],@X[1]
1157	vpshufb	$t3,@X[2],@X[2]
1158	vpaddd	0x00($Tbl),@X[0],$t0
1159	vpshufb	$t3,@X[3],@X[3]
1160	vpaddd	0x20($Tbl),@X[1],$t1
1161	vpaddd	0x40($Tbl),@X[2],$t2
1162	vpaddd	0x60($Tbl),@X[3],$t3
1163	vmovdqa	$t0,0x00(%rsp)
1164	mov	$A,$a1
1165	vmovdqa	$t1,0x10(%rsp)
1166	mov	$B,$a3
1167	vmovdqa	$t2,0x20(%rsp)
1168	xor	$C,$a3			# magic
1169	vmovdqa	$t3,0x30(%rsp)
1170	mov	$E,$a0
1171	jmp	.Lxop_00_47
1172
1173.align	16
1174.Lxop_00_47:
1175	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1176___
1177sub XOP_256_00_47 () {
1178my $j = shift;
1179my $body = shift;
1180my @X = @_;
1181my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1182
1183	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1184	  eval(shift(@insns));
1185	  eval(shift(@insns));
1186	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1187	  eval(shift(@insns));
1188	  eval(shift(@insns));
1189	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1190	  eval(shift(@insns));
1191	  eval(shift(@insns));
1192	&vpsrld		($t0,$t0,$sigma0[2]);
1193	  eval(shift(@insns));
1194	  eval(shift(@insns));
1195	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1196	  eval(shift(@insns));
1197	  eval(shift(@insns));
1198	  eval(shift(@insns));
1199	  eval(shift(@insns));
1200	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1201	  eval(shift(@insns));
1202	  eval(shift(@insns));
1203	&vpxor		($t0,$t0,$t1);
1204	  eval(shift(@insns));
1205	  eval(shift(@insns));
1206	  eval(shift(@insns));
1207	  eval(shift(@insns));
1208	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1209	  eval(shift(@insns));
1210	  eval(shift(@insns));
1211	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1212	  eval(shift(@insns));
1213	  eval(shift(@insns));
1214	 &vpsrld	($t2,@X[3],$sigma1[2]);
1215	  eval(shift(@insns));
1216	  eval(shift(@insns));
1217	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1218	  eval(shift(@insns));
1219	  eval(shift(@insns));
1220	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1221	  eval(shift(@insns));
1222	  eval(shift(@insns));
1223	 &vpxor		($t3,$t3,$t2);
1224	  eval(shift(@insns));
1225	  eval(shift(@insns));
1226	  eval(shift(@insns));
1227	  eval(shift(@insns));
1228	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1229	  eval(shift(@insns));
1230	  eval(shift(@insns));
1231	  eval(shift(@insns));
1232	  eval(shift(@insns));
1233	&vpsrldq	($t3,$t3,8);
1234	  eval(shift(@insns));
1235	  eval(shift(@insns));
1236	  eval(shift(@insns));
1237	  eval(shift(@insns));
1238	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1239	  eval(shift(@insns));
1240	  eval(shift(@insns));
1241	  eval(shift(@insns));
1242	  eval(shift(@insns));
1243	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1244	  eval(shift(@insns));
1245	  eval(shift(@insns));
1246	 &vpsrld	($t2,@X[0],$sigma1[2]);
1247	  eval(shift(@insns));
1248	  eval(shift(@insns));
1249	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1250	  eval(shift(@insns));
1251	  eval(shift(@insns));
1252	 &vpxor		($t3,$t3,$t2);
1253	  eval(shift(@insns));
1254	  eval(shift(@insns));
1255	  eval(shift(@insns));
1256	  eval(shift(@insns));
1257	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1258	  eval(shift(@insns));
1259	  eval(shift(@insns));
1260	  eval(shift(@insns));
1261	  eval(shift(@insns));
1262	&vpslldq	($t3,$t3,8);		# 22 instructions
1263	  eval(shift(@insns));
1264	  eval(shift(@insns));
1265	  eval(shift(@insns));
1266	  eval(shift(@insns));
1267	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1268	  eval(shift(@insns));
1269	  eval(shift(@insns));
1270	  eval(shift(@insns));
1271	  eval(shift(@insns));
1272	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1273	  foreach (@insns) { eval; }		# remaining instructions
1274	&vmovdqa	(16*$j."(%rsp)",$t2);
1275}
1276
1277    for ($i=0,$j=0; $j<4; $j++) {
1278	&XOP_256_00_47($j,\&body_00_15,@X);
1279	push(@X,shift(@X));			# rotate(@X)
1280    }
1281	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1282	&jne	(".Lxop_00_47");
1283
1284    for ($i=0; $i<16; ) {
1285	foreach(body_00_15()) { eval; }
1286    }
1287
1288					} else {	# SHA512
1289    my @X = map("%xmm$_",(0..7));
1290    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1291
1292$code.=<<___;
1293.align	16
1294.Lloop_xop:
1295	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1296	vmovdqu	0x00($inp),@X[0]
1297	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1298	vmovdqu	0x10($inp),@X[1]
1299	vmovdqu	0x20($inp),@X[2]
1300	vpshufb	$t3,@X[0],@X[0]
1301	vmovdqu	0x30($inp),@X[3]
1302	vpshufb	$t3,@X[1],@X[1]
1303	vmovdqu	0x40($inp),@X[4]
1304	vpshufb	$t3,@X[2],@X[2]
1305	vmovdqu	0x50($inp),@X[5]
1306	vpshufb	$t3,@X[3],@X[3]
1307	vmovdqu	0x60($inp),@X[6]
1308	vpshufb	$t3,@X[4],@X[4]
1309	vmovdqu	0x70($inp),@X[7]
1310	vpshufb	$t3,@X[5],@X[5]
1311	vpaddq	-0x80($Tbl),@X[0],$t0
1312	vpshufb	$t3,@X[6],@X[6]
1313	vpaddq	-0x60($Tbl),@X[1],$t1
1314	vpshufb	$t3,@X[7],@X[7]
1315	vpaddq	-0x40($Tbl),@X[2],$t2
1316	vpaddq	-0x20($Tbl),@X[3],$t3
1317	vmovdqa	$t0,0x00(%rsp)
1318	vpaddq	0x00($Tbl),@X[4],$t0
1319	vmovdqa	$t1,0x10(%rsp)
1320	vpaddq	0x20($Tbl),@X[5],$t1
1321	vmovdqa	$t2,0x20(%rsp)
1322	vpaddq	0x40($Tbl),@X[6],$t2
1323	vmovdqa	$t3,0x30(%rsp)
1324	vpaddq	0x60($Tbl),@X[7],$t3
1325	vmovdqa	$t0,0x40(%rsp)
1326	mov	$A,$a1
1327	vmovdqa	$t1,0x50(%rsp)
1328	mov	$B,$a3
1329	vmovdqa	$t2,0x60(%rsp)
1330	xor	$C,$a3			# magic
1331	vmovdqa	$t3,0x70(%rsp)
1332	mov	$E,$a0
1333	jmp	.Lxop_00_47
1334
1335.align	16
1336.Lxop_00_47:
1337	add	\$`16*2*$SZ`,$Tbl
1338___
1339sub XOP_512_00_47 () {
1340my $j = shift;
1341my $body = shift;
1342my @X = @_;
1343my @insns = (&$body,&$body);			# 52 instructions
1344
1345	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1346	  eval(shift(@insns));
1347	  eval(shift(@insns));
1348	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1349	  eval(shift(@insns));
1350	  eval(shift(@insns));
1351	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1352	  eval(shift(@insns));
1353	  eval(shift(@insns));
1354	&vpsrlq		($t0,$t0,$sigma0[2]);
1355	  eval(shift(@insns));
1356	  eval(shift(@insns));
1357	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1358	  eval(shift(@insns));
1359	  eval(shift(@insns));
1360	  eval(shift(@insns));
1361	  eval(shift(@insns));
1362	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1363	  eval(shift(@insns));
1364	  eval(shift(@insns));
1365	&vpxor		($t0,$t0,$t1);
1366	  eval(shift(@insns));
1367	  eval(shift(@insns));
1368	  eval(shift(@insns));
1369	  eval(shift(@insns));
1370	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1371	  eval(shift(@insns));
1372	  eval(shift(@insns));
1373	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1374	  eval(shift(@insns));
1375	  eval(shift(@insns));
1376	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1377	  eval(shift(@insns));
1378	  eval(shift(@insns));
1379	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1380	  eval(shift(@insns));
1381	  eval(shift(@insns));
1382	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1383	  eval(shift(@insns));
1384	  eval(shift(@insns));
1385	 &vpxor		($t3,$t3,$t2);
1386	  eval(shift(@insns));
1387	  eval(shift(@insns));
1388	  eval(shift(@insns));
1389	  eval(shift(@insns));
1390	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1391	  eval(shift(@insns));
1392	  eval(shift(@insns));
1393	  eval(shift(@insns));
1394	  eval(shift(@insns));
1395	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1396	  eval(shift(@insns));
1397	  eval(shift(@insns));
1398	  eval(shift(@insns));
1399	  eval(shift(@insns));
1400	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1401	  foreach (@insns) { eval; }		# remaining instructions
1402	&vmovdqa	(16*$j."(%rsp)",$t2);
1403}
1404
1405    for ($i=0,$j=0; $j<8; $j++) {
1406	&XOP_512_00_47($j,\&body_00_15,@X);
1407	push(@X,shift(@X));			# rotate(@X)
1408    }
1409	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1410	&jne	(".Lxop_00_47");
1411
1412    for ($i=0; $i<16; ) {
1413	foreach(body_00_15()) { eval; }
1414    }
1415}
1416$code.=<<___;
1417	mov	$_ctx,$ctx
1418	mov	$a1,$A
1419
1420	add	$SZ*0($ctx),$A
1421	lea	16*$SZ($inp),$inp
1422	add	$SZ*1($ctx),$B
1423	add	$SZ*2($ctx),$C
1424	add	$SZ*3($ctx),$D
1425	add	$SZ*4($ctx),$E
1426	add	$SZ*5($ctx),$F
1427	add	$SZ*6($ctx),$G
1428	add	$SZ*7($ctx),$H
1429
1430	cmp	$_end,$inp
1431
1432	mov	$A,$SZ*0($ctx)
1433	mov	$B,$SZ*1($ctx)
1434	mov	$C,$SZ*2($ctx)
1435	mov	$D,$SZ*3($ctx)
1436	mov	$E,$SZ*4($ctx)
1437	mov	$F,$SZ*5($ctx)
1438	mov	$G,$SZ*6($ctx)
1439	mov	$H,$SZ*7($ctx)
1440	jb	.Lloop_xop
1441
1442	mov	$_rsp,%rsi
1443	vzeroupper
1444___
1445$code.=<<___ if ($win64);
1446	movaps	16*$SZ+32(%rsp),%xmm6
1447	movaps	16*$SZ+48(%rsp),%xmm7
1448	movaps	16*$SZ+64(%rsp),%xmm8
1449	movaps	16*$SZ+80(%rsp),%xmm9
1450___
1451$code.=<<___ if ($win64 && $SZ>4);
1452	movaps	16*$SZ+96(%rsp),%xmm10
1453	movaps	16*$SZ+112(%rsp),%xmm11
1454___
1455$code.=<<___;
1456	mov	-48(%rsi),%r15
1457	mov	-40(%rsi),%r14
1458	mov	-32(%rsi),%r13
1459	mov	-24(%rsi),%r12
1460	mov	-16(%rsi),%rbp
1461	mov	-8(%rsi),%rbx
1462	lea	(%rsi),%rsp
1463.Lepilogue_xop:
1464	ret
1465.size	${func}_xop,.-${func}_xop
1466___
1467}
1468######################################################################
1469# AVX+shrd code path
1470#
1471local *ror = sub { &shrd(@_[0],@_) };
1472
1473$code.=<<___;
1474.type	${func}_avx,\@function,3
1475.align	64
1476${func}_avx:
1477.Lavx_shortcut:
1478	mov	%rsp,%rax		# copy %rsp
1479	push	%rbx
1480	push	%rbp
1481	push	%r12
1482	push	%r13
1483	push	%r14
1484	push	%r15
1485	shl	\$4,%rdx		# num*16
1486	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1487	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1488	and	\$-64,%rsp		# align stack frame
1489	mov	$ctx,$_ctx		# save ctx, 1st arg
1490	mov	$inp,$_inp		# save inp, 2nd arh
1491	mov	%rdx,$_end		# save end pointer, "3rd" arg
1492	mov	%rax,$_rsp		# save copy of %rsp
1493___
1494$code.=<<___ if ($win64);
1495	movaps	%xmm6,16*$SZ+32(%rsp)
1496	movaps	%xmm7,16*$SZ+48(%rsp)
1497	movaps	%xmm8,16*$SZ+64(%rsp)
1498	movaps	%xmm9,16*$SZ+80(%rsp)
1499___
1500$code.=<<___ if ($win64 && $SZ>4);
1501	movaps	%xmm10,16*$SZ+96(%rsp)
1502	movaps	%xmm11,16*$SZ+112(%rsp)
1503___
1504$code.=<<___;
1505.Lprologue_avx:
1506
1507	vzeroupper
1508	mov	$SZ*0($ctx),$A
1509	mov	$SZ*1($ctx),$B
1510	mov	$SZ*2($ctx),$C
1511	mov	$SZ*3($ctx),$D
1512	mov	$SZ*4($ctx),$E
1513	mov	$SZ*5($ctx),$F
1514	mov	$SZ*6($ctx),$G
1515	mov	$SZ*7($ctx),$H
1516___
1517					if ($SZ==4) {	# SHA256
1518    my @X = map("%xmm$_",(0..3));
1519    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1520
1521$code.=<<___;
1522	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1523	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1524	jmp	.Lloop_avx
1525.align	16
1526.Lloop_avx:
1527	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1528	vmovdqu	0x00($inp),@X[0]
1529	vmovdqu	0x10($inp),@X[1]
1530	vmovdqu	0x20($inp),@X[2]
1531	vmovdqu	0x30($inp),@X[3]
1532	vpshufb	$t3,@X[0],@X[0]
1533	lea	$TABLE(%rip),$Tbl
1534	vpshufb	$t3,@X[1],@X[1]
1535	vpshufb	$t3,@X[2],@X[2]
1536	vpaddd	0x00($Tbl),@X[0],$t0
1537	vpshufb	$t3,@X[3],@X[3]
1538	vpaddd	0x20($Tbl),@X[1],$t1
1539	vpaddd	0x40($Tbl),@X[2],$t2
1540	vpaddd	0x60($Tbl),@X[3],$t3
1541	vmovdqa	$t0,0x00(%rsp)
1542	mov	$A,$a1
1543	vmovdqa	$t1,0x10(%rsp)
1544	mov	$B,$a3
1545	vmovdqa	$t2,0x20(%rsp)
1546	xor	$C,$a3			# magic
1547	vmovdqa	$t3,0x30(%rsp)
1548	mov	$E,$a0
1549	jmp	.Lavx_00_47
1550
1551.align	16
1552.Lavx_00_47:
1553	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1554___
1555sub Xupdate_256_AVX () {
1556	(
1557	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1558	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1559	'&vpsrld	($t2,$t0,$sigma0[0]);',
1560	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1561	'&vpsrld	($t3,$t0,$sigma0[2])',
1562	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1563	'&vpxor		($t0,$t3,$t2)',
1564	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1565	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1566	'&vpxor		($t0,$t0,$t1)',
1567	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1568	'&vpxor		($t0,$t0,$t2)',
1569	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1570	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1571	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1572	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1573	 '&vpxor	($t2,$t2,$t3);',
1574	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1575	 '&vpxor	($t2,$t2,$t3)',
1576	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1577	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1578	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1579	 '&vpsrld	($t2,$t3,$sigma1[2])',
1580	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1581	 '&vpxor	($t2,$t2,$t3);',
1582	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1583	 '&vpxor	($t2,$t2,$t3)',
1584	 '&vpshufb	($t2,$t2,$t5)',
1585	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1586	);
1587}
1588
1589sub AVX_256_00_47 () {
1590my $j = shift;
1591my $body = shift;
1592my @X = @_;
1593my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1594
1595	foreach (Xupdate_256_AVX()) {		# 29 instructions
1596	    eval;
1597	    eval(shift(@insns));
1598	    eval(shift(@insns));
1599	    eval(shift(@insns));
1600	}
1601	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1602	  foreach (@insns) { eval; }		# remaining instructions
1603	&vmovdqa	(16*$j."(%rsp)",$t2);
1604}
1605
1606    for ($i=0,$j=0; $j<4; $j++) {
1607	&AVX_256_00_47($j,\&body_00_15,@X);
1608	push(@X,shift(@X));			# rotate(@X)
1609    }
1610	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1611	&jne	(".Lavx_00_47");
1612
1613    for ($i=0; $i<16; ) {
1614	foreach(body_00_15()) { eval; }
1615    }
1616
1617					} else {	# SHA512
1618    my @X = map("%xmm$_",(0..7));
1619    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1620
1621$code.=<<___;
1622	jmp	.Lloop_avx
1623.align	16
1624.Lloop_avx:
1625	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1626	vmovdqu	0x00($inp),@X[0]
1627	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1628	vmovdqu	0x10($inp),@X[1]
1629	vmovdqu	0x20($inp),@X[2]
1630	vpshufb	$t3,@X[0],@X[0]
1631	vmovdqu	0x30($inp),@X[3]
1632	vpshufb	$t3,@X[1],@X[1]
1633	vmovdqu	0x40($inp),@X[4]
1634	vpshufb	$t3,@X[2],@X[2]
1635	vmovdqu	0x50($inp),@X[5]
1636	vpshufb	$t3,@X[3],@X[3]
1637	vmovdqu	0x60($inp),@X[6]
1638	vpshufb	$t3,@X[4],@X[4]
1639	vmovdqu	0x70($inp),@X[7]
1640	vpshufb	$t3,@X[5],@X[5]
1641	vpaddq	-0x80($Tbl),@X[0],$t0
1642	vpshufb	$t3,@X[6],@X[6]
1643	vpaddq	-0x60($Tbl),@X[1],$t1
1644	vpshufb	$t3,@X[7],@X[7]
1645	vpaddq	-0x40($Tbl),@X[2],$t2
1646	vpaddq	-0x20($Tbl),@X[3],$t3
1647	vmovdqa	$t0,0x00(%rsp)
1648	vpaddq	0x00($Tbl),@X[4],$t0
1649	vmovdqa	$t1,0x10(%rsp)
1650	vpaddq	0x20($Tbl),@X[5],$t1
1651	vmovdqa	$t2,0x20(%rsp)
1652	vpaddq	0x40($Tbl),@X[6],$t2
1653	vmovdqa	$t3,0x30(%rsp)
1654	vpaddq	0x60($Tbl),@X[7],$t3
1655	vmovdqa	$t0,0x40(%rsp)
1656	mov	$A,$a1
1657	vmovdqa	$t1,0x50(%rsp)
1658	mov	$B,$a3
1659	vmovdqa	$t2,0x60(%rsp)
1660	xor	$C,$a3			# magic
1661	vmovdqa	$t3,0x70(%rsp)
1662	mov	$E,$a0
1663	jmp	.Lavx_00_47
1664
1665.align	16
1666.Lavx_00_47:
1667	add	\$`16*2*$SZ`,$Tbl
1668___
1669sub Xupdate_512_AVX () {
1670	(
1671	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1672	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1673	'&vpsrlq	($t2,$t0,$sigma0[0])',
1674	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1675	'&vpsrlq	($t3,$t0,$sigma0[2])',
1676	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1677	 '&vpxor	($t0,$t3,$t2)',
1678	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1679	 '&vpxor	($t0,$t0,$t1)',
1680	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1681	 '&vpxor	($t0,$t0,$t2)',
1682	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1683	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1684	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1685	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1686	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1687	 '&vpxor	($t3,$t3,$t2)',
1688	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1689	 '&vpxor	($t3,$t3,$t1)',
1690	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1691	 '&vpxor	($t3,$t3,$t2)',
1692	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1693	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1694	);
1695}
1696
1697sub AVX_512_00_47 () {
1698my $j = shift;
1699my $body = shift;
1700my @X = @_;
1701my @insns = (&$body,&$body);			# 52 instructions
1702
1703	foreach (Xupdate_512_AVX()) {		# 23 instructions
1704	    eval;
1705	    eval(shift(@insns));
1706	    eval(shift(@insns));
1707	}
1708	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1709	  foreach (@insns) { eval; }		# remaining instructions
1710	&vmovdqa	(16*$j."(%rsp)",$t2);
1711}
1712
1713    for ($i=0,$j=0; $j<8; $j++) {
1714	&AVX_512_00_47($j,\&body_00_15,@X);
1715	push(@X,shift(@X));			# rotate(@X)
1716    }
1717	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1718	&jne	(".Lavx_00_47");
1719
1720    for ($i=0; $i<16; ) {
1721	foreach(body_00_15()) { eval; }
1722    }
1723}
1724$code.=<<___;
1725	mov	$_ctx,$ctx
1726	mov	$a1,$A
1727
1728	add	$SZ*0($ctx),$A
1729	lea	16*$SZ($inp),$inp
1730	add	$SZ*1($ctx),$B
1731	add	$SZ*2($ctx),$C
1732	add	$SZ*3($ctx),$D
1733	add	$SZ*4($ctx),$E
1734	add	$SZ*5($ctx),$F
1735	add	$SZ*6($ctx),$G
1736	add	$SZ*7($ctx),$H
1737
1738	cmp	$_end,$inp
1739
1740	mov	$A,$SZ*0($ctx)
1741	mov	$B,$SZ*1($ctx)
1742	mov	$C,$SZ*2($ctx)
1743	mov	$D,$SZ*3($ctx)
1744	mov	$E,$SZ*4($ctx)
1745	mov	$F,$SZ*5($ctx)
1746	mov	$G,$SZ*6($ctx)
1747	mov	$H,$SZ*7($ctx)
1748	jb	.Lloop_avx
1749
1750	mov	$_rsp,%rsi
1751	vzeroupper
1752___
1753$code.=<<___ if ($win64);
1754	movaps	16*$SZ+32(%rsp),%xmm6
1755	movaps	16*$SZ+48(%rsp),%xmm7
1756	movaps	16*$SZ+64(%rsp),%xmm8
1757	movaps	16*$SZ+80(%rsp),%xmm9
1758___
1759$code.=<<___ if ($win64 && $SZ>4);
1760	movaps	16*$SZ+96(%rsp),%xmm10
1761	movaps	16*$SZ+112(%rsp),%xmm11
1762___
1763$code.=<<___;
1764	mov	-48(%rsi),%r15
1765	mov	-40(%rsi),%r14
1766	mov	-32(%rsi),%r13
1767	mov	-24(%rsi),%r12
1768	mov	-16(%rsi),%rbp
1769	mov	-8(%rsi),%rbx
1770	lea	(%rsi),%rsp
1771.Lepilogue_avx:
1772	ret
1773.size	${func}_avx,.-${func}_avx
1774___
1775
1776if ($avx>1) {{
1777######################################################################
1778# AVX2+BMI code path
1779#
1780my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1781my $PUSH8=8*2*$SZ;
1782use integer;
1783
1784sub bodyx_00_15 () {
1785	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1786	(
1787	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1788
1789	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1790	'&and	($a4,$e)',		# f&e
1791	'&rorx	($a0,$e,$Sigma1[2])',
1792	'&rorx	($a2,$e,$Sigma1[1])',
1793
1794	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1795	'&lea	($h,"($h,$a4)")',
1796	'&andn	($a4,$e,$g)',		# ~e&g
1797	'&xor	($a0,$a2)',
1798
1799	'&rorx	($a1,$e,$Sigma1[0])',
1800	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1801	'&xor	($a0,$a1)',		# Sigma1(e)
1802	'&mov	($a2,$a)',
1803
1804	'&rorx	($a4,$a,$Sigma0[2])',
1805	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1806	'&xor	($a2,$b)',		# a^b, b^c in next round
1807	'&rorx	($a1,$a,$Sigma0[1])',
1808
1809	'&rorx	($a0,$a,$Sigma0[0])',
1810	'&lea	($d,"($d,$h)")',	# d+=h
1811	'&and	($a3,$a2)',		# (b^c)&(a^b)
1812	'&xor	($a1,$a4)',
1813
1814	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1815	'&xor	($a1,$a0)',		# Sigma0(a)
1816	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1817	'&mov	($a4,$e)',		# copy of f in future
1818
1819	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1820	);
1821	# and at the finish one has to $a+=$a1
1822}
1823
1824$code.=<<___;
1825.type	${func}_avx2,\@function,3
1826.align	64
1827${func}_avx2:
1828.Lavx2_shortcut:
1829	mov	%rsp,%rax		# copy %rsp
1830	push	%rbx
1831	push	%rbp
1832	push	%r12
1833	push	%r13
1834	push	%r14
1835	push	%r15
1836	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1837	shl	\$4,%rdx		# num*16
1838	and	\$-256*$SZ,%rsp		# align stack frame
1839	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1840	add	\$`2*$SZ*($rounds-8)`,%rsp
1841	mov	$ctx,$_ctx		# save ctx, 1st arg
1842	mov	$inp,$_inp		# save inp, 2nd arh
1843	mov	%rdx,$_end		# save end pointer, "3rd" arg
1844	mov	%rax,$_rsp		# save copy of %rsp
1845___
1846$code.=<<___ if ($win64);
1847	movaps	%xmm6,16*$SZ+32(%rsp)
1848	movaps	%xmm7,16*$SZ+48(%rsp)
1849	movaps	%xmm8,16*$SZ+64(%rsp)
1850	movaps	%xmm9,16*$SZ+80(%rsp)
1851___
1852$code.=<<___ if ($win64 && $SZ>4);
1853	movaps	%xmm10,16*$SZ+96(%rsp)
1854	movaps	%xmm11,16*$SZ+112(%rsp)
1855___
1856$code.=<<___;
1857.Lprologue_avx2:
1858
1859	vzeroupper
1860	sub	\$-16*$SZ,$inp		# inp++, size optimization
1861	mov	$SZ*0($ctx),$A
1862	mov	$inp,%r12		# borrow $T1
1863	mov	$SZ*1($ctx),$B
1864	cmp	%rdx,$inp		# $_end
1865	mov	$SZ*2($ctx),$C
1866	cmove	%rsp,%r12		# next block or random data
1867	mov	$SZ*3($ctx),$D
1868	mov	$SZ*4($ctx),$E
1869	mov	$SZ*5($ctx),$F
1870	mov	$SZ*6($ctx),$G
1871	mov	$SZ*7($ctx),$H
1872___
1873					if ($SZ==4) {	# SHA256
1874    my @X = map("%ymm$_",(0..3));
1875    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1876
1877$code.=<<___;
1878	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1879	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1880	jmp	.Loop_avx2
1881.align	16
1882.Loop_avx2:
1883	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1884	vmovdqu	-16*$SZ+0($inp),%xmm0
1885	vmovdqu	-16*$SZ+16($inp),%xmm1
1886	vmovdqu	-16*$SZ+32($inp),%xmm2
1887	vmovdqu	-16*$SZ+48($inp),%xmm3
1888	#mov		$inp,$_inp	# offload $inp
1889	vinserti128	\$1,(%r12),@X[0],@X[0]
1890	vinserti128	\$1,16(%r12),@X[1],@X[1]
1891	vpshufb		$t3,@X[0],@X[0]
1892	vinserti128	\$1,32(%r12),@X[2],@X[2]
1893	vpshufb		$t3,@X[1],@X[1]
1894	vinserti128	\$1,48(%r12),@X[3],@X[3]
1895
1896	lea	$TABLE(%rip),$Tbl
1897	vpshufb	$t3,@X[2],@X[2]
1898	vpaddd	0x00($Tbl),@X[0],$t0
1899	vpshufb	$t3,@X[3],@X[3]
1900	vpaddd	0x20($Tbl),@X[1],$t1
1901	vpaddd	0x40($Tbl),@X[2],$t2
1902	vpaddd	0x60($Tbl),@X[3],$t3
1903	vmovdqa	$t0,0x00(%rsp)
1904	xor	$a1,$a1
1905	vmovdqa	$t1,0x20(%rsp)
1906	lea	-$PUSH8(%rsp),%rsp
1907	mov	$B,$a3
1908	vmovdqa	$t2,0x00(%rsp)
1909	xor	$C,$a3			# magic
1910	vmovdqa	$t3,0x20(%rsp)
1911	mov	$F,$a4
1912	sub	\$-16*2*$SZ,$Tbl	# size optimization
1913	jmp	.Lavx2_00_47
1914
1915.align	16
1916.Lavx2_00_47:
1917___
1918
1919sub AVX2_256_00_47 () {
1920my $j = shift;
1921my $body = shift;
1922my @X = @_;
1923my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1924my $base = "+2*$PUSH8(%rsp)";
1925
1926	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1927	foreach (Xupdate_256_AVX()) {		# 29 instructions
1928	    eval;
1929	    eval(shift(@insns));
1930	    eval(shift(@insns));
1931	    eval(shift(@insns));
1932	}
1933	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1934	  foreach (@insns) { eval; }		# remaining instructions
1935	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1936}
1937
1938    for ($i=0,$j=0; $j<4; $j++) {
1939	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1940	push(@X,shift(@X));			# rotate(@X)
1941    }
1942	&lea	($Tbl,16*2*$SZ."($Tbl)");
1943	&cmpb	(($SZ-1)."($Tbl)",0);
1944	&jne	(".Lavx2_00_47");
1945
1946    for ($i=0; $i<16; ) {
1947	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1948	foreach(bodyx_00_15()) { eval; }
1949    }
1950					} else {	# SHA512
1951    my @X = map("%ymm$_",(0..7));
1952    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1953
1954$code.=<<___;
1955	jmp	.Loop_avx2
1956.align	16
1957.Loop_avx2:
1958	vmovdqu	-16*$SZ($inp),%xmm0
1959	vmovdqu	-16*$SZ+16($inp),%xmm1
1960	vmovdqu	-16*$SZ+32($inp),%xmm2
1961	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1962	vmovdqu	-16*$SZ+48($inp),%xmm3
1963	vmovdqu	-16*$SZ+64($inp),%xmm4
1964	vmovdqu	-16*$SZ+80($inp),%xmm5
1965	vmovdqu	-16*$SZ+96($inp),%xmm6
1966	vmovdqu	-16*$SZ+112($inp),%xmm7
1967	#mov	$inp,$_inp	# offload $inp
1968	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
1969	vinserti128	\$1,(%r12),@X[0],@X[0]
1970	vinserti128	\$1,16(%r12),@X[1],@X[1]
1971	 vpshufb	$t2,@X[0],@X[0]
1972	vinserti128	\$1,32(%r12),@X[2],@X[2]
1973	 vpshufb	$t2,@X[1],@X[1]
1974	vinserti128	\$1,48(%r12),@X[3],@X[3]
1975	 vpshufb	$t2,@X[2],@X[2]
1976	vinserti128	\$1,64(%r12),@X[4],@X[4]
1977	 vpshufb	$t2,@X[3],@X[3]
1978	vinserti128	\$1,80(%r12),@X[5],@X[5]
1979	 vpshufb	$t2,@X[4],@X[4]
1980	vinserti128	\$1,96(%r12),@X[6],@X[6]
1981	 vpshufb	$t2,@X[5],@X[5]
1982	vinserti128	\$1,112(%r12),@X[7],@X[7]
1983
1984	vpaddq	-0x80($Tbl),@X[0],$t0
1985	vpshufb	$t2,@X[6],@X[6]
1986	vpaddq	-0x60($Tbl),@X[1],$t1
1987	vpshufb	$t2,@X[7],@X[7]
1988	vpaddq	-0x40($Tbl),@X[2],$t2
1989	vpaddq	-0x20($Tbl),@X[3],$t3
1990	vmovdqa	$t0,0x00(%rsp)
1991	vpaddq	0x00($Tbl),@X[4],$t0
1992	vmovdqa	$t1,0x20(%rsp)
1993	vpaddq	0x20($Tbl),@X[5],$t1
1994	vmovdqa	$t2,0x40(%rsp)
1995	vpaddq	0x40($Tbl),@X[6],$t2
1996	vmovdqa	$t3,0x60(%rsp)
1997	lea	-$PUSH8(%rsp),%rsp
1998	vpaddq	0x60($Tbl),@X[7],$t3
1999	vmovdqa	$t0,0x00(%rsp)
2000	xor	$a1,$a1
2001	vmovdqa	$t1,0x20(%rsp)
2002	mov	$B,$a3
2003	vmovdqa	$t2,0x40(%rsp)
2004	xor	$C,$a3			# magic
2005	vmovdqa	$t3,0x60(%rsp)
2006	mov	$F,$a4
2007	add	\$16*2*$SZ,$Tbl
2008	jmp	.Lavx2_00_47
2009
2010.align	16
2011.Lavx2_00_47:
2012___
2013
2014sub AVX2_512_00_47 () {
2015my $j = shift;
2016my $body = shift;
2017my @X = @_;
2018my @insns = (&$body,&$body);			# 48 instructions
2019my $base = "+2*$PUSH8(%rsp)";
2020
2021	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
2022	foreach (Xupdate_512_AVX()) {		# 23 instructions
2023	    eval;
2024	    if ($_ !~ /\;$/) {
2025		eval(shift(@insns));
2026		eval(shift(@insns));
2027		eval(shift(@insns));
2028	    }
2029	}
2030	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2031	  foreach (@insns) { eval; }		# remaining instructions
2032	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2033}
2034
2035    for ($i=0,$j=0; $j<8; $j++) {
2036	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2037	push(@X,shift(@X));			# rotate(@X)
2038    }
2039	&lea	($Tbl,16*2*$SZ."($Tbl)");
2040	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2041	&jne	(".Lavx2_00_47");
2042
2043    for ($i=0; $i<16; ) {
2044	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2045	foreach(bodyx_00_15()) { eval; }
2046    }
2047}
2048$code.=<<___;
2049	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2050	add	$a1,$A
2051	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2052	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2053
2054	add	$SZ*0($ctx),$A
2055	add	$SZ*1($ctx),$B
2056	add	$SZ*2($ctx),$C
2057	add	$SZ*3($ctx),$D
2058	add	$SZ*4($ctx),$E
2059	add	$SZ*5($ctx),$F
2060	add	$SZ*6($ctx),$G
2061	add	$SZ*7($ctx),$H
2062
2063	mov	$A,$SZ*0($ctx)
2064	mov	$B,$SZ*1($ctx)
2065	mov	$C,$SZ*2($ctx)
2066	mov	$D,$SZ*3($ctx)
2067	mov	$E,$SZ*4($ctx)
2068	mov	$F,$SZ*5($ctx)
2069	mov	$G,$SZ*6($ctx)
2070	mov	$H,$SZ*7($ctx)
2071
2072	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2073	je	.Ldone_avx2
2074
2075	xor	$a1,$a1
2076	mov	$B,$a3
2077	xor	$C,$a3			# magic
2078	mov	$F,$a4
2079	jmp	.Lower_avx2
2080.align	16
2081.Lower_avx2:
2082___
2083    for ($i=0; $i<8; ) {
2084	my $base="+16($Tbl)";
2085	foreach(bodyx_00_15()) { eval; }
2086    }
2087$code.=<<___;
2088	lea	-$PUSH8($Tbl),$Tbl
2089	cmp	%rsp,$Tbl
2090	jae	.Lower_avx2
2091
2092	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2093	add	$a1,$A
2094	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2095	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2096
2097	add	$SZ*0($ctx),$A
2098	add	$SZ*1($ctx),$B
2099	add	$SZ*2($ctx),$C
2100	add	$SZ*3($ctx),$D
2101	add	$SZ*4($ctx),$E
2102	add	$SZ*5($ctx),$F
2103	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2104	add	$SZ*6($ctx),$G
2105	mov	$inp,%r12
2106	add	$SZ*7($ctx),$H
2107	cmp	$_end,$inp
2108
2109	mov	$A,$SZ*0($ctx)
2110	cmove	%rsp,%r12		# next block or stale data
2111	mov	$B,$SZ*1($ctx)
2112	mov	$C,$SZ*2($ctx)
2113	mov	$D,$SZ*3($ctx)
2114	mov	$E,$SZ*4($ctx)
2115	mov	$F,$SZ*5($ctx)
2116	mov	$G,$SZ*6($ctx)
2117	mov	$H,$SZ*7($ctx)
2118
2119	jbe	.Loop_avx2
2120	lea	(%rsp),$Tbl
2121
2122.Ldone_avx2:
2123	lea	($Tbl),%rsp
2124	mov	$_rsp,%rsi
2125	vzeroupper
2126___
2127$code.=<<___ if ($win64);
2128	movaps	16*$SZ+32(%rsp),%xmm6
2129	movaps	16*$SZ+48(%rsp),%xmm7
2130	movaps	16*$SZ+64(%rsp),%xmm8
2131	movaps	16*$SZ+80(%rsp),%xmm9
2132___
2133$code.=<<___ if ($win64 && $SZ>4);
2134	movaps	16*$SZ+96(%rsp),%xmm10
2135	movaps	16*$SZ+112(%rsp),%xmm11
2136___
2137$code.=<<___;
2138	mov	-48(%rsi),%r15
2139	mov	-40(%rsi),%r14
2140	mov	-32(%rsi),%r13
2141	mov	-24(%rsi),%r12
2142	mov	-16(%rsi),%rbp
2143	mov	-8(%rsi),%rbx
2144	lea	(%rsi),%rsp
2145.Lepilogue_avx2:
2146	ret
2147.size	${func}_avx2,.-${func}_avx2
2148___
2149}}
2150}}}}}
2151
2152# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2153#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2154if ($win64) {
2155$rec="%rcx";
2156$frame="%rdx";
2157$context="%r8";
2158$disp="%r9";
2159
2160$code.=<<___;
2161.extern	__imp_RtlVirtualUnwind
2162.type	se_handler,\@abi-omnipotent
2163.align	16
2164se_handler:
2165	push	%rsi
2166	push	%rdi
2167	push	%rbx
2168	push	%rbp
2169	push	%r12
2170	push	%r13
2171	push	%r14
2172	push	%r15
2173	pushfq
2174	sub	\$64,%rsp
2175
2176	mov	120($context),%rax	# pull context->Rax
2177	mov	248($context),%rbx	# pull context->Rip
2178
2179	mov	8($disp),%rsi		# disp->ImageBase
2180	mov	56($disp),%r11		# disp->HanderlData
2181
2182	mov	0(%r11),%r10d		# HandlerData[0]
2183	lea	(%rsi,%r10),%r10	# prologue label
2184	cmp	%r10,%rbx		# context->Rip<prologue label
2185	jb	.Lin_prologue
2186
2187	mov	152($context),%rax	# pull context->Rsp
2188
2189	mov	4(%r11),%r10d		# HandlerData[1]
2190	lea	(%rsi,%r10),%r10	# epilogue label
2191	cmp	%r10,%rbx		# context->Rip>=epilogue label
2192	jae	.Lin_prologue
2193___
2194$code.=<<___ if ($avx>1);
2195	lea	.Lavx2_shortcut(%rip),%r10
2196	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2197	jb	.Lnot_in_avx2
2198
2199	and	\$-256*$SZ,%rax
2200	add	\$`2*$SZ*($rounds-8)`,%rax
2201.Lnot_in_avx2:
2202___
2203$code.=<<___;
2204	mov	%rax,%rsi		# put aside Rsp
2205	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2206
2207	mov	-8(%rax),%rbx
2208	mov	-16(%rax),%rbp
2209	mov	-24(%rax),%r12
2210	mov	-32(%rax),%r13
2211	mov	-40(%rax),%r14
2212	mov	-48(%rax),%r15
2213	mov	%rbx,144($context)	# restore context->Rbx
2214	mov	%rbp,160($context)	# restore context->Rbp
2215	mov	%r12,216($context)	# restore context->R12
2216	mov	%r13,224($context)	# restore context->R13
2217	mov	%r14,232($context)	# restore context->R14
2218	mov	%r15,240($context)	# restore context->R15
2219
2220	lea	.Lepilogue(%rip),%r10
2221	cmp	%r10,%rbx
2222	jb	.Lin_prologue		# non-AVX code
2223
2224	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2225	lea	512($context),%rdi	# &context.Xmm6
2226	mov	\$`$SZ==4?8:12`,%ecx
2227	.long	0xa548f3fc		# cld; rep movsq
2228
2229.Lin_prologue:
2230	mov	8(%rax),%rdi
2231	mov	16(%rax),%rsi
2232	mov	%rax,152($context)	# restore context->Rsp
2233	mov	%rsi,168($context)	# restore context->Rsi
2234	mov	%rdi,176($context)	# restore context->Rdi
2235
2236	mov	40($disp),%rdi		# disp->ContextRecord
2237	mov	$context,%rsi		# context
2238	mov	\$154,%ecx		# sizeof(CONTEXT)
2239	.long	0xa548f3fc		# cld; rep movsq
2240
2241	mov	$disp,%rsi
2242	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2243	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2244	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2245	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2246	mov	40(%rsi),%r10		# disp->ContextRecord
2247	lea	56(%rsi),%r11		# &disp->HandlerData
2248	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2249	mov	%r10,32(%rsp)		# arg5
2250	mov	%r11,40(%rsp)		# arg6
2251	mov	%r12,48(%rsp)		# arg7
2252	mov	%rcx,56(%rsp)		# arg8, (NULL)
2253	call	*__imp_RtlVirtualUnwind(%rip)
2254
2255	mov	\$1,%eax		# ExceptionContinueSearch
2256	add	\$64,%rsp
2257	popfq
2258	pop	%r15
2259	pop	%r14
2260	pop	%r13
2261	pop	%r12
2262	pop	%rbp
2263	pop	%rbx
2264	pop	%rdi
2265	pop	%rsi
2266	ret
2267.size	se_handler,.-se_handler
2268___
2269
2270$code.=<<___ if ($SZ==4 && $shaext);
2271.type	shaext_handler,\@abi-omnipotent
2272.align	16
2273shaext_handler:
2274	push	%rsi
2275	push	%rdi
2276	push	%rbx
2277	push	%rbp
2278	push	%r12
2279	push	%r13
2280	push	%r14
2281	push	%r15
2282	pushfq
2283	sub	\$64,%rsp
2284
2285	mov	120($context),%rax	# pull context->Rax
2286	mov	248($context),%rbx	# pull context->Rip
2287
2288	lea	.Lprologue_shaext(%rip),%r10
2289	cmp	%r10,%rbx		# context->Rip<.Lprologue
2290	jb	.Lin_prologue
2291
2292	lea	.Lepilogue_shaext(%rip),%r10
2293	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2294	jae	.Lin_prologue
2295
2296	lea	-8-5*16(%rax),%rsi
2297	lea	512($context),%rdi	# &context.Xmm6
2298	mov	\$10,%ecx
2299	.long	0xa548f3fc		# cld; rep movsq
2300
2301	jmp	.Lin_prologue
2302.size	shaext_handler,.-shaext_handler
2303___
2304
2305$code.=<<___;
2306.section	.pdata
2307.align	4
2308	.rva	.LSEH_begin_$func
2309	.rva	.LSEH_end_$func
2310	.rva	.LSEH_info_$func
2311___
2312$code.=<<___ if ($SZ==4 && $shaext);
2313	.rva	.LSEH_begin_${func}_shaext
2314	.rva	.LSEH_end_${func}_shaext
2315	.rva	.LSEH_info_${func}_shaext
2316___
2317$code.=<<___ if ($SZ==4);
2318	.rva	.LSEH_begin_${func}_ssse3
2319	.rva	.LSEH_end_${func}_ssse3
2320	.rva	.LSEH_info_${func}_ssse3
2321___
2322$code.=<<___ if ($avx && $SZ==8);
2323	.rva	.LSEH_begin_${func}_xop
2324	.rva	.LSEH_end_${func}_xop
2325	.rva	.LSEH_info_${func}_xop
2326___
2327$code.=<<___ if ($avx);
2328	.rva	.LSEH_begin_${func}_avx
2329	.rva	.LSEH_end_${func}_avx
2330	.rva	.LSEH_info_${func}_avx
2331___
2332$code.=<<___ if ($avx>1);
2333	.rva	.LSEH_begin_${func}_avx2
2334	.rva	.LSEH_end_${func}_avx2
2335	.rva	.LSEH_info_${func}_avx2
2336___
2337$code.=<<___;
2338.section	.xdata
2339.align	8
2340.LSEH_info_$func:
2341	.byte	9,0,0,0
2342	.rva	se_handler
2343	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2344___
2345$code.=<<___ if ($SZ==4 && $shaext);
2346.LSEH_info_${func}_shaext:
2347	.byte	9,0,0,0
2348	.rva	shaext_handler
2349___
2350$code.=<<___ if ($SZ==4);
2351.LSEH_info_${func}_ssse3:
2352	.byte	9,0,0,0
2353	.rva	se_handler
2354	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2355___
2356$code.=<<___ if ($avx && $SZ==8);
2357.LSEH_info_${func}_xop:
2358	.byte	9,0,0,0
2359	.rva	se_handler
2360	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2361___
2362$code.=<<___ if ($avx);
2363.LSEH_info_${func}_avx:
2364	.byte	9,0,0,0
2365	.rva	se_handler
2366	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2367___
2368$code.=<<___ if ($avx>1);
2369.LSEH_info_${func}_avx2:
2370	.byte	9,0,0,0
2371	.rva	se_handler
2372	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2373___
2374}
2375
2376sub sha256op38 {
2377    my $instr = shift;
2378    my %opcodelet = (
2379		"sha256rnds2" => 0xcb,
2380  		"sha256msg1"  => 0xcc,
2381		"sha256msg2"  => 0xcd	);
2382
2383    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2384      my @opcode=(0x0f,0x38);
2385	push @opcode,$opcodelet{$instr};
2386	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2387	return ".byte\t".join(',',@opcode);
2388    } else {
2389	return $instr."\t".@_[0];
2390    }
2391}
2392
2393foreach (split("\n",$code)) {
2394	s/\`([^\`]*)\`/eval $1/geo;
2395
2396	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2397
2398	print $_,"\n";
2399}
2400close STDOUT;
2401