aesni-sha256-x86_64.pl revision 325335
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# January 2013
11#
12# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18# AESNI code is weaved into it. As SHA256 dominates execution time,
19# stitch performance does not depend on AES key length. Below are
20# performance numbers in cycles per processed byte, less is better,
21# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22# subroutine:
23#
24#		 AES-128/-192/-256+SHA256	this(**)gain
25# Sandy Bridge	    5.05/6.05/7.05+11.6		13.0	+28%/36%/43%
26# Ivy Bridge	    5.05/6.05/7.05+10.3		11.6	+32%/41%/50%
27# Haswell	    4.43/5.29/6.19+7.80		8.79	+39%/49%/59%
28# Bulldozer	    5.77/6.89/8.00+13.7		13.7	+42%/50%/58%
29#
30# (*)	there are XOP, AVX1 and AVX2 code pathes, meaning that
31#	Westmere is omitted from loop, this is because gain was not
32#	estimated high enough to justify the effort;
33# (**)	these are EVP-free results, results obtained with 'speed
34#	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36$flavour = shift;
37$output  = shift;
38if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45die "can't locate x86_64-xlate.pl";
46
47if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49	$avx = ($1>=2.19) + ($1>=2.22);
50}
51
52if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54	$avx = ($1>=2.09) + ($1>=2.10);
55}
56
57if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59	$avx = ($1>=10) + ($1>=12);
60}
61
62if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
63	$avx = ($2>=3.0) + ($2>3.0);
64}
65
66$shaext=$avx;	### set to zero if compiling for 1.0.1
67$avx=1		if (!$shaext && $avx);
68
69open OUT,"| \"$^X\" $xlate $flavour $output";
70*STDOUT=*OUT;
71
72$func="aesni_cbc_sha256_enc";
73$TABLE="K256";
74$SZ=4;
75@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
76				"%r8d","%r9d","%r10d","%r11d");
77($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
78@Sigma0=( 2,13,22);
79@Sigma1=( 6,11,25);
80@sigma0=( 7,18, 3);
81@sigma1=(17,19,10);
82$rounds=64;
83
84########################################################################
85# void aesni_cbc_sha256_enc(const void *inp,
86#			void *out,
87#			size_t length,
88#			const AES_KEY *key,
89#			unsigned char *iv,
90#			SHA256_CTX *ctx,
91#			const void *in0);
92($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
93("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
94
95$Tbl="%rbp";
96
97$_inp="16*$SZ+0*8(%rsp)";
98$_out="16*$SZ+1*8(%rsp)";
99$_end="16*$SZ+2*8(%rsp)";
100$_key="16*$SZ+3*8(%rsp)";
101$_ivp="16*$SZ+4*8(%rsp)";
102$_ctx="16*$SZ+5*8(%rsp)";
103$_in0="16*$SZ+6*8(%rsp)";
104$_rsp="16*$SZ+7*8(%rsp)";
105$framesz=16*$SZ+8*8;
106
107$code=<<___;
108.text
109
110.extern	OPENSSL_ia32cap_P
111.globl	$func
112.type	$func,\@abi-omnipotent
113.align	16
114$func:
115___
116						if ($avx) {
117$code.=<<___;
118	lea	OPENSSL_ia32cap_P(%rip),%r11
119	mov	\$1,%eax
120	cmp	\$0,`$win64?"%rcx":"%rdi"`
121	je	.Lprobe
122	mov	0(%r11),%eax
123	mov	4(%r11),%r10
124___
125$code.=<<___ if ($shaext);
126	bt	\$61,%r10			# check for SHA
127	jc	${func}_shaext
128___
129$code.=<<___;
130	mov	%r10,%r11
131	shr	\$32,%r11
132
133	test	\$`1<<11`,%r10d			# check for XOP
134	jnz	${func}_xop
135___
136$code.=<<___ if ($avx>1);
137	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
138	cmp	\$`1<<8|1<<5|1<<3`,%r11d
139	je	${func}_avx2
140___
141$code.=<<___;
142	and	\$`1<<28`,%r10d			# check for AVX
143	jnz	${func}_avx
144	ud2
145___
146						}
147$code.=<<___;
148	xor	%eax,%eax
149	cmp	\$0,`$win64?"%rcx":"%rdi"`
150	je	.Lprobe
151	ud2
152.Lprobe:
153	ret
154.size	$func,.-$func
155
156.align	64
157.type	$TABLE,\@object
158$TABLE:
159	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
161	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
162	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
163	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
164	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
167	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
168	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
169	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
170	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
171	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
172	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
173	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
174	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
175	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
176	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
177	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
178	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
179	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
180	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
181	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
182	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
183	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
184	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
187	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
188	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
189	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
190	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
191
192	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
193	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
194	.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
195	.long	0,0,0,0,   0,0,0,0
196	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
197.align	64
198___
199
200######################################################################
201# SIMD code paths
202#
203{{{
204($iv,$inout,$roundkey,$temp,
205 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
206
207$aesni_cbc_idx=0;
208@aesni_cbc_block = (
209##	&vmovdqu	($roundkey,"0x00-0x80($inp)");'
210##	&vmovdqu	($inout,($inp));
211##	&mov		($_inp,$inp);
212
213	'&vpxor		($inout,$inout,$roundkey);'.
214	' &vmovdqu	($roundkey,"0x10-0x80($inp)");',
215
216	'&vpxor		($inout,$inout,$iv);',
217
218	'&vaesenc	($inout,$inout,$roundkey);'.
219	' &vmovdqu	($roundkey,"0x20-0x80($inp)");',
220
221	'&vaesenc	($inout,$inout,$roundkey);'.
222	' &vmovdqu	($roundkey,"0x30-0x80($inp)");',
223
224	'&vaesenc	($inout,$inout,$roundkey);'.
225	' &vmovdqu	($roundkey,"0x40-0x80($inp)");',
226
227	'&vaesenc	($inout,$inout,$roundkey);'.
228	' &vmovdqu	($roundkey,"0x50-0x80($inp)");',
229
230	'&vaesenc	($inout,$inout,$roundkey);'.
231	' &vmovdqu	($roundkey,"0x60-0x80($inp)");',
232
233	'&vaesenc	($inout,$inout,$roundkey);'.
234	' &vmovdqu	($roundkey,"0x70-0x80($inp)");',
235
236	'&vaesenc	($inout,$inout,$roundkey);'.
237	' &vmovdqu	($roundkey,"0x80-0x80($inp)");',
238
239	'&vaesenc	($inout,$inout,$roundkey);'.
240	' &vmovdqu	($roundkey,"0x90-0x80($inp)");',
241
242	'&vaesenc	($inout,$inout,$roundkey);'.
243	' &vmovdqu	($roundkey,"0xa0-0x80($inp)");',
244
245	'&vaesenclast	($temp,$inout,$roundkey);'.
246	' &vaesenc	($inout,$inout,$roundkey);'.
247	' &vmovdqu	($roundkey,"0xb0-0x80($inp)");',
248
249	'&vpand		($iv,$temp,$mask10);'.
250	' &vaesenc	($inout,$inout,$roundkey);'.
251	' &vmovdqu	($roundkey,"0xc0-0x80($inp)");',
252
253	'&vaesenclast	($temp,$inout,$roundkey);'.
254	' &vaesenc	($inout,$inout,$roundkey);'.
255	' &vmovdqu	($roundkey,"0xd0-0x80($inp)");',
256
257	'&vpand		($temp,$temp,$mask12);'.
258	' &vaesenc	($inout,$inout,$roundkey);'.
259	 '&vmovdqu	($roundkey,"0xe0-0x80($inp)");',
260
261	'&vpor		($iv,$iv,$temp);'.
262	' &vaesenclast	($temp,$inout,$roundkey);'.
263	' &vmovdqu	($roundkey,"0x00-0x80($inp)");'
264
265##	&mov		($inp,$_inp);
266##	&mov		($out,$_out);
267##	&vpand		($temp,$temp,$mask14);
268##	&vpor		($iv,$iv,$temp);
269##	&vmovdqu	($iv,($out,$inp);
270##	&lea		(inp,16($inp));
271);
272
273my $a4=$T1;
274my ($a,$b,$c,$d,$e,$f,$g,$h);
275
276sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
277{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
278  my $arg = pop;
279    $arg = "\$$arg" if ($arg*1 eq $arg);
280    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
281}
282
283sub body_00_15 () {
284	(
285	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
286
287	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
288	'&mov	($a,$a1)',
289	'&mov	($a4,$f)',
290
291	'&xor	($a0,$e)',
292	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
293	'&xor	($a4,$g)',			# f^g
294
295	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
296	'&xor	($a1,$a)',
297	'&and	($a4,$e)',			# (f^g)&e
298
299	@aesni_cbc_block[$aesni_cbc_idx++].
300	'&xor	($a0,$e)',
301	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
302	'&mov	($a2,$a)',
303
304	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
305	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
306	'&xor	($a2,$b)',			# a^b, b^c in next round
307
308	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
309	'&add	($h,$a4)',			# h+=Ch(e,f,g)
310	'&and	($a3,$a2)',			# (b^c)&(a^b)
311
312	'&xor	($a1,$a)',
313	'&add	($h,$a0)',			# h+=Sigma1(e)
314	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
315
316	'&add	($d,$h)',			# d+=h
317	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
318	'&add	($h,$a3)',			# h+=Maj(a,b,c)
319
320	'&mov	($a0,$d)',
321	'&add	($a1,$h);'.			# h+=Sigma0(a)
322	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
323	);
324}
325
326if ($avx) {{
327######################################################################
328# XOP code path
329#
330$code.=<<___;
331.type	${func}_xop,\@function,6
332.align	64
333${func}_xop:
334.Lxop_shortcut:
335	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
336	push	%rbx
337	push	%rbp
338	push	%r12
339	push	%r13
340	push	%r14
341	push	%r15
342	mov	%rsp,%r11		# copy %rsp
343	sub	\$`$framesz+$win64*16*10`,%rsp
344	and	\$-64,%rsp		# align stack frame
345
346	shl	\$6,$len
347	sub	$inp,$out		# re-bias
348	sub	$inp,$in0
349	add	$inp,$len		# end of input
350
351	#mov	$inp,$_inp		# saved later
352	mov	$out,$_out
353	mov	$len,$_end
354	#mov	$key,$_key		# remains resident in $inp register
355	mov	$ivp,$_ivp
356	mov	$ctx,$_ctx
357	mov	$in0,$_in0
358	mov	%r11,$_rsp
359___
360$code.=<<___ if ($win64);
361	movaps	%xmm6,`$framesz+16*0`(%rsp)
362	movaps	%xmm7,`$framesz+16*1`(%rsp)
363	movaps	%xmm8,`$framesz+16*2`(%rsp)
364	movaps	%xmm9,`$framesz+16*3`(%rsp)
365	movaps	%xmm10,`$framesz+16*4`(%rsp)
366	movaps	%xmm11,`$framesz+16*5`(%rsp)
367	movaps	%xmm12,`$framesz+16*6`(%rsp)
368	movaps	%xmm13,`$framesz+16*7`(%rsp)
369	movaps	%xmm14,`$framesz+16*8`(%rsp)
370	movaps	%xmm15,`$framesz+16*9`(%rsp)
371___
372$code.=<<___;
373.Lprologue_xop:
374	vzeroall
375
376	mov	$inp,%r12		# borrow $a4
377	lea	0x80($key),$inp		# size optimization, reassign
378	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
379	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
380	mov	$ctx,%r15		# borrow $a2
381	mov	$in0,%rsi		# borrow $a3
382	vmovdqu	($ivp),$iv		# load IV
383	sub	\$9,%r14
384
385	mov	$SZ*0(%r15),$A
386	mov	$SZ*1(%r15),$B
387	mov	$SZ*2(%r15),$C
388	mov	$SZ*3(%r15),$D
389	mov	$SZ*4(%r15),$E
390	mov	$SZ*5(%r15),$F
391	mov	$SZ*6(%r15),$G
392	mov	$SZ*7(%r15),$H
393
394	vmovdqa	0x00(%r13,%r14,8),$mask14
395	vmovdqa	0x10(%r13,%r14,8),$mask12
396	vmovdqa	0x20(%r13,%r14,8),$mask10
397	vmovdqu	0x00-0x80($inp),$roundkey
398	jmp	.Lloop_xop
399___
400					if ($SZ==4) {	# SHA256
401    my @X = map("%xmm$_",(0..3));
402    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
403
404$code.=<<___;
405.align	16
406.Lloop_xop:
407	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
408	vmovdqu	0x00(%rsi,%r12),@X[0]
409	vmovdqu	0x10(%rsi,%r12),@X[1]
410	vmovdqu	0x20(%rsi,%r12),@X[2]
411	vmovdqu	0x30(%rsi,%r12),@X[3]
412	vpshufb	$t3,@X[0],@X[0]
413	lea	$TABLE(%rip),$Tbl
414	vpshufb	$t3,@X[1],@X[1]
415	vpshufb	$t3,@X[2],@X[2]
416	vpaddd	0x00($Tbl),@X[0],$t0
417	vpshufb	$t3,@X[3],@X[3]
418	vpaddd	0x20($Tbl),@X[1],$t1
419	vpaddd	0x40($Tbl),@X[2],$t2
420	vpaddd	0x60($Tbl),@X[3],$t3
421	vmovdqa	$t0,0x00(%rsp)
422	mov	$A,$a1
423	vmovdqa	$t1,0x10(%rsp)
424	mov	$B,$a3
425	vmovdqa	$t2,0x20(%rsp)
426	xor	$C,$a3			# magic
427	vmovdqa	$t3,0x30(%rsp)
428	mov	$E,$a0
429	jmp	.Lxop_00_47
430
431.align	16
432.Lxop_00_47:
433	sub	\$-16*2*$SZ,$Tbl	# size optimization
434	vmovdqu	(%r12),$inout		# $a4
435	mov	%r12,$_inp		# $a4
436___
437sub XOP_256_00_47 () {
438my $j = shift;
439my $body = shift;
440my @X = @_;
441my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
442
443	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
444	  eval(shift(@insns));
445	  eval(shift(@insns));
446	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
447	  eval(shift(@insns));
448	  eval(shift(@insns));
449	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
450	  eval(shift(@insns));
451	  eval(shift(@insns));
452	&vpsrld		($t0,$t0,$sigma0[2]);
453	  eval(shift(@insns));
454	  eval(shift(@insns));
455	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
456	  eval(shift(@insns));
457	  eval(shift(@insns));
458	  eval(shift(@insns));
459	  eval(shift(@insns));
460	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
461	  eval(shift(@insns));
462	  eval(shift(@insns));
463	&vpxor		($t0,$t0,$t1);
464	  eval(shift(@insns));
465	  eval(shift(@insns));
466	  eval(shift(@insns));
467	  eval(shift(@insns));
468	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
469	  eval(shift(@insns));
470	  eval(shift(@insns));
471	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
472	  eval(shift(@insns));
473	  eval(shift(@insns));
474	 &vpsrld	($t2,@X[3],$sigma1[2]);
475	  eval(shift(@insns));
476	  eval(shift(@insns));
477	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
478	  eval(shift(@insns));
479	  eval(shift(@insns));
480	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
481	  eval(shift(@insns));
482	  eval(shift(@insns));
483	 &vpxor		($t3,$t3,$t2);
484	  eval(shift(@insns));
485	  eval(shift(@insns));
486	  eval(shift(@insns));
487	  eval(shift(@insns));
488	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
489	  eval(shift(@insns));
490	  eval(shift(@insns));
491	  eval(shift(@insns));
492	  eval(shift(@insns));
493	&vpsrldq	($t3,$t3,8);
494	  eval(shift(@insns));
495	  eval(shift(@insns));
496	  eval(shift(@insns));
497	  eval(shift(@insns));
498	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
499	  eval(shift(@insns));
500	  eval(shift(@insns));
501	  eval(shift(@insns));
502	  eval(shift(@insns));
503	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
504	  eval(shift(@insns));
505	  eval(shift(@insns));
506	 &vpsrld	($t2,@X[0],$sigma1[2]);
507	  eval(shift(@insns));
508	  eval(shift(@insns));
509	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
510	  eval(shift(@insns));
511	  eval(shift(@insns));
512	 &vpxor		($t3,$t3,$t2);
513	  eval(shift(@insns));
514	  eval(shift(@insns));
515	  eval(shift(@insns));
516	  eval(shift(@insns));
517	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
518	  eval(shift(@insns));
519	  eval(shift(@insns));
520	  eval(shift(@insns));
521	  eval(shift(@insns));
522	&vpslldq	($t3,$t3,8);		# 22 instructions
523	  eval(shift(@insns));
524	  eval(shift(@insns));
525	  eval(shift(@insns));
526	  eval(shift(@insns));
527	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
528	  eval(shift(@insns));
529	  eval(shift(@insns));
530	  eval(shift(@insns));
531	  eval(shift(@insns));
532	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
533	  foreach (@insns) { eval; }		# remaining instructions
534	&vmovdqa	(16*$j."(%rsp)",$t2);
535}
536
537    $aesni_cbc_idx=0;
538    for ($i=0,$j=0; $j<4; $j++) {
539	&XOP_256_00_47($j,\&body_00_15,@X);
540	push(@X,shift(@X));			# rotate(@X)
541    }
542    	&mov		("%r12",$_inp);		# borrow $a4
543	&vpand		($temp,$temp,$mask14);
544	&mov		("%r15",$_out);		# borrow $a2
545	&vpor		($iv,$iv,$temp);
546	&vmovdqu	("(%r15,%r12)",$iv);	# write output
547	&lea		("%r12","16(%r12)");	# inp++
548
549	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
550	&jne	(".Lxop_00_47");
551
552	&vmovdqu	($inout,"(%r12)");
553	&mov		($_inp,"%r12");
554
555    $aesni_cbc_idx=0;
556    for ($i=0; $i<16; ) {
557	foreach(body_00_15()) { eval; }
558    }
559					}
560$code.=<<___;
561	mov	$_inp,%r12		# borrow $a4
562	mov	$_out,%r13		# borrow $a0
563	mov	$_ctx,%r15		# borrow $a2
564	mov	$_in0,%rsi		# borrow $a3
565
566	vpand	$mask14,$temp,$temp
567	mov	$a1,$A
568	vpor	$temp,$iv,$iv
569	vmovdqu	$iv,(%r13,%r12)		# write output
570	lea	16(%r12),%r12		# inp++
571
572	add	$SZ*0(%r15),$A
573	add	$SZ*1(%r15),$B
574	add	$SZ*2(%r15),$C
575	add	$SZ*3(%r15),$D
576	add	$SZ*4(%r15),$E
577	add	$SZ*5(%r15),$F
578	add	$SZ*6(%r15),$G
579	add	$SZ*7(%r15),$H
580
581	cmp	$_end,%r12
582
583	mov	$A,$SZ*0(%r15)
584	mov	$B,$SZ*1(%r15)
585	mov	$C,$SZ*2(%r15)
586	mov	$D,$SZ*3(%r15)
587	mov	$E,$SZ*4(%r15)
588	mov	$F,$SZ*5(%r15)
589	mov	$G,$SZ*6(%r15)
590	mov	$H,$SZ*7(%r15)
591
592	jb	.Lloop_xop
593
594	mov	$_ivp,$ivp
595	mov	$_rsp,%rsi
596	vmovdqu	$iv,($ivp)		# output IV
597	vzeroall
598___
599$code.=<<___ if ($win64);
600	movaps	`$framesz+16*0`(%rsp),%xmm6
601	movaps	`$framesz+16*1`(%rsp),%xmm7
602	movaps	`$framesz+16*2`(%rsp),%xmm8
603	movaps	`$framesz+16*3`(%rsp),%xmm9
604	movaps	`$framesz+16*4`(%rsp),%xmm10
605	movaps	`$framesz+16*5`(%rsp),%xmm11
606	movaps	`$framesz+16*6`(%rsp),%xmm12
607	movaps	`$framesz+16*7`(%rsp),%xmm13
608	movaps	`$framesz+16*8`(%rsp),%xmm14
609	movaps	`$framesz+16*9`(%rsp),%xmm15
610___
611$code.=<<___;
612	mov	(%rsi),%r15
613	mov	8(%rsi),%r14
614	mov	16(%rsi),%r13
615	mov	24(%rsi),%r12
616	mov	32(%rsi),%rbp
617	mov	40(%rsi),%rbx
618	lea	48(%rsi),%rsp
619.Lepilogue_xop:
620	ret
621.size	${func}_xop,.-${func}_xop
622___
623######################################################################
624# AVX+shrd code path
625#
626local *ror = sub { &shrd(@_[0],@_) };
627
628$code.=<<___;
629.type	${func}_avx,\@function,6
630.align	64
631${func}_avx:
632.Lavx_shortcut:
633	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
634	push	%rbx
635	push	%rbp
636	push	%r12
637	push	%r13
638	push	%r14
639	push	%r15
640	mov	%rsp,%r11		# copy %rsp
641	sub	\$`$framesz+$win64*16*10`,%rsp
642	and	\$-64,%rsp		# align stack frame
643
644	shl	\$6,$len
645	sub	$inp,$out		# re-bias
646	sub	$inp,$in0
647	add	$inp,$len		# end of input
648
649	#mov	$inp,$_inp		# saved later
650	mov	$out,$_out
651	mov	$len,$_end
652	#mov	$key,$_key		# remains resident in $inp register
653	mov	$ivp,$_ivp
654	mov	$ctx,$_ctx
655	mov	$in0,$_in0
656	mov	%r11,$_rsp
657___
658$code.=<<___ if ($win64);
659	movaps	%xmm6,`$framesz+16*0`(%rsp)
660	movaps	%xmm7,`$framesz+16*1`(%rsp)
661	movaps	%xmm8,`$framesz+16*2`(%rsp)
662	movaps	%xmm9,`$framesz+16*3`(%rsp)
663	movaps	%xmm10,`$framesz+16*4`(%rsp)
664	movaps	%xmm11,`$framesz+16*5`(%rsp)
665	movaps	%xmm12,`$framesz+16*6`(%rsp)
666	movaps	%xmm13,`$framesz+16*7`(%rsp)
667	movaps	%xmm14,`$framesz+16*8`(%rsp)
668	movaps	%xmm15,`$framesz+16*9`(%rsp)
669___
670$code.=<<___;
671.Lprologue_avx:
672	vzeroall
673
674	mov	$inp,%r12		# borrow $a4
675	lea	0x80($key),$inp		# size optimization, reassign
676	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
677	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
678	mov	$ctx,%r15		# borrow $a2
679	mov	$in0,%rsi		# borrow $a3
680	vmovdqu	($ivp),$iv		# load IV
681	sub	\$9,%r14
682
683	mov	$SZ*0(%r15),$A
684	mov	$SZ*1(%r15),$B
685	mov	$SZ*2(%r15),$C
686	mov	$SZ*3(%r15),$D
687	mov	$SZ*4(%r15),$E
688	mov	$SZ*5(%r15),$F
689	mov	$SZ*6(%r15),$G
690	mov	$SZ*7(%r15),$H
691
692	vmovdqa	0x00(%r13,%r14,8),$mask14
693	vmovdqa	0x10(%r13,%r14,8),$mask12
694	vmovdqa	0x20(%r13,%r14,8),$mask10
695	vmovdqu	0x00-0x80($inp),$roundkey
696___
697					if ($SZ==4) {	# SHA256
698    my @X = map("%xmm$_",(0..3));
699    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
700
701$code.=<<___;
702	jmp	.Lloop_avx
703.align	16
704.Lloop_avx:
705	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
706	vmovdqu	0x00(%rsi,%r12),@X[0]
707	vmovdqu	0x10(%rsi,%r12),@X[1]
708	vmovdqu	0x20(%rsi,%r12),@X[2]
709	vmovdqu	0x30(%rsi,%r12),@X[3]
710	vpshufb	$t3,@X[0],@X[0]
711	lea	$TABLE(%rip),$Tbl
712	vpshufb	$t3,@X[1],@X[1]
713	vpshufb	$t3,@X[2],@X[2]
714	vpaddd	0x00($Tbl),@X[0],$t0
715	vpshufb	$t3,@X[3],@X[3]
716	vpaddd	0x20($Tbl),@X[1],$t1
717	vpaddd	0x40($Tbl),@X[2],$t2
718	vpaddd	0x60($Tbl),@X[3],$t3
719	vmovdqa	$t0,0x00(%rsp)
720	mov	$A,$a1
721	vmovdqa	$t1,0x10(%rsp)
722	mov	$B,$a3
723	vmovdqa	$t2,0x20(%rsp)
724	xor	$C,$a3			# magic
725	vmovdqa	$t3,0x30(%rsp)
726	mov	$E,$a0
727	jmp	.Lavx_00_47
728
729.align	16
730.Lavx_00_47:
731	sub	\$-16*2*$SZ,$Tbl	# size optimization
732	vmovdqu	(%r12),$inout		# $a4
733	mov	%r12,$_inp		# $a4
734___
735sub Xupdate_256_AVX () {
736	(
737	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
738	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
739	'&vpsrld	($t2,$t0,$sigma0[0]);',
740	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
741	'&vpsrld	($t3,$t0,$sigma0[2])',
742	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
743	'&vpxor		($t0,$t3,$t2)',
744	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
745	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
746	'&vpxor		($t0,$t0,$t1)',
747	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
748	'&vpxor		($t0,$t0,$t2)',
749	 '&vpsrld	($t2,$t3,$sigma1[2]);',
750	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
751	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
752	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
753	 '&vpxor	($t2,$t2,$t3);',
754	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
755	 '&vpxor	($t2,$t2,$t3)',		# sigma1(X[14..15])
756	 '&vpshufd	($t2,$t2,0b10000100)',
757	 '&vpsrldq	($t2,$t2,8)',
758	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
759	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
760	 '&vpsrld	($t2,$t3,$sigma1[2])',
761	 '&vpsrlq	($t3,$t3,$sigma1[0])',
762	 '&vpxor	($t2,$t2,$t3);',
763	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
764	 '&vpxor	($t2,$t2,$t3)',
765	 '&vpshufd	($t2,$t2,0b11101000)',
766	 '&vpslldq	($t2,$t2,8)',
767	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
768	);
769}
770
771sub AVX_256_00_47 () {
772my $j = shift;
773my $body = shift;
774my @X = @_;
775my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
776
777	foreach (Xupdate_256_AVX()) {		# 29 instructions
778	    eval;
779	    eval(shift(@insns));
780	    eval(shift(@insns));
781	    eval(shift(@insns));
782	}
783	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
784	  foreach (@insns) { eval; }		# remaining instructions
785	&vmovdqa	(16*$j."(%rsp)",$t2);
786}
787
788    $aesni_cbc_idx=0;
789    for ($i=0,$j=0; $j<4; $j++) {
790	&AVX_256_00_47($j,\&body_00_15,@X);
791	push(@X,shift(@X));			# rotate(@X)
792    }
793    	&mov		("%r12",$_inp);		# borrow $a4
794	&vpand		($temp,$temp,$mask14);
795	&mov		("%r15",$_out);		# borrow $a2
796	&vpor		($iv,$iv,$temp);
797	&vmovdqu	("(%r15,%r12)",$iv);	# write output
798	&lea		("%r12","16(%r12)");	# inp++
799
800	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
801	&jne	(".Lavx_00_47");
802
803	&vmovdqu	($inout,"(%r12)");
804	&mov		($_inp,"%r12");
805
806    $aesni_cbc_idx=0;
807    for ($i=0; $i<16; ) {
808	foreach(body_00_15()) { eval; }
809    }
810
811					}
812$code.=<<___;
813	mov	$_inp,%r12		# borrow $a4
814	mov	$_out,%r13		# borrow $a0
815	mov	$_ctx,%r15		# borrow $a2
816	mov	$_in0,%rsi		# borrow $a3
817
818	vpand	$mask14,$temp,$temp
819	mov	$a1,$A
820	vpor	$temp,$iv,$iv
821	vmovdqu	$iv,(%r13,%r12)		# write output
822	lea	16(%r12),%r12		# inp++
823
824	add	$SZ*0(%r15),$A
825	add	$SZ*1(%r15),$B
826	add	$SZ*2(%r15),$C
827	add	$SZ*3(%r15),$D
828	add	$SZ*4(%r15),$E
829	add	$SZ*5(%r15),$F
830	add	$SZ*6(%r15),$G
831	add	$SZ*7(%r15),$H
832
833	cmp	$_end,%r12
834
835	mov	$A,$SZ*0(%r15)
836	mov	$B,$SZ*1(%r15)
837	mov	$C,$SZ*2(%r15)
838	mov	$D,$SZ*3(%r15)
839	mov	$E,$SZ*4(%r15)
840	mov	$F,$SZ*5(%r15)
841	mov	$G,$SZ*6(%r15)
842	mov	$H,$SZ*7(%r15)
843	jb	.Lloop_avx
844
845	mov	$_ivp,$ivp
846	mov	$_rsp,%rsi
847	vmovdqu	$iv,($ivp)		# output IV
848	vzeroall
849___
850$code.=<<___ if ($win64);
851	movaps	`$framesz+16*0`(%rsp),%xmm6
852	movaps	`$framesz+16*1`(%rsp),%xmm7
853	movaps	`$framesz+16*2`(%rsp),%xmm8
854	movaps	`$framesz+16*3`(%rsp),%xmm9
855	movaps	`$framesz+16*4`(%rsp),%xmm10
856	movaps	`$framesz+16*5`(%rsp),%xmm11
857	movaps	`$framesz+16*6`(%rsp),%xmm12
858	movaps	`$framesz+16*7`(%rsp),%xmm13
859	movaps	`$framesz+16*8`(%rsp),%xmm14
860	movaps	`$framesz+16*9`(%rsp),%xmm15
861___
862$code.=<<___;
863	mov	(%rsi),%r15
864	mov	8(%rsi),%r14
865	mov	16(%rsi),%r13
866	mov	24(%rsi),%r12
867	mov	32(%rsi),%rbp
868	mov	40(%rsi),%rbx
869	lea	48(%rsi),%rsp
870.Lepilogue_avx:
871	ret
872.size	${func}_avx,.-${func}_avx
873___
874
875if ($avx>1) {{
876######################################################################
877# AVX2+BMI code path
878#
879my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
880my $PUSH8=8*2*$SZ;
881use integer;
882
883sub bodyx_00_15 () {
884	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
885	(
886	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
887
888	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
889	'&and	($a4,$e)',		# f&e
890	'&rorx	($a0,$e,$Sigma1[2])',
891	'&rorx	($a2,$e,$Sigma1[1])',
892
893	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
894	'&lea	($h,"($h,$a4)")',
895	'&andn	($a4,$e,$g)',		# ~e&g
896	'&xor	($a0,$a2)',
897
898	'&rorx	($a1,$e,$Sigma1[0])',
899	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
900	'&xor	($a0,$a1)',		# Sigma1(e)
901	'&mov	($a2,$a)',
902
903	'&rorx	($a4,$a,$Sigma0[2])',
904	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
905	'&xor	($a2,$b)',		# a^b, b^c in next round
906	'&rorx	($a1,$a,$Sigma0[1])',
907
908	'&rorx	($a0,$a,$Sigma0[0])',
909	'&lea	($d,"($d,$h)")',	# d+=h
910	'&and	($a3,$a2)',		# (b^c)&(a^b)
911	@aesni_cbc_block[$aesni_cbc_idx++].
912	'&xor	($a1,$a4)',
913
914	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
915	'&xor	($a1,$a0)',		# Sigma0(a)
916	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
917	'&mov	($a4,$e)',		# copy of f in future
918
919	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
920	);
921	# and at the finish one has to $a+=$a1
922}
923
924$code.=<<___;
925.type	${func}_avx2,\@function,6
926.align	64
927${func}_avx2:
928.Lavx2_shortcut:
929	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
930	push	%rbx
931	push	%rbp
932	push	%r12
933	push	%r13
934	push	%r14
935	push	%r15
936	mov	%rsp,%r11		# copy %rsp
937	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
938	and	\$-256*$SZ,%rsp		# align stack frame
939	add	\$`2*$SZ*($rounds-8)`,%rsp
940
941	shl	\$6,$len
942	sub	$inp,$out		# re-bias
943	sub	$inp,$in0
944	add	$inp,$len		# end of input
945
946	#mov	$inp,$_inp		# saved later
947	#mov	$out,$_out		# kept in $offload
948	mov	$len,$_end
949	#mov	$key,$_key		# remains resident in $inp register
950	mov	$ivp,$_ivp
951	mov	$ctx,$_ctx
952	mov	$in0,$_in0
953	mov	%r11,$_rsp
954___
955$code.=<<___ if ($win64);
956	movaps	%xmm6,`$framesz+16*0`(%rsp)
957	movaps	%xmm7,`$framesz+16*1`(%rsp)
958	movaps	%xmm8,`$framesz+16*2`(%rsp)
959	movaps	%xmm9,`$framesz+16*3`(%rsp)
960	movaps	%xmm10,`$framesz+16*4`(%rsp)
961	movaps	%xmm11,`$framesz+16*5`(%rsp)
962	movaps	%xmm12,`$framesz+16*6`(%rsp)
963	movaps	%xmm13,`$framesz+16*7`(%rsp)
964	movaps	%xmm14,`$framesz+16*8`(%rsp)
965	movaps	%xmm15,`$framesz+16*9`(%rsp)
966___
967$code.=<<___;
968.Lprologue_avx2:
969	vzeroall
970
971	mov	$inp,%r13		# borrow $a0
972	vpinsrq	\$1,$out,$offload,$offload
973	lea	0x80($key),$inp		# size optimization, reassign
974	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r12	# borrow $a4
975	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
976	mov	$ctx,%r15		# borrow $a2
977	mov	$in0,%rsi		# borrow $a3
978	vmovdqu	($ivp),$iv		# load IV
979	lea	-9(%r14),%r14
980
981	vmovdqa	0x00(%r12,%r14,8),$mask14
982	vmovdqa	0x10(%r12,%r14,8),$mask12
983	vmovdqa	0x20(%r12,%r14,8),$mask10
984
985	sub	\$-16*$SZ,%r13		# inp++, size optimization
986	mov	$SZ*0(%r15),$A
987	lea	(%rsi,%r13),%r12	# borrow $a0
988	mov	$SZ*1(%r15),$B
989	cmp	$len,%r13		# $_end
990	mov	$SZ*2(%r15),$C
991	cmove	%rsp,%r12		# next block or random data
992	mov	$SZ*3(%r15),$D
993	mov	$SZ*4(%r15),$E
994	mov	$SZ*5(%r15),$F
995	mov	$SZ*6(%r15),$G
996	mov	$SZ*7(%r15),$H
997	vmovdqu	0x00-0x80($inp),$roundkey
998___
999					if ($SZ==4) {	# SHA256
1000    my @X = map("%ymm$_",(0..3));
1001    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1002
1003$code.=<<___;
1004	jmp	.Loop_avx2
1005.align	16
1006.Loop_avx2:
1007	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1008	vmovdqu	-16*$SZ+0(%rsi,%r13),%xmm0
1009	vmovdqu	-16*$SZ+16(%rsi,%r13),%xmm1
1010	vmovdqu	-16*$SZ+32(%rsi,%r13),%xmm2
1011	vmovdqu	-16*$SZ+48(%rsi,%r13),%xmm3
1012
1013	vinserti128	\$1,(%r12),@X[0],@X[0]
1014	vinserti128	\$1,16(%r12),@X[1],@X[1]
1015	 vpshufb	$t3,@X[0],@X[0]
1016	vinserti128	\$1,32(%r12),@X[2],@X[2]
1017	 vpshufb	$t3,@X[1],@X[1]
1018	vinserti128	\$1,48(%r12),@X[3],@X[3]
1019
1020	lea	$TABLE(%rip),$Tbl
1021	vpshufb	$t3,@X[2],@X[2]
1022	lea	-16*$SZ(%r13),%r13
1023	vpaddd	0x00($Tbl),@X[0],$t0
1024	vpshufb	$t3,@X[3],@X[3]
1025	vpaddd	0x20($Tbl),@X[1],$t1
1026	vpaddd	0x40($Tbl),@X[2],$t2
1027	vpaddd	0x60($Tbl),@X[3],$t3
1028	vmovdqa	$t0,0x00(%rsp)
1029	xor	$a1,$a1
1030	vmovdqa	$t1,0x20(%rsp)
1031	lea	-$PUSH8(%rsp),%rsp
1032	mov	$B,$a3
1033	vmovdqa	$t2,0x00(%rsp)
1034	xor	$C,$a3			# magic
1035	vmovdqa	$t3,0x20(%rsp)
1036	mov	$F,$a4
1037	sub	\$-16*2*$SZ,$Tbl	# size optimization
1038	jmp	.Lavx2_00_47
1039
1040.align	16
1041.Lavx2_00_47:
1042	vmovdqu	(%r13),$inout
1043	vpinsrq	\$0,%r13,$offload,$offload
1044___
1045
1046sub AVX2_256_00_47 () {
1047my $j = shift;
1048my $body = shift;
1049my @X = @_;
1050my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1051my $base = "+2*$PUSH8(%rsp)";
1052
1053	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1054	foreach (Xupdate_256_AVX()) {		# 29 instructions
1055	    eval;
1056	    eval(shift(@insns));
1057	    eval(shift(@insns));
1058	    eval(shift(@insns));
1059	}
1060	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1061	  foreach (@insns) { eval; }		# remaining instructions
1062	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1063}
1064    $aesni_cbc_idx=0;
1065    for ($i=0,$j=0; $j<4; $j++) {
1066	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1067	push(@X,shift(@X));			# rotate(@X)
1068    }
1069	&vmovq		("%r13",$offload);	# borrow $a0
1070	&vpextrq	("%r15",$offload,1);	# borrow $a2
1071	&vpand		($temp,$temp,$mask14);
1072	&vpor		($iv,$iv,$temp);
1073	&vmovdqu	("(%r15,%r13)",$iv);	# write output
1074	&lea		("%r13","16(%r13)");	# inp++
1075
1076	&lea	($Tbl,16*2*$SZ."($Tbl)");
1077	&cmpb	(($SZ-1)."($Tbl)",0);
1078	&jne	(".Lavx2_00_47");
1079
1080	&vmovdqu	($inout,"(%r13)");
1081	&vpinsrq	($offload,$offload,"%r13",0);
1082
1083    $aesni_cbc_idx=0;
1084    for ($i=0; $i<16; ) {
1085	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1086	foreach(bodyx_00_15()) { eval; }
1087    }
1088					}
1089$code.=<<___;
1090	vpextrq	\$1,$offload,%r12		# $_out, borrow $a4
1091	vmovq	$offload,%r13			# $_inp, borrow $a0
1092	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1093	add	$a1,$A
1094	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1095
1096	vpand	$mask14,$temp,$temp
1097	vpor	$temp,$iv,$iv
1098	vmovdqu	$iv,(%r12,%r13)			# write output
1099	lea	16(%r13),%r13
1100
1101	add	$SZ*0(%r15),$A
1102	add	$SZ*1(%r15),$B
1103	add	$SZ*2(%r15),$C
1104	add	$SZ*3(%r15),$D
1105	add	$SZ*4(%r15),$E
1106	add	$SZ*5(%r15),$F
1107	add	$SZ*6(%r15),$G
1108	add	$SZ*7(%r15),$H
1109
1110	mov	$A,$SZ*0(%r15)
1111	mov	$B,$SZ*1(%r15)
1112	mov	$C,$SZ*2(%r15)
1113	mov	$D,$SZ*3(%r15)
1114	mov	$E,$SZ*4(%r15)
1115	mov	$F,$SZ*5(%r15)
1116	mov	$G,$SZ*6(%r15)
1117	mov	$H,$SZ*7(%r15)
1118
1119	cmp	`$PUSH8+2*8`($Tbl),%r13		# $_end
1120	je	.Ldone_avx2
1121
1122	xor	$a1,$a1
1123	mov	$B,$a3
1124	mov	$F,$a4
1125	xor	$C,$a3			# magic
1126	jmp	.Lower_avx2
1127.align	16
1128.Lower_avx2:
1129	vmovdqu	(%r13),$inout
1130	vpinsrq	\$0,%r13,$offload,$offload
1131___
1132    $aesni_cbc_idx=0;
1133    for ($i=0; $i<16; ) {
1134	my $base="+16($Tbl)";
1135	foreach(bodyx_00_15()) { eval; }
1136	&lea	($Tbl,"-$PUSH8($Tbl)")	if ($i==8);
1137    }
1138$code.=<<___;
1139	vmovq	$offload,%r13			# borrow $a0
1140	vpextrq	\$1,$offload,%r15		# borrow $a2
1141	vpand	$mask14,$temp,$temp
1142	vpor	$temp,$iv,$iv
1143	lea	-$PUSH8($Tbl),$Tbl
1144	vmovdqu	$iv,(%r15,%r13)			# write output
1145	lea	16(%r13),%r13			# inp++
1146	cmp	%rsp,$Tbl
1147	jae	.Lower_avx2
1148
1149	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1150	lea	16*$SZ(%r13),%r13
1151	mov	`2*$SZ*$rounds+6*8`(%rsp),%rsi	# $_in0, borrow $a3
1152	add	$a1,$A
1153	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1154
1155	add	$SZ*0(%r15),$A
1156	add	$SZ*1(%r15),$B
1157	add	$SZ*2(%r15),$C
1158	add	$SZ*3(%r15),$D
1159	add	$SZ*4(%r15),$E
1160	add	$SZ*5(%r15),$F
1161	add	$SZ*6(%r15),$G
1162	lea	(%rsi,%r13),%r12
1163	add	$SZ*7(%r15),$H
1164
1165	cmp	$_end,%r13
1166
1167	mov	$A,$SZ*0(%r15)
1168	cmove	%rsp,%r12		# next block or stale data
1169	mov	$B,$SZ*1(%r15)
1170	mov	$C,$SZ*2(%r15)
1171	mov	$D,$SZ*3(%r15)
1172	mov	$E,$SZ*4(%r15)
1173	mov	$F,$SZ*5(%r15)
1174	mov	$G,$SZ*6(%r15)
1175	mov	$H,$SZ*7(%r15)
1176
1177	jbe	.Loop_avx2
1178	lea	(%rsp),$Tbl
1179
1180.Ldone_avx2:
1181	lea	($Tbl),%rsp
1182	mov	$_ivp,$ivp
1183	mov	$_rsp,%rsi
1184	vmovdqu	$iv,($ivp)		# output IV
1185	vzeroall
1186___
1187$code.=<<___ if ($win64);
1188	movaps	`$framesz+16*0`(%rsp),%xmm6
1189	movaps	`$framesz+16*1`(%rsp),%xmm7
1190	movaps	`$framesz+16*2`(%rsp),%xmm8
1191	movaps	`$framesz+16*3`(%rsp),%xmm9
1192	movaps	`$framesz+16*4`(%rsp),%xmm10
1193	movaps	`$framesz+16*5`(%rsp),%xmm11
1194	movaps	`$framesz+16*6`(%rsp),%xmm12
1195	movaps	`$framesz+16*7`(%rsp),%xmm13
1196	movaps	`$framesz+16*8`(%rsp),%xmm14
1197	movaps	`$framesz+16*9`(%rsp),%xmm15
1198___
1199$code.=<<___;
1200	mov	(%rsi),%r15
1201	mov	8(%rsi),%r14
1202	mov	16(%rsi),%r13
1203	mov	24(%rsi),%r12
1204	mov	32(%rsi),%rbp
1205	mov	40(%rsi),%rbx
1206	lea	48(%rsi),%rsp
1207.Lepilogue_avx2:
1208	ret
1209.size	${func}_avx2,.-${func}_avx2
1210___
1211}}
1212}}
1213{{
1214my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1215
1216my ($rounds,$Tbl)=("%r11d","%rbx");
1217
1218my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1219my @rndkey=("%xmm4","%xmm5");
1220my $r=0;
1221my $sn=0;
1222
1223my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1224my @MSG=map("%xmm$_",(10..13));
1225
1226my $aesenc=sub {
1227  use integer;
1228  my ($n,$k)=($r/10,$r%10);
1229    if ($k==0) {
1230      $code.=<<___;
1231	movups		`16*$n`($in0),$in		# load input
1232	xorps		$rndkey0,$in
1233___
1234      $code.=<<___ if ($n);
1235	movups		$iv,`16*($n-1)`($out,$in0)	# write output
1236___
1237      $code.=<<___;
1238	xorps		$in,$iv
1239	movups		`32+16*$k-112`($key),$rndkey[1]
1240	aesenc		$rndkey[0],$iv
1241___
1242    } elsif ($k==9) {
1243      $sn++;
1244      $code.=<<___;
1245	cmp		\$11,$rounds
1246	jb		.Laesenclast$sn
1247	movups		`32+16*($k+0)-112`($key),$rndkey[1]
1248	aesenc		$rndkey[0],$iv
1249	movups		`32+16*($k+1)-112`($key),$rndkey[0]
1250	aesenc		$rndkey[1],$iv
1251	je		.Laesenclast$sn
1252	movups		`32+16*($k+2)-112`($key),$rndkey[1]
1253	aesenc		$rndkey[0],$iv
1254	movups		`32+16*($k+3)-112`($key),$rndkey[0]
1255	aesenc		$rndkey[1],$iv
1256.Laesenclast$sn:
1257	aesenclast	$rndkey[0],$iv
1258	movups		16-112($key),$rndkey[1]		# forward reference
1259	nop
1260___
1261    } else {
1262      $code.=<<___;
1263	movups		`32+16*$k-112`($key),$rndkey[1]
1264	aesenc		$rndkey[0],$iv
1265___
1266    }
1267    $r++;	unshift(@rndkey,pop(@rndkey));
1268};
1269
1270if ($shaext) {
1271my $Tbl="%rax";
1272
1273$code.=<<___;
1274.type	${func}_shaext,\@function,6
1275.align	32
1276${func}_shaext:
1277	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1278___
1279$code.=<<___ if ($win64);
1280	lea	`-8-10*16`(%rsp),%rsp
1281	movaps	%xmm6,-8-10*16(%rax)
1282	movaps	%xmm7,-8-9*16(%rax)
1283	movaps	%xmm8,-8-8*16(%rax)
1284	movaps	%xmm9,-8-7*16(%rax)
1285	movaps	%xmm10,-8-6*16(%rax)
1286	movaps	%xmm11,-8-5*16(%rax)
1287	movaps	%xmm12,-8-4*16(%rax)
1288	movaps	%xmm13,-8-3*16(%rax)
1289	movaps	%xmm14,-8-2*16(%rax)
1290	movaps	%xmm15,-8-1*16(%rax)
1291.Lprologue_shaext:
1292___
1293$code.=<<___;
1294	lea		K256+0x80(%rip),$Tbl
1295	movdqu		($ctx),$ABEF		# DCBA
1296	movdqu		16($ctx),$CDGH		# HGFE
1297	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
1298
1299	mov		240($key),$rounds
1300	sub		$in0,$out
1301	movups		($key),$rndkey0		# $key[0]
1302	movups		($ivp),$iv		# load IV
1303	movups		16($key),$rndkey[0]	# forward reference
1304	lea		112($key),$key		# size optimization
1305
1306	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
1307	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
1308	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
1309	movdqa		$TMP,$BSWAP		# offload
1310	palignr		\$8,$CDGH,$ABEF		# ABEF
1311	punpcklqdq	$Wi,$CDGH		# CDGH
1312
1313	jmp	.Loop_shaext
1314
1315.align	16
1316.Loop_shaext:
1317	movdqu		($inp),@MSG[0]
1318	movdqu		0x10($inp),@MSG[1]
1319	movdqu		0x20($inp),@MSG[2]
1320	pshufb		$TMP,@MSG[0]
1321	movdqu		0x30($inp),@MSG[3]
1322
1323	movdqa		0*32-0x80($Tbl),$Wi
1324	paddd		@MSG[0],$Wi
1325	pshufb		$TMP,@MSG[1]
1326	movdqa		$CDGH,$CDGH_SAVE	# offload
1327	movdqa		$ABEF,$ABEF_SAVE	# offload
1328___
1329	&$aesenc();
1330$code.=<<___;
1331	sha256rnds2	$ABEF,$CDGH		# 0-3
1332	pshufd		\$0x0e,$Wi,$Wi
1333___
1334	&$aesenc();
1335$code.=<<___;
1336	sha256rnds2	$CDGH,$ABEF
1337
1338	movdqa		1*32-0x80($Tbl),$Wi
1339	paddd		@MSG[1],$Wi
1340	pshufb		$TMP,@MSG[2]
1341	lea		0x40($inp),$inp
1342___
1343	&$aesenc();
1344$code.=<<___;
1345	sha256rnds2	$ABEF,$CDGH		# 4-7
1346	pshufd		\$0x0e,$Wi,$Wi
1347___
1348	&$aesenc();
1349$code.=<<___;
1350	sha256rnds2	$CDGH,$ABEF
1351
1352	movdqa		2*32-0x80($Tbl),$Wi
1353	paddd		@MSG[2],$Wi
1354	pshufb		$TMP,@MSG[3]
1355	sha256msg1	@MSG[1],@MSG[0]
1356___
1357	&$aesenc();
1358$code.=<<___;
1359	sha256rnds2	$ABEF,$CDGH		# 8-11
1360	pshufd		\$0x0e,$Wi,$Wi
1361	movdqa		@MSG[3],$TMP
1362	palignr		\$4,@MSG[2],$TMP
1363	paddd		$TMP,@MSG[0]
1364___
1365	&$aesenc();
1366$code.=<<___;
1367	sha256rnds2	$CDGH,$ABEF
1368
1369	movdqa		3*32-0x80($Tbl),$Wi
1370	paddd		@MSG[3],$Wi
1371	sha256msg2	@MSG[3],@MSG[0]
1372	sha256msg1	@MSG[2],@MSG[1]
1373___
1374	&$aesenc();
1375$code.=<<___;
1376	sha256rnds2	$ABEF,$CDGH		# 12-15
1377	pshufd		\$0x0e,$Wi,$Wi
1378___
1379	&$aesenc();
1380$code.=<<___;
1381	movdqa		@MSG[0],$TMP
1382	palignr		\$4,@MSG[3],$TMP
1383	paddd		$TMP,@MSG[1]
1384	sha256rnds2	$CDGH,$ABEF
1385___
1386for($i=4;$i<16-3;$i++) {
1387	&$aesenc()	if (($r%10)==0);
1388$code.=<<___;
1389	movdqa		$i*32-0x80($Tbl),$Wi
1390	paddd		@MSG[0],$Wi
1391	sha256msg2	@MSG[0],@MSG[1]
1392	sha256msg1	@MSG[3],@MSG[2]
1393___
1394	&$aesenc();
1395$code.=<<___;
1396	sha256rnds2	$ABEF,$CDGH		# 16-19...
1397	pshufd		\$0x0e,$Wi,$Wi
1398	movdqa		@MSG[1],$TMP
1399	palignr		\$4,@MSG[0],$TMP
1400	paddd		$TMP,@MSG[2]
1401___
1402	&$aesenc();
1403	&$aesenc()	if ($r==19);
1404$code.=<<___;
1405	sha256rnds2	$CDGH,$ABEF
1406___
1407	push(@MSG,shift(@MSG));
1408}
1409$code.=<<___;
1410	movdqa		13*32-0x80($Tbl),$Wi
1411	paddd		@MSG[0],$Wi
1412	sha256msg2	@MSG[0],@MSG[1]
1413	sha256msg1	@MSG[3],@MSG[2]
1414___
1415	&$aesenc();
1416$code.=<<___;
1417	sha256rnds2	$ABEF,$CDGH		# 52-55
1418	pshufd		\$0x0e,$Wi,$Wi
1419	movdqa		@MSG[1],$TMP
1420	palignr		\$4,@MSG[0],$TMP
1421	paddd		$TMP,@MSG[2]
1422___
1423	&$aesenc();
1424	&$aesenc();
1425$code.=<<___;
1426	sha256rnds2	$CDGH,$ABEF
1427
1428	movdqa		14*32-0x80($Tbl),$Wi
1429	paddd		@MSG[1],$Wi
1430	sha256msg2	@MSG[1],@MSG[2]
1431	movdqa		$BSWAP,$TMP
1432___
1433	&$aesenc();
1434$code.=<<___;
1435	sha256rnds2	$ABEF,$CDGH		# 56-59
1436	pshufd		\$0x0e,$Wi,$Wi
1437___
1438	&$aesenc();
1439$code.=<<___;
1440	sha256rnds2	$CDGH,$ABEF
1441
1442	movdqa		15*32-0x80($Tbl),$Wi
1443	paddd		@MSG[2],$Wi
1444___
1445	&$aesenc();
1446	&$aesenc();
1447$code.=<<___;
1448	sha256rnds2	$ABEF,$CDGH		# 60-63
1449	pshufd		\$0x0e,$Wi,$Wi
1450___
1451	&$aesenc();
1452$code.=<<___;
1453	sha256rnds2	$CDGH,$ABEF
1454	#pxor		$CDGH,$rndkey0		# black magic
1455___
1456	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
1457$code.=<<___;
1458	#xorps		$CDGH,$rndkey0		# black magic
1459	paddd		$CDGH_SAVE,$CDGH
1460	paddd		$ABEF_SAVE,$ABEF
1461
1462	dec		$len
1463	movups		$iv,48($out,$in0)	# write output
1464	lea		64($in0),$in0
1465	jnz		.Loop_shaext
1466
1467	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
1468	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
1469	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
1470	punpckhqdq	$CDGH,$ABEF		# DCBA
1471	palignr		\$8,$TMP,$CDGH		# HGFE
1472
1473	movups		$iv,($ivp)		# write IV
1474	movdqu		$ABEF,($ctx)
1475	movdqu		$CDGH,16($ctx)
1476___
1477$code.=<<___ if ($win64);
1478	movaps	0*16(%rsp),%xmm6
1479	movaps	1*16(%rsp),%xmm7
1480	movaps	2*16(%rsp),%xmm8
1481	movaps	3*16(%rsp),%xmm9
1482	movaps	4*16(%rsp),%xmm10
1483	movaps	5*16(%rsp),%xmm11
1484	movaps	6*16(%rsp),%xmm12
1485	movaps	7*16(%rsp),%xmm13
1486	movaps	8*16(%rsp),%xmm14
1487	movaps	9*16(%rsp),%xmm15
1488	lea	8+10*16(%rsp),%rsp
1489.Lepilogue_shaext:
1490___
1491$code.=<<___;
1492	ret
1493.size	${func}_shaext,.-${func}_shaext
1494___
1495}
1496}}}}}
1497
1498# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1499#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1500if ($win64 && $avx) {
1501$rec="%rcx";
1502$frame="%rdx";
1503$context="%r8";
1504$disp="%r9";
1505
1506$code.=<<___;
1507.extern	__imp_RtlVirtualUnwind
1508.type	se_handler,\@abi-omnipotent
1509.align	16
1510se_handler:
1511	push	%rsi
1512	push	%rdi
1513	push	%rbx
1514	push	%rbp
1515	push	%r12
1516	push	%r13
1517	push	%r14
1518	push	%r15
1519	pushfq
1520	sub	\$64,%rsp
1521
1522	mov	120($context),%rax	# pull context->Rax
1523	mov	248($context),%rbx	# pull context->Rip
1524
1525	mov	8($disp),%rsi		# disp->ImageBase
1526	mov	56($disp),%r11		# disp->HanderlData
1527
1528	mov	0(%r11),%r10d		# HandlerData[0]
1529	lea	(%rsi,%r10),%r10	# prologue label
1530	cmp	%r10,%rbx		# context->Rip<prologue label
1531	jb	.Lin_prologue
1532
1533	mov	152($context),%rax	# pull context->Rsp
1534
1535	mov	4(%r11),%r10d		# HandlerData[1]
1536	lea	(%rsi,%r10),%r10	# epilogue label
1537	cmp	%r10,%rbx		# context->Rip>=epilogue label
1538	jae	.Lin_prologue
1539___
1540$code.=<<___ if ($shaext);
1541	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
1542	cmp	%r10,%rbx
1543	jb	.Lnot_in_shaext
1544
1545	lea	(%rax),%rsi
1546	lea	512($context),%rdi	# &context.Xmm6
1547	mov	\$20,%ecx
1548	.long	0xa548f3fc		# cld; rep movsq
1549	lea	168(%rax),%rax		# adjust stack pointer
1550	jmp	.Lin_prologue
1551.Lnot_in_shaext:
1552___
1553$code.=<<___ if ($avx>1);
1554	lea	.Lavx2_shortcut(%rip),%r10
1555	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1556	jb	.Lnot_in_avx2
1557
1558	and	\$-256*$SZ,%rax
1559	add	\$`2*$SZ*($rounds-8)`,%rax
1560.Lnot_in_avx2:
1561___
1562$code.=<<___;
1563	mov	%rax,%rsi		# put aside Rsp
1564	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp
1565	lea	48(%rax),%rax
1566
1567	mov	-8(%rax),%rbx
1568	mov	-16(%rax),%rbp
1569	mov	-24(%rax),%r12
1570	mov	-32(%rax),%r13
1571	mov	-40(%rax),%r14
1572	mov	-48(%rax),%r15
1573	mov	%rbx,144($context)	# restore context->Rbx
1574	mov	%rbp,160($context)	# restore context->Rbp
1575	mov	%r12,216($context)	# restore context->R12
1576	mov	%r13,224($context)	# restore context->R13
1577	mov	%r14,232($context)	# restore context->R14
1578	mov	%r15,240($context)	# restore context->R15
1579
1580	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
1581	lea	512($context),%rdi	# &context.Xmm6
1582	mov	\$20,%ecx
1583	.long	0xa548f3fc		# cld; rep movsq
1584
1585.Lin_prologue:
1586	mov	8(%rax),%rdi
1587	mov	16(%rax),%rsi
1588	mov	%rax,152($context)	# restore context->Rsp
1589	mov	%rsi,168($context)	# restore context->Rsi
1590	mov	%rdi,176($context)	# restore context->Rdi
1591
1592	mov	40($disp),%rdi		# disp->ContextRecord
1593	mov	$context,%rsi		# context
1594	mov	\$154,%ecx		# sizeof(CONTEXT)
1595	.long	0xa548f3fc		# cld; rep movsq
1596
1597	mov	$disp,%rsi
1598	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1599	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1600	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1601	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1602	mov	40(%rsi),%r10		# disp->ContextRecord
1603	lea	56(%rsi),%r11		# &disp->HandlerData
1604	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1605	mov	%r10,32(%rsp)		# arg5
1606	mov	%r11,40(%rsp)		# arg6
1607	mov	%r12,48(%rsp)		# arg7
1608	mov	%rcx,56(%rsp)		# arg8, (NULL)
1609	call	*__imp_RtlVirtualUnwind(%rip)
1610
1611	mov	\$1,%eax		# ExceptionContinueSearch
1612	add	\$64,%rsp
1613	popfq
1614	pop	%r15
1615	pop	%r14
1616	pop	%r13
1617	pop	%r12
1618	pop	%rbp
1619	pop	%rbx
1620	pop	%rdi
1621	pop	%rsi
1622	ret
1623.size	se_handler,.-se_handler
1624
1625.section	.pdata
1626	.rva	.LSEH_begin_${func}_xop
1627	.rva	.LSEH_end_${func}_xop
1628	.rva	.LSEH_info_${func}_xop
1629
1630	.rva	.LSEH_begin_${func}_avx
1631	.rva	.LSEH_end_${func}_avx
1632	.rva	.LSEH_info_${func}_avx
1633___
1634$code.=<<___ if ($avx>1);
1635	.rva	.LSEH_begin_${func}_avx2
1636	.rva	.LSEH_end_${func}_avx2
1637	.rva	.LSEH_info_${func}_avx2
1638___
1639$code.=<<___ if ($shaext);
1640	.rva	.LSEH_begin_${func}_shaext
1641	.rva	.LSEH_end_${func}_shaext
1642	.rva	.LSEH_info_${func}_shaext
1643___
1644$code.=<<___;
1645.section	.xdata
1646.align	8
1647.LSEH_info_${func}_xop:
1648	.byte	9,0,0,0
1649	.rva	se_handler
1650	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
1651
1652.LSEH_info_${func}_avx:
1653	.byte	9,0,0,0
1654	.rva	se_handler
1655	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1656___
1657$code.=<<___ if ($avx>1);
1658.LSEH_info_${func}_avx2:
1659	.byte	9,0,0,0
1660	.rva	se_handler
1661	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
1662___
1663$code.=<<___ if ($shaext);
1664.LSEH_info_${func}_shaext:
1665	.byte	9,0,0,0
1666	.rva	se_handler
1667	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
1668___
1669}
1670
1671####################################################################
1672sub rex {
1673  local *opcode=shift;
1674  my ($dst,$src)=@_;
1675  my $rex=0;
1676
1677    $rex|=0x04			if($dst>=8);
1678    $rex|=0x01			if($src>=8);
1679    unshift @opcode,$rex|0x40	if($rex);
1680}
1681
1682{
1683  my %opcodelet = (
1684		"sha256rnds2" => 0xcb,
1685  		"sha256msg1"  => 0xcc,
1686		"sha256msg2"  => 0xcd	);
1687
1688  sub sha256op38 {
1689    my $instr = shift;
1690
1691    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1692      my @opcode=(0x0f,0x38);
1693	rex(\@opcode,$2,$1);
1694	push @opcode,$opcodelet{$instr};
1695	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1696	return ".byte\t".join(',',@opcode);
1697    } else {
1698	return $instr."\t".@_[0];
1699    }
1700  }
1701}
1702
1703$code =~ s/\`([^\`]*)\`/eval $1/gem;
1704$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1705print $code;
1706close STDOUT;
1707