aesni-sha1-x86_64.pl revision 325335
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2011
11#
12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
18# AESNI code is weaved into it. Below are performance numbers in
19# cycles per processed byte, less is better, for standalone AESNI-CBC
20# encrypt, sum of the latter and standalone SHA1, and "stitched"
21# subroutine:
22#
23#		AES-128-CBC	+SHA1		stitch      gain
24# Westmere	3.77[+5.3]	9.07		6.55	    +38%
25# Sandy Bridge	5.05[+5.0(6.1)]	10.06(11.15)	5.98(7.05)  +68%(+58%)
26# Ivy Bridge	5.05[+4.6]	9.65		5.54        +74%
27# Haswell	4.43[+3.6(4.2)]	8.00(8.58)	4.55(5.21)  +75%(+65%)
28# Bulldozer	5.77[+6.0]	11.72		6.37        +84%
29#
30#		AES-192-CBC
31# Westmere	4.51		9.81		6.80	    +44%
32# Sandy Bridge	6.05		11.06(12.15)	6.11(7.19)  +81%(+69%)
33# Ivy Bridge	6.05		10.65		6.07        +75%
34# Haswell	5.29		8.86(9.44)	5.32(5.32)  +67%(+77%)
35# Bulldozer	6.89		12.84		6.96        +84%
36#
37#		AES-256-CBC
38# Westmere	5.25		10.55		7.21	    +46%
39# Sandy Bridge	7.05		12.06(13.15)	7.12(7.72)  +69%(+70%)
40# Ivy Bridge	7.05		11.65		7.12        +64%
41# Haswell	6.19		9.76(10.34)	6.21(6.25)  +57%(+65%)
42# Bulldozer	8.00		13.95		8.25        +69%
43#
44# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
45#	background information. Above numbers in parentheses are SSSE3
46#	results collected on AVX-capable CPU, i.e. apply on OSes that
47#	don't support AVX.
48#
49# Needless to mention that it makes no sense to implement "stitched"
50# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
51# fully utilize parallelism, so stitching would not give any gain
52# anyway. Well, there might be some, e.g. because of better cache
53# locality... For reference, here are performance results for
54# standalone AESNI-CBC decrypt:
55#
56#		AES-128-CBC	AES-192-CBC	AES-256-CBC
57# Westmere	1.25		1.50		1.75
58# Sandy Bridge	0.74		0.91		1.09
59# Ivy Bridge	0.74		0.90		1.11
60# Haswell	0.63		0.76		0.88
61# Bulldozer	0.70		0.85		0.99
62
63# And indeed:
64#
65#		AES-256-CBC	+SHA1		stitch      gain
66# Westmere	1.75		7.20		6.68        +7.8%
67# Sandy Bridge	1.09		6.09(7.22)	5.82(6.95)  +4.6%(+3.9%)
68# Ivy Bridge	1.11		5.70		5.45        +4.6%
69# Haswell	0.88		4.45(5.00)	4.39(4.69)  +1.4%(*)(+6.6%)
70# Bulldozer	0.99		6.95		5.95        +17%(**)
71#
72# (*)	Tiny improvement coefficient on Haswell is because we compare
73#	AVX1 stitch to sum with AVX2 SHA1.
74# (**)	Execution is fully dominated by integer code sequence and
75#	SIMD still hardly shows [in single-process benchmark;-]
76
77$flavour = shift;
78$output  = shift;
79if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
80
81$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
82
83$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
84( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
85( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
86die "can't locate x86_64-xlate.pl";
87
88$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
89		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
90	   $1>=2.19);
91$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
92	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
93	   $1>=2.09);
94$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
95	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
96	   $1>=10);
97$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
98
99$shaext=1;	### set to zero if compiling for 1.0.1
100
101$stitched_decrypt=0;
102
103open OUT,"| \"$^X\" $xlate $flavour $output";
104*STDOUT=*OUT;
105
106# void aesni_cbc_sha1_enc(const void *inp,
107#			void *out,
108#			size_t length,
109#			const AES_KEY *key,
110#			unsigned char *iv,
111#			SHA_CTX *ctx,
112#			const void *in0);
113
114$code.=<<___;
115.text
116.extern	OPENSSL_ia32cap_P
117
118.globl	aesni_cbc_sha1_enc
119.type	aesni_cbc_sha1_enc,\@abi-omnipotent
120.align	32
121aesni_cbc_sha1_enc:
122	# caller should check for SSSE3 and AES-NI bits
123	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
124	mov	OPENSSL_ia32cap_P+4(%rip),%r11
125___
126$code.=<<___ if ($shaext);
127	bt	\$61,%r11		# check SHA bit
128	jc	aesni_cbc_sha1_enc_shaext
129___
130$code.=<<___ if ($avx);
131	and	\$`1<<28`,%r11d		# mask AVX bit
132	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
133	or	%r11d,%r10d
134	cmp	\$`1<<28|1<<30`,%r10d
135	je	aesni_cbc_sha1_enc_avx
136___
137$code.=<<___;
138	jmp	aesni_cbc_sha1_enc_ssse3
139	ret
140.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
141___
142
143my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
144
145my $Xi=4;
146my @X=map("%xmm$_",(4..7,0..3));
147my @Tx=map("%xmm$_",(8..10));
148my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
149my @T=("%esi","%edi");
150my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
151my $K_XX_XX="%r11";
152my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));			# for enc
153my @rndkey=("%xmm14","%xmm15");					# for enc
154my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));	# for dec
155
156if (1) {	# reassign for Atom Silvermont
157    # The goal is to minimize amount of instructions with more than
158    # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
159    # SSSE3 instructions to upper half of the register bank.
160    @X=map("%xmm$_",(8..11,4..7));
161    @Tx=map("%xmm$_",(12,13,3));
162    ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
163    @rndkey=("%xmm0","%xmm1");
164}
165
166sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
167{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
168  my $arg = pop;
169    $arg = "\$$arg" if ($arg*1 eq $arg);
170    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
171}
172
173my $_rol=sub { &rol(@_) };
174my $_ror=sub { &ror(@_) };
175
176$code.=<<___;
177.type	aesni_cbc_sha1_enc_ssse3,\@function,6
178.align	32
179aesni_cbc_sha1_enc_ssse3:
180	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
181	#shr	\$6,$len			# debugging artefact
182	#jz	.Lepilogue_ssse3		# debugging artefact
183	push	%rbx
184	push	%rbp
185	push	%r12
186	push	%r13
187	push	%r14
188	push	%r15
189	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
190	#mov	$in0,$inp			# debugging artefact
191	#lea	64(%rsp),$ctx			# debugging artefact
192___
193$code.=<<___ if ($win64);
194	movaps	%xmm6,96+0(%rsp)
195	movaps	%xmm7,96+16(%rsp)
196	movaps	%xmm8,96+32(%rsp)
197	movaps	%xmm9,96+48(%rsp)
198	movaps	%xmm10,96+64(%rsp)
199	movaps	%xmm11,96+80(%rsp)
200	movaps	%xmm12,96+96(%rsp)
201	movaps	%xmm13,96+112(%rsp)
202	movaps	%xmm14,96+128(%rsp)
203	movaps	%xmm15,96+144(%rsp)
204.Lprologue_ssse3:
205___
206$code.=<<___;
207	mov	$in0,%r12			# reassign arguments
208	mov	$out,%r13
209	mov	$len,%r14
210	lea	112($key),%r15			# size optimization
211	movdqu	($ivp),$iv			# load IV
212	mov	$ivp,88(%rsp)			# save $ivp
213___
214($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
215my $rounds="${ivp}d";
216$code.=<<___;
217	shl	\$6,$len
218	sub	$in0,$out
219	mov	240-112($key),$rounds
220	add	$inp,$len		# end of input
221
222	lea	K_XX_XX(%rip),$K_XX_XX
223	mov	0($ctx),$A		# load context
224	mov	4($ctx),$B
225	mov	8($ctx),$C
226	mov	12($ctx),$D
227	mov	$B,@T[0]		# magic seed
228	mov	16($ctx),$E
229	mov	$C,@T[1]
230	xor	$D,@T[1]
231	and	@T[1],@T[0]
232
233	movdqa	64($K_XX_XX),@Tx[2]	# pbswap mask
234	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
235	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
236	movdqu	16($inp),@X[-3&7]
237	movdqu	32($inp),@X[-2&7]
238	movdqu	48($inp),@X[-1&7]
239	pshufb	@Tx[2],@X[-4&7]		# byte swap
240	pshufb	@Tx[2],@X[-3&7]
241	pshufb	@Tx[2],@X[-2&7]
242	add	\$64,$inp
243	paddd	@Tx[1],@X[-4&7]		# add K_00_19
244	pshufb	@Tx[2],@X[-1&7]
245	paddd	@Tx[1],@X[-3&7]
246	paddd	@Tx[1],@X[-2&7]
247	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
248	psubd	@Tx[1],@X[-4&7]		# restore X[]
249	movdqa	@X[-3&7],16(%rsp)
250	psubd	@Tx[1],@X[-3&7]
251	movdqa	@X[-2&7],32(%rsp)
252	psubd	@Tx[1],@X[-2&7]
253	movups	-112($key),$rndkey0	# $key[0]
254	movups	16-112($key),$rndkey[0]	# forward reference
255	jmp	.Loop_ssse3
256___
257
258my $aesenc=sub {
259  use integer;
260  my ($n,$k)=($r/10,$r%10);
261    if ($k==0) {
262      $code.=<<___;
263	movups		`16*$n`($in0),$in		# load input
264	xorps		$rndkey0,$in
265___
266      $code.=<<___ if ($n);
267	movups		$iv,`16*($n-1)`($out,$in0)	# write output
268___
269      $code.=<<___;
270	xorps		$in,$iv
271	movups		`32+16*$k-112`($key),$rndkey[1]
272	aesenc		$rndkey[0],$iv
273___
274    } elsif ($k==9) {
275      $sn++;
276      $code.=<<___;
277	cmp		\$11,$rounds
278	jb		.Laesenclast$sn
279	movups		`32+16*($k+0)-112`($key),$rndkey[1]
280	aesenc		$rndkey[0],$iv
281	movups		`32+16*($k+1)-112`($key),$rndkey[0]
282	aesenc		$rndkey[1],$iv
283	je		.Laesenclast$sn
284	movups		`32+16*($k+2)-112`($key),$rndkey[1]
285	aesenc		$rndkey[0],$iv
286	movups		`32+16*($k+3)-112`($key),$rndkey[0]
287	aesenc		$rndkey[1],$iv
288.Laesenclast$sn:
289	aesenclast	$rndkey[0],$iv
290	movups		16-112($key),$rndkey[1]		# forward reference
291___
292    } else {
293      $code.=<<___;
294	movups		`32+16*$k-112`($key),$rndkey[1]
295	aesenc		$rndkey[0],$iv
296___
297    }
298    $r++;	unshift(@rndkey,pop(@rndkey));
299};
300
301sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
302{ use integer;
303  my $body = shift;
304  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
305  my ($a,$b,$c,$d,$e);
306
307	 eval(shift(@insns));		# ror
308	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
309	 eval(shift(@insns));
310	&movdqa	(@Tx[0],@X[-1&7]);
311	  &paddd	(@Tx[1],@X[-1&7]);
312	 eval(shift(@insns));
313	 eval(shift(@insns));
314
315	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
316	 eval(shift(@insns));
317	 eval(shift(@insns));		# rol
318	 eval(shift(@insns));
319	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
320	 eval(shift(@insns));
321	 eval(shift(@insns));
322
323	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
324	 eval(shift(@insns));
325	 eval(shift(@insns));		# ror
326	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
327	 eval(shift(@insns));
328	 eval(shift(@insns));
329	 eval(shift(@insns));
330
331	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
332	 eval(shift(@insns));
333	 eval(shift(@insns));		# rol
334	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
335	 eval(shift(@insns));
336	 eval(shift(@insns));
337
338	&movdqa	(@Tx[2],@X[0]);
339	 eval(shift(@insns));
340	 eval(shift(@insns));
341	 eval(shift(@insns));		# ror
342	&movdqa	(@Tx[0],@X[0]);
343	 eval(shift(@insns));
344
345	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
346	&paddd	(@X[0],@X[0]);
347	 eval(shift(@insns));
348	 eval(shift(@insns));
349
350	&psrld	(@Tx[0],31);
351	 eval(shift(@insns));
352	 eval(shift(@insns));		# rol
353	 eval(shift(@insns));
354	&movdqa	(@Tx[1],@Tx[2]);
355	 eval(shift(@insns));
356	 eval(shift(@insns));
357
358	&psrld	(@Tx[2],30);
359	 eval(shift(@insns));
360	 eval(shift(@insns));		# ror
361	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
362	 eval(shift(@insns));
363	 eval(shift(@insns));
364	 eval(shift(@insns));
365
366	&pslld	(@Tx[1],2);
367	&pxor	(@X[0],@Tx[2]);
368	 eval(shift(@insns));
369	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
370	 eval(shift(@insns));		# rol
371	 eval(shift(@insns));
372	 eval(shift(@insns));
373
374	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
375	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
376
377	 foreach (@insns) { eval; }	# remaining instructions [if any]
378
379  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
380		push(@Tx,shift(@Tx));
381}
382
383sub Xupdate_ssse3_32_79()
384{ use integer;
385  my $body = shift;
386  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
387  my ($a,$b,$c,$d,$e);
388
389	 eval(shift(@insns))		if ($Xi==8);
390	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
391	 eval(shift(@insns))		if ($Xi==8);
392	 eval(shift(@insns));		# body_20_39
393	 eval(shift(@insns));
394	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
395	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
396	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
397	 eval(shift(@insns));
398	 eval(shift(@insns));		# rol
399
400	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
401	 eval(shift(@insns));
402	 eval(shift(@insns));
403	if ($Xi%5) {
404	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
405	} else {			# ... or load next one
406	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
407	}
408	 eval(shift(@insns));		# ror
409	  &paddd	(@Tx[1],@X[-1&7]);
410	 eval(shift(@insns));
411
412	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
413	 eval(shift(@insns));		# body_20_39
414	 eval(shift(@insns));
415	 eval(shift(@insns));
416	 eval(shift(@insns));		# rol
417	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
418
419	&movdqa	(@Tx[0],@X[0]);
420	 eval(shift(@insns));
421	 eval(shift(@insns));
422	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
423	 eval(shift(@insns));		# ror
424	 eval(shift(@insns));
425	 eval(shift(@insns));		# body_20_39
426
427	&pslld	(@X[0],2);
428	 eval(shift(@insns));
429	 eval(shift(@insns));
430	&psrld	(@Tx[0],30);
431	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
432	 eval(shift(@insns));
433	 eval(shift(@insns));
434	 eval(shift(@insns));		# ror
435
436	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
437	 eval(shift(@insns));
438	 eval(shift(@insns));		# body_20_39
439	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
440	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
441	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
442	 eval(shift(@insns));
443	 eval(shift(@insns));		# rol
444	 eval(shift(@insns));
445	 eval(shift(@insns));
446	 eval(shift(@insns));		# rol
447	 eval(shift(@insns));
448
449	 foreach (@insns) { eval; }	# remaining instructions
450
451  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
452		push(@Tx,shift(@Tx));
453}
454
455sub Xuplast_ssse3_80()
456{ use integer;
457  my $body = shift;
458  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
459  my ($a,$b,$c,$d,$e);
460
461	 eval(shift(@insns));
462	 eval(shift(@insns));
463	 eval(shift(@insns));
464	 eval(shift(@insns));
465	  &paddd	(@Tx[1],@X[-1&7]);
466	 eval(shift(@insns));
467	 eval(shift(@insns));
468
469	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
470
471	 foreach (@insns) { eval; }		# remaining instructions
472
473	&cmp	($inp,$len);
474	&je	(shift);
475
476	unshift(@Tx,pop(@Tx));
477
478	&movdqa	(@Tx[2],"64($K_XX_XX)");	# pbswap mask
479	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
480	&movdqu	(@X[-4&7],"0($inp)");		# load input
481	&movdqu	(@X[-3&7],"16($inp)");
482	&movdqu	(@X[-2&7],"32($inp)");
483	&movdqu	(@X[-1&7],"48($inp)");
484	&pshufb	(@X[-4&7],@Tx[2]);		# byte swap
485	&add	($inp,64);
486
487  $Xi=0;
488}
489
490sub Xloop_ssse3()
491{ use integer;
492  my $body = shift;
493  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
494  my ($a,$b,$c,$d,$e);
495
496	 eval(shift(@insns));
497	 eval(shift(@insns));
498	 eval(shift(@insns));
499	&pshufb	(@X[($Xi-3)&7],@Tx[2]);
500	 eval(shift(@insns));
501	 eval(shift(@insns));
502	 eval(shift(@insns));
503	 eval(shift(@insns));
504	&paddd	(@X[($Xi-4)&7],@Tx[1]);
505	 eval(shift(@insns));
506	 eval(shift(@insns));
507	 eval(shift(@insns));
508	 eval(shift(@insns));
509	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
510	 eval(shift(@insns));
511	 eval(shift(@insns));
512	 eval(shift(@insns));
513	 eval(shift(@insns));
514	&psubd	(@X[($Xi-4)&7],@Tx[1]);
515
516	foreach (@insns) { eval; }
517  $Xi++;
518}
519
520sub Xtail_ssse3()
521{ use integer;
522  my $body = shift;
523  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
524  my ($a,$b,$c,$d,$e);
525
526	foreach (@insns) { eval; }
527}
528
529my @body_00_19 = (
530	'($a,$b,$c,$d,$e)=@V;'.
531	'&$_ror	($b,$j?7:2);',	# $b>>>2
532	'&xor	(@T[0],$d);',
533	'&mov	(@T[1],$a);',	# $b for next round
534
535	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
536	'&xor	($b,$c);',	# $c^$d for next round
537
538	'&$_rol	($a,5);',
539	'&add	($e,@T[0]);',
540	'&and	(@T[1],$b);',	# ($b&($c^$d)) for next round
541
542	'&xor	($b,$c);',	# restore $b
543	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
544	);
545
546sub body_00_19 () {	# ((c^d)&b)^d
547    # on start @T[0]=(c^d)&b
548    return &body_20_39() if ($rx==19); $rx++;
549
550    use integer;
551    my ($k,$n);
552    my @r=@body_00_19;
553
554	$n = scalar(@r);
555	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
556	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
557	$jj++;
558
559    return @r;
560}
561
562my @body_20_39 = (
563	'($a,$b,$c,$d,$e)=@V;'.
564	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
565	'&xor	(@T[0],$d)	if($j==19);'.
566	'&xor	(@T[0],$c)	if($j> 19);',	# ($b^$d^$c)
567	'&mov	(@T[1],$a);',	# $b for next round
568
569	'&$_rol	($a,5);',
570	'&add	($e,@T[0]);',
571	'&xor	(@T[1],$c)	if ($j< 79);',	# $b^$d for next round
572
573	'&$_ror	($b,7);',	# $b>>>2
574	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
575	);
576
577sub body_20_39 () {	# b^d^c
578    # on entry @T[0]=b^d
579    return &body_40_59() if ($rx==39); $rx++;
580
581    use integer;
582    my ($k,$n);
583    my @r=@body_20_39;
584
585	$n = scalar(@r);
586	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
587	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n && $rx!=20);
588	$jj++;
589
590    return @r;
591}
592
593my @body_40_59 = (
594	'($a,$b,$c,$d,$e)=@V;'.
595	'&add	($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
596	'&and	(@T[0],$c)	if ($j>=40);',	# (b^c)&(c^d)
597	'&xor	($c,$d)		if ($j>=40);',	# restore $c
598
599	'&$_ror	($b,7);',	# $b>>>2
600	'&mov	(@T[1],$a);',	# $b for next round
601	'&xor	(@T[0],$c);',
602
603	'&$_rol	($a,5);',
604	'&add	($e,@T[0]);',
605	'&xor	(@T[1],$c)	if ($j==59);'.
606	'&xor	(@T[1],$b)	if ($j< 59);',	# b^c for next round
607
608	'&xor	($b,$c)		if ($j< 59);',	# c^d for next round
609	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
610	);
611
612sub body_40_59 () {	# ((b^c)&(c^d))^c
613    # on entry @T[0]=(b^c), (c^=d)
614    $rx++;
615
616    use integer;
617    my ($k,$n);
618    my @r=@body_40_59;
619
620	$n = scalar(@r);
621	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
622	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n && $rx!=40);
623	$jj++;
624
625    return @r;
626}
627$code.=<<___;
628.align	32
629.Loop_ssse3:
630___
631	&Xupdate_ssse3_16_31(\&body_00_19);
632	&Xupdate_ssse3_16_31(\&body_00_19);
633	&Xupdate_ssse3_16_31(\&body_00_19);
634	&Xupdate_ssse3_16_31(\&body_00_19);
635	&Xupdate_ssse3_32_79(\&body_00_19);
636	&Xupdate_ssse3_32_79(\&body_20_39);
637	&Xupdate_ssse3_32_79(\&body_20_39);
638	&Xupdate_ssse3_32_79(\&body_20_39);
639	&Xupdate_ssse3_32_79(\&body_20_39);
640	&Xupdate_ssse3_32_79(\&body_20_39);
641	&Xupdate_ssse3_32_79(\&body_40_59);
642	&Xupdate_ssse3_32_79(\&body_40_59);
643	&Xupdate_ssse3_32_79(\&body_40_59);
644	&Xupdate_ssse3_32_79(\&body_40_59);
645	&Xupdate_ssse3_32_79(\&body_40_59);
646	&Xupdate_ssse3_32_79(\&body_20_39);
647	&Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3");	# can jump to "done"
648
649				$saved_j=$j; @saved_V=@V;
650				$saved_r=$r; @saved_rndkey=@rndkey;
651
652	&Xloop_ssse3(\&body_20_39);
653	&Xloop_ssse3(\&body_20_39);
654	&Xloop_ssse3(\&body_20_39);
655
656$code.=<<___;
657	movups	$iv,48($out,$in0)		# write output
658	lea	64($in0),$in0
659
660	add	0($ctx),$A			# update context
661	add	4($ctx),@T[0]
662	add	8($ctx),$C
663	add	12($ctx),$D
664	mov	$A,0($ctx)
665	add	16($ctx),$E
666	mov	@T[0],4($ctx)
667	mov	@T[0],$B			# magic seed
668	mov	$C,8($ctx)
669	mov	$C,@T[1]
670	mov	$D,12($ctx)
671	xor	$D,@T[1]
672	mov	$E,16($ctx)
673	and	@T[1],@T[0]
674	jmp	.Loop_ssse3
675
676.Ldone_ssse3:
677___
678				$jj=$j=$saved_j; @V=@saved_V;
679				$r=$saved_r;     @rndkey=@saved_rndkey;
680
681	&Xtail_ssse3(\&body_20_39);
682	&Xtail_ssse3(\&body_20_39);
683	&Xtail_ssse3(\&body_20_39);
684
685$code.=<<___;
686	movups	$iv,48($out,$in0)		# write output
687	mov	88(%rsp),$ivp			# restore $ivp
688
689	add	0($ctx),$A			# update context
690	add	4($ctx),@T[0]
691	add	8($ctx),$C
692	mov	$A,0($ctx)
693	add	12($ctx),$D
694	mov	@T[0],4($ctx)
695	add	16($ctx),$E
696	mov	$C,8($ctx)
697	mov	$D,12($ctx)
698	mov	$E,16($ctx)
699	movups	$iv,($ivp)			# write IV
700___
701$code.=<<___ if ($win64);
702	movaps	96+0(%rsp),%xmm6
703	movaps	96+16(%rsp),%xmm7
704	movaps	96+32(%rsp),%xmm8
705	movaps	96+48(%rsp),%xmm9
706	movaps	96+64(%rsp),%xmm10
707	movaps	96+80(%rsp),%xmm11
708	movaps	96+96(%rsp),%xmm12
709	movaps	96+112(%rsp),%xmm13
710	movaps	96+128(%rsp),%xmm14
711	movaps	96+144(%rsp),%xmm15
712___
713$code.=<<___;
714	lea	`104+($win64?10*16:0)`(%rsp),%rsi
715	mov	0(%rsi),%r15
716	mov	8(%rsi),%r14
717	mov	16(%rsi),%r13
718	mov	24(%rsi),%r12
719	mov	32(%rsi),%rbp
720	mov	40(%rsi),%rbx
721	lea	48(%rsi),%rsp
722.Lepilogue_ssse3:
723	ret
724.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
725___
726
727						if ($stitched_decrypt) {{{
728# reset
729($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
730$j=$jj=$r=$rx=0;
731$Xi=4;
732
733# reassign for Atom Silvermont (see above)
734($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
735@X=map("%xmm$_",(8..13,6,7));
736@Tx=map("%xmm$_",(14,15,5));
737
738my @aes256_dec = (
739	'&movdqu($inout0,"0x00($in0)");',
740	'&movdqu($inout1,"0x10($in0)");	&pxor	($inout0,$rndkey0);',
741	'&movdqu($inout2,"0x20($in0)");	&pxor	($inout1,$rndkey0);',
742	'&movdqu($inout3,"0x30($in0)");	&pxor	($inout2,$rndkey0);',
743
744	'&pxor	($inout3,$rndkey0);	&movups	($rndkey0,"16-112($key)");',
745	'&movaps("64(%rsp)",@X[2]);',	# save IV, originally @X[3]
746	undef,undef
747	);
748for ($i=0;$i<13;$i++) {
749    push (@aes256_dec,(
750	'&aesdec	($inout0,$rndkey0);',
751	'&aesdec	($inout1,$rndkey0);',
752	'&aesdec	($inout2,$rndkey0);',
753	'&aesdec	($inout3,$rndkey0);	&movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
754	));
755    push (@aes256_dec,(undef,undef))	if (($i>=3 && $i<=5) || $i>=11);
756    push (@aes256_dec,(undef,undef))	if ($i==5);
757}
758push(@aes256_dec,(
759	'&aesdeclast	($inout0,$rndkey0);	&movups	(@X[0],"0x00($in0)");',
760	'&aesdeclast	($inout1,$rndkey0);	&movups	(@X[1],"0x10($in0)");',
761	'&aesdeclast	($inout2,$rndkey0);	&movups	(@X[2],"0x20($in0)");',
762	'&aesdeclast	($inout3,$rndkey0);	&movups	(@X[3],"0x30($in0)");',
763
764	'&xorps		($inout0,"64(%rsp)");	&movdqu	($rndkey0,"-112($key)");',
765	'&xorps		($inout1,@X[0]);	&movups	("0x00($out,$in0)",$inout0);',
766	'&xorps		($inout2,@X[1]);	&movups	("0x10($out,$in0)",$inout1);',
767	'&xorps		($inout3,@X[2]);	&movups	("0x20($out,$in0)",$inout2);',
768
769	'&movups	("0x30($out,$in0)",$inout3);'
770	));
771
772sub body_00_19_dec () {	# ((c^d)&b)^d
773    # on start @T[0]=(c^d)&b
774    return &body_20_39_dec() if ($rx==19);
775
776    my @r=@body_00_19;
777
778	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
779	$rx++;
780
781    return @r;
782}
783
784sub body_20_39_dec () {	# b^d^c
785    # on entry @T[0]=b^d
786    return &body_40_59_dec() if ($rx==39);
787
788    my @r=@body_20_39;
789
790	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
791	$rx++;
792
793    return @r;
794}
795
796sub body_40_59_dec () {	# ((b^c)&(c^d))^c
797    # on entry @T[0]=(b^c), (c^=d)
798
799    my @r=@body_40_59;
800
801	unshift (@r,@aes256_dec[$rx])	if (@aes256_dec[$rx]);
802	$rx++;
803
804    return @r;
805}
806
807$code.=<<___;
808.globl	aesni256_cbc_sha1_dec
809.type	aesni256_cbc_sha1_dec,\@abi-omnipotent
810.align	32
811aesni256_cbc_sha1_dec:
812	# caller should check for SSSE3 and AES-NI bits
813	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
814	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
815___
816$code.=<<___ if ($avx);
817	and	\$`1<<28`,%r11d		# mask AVX bit
818	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
819	or	%r11d,%r10d
820	cmp	\$`1<<28|1<<30`,%r10d
821	je	aesni256_cbc_sha1_dec_avx
822___
823$code.=<<___;
824	jmp	aesni256_cbc_sha1_dec_ssse3
825	ret
826.size	aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
827
828.type	aesni256_cbc_sha1_dec_ssse3,\@function,6
829.align	32
830aesni256_cbc_sha1_dec_ssse3:
831	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
832	push	%rbx
833	push	%rbp
834	push	%r12
835	push	%r13
836	push	%r14
837	push	%r15
838	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
839___
840$code.=<<___ if ($win64);
841	movaps	%xmm6,96+0(%rsp)
842	movaps	%xmm7,96+16(%rsp)
843	movaps	%xmm8,96+32(%rsp)
844	movaps	%xmm9,96+48(%rsp)
845	movaps	%xmm10,96+64(%rsp)
846	movaps	%xmm11,96+80(%rsp)
847	movaps	%xmm12,96+96(%rsp)
848	movaps	%xmm13,96+112(%rsp)
849	movaps	%xmm14,96+128(%rsp)
850	movaps	%xmm15,96+144(%rsp)
851.Lprologue_dec_ssse3:
852___
853$code.=<<___;
854	mov	$in0,%r12			# reassign arguments
855	mov	$out,%r13
856	mov	$len,%r14
857	lea	112($key),%r15			# size optimization
858	movdqu	($ivp),@X[3]			# load IV
859	#mov	$ivp,88(%rsp)			# save $ivp
860___
861($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
862$code.=<<___;
863	shl	\$6,$len
864	sub	$in0,$out
865	add	$inp,$len		# end of input
866
867	lea	K_XX_XX(%rip),$K_XX_XX
868	mov	0($ctx),$A		# load context
869	mov	4($ctx),$B
870	mov	8($ctx),$C
871	mov	12($ctx),$D
872	mov	$B,@T[0]		# magic seed
873	mov	16($ctx),$E
874	mov	$C,@T[1]
875	xor	$D,@T[1]
876	and	@T[1],@T[0]
877
878	movdqa	64($K_XX_XX),@Tx[2]	# pbswap mask
879	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
880	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
881	movdqu	16($inp),@X[-3&7]
882	movdqu	32($inp),@X[-2&7]
883	movdqu	48($inp),@X[-1&7]
884	pshufb	@Tx[2],@X[-4&7]		# byte swap
885	add	\$64,$inp
886	pshufb	@Tx[2],@X[-3&7]
887	pshufb	@Tx[2],@X[-2&7]
888	pshufb	@Tx[2],@X[-1&7]
889	paddd	@Tx[1],@X[-4&7]		# add K_00_19
890	paddd	@Tx[1],@X[-3&7]
891	paddd	@Tx[1],@X[-2&7]
892	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
893	psubd	@Tx[1],@X[-4&7]		# restore X[]
894	movdqa	@X[-3&7],16(%rsp)
895	psubd	@Tx[1],@X[-3&7]
896	movdqa	@X[-2&7],32(%rsp)
897	psubd	@Tx[1],@X[-2&7]
898	movdqu	-112($key),$rndkey0	# $key[0]
899	jmp	.Loop_dec_ssse3
900
901.align	32
902.Loop_dec_ssse3:
903___
904	&Xupdate_ssse3_16_31(\&body_00_19_dec);
905	&Xupdate_ssse3_16_31(\&body_00_19_dec);
906	&Xupdate_ssse3_16_31(\&body_00_19_dec);
907	&Xupdate_ssse3_16_31(\&body_00_19_dec);
908	&Xupdate_ssse3_32_79(\&body_00_19_dec);
909	&Xupdate_ssse3_32_79(\&body_20_39_dec);
910	&Xupdate_ssse3_32_79(\&body_20_39_dec);
911	&Xupdate_ssse3_32_79(\&body_20_39_dec);
912	&Xupdate_ssse3_32_79(\&body_20_39_dec);
913	&Xupdate_ssse3_32_79(\&body_20_39_dec);
914	&Xupdate_ssse3_32_79(\&body_40_59_dec);
915	&Xupdate_ssse3_32_79(\&body_40_59_dec);
916	&Xupdate_ssse3_32_79(\&body_40_59_dec);
917	&Xupdate_ssse3_32_79(\&body_40_59_dec);
918	&Xupdate_ssse3_32_79(\&body_40_59_dec);
919	&Xupdate_ssse3_32_79(\&body_20_39_dec);
920	&Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3");	# can jump to "done"
921
922				$saved_j=$j;   @saved_V=@V;
923				$saved_rx=$rx;
924
925	&Xloop_ssse3(\&body_20_39_dec);
926	&Xloop_ssse3(\&body_20_39_dec);
927	&Xloop_ssse3(\&body_20_39_dec);
928
929	eval(@aes256_dec[-1]);			# last store
930$code.=<<___;
931	lea	64($in0),$in0
932
933	add	0($ctx),$A			# update context
934	add	4($ctx),@T[0]
935	add	8($ctx),$C
936	add	12($ctx),$D
937	mov	$A,0($ctx)
938	add	16($ctx),$E
939	mov	@T[0],4($ctx)
940	mov	@T[0],$B			# magic seed
941	mov	$C,8($ctx)
942	mov	$C,@T[1]
943	mov	$D,12($ctx)
944	xor	$D,@T[1]
945	mov	$E,16($ctx)
946	and	@T[1],@T[0]
947	jmp	.Loop_dec_ssse3
948
949.Ldone_dec_ssse3:
950___
951				$jj=$j=$saved_j; @V=@saved_V;
952				$rx=$saved_rx;
953
954	&Xtail_ssse3(\&body_20_39_dec);
955	&Xtail_ssse3(\&body_20_39_dec);
956	&Xtail_ssse3(\&body_20_39_dec);
957
958	eval(@aes256_dec[-1]);			# last store
959$code.=<<___;
960	add	0($ctx),$A			# update context
961	add	4($ctx),@T[0]
962	add	8($ctx),$C
963	mov	$A,0($ctx)
964	add	12($ctx),$D
965	mov	@T[0],4($ctx)
966	add	16($ctx),$E
967	mov	$C,8($ctx)
968	mov	$D,12($ctx)
969	mov	$E,16($ctx)
970	movups	@X[3],($ivp)			# write IV
971___
972$code.=<<___ if ($win64);
973	movaps	96+0(%rsp),%xmm6
974	movaps	96+16(%rsp),%xmm7
975	movaps	96+32(%rsp),%xmm8
976	movaps	96+48(%rsp),%xmm9
977	movaps	96+64(%rsp),%xmm10
978	movaps	96+80(%rsp),%xmm11
979	movaps	96+96(%rsp),%xmm12
980	movaps	96+112(%rsp),%xmm13
981	movaps	96+128(%rsp),%xmm14
982	movaps	96+144(%rsp),%xmm15
983___
984$code.=<<___;
985	lea	`104+($win64?10*16:0)`(%rsp),%rsi
986	mov	0(%rsi),%r15
987	mov	8(%rsi),%r14
988	mov	16(%rsi),%r13
989	mov	24(%rsi),%r12
990	mov	32(%rsi),%rbp
991	mov	40(%rsi),%rbx
992	lea	48(%rsi),%rsp
993.Lepilogue_dec_ssse3:
994	ret
995.size	aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
996___
997						}}}
998$j=$jj=$r=$rx=0;
999
1000if ($avx) {
1001my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1002
1003my $Xi=4;
1004my @X=map("%xmm$_",(4..7,0..3));
1005my @Tx=map("%xmm$_",(8..10));
1006my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
1007my @T=("%esi","%edi");
1008my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
1009my @rndkey=("%xmm14","%xmm15");
1010my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));	# for dec
1011my $Kx=@Tx[2];
1012
1013my $_rol=sub { &shld(@_[0],@_) };
1014my $_ror=sub { &shrd(@_[0],@_) };
1015
1016$code.=<<___;
1017.type	aesni_cbc_sha1_enc_avx,\@function,6
1018.align	32
1019aesni_cbc_sha1_enc_avx:
1020	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1021	#shr	\$6,$len			# debugging artefact
1022	#jz	.Lepilogue_avx			# debugging artefact
1023	push	%rbx
1024	push	%rbp
1025	push	%r12
1026	push	%r13
1027	push	%r14
1028	push	%r15
1029	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
1030	#mov	$in0,$inp			# debugging artefact
1031	#lea	64(%rsp),$ctx			# debugging artefact
1032___
1033$code.=<<___ if ($win64);
1034	movaps	%xmm6,96+0(%rsp)
1035	movaps	%xmm7,96+16(%rsp)
1036	movaps	%xmm8,96+32(%rsp)
1037	movaps	%xmm9,96+48(%rsp)
1038	movaps	%xmm10,96+64(%rsp)
1039	movaps	%xmm11,96+80(%rsp)
1040	movaps	%xmm12,96+96(%rsp)
1041	movaps	%xmm13,96+112(%rsp)
1042	movaps	%xmm14,96+128(%rsp)
1043	movaps	%xmm15,96+144(%rsp)
1044.Lprologue_avx:
1045___
1046$code.=<<___;
1047	vzeroall
1048	mov	$in0,%r12			# reassign arguments
1049	mov	$out,%r13
1050	mov	$len,%r14
1051	lea	112($key),%r15			# size optimization
1052	vmovdqu	($ivp),$iv			# load IV
1053	mov	$ivp,88(%rsp)			# save $ivp
1054___
1055($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
1056my $rounds="${ivp}d";
1057$code.=<<___;
1058	shl	\$6,$len
1059	sub	$in0,$out
1060	mov	240-112($key),$rounds
1061	add	$inp,$len		# end of input
1062
1063	lea	K_XX_XX(%rip),$K_XX_XX
1064	mov	0($ctx),$A		# load context
1065	mov	4($ctx),$B
1066	mov	8($ctx),$C
1067	mov	12($ctx),$D
1068	mov	$B,@T[0]		# magic seed
1069	mov	16($ctx),$E
1070	mov	$C,@T[1]
1071	xor	$D,@T[1]
1072	and	@T[1],@T[0]
1073
1074	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1075	vmovdqa	0($K_XX_XX),$Kx		# K_00_19
1076	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1077	vmovdqu	16($inp),@X[-3&7]
1078	vmovdqu	32($inp),@X[-2&7]
1079	vmovdqu	48($inp),@X[-1&7]
1080	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1081	add	\$64,$inp
1082	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1083	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1084	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1085	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1086	vpaddd	$Kx,@X[-3&7],@X[1]
1087	vpaddd	$Kx,@X[-2&7],@X[2]
1088	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1089	vmovdqa	@X[1],16(%rsp)
1090	vmovdqa	@X[2],32(%rsp)
1091	vmovups	-112($key),$rndkey[1]	# $key[0]
1092	vmovups	16-112($key),$rndkey[0]	# forward reference
1093	jmp	.Loop_avx
1094___
1095
1096my $aesenc=sub {
1097  use integer;
1098  my ($n,$k)=($r/10,$r%10);
1099    if ($k==0) {
1100      $code.=<<___;
1101	vmovdqu		`16*$n`($in0),$in		# load input
1102	vpxor		$rndkey[1],$in,$in
1103___
1104      $code.=<<___ if ($n);
1105	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
1106___
1107      $code.=<<___;
1108	vpxor		$in,$iv,$iv
1109	vaesenc		$rndkey[0],$iv,$iv
1110	vmovups		`32+16*$k-112`($key),$rndkey[1]
1111___
1112    } elsif ($k==9) {
1113      $sn++;
1114      $code.=<<___;
1115	cmp		\$11,$rounds
1116	jb		.Lvaesenclast$sn
1117	vaesenc		$rndkey[0],$iv,$iv
1118	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
1119	vaesenc		$rndkey[1],$iv,$iv
1120	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
1121	je		.Lvaesenclast$sn
1122	vaesenc		$rndkey[0],$iv,$iv
1123	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
1124	vaesenc		$rndkey[1],$iv,$iv
1125	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
1126.Lvaesenclast$sn:
1127	vaesenclast	$rndkey[0],$iv,$iv
1128	vmovups		-112($key),$rndkey[0]
1129	vmovups		16-112($key),$rndkey[1]		# forward reference
1130___
1131    } else {
1132      $code.=<<___;
1133	vaesenc		$rndkey[0],$iv,$iv
1134	vmovups		`32+16*$k-112`($key),$rndkey[1]
1135___
1136    }
1137    $r++;	unshift(@rndkey,pop(@rndkey));
1138};
1139
1140sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
1141{ use integer;
1142  my $body = shift;
1143  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1144  my ($a,$b,$c,$d,$e);
1145
1146	 eval(shift(@insns));
1147	 eval(shift(@insns));
1148	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1149	 eval(shift(@insns));
1150	 eval(shift(@insns));
1151
1152	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1153	 eval(shift(@insns));
1154	 eval(shift(@insns));
1155	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1156	 eval(shift(@insns));
1157	 eval(shift(@insns));
1158	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1159	 eval(shift(@insns));
1160	 eval(shift(@insns));
1161
1162	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1163	 eval(shift(@insns));
1164	 eval(shift(@insns));
1165	 eval(shift(@insns));
1166	 eval(shift(@insns));
1167
1168	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1169	 eval(shift(@insns));
1170	 eval(shift(@insns));
1171	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1172	 eval(shift(@insns));
1173	 eval(shift(@insns));
1174
1175	&vpsrld	(@Tx[0],@X[0],31);
1176	 eval(shift(@insns));
1177	 eval(shift(@insns));
1178	 eval(shift(@insns));
1179	 eval(shift(@insns));
1180
1181	&vpslldq(@Tx[1],@X[0],12);		# "X[0]"<<96, extract one dword
1182	&vpaddd	(@X[0],@X[0],@X[0]);
1183	 eval(shift(@insns));
1184	 eval(shift(@insns));
1185	 eval(shift(@insns));
1186	 eval(shift(@insns));
1187
1188	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1189	&vpsrld	(@Tx[0],@Tx[1],30);
1190	 eval(shift(@insns));
1191	 eval(shift(@insns));
1192	 eval(shift(@insns));
1193	 eval(shift(@insns));
1194
1195	&vpslld	(@Tx[1],@Tx[1],2);
1196	&vpxor	(@X[0],@X[0],@Tx[0]);
1197	 eval(shift(@insns));
1198	 eval(shift(@insns));
1199	 eval(shift(@insns));
1200	 eval(shift(@insns));
1201
1202	&vpxor	(@X[0],@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
1203	 eval(shift(@insns));
1204	 eval(shift(@insns));
1205	  &vmovdqa	($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1206	 eval(shift(@insns));
1207	 eval(shift(@insns));
1208
1209
1210	 foreach (@insns) { eval; }	# remaining instructions [if any]
1211
1212  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1213}
1214
1215sub Xupdate_avx_32_79()
1216{ use integer;
1217  my $body = shift;
1218  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
1219  my ($a,$b,$c,$d,$e);
1220
1221	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1222	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1223	 eval(shift(@insns));		# body_20_39
1224	 eval(shift(@insns));
1225	 eval(shift(@insns));
1226	 eval(shift(@insns));		# rol
1227
1228	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1229	 eval(shift(@insns));
1230	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1231	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1232	  &vmovdqa	($Kx,eval(16*($Xi/5))."($K_XX_XX)")	if ($Xi%5==0);
1233	 eval(shift(@insns));		# ror
1234	 eval(shift(@insns));
1235
1236	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1237	 eval(shift(@insns));		# body_20_39
1238	 eval(shift(@insns));
1239	 eval(shift(@insns));
1240	 eval(shift(@insns));		# rol
1241
1242	&vpsrld	(@Tx[0],@X[0],30);
1243	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1244	 eval(shift(@insns));
1245	 eval(shift(@insns));
1246	 eval(shift(@insns));		# ror
1247	 eval(shift(@insns));
1248
1249	&vpslld	(@X[0],@X[0],2);
1250	 eval(shift(@insns));		# body_20_39
1251	 eval(shift(@insns));
1252	 eval(shift(@insns));
1253	 eval(shift(@insns));		# rol
1254	 eval(shift(@insns));
1255	 eval(shift(@insns));
1256	 eval(shift(@insns));		# ror
1257	 eval(shift(@insns));
1258
1259	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1260	 eval(shift(@insns));		# body_20_39
1261	 eval(shift(@insns));
1262	 eval(shift(@insns));
1263	 eval(shift(@insns));		# rol
1264	 eval(shift(@insns));
1265	 eval(shift(@insns));
1266	 eval(shift(@insns));		# rol
1267	 eval(shift(@insns));
1268
1269	 foreach (@insns) { eval; }	# remaining instructions
1270
1271  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1272}
1273
1274sub Xuplast_avx_80()
1275{ use integer;
1276  my $body = shift;
1277  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1278  my ($a,$b,$c,$d,$e);
1279
1280	 eval(shift(@insns));
1281	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1282	 eval(shift(@insns));
1283	 eval(shift(@insns));
1284	 eval(shift(@insns));
1285	 eval(shift(@insns));
1286
1287	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1288
1289	 foreach (@insns) { eval; }		# remaining instructions
1290
1291	&cmp	($inp,$len);
1292	&je	(shift);
1293
1294	&vmovdqa(@Tx[1],"64($K_XX_XX)");	# pbswap mask
1295	&vmovdqa($Kx,"0($K_XX_XX)");		# K_00_19
1296	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1297	&vmovdqu(@X[-3&7],"16($inp)");
1298	&vmovdqu(@X[-2&7],"32($inp)");
1299	&vmovdqu(@X[-1&7],"48($inp)");
1300	&vpshufb(@X[-4&7],@X[-4&7],@Tx[1]);	# byte swap
1301	&add	($inp,64);
1302
1303  $Xi=0;
1304}
1305
1306sub Xloop_avx()
1307{ use integer;
1308  my $body = shift;
1309  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1310  my ($a,$b,$c,$d,$e);
1311
1312	 eval(shift(@insns));
1313	 eval(shift(@insns));
1314	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
1315	 eval(shift(@insns));
1316	 eval(shift(@insns));
1317	&vpaddd	(@Tx[0],@X[($Xi-4)&7],$Kx);
1318	 eval(shift(@insns));
1319	 eval(shift(@insns));
1320	 eval(shift(@insns));
1321	 eval(shift(@insns));
1322	&vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]);	# X[]+K xfer to IALU
1323	 eval(shift(@insns));
1324	 eval(shift(@insns));
1325
1326	foreach (@insns) { eval; }
1327  $Xi++;
1328}
1329
1330sub Xtail_avx()
1331{ use integer;
1332  my $body = shift;
1333  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1334  my ($a,$b,$c,$d,$e);
1335
1336	foreach (@insns) { eval; }
1337}
1338
1339$code.=<<___;
1340.align	32
1341.Loop_avx:
1342___
1343	&Xupdate_avx_16_31(\&body_00_19);
1344	&Xupdate_avx_16_31(\&body_00_19);
1345	&Xupdate_avx_16_31(\&body_00_19);
1346	&Xupdate_avx_16_31(\&body_00_19);
1347	&Xupdate_avx_32_79(\&body_00_19);
1348	&Xupdate_avx_32_79(\&body_20_39);
1349	&Xupdate_avx_32_79(\&body_20_39);
1350	&Xupdate_avx_32_79(\&body_20_39);
1351	&Xupdate_avx_32_79(\&body_20_39);
1352	&Xupdate_avx_32_79(\&body_20_39);
1353	&Xupdate_avx_32_79(\&body_40_59);
1354	&Xupdate_avx_32_79(\&body_40_59);
1355	&Xupdate_avx_32_79(\&body_40_59);
1356	&Xupdate_avx_32_79(\&body_40_59);
1357	&Xupdate_avx_32_79(\&body_40_59);
1358	&Xupdate_avx_32_79(\&body_20_39);
1359	&Xuplast_avx_80(\&body_20_39,".Ldone_avx");	# can jump to "done"
1360
1361				$saved_j=$j; @saved_V=@V;
1362				$saved_r=$r; @saved_rndkey=@rndkey;
1363
1364	&Xloop_avx(\&body_20_39);
1365	&Xloop_avx(\&body_20_39);
1366	&Xloop_avx(\&body_20_39);
1367
1368$code.=<<___;
1369	vmovups	$iv,48($out,$in0)		# write output
1370	lea	64($in0),$in0
1371
1372	add	0($ctx),$A			# update context
1373	add	4($ctx),@T[0]
1374	add	8($ctx),$C
1375	add	12($ctx),$D
1376	mov	$A,0($ctx)
1377	add	16($ctx),$E
1378	mov	@T[0],4($ctx)
1379	mov	@T[0],$B			# magic seed
1380	mov	$C,8($ctx)
1381	mov	$C,@T[1]
1382	mov	$D,12($ctx)
1383	xor	$D,@T[1]
1384	mov	$E,16($ctx)
1385	and	@T[1],@T[0]
1386	jmp	.Loop_avx
1387
1388.Ldone_avx:
1389___
1390				$jj=$j=$saved_j; @V=@saved_V;
1391				$r=$saved_r;     @rndkey=@saved_rndkey;
1392
1393	&Xtail_avx(\&body_20_39);
1394	&Xtail_avx(\&body_20_39);
1395	&Xtail_avx(\&body_20_39);
1396
1397$code.=<<___;
1398	vmovups	$iv,48($out,$in0)		# write output
1399	mov	88(%rsp),$ivp			# restore $ivp
1400
1401	add	0($ctx),$A			# update context
1402	add	4($ctx),@T[0]
1403	add	8($ctx),$C
1404	mov	$A,0($ctx)
1405	add	12($ctx),$D
1406	mov	@T[0],4($ctx)
1407	add	16($ctx),$E
1408	mov	$C,8($ctx)
1409	mov	$D,12($ctx)
1410	mov	$E,16($ctx)
1411	vmovups	$iv,($ivp)			# write IV
1412	vzeroall
1413___
1414$code.=<<___ if ($win64);
1415	movaps	96+0(%rsp),%xmm6
1416	movaps	96+16(%rsp),%xmm7
1417	movaps	96+32(%rsp),%xmm8
1418	movaps	96+48(%rsp),%xmm9
1419	movaps	96+64(%rsp),%xmm10
1420	movaps	96+80(%rsp),%xmm11
1421	movaps	96+96(%rsp),%xmm12
1422	movaps	96+112(%rsp),%xmm13
1423	movaps	96+128(%rsp),%xmm14
1424	movaps	96+144(%rsp),%xmm15
1425___
1426$code.=<<___;
1427	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1428	mov	0(%rsi),%r15
1429	mov	8(%rsi),%r14
1430	mov	16(%rsi),%r13
1431	mov	24(%rsi),%r12
1432	mov	32(%rsi),%rbp
1433	mov	40(%rsi),%rbx
1434	lea	48(%rsi),%rsp
1435.Lepilogue_avx:
1436	ret
1437.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1438___
1439
1440						if ($stitched_decrypt) {{{
1441# reset
1442($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1443
1444$j=$jj=$r=$rx=0;
1445$Xi=4;
1446
1447@aes256_dec = (
1448	'&vpxor	($inout0,$rndkey0,"0x00($in0)");',
1449	'&vpxor	($inout1,$rndkey0,"0x10($in0)");',
1450	'&vpxor	($inout2,$rndkey0,"0x20($in0)");',
1451	'&vpxor	($inout3,$rndkey0,"0x30($in0)");',
1452
1453	'&vmovups($rndkey0,"16-112($key)");',
1454	'&vmovups("64(%rsp)",@X[2]);',		# save IV, originally @X[3]
1455	undef,undef
1456	);
1457for ($i=0;$i<13;$i++) {
1458    push (@aes256_dec,(
1459	'&vaesdec	($inout0,$inout0,$rndkey0);',
1460	'&vaesdec	($inout1,$inout1,$rndkey0);',
1461	'&vaesdec	($inout2,$inout2,$rndkey0);',
1462	'&vaesdec	($inout3,$inout3,$rndkey0);	&vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
1463	));
1464    push (@aes256_dec,(undef,undef))	if (($i>=3 && $i<=5) || $i>=11);
1465    push (@aes256_dec,(undef,undef))	if ($i==5);
1466}
1467push(@aes256_dec,(
1468	'&vaesdeclast	($inout0,$inout0,$rndkey0);	&vmovups(@X[0],"0x00($in0)");',
1469	'&vaesdeclast	($inout1,$inout1,$rndkey0);	&vmovups(@X[1],"0x10($in0)");',
1470	'&vaesdeclast	($inout2,$inout2,$rndkey0);	&vmovups(@X[2],"0x20($in0)");',
1471	'&vaesdeclast	($inout3,$inout3,$rndkey0);	&vmovups(@X[3],"0x30($in0)");',
1472
1473	'&vxorps	($inout0,$inout0,"64(%rsp)");	&vmovdqu($rndkey0,"-112($key)");',
1474	'&vxorps	($inout1,$inout1,@X[0]);	&vmovups("0x00($out,$in0)",$inout0);',
1475	'&vxorps	($inout2,$inout2,@X[1]);	&vmovups("0x10($out,$in0)",$inout1);',
1476	'&vxorps	($inout3,$inout3,@X[2]);	&vmovups("0x20($out,$in0)",$inout2);',
1477
1478	'&vmovups	("0x30($out,$in0)",$inout3);'
1479	));
1480
1481$code.=<<___;
1482.type	aesni256_cbc_sha1_dec_avx,\@function,6
1483.align	32
1484aesni256_cbc_sha1_dec_avx:
1485	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1486	push	%rbx
1487	push	%rbp
1488	push	%r12
1489	push	%r13
1490	push	%r14
1491	push	%r15
1492	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
1493___
1494$code.=<<___ if ($win64);
1495	movaps	%xmm6,96+0(%rsp)
1496	movaps	%xmm7,96+16(%rsp)
1497	movaps	%xmm8,96+32(%rsp)
1498	movaps	%xmm9,96+48(%rsp)
1499	movaps	%xmm10,96+64(%rsp)
1500	movaps	%xmm11,96+80(%rsp)
1501	movaps	%xmm12,96+96(%rsp)
1502	movaps	%xmm13,96+112(%rsp)
1503	movaps	%xmm14,96+128(%rsp)
1504	movaps	%xmm15,96+144(%rsp)
1505.Lprologue_dec_avx:
1506___
1507$code.=<<___;
1508	vzeroall
1509	mov	$in0,%r12			# reassign arguments
1510	mov	$out,%r13
1511	mov	$len,%r14
1512	lea	112($key),%r15			# size optimization
1513	vmovdqu	($ivp),@X[3]			# load IV
1514___
1515($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
1516$code.=<<___;
1517	shl	\$6,$len
1518	sub	$in0,$out
1519	add	$inp,$len		# end of input
1520
1521	lea	K_XX_XX(%rip),$K_XX_XX
1522	mov	0($ctx),$A		# load context
1523	mov	4($ctx),$B
1524	mov	8($ctx),$C
1525	mov	12($ctx),$D
1526	mov	$B,@T[0]		# magic seed
1527	mov	16($ctx),$E
1528	mov	$C,@T[1]
1529	xor	$D,@T[1]
1530	and	@T[1],@T[0]
1531
1532	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1533	vmovdqa	0($K_XX_XX),$Kx		# K_00_19
1534	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1535	vmovdqu	16($inp),@X[-3&7]
1536	vmovdqu	32($inp),@X[-2&7]
1537	vmovdqu	48($inp),@X[-1&7]
1538	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1539	add	\$64,$inp
1540	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1541	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1542	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1543	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1544	vpaddd	$Kx,@X[-3&7],@X[1]
1545	vpaddd	$Kx,@X[-2&7],@X[2]
1546	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1547	vmovdqa	@X[1],16(%rsp)
1548	vmovdqa	@X[2],32(%rsp)
1549	vmovups	-112($key),$rndkey0	# $key[0]
1550	jmp	.Loop_dec_avx
1551
1552.align	32
1553.Loop_dec_avx:
1554___
1555	&Xupdate_avx_16_31(\&body_00_19_dec);
1556	&Xupdate_avx_16_31(\&body_00_19_dec);
1557	&Xupdate_avx_16_31(\&body_00_19_dec);
1558	&Xupdate_avx_16_31(\&body_00_19_dec);
1559	&Xupdate_avx_32_79(\&body_00_19_dec);
1560	&Xupdate_avx_32_79(\&body_20_39_dec);
1561	&Xupdate_avx_32_79(\&body_20_39_dec);
1562	&Xupdate_avx_32_79(\&body_20_39_dec);
1563	&Xupdate_avx_32_79(\&body_20_39_dec);
1564	&Xupdate_avx_32_79(\&body_20_39_dec);
1565	&Xupdate_avx_32_79(\&body_40_59_dec);
1566	&Xupdate_avx_32_79(\&body_40_59_dec);
1567	&Xupdate_avx_32_79(\&body_40_59_dec);
1568	&Xupdate_avx_32_79(\&body_40_59_dec);
1569	&Xupdate_avx_32_79(\&body_40_59_dec);
1570	&Xupdate_avx_32_79(\&body_20_39_dec);
1571	&Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx");	# can jump to "done"
1572
1573				$saved_j=$j; @saved_V=@V;
1574				$saved_rx=$rx;
1575
1576	&Xloop_avx(\&body_20_39_dec);
1577	&Xloop_avx(\&body_20_39_dec);
1578	&Xloop_avx(\&body_20_39_dec);
1579
1580	eval(@aes256_dec[-1]);			# last store
1581$code.=<<___;
1582	lea	64($in0),$in0
1583
1584	add	0($ctx),$A			# update context
1585	add	4($ctx),@T[0]
1586	add	8($ctx),$C
1587	add	12($ctx),$D
1588	mov	$A,0($ctx)
1589	add	16($ctx),$E
1590	mov	@T[0],4($ctx)
1591	mov	@T[0],$B			# magic seed
1592	mov	$C,8($ctx)
1593	mov	$C,@T[1]
1594	mov	$D,12($ctx)
1595	xor	$D,@T[1]
1596	mov	$E,16($ctx)
1597	and	@T[1],@T[0]
1598	jmp	.Loop_dec_avx
1599
1600.Ldone_dec_avx:
1601___
1602				$jj=$j=$saved_j; @V=@saved_V;
1603				$rx=$saved_rx;
1604
1605	&Xtail_avx(\&body_20_39_dec);
1606	&Xtail_avx(\&body_20_39_dec);
1607	&Xtail_avx(\&body_20_39_dec);
1608
1609	eval(@aes256_dec[-1]);			# last store
1610$code.=<<___;
1611
1612	add	0($ctx),$A			# update context
1613	add	4($ctx),@T[0]
1614	add	8($ctx),$C
1615	mov	$A,0($ctx)
1616	add	12($ctx),$D
1617	mov	@T[0],4($ctx)
1618	add	16($ctx),$E
1619	mov	$C,8($ctx)
1620	mov	$D,12($ctx)
1621	mov	$E,16($ctx)
1622	vmovups	@X[3],($ivp)			# write IV
1623	vzeroall
1624___
1625$code.=<<___ if ($win64);
1626	movaps	96+0(%rsp),%xmm6
1627	movaps	96+16(%rsp),%xmm7
1628	movaps	96+32(%rsp),%xmm8
1629	movaps	96+48(%rsp),%xmm9
1630	movaps	96+64(%rsp),%xmm10
1631	movaps	96+80(%rsp),%xmm11
1632	movaps	96+96(%rsp),%xmm12
1633	movaps	96+112(%rsp),%xmm13
1634	movaps	96+128(%rsp),%xmm14
1635	movaps	96+144(%rsp),%xmm15
1636___
1637$code.=<<___;
1638	lea	`104+($win64?10*16:0)`(%rsp),%rsi
1639	mov	0(%rsi),%r15
1640	mov	8(%rsi),%r14
1641	mov	16(%rsi),%r13
1642	mov	24(%rsi),%r12
1643	mov	32(%rsi),%rbp
1644	mov	40(%rsi),%rbx
1645	lea	48(%rsi),%rsp
1646.Lepilogue_dec_avx:
1647	ret
1648.size	aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
1649___
1650						}}}
1651}
1652$code.=<<___;
1653.align	64
1654K_XX_XX:
1655.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1656.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1657.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1658.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1659.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1660.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1661
1662.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1663.align	64
1664___
1665						if ($shaext) {{{
1666($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1667
1668$rounds="%r11d";
1669
1670($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
1671@rndkey=("%xmm0","%xmm1");
1672$r=0;
1673
1674my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12));
1675my @MSG=map("%xmm$_",(3..6));
1676
1677$code.=<<___;
1678.type	aesni_cbc_sha1_enc_shaext,\@function,6
1679.align	32
1680aesni_cbc_sha1_enc_shaext:
1681	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1682___
1683$code.=<<___ if ($win64);
1684	lea	`-8-10*16`(%rsp),%rsp
1685	movaps	%xmm6,-8-10*16(%rax)
1686	movaps	%xmm7,-8-9*16(%rax)
1687	movaps	%xmm8,-8-8*16(%rax)
1688	movaps	%xmm9,-8-7*16(%rax)
1689	movaps	%xmm10,-8-6*16(%rax)
1690	movaps	%xmm11,-8-5*16(%rax)
1691	movaps	%xmm12,-8-4*16(%rax)
1692	movaps	%xmm13,-8-3*16(%rax)
1693	movaps	%xmm14,-8-2*16(%rax)
1694	movaps	%xmm15,-8-1*16(%rax)
1695.Lprologue_shaext:
1696___
1697$code.=<<___;
1698	movdqu	($ctx),$ABCD
1699	movd	16($ctx),$E
1700	movdqa	K_XX_XX+0x50(%rip),$BSWAP	# byte-n-word swap
1701
1702	mov	240($key),$rounds
1703	sub	$in0,$out
1704	movups	($key),$rndkey0			# $key[0]
1705	movups	($ivp),$iv			# load IV
1706	movups	16($key),$rndkey[0]		# forward reference
1707	lea	112($key),$key			# size optimization
1708
1709	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
1710	pshufd	\$0b00011011,$E,$E		# flip word order
1711	jmp	.Loop_shaext
1712
1713.align	16
1714.Loop_shaext:
1715___
1716	&$aesenc();
1717$code.=<<___;
1718	movdqu		($inp),@MSG[0]
1719	movdqa		$E,$E_SAVE		# offload $E
1720	pshufb		$BSWAP,@MSG[0]
1721	movdqu		0x10($inp),@MSG[1]
1722	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
1723___
1724	&$aesenc();
1725$code.=<<___;
1726	pshufb		$BSWAP,@MSG[1]
1727
1728	paddd		@MSG[0],$E
1729	movdqu		0x20($inp),@MSG[2]
1730	lea		0x40($inp),$inp
1731	pxor		$E_SAVE,@MSG[0]		# black magic
1732___
1733	&$aesenc();
1734$code.=<<___;
1735	pxor		$E_SAVE,@MSG[0]		# black magic
1736	movdqa		$ABCD,$E_
1737	pshufb		$BSWAP,@MSG[2]
1738	sha1rnds4	\$0,$E,$ABCD		# 0-3
1739	sha1nexte	@MSG[1],$E_
1740___
1741	&$aesenc();
1742$code.=<<___;
1743	sha1msg1	@MSG[1],@MSG[0]
1744	movdqu		-0x10($inp),@MSG[3]
1745	movdqa		$ABCD,$E
1746	pshufb		$BSWAP,@MSG[3]
1747___
1748	&$aesenc();
1749$code.=<<___;
1750	sha1rnds4	\$0,$E_,$ABCD		# 4-7
1751	sha1nexte	@MSG[2],$E
1752	pxor		@MSG[2],@MSG[0]
1753	sha1msg1	@MSG[2],@MSG[1]
1754___
1755	&$aesenc();
1756
1757for($i=2;$i<20-4;$i++) {
1758$code.=<<___;
1759	movdqa		$ABCD,$E_
1760	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 8-11
1761	sha1nexte	@MSG[3],$E_
1762___
1763	&$aesenc();
1764$code.=<<___;
1765	sha1msg2	@MSG[3],@MSG[0]
1766	pxor		@MSG[3],@MSG[1]
1767	sha1msg1	@MSG[3],@MSG[2]
1768___
1769	($E,$E_)=($E_,$E);
1770	push(@MSG,shift(@MSG));
1771
1772	&$aesenc();
1773}
1774$code.=<<___;
1775	movdqa		$ABCD,$E_
1776	sha1rnds4	\$3,$E,$ABCD		# 64-67
1777	sha1nexte	@MSG[3],$E_
1778	sha1msg2	@MSG[3],@MSG[0]
1779	pxor		@MSG[3],@MSG[1]
1780___
1781	&$aesenc();
1782$code.=<<___;
1783	movdqa		$ABCD,$E
1784	sha1rnds4	\$3,$E_,$ABCD		# 68-71
1785	sha1nexte	@MSG[0],$E
1786	sha1msg2	@MSG[0],@MSG[1]
1787___
1788	&$aesenc();
1789$code.=<<___;
1790	movdqa		$E_SAVE,@MSG[0]
1791	movdqa		$ABCD,$E_
1792	sha1rnds4	\$3,$E,$ABCD		# 72-75
1793	sha1nexte	@MSG[1],$E_
1794___
1795	&$aesenc();
1796$code.=<<___;
1797	movdqa		$ABCD,$E
1798	sha1rnds4	\$3,$E_,$ABCD		# 76-79
1799	sha1nexte	$MSG[0],$E
1800___
1801	while($r<40)	{ &$aesenc(); }		# remaining aesenc's
1802$code.=<<___;
1803	dec		$len
1804
1805	paddd		$ABCD_SAVE,$ABCD
1806	movups		$iv,48($out,$in0)	# write output
1807	lea		64($in0),$in0
1808	jnz		.Loop_shaext
1809
1810	pshufd	\$0b00011011,$ABCD,$ABCD
1811	pshufd	\$0b00011011,$E,$E
1812	movups	$iv,($ivp)			# write IV
1813	movdqu	$ABCD,($ctx)
1814	movd	$E,16($ctx)
1815___
1816$code.=<<___ if ($win64);
1817	movaps	-8-10*16(%rax),%xmm6
1818	movaps	-8-9*16(%rax),%xmm7
1819	movaps	-8-8*16(%rax),%xmm8
1820	movaps	-8-7*16(%rax),%xmm9
1821	movaps	-8-6*16(%rax),%xmm10
1822	movaps	-8-5*16(%rax),%xmm11
1823	movaps	-8-4*16(%rax),%xmm12
1824	movaps	-8-3*16(%rax),%xmm13
1825	movaps	-8-2*16(%rax),%xmm14
1826	movaps	-8-1*16(%rax),%xmm15
1827	mov	%rax,%rsp
1828.Lepilogue_shaext:
1829___
1830$code.=<<___;
1831	ret
1832.size	aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext
1833___
1834						}}}
1835# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1836#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1837if ($win64) {
1838$rec="%rcx";
1839$frame="%rdx";
1840$context="%r8";
1841$disp="%r9";
1842
1843$code.=<<___;
1844.extern	__imp_RtlVirtualUnwind
1845.type	ssse3_handler,\@abi-omnipotent
1846.align	16
1847ssse3_handler:
1848	push	%rsi
1849	push	%rdi
1850	push	%rbx
1851	push	%rbp
1852	push	%r12
1853	push	%r13
1854	push	%r14
1855	push	%r15
1856	pushfq
1857	sub	\$64,%rsp
1858
1859	mov	120($context),%rax	# pull context->Rax
1860	mov	248($context),%rbx	# pull context->Rip
1861
1862	mov	8($disp),%rsi		# disp->ImageBase
1863	mov	56($disp),%r11		# disp->HandlerData
1864
1865	mov	0(%r11),%r10d		# HandlerData[0]
1866	lea	(%rsi,%r10),%r10	# prologue label
1867	cmp	%r10,%rbx		# context->Rip<prologue label
1868	jb	.Lcommon_seh_tail
1869
1870	mov	152($context),%rax	# pull context->Rsp
1871
1872	mov	4(%r11),%r10d		# HandlerData[1]
1873	lea	(%rsi,%r10),%r10	# epilogue label
1874	cmp	%r10,%rbx		# context->Rip>=epilogue label
1875	jae	.Lcommon_seh_tail
1876___
1877$code.=<<___ if ($shaext);
1878	lea	aesni_cbc_sha1_enc_shaext(%rip),%r10
1879	cmp	%r10,%rbx
1880	jb	.Lseh_no_shaext
1881
1882	lea	(%rax),%rsi
1883	lea	512($context),%rdi	# &context.Xmm6
1884	mov	\$20,%ecx
1885	.long	0xa548f3fc		# cld; rep movsq
1886	lea	168(%rax),%rax		# adjust stack pointer
1887	jmp	.Lcommon_seh_tail
1888.Lseh_no_shaext:
1889___
1890$code.=<<___;
1891	lea	96(%rax),%rsi
1892	lea	512($context),%rdi	# &context.Xmm6
1893	mov	\$20,%ecx
1894	.long	0xa548f3fc		# cld; rep movsq
1895	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
1896
1897	mov	0(%rax),%r15
1898	mov	8(%rax),%r14
1899	mov	16(%rax),%r13
1900	mov	24(%rax),%r12
1901	mov	32(%rax),%rbp
1902	mov	40(%rax),%rbx
1903	lea	48(%rax),%rax
1904	mov	%rbx,144($context)	# restore context->Rbx
1905	mov	%rbp,160($context)	# restore context->Rbp
1906	mov	%r12,216($context)	# restore context->R12
1907	mov	%r13,224($context)	# restore context->R13
1908	mov	%r14,232($context)	# restore context->R14
1909	mov	%r15,240($context)	# restore context->R15
1910
1911.Lcommon_seh_tail:
1912	mov	8(%rax),%rdi
1913	mov	16(%rax),%rsi
1914	mov	%rax,152($context)	# restore context->Rsp
1915	mov	%rsi,168($context)	# restore context->Rsi
1916	mov	%rdi,176($context)	# restore context->Rdi
1917
1918	mov	40($disp),%rdi		# disp->ContextRecord
1919	mov	$context,%rsi		# context
1920	mov	\$154,%ecx		# sizeof(CONTEXT)
1921	.long	0xa548f3fc		# cld; rep movsq
1922
1923	mov	$disp,%rsi
1924	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1925	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1926	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1927	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1928	mov	40(%rsi),%r10		# disp->ContextRecord
1929	lea	56(%rsi),%r11		# &disp->HandlerData
1930	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1931	mov	%r10,32(%rsp)		# arg5
1932	mov	%r11,40(%rsp)		# arg6
1933	mov	%r12,48(%rsp)		# arg7
1934	mov	%rcx,56(%rsp)		# arg8, (NULL)
1935	call	*__imp_RtlVirtualUnwind(%rip)
1936
1937	mov	\$1,%eax		# ExceptionContinueSearch
1938	add	\$64,%rsp
1939	popfq
1940	pop	%r15
1941	pop	%r14
1942	pop	%r13
1943	pop	%r12
1944	pop	%rbp
1945	pop	%rbx
1946	pop	%rdi
1947	pop	%rsi
1948	ret
1949.size	ssse3_handler,.-ssse3_handler
1950
1951.section	.pdata
1952.align	4
1953	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
1954	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
1955	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
1956___
1957$code.=<<___ if ($avx);
1958	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
1959	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
1960	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
1961___
1962$code.=<<___ if ($shaext);
1963	.rva	.LSEH_begin_aesni_cbc_sha1_enc_shaext
1964	.rva	.LSEH_end_aesni_cbc_sha1_enc_shaext
1965	.rva	.LSEH_info_aesni_cbc_sha1_enc_shaext
1966___
1967$code.=<<___;
1968.section	.xdata
1969.align	8
1970.LSEH_info_aesni_cbc_sha1_enc_ssse3:
1971	.byte	9,0,0,0
1972	.rva	ssse3_handler
1973	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1974___
1975$code.=<<___ if ($avx);
1976.LSEH_info_aesni_cbc_sha1_enc_avx:
1977	.byte	9,0,0,0
1978	.rva	ssse3_handler
1979	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1980___
1981$code.=<<___ if ($shaext);
1982.LSEH_info_aesni_cbc_sha1_enc_shaext:
1983	.byte	9,0,0,0
1984	.rva	ssse3_handler
1985	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
1986___
1987}
1988
1989####################################################################
1990sub rex {
1991  local *opcode=shift;
1992  my ($dst,$src)=@_;
1993  my $rex=0;
1994
1995    $rex|=0x04			if($dst>=8);
1996    $rex|=0x01			if($src>=8);
1997    unshift @opcode,$rex|0x40	if($rex);
1998}
1999
2000sub sha1rnds4 {
2001    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2002      my @opcode=(0x0f,0x3a,0xcc);
2003	rex(\@opcode,$3,$2);
2004	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2005	my $c=$1;
2006	push @opcode,$c=~/^0/?oct($c):$c;
2007	return ".byte\t".join(',',@opcode);
2008    } else {
2009	return "sha1rnds4\t".@_[0];
2010    }
2011}
2012
2013sub sha1op38 {
2014    my $instr = shift;
2015    my %opcodelet = (
2016		"sha1nexte" => 0xc8,
2017  		"sha1msg1"  => 0xc9,
2018		"sha1msg2"  => 0xca	);
2019
2020    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2021      my @opcode=(0x0f,0x38);
2022	rex(\@opcode,$2,$1);
2023	push @opcode,$opcodelet{$instr};
2024	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2025	return ".byte\t".join(',',@opcode);
2026    } else {
2027	return $instr."\t".@_[0];
2028    }
2029}
2030
2031sub aesni {
2032  my $line=shift;
2033  my @opcode=(0x0f,0x38);
2034
2035    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2036	my %opcodelet = (
2037		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
2038		"aesdec" => 0xde,	"aesdeclast" => 0xdf
2039	);
2040	return undef if (!defined($opcodelet{$1}));
2041	rex(\@opcode,$3,$2);
2042	push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3);	# ModR/M
2043	unshift @opcode,0x66;
2044	return ".byte\t".join(',',@opcode);
2045    }
2046    return $line;
2047}
2048
2049foreach (split("\n",$code)) {
2050        s/\`([^\`]*)\`/eval $1/geo;
2051
2052	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo		or
2053	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo		or
2054	s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo;
2055
2056	print $_,"\n";
2057}
2058close STDOUT;
2059