bsaes-x86_64.pl revision 264331
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode]				###
5### bitsliced implementation for Intel Core 2 processors	###
6### requires support of SSE extensions up to SSSE3		###
7### Author: Emilia K��sper and Peter Schwabe			###
8### Date: 2009-03-19						###
9### Public domain						###
10###								###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12### further information.					###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22#   from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24#   allowed to feed its output back to aesenc[last], this was
25#   achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28#   relies on conversion of "conventional" key schedule as returned
29#   by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31#   to skip one shiftrows(), reduce bit-sliced key schedule and
32#   speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38#		Emilia's	this(*)		difference
39#
40# Core 2    	9.30		8.69		+7%
41# Nehalem(**) 	7.63		6.98		+9%
42# Atom	    	17.1		17.4		-2%(***)
43#
44# (*)	Comparison is not completely fair, because "this" is ECB,
45#	i.e. no extra processing such as counter values calculation
46#	and xor-ing input as in Emilia's CTR implementation is
47#	performed. However, the CTR calculations stand for not more
48#	than 1% of total time, so comparison is *rather* fair.
49#
50# (**)	Results were collected on Westmere, which is considered to
51#	be equivalent to Nehalem for this code.
52#
53# (***)	Slowdown on Atom is rather strange per se, because original
54#	implementation has a number of 9+-bytes instructions, which
55#	are bad for Atom front-end, and which I eliminated completely.
56#	In attempt to address deterioration sbox() was tested in FP
57#	SIMD "domain" (movaps instead of movdqa, xorps instead of
58#	pxor, etc.). While it resulted in nominal 4% improvement on
59#	Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# 		conversion	conversion/8x block
68# Core 2	240		0.22
69# Nehalem	180		0.20
70# Atom		430		0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2	9.83
87# Nehalem	7.74
88# Atom		19.0
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95#						<appro@openssl.org>
96
97$flavour = shift;
98$output  = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
113my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118sub Sbox {
119# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124	&InBasisChange	(@b);
125	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
126	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134	pxor	@b[6], @b[5]
135	pxor	@b[1], @b[2]
136	pxor	@b[0], @b[3]
137	pxor	@b[2], @b[6]
138	pxor 	@b[0], @b[5]
139
140	pxor	@b[3], @b[6]
141	pxor	@b[7], @b[3]
142	pxor	@b[5], @b[7]
143	pxor	@b[4], @b[3]
144	pxor	@b[5], @b[4]
145	pxor	@b[1], @b[3]
146
147	pxor	@b[7], @b[2]
148	pxor	@b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157	pxor	@b[6], @b[0]
158	pxor	@b[4], @b[1]
159	pxor	@b[0], @b[2]
160	pxor	@b[6], @b[4]
161	pxor	@b[1], @b[6]
162
163	pxor	@b[5], @b[1]
164	pxor	@b[3], @b[5]
165	pxor	@b[7], @b[3]
166	pxor	@b[5], @b[7]
167	pxor	@b[5], @b[2]
168
169	pxor	@b[7], @b[4]
170___
171}
172
173sub InvSbox {
174# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179	&InvInBasisChange	(@b);
180	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
181	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange {		# OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187	pxor	@b[7], @b[4]
188
189	pxor	@b[5], @b[7]
190	pxor	@b[5], @b[2]
191	pxor	@b[7], @b[3]
192	pxor	@b[3], @b[5]
193	pxor	@b[5], @b[1]
194
195	pxor	@b[1], @b[6]
196	pxor	@b[0], @b[2]
197	pxor	@b[6], @b[4]
198	pxor	@b[6], @b[0]
199	pxor	@b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange {		# InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206	pxor	@b[5], @b[1]
207	pxor	@b[7], @b[2]
208
209	pxor	@b[1], @b[3]
210	pxor	@b[5], @b[4]
211	pxor	@b[5], @b[7]
212	pxor	@b[4], @b[3]
213	 pxor 	@b[0], @b[5]
214	pxor	@b[7], @b[3]
215	 pxor	@b[2], @b[6]
216	 pxor	@b[1], @b[2]
217	pxor	@b[3], @b[6]
218
219	pxor	@b[0], @b[3]
220	pxor	@b[6], @b[5]
221___
222}
223
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230	movdqa	$y0, $t0
231	pxor 	$y1, $t0
232	pand	$x0, $t0
233	pxor	$x1, $x0
234	pand	$y0, $x1
235	pand	$y1, $x0
236	pxor	$x1, $x0
237	pxor	$t0, $x1
238___
239}
240
241sub Mul_GF4_N {				# not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245	movdqa	$y0, $t0
246	pxor	$y1, $t0
247	pand	$x0, $t0
248	pxor	$x1, $x0
249	pand	$y0, $x1
250	pand	$y1, $x0
251	pxor	$x0, $x1
252	pxor	$t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259    $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261	movdqa	$y0, $t0
262	 movdqa	$y2, $t1
263	pxor	$y1, $t0
264	 pxor 	$y3, $t1
265	pand	$x0, $t0
266	 pand	$x2, $t1
267	pxor	$x1, $x0
268	 pxor	$x3, $x2
269	pand	$y0, $x1
270	 pand	$y2, $x3
271	pand	$y1, $x0
272	 pand	$y3, $x2
273	pxor	$x0, $x1
274	 pxor	$x3, $x2
275	pxor	$t0, $x0
276	 pxor	$t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284	movdqa	@x[0], @t[0]
285	movdqa	@x[1], @t[1]
286___
287	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289	pxor	@x[2], @t[0]
290	pxor	@x[3], @t[1]
291	pxor	@y[2], @y[0]
292	pxor	@y[3], @y[1]
293___
294	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
295			 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297	pxor	@t[0], @x[0]
298	pxor	@t[0], @x[2]
299	pxor	@t[1], @x[1]
300	pxor	@t[1], @x[3]
301
302	movdqa	@x[4], @t[0]
303	movdqa	@x[5], @t[1]
304	pxor	@x[6], @t[0]
305	pxor	@x[7], @t[1]
306___
307	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
308			 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310	pxor	@y[2], @y[0]
311	pxor	@y[3], @y[1]
312___
313	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315	pxor	@t[0], @x[4]
316	pxor	@t[0], @x[6]
317	pxor	@t[1], @x[5]
318	pxor	@t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330	movdqa	@x[4], @t[3]
331	movdqa	@x[5], @t[2]
332	movdqa	@x[1], @t[1]
333	movdqa	@x[7], @s[1]
334	movdqa	@x[0], @s[0]
335
336	pxor	@x[6], @t[3]
337	pxor	@x[7], @t[2]
338	pxor	@x[3], @t[1]
339	 movdqa	@t[3], @s[2]
340	pxor	@x[6], @s[1]
341	 movdqa	@t[2], @t[0]
342	pxor	@x[2], @s[0]
343	 movdqa	@t[3], @s[3]
344
345	por	@t[1], @t[2]
346	por	@s[0], @t[3]
347	pxor	@t[0], @s[3]
348	pand	@s[0], @s[2]
349	pxor	@t[1], @s[0]
350	pand	@t[1], @t[0]
351	pand	@s[0], @s[3]
352	movdqa	@x[3], @s[0]
353	pxor	@x[2], @s[0]
354	pand	@s[0], @s[1]
355	pxor	@s[1], @t[3]
356	pxor	@s[1], @t[2]
357	movdqa	@x[4], @s[1]
358	movdqa	@x[1], @s[0]
359	pxor	@x[5], @s[1]
360	pxor	@x[0], @s[0]
361	movdqa	@s[1], @t[1]
362	pand	@s[0], @s[1]
363	por	@s[0], @t[1]
364	pxor	@s[1], @t[0]
365	pxor	@s[3], @t[3]
366	pxor	@s[2], @t[2]
367	pxor	@s[3], @t[1]
368	movdqa	@x[7], @s[0]
369	pxor	@s[2], @t[0]
370	movdqa	@x[6], @s[1]
371	pxor	@s[2], @t[1]
372	movdqa	@x[5], @s[2]
373	pand	@x[3], @s[0]
374	movdqa	@x[4], @s[3]
375	pand	@x[2], @s[1]
376	pand	@x[1], @s[2]
377	por	@x[0], @s[3]
378	pxor	@s[0], @t[3]
379	pxor	@s[1], @t[2]
380	pxor	@s[2], @t[1]
381	pxor	@s[3], @t[0]
382
383	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385	# new smaller inversion
386
387	movdqa	@t[3], @s[0]
388	pand	@t[1], @t[3]
389	pxor	@t[2], @s[0]
390
391	movdqa	@t[0], @s[2]
392	movdqa	@s[0], @s[3]
393	pxor	@t[3], @s[2]
394	pand	@s[2], @s[3]
395
396	movdqa	@t[1], @s[1]
397	pxor	@t[2], @s[3]
398	pxor	@t[0], @s[1]
399
400	pxor	@t[2], @t[3]
401
402	pand	@t[3], @s[1]
403
404	movdqa	@s[2], @t[2]
405	pxor	@t[0], @s[1]
406
407	pxor	@s[1], @t[2]
408	pxor	@s[1], @t[1]
409
410	pand	@t[0], @t[2]
411
412	pxor	@t[2], @s[2]
413	pxor	@t[2], @t[1]
414
415	pand	@s[3], @s[2]
416
417	pxor	@s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
431sub ShiftRows {
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435	pxor	0x00($key),@x[0]
436	pxor	0x10($key),@x[1]
437	pshufb	$mask,@x[0]
438	pxor	0x20($key),@x[2]
439	pshufb	$mask,@x[1]
440	pxor	0x30($key),@x[3]
441	pshufb	$mask,@x[2]
442	pxor	0x40($key),@x[4]
443	pshufb	$mask,@x[3]
444	pxor	0x50($key),@x[5]
445	pshufb	$mask,@x[4]
446	pxor	0x60($key),@x[6]
447	pshufb	$mask,@x[5]
448	pxor	0x70($key),@x[7]
449	pshufb	$mask,@x[6]
450	lea	0x80($key),$key
451	pshufb	$mask,@x[7]
452___
453}
454
455sub MixColumns {
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
459my $inv=@_[16];	# optional
460$code.=<<___;
461	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
462	pshufd	\$0x93, @x[1], @t[1]
463	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
464	pshufd	\$0x93, @x[2], @t[2]
465	 pxor	@t[1], @x[1]
466	pshufd	\$0x93, @x[3], @t[3]
467	 pxor	@t[2], @x[2]
468	pshufd	\$0x93, @x[4], @t[4]
469	 pxor	@t[3], @x[3]
470	pshufd	\$0x93, @x[5], @t[5]
471	 pxor	@t[4], @x[4]
472	pshufd	\$0x93, @x[6], @t[6]
473	 pxor	@t[5], @x[5]
474	pshufd	\$0x93, @x[7], @t[7]
475	 pxor	@t[6], @x[6]
476	 pxor	@t[7], @x[7]
477
478	pxor	@x[0], @t[1]
479	pxor	@x[7], @t[0]
480	pxor	@x[7], @t[1]
481	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
482	pxor	@x[1], @t[2]
483	 pshufd	\$0x4E, @x[1], @x[1]
484	pxor	@x[4], @t[5]
485	 pxor	@t[0], @x[0]
486	pxor	@x[5], @t[6]
487	 pxor	@t[1], @x[1]
488	pxor	@x[3], @t[4]
489	 pshufd	\$0x4E, @x[4], @t[0]
490	pxor	@x[6], @t[7]
491	 pshufd	\$0x4E, @x[5], @t[1]
492	pxor	@x[2], @t[3]
493	 pshufd	\$0x4E, @x[3], @x[4]
494	pxor	@x[7], @t[3]
495	 pshufd	\$0x4E, @x[7], @x[5]
496	pxor	@x[7], @t[4]
497	 pshufd	\$0x4E, @x[6], @x[3]
498	pxor	@t[4], @t[0]
499	 pshufd	\$0x4E, @x[2], @x[6]
500	pxor	@t[5], @t[1]
501___
502$code.=<<___ if (!$inv);
503	pxor	@t[3], @x[4]
504	pxor	@t[7], @x[5]
505	pxor	@t[6], @x[3]
506	 movdqa	@t[0], @x[2]
507	pxor	@t[2], @x[6]
508	 movdqa	@t[1], @x[7]
509___
510$code.=<<___ if ($inv);
511	pxor	@x[4], @t[3]
512	pxor	@t[7], @x[5]
513	pxor	@x[3], @t[6]
514	 movdqa	@t[0], @x[3]
515	pxor	@t[2], @x[6]
516	 movdqa	@t[6], @x[2]
517	 movdqa	@t[1], @x[7]
518	 movdqa	@x[6], @x[4]
519	 movdqa	@t[3], @x[6]
520___
521}
522
523sub InvMixColumns_orig {
524my @x=@_[0..7];
525my @t=@_[8..15];
526
527$code.=<<___;
528	# multiplication by 0x0e
529	pshufd	\$0x93, @x[7], @t[7]
530	movdqa	@x[2], @t[2]
531	pxor	@x[5], @x[7]		# 7 5
532	pxor	@x[5], @x[2]		# 2 5
533	pshufd	\$0x93, @x[0], @t[0]
534	movdqa	@x[5], @t[5]
535	pxor	@x[0], @x[5]		# 5 0		[1]
536	pxor	@x[1], @x[0]		# 0 1
537	pshufd	\$0x93, @x[1], @t[1]
538	pxor	@x[2], @x[1]		# 1 25
539	pxor	@x[6], @x[0]		# 01 6		[2]
540	pxor	@x[3], @x[1]		# 125 3		[4]
541	pshufd	\$0x93, @x[3], @t[3]
542	pxor	@x[0], @x[2]		# 25 016	[3]
543	pxor	@x[7], @x[3]		# 3 75
544	pxor	@x[6], @x[7]		# 75 6		[0]
545	pshufd	\$0x93, @x[6], @t[6]
546	movdqa	@x[4], @t[4]
547	pxor	@x[4], @x[6]		# 6 4
548	pxor	@x[3], @x[4]		# 4 375		[6]
549	pxor	@x[7], @x[3]		# 375 756=36
550	pxor	@t[5], @x[6]		# 64 5		[7]
551	pxor	@t[2], @x[3]		# 36 2
552	pxor	@t[4], @x[3]		# 362 4		[5]
553	pshufd	\$0x93, @t[5], @t[5]
554___
555					my @y = @x[7,5,0,2,1,3,4,6];
556$code.=<<___;
557	# multiplication by 0x0b
558	pxor	@y[0], @y[1]
559	pxor	@t[0], @y[0]
560	pxor	@t[1], @y[1]
561	pshufd	\$0x93, @t[2], @t[2]
562	pxor	@t[5], @y[0]
563	pxor	@t[6], @y[1]
564	pxor	@t[7], @y[0]
565	pshufd	\$0x93, @t[4], @t[4]
566	pxor	@t[6], @t[7]		# clobber t[7]
567	pxor	@y[0], @y[1]
568
569	pxor	@t[0], @y[3]
570	pshufd	\$0x93, @t[0], @t[0]
571	pxor	@t[1], @y[2]
572	pxor	@t[1], @y[4]
573	pxor	@t[2], @y[2]
574	pshufd	\$0x93, @t[1], @t[1]
575	pxor	@t[2], @y[3]
576	pxor	@t[2], @y[5]
577	pxor	@t[7], @y[2]
578	pshufd	\$0x93, @t[2], @t[2]
579	pxor	@t[3], @y[3]
580	pxor	@t[3], @y[6]
581	pxor	@t[3], @y[4]
582	pshufd	\$0x93, @t[3], @t[3]
583	pxor	@t[4], @y[7]
584	pxor	@t[4], @y[5]
585	pxor	@t[7], @y[7]
586	pxor	@t[5], @y[3]
587	pxor	@t[4], @y[4]
588	pxor	@t[5], @t[7]		# clobber t[7] even more
589
590	pxor	@t[7], @y[5]
591	pshufd	\$0x93, @t[4], @t[4]
592	pxor	@t[7], @y[6]
593	pxor	@t[7], @y[4]
594
595	pxor	@t[5], @t[7]
596	pshufd	\$0x93, @t[5], @t[5]
597	pxor	@t[6], @t[7]		# restore t[7]
598
599	# multiplication by 0x0d
600	pxor	@y[7], @y[4]
601	pxor	@t[4], @y[7]
602	pshufd	\$0x93, @t[6], @t[6]
603	pxor	@t[0], @y[2]
604	pxor	@t[5], @y[7]
605	pxor	@t[2], @y[2]
606	pshufd	\$0x93, @t[7], @t[7]
607
608	pxor	@y[1], @y[3]
609	pxor	@t[1], @y[1]
610	pxor	@t[0], @y[0]
611	pxor	@t[0], @y[3]
612	pxor	@t[5], @y[1]
613	pxor	@t[5], @y[0]
614	pxor	@t[7], @y[1]
615	pshufd	\$0x93, @t[0], @t[0]
616	pxor	@t[6], @y[0]
617	pxor	@y[1], @y[3]
618	pxor	@t[1], @y[4]
619	pshufd	\$0x93, @t[1], @t[1]
620
621	pxor	@t[7], @y[7]
622	pxor	@t[2], @y[4]
623	pxor	@t[2], @y[5]
624	pshufd	\$0x93, @t[2], @t[2]
625	pxor	@t[6], @y[2]
626	pxor	@t[3], @t[6]		# clobber t[6]
627	pxor	@y[7], @y[4]
628	pxor	@t[6], @y[3]
629
630	pxor	@t[6], @y[6]
631	pxor	@t[5], @y[5]
632	pxor	@t[4], @y[6]
633	pshufd	\$0x93, @t[4], @t[4]
634	pxor	@t[6], @y[5]
635	pxor	@t[7], @y[6]
636	pxor	@t[3], @t[6]		# restore t[6]
637
638	pshufd	\$0x93, @t[5], @t[5]
639	pshufd	\$0x93, @t[6], @t[6]
640	pshufd	\$0x93, @t[7], @t[7]
641	pshufd	\$0x93, @t[3], @t[3]
642
643	# multiplication by 0x09
644	pxor	@y[1], @y[4]
645	pxor	@y[1], @t[1]		# t[1]=y[1]
646	pxor	@t[5], @t[0]		# clobber t[0]
647	pxor	@t[5], @t[1]
648	pxor	@t[0], @y[3]
649	pxor	@y[0], @t[0]		# t[0]=y[0]
650	pxor	@t[6], @t[1]
651	pxor	@t[7], @t[6]		# clobber t[6]
652	pxor	@t[1], @y[4]
653	pxor	@t[4], @y[7]
654	pxor	@y[4], @t[4]		# t[4]=y[4]
655	pxor	@t[3], @y[6]
656	pxor	@y[3], @t[3]		# t[3]=y[3]
657	pxor	@t[2], @y[5]
658	pxor	@y[2], @t[2]		# t[2]=y[2]
659	pxor	@t[7], @t[3]
660	pxor	@y[5], @t[5]		# t[5]=y[5]
661	pxor	@t[6], @t[2]
662	pxor	@t[6], @t[5]
663	pxor	@y[6], @t[6]		# t[6]=y[6]
664	pxor	@y[7], @t[7]		# t[7]=y[7]
665
666	movdqa	@t[0],@XMM[0]
667	movdqa	@t[1],@XMM[1]
668	movdqa	@t[2],@XMM[2]
669	movdqa	@t[3],@XMM[3]
670	movdqa	@t[4],@XMM[4]
671	movdqa	@t[5],@XMM[5]
672	movdqa	@t[6],@XMM[6]
673	movdqa	@t[7],@XMM[7]
674___
675}
676
677sub InvMixColumns {
678my @x=@_[0..7];
679my @t=@_[8..15];
680
681# Thanks to Jussi Kivilinna for providing pointer to
682#
683# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
686# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
687
688$code.=<<___;
689	# multiplication by 0x05-0x00-0x04-0x00
690	pshufd	\$0x4E, @x[0], @t[0]
691	pshufd	\$0x4E, @x[6], @t[6]
692	pxor	@x[0], @t[0]
693	pshufd	\$0x4E, @x[7], @t[7]
694	pxor	@x[6], @t[6]
695	pshufd	\$0x4E, @x[1], @t[1]
696	pxor	@x[7], @t[7]
697	pshufd	\$0x4E, @x[2], @t[2]
698	pxor	@x[1], @t[1]
699	pshufd	\$0x4E, @x[3], @t[3]
700	pxor	@x[2], @t[2]
701	 pxor	@t[6], @x[0]
702	 pxor	@t[6], @x[1]
703	pshufd	\$0x4E, @x[4], @t[4]
704	pxor	@x[3], @t[3]
705	 pxor	@t[0], @x[2]
706	 pxor	@t[1], @x[3]
707	pshufd	\$0x4E, @x[5], @t[5]
708	pxor	@x[4], @t[4]
709	 pxor	@t[7], @x[1]
710	 pxor	@t[2], @x[4]
711	pxor	@x[5], @t[5]
712
713	 pxor	@t[7], @x[2]
714	 pxor	@t[6], @x[3]
715	 pxor	@t[6], @x[4]
716	 pxor	@t[3], @x[5]
717	 pxor	@t[4], @x[6]
718	 pxor	@t[7], @x[4]
719	 pxor	@t[7], @x[5]
720	 pxor	@t[5], @x[7]
721___
722	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
723}
724
725sub aesenc {				# not used
726my @b=@_[0..7];
727my @t=@_[8..15];
728$code.=<<___;
729	movdqa	0x30($const),@t[0]	# .LSR
730___
731	&ShiftRows	(@b,@t[0]);
732	&Sbox		(@b,@t);
733	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
734}
735
736sub aesenclast {			# not used
737my @b=@_[0..7];
738my @t=@_[8..15];
739$code.=<<___;
740	movdqa	0x40($const),@t[0]	# .LSRM0
741___
742	&ShiftRows	(@b,@t[0]);
743	&Sbox		(@b,@t);
744$code.=<<___
745	pxor	0x00($key),@b[0]
746	pxor	0x10($key),@b[1]
747	pxor	0x20($key),@b[4]
748	pxor	0x30($key),@b[6]
749	pxor	0x40($key),@b[3]
750	pxor	0x50($key),@b[7]
751	pxor	0x60($key),@b[2]
752	pxor	0x70($key),@b[5]
753___
754}
755
756sub swapmove {
757my ($a,$b,$n,$mask,$t)=@_;
758$code.=<<___;
759	movdqa	$b,$t
760	psrlq	\$$n,$b
761	pxor  	$a,$b
762	pand	$mask,$b
763	pxor	$b,$a
764	psllq	\$$n,$b
765	pxor	$t,$b
766___
767}
768sub swapmove2x {
769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770$code.=<<___;
771	movdqa	$b0,$t0
772	psrlq	\$$n,$b0
773	 movdqa	$b1,$t1
774	 psrlq	\$$n,$b1
775	pxor  	$a0,$b0
776	 pxor  	$a1,$b1
777	pand	$mask,$b0
778	 pand	$mask,$b1
779	pxor	$b0,$a0
780	psllq	\$$n,$b0
781	 pxor	$b1,$a1
782	 psllq	\$$n,$b1
783	pxor	$t0,$b0
784	 pxor	$t1,$b1
785___
786}
787
788sub bitslice {
789my @x=reverse(@_[0..7]);
790my ($t0,$t1,$t2,$t3)=@_[8..11];
791$code.=<<___;
792	movdqa	0x00($const),$t0	# .LBS0
793	movdqa	0x10($const),$t1	# .LBS1
794___
795	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797$code.=<<___;
798	movdqa	0x20($const),$t0	# .LBS2
799___
800	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802
803	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805}
806
807$code.=<<___;
808.text
809
810.extern	asm_AES_encrypt
811.extern	asm_AES_decrypt
812
813.type	_bsaes_encrypt8,\@abi-omnipotent
814.align	64
815_bsaes_encrypt8:
816	lea	.LBS0(%rip), $const	# constants table
817
818	movdqa	($key), @XMM[9]		# round 0 key
819	lea	0x10($key), $key
820	movdqa	0x50($const), @XMM[8]	# .LM0SR
821	pxor	@XMM[9], @XMM[0]	# xor with round0 key
822	pxor	@XMM[9], @XMM[1]
823	 pshufb	@XMM[8], @XMM[0]
824	pxor	@XMM[9], @XMM[2]
825	 pshufb	@XMM[8], @XMM[1]
826	pxor	@XMM[9], @XMM[3]
827	 pshufb	@XMM[8], @XMM[2]
828	pxor	@XMM[9], @XMM[4]
829	 pshufb	@XMM[8], @XMM[3]
830	pxor	@XMM[9], @XMM[5]
831	 pshufb	@XMM[8], @XMM[4]
832	pxor	@XMM[9], @XMM[6]
833	 pshufb	@XMM[8], @XMM[5]
834	pxor	@XMM[9], @XMM[7]
835	 pshufb	@XMM[8], @XMM[6]
836	 pshufb	@XMM[8], @XMM[7]
837_bsaes_encrypt8_bitslice:
838___
839	&bitslice	(@XMM[0..7, 8..11]);
840$code.=<<___;
841	dec	$rounds
842	jmp	.Lenc_sbox
843.align	16
844.Lenc_loop:
845___
846	&ShiftRows	(@XMM[0..7, 8]);
847$code.=".Lenc_sbox:\n";
848	&Sbox		(@XMM[0..7, 8..15]);
849$code.=<<___;
850	dec	$rounds
851	jl	.Lenc_done
852___
853	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
854$code.=<<___;
855	movdqa	0x30($const), @XMM[8]	# .LSR
856	jnz	.Lenc_loop
857	movdqa	0x40($const), @XMM[8]	# .LSRM0
858	jmp	.Lenc_loop
859.align	16
860.Lenc_done:
861___
862	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
863	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
864$code.=<<___;
865	movdqa	($key), @XMM[8]		# last round key
866	pxor	@XMM[8], @XMM[4]
867	pxor	@XMM[8], @XMM[6]
868	pxor	@XMM[8], @XMM[3]
869	pxor	@XMM[8], @XMM[7]
870	pxor	@XMM[8], @XMM[2]
871	pxor	@XMM[8], @XMM[5]
872	pxor	@XMM[8], @XMM[0]
873	pxor	@XMM[8], @XMM[1]
874	ret
875.size	_bsaes_encrypt8,.-_bsaes_encrypt8
876
877.type	_bsaes_decrypt8,\@abi-omnipotent
878.align	64
879_bsaes_decrypt8:
880	lea	.LBS0(%rip), $const	# constants table
881
882	movdqa	($key), @XMM[9]		# round 0 key
883	lea	0x10($key), $key
884	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
885	pxor	@XMM[9], @XMM[0]	# xor with round0 key
886	pxor	@XMM[9], @XMM[1]
887	 pshufb	@XMM[8], @XMM[0]
888	pxor	@XMM[9], @XMM[2]
889	 pshufb	@XMM[8], @XMM[1]
890	pxor	@XMM[9], @XMM[3]
891	 pshufb	@XMM[8], @XMM[2]
892	pxor	@XMM[9], @XMM[4]
893	 pshufb	@XMM[8], @XMM[3]
894	pxor	@XMM[9], @XMM[5]
895	 pshufb	@XMM[8], @XMM[4]
896	pxor	@XMM[9], @XMM[6]
897	 pshufb	@XMM[8], @XMM[5]
898	pxor	@XMM[9], @XMM[7]
899	 pshufb	@XMM[8], @XMM[6]
900	 pshufb	@XMM[8], @XMM[7]
901___
902	&bitslice	(@XMM[0..7, 8..11]);
903$code.=<<___;
904	dec	$rounds
905	jmp	.Ldec_sbox
906.align	16
907.Ldec_loop:
908___
909	&ShiftRows	(@XMM[0..7, 8]);
910$code.=".Ldec_sbox:\n";
911	&InvSbox	(@XMM[0..7, 8..15]);
912$code.=<<___;
913	dec	$rounds
914	jl	.Ldec_done
915___
916	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
917$code.=<<___;
918	movdqa	-0x10($const), @XMM[8]	# .LISR
919	jnz	.Ldec_loop
920	movdqa	-0x20($const), @XMM[8]	# .LISRM0
921	jmp	.Ldec_loop
922.align	16
923.Ldec_done:
924___
925	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
926$code.=<<___;
927	movdqa	($key), @XMM[8]		# last round key
928	pxor	@XMM[8], @XMM[6]
929	pxor	@XMM[8], @XMM[4]
930	pxor	@XMM[8], @XMM[2]
931	pxor	@XMM[8], @XMM[7]
932	pxor	@XMM[8], @XMM[3]
933	pxor	@XMM[8], @XMM[5]
934	pxor	@XMM[8], @XMM[0]
935	pxor	@XMM[8], @XMM[1]
936	ret
937.size	_bsaes_decrypt8,.-_bsaes_decrypt8
938___
939}
940{
941my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
942
943sub bitslice_key {
944my @x=reverse(@_[0..7]);
945my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
946
947	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
948$code.=<<___;
949	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
950	movdqa	@x[0], @x[2]
951	movdqa	@x[1], @x[3]
952___
953	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
954
955	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
956$code.=<<___;
957	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
958	movdqa	@x[0], @x[4]
959	movdqa	@x[2], @x[6]
960	movdqa	@x[1], @x[5]
961	movdqa	@x[3], @x[7]
962___
963	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
964	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
965}
966
967$code.=<<___;
968.type	_bsaes_key_convert,\@abi-omnipotent
969.align	16
970_bsaes_key_convert:
971	lea	.Lmasks(%rip), $const
972	movdqu	($inp), %xmm7		# load round 0 key
973	lea	0x10($inp), $inp
974	movdqa	0x00($const), %xmm0	# 0x01...
975	movdqa	0x10($const), %xmm1	# 0x02...
976	movdqa	0x20($const), %xmm2	# 0x04...
977	movdqa	0x30($const), %xmm3	# 0x08...
978	movdqa	0x40($const), %xmm4	# .LM0
979	pcmpeqd	%xmm5, %xmm5		# .LNOT
980
981	movdqu	($inp), %xmm6		# load round 1 key
982	movdqa	%xmm7, ($out)		# save round 0 key
983	lea	0x10($out), $out
984	dec	$rounds
985	jmp	.Lkey_loop
986.align	16
987.Lkey_loop:
988	pshufb	%xmm4, %xmm6		# .LM0
989
990	movdqa	%xmm0,	%xmm8
991	movdqa	%xmm1,	%xmm9
992
993	pand	%xmm6,	%xmm8
994	pand	%xmm6,	%xmm9
995	movdqa	%xmm2,	%xmm10
996	pcmpeqb	%xmm0,	%xmm8
997	psllq	\$4,	%xmm0		# 0x10...
998	movdqa	%xmm3,	%xmm11
999	pcmpeqb	%xmm1,	%xmm9
1000	psllq	\$4,	%xmm1		# 0x20...
1001
1002	pand	%xmm6,	%xmm10
1003	pand	%xmm6,	%xmm11
1004	movdqa	%xmm0,	%xmm12
1005	pcmpeqb	%xmm2,	%xmm10
1006	psllq	\$4,	%xmm2		# 0x40...
1007	movdqa	%xmm1,	%xmm13
1008	pcmpeqb	%xmm3,	%xmm11
1009	psllq	\$4,	%xmm3		# 0x80...
1010
1011	movdqa	%xmm2,	%xmm14
1012	movdqa	%xmm3,	%xmm15
1013	 pxor	%xmm5,	%xmm8		# "pnot"
1014	 pxor	%xmm5,	%xmm9
1015
1016	pand	%xmm6,	%xmm12
1017	pand	%xmm6,	%xmm13
1018	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1019	pcmpeqb	%xmm0,	%xmm12
1020	psrlq	\$4,	%xmm0		# 0x01...
1021	 movdqa	%xmm9, 0x10($out)
1022	pcmpeqb	%xmm1,	%xmm13
1023	psrlq	\$4,	%xmm1		# 0x02...
1024	 lea	0x10($inp), $inp
1025
1026	pand	%xmm6,	%xmm14
1027	pand	%xmm6,	%xmm15
1028	 movdqa	%xmm10, 0x20($out)
1029	pcmpeqb	%xmm2,	%xmm14
1030	psrlq	\$4,	%xmm2		# 0x04...
1031	 movdqa	%xmm11, 0x30($out)
1032	pcmpeqb	%xmm3,	%xmm15
1033	psrlq	\$4,	%xmm3		# 0x08...
1034	 movdqu	($inp), %xmm6		# load next round key
1035
1036	pxor	%xmm5, %xmm13		# "pnot"
1037	pxor	%xmm5, %xmm14
1038	movdqa	%xmm12, 0x40($out)
1039	movdqa	%xmm13, 0x50($out)
1040	movdqa	%xmm14, 0x60($out)
1041	movdqa	%xmm15, 0x70($out)
1042	lea	0x80($out),$out
1043	dec	$rounds
1044	jnz	.Lkey_loop
1045
1046	movdqa	0x50($const), %xmm7	# .L63
1047	#movdqa	%xmm6, ($out)		# don't save last round key
1048	ret
1049.size	_bsaes_key_convert,.-_bsaes_key_convert
1050___
1051}
1052
1053if (0 && !$win64) {	# following four functions are unsupported interface
1054			# used for benchmarking...
1055$code.=<<___;
1056.globl	bsaes_enc_key_convert
1057.type	bsaes_enc_key_convert,\@function,2
1058.align	16
1059bsaes_enc_key_convert:
1060	mov	240($inp),%r10d		# pass rounds
1061	mov	$inp,%rcx		# pass key
1062	mov	$out,%rax		# pass key schedule
1063	call	_bsaes_key_convert
1064	pxor	%xmm6,%xmm7		# fix up last round key
1065	movdqa	%xmm7,(%rax)		# save last round key
1066	ret
1067.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1068
1069.globl	bsaes_encrypt_128
1070.type	bsaes_encrypt_128,\@function,4
1071.align	16
1072bsaes_encrypt_128:
1073.Lenc128_loop:
1074	movdqu	0x00($inp), @XMM[0]	# load input
1075	movdqu	0x10($inp), @XMM[1]
1076	movdqu	0x20($inp), @XMM[2]
1077	movdqu	0x30($inp), @XMM[3]
1078	movdqu	0x40($inp), @XMM[4]
1079	movdqu	0x50($inp), @XMM[5]
1080	movdqu	0x60($inp), @XMM[6]
1081	movdqu	0x70($inp), @XMM[7]
1082	mov	$key, %rax		# pass the $key
1083	lea	0x80($inp), $inp
1084	mov	\$10,%r10d
1085
1086	call	_bsaes_encrypt8
1087
1088	movdqu	@XMM[0], 0x00($out)	# write output
1089	movdqu	@XMM[1], 0x10($out)
1090	movdqu	@XMM[4], 0x20($out)
1091	movdqu	@XMM[6], 0x30($out)
1092	movdqu	@XMM[3], 0x40($out)
1093	movdqu	@XMM[7], 0x50($out)
1094	movdqu	@XMM[2], 0x60($out)
1095	movdqu	@XMM[5], 0x70($out)
1096	lea	0x80($out), $out
1097	sub	\$0x80,$len
1098	ja	.Lenc128_loop
1099	ret
1100.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1101
1102.globl	bsaes_dec_key_convert
1103.type	bsaes_dec_key_convert,\@function,2
1104.align	16
1105bsaes_dec_key_convert:
1106	mov	240($inp),%r10d		# pass rounds
1107	mov	$inp,%rcx		# pass key
1108	mov	$out,%rax		# pass key schedule
1109	call	_bsaes_key_convert
1110	pxor	($out),%xmm7		# fix up round 0 key
1111	movdqa	%xmm6,(%rax)		# save last round key
1112	movdqa	%xmm7,($out)
1113	ret
1114.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1115
1116.globl	bsaes_decrypt_128
1117.type	bsaes_decrypt_128,\@function,4
1118.align	16
1119bsaes_decrypt_128:
1120.Ldec128_loop:
1121	movdqu	0x00($inp), @XMM[0]	# load input
1122	movdqu	0x10($inp), @XMM[1]
1123	movdqu	0x20($inp), @XMM[2]
1124	movdqu	0x30($inp), @XMM[3]
1125	movdqu	0x40($inp), @XMM[4]
1126	movdqu	0x50($inp), @XMM[5]
1127	movdqu	0x60($inp), @XMM[6]
1128	movdqu	0x70($inp), @XMM[7]
1129	mov	$key, %rax		# pass the $key
1130	lea	0x80($inp), $inp
1131	mov	\$10,%r10d
1132
1133	call	_bsaes_decrypt8
1134
1135	movdqu	@XMM[0], 0x00($out)	# write output
1136	movdqu	@XMM[1], 0x10($out)
1137	movdqu	@XMM[6], 0x20($out)
1138	movdqu	@XMM[4], 0x30($out)
1139	movdqu	@XMM[2], 0x40($out)
1140	movdqu	@XMM[7], 0x50($out)
1141	movdqu	@XMM[3], 0x60($out)
1142	movdqu	@XMM[5], 0x70($out)
1143	lea	0x80($out), $out
1144	sub	\$0x80,$len
1145	ja	.Ldec128_loop
1146	ret
1147.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1148___
1149}
1150{
1151######################################################################
1152#
1153# OpenSSL interface
1154#
1155my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1156						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1157my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1158
1159if ($ecb) {
1160$code.=<<___;
1161.globl	bsaes_ecb_encrypt_blocks
1162.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1163.align	16
1164bsaes_ecb_encrypt_blocks:
1165	mov	%rsp, %rax
1166.Lecb_enc_prologue:
1167	push	%rbp
1168	push	%rbx
1169	push	%r12
1170	push	%r13
1171	push	%r14
1172	push	%r15
1173	lea	-0x48(%rsp),%rsp
1174___
1175$code.=<<___ if ($win64);
1176	lea	-0xa0(%rsp), %rsp
1177	movaps	%xmm6, 0x40(%rsp)
1178	movaps	%xmm7, 0x50(%rsp)
1179	movaps	%xmm8, 0x60(%rsp)
1180	movaps	%xmm9, 0x70(%rsp)
1181	movaps	%xmm10, 0x80(%rsp)
1182	movaps	%xmm11, 0x90(%rsp)
1183	movaps	%xmm12, 0xa0(%rsp)
1184	movaps	%xmm13, 0xb0(%rsp)
1185	movaps	%xmm14, 0xc0(%rsp)
1186	movaps	%xmm15, 0xd0(%rsp)
1187.Lecb_enc_body:
1188___
1189$code.=<<___;
1190	mov	%rsp,%rbp		# backup %rsp
1191	mov	240($arg4),%eax		# rounds
1192	mov	$arg1,$inp		# backup arguments
1193	mov	$arg2,$out
1194	mov	$arg3,$len
1195	mov	$arg4,$key
1196	cmp	\$8,$arg3
1197	jb	.Lecb_enc_short
1198
1199	mov	%eax,%ebx		# backup rounds
1200	shl	\$7,%rax		# 128 bytes per inner round key
1201	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1202	sub	%rax,%rsp
1203	mov	%rsp,%rax		# pass key schedule
1204	mov	$key,%rcx		# pass key
1205	mov	%ebx,%r10d		# pass rounds
1206	call	_bsaes_key_convert
1207	pxor	%xmm6,%xmm7		# fix up last round key
1208	movdqa	%xmm7,(%rax)		# save last round key
1209
1210	sub	\$8,$len
1211.Lecb_enc_loop:
1212	movdqu	0x00($inp), @XMM[0]	# load input
1213	movdqu	0x10($inp), @XMM[1]
1214	movdqu	0x20($inp), @XMM[2]
1215	movdqu	0x30($inp), @XMM[3]
1216	movdqu	0x40($inp), @XMM[4]
1217	movdqu	0x50($inp), @XMM[5]
1218	mov	%rsp, %rax		# pass key schedule
1219	movdqu	0x60($inp), @XMM[6]
1220	mov	%ebx,%r10d		# pass rounds
1221	movdqu	0x70($inp), @XMM[7]
1222	lea	0x80($inp), $inp
1223
1224	call	_bsaes_encrypt8
1225
1226	movdqu	@XMM[0], 0x00($out)	# write output
1227	movdqu	@XMM[1], 0x10($out)
1228	movdqu	@XMM[4], 0x20($out)
1229	movdqu	@XMM[6], 0x30($out)
1230	movdqu	@XMM[3], 0x40($out)
1231	movdqu	@XMM[7], 0x50($out)
1232	movdqu	@XMM[2], 0x60($out)
1233	movdqu	@XMM[5], 0x70($out)
1234	lea	0x80($out), $out
1235	sub	\$8,$len
1236	jnc	.Lecb_enc_loop
1237
1238	add	\$8,$len
1239	jz	.Lecb_enc_done
1240
1241	movdqu	0x00($inp), @XMM[0]	# load input
1242	mov	%rsp, %rax		# pass key schedule
1243	mov	%ebx,%r10d		# pass rounds
1244	cmp	\$2,$len
1245	jb	.Lecb_enc_one
1246	movdqu	0x10($inp), @XMM[1]
1247	je	.Lecb_enc_two
1248	movdqu	0x20($inp), @XMM[2]
1249	cmp	\$4,$len
1250	jb	.Lecb_enc_three
1251	movdqu	0x30($inp), @XMM[3]
1252	je	.Lecb_enc_four
1253	movdqu	0x40($inp), @XMM[4]
1254	cmp	\$6,$len
1255	jb	.Lecb_enc_five
1256	movdqu	0x50($inp), @XMM[5]
1257	je	.Lecb_enc_six
1258	movdqu	0x60($inp), @XMM[6]
1259	call	_bsaes_encrypt8
1260	movdqu	@XMM[0], 0x00($out)	# write output
1261	movdqu	@XMM[1], 0x10($out)
1262	movdqu	@XMM[4], 0x20($out)
1263	movdqu	@XMM[6], 0x30($out)
1264	movdqu	@XMM[3], 0x40($out)
1265	movdqu	@XMM[7], 0x50($out)
1266	movdqu	@XMM[2], 0x60($out)
1267	jmp	.Lecb_enc_done
1268.align	16
1269.Lecb_enc_six:
1270	call	_bsaes_encrypt8
1271	movdqu	@XMM[0], 0x00($out)	# write output
1272	movdqu	@XMM[1], 0x10($out)
1273	movdqu	@XMM[4], 0x20($out)
1274	movdqu	@XMM[6], 0x30($out)
1275	movdqu	@XMM[3], 0x40($out)
1276	movdqu	@XMM[7], 0x50($out)
1277	jmp	.Lecb_enc_done
1278.align	16
1279.Lecb_enc_five:
1280	call	_bsaes_encrypt8
1281	movdqu	@XMM[0], 0x00($out)	# write output
1282	movdqu	@XMM[1], 0x10($out)
1283	movdqu	@XMM[4], 0x20($out)
1284	movdqu	@XMM[6], 0x30($out)
1285	movdqu	@XMM[3], 0x40($out)
1286	jmp	.Lecb_enc_done
1287.align	16
1288.Lecb_enc_four:
1289	call	_bsaes_encrypt8
1290	movdqu	@XMM[0], 0x00($out)	# write output
1291	movdqu	@XMM[1], 0x10($out)
1292	movdqu	@XMM[4], 0x20($out)
1293	movdqu	@XMM[6], 0x30($out)
1294	jmp	.Lecb_enc_done
1295.align	16
1296.Lecb_enc_three:
1297	call	_bsaes_encrypt8
1298	movdqu	@XMM[0], 0x00($out)	# write output
1299	movdqu	@XMM[1], 0x10($out)
1300	movdqu	@XMM[4], 0x20($out)
1301	jmp	.Lecb_enc_done
1302.align	16
1303.Lecb_enc_two:
1304	call	_bsaes_encrypt8
1305	movdqu	@XMM[0], 0x00($out)	# write output
1306	movdqu	@XMM[1], 0x10($out)
1307	jmp	.Lecb_enc_done
1308.align	16
1309.Lecb_enc_one:
1310	call	_bsaes_encrypt8
1311	movdqu	@XMM[0], 0x00($out)	# write output
1312	jmp	.Lecb_enc_done
1313.align	16
1314.Lecb_enc_short:
1315	lea	($inp), $arg1
1316	lea	($out), $arg2
1317	lea	($key), $arg3
1318	call	asm_AES_encrypt
1319	lea	16($inp), $inp
1320	lea	16($out), $out
1321	dec	$len
1322	jnz	.Lecb_enc_short
1323
1324.Lecb_enc_done:
1325	lea	(%rsp),%rax
1326	pxor	%xmm0, %xmm0
1327.Lecb_enc_bzero:			# wipe key schedule [if any]
1328	movdqa	%xmm0, 0x00(%rax)
1329	movdqa	%xmm0, 0x10(%rax)
1330	lea	0x20(%rax), %rax
1331	cmp	%rax, %rbp
1332	jb	.Lecb_enc_bzero
1333
1334	lea	(%rbp),%rsp		# restore %rsp
1335___
1336$code.=<<___ if ($win64);
1337	movaps	0x40(%rbp), %xmm6
1338	movaps	0x50(%rbp), %xmm7
1339	movaps	0x60(%rbp), %xmm8
1340	movaps	0x70(%rbp), %xmm9
1341	movaps	0x80(%rbp), %xmm10
1342	movaps	0x90(%rbp), %xmm11
1343	movaps	0xa0(%rbp), %xmm12
1344	movaps	0xb0(%rbp), %xmm13
1345	movaps	0xc0(%rbp), %xmm14
1346	movaps	0xd0(%rbp), %xmm15
1347	lea	0xa0(%rbp), %rsp
1348___
1349$code.=<<___;
1350	mov	0x48(%rsp), %r15
1351	mov	0x50(%rsp), %r14
1352	mov	0x58(%rsp), %r13
1353	mov	0x60(%rsp), %r12
1354	mov	0x68(%rsp), %rbx
1355	mov	0x70(%rsp), %rax
1356	lea	0x78(%rsp), %rsp
1357	mov	%rax, %rbp
1358.Lecb_enc_epilogue:
1359	ret
1360.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1361
1362.globl	bsaes_ecb_decrypt_blocks
1363.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1364.align	16
1365bsaes_ecb_decrypt_blocks:
1366	mov	%rsp, %rax
1367.Lecb_dec_prologue:
1368	push	%rbp
1369	push	%rbx
1370	push	%r12
1371	push	%r13
1372	push	%r14
1373	push	%r15
1374	lea	-0x48(%rsp),%rsp
1375___
1376$code.=<<___ if ($win64);
1377	lea	-0xa0(%rsp), %rsp
1378	movaps	%xmm6, 0x40(%rsp)
1379	movaps	%xmm7, 0x50(%rsp)
1380	movaps	%xmm8, 0x60(%rsp)
1381	movaps	%xmm9, 0x70(%rsp)
1382	movaps	%xmm10, 0x80(%rsp)
1383	movaps	%xmm11, 0x90(%rsp)
1384	movaps	%xmm12, 0xa0(%rsp)
1385	movaps	%xmm13, 0xb0(%rsp)
1386	movaps	%xmm14, 0xc0(%rsp)
1387	movaps	%xmm15, 0xd0(%rsp)
1388.Lecb_dec_body:
1389___
1390$code.=<<___;
1391	mov	%rsp,%rbp		# backup %rsp
1392	mov	240($arg4),%eax		# rounds
1393	mov	$arg1,$inp		# backup arguments
1394	mov	$arg2,$out
1395	mov	$arg3,$len
1396	mov	$arg4,$key
1397	cmp	\$8,$arg3
1398	jb	.Lecb_dec_short
1399
1400	mov	%eax,%ebx		# backup rounds
1401	shl	\$7,%rax		# 128 bytes per inner round key
1402	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1403	sub	%rax,%rsp
1404	mov	%rsp,%rax		# pass key schedule
1405	mov	$key,%rcx		# pass key
1406	mov	%ebx,%r10d		# pass rounds
1407	call	_bsaes_key_convert
1408	pxor	(%rsp),%xmm7		# fix up 0 round key
1409	movdqa	%xmm6,(%rax)		# save last round key
1410	movdqa	%xmm7,(%rsp)
1411
1412	sub	\$8,$len
1413.Lecb_dec_loop:
1414	movdqu	0x00($inp), @XMM[0]	# load input
1415	movdqu	0x10($inp), @XMM[1]
1416	movdqu	0x20($inp), @XMM[2]
1417	movdqu	0x30($inp), @XMM[3]
1418	movdqu	0x40($inp), @XMM[4]
1419	movdqu	0x50($inp), @XMM[5]
1420	mov	%rsp, %rax		# pass key schedule
1421	movdqu	0x60($inp), @XMM[6]
1422	mov	%ebx,%r10d		# pass rounds
1423	movdqu	0x70($inp), @XMM[7]
1424	lea	0x80($inp), $inp
1425
1426	call	_bsaes_decrypt8
1427
1428	movdqu	@XMM[0], 0x00($out)	# write output
1429	movdqu	@XMM[1], 0x10($out)
1430	movdqu	@XMM[6], 0x20($out)
1431	movdqu	@XMM[4], 0x30($out)
1432	movdqu	@XMM[2], 0x40($out)
1433	movdqu	@XMM[7], 0x50($out)
1434	movdqu	@XMM[3], 0x60($out)
1435	movdqu	@XMM[5], 0x70($out)
1436	lea	0x80($out), $out
1437	sub	\$8,$len
1438	jnc	.Lecb_dec_loop
1439
1440	add	\$8,$len
1441	jz	.Lecb_dec_done
1442
1443	movdqu	0x00($inp), @XMM[0]	# load input
1444	mov	%rsp, %rax		# pass key schedule
1445	mov	%ebx,%r10d		# pass rounds
1446	cmp	\$2,$len
1447	jb	.Lecb_dec_one
1448	movdqu	0x10($inp), @XMM[1]
1449	je	.Lecb_dec_two
1450	movdqu	0x20($inp), @XMM[2]
1451	cmp	\$4,$len
1452	jb	.Lecb_dec_three
1453	movdqu	0x30($inp), @XMM[3]
1454	je	.Lecb_dec_four
1455	movdqu	0x40($inp), @XMM[4]
1456	cmp	\$6,$len
1457	jb	.Lecb_dec_five
1458	movdqu	0x50($inp), @XMM[5]
1459	je	.Lecb_dec_six
1460	movdqu	0x60($inp), @XMM[6]
1461	call	_bsaes_decrypt8
1462	movdqu	@XMM[0], 0x00($out)	# write output
1463	movdqu	@XMM[1], 0x10($out)
1464	movdqu	@XMM[6], 0x20($out)
1465	movdqu	@XMM[4], 0x30($out)
1466	movdqu	@XMM[2], 0x40($out)
1467	movdqu	@XMM[7], 0x50($out)
1468	movdqu	@XMM[3], 0x60($out)
1469	jmp	.Lecb_dec_done
1470.align	16
1471.Lecb_dec_six:
1472	call	_bsaes_decrypt8
1473	movdqu	@XMM[0], 0x00($out)	# write output
1474	movdqu	@XMM[1], 0x10($out)
1475	movdqu	@XMM[6], 0x20($out)
1476	movdqu	@XMM[4], 0x30($out)
1477	movdqu	@XMM[2], 0x40($out)
1478	movdqu	@XMM[7], 0x50($out)
1479	jmp	.Lecb_dec_done
1480.align	16
1481.Lecb_dec_five:
1482	call	_bsaes_decrypt8
1483	movdqu	@XMM[0], 0x00($out)	# write output
1484	movdqu	@XMM[1], 0x10($out)
1485	movdqu	@XMM[6], 0x20($out)
1486	movdqu	@XMM[4], 0x30($out)
1487	movdqu	@XMM[2], 0x40($out)
1488	jmp	.Lecb_dec_done
1489.align	16
1490.Lecb_dec_four:
1491	call	_bsaes_decrypt8
1492	movdqu	@XMM[0], 0x00($out)	# write output
1493	movdqu	@XMM[1], 0x10($out)
1494	movdqu	@XMM[6], 0x20($out)
1495	movdqu	@XMM[4], 0x30($out)
1496	jmp	.Lecb_dec_done
1497.align	16
1498.Lecb_dec_three:
1499	call	_bsaes_decrypt8
1500	movdqu	@XMM[0], 0x00($out)	# write output
1501	movdqu	@XMM[1], 0x10($out)
1502	movdqu	@XMM[6], 0x20($out)
1503	jmp	.Lecb_dec_done
1504.align	16
1505.Lecb_dec_two:
1506	call	_bsaes_decrypt8
1507	movdqu	@XMM[0], 0x00($out)	# write output
1508	movdqu	@XMM[1], 0x10($out)
1509	jmp	.Lecb_dec_done
1510.align	16
1511.Lecb_dec_one:
1512	call	_bsaes_decrypt8
1513	movdqu	@XMM[0], 0x00($out)	# write output
1514	jmp	.Lecb_dec_done
1515.align	16
1516.Lecb_dec_short:
1517	lea	($inp), $arg1
1518	lea	($out), $arg2
1519	lea	($key), $arg3
1520	call	asm_AES_decrypt
1521	lea	16($inp), $inp
1522	lea	16($out), $out
1523	dec	$len
1524	jnz	.Lecb_dec_short
1525
1526.Lecb_dec_done:
1527	lea	(%rsp),%rax
1528	pxor	%xmm0, %xmm0
1529.Lecb_dec_bzero:			# wipe key schedule [if any]
1530	movdqa	%xmm0, 0x00(%rax)
1531	movdqa	%xmm0, 0x10(%rax)
1532	lea	0x20(%rax), %rax
1533	cmp	%rax, %rbp
1534	jb	.Lecb_dec_bzero
1535
1536	lea	(%rbp),%rsp		# restore %rsp
1537___
1538$code.=<<___ if ($win64);
1539	movaps	0x40(%rbp), %xmm6
1540	movaps	0x50(%rbp), %xmm7
1541	movaps	0x60(%rbp), %xmm8
1542	movaps	0x70(%rbp), %xmm9
1543	movaps	0x80(%rbp), %xmm10
1544	movaps	0x90(%rbp), %xmm11
1545	movaps	0xa0(%rbp), %xmm12
1546	movaps	0xb0(%rbp), %xmm13
1547	movaps	0xc0(%rbp), %xmm14
1548	movaps	0xd0(%rbp), %xmm15
1549	lea	0xa0(%rbp), %rsp
1550___
1551$code.=<<___;
1552	mov	0x48(%rsp), %r15
1553	mov	0x50(%rsp), %r14
1554	mov	0x58(%rsp), %r13
1555	mov	0x60(%rsp), %r12
1556	mov	0x68(%rsp), %rbx
1557	mov	0x70(%rsp), %rax
1558	lea	0x78(%rsp), %rsp
1559	mov	%rax, %rbp
1560.Lecb_dec_epilogue:
1561	ret
1562.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1563___
1564}
1565$code.=<<___;
1566.extern	asm_AES_cbc_encrypt
1567.globl	bsaes_cbc_encrypt
1568.type	bsaes_cbc_encrypt,\@abi-omnipotent
1569.align	16
1570bsaes_cbc_encrypt:
1571___
1572$code.=<<___ if ($win64);
1573	mov	48(%rsp),$arg6		# pull direction flag
1574___
1575$code.=<<___;
1576	cmp	\$0,$arg6
1577	jne	asm_AES_cbc_encrypt
1578	cmp	\$128,$arg3
1579	jb	asm_AES_cbc_encrypt
1580
1581	mov	%rsp, %rax
1582.Lcbc_dec_prologue:
1583	push	%rbp
1584	push	%rbx
1585	push	%r12
1586	push	%r13
1587	push	%r14
1588	push	%r15
1589	lea	-0x48(%rsp), %rsp
1590___
1591$code.=<<___ if ($win64);
1592	mov	0xa0(%rsp),$arg5	# pull ivp
1593	lea	-0xa0(%rsp), %rsp
1594	movaps	%xmm6, 0x40(%rsp)
1595	movaps	%xmm7, 0x50(%rsp)
1596	movaps	%xmm8, 0x60(%rsp)
1597	movaps	%xmm9, 0x70(%rsp)
1598	movaps	%xmm10, 0x80(%rsp)
1599	movaps	%xmm11, 0x90(%rsp)
1600	movaps	%xmm12, 0xa0(%rsp)
1601	movaps	%xmm13, 0xb0(%rsp)
1602	movaps	%xmm14, 0xc0(%rsp)
1603	movaps	%xmm15, 0xd0(%rsp)
1604.Lcbc_dec_body:
1605___
1606$code.=<<___;
1607	mov	%rsp, %rbp		# backup %rsp
1608	mov	240($arg4), %eax	# rounds
1609	mov	$arg1, $inp		# backup arguments
1610	mov	$arg2, $out
1611	mov	$arg3, $len
1612	mov	$arg4, $key
1613	mov	$arg5, %rbx
1614	shr	\$4, $len		# bytes to blocks
1615
1616	mov	%eax, %edx		# rounds
1617	shl	\$7, %rax		# 128 bytes per inner round key
1618	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1619	sub	%rax, %rsp
1620
1621	mov	%rsp, %rax		# pass key schedule
1622	mov	$key, %rcx		# pass key
1623	mov	%edx, %r10d		# pass rounds
1624	call	_bsaes_key_convert
1625	pxor	(%rsp),%xmm7		# fix up 0 round key
1626	movdqa	%xmm6,(%rax)		# save last round key
1627	movdqa	%xmm7,(%rsp)
1628
1629	movdqu	(%rbx), @XMM[15]	# load IV
1630	sub	\$8,$len
1631.Lcbc_dec_loop:
1632	movdqu	0x00($inp), @XMM[0]	# load input
1633	movdqu	0x10($inp), @XMM[1]
1634	movdqu	0x20($inp), @XMM[2]
1635	movdqu	0x30($inp), @XMM[3]
1636	movdqu	0x40($inp), @XMM[4]
1637	movdqu	0x50($inp), @XMM[5]
1638	mov	%rsp, %rax		# pass key schedule
1639	movdqu	0x60($inp), @XMM[6]
1640	mov	%edx,%r10d		# pass rounds
1641	movdqu	0x70($inp), @XMM[7]
1642	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1643
1644	call	_bsaes_decrypt8
1645
1646	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1647	movdqu	0x00($inp), @XMM[8]	# re-load input
1648	movdqu	0x10($inp), @XMM[9]
1649	pxor	@XMM[8], @XMM[1]
1650	movdqu	0x20($inp), @XMM[10]
1651	pxor	@XMM[9], @XMM[6]
1652	movdqu	0x30($inp), @XMM[11]
1653	pxor	@XMM[10], @XMM[4]
1654	movdqu	0x40($inp), @XMM[12]
1655	pxor	@XMM[11], @XMM[2]
1656	movdqu	0x50($inp), @XMM[13]
1657	pxor	@XMM[12], @XMM[7]
1658	movdqu	0x60($inp), @XMM[14]
1659	pxor	@XMM[13], @XMM[3]
1660	movdqu	0x70($inp), @XMM[15]	# IV
1661	pxor	@XMM[14], @XMM[5]
1662	movdqu	@XMM[0], 0x00($out)	# write output
1663	lea	0x80($inp), $inp
1664	movdqu	@XMM[1], 0x10($out)
1665	movdqu	@XMM[6], 0x20($out)
1666	movdqu	@XMM[4], 0x30($out)
1667	movdqu	@XMM[2], 0x40($out)
1668	movdqu	@XMM[7], 0x50($out)
1669	movdqu	@XMM[3], 0x60($out)
1670	movdqu	@XMM[5], 0x70($out)
1671	lea	0x80($out), $out
1672	sub	\$8,$len
1673	jnc	.Lcbc_dec_loop
1674
1675	add	\$8,$len
1676	jz	.Lcbc_dec_done
1677
1678	movdqu	0x00($inp), @XMM[0]	# load input
1679	mov	%rsp, %rax		# pass key schedule
1680	mov	%edx, %r10d		# pass rounds
1681	cmp	\$2,$len
1682	jb	.Lcbc_dec_one
1683	movdqu	0x10($inp), @XMM[1]
1684	je	.Lcbc_dec_two
1685	movdqu	0x20($inp), @XMM[2]
1686	cmp	\$4,$len
1687	jb	.Lcbc_dec_three
1688	movdqu	0x30($inp), @XMM[3]
1689	je	.Lcbc_dec_four
1690	movdqu	0x40($inp), @XMM[4]
1691	cmp	\$6,$len
1692	jb	.Lcbc_dec_five
1693	movdqu	0x50($inp), @XMM[5]
1694	je	.Lcbc_dec_six
1695	movdqu	0x60($inp), @XMM[6]
1696	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1697	call	_bsaes_decrypt8
1698	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1699	movdqu	0x00($inp), @XMM[8]	# re-load input
1700	movdqu	0x10($inp), @XMM[9]
1701	pxor	@XMM[8], @XMM[1]
1702	movdqu	0x20($inp), @XMM[10]
1703	pxor	@XMM[9], @XMM[6]
1704	movdqu	0x30($inp), @XMM[11]
1705	pxor	@XMM[10], @XMM[4]
1706	movdqu	0x40($inp), @XMM[12]
1707	pxor	@XMM[11], @XMM[2]
1708	movdqu	0x50($inp), @XMM[13]
1709	pxor	@XMM[12], @XMM[7]
1710	movdqu	0x60($inp), @XMM[15]	# IV
1711	pxor	@XMM[13], @XMM[3]
1712	movdqu	@XMM[0], 0x00($out)	# write output
1713	movdqu	@XMM[1], 0x10($out)
1714	movdqu	@XMM[6], 0x20($out)
1715	movdqu	@XMM[4], 0x30($out)
1716	movdqu	@XMM[2], 0x40($out)
1717	movdqu	@XMM[7], 0x50($out)
1718	movdqu	@XMM[3], 0x60($out)
1719	jmp	.Lcbc_dec_done
1720.align	16
1721.Lcbc_dec_six:
1722	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1723	call	_bsaes_decrypt8
1724	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1725	movdqu	0x00($inp), @XMM[8]	# re-load input
1726	movdqu	0x10($inp), @XMM[9]
1727	pxor	@XMM[8], @XMM[1]
1728	movdqu	0x20($inp), @XMM[10]
1729	pxor	@XMM[9], @XMM[6]
1730	movdqu	0x30($inp), @XMM[11]
1731	pxor	@XMM[10], @XMM[4]
1732	movdqu	0x40($inp), @XMM[12]
1733	pxor	@XMM[11], @XMM[2]
1734	movdqu	0x50($inp), @XMM[15]	# IV
1735	pxor	@XMM[12], @XMM[7]
1736	movdqu	@XMM[0], 0x00($out)	# write output
1737	movdqu	@XMM[1], 0x10($out)
1738	movdqu	@XMM[6], 0x20($out)
1739	movdqu	@XMM[4], 0x30($out)
1740	movdqu	@XMM[2], 0x40($out)
1741	movdqu	@XMM[7], 0x50($out)
1742	jmp	.Lcbc_dec_done
1743.align	16
1744.Lcbc_dec_five:
1745	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1746	call	_bsaes_decrypt8
1747	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1748	movdqu	0x00($inp), @XMM[8]	# re-load input
1749	movdqu	0x10($inp), @XMM[9]
1750	pxor	@XMM[8], @XMM[1]
1751	movdqu	0x20($inp), @XMM[10]
1752	pxor	@XMM[9], @XMM[6]
1753	movdqu	0x30($inp), @XMM[11]
1754	pxor	@XMM[10], @XMM[4]
1755	movdqu	0x40($inp), @XMM[15]	# IV
1756	pxor	@XMM[11], @XMM[2]
1757	movdqu	@XMM[0], 0x00($out)	# write output
1758	movdqu	@XMM[1], 0x10($out)
1759	movdqu	@XMM[6], 0x20($out)
1760	movdqu	@XMM[4], 0x30($out)
1761	movdqu	@XMM[2], 0x40($out)
1762	jmp	.Lcbc_dec_done
1763.align	16
1764.Lcbc_dec_four:
1765	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1766	call	_bsaes_decrypt8
1767	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1768	movdqu	0x00($inp), @XMM[8]	# re-load input
1769	movdqu	0x10($inp), @XMM[9]
1770	pxor	@XMM[8], @XMM[1]
1771	movdqu	0x20($inp), @XMM[10]
1772	pxor	@XMM[9], @XMM[6]
1773	movdqu	0x30($inp), @XMM[15]	# IV
1774	pxor	@XMM[10], @XMM[4]
1775	movdqu	@XMM[0], 0x00($out)	# write output
1776	movdqu	@XMM[1], 0x10($out)
1777	movdqu	@XMM[6], 0x20($out)
1778	movdqu	@XMM[4], 0x30($out)
1779	jmp	.Lcbc_dec_done
1780.align	16
1781.Lcbc_dec_three:
1782	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1783	call	_bsaes_decrypt8
1784	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1785	movdqu	0x00($inp), @XMM[8]	# re-load input
1786	movdqu	0x10($inp), @XMM[9]
1787	pxor	@XMM[8], @XMM[1]
1788	movdqu	0x20($inp), @XMM[15]	# IV
1789	pxor	@XMM[9], @XMM[6]
1790	movdqu	@XMM[0], 0x00($out)	# write output
1791	movdqu	@XMM[1], 0x10($out)
1792	movdqu	@XMM[6], 0x20($out)
1793	jmp	.Lcbc_dec_done
1794.align	16
1795.Lcbc_dec_two:
1796	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1797	call	_bsaes_decrypt8
1798	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1799	movdqu	0x00($inp), @XMM[8]	# re-load input
1800	movdqu	0x10($inp), @XMM[15]	# IV
1801	pxor	@XMM[8], @XMM[1]
1802	movdqu	@XMM[0], 0x00($out)	# write output
1803	movdqu	@XMM[1], 0x10($out)
1804	jmp	.Lcbc_dec_done
1805.align	16
1806.Lcbc_dec_one:
1807	lea	($inp), $arg1
1808	lea	0x20(%rbp), $arg2	# buffer output
1809	lea	($key), $arg3
1810	call	asm_AES_decrypt		# doesn't touch %xmm
1811	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1812	movdqu	@XMM[15], ($out)	# write output
1813	movdqa	@XMM[0], @XMM[15]	# IV
1814
1815.Lcbc_dec_done:
1816	movdqu	@XMM[15], (%rbx)	# return IV
1817	lea	(%rsp), %rax
1818	pxor	%xmm0, %xmm0
1819.Lcbc_dec_bzero:			# wipe key schedule [if any]
1820	movdqa	%xmm0, 0x00(%rax)
1821	movdqa	%xmm0, 0x10(%rax)
1822	lea	0x20(%rax), %rax
1823	cmp	%rax, %rbp
1824	ja	.Lcbc_dec_bzero
1825
1826	lea	(%rbp),%rsp		# restore %rsp
1827___
1828$code.=<<___ if ($win64);
1829	movaps	0x40(%rbp), %xmm6
1830	movaps	0x50(%rbp), %xmm7
1831	movaps	0x60(%rbp), %xmm8
1832	movaps	0x70(%rbp), %xmm9
1833	movaps	0x80(%rbp), %xmm10
1834	movaps	0x90(%rbp), %xmm11
1835	movaps	0xa0(%rbp), %xmm12
1836	movaps	0xb0(%rbp), %xmm13
1837	movaps	0xc0(%rbp), %xmm14
1838	movaps	0xd0(%rbp), %xmm15
1839	lea	0xa0(%rbp), %rsp
1840___
1841$code.=<<___;
1842	mov	0x48(%rsp), %r15
1843	mov	0x50(%rsp), %r14
1844	mov	0x58(%rsp), %r13
1845	mov	0x60(%rsp), %r12
1846	mov	0x68(%rsp), %rbx
1847	mov	0x70(%rsp), %rax
1848	lea	0x78(%rsp), %rsp
1849	mov	%rax, %rbp
1850.Lcbc_dec_epilogue:
1851	ret
1852.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1853
1854.globl	bsaes_ctr32_encrypt_blocks
1855.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1856.align	16
1857bsaes_ctr32_encrypt_blocks:
1858	mov	%rsp, %rax
1859.Lctr_enc_prologue:
1860	push	%rbp
1861	push	%rbx
1862	push	%r12
1863	push	%r13
1864	push	%r14
1865	push	%r15
1866	lea	-0x48(%rsp), %rsp
1867___
1868$code.=<<___ if ($win64);
1869	mov	0xa0(%rsp),$arg5	# pull ivp
1870	lea	-0xa0(%rsp), %rsp
1871	movaps	%xmm6, 0x40(%rsp)
1872	movaps	%xmm7, 0x50(%rsp)
1873	movaps	%xmm8, 0x60(%rsp)
1874	movaps	%xmm9, 0x70(%rsp)
1875	movaps	%xmm10, 0x80(%rsp)
1876	movaps	%xmm11, 0x90(%rsp)
1877	movaps	%xmm12, 0xa0(%rsp)
1878	movaps	%xmm13, 0xb0(%rsp)
1879	movaps	%xmm14, 0xc0(%rsp)
1880	movaps	%xmm15, 0xd0(%rsp)
1881.Lctr_enc_body:
1882___
1883$code.=<<___;
1884	mov	%rsp, %rbp		# backup %rsp
1885	movdqu	($arg5), %xmm0		# load counter
1886	mov	240($arg4), %eax	# rounds
1887	mov	$arg1, $inp		# backup arguments
1888	mov	$arg2, $out
1889	mov	$arg3, $len
1890	mov	$arg4, $key
1891	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1892	cmp	\$8, $arg3
1893	jb	.Lctr_enc_short
1894
1895	mov	%eax, %ebx		# rounds
1896	shl	\$7, %rax		# 128 bytes per inner round key
1897	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1898	sub	%rax, %rsp
1899
1900	mov	%rsp, %rax		# pass key schedule
1901	mov	$key, %rcx		# pass key
1902	mov	%ebx, %r10d		# pass rounds
1903	call	_bsaes_key_convert
1904	pxor	%xmm6,%xmm7		# fix up last round key
1905	movdqa	%xmm7,(%rax)		# save last round key
1906
1907	movdqa	(%rsp), @XMM[9]		# load round0 key
1908	lea	.LADD1(%rip), %r11
1909	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1910	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1911	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1912	pshufb	@XMM[8], @XMM[0]
1913	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1914	jmp	.Lctr_enc_loop
1915.align	16
1916.Lctr_enc_loop:
1917	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1918	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1919	movdqa	@XMM[0], @XMM[2]
1920	paddd	0x00(%r11), @XMM[1]	# .LADD1
1921	movdqa	@XMM[0], @XMM[3]
1922	paddd	0x10(%r11), @XMM[2]	# .LADD2
1923	movdqa	@XMM[0], @XMM[4]
1924	paddd	0x20(%r11), @XMM[3]	# .LADD3
1925	movdqa	@XMM[0], @XMM[5]
1926	paddd	0x30(%r11), @XMM[4]	# .LADD4
1927	movdqa	@XMM[0], @XMM[6]
1928	paddd	0x40(%r11), @XMM[5]	# .LADD5
1929	movdqa	@XMM[0], @XMM[7]
1930	paddd	0x50(%r11), @XMM[6]	# .LADD6
1931	paddd	0x60(%r11), @XMM[7]	# .LADD7
1932
1933	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1934	# to flip byte order in 32-bit counter
1935	movdqa	(%rsp), @XMM[9]		# round 0 key
1936	lea	0x10(%rsp), %rax	# pass key schedule
1937	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1938	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1939	pxor	@XMM[9], @XMM[1]
1940	 pshufb	@XMM[8], @XMM[0]
1941	pxor	@XMM[9], @XMM[2]
1942	 pshufb	@XMM[8], @XMM[1]
1943	pxor	@XMM[9], @XMM[3]
1944	 pshufb	@XMM[8], @XMM[2]
1945	pxor	@XMM[9], @XMM[4]
1946	 pshufb	@XMM[8], @XMM[3]
1947	pxor	@XMM[9], @XMM[5]
1948	 pshufb	@XMM[8], @XMM[4]
1949	pxor	@XMM[9], @XMM[6]
1950	 pshufb	@XMM[8], @XMM[5]
1951	pxor	@XMM[9], @XMM[7]
1952	 pshufb	@XMM[8], @XMM[6]
1953	lea	.LBS0(%rip), %r11	# constants table
1954	 pshufb	@XMM[8], @XMM[7]
1955	mov	%ebx,%r10d		# pass rounds
1956
1957	call	_bsaes_encrypt8_bitslice
1958
1959	sub	\$8,$len
1960	jc	.Lctr_enc_loop_done
1961
1962	movdqu	0x00($inp), @XMM[8]	# load input
1963	movdqu	0x10($inp), @XMM[9]
1964	movdqu	0x20($inp), @XMM[10]
1965	movdqu	0x30($inp), @XMM[11]
1966	movdqu	0x40($inp), @XMM[12]
1967	movdqu	0x50($inp), @XMM[13]
1968	movdqu	0x60($inp), @XMM[14]
1969	movdqu	0x70($inp), @XMM[15]
1970	lea	0x80($inp),$inp
1971	pxor	@XMM[0], @XMM[8]
1972	movdqa	0x20(%rbp), @XMM[0]	# load counter
1973	pxor	@XMM[9], @XMM[1]
1974	movdqu	@XMM[8], 0x00($out)	# write output
1975	pxor	@XMM[10], @XMM[4]
1976	movdqu	@XMM[1], 0x10($out)
1977	pxor	@XMM[11], @XMM[6]
1978	movdqu	@XMM[4], 0x20($out)
1979	pxor	@XMM[12], @XMM[3]
1980	movdqu	@XMM[6], 0x30($out)
1981	pxor	@XMM[13], @XMM[7]
1982	movdqu	@XMM[3], 0x40($out)
1983	pxor	@XMM[14], @XMM[2]
1984	movdqu	@XMM[7], 0x50($out)
1985	pxor	@XMM[15], @XMM[5]
1986	movdqu	@XMM[2], 0x60($out)
1987	lea	.LADD1(%rip), %r11
1988	movdqu	@XMM[5], 0x70($out)
1989	lea	0x80($out), $out
1990	paddd	0x70(%r11), @XMM[0]	# .LADD8
1991	jnz	.Lctr_enc_loop
1992
1993	jmp	.Lctr_enc_done
1994.align	16
1995.Lctr_enc_loop_done:
1996	add	\$8, $len
1997	movdqu	0x00($inp), @XMM[8]	# load input
1998	pxor	@XMM[8], @XMM[0]
1999	movdqu	@XMM[0], 0x00($out)	# write output
2000	cmp	\$2,$len
2001	jb	.Lctr_enc_done
2002	movdqu	0x10($inp), @XMM[9]
2003	pxor	@XMM[9], @XMM[1]
2004	movdqu	@XMM[1], 0x10($out)
2005	je	.Lctr_enc_done
2006	movdqu	0x20($inp), @XMM[10]
2007	pxor	@XMM[10], @XMM[4]
2008	movdqu	@XMM[4], 0x20($out)
2009	cmp	\$4,$len
2010	jb	.Lctr_enc_done
2011	movdqu	0x30($inp), @XMM[11]
2012	pxor	@XMM[11], @XMM[6]
2013	movdqu	@XMM[6], 0x30($out)
2014	je	.Lctr_enc_done
2015	movdqu	0x40($inp), @XMM[12]
2016	pxor	@XMM[12], @XMM[3]
2017	movdqu	@XMM[3], 0x40($out)
2018	cmp	\$6,$len
2019	jb	.Lctr_enc_done
2020	movdqu	0x50($inp), @XMM[13]
2021	pxor	@XMM[13], @XMM[7]
2022	movdqu	@XMM[7], 0x50($out)
2023	je	.Lctr_enc_done
2024	movdqu	0x60($inp), @XMM[14]
2025	pxor	@XMM[14], @XMM[2]
2026	movdqu	@XMM[2], 0x60($out)
2027	jmp	.Lctr_enc_done
2028
2029.align	16
2030.Lctr_enc_short:
2031	lea	0x20(%rbp), $arg1
2032	lea	0x30(%rbp), $arg2
2033	lea	($key), $arg3
2034	call	asm_AES_encrypt
2035	movdqu	($inp), @XMM[1]
2036	lea	16($inp), $inp
2037	mov	0x2c(%rbp), %eax	# load 32-bit counter
2038	bswap	%eax
2039	pxor	0x30(%rbp), @XMM[1]
2040	inc	%eax			# increment
2041	movdqu	@XMM[1], ($out)
2042	bswap	%eax
2043	lea	16($out), $out
2044	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2045	dec	$len
2046	jnz	.Lctr_enc_short
2047
2048.Lctr_enc_done:
2049	lea	(%rsp), %rax
2050	pxor	%xmm0, %xmm0
2051.Lctr_enc_bzero:			# wipe key schedule [if any]
2052	movdqa	%xmm0, 0x00(%rax)
2053	movdqa	%xmm0, 0x10(%rax)
2054	lea	0x20(%rax), %rax
2055	cmp	%rax, %rbp
2056	ja	.Lctr_enc_bzero
2057
2058	lea	(%rbp),%rsp		# restore %rsp
2059___
2060$code.=<<___ if ($win64);
2061	movaps	0x40(%rbp), %xmm6
2062	movaps	0x50(%rbp), %xmm7
2063	movaps	0x60(%rbp), %xmm8
2064	movaps	0x70(%rbp), %xmm9
2065	movaps	0x80(%rbp), %xmm10
2066	movaps	0x90(%rbp), %xmm11
2067	movaps	0xa0(%rbp), %xmm12
2068	movaps	0xb0(%rbp), %xmm13
2069	movaps	0xc0(%rbp), %xmm14
2070	movaps	0xd0(%rbp), %xmm15
2071	lea	0xa0(%rbp), %rsp
2072___
2073$code.=<<___;
2074	mov	0x48(%rsp), %r15
2075	mov	0x50(%rsp), %r14
2076	mov	0x58(%rsp), %r13
2077	mov	0x60(%rsp), %r12
2078	mov	0x68(%rsp), %rbx
2079	mov	0x70(%rsp), %rax
2080	lea	0x78(%rsp), %rsp
2081	mov	%rax, %rbp
2082.Lctr_enc_epilogue:
2083	ret
2084.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2085___
2086######################################################################
2087# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2088#	const AES_KEY *key1, const AES_KEY *key2,
2089#	const unsigned char iv[16]);
2090#
2091my ($twmask,$twres,$twtmp)=@XMM[13..15];
2092$arg6=~s/d$//;
2093
2094$code.=<<___;
2095.globl	bsaes_xts_encrypt
2096.type	bsaes_xts_encrypt,\@abi-omnipotent
2097.align	16
2098bsaes_xts_encrypt:
2099	mov	%rsp, %rax
2100.Lxts_enc_prologue:
2101	push	%rbp
2102	push	%rbx
2103	push	%r12
2104	push	%r13
2105	push	%r14
2106	push	%r15
2107	lea	-0x48(%rsp), %rsp
2108___
2109$code.=<<___ if ($win64);
2110	mov	0xa0(%rsp),$arg5	# pull key2
2111	mov	0xa8(%rsp),$arg6	# pull ivp
2112	lea	-0xa0(%rsp), %rsp
2113	movaps	%xmm6, 0x40(%rsp)
2114	movaps	%xmm7, 0x50(%rsp)
2115	movaps	%xmm8, 0x60(%rsp)
2116	movaps	%xmm9, 0x70(%rsp)
2117	movaps	%xmm10, 0x80(%rsp)
2118	movaps	%xmm11, 0x90(%rsp)
2119	movaps	%xmm12, 0xa0(%rsp)
2120	movaps	%xmm13, 0xb0(%rsp)
2121	movaps	%xmm14, 0xc0(%rsp)
2122	movaps	%xmm15, 0xd0(%rsp)
2123.Lxts_enc_body:
2124___
2125$code.=<<___;
2126	mov	%rsp, %rbp		# backup %rsp
2127	mov	$arg1, $inp		# backup arguments
2128	mov	$arg2, $out
2129	mov	$arg3, $len
2130	mov	$arg4, $key
2131
2132	lea	($arg6), $arg1
2133	lea	0x20(%rbp), $arg2
2134	lea	($arg5), $arg3
2135	call	asm_AES_encrypt		# generate initial tweak
2136
2137	mov	240($key), %eax		# rounds
2138	mov	$len, %rbx		# backup $len
2139
2140	mov	%eax, %edx		# rounds
2141	shl	\$7, %rax		# 128 bytes per inner round key
2142	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2143	sub	%rax, %rsp
2144
2145	mov	%rsp, %rax		# pass key schedule
2146	mov	$key, %rcx		# pass key
2147	mov	%edx, %r10d		# pass rounds
2148	call	_bsaes_key_convert
2149	pxor	%xmm6, %xmm7		# fix up last round key
2150	movdqa	%xmm7, (%rax)		# save last round key
2151
2152	and	\$-16, $len
2153	sub	\$0x80, %rsp		# place for tweak[8]
2154	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2155
2156	pxor	$twtmp, $twtmp
2157	movdqa	.Lxts_magic(%rip), $twmask
2158	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2159
2160	sub	\$0x80, $len
2161	jc	.Lxts_enc_short
2162	jmp	.Lxts_enc_loop
2163
2164.align	16
2165.Lxts_enc_loop:
2166___
2167    for ($i=0;$i<7;$i++) {
2168    $code.=<<___;
2169	pshufd	\$0x13, $twtmp, $twres
2170	pxor	$twtmp, $twtmp
2171	movdqa	@XMM[7], @XMM[$i]
2172	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2173	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2174	pand	$twmask, $twres		# isolate carry and residue
2175	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2176	pxor	$twres, @XMM[7]
2177___
2178    $code.=<<___ if ($i>=1);
2179	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2180___
2181    $code.=<<___ if ($i>=2);
2182	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2183___
2184    }
2185$code.=<<___;
2186	movdqu	0x60($inp), @XMM[8+6]
2187	pxor	@XMM[8+5], @XMM[5]
2188	movdqu	0x70($inp), @XMM[8+7]
2189	lea	0x80($inp), $inp
2190	movdqa	@XMM[7], 0x70(%rsp)
2191	pxor	@XMM[8+6], @XMM[6]
2192	lea	0x80(%rsp), %rax	# pass key schedule
2193	pxor	@XMM[8+7], @XMM[7]
2194	mov	%edx, %r10d		# pass rounds
2195
2196	call	_bsaes_encrypt8
2197
2198	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2199	pxor	0x10(%rsp), @XMM[1]
2200	movdqu	@XMM[0], 0x00($out)	# write output
2201	pxor	0x20(%rsp), @XMM[4]
2202	movdqu	@XMM[1], 0x10($out)
2203	pxor	0x30(%rsp), @XMM[6]
2204	movdqu	@XMM[4], 0x20($out)
2205	pxor	0x40(%rsp), @XMM[3]
2206	movdqu	@XMM[6], 0x30($out)
2207	pxor	0x50(%rsp), @XMM[7]
2208	movdqu	@XMM[3], 0x40($out)
2209	pxor	0x60(%rsp), @XMM[2]
2210	movdqu	@XMM[7], 0x50($out)
2211	pxor	0x70(%rsp), @XMM[5]
2212	movdqu	@XMM[2], 0x60($out)
2213	movdqu	@XMM[5], 0x70($out)
2214	lea	0x80($out), $out
2215
2216	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2217	pxor	$twtmp, $twtmp
2218	movdqa	.Lxts_magic(%rip), $twmask
2219	pcmpgtd	@XMM[7], $twtmp
2220	pshufd	\$0x13, $twtmp, $twres
2221	pxor	$twtmp, $twtmp
2222	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2223	pand	$twmask, $twres		# isolate carry and residue
2224	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2225	pxor	$twres, @XMM[7]
2226
2227	sub	\$0x80,$len
2228	jnc	.Lxts_enc_loop
2229
2230.Lxts_enc_short:
2231	add	\$0x80, $len
2232	jz	.Lxts_enc_done
2233___
2234    for ($i=0;$i<7;$i++) {
2235    $code.=<<___;
2236	pshufd	\$0x13, $twtmp, $twres
2237	pxor	$twtmp, $twtmp
2238	movdqa	@XMM[7], @XMM[$i]
2239	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2240	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2241	pand	$twmask, $twres		# isolate carry and residue
2242	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2243	pxor	$twres, @XMM[7]
2244___
2245    $code.=<<___ if ($i>=1);
2246	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2247	cmp	\$`0x10*$i`,$len
2248	je	.Lxts_enc_$i
2249___
2250    $code.=<<___ if ($i>=2);
2251	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2252___
2253    }
2254$code.=<<___;
2255	movdqu	0x60($inp), @XMM[8+6]
2256	pxor	@XMM[8+5], @XMM[5]
2257	movdqa	@XMM[7], 0x70(%rsp)
2258	lea	0x70($inp), $inp
2259	pxor	@XMM[8+6], @XMM[6]
2260	lea	0x80(%rsp), %rax	# pass key schedule
2261	mov	%edx, %r10d		# pass rounds
2262
2263	call	_bsaes_encrypt8
2264
2265	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2266	pxor	0x10(%rsp), @XMM[1]
2267	movdqu	@XMM[0], 0x00($out)	# write output
2268	pxor	0x20(%rsp), @XMM[4]
2269	movdqu	@XMM[1], 0x10($out)
2270	pxor	0x30(%rsp), @XMM[6]
2271	movdqu	@XMM[4], 0x20($out)
2272	pxor	0x40(%rsp), @XMM[3]
2273	movdqu	@XMM[6], 0x30($out)
2274	pxor	0x50(%rsp), @XMM[7]
2275	movdqu	@XMM[3], 0x40($out)
2276	pxor	0x60(%rsp), @XMM[2]
2277	movdqu	@XMM[7], 0x50($out)
2278	movdqu	@XMM[2], 0x60($out)
2279	lea	0x70($out), $out
2280
2281	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2282	jmp	.Lxts_enc_done
2283.align	16
2284.Lxts_enc_6:
2285	pxor	@XMM[8+4], @XMM[4]
2286	lea	0x60($inp), $inp
2287	pxor	@XMM[8+5], @XMM[5]
2288	lea	0x80(%rsp), %rax	# pass key schedule
2289	mov	%edx, %r10d		# pass rounds
2290
2291	call	_bsaes_encrypt8
2292
2293	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2294	pxor	0x10(%rsp), @XMM[1]
2295	movdqu	@XMM[0], 0x00($out)	# write output
2296	pxor	0x20(%rsp), @XMM[4]
2297	movdqu	@XMM[1], 0x10($out)
2298	pxor	0x30(%rsp), @XMM[6]
2299	movdqu	@XMM[4], 0x20($out)
2300	pxor	0x40(%rsp), @XMM[3]
2301	movdqu	@XMM[6], 0x30($out)
2302	pxor	0x50(%rsp), @XMM[7]
2303	movdqu	@XMM[3], 0x40($out)
2304	movdqu	@XMM[7], 0x50($out)
2305	lea	0x60($out), $out
2306
2307	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2308	jmp	.Lxts_enc_done
2309.align	16
2310.Lxts_enc_5:
2311	pxor	@XMM[8+3], @XMM[3]
2312	lea	0x50($inp), $inp
2313	pxor	@XMM[8+4], @XMM[4]
2314	lea	0x80(%rsp), %rax	# pass key schedule
2315	mov	%edx, %r10d		# pass rounds
2316
2317	call	_bsaes_encrypt8
2318
2319	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2320	pxor	0x10(%rsp), @XMM[1]
2321	movdqu	@XMM[0], 0x00($out)	# write output
2322	pxor	0x20(%rsp), @XMM[4]
2323	movdqu	@XMM[1], 0x10($out)
2324	pxor	0x30(%rsp), @XMM[6]
2325	movdqu	@XMM[4], 0x20($out)
2326	pxor	0x40(%rsp), @XMM[3]
2327	movdqu	@XMM[6], 0x30($out)
2328	movdqu	@XMM[3], 0x40($out)
2329	lea	0x50($out), $out
2330
2331	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2332	jmp	.Lxts_enc_done
2333.align	16
2334.Lxts_enc_4:
2335	pxor	@XMM[8+2], @XMM[2]
2336	lea	0x40($inp), $inp
2337	pxor	@XMM[8+3], @XMM[3]
2338	lea	0x80(%rsp), %rax	# pass key schedule
2339	mov	%edx, %r10d		# pass rounds
2340
2341	call	_bsaes_encrypt8
2342
2343	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2344	pxor	0x10(%rsp), @XMM[1]
2345	movdqu	@XMM[0], 0x00($out)	# write output
2346	pxor	0x20(%rsp), @XMM[4]
2347	movdqu	@XMM[1], 0x10($out)
2348	pxor	0x30(%rsp), @XMM[6]
2349	movdqu	@XMM[4], 0x20($out)
2350	movdqu	@XMM[6], 0x30($out)
2351	lea	0x40($out), $out
2352
2353	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2354	jmp	.Lxts_enc_done
2355.align	16
2356.Lxts_enc_3:
2357	pxor	@XMM[8+1], @XMM[1]
2358	lea	0x30($inp), $inp
2359	pxor	@XMM[8+2], @XMM[2]
2360	lea	0x80(%rsp), %rax	# pass key schedule
2361	mov	%edx, %r10d		# pass rounds
2362
2363	call	_bsaes_encrypt8
2364
2365	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2366	pxor	0x10(%rsp), @XMM[1]
2367	movdqu	@XMM[0], 0x00($out)	# write output
2368	pxor	0x20(%rsp), @XMM[4]
2369	movdqu	@XMM[1], 0x10($out)
2370	movdqu	@XMM[4], 0x20($out)
2371	lea	0x30($out), $out
2372
2373	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2374	jmp	.Lxts_enc_done
2375.align	16
2376.Lxts_enc_2:
2377	pxor	@XMM[8+0], @XMM[0]
2378	lea	0x20($inp), $inp
2379	pxor	@XMM[8+1], @XMM[1]
2380	lea	0x80(%rsp), %rax	# pass key schedule
2381	mov	%edx, %r10d		# pass rounds
2382
2383	call	_bsaes_encrypt8
2384
2385	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2386	pxor	0x10(%rsp), @XMM[1]
2387	movdqu	@XMM[0], 0x00($out)	# write output
2388	movdqu	@XMM[1], 0x10($out)
2389	lea	0x20($out), $out
2390
2391	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2392	jmp	.Lxts_enc_done
2393.align	16
2394.Lxts_enc_1:
2395	pxor	@XMM[0], @XMM[8]
2396	lea	0x10($inp), $inp
2397	movdqa	@XMM[8], 0x20(%rbp)
2398	lea	0x20(%rbp), $arg1
2399	lea	0x20(%rbp), $arg2
2400	lea	($key), $arg3
2401	call	asm_AES_encrypt		# doesn't touch %xmm
2402	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2403	#pxor	@XMM[8], @XMM[0]
2404	#lea	0x80(%rsp), %rax	# pass key schedule
2405	#mov	%edx, %r10d		# pass rounds
2406	#call	_bsaes_encrypt8
2407	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2408	movdqu	@XMM[0], 0x00($out)	# write output
2409	lea	0x10($out), $out
2410
2411	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2412
2413.Lxts_enc_done:
2414	and	\$15, %ebx
2415	jz	.Lxts_enc_ret
2416	mov	$out, %rdx
2417
2418.Lxts_enc_steal:
2419	movzb	($inp), %eax
2420	movzb	-16(%rdx), %ecx
2421	lea	1($inp), $inp
2422	mov	%al, -16(%rdx)
2423	mov	%cl, 0(%rdx)
2424	lea	1(%rdx), %rdx
2425	sub	\$1,%ebx
2426	jnz	.Lxts_enc_steal
2427
2428	movdqu	-16($out), @XMM[0]
2429	lea	0x20(%rbp), $arg1
2430	pxor	@XMM[7], @XMM[0]
2431	lea	0x20(%rbp), $arg2
2432	movdqa	@XMM[0], 0x20(%rbp)
2433	lea	($key), $arg3
2434	call	asm_AES_encrypt		# doesn't touch %xmm
2435	pxor	0x20(%rbp), @XMM[7]
2436	movdqu	@XMM[7], -16($out)
2437
2438.Lxts_enc_ret:
2439	lea	(%rsp), %rax
2440	pxor	%xmm0, %xmm0
2441.Lxts_enc_bzero:			# wipe key schedule [if any]
2442	movdqa	%xmm0, 0x00(%rax)
2443	movdqa	%xmm0, 0x10(%rax)
2444	lea	0x20(%rax), %rax
2445	cmp	%rax, %rbp
2446	ja	.Lxts_enc_bzero
2447
2448	lea	(%rbp),%rsp		# restore %rsp
2449___
2450$code.=<<___ if ($win64);
2451	movaps	0x40(%rbp), %xmm6
2452	movaps	0x50(%rbp), %xmm7
2453	movaps	0x60(%rbp), %xmm8
2454	movaps	0x70(%rbp), %xmm9
2455	movaps	0x80(%rbp), %xmm10
2456	movaps	0x90(%rbp), %xmm11
2457	movaps	0xa0(%rbp), %xmm12
2458	movaps	0xb0(%rbp), %xmm13
2459	movaps	0xc0(%rbp), %xmm14
2460	movaps	0xd0(%rbp), %xmm15
2461	lea	0xa0(%rbp), %rsp
2462___
2463$code.=<<___;
2464	mov	0x48(%rsp), %r15
2465	mov	0x50(%rsp), %r14
2466	mov	0x58(%rsp), %r13
2467	mov	0x60(%rsp), %r12
2468	mov	0x68(%rsp), %rbx
2469	mov	0x70(%rsp), %rax
2470	lea	0x78(%rsp), %rsp
2471	mov	%rax, %rbp
2472.Lxts_enc_epilogue:
2473	ret
2474.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2475
2476.globl	bsaes_xts_decrypt
2477.type	bsaes_xts_decrypt,\@abi-omnipotent
2478.align	16
2479bsaes_xts_decrypt:
2480	mov	%rsp, %rax
2481.Lxts_dec_prologue:
2482	push	%rbp
2483	push	%rbx
2484	push	%r12
2485	push	%r13
2486	push	%r14
2487	push	%r15
2488	lea	-0x48(%rsp), %rsp
2489___
2490$code.=<<___ if ($win64);
2491	mov	0xa0(%rsp),$arg5	# pull key2
2492	mov	0xa8(%rsp),$arg6	# pull ivp
2493	lea	-0xa0(%rsp), %rsp
2494	movaps	%xmm6, 0x40(%rsp)
2495	movaps	%xmm7, 0x50(%rsp)
2496	movaps	%xmm8, 0x60(%rsp)
2497	movaps	%xmm9, 0x70(%rsp)
2498	movaps	%xmm10, 0x80(%rsp)
2499	movaps	%xmm11, 0x90(%rsp)
2500	movaps	%xmm12, 0xa0(%rsp)
2501	movaps	%xmm13, 0xb0(%rsp)
2502	movaps	%xmm14, 0xc0(%rsp)
2503	movaps	%xmm15, 0xd0(%rsp)
2504.Lxts_dec_body:
2505___
2506$code.=<<___;
2507	mov	%rsp, %rbp		# backup %rsp
2508	mov	$arg1, $inp		# backup arguments
2509	mov	$arg2, $out
2510	mov	$arg3, $len
2511	mov	$arg4, $key
2512
2513	lea	($arg6), $arg1
2514	lea	0x20(%rbp), $arg2
2515	lea	($arg5), $arg3
2516	call	asm_AES_encrypt		# generate initial tweak
2517
2518	mov	240($key), %eax		# rounds
2519	mov	$len, %rbx		# backup $len
2520
2521	mov	%eax, %edx		# rounds
2522	shl	\$7, %rax		# 128 bytes per inner round key
2523	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2524	sub	%rax, %rsp
2525
2526	mov	%rsp, %rax		# pass key schedule
2527	mov	$key, %rcx		# pass key
2528	mov	%edx, %r10d		# pass rounds
2529	call	_bsaes_key_convert
2530	pxor	(%rsp), %xmm7		# fix up round 0 key
2531	movdqa	%xmm6, (%rax)		# save last round key
2532	movdqa	%xmm7, (%rsp)
2533
2534	xor	%eax, %eax		# if ($len%16) len-=16;
2535	and	\$-16, $len
2536	test	\$15, %ebx
2537	setnz	%al
2538	shl	\$4, %rax
2539	sub	%rax, $len
2540
2541	sub	\$0x80, %rsp		# place for tweak[8]
2542	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2543
2544	pxor	$twtmp, $twtmp
2545	movdqa	.Lxts_magic(%rip), $twmask
2546	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2547
2548	sub	\$0x80, $len
2549	jc	.Lxts_dec_short
2550	jmp	.Lxts_dec_loop
2551
2552.align	16
2553.Lxts_dec_loop:
2554___
2555    for ($i=0;$i<7;$i++) {
2556    $code.=<<___;
2557	pshufd	\$0x13, $twtmp, $twres
2558	pxor	$twtmp, $twtmp
2559	movdqa	@XMM[7], @XMM[$i]
2560	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2561	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2562	pand	$twmask, $twres		# isolate carry and residue
2563	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2564	pxor	$twres, @XMM[7]
2565___
2566    $code.=<<___ if ($i>=1);
2567	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2568___
2569    $code.=<<___ if ($i>=2);
2570	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2571___
2572    }
2573$code.=<<___;
2574	movdqu	0x60($inp), @XMM[8+6]
2575	pxor	@XMM[8+5], @XMM[5]
2576	movdqu	0x70($inp), @XMM[8+7]
2577	lea	0x80($inp), $inp
2578	movdqa	@XMM[7], 0x70(%rsp)
2579	pxor	@XMM[8+6], @XMM[6]
2580	lea	0x80(%rsp), %rax	# pass key schedule
2581	pxor	@XMM[8+7], @XMM[7]
2582	mov	%edx, %r10d		# pass rounds
2583
2584	call	_bsaes_decrypt8
2585
2586	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2587	pxor	0x10(%rsp), @XMM[1]
2588	movdqu	@XMM[0], 0x00($out)	# write output
2589	pxor	0x20(%rsp), @XMM[6]
2590	movdqu	@XMM[1], 0x10($out)
2591	pxor	0x30(%rsp), @XMM[4]
2592	movdqu	@XMM[6], 0x20($out)
2593	pxor	0x40(%rsp), @XMM[2]
2594	movdqu	@XMM[4], 0x30($out)
2595	pxor	0x50(%rsp), @XMM[7]
2596	movdqu	@XMM[2], 0x40($out)
2597	pxor	0x60(%rsp), @XMM[3]
2598	movdqu	@XMM[7], 0x50($out)
2599	pxor	0x70(%rsp), @XMM[5]
2600	movdqu	@XMM[3], 0x60($out)
2601	movdqu	@XMM[5], 0x70($out)
2602	lea	0x80($out), $out
2603
2604	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2605	pxor	$twtmp, $twtmp
2606	movdqa	.Lxts_magic(%rip), $twmask
2607	pcmpgtd	@XMM[7], $twtmp
2608	pshufd	\$0x13, $twtmp, $twres
2609	pxor	$twtmp, $twtmp
2610	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2611	pand	$twmask, $twres		# isolate carry and residue
2612	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2613	pxor	$twres, @XMM[7]
2614
2615	sub	\$0x80,$len
2616	jnc	.Lxts_dec_loop
2617
2618.Lxts_dec_short:
2619	add	\$0x80, $len
2620	jz	.Lxts_dec_done
2621___
2622    for ($i=0;$i<7;$i++) {
2623    $code.=<<___;
2624	pshufd	\$0x13, $twtmp, $twres
2625	pxor	$twtmp, $twtmp
2626	movdqa	@XMM[7], @XMM[$i]
2627	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2628	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2629	pand	$twmask, $twres		# isolate carry and residue
2630	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2631	pxor	$twres, @XMM[7]
2632___
2633    $code.=<<___ if ($i>=1);
2634	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2635	cmp	\$`0x10*$i`,$len
2636	je	.Lxts_dec_$i
2637___
2638    $code.=<<___ if ($i>=2);
2639	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2640___
2641    }
2642$code.=<<___;
2643	movdqu	0x60($inp), @XMM[8+6]
2644	pxor	@XMM[8+5], @XMM[5]
2645	movdqa	@XMM[7], 0x70(%rsp)
2646	lea	0x70($inp), $inp
2647	pxor	@XMM[8+6], @XMM[6]
2648	lea	0x80(%rsp), %rax	# pass key schedule
2649	mov	%edx, %r10d		# pass rounds
2650
2651	call	_bsaes_decrypt8
2652
2653	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2654	pxor	0x10(%rsp), @XMM[1]
2655	movdqu	@XMM[0], 0x00($out)	# write output
2656	pxor	0x20(%rsp), @XMM[6]
2657	movdqu	@XMM[1], 0x10($out)
2658	pxor	0x30(%rsp), @XMM[4]
2659	movdqu	@XMM[6], 0x20($out)
2660	pxor	0x40(%rsp), @XMM[2]
2661	movdqu	@XMM[4], 0x30($out)
2662	pxor	0x50(%rsp), @XMM[7]
2663	movdqu	@XMM[2], 0x40($out)
2664	pxor	0x60(%rsp), @XMM[3]
2665	movdqu	@XMM[7], 0x50($out)
2666	movdqu	@XMM[3], 0x60($out)
2667	lea	0x70($out), $out
2668
2669	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2670	jmp	.Lxts_dec_done
2671.align	16
2672.Lxts_dec_6:
2673	pxor	@XMM[8+4], @XMM[4]
2674	lea	0x60($inp), $inp
2675	pxor	@XMM[8+5], @XMM[5]
2676	lea	0x80(%rsp), %rax	# pass key schedule
2677	mov	%edx, %r10d		# pass rounds
2678
2679	call	_bsaes_decrypt8
2680
2681	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2682	pxor	0x10(%rsp), @XMM[1]
2683	movdqu	@XMM[0], 0x00($out)	# write output
2684	pxor	0x20(%rsp), @XMM[6]
2685	movdqu	@XMM[1], 0x10($out)
2686	pxor	0x30(%rsp), @XMM[4]
2687	movdqu	@XMM[6], 0x20($out)
2688	pxor	0x40(%rsp), @XMM[2]
2689	movdqu	@XMM[4], 0x30($out)
2690	pxor	0x50(%rsp), @XMM[7]
2691	movdqu	@XMM[2], 0x40($out)
2692	movdqu	@XMM[7], 0x50($out)
2693	lea	0x60($out), $out
2694
2695	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2696	jmp	.Lxts_dec_done
2697.align	16
2698.Lxts_dec_5:
2699	pxor	@XMM[8+3], @XMM[3]
2700	lea	0x50($inp), $inp
2701	pxor	@XMM[8+4], @XMM[4]
2702	lea	0x80(%rsp), %rax	# pass key schedule
2703	mov	%edx, %r10d		# pass rounds
2704
2705	call	_bsaes_decrypt8
2706
2707	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2708	pxor	0x10(%rsp), @XMM[1]
2709	movdqu	@XMM[0], 0x00($out)	# write output
2710	pxor	0x20(%rsp), @XMM[6]
2711	movdqu	@XMM[1], 0x10($out)
2712	pxor	0x30(%rsp), @XMM[4]
2713	movdqu	@XMM[6], 0x20($out)
2714	pxor	0x40(%rsp), @XMM[2]
2715	movdqu	@XMM[4], 0x30($out)
2716	movdqu	@XMM[2], 0x40($out)
2717	lea	0x50($out), $out
2718
2719	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2720	jmp	.Lxts_dec_done
2721.align	16
2722.Lxts_dec_4:
2723	pxor	@XMM[8+2], @XMM[2]
2724	lea	0x40($inp), $inp
2725	pxor	@XMM[8+3], @XMM[3]
2726	lea	0x80(%rsp), %rax	# pass key schedule
2727	mov	%edx, %r10d		# pass rounds
2728
2729	call	_bsaes_decrypt8
2730
2731	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2732	pxor	0x10(%rsp), @XMM[1]
2733	movdqu	@XMM[0], 0x00($out)	# write output
2734	pxor	0x20(%rsp), @XMM[6]
2735	movdqu	@XMM[1], 0x10($out)
2736	pxor	0x30(%rsp), @XMM[4]
2737	movdqu	@XMM[6], 0x20($out)
2738	movdqu	@XMM[4], 0x30($out)
2739	lea	0x40($out), $out
2740
2741	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2742	jmp	.Lxts_dec_done
2743.align	16
2744.Lxts_dec_3:
2745	pxor	@XMM[8+1], @XMM[1]
2746	lea	0x30($inp), $inp
2747	pxor	@XMM[8+2], @XMM[2]
2748	lea	0x80(%rsp), %rax	# pass key schedule
2749	mov	%edx, %r10d		# pass rounds
2750
2751	call	_bsaes_decrypt8
2752
2753	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2754	pxor	0x10(%rsp), @XMM[1]
2755	movdqu	@XMM[0], 0x00($out)	# write output
2756	pxor	0x20(%rsp), @XMM[6]
2757	movdqu	@XMM[1], 0x10($out)
2758	movdqu	@XMM[6], 0x20($out)
2759	lea	0x30($out), $out
2760
2761	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2762	jmp	.Lxts_dec_done
2763.align	16
2764.Lxts_dec_2:
2765	pxor	@XMM[8+0], @XMM[0]
2766	lea	0x20($inp), $inp
2767	pxor	@XMM[8+1], @XMM[1]
2768	lea	0x80(%rsp), %rax	# pass key schedule
2769	mov	%edx, %r10d		# pass rounds
2770
2771	call	_bsaes_decrypt8
2772
2773	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2774	pxor	0x10(%rsp), @XMM[1]
2775	movdqu	@XMM[0], 0x00($out)	# write output
2776	movdqu	@XMM[1], 0x10($out)
2777	lea	0x20($out), $out
2778
2779	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2780	jmp	.Lxts_dec_done
2781.align	16
2782.Lxts_dec_1:
2783	pxor	@XMM[0], @XMM[8]
2784	lea	0x10($inp), $inp
2785	movdqa	@XMM[8], 0x20(%rbp)
2786	lea	0x20(%rbp), $arg1
2787	lea	0x20(%rbp), $arg2
2788	lea	($key), $arg3
2789	call	asm_AES_decrypt		# doesn't touch %xmm
2790	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2791	#pxor	@XMM[8], @XMM[0]
2792	#lea	0x80(%rsp), %rax	# pass key schedule
2793	#mov	%edx, %r10d		# pass rounds
2794	#call	_bsaes_decrypt8
2795	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2796	movdqu	@XMM[0], 0x00($out)	# write output
2797	lea	0x10($out), $out
2798
2799	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2800
2801.Lxts_dec_done:
2802	and	\$15, %ebx
2803	jz	.Lxts_dec_ret
2804
2805	pxor	$twtmp, $twtmp
2806	movdqa	.Lxts_magic(%rip), $twmask
2807	pcmpgtd	@XMM[7], $twtmp
2808	pshufd	\$0x13, $twtmp, $twres
2809	movdqa	@XMM[7], @XMM[6]
2810	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2811	pand	$twmask, $twres		# isolate carry and residue
2812	movdqu	($inp), @XMM[0]
2813	pxor	$twres, @XMM[7]
2814
2815	lea	0x20(%rbp), $arg1
2816	pxor	@XMM[7], @XMM[0]
2817	lea	0x20(%rbp), $arg2
2818	movdqa	@XMM[0], 0x20(%rbp)
2819	lea	($key), $arg3
2820	call	asm_AES_decrypt		# doesn't touch %xmm
2821	pxor	0x20(%rbp), @XMM[7]
2822	mov	$out, %rdx
2823	movdqu	@XMM[7], ($out)
2824
2825.Lxts_dec_steal:
2826	movzb	16($inp), %eax
2827	movzb	(%rdx), %ecx
2828	lea	1($inp), $inp
2829	mov	%al, (%rdx)
2830	mov	%cl, 16(%rdx)
2831	lea	1(%rdx), %rdx
2832	sub	\$1,%ebx
2833	jnz	.Lxts_dec_steal
2834
2835	movdqu	($out), @XMM[0]
2836	lea	0x20(%rbp), $arg1
2837	pxor	@XMM[6], @XMM[0]
2838	lea	0x20(%rbp), $arg2
2839	movdqa	@XMM[0], 0x20(%rbp)
2840	lea	($key), $arg3
2841	call	asm_AES_decrypt		# doesn't touch %xmm
2842	pxor	0x20(%rbp), @XMM[6]
2843	movdqu	@XMM[6], ($out)
2844
2845.Lxts_dec_ret:
2846	lea	(%rsp), %rax
2847	pxor	%xmm0, %xmm0
2848.Lxts_dec_bzero:			# wipe key schedule [if any]
2849	movdqa	%xmm0, 0x00(%rax)
2850	movdqa	%xmm0, 0x10(%rax)
2851	lea	0x20(%rax), %rax
2852	cmp	%rax, %rbp
2853	ja	.Lxts_dec_bzero
2854
2855	lea	(%rbp),%rsp		# restore %rsp
2856___
2857$code.=<<___ if ($win64);
2858	movaps	0x40(%rbp), %xmm6
2859	movaps	0x50(%rbp), %xmm7
2860	movaps	0x60(%rbp), %xmm8
2861	movaps	0x70(%rbp), %xmm9
2862	movaps	0x80(%rbp), %xmm10
2863	movaps	0x90(%rbp), %xmm11
2864	movaps	0xa0(%rbp), %xmm12
2865	movaps	0xb0(%rbp), %xmm13
2866	movaps	0xc0(%rbp), %xmm14
2867	movaps	0xd0(%rbp), %xmm15
2868	lea	0xa0(%rbp), %rsp
2869___
2870$code.=<<___;
2871	mov	0x48(%rsp), %r15
2872	mov	0x50(%rsp), %r14
2873	mov	0x58(%rsp), %r13
2874	mov	0x60(%rsp), %r12
2875	mov	0x68(%rsp), %rbx
2876	mov	0x70(%rsp), %rax
2877	lea	0x78(%rsp), %rsp
2878	mov	%rax, %rbp
2879.Lxts_dec_epilogue:
2880	ret
2881.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2882___
2883}
2884$code.=<<___;
2885.type	_bsaes_const,\@object
2886.align	64
2887_bsaes_const:
2888.LM0ISR:	# InvShiftRows constants
2889	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2890.LISRM0:
2891	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2892.LISR:
2893	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2894.LBS0:		# bit-slice constants
2895	.quad	0x5555555555555555, 0x5555555555555555
2896.LBS1:
2897	.quad	0x3333333333333333, 0x3333333333333333
2898.LBS2:
2899	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2900.LSR:		# shiftrows constants
2901	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2902.LSRM0:
2903	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2904.LM0SR:
2905	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2906.LSWPUP:	# byte-swap upper dword
2907	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2908.LSWPUPM0SR:
2909	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2910.LADD1:		# counter increment constants
2911	.quad	0x0000000000000000, 0x0000000100000000
2912.LADD2:
2913	.quad	0x0000000000000000, 0x0000000200000000
2914.LADD3:
2915	.quad	0x0000000000000000, 0x0000000300000000
2916.LADD4:
2917	.quad	0x0000000000000000, 0x0000000400000000
2918.LADD5:
2919	.quad	0x0000000000000000, 0x0000000500000000
2920.LADD6:
2921	.quad	0x0000000000000000, 0x0000000600000000
2922.LADD7:
2923	.quad	0x0000000000000000, 0x0000000700000000
2924.LADD8:
2925	.quad	0x0000000000000000, 0x0000000800000000
2926.Lxts_magic:
2927	.long	0x87,0,1,0
2928.Lmasks:
2929	.quad	0x0101010101010101, 0x0101010101010101
2930	.quad	0x0202020202020202, 0x0202020202020202
2931	.quad	0x0404040404040404, 0x0404040404040404
2932	.quad	0x0808080808080808, 0x0808080808080808
2933.LM0:
2934	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2935.L63:
2936	.quad	0x6363636363636363, 0x6363636363636363
2937.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia K��sper, Peter Schwabe, Andy Polyakov"
2938.align	64
2939.size	_bsaes_const,.-_bsaes_const
2940___
2941
2942# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2943#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2944if ($win64) {
2945$rec="%rcx";
2946$frame="%rdx";
2947$context="%r8";
2948$disp="%r9";
2949
2950$code.=<<___;
2951.extern	__imp_RtlVirtualUnwind
2952.type	se_handler,\@abi-omnipotent
2953.align	16
2954se_handler:
2955	push	%rsi
2956	push	%rdi
2957	push	%rbx
2958	push	%rbp
2959	push	%r12
2960	push	%r13
2961	push	%r14
2962	push	%r15
2963	pushfq
2964	sub	\$64,%rsp
2965
2966	mov	120($context),%rax	# pull context->Rax
2967	mov	248($context),%rbx	# pull context->Rip
2968
2969	mov	8($disp),%rsi		# disp->ImageBase
2970	mov	56($disp),%r11		# disp->HandlerData
2971
2972	mov	0(%r11),%r10d		# HandlerData[0]
2973	lea	(%rsi,%r10),%r10	# prologue label
2974	cmp	%r10,%rbx		# context->Rip<prologue label
2975	jb	.Lin_prologue
2976
2977	mov	152($context),%rax	# pull context->Rsp
2978
2979	mov	4(%r11),%r10d		# HandlerData[1]
2980	lea	(%rsi,%r10),%r10	# epilogue label
2981	cmp	%r10,%rbx		# context->Rip>=epilogue label
2982	jae	.Lin_prologue
2983
2984	mov	160($context),%rax	# pull context->Rbp
2985
2986	lea	0x40(%rax),%rsi		# %xmm save area
2987	lea	512($context),%rdi	# &context.Xmm6
2988	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2989	.long	0xa548f3fc		# cld; rep movsq
2990	lea	0xa0(%rax),%rax		# adjust stack pointer
2991
2992	mov	0x70(%rax),%rbp
2993	mov	0x68(%rax),%rbx
2994	mov	0x60(%rax),%r12
2995	mov	0x58(%rax),%r13
2996	mov	0x50(%rax),%r14
2997	mov	0x48(%rax),%r15
2998	lea	0x78(%rax),%rax		# adjust stack pointer
2999	mov	%rbx,144($context)	# restore context->Rbx
3000	mov	%rbp,160($context)	# restore context->Rbp
3001	mov	%r12,216($context)	# restore context->R12
3002	mov	%r13,224($context)	# restore context->R13
3003	mov	%r14,232($context)	# restore context->R14
3004	mov	%r15,240($context)	# restore context->R15
3005
3006.Lin_prologue:
3007	mov	%rax,152($context)	# restore context->Rsp
3008
3009	mov	40($disp),%rdi		# disp->ContextRecord
3010	mov	$context,%rsi		# context
3011	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3012	.long	0xa548f3fc		# cld; rep movsq
3013
3014	mov	$disp,%rsi
3015	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3016	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3017	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3018	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3019	mov	40(%rsi),%r10		# disp->ContextRecord
3020	lea	56(%rsi),%r11		# &disp->HandlerData
3021	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3022	mov	%r10,32(%rsp)		# arg5
3023	mov	%r11,40(%rsp)		# arg6
3024	mov	%r12,48(%rsp)		# arg7
3025	mov	%rcx,56(%rsp)		# arg8, (NULL)
3026	call	*__imp_RtlVirtualUnwind(%rip)
3027
3028	mov	\$1,%eax		# ExceptionContinueSearch
3029	add	\$64,%rsp
3030	popfq
3031	pop	%r15
3032	pop	%r14
3033	pop	%r13
3034	pop	%r12
3035	pop	%rbp
3036	pop	%rbx
3037	pop	%rdi
3038	pop	%rsi
3039	ret
3040.size	se_handler,.-se_handler
3041
3042.section	.pdata
3043.align	4
3044___
3045$code.=<<___ if ($ecb);
3046	.rva	.Lecb_enc_prologue
3047	.rva	.Lecb_enc_epilogue
3048	.rva	.Lecb_enc_info
3049
3050	.rva	.Lecb_dec_prologue
3051	.rva	.Lecb_dec_epilogue
3052	.rva	.Lecb_dec_info
3053___
3054$code.=<<___;
3055	.rva	.Lcbc_dec_prologue
3056	.rva	.Lcbc_dec_epilogue
3057	.rva	.Lcbc_dec_info
3058
3059	.rva	.Lctr_enc_prologue
3060	.rva	.Lctr_enc_epilogue
3061	.rva	.Lctr_enc_info
3062
3063	.rva	.Lxts_enc_prologue
3064	.rva	.Lxts_enc_epilogue
3065	.rva	.Lxts_enc_info
3066
3067	.rva	.Lxts_dec_prologue
3068	.rva	.Lxts_dec_epilogue
3069	.rva	.Lxts_dec_info
3070
3071.section	.xdata
3072.align	8
3073___
3074$code.=<<___ if ($ecb);
3075.Lecb_enc_info:
3076	.byte	9,0,0,0
3077	.rva	se_handler
3078	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3079.Lecb_dec_info:
3080	.byte	9,0,0,0
3081	.rva	se_handler
3082	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3083___
3084$code.=<<___;
3085.Lcbc_dec_info:
3086	.byte	9,0,0,0
3087	.rva	se_handler
3088	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3089.Lctr_enc_info:
3090	.byte	9,0,0,0
3091	.rva	se_handler
3092	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3093.Lxts_enc_info:
3094	.byte	9,0,0,0
3095	.rva	se_handler
3096	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3097.Lxts_dec_info:
3098	.byte	9,0,0,0
3099	.rva	se_handler
3100	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3101___
3102}
3103
3104$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3105
3106print $code;
3107
3108close STDOUT;
3109