1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim###################################################################
4238384Sjkim### AES-128 [originally in CTR mode]				###
5238384Sjkim### bitsliced implementation for Intel Core 2 processors	###
6238384Sjkim### requires support of SSE extensions up to SSSE3		###
7238384Sjkim### Author: Emilia K��sper and Peter Schwabe			###
8238384Sjkim### Date: 2009-03-19						###
9238384Sjkim### Public domain						###
10238384Sjkim###								###
11238384Sjkim### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12238384Sjkim### further information.					###
13238384Sjkim###################################################################
14238384Sjkim#
15238384Sjkim# September 2011.
16238384Sjkim#
17238384Sjkim# Started as transliteration to "perlasm" the original code has
18238384Sjkim# undergone following changes:
19238384Sjkim#
20238384Sjkim# - code was made position-independent;
21238384Sjkim# - rounds were folded into a loop resulting in >5x size reduction
22238384Sjkim#   from 12.5KB to 2.2KB;
23238384Sjkim# - above was possibile thanks to mixcolumns() modification that
24238384Sjkim#   allowed to feed its output back to aesenc[last], this was
25238384Sjkim#   achieved at cost of two additional inter-registers moves;
26238384Sjkim# - some instruction reordering and interleaving;
27238384Sjkim# - this module doesn't implement key setup subroutine, instead it
28238384Sjkim#   relies on conversion of "conventional" key schedule as returned
29238384Sjkim#   by AES_set_encrypt_key (see discussion below);
30238384Sjkim# - first and last round keys are treated differently, which allowed
31238384Sjkim#   to skip one shiftrows(), reduce bit-sliced key schedule and
32238384Sjkim#   speed-up conversion by 22%;
33238384Sjkim# - support for 192- and 256-bit keys was added;
34238384Sjkim#
35238384Sjkim# Resulting performance in CPU cycles spent to encrypt one byte out
36238384Sjkim# of 4096-byte buffer with 128-bit key is:
37238384Sjkim#
38238384Sjkim#		Emilia's	this(*)		difference
39238384Sjkim#
40238384Sjkim# Core 2    	9.30		8.69		+7%
41238384Sjkim# Nehalem(**) 	7.63		6.98		+9%
42238384Sjkim# Atom	    	17.1		17.4		-2%(***)
43238384Sjkim#
44238384Sjkim# (*)	Comparison is not completely fair, because "this" is ECB,
45238384Sjkim#	i.e. no extra processing such as counter values calculation
46238384Sjkim#	and xor-ing input as in Emilia's CTR implementation is
47238384Sjkim#	performed. However, the CTR calculations stand for not more
48238384Sjkim#	than 1% of total time, so comparison is *rather* fair.
49238384Sjkim#
50238384Sjkim# (**)	Results were collected on Westmere, which is considered to
51238384Sjkim#	be equivalent to Nehalem for this code.
52238384Sjkim#
53238384Sjkim# (***)	Slowdown on Atom is rather strange per se, because original
54238384Sjkim#	implementation has a number of 9+-bytes instructions, which
55238384Sjkim#	are bad for Atom front-end, and which I eliminated completely.
56238384Sjkim#	In attempt to address deterioration sbox() was tested in FP
57238384Sjkim#	SIMD "domain" (movaps instead of movdqa, xorps instead of
58238384Sjkim#	pxor, etc.). While it resulted in nominal 4% improvement on
59238384Sjkim#	Atom, it hurted Westmere by more than 2x factor.
60238384Sjkim#
61238384Sjkim# As for key schedule conversion subroutine. Interface to OpenSSL
62238384Sjkim# relies on per-invocation on-the-fly conversion. This naturally
63238384Sjkim# has impact on performance, especially for short inputs. Conversion
64238384Sjkim# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65238384Sjkim# function is:
66238384Sjkim#
67238384Sjkim# 		conversion	conversion/8x block
68238384Sjkim# Core 2	240		0.22
69238384Sjkim# Nehalem	180		0.20
70238384Sjkim# Atom		430		0.19
71238384Sjkim#
72238384Sjkim# The ratio values mean that 128-byte blocks will be processed
73238384Sjkim# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74238384Sjkim# etc. Then keep in mind that input sizes not divisible by 128 are
75238384Sjkim# *effectively* slower, especially shortest ones, e.g. consecutive
76238384Sjkim# 144-byte blocks are processed 44% slower than one would expect,
77238384Sjkim# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78238384Sjkim# it's still faster than ["hyper-threading-safe" code path in]
79238384Sjkim# aes-x86_64.pl on all lengths above 64 bytes...
80238384Sjkim#
81238384Sjkim# October 2011.
82238384Sjkim#
83238384Sjkim# Add decryption procedure. Performance in CPU cycles spent to decrypt
84238384Sjkim# one byte out of 4096-byte buffer with 128-bit key is:
85238384Sjkim#
86264331Sjkim# Core 2	9.83
87264331Sjkim# Nehalem	7.74
88264331Sjkim# Atom		19.0
89238384Sjkim#
90238384Sjkim# November 2011.
91238384Sjkim#
92238384Sjkim# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93238384Sjkim# suboptimal, but XTS is meant to be used with larger blocks...
94238384Sjkim#
95238384Sjkim#						<appro@openssl.org>
96238384Sjkim
97238384Sjkim$flavour = shift;
98238384Sjkim$output  = shift;
99238384Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100238384Sjkim
101238384Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102238384Sjkim
103238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104238384Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105238384Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106238384Sjkimdie "can't locate x86_64-xlate.pl";
107238384Sjkim
108246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
109246772Sjkim*STDOUT=*OUT;
110238384Sjkim
111238384Sjkimmy ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112238384Sjkimmy @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
113238384Sjkimmy $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
114238384Sjkim
115238384Sjkim{
116238384Sjkimmy ($key,$rounds,$const)=("%rax","%r10d","%r11");
117238384Sjkim
118238384Sjkimsub Sbox {
119238384Sjkim# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120238384Sjkim# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121238384Sjkimmy @b=@_[0..7];
122238384Sjkimmy @t=@_[8..11];
123238384Sjkimmy @s=@_[12..15];
124238384Sjkim	&InBasisChange	(@b);
125238384Sjkim	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
126238384Sjkim	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
127238384Sjkim}
128238384Sjkim
129238384Sjkimsub InBasisChange {
130238384Sjkim# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131238384Sjkim# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132238384Sjkimmy @b=@_[0..7];
133238384Sjkim$code.=<<___;
134238384Sjkim	pxor	@b[6], @b[5]
135238384Sjkim	pxor	@b[1], @b[2]
136238384Sjkim	pxor	@b[0], @b[3]
137238384Sjkim	pxor	@b[2], @b[6]
138238384Sjkim	pxor 	@b[0], @b[5]
139238384Sjkim
140238384Sjkim	pxor	@b[3], @b[6]
141238384Sjkim	pxor	@b[7], @b[3]
142238384Sjkim	pxor	@b[5], @b[7]
143238384Sjkim	pxor	@b[4], @b[3]
144238384Sjkim	pxor	@b[5], @b[4]
145238384Sjkim	pxor	@b[1], @b[3]
146238384Sjkim
147238384Sjkim	pxor	@b[7], @b[2]
148238384Sjkim	pxor	@b[5], @b[1]
149238384Sjkim___
150238384Sjkim}
151238384Sjkim
152238384Sjkimsub OutBasisChange {
153238384Sjkim# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154238384Sjkim# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155238384Sjkimmy @b=@_[0..7];
156238384Sjkim$code.=<<___;
157238384Sjkim	pxor	@b[6], @b[0]
158238384Sjkim	pxor	@b[4], @b[1]
159238384Sjkim	pxor	@b[0], @b[2]
160238384Sjkim	pxor	@b[6], @b[4]
161238384Sjkim	pxor	@b[1], @b[6]
162238384Sjkim
163238384Sjkim	pxor	@b[5], @b[1]
164238384Sjkim	pxor	@b[3], @b[5]
165238384Sjkim	pxor	@b[7], @b[3]
166238384Sjkim	pxor	@b[5], @b[7]
167238384Sjkim	pxor	@b[5], @b[2]
168238384Sjkim
169238384Sjkim	pxor	@b[7], @b[4]
170238384Sjkim___
171238384Sjkim}
172238384Sjkim
173238384Sjkimsub InvSbox {
174238384Sjkim# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175238384Sjkim# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176238384Sjkimmy @b=@_[0..7];
177238384Sjkimmy @t=@_[8..11];
178238384Sjkimmy @s=@_[12..15];
179238384Sjkim	&InvInBasisChange	(@b);
180238384Sjkim	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
181238384Sjkim	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
182238384Sjkim}
183238384Sjkim
184238384Sjkimsub InvInBasisChange {		# OutBasisChange in reverse
185238384Sjkimmy @b=@_[5,1,2,6,3,7,0,4];
186238384Sjkim$code.=<<___
187238384Sjkim	pxor	@b[7], @b[4]
188238384Sjkim
189238384Sjkim	pxor	@b[5], @b[7]
190238384Sjkim	pxor	@b[5], @b[2]
191238384Sjkim	pxor	@b[7], @b[3]
192238384Sjkim	pxor	@b[3], @b[5]
193238384Sjkim	pxor	@b[5], @b[1]
194238384Sjkim
195238384Sjkim	pxor	@b[1], @b[6]
196238384Sjkim	pxor	@b[0], @b[2]
197238384Sjkim	pxor	@b[6], @b[4]
198238384Sjkim	pxor	@b[6], @b[0]
199238384Sjkim	pxor	@b[4], @b[1]
200238384Sjkim___
201238384Sjkim}
202238384Sjkim
203238384Sjkimsub InvOutBasisChange {		# InBasisChange in reverse
204238384Sjkimmy @b=@_[2,5,7,3,6,1,0,4];
205238384Sjkim$code.=<<___;
206238384Sjkim	pxor	@b[5], @b[1]
207238384Sjkim	pxor	@b[7], @b[2]
208238384Sjkim
209238384Sjkim	pxor	@b[1], @b[3]
210238384Sjkim	pxor	@b[5], @b[4]
211238384Sjkim	pxor	@b[5], @b[7]
212238384Sjkim	pxor	@b[4], @b[3]
213238384Sjkim	 pxor 	@b[0], @b[5]
214238384Sjkim	pxor	@b[7], @b[3]
215238384Sjkim	 pxor	@b[2], @b[6]
216238384Sjkim	 pxor	@b[1], @b[2]
217238384Sjkim	pxor	@b[3], @b[6]
218238384Sjkim
219238384Sjkim	pxor	@b[0], @b[3]
220238384Sjkim	pxor	@b[6], @b[5]
221238384Sjkim___
222238384Sjkim}
223238384Sjkim
224238384Sjkimsub Mul_GF4 {
225238384Sjkim#;*************************************************************
226238384Sjkim#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227238384Sjkim#;*************************************************************
228238384Sjkimmy ($x0,$x1,$y0,$y1,$t0)=@_;
229238384Sjkim$code.=<<___;
230238384Sjkim	movdqa	$y0, $t0
231238384Sjkim	pxor 	$y1, $t0
232238384Sjkim	pand	$x0, $t0
233238384Sjkim	pxor	$x1, $x0
234238384Sjkim	pand	$y0, $x1
235238384Sjkim	pand	$y1, $x0
236238384Sjkim	pxor	$x1, $x0
237238384Sjkim	pxor	$t0, $x1
238238384Sjkim___
239238384Sjkim}
240238384Sjkim
241238384Sjkimsub Mul_GF4_N {				# not used, see next subroutine
242238384Sjkim# multiply and scale by N
243238384Sjkimmy ($x0,$x1,$y0,$y1,$t0)=@_;
244238384Sjkim$code.=<<___;
245238384Sjkim	movdqa	$y0, $t0
246238384Sjkim	pxor	$y1, $t0
247238384Sjkim	pand	$x0, $t0
248238384Sjkim	pxor	$x1, $x0
249238384Sjkim	pand	$y0, $x1
250238384Sjkim	pand	$y1, $x0
251238384Sjkim	pxor	$x0, $x1
252238384Sjkim	pxor	$t0, $x0
253238384Sjkim___
254238384Sjkim}
255238384Sjkim
256238384Sjkimsub Mul_GF4_N_GF4 {
257238384Sjkim# interleaved Mul_GF4_N and Mul_GF4
258238384Sjkimmy ($x0,$x1,$y0,$y1,$t0,
259238384Sjkim    $x2,$x3,$y2,$y3,$t1)=@_;
260238384Sjkim$code.=<<___;
261238384Sjkim	movdqa	$y0, $t0
262238384Sjkim	 movdqa	$y2, $t1
263238384Sjkim	pxor	$y1, $t0
264238384Sjkim	 pxor 	$y3, $t1
265238384Sjkim	pand	$x0, $t0
266238384Sjkim	 pand	$x2, $t1
267238384Sjkim	pxor	$x1, $x0
268238384Sjkim	 pxor	$x3, $x2
269238384Sjkim	pand	$y0, $x1
270238384Sjkim	 pand	$y2, $x3
271238384Sjkim	pand	$y1, $x0
272238384Sjkim	 pand	$y3, $x2
273238384Sjkim	pxor	$x0, $x1
274238384Sjkim	 pxor	$x3, $x2
275238384Sjkim	pxor	$t0, $x0
276238384Sjkim	 pxor	$t1, $x3
277238384Sjkim___
278238384Sjkim}
279238384Sjkimsub Mul_GF16_2 {
280238384Sjkimmy @x=@_[0..7];
281238384Sjkimmy @y=@_[8..11];
282238384Sjkimmy @t=@_[12..15];
283238384Sjkim$code.=<<___;
284238384Sjkim	movdqa	@x[0], @t[0]
285238384Sjkim	movdqa	@x[1], @t[1]
286238384Sjkim___
287238384Sjkim	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
288238384Sjkim$code.=<<___;
289238384Sjkim	pxor	@x[2], @t[0]
290238384Sjkim	pxor	@x[3], @t[1]
291238384Sjkim	pxor	@y[2], @y[0]
292238384Sjkim	pxor	@y[3], @y[1]
293238384Sjkim___
294238384Sjkim	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
295238384Sjkim			 @x[2], @x[3], @y[2], @y[3], @t[2]);
296238384Sjkim$code.=<<___;
297238384Sjkim	pxor	@t[0], @x[0]
298238384Sjkim	pxor	@t[0], @x[2]
299238384Sjkim	pxor	@t[1], @x[1]
300238384Sjkim	pxor	@t[1], @x[3]
301238384Sjkim
302238384Sjkim	movdqa	@x[4], @t[0]
303238384Sjkim	movdqa	@x[5], @t[1]
304238384Sjkim	pxor	@x[6], @t[0]
305238384Sjkim	pxor	@x[7], @t[1]
306238384Sjkim___
307238384Sjkim	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
308238384Sjkim			 @x[6], @x[7], @y[2], @y[3], @t[2]);
309238384Sjkim$code.=<<___;
310238384Sjkim	pxor	@y[2], @y[0]
311238384Sjkim	pxor	@y[3], @y[1]
312238384Sjkim___
313238384Sjkim	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
314238384Sjkim$code.=<<___;
315238384Sjkim	pxor	@t[0], @x[4]
316238384Sjkim	pxor	@t[0], @x[6]
317238384Sjkim	pxor	@t[1], @x[5]
318238384Sjkim	pxor	@t[1], @x[7]
319238384Sjkim___
320238384Sjkim}
321238384Sjkimsub Inv_GF256 {
322238384Sjkim#;********************************************************************
323238384Sjkim#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
324238384Sjkim#;********************************************************************
325238384Sjkimmy @x=@_[0..7];
326238384Sjkimmy @t=@_[8..11];
327238384Sjkimmy @s=@_[12..15];
328238384Sjkim# direct optimizations from hardware
329238384Sjkim$code.=<<___;
330238384Sjkim	movdqa	@x[4], @t[3]
331238384Sjkim	movdqa	@x[5], @t[2]
332238384Sjkim	movdqa	@x[1], @t[1]
333238384Sjkim	movdqa	@x[7], @s[1]
334238384Sjkim	movdqa	@x[0], @s[0]
335238384Sjkim
336238384Sjkim	pxor	@x[6], @t[3]
337238384Sjkim	pxor	@x[7], @t[2]
338238384Sjkim	pxor	@x[3], @t[1]
339238384Sjkim	 movdqa	@t[3], @s[2]
340238384Sjkim	pxor	@x[6], @s[1]
341238384Sjkim	 movdqa	@t[2], @t[0]
342238384Sjkim	pxor	@x[2], @s[0]
343238384Sjkim	 movdqa	@t[3], @s[3]
344238384Sjkim
345238384Sjkim	por	@t[1], @t[2]
346238384Sjkim	por	@s[0], @t[3]
347238384Sjkim	pxor	@t[0], @s[3]
348238384Sjkim	pand	@s[0], @s[2]
349238384Sjkim	pxor	@t[1], @s[0]
350238384Sjkim	pand	@t[1], @t[0]
351238384Sjkim	pand	@s[0], @s[3]
352238384Sjkim	movdqa	@x[3], @s[0]
353238384Sjkim	pxor	@x[2], @s[0]
354238384Sjkim	pand	@s[0], @s[1]
355238384Sjkim	pxor	@s[1], @t[3]
356238384Sjkim	pxor	@s[1], @t[2]
357238384Sjkim	movdqa	@x[4], @s[1]
358238384Sjkim	movdqa	@x[1], @s[0]
359238384Sjkim	pxor	@x[5], @s[1]
360238384Sjkim	pxor	@x[0], @s[0]
361238384Sjkim	movdqa	@s[1], @t[1]
362238384Sjkim	pand	@s[0], @s[1]
363238384Sjkim	por	@s[0], @t[1]
364238384Sjkim	pxor	@s[1], @t[0]
365238384Sjkim	pxor	@s[3], @t[3]
366238384Sjkim	pxor	@s[2], @t[2]
367238384Sjkim	pxor	@s[3], @t[1]
368238384Sjkim	movdqa	@x[7], @s[0]
369238384Sjkim	pxor	@s[2], @t[0]
370238384Sjkim	movdqa	@x[6], @s[1]
371238384Sjkim	pxor	@s[2], @t[1]
372238384Sjkim	movdqa	@x[5], @s[2]
373238384Sjkim	pand	@x[3], @s[0]
374238384Sjkim	movdqa	@x[4], @s[3]
375238384Sjkim	pand	@x[2], @s[1]
376238384Sjkim	pand	@x[1], @s[2]
377238384Sjkim	por	@x[0], @s[3]
378238384Sjkim	pxor	@s[0], @t[3]
379238384Sjkim	pxor	@s[1], @t[2]
380238384Sjkim	pxor	@s[2], @t[1]
381238384Sjkim	pxor	@s[3], @t[0]
382238384Sjkim
383238384Sjkim	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384238384Sjkim
385238384Sjkim	# new smaller inversion
386238384Sjkim
387238384Sjkim	movdqa	@t[3], @s[0]
388238384Sjkim	pand	@t[1], @t[3]
389238384Sjkim	pxor	@t[2], @s[0]
390238384Sjkim
391238384Sjkim	movdqa	@t[0], @s[2]
392238384Sjkim	movdqa	@s[0], @s[3]
393238384Sjkim	pxor	@t[3], @s[2]
394238384Sjkim	pand	@s[2], @s[3]
395238384Sjkim
396238384Sjkim	movdqa	@t[1], @s[1]
397238384Sjkim	pxor	@t[2], @s[3]
398238384Sjkim	pxor	@t[0], @s[1]
399238384Sjkim
400238384Sjkim	pxor	@t[2], @t[3]
401238384Sjkim
402238384Sjkim	pand	@t[3], @s[1]
403238384Sjkim
404238384Sjkim	movdqa	@s[2], @t[2]
405238384Sjkim	pxor	@t[0], @s[1]
406238384Sjkim
407238384Sjkim	pxor	@s[1], @t[2]
408238384Sjkim	pxor	@s[1], @t[1]
409238384Sjkim
410238384Sjkim	pand	@t[0], @t[2]
411238384Sjkim
412238384Sjkim	pxor	@t[2], @s[2]
413238384Sjkim	pxor	@t[2], @t[1]
414238384Sjkim
415238384Sjkim	pand	@s[3], @s[2]
416238384Sjkim
417238384Sjkim	pxor	@s[0], @s[2]
418238384Sjkim___
419238384Sjkim# output in s3, s2, s1, t1
420238384Sjkim
421238384Sjkim# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422238384Sjkim
423238384Sjkim# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424238384Sjkim	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425238384Sjkim
426238384Sjkim### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427238384Sjkim}
428238384Sjkim
429238384Sjkim# AES linear components
430238384Sjkim
431238384Sjkimsub ShiftRows {
432238384Sjkimmy @x=@_[0..7];
433238384Sjkimmy $mask=pop;
434238384Sjkim$code.=<<___;
435238384Sjkim	pxor	0x00($key),@x[0]
436238384Sjkim	pxor	0x10($key),@x[1]
437238384Sjkim	pshufb	$mask,@x[0]
438238384Sjkim	pxor	0x20($key),@x[2]
439238384Sjkim	pshufb	$mask,@x[1]
440238384Sjkim	pxor	0x30($key),@x[3]
441238384Sjkim	pshufb	$mask,@x[2]
442238384Sjkim	pxor	0x40($key),@x[4]
443238384Sjkim	pshufb	$mask,@x[3]
444238384Sjkim	pxor	0x50($key),@x[5]
445238384Sjkim	pshufb	$mask,@x[4]
446238384Sjkim	pxor	0x60($key),@x[6]
447238384Sjkim	pshufb	$mask,@x[5]
448238384Sjkim	pxor	0x70($key),@x[7]
449238384Sjkim	pshufb	$mask,@x[6]
450238384Sjkim	lea	0x80($key),$key
451238384Sjkim	pshufb	$mask,@x[7]
452238384Sjkim___
453238384Sjkim}
454238384Sjkim
455238384Sjkimsub MixColumns {
456238384Sjkim# modified to emit output in order suitable for feeding back to aesenc[last]
457238384Sjkimmy @x=@_[0..7];
458238384Sjkimmy @t=@_[8..15];
459264331Sjkimmy $inv=@_[16];	# optional
460238384Sjkim$code.=<<___;
461238384Sjkim	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
462238384Sjkim	pshufd	\$0x93, @x[1], @t[1]
463238384Sjkim	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
464238384Sjkim	pshufd	\$0x93, @x[2], @t[2]
465238384Sjkim	 pxor	@t[1], @x[1]
466238384Sjkim	pshufd	\$0x93, @x[3], @t[3]
467238384Sjkim	 pxor	@t[2], @x[2]
468238384Sjkim	pshufd	\$0x93, @x[4], @t[4]
469238384Sjkim	 pxor	@t[3], @x[3]
470238384Sjkim	pshufd	\$0x93, @x[5], @t[5]
471238384Sjkim	 pxor	@t[4], @x[4]
472238384Sjkim	pshufd	\$0x93, @x[6], @t[6]
473238384Sjkim	 pxor	@t[5], @x[5]
474238384Sjkim	pshufd	\$0x93, @x[7], @t[7]
475238384Sjkim	 pxor	@t[6], @x[6]
476238384Sjkim	 pxor	@t[7], @x[7]
477238384Sjkim
478238384Sjkim	pxor	@x[0], @t[1]
479238384Sjkim	pxor	@x[7], @t[0]
480238384Sjkim	pxor	@x[7], @t[1]
481238384Sjkim	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
482238384Sjkim	pxor	@x[1], @t[2]
483238384Sjkim	 pshufd	\$0x4E, @x[1], @x[1]
484238384Sjkim	pxor	@x[4], @t[5]
485238384Sjkim	 pxor	@t[0], @x[0]
486238384Sjkim	pxor	@x[5], @t[6]
487238384Sjkim	 pxor	@t[1], @x[1]
488238384Sjkim	pxor	@x[3], @t[4]
489238384Sjkim	 pshufd	\$0x4E, @x[4], @t[0]
490238384Sjkim	pxor	@x[6], @t[7]
491238384Sjkim	 pshufd	\$0x4E, @x[5], @t[1]
492238384Sjkim	pxor	@x[2], @t[3]
493238384Sjkim	 pshufd	\$0x4E, @x[3], @x[4]
494238384Sjkim	pxor	@x[7], @t[3]
495238384Sjkim	 pshufd	\$0x4E, @x[7], @x[5]
496238384Sjkim	pxor	@x[7], @t[4]
497238384Sjkim	 pshufd	\$0x4E, @x[6], @x[3]
498238384Sjkim	pxor	@t[4], @t[0]
499238384Sjkim	 pshufd	\$0x4E, @x[2], @x[6]
500238384Sjkim	pxor	@t[5], @t[1]
501264331Sjkim___
502264331Sjkim$code.=<<___ if (!$inv);
503238384Sjkim	pxor	@t[3], @x[4]
504238384Sjkim	pxor	@t[7], @x[5]
505238384Sjkim	pxor	@t[6], @x[3]
506238384Sjkim	 movdqa	@t[0], @x[2]
507238384Sjkim	pxor	@t[2], @x[6]
508238384Sjkim	 movdqa	@t[1], @x[7]
509238384Sjkim___
510264331Sjkim$code.=<<___ if ($inv);
511264331Sjkim	pxor	@x[4], @t[3]
512264331Sjkim	pxor	@t[7], @x[5]
513264331Sjkim	pxor	@x[3], @t[6]
514264331Sjkim	 movdqa	@t[0], @x[3]
515264331Sjkim	pxor	@t[2], @x[6]
516264331Sjkim	 movdqa	@t[6], @x[2]
517264331Sjkim	 movdqa	@t[1], @x[7]
518264331Sjkim	 movdqa	@x[6], @x[4]
519264331Sjkim	 movdqa	@t[3], @x[6]
520264331Sjkim___
521238384Sjkim}
522238384Sjkim
523264331Sjkimsub InvMixColumns_orig {
524238384Sjkimmy @x=@_[0..7];
525238384Sjkimmy @t=@_[8..15];
526238384Sjkim
527238384Sjkim$code.=<<___;
528238384Sjkim	# multiplication by 0x0e
529238384Sjkim	pshufd	\$0x93, @x[7], @t[7]
530238384Sjkim	movdqa	@x[2], @t[2]
531238384Sjkim	pxor	@x[5], @x[7]		# 7 5
532238384Sjkim	pxor	@x[5], @x[2]		# 2 5
533238384Sjkim	pshufd	\$0x93, @x[0], @t[0]
534238384Sjkim	movdqa	@x[5], @t[5]
535238384Sjkim	pxor	@x[0], @x[5]		# 5 0		[1]
536238384Sjkim	pxor	@x[1], @x[0]		# 0 1
537238384Sjkim	pshufd	\$0x93, @x[1], @t[1]
538238384Sjkim	pxor	@x[2], @x[1]		# 1 25
539238384Sjkim	pxor	@x[6], @x[0]		# 01 6		[2]
540238384Sjkim	pxor	@x[3], @x[1]		# 125 3		[4]
541238384Sjkim	pshufd	\$0x93, @x[3], @t[3]
542238384Sjkim	pxor	@x[0], @x[2]		# 25 016	[3]
543238384Sjkim	pxor	@x[7], @x[3]		# 3 75
544238384Sjkim	pxor	@x[6], @x[7]		# 75 6		[0]
545238384Sjkim	pshufd	\$0x93, @x[6], @t[6]
546238384Sjkim	movdqa	@x[4], @t[4]
547238384Sjkim	pxor	@x[4], @x[6]		# 6 4
548238384Sjkim	pxor	@x[3], @x[4]		# 4 375		[6]
549238384Sjkim	pxor	@x[7], @x[3]		# 375 756=36
550238384Sjkim	pxor	@t[5], @x[6]		# 64 5		[7]
551238384Sjkim	pxor	@t[2], @x[3]		# 36 2
552238384Sjkim	pxor	@t[4], @x[3]		# 362 4		[5]
553238384Sjkim	pshufd	\$0x93, @t[5], @t[5]
554238384Sjkim___
555238384Sjkim					my @y = @x[7,5,0,2,1,3,4,6];
556238384Sjkim$code.=<<___;
557238384Sjkim	# multiplication by 0x0b
558238384Sjkim	pxor	@y[0], @y[1]
559238384Sjkim	pxor	@t[0], @y[0]
560238384Sjkim	pxor	@t[1], @y[1]
561238384Sjkim	pshufd	\$0x93, @t[2], @t[2]
562238384Sjkim	pxor	@t[5], @y[0]
563238384Sjkim	pxor	@t[6], @y[1]
564238384Sjkim	pxor	@t[7], @y[0]
565238384Sjkim	pshufd	\$0x93, @t[4], @t[4]
566238384Sjkim	pxor	@t[6], @t[7]		# clobber t[7]
567238384Sjkim	pxor	@y[0], @y[1]
568238384Sjkim
569238384Sjkim	pxor	@t[0], @y[3]
570238384Sjkim	pshufd	\$0x93, @t[0], @t[0]
571238384Sjkim	pxor	@t[1], @y[2]
572238384Sjkim	pxor	@t[1], @y[4]
573238384Sjkim	pxor	@t[2], @y[2]
574238384Sjkim	pshufd	\$0x93, @t[1], @t[1]
575238384Sjkim	pxor	@t[2], @y[3]
576238384Sjkim	pxor	@t[2], @y[5]
577238384Sjkim	pxor	@t[7], @y[2]
578238384Sjkim	pshufd	\$0x93, @t[2], @t[2]
579238384Sjkim	pxor	@t[3], @y[3]
580238384Sjkim	pxor	@t[3], @y[6]
581238384Sjkim	pxor	@t[3], @y[4]
582238384Sjkim	pshufd	\$0x93, @t[3], @t[3]
583238384Sjkim	pxor	@t[4], @y[7]
584238384Sjkim	pxor	@t[4], @y[5]
585238384Sjkim	pxor	@t[7], @y[7]
586238384Sjkim	pxor	@t[5], @y[3]
587238384Sjkim	pxor	@t[4], @y[4]
588238384Sjkim	pxor	@t[5], @t[7]		# clobber t[7] even more
589238384Sjkim
590238384Sjkim	pxor	@t[7], @y[5]
591238384Sjkim	pshufd	\$0x93, @t[4], @t[4]
592238384Sjkim	pxor	@t[7], @y[6]
593238384Sjkim	pxor	@t[7], @y[4]
594238384Sjkim
595238384Sjkim	pxor	@t[5], @t[7]
596238384Sjkim	pshufd	\$0x93, @t[5], @t[5]
597238384Sjkim	pxor	@t[6], @t[7]		# restore t[7]
598238384Sjkim
599238384Sjkim	# multiplication by 0x0d
600238384Sjkim	pxor	@y[7], @y[4]
601238384Sjkim	pxor	@t[4], @y[7]
602238384Sjkim	pshufd	\$0x93, @t[6], @t[6]
603238384Sjkim	pxor	@t[0], @y[2]
604238384Sjkim	pxor	@t[5], @y[7]
605238384Sjkim	pxor	@t[2], @y[2]
606238384Sjkim	pshufd	\$0x93, @t[7], @t[7]
607238384Sjkim
608238384Sjkim	pxor	@y[1], @y[3]
609238384Sjkim	pxor	@t[1], @y[1]
610238384Sjkim	pxor	@t[0], @y[0]
611238384Sjkim	pxor	@t[0], @y[3]
612238384Sjkim	pxor	@t[5], @y[1]
613238384Sjkim	pxor	@t[5], @y[0]
614238384Sjkim	pxor	@t[7], @y[1]
615238384Sjkim	pshufd	\$0x93, @t[0], @t[0]
616238384Sjkim	pxor	@t[6], @y[0]
617238384Sjkim	pxor	@y[1], @y[3]
618238384Sjkim	pxor	@t[1], @y[4]
619238384Sjkim	pshufd	\$0x93, @t[1], @t[1]
620238384Sjkim
621238384Sjkim	pxor	@t[7], @y[7]
622238384Sjkim	pxor	@t[2], @y[4]
623238384Sjkim	pxor	@t[2], @y[5]
624238384Sjkim	pshufd	\$0x93, @t[2], @t[2]
625238384Sjkim	pxor	@t[6], @y[2]
626238384Sjkim	pxor	@t[3], @t[6]		# clobber t[6]
627238384Sjkim	pxor	@y[7], @y[4]
628238384Sjkim	pxor	@t[6], @y[3]
629238384Sjkim
630238384Sjkim	pxor	@t[6], @y[6]
631238384Sjkim	pxor	@t[5], @y[5]
632238384Sjkim	pxor	@t[4], @y[6]
633238384Sjkim	pshufd	\$0x93, @t[4], @t[4]
634238384Sjkim	pxor	@t[6], @y[5]
635238384Sjkim	pxor	@t[7], @y[6]
636238384Sjkim	pxor	@t[3], @t[6]		# restore t[6]
637238384Sjkim
638238384Sjkim	pshufd	\$0x93, @t[5], @t[5]
639238384Sjkim	pshufd	\$0x93, @t[6], @t[6]
640238384Sjkim	pshufd	\$0x93, @t[7], @t[7]
641238384Sjkim	pshufd	\$0x93, @t[3], @t[3]
642238384Sjkim
643238384Sjkim	# multiplication by 0x09
644238384Sjkim	pxor	@y[1], @y[4]
645238384Sjkim	pxor	@y[1], @t[1]		# t[1]=y[1]
646238384Sjkim	pxor	@t[5], @t[0]		# clobber t[0]
647238384Sjkim	pxor	@t[5], @t[1]
648238384Sjkim	pxor	@t[0], @y[3]
649238384Sjkim	pxor	@y[0], @t[0]		# t[0]=y[0]
650238384Sjkim	pxor	@t[6], @t[1]
651238384Sjkim	pxor	@t[7], @t[6]		# clobber t[6]
652238384Sjkim	pxor	@t[1], @y[4]
653238384Sjkim	pxor	@t[4], @y[7]
654238384Sjkim	pxor	@y[4], @t[4]		# t[4]=y[4]
655238384Sjkim	pxor	@t[3], @y[6]
656238384Sjkim	pxor	@y[3], @t[3]		# t[3]=y[3]
657238384Sjkim	pxor	@t[2], @y[5]
658238384Sjkim	pxor	@y[2], @t[2]		# t[2]=y[2]
659238384Sjkim	pxor	@t[7], @t[3]
660238384Sjkim	pxor	@y[5], @t[5]		# t[5]=y[5]
661238384Sjkim	pxor	@t[6], @t[2]
662238384Sjkim	pxor	@t[6], @t[5]
663238384Sjkim	pxor	@y[6], @t[6]		# t[6]=y[6]
664238384Sjkim	pxor	@y[7], @t[7]		# t[7]=y[7]
665238384Sjkim
666238384Sjkim	movdqa	@t[0],@XMM[0]
667238384Sjkim	movdqa	@t[1],@XMM[1]
668238384Sjkim	movdqa	@t[2],@XMM[2]
669238384Sjkim	movdqa	@t[3],@XMM[3]
670238384Sjkim	movdqa	@t[4],@XMM[4]
671238384Sjkim	movdqa	@t[5],@XMM[5]
672238384Sjkim	movdqa	@t[6],@XMM[6]
673238384Sjkim	movdqa	@t[7],@XMM[7]
674238384Sjkim___
675238384Sjkim}
676238384Sjkim
677264331Sjkimsub InvMixColumns {
678264331Sjkimmy @x=@_[0..7];
679264331Sjkimmy @t=@_[8..15];
680264331Sjkim
681264331Sjkim# Thanks to Jussi Kivilinna for providing pointer to
682264331Sjkim#
683264331Sjkim# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
684264331Sjkim# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685264331Sjkim# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
686264331Sjkim# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
687264331Sjkim
688264331Sjkim$code.=<<___;
689264331Sjkim	# multiplication by 0x05-0x00-0x04-0x00
690264331Sjkim	pshufd	\$0x4E, @x[0], @t[0]
691264331Sjkim	pshufd	\$0x4E, @x[6], @t[6]
692264331Sjkim	pxor	@x[0], @t[0]
693264331Sjkim	pshufd	\$0x4E, @x[7], @t[7]
694264331Sjkim	pxor	@x[6], @t[6]
695264331Sjkim	pshufd	\$0x4E, @x[1], @t[1]
696264331Sjkim	pxor	@x[7], @t[7]
697264331Sjkim	pshufd	\$0x4E, @x[2], @t[2]
698264331Sjkim	pxor	@x[1], @t[1]
699264331Sjkim	pshufd	\$0x4E, @x[3], @t[3]
700264331Sjkim	pxor	@x[2], @t[2]
701264331Sjkim	 pxor	@t[6], @x[0]
702264331Sjkim	 pxor	@t[6], @x[1]
703264331Sjkim	pshufd	\$0x4E, @x[4], @t[4]
704264331Sjkim	pxor	@x[3], @t[3]
705264331Sjkim	 pxor	@t[0], @x[2]
706264331Sjkim	 pxor	@t[1], @x[3]
707264331Sjkim	pshufd	\$0x4E, @x[5], @t[5]
708264331Sjkim	pxor	@x[4], @t[4]
709264331Sjkim	 pxor	@t[7], @x[1]
710264331Sjkim	 pxor	@t[2], @x[4]
711264331Sjkim	pxor	@x[5], @t[5]
712264331Sjkim
713264331Sjkim	 pxor	@t[7], @x[2]
714264331Sjkim	 pxor	@t[6], @x[3]
715264331Sjkim	 pxor	@t[6], @x[4]
716264331Sjkim	 pxor	@t[3], @x[5]
717264331Sjkim	 pxor	@t[4], @x[6]
718264331Sjkim	 pxor	@t[7], @x[4]
719264331Sjkim	 pxor	@t[7], @x[5]
720264331Sjkim	 pxor	@t[5], @x[7]
721264331Sjkim___
722264331Sjkim	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
723264331Sjkim}
724264331Sjkim
725238384Sjkimsub aesenc {				# not used
726238384Sjkimmy @b=@_[0..7];
727238384Sjkimmy @t=@_[8..15];
728238384Sjkim$code.=<<___;
729238384Sjkim	movdqa	0x30($const),@t[0]	# .LSR
730238384Sjkim___
731238384Sjkim	&ShiftRows	(@b,@t[0]);
732238384Sjkim	&Sbox		(@b,@t);
733238384Sjkim	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
734238384Sjkim}
735238384Sjkim
736238384Sjkimsub aesenclast {			# not used
737238384Sjkimmy @b=@_[0..7];
738238384Sjkimmy @t=@_[8..15];
739238384Sjkim$code.=<<___;
740238384Sjkim	movdqa	0x40($const),@t[0]	# .LSRM0
741238384Sjkim___
742238384Sjkim	&ShiftRows	(@b,@t[0]);
743238384Sjkim	&Sbox		(@b,@t);
744238384Sjkim$code.=<<___
745238384Sjkim	pxor	0x00($key),@b[0]
746238384Sjkim	pxor	0x10($key),@b[1]
747238384Sjkim	pxor	0x20($key),@b[4]
748238384Sjkim	pxor	0x30($key),@b[6]
749238384Sjkim	pxor	0x40($key),@b[3]
750238384Sjkim	pxor	0x50($key),@b[7]
751238384Sjkim	pxor	0x60($key),@b[2]
752238384Sjkim	pxor	0x70($key),@b[5]
753238384Sjkim___
754238384Sjkim}
755238384Sjkim
756238384Sjkimsub swapmove {
757238384Sjkimmy ($a,$b,$n,$mask,$t)=@_;
758238384Sjkim$code.=<<___;
759238384Sjkim	movdqa	$b,$t
760238384Sjkim	psrlq	\$$n,$b
761238384Sjkim	pxor  	$a,$b
762238384Sjkim	pand	$mask,$b
763238384Sjkim	pxor	$b,$a
764238384Sjkim	psllq	\$$n,$b
765238384Sjkim	pxor	$t,$b
766238384Sjkim___
767238384Sjkim}
768238384Sjkimsub swapmove2x {
769238384Sjkimmy ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770238384Sjkim$code.=<<___;
771238384Sjkim	movdqa	$b0,$t0
772238384Sjkim	psrlq	\$$n,$b0
773238384Sjkim	 movdqa	$b1,$t1
774238384Sjkim	 psrlq	\$$n,$b1
775238384Sjkim	pxor  	$a0,$b0
776238384Sjkim	 pxor  	$a1,$b1
777238384Sjkim	pand	$mask,$b0
778238384Sjkim	 pand	$mask,$b1
779238384Sjkim	pxor	$b0,$a0
780238384Sjkim	psllq	\$$n,$b0
781238384Sjkim	 pxor	$b1,$a1
782238384Sjkim	 psllq	\$$n,$b1
783238384Sjkim	pxor	$t0,$b0
784238384Sjkim	 pxor	$t1,$b1
785238384Sjkim___
786238384Sjkim}
787238384Sjkim
788238384Sjkimsub bitslice {
789238384Sjkimmy @x=reverse(@_[0..7]);
790238384Sjkimmy ($t0,$t1,$t2,$t3)=@_[8..11];
791238384Sjkim$code.=<<___;
792238384Sjkim	movdqa	0x00($const),$t0	# .LBS0
793238384Sjkim	movdqa	0x10($const),$t1	# .LBS1
794238384Sjkim___
795238384Sjkim	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796238384Sjkim	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797238384Sjkim$code.=<<___;
798238384Sjkim	movdqa	0x20($const),$t0	# .LBS2
799238384Sjkim___
800238384Sjkim	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801238384Sjkim	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802238384Sjkim
803238384Sjkim	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804238384Sjkim	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805238384Sjkim}
806238384Sjkim
807238384Sjkim$code.=<<___;
808238384Sjkim.text
809238384Sjkim
810238384Sjkim.extern	asm_AES_encrypt
811238384Sjkim.extern	asm_AES_decrypt
812238384Sjkim
813238384Sjkim.type	_bsaes_encrypt8,\@abi-omnipotent
814238384Sjkim.align	64
815238384Sjkim_bsaes_encrypt8:
816238384Sjkim	lea	.LBS0(%rip), $const	# constants table
817238384Sjkim
818238384Sjkim	movdqa	($key), @XMM[9]		# round 0 key
819238384Sjkim	lea	0x10($key), $key
820238384Sjkim	movdqa	0x50($const), @XMM[8]	# .LM0SR
821238384Sjkim	pxor	@XMM[9], @XMM[0]	# xor with round0 key
822238384Sjkim	pxor	@XMM[9], @XMM[1]
823238384Sjkim	 pshufb	@XMM[8], @XMM[0]
824238384Sjkim	pxor	@XMM[9], @XMM[2]
825238384Sjkim	 pshufb	@XMM[8], @XMM[1]
826238384Sjkim	pxor	@XMM[9], @XMM[3]
827238384Sjkim	 pshufb	@XMM[8], @XMM[2]
828238384Sjkim	pxor	@XMM[9], @XMM[4]
829238384Sjkim	 pshufb	@XMM[8], @XMM[3]
830238384Sjkim	pxor	@XMM[9], @XMM[5]
831238384Sjkim	 pshufb	@XMM[8], @XMM[4]
832238384Sjkim	pxor	@XMM[9], @XMM[6]
833238384Sjkim	 pshufb	@XMM[8], @XMM[5]
834238384Sjkim	pxor	@XMM[9], @XMM[7]
835238384Sjkim	 pshufb	@XMM[8], @XMM[6]
836238384Sjkim	 pshufb	@XMM[8], @XMM[7]
837238384Sjkim_bsaes_encrypt8_bitslice:
838238384Sjkim___
839238384Sjkim	&bitslice	(@XMM[0..7, 8..11]);
840238384Sjkim$code.=<<___;
841238384Sjkim	dec	$rounds
842238384Sjkim	jmp	.Lenc_sbox
843238384Sjkim.align	16
844238384Sjkim.Lenc_loop:
845238384Sjkim___
846238384Sjkim	&ShiftRows	(@XMM[0..7, 8]);
847238384Sjkim$code.=".Lenc_sbox:\n";
848238384Sjkim	&Sbox		(@XMM[0..7, 8..15]);
849238384Sjkim$code.=<<___;
850238384Sjkim	dec	$rounds
851238384Sjkim	jl	.Lenc_done
852238384Sjkim___
853238384Sjkim	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
854238384Sjkim$code.=<<___;
855238384Sjkim	movdqa	0x30($const), @XMM[8]	# .LSR
856238384Sjkim	jnz	.Lenc_loop
857238384Sjkim	movdqa	0x40($const), @XMM[8]	# .LSRM0
858238384Sjkim	jmp	.Lenc_loop
859238384Sjkim.align	16
860238384Sjkim.Lenc_done:
861238384Sjkim___
862238384Sjkim	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
863238384Sjkim	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
864238384Sjkim$code.=<<___;
865238384Sjkim	movdqa	($key), @XMM[8]		# last round key
866238384Sjkim	pxor	@XMM[8], @XMM[4]
867238384Sjkim	pxor	@XMM[8], @XMM[6]
868238384Sjkim	pxor	@XMM[8], @XMM[3]
869238384Sjkim	pxor	@XMM[8], @XMM[7]
870238384Sjkim	pxor	@XMM[8], @XMM[2]
871238384Sjkim	pxor	@XMM[8], @XMM[5]
872238384Sjkim	pxor	@XMM[8], @XMM[0]
873238384Sjkim	pxor	@XMM[8], @XMM[1]
874238384Sjkim	ret
875238384Sjkim.size	_bsaes_encrypt8,.-_bsaes_encrypt8
876238384Sjkim
877238384Sjkim.type	_bsaes_decrypt8,\@abi-omnipotent
878238384Sjkim.align	64
879238384Sjkim_bsaes_decrypt8:
880238384Sjkim	lea	.LBS0(%rip), $const	# constants table
881238384Sjkim
882238384Sjkim	movdqa	($key), @XMM[9]		# round 0 key
883238384Sjkim	lea	0x10($key), $key
884238384Sjkim	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
885238384Sjkim	pxor	@XMM[9], @XMM[0]	# xor with round0 key
886238384Sjkim	pxor	@XMM[9], @XMM[1]
887238384Sjkim	 pshufb	@XMM[8], @XMM[0]
888238384Sjkim	pxor	@XMM[9], @XMM[2]
889238384Sjkim	 pshufb	@XMM[8], @XMM[1]
890238384Sjkim	pxor	@XMM[9], @XMM[3]
891238384Sjkim	 pshufb	@XMM[8], @XMM[2]
892238384Sjkim	pxor	@XMM[9], @XMM[4]
893238384Sjkim	 pshufb	@XMM[8], @XMM[3]
894238384Sjkim	pxor	@XMM[9], @XMM[5]
895238384Sjkim	 pshufb	@XMM[8], @XMM[4]
896238384Sjkim	pxor	@XMM[9], @XMM[6]
897238384Sjkim	 pshufb	@XMM[8], @XMM[5]
898238384Sjkim	pxor	@XMM[9], @XMM[7]
899238384Sjkim	 pshufb	@XMM[8], @XMM[6]
900238384Sjkim	 pshufb	@XMM[8], @XMM[7]
901238384Sjkim___
902238384Sjkim	&bitslice	(@XMM[0..7, 8..11]);
903238384Sjkim$code.=<<___;
904238384Sjkim	dec	$rounds
905238384Sjkim	jmp	.Ldec_sbox
906238384Sjkim.align	16
907238384Sjkim.Ldec_loop:
908238384Sjkim___
909238384Sjkim	&ShiftRows	(@XMM[0..7, 8]);
910238384Sjkim$code.=".Ldec_sbox:\n";
911238384Sjkim	&InvSbox	(@XMM[0..7, 8..15]);
912238384Sjkim$code.=<<___;
913238384Sjkim	dec	$rounds
914238384Sjkim	jl	.Ldec_done
915238384Sjkim___
916238384Sjkim	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
917238384Sjkim$code.=<<___;
918238384Sjkim	movdqa	-0x10($const), @XMM[8]	# .LISR
919238384Sjkim	jnz	.Ldec_loop
920238384Sjkim	movdqa	-0x20($const), @XMM[8]	# .LISRM0
921238384Sjkim	jmp	.Ldec_loop
922238384Sjkim.align	16
923238384Sjkim.Ldec_done:
924238384Sjkim___
925238384Sjkim	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
926238384Sjkim$code.=<<___;
927238384Sjkim	movdqa	($key), @XMM[8]		# last round key
928238384Sjkim	pxor	@XMM[8], @XMM[6]
929238384Sjkim	pxor	@XMM[8], @XMM[4]
930238384Sjkim	pxor	@XMM[8], @XMM[2]
931238384Sjkim	pxor	@XMM[8], @XMM[7]
932238384Sjkim	pxor	@XMM[8], @XMM[3]
933238384Sjkim	pxor	@XMM[8], @XMM[5]
934238384Sjkim	pxor	@XMM[8], @XMM[0]
935238384Sjkim	pxor	@XMM[8], @XMM[1]
936238384Sjkim	ret
937238384Sjkim.size	_bsaes_decrypt8,.-_bsaes_decrypt8
938238384Sjkim___
939238384Sjkim}
940238384Sjkim{
941238384Sjkimmy ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
942238384Sjkim
943238384Sjkimsub bitslice_key {
944238384Sjkimmy @x=reverse(@_[0..7]);
945238384Sjkimmy ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
946238384Sjkim
947238384Sjkim	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
948238384Sjkim$code.=<<___;
949238384Sjkim	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
950238384Sjkim	movdqa	@x[0], @x[2]
951238384Sjkim	movdqa	@x[1], @x[3]
952238384Sjkim___
953238384Sjkim	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
954238384Sjkim
955238384Sjkim	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
956238384Sjkim$code.=<<___;
957238384Sjkim	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
958238384Sjkim	movdqa	@x[0], @x[4]
959238384Sjkim	movdqa	@x[2], @x[6]
960238384Sjkim	movdqa	@x[1], @x[5]
961238384Sjkim	movdqa	@x[3], @x[7]
962238384Sjkim___
963238384Sjkim	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
964238384Sjkim	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
965238384Sjkim}
966238384Sjkim
967238384Sjkim$code.=<<___;
968238384Sjkim.type	_bsaes_key_convert,\@abi-omnipotent
969238384Sjkim.align	16
970238384Sjkim_bsaes_key_convert:
971238384Sjkim	lea	.Lmasks(%rip), $const
972238384Sjkim	movdqu	($inp), %xmm7		# load round 0 key
973238384Sjkim	lea	0x10($inp), $inp
974238384Sjkim	movdqa	0x00($const), %xmm0	# 0x01...
975238384Sjkim	movdqa	0x10($const), %xmm1	# 0x02...
976238384Sjkim	movdqa	0x20($const), %xmm2	# 0x04...
977238384Sjkim	movdqa	0x30($const), %xmm3	# 0x08...
978238384Sjkim	movdqa	0x40($const), %xmm4	# .LM0
979238384Sjkim	pcmpeqd	%xmm5, %xmm5		# .LNOT
980238384Sjkim
981238384Sjkim	movdqu	($inp), %xmm6		# load round 1 key
982238384Sjkim	movdqa	%xmm7, ($out)		# save round 0 key
983238384Sjkim	lea	0x10($out), $out
984238384Sjkim	dec	$rounds
985238384Sjkim	jmp	.Lkey_loop
986238384Sjkim.align	16
987238384Sjkim.Lkey_loop:
988238384Sjkim	pshufb	%xmm4, %xmm6		# .LM0
989238384Sjkim
990238384Sjkim	movdqa	%xmm0,	%xmm8
991238384Sjkim	movdqa	%xmm1,	%xmm9
992238384Sjkim
993238384Sjkim	pand	%xmm6,	%xmm8
994238384Sjkim	pand	%xmm6,	%xmm9
995238384Sjkim	movdqa	%xmm2,	%xmm10
996238384Sjkim	pcmpeqb	%xmm0,	%xmm8
997238384Sjkim	psllq	\$4,	%xmm0		# 0x10...
998238384Sjkim	movdqa	%xmm3,	%xmm11
999238384Sjkim	pcmpeqb	%xmm1,	%xmm9
1000238384Sjkim	psllq	\$4,	%xmm1		# 0x20...
1001238384Sjkim
1002238384Sjkim	pand	%xmm6,	%xmm10
1003238384Sjkim	pand	%xmm6,	%xmm11
1004238384Sjkim	movdqa	%xmm0,	%xmm12
1005238384Sjkim	pcmpeqb	%xmm2,	%xmm10
1006238384Sjkim	psllq	\$4,	%xmm2		# 0x40...
1007238384Sjkim	movdqa	%xmm1,	%xmm13
1008238384Sjkim	pcmpeqb	%xmm3,	%xmm11
1009238384Sjkim	psllq	\$4,	%xmm3		# 0x80...
1010238384Sjkim
1011238384Sjkim	movdqa	%xmm2,	%xmm14
1012238384Sjkim	movdqa	%xmm3,	%xmm15
1013238384Sjkim	 pxor	%xmm5,	%xmm8		# "pnot"
1014238384Sjkim	 pxor	%xmm5,	%xmm9
1015238384Sjkim
1016238384Sjkim	pand	%xmm6,	%xmm12
1017238384Sjkim	pand	%xmm6,	%xmm13
1018238384Sjkim	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1019238384Sjkim	pcmpeqb	%xmm0,	%xmm12
1020238384Sjkim	psrlq	\$4,	%xmm0		# 0x01...
1021238384Sjkim	 movdqa	%xmm9, 0x10($out)
1022238384Sjkim	pcmpeqb	%xmm1,	%xmm13
1023238384Sjkim	psrlq	\$4,	%xmm1		# 0x02...
1024238384Sjkim	 lea	0x10($inp), $inp
1025238384Sjkim
1026238384Sjkim	pand	%xmm6,	%xmm14
1027238384Sjkim	pand	%xmm6,	%xmm15
1028238384Sjkim	 movdqa	%xmm10, 0x20($out)
1029238384Sjkim	pcmpeqb	%xmm2,	%xmm14
1030238384Sjkim	psrlq	\$4,	%xmm2		# 0x04...
1031238384Sjkim	 movdqa	%xmm11, 0x30($out)
1032238384Sjkim	pcmpeqb	%xmm3,	%xmm15
1033238384Sjkim	psrlq	\$4,	%xmm3		# 0x08...
1034238384Sjkim	 movdqu	($inp), %xmm6		# load next round key
1035238384Sjkim
1036238384Sjkim	pxor	%xmm5, %xmm13		# "pnot"
1037238384Sjkim	pxor	%xmm5, %xmm14
1038238384Sjkim	movdqa	%xmm12, 0x40($out)
1039238384Sjkim	movdqa	%xmm13, 0x50($out)
1040238384Sjkim	movdqa	%xmm14, 0x60($out)
1041238384Sjkim	movdqa	%xmm15, 0x70($out)
1042238384Sjkim	lea	0x80($out),$out
1043238384Sjkim	dec	$rounds
1044238384Sjkim	jnz	.Lkey_loop
1045238384Sjkim
1046238384Sjkim	movdqa	0x50($const), %xmm7	# .L63
1047238384Sjkim	#movdqa	%xmm6, ($out)		# don't save last round key
1048238384Sjkim	ret
1049238384Sjkim.size	_bsaes_key_convert,.-_bsaes_key_convert
1050238384Sjkim___
1051238384Sjkim}
1052238384Sjkim
1053238384Sjkimif (0 && !$win64) {	# following four functions are unsupported interface
1054238384Sjkim			# used for benchmarking...
1055238384Sjkim$code.=<<___;
1056238384Sjkim.globl	bsaes_enc_key_convert
1057238384Sjkim.type	bsaes_enc_key_convert,\@function,2
1058238384Sjkim.align	16
1059238384Sjkimbsaes_enc_key_convert:
1060238384Sjkim	mov	240($inp),%r10d		# pass rounds
1061238384Sjkim	mov	$inp,%rcx		# pass key
1062238384Sjkim	mov	$out,%rax		# pass key schedule
1063238384Sjkim	call	_bsaes_key_convert
1064238384Sjkim	pxor	%xmm6,%xmm7		# fix up last round key
1065238384Sjkim	movdqa	%xmm7,(%rax)		# save last round key
1066238384Sjkim	ret
1067238384Sjkim.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1068238384Sjkim
1069238384Sjkim.globl	bsaes_encrypt_128
1070238384Sjkim.type	bsaes_encrypt_128,\@function,4
1071238384Sjkim.align	16
1072238384Sjkimbsaes_encrypt_128:
1073238384Sjkim.Lenc128_loop:
1074238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1075238384Sjkim	movdqu	0x10($inp), @XMM[1]
1076238384Sjkim	movdqu	0x20($inp), @XMM[2]
1077238384Sjkim	movdqu	0x30($inp), @XMM[3]
1078238384Sjkim	movdqu	0x40($inp), @XMM[4]
1079238384Sjkim	movdqu	0x50($inp), @XMM[5]
1080238384Sjkim	movdqu	0x60($inp), @XMM[6]
1081238384Sjkim	movdqu	0x70($inp), @XMM[7]
1082238384Sjkim	mov	$key, %rax		# pass the $key
1083238384Sjkim	lea	0x80($inp), $inp
1084238384Sjkim	mov	\$10,%r10d
1085238384Sjkim
1086238384Sjkim	call	_bsaes_encrypt8
1087238384Sjkim
1088238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1089238384Sjkim	movdqu	@XMM[1], 0x10($out)
1090238384Sjkim	movdqu	@XMM[4], 0x20($out)
1091238384Sjkim	movdqu	@XMM[6], 0x30($out)
1092238384Sjkim	movdqu	@XMM[3], 0x40($out)
1093238384Sjkim	movdqu	@XMM[7], 0x50($out)
1094238384Sjkim	movdqu	@XMM[2], 0x60($out)
1095238384Sjkim	movdqu	@XMM[5], 0x70($out)
1096238384Sjkim	lea	0x80($out), $out
1097238384Sjkim	sub	\$0x80,$len
1098238384Sjkim	ja	.Lenc128_loop
1099238384Sjkim	ret
1100238384Sjkim.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1101238384Sjkim
1102238384Sjkim.globl	bsaes_dec_key_convert
1103238384Sjkim.type	bsaes_dec_key_convert,\@function,2
1104238384Sjkim.align	16
1105238384Sjkimbsaes_dec_key_convert:
1106238384Sjkim	mov	240($inp),%r10d		# pass rounds
1107238384Sjkim	mov	$inp,%rcx		# pass key
1108238384Sjkim	mov	$out,%rax		# pass key schedule
1109238384Sjkim	call	_bsaes_key_convert
1110238384Sjkim	pxor	($out),%xmm7		# fix up round 0 key
1111238384Sjkim	movdqa	%xmm6,(%rax)		# save last round key
1112238384Sjkim	movdqa	%xmm7,($out)
1113238384Sjkim	ret
1114238384Sjkim.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1115238384Sjkim
1116238384Sjkim.globl	bsaes_decrypt_128
1117238384Sjkim.type	bsaes_decrypt_128,\@function,4
1118238384Sjkim.align	16
1119238384Sjkimbsaes_decrypt_128:
1120238384Sjkim.Ldec128_loop:
1121238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1122238384Sjkim	movdqu	0x10($inp), @XMM[1]
1123238384Sjkim	movdqu	0x20($inp), @XMM[2]
1124238384Sjkim	movdqu	0x30($inp), @XMM[3]
1125238384Sjkim	movdqu	0x40($inp), @XMM[4]
1126238384Sjkim	movdqu	0x50($inp), @XMM[5]
1127238384Sjkim	movdqu	0x60($inp), @XMM[6]
1128238384Sjkim	movdqu	0x70($inp), @XMM[7]
1129238384Sjkim	mov	$key, %rax		# pass the $key
1130238384Sjkim	lea	0x80($inp), $inp
1131238384Sjkim	mov	\$10,%r10d
1132238384Sjkim
1133238384Sjkim	call	_bsaes_decrypt8
1134238384Sjkim
1135238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1136238384Sjkim	movdqu	@XMM[1], 0x10($out)
1137238384Sjkim	movdqu	@XMM[6], 0x20($out)
1138238384Sjkim	movdqu	@XMM[4], 0x30($out)
1139238384Sjkim	movdqu	@XMM[2], 0x40($out)
1140238384Sjkim	movdqu	@XMM[7], 0x50($out)
1141238384Sjkim	movdqu	@XMM[3], 0x60($out)
1142238384Sjkim	movdqu	@XMM[5], 0x70($out)
1143238384Sjkim	lea	0x80($out), $out
1144238384Sjkim	sub	\$0x80,$len
1145238384Sjkim	ja	.Ldec128_loop
1146238384Sjkim	ret
1147238384Sjkim.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1148238384Sjkim___
1149238384Sjkim}
1150238384Sjkim{
1151238384Sjkim######################################################################
1152238384Sjkim#
1153238384Sjkim# OpenSSL interface
1154238384Sjkim#
1155238384Sjkimmy ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1156238384Sjkim						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1157238384Sjkimmy ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1158238384Sjkim
1159238384Sjkimif ($ecb) {
1160238384Sjkim$code.=<<___;
1161238384Sjkim.globl	bsaes_ecb_encrypt_blocks
1162238384Sjkim.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1163238384Sjkim.align	16
1164238384Sjkimbsaes_ecb_encrypt_blocks:
1165238384Sjkim	mov	%rsp, %rax
1166238384Sjkim.Lecb_enc_prologue:
1167238384Sjkim	push	%rbp
1168238384Sjkim	push	%rbx
1169238384Sjkim	push	%r12
1170238384Sjkim	push	%r13
1171238384Sjkim	push	%r14
1172238384Sjkim	push	%r15
1173238384Sjkim	lea	-0x48(%rsp),%rsp
1174238384Sjkim___
1175238384Sjkim$code.=<<___ if ($win64);
1176238384Sjkim	lea	-0xa0(%rsp), %rsp
1177238384Sjkim	movaps	%xmm6, 0x40(%rsp)
1178238384Sjkim	movaps	%xmm7, 0x50(%rsp)
1179238384Sjkim	movaps	%xmm8, 0x60(%rsp)
1180238384Sjkim	movaps	%xmm9, 0x70(%rsp)
1181238384Sjkim	movaps	%xmm10, 0x80(%rsp)
1182238384Sjkim	movaps	%xmm11, 0x90(%rsp)
1183238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
1184238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
1185238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
1186238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
1187238384Sjkim.Lecb_enc_body:
1188238384Sjkim___
1189238384Sjkim$code.=<<___;
1190238384Sjkim	mov	%rsp,%rbp		# backup %rsp
1191238384Sjkim	mov	240($arg4),%eax		# rounds
1192238384Sjkim	mov	$arg1,$inp		# backup arguments
1193238384Sjkim	mov	$arg2,$out
1194238384Sjkim	mov	$arg3,$len
1195238384Sjkim	mov	$arg4,$key
1196238384Sjkim	cmp	\$8,$arg3
1197238384Sjkim	jb	.Lecb_enc_short
1198238384Sjkim
1199238384Sjkim	mov	%eax,%ebx		# backup rounds
1200238384Sjkim	shl	\$7,%rax		# 128 bytes per inner round key
1201238384Sjkim	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1202238384Sjkim	sub	%rax,%rsp
1203238384Sjkim	mov	%rsp,%rax		# pass key schedule
1204238384Sjkim	mov	$key,%rcx		# pass key
1205238384Sjkim	mov	%ebx,%r10d		# pass rounds
1206238384Sjkim	call	_bsaes_key_convert
1207238384Sjkim	pxor	%xmm6,%xmm7		# fix up last round key
1208238384Sjkim	movdqa	%xmm7,(%rax)		# save last round key
1209238384Sjkim
1210238384Sjkim	sub	\$8,$len
1211238384Sjkim.Lecb_enc_loop:
1212238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1213238384Sjkim	movdqu	0x10($inp), @XMM[1]
1214238384Sjkim	movdqu	0x20($inp), @XMM[2]
1215238384Sjkim	movdqu	0x30($inp), @XMM[3]
1216238384Sjkim	movdqu	0x40($inp), @XMM[4]
1217238384Sjkim	movdqu	0x50($inp), @XMM[5]
1218238384Sjkim	mov	%rsp, %rax		# pass key schedule
1219238384Sjkim	movdqu	0x60($inp), @XMM[6]
1220238384Sjkim	mov	%ebx,%r10d		# pass rounds
1221238384Sjkim	movdqu	0x70($inp), @XMM[7]
1222238384Sjkim	lea	0x80($inp), $inp
1223238384Sjkim
1224238384Sjkim	call	_bsaes_encrypt8
1225238384Sjkim
1226238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1227238384Sjkim	movdqu	@XMM[1], 0x10($out)
1228238384Sjkim	movdqu	@XMM[4], 0x20($out)
1229238384Sjkim	movdqu	@XMM[6], 0x30($out)
1230238384Sjkim	movdqu	@XMM[3], 0x40($out)
1231238384Sjkim	movdqu	@XMM[7], 0x50($out)
1232238384Sjkim	movdqu	@XMM[2], 0x60($out)
1233238384Sjkim	movdqu	@XMM[5], 0x70($out)
1234238384Sjkim	lea	0x80($out), $out
1235238384Sjkim	sub	\$8,$len
1236238384Sjkim	jnc	.Lecb_enc_loop
1237238384Sjkim
1238238384Sjkim	add	\$8,$len
1239238384Sjkim	jz	.Lecb_enc_done
1240238384Sjkim
1241238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1242238384Sjkim	mov	%rsp, %rax		# pass key schedule
1243238384Sjkim	mov	%ebx,%r10d		# pass rounds
1244238384Sjkim	cmp	\$2,$len
1245238384Sjkim	jb	.Lecb_enc_one
1246238384Sjkim	movdqu	0x10($inp), @XMM[1]
1247238384Sjkim	je	.Lecb_enc_two
1248238384Sjkim	movdqu	0x20($inp), @XMM[2]
1249238384Sjkim	cmp	\$4,$len
1250238384Sjkim	jb	.Lecb_enc_three
1251238384Sjkim	movdqu	0x30($inp), @XMM[3]
1252238384Sjkim	je	.Lecb_enc_four
1253238384Sjkim	movdqu	0x40($inp), @XMM[4]
1254238384Sjkim	cmp	\$6,$len
1255238384Sjkim	jb	.Lecb_enc_five
1256238384Sjkim	movdqu	0x50($inp), @XMM[5]
1257238384Sjkim	je	.Lecb_enc_six
1258238384Sjkim	movdqu	0x60($inp), @XMM[6]
1259238384Sjkim	call	_bsaes_encrypt8
1260238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1261238384Sjkim	movdqu	@XMM[1], 0x10($out)
1262238384Sjkim	movdqu	@XMM[4], 0x20($out)
1263238384Sjkim	movdqu	@XMM[6], 0x30($out)
1264238384Sjkim	movdqu	@XMM[3], 0x40($out)
1265238384Sjkim	movdqu	@XMM[7], 0x50($out)
1266238384Sjkim	movdqu	@XMM[2], 0x60($out)
1267238384Sjkim	jmp	.Lecb_enc_done
1268238384Sjkim.align	16
1269238384Sjkim.Lecb_enc_six:
1270238384Sjkim	call	_bsaes_encrypt8
1271238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1272238384Sjkim	movdqu	@XMM[1], 0x10($out)
1273238384Sjkim	movdqu	@XMM[4], 0x20($out)
1274238384Sjkim	movdqu	@XMM[6], 0x30($out)
1275238384Sjkim	movdqu	@XMM[3], 0x40($out)
1276238384Sjkim	movdqu	@XMM[7], 0x50($out)
1277238384Sjkim	jmp	.Lecb_enc_done
1278238384Sjkim.align	16
1279238384Sjkim.Lecb_enc_five:
1280238384Sjkim	call	_bsaes_encrypt8
1281238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1282238384Sjkim	movdqu	@XMM[1], 0x10($out)
1283238384Sjkim	movdqu	@XMM[4], 0x20($out)
1284238384Sjkim	movdqu	@XMM[6], 0x30($out)
1285238384Sjkim	movdqu	@XMM[3], 0x40($out)
1286238384Sjkim	jmp	.Lecb_enc_done
1287238384Sjkim.align	16
1288238384Sjkim.Lecb_enc_four:
1289238384Sjkim	call	_bsaes_encrypt8
1290238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1291238384Sjkim	movdqu	@XMM[1], 0x10($out)
1292238384Sjkim	movdqu	@XMM[4], 0x20($out)
1293238384Sjkim	movdqu	@XMM[6], 0x30($out)
1294238384Sjkim	jmp	.Lecb_enc_done
1295238384Sjkim.align	16
1296238384Sjkim.Lecb_enc_three:
1297238384Sjkim	call	_bsaes_encrypt8
1298238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1299238384Sjkim	movdqu	@XMM[1], 0x10($out)
1300238384Sjkim	movdqu	@XMM[4], 0x20($out)
1301238384Sjkim	jmp	.Lecb_enc_done
1302238384Sjkim.align	16
1303238384Sjkim.Lecb_enc_two:
1304238384Sjkim	call	_bsaes_encrypt8
1305238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1306238384Sjkim	movdqu	@XMM[1], 0x10($out)
1307238384Sjkim	jmp	.Lecb_enc_done
1308238384Sjkim.align	16
1309238384Sjkim.Lecb_enc_one:
1310238384Sjkim	call	_bsaes_encrypt8
1311238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1312238384Sjkim	jmp	.Lecb_enc_done
1313238384Sjkim.align	16
1314238384Sjkim.Lecb_enc_short:
1315238384Sjkim	lea	($inp), $arg1
1316238384Sjkim	lea	($out), $arg2
1317238384Sjkim	lea	($key), $arg3
1318238384Sjkim	call	asm_AES_encrypt
1319238384Sjkim	lea	16($inp), $inp
1320238384Sjkim	lea	16($out), $out
1321238384Sjkim	dec	$len
1322238384Sjkim	jnz	.Lecb_enc_short
1323238384Sjkim
1324238384Sjkim.Lecb_enc_done:
1325238384Sjkim	lea	(%rsp),%rax
1326238384Sjkim	pxor	%xmm0, %xmm0
1327238384Sjkim.Lecb_enc_bzero:			# wipe key schedule [if any]
1328238384Sjkim	movdqa	%xmm0, 0x00(%rax)
1329238384Sjkim	movdqa	%xmm0, 0x10(%rax)
1330238384Sjkim	lea	0x20(%rax), %rax
1331238384Sjkim	cmp	%rax, %rbp
1332238384Sjkim	jb	.Lecb_enc_bzero
1333238384Sjkim
1334238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
1335238384Sjkim___
1336238384Sjkim$code.=<<___ if ($win64);
1337238384Sjkim	movaps	0x40(%rbp), %xmm6
1338238384Sjkim	movaps	0x50(%rbp), %xmm7
1339238384Sjkim	movaps	0x60(%rbp), %xmm8
1340238384Sjkim	movaps	0x70(%rbp), %xmm9
1341238384Sjkim	movaps	0x80(%rbp), %xmm10
1342238384Sjkim	movaps	0x90(%rbp), %xmm11
1343238384Sjkim	movaps	0xa0(%rbp), %xmm12
1344238384Sjkim	movaps	0xb0(%rbp), %xmm13
1345238384Sjkim	movaps	0xc0(%rbp), %xmm14
1346238384Sjkim	movaps	0xd0(%rbp), %xmm15
1347238384Sjkim	lea	0xa0(%rbp), %rsp
1348238384Sjkim___
1349238384Sjkim$code.=<<___;
1350238384Sjkim	mov	0x48(%rsp), %r15
1351238384Sjkim	mov	0x50(%rsp), %r14
1352238384Sjkim	mov	0x58(%rsp), %r13
1353238384Sjkim	mov	0x60(%rsp), %r12
1354238384Sjkim	mov	0x68(%rsp), %rbx
1355238384Sjkim	mov	0x70(%rsp), %rax
1356238384Sjkim	lea	0x78(%rsp), %rsp
1357238384Sjkim	mov	%rax, %rbp
1358238384Sjkim.Lecb_enc_epilogue:
1359238384Sjkim	ret
1360238384Sjkim.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1361238384Sjkim
1362238384Sjkim.globl	bsaes_ecb_decrypt_blocks
1363238384Sjkim.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1364238384Sjkim.align	16
1365238384Sjkimbsaes_ecb_decrypt_blocks:
1366238384Sjkim	mov	%rsp, %rax
1367238384Sjkim.Lecb_dec_prologue:
1368238384Sjkim	push	%rbp
1369238384Sjkim	push	%rbx
1370238384Sjkim	push	%r12
1371238384Sjkim	push	%r13
1372238384Sjkim	push	%r14
1373238384Sjkim	push	%r15
1374238384Sjkim	lea	-0x48(%rsp),%rsp
1375238384Sjkim___
1376238384Sjkim$code.=<<___ if ($win64);
1377238384Sjkim	lea	-0xa0(%rsp), %rsp
1378238384Sjkim	movaps	%xmm6, 0x40(%rsp)
1379238384Sjkim	movaps	%xmm7, 0x50(%rsp)
1380238384Sjkim	movaps	%xmm8, 0x60(%rsp)
1381238384Sjkim	movaps	%xmm9, 0x70(%rsp)
1382238384Sjkim	movaps	%xmm10, 0x80(%rsp)
1383238384Sjkim	movaps	%xmm11, 0x90(%rsp)
1384238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
1385238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
1386238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
1387238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
1388238384Sjkim.Lecb_dec_body:
1389238384Sjkim___
1390238384Sjkim$code.=<<___;
1391238384Sjkim	mov	%rsp,%rbp		# backup %rsp
1392238384Sjkim	mov	240($arg4),%eax		# rounds
1393238384Sjkim	mov	$arg1,$inp		# backup arguments
1394238384Sjkim	mov	$arg2,$out
1395238384Sjkim	mov	$arg3,$len
1396238384Sjkim	mov	$arg4,$key
1397238384Sjkim	cmp	\$8,$arg3
1398238384Sjkim	jb	.Lecb_dec_short
1399238384Sjkim
1400238384Sjkim	mov	%eax,%ebx		# backup rounds
1401238384Sjkim	shl	\$7,%rax		# 128 bytes per inner round key
1402238384Sjkim	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1403238384Sjkim	sub	%rax,%rsp
1404238384Sjkim	mov	%rsp,%rax		# pass key schedule
1405238384Sjkim	mov	$key,%rcx		# pass key
1406238384Sjkim	mov	%ebx,%r10d		# pass rounds
1407238384Sjkim	call	_bsaes_key_convert
1408238384Sjkim	pxor	(%rsp),%xmm7		# fix up 0 round key
1409238384Sjkim	movdqa	%xmm6,(%rax)		# save last round key
1410238384Sjkim	movdqa	%xmm7,(%rsp)
1411238384Sjkim
1412238384Sjkim	sub	\$8,$len
1413238384Sjkim.Lecb_dec_loop:
1414238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1415238384Sjkim	movdqu	0x10($inp), @XMM[1]
1416238384Sjkim	movdqu	0x20($inp), @XMM[2]
1417238384Sjkim	movdqu	0x30($inp), @XMM[3]
1418238384Sjkim	movdqu	0x40($inp), @XMM[4]
1419238384Sjkim	movdqu	0x50($inp), @XMM[5]
1420238384Sjkim	mov	%rsp, %rax		# pass key schedule
1421238384Sjkim	movdqu	0x60($inp), @XMM[6]
1422238384Sjkim	mov	%ebx,%r10d		# pass rounds
1423238384Sjkim	movdqu	0x70($inp), @XMM[7]
1424238384Sjkim	lea	0x80($inp), $inp
1425238384Sjkim
1426238384Sjkim	call	_bsaes_decrypt8
1427238384Sjkim
1428238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1429238384Sjkim	movdqu	@XMM[1], 0x10($out)
1430238384Sjkim	movdqu	@XMM[6], 0x20($out)
1431238384Sjkim	movdqu	@XMM[4], 0x30($out)
1432238384Sjkim	movdqu	@XMM[2], 0x40($out)
1433238384Sjkim	movdqu	@XMM[7], 0x50($out)
1434238384Sjkim	movdqu	@XMM[3], 0x60($out)
1435238384Sjkim	movdqu	@XMM[5], 0x70($out)
1436238384Sjkim	lea	0x80($out), $out
1437238384Sjkim	sub	\$8,$len
1438238384Sjkim	jnc	.Lecb_dec_loop
1439238384Sjkim
1440238384Sjkim	add	\$8,$len
1441238384Sjkim	jz	.Lecb_dec_done
1442238384Sjkim
1443238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1444238384Sjkim	mov	%rsp, %rax		# pass key schedule
1445238384Sjkim	mov	%ebx,%r10d		# pass rounds
1446238384Sjkim	cmp	\$2,$len
1447238384Sjkim	jb	.Lecb_dec_one
1448238384Sjkim	movdqu	0x10($inp), @XMM[1]
1449238384Sjkim	je	.Lecb_dec_two
1450238384Sjkim	movdqu	0x20($inp), @XMM[2]
1451238384Sjkim	cmp	\$4,$len
1452238384Sjkim	jb	.Lecb_dec_three
1453238384Sjkim	movdqu	0x30($inp), @XMM[3]
1454238384Sjkim	je	.Lecb_dec_four
1455238384Sjkim	movdqu	0x40($inp), @XMM[4]
1456238384Sjkim	cmp	\$6,$len
1457238384Sjkim	jb	.Lecb_dec_five
1458238384Sjkim	movdqu	0x50($inp), @XMM[5]
1459238384Sjkim	je	.Lecb_dec_six
1460238384Sjkim	movdqu	0x60($inp), @XMM[6]
1461238384Sjkim	call	_bsaes_decrypt8
1462238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1463238384Sjkim	movdqu	@XMM[1], 0x10($out)
1464238384Sjkim	movdqu	@XMM[6], 0x20($out)
1465238384Sjkim	movdqu	@XMM[4], 0x30($out)
1466238384Sjkim	movdqu	@XMM[2], 0x40($out)
1467238384Sjkim	movdqu	@XMM[7], 0x50($out)
1468238384Sjkim	movdqu	@XMM[3], 0x60($out)
1469238384Sjkim	jmp	.Lecb_dec_done
1470238384Sjkim.align	16
1471238384Sjkim.Lecb_dec_six:
1472238384Sjkim	call	_bsaes_decrypt8
1473238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1474238384Sjkim	movdqu	@XMM[1], 0x10($out)
1475238384Sjkim	movdqu	@XMM[6], 0x20($out)
1476238384Sjkim	movdqu	@XMM[4], 0x30($out)
1477238384Sjkim	movdqu	@XMM[2], 0x40($out)
1478238384Sjkim	movdqu	@XMM[7], 0x50($out)
1479238384Sjkim	jmp	.Lecb_dec_done
1480238384Sjkim.align	16
1481238384Sjkim.Lecb_dec_five:
1482238384Sjkim	call	_bsaes_decrypt8
1483238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1484238384Sjkim	movdqu	@XMM[1], 0x10($out)
1485238384Sjkim	movdqu	@XMM[6], 0x20($out)
1486238384Sjkim	movdqu	@XMM[4], 0x30($out)
1487238384Sjkim	movdqu	@XMM[2], 0x40($out)
1488238384Sjkim	jmp	.Lecb_dec_done
1489238384Sjkim.align	16
1490238384Sjkim.Lecb_dec_four:
1491238384Sjkim	call	_bsaes_decrypt8
1492238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1493238384Sjkim	movdqu	@XMM[1], 0x10($out)
1494238384Sjkim	movdqu	@XMM[6], 0x20($out)
1495238384Sjkim	movdqu	@XMM[4], 0x30($out)
1496238384Sjkim	jmp	.Lecb_dec_done
1497238384Sjkim.align	16
1498238384Sjkim.Lecb_dec_three:
1499238384Sjkim	call	_bsaes_decrypt8
1500238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1501238384Sjkim	movdqu	@XMM[1], 0x10($out)
1502238384Sjkim	movdqu	@XMM[6], 0x20($out)
1503238384Sjkim	jmp	.Lecb_dec_done
1504238384Sjkim.align	16
1505238384Sjkim.Lecb_dec_two:
1506238384Sjkim	call	_bsaes_decrypt8
1507238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1508238384Sjkim	movdqu	@XMM[1], 0x10($out)
1509238384Sjkim	jmp	.Lecb_dec_done
1510238384Sjkim.align	16
1511238384Sjkim.Lecb_dec_one:
1512238384Sjkim	call	_bsaes_decrypt8
1513238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1514238384Sjkim	jmp	.Lecb_dec_done
1515238384Sjkim.align	16
1516238384Sjkim.Lecb_dec_short:
1517238384Sjkim	lea	($inp), $arg1
1518238384Sjkim	lea	($out), $arg2
1519238384Sjkim	lea	($key), $arg3
1520238384Sjkim	call	asm_AES_decrypt
1521238384Sjkim	lea	16($inp), $inp
1522238384Sjkim	lea	16($out), $out
1523238384Sjkim	dec	$len
1524238384Sjkim	jnz	.Lecb_dec_short
1525238384Sjkim
1526238384Sjkim.Lecb_dec_done:
1527238384Sjkim	lea	(%rsp),%rax
1528238384Sjkim	pxor	%xmm0, %xmm0
1529238384Sjkim.Lecb_dec_bzero:			# wipe key schedule [if any]
1530238384Sjkim	movdqa	%xmm0, 0x00(%rax)
1531238384Sjkim	movdqa	%xmm0, 0x10(%rax)
1532238384Sjkim	lea	0x20(%rax), %rax
1533238384Sjkim	cmp	%rax, %rbp
1534238384Sjkim	jb	.Lecb_dec_bzero
1535238384Sjkim
1536238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
1537238384Sjkim___
1538238384Sjkim$code.=<<___ if ($win64);
1539238384Sjkim	movaps	0x40(%rbp), %xmm6
1540238384Sjkim	movaps	0x50(%rbp), %xmm7
1541238384Sjkim	movaps	0x60(%rbp), %xmm8
1542238384Sjkim	movaps	0x70(%rbp), %xmm9
1543238384Sjkim	movaps	0x80(%rbp), %xmm10
1544238384Sjkim	movaps	0x90(%rbp), %xmm11
1545238384Sjkim	movaps	0xa0(%rbp), %xmm12
1546238384Sjkim	movaps	0xb0(%rbp), %xmm13
1547238384Sjkim	movaps	0xc0(%rbp), %xmm14
1548238384Sjkim	movaps	0xd0(%rbp), %xmm15
1549238384Sjkim	lea	0xa0(%rbp), %rsp
1550238384Sjkim___
1551238384Sjkim$code.=<<___;
1552238384Sjkim	mov	0x48(%rsp), %r15
1553238384Sjkim	mov	0x50(%rsp), %r14
1554238384Sjkim	mov	0x58(%rsp), %r13
1555238384Sjkim	mov	0x60(%rsp), %r12
1556238384Sjkim	mov	0x68(%rsp), %rbx
1557238384Sjkim	mov	0x70(%rsp), %rax
1558238384Sjkim	lea	0x78(%rsp), %rsp
1559238384Sjkim	mov	%rax, %rbp
1560238384Sjkim.Lecb_dec_epilogue:
1561238384Sjkim	ret
1562238384Sjkim.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1563238384Sjkim___
1564238384Sjkim}
1565238384Sjkim$code.=<<___;
1566238384Sjkim.extern	asm_AES_cbc_encrypt
1567238384Sjkim.globl	bsaes_cbc_encrypt
1568238384Sjkim.type	bsaes_cbc_encrypt,\@abi-omnipotent
1569238384Sjkim.align	16
1570238384Sjkimbsaes_cbc_encrypt:
1571238384Sjkim___
1572238384Sjkim$code.=<<___ if ($win64);
1573238384Sjkim	mov	48(%rsp),$arg6		# pull direction flag
1574238384Sjkim___
1575238384Sjkim$code.=<<___;
1576238384Sjkim	cmp	\$0,$arg6
1577238384Sjkim	jne	asm_AES_cbc_encrypt
1578238384Sjkim	cmp	\$128,$arg3
1579238384Sjkim	jb	asm_AES_cbc_encrypt
1580238384Sjkim
1581238384Sjkim	mov	%rsp, %rax
1582238384Sjkim.Lcbc_dec_prologue:
1583238384Sjkim	push	%rbp
1584238384Sjkim	push	%rbx
1585238384Sjkim	push	%r12
1586238384Sjkim	push	%r13
1587238384Sjkim	push	%r14
1588238384Sjkim	push	%r15
1589238384Sjkim	lea	-0x48(%rsp), %rsp
1590238384Sjkim___
1591238384Sjkim$code.=<<___ if ($win64);
1592238384Sjkim	mov	0xa0(%rsp),$arg5	# pull ivp
1593238384Sjkim	lea	-0xa0(%rsp), %rsp
1594238384Sjkim	movaps	%xmm6, 0x40(%rsp)
1595238384Sjkim	movaps	%xmm7, 0x50(%rsp)
1596238384Sjkim	movaps	%xmm8, 0x60(%rsp)
1597238384Sjkim	movaps	%xmm9, 0x70(%rsp)
1598238384Sjkim	movaps	%xmm10, 0x80(%rsp)
1599238384Sjkim	movaps	%xmm11, 0x90(%rsp)
1600238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
1601238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
1602238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
1603238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
1604238384Sjkim.Lcbc_dec_body:
1605238384Sjkim___
1606238384Sjkim$code.=<<___;
1607238384Sjkim	mov	%rsp, %rbp		# backup %rsp
1608238384Sjkim	mov	240($arg4), %eax	# rounds
1609238384Sjkim	mov	$arg1, $inp		# backup arguments
1610238384Sjkim	mov	$arg2, $out
1611238384Sjkim	mov	$arg3, $len
1612238384Sjkim	mov	$arg4, $key
1613238384Sjkim	mov	$arg5, %rbx
1614238384Sjkim	shr	\$4, $len		# bytes to blocks
1615238384Sjkim
1616238384Sjkim	mov	%eax, %edx		# rounds
1617238384Sjkim	shl	\$7, %rax		# 128 bytes per inner round key
1618238384Sjkim	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1619238384Sjkim	sub	%rax, %rsp
1620238384Sjkim
1621238384Sjkim	mov	%rsp, %rax		# pass key schedule
1622238384Sjkim	mov	$key, %rcx		# pass key
1623238384Sjkim	mov	%edx, %r10d		# pass rounds
1624238384Sjkim	call	_bsaes_key_convert
1625238384Sjkim	pxor	(%rsp),%xmm7		# fix up 0 round key
1626238384Sjkim	movdqa	%xmm6,(%rax)		# save last round key
1627238384Sjkim	movdqa	%xmm7,(%rsp)
1628238384Sjkim
1629238384Sjkim	movdqu	(%rbx), @XMM[15]	# load IV
1630238384Sjkim	sub	\$8,$len
1631238384Sjkim.Lcbc_dec_loop:
1632238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1633238384Sjkim	movdqu	0x10($inp), @XMM[1]
1634238384Sjkim	movdqu	0x20($inp), @XMM[2]
1635238384Sjkim	movdqu	0x30($inp), @XMM[3]
1636238384Sjkim	movdqu	0x40($inp), @XMM[4]
1637238384Sjkim	movdqu	0x50($inp), @XMM[5]
1638238384Sjkim	mov	%rsp, %rax		# pass key schedule
1639238384Sjkim	movdqu	0x60($inp), @XMM[6]
1640238384Sjkim	mov	%edx,%r10d		# pass rounds
1641238384Sjkim	movdqu	0x70($inp), @XMM[7]
1642238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1643238384Sjkim
1644238384Sjkim	call	_bsaes_decrypt8
1645238384Sjkim
1646238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1647238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1648238384Sjkim	movdqu	0x10($inp), @XMM[9]
1649238384Sjkim	pxor	@XMM[8], @XMM[1]
1650238384Sjkim	movdqu	0x20($inp), @XMM[10]
1651238384Sjkim	pxor	@XMM[9], @XMM[6]
1652238384Sjkim	movdqu	0x30($inp), @XMM[11]
1653238384Sjkim	pxor	@XMM[10], @XMM[4]
1654238384Sjkim	movdqu	0x40($inp), @XMM[12]
1655238384Sjkim	pxor	@XMM[11], @XMM[2]
1656238384Sjkim	movdqu	0x50($inp), @XMM[13]
1657238384Sjkim	pxor	@XMM[12], @XMM[7]
1658238384Sjkim	movdqu	0x60($inp), @XMM[14]
1659238384Sjkim	pxor	@XMM[13], @XMM[3]
1660238384Sjkim	movdqu	0x70($inp), @XMM[15]	# IV
1661238384Sjkim	pxor	@XMM[14], @XMM[5]
1662238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1663238384Sjkim	lea	0x80($inp), $inp
1664238384Sjkim	movdqu	@XMM[1], 0x10($out)
1665238384Sjkim	movdqu	@XMM[6], 0x20($out)
1666238384Sjkim	movdqu	@XMM[4], 0x30($out)
1667238384Sjkim	movdqu	@XMM[2], 0x40($out)
1668238384Sjkim	movdqu	@XMM[7], 0x50($out)
1669238384Sjkim	movdqu	@XMM[3], 0x60($out)
1670238384Sjkim	movdqu	@XMM[5], 0x70($out)
1671238384Sjkim	lea	0x80($out), $out
1672238384Sjkim	sub	\$8,$len
1673238384Sjkim	jnc	.Lcbc_dec_loop
1674238384Sjkim
1675238384Sjkim	add	\$8,$len
1676238384Sjkim	jz	.Lcbc_dec_done
1677238384Sjkim
1678238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1679238384Sjkim	mov	%rsp, %rax		# pass key schedule
1680238384Sjkim	mov	%edx, %r10d		# pass rounds
1681238384Sjkim	cmp	\$2,$len
1682238384Sjkim	jb	.Lcbc_dec_one
1683238384Sjkim	movdqu	0x10($inp), @XMM[1]
1684238384Sjkim	je	.Lcbc_dec_two
1685238384Sjkim	movdqu	0x20($inp), @XMM[2]
1686238384Sjkim	cmp	\$4,$len
1687238384Sjkim	jb	.Lcbc_dec_three
1688238384Sjkim	movdqu	0x30($inp), @XMM[3]
1689238384Sjkim	je	.Lcbc_dec_four
1690238384Sjkim	movdqu	0x40($inp), @XMM[4]
1691238384Sjkim	cmp	\$6,$len
1692238384Sjkim	jb	.Lcbc_dec_five
1693238384Sjkim	movdqu	0x50($inp), @XMM[5]
1694238384Sjkim	je	.Lcbc_dec_six
1695238384Sjkim	movdqu	0x60($inp), @XMM[6]
1696238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1697238384Sjkim	call	_bsaes_decrypt8
1698238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1699238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1700238384Sjkim	movdqu	0x10($inp), @XMM[9]
1701238384Sjkim	pxor	@XMM[8], @XMM[1]
1702238384Sjkim	movdqu	0x20($inp), @XMM[10]
1703238384Sjkim	pxor	@XMM[9], @XMM[6]
1704238384Sjkim	movdqu	0x30($inp), @XMM[11]
1705238384Sjkim	pxor	@XMM[10], @XMM[4]
1706238384Sjkim	movdqu	0x40($inp), @XMM[12]
1707238384Sjkim	pxor	@XMM[11], @XMM[2]
1708238384Sjkim	movdqu	0x50($inp), @XMM[13]
1709238384Sjkim	pxor	@XMM[12], @XMM[7]
1710238384Sjkim	movdqu	0x60($inp), @XMM[15]	# IV
1711238384Sjkim	pxor	@XMM[13], @XMM[3]
1712238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1713238384Sjkim	movdqu	@XMM[1], 0x10($out)
1714238384Sjkim	movdqu	@XMM[6], 0x20($out)
1715238384Sjkim	movdqu	@XMM[4], 0x30($out)
1716238384Sjkim	movdqu	@XMM[2], 0x40($out)
1717238384Sjkim	movdqu	@XMM[7], 0x50($out)
1718238384Sjkim	movdqu	@XMM[3], 0x60($out)
1719238384Sjkim	jmp	.Lcbc_dec_done
1720238384Sjkim.align	16
1721238384Sjkim.Lcbc_dec_six:
1722238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1723238384Sjkim	call	_bsaes_decrypt8
1724238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1725238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1726238384Sjkim	movdqu	0x10($inp), @XMM[9]
1727238384Sjkim	pxor	@XMM[8], @XMM[1]
1728238384Sjkim	movdqu	0x20($inp), @XMM[10]
1729238384Sjkim	pxor	@XMM[9], @XMM[6]
1730238384Sjkim	movdqu	0x30($inp), @XMM[11]
1731238384Sjkim	pxor	@XMM[10], @XMM[4]
1732238384Sjkim	movdqu	0x40($inp), @XMM[12]
1733238384Sjkim	pxor	@XMM[11], @XMM[2]
1734238384Sjkim	movdqu	0x50($inp), @XMM[15]	# IV
1735238384Sjkim	pxor	@XMM[12], @XMM[7]
1736238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1737238384Sjkim	movdqu	@XMM[1], 0x10($out)
1738238384Sjkim	movdqu	@XMM[6], 0x20($out)
1739238384Sjkim	movdqu	@XMM[4], 0x30($out)
1740238384Sjkim	movdqu	@XMM[2], 0x40($out)
1741238384Sjkim	movdqu	@XMM[7], 0x50($out)
1742238384Sjkim	jmp	.Lcbc_dec_done
1743238384Sjkim.align	16
1744238384Sjkim.Lcbc_dec_five:
1745238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1746238384Sjkim	call	_bsaes_decrypt8
1747238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1748238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1749238384Sjkim	movdqu	0x10($inp), @XMM[9]
1750238384Sjkim	pxor	@XMM[8], @XMM[1]
1751238384Sjkim	movdqu	0x20($inp), @XMM[10]
1752238384Sjkim	pxor	@XMM[9], @XMM[6]
1753238384Sjkim	movdqu	0x30($inp), @XMM[11]
1754238384Sjkim	pxor	@XMM[10], @XMM[4]
1755238384Sjkim	movdqu	0x40($inp), @XMM[15]	# IV
1756238384Sjkim	pxor	@XMM[11], @XMM[2]
1757238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1758238384Sjkim	movdqu	@XMM[1], 0x10($out)
1759238384Sjkim	movdqu	@XMM[6], 0x20($out)
1760238384Sjkim	movdqu	@XMM[4], 0x30($out)
1761238384Sjkim	movdqu	@XMM[2], 0x40($out)
1762238384Sjkim	jmp	.Lcbc_dec_done
1763238384Sjkim.align	16
1764238384Sjkim.Lcbc_dec_four:
1765238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1766238384Sjkim	call	_bsaes_decrypt8
1767238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1768238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1769238384Sjkim	movdqu	0x10($inp), @XMM[9]
1770238384Sjkim	pxor	@XMM[8], @XMM[1]
1771238384Sjkim	movdqu	0x20($inp), @XMM[10]
1772238384Sjkim	pxor	@XMM[9], @XMM[6]
1773238384Sjkim	movdqu	0x30($inp), @XMM[15]	# IV
1774238384Sjkim	pxor	@XMM[10], @XMM[4]
1775238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1776238384Sjkim	movdqu	@XMM[1], 0x10($out)
1777238384Sjkim	movdqu	@XMM[6], 0x20($out)
1778238384Sjkim	movdqu	@XMM[4], 0x30($out)
1779238384Sjkim	jmp	.Lcbc_dec_done
1780238384Sjkim.align	16
1781238384Sjkim.Lcbc_dec_three:
1782238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1783238384Sjkim	call	_bsaes_decrypt8
1784238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1785238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1786238384Sjkim	movdqu	0x10($inp), @XMM[9]
1787238384Sjkim	pxor	@XMM[8], @XMM[1]
1788238384Sjkim	movdqu	0x20($inp), @XMM[15]	# IV
1789238384Sjkim	pxor	@XMM[9], @XMM[6]
1790238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1791238384Sjkim	movdqu	@XMM[1], 0x10($out)
1792238384Sjkim	movdqu	@XMM[6], 0x20($out)
1793238384Sjkim	jmp	.Lcbc_dec_done
1794238384Sjkim.align	16
1795238384Sjkim.Lcbc_dec_two:
1796238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1797238384Sjkim	call	_bsaes_decrypt8
1798238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1799238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1800238384Sjkim	movdqu	0x10($inp), @XMM[15]	# IV
1801238384Sjkim	pxor	@XMM[8], @XMM[1]
1802238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1803238384Sjkim	movdqu	@XMM[1], 0x10($out)
1804238384Sjkim	jmp	.Lcbc_dec_done
1805238384Sjkim.align	16
1806238384Sjkim.Lcbc_dec_one:
1807238384Sjkim	lea	($inp), $arg1
1808238384Sjkim	lea	0x20(%rbp), $arg2	# buffer output
1809238384Sjkim	lea	($key), $arg3
1810238384Sjkim	call	asm_AES_decrypt		# doesn't touch %xmm
1811238384Sjkim	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1812238384Sjkim	movdqu	@XMM[15], ($out)	# write output
1813238384Sjkim	movdqa	@XMM[0], @XMM[15]	# IV
1814238384Sjkim
1815238384Sjkim.Lcbc_dec_done:
1816238384Sjkim	movdqu	@XMM[15], (%rbx)	# return IV
1817238384Sjkim	lea	(%rsp), %rax
1818238384Sjkim	pxor	%xmm0, %xmm0
1819238384Sjkim.Lcbc_dec_bzero:			# wipe key schedule [if any]
1820238384Sjkim	movdqa	%xmm0, 0x00(%rax)
1821238384Sjkim	movdqa	%xmm0, 0x10(%rax)
1822238384Sjkim	lea	0x20(%rax), %rax
1823238384Sjkim	cmp	%rax, %rbp
1824238384Sjkim	ja	.Lcbc_dec_bzero
1825238384Sjkim
1826238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
1827238384Sjkim___
1828238384Sjkim$code.=<<___ if ($win64);
1829238384Sjkim	movaps	0x40(%rbp), %xmm6
1830238384Sjkim	movaps	0x50(%rbp), %xmm7
1831238384Sjkim	movaps	0x60(%rbp), %xmm8
1832238384Sjkim	movaps	0x70(%rbp), %xmm9
1833238384Sjkim	movaps	0x80(%rbp), %xmm10
1834238384Sjkim	movaps	0x90(%rbp), %xmm11
1835238384Sjkim	movaps	0xa0(%rbp), %xmm12
1836238384Sjkim	movaps	0xb0(%rbp), %xmm13
1837238384Sjkim	movaps	0xc0(%rbp), %xmm14
1838238384Sjkim	movaps	0xd0(%rbp), %xmm15
1839238384Sjkim	lea	0xa0(%rbp), %rsp
1840238384Sjkim___
1841238384Sjkim$code.=<<___;
1842238384Sjkim	mov	0x48(%rsp), %r15
1843238384Sjkim	mov	0x50(%rsp), %r14
1844238384Sjkim	mov	0x58(%rsp), %r13
1845238384Sjkim	mov	0x60(%rsp), %r12
1846238384Sjkim	mov	0x68(%rsp), %rbx
1847238384Sjkim	mov	0x70(%rsp), %rax
1848238384Sjkim	lea	0x78(%rsp), %rsp
1849238384Sjkim	mov	%rax, %rbp
1850238384Sjkim.Lcbc_dec_epilogue:
1851238384Sjkim	ret
1852238384Sjkim.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1853238384Sjkim
1854238384Sjkim.globl	bsaes_ctr32_encrypt_blocks
1855238384Sjkim.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1856238384Sjkim.align	16
1857238384Sjkimbsaes_ctr32_encrypt_blocks:
1858238384Sjkim	mov	%rsp, %rax
1859238384Sjkim.Lctr_enc_prologue:
1860238384Sjkim	push	%rbp
1861238384Sjkim	push	%rbx
1862238384Sjkim	push	%r12
1863238384Sjkim	push	%r13
1864238384Sjkim	push	%r14
1865238384Sjkim	push	%r15
1866238384Sjkim	lea	-0x48(%rsp), %rsp
1867238384Sjkim___
1868238384Sjkim$code.=<<___ if ($win64);
1869238384Sjkim	mov	0xa0(%rsp),$arg5	# pull ivp
1870238384Sjkim	lea	-0xa0(%rsp), %rsp
1871238384Sjkim	movaps	%xmm6, 0x40(%rsp)
1872238384Sjkim	movaps	%xmm7, 0x50(%rsp)
1873238384Sjkim	movaps	%xmm8, 0x60(%rsp)
1874238384Sjkim	movaps	%xmm9, 0x70(%rsp)
1875238384Sjkim	movaps	%xmm10, 0x80(%rsp)
1876238384Sjkim	movaps	%xmm11, 0x90(%rsp)
1877238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
1878238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
1879238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
1880238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
1881238384Sjkim.Lctr_enc_body:
1882238384Sjkim___
1883238384Sjkim$code.=<<___;
1884238384Sjkim	mov	%rsp, %rbp		# backup %rsp
1885238384Sjkim	movdqu	($arg5), %xmm0		# load counter
1886238384Sjkim	mov	240($arg4), %eax	# rounds
1887238384Sjkim	mov	$arg1, $inp		# backup arguments
1888238384Sjkim	mov	$arg2, $out
1889238384Sjkim	mov	$arg3, $len
1890238384Sjkim	mov	$arg4, $key
1891238384Sjkim	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1892238384Sjkim	cmp	\$8, $arg3
1893238384Sjkim	jb	.Lctr_enc_short
1894238384Sjkim
1895238384Sjkim	mov	%eax, %ebx		# rounds
1896238384Sjkim	shl	\$7, %rax		# 128 bytes per inner round key
1897238384Sjkim	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1898238384Sjkim	sub	%rax, %rsp
1899238384Sjkim
1900238384Sjkim	mov	%rsp, %rax		# pass key schedule
1901238384Sjkim	mov	$key, %rcx		# pass key
1902238384Sjkim	mov	%ebx, %r10d		# pass rounds
1903238384Sjkim	call	_bsaes_key_convert
1904238384Sjkim	pxor	%xmm6,%xmm7		# fix up last round key
1905238384Sjkim	movdqa	%xmm7,(%rax)		# save last round key
1906238384Sjkim
1907238384Sjkim	movdqa	(%rsp), @XMM[9]		# load round0 key
1908238384Sjkim	lea	.LADD1(%rip), %r11
1909238384Sjkim	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1910238384Sjkim	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1911238384Sjkim	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1912238384Sjkim	pshufb	@XMM[8], @XMM[0]
1913238384Sjkim	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1914238384Sjkim	jmp	.Lctr_enc_loop
1915238384Sjkim.align	16
1916238384Sjkim.Lctr_enc_loop:
1917238384Sjkim	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1918238384Sjkim	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1919238384Sjkim	movdqa	@XMM[0], @XMM[2]
1920238384Sjkim	paddd	0x00(%r11), @XMM[1]	# .LADD1
1921238384Sjkim	movdqa	@XMM[0], @XMM[3]
1922238384Sjkim	paddd	0x10(%r11), @XMM[2]	# .LADD2
1923238384Sjkim	movdqa	@XMM[0], @XMM[4]
1924238384Sjkim	paddd	0x20(%r11), @XMM[3]	# .LADD3
1925238384Sjkim	movdqa	@XMM[0], @XMM[5]
1926238384Sjkim	paddd	0x30(%r11), @XMM[4]	# .LADD4
1927238384Sjkim	movdqa	@XMM[0], @XMM[6]
1928238384Sjkim	paddd	0x40(%r11), @XMM[5]	# .LADD5
1929238384Sjkim	movdqa	@XMM[0], @XMM[7]
1930238384Sjkim	paddd	0x50(%r11), @XMM[6]	# .LADD6
1931238384Sjkim	paddd	0x60(%r11), @XMM[7]	# .LADD7
1932238384Sjkim
1933238384Sjkim	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1934238384Sjkim	# to flip byte order in 32-bit counter
1935238384Sjkim	movdqa	(%rsp), @XMM[9]		# round 0 key
1936238384Sjkim	lea	0x10(%rsp), %rax	# pass key schedule
1937238384Sjkim	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1938238384Sjkim	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1939238384Sjkim	pxor	@XMM[9], @XMM[1]
1940238384Sjkim	 pshufb	@XMM[8], @XMM[0]
1941238384Sjkim	pxor	@XMM[9], @XMM[2]
1942238384Sjkim	 pshufb	@XMM[8], @XMM[1]
1943238384Sjkim	pxor	@XMM[9], @XMM[3]
1944238384Sjkim	 pshufb	@XMM[8], @XMM[2]
1945238384Sjkim	pxor	@XMM[9], @XMM[4]
1946238384Sjkim	 pshufb	@XMM[8], @XMM[3]
1947238384Sjkim	pxor	@XMM[9], @XMM[5]
1948238384Sjkim	 pshufb	@XMM[8], @XMM[4]
1949238384Sjkim	pxor	@XMM[9], @XMM[6]
1950238384Sjkim	 pshufb	@XMM[8], @XMM[5]
1951238384Sjkim	pxor	@XMM[9], @XMM[7]
1952238384Sjkim	 pshufb	@XMM[8], @XMM[6]
1953238384Sjkim	lea	.LBS0(%rip), %r11	# constants table
1954238384Sjkim	 pshufb	@XMM[8], @XMM[7]
1955238384Sjkim	mov	%ebx,%r10d		# pass rounds
1956238384Sjkim
1957238384Sjkim	call	_bsaes_encrypt8_bitslice
1958238384Sjkim
1959238384Sjkim	sub	\$8,$len
1960238384Sjkim	jc	.Lctr_enc_loop_done
1961238384Sjkim
1962238384Sjkim	movdqu	0x00($inp), @XMM[8]	# load input
1963238384Sjkim	movdqu	0x10($inp), @XMM[9]
1964238384Sjkim	movdqu	0x20($inp), @XMM[10]
1965238384Sjkim	movdqu	0x30($inp), @XMM[11]
1966238384Sjkim	movdqu	0x40($inp), @XMM[12]
1967238384Sjkim	movdqu	0x50($inp), @XMM[13]
1968238384Sjkim	movdqu	0x60($inp), @XMM[14]
1969238384Sjkim	movdqu	0x70($inp), @XMM[15]
1970238384Sjkim	lea	0x80($inp),$inp
1971238384Sjkim	pxor	@XMM[0], @XMM[8]
1972238384Sjkim	movdqa	0x20(%rbp), @XMM[0]	# load counter
1973238384Sjkim	pxor	@XMM[9], @XMM[1]
1974238384Sjkim	movdqu	@XMM[8], 0x00($out)	# write output
1975238384Sjkim	pxor	@XMM[10], @XMM[4]
1976238384Sjkim	movdqu	@XMM[1], 0x10($out)
1977238384Sjkim	pxor	@XMM[11], @XMM[6]
1978238384Sjkim	movdqu	@XMM[4], 0x20($out)
1979238384Sjkim	pxor	@XMM[12], @XMM[3]
1980238384Sjkim	movdqu	@XMM[6], 0x30($out)
1981238384Sjkim	pxor	@XMM[13], @XMM[7]
1982238384Sjkim	movdqu	@XMM[3], 0x40($out)
1983238384Sjkim	pxor	@XMM[14], @XMM[2]
1984238384Sjkim	movdqu	@XMM[7], 0x50($out)
1985238384Sjkim	pxor	@XMM[15], @XMM[5]
1986238384Sjkim	movdqu	@XMM[2], 0x60($out)
1987238384Sjkim	lea	.LADD1(%rip), %r11
1988238384Sjkim	movdqu	@XMM[5], 0x70($out)
1989238384Sjkim	lea	0x80($out), $out
1990238384Sjkim	paddd	0x70(%r11), @XMM[0]	# .LADD8
1991238384Sjkim	jnz	.Lctr_enc_loop
1992238384Sjkim
1993238384Sjkim	jmp	.Lctr_enc_done
1994238384Sjkim.align	16
1995238384Sjkim.Lctr_enc_loop_done:
1996238384Sjkim	add	\$8, $len
1997238384Sjkim	movdqu	0x00($inp), @XMM[8]	# load input
1998238384Sjkim	pxor	@XMM[8], @XMM[0]
1999238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2000238384Sjkim	cmp	\$2,$len
2001238384Sjkim	jb	.Lctr_enc_done
2002238384Sjkim	movdqu	0x10($inp), @XMM[9]
2003238384Sjkim	pxor	@XMM[9], @XMM[1]
2004238384Sjkim	movdqu	@XMM[1], 0x10($out)
2005238384Sjkim	je	.Lctr_enc_done
2006238384Sjkim	movdqu	0x20($inp), @XMM[10]
2007238384Sjkim	pxor	@XMM[10], @XMM[4]
2008238384Sjkim	movdqu	@XMM[4], 0x20($out)
2009238384Sjkim	cmp	\$4,$len
2010238384Sjkim	jb	.Lctr_enc_done
2011238384Sjkim	movdqu	0x30($inp), @XMM[11]
2012238384Sjkim	pxor	@XMM[11], @XMM[6]
2013238384Sjkim	movdqu	@XMM[6], 0x30($out)
2014238384Sjkim	je	.Lctr_enc_done
2015238384Sjkim	movdqu	0x40($inp), @XMM[12]
2016238384Sjkim	pxor	@XMM[12], @XMM[3]
2017238384Sjkim	movdqu	@XMM[3], 0x40($out)
2018238384Sjkim	cmp	\$6,$len
2019238384Sjkim	jb	.Lctr_enc_done
2020238384Sjkim	movdqu	0x50($inp), @XMM[13]
2021238384Sjkim	pxor	@XMM[13], @XMM[7]
2022238384Sjkim	movdqu	@XMM[7], 0x50($out)
2023238384Sjkim	je	.Lctr_enc_done
2024238384Sjkim	movdqu	0x60($inp), @XMM[14]
2025238384Sjkim	pxor	@XMM[14], @XMM[2]
2026238384Sjkim	movdqu	@XMM[2], 0x60($out)
2027238384Sjkim	jmp	.Lctr_enc_done
2028238384Sjkim
2029238384Sjkim.align	16
2030238384Sjkim.Lctr_enc_short:
2031238384Sjkim	lea	0x20(%rbp), $arg1
2032238384Sjkim	lea	0x30(%rbp), $arg2
2033238384Sjkim	lea	($key), $arg3
2034238384Sjkim	call	asm_AES_encrypt
2035238384Sjkim	movdqu	($inp), @XMM[1]
2036238384Sjkim	lea	16($inp), $inp
2037238384Sjkim	mov	0x2c(%rbp), %eax	# load 32-bit counter
2038238384Sjkim	bswap	%eax
2039238384Sjkim	pxor	0x30(%rbp), @XMM[1]
2040238384Sjkim	inc	%eax			# increment
2041238384Sjkim	movdqu	@XMM[1], ($out)
2042238384Sjkim	bswap	%eax
2043238384Sjkim	lea	16($out), $out
2044238384Sjkim	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2045238384Sjkim	dec	$len
2046238384Sjkim	jnz	.Lctr_enc_short
2047238384Sjkim
2048238384Sjkim.Lctr_enc_done:
2049238384Sjkim	lea	(%rsp), %rax
2050238384Sjkim	pxor	%xmm0, %xmm0
2051238384Sjkim.Lctr_enc_bzero:			# wipe key schedule [if any]
2052238384Sjkim	movdqa	%xmm0, 0x00(%rax)
2053238384Sjkim	movdqa	%xmm0, 0x10(%rax)
2054238384Sjkim	lea	0x20(%rax), %rax
2055238384Sjkim	cmp	%rax, %rbp
2056238384Sjkim	ja	.Lctr_enc_bzero
2057238384Sjkim
2058238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
2059238384Sjkim___
2060238384Sjkim$code.=<<___ if ($win64);
2061238384Sjkim	movaps	0x40(%rbp), %xmm6
2062238384Sjkim	movaps	0x50(%rbp), %xmm7
2063238384Sjkim	movaps	0x60(%rbp), %xmm8
2064238384Sjkim	movaps	0x70(%rbp), %xmm9
2065238384Sjkim	movaps	0x80(%rbp), %xmm10
2066238384Sjkim	movaps	0x90(%rbp), %xmm11
2067238384Sjkim	movaps	0xa0(%rbp), %xmm12
2068238384Sjkim	movaps	0xb0(%rbp), %xmm13
2069238384Sjkim	movaps	0xc0(%rbp), %xmm14
2070238384Sjkim	movaps	0xd0(%rbp), %xmm15
2071238384Sjkim	lea	0xa0(%rbp), %rsp
2072238384Sjkim___
2073238384Sjkim$code.=<<___;
2074238384Sjkim	mov	0x48(%rsp), %r15
2075238384Sjkim	mov	0x50(%rsp), %r14
2076238384Sjkim	mov	0x58(%rsp), %r13
2077238384Sjkim	mov	0x60(%rsp), %r12
2078238384Sjkim	mov	0x68(%rsp), %rbx
2079238384Sjkim	mov	0x70(%rsp), %rax
2080238384Sjkim	lea	0x78(%rsp), %rsp
2081238384Sjkim	mov	%rax, %rbp
2082238384Sjkim.Lctr_enc_epilogue:
2083238384Sjkim	ret
2084238384Sjkim.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2085238384Sjkim___
2086238384Sjkim######################################################################
2087238384Sjkim# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2088238384Sjkim#	const AES_KEY *key1, const AES_KEY *key2,
2089238384Sjkim#	const unsigned char iv[16]);
2090238384Sjkim#
2091238384Sjkimmy ($twmask,$twres,$twtmp)=@XMM[13..15];
2092264331Sjkim$arg6=~s/d$//;
2093264331Sjkim
2094238384Sjkim$code.=<<___;
2095238384Sjkim.globl	bsaes_xts_encrypt
2096238384Sjkim.type	bsaes_xts_encrypt,\@abi-omnipotent
2097238384Sjkim.align	16
2098238384Sjkimbsaes_xts_encrypt:
2099238384Sjkim	mov	%rsp, %rax
2100238384Sjkim.Lxts_enc_prologue:
2101238384Sjkim	push	%rbp
2102238384Sjkim	push	%rbx
2103238384Sjkim	push	%r12
2104238384Sjkim	push	%r13
2105238384Sjkim	push	%r14
2106238384Sjkim	push	%r15
2107238384Sjkim	lea	-0x48(%rsp), %rsp
2108238384Sjkim___
2109238384Sjkim$code.=<<___ if ($win64);
2110238384Sjkim	mov	0xa0(%rsp),$arg5	# pull key2
2111238384Sjkim	mov	0xa8(%rsp),$arg6	# pull ivp
2112238384Sjkim	lea	-0xa0(%rsp), %rsp
2113238384Sjkim	movaps	%xmm6, 0x40(%rsp)
2114238384Sjkim	movaps	%xmm7, 0x50(%rsp)
2115238384Sjkim	movaps	%xmm8, 0x60(%rsp)
2116238384Sjkim	movaps	%xmm9, 0x70(%rsp)
2117238384Sjkim	movaps	%xmm10, 0x80(%rsp)
2118238384Sjkim	movaps	%xmm11, 0x90(%rsp)
2119238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
2120238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
2121238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
2122238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
2123238384Sjkim.Lxts_enc_body:
2124238384Sjkim___
2125238384Sjkim$code.=<<___;
2126238384Sjkim	mov	%rsp, %rbp		# backup %rsp
2127238384Sjkim	mov	$arg1, $inp		# backup arguments
2128238384Sjkim	mov	$arg2, $out
2129238384Sjkim	mov	$arg3, $len
2130238384Sjkim	mov	$arg4, $key
2131238384Sjkim
2132238384Sjkim	lea	($arg6), $arg1
2133238384Sjkim	lea	0x20(%rbp), $arg2
2134238384Sjkim	lea	($arg5), $arg3
2135238384Sjkim	call	asm_AES_encrypt		# generate initial tweak
2136238384Sjkim
2137238384Sjkim	mov	240($key), %eax		# rounds
2138238384Sjkim	mov	$len, %rbx		# backup $len
2139238384Sjkim
2140238384Sjkim	mov	%eax, %edx		# rounds
2141238384Sjkim	shl	\$7, %rax		# 128 bytes per inner round key
2142238384Sjkim	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2143238384Sjkim	sub	%rax, %rsp
2144238384Sjkim
2145238384Sjkim	mov	%rsp, %rax		# pass key schedule
2146238384Sjkim	mov	$key, %rcx		# pass key
2147238384Sjkim	mov	%edx, %r10d		# pass rounds
2148238384Sjkim	call	_bsaes_key_convert
2149238384Sjkim	pxor	%xmm6, %xmm7		# fix up last round key
2150238384Sjkim	movdqa	%xmm7, (%rax)		# save last round key
2151238384Sjkim
2152238384Sjkim	and	\$-16, $len
2153238384Sjkim	sub	\$0x80, %rsp		# place for tweak[8]
2154238384Sjkim	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2155238384Sjkim
2156238384Sjkim	pxor	$twtmp, $twtmp
2157238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2158238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2159238384Sjkim
2160238384Sjkim	sub	\$0x80, $len
2161238384Sjkim	jc	.Lxts_enc_short
2162238384Sjkim	jmp	.Lxts_enc_loop
2163238384Sjkim
2164238384Sjkim.align	16
2165238384Sjkim.Lxts_enc_loop:
2166238384Sjkim___
2167238384Sjkim    for ($i=0;$i<7;$i++) {
2168238384Sjkim    $code.=<<___;
2169238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2170238384Sjkim	pxor	$twtmp, $twtmp
2171238384Sjkim	movdqa	@XMM[7], @XMM[$i]
2172238384Sjkim	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2173238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2174238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2175238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2176238384Sjkim	pxor	$twres, @XMM[7]
2177238384Sjkim___
2178238384Sjkim    $code.=<<___ if ($i>=1);
2179238384Sjkim	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2180238384Sjkim___
2181238384Sjkim    $code.=<<___ if ($i>=2);
2182238384Sjkim	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2183238384Sjkim___
2184238384Sjkim    }
2185238384Sjkim$code.=<<___;
2186238384Sjkim	movdqu	0x60($inp), @XMM[8+6]
2187238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2188238384Sjkim	movdqu	0x70($inp), @XMM[8+7]
2189238384Sjkim	lea	0x80($inp), $inp
2190238384Sjkim	movdqa	@XMM[7], 0x70(%rsp)
2191238384Sjkim	pxor	@XMM[8+6], @XMM[6]
2192238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2193238384Sjkim	pxor	@XMM[8+7], @XMM[7]
2194238384Sjkim	mov	%edx, %r10d		# pass rounds
2195238384Sjkim
2196238384Sjkim	call	_bsaes_encrypt8
2197238384Sjkim
2198238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2199238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2200238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2201238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2202238384Sjkim	movdqu	@XMM[1], 0x10($out)
2203238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2204238384Sjkim	movdqu	@XMM[4], 0x20($out)
2205238384Sjkim	pxor	0x40(%rsp), @XMM[3]
2206238384Sjkim	movdqu	@XMM[6], 0x30($out)
2207238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2208238384Sjkim	movdqu	@XMM[3], 0x40($out)
2209238384Sjkim	pxor	0x60(%rsp), @XMM[2]
2210238384Sjkim	movdqu	@XMM[7], 0x50($out)
2211238384Sjkim	pxor	0x70(%rsp), @XMM[5]
2212238384Sjkim	movdqu	@XMM[2], 0x60($out)
2213238384Sjkim	movdqu	@XMM[5], 0x70($out)
2214238384Sjkim	lea	0x80($out), $out
2215238384Sjkim
2216238384Sjkim	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2217238384Sjkim	pxor	$twtmp, $twtmp
2218238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2219238384Sjkim	pcmpgtd	@XMM[7], $twtmp
2220238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2221238384Sjkim	pxor	$twtmp, $twtmp
2222238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2223238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2224238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2225238384Sjkim	pxor	$twres, @XMM[7]
2226238384Sjkim
2227238384Sjkim	sub	\$0x80,$len
2228238384Sjkim	jnc	.Lxts_enc_loop
2229238384Sjkim
2230238384Sjkim.Lxts_enc_short:
2231238384Sjkim	add	\$0x80, $len
2232238384Sjkim	jz	.Lxts_enc_done
2233238384Sjkim___
2234238384Sjkim    for ($i=0;$i<7;$i++) {
2235238384Sjkim    $code.=<<___;
2236238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2237238384Sjkim	pxor	$twtmp, $twtmp
2238238384Sjkim	movdqa	@XMM[7], @XMM[$i]
2239238384Sjkim	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2240238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2241238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2242238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2243238384Sjkim	pxor	$twres, @XMM[7]
2244238384Sjkim___
2245238384Sjkim    $code.=<<___ if ($i>=1);
2246238384Sjkim	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2247238384Sjkim	cmp	\$`0x10*$i`,$len
2248238384Sjkim	je	.Lxts_enc_$i
2249238384Sjkim___
2250238384Sjkim    $code.=<<___ if ($i>=2);
2251238384Sjkim	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2252238384Sjkim___
2253238384Sjkim    }
2254238384Sjkim$code.=<<___;
2255238384Sjkim	movdqu	0x60($inp), @XMM[8+6]
2256238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2257238384Sjkim	movdqa	@XMM[7], 0x70(%rsp)
2258238384Sjkim	lea	0x70($inp), $inp
2259238384Sjkim	pxor	@XMM[8+6], @XMM[6]
2260238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2261238384Sjkim	mov	%edx, %r10d		# pass rounds
2262238384Sjkim
2263238384Sjkim	call	_bsaes_encrypt8
2264238384Sjkim
2265238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2266238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2267238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2268238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2269238384Sjkim	movdqu	@XMM[1], 0x10($out)
2270238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2271238384Sjkim	movdqu	@XMM[4], 0x20($out)
2272238384Sjkim	pxor	0x40(%rsp), @XMM[3]
2273238384Sjkim	movdqu	@XMM[6], 0x30($out)
2274238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2275238384Sjkim	movdqu	@XMM[3], 0x40($out)
2276238384Sjkim	pxor	0x60(%rsp), @XMM[2]
2277238384Sjkim	movdqu	@XMM[7], 0x50($out)
2278238384Sjkim	movdqu	@XMM[2], 0x60($out)
2279238384Sjkim	lea	0x70($out), $out
2280238384Sjkim
2281238384Sjkim	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2282238384Sjkim	jmp	.Lxts_enc_done
2283238384Sjkim.align	16
2284238384Sjkim.Lxts_enc_6:
2285238384Sjkim	pxor	@XMM[8+4], @XMM[4]
2286238384Sjkim	lea	0x60($inp), $inp
2287238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2288238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2289238384Sjkim	mov	%edx, %r10d		# pass rounds
2290238384Sjkim
2291238384Sjkim	call	_bsaes_encrypt8
2292238384Sjkim
2293238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2294238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2295238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2296238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2297238384Sjkim	movdqu	@XMM[1], 0x10($out)
2298238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2299238384Sjkim	movdqu	@XMM[4], 0x20($out)
2300238384Sjkim	pxor	0x40(%rsp), @XMM[3]
2301238384Sjkim	movdqu	@XMM[6], 0x30($out)
2302238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2303238384Sjkim	movdqu	@XMM[3], 0x40($out)
2304238384Sjkim	movdqu	@XMM[7], 0x50($out)
2305238384Sjkim	lea	0x60($out), $out
2306238384Sjkim
2307238384Sjkim	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2308238384Sjkim	jmp	.Lxts_enc_done
2309238384Sjkim.align	16
2310238384Sjkim.Lxts_enc_5:
2311238384Sjkim	pxor	@XMM[8+3], @XMM[3]
2312238384Sjkim	lea	0x50($inp), $inp
2313238384Sjkim	pxor	@XMM[8+4], @XMM[4]
2314238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2315238384Sjkim	mov	%edx, %r10d		# pass rounds
2316238384Sjkim
2317238384Sjkim	call	_bsaes_encrypt8
2318238384Sjkim
2319238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2320238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2321238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2322238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2323238384Sjkim	movdqu	@XMM[1], 0x10($out)
2324238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2325238384Sjkim	movdqu	@XMM[4], 0x20($out)
2326238384Sjkim	pxor	0x40(%rsp), @XMM[3]
2327238384Sjkim	movdqu	@XMM[6], 0x30($out)
2328238384Sjkim	movdqu	@XMM[3], 0x40($out)
2329238384Sjkim	lea	0x50($out), $out
2330238384Sjkim
2331238384Sjkim	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2332238384Sjkim	jmp	.Lxts_enc_done
2333238384Sjkim.align	16
2334238384Sjkim.Lxts_enc_4:
2335238384Sjkim	pxor	@XMM[8+2], @XMM[2]
2336238384Sjkim	lea	0x40($inp), $inp
2337238384Sjkim	pxor	@XMM[8+3], @XMM[3]
2338238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2339238384Sjkim	mov	%edx, %r10d		# pass rounds
2340238384Sjkim
2341238384Sjkim	call	_bsaes_encrypt8
2342238384Sjkim
2343238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2344238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2345238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2346238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2347238384Sjkim	movdqu	@XMM[1], 0x10($out)
2348238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2349238384Sjkim	movdqu	@XMM[4], 0x20($out)
2350238384Sjkim	movdqu	@XMM[6], 0x30($out)
2351238384Sjkim	lea	0x40($out), $out
2352238384Sjkim
2353238384Sjkim	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2354238384Sjkim	jmp	.Lxts_enc_done
2355238384Sjkim.align	16
2356238384Sjkim.Lxts_enc_3:
2357238384Sjkim	pxor	@XMM[8+1], @XMM[1]
2358238384Sjkim	lea	0x30($inp), $inp
2359238384Sjkim	pxor	@XMM[8+2], @XMM[2]
2360238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2361238384Sjkim	mov	%edx, %r10d		# pass rounds
2362238384Sjkim
2363238384Sjkim	call	_bsaes_encrypt8
2364238384Sjkim
2365238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2366238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2367238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2368238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2369238384Sjkim	movdqu	@XMM[1], 0x10($out)
2370238384Sjkim	movdqu	@XMM[4], 0x20($out)
2371238384Sjkim	lea	0x30($out), $out
2372238384Sjkim
2373238384Sjkim	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2374238384Sjkim	jmp	.Lxts_enc_done
2375238384Sjkim.align	16
2376238384Sjkim.Lxts_enc_2:
2377238384Sjkim	pxor	@XMM[8+0], @XMM[0]
2378238384Sjkim	lea	0x20($inp), $inp
2379238384Sjkim	pxor	@XMM[8+1], @XMM[1]
2380238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2381238384Sjkim	mov	%edx, %r10d		# pass rounds
2382238384Sjkim
2383238384Sjkim	call	_bsaes_encrypt8
2384238384Sjkim
2385238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2386238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2387238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2388238384Sjkim	movdqu	@XMM[1], 0x10($out)
2389238384Sjkim	lea	0x20($out), $out
2390238384Sjkim
2391238384Sjkim	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2392238384Sjkim	jmp	.Lxts_enc_done
2393238384Sjkim.align	16
2394238384Sjkim.Lxts_enc_1:
2395238384Sjkim	pxor	@XMM[0], @XMM[8]
2396238384Sjkim	lea	0x10($inp), $inp
2397238384Sjkim	movdqa	@XMM[8], 0x20(%rbp)
2398238384Sjkim	lea	0x20(%rbp), $arg1
2399238384Sjkim	lea	0x20(%rbp), $arg2
2400238384Sjkim	lea	($key), $arg3
2401238384Sjkim	call	asm_AES_encrypt		# doesn't touch %xmm
2402238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2403238384Sjkim	#pxor	@XMM[8], @XMM[0]
2404238384Sjkim	#lea	0x80(%rsp), %rax	# pass key schedule
2405238384Sjkim	#mov	%edx, %r10d		# pass rounds
2406238384Sjkim	#call	_bsaes_encrypt8
2407238384Sjkim	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2408238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2409238384Sjkim	lea	0x10($out), $out
2410238384Sjkim
2411238384Sjkim	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2412238384Sjkim
2413238384Sjkim.Lxts_enc_done:
2414238384Sjkim	and	\$15, %ebx
2415238384Sjkim	jz	.Lxts_enc_ret
2416238384Sjkim	mov	$out, %rdx
2417238384Sjkim
2418238384Sjkim.Lxts_enc_steal:
2419238384Sjkim	movzb	($inp), %eax
2420238384Sjkim	movzb	-16(%rdx), %ecx
2421238384Sjkim	lea	1($inp), $inp
2422238384Sjkim	mov	%al, -16(%rdx)
2423238384Sjkim	mov	%cl, 0(%rdx)
2424238384Sjkim	lea	1(%rdx), %rdx
2425238384Sjkim	sub	\$1,%ebx
2426238384Sjkim	jnz	.Lxts_enc_steal
2427238384Sjkim
2428238384Sjkim	movdqu	-16($out), @XMM[0]
2429238384Sjkim	lea	0x20(%rbp), $arg1
2430238384Sjkim	pxor	@XMM[7], @XMM[0]
2431238384Sjkim	lea	0x20(%rbp), $arg2
2432238384Sjkim	movdqa	@XMM[0], 0x20(%rbp)
2433238384Sjkim	lea	($key), $arg3
2434238384Sjkim	call	asm_AES_encrypt		# doesn't touch %xmm
2435238384Sjkim	pxor	0x20(%rbp), @XMM[7]
2436238384Sjkim	movdqu	@XMM[7], -16($out)
2437238384Sjkim
2438238384Sjkim.Lxts_enc_ret:
2439238384Sjkim	lea	(%rsp), %rax
2440238384Sjkim	pxor	%xmm0, %xmm0
2441238384Sjkim.Lxts_enc_bzero:			# wipe key schedule [if any]
2442238384Sjkim	movdqa	%xmm0, 0x00(%rax)
2443238384Sjkim	movdqa	%xmm0, 0x10(%rax)
2444238384Sjkim	lea	0x20(%rax), %rax
2445238384Sjkim	cmp	%rax, %rbp
2446238384Sjkim	ja	.Lxts_enc_bzero
2447238384Sjkim
2448238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
2449238384Sjkim___
2450238384Sjkim$code.=<<___ if ($win64);
2451238384Sjkim	movaps	0x40(%rbp), %xmm6
2452238384Sjkim	movaps	0x50(%rbp), %xmm7
2453238384Sjkim	movaps	0x60(%rbp), %xmm8
2454238384Sjkim	movaps	0x70(%rbp), %xmm9
2455238384Sjkim	movaps	0x80(%rbp), %xmm10
2456238384Sjkim	movaps	0x90(%rbp), %xmm11
2457238384Sjkim	movaps	0xa0(%rbp), %xmm12
2458238384Sjkim	movaps	0xb0(%rbp), %xmm13
2459238384Sjkim	movaps	0xc0(%rbp), %xmm14
2460238384Sjkim	movaps	0xd0(%rbp), %xmm15
2461238384Sjkim	lea	0xa0(%rbp), %rsp
2462238384Sjkim___
2463238384Sjkim$code.=<<___;
2464238384Sjkim	mov	0x48(%rsp), %r15
2465238384Sjkim	mov	0x50(%rsp), %r14
2466238384Sjkim	mov	0x58(%rsp), %r13
2467238384Sjkim	mov	0x60(%rsp), %r12
2468238384Sjkim	mov	0x68(%rsp), %rbx
2469238384Sjkim	mov	0x70(%rsp), %rax
2470238384Sjkim	lea	0x78(%rsp), %rsp
2471238384Sjkim	mov	%rax, %rbp
2472238384Sjkim.Lxts_enc_epilogue:
2473238384Sjkim	ret
2474238384Sjkim.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2475238384Sjkim
2476238384Sjkim.globl	bsaes_xts_decrypt
2477238384Sjkim.type	bsaes_xts_decrypt,\@abi-omnipotent
2478238384Sjkim.align	16
2479238384Sjkimbsaes_xts_decrypt:
2480238384Sjkim	mov	%rsp, %rax
2481238384Sjkim.Lxts_dec_prologue:
2482238384Sjkim	push	%rbp
2483238384Sjkim	push	%rbx
2484238384Sjkim	push	%r12
2485238384Sjkim	push	%r13
2486238384Sjkim	push	%r14
2487238384Sjkim	push	%r15
2488238384Sjkim	lea	-0x48(%rsp), %rsp
2489238384Sjkim___
2490238384Sjkim$code.=<<___ if ($win64);
2491238384Sjkim	mov	0xa0(%rsp),$arg5	# pull key2
2492238384Sjkim	mov	0xa8(%rsp),$arg6	# pull ivp
2493238384Sjkim	lea	-0xa0(%rsp), %rsp
2494238384Sjkim	movaps	%xmm6, 0x40(%rsp)
2495238384Sjkim	movaps	%xmm7, 0x50(%rsp)
2496238384Sjkim	movaps	%xmm8, 0x60(%rsp)
2497238384Sjkim	movaps	%xmm9, 0x70(%rsp)
2498238384Sjkim	movaps	%xmm10, 0x80(%rsp)
2499238384Sjkim	movaps	%xmm11, 0x90(%rsp)
2500238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
2501238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
2502238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
2503238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
2504238384Sjkim.Lxts_dec_body:
2505238384Sjkim___
2506238384Sjkim$code.=<<___;
2507238384Sjkim	mov	%rsp, %rbp		# backup %rsp
2508238384Sjkim	mov	$arg1, $inp		# backup arguments
2509238384Sjkim	mov	$arg2, $out
2510238384Sjkim	mov	$arg3, $len
2511238384Sjkim	mov	$arg4, $key
2512238384Sjkim
2513238384Sjkim	lea	($arg6), $arg1
2514238384Sjkim	lea	0x20(%rbp), $arg2
2515238384Sjkim	lea	($arg5), $arg3
2516238384Sjkim	call	asm_AES_encrypt		# generate initial tweak
2517238384Sjkim
2518238384Sjkim	mov	240($key), %eax		# rounds
2519238384Sjkim	mov	$len, %rbx		# backup $len
2520238384Sjkim
2521238384Sjkim	mov	%eax, %edx		# rounds
2522238384Sjkim	shl	\$7, %rax		# 128 bytes per inner round key
2523238384Sjkim	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2524238384Sjkim	sub	%rax, %rsp
2525238384Sjkim
2526238384Sjkim	mov	%rsp, %rax		# pass key schedule
2527238384Sjkim	mov	$key, %rcx		# pass key
2528238384Sjkim	mov	%edx, %r10d		# pass rounds
2529238384Sjkim	call	_bsaes_key_convert
2530238384Sjkim	pxor	(%rsp), %xmm7		# fix up round 0 key
2531238384Sjkim	movdqa	%xmm6, (%rax)		# save last round key
2532238384Sjkim	movdqa	%xmm7, (%rsp)
2533238384Sjkim
2534238384Sjkim	xor	%eax, %eax		# if ($len%16) len-=16;
2535238384Sjkim	and	\$-16, $len
2536238384Sjkim	test	\$15, %ebx
2537238384Sjkim	setnz	%al
2538238384Sjkim	shl	\$4, %rax
2539238384Sjkim	sub	%rax, $len
2540238384Sjkim
2541238384Sjkim	sub	\$0x80, %rsp		# place for tweak[8]
2542238384Sjkim	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2543238384Sjkim
2544238384Sjkim	pxor	$twtmp, $twtmp
2545238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2546238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2547238384Sjkim
2548238384Sjkim	sub	\$0x80, $len
2549238384Sjkim	jc	.Lxts_dec_short
2550238384Sjkim	jmp	.Lxts_dec_loop
2551238384Sjkim
2552238384Sjkim.align	16
2553238384Sjkim.Lxts_dec_loop:
2554238384Sjkim___
2555238384Sjkim    for ($i=0;$i<7;$i++) {
2556238384Sjkim    $code.=<<___;
2557238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2558238384Sjkim	pxor	$twtmp, $twtmp
2559238384Sjkim	movdqa	@XMM[7], @XMM[$i]
2560238384Sjkim	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2561238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2562238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2563238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2564238384Sjkim	pxor	$twres, @XMM[7]
2565238384Sjkim___
2566238384Sjkim    $code.=<<___ if ($i>=1);
2567238384Sjkim	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2568238384Sjkim___
2569238384Sjkim    $code.=<<___ if ($i>=2);
2570238384Sjkim	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2571238384Sjkim___
2572238384Sjkim    }
2573238384Sjkim$code.=<<___;
2574238384Sjkim	movdqu	0x60($inp), @XMM[8+6]
2575238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2576238384Sjkim	movdqu	0x70($inp), @XMM[8+7]
2577238384Sjkim	lea	0x80($inp), $inp
2578238384Sjkim	movdqa	@XMM[7], 0x70(%rsp)
2579238384Sjkim	pxor	@XMM[8+6], @XMM[6]
2580238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2581238384Sjkim	pxor	@XMM[8+7], @XMM[7]
2582238384Sjkim	mov	%edx, %r10d		# pass rounds
2583238384Sjkim
2584238384Sjkim	call	_bsaes_decrypt8
2585238384Sjkim
2586238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2587238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2588238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2589238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2590238384Sjkim	movdqu	@XMM[1], 0x10($out)
2591238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2592238384Sjkim	movdqu	@XMM[6], 0x20($out)
2593238384Sjkim	pxor	0x40(%rsp), @XMM[2]
2594238384Sjkim	movdqu	@XMM[4], 0x30($out)
2595238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2596238384Sjkim	movdqu	@XMM[2], 0x40($out)
2597238384Sjkim	pxor	0x60(%rsp), @XMM[3]
2598238384Sjkim	movdqu	@XMM[7], 0x50($out)
2599238384Sjkim	pxor	0x70(%rsp), @XMM[5]
2600238384Sjkim	movdqu	@XMM[3], 0x60($out)
2601238384Sjkim	movdqu	@XMM[5], 0x70($out)
2602238384Sjkim	lea	0x80($out), $out
2603238384Sjkim
2604238384Sjkim	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2605238384Sjkim	pxor	$twtmp, $twtmp
2606238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2607238384Sjkim	pcmpgtd	@XMM[7], $twtmp
2608238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2609238384Sjkim	pxor	$twtmp, $twtmp
2610238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2611238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2612238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2613238384Sjkim	pxor	$twres, @XMM[7]
2614238384Sjkim
2615238384Sjkim	sub	\$0x80,$len
2616238384Sjkim	jnc	.Lxts_dec_loop
2617238384Sjkim
2618238384Sjkim.Lxts_dec_short:
2619238384Sjkim	add	\$0x80, $len
2620238384Sjkim	jz	.Lxts_dec_done
2621238384Sjkim___
2622238384Sjkim    for ($i=0;$i<7;$i++) {
2623238384Sjkim    $code.=<<___;
2624238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2625238384Sjkim	pxor	$twtmp, $twtmp
2626238384Sjkim	movdqa	@XMM[7], @XMM[$i]
2627238384Sjkim	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2628238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2629238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2630238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2631238384Sjkim	pxor	$twres, @XMM[7]
2632238384Sjkim___
2633238384Sjkim    $code.=<<___ if ($i>=1);
2634238384Sjkim	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2635238384Sjkim	cmp	\$`0x10*$i`,$len
2636238384Sjkim	je	.Lxts_dec_$i
2637238384Sjkim___
2638238384Sjkim    $code.=<<___ if ($i>=2);
2639238384Sjkim	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2640238384Sjkim___
2641238384Sjkim    }
2642238384Sjkim$code.=<<___;
2643238384Sjkim	movdqu	0x60($inp), @XMM[8+6]
2644238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2645238384Sjkim	movdqa	@XMM[7], 0x70(%rsp)
2646238384Sjkim	lea	0x70($inp), $inp
2647238384Sjkim	pxor	@XMM[8+6], @XMM[6]
2648238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2649238384Sjkim	mov	%edx, %r10d		# pass rounds
2650238384Sjkim
2651238384Sjkim	call	_bsaes_decrypt8
2652238384Sjkim
2653238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2654238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2655238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2656238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2657238384Sjkim	movdqu	@XMM[1], 0x10($out)
2658238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2659238384Sjkim	movdqu	@XMM[6], 0x20($out)
2660238384Sjkim	pxor	0x40(%rsp), @XMM[2]
2661238384Sjkim	movdqu	@XMM[4], 0x30($out)
2662238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2663238384Sjkim	movdqu	@XMM[2], 0x40($out)
2664238384Sjkim	pxor	0x60(%rsp), @XMM[3]
2665238384Sjkim	movdqu	@XMM[7], 0x50($out)
2666238384Sjkim	movdqu	@XMM[3], 0x60($out)
2667238384Sjkim	lea	0x70($out), $out
2668238384Sjkim
2669238384Sjkim	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2670238384Sjkim	jmp	.Lxts_dec_done
2671238384Sjkim.align	16
2672238384Sjkim.Lxts_dec_6:
2673238384Sjkim	pxor	@XMM[8+4], @XMM[4]
2674238384Sjkim	lea	0x60($inp), $inp
2675238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2676238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2677238384Sjkim	mov	%edx, %r10d		# pass rounds
2678238384Sjkim
2679238384Sjkim	call	_bsaes_decrypt8
2680238384Sjkim
2681238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2682238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2683238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2684238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2685238384Sjkim	movdqu	@XMM[1], 0x10($out)
2686238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2687238384Sjkim	movdqu	@XMM[6], 0x20($out)
2688238384Sjkim	pxor	0x40(%rsp), @XMM[2]
2689238384Sjkim	movdqu	@XMM[4], 0x30($out)
2690238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2691238384Sjkim	movdqu	@XMM[2], 0x40($out)
2692238384Sjkim	movdqu	@XMM[7], 0x50($out)
2693238384Sjkim	lea	0x60($out), $out
2694238384Sjkim
2695238384Sjkim	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2696238384Sjkim	jmp	.Lxts_dec_done
2697238384Sjkim.align	16
2698238384Sjkim.Lxts_dec_5:
2699238384Sjkim	pxor	@XMM[8+3], @XMM[3]
2700238384Sjkim	lea	0x50($inp), $inp
2701238384Sjkim	pxor	@XMM[8+4], @XMM[4]
2702238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2703238384Sjkim	mov	%edx, %r10d		# pass rounds
2704238384Sjkim
2705238384Sjkim	call	_bsaes_decrypt8
2706238384Sjkim
2707238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2708238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2709238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2710238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2711238384Sjkim	movdqu	@XMM[1], 0x10($out)
2712238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2713238384Sjkim	movdqu	@XMM[6], 0x20($out)
2714238384Sjkim	pxor	0x40(%rsp), @XMM[2]
2715238384Sjkim	movdqu	@XMM[4], 0x30($out)
2716238384Sjkim	movdqu	@XMM[2], 0x40($out)
2717238384Sjkim	lea	0x50($out), $out
2718238384Sjkim
2719238384Sjkim	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2720238384Sjkim	jmp	.Lxts_dec_done
2721238384Sjkim.align	16
2722238384Sjkim.Lxts_dec_4:
2723238384Sjkim	pxor	@XMM[8+2], @XMM[2]
2724238384Sjkim	lea	0x40($inp), $inp
2725238384Sjkim	pxor	@XMM[8+3], @XMM[3]
2726238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2727238384Sjkim	mov	%edx, %r10d		# pass rounds
2728238384Sjkim
2729238384Sjkim	call	_bsaes_decrypt8
2730238384Sjkim
2731238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2732238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2733238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2734238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2735238384Sjkim	movdqu	@XMM[1], 0x10($out)
2736238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2737238384Sjkim	movdqu	@XMM[6], 0x20($out)
2738238384Sjkim	movdqu	@XMM[4], 0x30($out)
2739238384Sjkim	lea	0x40($out), $out
2740238384Sjkim
2741238384Sjkim	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2742238384Sjkim	jmp	.Lxts_dec_done
2743238384Sjkim.align	16
2744238384Sjkim.Lxts_dec_3:
2745238384Sjkim	pxor	@XMM[8+1], @XMM[1]
2746238384Sjkim	lea	0x30($inp), $inp
2747238384Sjkim	pxor	@XMM[8+2], @XMM[2]
2748238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2749238384Sjkim	mov	%edx, %r10d		# pass rounds
2750238384Sjkim
2751238384Sjkim	call	_bsaes_decrypt8
2752238384Sjkim
2753238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2754238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2755238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2756238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2757238384Sjkim	movdqu	@XMM[1], 0x10($out)
2758238384Sjkim	movdqu	@XMM[6], 0x20($out)
2759238384Sjkim	lea	0x30($out), $out
2760238384Sjkim
2761238384Sjkim	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2762238384Sjkim	jmp	.Lxts_dec_done
2763238384Sjkim.align	16
2764238384Sjkim.Lxts_dec_2:
2765238384Sjkim	pxor	@XMM[8+0], @XMM[0]
2766238384Sjkim	lea	0x20($inp), $inp
2767238384Sjkim	pxor	@XMM[8+1], @XMM[1]
2768238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2769238384Sjkim	mov	%edx, %r10d		# pass rounds
2770238384Sjkim
2771238384Sjkim	call	_bsaes_decrypt8
2772238384Sjkim
2773238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2774238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2775238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2776238384Sjkim	movdqu	@XMM[1], 0x10($out)
2777238384Sjkim	lea	0x20($out), $out
2778238384Sjkim
2779238384Sjkim	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2780238384Sjkim	jmp	.Lxts_dec_done
2781238384Sjkim.align	16
2782238384Sjkim.Lxts_dec_1:
2783238384Sjkim	pxor	@XMM[0], @XMM[8]
2784238384Sjkim	lea	0x10($inp), $inp
2785238384Sjkim	movdqa	@XMM[8], 0x20(%rbp)
2786238384Sjkim	lea	0x20(%rbp), $arg1
2787238384Sjkim	lea	0x20(%rbp), $arg2
2788238384Sjkim	lea	($key), $arg3
2789238384Sjkim	call	asm_AES_decrypt		# doesn't touch %xmm
2790238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2791238384Sjkim	#pxor	@XMM[8], @XMM[0]
2792238384Sjkim	#lea	0x80(%rsp), %rax	# pass key schedule
2793238384Sjkim	#mov	%edx, %r10d		# pass rounds
2794238384Sjkim	#call	_bsaes_decrypt8
2795238384Sjkim	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2796238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2797238384Sjkim	lea	0x10($out), $out
2798238384Sjkim
2799238384Sjkim	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2800238384Sjkim
2801238384Sjkim.Lxts_dec_done:
2802238384Sjkim	and	\$15, %ebx
2803238384Sjkim	jz	.Lxts_dec_ret
2804238384Sjkim
2805238384Sjkim	pxor	$twtmp, $twtmp
2806238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2807238384Sjkim	pcmpgtd	@XMM[7], $twtmp
2808238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2809238384Sjkim	movdqa	@XMM[7], @XMM[6]
2810238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2811238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2812238384Sjkim	movdqu	($inp), @XMM[0]
2813238384Sjkim	pxor	$twres, @XMM[7]
2814238384Sjkim
2815238384Sjkim	lea	0x20(%rbp), $arg1
2816238384Sjkim	pxor	@XMM[7], @XMM[0]
2817238384Sjkim	lea	0x20(%rbp), $arg2
2818238384Sjkim	movdqa	@XMM[0], 0x20(%rbp)
2819238384Sjkim	lea	($key), $arg3
2820238384Sjkim	call	asm_AES_decrypt		# doesn't touch %xmm
2821238384Sjkim	pxor	0x20(%rbp), @XMM[7]
2822238384Sjkim	mov	$out, %rdx
2823238384Sjkim	movdqu	@XMM[7], ($out)
2824238384Sjkim
2825238384Sjkim.Lxts_dec_steal:
2826238384Sjkim	movzb	16($inp), %eax
2827238384Sjkim	movzb	(%rdx), %ecx
2828238384Sjkim	lea	1($inp), $inp
2829238384Sjkim	mov	%al, (%rdx)
2830238384Sjkim	mov	%cl, 16(%rdx)
2831238384Sjkim	lea	1(%rdx), %rdx
2832238384Sjkim	sub	\$1,%ebx
2833238384Sjkim	jnz	.Lxts_dec_steal
2834238384Sjkim
2835238384Sjkim	movdqu	($out), @XMM[0]
2836238384Sjkim	lea	0x20(%rbp), $arg1
2837238384Sjkim	pxor	@XMM[6], @XMM[0]
2838238384Sjkim	lea	0x20(%rbp), $arg2
2839238384Sjkim	movdqa	@XMM[0], 0x20(%rbp)
2840238384Sjkim	lea	($key), $arg3
2841238384Sjkim	call	asm_AES_decrypt		# doesn't touch %xmm
2842238384Sjkim	pxor	0x20(%rbp), @XMM[6]
2843238384Sjkim	movdqu	@XMM[6], ($out)
2844238384Sjkim
2845238384Sjkim.Lxts_dec_ret:
2846238384Sjkim	lea	(%rsp), %rax
2847238384Sjkim	pxor	%xmm0, %xmm0
2848238384Sjkim.Lxts_dec_bzero:			# wipe key schedule [if any]
2849238384Sjkim	movdqa	%xmm0, 0x00(%rax)
2850238384Sjkim	movdqa	%xmm0, 0x10(%rax)
2851238384Sjkim	lea	0x20(%rax), %rax
2852238384Sjkim	cmp	%rax, %rbp
2853238384Sjkim	ja	.Lxts_dec_bzero
2854238384Sjkim
2855238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
2856238384Sjkim___
2857238384Sjkim$code.=<<___ if ($win64);
2858238384Sjkim	movaps	0x40(%rbp), %xmm6
2859238384Sjkim	movaps	0x50(%rbp), %xmm7
2860238384Sjkim	movaps	0x60(%rbp), %xmm8
2861238384Sjkim	movaps	0x70(%rbp), %xmm9
2862238384Sjkim	movaps	0x80(%rbp), %xmm10
2863238384Sjkim	movaps	0x90(%rbp), %xmm11
2864238384Sjkim	movaps	0xa0(%rbp), %xmm12
2865238384Sjkim	movaps	0xb0(%rbp), %xmm13
2866238384Sjkim	movaps	0xc0(%rbp), %xmm14
2867238384Sjkim	movaps	0xd0(%rbp), %xmm15
2868238384Sjkim	lea	0xa0(%rbp), %rsp
2869238384Sjkim___
2870238384Sjkim$code.=<<___;
2871238384Sjkim	mov	0x48(%rsp), %r15
2872238384Sjkim	mov	0x50(%rsp), %r14
2873238384Sjkim	mov	0x58(%rsp), %r13
2874238384Sjkim	mov	0x60(%rsp), %r12
2875238384Sjkim	mov	0x68(%rsp), %rbx
2876238384Sjkim	mov	0x70(%rsp), %rax
2877238384Sjkim	lea	0x78(%rsp), %rsp
2878238384Sjkim	mov	%rax, %rbp
2879238384Sjkim.Lxts_dec_epilogue:
2880238384Sjkim	ret
2881238384Sjkim.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2882238384Sjkim___
2883238384Sjkim}
2884238384Sjkim$code.=<<___;
2885238384Sjkim.type	_bsaes_const,\@object
2886238384Sjkim.align	64
2887238384Sjkim_bsaes_const:
2888238384Sjkim.LM0ISR:	# InvShiftRows constants
2889238384Sjkim	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2890238384Sjkim.LISRM0:
2891238384Sjkim	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2892238384Sjkim.LISR:
2893238384Sjkim	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2894238384Sjkim.LBS0:		# bit-slice constants
2895238384Sjkim	.quad	0x5555555555555555, 0x5555555555555555
2896238384Sjkim.LBS1:
2897238384Sjkim	.quad	0x3333333333333333, 0x3333333333333333
2898238384Sjkim.LBS2:
2899238384Sjkim	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2900238384Sjkim.LSR:		# shiftrows constants
2901238384Sjkim	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2902238384Sjkim.LSRM0:
2903238384Sjkim	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2904238384Sjkim.LM0SR:
2905238384Sjkim	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2906238384Sjkim.LSWPUP:	# byte-swap upper dword
2907238384Sjkim	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2908238384Sjkim.LSWPUPM0SR:
2909238384Sjkim	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2910238384Sjkim.LADD1:		# counter increment constants
2911238384Sjkim	.quad	0x0000000000000000, 0x0000000100000000
2912238384Sjkim.LADD2:
2913238384Sjkim	.quad	0x0000000000000000, 0x0000000200000000
2914238384Sjkim.LADD3:
2915238384Sjkim	.quad	0x0000000000000000, 0x0000000300000000
2916238384Sjkim.LADD4:
2917238384Sjkim	.quad	0x0000000000000000, 0x0000000400000000
2918238384Sjkim.LADD5:
2919238384Sjkim	.quad	0x0000000000000000, 0x0000000500000000
2920238384Sjkim.LADD6:
2921238384Sjkim	.quad	0x0000000000000000, 0x0000000600000000
2922238384Sjkim.LADD7:
2923238384Sjkim	.quad	0x0000000000000000, 0x0000000700000000
2924238384Sjkim.LADD8:
2925238384Sjkim	.quad	0x0000000000000000, 0x0000000800000000
2926238384Sjkim.Lxts_magic:
2927238384Sjkim	.long	0x87,0,1,0
2928238384Sjkim.Lmasks:
2929238384Sjkim	.quad	0x0101010101010101, 0x0101010101010101
2930238384Sjkim	.quad	0x0202020202020202, 0x0202020202020202
2931238384Sjkim	.quad	0x0404040404040404, 0x0404040404040404
2932238384Sjkim	.quad	0x0808080808080808, 0x0808080808080808
2933238384Sjkim.LM0:
2934238384Sjkim	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2935238384Sjkim.L63:
2936238384Sjkim	.quad	0x6363636363636363, 0x6363636363636363
2937238384Sjkim.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia K��sper, Peter Schwabe, Andy Polyakov"
2938238384Sjkim.align	64
2939238384Sjkim.size	_bsaes_const,.-_bsaes_const
2940238384Sjkim___
2941238384Sjkim
2942238384Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2943238384Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2944238384Sjkimif ($win64) {
2945238384Sjkim$rec="%rcx";
2946238384Sjkim$frame="%rdx";
2947238384Sjkim$context="%r8";
2948238384Sjkim$disp="%r9";
2949238384Sjkim
2950238384Sjkim$code.=<<___;
2951238384Sjkim.extern	__imp_RtlVirtualUnwind
2952238384Sjkim.type	se_handler,\@abi-omnipotent
2953238384Sjkim.align	16
2954238384Sjkimse_handler:
2955238384Sjkim	push	%rsi
2956238384Sjkim	push	%rdi
2957238384Sjkim	push	%rbx
2958238384Sjkim	push	%rbp
2959238384Sjkim	push	%r12
2960238384Sjkim	push	%r13
2961238384Sjkim	push	%r14
2962238384Sjkim	push	%r15
2963238384Sjkim	pushfq
2964238384Sjkim	sub	\$64,%rsp
2965238384Sjkim
2966238384Sjkim	mov	120($context),%rax	# pull context->Rax
2967238384Sjkim	mov	248($context),%rbx	# pull context->Rip
2968238384Sjkim
2969238384Sjkim	mov	8($disp),%rsi		# disp->ImageBase
2970238384Sjkim	mov	56($disp),%r11		# disp->HandlerData
2971238384Sjkim
2972238384Sjkim	mov	0(%r11),%r10d		# HandlerData[0]
2973238384Sjkim	lea	(%rsi,%r10),%r10	# prologue label
2974238384Sjkim	cmp	%r10,%rbx		# context->Rip<prologue label
2975238384Sjkim	jb	.Lin_prologue
2976238384Sjkim
2977238384Sjkim	mov	152($context),%rax	# pull context->Rsp
2978238384Sjkim
2979238384Sjkim	mov	4(%r11),%r10d		# HandlerData[1]
2980238384Sjkim	lea	(%rsi,%r10),%r10	# epilogue label
2981238384Sjkim	cmp	%r10,%rbx		# context->Rip>=epilogue label
2982238384Sjkim	jae	.Lin_prologue
2983238384Sjkim
2984238384Sjkim	mov	160($context),%rax	# pull context->Rbp
2985238384Sjkim
2986238384Sjkim	lea	0x40(%rax),%rsi		# %xmm save area
2987238384Sjkim	lea	512($context),%rdi	# &context.Xmm6
2988238384Sjkim	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2989238384Sjkim	.long	0xa548f3fc		# cld; rep movsq
2990238384Sjkim	lea	0xa0(%rax),%rax		# adjust stack pointer
2991238384Sjkim
2992238384Sjkim	mov	0x70(%rax),%rbp
2993238384Sjkim	mov	0x68(%rax),%rbx
2994238384Sjkim	mov	0x60(%rax),%r12
2995238384Sjkim	mov	0x58(%rax),%r13
2996238384Sjkim	mov	0x50(%rax),%r14
2997238384Sjkim	mov	0x48(%rax),%r15
2998238384Sjkim	lea	0x78(%rax),%rax		# adjust stack pointer
2999238384Sjkim	mov	%rbx,144($context)	# restore context->Rbx
3000238384Sjkim	mov	%rbp,160($context)	# restore context->Rbp
3001238384Sjkim	mov	%r12,216($context)	# restore context->R12
3002238384Sjkim	mov	%r13,224($context)	# restore context->R13
3003238384Sjkim	mov	%r14,232($context)	# restore context->R14
3004238384Sjkim	mov	%r15,240($context)	# restore context->R15
3005238384Sjkim
3006238384Sjkim.Lin_prologue:
3007238384Sjkim	mov	%rax,152($context)	# restore context->Rsp
3008238384Sjkim
3009238384Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
3010238384Sjkim	mov	$context,%rsi		# context
3011238384Sjkim	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3012238384Sjkim	.long	0xa548f3fc		# cld; rep movsq
3013238384Sjkim
3014238384Sjkim	mov	$disp,%rsi
3015238384Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3016238384Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3017238384Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3018238384Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3019238384Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
3020238384Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
3021238384Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3022238384Sjkim	mov	%r10,32(%rsp)		# arg5
3023238384Sjkim	mov	%r11,40(%rsp)		# arg6
3024238384Sjkim	mov	%r12,48(%rsp)		# arg7
3025238384Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
3026238384Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
3027238384Sjkim
3028238384Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
3029238384Sjkim	add	\$64,%rsp
3030238384Sjkim	popfq
3031238384Sjkim	pop	%r15
3032238384Sjkim	pop	%r14
3033238384Sjkim	pop	%r13
3034238384Sjkim	pop	%r12
3035238384Sjkim	pop	%rbp
3036238384Sjkim	pop	%rbx
3037238384Sjkim	pop	%rdi
3038238384Sjkim	pop	%rsi
3039238384Sjkim	ret
3040238384Sjkim.size	se_handler,.-se_handler
3041238384Sjkim
3042238384Sjkim.section	.pdata
3043238384Sjkim.align	4
3044238384Sjkim___
3045238384Sjkim$code.=<<___ if ($ecb);
3046238384Sjkim	.rva	.Lecb_enc_prologue
3047238384Sjkim	.rva	.Lecb_enc_epilogue
3048238384Sjkim	.rva	.Lecb_enc_info
3049238384Sjkim
3050238384Sjkim	.rva	.Lecb_dec_prologue
3051238384Sjkim	.rva	.Lecb_dec_epilogue
3052238384Sjkim	.rva	.Lecb_dec_info
3053238384Sjkim___
3054238384Sjkim$code.=<<___;
3055238384Sjkim	.rva	.Lcbc_dec_prologue
3056238384Sjkim	.rva	.Lcbc_dec_epilogue
3057238384Sjkim	.rva	.Lcbc_dec_info
3058238384Sjkim
3059238384Sjkim	.rva	.Lctr_enc_prologue
3060238384Sjkim	.rva	.Lctr_enc_epilogue
3061238384Sjkim	.rva	.Lctr_enc_info
3062238384Sjkim
3063238384Sjkim	.rva	.Lxts_enc_prologue
3064238384Sjkim	.rva	.Lxts_enc_epilogue
3065238384Sjkim	.rva	.Lxts_enc_info
3066238384Sjkim
3067238384Sjkim	.rva	.Lxts_dec_prologue
3068238384Sjkim	.rva	.Lxts_dec_epilogue
3069238384Sjkim	.rva	.Lxts_dec_info
3070238384Sjkim
3071238384Sjkim.section	.xdata
3072238384Sjkim.align	8
3073238384Sjkim___
3074238384Sjkim$code.=<<___ if ($ecb);
3075238384Sjkim.Lecb_enc_info:
3076238384Sjkim	.byte	9,0,0,0
3077238384Sjkim	.rva	se_handler
3078238384Sjkim	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3079238384Sjkim.Lecb_dec_info:
3080238384Sjkim	.byte	9,0,0,0
3081238384Sjkim	.rva	se_handler
3082238384Sjkim	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3083238384Sjkim___
3084238384Sjkim$code.=<<___;
3085238384Sjkim.Lcbc_dec_info:
3086238384Sjkim	.byte	9,0,0,0
3087238384Sjkim	.rva	se_handler
3088238384Sjkim	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3089238384Sjkim.Lctr_enc_info:
3090238384Sjkim	.byte	9,0,0,0
3091238384Sjkim	.rva	se_handler
3092238384Sjkim	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3093238384Sjkim.Lxts_enc_info:
3094238384Sjkim	.byte	9,0,0,0
3095238384Sjkim	.rva	se_handler
3096238384Sjkim	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3097238384Sjkim.Lxts_dec_info:
3098238384Sjkim	.byte	9,0,0,0
3099238384Sjkim	.rva	se_handler
3100238384Sjkim	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3101238384Sjkim___
3102238384Sjkim}
3103238384Sjkim
3104238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3105238384Sjkim
3106238384Sjkimprint $code;
3107238384Sjkim
3108238384Sjkimclose STDOUT;
3109