1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim###################################################################
4238384Sjkim### AES-128 [originally in CTR mode]				###
5238384Sjkim### bitsliced implementation for Intel Core 2 processors	###
6238384Sjkim### requires support of SSE extensions up to SSSE3		###
7238384Sjkim### Author: Emilia K��sper and Peter Schwabe			###
8238384Sjkim### Date: 2009-03-19						###
9238384Sjkim### Public domain						###
10238384Sjkim###								###
11238384Sjkim### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12238384Sjkim### further information.					###
13238384Sjkim###################################################################
14238384Sjkim#
15238384Sjkim# September 2011.
16238384Sjkim#
17238384Sjkim# Started as transliteration to "perlasm" the original code has
18238384Sjkim# undergone following changes:
19238384Sjkim#
20238384Sjkim# - code was made position-independent;
21238384Sjkim# - rounds were folded into a loop resulting in >5x size reduction
22238384Sjkim#   from 12.5KB to 2.2KB;
23238384Sjkim# - above was possibile thanks to mixcolumns() modification that
24238384Sjkim#   allowed to feed its output back to aesenc[last], this was
25238384Sjkim#   achieved at cost of two additional inter-registers moves;
26238384Sjkim# - some instruction reordering and interleaving;
27238384Sjkim# - this module doesn't implement key setup subroutine, instead it
28238384Sjkim#   relies on conversion of "conventional" key schedule as returned
29238384Sjkim#   by AES_set_encrypt_key (see discussion below);
30238384Sjkim# - first and last round keys are treated differently, which allowed
31238384Sjkim#   to skip one shiftrows(), reduce bit-sliced key schedule and
32238384Sjkim#   speed-up conversion by 22%;
33238384Sjkim# - support for 192- and 256-bit keys was added;
34238384Sjkim#
35238384Sjkim# Resulting performance in CPU cycles spent to encrypt one byte out
36238384Sjkim# of 4096-byte buffer with 128-bit key is:
37238384Sjkim#
38238384Sjkim#		Emilia's	this(*)		difference
39238384Sjkim#
40238384Sjkim# Core 2    	9.30		8.69		+7%
41290207Sjkim# Nehalem(**) 	7.63		6.88		+11%
42290207Sjkim# Atom	    	17.1		16.4		+4%
43290207Sjkim# Silvermont	-		12.9
44238384Sjkim#
45238384Sjkim# (*)	Comparison is not completely fair, because "this" is ECB,
46238384Sjkim#	i.e. no extra processing such as counter values calculation
47238384Sjkim#	and xor-ing input as in Emilia's CTR implementation is
48238384Sjkim#	performed. However, the CTR calculations stand for not more
49238384Sjkim#	than 1% of total time, so comparison is *rather* fair.
50238384Sjkim#
51238384Sjkim# (**)	Results were collected on Westmere, which is considered to
52238384Sjkim#	be equivalent to Nehalem for this code.
53238384Sjkim#
54238384Sjkim# As for key schedule conversion subroutine. Interface to OpenSSL
55238384Sjkim# relies on per-invocation on-the-fly conversion. This naturally
56238384Sjkim# has impact on performance, especially for short inputs. Conversion
57238384Sjkim# time in CPU cycles and its ratio to CPU cycles spent in 8x block
58238384Sjkim# function is:
59238384Sjkim#
60238384Sjkim# 		conversion	conversion/8x block
61238384Sjkim# Core 2	240		0.22
62238384Sjkim# Nehalem	180		0.20
63290207Sjkim# Atom		430		0.20
64238384Sjkim#
65238384Sjkim# The ratio values mean that 128-byte blocks will be processed
66238384Sjkim# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
67238384Sjkim# etc. Then keep in mind that input sizes not divisible by 128 are
68238384Sjkim# *effectively* slower, especially shortest ones, e.g. consecutive
69238384Sjkim# 144-byte blocks are processed 44% slower than one would expect,
70238384Sjkim# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
71238384Sjkim# it's still faster than ["hyper-threading-safe" code path in]
72238384Sjkim# aes-x86_64.pl on all lengths above 64 bytes...
73238384Sjkim#
74238384Sjkim# October 2011.
75238384Sjkim#
76238384Sjkim# Add decryption procedure. Performance in CPU cycles spent to decrypt
77238384Sjkim# one byte out of 4096-byte buffer with 128-bit key is:
78238384Sjkim#
79290207Sjkim# Core 2	9.98
80290207Sjkim# Nehalem	7.80
81290207Sjkim# Atom		17.9
82290207Sjkim# Silvermont	14.0
83238384Sjkim#
84238384Sjkim# November 2011.
85238384Sjkim#
86238384Sjkim# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
87238384Sjkim# suboptimal, but XTS is meant to be used with larger blocks...
88238384Sjkim#
89238384Sjkim#						<appro@openssl.org>
90238384Sjkim
91238384Sjkim$flavour = shift;
92238384Sjkim$output  = shift;
93238384Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94238384Sjkim
95238384Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96238384Sjkim
97238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98238384Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99238384Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100238384Sjkimdie "can't locate x86_64-xlate.pl";
101238384Sjkim
102246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
103246772Sjkim*STDOUT=*OUT;
104238384Sjkim
105238384Sjkimmy ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
106238384Sjkimmy @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
107238384Sjkimmy $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
108238384Sjkim
109238384Sjkim{
110238384Sjkimmy ($key,$rounds,$const)=("%rax","%r10d","%r11");
111238384Sjkim
112238384Sjkimsub Sbox {
113238384Sjkim# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
114238384Sjkim# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
115238384Sjkimmy @b=@_[0..7];
116238384Sjkimmy @t=@_[8..11];
117238384Sjkimmy @s=@_[12..15];
118238384Sjkim	&InBasisChange	(@b);
119238384Sjkim	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
120238384Sjkim	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
121238384Sjkim}
122238384Sjkim
123238384Sjkimsub InBasisChange {
124238384Sjkim# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125238384Sjkim# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
126238384Sjkimmy @b=@_[0..7];
127238384Sjkim$code.=<<___;
128238384Sjkim	pxor	@b[6], @b[5]
129238384Sjkim	pxor	@b[1], @b[2]
130238384Sjkim	pxor	@b[0], @b[3]
131238384Sjkim	pxor	@b[2], @b[6]
132238384Sjkim	pxor 	@b[0], @b[5]
133238384Sjkim
134238384Sjkim	pxor	@b[3], @b[6]
135238384Sjkim	pxor	@b[7], @b[3]
136238384Sjkim	pxor	@b[5], @b[7]
137238384Sjkim	pxor	@b[4], @b[3]
138238384Sjkim	pxor	@b[5], @b[4]
139238384Sjkim	pxor	@b[1], @b[3]
140238384Sjkim
141238384Sjkim	pxor	@b[7], @b[2]
142238384Sjkim	pxor	@b[5], @b[1]
143238384Sjkim___
144238384Sjkim}
145238384Sjkim
146238384Sjkimsub OutBasisChange {
147238384Sjkim# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
148238384Sjkim# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
149238384Sjkimmy @b=@_[0..7];
150238384Sjkim$code.=<<___;
151238384Sjkim	pxor	@b[6], @b[0]
152238384Sjkim	pxor	@b[4], @b[1]
153238384Sjkim	pxor	@b[0], @b[2]
154238384Sjkim	pxor	@b[6], @b[4]
155238384Sjkim	pxor	@b[1], @b[6]
156238384Sjkim
157238384Sjkim	pxor	@b[5], @b[1]
158238384Sjkim	pxor	@b[3], @b[5]
159238384Sjkim	pxor	@b[7], @b[3]
160238384Sjkim	pxor	@b[5], @b[7]
161238384Sjkim	pxor	@b[5], @b[2]
162238384Sjkim
163238384Sjkim	pxor	@b[7], @b[4]
164238384Sjkim___
165238384Sjkim}
166238384Sjkim
167238384Sjkimsub InvSbox {
168238384Sjkim# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
169238384Sjkim# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
170238384Sjkimmy @b=@_[0..7];
171238384Sjkimmy @t=@_[8..11];
172238384Sjkimmy @s=@_[12..15];
173238384Sjkim	&InvInBasisChange	(@b);
174238384Sjkim	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
175238384Sjkim	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
176238384Sjkim}
177238384Sjkim
178238384Sjkimsub InvInBasisChange {		# OutBasisChange in reverse
179238384Sjkimmy @b=@_[5,1,2,6,3,7,0,4];
180238384Sjkim$code.=<<___
181238384Sjkim	pxor	@b[7], @b[4]
182238384Sjkim
183238384Sjkim	pxor	@b[5], @b[7]
184238384Sjkim	pxor	@b[5], @b[2]
185238384Sjkim	pxor	@b[7], @b[3]
186238384Sjkim	pxor	@b[3], @b[5]
187238384Sjkim	pxor	@b[5], @b[1]
188238384Sjkim
189238384Sjkim	pxor	@b[1], @b[6]
190238384Sjkim	pxor	@b[0], @b[2]
191238384Sjkim	pxor	@b[6], @b[4]
192238384Sjkim	pxor	@b[6], @b[0]
193238384Sjkim	pxor	@b[4], @b[1]
194238384Sjkim___
195238384Sjkim}
196238384Sjkim
197238384Sjkimsub InvOutBasisChange {		# InBasisChange in reverse
198238384Sjkimmy @b=@_[2,5,7,3,6,1,0,4];
199238384Sjkim$code.=<<___;
200238384Sjkim	pxor	@b[5], @b[1]
201238384Sjkim	pxor	@b[7], @b[2]
202238384Sjkim
203238384Sjkim	pxor	@b[1], @b[3]
204238384Sjkim	pxor	@b[5], @b[4]
205238384Sjkim	pxor	@b[5], @b[7]
206238384Sjkim	pxor	@b[4], @b[3]
207238384Sjkim	 pxor 	@b[0], @b[5]
208238384Sjkim	pxor	@b[7], @b[3]
209238384Sjkim	 pxor	@b[2], @b[6]
210238384Sjkim	 pxor	@b[1], @b[2]
211238384Sjkim	pxor	@b[3], @b[6]
212238384Sjkim
213238384Sjkim	pxor	@b[0], @b[3]
214238384Sjkim	pxor	@b[6], @b[5]
215238384Sjkim___
216238384Sjkim}
217238384Sjkim
218238384Sjkimsub Mul_GF4 {
219238384Sjkim#;*************************************************************
220238384Sjkim#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
221238384Sjkim#;*************************************************************
222238384Sjkimmy ($x0,$x1,$y0,$y1,$t0)=@_;
223238384Sjkim$code.=<<___;
224238384Sjkim	movdqa	$y0, $t0
225238384Sjkim	pxor 	$y1, $t0
226238384Sjkim	pand	$x0, $t0
227238384Sjkim	pxor	$x1, $x0
228238384Sjkim	pand	$y0, $x1
229238384Sjkim	pand	$y1, $x0
230238384Sjkim	pxor	$x1, $x0
231238384Sjkim	pxor	$t0, $x1
232238384Sjkim___
233238384Sjkim}
234238384Sjkim
235238384Sjkimsub Mul_GF4_N {				# not used, see next subroutine
236238384Sjkim# multiply and scale by N
237238384Sjkimmy ($x0,$x1,$y0,$y1,$t0)=@_;
238238384Sjkim$code.=<<___;
239238384Sjkim	movdqa	$y0, $t0
240238384Sjkim	pxor	$y1, $t0
241238384Sjkim	pand	$x0, $t0
242238384Sjkim	pxor	$x1, $x0
243238384Sjkim	pand	$y0, $x1
244238384Sjkim	pand	$y1, $x0
245238384Sjkim	pxor	$x0, $x1
246238384Sjkim	pxor	$t0, $x0
247238384Sjkim___
248238384Sjkim}
249238384Sjkim
250238384Sjkimsub Mul_GF4_N_GF4 {
251238384Sjkim# interleaved Mul_GF4_N and Mul_GF4
252238384Sjkimmy ($x0,$x1,$y0,$y1,$t0,
253238384Sjkim    $x2,$x3,$y2,$y3,$t1)=@_;
254238384Sjkim$code.=<<___;
255238384Sjkim	movdqa	$y0, $t0
256238384Sjkim	 movdqa	$y2, $t1
257238384Sjkim	pxor	$y1, $t0
258238384Sjkim	 pxor 	$y3, $t1
259238384Sjkim	pand	$x0, $t0
260238384Sjkim	 pand	$x2, $t1
261238384Sjkim	pxor	$x1, $x0
262238384Sjkim	 pxor	$x3, $x2
263238384Sjkim	pand	$y0, $x1
264238384Sjkim	 pand	$y2, $x3
265238384Sjkim	pand	$y1, $x0
266238384Sjkim	 pand	$y3, $x2
267238384Sjkim	pxor	$x0, $x1
268238384Sjkim	 pxor	$x3, $x2
269238384Sjkim	pxor	$t0, $x0
270238384Sjkim	 pxor	$t1, $x3
271238384Sjkim___
272238384Sjkim}
273238384Sjkimsub Mul_GF16_2 {
274238384Sjkimmy @x=@_[0..7];
275238384Sjkimmy @y=@_[8..11];
276238384Sjkimmy @t=@_[12..15];
277238384Sjkim$code.=<<___;
278238384Sjkim	movdqa	@x[0], @t[0]
279238384Sjkim	movdqa	@x[1], @t[1]
280238384Sjkim___
281238384Sjkim	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
282238384Sjkim$code.=<<___;
283238384Sjkim	pxor	@x[2], @t[0]
284238384Sjkim	pxor	@x[3], @t[1]
285238384Sjkim	pxor	@y[2], @y[0]
286238384Sjkim	pxor	@y[3], @y[1]
287238384Sjkim___
288238384Sjkim	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
289238384Sjkim			 @x[2], @x[3], @y[2], @y[3], @t[2]);
290238384Sjkim$code.=<<___;
291238384Sjkim	pxor	@t[0], @x[0]
292238384Sjkim	pxor	@t[0], @x[2]
293238384Sjkim	pxor	@t[1], @x[1]
294238384Sjkim	pxor	@t[1], @x[3]
295238384Sjkim
296238384Sjkim	movdqa	@x[4], @t[0]
297238384Sjkim	movdqa	@x[5], @t[1]
298238384Sjkim	pxor	@x[6], @t[0]
299238384Sjkim	pxor	@x[7], @t[1]
300238384Sjkim___
301238384Sjkim	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
302238384Sjkim			 @x[6], @x[7], @y[2], @y[3], @t[2]);
303238384Sjkim$code.=<<___;
304238384Sjkim	pxor	@y[2], @y[0]
305238384Sjkim	pxor	@y[3], @y[1]
306238384Sjkim___
307238384Sjkim	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
308238384Sjkim$code.=<<___;
309238384Sjkim	pxor	@t[0], @x[4]
310238384Sjkim	pxor	@t[0], @x[6]
311238384Sjkim	pxor	@t[1], @x[5]
312238384Sjkim	pxor	@t[1], @x[7]
313238384Sjkim___
314238384Sjkim}
315238384Sjkimsub Inv_GF256 {
316238384Sjkim#;********************************************************************
317238384Sjkim#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
318238384Sjkim#;********************************************************************
319238384Sjkimmy @x=@_[0..7];
320238384Sjkimmy @t=@_[8..11];
321238384Sjkimmy @s=@_[12..15];
322238384Sjkim# direct optimizations from hardware
323238384Sjkim$code.=<<___;
324238384Sjkim	movdqa	@x[4], @t[3]
325238384Sjkim	movdqa	@x[5], @t[2]
326238384Sjkim	movdqa	@x[1], @t[1]
327238384Sjkim	movdqa	@x[7], @s[1]
328238384Sjkim	movdqa	@x[0], @s[0]
329238384Sjkim
330238384Sjkim	pxor	@x[6], @t[3]
331238384Sjkim	pxor	@x[7], @t[2]
332238384Sjkim	pxor	@x[3], @t[1]
333238384Sjkim	 movdqa	@t[3], @s[2]
334238384Sjkim	pxor	@x[6], @s[1]
335238384Sjkim	 movdqa	@t[2], @t[0]
336238384Sjkim	pxor	@x[2], @s[0]
337238384Sjkim	 movdqa	@t[3], @s[3]
338238384Sjkim
339238384Sjkim	por	@t[1], @t[2]
340238384Sjkim	por	@s[0], @t[3]
341238384Sjkim	pxor	@t[0], @s[3]
342238384Sjkim	pand	@s[0], @s[2]
343238384Sjkim	pxor	@t[1], @s[0]
344238384Sjkim	pand	@t[1], @t[0]
345238384Sjkim	pand	@s[0], @s[3]
346238384Sjkim	movdqa	@x[3], @s[0]
347238384Sjkim	pxor	@x[2], @s[0]
348238384Sjkim	pand	@s[0], @s[1]
349238384Sjkim	pxor	@s[1], @t[3]
350238384Sjkim	pxor	@s[1], @t[2]
351238384Sjkim	movdqa	@x[4], @s[1]
352238384Sjkim	movdqa	@x[1], @s[0]
353238384Sjkim	pxor	@x[5], @s[1]
354238384Sjkim	pxor	@x[0], @s[0]
355238384Sjkim	movdqa	@s[1], @t[1]
356238384Sjkim	pand	@s[0], @s[1]
357238384Sjkim	por	@s[0], @t[1]
358238384Sjkim	pxor	@s[1], @t[0]
359238384Sjkim	pxor	@s[3], @t[3]
360238384Sjkim	pxor	@s[2], @t[2]
361238384Sjkim	pxor	@s[3], @t[1]
362238384Sjkim	movdqa	@x[7], @s[0]
363238384Sjkim	pxor	@s[2], @t[0]
364238384Sjkim	movdqa	@x[6], @s[1]
365238384Sjkim	pxor	@s[2], @t[1]
366238384Sjkim	movdqa	@x[5], @s[2]
367238384Sjkim	pand	@x[3], @s[0]
368238384Sjkim	movdqa	@x[4], @s[3]
369238384Sjkim	pand	@x[2], @s[1]
370238384Sjkim	pand	@x[1], @s[2]
371238384Sjkim	por	@x[0], @s[3]
372238384Sjkim	pxor	@s[0], @t[3]
373238384Sjkim	pxor	@s[1], @t[2]
374238384Sjkim	pxor	@s[2], @t[1]
375238384Sjkim	pxor	@s[3], @t[0]
376238384Sjkim
377238384Sjkim	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
378238384Sjkim
379238384Sjkim	# new smaller inversion
380238384Sjkim
381238384Sjkim	movdqa	@t[3], @s[0]
382238384Sjkim	pand	@t[1], @t[3]
383238384Sjkim	pxor	@t[2], @s[0]
384238384Sjkim
385238384Sjkim	movdqa	@t[0], @s[2]
386238384Sjkim	movdqa	@s[0], @s[3]
387238384Sjkim	pxor	@t[3], @s[2]
388238384Sjkim	pand	@s[2], @s[3]
389238384Sjkim
390238384Sjkim	movdqa	@t[1], @s[1]
391238384Sjkim	pxor	@t[2], @s[3]
392238384Sjkim	pxor	@t[0], @s[1]
393238384Sjkim
394238384Sjkim	pxor	@t[2], @t[3]
395238384Sjkim
396238384Sjkim	pand	@t[3], @s[1]
397238384Sjkim
398238384Sjkim	movdqa	@s[2], @t[2]
399238384Sjkim	pxor	@t[0], @s[1]
400238384Sjkim
401238384Sjkim	pxor	@s[1], @t[2]
402238384Sjkim	pxor	@s[1], @t[1]
403238384Sjkim
404238384Sjkim	pand	@t[0], @t[2]
405238384Sjkim
406238384Sjkim	pxor	@t[2], @s[2]
407238384Sjkim	pxor	@t[2], @t[1]
408238384Sjkim
409238384Sjkim	pand	@s[3], @s[2]
410238384Sjkim
411238384Sjkim	pxor	@s[0], @s[2]
412238384Sjkim___
413238384Sjkim# output in s3, s2, s1, t1
414238384Sjkim
415238384Sjkim# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
416238384Sjkim
417238384Sjkim# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
418238384Sjkim	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
419238384Sjkim
420238384Sjkim### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
421238384Sjkim}
422238384Sjkim
423238384Sjkim# AES linear components
424238384Sjkim
425238384Sjkimsub ShiftRows {
426238384Sjkimmy @x=@_[0..7];
427238384Sjkimmy $mask=pop;
428238384Sjkim$code.=<<___;
429238384Sjkim	pxor	0x00($key),@x[0]
430238384Sjkim	pxor	0x10($key),@x[1]
431290207Sjkim	pxor	0x20($key),@x[2]
432290207Sjkim	pxor	0x30($key),@x[3]
433238384Sjkim	pshufb	$mask,@x[0]
434238384Sjkim	pshufb	$mask,@x[1]
435290207Sjkim	pxor	0x40($key),@x[4]
436290207Sjkim	pxor	0x50($key),@x[5]
437238384Sjkim	pshufb	$mask,@x[2]
438238384Sjkim	pshufb	$mask,@x[3]
439290207Sjkim	pxor	0x60($key),@x[6]
440290207Sjkim	pxor	0x70($key),@x[7]
441238384Sjkim	pshufb	$mask,@x[4]
442238384Sjkim	pshufb	$mask,@x[5]
443238384Sjkim	pshufb	$mask,@x[6]
444290207Sjkim	pshufb	$mask,@x[7]
445238384Sjkim	lea	0x80($key),$key
446238384Sjkim___
447238384Sjkim}
448238384Sjkim
449238384Sjkimsub MixColumns {
450238384Sjkim# modified to emit output in order suitable for feeding back to aesenc[last]
451238384Sjkimmy @x=@_[0..7];
452238384Sjkimmy @t=@_[8..15];
453261037Sjkimmy $inv=@_[16];	# optional
454238384Sjkim$code.=<<___;
455238384Sjkim	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
456238384Sjkim	pshufd	\$0x93, @x[1], @t[1]
457238384Sjkim	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
458238384Sjkim	pshufd	\$0x93, @x[2], @t[2]
459238384Sjkim	 pxor	@t[1], @x[1]
460238384Sjkim	pshufd	\$0x93, @x[3], @t[3]
461238384Sjkim	 pxor	@t[2], @x[2]
462238384Sjkim	pshufd	\$0x93, @x[4], @t[4]
463238384Sjkim	 pxor	@t[3], @x[3]
464238384Sjkim	pshufd	\$0x93, @x[5], @t[5]
465238384Sjkim	 pxor	@t[4], @x[4]
466238384Sjkim	pshufd	\$0x93, @x[6], @t[6]
467238384Sjkim	 pxor	@t[5], @x[5]
468238384Sjkim	pshufd	\$0x93, @x[7], @t[7]
469238384Sjkim	 pxor	@t[6], @x[6]
470238384Sjkim	 pxor	@t[7], @x[7]
471238384Sjkim
472238384Sjkim	pxor	@x[0], @t[1]
473238384Sjkim	pxor	@x[7], @t[0]
474238384Sjkim	pxor	@x[7], @t[1]
475238384Sjkim	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
476238384Sjkim	pxor	@x[1], @t[2]
477238384Sjkim	 pshufd	\$0x4E, @x[1], @x[1]
478238384Sjkim	pxor	@x[4], @t[5]
479238384Sjkim	 pxor	@t[0], @x[0]
480238384Sjkim	pxor	@x[5], @t[6]
481238384Sjkim	 pxor	@t[1], @x[1]
482238384Sjkim	pxor	@x[3], @t[4]
483238384Sjkim	 pshufd	\$0x4E, @x[4], @t[0]
484238384Sjkim	pxor	@x[6], @t[7]
485238384Sjkim	 pshufd	\$0x4E, @x[5], @t[1]
486238384Sjkim	pxor	@x[2], @t[3]
487238384Sjkim	 pshufd	\$0x4E, @x[3], @x[4]
488238384Sjkim	pxor	@x[7], @t[3]
489238384Sjkim	 pshufd	\$0x4E, @x[7], @x[5]
490238384Sjkim	pxor	@x[7], @t[4]
491238384Sjkim	 pshufd	\$0x4E, @x[6], @x[3]
492238384Sjkim	pxor	@t[4], @t[0]
493238384Sjkim	 pshufd	\$0x4E, @x[2], @x[6]
494238384Sjkim	pxor	@t[5], @t[1]
495261037Sjkim___
496261037Sjkim$code.=<<___ if (!$inv);
497238384Sjkim	pxor	@t[3], @x[4]
498238384Sjkim	pxor	@t[7], @x[5]
499238384Sjkim	pxor	@t[6], @x[3]
500238384Sjkim	 movdqa	@t[0], @x[2]
501238384Sjkim	pxor	@t[2], @x[6]
502238384Sjkim	 movdqa	@t[1], @x[7]
503238384Sjkim___
504261037Sjkim$code.=<<___ if ($inv);
505261037Sjkim	pxor	@x[4], @t[3]
506261037Sjkim	pxor	@t[7], @x[5]
507261037Sjkim	pxor	@x[3], @t[6]
508261037Sjkim	 movdqa	@t[0], @x[3]
509261037Sjkim	pxor	@t[2], @x[6]
510261037Sjkim	 movdqa	@t[6], @x[2]
511261037Sjkim	 movdqa	@t[1], @x[7]
512261037Sjkim	 movdqa	@x[6], @x[4]
513261037Sjkim	 movdqa	@t[3], @x[6]
514261037Sjkim___
515238384Sjkim}
516238384Sjkim
517261037Sjkimsub InvMixColumns_orig {
518238384Sjkimmy @x=@_[0..7];
519238384Sjkimmy @t=@_[8..15];
520238384Sjkim
521238384Sjkim$code.=<<___;
522238384Sjkim	# multiplication by 0x0e
523238384Sjkim	pshufd	\$0x93, @x[7], @t[7]
524238384Sjkim	movdqa	@x[2], @t[2]
525238384Sjkim	pxor	@x[5], @x[7]		# 7 5
526238384Sjkim	pxor	@x[5], @x[2]		# 2 5
527238384Sjkim	pshufd	\$0x93, @x[0], @t[0]
528238384Sjkim	movdqa	@x[5], @t[5]
529238384Sjkim	pxor	@x[0], @x[5]		# 5 0		[1]
530238384Sjkim	pxor	@x[1], @x[0]		# 0 1
531238384Sjkim	pshufd	\$0x93, @x[1], @t[1]
532238384Sjkim	pxor	@x[2], @x[1]		# 1 25
533238384Sjkim	pxor	@x[6], @x[0]		# 01 6		[2]
534238384Sjkim	pxor	@x[3], @x[1]		# 125 3		[4]
535238384Sjkim	pshufd	\$0x93, @x[3], @t[3]
536238384Sjkim	pxor	@x[0], @x[2]		# 25 016	[3]
537238384Sjkim	pxor	@x[7], @x[3]		# 3 75
538238384Sjkim	pxor	@x[6], @x[7]		# 75 6		[0]
539238384Sjkim	pshufd	\$0x93, @x[6], @t[6]
540238384Sjkim	movdqa	@x[4], @t[4]
541238384Sjkim	pxor	@x[4], @x[6]		# 6 4
542238384Sjkim	pxor	@x[3], @x[4]		# 4 375		[6]
543238384Sjkim	pxor	@x[7], @x[3]		# 375 756=36
544238384Sjkim	pxor	@t[5], @x[6]		# 64 5		[7]
545238384Sjkim	pxor	@t[2], @x[3]		# 36 2
546238384Sjkim	pxor	@t[4], @x[3]		# 362 4		[5]
547238384Sjkim	pshufd	\$0x93, @t[5], @t[5]
548238384Sjkim___
549238384Sjkim					my @y = @x[7,5,0,2,1,3,4,6];
550238384Sjkim$code.=<<___;
551238384Sjkim	# multiplication by 0x0b
552238384Sjkim	pxor	@y[0], @y[1]
553238384Sjkim	pxor	@t[0], @y[0]
554238384Sjkim	pxor	@t[1], @y[1]
555238384Sjkim	pshufd	\$0x93, @t[2], @t[2]
556238384Sjkim	pxor	@t[5], @y[0]
557238384Sjkim	pxor	@t[6], @y[1]
558238384Sjkim	pxor	@t[7], @y[0]
559238384Sjkim	pshufd	\$0x93, @t[4], @t[4]
560238384Sjkim	pxor	@t[6], @t[7]		# clobber t[7]
561238384Sjkim	pxor	@y[0], @y[1]
562238384Sjkim
563238384Sjkim	pxor	@t[0], @y[3]
564238384Sjkim	pshufd	\$0x93, @t[0], @t[0]
565238384Sjkim	pxor	@t[1], @y[2]
566238384Sjkim	pxor	@t[1], @y[4]
567238384Sjkim	pxor	@t[2], @y[2]
568238384Sjkim	pshufd	\$0x93, @t[1], @t[1]
569238384Sjkim	pxor	@t[2], @y[3]
570238384Sjkim	pxor	@t[2], @y[5]
571238384Sjkim	pxor	@t[7], @y[2]
572238384Sjkim	pshufd	\$0x93, @t[2], @t[2]
573238384Sjkim	pxor	@t[3], @y[3]
574238384Sjkim	pxor	@t[3], @y[6]
575238384Sjkim	pxor	@t[3], @y[4]
576238384Sjkim	pshufd	\$0x93, @t[3], @t[3]
577238384Sjkim	pxor	@t[4], @y[7]
578238384Sjkim	pxor	@t[4], @y[5]
579238384Sjkim	pxor	@t[7], @y[7]
580238384Sjkim	pxor	@t[5], @y[3]
581238384Sjkim	pxor	@t[4], @y[4]
582238384Sjkim	pxor	@t[5], @t[7]		# clobber t[7] even more
583238384Sjkim
584238384Sjkim	pxor	@t[7], @y[5]
585238384Sjkim	pshufd	\$0x93, @t[4], @t[4]
586238384Sjkim	pxor	@t[7], @y[6]
587238384Sjkim	pxor	@t[7], @y[4]
588238384Sjkim
589238384Sjkim	pxor	@t[5], @t[7]
590238384Sjkim	pshufd	\$0x93, @t[5], @t[5]
591238384Sjkim	pxor	@t[6], @t[7]		# restore t[7]
592238384Sjkim
593238384Sjkim	# multiplication by 0x0d
594238384Sjkim	pxor	@y[7], @y[4]
595238384Sjkim	pxor	@t[4], @y[7]
596238384Sjkim	pshufd	\$0x93, @t[6], @t[6]
597238384Sjkim	pxor	@t[0], @y[2]
598238384Sjkim	pxor	@t[5], @y[7]
599238384Sjkim	pxor	@t[2], @y[2]
600238384Sjkim	pshufd	\$0x93, @t[7], @t[7]
601238384Sjkim
602238384Sjkim	pxor	@y[1], @y[3]
603238384Sjkim	pxor	@t[1], @y[1]
604238384Sjkim	pxor	@t[0], @y[0]
605238384Sjkim	pxor	@t[0], @y[3]
606238384Sjkim	pxor	@t[5], @y[1]
607238384Sjkim	pxor	@t[5], @y[0]
608238384Sjkim	pxor	@t[7], @y[1]
609238384Sjkim	pshufd	\$0x93, @t[0], @t[0]
610238384Sjkim	pxor	@t[6], @y[0]
611238384Sjkim	pxor	@y[1], @y[3]
612238384Sjkim	pxor	@t[1], @y[4]
613238384Sjkim	pshufd	\$0x93, @t[1], @t[1]
614238384Sjkim
615238384Sjkim	pxor	@t[7], @y[7]
616238384Sjkim	pxor	@t[2], @y[4]
617238384Sjkim	pxor	@t[2], @y[5]
618238384Sjkim	pshufd	\$0x93, @t[2], @t[2]
619238384Sjkim	pxor	@t[6], @y[2]
620238384Sjkim	pxor	@t[3], @t[6]		# clobber t[6]
621238384Sjkim	pxor	@y[7], @y[4]
622238384Sjkim	pxor	@t[6], @y[3]
623238384Sjkim
624238384Sjkim	pxor	@t[6], @y[6]
625238384Sjkim	pxor	@t[5], @y[5]
626238384Sjkim	pxor	@t[4], @y[6]
627238384Sjkim	pshufd	\$0x93, @t[4], @t[4]
628238384Sjkim	pxor	@t[6], @y[5]
629238384Sjkim	pxor	@t[7], @y[6]
630238384Sjkim	pxor	@t[3], @t[6]		# restore t[6]
631238384Sjkim
632238384Sjkim	pshufd	\$0x93, @t[5], @t[5]
633238384Sjkim	pshufd	\$0x93, @t[6], @t[6]
634238384Sjkim	pshufd	\$0x93, @t[7], @t[7]
635238384Sjkim	pshufd	\$0x93, @t[3], @t[3]
636238384Sjkim
637238384Sjkim	# multiplication by 0x09
638238384Sjkim	pxor	@y[1], @y[4]
639238384Sjkim	pxor	@y[1], @t[1]		# t[1]=y[1]
640238384Sjkim	pxor	@t[5], @t[0]		# clobber t[0]
641238384Sjkim	pxor	@t[5], @t[1]
642238384Sjkim	pxor	@t[0], @y[3]
643238384Sjkim	pxor	@y[0], @t[0]		# t[0]=y[0]
644238384Sjkim	pxor	@t[6], @t[1]
645238384Sjkim	pxor	@t[7], @t[6]		# clobber t[6]
646238384Sjkim	pxor	@t[1], @y[4]
647238384Sjkim	pxor	@t[4], @y[7]
648238384Sjkim	pxor	@y[4], @t[4]		# t[4]=y[4]
649238384Sjkim	pxor	@t[3], @y[6]
650238384Sjkim	pxor	@y[3], @t[3]		# t[3]=y[3]
651238384Sjkim	pxor	@t[2], @y[5]
652238384Sjkim	pxor	@y[2], @t[2]		# t[2]=y[2]
653238384Sjkim	pxor	@t[7], @t[3]
654238384Sjkim	pxor	@y[5], @t[5]		# t[5]=y[5]
655238384Sjkim	pxor	@t[6], @t[2]
656238384Sjkim	pxor	@t[6], @t[5]
657238384Sjkim	pxor	@y[6], @t[6]		# t[6]=y[6]
658238384Sjkim	pxor	@y[7], @t[7]		# t[7]=y[7]
659238384Sjkim
660238384Sjkim	movdqa	@t[0],@XMM[0]
661238384Sjkim	movdqa	@t[1],@XMM[1]
662238384Sjkim	movdqa	@t[2],@XMM[2]
663238384Sjkim	movdqa	@t[3],@XMM[3]
664238384Sjkim	movdqa	@t[4],@XMM[4]
665238384Sjkim	movdqa	@t[5],@XMM[5]
666238384Sjkim	movdqa	@t[6],@XMM[6]
667238384Sjkim	movdqa	@t[7],@XMM[7]
668238384Sjkim___
669238384Sjkim}
670238384Sjkim
671261037Sjkimsub InvMixColumns {
672261037Sjkimmy @x=@_[0..7];
673261037Sjkimmy @t=@_[8..15];
674261037Sjkim
675261037Sjkim# Thanks to Jussi Kivilinna for providing pointer to
676261037Sjkim#
677261037Sjkim# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
678261037Sjkim# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
679261037Sjkim# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
680261037Sjkim# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
681261037Sjkim
682261037Sjkim$code.=<<___;
683261037Sjkim	# multiplication by 0x05-0x00-0x04-0x00
684261037Sjkim	pshufd	\$0x4E, @x[0], @t[0]
685261037Sjkim	pshufd	\$0x4E, @x[6], @t[6]
686261037Sjkim	pxor	@x[0], @t[0]
687261037Sjkim	pshufd	\$0x4E, @x[7], @t[7]
688261037Sjkim	pxor	@x[6], @t[6]
689261037Sjkim	pshufd	\$0x4E, @x[1], @t[1]
690261037Sjkim	pxor	@x[7], @t[7]
691261037Sjkim	pshufd	\$0x4E, @x[2], @t[2]
692261037Sjkim	pxor	@x[1], @t[1]
693261037Sjkim	pshufd	\$0x4E, @x[3], @t[3]
694261037Sjkim	pxor	@x[2], @t[2]
695261037Sjkim	 pxor	@t[6], @x[0]
696261037Sjkim	 pxor	@t[6], @x[1]
697261037Sjkim	pshufd	\$0x4E, @x[4], @t[4]
698261037Sjkim	pxor	@x[3], @t[3]
699261037Sjkim	 pxor	@t[0], @x[2]
700261037Sjkim	 pxor	@t[1], @x[3]
701261037Sjkim	pshufd	\$0x4E, @x[5], @t[5]
702261037Sjkim	pxor	@x[4], @t[4]
703261037Sjkim	 pxor	@t[7], @x[1]
704261037Sjkim	 pxor	@t[2], @x[4]
705261037Sjkim	pxor	@x[5], @t[5]
706261037Sjkim
707261037Sjkim	 pxor	@t[7], @x[2]
708261037Sjkim	 pxor	@t[6], @x[3]
709261037Sjkim	 pxor	@t[6], @x[4]
710261037Sjkim	 pxor	@t[3], @x[5]
711261037Sjkim	 pxor	@t[4], @x[6]
712261037Sjkim	 pxor	@t[7], @x[4]
713261037Sjkim	 pxor	@t[7], @x[5]
714261037Sjkim	 pxor	@t[5], @x[7]
715261037Sjkim___
716261037Sjkim	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
717261037Sjkim}
718261037Sjkim
719238384Sjkimsub aesenc {				# not used
720238384Sjkimmy @b=@_[0..7];
721238384Sjkimmy @t=@_[8..15];
722238384Sjkim$code.=<<___;
723238384Sjkim	movdqa	0x30($const),@t[0]	# .LSR
724238384Sjkim___
725238384Sjkim	&ShiftRows	(@b,@t[0]);
726238384Sjkim	&Sbox		(@b,@t);
727238384Sjkim	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
728238384Sjkim}
729238384Sjkim
730238384Sjkimsub aesenclast {			# not used
731238384Sjkimmy @b=@_[0..7];
732238384Sjkimmy @t=@_[8..15];
733238384Sjkim$code.=<<___;
734238384Sjkim	movdqa	0x40($const),@t[0]	# .LSRM0
735238384Sjkim___
736238384Sjkim	&ShiftRows	(@b,@t[0]);
737238384Sjkim	&Sbox		(@b,@t);
738238384Sjkim$code.=<<___
739238384Sjkim	pxor	0x00($key),@b[0]
740238384Sjkim	pxor	0x10($key),@b[1]
741238384Sjkim	pxor	0x20($key),@b[4]
742238384Sjkim	pxor	0x30($key),@b[6]
743238384Sjkim	pxor	0x40($key),@b[3]
744238384Sjkim	pxor	0x50($key),@b[7]
745238384Sjkim	pxor	0x60($key),@b[2]
746238384Sjkim	pxor	0x70($key),@b[5]
747238384Sjkim___
748238384Sjkim}
749238384Sjkim
750238384Sjkimsub swapmove {
751238384Sjkimmy ($a,$b,$n,$mask,$t)=@_;
752238384Sjkim$code.=<<___;
753238384Sjkim	movdqa	$b,$t
754238384Sjkim	psrlq	\$$n,$b
755238384Sjkim	pxor  	$a,$b
756238384Sjkim	pand	$mask,$b
757238384Sjkim	pxor	$b,$a
758238384Sjkim	psllq	\$$n,$b
759238384Sjkim	pxor	$t,$b
760238384Sjkim___
761238384Sjkim}
762238384Sjkimsub swapmove2x {
763238384Sjkimmy ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
764238384Sjkim$code.=<<___;
765238384Sjkim	movdqa	$b0,$t0
766238384Sjkim	psrlq	\$$n,$b0
767238384Sjkim	 movdqa	$b1,$t1
768238384Sjkim	 psrlq	\$$n,$b1
769238384Sjkim	pxor  	$a0,$b0
770238384Sjkim	 pxor  	$a1,$b1
771238384Sjkim	pand	$mask,$b0
772238384Sjkim	 pand	$mask,$b1
773238384Sjkim	pxor	$b0,$a0
774238384Sjkim	psllq	\$$n,$b0
775238384Sjkim	 pxor	$b1,$a1
776238384Sjkim	 psllq	\$$n,$b1
777238384Sjkim	pxor	$t0,$b0
778238384Sjkim	 pxor	$t1,$b1
779238384Sjkim___
780238384Sjkim}
781238384Sjkim
782238384Sjkimsub bitslice {
783238384Sjkimmy @x=reverse(@_[0..7]);
784238384Sjkimmy ($t0,$t1,$t2,$t3)=@_[8..11];
785238384Sjkim$code.=<<___;
786238384Sjkim	movdqa	0x00($const),$t0	# .LBS0
787238384Sjkim	movdqa	0x10($const),$t1	# .LBS1
788238384Sjkim___
789238384Sjkim	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
790238384Sjkim	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
791238384Sjkim$code.=<<___;
792238384Sjkim	movdqa	0x20($const),$t0	# .LBS2
793238384Sjkim___
794238384Sjkim	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
795238384Sjkim	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
796238384Sjkim
797238384Sjkim	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
798238384Sjkim	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
799238384Sjkim}
800238384Sjkim
801238384Sjkim$code.=<<___;
802238384Sjkim.text
803238384Sjkim
804238384Sjkim.extern	asm_AES_encrypt
805238384Sjkim.extern	asm_AES_decrypt
806238384Sjkim
807238384Sjkim.type	_bsaes_encrypt8,\@abi-omnipotent
808238384Sjkim.align	64
809238384Sjkim_bsaes_encrypt8:
810238384Sjkim	lea	.LBS0(%rip), $const	# constants table
811238384Sjkim
812238384Sjkim	movdqa	($key), @XMM[9]		# round 0 key
813238384Sjkim	lea	0x10($key), $key
814238384Sjkim	movdqa	0x50($const), @XMM[8]	# .LM0SR
815238384Sjkim	pxor	@XMM[9], @XMM[0]	# xor with round0 key
816238384Sjkim	pxor	@XMM[9], @XMM[1]
817290207Sjkim	pxor	@XMM[9], @XMM[2]
818290207Sjkim	pxor	@XMM[9], @XMM[3]
819238384Sjkim	 pshufb	@XMM[8], @XMM[0]
820238384Sjkim	 pshufb	@XMM[8], @XMM[1]
821290207Sjkim	pxor	@XMM[9], @XMM[4]
822290207Sjkim	pxor	@XMM[9], @XMM[5]
823238384Sjkim	 pshufb	@XMM[8], @XMM[2]
824238384Sjkim	 pshufb	@XMM[8], @XMM[3]
825290207Sjkim	pxor	@XMM[9], @XMM[6]
826290207Sjkim	pxor	@XMM[9], @XMM[7]
827238384Sjkim	 pshufb	@XMM[8], @XMM[4]
828238384Sjkim	 pshufb	@XMM[8], @XMM[5]
829238384Sjkim	 pshufb	@XMM[8], @XMM[6]
830238384Sjkim	 pshufb	@XMM[8], @XMM[7]
831238384Sjkim_bsaes_encrypt8_bitslice:
832238384Sjkim___
833238384Sjkim	&bitslice	(@XMM[0..7, 8..11]);
834238384Sjkim$code.=<<___;
835238384Sjkim	dec	$rounds
836238384Sjkim	jmp	.Lenc_sbox
837238384Sjkim.align	16
838238384Sjkim.Lenc_loop:
839238384Sjkim___
840238384Sjkim	&ShiftRows	(@XMM[0..7, 8]);
841238384Sjkim$code.=".Lenc_sbox:\n";
842238384Sjkim	&Sbox		(@XMM[0..7, 8..15]);
843238384Sjkim$code.=<<___;
844238384Sjkim	dec	$rounds
845238384Sjkim	jl	.Lenc_done
846238384Sjkim___
847238384Sjkim	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
848238384Sjkim$code.=<<___;
849238384Sjkim	movdqa	0x30($const), @XMM[8]	# .LSR
850238384Sjkim	jnz	.Lenc_loop
851238384Sjkim	movdqa	0x40($const), @XMM[8]	# .LSRM0
852238384Sjkim	jmp	.Lenc_loop
853238384Sjkim.align	16
854238384Sjkim.Lenc_done:
855238384Sjkim___
856238384Sjkim	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
857238384Sjkim	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
858238384Sjkim$code.=<<___;
859238384Sjkim	movdqa	($key), @XMM[8]		# last round key
860238384Sjkim	pxor	@XMM[8], @XMM[4]
861238384Sjkim	pxor	@XMM[8], @XMM[6]
862238384Sjkim	pxor	@XMM[8], @XMM[3]
863238384Sjkim	pxor	@XMM[8], @XMM[7]
864238384Sjkim	pxor	@XMM[8], @XMM[2]
865238384Sjkim	pxor	@XMM[8], @XMM[5]
866238384Sjkim	pxor	@XMM[8], @XMM[0]
867238384Sjkim	pxor	@XMM[8], @XMM[1]
868238384Sjkim	ret
869238384Sjkim.size	_bsaes_encrypt8,.-_bsaes_encrypt8
870238384Sjkim
871238384Sjkim.type	_bsaes_decrypt8,\@abi-omnipotent
872238384Sjkim.align	64
873238384Sjkim_bsaes_decrypt8:
874238384Sjkim	lea	.LBS0(%rip), $const	# constants table
875238384Sjkim
876238384Sjkim	movdqa	($key), @XMM[9]		# round 0 key
877238384Sjkim	lea	0x10($key), $key
878238384Sjkim	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
879238384Sjkim	pxor	@XMM[9], @XMM[0]	# xor with round0 key
880238384Sjkim	pxor	@XMM[9], @XMM[1]
881290207Sjkim	pxor	@XMM[9], @XMM[2]
882290207Sjkim	pxor	@XMM[9], @XMM[3]
883238384Sjkim	 pshufb	@XMM[8], @XMM[0]
884238384Sjkim	 pshufb	@XMM[8], @XMM[1]
885290207Sjkim	pxor	@XMM[9], @XMM[4]
886290207Sjkim	pxor	@XMM[9], @XMM[5]
887238384Sjkim	 pshufb	@XMM[8], @XMM[2]
888238384Sjkim	 pshufb	@XMM[8], @XMM[3]
889290207Sjkim	pxor	@XMM[9], @XMM[6]
890290207Sjkim	pxor	@XMM[9], @XMM[7]
891238384Sjkim	 pshufb	@XMM[8], @XMM[4]
892238384Sjkim	 pshufb	@XMM[8], @XMM[5]
893238384Sjkim	 pshufb	@XMM[8], @XMM[6]
894238384Sjkim	 pshufb	@XMM[8], @XMM[7]
895238384Sjkim___
896238384Sjkim	&bitslice	(@XMM[0..7, 8..11]);
897238384Sjkim$code.=<<___;
898238384Sjkim	dec	$rounds
899238384Sjkim	jmp	.Ldec_sbox
900238384Sjkim.align	16
901238384Sjkim.Ldec_loop:
902238384Sjkim___
903238384Sjkim	&ShiftRows	(@XMM[0..7, 8]);
904238384Sjkim$code.=".Ldec_sbox:\n";
905238384Sjkim	&InvSbox	(@XMM[0..7, 8..15]);
906238384Sjkim$code.=<<___;
907238384Sjkim	dec	$rounds
908238384Sjkim	jl	.Ldec_done
909238384Sjkim___
910238384Sjkim	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
911238384Sjkim$code.=<<___;
912238384Sjkim	movdqa	-0x10($const), @XMM[8]	# .LISR
913238384Sjkim	jnz	.Ldec_loop
914238384Sjkim	movdqa	-0x20($const), @XMM[8]	# .LISRM0
915238384Sjkim	jmp	.Ldec_loop
916238384Sjkim.align	16
917238384Sjkim.Ldec_done:
918238384Sjkim___
919238384Sjkim	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
920238384Sjkim$code.=<<___;
921238384Sjkim	movdqa	($key), @XMM[8]		# last round key
922238384Sjkim	pxor	@XMM[8], @XMM[6]
923238384Sjkim	pxor	@XMM[8], @XMM[4]
924238384Sjkim	pxor	@XMM[8], @XMM[2]
925238384Sjkim	pxor	@XMM[8], @XMM[7]
926238384Sjkim	pxor	@XMM[8], @XMM[3]
927238384Sjkim	pxor	@XMM[8], @XMM[5]
928238384Sjkim	pxor	@XMM[8], @XMM[0]
929238384Sjkim	pxor	@XMM[8], @XMM[1]
930238384Sjkim	ret
931238384Sjkim.size	_bsaes_decrypt8,.-_bsaes_decrypt8
932238384Sjkim___
933238384Sjkim}
934238384Sjkim{
935238384Sjkimmy ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
936238384Sjkim
937238384Sjkimsub bitslice_key {
938238384Sjkimmy @x=reverse(@_[0..7]);
939238384Sjkimmy ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
940238384Sjkim
941238384Sjkim	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
942238384Sjkim$code.=<<___;
943238384Sjkim	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
944238384Sjkim	movdqa	@x[0], @x[2]
945238384Sjkim	movdqa	@x[1], @x[3]
946238384Sjkim___
947238384Sjkim	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
948238384Sjkim
949238384Sjkim	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
950238384Sjkim$code.=<<___;
951238384Sjkim	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
952238384Sjkim	movdqa	@x[0], @x[4]
953238384Sjkim	movdqa	@x[2], @x[6]
954238384Sjkim	movdqa	@x[1], @x[5]
955238384Sjkim	movdqa	@x[3], @x[7]
956238384Sjkim___
957238384Sjkim	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
958238384Sjkim	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
959238384Sjkim}
960238384Sjkim
961238384Sjkim$code.=<<___;
962238384Sjkim.type	_bsaes_key_convert,\@abi-omnipotent
963238384Sjkim.align	16
964238384Sjkim_bsaes_key_convert:
965238384Sjkim	lea	.Lmasks(%rip), $const
966238384Sjkim	movdqu	($inp), %xmm7		# load round 0 key
967238384Sjkim	lea	0x10($inp), $inp
968238384Sjkim	movdqa	0x00($const), %xmm0	# 0x01...
969238384Sjkim	movdqa	0x10($const), %xmm1	# 0x02...
970238384Sjkim	movdqa	0x20($const), %xmm2	# 0x04...
971238384Sjkim	movdqa	0x30($const), %xmm3	# 0x08...
972238384Sjkim	movdqa	0x40($const), %xmm4	# .LM0
973238384Sjkim	pcmpeqd	%xmm5, %xmm5		# .LNOT
974238384Sjkim
975238384Sjkim	movdqu	($inp), %xmm6		# load round 1 key
976238384Sjkim	movdqa	%xmm7, ($out)		# save round 0 key
977238384Sjkim	lea	0x10($out), $out
978238384Sjkim	dec	$rounds
979238384Sjkim	jmp	.Lkey_loop
980238384Sjkim.align	16
981238384Sjkim.Lkey_loop:
982238384Sjkim	pshufb	%xmm4, %xmm6		# .LM0
983238384Sjkim
984238384Sjkim	movdqa	%xmm0,	%xmm8
985238384Sjkim	movdqa	%xmm1,	%xmm9
986238384Sjkim
987238384Sjkim	pand	%xmm6,	%xmm8
988238384Sjkim	pand	%xmm6,	%xmm9
989238384Sjkim	movdqa	%xmm2,	%xmm10
990238384Sjkim	pcmpeqb	%xmm0,	%xmm8
991238384Sjkim	psllq	\$4,	%xmm0		# 0x10...
992238384Sjkim	movdqa	%xmm3,	%xmm11
993238384Sjkim	pcmpeqb	%xmm1,	%xmm9
994238384Sjkim	psllq	\$4,	%xmm1		# 0x20...
995238384Sjkim
996238384Sjkim	pand	%xmm6,	%xmm10
997238384Sjkim	pand	%xmm6,	%xmm11
998238384Sjkim	movdqa	%xmm0,	%xmm12
999238384Sjkim	pcmpeqb	%xmm2,	%xmm10
1000238384Sjkim	psllq	\$4,	%xmm2		# 0x40...
1001238384Sjkim	movdqa	%xmm1,	%xmm13
1002238384Sjkim	pcmpeqb	%xmm3,	%xmm11
1003238384Sjkim	psllq	\$4,	%xmm3		# 0x80...
1004238384Sjkim
1005238384Sjkim	movdqa	%xmm2,	%xmm14
1006238384Sjkim	movdqa	%xmm3,	%xmm15
1007238384Sjkim	 pxor	%xmm5,	%xmm8		# "pnot"
1008238384Sjkim	 pxor	%xmm5,	%xmm9
1009238384Sjkim
1010238384Sjkim	pand	%xmm6,	%xmm12
1011238384Sjkim	pand	%xmm6,	%xmm13
1012238384Sjkim	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1013238384Sjkim	pcmpeqb	%xmm0,	%xmm12
1014238384Sjkim	psrlq	\$4,	%xmm0		# 0x01...
1015238384Sjkim	 movdqa	%xmm9, 0x10($out)
1016238384Sjkim	pcmpeqb	%xmm1,	%xmm13
1017238384Sjkim	psrlq	\$4,	%xmm1		# 0x02...
1018238384Sjkim	 lea	0x10($inp), $inp
1019238384Sjkim
1020238384Sjkim	pand	%xmm6,	%xmm14
1021238384Sjkim	pand	%xmm6,	%xmm15
1022238384Sjkim	 movdqa	%xmm10, 0x20($out)
1023238384Sjkim	pcmpeqb	%xmm2,	%xmm14
1024238384Sjkim	psrlq	\$4,	%xmm2		# 0x04...
1025238384Sjkim	 movdqa	%xmm11, 0x30($out)
1026238384Sjkim	pcmpeqb	%xmm3,	%xmm15
1027238384Sjkim	psrlq	\$4,	%xmm3		# 0x08...
1028238384Sjkim	 movdqu	($inp), %xmm6		# load next round key
1029238384Sjkim
1030238384Sjkim	pxor	%xmm5, %xmm13		# "pnot"
1031238384Sjkim	pxor	%xmm5, %xmm14
1032238384Sjkim	movdqa	%xmm12, 0x40($out)
1033238384Sjkim	movdqa	%xmm13, 0x50($out)
1034238384Sjkim	movdqa	%xmm14, 0x60($out)
1035238384Sjkim	movdqa	%xmm15, 0x70($out)
1036238384Sjkim	lea	0x80($out),$out
1037238384Sjkim	dec	$rounds
1038238384Sjkim	jnz	.Lkey_loop
1039238384Sjkim
1040238384Sjkim	movdqa	0x50($const), %xmm7	# .L63
1041238384Sjkim	#movdqa	%xmm6, ($out)		# don't save last round key
1042238384Sjkim	ret
1043238384Sjkim.size	_bsaes_key_convert,.-_bsaes_key_convert
1044238384Sjkim___
1045238384Sjkim}
1046238384Sjkim
1047238384Sjkimif (0 && !$win64) {	# following four functions are unsupported interface
1048238384Sjkim			# used for benchmarking...
1049238384Sjkim$code.=<<___;
1050238384Sjkim.globl	bsaes_enc_key_convert
1051238384Sjkim.type	bsaes_enc_key_convert,\@function,2
1052238384Sjkim.align	16
1053238384Sjkimbsaes_enc_key_convert:
1054238384Sjkim	mov	240($inp),%r10d		# pass rounds
1055238384Sjkim	mov	$inp,%rcx		# pass key
1056238384Sjkim	mov	$out,%rax		# pass key schedule
1057238384Sjkim	call	_bsaes_key_convert
1058238384Sjkim	pxor	%xmm6,%xmm7		# fix up last round key
1059238384Sjkim	movdqa	%xmm7,(%rax)		# save last round key
1060238384Sjkim	ret
1061238384Sjkim.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1062238384Sjkim
1063238384Sjkim.globl	bsaes_encrypt_128
1064238384Sjkim.type	bsaes_encrypt_128,\@function,4
1065238384Sjkim.align	16
1066238384Sjkimbsaes_encrypt_128:
1067238384Sjkim.Lenc128_loop:
1068238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1069238384Sjkim	movdqu	0x10($inp), @XMM[1]
1070238384Sjkim	movdqu	0x20($inp), @XMM[2]
1071238384Sjkim	movdqu	0x30($inp), @XMM[3]
1072238384Sjkim	movdqu	0x40($inp), @XMM[4]
1073238384Sjkim	movdqu	0x50($inp), @XMM[5]
1074238384Sjkim	movdqu	0x60($inp), @XMM[6]
1075238384Sjkim	movdqu	0x70($inp), @XMM[7]
1076238384Sjkim	mov	$key, %rax		# pass the $key
1077238384Sjkim	lea	0x80($inp), $inp
1078238384Sjkim	mov	\$10,%r10d
1079238384Sjkim
1080238384Sjkim	call	_bsaes_encrypt8
1081238384Sjkim
1082238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1083238384Sjkim	movdqu	@XMM[1], 0x10($out)
1084238384Sjkim	movdqu	@XMM[4], 0x20($out)
1085238384Sjkim	movdqu	@XMM[6], 0x30($out)
1086238384Sjkim	movdqu	@XMM[3], 0x40($out)
1087238384Sjkim	movdqu	@XMM[7], 0x50($out)
1088238384Sjkim	movdqu	@XMM[2], 0x60($out)
1089238384Sjkim	movdqu	@XMM[5], 0x70($out)
1090238384Sjkim	lea	0x80($out), $out
1091238384Sjkim	sub	\$0x80,$len
1092238384Sjkim	ja	.Lenc128_loop
1093238384Sjkim	ret
1094238384Sjkim.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1095238384Sjkim
1096238384Sjkim.globl	bsaes_dec_key_convert
1097238384Sjkim.type	bsaes_dec_key_convert,\@function,2
1098238384Sjkim.align	16
1099238384Sjkimbsaes_dec_key_convert:
1100238384Sjkim	mov	240($inp),%r10d		# pass rounds
1101238384Sjkim	mov	$inp,%rcx		# pass key
1102238384Sjkim	mov	$out,%rax		# pass key schedule
1103238384Sjkim	call	_bsaes_key_convert
1104238384Sjkim	pxor	($out),%xmm7		# fix up round 0 key
1105238384Sjkim	movdqa	%xmm6,(%rax)		# save last round key
1106238384Sjkim	movdqa	%xmm7,($out)
1107238384Sjkim	ret
1108238384Sjkim.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1109238384Sjkim
1110238384Sjkim.globl	bsaes_decrypt_128
1111238384Sjkim.type	bsaes_decrypt_128,\@function,4
1112238384Sjkim.align	16
1113238384Sjkimbsaes_decrypt_128:
1114238384Sjkim.Ldec128_loop:
1115238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1116238384Sjkim	movdqu	0x10($inp), @XMM[1]
1117238384Sjkim	movdqu	0x20($inp), @XMM[2]
1118238384Sjkim	movdqu	0x30($inp), @XMM[3]
1119238384Sjkim	movdqu	0x40($inp), @XMM[4]
1120238384Sjkim	movdqu	0x50($inp), @XMM[5]
1121238384Sjkim	movdqu	0x60($inp), @XMM[6]
1122238384Sjkim	movdqu	0x70($inp), @XMM[7]
1123238384Sjkim	mov	$key, %rax		# pass the $key
1124238384Sjkim	lea	0x80($inp), $inp
1125238384Sjkim	mov	\$10,%r10d
1126238384Sjkim
1127238384Sjkim	call	_bsaes_decrypt8
1128238384Sjkim
1129238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1130238384Sjkim	movdqu	@XMM[1], 0x10($out)
1131238384Sjkim	movdqu	@XMM[6], 0x20($out)
1132238384Sjkim	movdqu	@XMM[4], 0x30($out)
1133238384Sjkim	movdqu	@XMM[2], 0x40($out)
1134238384Sjkim	movdqu	@XMM[7], 0x50($out)
1135238384Sjkim	movdqu	@XMM[3], 0x60($out)
1136238384Sjkim	movdqu	@XMM[5], 0x70($out)
1137238384Sjkim	lea	0x80($out), $out
1138238384Sjkim	sub	\$0x80,$len
1139238384Sjkim	ja	.Ldec128_loop
1140238384Sjkim	ret
1141238384Sjkim.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1142238384Sjkim___
1143238384Sjkim}
1144238384Sjkim{
1145238384Sjkim######################################################################
1146238384Sjkim#
1147238384Sjkim# OpenSSL interface
1148238384Sjkim#
1149238384Sjkimmy ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1150238384Sjkim						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1151238384Sjkimmy ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1152238384Sjkim
1153238384Sjkimif ($ecb) {
1154238384Sjkim$code.=<<___;
1155238384Sjkim.globl	bsaes_ecb_encrypt_blocks
1156238384Sjkim.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1157238384Sjkim.align	16
1158238384Sjkimbsaes_ecb_encrypt_blocks:
1159238384Sjkim	mov	%rsp, %rax
1160238384Sjkim.Lecb_enc_prologue:
1161238384Sjkim	push	%rbp
1162238384Sjkim	push	%rbx
1163238384Sjkim	push	%r12
1164238384Sjkim	push	%r13
1165238384Sjkim	push	%r14
1166238384Sjkim	push	%r15
1167238384Sjkim	lea	-0x48(%rsp),%rsp
1168238384Sjkim___
1169238384Sjkim$code.=<<___ if ($win64);
1170238384Sjkim	lea	-0xa0(%rsp), %rsp
1171238384Sjkim	movaps	%xmm6, 0x40(%rsp)
1172238384Sjkim	movaps	%xmm7, 0x50(%rsp)
1173238384Sjkim	movaps	%xmm8, 0x60(%rsp)
1174238384Sjkim	movaps	%xmm9, 0x70(%rsp)
1175238384Sjkim	movaps	%xmm10, 0x80(%rsp)
1176238384Sjkim	movaps	%xmm11, 0x90(%rsp)
1177238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
1178238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
1179238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
1180238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
1181238384Sjkim.Lecb_enc_body:
1182238384Sjkim___
1183238384Sjkim$code.=<<___;
1184238384Sjkim	mov	%rsp,%rbp		# backup %rsp
1185238384Sjkim	mov	240($arg4),%eax		# rounds
1186238384Sjkim	mov	$arg1,$inp		# backup arguments
1187238384Sjkim	mov	$arg2,$out
1188238384Sjkim	mov	$arg3,$len
1189238384Sjkim	mov	$arg4,$key
1190238384Sjkim	cmp	\$8,$arg3
1191238384Sjkim	jb	.Lecb_enc_short
1192238384Sjkim
1193238384Sjkim	mov	%eax,%ebx		# backup rounds
1194238384Sjkim	shl	\$7,%rax		# 128 bytes per inner round key
1195238384Sjkim	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1196238384Sjkim	sub	%rax,%rsp
1197238384Sjkim	mov	%rsp,%rax		# pass key schedule
1198238384Sjkim	mov	$key,%rcx		# pass key
1199238384Sjkim	mov	%ebx,%r10d		# pass rounds
1200238384Sjkim	call	_bsaes_key_convert
1201238384Sjkim	pxor	%xmm6,%xmm7		# fix up last round key
1202238384Sjkim	movdqa	%xmm7,(%rax)		# save last round key
1203238384Sjkim
1204238384Sjkim	sub	\$8,$len
1205238384Sjkim.Lecb_enc_loop:
1206238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1207238384Sjkim	movdqu	0x10($inp), @XMM[1]
1208238384Sjkim	movdqu	0x20($inp), @XMM[2]
1209238384Sjkim	movdqu	0x30($inp), @XMM[3]
1210238384Sjkim	movdqu	0x40($inp), @XMM[4]
1211238384Sjkim	movdqu	0x50($inp), @XMM[5]
1212238384Sjkim	mov	%rsp, %rax		# pass key schedule
1213238384Sjkim	movdqu	0x60($inp), @XMM[6]
1214238384Sjkim	mov	%ebx,%r10d		# pass rounds
1215238384Sjkim	movdqu	0x70($inp), @XMM[7]
1216238384Sjkim	lea	0x80($inp), $inp
1217238384Sjkim
1218238384Sjkim	call	_bsaes_encrypt8
1219238384Sjkim
1220238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1221238384Sjkim	movdqu	@XMM[1], 0x10($out)
1222238384Sjkim	movdqu	@XMM[4], 0x20($out)
1223238384Sjkim	movdqu	@XMM[6], 0x30($out)
1224238384Sjkim	movdqu	@XMM[3], 0x40($out)
1225238384Sjkim	movdqu	@XMM[7], 0x50($out)
1226238384Sjkim	movdqu	@XMM[2], 0x60($out)
1227238384Sjkim	movdqu	@XMM[5], 0x70($out)
1228238384Sjkim	lea	0x80($out), $out
1229238384Sjkim	sub	\$8,$len
1230238384Sjkim	jnc	.Lecb_enc_loop
1231238384Sjkim
1232238384Sjkim	add	\$8,$len
1233238384Sjkim	jz	.Lecb_enc_done
1234238384Sjkim
1235238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1236238384Sjkim	mov	%rsp, %rax		# pass key schedule
1237238384Sjkim	mov	%ebx,%r10d		# pass rounds
1238238384Sjkim	cmp	\$2,$len
1239238384Sjkim	jb	.Lecb_enc_one
1240238384Sjkim	movdqu	0x10($inp), @XMM[1]
1241238384Sjkim	je	.Lecb_enc_two
1242238384Sjkim	movdqu	0x20($inp), @XMM[2]
1243238384Sjkim	cmp	\$4,$len
1244238384Sjkim	jb	.Lecb_enc_three
1245238384Sjkim	movdqu	0x30($inp), @XMM[3]
1246238384Sjkim	je	.Lecb_enc_four
1247238384Sjkim	movdqu	0x40($inp), @XMM[4]
1248238384Sjkim	cmp	\$6,$len
1249238384Sjkim	jb	.Lecb_enc_five
1250238384Sjkim	movdqu	0x50($inp), @XMM[5]
1251238384Sjkim	je	.Lecb_enc_six
1252238384Sjkim	movdqu	0x60($inp), @XMM[6]
1253238384Sjkim	call	_bsaes_encrypt8
1254238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1255238384Sjkim	movdqu	@XMM[1], 0x10($out)
1256238384Sjkim	movdqu	@XMM[4], 0x20($out)
1257238384Sjkim	movdqu	@XMM[6], 0x30($out)
1258238384Sjkim	movdqu	@XMM[3], 0x40($out)
1259238384Sjkim	movdqu	@XMM[7], 0x50($out)
1260238384Sjkim	movdqu	@XMM[2], 0x60($out)
1261238384Sjkim	jmp	.Lecb_enc_done
1262238384Sjkim.align	16
1263238384Sjkim.Lecb_enc_six:
1264238384Sjkim	call	_bsaes_encrypt8
1265238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1266238384Sjkim	movdqu	@XMM[1], 0x10($out)
1267238384Sjkim	movdqu	@XMM[4], 0x20($out)
1268238384Sjkim	movdqu	@XMM[6], 0x30($out)
1269238384Sjkim	movdqu	@XMM[3], 0x40($out)
1270238384Sjkim	movdqu	@XMM[7], 0x50($out)
1271238384Sjkim	jmp	.Lecb_enc_done
1272238384Sjkim.align	16
1273238384Sjkim.Lecb_enc_five:
1274238384Sjkim	call	_bsaes_encrypt8
1275238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1276238384Sjkim	movdqu	@XMM[1], 0x10($out)
1277238384Sjkim	movdqu	@XMM[4], 0x20($out)
1278238384Sjkim	movdqu	@XMM[6], 0x30($out)
1279238384Sjkim	movdqu	@XMM[3], 0x40($out)
1280238384Sjkim	jmp	.Lecb_enc_done
1281238384Sjkim.align	16
1282238384Sjkim.Lecb_enc_four:
1283238384Sjkim	call	_bsaes_encrypt8
1284238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1285238384Sjkim	movdqu	@XMM[1], 0x10($out)
1286238384Sjkim	movdqu	@XMM[4], 0x20($out)
1287238384Sjkim	movdqu	@XMM[6], 0x30($out)
1288238384Sjkim	jmp	.Lecb_enc_done
1289238384Sjkim.align	16
1290238384Sjkim.Lecb_enc_three:
1291238384Sjkim	call	_bsaes_encrypt8
1292238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1293238384Sjkim	movdqu	@XMM[1], 0x10($out)
1294238384Sjkim	movdqu	@XMM[4], 0x20($out)
1295238384Sjkim	jmp	.Lecb_enc_done
1296238384Sjkim.align	16
1297238384Sjkim.Lecb_enc_two:
1298238384Sjkim	call	_bsaes_encrypt8
1299238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1300238384Sjkim	movdqu	@XMM[1], 0x10($out)
1301238384Sjkim	jmp	.Lecb_enc_done
1302238384Sjkim.align	16
1303238384Sjkim.Lecb_enc_one:
1304238384Sjkim	call	_bsaes_encrypt8
1305238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1306238384Sjkim	jmp	.Lecb_enc_done
1307238384Sjkim.align	16
1308238384Sjkim.Lecb_enc_short:
1309238384Sjkim	lea	($inp), $arg1
1310238384Sjkim	lea	($out), $arg2
1311238384Sjkim	lea	($key), $arg3
1312238384Sjkim	call	asm_AES_encrypt
1313238384Sjkim	lea	16($inp), $inp
1314238384Sjkim	lea	16($out), $out
1315238384Sjkim	dec	$len
1316238384Sjkim	jnz	.Lecb_enc_short
1317238384Sjkim
1318238384Sjkim.Lecb_enc_done:
1319238384Sjkim	lea	(%rsp),%rax
1320238384Sjkim	pxor	%xmm0, %xmm0
1321238384Sjkim.Lecb_enc_bzero:			# wipe key schedule [if any]
1322238384Sjkim	movdqa	%xmm0, 0x00(%rax)
1323238384Sjkim	movdqa	%xmm0, 0x10(%rax)
1324238384Sjkim	lea	0x20(%rax), %rax
1325238384Sjkim	cmp	%rax, %rbp
1326238384Sjkim	jb	.Lecb_enc_bzero
1327238384Sjkim
1328238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
1329238384Sjkim___
1330238384Sjkim$code.=<<___ if ($win64);
1331238384Sjkim	movaps	0x40(%rbp), %xmm6
1332238384Sjkim	movaps	0x50(%rbp), %xmm7
1333238384Sjkim	movaps	0x60(%rbp), %xmm8
1334238384Sjkim	movaps	0x70(%rbp), %xmm9
1335238384Sjkim	movaps	0x80(%rbp), %xmm10
1336238384Sjkim	movaps	0x90(%rbp), %xmm11
1337238384Sjkim	movaps	0xa0(%rbp), %xmm12
1338238384Sjkim	movaps	0xb0(%rbp), %xmm13
1339238384Sjkim	movaps	0xc0(%rbp), %xmm14
1340238384Sjkim	movaps	0xd0(%rbp), %xmm15
1341238384Sjkim	lea	0xa0(%rbp), %rsp
1342238384Sjkim___
1343238384Sjkim$code.=<<___;
1344238384Sjkim	mov	0x48(%rsp), %r15
1345238384Sjkim	mov	0x50(%rsp), %r14
1346238384Sjkim	mov	0x58(%rsp), %r13
1347238384Sjkim	mov	0x60(%rsp), %r12
1348238384Sjkim	mov	0x68(%rsp), %rbx
1349238384Sjkim	mov	0x70(%rsp), %rax
1350238384Sjkim	lea	0x78(%rsp), %rsp
1351238384Sjkim	mov	%rax, %rbp
1352238384Sjkim.Lecb_enc_epilogue:
1353238384Sjkim	ret
1354238384Sjkim.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1355238384Sjkim
1356238384Sjkim.globl	bsaes_ecb_decrypt_blocks
1357238384Sjkim.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1358238384Sjkim.align	16
1359238384Sjkimbsaes_ecb_decrypt_blocks:
1360238384Sjkim	mov	%rsp, %rax
1361238384Sjkim.Lecb_dec_prologue:
1362238384Sjkim	push	%rbp
1363238384Sjkim	push	%rbx
1364238384Sjkim	push	%r12
1365238384Sjkim	push	%r13
1366238384Sjkim	push	%r14
1367238384Sjkim	push	%r15
1368238384Sjkim	lea	-0x48(%rsp),%rsp
1369238384Sjkim___
1370238384Sjkim$code.=<<___ if ($win64);
1371238384Sjkim	lea	-0xa0(%rsp), %rsp
1372238384Sjkim	movaps	%xmm6, 0x40(%rsp)
1373238384Sjkim	movaps	%xmm7, 0x50(%rsp)
1374238384Sjkim	movaps	%xmm8, 0x60(%rsp)
1375238384Sjkim	movaps	%xmm9, 0x70(%rsp)
1376238384Sjkim	movaps	%xmm10, 0x80(%rsp)
1377238384Sjkim	movaps	%xmm11, 0x90(%rsp)
1378238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
1379238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
1380238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
1381238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
1382238384Sjkim.Lecb_dec_body:
1383238384Sjkim___
1384238384Sjkim$code.=<<___;
1385238384Sjkim	mov	%rsp,%rbp		# backup %rsp
1386238384Sjkim	mov	240($arg4),%eax		# rounds
1387238384Sjkim	mov	$arg1,$inp		# backup arguments
1388238384Sjkim	mov	$arg2,$out
1389238384Sjkim	mov	$arg3,$len
1390238384Sjkim	mov	$arg4,$key
1391238384Sjkim	cmp	\$8,$arg3
1392238384Sjkim	jb	.Lecb_dec_short
1393238384Sjkim
1394238384Sjkim	mov	%eax,%ebx		# backup rounds
1395238384Sjkim	shl	\$7,%rax		# 128 bytes per inner round key
1396238384Sjkim	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1397238384Sjkim	sub	%rax,%rsp
1398238384Sjkim	mov	%rsp,%rax		# pass key schedule
1399238384Sjkim	mov	$key,%rcx		# pass key
1400238384Sjkim	mov	%ebx,%r10d		# pass rounds
1401238384Sjkim	call	_bsaes_key_convert
1402238384Sjkim	pxor	(%rsp),%xmm7		# fix up 0 round key
1403238384Sjkim	movdqa	%xmm6,(%rax)		# save last round key
1404238384Sjkim	movdqa	%xmm7,(%rsp)
1405238384Sjkim
1406238384Sjkim	sub	\$8,$len
1407238384Sjkim.Lecb_dec_loop:
1408238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1409238384Sjkim	movdqu	0x10($inp), @XMM[1]
1410238384Sjkim	movdqu	0x20($inp), @XMM[2]
1411238384Sjkim	movdqu	0x30($inp), @XMM[3]
1412238384Sjkim	movdqu	0x40($inp), @XMM[4]
1413238384Sjkim	movdqu	0x50($inp), @XMM[5]
1414238384Sjkim	mov	%rsp, %rax		# pass key schedule
1415238384Sjkim	movdqu	0x60($inp), @XMM[6]
1416238384Sjkim	mov	%ebx,%r10d		# pass rounds
1417238384Sjkim	movdqu	0x70($inp), @XMM[7]
1418238384Sjkim	lea	0x80($inp), $inp
1419238384Sjkim
1420238384Sjkim	call	_bsaes_decrypt8
1421238384Sjkim
1422238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1423238384Sjkim	movdqu	@XMM[1], 0x10($out)
1424238384Sjkim	movdqu	@XMM[6], 0x20($out)
1425238384Sjkim	movdqu	@XMM[4], 0x30($out)
1426238384Sjkim	movdqu	@XMM[2], 0x40($out)
1427238384Sjkim	movdqu	@XMM[7], 0x50($out)
1428238384Sjkim	movdqu	@XMM[3], 0x60($out)
1429238384Sjkim	movdqu	@XMM[5], 0x70($out)
1430238384Sjkim	lea	0x80($out), $out
1431238384Sjkim	sub	\$8,$len
1432238384Sjkim	jnc	.Lecb_dec_loop
1433238384Sjkim
1434238384Sjkim	add	\$8,$len
1435238384Sjkim	jz	.Lecb_dec_done
1436238384Sjkim
1437238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1438238384Sjkim	mov	%rsp, %rax		# pass key schedule
1439238384Sjkim	mov	%ebx,%r10d		# pass rounds
1440238384Sjkim	cmp	\$2,$len
1441238384Sjkim	jb	.Lecb_dec_one
1442238384Sjkim	movdqu	0x10($inp), @XMM[1]
1443238384Sjkim	je	.Lecb_dec_two
1444238384Sjkim	movdqu	0x20($inp), @XMM[2]
1445238384Sjkim	cmp	\$4,$len
1446238384Sjkim	jb	.Lecb_dec_three
1447238384Sjkim	movdqu	0x30($inp), @XMM[3]
1448238384Sjkim	je	.Lecb_dec_four
1449238384Sjkim	movdqu	0x40($inp), @XMM[4]
1450238384Sjkim	cmp	\$6,$len
1451238384Sjkim	jb	.Lecb_dec_five
1452238384Sjkim	movdqu	0x50($inp), @XMM[5]
1453238384Sjkim	je	.Lecb_dec_six
1454238384Sjkim	movdqu	0x60($inp), @XMM[6]
1455238384Sjkim	call	_bsaes_decrypt8
1456238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1457238384Sjkim	movdqu	@XMM[1], 0x10($out)
1458238384Sjkim	movdqu	@XMM[6], 0x20($out)
1459238384Sjkim	movdqu	@XMM[4], 0x30($out)
1460238384Sjkim	movdqu	@XMM[2], 0x40($out)
1461238384Sjkim	movdqu	@XMM[7], 0x50($out)
1462238384Sjkim	movdqu	@XMM[3], 0x60($out)
1463238384Sjkim	jmp	.Lecb_dec_done
1464238384Sjkim.align	16
1465238384Sjkim.Lecb_dec_six:
1466238384Sjkim	call	_bsaes_decrypt8
1467238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1468238384Sjkim	movdqu	@XMM[1], 0x10($out)
1469238384Sjkim	movdqu	@XMM[6], 0x20($out)
1470238384Sjkim	movdqu	@XMM[4], 0x30($out)
1471238384Sjkim	movdqu	@XMM[2], 0x40($out)
1472238384Sjkim	movdqu	@XMM[7], 0x50($out)
1473238384Sjkim	jmp	.Lecb_dec_done
1474238384Sjkim.align	16
1475238384Sjkim.Lecb_dec_five:
1476238384Sjkim	call	_bsaes_decrypt8
1477238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1478238384Sjkim	movdqu	@XMM[1], 0x10($out)
1479238384Sjkim	movdqu	@XMM[6], 0x20($out)
1480238384Sjkim	movdqu	@XMM[4], 0x30($out)
1481238384Sjkim	movdqu	@XMM[2], 0x40($out)
1482238384Sjkim	jmp	.Lecb_dec_done
1483238384Sjkim.align	16
1484238384Sjkim.Lecb_dec_four:
1485238384Sjkim	call	_bsaes_decrypt8
1486238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1487238384Sjkim	movdqu	@XMM[1], 0x10($out)
1488238384Sjkim	movdqu	@XMM[6], 0x20($out)
1489238384Sjkim	movdqu	@XMM[4], 0x30($out)
1490238384Sjkim	jmp	.Lecb_dec_done
1491238384Sjkim.align	16
1492238384Sjkim.Lecb_dec_three:
1493238384Sjkim	call	_bsaes_decrypt8
1494238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1495238384Sjkim	movdqu	@XMM[1], 0x10($out)
1496238384Sjkim	movdqu	@XMM[6], 0x20($out)
1497238384Sjkim	jmp	.Lecb_dec_done
1498238384Sjkim.align	16
1499238384Sjkim.Lecb_dec_two:
1500238384Sjkim	call	_bsaes_decrypt8
1501238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1502238384Sjkim	movdqu	@XMM[1], 0x10($out)
1503238384Sjkim	jmp	.Lecb_dec_done
1504238384Sjkim.align	16
1505238384Sjkim.Lecb_dec_one:
1506238384Sjkim	call	_bsaes_decrypt8
1507238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1508238384Sjkim	jmp	.Lecb_dec_done
1509238384Sjkim.align	16
1510238384Sjkim.Lecb_dec_short:
1511238384Sjkim	lea	($inp), $arg1
1512238384Sjkim	lea	($out), $arg2
1513238384Sjkim	lea	($key), $arg3
1514238384Sjkim	call	asm_AES_decrypt
1515238384Sjkim	lea	16($inp), $inp
1516238384Sjkim	lea	16($out), $out
1517238384Sjkim	dec	$len
1518238384Sjkim	jnz	.Lecb_dec_short
1519238384Sjkim
1520238384Sjkim.Lecb_dec_done:
1521238384Sjkim	lea	(%rsp),%rax
1522238384Sjkim	pxor	%xmm0, %xmm0
1523238384Sjkim.Lecb_dec_bzero:			# wipe key schedule [if any]
1524238384Sjkim	movdqa	%xmm0, 0x00(%rax)
1525238384Sjkim	movdqa	%xmm0, 0x10(%rax)
1526238384Sjkim	lea	0x20(%rax), %rax
1527238384Sjkim	cmp	%rax, %rbp
1528238384Sjkim	jb	.Lecb_dec_bzero
1529238384Sjkim
1530238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
1531238384Sjkim___
1532238384Sjkim$code.=<<___ if ($win64);
1533238384Sjkim	movaps	0x40(%rbp), %xmm6
1534238384Sjkim	movaps	0x50(%rbp), %xmm7
1535238384Sjkim	movaps	0x60(%rbp), %xmm8
1536238384Sjkim	movaps	0x70(%rbp), %xmm9
1537238384Sjkim	movaps	0x80(%rbp), %xmm10
1538238384Sjkim	movaps	0x90(%rbp), %xmm11
1539238384Sjkim	movaps	0xa0(%rbp), %xmm12
1540238384Sjkim	movaps	0xb0(%rbp), %xmm13
1541238384Sjkim	movaps	0xc0(%rbp), %xmm14
1542238384Sjkim	movaps	0xd0(%rbp), %xmm15
1543238384Sjkim	lea	0xa0(%rbp), %rsp
1544238384Sjkim___
1545238384Sjkim$code.=<<___;
1546238384Sjkim	mov	0x48(%rsp), %r15
1547238384Sjkim	mov	0x50(%rsp), %r14
1548238384Sjkim	mov	0x58(%rsp), %r13
1549238384Sjkim	mov	0x60(%rsp), %r12
1550238384Sjkim	mov	0x68(%rsp), %rbx
1551238384Sjkim	mov	0x70(%rsp), %rax
1552238384Sjkim	lea	0x78(%rsp), %rsp
1553238384Sjkim	mov	%rax, %rbp
1554238384Sjkim.Lecb_dec_epilogue:
1555238384Sjkim	ret
1556238384Sjkim.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1557238384Sjkim___
1558238384Sjkim}
1559238384Sjkim$code.=<<___;
1560238384Sjkim.extern	asm_AES_cbc_encrypt
1561238384Sjkim.globl	bsaes_cbc_encrypt
1562238384Sjkim.type	bsaes_cbc_encrypt,\@abi-omnipotent
1563238384Sjkim.align	16
1564238384Sjkimbsaes_cbc_encrypt:
1565238384Sjkim___
1566238384Sjkim$code.=<<___ if ($win64);
1567238384Sjkim	mov	48(%rsp),$arg6		# pull direction flag
1568238384Sjkim___
1569238384Sjkim$code.=<<___;
1570238384Sjkim	cmp	\$0,$arg6
1571238384Sjkim	jne	asm_AES_cbc_encrypt
1572238384Sjkim	cmp	\$128,$arg3
1573238384Sjkim	jb	asm_AES_cbc_encrypt
1574238384Sjkim
1575238384Sjkim	mov	%rsp, %rax
1576238384Sjkim.Lcbc_dec_prologue:
1577238384Sjkim	push	%rbp
1578238384Sjkim	push	%rbx
1579238384Sjkim	push	%r12
1580238384Sjkim	push	%r13
1581238384Sjkim	push	%r14
1582238384Sjkim	push	%r15
1583238384Sjkim	lea	-0x48(%rsp), %rsp
1584238384Sjkim___
1585238384Sjkim$code.=<<___ if ($win64);
1586238384Sjkim	mov	0xa0(%rsp),$arg5	# pull ivp
1587238384Sjkim	lea	-0xa0(%rsp), %rsp
1588238384Sjkim	movaps	%xmm6, 0x40(%rsp)
1589238384Sjkim	movaps	%xmm7, 0x50(%rsp)
1590238384Sjkim	movaps	%xmm8, 0x60(%rsp)
1591238384Sjkim	movaps	%xmm9, 0x70(%rsp)
1592238384Sjkim	movaps	%xmm10, 0x80(%rsp)
1593238384Sjkim	movaps	%xmm11, 0x90(%rsp)
1594238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
1595238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
1596238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
1597238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
1598238384Sjkim.Lcbc_dec_body:
1599238384Sjkim___
1600238384Sjkim$code.=<<___;
1601238384Sjkim	mov	%rsp, %rbp		# backup %rsp
1602238384Sjkim	mov	240($arg4), %eax	# rounds
1603238384Sjkim	mov	$arg1, $inp		# backup arguments
1604238384Sjkim	mov	$arg2, $out
1605238384Sjkim	mov	$arg3, $len
1606238384Sjkim	mov	$arg4, $key
1607238384Sjkim	mov	$arg5, %rbx
1608238384Sjkim	shr	\$4, $len		# bytes to blocks
1609238384Sjkim
1610238384Sjkim	mov	%eax, %edx		# rounds
1611238384Sjkim	shl	\$7, %rax		# 128 bytes per inner round key
1612238384Sjkim	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1613238384Sjkim	sub	%rax, %rsp
1614238384Sjkim
1615238384Sjkim	mov	%rsp, %rax		# pass key schedule
1616238384Sjkim	mov	$key, %rcx		# pass key
1617238384Sjkim	mov	%edx, %r10d		# pass rounds
1618238384Sjkim	call	_bsaes_key_convert
1619238384Sjkim	pxor	(%rsp),%xmm7		# fix up 0 round key
1620238384Sjkim	movdqa	%xmm6,(%rax)		# save last round key
1621238384Sjkim	movdqa	%xmm7,(%rsp)
1622238384Sjkim
1623238384Sjkim	movdqu	(%rbx), @XMM[15]	# load IV
1624238384Sjkim	sub	\$8,$len
1625238384Sjkim.Lcbc_dec_loop:
1626238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1627238384Sjkim	movdqu	0x10($inp), @XMM[1]
1628238384Sjkim	movdqu	0x20($inp), @XMM[2]
1629238384Sjkim	movdqu	0x30($inp), @XMM[3]
1630238384Sjkim	movdqu	0x40($inp), @XMM[4]
1631238384Sjkim	movdqu	0x50($inp), @XMM[5]
1632238384Sjkim	mov	%rsp, %rax		# pass key schedule
1633238384Sjkim	movdqu	0x60($inp), @XMM[6]
1634238384Sjkim	mov	%edx,%r10d		# pass rounds
1635238384Sjkim	movdqu	0x70($inp), @XMM[7]
1636238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1637238384Sjkim
1638238384Sjkim	call	_bsaes_decrypt8
1639238384Sjkim
1640238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1641238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1642238384Sjkim	movdqu	0x10($inp), @XMM[9]
1643238384Sjkim	pxor	@XMM[8], @XMM[1]
1644238384Sjkim	movdqu	0x20($inp), @XMM[10]
1645238384Sjkim	pxor	@XMM[9], @XMM[6]
1646238384Sjkim	movdqu	0x30($inp), @XMM[11]
1647238384Sjkim	pxor	@XMM[10], @XMM[4]
1648238384Sjkim	movdqu	0x40($inp), @XMM[12]
1649238384Sjkim	pxor	@XMM[11], @XMM[2]
1650238384Sjkim	movdqu	0x50($inp), @XMM[13]
1651238384Sjkim	pxor	@XMM[12], @XMM[7]
1652238384Sjkim	movdqu	0x60($inp), @XMM[14]
1653238384Sjkim	pxor	@XMM[13], @XMM[3]
1654238384Sjkim	movdqu	0x70($inp), @XMM[15]	# IV
1655238384Sjkim	pxor	@XMM[14], @XMM[5]
1656238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1657238384Sjkim	lea	0x80($inp), $inp
1658238384Sjkim	movdqu	@XMM[1], 0x10($out)
1659238384Sjkim	movdqu	@XMM[6], 0x20($out)
1660238384Sjkim	movdqu	@XMM[4], 0x30($out)
1661238384Sjkim	movdqu	@XMM[2], 0x40($out)
1662238384Sjkim	movdqu	@XMM[7], 0x50($out)
1663238384Sjkim	movdqu	@XMM[3], 0x60($out)
1664238384Sjkim	movdqu	@XMM[5], 0x70($out)
1665238384Sjkim	lea	0x80($out), $out
1666238384Sjkim	sub	\$8,$len
1667238384Sjkim	jnc	.Lcbc_dec_loop
1668238384Sjkim
1669238384Sjkim	add	\$8,$len
1670238384Sjkim	jz	.Lcbc_dec_done
1671238384Sjkim
1672238384Sjkim	movdqu	0x00($inp), @XMM[0]	# load input
1673238384Sjkim	mov	%rsp, %rax		# pass key schedule
1674238384Sjkim	mov	%edx, %r10d		# pass rounds
1675238384Sjkim	cmp	\$2,$len
1676238384Sjkim	jb	.Lcbc_dec_one
1677238384Sjkim	movdqu	0x10($inp), @XMM[1]
1678238384Sjkim	je	.Lcbc_dec_two
1679238384Sjkim	movdqu	0x20($inp), @XMM[2]
1680238384Sjkim	cmp	\$4,$len
1681238384Sjkim	jb	.Lcbc_dec_three
1682238384Sjkim	movdqu	0x30($inp), @XMM[3]
1683238384Sjkim	je	.Lcbc_dec_four
1684238384Sjkim	movdqu	0x40($inp), @XMM[4]
1685238384Sjkim	cmp	\$6,$len
1686238384Sjkim	jb	.Lcbc_dec_five
1687238384Sjkim	movdqu	0x50($inp), @XMM[5]
1688238384Sjkim	je	.Lcbc_dec_six
1689238384Sjkim	movdqu	0x60($inp), @XMM[6]
1690238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1691238384Sjkim	call	_bsaes_decrypt8
1692238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1693238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1694238384Sjkim	movdqu	0x10($inp), @XMM[9]
1695238384Sjkim	pxor	@XMM[8], @XMM[1]
1696238384Sjkim	movdqu	0x20($inp), @XMM[10]
1697238384Sjkim	pxor	@XMM[9], @XMM[6]
1698238384Sjkim	movdqu	0x30($inp), @XMM[11]
1699238384Sjkim	pxor	@XMM[10], @XMM[4]
1700238384Sjkim	movdqu	0x40($inp), @XMM[12]
1701238384Sjkim	pxor	@XMM[11], @XMM[2]
1702238384Sjkim	movdqu	0x50($inp), @XMM[13]
1703238384Sjkim	pxor	@XMM[12], @XMM[7]
1704238384Sjkim	movdqu	0x60($inp), @XMM[15]	# IV
1705238384Sjkim	pxor	@XMM[13], @XMM[3]
1706238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1707238384Sjkim	movdqu	@XMM[1], 0x10($out)
1708238384Sjkim	movdqu	@XMM[6], 0x20($out)
1709238384Sjkim	movdqu	@XMM[4], 0x30($out)
1710238384Sjkim	movdqu	@XMM[2], 0x40($out)
1711238384Sjkim	movdqu	@XMM[7], 0x50($out)
1712238384Sjkim	movdqu	@XMM[3], 0x60($out)
1713238384Sjkim	jmp	.Lcbc_dec_done
1714238384Sjkim.align	16
1715238384Sjkim.Lcbc_dec_six:
1716238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1717238384Sjkim	call	_bsaes_decrypt8
1718238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1719238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1720238384Sjkim	movdqu	0x10($inp), @XMM[9]
1721238384Sjkim	pxor	@XMM[8], @XMM[1]
1722238384Sjkim	movdqu	0x20($inp), @XMM[10]
1723238384Sjkim	pxor	@XMM[9], @XMM[6]
1724238384Sjkim	movdqu	0x30($inp), @XMM[11]
1725238384Sjkim	pxor	@XMM[10], @XMM[4]
1726238384Sjkim	movdqu	0x40($inp), @XMM[12]
1727238384Sjkim	pxor	@XMM[11], @XMM[2]
1728238384Sjkim	movdqu	0x50($inp), @XMM[15]	# IV
1729238384Sjkim	pxor	@XMM[12], @XMM[7]
1730238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1731238384Sjkim	movdqu	@XMM[1], 0x10($out)
1732238384Sjkim	movdqu	@XMM[6], 0x20($out)
1733238384Sjkim	movdqu	@XMM[4], 0x30($out)
1734238384Sjkim	movdqu	@XMM[2], 0x40($out)
1735238384Sjkim	movdqu	@XMM[7], 0x50($out)
1736238384Sjkim	jmp	.Lcbc_dec_done
1737238384Sjkim.align	16
1738238384Sjkim.Lcbc_dec_five:
1739238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1740238384Sjkim	call	_bsaes_decrypt8
1741238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1742238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1743238384Sjkim	movdqu	0x10($inp), @XMM[9]
1744238384Sjkim	pxor	@XMM[8], @XMM[1]
1745238384Sjkim	movdqu	0x20($inp), @XMM[10]
1746238384Sjkim	pxor	@XMM[9], @XMM[6]
1747238384Sjkim	movdqu	0x30($inp), @XMM[11]
1748238384Sjkim	pxor	@XMM[10], @XMM[4]
1749238384Sjkim	movdqu	0x40($inp), @XMM[15]	# IV
1750238384Sjkim	pxor	@XMM[11], @XMM[2]
1751238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1752238384Sjkim	movdqu	@XMM[1], 0x10($out)
1753238384Sjkim	movdqu	@XMM[6], 0x20($out)
1754238384Sjkim	movdqu	@XMM[4], 0x30($out)
1755238384Sjkim	movdqu	@XMM[2], 0x40($out)
1756238384Sjkim	jmp	.Lcbc_dec_done
1757238384Sjkim.align	16
1758238384Sjkim.Lcbc_dec_four:
1759238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1760238384Sjkim	call	_bsaes_decrypt8
1761238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1762238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1763238384Sjkim	movdqu	0x10($inp), @XMM[9]
1764238384Sjkim	pxor	@XMM[8], @XMM[1]
1765238384Sjkim	movdqu	0x20($inp), @XMM[10]
1766238384Sjkim	pxor	@XMM[9], @XMM[6]
1767238384Sjkim	movdqu	0x30($inp), @XMM[15]	# IV
1768238384Sjkim	pxor	@XMM[10], @XMM[4]
1769238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1770238384Sjkim	movdqu	@XMM[1], 0x10($out)
1771238384Sjkim	movdqu	@XMM[6], 0x20($out)
1772238384Sjkim	movdqu	@XMM[4], 0x30($out)
1773238384Sjkim	jmp	.Lcbc_dec_done
1774238384Sjkim.align	16
1775238384Sjkim.Lcbc_dec_three:
1776238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1777238384Sjkim	call	_bsaes_decrypt8
1778238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1779238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1780238384Sjkim	movdqu	0x10($inp), @XMM[9]
1781238384Sjkim	pxor	@XMM[8], @XMM[1]
1782238384Sjkim	movdqu	0x20($inp), @XMM[15]	# IV
1783238384Sjkim	pxor	@XMM[9], @XMM[6]
1784238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1785238384Sjkim	movdqu	@XMM[1], 0x10($out)
1786238384Sjkim	movdqu	@XMM[6], 0x20($out)
1787238384Sjkim	jmp	.Lcbc_dec_done
1788238384Sjkim.align	16
1789238384Sjkim.Lcbc_dec_two:
1790238384Sjkim	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1791238384Sjkim	call	_bsaes_decrypt8
1792238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1793238384Sjkim	movdqu	0x00($inp), @XMM[8]	# re-load input
1794238384Sjkim	movdqu	0x10($inp), @XMM[15]	# IV
1795238384Sjkim	pxor	@XMM[8], @XMM[1]
1796238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1797238384Sjkim	movdqu	@XMM[1], 0x10($out)
1798238384Sjkim	jmp	.Lcbc_dec_done
1799238384Sjkim.align	16
1800238384Sjkim.Lcbc_dec_one:
1801238384Sjkim	lea	($inp), $arg1
1802238384Sjkim	lea	0x20(%rbp), $arg2	# buffer output
1803238384Sjkim	lea	($key), $arg3
1804238384Sjkim	call	asm_AES_decrypt		# doesn't touch %xmm
1805238384Sjkim	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1806238384Sjkim	movdqu	@XMM[15], ($out)	# write output
1807238384Sjkim	movdqa	@XMM[0], @XMM[15]	# IV
1808238384Sjkim
1809238384Sjkim.Lcbc_dec_done:
1810238384Sjkim	movdqu	@XMM[15], (%rbx)	# return IV
1811238384Sjkim	lea	(%rsp), %rax
1812238384Sjkim	pxor	%xmm0, %xmm0
1813238384Sjkim.Lcbc_dec_bzero:			# wipe key schedule [if any]
1814238384Sjkim	movdqa	%xmm0, 0x00(%rax)
1815238384Sjkim	movdqa	%xmm0, 0x10(%rax)
1816238384Sjkim	lea	0x20(%rax), %rax
1817238384Sjkim	cmp	%rax, %rbp
1818238384Sjkim	ja	.Lcbc_dec_bzero
1819238384Sjkim
1820238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
1821238384Sjkim___
1822238384Sjkim$code.=<<___ if ($win64);
1823238384Sjkim	movaps	0x40(%rbp), %xmm6
1824238384Sjkim	movaps	0x50(%rbp), %xmm7
1825238384Sjkim	movaps	0x60(%rbp), %xmm8
1826238384Sjkim	movaps	0x70(%rbp), %xmm9
1827238384Sjkim	movaps	0x80(%rbp), %xmm10
1828238384Sjkim	movaps	0x90(%rbp), %xmm11
1829238384Sjkim	movaps	0xa0(%rbp), %xmm12
1830238384Sjkim	movaps	0xb0(%rbp), %xmm13
1831238384Sjkim	movaps	0xc0(%rbp), %xmm14
1832238384Sjkim	movaps	0xd0(%rbp), %xmm15
1833238384Sjkim	lea	0xa0(%rbp), %rsp
1834238384Sjkim___
1835238384Sjkim$code.=<<___;
1836238384Sjkim	mov	0x48(%rsp), %r15
1837238384Sjkim	mov	0x50(%rsp), %r14
1838238384Sjkim	mov	0x58(%rsp), %r13
1839238384Sjkim	mov	0x60(%rsp), %r12
1840238384Sjkim	mov	0x68(%rsp), %rbx
1841238384Sjkim	mov	0x70(%rsp), %rax
1842238384Sjkim	lea	0x78(%rsp), %rsp
1843238384Sjkim	mov	%rax, %rbp
1844238384Sjkim.Lcbc_dec_epilogue:
1845238384Sjkim	ret
1846238384Sjkim.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1847238384Sjkim
1848238384Sjkim.globl	bsaes_ctr32_encrypt_blocks
1849238384Sjkim.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1850238384Sjkim.align	16
1851238384Sjkimbsaes_ctr32_encrypt_blocks:
1852238384Sjkim	mov	%rsp, %rax
1853238384Sjkim.Lctr_enc_prologue:
1854238384Sjkim	push	%rbp
1855238384Sjkim	push	%rbx
1856238384Sjkim	push	%r12
1857238384Sjkim	push	%r13
1858238384Sjkim	push	%r14
1859238384Sjkim	push	%r15
1860238384Sjkim	lea	-0x48(%rsp), %rsp
1861238384Sjkim___
1862238384Sjkim$code.=<<___ if ($win64);
1863238384Sjkim	mov	0xa0(%rsp),$arg5	# pull ivp
1864238384Sjkim	lea	-0xa0(%rsp), %rsp
1865238384Sjkim	movaps	%xmm6, 0x40(%rsp)
1866238384Sjkim	movaps	%xmm7, 0x50(%rsp)
1867238384Sjkim	movaps	%xmm8, 0x60(%rsp)
1868238384Sjkim	movaps	%xmm9, 0x70(%rsp)
1869238384Sjkim	movaps	%xmm10, 0x80(%rsp)
1870238384Sjkim	movaps	%xmm11, 0x90(%rsp)
1871238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
1872238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
1873238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
1874238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
1875238384Sjkim.Lctr_enc_body:
1876238384Sjkim___
1877238384Sjkim$code.=<<___;
1878238384Sjkim	mov	%rsp, %rbp		# backup %rsp
1879238384Sjkim	movdqu	($arg5), %xmm0		# load counter
1880238384Sjkim	mov	240($arg4), %eax	# rounds
1881238384Sjkim	mov	$arg1, $inp		# backup arguments
1882238384Sjkim	mov	$arg2, $out
1883238384Sjkim	mov	$arg3, $len
1884238384Sjkim	mov	$arg4, $key
1885238384Sjkim	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1886238384Sjkim	cmp	\$8, $arg3
1887238384Sjkim	jb	.Lctr_enc_short
1888238384Sjkim
1889238384Sjkim	mov	%eax, %ebx		# rounds
1890238384Sjkim	shl	\$7, %rax		# 128 bytes per inner round key
1891238384Sjkim	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1892238384Sjkim	sub	%rax, %rsp
1893238384Sjkim
1894238384Sjkim	mov	%rsp, %rax		# pass key schedule
1895238384Sjkim	mov	$key, %rcx		# pass key
1896238384Sjkim	mov	%ebx, %r10d		# pass rounds
1897238384Sjkim	call	_bsaes_key_convert
1898238384Sjkim	pxor	%xmm6,%xmm7		# fix up last round key
1899238384Sjkim	movdqa	%xmm7,(%rax)		# save last round key
1900238384Sjkim
1901238384Sjkim	movdqa	(%rsp), @XMM[9]		# load round0 key
1902238384Sjkim	lea	.LADD1(%rip), %r11
1903238384Sjkim	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1904238384Sjkim	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1905238384Sjkim	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1906238384Sjkim	pshufb	@XMM[8], @XMM[0]
1907238384Sjkim	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1908238384Sjkim	jmp	.Lctr_enc_loop
1909238384Sjkim.align	16
1910238384Sjkim.Lctr_enc_loop:
1911238384Sjkim	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1912238384Sjkim	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1913238384Sjkim	movdqa	@XMM[0], @XMM[2]
1914238384Sjkim	paddd	0x00(%r11), @XMM[1]	# .LADD1
1915238384Sjkim	movdqa	@XMM[0], @XMM[3]
1916238384Sjkim	paddd	0x10(%r11), @XMM[2]	# .LADD2
1917238384Sjkim	movdqa	@XMM[0], @XMM[4]
1918238384Sjkim	paddd	0x20(%r11), @XMM[3]	# .LADD3
1919238384Sjkim	movdqa	@XMM[0], @XMM[5]
1920238384Sjkim	paddd	0x30(%r11), @XMM[4]	# .LADD4
1921238384Sjkim	movdqa	@XMM[0], @XMM[6]
1922238384Sjkim	paddd	0x40(%r11), @XMM[5]	# .LADD5
1923238384Sjkim	movdqa	@XMM[0], @XMM[7]
1924238384Sjkim	paddd	0x50(%r11), @XMM[6]	# .LADD6
1925238384Sjkim	paddd	0x60(%r11), @XMM[7]	# .LADD7
1926238384Sjkim
1927238384Sjkim	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1928238384Sjkim	# to flip byte order in 32-bit counter
1929238384Sjkim	movdqa	(%rsp), @XMM[9]		# round 0 key
1930238384Sjkim	lea	0x10(%rsp), %rax	# pass key schedule
1931238384Sjkim	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1932238384Sjkim	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1933238384Sjkim	pxor	@XMM[9], @XMM[1]
1934290207Sjkim	pxor	@XMM[9], @XMM[2]
1935290207Sjkim	pxor	@XMM[9], @XMM[3]
1936238384Sjkim	 pshufb	@XMM[8], @XMM[0]
1937238384Sjkim	 pshufb	@XMM[8], @XMM[1]
1938290207Sjkim	pxor	@XMM[9], @XMM[4]
1939290207Sjkim	pxor	@XMM[9], @XMM[5]
1940238384Sjkim	 pshufb	@XMM[8], @XMM[2]
1941238384Sjkim	 pshufb	@XMM[8], @XMM[3]
1942290207Sjkim	pxor	@XMM[9], @XMM[6]
1943290207Sjkim	pxor	@XMM[9], @XMM[7]
1944238384Sjkim	 pshufb	@XMM[8], @XMM[4]
1945238384Sjkim	 pshufb	@XMM[8], @XMM[5]
1946238384Sjkim	 pshufb	@XMM[8], @XMM[6]
1947290207Sjkim	 pshufb	@XMM[8], @XMM[7]
1948238384Sjkim	lea	.LBS0(%rip), %r11	# constants table
1949238384Sjkim	mov	%ebx,%r10d		# pass rounds
1950238384Sjkim
1951238384Sjkim	call	_bsaes_encrypt8_bitslice
1952238384Sjkim
1953238384Sjkim	sub	\$8,$len
1954238384Sjkim	jc	.Lctr_enc_loop_done
1955238384Sjkim
1956238384Sjkim	movdqu	0x00($inp), @XMM[8]	# load input
1957238384Sjkim	movdqu	0x10($inp), @XMM[9]
1958238384Sjkim	movdqu	0x20($inp), @XMM[10]
1959238384Sjkim	movdqu	0x30($inp), @XMM[11]
1960238384Sjkim	movdqu	0x40($inp), @XMM[12]
1961238384Sjkim	movdqu	0x50($inp), @XMM[13]
1962238384Sjkim	movdqu	0x60($inp), @XMM[14]
1963238384Sjkim	movdqu	0x70($inp), @XMM[15]
1964238384Sjkim	lea	0x80($inp),$inp
1965238384Sjkim	pxor	@XMM[0], @XMM[8]
1966238384Sjkim	movdqa	0x20(%rbp), @XMM[0]	# load counter
1967238384Sjkim	pxor	@XMM[9], @XMM[1]
1968238384Sjkim	movdqu	@XMM[8], 0x00($out)	# write output
1969238384Sjkim	pxor	@XMM[10], @XMM[4]
1970238384Sjkim	movdqu	@XMM[1], 0x10($out)
1971238384Sjkim	pxor	@XMM[11], @XMM[6]
1972238384Sjkim	movdqu	@XMM[4], 0x20($out)
1973238384Sjkim	pxor	@XMM[12], @XMM[3]
1974238384Sjkim	movdqu	@XMM[6], 0x30($out)
1975238384Sjkim	pxor	@XMM[13], @XMM[7]
1976238384Sjkim	movdqu	@XMM[3], 0x40($out)
1977238384Sjkim	pxor	@XMM[14], @XMM[2]
1978238384Sjkim	movdqu	@XMM[7], 0x50($out)
1979238384Sjkim	pxor	@XMM[15], @XMM[5]
1980238384Sjkim	movdqu	@XMM[2], 0x60($out)
1981238384Sjkim	lea	.LADD1(%rip), %r11
1982238384Sjkim	movdqu	@XMM[5], 0x70($out)
1983238384Sjkim	lea	0x80($out), $out
1984238384Sjkim	paddd	0x70(%r11), @XMM[0]	# .LADD8
1985238384Sjkim	jnz	.Lctr_enc_loop
1986238384Sjkim
1987238384Sjkim	jmp	.Lctr_enc_done
1988238384Sjkim.align	16
1989238384Sjkim.Lctr_enc_loop_done:
1990238384Sjkim	add	\$8, $len
1991238384Sjkim	movdqu	0x00($inp), @XMM[8]	# load input
1992238384Sjkim	pxor	@XMM[8], @XMM[0]
1993238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
1994238384Sjkim	cmp	\$2,$len
1995238384Sjkim	jb	.Lctr_enc_done
1996238384Sjkim	movdqu	0x10($inp), @XMM[9]
1997238384Sjkim	pxor	@XMM[9], @XMM[1]
1998238384Sjkim	movdqu	@XMM[1], 0x10($out)
1999238384Sjkim	je	.Lctr_enc_done
2000238384Sjkim	movdqu	0x20($inp), @XMM[10]
2001238384Sjkim	pxor	@XMM[10], @XMM[4]
2002238384Sjkim	movdqu	@XMM[4], 0x20($out)
2003238384Sjkim	cmp	\$4,$len
2004238384Sjkim	jb	.Lctr_enc_done
2005238384Sjkim	movdqu	0x30($inp), @XMM[11]
2006238384Sjkim	pxor	@XMM[11], @XMM[6]
2007238384Sjkim	movdqu	@XMM[6], 0x30($out)
2008238384Sjkim	je	.Lctr_enc_done
2009238384Sjkim	movdqu	0x40($inp), @XMM[12]
2010238384Sjkim	pxor	@XMM[12], @XMM[3]
2011238384Sjkim	movdqu	@XMM[3], 0x40($out)
2012238384Sjkim	cmp	\$6,$len
2013238384Sjkim	jb	.Lctr_enc_done
2014238384Sjkim	movdqu	0x50($inp), @XMM[13]
2015238384Sjkim	pxor	@XMM[13], @XMM[7]
2016238384Sjkim	movdqu	@XMM[7], 0x50($out)
2017238384Sjkim	je	.Lctr_enc_done
2018238384Sjkim	movdqu	0x60($inp), @XMM[14]
2019238384Sjkim	pxor	@XMM[14], @XMM[2]
2020238384Sjkim	movdqu	@XMM[2], 0x60($out)
2021238384Sjkim	jmp	.Lctr_enc_done
2022238384Sjkim
2023238384Sjkim.align	16
2024238384Sjkim.Lctr_enc_short:
2025238384Sjkim	lea	0x20(%rbp), $arg1
2026238384Sjkim	lea	0x30(%rbp), $arg2
2027238384Sjkim	lea	($key), $arg3
2028238384Sjkim	call	asm_AES_encrypt
2029238384Sjkim	movdqu	($inp), @XMM[1]
2030238384Sjkim	lea	16($inp), $inp
2031238384Sjkim	mov	0x2c(%rbp), %eax	# load 32-bit counter
2032238384Sjkim	bswap	%eax
2033238384Sjkim	pxor	0x30(%rbp), @XMM[1]
2034238384Sjkim	inc	%eax			# increment
2035238384Sjkim	movdqu	@XMM[1], ($out)
2036238384Sjkim	bswap	%eax
2037238384Sjkim	lea	16($out), $out
2038238384Sjkim	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2039238384Sjkim	dec	$len
2040238384Sjkim	jnz	.Lctr_enc_short
2041238384Sjkim
2042238384Sjkim.Lctr_enc_done:
2043238384Sjkim	lea	(%rsp), %rax
2044238384Sjkim	pxor	%xmm0, %xmm0
2045238384Sjkim.Lctr_enc_bzero:			# wipe key schedule [if any]
2046238384Sjkim	movdqa	%xmm0, 0x00(%rax)
2047238384Sjkim	movdqa	%xmm0, 0x10(%rax)
2048238384Sjkim	lea	0x20(%rax), %rax
2049238384Sjkim	cmp	%rax, %rbp
2050238384Sjkim	ja	.Lctr_enc_bzero
2051238384Sjkim
2052238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
2053238384Sjkim___
2054238384Sjkim$code.=<<___ if ($win64);
2055238384Sjkim	movaps	0x40(%rbp), %xmm6
2056238384Sjkim	movaps	0x50(%rbp), %xmm7
2057238384Sjkim	movaps	0x60(%rbp), %xmm8
2058238384Sjkim	movaps	0x70(%rbp), %xmm9
2059238384Sjkim	movaps	0x80(%rbp), %xmm10
2060238384Sjkim	movaps	0x90(%rbp), %xmm11
2061238384Sjkim	movaps	0xa0(%rbp), %xmm12
2062238384Sjkim	movaps	0xb0(%rbp), %xmm13
2063238384Sjkim	movaps	0xc0(%rbp), %xmm14
2064238384Sjkim	movaps	0xd0(%rbp), %xmm15
2065238384Sjkim	lea	0xa0(%rbp), %rsp
2066238384Sjkim___
2067238384Sjkim$code.=<<___;
2068238384Sjkim	mov	0x48(%rsp), %r15
2069238384Sjkim	mov	0x50(%rsp), %r14
2070238384Sjkim	mov	0x58(%rsp), %r13
2071238384Sjkim	mov	0x60(%rsp), %r12
2072238384Sjkim	mov	0x68(%rsp), %rbx
2073238384Sjkim	mov	0x70(%rsp), %rax
2074238384Sjkim	lea	0x78(%rsp), %rsp
2075238384Sjkim	mov	%rax, %rbp
2076238384Sjkim.Lctr_enc_epilogue:
2077238384Sjkim	ret
2078238384Sjkim.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2079238384Sjkim___
2080238384Sjkim######################################################################
2081238384Sjkim# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2082238384Sjkim#	const AES_KEY *key1, const AES_KEY *key2,
2083238384Sjkim#	const unsigned char iv[16]);
2084238384Sjkim#
2085238384Sjkimmy ($twmask,$twres,$twtmp)=@XMM[13..15];
2086261037Sjkim$arg6=~s/d$//;
2087261037Sjkim
2088238384Sjkim$code.=<<___;
2089238384Sjkim.globl	bsaes_xts_encrypt
2090238384Sjkim.type	bsaes_xts_encrypt,\@abi-omnipotent
2091238384Sjkim.align	16
2092238384Sjkimbsaes_xts_encrypt:
2093238384Sjkim	mov	%rsp, %rax
2094238384Sjkim.Lxts_enc_prologue:
2095238384Sjkim	push	%rbp
2096238384Sjkim	push	%rbx
2097238384Sjkim	push	%r12
2098238384Sjkim	push	%r13
2099238384Sjkim	push	%r14
2100238384Sjkim	push	%r15
2101238384Sjkim	lea	-0x48(%rsp), %rsp
2102238384Sjkim___
2103238384Sjkim$code.=<<___ if ($win64);
2104238384Sjkim	mov	0xa0(%rsp),$arg5	# pull key2
2105238384Sjkim	mov	0xa8(%rsp),$arg6	# pull ivp
2106238384Sjkim	lea	-0xa0(%rsp), %rsp
2107238384Sjkim	movaps	%xmm6, 0x40(%rsp)
2108238384Sjkim	movaps	%xmm7, 0x50(%rsp)
2109238384Sjkim	movaps	%xmm8, 0x60(%rsp)
2110238384Sjkim	movaps	%xmm9, 0x70(%rsp)
2111238384Sjkim	movaps	%xmm10, 0x80(%rsp)
2112238384Sjkim	movaps	%xmm11, 0x90(%rsp)
2113238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
2114238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
2115238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
2116238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
2117238384Sjkim.Lxts_enc_body:
2118238384Sjkim___
2119238384Sjkim$code.=<<___;
2120238384Sjkim	mov	%rsp, %rbp		# backup %rsp
2121238384Sjkim	mov	$arg1, $inp		# backup arguments
2122238384Sjkim	mov	$arg2, $out
2123238384Sjkim	mov	$arg3, $len
2124238384Sjkim	mov	$arg4, $key
2125238384Sjkim
2126238384Sjkim	lea	($arg6), $arg1
2127238384Sjkim	lea	0x20(%rbp), $arg2
2128238384Sjkim	lea	($arg5), $arg3
2129238384Sjkim	call	asm_AES_encrypt		# generate initial tweak
2130238384Sjkim
2131238384Sjkim	mov	240($key), %eax		# rounds
2132238384Sjkim	mov	$len, %rbx		# backup $len
2133238384Sjkim
2134238384Sjkim	mov	%eax, %edx		# rounds
2135238384Sjkim	shl	\$7, %rax		# 128 bytes per inner round key
2136238384Sjkim	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2137238384Sjkim	sub	%rax, %rsp
2138238384Sjkim
2139238384Sjkim	mov	%rsp, %rax		# pass key schedule
2140238384Sjkim	mov	$key, %rcx		# pass key
2141238384Sjkim	mov	%edx, %r10d		# pass rounds
2142238384Sjkim	call	_bsaes_key_convert
2143238384Sjkim	pxor	%xmm6, %xmm7		# fix up last round key
2144238384Sjkim	movdqa	%xmm7, (%rax)		# save last round key
2145238384Sjkim
2146238384Sjkim	and	\$-16, $len
2147238384Sjkim	sub	\$0x80, %rsp		# place for tweak[8]
2148238384Sjkim	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2149238384Sjkim
2150238384Sjkim	pxor	$twtmp, $twtmp
2151238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2152238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2153238384Sjkim
2154238384Sjkim	sub	\$0x80, $len
2155238384Sjkim	jc	.Lxts_enc_short
2156238384Sjkim	jmp	.Lxts_enc_loop
2157238384Sjkim
2158238384Sjkim.align	16
2159238384Sjkim.Lxts_enc_loop:
2160238384Sjkim___
2161238384Sjkim    for ($i=0;$i<7;$i++) {
2162238384Sjkim    $code.=<<___;
2163238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2164238384Sjkim	pxor	$twtmp, $twtmp
2165238384Sjkim	movdqa	@XMM[7], @XMM[$i]
2166238384Sjkim	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2167238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2168238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2169238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2170238384Sjkim	pxor	$twres, @XMM[7]
2171238384Sjkim___
2172238384Sjkim    $code.=<<___ if ($i>=1);
2173238384Sjkim	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2174238384Sjkim___
2175238384Sjkim    $code.=<<___ if ($i>=2);
2176238384Sjkim	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2177238384Sjkim___
2178238384Sjkim    }
2179238384Sjkim$code.=<<___;
2180238384Sjkim	movdqu	0x60($inp), @XMM[8+6]
2181238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2182238384Sjkim	movdqu	0x70($inp), @XMM[8+7]
2183238384Sjkim	lea	0x80($inp), $inp
2184238384Sjkim	movdqa	@XMM[7], 0x70(%rsp)
2185238384Sjkim	pxor	@XMM[8+6], @XMM[6]
2186238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2187238384Sjkim	pxor	@XMM[8+7], @XMM[7]
2188238384Sjkim	mov	%edx, %r10d		# pass rounds
2189238384Sjkim
2190238384Sjkim	call	_bsaes_encrypt8
2191238384Sjkim
2192238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2193238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2194238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2195238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2196238384Sjkim	movdqu	@XMM[1], 0x10($out)
2197238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2198238384Sjkim	movdqu	@XMM[4], 0x20($out)
2199238384Sjkim	pxor	0x40(%rsp), @XMM[3]
2200238384Sjkim	movdqu	@XMM[6], 0x30($out)
2201238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2202238384Sjkim	movdqu	@XMM[3], 0x40($out)
2203238384Sjkim	pxor	0x60(%rsp), @XMM[2]
2204238384Sjkim	movdqu	@XMM[7], 0x50($out)
2205238384Sjkim	pxor	0x70(%rsp), @XMM[5]
2206238384Sjkim	movdqu	@XMM[2], 0x60($out)
2207238384Sjkim	movdqu	@XMM[5], 0x70($out)
2208238384Sjkim	lea	0x80($out), $out
2209238384Sjkim
2210238384Sjkim	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2211238384Sjkim	pxor	$twtmp, $twtmp
2212238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2213238384Sjkim	pcmpgtd	@XMM[7], $twtmp
2214238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2215238384Sjkim	pxor	$twtmp, $twtmp
2216238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2217238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2218238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2219238384Sjkim	pxor	$twres, @XMM[7]
2220238384Sjkim
2221238384Sjkim	sub	\$0x80,$len
2222238384Sjkim	jnc	.Lxts_enc_loop
2223238384Sjkim
2224238384Sjkim.Lxts_enc_short:
2225238384Sjkim	add	\$0x80, $len
2226238384Sjkim	jz	.Lxts_enc_done
2227238384Sjkim___
2228238384Sjkim    for ($i=0;$i<7;$i++) {
2229238384Sjkim    $code.=<<___;
2230238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2231238384Sjkim	pxor	$twtmp, $twtmp
2232238384Sjkim	movdqa	@XMM[7], @XMM[$i]
2233238384Sjkim	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2234238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2235238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2236238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2237238384Sjkim	pxor	$twres, @XMM[7]
2238238384Sjkim___
2239238384Sjkim    $code.=<<___ if ($i>=1);
2240238384Sjkim	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2241238384Sjkim	cmp	\$`0x10*$i`,$len
2242238384Sjkim	je	.Lxts_enc_$i
2243238384Sjkim___
2244238384Sjkim    $code.=<<___ if ($i>=2);
2245238384Sjkim	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2246238384Sjkim___
2247238384Sjkim    }
2248238384Sjkim$code.=<<___;
2249238384Sjkim	movdqu	0x60($inp), @XMM[8+6]
2250238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2251238384Sjkim	movdqa	@XMM[7], 0x70(%rsp)
2252238384Sjkim	lea	0x70($inp), $inp
2253238384Sjkim	pxor	@XMM[8+6], @XMM[6]
2254238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2255238384Sjkim	mov	%edx, %r10d		# pass rounds
2256238384Sjkim
2257238384Sjkim	call	_bsaes_encrypt8
2258238384Sjkim
2259238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2260238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2261238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2262238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2263238384Sjkim	movdqu	@XMM[1], 0x10($out)
2264238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2265238384Sjkim	movdqu	@XMM[4], 0x20($out)
2266238384Sjkim	pxor	0x40(%rsp), @XMM[3]
2267238384Sjkim	movdqu	@XMM[6], 0x30($out)
2268238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2269238384Sjkim	movdqu	@XMM[3], 0x40($out)
2270238384Sjkim	pxor	0x60(%rsp), @XMM[2]
2271238384Sjkim	movdqu	@XMM[7], 0x50($out)
2272238384Sjkim	movdqu	@XMM[2], 0x60($out)
2273238384Sjkim	lea	0x70($out), $out
2274238384Sjkim
2275238384Sjkim	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2276238384Sjkim	jmp	.Lxts_enc_done
2277238384Sjkim.align	16
2278238384Sjkim.Lxts_enc_6:
2279238384Sjkim	pxor	@XMM[8+4], @XMM[4]
2280238384Sjkim	lea	0x60($inp), $inp
2281238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2282238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2283238384Sjkim	mov	%edx, %r10d		# pass rounds
2284238384Sjkim
2285238384Sjkim	call	_bsaes_encrypt8
2286238384Sjkim
2287238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2288238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2289238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2290238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2291238384Sjkim	movdqu	@XMM[1], 0x10($out)
2292238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2293238384Sjkim	movdqu	@XMM[4], 0x20($out)
2294238384Sjkim	pxor	0x40(%rsp), @XMM[3]
2295238384Sjkim	movdqu	@XMM[6], 0x30($out)
2296238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2297238384Sjkim	movdqu	@XMM[3], 0x40($out)
2298238384Sjkim	movdqu	@XMM[7], 0x50($out)
2299238384Sjkim	lea	0x60($out), $out
2300238384Sjkim
2301238384Sjkim	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2302238384Sjkim	jmp	.Lxts_enc_done
2303238384Sjkim.align	16
2304238384Sjkim.Lxts_enc_5:
2305238384Sjkim	pxor	@XMM[8+3], @XMM[3]
2306238384Sjkim	lea	0x50($inp), $inp
2307238384Sjkim	pxor	@XMM[8+4], @XMM[4]
2308238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2309238384Sjkim	mov	%edx, %r10d		# pass rounds
2310238384Sjkim
2311238384Sjkim	call	_bsaes_encrypt8
2312238384Sjkim
2313238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2314238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2315238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2316238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2317238384Sjkim	movdqu	@XMM[1], 0x10($out)
2318238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2319238384Sjkim	movdqu	@XMM[4], 0x20($out)
2320238384Sjkim	pxor	0x40(%rsp), @XMM[3]
2321238384Sjkim	movdqu	@XMM[6], 0x30($out)
2322238384Sjkim	movdqu	@XMM[3], 0x40($out)
2323238384Sjkim	lea	0x50($out), $out
2324238384Sjkim
2325238384Sjkim	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2326238384Sjkim	jmp	.Lxts_enc_done
2327238384Sjkim.align	16
2328238384Sjkim.Lxts_enc_4:
2329238384Sjkim	pxor	@XMM[8+2], @XMM[2]
2330238384Sjkim	lea	0x40($inp), $inp
2331238384Sjkim	pxor	@XMM[8+3], @XMM[3]
2332238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2333238384Sjkim	mov	%edx, %r10d		# pass rounds
2334238384Sjkim
2335238384Sjkim	call	_bsaes_encrypt8
2336238384Sjkim
2337238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2338238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2339238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2340238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2341238384Sjkim	movdqu	@XMM[1], 0x10($out)
2342238384Sjkim	pxor	0x30(%rsp), @XMM[6]
2343238384Sjkim	movdqu	@XMM[4], 0x20($out)
2344238384Sjkim	movdqu	@XMM[6], 0x30($out)
2345238384Sjkim	lea	0x40($out), $out
2346238384Sjkim
2347238384Sjkim	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2348238384Sjkim	jmp	.Lxts_enc_done
2349238384Sjkim.align	16
2350238384Sjkim.Lxts_enc_3:
2351238384Sjkim	pxor	@XMM[8+1], @XMM[1]
2352238384Sjkim	lea	0x30($inp), $inp
2353238384Sjkim	pxor	@XMM[8+2], @XMM[2]
2354238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2355238384Sjkim	mov	%edx, %r10d		# pass rounds
2356238384Sjkim
2357238384Sjkim	call	_bsaes_encrypt8
2358238384Sjkim
2359238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2360238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2361238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2362238384Sjkim	pxor	0x20(%rsp), @XMM[4]
2363238384Sjkim	movdqu	@XMM[1], 0x10($out)
2364238384Sjkim	movdqu	@XMM[4], 0x20($out)
2365238384Sjkim	lea	0x30($out), $out
2366238384Sjkim
2367238384Sjkim	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2368238384Sjkim	jmp	.Lxts_enc_done
2369238384Sjkim.align	16
2370238384Sjkim.Lxts_enc_2:
2371238384Sjkim	pxor	@XMM[8+0], @XMM[0]
2372238384Sjkim	lea	0x20($inp), $inp
2373238384Sjkim	pxor	@XMM[8+1], @XMM[1]
2374238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2375238384Sjkim	mov	%edx, %r10d		# pass rounds
2376238384Sjkim
2377238384Sjkim	call	_bsaes_encrypt8
2378238384Sjkim
2379238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2380238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2381238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2382238384Sjkim	movdqu	@XMM[1], 0x10($out)
2383238384Sjkim	lea	0x20($out), $out
2384238384Sjkim
2385238384Sjkim	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2386238384Sjkim	jmp	.Lxts_enc_done
2387238384Sjkim.align	16
2388238384Sjkim.Lxts_enc_1:
2389238384Sjkim	pxor	@XMM[0], @XMM[8]
2390238384Sjkim	lea	0x10($inp), $inp
2391238384Sjkim	movdqa	@XMM[8], 0x20(%rbp)
2392238384Sjkim	lea	0x20(%rbp), $arg1
2393238384Sjkim	lea	0x20(%rbp), $arg2
2394238384Sjkim	lea	($key), $arg3
2395238384Sjkim	call	asm_AES_encrypt		# doesn't touch %xmm
2396238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2397238384Sjkim	#pxor	@XMM[8], @XMM[0]
2398238384Sjkim	#lea	0x80(%rsp), %rax	# pass key schedule
2399238384Sjkim	#mov	%edx, %r10d		# pass rounds
2400238384Sjkim	#call	_bsaes_encrypt8
2401238384Sjkim	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2402238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2403238384Sjkim	lea	0x10($out), $out
2404238384Sjkim
2405238384Sjkim	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2406238384Sjkim
2407238384Sjkim.Lxts_enc_done:
2408238384Sjkim	and	\$15, %ebx
2409238384Sjkim	jz	.Lxts_enc_ret
2410238384Sjkim	mov	$out, %rdx
2411238384Sjkim
2412238384Sjkim.Lxts_enc_steal:
2413238384Sjkim	movzb	($inp), %eax
2414238384Sjkim	movzb	-16(%rdx), %ecx
2415238384Sjkim	lea	1($inp), $inp
2416238384Sjkim	mov	%al, -16(%rdx)
2417238384Sjkim	mov	%cl, 0(%rdx)
2418238384Sjkim	lea	1(%rdx), %rdx
2419238384Sjkim	sub	\$1,%ebx
2420238384Sjkim	jnz	.Lxts_enc_steal
2421238384Sjkim
2422238384Sjkim	movdqu	-16($out), @XMM[0]
2423238384Sjkim	lea	0x20(%rbp), $arg1
2424238384Sjkim	pxor	@XMM[7], @XMM[0]
2425238384Sjkim	lea	0x20(%rbp), $arg2
2426238384Sjkim	movdqa	@XMM[0], 0x20(%rbp)
2427238384Sjkim	lea	($key), $arg3
2428238384Sjkim	call	asm_AES_encrypt		# doesn't touch %xmm
2429238384Sjkim	pxor	0x20(%rbp), @XMM[7]
2430238384Sjkim	movdqu	@XMM[7], -16($out)
2431238384Sjkim
2432238384Sjkim.Lxts_enc_ret:
2433238384Sjkim	lea	(%rsp), %rax
2434238384Sjkim	pxor	%xmm0, %xmm0
2435238384Sjkim.Lxts_enc_bzero:			# wipe key schedule [if any]
2436238384Sjkim	movdqa	%xmm0, 0x00(%rax)
2437238384Sjkim	movdqa	%xmm0, 0x10(%rax)
2438238384Sjkim	lea	0x20(%rax), %rax
2439238384Sjkim	cmp	%rax, %rbp
2440238384Sjkim	ja	.Lxts_enc_bzero
2441238384Sjkim
2442238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
2443238384Sjkim___
2444238384Sjkim$code.=<<___ if ($win64);
2445238384Sjkim	movaps	0x40(%rbp), %xmm6
2446238384Sjkim	movaps	0x50(%rbp), %xmm7
2447238384Sjkim	movaps	0x60(%rbp), %xmm8
2448238384Sjkim	movaps	0x70(%rbp), %xmm9
2449238384Sjkim	movaps	0x80(%rbp), %xmm10
2450238384Sjkim	movaps	0x90(%rbp), %xmm11
2451238384Sjkim	movaps	0xa0(%rbp), %xmm12
2452238384Sjkim	movaps	0xb0(%rbp), %xmm13
2453238384Sjkim	movaps	0xc0(%rbp), %xmm14
2454238384Sjkim	movaps	0xd0(%rbp), %xmm15
2455238384Sjkim	lea	0xa0(%rbp), %rsp
2456238384Sjkim___
2457238384Sjkim$code.=<<___;
2458238384Sjkim	mov	0x48(%rsp), %r15
2459238384Sjkim	mov	0x50(%rsp), %r14
2460238384Sjkim	mov	0x58(%rsp), %r13
2461238384Sjkim	mov	0x60(%rsp), %r12
2462238384Sjkim	mov	0x68(%rsp), %rbx
2463238384Sjkim	mov	0x70(%rsp), %rax
2464238384Sjkim	lea	0x78(%rsp), %rsp
2465238384Sjkim	mov	%rax, %rbp
2466238384Sjkim.Lxts_enc_epilogue:
2467238384Sjkim	ret
2468238384Sjkim.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2469238384Sjkim
2470238384Sjkim.globl	bsaes_xts_decrypt
2471238384Sjkim.type	bsaes_xts_decrypt,\@abi-omnipotent
2472238384Sjkim.align	16
2473238384Sjkimbsaes_xts_decrypt:
2474238384Sjkim	mov	%rsp, %rax
2475238384Sjkim.Lxts_dec_prologue:
2476238384Sjkim	push	%rbp
2477238384Sjkim	push	%rbx
2478238384Sjkim	push	%r12
2479238384Sjkim	push	%r13
2480238384Sjkim	push	%r14
2481238384Sjkim	push	%r15
2482238384Sjkim	lea	-0x48(%rsp), %rsp
2483238384Sjkim___
2484238384Sjkim$code.=<<___ if ($win64);
2485238384Sjkim	mov	0xa0(%rsp),$arg5	# pull key2
2486238384Sjkim	mov	0xa8(%rsp),$arg6	# pull ivp
2487238384Sjkim	lea	-0xa0(%rsp), %rsp
2488238384Sjkim	movaps	%xmm6, 0x40(%rsp)
2489238384Sjkim	movaps	%xmm7, 0x50(%rsp)
2490238384Sjkim	movaps	%xmm8, 0x60(%rsp)
2491238384Sjkim	movaps	%xmm9, 0x70(%rsp)
2492238384Sjkim	movaps	%xmm10, 0x80(%rsp)
2493238384Sjkim	movaps	%xmm11, 0x90(%rsp)
2494238384Sjkim	movaps	%xmm12, 0xa0(%rsp)
2495238384Sjkim	movaps	%xmm13, 0xb0(%rsp)
2496238384Sjkim	movaps	%xmm14, 0xc0(%rsp)
2497238384Sjkim	movaps	%xmm15, 0xd0(%rsp)
2498238384Sjkim.Lxts_dec_body:
2499238384Sjkim___
2500238384Sjkim$code.=<<___;
2501238384Sjkim	mov	%rsp, %rbp		# backup %rsp
2502238384Sjkim	mov	$arg1, $inp		# backup arguments
2503238384Sjkim	mov	$arg2, $out
2504238384Sjkim	mov	$arg3, $len
2505238384Sjkim	mov	$arg4, $key
2506238384Sjkim
2507238384Sjkim	lea	($arg6), $arg1
2508238384Sjkim	lea	0x20(%rbp), $arg2
2509238384Sjkim	lea	($arg5), $arg3
2510238384Sjkim	call	asm_AES_encrypt		# generate initial tweak
2511238384Sjkim
2512238384Sjkim	mov	240($key), %eax		# rounds
2513238384Sjkim	mov	$len, %rbx		# backup $len
2514238384Sjkim
2515238384Sjkim	mov	%eax, %edx		# rounds
2516238384Sjkim	shl	\$7, %rax		# 128 bytes per inner round key
2517238384Sjkim	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2518238384Sjkim	sub	%rax, %rsp
2519238384Sjkim
2520238384Sjkim	mov	%rsp, %rax		# pass key schedule
2521238384Sjkim	mov	$key, %rcx		# pass key
2522238384Sjkim	mov	%edx, %r10d		# pass rounds
2523238384Sjkim	call	_bsaes_key_convert
2524238384Sjkim	pxor	(%rsp), %xmm7		# fix up round 0 key
2525238384Sjkim	movdqa	%xmm6, (%rax)		# save last round key
2526238384Sjkim	movdqa	%xmm7, (%rsp)
2527238384Sjkim
2528238384Sjkim	xor	%eax, %eax		# if ($len%16) len-=16;
2529238384Sjkim	and	\$-16, $len
2530238384Sjkim	test	\$15, %ebx
2531238384Sjkim	setnz	%al
2532238384Sjkim	shl	\$4, %rax
2533238384Sjkim	sub	%rax, $len
2534238384Sjkim
2535238384Sjkim	sub	\$0x80, %rsp		# place for tweak[8]
2536238384Sjkim	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2537238384Sjkim
2538238384Sjkim	pxor	$twtmp, $twtmp
2539238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2540238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2541238384Sjkim
2542238384Sjkim	sub	\$0x80, $len
2543238384Sjkim	jc	.Lxts_dec_short
2544238384Sjkim	jmp	.Lxts_dec_loop
2545238384Sjkim
2546238384Sjkim.align	16
2547238384Sjkim.Lxts_dec_loop:
2548238384Sjkim___
2549238384Sjkim    for ($i=0;$i<7;$i++) {
2550238384Sjkim    $code.=<<___;
2551238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2552238384Sjkim	pxor	$twtmp, $twtmp
2553238384Sjkim	movdqa	@XMM[7], @XMM[$i]
2554238384Sjkim	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2555238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2556238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2557238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2558238384Sjkim	pxor	$twres, @XMM[7]
2559238384Sjkim___
2560238384Sjkim    $code.=<<___ if ($i>=1);
2561238384Sjkim	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2562238384Sjkim___
2563238384Sjkim    $code.=<<___ if ($i>=2);
2564238384Sjkim	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2565238384Sjkim___
2566238384Sjkim    }
2567238384Sjkim$code.=<<___;
2568238384Sjkim	movdqu	0x60($inp), @XMM[8+6]
2569238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2570238384Sjkim	movdqu	0x70($inp), @XMM[8+7]
2571238384Sjkim	lea	0x80($inp), $inp
2572238384Sjkim	movdqa	@XMM[7], 0x70(%rsp)
2573238384Sjkim	pxor	@XMM[8+6], @XMM[6]
2574238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2575238384Sjkim	pxor	@XMM[8+7], @XMM[7]
2576238384Sjkim	mov	%edx, %r10d		# pass rounds
2577238384Sjkim
2578238384Sjkim	call	_bsaes_decrypt8
2579238384Sjkim
2580238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2581238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2582238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2583238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2584238384Sjkim	movdqu	@XMM[1], 0x10($out)
2585238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2586238384Sjkim	movdqu	@XMM[6], 0x20($out)
2587238384Sjkim	pxor	0x40(%rsp), @XMM[2]
2588238384Sjkim	movdqu	@XMM[4], 0x30($out)
2589238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2590238384Sjkim	movdqu	@XMM[2], 0x40($out)
2591238384Sjkim	pxor	0x60(%rsp), @XMM[3]
2592238384Sjkim	movdqu	@XMM[7], 0x50($out)
2593238384Sjkim	pxor	0x70(%rsp), @XMM[5]
2594238384Sjkim	movdqu	@XMM[3], 0x60($out)
2595238384Sjkim	movdqu	@XMM[5], 0x70($out)
2596238384Sjkim	lea	0x80($out), $out
2597238384Sjkim
2598238384Sjkim	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2599238384Sjkim	pxor	$twtmp, $twtmp
2600238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2601238384Sjkim	pcmpgtd	@XMM[7], $twtmp
2602238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2603238384Sjkim	pxor	$twtmp, $twtmp
2604238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2605238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2606238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2607238384Sjkim	pxor	$twres, @XMM[7]
2608238384Sjkim
2609238384Sjkim	sub	\$0x80,$len
2610238384Sjkim	jnc	.Lxts_dec_loop
2611238384Sjkim
2612238384Sjkim.Lxts_dec_short:
2613238384Sjkim	add	\$0x80, $len
2614238384Sjkim	jz	.Lxts_dec_done
2615238384Sjkim___
2616238384Sjkim    for ($i=0;$i<7;$i++) {
2617238384Sjkim    $code.=<<___;
2618238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2619238384Sjkim	pxor	$twtmp, $twtmp
2620238384Sjkim	movdqa	@XMM[7], @XMM[$i]
2621238384Sjkim	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2622238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2623238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2624238384Sjkim	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2625238384Sjkim	pxor	$twres, @XMM[7]
2626238384Sjkim___
2627238384Sjkim    $code.=<<___ if ($i>=1);
2628238384Sjkim	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2629238384Sjkim	cmp	\$`0x10*$i`,$len
2630238384Sjkim	je	.Lxts_dec_$i
2631238384Sjkim___
2632238384Sjkim    $code.=<<___ if ($i>=2);
2633238384Sjkim	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2634238384Sjkim___
2635238384Sjkim    }
2636238384Sjkim$code.=<<___;
2637238384Sjkim	movdqu	0x60($inp), @XMM[8+6]
2638238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2639238384Sjkim	movdqa	@XMM[7], 0x70(%rsp)
2640238384Sjkim	lea	0x70($inp), $inp
2641238384Sjkim	pxor	@XMM[8+6], @XMM[6]
2642238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2643238384Sjkim	mov	%edx, %r10d		# pass rounds
2644238384Sjkim
2645238384Sjkim	call	_bsaes_decrypt8
2646238384Sjkim
2647238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2648238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2649238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2650238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2651238384Sjkim	movdqu	@XMM[1], 0x10($out)
2652238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2653238384Sjkim	movdqu	@XMM[6], 0x20($out)
2654238384Sjkim	pxor	0x40(%rsp), @XMM[2]
2655238384Sjkim	movdqu	@XMM[4], 0x30($out)
2656238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2657238384Sjkim	movdqu	@XMM[2], 0x40($out)
2658238384Sjkim	pxor	0x60(%rsp), @XMM[3]
2659238384Sjkim	movdqu	@XMM[7], 0x50($out)
2660238384Sjkim	movdqu	@XMM[3], 0x60($out)
2661238384Sjkim	lea	0x70($out), $out
2662238384Sjkim
2663238384Sjkim	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2664238384Sjkim	jmp	.Lxts_dec_done
2665238384Sjkim.align	16
2666238384Sjkim.Lxts_dec_6:
2667238384Sjkim	pxor	@XMM[8+4], @XMM[4]
2668238384Sjkim	lea	0x60($inp), $inp
2669238384Sjkim	pxor	@XMM[8+5], @XMM[5]
2670238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2671238384Sjkim	mov	%edx, %r10d		# pass rounds
2672238384Sjkim
2673238384Sjkim	call	_bsaes_decrypt8
2674238384Sjkim
2675238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2676238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2677238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2678238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2679238384Sjkim	movdqu	@XMM[1], 0x10($out)
2680238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2681238384Sjkim	movdqu	@XMM[6], 0x20($out)
2682238384Sjkim	pxor	0x40(%rsp), @XMM[2]
2683238384Sjkim	movdqu	@XMM[4], 0x30($out)
2684238384Sjkim	pxor	0x50(%rsp), @XMM[7]
2685238384Sjkim	movdqu	@XMM[2], 0x40($out)
2686238384Sjkim	movdqu	@XMM[7], 0x50($out)
2687238384Sjkim	lea	0x60($out), $out
2688238384Sjkim
2689238384Sjkim	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2690238384Sjkim	jmp	.Lxts_dec_done
2691238384Sjkim.align	16
2692238384Sjkim.Lxts_dec_5:
2693238384Sjkim	pxor	@XMM[8+3], @XMM[3]
2694238384Sjkim	lea	0x50($inp), $inp
2695238384Sjkim	pxor	@XMM[8+4], @XMM[4]
2696238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2697238384Sjkim	mov	%edx, %r10d		# pass rounds
2698238384Sjkim
2699238384Sjkim	call	_bsaes_decrypt8
2700238384Sjkim
2701238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2702238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2703238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2704238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2705238384Sjkim	movdqu	@XMM[1], 0x10($out)
2706238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2707238384Sjkim	movdqu	@XMM[6], 0x20($out)
2708238384Sjkim	pxor	0x40(%rsp), @XMM[2]
2709238384Sjkim	movdqu	@XMM[4], 0x30($out)
2710238384Sjkim	movdqu	@XMM[2], 0x40($out)
2711238384Sjkim	lea	0x50($out), $out
2712238384Sjkim
2713238384Sjkim	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2714238384Sjkim	jmp	.Lxts_dec_done
2715238384Sjkim.align	16
2716238384Sjkim.Lxts_dec_4:
2717238384Sjkim	pxor	@XMM[8+2], @XMM[2]
2718238384Sjkim	lea	0x40($inp), $inp
2719238384Sjkim	pxor	@XMM[8+3], @XMM[3]
2720238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2721238384Sjkim	mov	%edx, %r10d		# pass rounds
2722238384Sjkim
2723238384Sjkim	call	_bsaes_decrypt8
2724238384Sjkim
2725238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2726238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2727238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2728238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2729238384Sjkim	movdqu	@XMM[1], 0x10($out)
2730238384Sjkim	pxor	0x30(%rsp), @XMM[4]
2731238384Sjkim	movdqu	@XMM[6], 0x20($out)
2732238384Sjkim	movdqu	@XMM[4], 0x30($out)
2733238384Sjkim	lea	0x40($out), $out
2734238384Sjkim
2735238384Sjkim	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2736238384Sjkim	jmp	.Lxts_dec_done
2737238384Sjkim.align	16
2738238384Sjkim.Lxts_dec_3:
2739238384Sjkim	pxor	@XMM[8+1], @XMM[1]
2740238384Sjkim	lea	0x30($inp), $inp
2741238384Sjkim	pxor	@XMM[8+2], @XMM[2]
2742238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2743238384Sjkim	mov	%edx, %r10d		# pass rounds
2744238384Sjkim
2745238384Sjkim	call	_bsaes_decrypt8
2746238384Sjkim
2747238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2748238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2749238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2750238384Sjkim	pxor	0x20(%rsp), @XMM[6]
2751238384Sjkim	movdqu	@XMM[1], 0x10($out)
2752238384Sjkim	movdqu	@XMM[6], 0x20($out)
2753238384Sjkim	lea	0x30($out), $out
2754238384Sjkim
2755238384Sjkim	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2756238384Sjkim	jmp	.Lxts_dec_done
2757238384Sjkim.align	16
2758238384Sjkim.Lxts_dec_2:
2759238384Sjkim	pxor	@XMM[8+0], @XMM[0]
2760238384Sjkim	lea	0x20($inp), $inp
2761238384Sjkim	pxor	@XMM[8+1], @XMM[1]
2762238384Sjkim	lea	0x80(%rsp), %rax	# pass key schedule
2763238384Sjkim	mov	%edx, %r10d		# pass rounds
2764238384Sjkim
2765238384Sjkim	call	_bsaes_decrypt8
2766238384Sjkim
2767238384Sjkim	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2768238384Sjkim	pxor	0x10(%rsp), @XMM[1]
2769238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2770238384Sjkim	movdqu	@XMM[1], 0x10($out)
2771238384Sjkim	lea	0x20($out), $out
2772238384Sjkim
2773238384Sjkim	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2774238384Sjkim	jmp	.Lxts_dec_done
2775238384Sjkim.align	16
2776238384Sjkim.Lxts_dec_1:
2777238384Sjkim	pxor	@XMM[0], @XMM[8]
2778238384Sjkim	lea	0x10($inp), $inp
2779238384Sjkim	movdqa	@XMM[8], 0x20(%rbp)
2780238384Sjkim	lea	0x20(%rbp), $arg1
2781238384Sjkim	lea	0x20(%rbp), $arg2
2782238384Sjkim	lea	($key), $arg3
2783238384Sjkim	call	asm_AES_decrypt		# doesn't touch %xmm
2784238384Sjkim	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2785238384Sjkim	#pxor	@XMM[8], @XMM[0]
2786238384Sjkim	#lea	0x80(%rsp), %rax	# pass key schedule
2787238384Sjkim	#mov	%edx, %r10d		# pass rounds
2788238384Sjkim	#call	_bsaes_decrypt8
2789238384Sjkim	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2790238384Sjkim	movdqu	@XMM[0], 0x00($out)	# write output
2791238384Sjkim	lea	0x10($out), $out
2792238384Sjkim
2793238384Sjkim	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2794238384Sjkim
2795238384Sjkim.Lxts_dec_done:
2796238384Sjkim	and	\$15, %ebx
2797238384Sjkim	jz	.Lxts_dec_ret
2798238384Sjkim
2799238384Sjkim	pxor	$twtmp, $twtmp
2800238384Sjkim	movdqa	.Lxts_magic(%rip), $twmask
2801238384Sjkim	pcmpgtd	@XMM[7], $twtmp
2802238384Sjkim	pshufd	\$0x13, $twtmp, $twres
2803238384Sjkim	movdqa	@XMM[7], @XMM[6]
2804238384Sjkim	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2805238384Sjkim	pand	$twmask, $twres		# isolate carry and residue
2806238384Sjkim	movdqu	($inp), @XMM[0]
2807238384Sjkim	pxor	$twres, @XMM[7]
2808238384Sjkim
2809238384Sjkim	lea	0x20(%rbp), $arg1
2810238384Sjkim	pxor	@XMM[7], @XMM[0]
2811238384Sjkim	lea	0x20(%rbp), $arg2
2812238384Sjkim	movdqa	@XMM[0], 0x20(%rbp)
2813238384Sjkim	lea	($key), $arg3
2814238384Sjkim	call	asm_AES_decrypt		# doesn't touch %xmm
2815238384Sjkim	pxor	0x20(%rbp), @XMM[7]
2816238384Sjkim	mov	$out, %rdx
2817238384Sjkim	movdqu	@XMM[7], ($out)
2818238384Sjkim
2819238384Sjkim.Lxts_dec_steal:
2820238384Sjkim	movzb	16($inp), %eax
2821238384Sjkim	movzb	(%rdx), %ecx
2822238384Sjkim	lea	1($inp), $inp
2823238384Sjkim	mov	%al, (%rdx)
2824238384Sjkim	mov	%cl, 16(%rdx)
2825238384Sjkim	lea	1(%rdx), %rdx
2826238384Sjkim	sub	\$1,%ebx
2827238384Sjkim	jnz	.Lxts_dec_steal
2828238384Sjkim
2829238384Sjkim	movdqu	($out), @XMM[0]
2830238384Sjkim	lea	0x20(%rbp), $arg1
2831238384Sjkim	pxor	@XMM[6], @XMM[0]
2832238384Sjkim	lea	0x20(%rbp), $arg2
2833238384Sjkim	movdqa	@XMM[0], 0x20(%rbp)
2834238384Sjkim	lea	($key), $arg3
2835238384Sjkim	call	asm_AES_decrypt		# doesn't touch %xmm
2836238384Sjkim	pxor	0x20(%rbp), @XMM[6]
2837238384Sjkim	movdqu	@XMM[6], ($out)
2838238384Sjkim
2839238384Sjkim.Lxts_dec_ret:
2840238384Sjkim	lea	(%rsp), %rax
2841238384Sjkim	pxor	%xmm0, %xmm0
2842238384Sjkim.Lxts_dec_bzero:			# wipe key schedule [if any]
2843238384Sjkim	movdqa	%xmm0, 0x00(%rax)
2844238384Sjkim	movdqa	%xmm0, 0x10(%rax)
2845238384Sjkim	lea	0x20(%rax), %rax
2846238384Sjkim	cmp	%rax, %rbp
2847238384Sjkim	ja	.Lxts_dec_bzero
2848238384Sjkim
2849238384Sjkim	lea	(%rbp),%rsp		# restore %rsp
2850238384Sjkim___
2851238384Sjkim$code.=<<___ if ($win64);
2852238384Sjkim	movaps	0x40(%rbp), %xmm6
2853238384Sjkim	movaps	0x50(%rbp), %xmm7
2854238384Sjkim	movaps	0x60(%rbp), %xmm8
2855238384Sjkim	movaps	0x70(%rbp), %xmm9
2856238384Sjkim	movaps	0x80(%rbp), %xmm10
2857238384Sjkim	movaps	0x90(%rbp), %xmm11
2858238384Sjkim	movaps	0xa0(%rbp), %xmm12
2859238384Sjkim	movaps	0xb0(%rbp), %xmm13
2860238384Sjkim	movaps	0xc0(%rbp), %xmm14
2861238384Sjkim	movaps	0xd0(%rbp), %xmm15
2862238384Sjkim	lea	0xa0(%rbp), %rsp
2863238384Sjkim___
2864238384Sjkim$code.=<<___;
2865238384Sjkim	mov	0x48(%rsp), %r15
2866238384Sjkim	mov	0x50(%rsp), %r14
2867238384Sjkim	mov	0x58(%rsp), %r13
2868238384Sjkim	mov	0x60(%rsp), %r12
2869238384Sjkim	mov	0x68(%rsp), %rbx
2870238384Sjkim	mov	0x70(%rsp), %rax
2871238384Sjkim	lea	0x78(%rsp), %rsp
2872238384Sjkim	mov	%rax, %rbp
2873238384Sjkim.Lxts_dec_epilogue:
2874238384Sjkim	ret
2875238384Sjkim.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2876238384Sjkim___
2877238384Sjkim}
2878238384Sjkim$code.=<<___;
2879238384Sjkim.type	_bsaes_const,\@object
2880238384Sjkim.align	64
2881238384Sjkim_bsaes_const:
2882238384Sjkim.LM0ISR:	# InvShiftRows constants
2883238384Sjkim	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2884238384Sjkim.LISRM0:
2885238384Sjkim	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2886238384Sjkim.LISR:
2887238384Sjkim	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2888238384Sjkim.LBS0:		# bit-slice constants
2889238384Sjkim	.quad	0x5555555555555555, 0x5555555555555555
2890238384Sjkim.LBS1:
2891238384Sjkim	.quad	0x3333333333333333, 0x3333333333333333
2892238384Sjkim.LBS2:
2893238384Sjkim	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2894238384Sjkim.LSR:		# shiftrows constants
2895238384Sjkim	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2896238384Sjkim.LSRM0:
2897238384Sjkim	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2898238384Sjkim.LM0SR:
2899238384Sjkim	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2900238384Sjkim.LSWPUP:	# byte-swap upper dword
2901238384Sjkim	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2902238384Sjkim.LSWPUPM0SR:
2903238384Sjkim	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2904238384Sjkim.LADD1:		# counter increment constants
2905238384Sjkim	.quad	0x0000000000000000, 0x0000000100000000
2906238384Sjkim.LADD2:
2907238384Sjkim	.quad	0x0000000000000000, 0x0000000200000000
2908238384Sjkim.LADD3:
2909238384Sjkim	.quad	0x0000000000000000, 0x0000000300000000
2910238384Sjkim.LADD4:
2911238384Sjkim	.quad	0x0000000000000000, 0x0000000400000000
2912238384Sjkim.LADD5:
2913238384Sjkim	.quad	0x0000000000000000, 0x0000000500000000
2914238384Sjkim.LADD6:
2915238384Sjkim	.quad	0x0000000000000000, 0x0000000600000000
2916238384Sjkim.LADD7:
2917238384Sjkim	.quad	0x0000000000000000, 0x0000000700000000
2918238384Sjkim.LADD8:
2919238384Sjkim	.quad	0x0000000000000000, 0x0000000800000000
2920238384Sjkim.Lxts_magic:
2921238384Sjkim	.long	0x87,0,1,0
2922238384Sjkim.Lmasks:
2923238384Sjkim	.quad	0x0101010101010101, 0x0101010101010101
2924238384Sjkim	.quad	0x0202020202020202, 0x0202020202020202
2925238384Sjkim	.quad	0x0404040404040404, 0x0404040404040404
2926238384Sjkim	.quad	0x0808080808080808, 0x0808080808080808
2927238384Sjkim.LM0:
2928238384Sjkim	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2929238384Sjkim.L63:
2930238384Sjkim	.quad	0x6363636363636363, 0x6363636363636363
2931238384Sjkim.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia K��sper, Peter Schwabe, Andy Polyakov"
2932238384Sjkim.align	64
2933238384Sjkim.size	_bsaes_const,.-_bsaes_const
2934238384Sjkim___
2935238384Sjkim
2936238384Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2937238384Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2938238384Sjkimif ($win64) {
2939238384Sjkim$rec="%rcx";
2940238384Sjkim$frame="%rdx";
2941238384Sjkim$context="%r8";
2942238384Sjkim$disp="%r9";
2943238384Sjkim
2944238384Sjkim$code.=<<___;
2945238384Sjkim.extern	__imp_RtlVirtualUnwind
2946238384Sjkim.type	se_handler,\@abi-omnipotent
2947238384Sjkim.align	16
2948238384Sjkimse_handler:
2949238384Sjkim	push	%rsi
2950238384Sjkim	push	%rdi
2951238384Sjkim	push	%rbx
2952238384Sjkim	push	%rbp
2953238384Sjkim	push	%r12
2954238384Sjkim	push	%r13
2955238384Sjkim	push	%r14
2956238384Sjkim	push	%r15
2957238384Sjkim	pushfq
2958238384Sjkim	sub	\$64,%rsp
2959238384Sjkim
2960238384Sjkim	mov	120($context),%rax	# pull context->Rax
2961238384Sjkim	mov	248($context),%rbx	# pull context->Rip
2962238384Sjkim
2963238384Sjkim	mov	8($disp),%rsi		# disp->ImageBase
2964238384Sjkim	mov	56($disp),%r11		# disp->HandlerData
2965238384Sjkim
2966238384Sjkim	mov	0(%r11),%r10d		# HandlerData[0]
2967238384Sjkim	lea	(%rsi,%r10),%r10	# prologue label
2968238384Sjkim	cmp	%r10,%rbx		# context->Rip<prologue label
2969238384Sjkim	jb	.Lin_prologue
2970238384Sjkim
2971238384Sjkim	mov	152($context),%rax	# pull context->Rsp
2972238384Sjkim
2973238384Sjkim	mov	4(%r11),%r10d		# HandlerData[1]
2974238384Sjkim	lea	(%rsi,%r10),%r10	# epilogue label
2975238384Sjkim	cmp	%r10,%rbx		# context->Rip>=epilogue label
2976238384Sjkim	jae	.Lin_prologue
2977238384Sjkim
2978238384Sjkim	mov	160($context),%rax	# pull context->Rbp
2979238384Sjkim
2980238384Sjkim	lea	0x40(%rax),%rsi		# %xmm save area
2981238384Sjkim	lea	512($context),%rdi	# &context.Xmm6
2982238384Sjkim	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2983238384Sjkim	.long	0xa548f3fc		# cld; rep movsq
2984238384Sjkim	lea	0xa0(%rax),%rax		# adjust stack pointer
2985238384Sjkim
2986238384Sjkim	mov	0x70(%rax),%rbp
2987238384Sjkim	mov	0x68(%rax),%rbx
2988238384Sjkim	mov	0x60(%rax),%r12
2989238384Sjkim	mov	0x58(%rax),%r13
2990238384Sjkim	mov	0x50(%rax),%r14
2991238384Sjkim	mov	0x48(%rax),%r15
2992238384Sjkim	lea	0x78(%rax),%rax		# adjust stack pointer
2993238384Sjkim	mov	%rbx,144($context)	# restore context->Rbx
2994238384Sjkim	mov	%rbp,160($context)	# restore context->Rbp
2995238384Sjkim	mov	%r12,216($context)	# restore context->R12
2996238384Sjkim	mov	%r13,224($context)	# restore context->R13
2997238384Sjkim	mov	%r14,232($context)	# restore context->R14
2998238384Sjkim	mov	%r15,240($context)	# restore context->R15
2999238384Sjkim
3000238384Sjkim.Lin_prologue:
3001238384Sjkim	mov	%rax,152($context)	# restore context->Rsp
3002238384Sjkim
3003238384Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
3004238384Sjkim	mov	$context,%rsi		# context
3005238384Sjkim	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3006238384Sjkim	.long	0xa548f3fc		# cld; rep movsq
3007238384Sjkim
3008238384Sjkim	mov	$disp,%rsi
3009238384Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3010238384Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3011238384Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3012238384Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3013238384Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
3014238384Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
3015238384Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3016238384Sjkim	mov	%r10,32(%rsp)		# arg5
3017238384Sjkim	mov	%r11,40(%rsp)		# arg6
3018238384Sjkim	mov	%r12,48(%rsp)		# arg7
3019238384Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
3020238384Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
3021238384Sjkim
3022238384Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
3023238384Sjkim	add	\$64,%rsp
3024238384Sjkim	popfq
3025238384Sjkim	pop	%r15
3026238384Sjkim	pop	%r14
3027238384Sjkim	pop	%r13
3028238384Sjkim	pop	%r12
3029238384Sjkim	pop	%rbp
3030238384Sjkim	pop	%rbx
3031238384Sjkim	pop	%rdi
3032238384Sjkim	pop	%rsi
3033238384Sjkim	ret
3034238384Sjkim.size	se_handler,.-se_handler
3035238384Sjkim
3036238384Sjkim.section	.pdata
3037238384Sjkim.align	4
3038238384Sjkim___
3039238384Sjkim$code.=<<___ if ($ecb);
3040238384Sjkim	.rva	.Lecb_enc_prologue
3041238384Sjkim	.rva	.Lecb_enc_epilogue
3042238384Sjkim	.rva	.Lecb_enc_info
3043238384Sjkim
3044238384Sjkim	.rva	.Lecb_dec_prologue
3045238384Sjkim	.rva	.Lecb_dec_epilogue
3046238384Sjkim	.rva	.Lecb_dec_info
3047238384Sjkim___
3048238384Sjkim$code.=<<___;
3049238384Sjkim	.rva	.Lcbc_dec_prologue
3050238384Sjkim	.rva	.Lcbc_dec_epilogue
3051238384Sjkim	.rva	.Lcbc_dec_info
3052238384Sjkim
3053238384Sjkim	.rva	.Lctr_enc_prologue
3054238384Sjkim	.rva	.Lctr_enc_epilogue
3055238384Sjkim	.rva	.Lctr_enc_info
3056238384Sjkim
3057238384Sjkim	.rva	.Lxts_enc_prologue
3058238384Sjkim	.rva	.Lxts_enc_epilogue
3059238384Sjkim	.rva	.Lxts_enc_info
3060238384Sjkim
3061238384Sjkim	.rva	.Lxts_dec_prologue
3062238384Sjkim	.rva	.Lxts_dec_epilogue
3063238384Sjkim	.rva	.Lxts_dec_info
3064238384Sjkim
3065238384Sjkim.section	.xdata
3066238384Sjkim.align	8
3067238384Sjkim___
3068238384Sjkim$code.=<<___ if ($ecb);
3069238384Sjkim.Lecb_enc_info:
3070238384Sjkim	.byte	9,0,0,0
3071238384Sjkim	.rva	se_handler
3072238384Sjkim	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3073238384Sjkim.Lecb_dec_info:
3074238384Sjkim	.byte	9,0,0,0
3075238384Sjkim	.rva	se_handler
3076238384Sjkim	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3077238384Sjkim___
3078238384Sjkim$code.=<<___;
3079238384Sjkim.Lcbc_dec_info:
3080238384Sjkim	.byte	9,0,0,0
3081238384Sjkim	.rva	se_handler
3082238384Sjkim	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3083238384Sjkim.Lctr_enc_info:
3084238384Sjkim	.byte	9,0,0,0
3085238384Sjkim	.rva	se_handler
3086238384Sjkim	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3087238384Sjkim.Lxts_enc_info:
3088238384Sjkim	.byte	9,0,0,0
3089238384Sjkim	.rva	se_handler
3090238384Sjkim	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3091238384Sjkim.Lxts_dec_info:
3092238384Sjkim	.byte	9,0,0,0
3093238384Sjkim	.rva	se_handler
3094238384Sjkim	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3095238384Sjkim___
3096238384Sjkim}
3097238384Sjkim
3098238384Sjkim$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3099238384Sjkim
3100238384Sjkimprint $code;
3101238384Sjkim
3102238384Sjkimclose STDOUT;
3103