1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * AES-XTS for modern x86_64 CPUs
4 *
5 * Copyright 2024 Google LLC
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10/*
11 * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
12 * complexities of coding for x86 SIMD, e.g. where every vector length needs
13 * different code, it uses a macro to generate several implementations that
14 * share similar source code but are targeted at different CPUs, listed below:
15 *
16 * AES-NI + AVX
17 *    - 128-bit vectors (1 AES block per vector)
18 *    - VEX-coded instructions
19 *    - xmm0-xmm15
20 *    - This is for older CPUs that lack VAES but do have AVX.
21 *
22 * VAES + VPCLMULQDQ + AVX2
23 *    - 256-bit vectors (2 AES blocks per vector)
24 *    - VEX-coded instructions
25 *    - ymm0-ymm15
26 *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
27 *      e.g. Intel's Alder Lake and AMD's Zen 3.
28 *
29 * VAES + VPCLMULQDQ + AVX10/256 + BMI2
30 *    - 256-bit vectors (2 AES blocks per vector)
31 *    - EVEX-coded instructions
32 *    - ymm0-ymm31
33 *    - This is for CPUs that have AVX512 but where using zmm registers causes
34 *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
35 *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
36 *      To avoid confusion with 512-bit, we just write AVX10/256.
37 *
38 * VAES + VPCLMULQDQ + AVX10/512 + BMI2
39 *    - Same as the previous one, but upgrades to 512-bit vectors
40 *      (4 AES blocks per vector) in zmm0-zmm31.
41 *    - This is for CPUs that have good AVX512 or AVX10/512 support.
42 *
43 * This file doesn't have an implementation for AES-NI alone (without AVX), as
44 * the lack of VEX would make all the assembly code different.
45 *
46 * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
47 * the XTS tweaks.  This avoids a bottleneck.  Currently there don't seem to be
48 * any CPUs that support VAES but not VPCLMULQDQ.  If that changes, we might
49 * need to start also providing an implementation using VAES alone.
50 *
51 * The AES-XTS implementations in this file support everything required by the
52 * crypto API, including support for arbitrary input lengths and multi-part
53 * processing.  However, they are most heavily optimized for the common case of
54 * power-of-2 length inputs that are processed in a single part (disk sectors).
55 */
56
57#include <linux/linkage.h>
58#include <linux/cfi_types.h>
59
60.section .rodata
61.p2align 4
62.Lgf_poly:
63	// The low 64 bits of this value represent the polynomial x^7 + x^2 + x
64	// + 1.  It is the value that must be XOR'd into the low 64 bits of the
65	// tweak each time a 1 is carried out of the high 64 bits.
66	//
67	// The high 64 bits of this value is just the internal carry bit that
68	// exists when there's a carry out of the low 64 bits of the tweak.
69	.quad	0x87, 1
70
71	// This table contains constants for vpshufb and vpblendvb, used to
72	// handle variable byte shifts and blending during ciphertext stealing
73	// on CPUs that don't support AVX10-style masking.
74.Lcts_permute_table:
75	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
76	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
77	.byte	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
78	.byte	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
79	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
80	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
81.text
82
83// Function parameters
84.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
85				// advanced to point to 7th-from-last round key
86.set	SRC,		%rsi	// Pointer to next source data
87.set	DST,		%rdx	// Pointer to next destination data
88.set	LEN,		%ecx	// Remaining length in bytes
89.set	LEN8,		%cl
90.set	LEN64,		%rcx
91.set	TWEAK,		%r8	// Pointer to next tweak
92
93// %rax holds the AES key length in bytes.
94.set	KEYLEN,		%eax
95.set	KEYLEN64,	%rax
96
97// %r9-r11 are available as temporaries.
98
99.macro	_define_Vi	i
100.if VL == 16
101	.set	V\i,		%xmm\i
102.elseif VL == 32
103	.set	V\i,		%ymm\i
104.elseif VL == 64
105	.set	V\i,		%zmm\i
106.else
107	.error "Unsupported Vector Length (VL)"
108.endif
109.endm
110
111.macro _define_aliases
112	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
113	// are available, that map to the xmm, ymm, or zmm registers according
114	// to the selected Vector Length (VL).
115	_define_Vi	0
116	_define_Vi	1
117	_define_Vi	2
118	_define_Vi	3
119	_define_Vi	4
120	_define_Vi	5
121	_define_Vi	6
122	_define_Vi	7
123	_define_Vi	8
124	_define_Vi	9
125	_define_Vi	10
126	_define_Vi	11
127	_define_Vi	12
128	_define_Vi	13
129	_define_Vi	14
130	_define_Vi	15
131.if USE_AVX10
132	_define_Vi	16
133	_define_Vi	17
134	_define_Vi	18
135	_define_Vi	19
136	_define_Vi	20
137	_define_Vi	21
138	_define_Vi	22
139	_define_Vi	23
140	_define_Vi	24
141	_define_Vi	25
142	_define_Vi	26
143	_define_Vi	27
144	_define_Vi	28
145	_define_Vi	29
146	_define_Vi	30
147	_define_Vi	31
148.endif
149
150	// V0-V3 hold the data blocks during the main loop, or temporary values
151	// otherwise.  V4-V5 hold temporary values.
152
153	// V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
154	.set	TWEAK0_XMM,	%xmm6
155	.set	TWEAK0,		V6
156	.set	TWEAK1_XMM,	%xmm7
157	.set	TWEAK1,		V7
158	.set	TWEAK2,		V8
159	.set	TWEAK3,		V9
160
161	// V10-V13 are used for computing the next values of TWEAK[0-3].
162	.set	NEXT_TWEAK0,	V10
163	.set	NEXT_TWEAK1,	V11
164	.set	NEXT_TWEAK2,	V12
165	.set	NEXT_TWEAK3,	V13
166
167	// V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
168	.set	GF_POLY_XMM,	%xmm14
169	.set	GF_POLY,	V14
170
171	// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
172	.set	KEY0_XMM,	%xmm15
173	.set	KEY0,		V15
174
175	// If 32 SIMD registers are available, then V16-V29 hold the remaining
176	// AES round keys, copied to all 128-bit lanes.
177	//
178	// AES-128, AES-192, and AES-256 use different numbers of round keys.
179	// To allow handling all three variants efficiently, we align the round
180	// keys to the *end* of this register range.  I.e., AES-128 uses
181	// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
182	// (All also use KEY0 for the XOR-only "round" at the beginning.)
183.if USE_AVX10
184	.set	KEY1_XMM,	%xmm16
185	.set	KEY1,		V16
186	.set	KEY2_XMM,	%xmm17
187	.set	KEY2,		V17
188	.set	KEY3_XMM,	%xmm18
189	.set	KEY3,		V18
190	.set	KEY4_XMM,	%xmm19
191	.set	KEY4,		V19
192	.set	KEY5_XMM,	%xmm20
193	.set	KEY5,		V20
194	.set	KEY6_XMM,	%xmm21
195	.set	KEY6,		V21
196	.set	KEY7_XMM,	%xmm22
197	.set	KEY7,		V22
198	.set	KEY8_XMM,	%xmm23
199	.set	KEY8,		V23
200	.set	KEY9_XMM,	%xmm24
201	.set	KEY9,		V24
202	.set	KEY10_XMM,	%xmm25
203	.set	KEY10,		V25
204	.set	KEY11_XMM,	%xmm26
205	.set	KEY11,		V26
206	.set	KEY12_XMM,	%xmm27
207	.set	KEY12,		V27
208	.set	KEY13_XMM,	%xmm28
209	.set	KEY13,		V28
210	.set	KEY14_XMM,	%xmm29
211	.set	KEY14,		V29
212.endif
213	// V30-V31 are currently unused.
214.endm
215
216// Move a vector between memory and a register.
217.macro	_vmovdqu	src, dst
218.if VL < 64
219	vmovdqu		\src, \dst
220.else
221	vmovdqu8	\src, \dst
222.endif
223.endm
224
225// Broadcast a 128-bit value into a vector.
226.macro	_vbroadcast128	src, dst
227.if VL == 16 && !USE_AVX10
228	vmovdqu		\src, \dst
229.elseif VL == 32 && !USE_AVX10
230	vbroadcasti128	\src, \dst
231.else
232	vbroadcasti32x4	\src, \dst
233.endif
234.endm
235
236// XOR two vectors together.
237.macro	_vpxor	src1, src2, dst
238.if USE_AVX10
239	vpxord		\src1, \src2, \dst
240.else
241	vpxor		\src1, \src2, \dst
242.endif
243.endm
244
245// XOR three vectors together.
246.macro	_xor3	src1, src2, src3_and_dst
247.if USE_AVX10
248	// vpternlogd with immediate 0x96 is a three-argument XOR.
249	vpternlogd	$0x96, \src1, \src2, \src3_and_dst
250.else
251	vpxor		\src1, \src3_and_dst, \src3_and_dst
252	vpxor		\src2, \src3_and_dst, \src3_and_dst
253.endif
254.endm
255
256// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
257// (by multiplying by the polynomial 'x') and write it to \dst.
258.macro	_next_tweak	src, tmp, dst
259	vpshufd		$0x13, \src, \tmp
260	vpaddq		\src, \src, \dst
261	vpsrad		$31, \tmp, \tmp
262	vpand		GF_POLY_XMM, \tmp, \tmp
263	vpxor		\tmp, \dst, \dst
264.endm
265
266// Given the XTS tweak(s) in the vector \src, compute the next vector of
267// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
268//
269// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
270// all tweaks in the vector in parallel.  If VL=16, we just do the regular
271// computation without vpclmulqdq, as it's the faster method for a single tweak.
272.macro	_next_tweakvec	src, tmp1, tmp2, dst
273.if VL == 16
274	_next_tweak	\src, \tmp1, \dst
275.else
276	vpsrlq		$64 - VL/16, \src, \tmp1
277	vpclmulqdq	$0x01, GF_POLY, \tmp1, \tmp2
278	vpslldq		$8, \tmp1, \tmp1
279	vpsllq		$VL/16, \src, \dst
280	_xor3		\tmp1, \tmp2, \dst
281.endif
282.endm
283
284// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
285// store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
286.macro	_compute_first_set_of_tweaks
287	vmovdqu		(TWEAK), TWEAK0_XMM
288	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
289.if VL == 16
290	// With VL=16, multiplying by x serially is fastest.
291	_next_tweak	TWEAK0, %xmm0, TWEAK1
292	_next_tweak	TWEAK1, %xmm0, TWEAK2
293	_next_tweak	TWEAK2, %xmm0, TWEAK3
294.else
295.if VL == 32
296	// Compute the second block of TWEAK0.
297	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
298	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
299.elseif VL == 64
300	// Compute the remaining blocks of TWEAK0.
301	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
302	_next_tweak	%xmm1, %xmm0, %xmm2
303	_next_tweak	%xmm2, %xmm0, %xmm3
304	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
305	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
306	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
307.endif
308	// Compute TWEAK[1-3] from TWEAK0.
309	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
310	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
311	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
312	vpclmulqdq	$0x01, GF_POLY, V0, V1
313	vpclmulqdq	$0x01, GF_POLY, V2, V3
314	vpclmulqdq	$0x01, GF_POLY, V4, V5
315	vpslldq		$8, V0, V0
316	vpslldq		$8, V2, V2
317	vpslldq		$8, V4, V4
318	vpsllq		$1*VL/16, TWEAK0, TWEAK1
319	vpsllq		$2*VL/16, TWEAK0, TWEAK2
320	vpsllq		$3*VL/16, TWEAK0, TWEAK3
321.if USE_AVX10
322	vpternlogd	$0x96, V0, V1, TWEAK1
323	vpternlogd	$0x96, V2, V3, TWEAK2
324	vpternlogd	$0x96, V4, V5, TWEAK3
325.else
326	vpxor		V0, TWEAK1, TWEAK1
327	vpxor		V2, TWEAK2, TWEAK2
328	vpxor		V4, TWEAK3, TWEAK3
329	vpxor		V1, TWEAK1, TWEAK1
330	vpxor		V3, TWEAK2, TWEAK2
331	vpxor		V5, TWEAK3, TWEAK3
332.endif
333.endif
334.endm
335
336// Do one step in computing the next set of tweaks using the method of just
337// multiplying by x repeatedly (the same method _next_tweak uses).
338.macro	_tweak_step_mulx	i
339.if \i == 0
340	.set PREV_TWEAK, TWEAK3
341	.set NEXT_TWEAK, NEXT_TWEAK0
342.elseif \i == 5
343	.set PREV_TWEAK, NEXT_TWEAK0
344	.set NEXT_TWEAK, NEXT_TWEAK1
345.elseif \i == 10
346	.set PREV_TWEAK, NEXT_TWEAK1
347	.set NEXT_TWEAK, NEXT_TWEAK2
348.elseif \i == 15
349	.set PREV_TWEAK, NEXT_TWEAK2
350	.set NEXT_TWEAK, NEXT_TWEAK3
351.endif
352.if \i >= 0 && \i < 20 && \i % 5 == 0
353	vpshufd		$0x13, PREV_TWEAK, V5
354.elseif \i >= 0 && \i < 20 && \i % 5 == 1
355	vpaddq		PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
356.elseif \i >= 0 && \i < 20 && \i % 5 == 2
357	vpsrad		$31, V5, V5
358.elseif \i >= 0 && \i < 20 && \i % 5 == 3
359	vpand		GF_POLY, V5, V5
360.elseif \i >= 0 && \i < 20 && \i % 5 == 4
361	vpxor		V5, NEXT_TWEAK, NEXT_TWEAK
362.elseif \i == 1000
363	vmovdqa		NEXT_TWEAK0, TWEAK0
364	vmovdqa		NEXT_TWEAK1, TWEAK1
365	vmovdqa		NEXT_TWEAK2, TWEAK2
366	vmovdqa		NEXT_TWEAK3, TWEAK3
367.endif
368.endm
369
370// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
371// (the same method _next_tweakvec uses for VL > 16).  This means multiplying
372// each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
373// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
374// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
375.macro	_tweak_step_pclmul	i
376.if \i == 0
377	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
378.elseif \i == 2
379	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
380.elseif \i == 4
381	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
382.elseif \i == 6
383	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
384.elseif \i == 8
385	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
386.elseif \i == 10
387	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
388.elseif \i == 12
389	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
390.elseif \i == 14
391	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
392.elseif \i == 1000
393	vpslldq		$(4*VL/16) / 8, TWEAK0, TWEAK0
394	vpslldq		$(4*VL/16) / 8, TWEAK1, TWEAK1
395	vpslldq		$(4*VL/16) / 8, TWEAK2, TWEAK2
396	vpslldq		$(4*VL/16) / 8, TWEAK3, TWEAK3
397	_vpxor		NEXT_TWEAK0, TWEAK0, TWEAK0
398	_vpxor		NEXT_TWEAK1, TWEAK1, TWEAK1
399	_vpxor		NEXT_TWEAK2, TWEAK2, TWEAK2
400	_vpxor		NEXT_TWEAK3, TWEAK3, TWEAK3
401.endif
402.endm
403
404// _tweak_step does one step of the computation of the next set of tweaks from
405// TWEAK[0-3].  To complete all steps, this is invoked with increasing values of
406// \i that include at least 0 through 19, then 1000 which signals the last step.
407//
408// This is used to interleave the computation of the next set of tweaks with the
409// AES en/decryptions, which increases performance in some cases.
410.macro	_tweak_step	i
411.if VL == 16
412	_tweak_step_mulx	\i
413.else
414	_tweak_step_pclmul	\i
415.endif
416.endm
417
418.macro	_setup_round_keys	enc
419
420	// Select either the encryption round keys or the decryption round keys.
421.if \enc
422	.set	OFFS, 0
423.else
424	.set	OFFS, 240
425.endif
426
427	// Load the round key for "round 0".
428	_vbroadcast128	OFFS(KEY), KEY0
429
430	// Increment KEY to make it so that 7*16(KEY) is the last round key.
431	// For AES-128, increment by 3*16, resulting in the 10 round keys (not
432	// counting the zero-th round key which was just loaded into KEY0) being
433	// -2*16(KEY) through 7*16(KEY).  For AES-192, increment by 5*16 and use
434	// 12 round keys -4*16(KEY) through 7*16(KEY).  For AES-256, increment
435	// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
436	//
437	// This rebasing provides two benefits.  First, it makes the offset to
438	// any round key be in the range [-96, 112], fitting in a signed byte.
439	// This shortens VEX-encoded instructions that access the later round
440	// keys which otherwise would need 4-byte offsets.  Second, it makes it
441	// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
442	// beginning.  Skipping rounds at the end doesn't work as well because
443	// the last round needs different instructions.
444	//
445	// An alternative approach would be to roll up all the round loops.  We
446	// don't do that because it isn't compatible with caching the round keys
447	// in registers which we do when possible (see below), and also because
448	// it seems unwise to rely *too* heavily on the CPU's branch predictor.
449	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
450
451	// If all 32 SIMD registers are available, cache all the round keys.
452.if USE_AVX10
453	cmp		$24, KEYLEN
454	jl		.Laes128\@
455	je		.Laes192\@
456	_vbroadcast128	-6*16(KEY), KEY1
457	_vbroadcast128	-5*16(KEY), KEY2
458.Laes192\@:
459	_vbroadcast128	-4*16(KEY), KEY3
460	_vbroadcast128	-3*16(KEY), KEY4
461.Laes128\@:
462	_vbroadcast128	-2*16(KEY), KEY5
463	_vbroadcast128	-1*16(KEY), KEY6
464	_vbroadcast128	0*16(KEY), KEY7
465	_vbroadcast128	1*16(KEY), KEY8
466	_vbroadcast128	2*16(KEY), KEY9
467	_vbroadcast128	3*16(KEY), KEY10
468	_vbroadcast128	4*16(KEY), KEY11
469	_vbroadcast128	5*16(KEY), KEY12
470	_vbroadcast128	6*16(KEY), KEY13
471	_vbroadcast128	7*16(KEY), KEY14
472.endif
473.endm
474
475// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
476// on the block(s) in \data using the round key(s) in \key.  The register length
477// determines the number of AES blocks en/decrypted.
478.macro	_vaes	enc, last, key, data
479.if \enc
480.if \last
481	vaesenclast	\key, \data, \data
482.else
483	vaesenc		\key, \data, \data
484.endif
485.else
486.if \last
487	vaesdeclast	\key, \data, \data
488.else
489	vaesdec		\key, \data, \data
490.endif
491.endif
492.endm
493
494// Do a single round of AES en/decryption on the block(s) in \data, using the
495// same key for all block(s).  The round key is loaded from the appropriate
496// register or memory location for round \i.  May clobber V4.
497.macro _vaes_1x		enc, last, i, xmm_suffix, data
498.if USE_AVX10
499	_vaes		\enc, \last, KEY\i\xmm_suffix, \data
500.else
501.ifnb \xmm_suffix
502	_vaes		\enc, \last, (\i-7)*16(KEY), \data
503.else
504	_vbroadcast128	(\i-7)*16(KEY), V4
505	_vaes		\enc, \last, V4, \data
506.endif
507.endif
508.endm
509
510// Do a single round of AES en/decryption on the blocks in registers V0-V3,
511// using the same key for all blocks.  The round key is loaded from the
512// appropriate register or memory location for round \i.  In addition, does two
513// steps of the computation of the next set of tweaks.  May clobber V4.
514.macro	_vaes_4x	enc, last, i
515.if USE_AVX10
516	_tweak_step	(2*(\i-5))
517	_vaes		\enc, \last, KEY\i, V0
518	_vaes		\enc, \last, KEY\i, V1
519	_tweak_step	(2*(\i-5) + 1)
520	_vaes		\enc, \last, KEY\i, V2
521	_vaes		\enc, \last, KEY\i, V3
522.else
523	_vbroadcast128	(\i-7)*16(KEY), V4
524	_tweak_step	(2*(\i-5))
525	_vaes		\enc, \last, V4, V0
526	_vaes		\enc, \last, V4, V1
527	_tweak_step	(2*(\i-5) + 1)
528	_vaes		\enc, \last, V4, V2
529	_vaes		\enc, \last, V4, V3
530.endif
531.endm
532
533// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
534// then XOR with \tweak again) of the block(s) in \data.  To process a single
535// block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
536// length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
537.macro	_aes_crypt	enc, xmm_suffix, tweak, data
538	_xor3		KEY0\xmm_suffix, \tweak, \data
539	cmp		$24, KEYLEN
540	jl		.Laes128\@
541	je		.Laes192\@
542	_vaes_1x	\enc, 0, 1, \xmm_suffix, \data
543	_vaes_1x	\enc, 0, 2, \xmm_suffix, \data
544.Laes192\@:
545	_vaes_1x	\enc, 0, 3, \xmm_suffix, \data
546	_vaes_1x	\enc, 0, 4, \xmm_suffix, \data
547.Laes128\@:
548	_vaes_1x	\enc, 0, 5, \xmm_suffix, \data
549	_vaes_1x	\enc, 0, 6, \xmm_suffix, \data
550	_vaes_1x	\enc, 0, 7, \xmm_suffix, \data
551	_vaes_1x	\enc, 0, 8, \xmm_suffix, \data
552	_vaes_1x	\enc, 0, 9, \xmm_suffix, \data
553	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
554	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
555	_vaes_1x	\enc, 0, 12, \xmm_suffix, \data
556	_vaes_1x	\enc, 0, 13, \xmm_suffix, \data
557	_vaes_1x	\enc, 1, 14, \xmm_suffix, \data
558	_vpxor		\tweak, \data, \data
559.endm
560
561.macro	_aes_xts_crypt	enc
562	_define_aliases
563
564.if !\enc
565	// When decrypting a message whose length isn't a multiple of the AES
566	// block length, exclude the last full block from the main loop by
567	// subtracting 16 from LEN.  This is needed because ciphertext stealing
568	// decryption uses the last two tweaks in reverse order.  We'll handle
569	// the last full block and the partial block specially at the end.
570	lea		-16(LEN), %eax
571	test		$15, LEN8
572	cmovnz		%eax, LEN
573.endif
574
575	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
576	movl		480(KEY), KEYLEN
577
578	// Setup the pointer to the round keys and cache as many as possible.
579	_setup_round_keys	\enc
580
581	// Compute the first set of tweaks TWEAK[0-3].
582	_compute_first_set_of_tweaks
583
584	sub		$4*VL, LEN
585	jl		.Lhandle_remainder\@
586
587.Lmain_loop\@:
588	// This is the main loop, en/decrypting 4*VL bytes per iteration.
589
590	// XOR each source block with its tweak and the zero-th round key.
591.if USE_AVX10
592	vmovdqu8	0*VL(SRC), V0
593	vmovdqu8	1*VL(SRC), V1
594	vmovdqu8	2*VL(SRC), V2
595	vmovdqu8	3*VL(SRC), V3
596	vpternlogd	$0x96, TWEAK0, KEY0, V0
597	vpternlogd	$0x96, TWEAK1, KEY0, V1
598	vpternlogd	$0x96, TWEAK2, KEY0, V2
599	vpternlogd	$0x96, TWEAK3, KEY0, V3
600.else
601	vpxor		0*VL(SRC), KEY0, V0
602	vpxor		1*VL(SRC), KEY0, V1
603	vpxor		2*VL(SRC), KEY0, V2
604	vpxor		3*VL(SRC), KEY0, V3
605	vpxor		TWEAK0, V0, V0
606	vpxor		TWEAK1, V1, V1
607	vpxor		TWEAK2, V2, V2
608	vpxor		TWEAK3, V3, V3
609.endif
610	cmp		$24, KEYLEN
611	jl		.Laes128\@
612	je		.Laes192\@
613	// Do all the AES rounds on the data blocks, interleaved with
614	// the computation of the next set of tweaks.
615	_vaes_4x	\enc, 0, 1
616	_vaes_4x	\enc, 0, 2
617.Laes192\@:
618	_vaes_4x	\enc, 0, 3
619	_vaes_4x	\enc, 0, 4
620.Laes128\@:
621	_vaes_4x	\enc, 0, 5
622	_vaes_4x	\enc, 0, 6
623	_vaes_4x	\enc, 0, 7
624	_vaes_4x	\enc, 0, 8
625	_vaes_4x	\enc, 0, 9
626	_vaes_4x	\enc, 0, 10
627	_vaes_4x	\enc, 0, 11
628	_vaes_4x	\enc, 0, 12
629	_vaes_4x	\enc, 0, 13
630	_vaes_4x	\enc, 1, 14
631
632	// XOR in the tweaks again.
633	_vpxor		TWEAK0, V0, V0
634	_vpxor		TWEAK1, V1, V1
635	_vpxor		TWEAK2, V2, V2
636	_vpxor		TWEAK3, V3, V3
637
638	// Store the destination blocks.
639	_vmovdqu	V0, 0*VL(DST)
640	_vmovdqu	V1, 1*VL(DST)
641	_vmovdqu	V2, 2*VL(DST)
642	_vmovdqu	V3, 3*VL(DST)
643
644	// Finish computing the next set of tweaks.
645	_tweak_step	1000
646
647	add		$4*VL, SRC
648	add		$4*VL, DST
649	sub		$4*VL, LEN
650	jge		.Lmain_loop\@
651
652	// Check for the uncommon case where the data length isn't a multiple of
653	// 4*VL.  Handle it out-of-line in order to optimize for the common
654	// case.  In the common case, just fall through to the ret.
655	test		$4*VL-1, LEN8
656	jnz		.Lhandle_remainder\@
657.Ldone\@:
658	// Store the next tweak back to *TWEAK to support continuation calls.
659	vmovdqu		TWEAK0_XMM, (TWEAK)
660.if VL > 16
661	vzeroupper
662.endif
663	RET
664
665.Lhandle_remainder\@:
666
667	// En/decrypt any remaining full blocks, one vector at a time.
668.if VL > 16
669	add		$3*VL, LEN	// Undo extra sub of 4*VL, then sub VL.
670	jl		.Lvec_at_a_time_done\@
671.Lvec_at_a_time\@:
672	_vmovdqu	(SRC), V0
673	_aes_crypt	\enc, , TWEAK0, V0
674	_vmovdqu	V0, (DST)
675	_next_tweakvec	TWEAK0, V0, V1, TWEAK0
676	add		$VL, SRC
677	add		$VL, DST
678	sub		$VL, LEN
679	jge		.Lvec_at_a_time\@
680.Lvec_at_a_time_done\@:
681	add		$VL-16, LEN	// Undo extra sub of VL, then sub 16.
682.else
683	add		$4*VL-16, LEN	// Undo extra sub of 4*VL, then sub 16.
684.endif
685
686	// En/decrypt any remaining full blocks, one at a time.
687	jl		.Lblock_at_a_time_done\@
688.Lblock_at_a_time\@:
689	vmovdqu		(SRC), %xmm0
690	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
691	vmovdqu		%xmm0, (DST)
692	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK0_XMM
693	add		$16, SRC
694	add		$16, DST
695	sub		$16, LEN
696	jge		.Lblock_at_a_time\@
697.Lblock_at_a_time_done\@:
698	add		$16, LEN	// Undo the extra sub of 16.
699	// Now 0 <= LEN <= 15.  If LEN is zero, we're done.
700	jz		.Ldone\@
701
702	// Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
703	// Do ciphertext stealing to process the last 16 + LEN bytes.
704
705.if \enc
706	// If encrypting, the main loop already encrypted the last full block to
707	// create the CTS intermediate ciphertext.  Prepare for the rest of CTS
708	// by rewinding the pointers and loading the intermediate ciphertext.
709	sub		$16, SRC
710	sub		$16, DST
711	vmovdqu		(DST), %xmm0
712.else
713	// If decrypting, the main loop didn't decrypt the last full block
714	// because CTS decryption uses the last two tweaks in reverse order.
715	// Do it now by advancing the tweak and decrypting the last full block.
716	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK1_XMM
717	vmovdqu		(SRC), %xmm0
718	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0
719.endif
720
721.if USE_AVX10
722	// Create a mask that has the first LEN bits set.
723	mov		$-1, %r9d
724	bzhi		LEN, %r9d, %r9d
725	kmovd		%r9d, %k1
726
727	// Swap the first LEN bytes of the en/decryption of the last full block
728	// with the partial block.  Note that to support in-place en/decryption,
729	// the load from the src partial block must happen before the store to
730	// the dst partial block.
731	vmovdqa		%xmm0, %xmm1
732	vmovdqu8	16(SRC), %xmm0{%k1}
733	vmovdqu8	%xmm1, 16(DST){%k1}
734.else
735	lea		.Lcts_permute_table(%rip), %r9
736
737	// Load the src partial block, left-aligned.  Note that to support
738	// in-place en/decryption, this must happen before the store to the dst
739	// partial block.
740	vmovdqu		(SRC, LEN64, 1), %xmm1
741
742	// Shift the first LEN bytes of the en/decryption of the last full block
743	// to the end of a register, then store it to DST+LEN.  This stores the
744	// dst partial block.  It also writes to the second part of the dst last
745	// full block, but that part is overwritten later.
746	vpshufb		(%r9, LEN64, 1), %xmm0, %xmm2
747	vmovdqu		%xmm2, (DST, LEN64, 1)
748
749	// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
750	sub		LEN64, %r9
751	vmovdqu		32(%r9), %xmm3
752
753	// Shift the src partial block to the beginning of its register.
754	vpshufb		%xmm3, %xmm1, %xmm1
755
756	// Do a blend to generate the src partial block followed by the second
757	// part of the en/decryption of the last full block.
758	vpblendvb	%xmm3, %xmm0, %xmm1, %xmm0
759.endif
760	// En/decrypt again and store the last full block.
761	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
762	vmovdqu		%xmm0, (DST)
763	jmp		.Ldone\@
764.endm
765
766// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
767//			   u8 iv[AES_BLOCK_SIZE]);
768SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
769	vmovdqu		(%rsi), %xmm0
770	vpxor		(%rdi), %xmm0, %xmm0
771	movl		480(%rdi), %eax		// AES key length
772	lea		-16(%rdi, %rax, 4), %rdi
773	cmp		$24, %eax
774	jl		.Lencrypt_iv_aes128
775	je		.Lencrypt_iv_aes192
776	vaesenc		-6*16(%rdi), %xmm0, %xmm0
777	vaesenc		-5*16(%rdi), %xmm0, %xmm0
778.Lencrypt_iv_aes192:
779	vaesenc		-4*16(%rdi), %xmm0, %xmm0
780	vaesenc		-3*16(%rdi), %xmm0, %xmm0
781.Lencrypt_iv_aes128:
782	vaesenc		-2*16(%rdi), %xmm0, %xmm0
783	vaesenc		-1*16(%rdi), %xmm0, %xmm0
784	vaesenc		0*16(%rdi), %xmm0, %xmm0
785	vaesenc		1*16(%rdi), %xmm0, %xmm0
786	vaesenc		2*16(%rdi), %xmm0, %xmm0
787	vaesenc		3*16(%rdi), %xmm0, %xmm0
788	vaesenc		4*16(%rdi), %xmm0, %xmm0
789	vaesenc		5*16(%rdi), %xmm0, %xmm0
790	vaesenc		6*16(%rdi), %xmm0, %xmm0
791	vaesenclast	7*16(%rdi), %xmm0, %xmm0
792	vmovdqu		%xmm0, (%rsi)
793	RET
794SYM_FUNC_END(aes_xts_encrypt_iv)
795
796// Below are the actual AES-XTS encryption and decryption functions,
797// instantiated from the above macro.  They all have the following prototype:
798//
799// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
800//			const u8 *src, u8 *dst, unsigned int len,
801//			u8 tweak[AES_BLOCK_SIZE]);
802//
803// |key| is the data key.  |tweak| contains the next tweak; the encryption of
804// the original IV with the tweak key was already done.  This function supports
805// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
806// |len| must be a multiple of 16 except on the last call.  If |len| is a
807// multiple of 16, then this function updates |tweak| to contain the next tweak.
808
809.set	VL, 16
810.set	USE_AVX10, 0
811SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
812	_aes_xts_crypt	1
813SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
814SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
815	_aes_xts_crypt	0
816SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
817
818#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
819.set	VL, 32
820.set	USE_AVX10, 0
821SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
822	_aes_xts_crypt	1
823SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
824SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
825	_aes_xts_crypt	0
826SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
827
828.set	VL, 32
829.set	USE_AVX10, 1
830SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
831	_aes_xts_crypt	1
832SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
833SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
834	_aes_xts_crypt	0
835SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
836
837.set	VL, 64
838.set	USE_AVX10, 1
839SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
840	_aes_xts_crypt	1
841SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
842SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
843	_aes_xts_crypt	0
844SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
845#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
846