1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * aes-ce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd.
6 * Copyright (C) 2024 Google LLC
7 *
8 * Author: Ard Biesheuvel <ardb@kernel.org>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	.text
15	.arch	armv8-a+crypto
16
17	.macro	load_round_keys, rk, nr, tmp
18	sub	w\tmp, \nr, #10
19	add	\tmp, \rk, w\tmp, sxtw #4
20	ld1	{v10.4s-v13.4s}, [\rk]
21	ld1	{v14.4s-v17.4s}, [\tmp], #64
22	ld1	{v18.4s-v21.4s}, [\tmp], #64
23	ld1	{v3.4s-v5.4s}, [\tmp]
24	.endm
25
26	.macro	dround, va, vb, vk
27	aese	\va\().16b, \vk\().16b
28	aesmc	\va\().16b, \va\().16b
29	aese	\vb\().16b, \vk\().16b
30	aesmc	\vb\().16b, \vb\().16b
31	.endm
32
33	.macro	aes_encrypt, va, vb, nr
34	tbz	\nr, #2, .L\@
35	dround	\va, \vb, v10
36	dround	\va, \vb, v11
37	tbz	\nr, #1, .L\@
38	dround	\va, \vb, v12
39	dround	\va, \vb, v13
40.L\@:	.irp	v, v14, v15, v16, v17, v18, v19, v20, v21, v3
41	dround	\va, \vb, \v
42	.endr
43	aese	\va\().16b, v4.16b
44	aese	\vb\().16b, v4.16b
45	.endm
46
47	.macro	aes_ccm_do_crypt,enc
48	load_round_keys	x3, w4, x10
49
50	ld1	{v0.16b}, [x5]			/* load mac */
51	cbz	x2, ce_aes_ccm_final
52	ldr	x8, [x6, #8]			/* load lower ctr */
53CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
540:	/* outer loop */
55	ld1	{v1.8b}, [x6]			/* load upper ctr */
56	prfm	pldl1strm, [x1]
57	add	x8, x8, #1
58	rev	x9, x8
59	ins	v1.d[1], x9			/* no carry in lower ctr */
60
61	aes_encrypt	v0, v1, w4
62
63	subs	w2, w2, #16
64	bmi	ce_aes_ccm_crypt_tail
65	ld1	{v2.16b}, [x1], #16		/* load next input block */
66	.if	\enc == 1
67	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
68	eor	v6.16b, v1.16b, v2.16b		/* xor with crypted ctr */
69	.else
70	eor	v2.16b, v2.16b, v1.16b		/* xor with crypted ctr */
71	eor	v6.16b, v2.16b, v5.16b		/* final round enc */
72	.endif
73	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
74	st1	{v6.16b}, [x0], #16		/* write output block */
75	bne	0b
76CPU_LE(	rev	x8, x8			)
77	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
78	cbnz	x7, ce_aes_ccm_final
79	st1	{v0.16b}, [x5]			/* store mac */
80	ret
81	.endm
82
83SYM_FUNC_START_LOCAL(ce_aes_ccm_crypt_tail)
84	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
85	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
86
87	add	x1, x1, w2, sxtw		/* rewind the input pointer (w2 < 0) */
88	add	x0, x0, w2, sxtw		/* rewind the output pointer */
89
90	adr_l	x8, .Lpermute			/* load permute vectors */
91	add	x9, x8, w2, sxtw
92	sub	x8, x8, w2, sxtw
93	ld1	{v7.16b-v8.16b}, [x9]
94	ld1	{v9.16b}, [x8]
95
96	ld1	{v2.16b}, [x1]			/* load a full block of input */
97	tbl	v1.16b, {v1.16b}, v7.16b	/* move keystream to end of register */
98	eor	v7.16b, v2.16b, v1.16b		/* encrypt partial input block */
99	bif	v2.16b, v7.16b, v22.16b		/* select plaintext */
100	tbx	v7.16b, {v6.16b}, v8.16b	/* insert output from previous iteration */
101	tbl	v2.16b, {v2.16b}, v9.16b	/* copy plaintext to start of v2 */
102	eor	v0.16b, v0.16b, v2.16b		/* fold plaintext into mac */
103
104	st1	{v7.16b}, [x0]			/* store output block */
105	cbz	x7, 0f
106
107SYM_INNER_LABEL(ce_aes_ccm_final, SYM_L_LOCAL)
108	ld1	{v1.16b}, [x7]			/* load 1st ctriv */
109
110	aes_encrypt	v0, v1, w4
111
112	/* final round key cancels out */
113	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */
1140:	st1	{v0.16b}, [x5]			/* store result */
115	ret
116SYM_FUNC_END(ce_aes_ccm_crypt_tail)
117
118	/*
119	 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
120	 * 			   u8 const rk[], u32 rounds, u8 mac[],
121	 * 			   u8 ctr[], u8 const final_iv[]);
122	 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
123	 * 			   u8 const rk[], u32 rounds, u8 mac[],
124	 * 			   u8 ctr[], u8 const final_iv[]);
125	 */
126SYM_FUNC_START(ce_aes_ccm_encrypt)
127	movi	v22.16b, #255
128	aes_ccm_do_crypt	1
129SYM_FUNC_END(ce_aes_ccm_encrypt)
130
131SYM_FUNC_START(ce_aes_ccm_decrypt)
132	movi	v22.16b, #0
133	aes_ccm_do_crypt	0
134SYM_FUNC_END(ce_aes_ccm_decrypt)
135
136	.section ".rodata", "a"
137	.align	6
138	.fill	15, 1, 0xff
139.Lpermute:
140	.byte	0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
141	.byte	0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
142	.fill	15, 1, 0xff
143