1/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
2//
3// This file is dual-licensed, meaning that you can use it under your
4// choice of either of the following two licenses:
5//
6// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
7//
8// Licensed under the Apache License 2.0 (the "License"). You can obtain
9// a copy in the file LICENSE in the source distribution or at
10// https://www.openssl.org/source/license.html
11//
12// or
13//
14// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
15// Copyright 2024 Google LLC
16// All rights reserved.
17//
18// Redistribution and use in source and binary forms, with or without
19// modification, are permitted provided that the following conditions
20// are met:
21// 1. Redistributions of source code must retain the above copyright
22//    notice, this list of conditions and the following disclaimer.
23// 2. Redistributions in binary form must reproduce the above copyright
24//    notice, this list of conditions and the following disclaimer in the
25//    documentation and/or other materials provided with the distribution.
26//
27// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
39// The generated code of this file depends on the following RISC-V extensions:
40// - RV64I
41// - RISC-V Vector ('V') with VLEN >= 128
42// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
43
44#include <linux/linkage.h>
45
46.text
47.option arch, +zvkb
48
49#define KEYP		a0
50#define INP		a1
51#define OUTP		a2
52#define LEN		a3
53#define IVP		a4
54
55#define CONSTS0		a5
56#define CONSTS1		a6
57#define CONSTS2		a7
58#define CONSTS3		t0
59#define TMP		t1
60#define VL		t2
61#define STRIDE		t3
62#define NROUNDS		t4
63#define KEY0		s0
64#define KEY1		s1
65#define KEY2		s2
66#define KEY3		s3
67#define KEY4		s4
68#define KEY5		s5
69#define KEY6		s6
70#define KEY7		s7
71#define COUNTER		s8
72#define NONCE0		s9
73#define NONCE1		s10
74#define NONCE2		s11
75
76.macro	chacha_round	a0, b0, c0, d0,  a1, b1, c1, d1, \
77			a2, b2, c2, d2,  a3, b3, c3, d3
78	// a += b; d ^= a; d = rol(d, 16);
79	vadd.vv		\a0, \a0, \b0
80	vadd.vv		\a1, \a1, \b1
81	vadd.vv		\a2, \a2, \b2
82	vadd.vv		\a3, \a3, \b3
83	vxor.vv		\d0, \d0, \a0
84	vxor.vv		\d1, \d1, \a1
85	vxor.vv		\d2, \d2, \a2
86	vxor.vv		\d3, \d3, \a3
87	vror.vi		\d0, \d0, 32 - 16
88	vror.vi		\d1, \d1, 32 - 16
89	vror.vi		\d2, \d2, 32 - 16
90	vror.vi		\d3, \d3, 32 - 16
91
92	// c += d; b ^= c; b = rol(b, 12);
93	vadd.vv		\c0, \c0, \d0
94	vadd.vv		\c1, \c1, \d1
95	vadd.vv		\c2, \c2, \d2
96	vadd.vv		\c3, \c3, \d3
97	vxor.vv		\b0, \b0, \c0
98	vxor.vv		\b1, \b1, \c1
99	vxor.vv		\b2, \b2, \c2
100	vxor.vv		\b3, \b3, \c3
101	vror.vi		\b0, \b0, 32 - 12
102	vror.vi		\b1, \b1, 32 - 12
103	vror.vi		\b2, \b2, 32 - 12
104	vror.vi		\b3, \b3, 32 - 12
105
106	// a += b; d ^= a; d = rol(d, 8);
107	vadd.vv		\a0, \a0, \b0
108	vadd.vv		\a1, \a1, \b1
109	vadd.vv		\a2, \a2, \b2
110	vadd.vv		\a3, \a3, \b3
111	vxor.vv		\d0, \d0, \a0
112	vxor.vv		\d1, \d1, \a1
113	vxor.vv		\d2, \d2, \a2
114	vxor.vv		\d3, \d3, \a3
115	vror.vi		\d0, \d0, 32 - 8
116	vror.vi		\d1, \d1, 32 - 8
117	vror.vi		\d2, \d2, 32 - 8
118	vror.vi		\d3, \d3, 32 - 8
119
120	// c += d; b ^= c; b = rol(b, 7);
121	vadd.vv		\c0, \c0, \d0
122	vadd.vv		\c1, \c1, \d1
123	vadd.vv		\c2, \c2, \d2
124	vadd.vv		\c3, \c3, \d3
125	vxor.vv		\b0, \b0, \c0
126	vxor.vv		\b1, \b1, \c1
127	vxor.vv		\b2, \b2, \c2
128	vxor.vv		\b3, \b3, \c3
129	vror.vi		\b0, \b0, 32 - 7
130	vror.vi		\b1, \b1, 32 - 7
131	vror.vi		\b2, \b2, 32 - 7
132	vror.vi		\b3, \b3, 32 - 7
133.endm
134
135// void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out, size_t len,
136//		      const u32 iv[4]);
137//
138// |len| must be nonzero and a multiple of 64 (CHACHA_BLOCK_SIZE).
139// The counter is treated as 32-bit, following the RFC7539 convention.
140SYM_FUNC_START(chacha20_zvkb)
141	srli		LEN, LEN, 6	// Bytes to blocks
142
143	addi		sp, sp, -96
144	sd		s0, 0(sp)
145	sd		s1, 8(sp)
146	sd		s2, 16(sp)
147	sd		s3, 24(sp)
148	sd		s4, 32(sp)
149	sd		s5, 40(sp)
150	sd		s6, 48(sp)
151	sd		s7, 56(sp)
152	sd		s8, 64(sp)
153	sd		s9, 72(sp)
154	sd		s10, 80(sp)
155	sd		s11, 88(sp)
156
157	li		STRIDE, 64
158
159	// Set up the initial state matrix in scalar registers.
160	li		CONSTS0, 0x61707865	// "expa" little endian
161	li		CONSTS1, 0x3320646e	// "nd 3" little endian
162	li		CONSTS2, 0x79622d32	// "2-by" little endian
163	li		CONSTS3, 0x6b206574	// "te k" little endian
164	lw		KEY0, 0(KEYP)
165	lw		KEY1, 4(KEYP)
166	lw		KEY2, 8(KEYP)
167	lw		KEY3, 12(KEYP)
168	lw		KEY4, 16(KEYP)
169	lw		KEY5, 20(KEYP)
170	lw		KEY6, 24(KEYP)
171	lw		KEY7, 28(KEYP)
172	lw		COUNTER, 0(IVP)
173	lw		NONCE0, 4(IVP)
174	lw		NONCE1, 8(IVP)
175	lw		NONCE2, 12(IVP)
176
177.Lblock_loop:
178	// Set vl to the number of blocks to process in this iteration.
179	vsetvli		VL, LEN, e32, m1, ta, ma
180
181	// Set up the initial state matrix for the next VL blocks in v0-v15.
182	// v{i} holds the i'th 32-bit word of the state matrix for all blocks.
183	// Note that only the counter word, at index 12, differs across blocks.
184	vmv.v.x		v0, CONSTS0
185	vmv.v.x		v1, CONSTS1
186	vmv.v.x		v2, CONSTS2
187	vmv.v.x		v3, CONSTS3
188	vmv.v.x		v4, KEY0
189	vmv.v.x		v5, KEY1
190	vmv.v.x		v6, KEY2
191	vmv.v.x		v7, KEY3
192	vmv.v.x		v8, KEY4
193	vmv.v.x		v9, KEY5
194	vmv.v.x		v10, KEY6
195	vmv.v.x		v11, KEY7
196	vid.v		v12
197	vadd.vx		v12, v12, COUNTER
198	vmv.v.x		v13, NONCE0
199	vmv.v.x		v14, NONCE1
200	vmv.v.x		v15, NONCE2
201
202	// Load the first half of the input data for each block into v16-v23.
203	// v{16+i} holds the i'th 32-bit word for all blocks.
204	vlsseg8e32.v	v16, (INP), STRIDE
205
206	li		NROUNDS, 20
207.Lnext_doubleround:
208	addi		NROUNDS, NROUNDS, -2
209	// column round
210	chacha_round	v0, v4, v8, v12, v1, v5, v9, v13, \
211			v2, v6, v10, v14, v3, v7, v11, v15
212	// diagonal round
213	chacha_round	v0, v5, v10, v15, v1, v6, v11, v12, \
214			v2, v7, v8, v13, v3, v4, v9, v14
215	bnez		NROUNDS, .Lnext_doubleround
216
217	// Load the second half of the input data for each block into v24-v31.
218	// v{24+i} holds the {8+i}'th 32-bit word for all blocks.
219	addi		TMP, INP, 32
220	vlsseg8e32.v	v24, (TMP), STRIDE
221
222	// Finalize the first half of the keystream for each block.
223	vadd.vx		v0, v0, CONSTS0
224	vadd.vx		v1, v1, CONSTS1
225	vadd.vx		v2, v2, CONSTS2
226	vadd.vx		v3, v3, CONSTS3
227	vadd.vx		v4, v4, KEY0
228	vadd.vx		v5, v5, KEY1
229	vadd.vx		v6, v6, KEY2
230	vadd.vx		v7, v7, KEY3
231
232	// Encrypt/decrypt the first half of the data for each block.
233	vxor.vv		v16, v16, v0
234	vxor.vv		v17, v17, v1
235	vxor.vv		v18, v18, v2
236	vxor.vv		v19, v19, v3
237	vxor.vv		v20, v20, v4
238	vxor.vv		v21, v21, v5
239	vxor.vv		v22, v22, v6
240	vxor.vv		v23, v23, v7
241
242	// Store the first half of the output data for each block.
243	vssseg8e32.v	v16, (OUTP), STRIDE
244
245	// Finalize the second half of the keystream for each block.
246	vadd.vx		v8, v8, KEY4
247	vadd.vx		v9, v9, KEY5
248	vadd.vx		v10, v10, KEY6
249	vadd.vx		v11, v11, KEY7
250	vid.v		v0
251	vadd.vx		v12, v12, COUNTER
252	vadd.vx		v13, v13, NONCE0
253	vadd.vx		v14, v14, NONCE1
254	vadd.vx		v15, v15, NONCE2
255	vadd.vv		v12, v12, v0
256
257	// Encrypt/decrypt the second half of the data for each block.
258	vxor.vv		v24, v24, v8
259	vxor.vv		v25, v25, v9
260	vxor.vv		v26, v26, v10
261	vxor.vv		v27, v27, v11
262	vxor.vv		v29, v29, v13
263	vxor.vv		v28, v28, v12
264	vxor.vv		v30, v30, v14
265	vxor.vv		v31, v31, v15
266
267	// Store the second half of the output data for each block.
268	addi		TMP, OUTP, 32
269	vssseg8e32.v	v24, (TMP), STRIDE
270
271	// Update the counter, the remaining number of blocks, and the input and
272	// output pointers according to the number of blocks processed (VL).
273	add		COUNTER, COUNTER, VL
274	sub		LEN, LEN, VL
275	slli		TMP, VL, 6
276	add		OUTP, OUTP, TMP
277	add		INP, INP, TMP
278	bnez		LEN, .Lblock_loop
279
280	ld		s0, 0(sp)
281	ld		s1, 8(sp)
282	ld		s2, 16(sp)
283	ld		s3, 24(sp)
284	ld		s4, 32(sp)
285	ld		s5, 40(sp)
286	ld		s6, 48(sp)
287	ld		s7, 56(sp)
288	ld		s8, 64(sp)
289	ld		s9, 72(sp)
290	ld		s10, 80(sp)
291	ld		s11, 88(sp)
292	addi		sp, sp, 96
293	ret
294SYM_FUNC_END(chacha20_zvkb)
295