1/*-
2 * Copyright (c) 2016 The FreeBSD Foundation
3 * Copyright (c) 2020 Ampere Computing
4 * All rights reserved.
5 *
6 * This software was developed by Andrew Turner under
7 * sponsorship from the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 * This file is derived from aesni_wrap.c:
31 * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
32 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
33 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
34 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
35 * Copyright (c) 2014 The FreeBSD Foundation
36 */
37
38/*
39 * This code is built with floating-point enabled. Make sure to have entered
40 * into floating-point context before calling any of these functions.
41 */
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/malloc.h>
46#include <sys/queue.h>
47
48#include <opencrypto/cryptodev.h>
49#include <opencrypto/gmac.h>
50#include <crypto/rijndael/rijndael.h>
51#include <crypto/armv8/armv8_crypto.h>
52
53#include <arm_neon.h>
54
55static uint8x16_t
56armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
57{
58	uint8x16_t tmp;
59	int i;
60
61	tmp = from;
62	for (i = 0; i < rounds - 1; i += 2) {
63		tmp = vaeseq_u8(tmp, keysched[i]);
64		tmp = vaesmcq_u8(tmp);
65		tmp = vaeseq_u8(tmp, keysched[i + 1]);
66		tmp = vaesmcq_u8(tmp);
67	}
68
69	tmp = vaeseq_u8(tmp, keysched[rounds - 1]);
70	tmp = vaesmcq_u8(tmp);
71	tmp = vaeseq_u8(tmp, keysched[rounds]);
72	tmp = veorq_u8(tmp, keysched[rounds + 1]);
73
74	return (tmp);
75}
76
77static uint8x16_t
78armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
79{
80	uint8x16_t tmp;
81	int i;
82
83	tmp = from;
84	for (i = 0; i < rounds - 1; i += 2) {
85		tmp = vaesdq_u8(tmp, keysched[i]);
86		tmp = vaesimcq_u8(tmp);
87		tmp = vaesdq_u8(tmp, keysched[i+1]);
88		tmp = vaesimcq_u8(tmp);
89	}
90
91	tmp = vaesdq_u8(tmp, keysched[rounds - 1]);
92	tmp = vaesimcq_u8(tmp);
93	tmp = vaesdq_u8(tmp, keysched[rounds]);
94	tmp = veorq_u8(tmp, keysched[rounds + 1]);
95
96	return (tmp);
97}
98
99void
100armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len,
101    struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
102    const uint8_t iv[static AES_BLOCK_LEN])
103{
104	uint8x16_t tot, ivreg, tmp;
105	uint8_t block[AES_BLOCK_LEN], *from, *to;
106	size_t fromseglen, oseglen, seglen, toseglen;
107
108	KASSERT(len % AES_BLOCK_LEN == 0,
109	    ("%s: length %zu not a multiple of the block size", __func__, len));
110
111	ivreg = vld1q_u8(iv);
112	for (; len > 0; len -= seglen) {
113		from = crypto_cursor_segment(fromc, &fromseglen);
114		to = crypto_cursor_segment(toc, &toseglen);
115
116		seglen = ulmin(len, ulmin(fromseglen, toseglen));
117		if (seglen < AES_BLOCK_LEN) {
118			crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
119			tmp = vld1q_u8(block);
120			tot = armv8_aes_enc(key->aes_rounds - 1,
121			    (const void *)key->aes_key, veorq_u8(tmp, ivreg));
122			ivreg = tot;
123			vst1q_u8(block, tot);
124			crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
125			seglen = AES_BLOCK_LEN;
126		} else {
127			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
128			    seglen -= AES_BLOCK_LEN) {
129				tmp = vld1q_u8(from);
130				tot = armv8_aes_enc(key->aes_rounds - 1,
131				    (const void *)key->aes_key,
132				    veorq_u8(tmp, ivreg));
133				ivreg = tot;
134				vst1q_u8(to, tot);
135				from += AES_BLOCK_LEN;
136				to += AES_BLOCK_LEN;
137			}
138			seglen = oseglen - seglen;
139			crypto_cursor_advance(fromc, seglen);
140			crypto_cursor_advance(toc, seglen);
141		}
142	}
143
144	explicit_bzero(block, sizeof(block));
145}
146
147void
148armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len,
149    struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
150    const uint8_t iv[static AES_BLOCK_LEN])
151{
152	uint8x16_t ivreg, nextiv, tmp;
153	uint8_t block[AES_BLOCK_LEN], *from, *to;
154	size_t fromseglen, oseglen, seglen, toseglen;
155
156	KASSERT(len % AES_BLOCK_LEN == 0,
157	    ("%s: length %zu not a multiple of the block size", __func__, len));
158
159	ivreg = vld1q_u8(iv);
160	for (; len > 0; len -= seglen) {
161		from = crypto_cursor_segment(fromc, &fromseglen);
162		to = crypto_cursor_segment(toc, &toseglen);
163
164		seglen = ulmin(len, ulmin(fromseglen, toseglen));
165		if (seglen < AES_BLOCK_LEN) {
166			crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
167			nextiv = vld1q_u8(block);
168			tmp = armv8_aes_dec(key->aes_rounds - 1,
169			    (const void *)key->aes_key, nextiv);
170			vst1q_u8(block, veorq_u8(tmp, ivreg));
171			ivreg = nextiv;
172			crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
173			seglen = AES_BLOCK_LEN;
174		} else {
175			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
176			    seglen -= AES_BLOCK_LEN) {
177				nextiv = vld1q_u8(from);
178				tmp = armv8_aes_dec(key->aes_rounds - 1,
179				    (const void *)key->aes_key, nextiv);
180				vst1q_u8(to, veorq_u8(tmp, ivreg));
181				ivreg = nextiv;
182				from += AES_BLOCK_LEN;
183				to += AES_BLOCK_LEN;
184			}
185			crypto_cursor_advance(fromc, oseglen - seglen);
186			crypto_cursor_advance(toc, oseglen - seglen);
187			seglen = oseglen - seglen;
188		}
189	}
190
191	explicit_bzero(block, sizeof(block));
192}
193
194#define	AES_XTS_BLOCKSIZE	16
195#define	AES_XTS_IVSIZE		8
196#define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
197
198static inline int32x4_t
199xts_crank_lfsr(int32x4_t inp)
200{
201	const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1};
202	int32x4_t xtweak, ret;
203
204	/* set up xor mask */
205	xtweak = vextq_s32(inp, inp, 3);
206	xtweak = vshrq_n_s32(xtweak, 31);
207	xtweak &= alphamask;
208
209	/* next term */
210	ret = vshlq_n_s32(inp, 1);
211	ret ^= xtweak;
212
213	return ret;
214}
215
216static void
217armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule,
218    uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt)
219{
220	uint8x16_t block;
221
222	block = vld1q_u8(from) ^ *tweak;
223
224	if (do_encrypt)
225		block = armv8_aes_enc(rounds - 1, key_schedule, block);
226	else
227		block = armv8_aes_dec(rounds - 1, key_schedule, block);
228
229	vst1q_u8(to, block ^ *tweak);
230
231	*tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak)));
232}
233
234static void
235armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule,
236    const uint8x16_t *tweak_schedule, size_t len,
237    struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
238    const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
239{
240	uint8x16_t tweakreg;
241	uint8_t block[AES_XTS_BLOCKSIZE] __aligned(16);
242	uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
243	uint8_t *from, *to;
244	size_t fromseglen, oseglen, seglen, toseglen;
245
246	KASSERT(len % AES_XTS_BLOCKSIZE == 0,
247	    ("%s: length %zu not a multiple of the block size", __func__, len));
248
249	/*
250	 * Prepare tweak as E_k2(IV). IV is specified as LE representation
251	 * of a 64-bit block number which we allow to be passed in directly.
252	 */
253#if BYTE_ORDER == LITTLE_ENDIAN
254	bcopy(iv, tweak, AES_XTS_IVSIZE);
255	/* Last 64 bits of IV are always zero. */
256	bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
257#else
258#error Only LITTLE_ENDIAN architectures are supported.
259#endif
260	tweakreg = vld1q_u8(tweak);
261	tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg);
262
263	for (; len > 0; len -= seglen) {
264		from = crypto_cursor_segment(fromc, &fromseglen);
265		to = crypto_cursor_segment(toc, &toseglen);
266
267		seglen = ulmin(len, ulmin(fromseglen, toseglen));
268		if (seglen < AES_XTS_BLOCKSIZE) {
269			crypto_cursor_copydata(fromc, AES_XTS_BLOCKSIZE, block);
270			armv8_aes_crypt_xts_block(rounds, data_schedule,
271			    &tweakreg, block, block, do_encrypt);
272			crypto_cursor_copyback(toc, AES_XTS_BLOCKSIZE, block);
273			seglen = AES_XTS_BLOCKSIZE;
274		} else {
275			for (oseglen = seglen; seglen >= AES_XTS_BLOCKSIZE;
276			    seglen -= AES_XTS_BLOCKSIZE) {
277				armv8_aes_crypt_xts_block(rounds, data_schedule,
278				    &tweakreg, from, to, do_encrypt);
279				from += AES_XTS_BLOCKSIZE;
280				to += AES_XTS_BLOCKSIZE;
281			}
282			seglen = oseglen - seglen;
283			crypto_cursor_advance(fromc, seglen);
284			crypto_cursor_advance(toc, seglen);
285		}
286	}
287
288	explicit_bzero(block, sizeof(block));
289}
290
291void
292armv8_aes_encrypt_xts(AES_key_t *data_schedule,
293    const void *tweak_schedule, size_t len, struct crypto_buffer_cursor *fromc,
294    struct crypto_buffer_cursor *toc, const uint8_t iv[static AES_BLOCK_LEN])
295{
296	armv8_aes_crypt_xts(data_schedule->aes_rounds,
297	    (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
298	    toc, iv, 1);
299}
300
301void
302armv8_aes_decrypt_xts(AES_key_t *data_schedule,
303    const void *tweak_schedule, size_t len,
304    struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
305    const uint8_t iv[static AES_BLOCK_LEN])
306{
307	armv8_aes_crypt_xts(data_schedule->aes_rounds,
308	    (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
309	    toc, iv, 0);
310
311}
312#define	AES_INC_COUNTER(counter)				\
313	do {							\
314		for (int pos = AES_BLOCK_LEN - 1;		\
315		     pos >= 0; pos--)				\
316			if (++(counter)[pos])			\
317				break;				\
318	} while (0)
319
320struct armv8_gcm_state {
321	__uint128_val_t EK0;
322	__uint128_val_t EKi;
323	__uint128_val_t Xi;
324	__uint128_val_t lenblock;
325	uint8_t aes_counter[AES_BLOCK_LEN];
326};
327
328static void
329armv8_aes_gmac_setup(struct armv8_gcm_state *s, AES_key_t *aes_key,
330    const uint8_t *authdata, size_t authdatalen,
331    const uint8_t iv[static AES_GCM_IV_LEN], const __uint128_val_t *Htable)
332{
333	uint8_t block[AES_BLOCK_LEN];
334	size_t trailer;
335
336	bzero(s->aes_counter, AES_BLOCK_LEN);
337	memcpy(s->aes_counter, iv, AES_GCM_IV_LEN);
338
339	/* Setup the counter */
340	s->aes_counter[AES_BLOCK_LEN - 1] = 1;
341
342	/* EK0 for a final GMAC round */
343	aes_v8_encrypt(s->aes_counter, s->EK0.c, aes_key);
344
345	/* GCM starts with 2 as counter, 1 is used for final xor of tag. */
346	s->aes_counter[AES_BLOCK_LEN - 1] = 2;
347
348	memset(s->Xi.c, 0, sizeof(s->Xi.c));
349	trailer = authdatalen % AES_BLOCK_LEN;
350	if (authdatalen - trailer > 0) {
351		gcm_ghash_v8(s->Xi.u, Htable, authdata, authdatalen - trailer);
352		authdata += authdatalen - trailer;
353	}
354	if (trailer > 0 || authdatalen == 0) {
355		memset(block, 0, sizeof(block));
356		memcpy(block, authdata, trailer);
357		gcm_ghash_v8(s->Xi.u, Htable, block, AES_BLOCK_LEN);
358	}
359}
360
361static void
362armv8_aes_gmac_finish(struct armv8_gcm_state *s, size_t len,
363    size_t authdatalen, const __uint128_val_t *Htable)
364{
365	/* Lengths block */
366	s->lenblock.u[0] = s->lenblock.u[1] = 0;
367	s->lenblock.d[1] = htobe32(authdatalen * 8);
368	s->lenblock.d[3] = htobe32(len * 8);
369	gcm_ghash_v8(s->Xi.u, Htable, s->lenblock.c, AES_BLOCK_LEN);
370
371	s->Xi.u[0] ^= s->EK0.u[0];
372	s->Xi.u[1] ^= s->EK0.u[1];
373}
374
375static void
376armv8_aes_encrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
377    const uint64_t *from, uint64_t *to)
378{
379	aes_v8_encrypt(s->aes_counter, s->EKi.c, aes_key);
380	AES_INC_COUNTER(s->aes_counter);
381	to[0] = from[0] ^ s->EKi.u[0];
382	to[1] = from[1] ^ s->EKi.u[1];
383}
384
385static void
386armv8_aes_decrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
387    const uint64_t *from, uint64_t *to)
388{
389	armv8_aes_encrypt_gcm_block(s, aes_key, from, to);
390}
391
392void
393armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len,
394    struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
395    size_t authdatalen, const uint8_t *authdata,
396    uint8_t tag[static GMAC_DIGEST_LEN],
397    const uint8_t iv[static AES_GCM_IV_LEN],
398    const __uint128_val_t *Htable)
399{
400	struct armv8_gcm_state s;
401	uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN);
402	uint64_t *from64, *to64;
403	size_t fromseglen, i, olen, oseglen, seglen, toseglen;
404
405	armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
406
407	for (olen = len; len > 0; len -= seglen) {
408		from64 = crypto_cursor_segment(fromc, &fromseglen);
409		to64 = crypto_cursor_segment(toc, &toseglen);
410
411		seglen = ulmin(len, ulmin(fromseglen, toseglen));
412		if (seglen < AES_BLOCK_LEN) {
413			seglen = ulmin(len, AES_BLOCK_LEN);
414
415			memset(block, 0, sizeof(block));
416			crypto_cursor_copydata(fromc, (int)seglen, block);
417
418			if (seglen == AES_BLOCK_LEN) {
419				armv8_aes_encrypt_gcm_block(&s, aes_key,
420				    (uint64_t *)block, (uint64_t *)block);
421			} else {
422				aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
423				AES_INC_COUNTER(s.aes_counter);
424				for (i = 0; i < seglen; i++)
425					block[i] ^= s.EKi.c[i];
426			}
427			gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
428
429			crypto_cursor_copyback(toc, (int)seglen, block);
430		} else {
431			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
432			    seglen -= AES_BLOCK_LEN) {
433				armv8_aes_encrypt_gcm_block(&s, aes_key, from64,
434				    to64);
435				gcm_ghash_v8(s.Xi.u, Htable, (uint8_t *)to64,
436				    AES_BLOCK_LEN);
437
438				from64 += 2;
439				to64 += 2;
440			}
441
442			seglen = oseglen - seglen;
443			crypto_cursor_advance(fromc, seglen);
444			crypto_cursor_advance(toc, seglen);
445		}
446	}
447
448	armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
449	memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN);
450
451	explicit_bzero(block, sizeof(block));
452	explicit_bzero(&s, sizeof(s));
453}
454
455int
456armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len,
457    struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
458    size_t authdatalen, const uint8_t *authdata,
459    const uint8_t tag[static GMAC_DIGEST_LEN],
460    const uint8_t iv[static AES_GCM_IV_LEN],
461    const __uint128_val_t *Htable)
462{
463	struct armv8_gcm_state s;
464	struct crypto_buffer_cursor fromcc;
465	uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN), *from;
466	uint64_t *block64, *from64, *to64;
467	size_t fromseglen, olen, oseglen, seglen, toseglen;
468	int error;
469
470	armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
471
472	crypto_cursor_copy(fromc, &fromcc);
473	for (olen = len; len > 0; len -= seglen) {
474		from = crypto_cursor_segment(&fromcc, &fromseglen);
475		seglen = ulmin(len, fromseglen);
476		seglen -= seglen % AES_BLOCK_LEN;
477		if (seglen > 0) {
478			gcm_ghash_v8(s.Xi.u, Htable, from, seglen);
479			crypto_cursor_advance(&fromcc, seglen);
480		} else {
481			memset(block, 0, sizeof(block));
482			seglen = ulmin(len, AES_BLOCK_LEN);
483			crypto_cursor_copydata(&fromcc, seglen, block);
484			gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
485		}
486	}
487
488	armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
489
490	if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) {
491		error = EBADMSG;
492		goto out;
493	}
494
495	block64 = (uint64_t *)block;
496	for (len = olen; len > 0; len -= seglen) {
497		from64 = crypto_cursor_segment(fromc, &fromseglen);
498		to64 = crypto_cursor_segment(toc, &toseglen);
499
500		seglen = ulmin(len, ulmin(fromseglen, toseglen));
501		if (seglen < AES_BLOCK_LEN) {
502			seglen = ulmin(len, AES_BLOCK_LEN);
503
504			memset(block, 0, sizeof(block));
505			crypto_cursor_copydata(fromc, seglen, block);
506
507			armv8_aes_decrypt_gcm_block(&s, aes_key, block64,
508			    block64);
509
510			crypto_cursor_copyback(toc, (int)seglen, block);
511		} else {
512			for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
513			    seglen -= AES_BLOCK_LEN) {
514				armv8_aes_decrypt_gcm_block(&s, aes_key, from64,
515				    to64);
516
517				from64 += 2;
518				to64 += 2;
519			}
520
521			seglen = oseglen - seglen;
522			crypto_cursor_advance(fromc, seglen);
523			crypto_cursor_advance(toc, seglen);
524		}
525	}
526
527	error = 0;
528out:
529	explicit_bzero(block, sizeof(block));
530	explicit_bzero(&s, sizeof(s));
531	return (error);
532}
533