1/*-
2 * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
3 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
4 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
5 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
6 * Copyright (c) 2014 The FreeBSD Foundation
7 * All rights reserved.
8 *
9 * Portions of this software were developed by John-Mark Gurney
10 * under sponsorship of the FreeBSD Foundation and
11 * Rubicon Communications, LLC (Netgate).
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD$");
37
38#include <sys/param.h>
39#include <sys/libkern.h>
40#include <sys/malloc.h>
41#include <sys/proc.h>
42#include <sys/systm.h>
43#include <crypto/aesni/aesni.h>
44
45#include <opencrypto/gmac.h>
46
47#include "aesencdec.h"
48#include <smmintrin.h>
49
50MALLOC_DECLARE(M_AESNI);
51
52struct blocks8 {
53	__m128i	blk[8];
54} __packed;
55
56void
57aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len,
58    const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
59{
60	__m128i tot, ivreg;
61	size_t i;
62
63	len /= AES_BLOCK_LEN;
64	ivreg = _mm_loadu_si128((const __m128i *)iv);
65	for (i = 0; i < len; i++) {
66		tot = aesni_enc(rounds - 1, key_schedule,
67		    _mm_loadu_si128((const __m128i *)from) ^ ivreg);
68		ivreg = tot;
69		_mm_storeu_si128((__m128i *)to, tot);
70		from += AES_BLOCK_LEN;
71		to += AES_BLOCK_LEN;
72	}
73}
74
75void
76aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len,
77    uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN])
78{
79	__m128i blocks[8];
80	struct blocks8 *blks;
81	__m128i ivreg, nextiv;
82	size_t i, j, cnt;
83
84	ivreg = _mm_loadu_si128((const __m128i *)iv);
85	cnt = len / AES_BLOCK_LEN / 8;
86	for (i = 0; i < cnt; i++) {
87		blks = (struct blocks8 *)buf;
88		aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
89		    blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
90		    blks->blk[6], blks->blk[7], &blocks[0]);
91		for (j = 0; j < 8; j++) {
92			nextiv = blks->blk[j];
93			blks->blk[j] = blocks[j] ^ ivreg;
94			ivreg = nextiv;
95		}
96		buf += AES_BLOCK_LEN * 8;
97	}
98	i *= 8;
99	cnt = len / AES_BLOCK_LEN;
100	for (; i < cnt; i++) {
101		nextiv = _mm_loadu_si128((void *)buf);
102		_mm_storeu_si128((void *)buf,
103		    aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg);
104		ivreg = nextiv;
105		buf += AES_BLOCK_LEN;
106	}
107}
108
109void
110aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len,
111    const uint8_t *from, uint8_t *to)
112{
113	__m128i tot;
114	__m128i tout[8];
115	struct blocks8 *top;
116	const struct blocks8 *blks;
117	size_t i, cnt;
118
119	cnt = len / AES_BLOCK_LEN / 8;
120	for (i = 0; i < cnt; i++) {
121		blks = (const struct blocks8 *)from;
122		top = (struct blocks8 *)to;
123		aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
124		    blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
125		    blks->blk[6], blks->blk[7], tout);
126		top->blk[0] = tout[0];
127		top->blk[1] = tout[1];
128		top->blk[2] = tout[2];
129		top->blk[3] = tout[3];
130		top->blk[4] = tout[4];
131		top->blk[5] = tout[5];
132		top->blk[6] = tout[6];
133		top->blk[7] = tout[7];
134		from += AES_BLOCK_LEN * 8;
135		to += AES_BLOCK_LEN * 8;
136	}
137	i *= 8;
138	cnt = len / AES_BLOCK_LEN;
139	for (; i < cnt; i++) {
140		tot = aesni_enc(rounds - 1, key_schedule,
141		    _mm_loadu_si128((const __m128i *)from));
142		_mm_storeu_si128((__m128i *)to, tot);
143		from += AES_BLOCK_LEN;
144		to += AES_BLOCK_LEN;
145	}
146}
147
148void
149aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len,
150    const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN])
151{
152	__m128i tot;
153	__m128i tout[8];
154	const struct blocks8 *blks;
155	struct blocks8 *top;
156	size_t i, cnt;
157
158	cnt = len / AES_BLOCK_LEN / 8;
159	for (i = 0; i < cnt; i++) {
160		blks = (const struct blocks8 *)from;
161		top = (struct blocks8 *)to;
162		aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1],
163		    blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5],
164		    blks->blk[6], blks->blk[7], tout);
165		top->blk[0] = tout[0];
166		top->blk[1] = tout[1];
167		top->blk[2] = tout[2];
168		top->blk[3] = tout[3];
169		top->blk[4] = tout[4];
170		top->blk[5] = tout[5];
171		top->blk[6] = tout[6];
172		top->blk[7] = tout[7];
173		from += AES_BLOCK_LEN * 8;
174		to += AES_BLOCK_LEN * 8;
175	}
176	i *= 8;
177	cnt = len / AES_BLOCK_LEN;
178	for (; i < cnt; i++) {
179		tot = aesni_dec(rounds - 1, key_schedule,
180		    _mm_loadu_si128((const __m128i *)from));
181		_mm_storeu_si128((__m128i *)to, tot);
182		from += AES_BLOCK_LEN;
183		to += AES_BLOCK_LEN;
184	}
185}
186
187/*
188 * mixed endian increment, low 64bits stored in hi word to be compatible
189 * with _icm's BSWAP.
190 */
191static inline __m128i
192nextc(__m128i x)
193{
194	const __m128i ONE = _mm_setr_epi32(0, 0, 1, 0);
195	const __m128i ZERO = _mm_setzero_si128();
196
197	x = _mm_add_epi64(x, ONE);
198	__m128i t = _mm_cmpeq_epi64(x, ZERO);
199	t = _mm_unpackhi_epi64(t, ZERO);
200	x = _mm_sub_epi64(x, t);
201
202	return x;
203}
204
205void
206aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len,
207    const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
208{
209	__m128i tot;
210	__m128i tmp1, tmp2, tmp3, tmp4;
211	__m128i tmp5, tmp6, tmp7, tmp8;
212	__m128i ctr1, ctr2, ctr3, ctr4;
213	__m128i ctr5, ctr6, ctr7, ctr8;
214	__m128i BSWAP_EPI64;
215	__m128i tout[8];
216	struct blocks8 *top;
217	const struct blocks8 *blks;
218	size_t i, cnt;
219
220	BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
221
222	ctr1 = _mm_loadu_si128((const __m128i *)iv);
223	ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
224
225	cnt = len / AES_BLOCK_LEN / 8;
226	for (i = 0; i < cnt; i++) {
227		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
228		ctr2 = nextc(ctr1);
229		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
230		ctr3 = nextc(ctr2);
231		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
232		ctr4 = nextc(ctr3);
233		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
234		ctr5 = nextc(ctr4);
235		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
236		ctr6 = nextc(ctr5);
237		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
238		ctr7 = nextc(ctr6);
239		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
240		ctr8 = nextc(ctr7);
241		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
242		ctr1 = nextc(ctr8);
243
244		blks = (const struct blocks8 *)from;
245		top = (struct blocks8 *)to;
246		aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4,
247		    tmp5, tmp6, tmp7, tmp8, tout);
248
249		top->blk[0] = blks->blk[0] ^ tout[0];
250		top->blk[1] = blks->blk[1] ^ tout[1];
251		top->blk[2] = blks->blk[2] ^ tout[2];
252		top->blk[3] = blks->blk[3] ^ tout[3];
253		top->blk[4] = blks->blk[4] ^ tout[4];
254		top->blk[5] = blks->blk[5] ^ tout[5];
255		top->blk[6] = blks->blk[6] ^ tout[6];
256		top->blk[7] = blks->blk[7] ^ tout[7];
257
258		from += AES_BLOCK_LEN * 8;
259		to += AES_BLOCK_LEN * 8;
260	}
261	i *= 8;
262	cnt = len / AES_BLOCK_LEN;
263	for (; i < cnt; i++) {
264		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
265		ctr1 = nextc(ctr1);
266
267		tot = aesni_enc(rounds - 1, key_schedule, tmp1);
268
269		tot = tot ^ _mm_loadu_si128((const __m128i *)from);
270		_mm_storeu_si128((__m128i *)to, tot);
271
272		from += AES_BLOCK_LEN;
273		to += AES_BLOCK_LEN;
274	}
275
276	/* handle remaining partial round */
277	if (len % AES_BLOCK_LEN != 0) {
278		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
279		tot = aesni_enc(rounds - 1, key_schedule, tmp1);
280		tot = tot ^ _mm_loadu_si128((const __m128i *)from);
281		memcpy(to, &tot, len % AES_BLOCK_LEN);
282	}
283}
284
285#define	AES_XTS_BLOCKSIZE	16
286#define	AES_XTS_IVSIZE		8
287#define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
288
289static inline __m128i
290xts_crank_lfsr(__m128i inp)
291{
292	const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA);
293	__m128i xtweak, ret;
294
295	/* set up xor mask */
296	xtweak = _mm_shuffle_epi32(inp, 0x93);
297	xtweak = _mm_srai_epi32(xtweak, 31);
298	xtweak &= alphamask;
299
300	/* next term */
301	ret = _mm_slli_epi32(inp, 1);
302	ret ^= xtweak;
303
304	return ret;
305}
306
307static void
308aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak,
309    const uint8_t *from, uint8_t *to, int do_encrypt)
310{
311	__m128i block;
312
313	block = _mm_loadu_si128((const __m128i *)from) ^ *tweak;
314
315	if (do_encrypt)
316		block = aesni_enc(rounds - 1, key_schedule, block);
317	else
318		block = aesni_dec(rounds - 1, key_schedule, block);
319
320	_mm_storeu_si128((__m128i *)to, block ^ *tweak);
321
322	*tweak = xts_crank_lfsr(*tweak);
323}
324
325static void
326aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak,
327    const uint8_t *from, uint8_t *to, int do_encrypt)
328{
329	__m128i tmptweak;
330	__m128i a, b, c, d, e, f, g, h;
331	__m128i tweaks[8];
332	__m128i tmp[8];
333	__m128i *top;
334	const __m128i *fromp;
335
336	tmptweak = *tweak;
337
338	/*
339	 * unroll the loop.  This lets gcc put values directly in the
340	 * register and saves memory accesses.
341	 */
342	fromp = (const __m128i *)from;
343#define PREPINP(v, pos) 					\
344		do {						\
345			tweaks[(pos)] = tmptweak;		\
346			(v) = _mm_loadu_si128(&fromp[pos]) ^	\
347			    tmptweak;				\
348			tmptweak = xts_crank_lfsr(tmptweak);	\
349		} while (0)
350	PREPINP(a, 0);
351	PREPINP(b, 1);
352	PREPINP(c, 2);
353	PREPINP(d, 3);
354	PREPINP(e, 4);
355	PREPINP(f, 5);
356	PREPINP(g, 6);
357	PREPINP(h, 7);
358	*tweak = tmptweak;
359
360	if (do_encrypt)
361		aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
362		    tmp);
363	else
364		aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h,
365		    tmp);
366
367	top = (__m128i *)to;
368	_mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]);
369	_mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]);
370	_mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]);
371	_mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]);
372	_mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]);
373	_mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]);
374	_mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]);
375	_mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]);
376}
377
378static void
379aesni_crypt_xts(int rounds, const __m128i *data_schedule,
380    const __m128i *tweak_schedule, size_t len, const uint8_t *from,
381    uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
382{
383	__m128i tweakreg;
384	uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
385	size_t i, cnt;
386
387	/*
388	 * Prepare tweak as E_k2(IV). IV is specified as LE representation
389	 * of a 64-bit block number which we allow to be passed in directly.
390	 */
391#if BYTE_ORDER == LITTLE_ENDIAN
392	bcopy(iv, tweak, AES_XTS_IVSIZE);
393	/* Last 64 bits of IV are always zero. */
394	bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
395#else
396#error Only LITTLE_ENDIAN architectures are supported.
397#endif
398	tweakreg = _mm_loadu_si128((__m128i *)&tweak[0]);
399	tweakreg = aesni_enc(rounds - 1, tweak_schedule, tweakreg);
400
401	cnt = len / AES_XTS_BLOCKSIZE / 8;
402	for (i = 0; i < cnt; i++) {
403		aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg,
404		    from, to, do_encrypt);
405		from += AES_XTS_BLOCKSIZE * 8;
406		to += AES_XTS_BLOCKSIZE * 8;
407	}
408	i *= 8;
409	cnt = len / AES_XTS_BLOCKSIZE;
410	for (; i < cnt; i++) {
411		aesni_crypt_xts_block(rounds, data_schedule, &tweakreg,
412		    from, to, do_encrypt);
413		from += AES_XTS_BLOCKSIZE;
414		to += AES_XTS_BLOCKSIZE;
415	}
416}
417
418void
419aesni_encrypt_xts(int rounds, const void *data_schedule,
420    const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
421    const uint8_t iv[static AES_BLOCK_LEN])
422{
423
424	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
425	    iv, 1);
426}
427
428void
429aesni_decrypt_xts(int rounds, const void *data_schedule,
430    const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
431    const uint8_t iv[static AES_BLOCK_LEN])
432{
433
434	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
435	    iv, 0);
436}
437
438int
439aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
440    int keylen)
441{
442	int decsched;
443
444	decsched = 1;
445
446	switch (ses->algo) {
447	case CRYPTO_AES_ICM:
448	case CRYPTO_AES_NIST_GCM_16:
449	case CRYPTO_AES_CCM_16:
450		decsched = 0;
451		/* FALLTHROUGH */
452	case CRYPTO_AES_CBC:
453		switch (keylen) {
454		case 128:
455			ses->rounds = AES128_ROUNDS;
456			break;
457		case 192:
458			ses->rounds = AES192_ROUNDS;
459			break;
460		case 256:
461			ses->rounds = AES256_ROUNDS;
462			break;
463		default:
464			CRYPTDEB("invalid CBC/ICM/GCM key length");
465			return (EINVAL);
466		}
467		break;
468	case CRYPTO_AES_XTS:
469		switch (keylen) {
470		case 256:
471			ses->rounds = AES128_ROUNDS;
472			break;
473		case 512:
474			ses->rounds = AES256_ROUNDS;
475			break;
476		default:
477			CRYPTDEB("invalid XTS key length");
478			return (EINVAL);
479		}
480		break;
481	default:
482		return (EINVAL);
483	}
484
485	aesni_set_enckey(key, ses->enc_schedule, ses->rounds);
486	if (decsched)
487		aesni_set_deckey(ses->enc_schedule, ses->dec_schedule,
488		    ses->rounds);
489
490	if (ses->algo == CRYPTO_AES_XTS)
491		aesni_set_enckey(key + keylen / 16, ses->xts_schedule,
492		    ses->rounds);
493
494	return (0);
495}
496