aes_via.c revision 1.5
1/*	$NetBSD: aes_via.c,v 1.5 2020/07/25 22:31:32 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__KERNEL_RCSID(1, "$NetBSD: aes_via.c,v 1.5 2020/07/25 22:31:32 riastradh Exp $");
31
32#ifdef _KERNEL
33#include <sys/types.h>
34#include <sys/evcnt.h>
35#include <sys/systm.h>
36#else
37#include <assert.h>
38#include <err.h>
39#include <stdint.h>
40#include <string.h>
41#define	KASSERT			assert
42#define	panic(fmt, args...)	err(1, fmt, args)
43struct evcnt { uint64_t ev_count; };
44#define	EVCNT_INITIALIZER(a,b,c,d) {0}
45#define	EVCNT_ATTACH_STATIC(name)	static char name##_attach __unused = 0
46#endif
47
48#include <crypto/aes/aes.h>
49#include <crypto/aes/aes_bear.h>
50#include <crypto/aes/aes_impl.h>
51
52#ifdef _KERNEL
53#include <x86/cpufunc.h>
54#include <x86/cpuvar.h>
55#include <x86/fpu.h>
56#include <x86/specialreg.h>
57#include <x86/via_padlock.h>
58#else
59#include <cpuid.h>
60#define	fpu_kern_enter()	((void)0)
61#define	fpu_kern_leave()	((void)0)
62#define C3_CRYPT_CWLO_ROUND_M		0x0000000f
63#define C3_CRYPT_CWLO_ALG_M		0x00000070
64#define C3_CRYPT_CWLO_ALG_AES		0x00000000
65#define C3_CRYPT_CWLO_KEYGEN_M		0x00000080
66#define C3_CRYPT_CWLO_KEYGEN_HW		0x00000000
67#define C3_CRYPT_CWLO_KEYGEN_SW		0x00000080
68#define C3_CRYPT_CWLO_NORMAL		0x00000000
69#define C3_CRYPT_CWLO_INTERMEDIATE	0x00000100
70#define C3_CRYPT_CWLO_ENCRYPT		0x00000000
71#define C3_CRYPT_CWLO_DECRYPT		0x00000200
72#define C3_CRYPT_CWLO_KEY128		0x0000000a      /* 128bit, 10 rds */
73#define C3_CRYPT_CWLO_KEY192		0x0000040c      /* 192bit, 12 rds */
74#define C3_CRYPT_CWLO_KEY256		0x0000080e      /* 256bit, 15 rds */
75#endif
76
77static void
78aesvia_reload_keys(void)
79{
80
81	asm volatile("pushf; popf");
82}
83
84static uint32_t
85aesvia_keylen_cw0(unsigned nrounds)
86{
87
88	/*
89	 * Determine the control word bits for the key size / number of
90	 * rounds.  For AES-128, the hardware can do key expansion on
91	 * the fly; for AES-192 and AES-256, software must do it.
92	 */
93	switch (nrounds) {
94	case AES_128_NROUNDS:
95		return C3_CRYPT_CWLO_KEY128;
96	case AES_192_NROUNDS:
97		return C3_CRYPT_CWLO_KEY192 | C3_CRYPT_CWLO_KEYGEN_SW;
98	case AES_256_NROUNDS:
99		return C3_CRYPT_CWLO_KEY256 | C3_CRYPT_CWLO_KEYGEN_SW;
100	default:
101		panic("invalid AES nrounds: %u", nrounds);
102	}
103}
104
105static void
106aesvia_setenckey(struct aesenc *enc, const uint8_t *key, uint32_t nrounds)
107{
108	size_t key_len;
109
110	switch (nrounds) {
111	case AES_128_NROUNDS:
112		enc->aese_aes.aes_rk[0] = le32dec(key + 4*0);
113		enc->aese_aes.aes_rk[1] = le32dec(key + 4*1);
114		enc->aese_aes.aes_rk[2] = le32dec(key + 4*2);
115		enc->aese_aes.aes_rk[3] = le32dec(key + 4*3);
116		return;
117	case AES_192_NROUNDS:
118		key_len = 24;
119		break;
120	case AES_256_NROUNDS:
121		key_len = 32;
122		break;
123	default:
124		panic("invalid AES nrounds: %u", nrounds);
125	}
126	br_aes_ct_keysched_stdenc(enc->aese_aes.aes_rk, key, key_len);
127}
128
129static void
130aesvia_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
131{
132	size_t key_len;
133
134	switch (nrounds) {
135	case AES_128_NROUNDS:
136		dec->aesd_aes.aes_rk[0] = le32dec(key + 4*0);
137		dec->aesd_aes.aes_rk[1] = le32dec(key + 4*1);
138		dec->aesd_aes.aes_rk[2] = le32dec(key + 4*2);
139		dec->aesd_aes.aes_rk[3] = le32dec(key + 4*3);
140		return;
141	case AES_192_NROUNDS:
142		key_len = 24;
143		break;
144	case AES_256_NROUNDS:
145		key_len = 32;
146		break;
147	default:
148		panic("invalid AES nrounds: %u", nrounds);
149	}
150	br_aes_ct_keysched_stddec(dec->aesd_aes.aes_rk, key, key_len);
151}
152
153static inline void
154aesvia_encN(const struct aesenc *enc, const uint8_t in[static 16],
155    uint8_t out[static 16], size_t nblocks, uint32_t cw0)
156{
157	const uint32_t cw[4] __aligned(16) = {
158		[0] = (cw0
159		    | C3_CRYPT_CWLO_ALG_AES
160		    | C3_CRYPT_CWLO_ENCRYPT
161		    | C3_CRYPT_CWLO_NORMAL),
162	};
163
164	KASSERT(((uintptr_t)enc & 0xf) == 0);
165	KASSERT(((uintptr_t)in & 0xf) == 0);
166	KASSERT(((uintptr_t)out & 0xf) == 0);
167
168	asm volatile("rep xcryptecb"
169	    : "+c"(nblocks), "+S"(in), "+D"(out)
170	    : "b"(enc), "d"(cw)
171	    : "memory", "cc");
172}
173
174static inline void
175aesvia_decN(const struct aesdec *dec, const uint8_t in[static 16],
176    uint8_t out[static 16], size_t nblocks, uint32_t cw0)
177{
178	const uint32_t cw[4] __aligned(16) = {
179		[0] = (cw0
180		    | C3_CRYPT_CWLO_ALG_AES
181		    | C3_CRYPT_CWLO_DECRYPT
182		    | C3_CRYPT_CWLO_NORMAL),
183	};
184
185	KASSERT(((uintptr_t)dec & 0xf) == 0);
186	KASSERT(((uintptr_t)in & 0xf) == 0);
187	KASSERT(((uintptr_t)out & 0xf) == 0);
188
189	asm volatile("rep xcryptecb"
190	    : "+c"(nblocks), "+S"(in), "+D"(out)
191	    : "b"(dec), "d"(cw)
192	    : "memory", "cc");
193}
194
195static struct evcnt enc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
196    NULL, "aesvia", "enc aligned");
197EVCNT_ATTACH_STATIC(enc_aligned_evcnt);
198static struct evcnt enc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
199    NULL, "aesvia", "dec unaligned");
200EVCNT_ATTACH_STATIC(enc_unaligned_evcnt);
201
202static void
203aesvia_enc(const struct aesenc *enc, const uint8_t in[static 16],
204    uint8_t out[static 16], uint32_t nrounds)
205{
206	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
207
208	fpu_kern_enter();
209	aesvia_reload_keys();
210	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
211	    ((uintptr_t)in & 0xff0) != 0xff0) {
212		enc_aligned_evcnt.ev_count++;
213		aesvia_encN(enc, in, out, 1, cw0);
214	} else {
215		enc_unaligned_evcnt.ev_count++;
216		/*
217		 * VIA requires 16-byte/128-bit alignment, and
218		 * xcrypt-ecb reads one block past the one we're
219		 * working on -- which may go past the end of the page
220		 * into unmapped territory.  Use a bounce buffer if
221		 * either constraint is violated.
222		 */
223		uint8_t inbuf[16] __aligned(16);
224		uint8_t outbuf[16] __aligned(16);
225
226		memcpy(inbuf, in, 16);
227		aesvia_encN(enc, inbuf, outbuf, 1, cw0);
228		memcpy(out, outbuf, 16);
229
230		explicit_memset(inbuf, 0, sizeof inbuf);
231		explicit_memset(outbuf, 0, sizeof outbuf);
232	}
233	fpu_kern_leave();
234}
235
236static struct evcnt dec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
237    NULL, "aesvia", "dec aligned");
238EVCNT_ATTACH_STATIC(dec_aligned_evcnt);
239static struct evcnt dec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
240    NULL, "aesvia", "dec unaligned");
241EVCNT_ATTACH_STATIC(dec_unaligned_evcnt);
242
243static void
244aesvia_dec(const struct aesdec *dec, const uint8_t in[static 16],
245    uint8_t out[static 16], uint32_t nrounds)
246{
247	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
248
249	fpu_kern_enter();
250	aesvia_reload_keys();
251	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
252	    ((uintptr_t)in & 0xff0) != 0xff0) {
253		dec_aligned_evcnt.ev_count++;
254		aesvia_decN(dec, in, out, 1, cw0);
255	} else {
256		dec_unaligned_evcnt.ev_count++;
257		/*
258		 * VIA requires 16-byte/128-bit alignment, and
259		 * xcrypt-ecb reads one block past the one we're
260		 * working on -- which may go past the end of the page
261		 * into unmapped territory.  Use a bounce buffer if
262		 * either constraint is violated.
263		 */
264		uint8_t inbuf[16] __aligned(16);
265		uint8_t outbuf[16] __aligned(16);
266
267		memcpy(inbuf, in, 16);
268		aesvia_decN(dec, inbuf, outbuf, 1, cw0);
269		memcpy(out, outbuf, 16);
270
271		explicit_memset(inbuf, 0, sizeof inbuf);
272		explicit_memset(outbuf, 0, sizeof outbuf);
273	}
274	fpu_kern_leave();
275}
276
277static inline void
278aesvia_cbc_encN(const struct aesenc *enc, const uint8_t in[static 16],
279    uint8_t out[static 16], size_t nblocks, uint8_t **ivp, uint32_t cw0)
280{
281	const uint32_t cw[4] __aligned(16) = {
282		[0] = (cw0
283		    | C3_CRYPT_CWLO_ALG_AES
284		    | C3_CRYPT_CWLO_ENCRYPT
285		    | C3_CRYPT_CWLO_NORMAL),
286	};
287
288	KASSERT(((uintptr_t)enc & 0xf) == 0);
289	KASSERT(((uintptr_t)in & 0xf) == 0);
290	KASSERT(((uintptr_t)out & 0xf) == 0);
291	KASSERT(((uintptr_t)*ivp & 0xf) == 0);
292
293	/*
294	 * Register effects:
295	 * - Counts nblocks down to zero.
296	 * - Advances in by nblocks (units of blocks).
297	 * - Advances out by nblocks (units of blocks).
298	 * - Updates *ivp to point at the last block of out.
299	 */
300	asm volatile("rep xcryptcbc"
301	    : "+c"(nblocks), "+S"(in), "+D"(out), "+a"(*ivp)
302	    : "b"(enc), "d"(cw)
303	    : "memory", "cc");
304}
305
306static inline void
307aesvia_cbc_decN(const struct aesdec *dec, const uint8_t in[static 16],
308    uint8_t out[static 16], size_t nblocks, uint8_t iv[static 16],
309    uint32_t cw0)
310{
311	const uint32_t cw[4] __aligned(16) = {
312		[0] = (cw0
313		    | C3_CRYPT_CWLO_ALG_AES
314		    | C3_CRYPT_CWLO_DECRYPT
315		    | C3_CRYPT_CWLO_NORMAL),
316	};
317
318	KASSERT(((uintptr_t)dec & 0xf) == 0);
319	KASSERT(((uintptr_t)in & 0xf) == 0);
320	KASSERT(((uintptr_t)out & 0xf) == 0);
321	KASSERT(((uintptr_t)iv & 0xf) == 0);
322
323	/*
324	 * Register effects:
325	 * - Counts nblocks down to zero.
326	 * - Advances in by nblocks (units of blocks).
327	 * - Advances out by nblocks (units of blocks).
328	 * Memory side effects:
329	 * - Writes what was the last block of in at the address iv.
330	 */
331	asm volatile("rep xcryptcbc"
332	    : "+c"(nblocks), "+S"(in), "+D"(out)
333	    : "a"(iv), "b"(dec), "d"(cw)
334	    : "memory", "cc");
335}
336
337static inline void
338xor128(void *x, const void *a, const void *b)
339{
340	uint32_t *x32 = x;
341	const uint32_t *a32 = a;
342	const uint32_t *b32 = b;
343
344	x32[0] = a32[0] ^ b32[0];
345	x32[1] = a32[1] ^ b32[1];
346	x32[2] = a32[2] ^ b32[2];
347	x32[3] = a32[3] ^ b32[3];
348}
349
350static struct evcnt cbcenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
351    NULL, "aesvia", "cbcenc aligned");
352EVCNT_ATTACH_STATIC(cbcenc_aligned_evcnt);
353static struct evcnt cbcenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
354    NULL, "aesvia", "cbcenc unaligned");
355EVCNT_ATTACH_STATIC(cbcenc_unaligned_evcnt);
356
357static void
358aesvia_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
359    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
360    uint32_t nrounds)
361{
362	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
363
364	KASSERT(nbytes % 16 == 0);
365	if (nbytes == 0)
366		return;
367
368	fpu_kern_enter();
369	aesvia_reload_keys();
370	if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
371		cbcenc_aligned_evcnt.ev_count++;
372		uint8_t *ivp = iv;
373		aesvia_cbc_encN(enc, in, out, nbytes/16, &ivp, cw0);
374		memcpy(iv, ivp, 16);
375	} else {
376		cbcenc_unaligned_evcnt.ev_count++;
377		uint8_t cv[16] __aligned(16);
378		uint8_t tmp[16] __aligned(16);
379
380		memcpy(cv, iv, 16);
381		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
382			memcpy(tmp, in, 16);
383			xor128(tmp, tmp, cv);
384			aesvia_encN(enc, tmp, cv, 1, cw0);
385			memcpy(out, cv, 16);
386		}
387		memcpy(iv, cv, 16);
388	}
389	fpu_kern_leave();
390}
391
392static struct evcnt cbcdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
393    NULL, "aesvia", "cbcdec aligned");
394EVCNT_ATTACH_STATIC(cbcdec_aligned_evcnt);
395static struct evcnt cbcdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
396    NULL, "aesvia", "cbcdec unaligned");
397EVCNT_ATTACH_STATIC(cbcdec_unaligned_evcnt);
398
399static void
400aesvia_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
401    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
402    uint32_t nrounds)
403{
404	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
405
406	KASSERT(nbytes % 16 == 0);
407	if (nbytes == 0)
408		return;
409
410	fpu_kern_enter();
411	aesvia_reload_keys();
412	if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
413		cbcdec_aligned_evcnt.ev_count++;
414		aesvia_cbc_decN(dec, in, out, nbytes/16, iv, cw0);
415	} else {
416		cbcdec_unaligned_evcnt.ev_count++;
417		uint8_t iv0[16] __aligned(16);
418		uint8_t cv[16] __aligned(16);
419		uint8_t tmp[16] __aligned(16);
420
421		memcpy(iv0, iv, 16);
422		memcpy(cv, in + nbytes - 16, 16);
423		memcpy(iv, cv, 16);
424
425		for (;;) {
426			aesvia_decN(dec, cv, tmp, 1, cw0);
427			if ((nbytes -= 16) == 0)
428				break;
429			memcpy(cv, in + nbytes - 16, 16);
430			xor128(tmp, tmp, cv);
431			memcpy(out + nbytes, tmp, 16);
432		}
433
434		xor128(tmp, tmp, iv0);
435		memcpy(out, tmp, 16);
436		explicit_memset(tmp, 0, sizeof tmp);
437	}
438	fpu_kern_leave();
439}
440
441static inline void
442aesvia_xts_update(uint32_t *t0, uint32_t *t1, uint32_t *t2, uint32_t *t3)
443{
444	uint32_t s0, s1, s2, s3;
445
446	s0 = *t0 >> 31;
447	s1 = *t1 >> 31;
448	s2 = *t2 >> 31;
449	s3 = *t3 >> 31;
450	*t0 = (*t0 << 1) ^ (-s3 & 0x87);
451	*t1 = (*t1 << 1) ^ s0;
452	*t2 = (*t2 << 1) ^ s1;
453	*t3 = (*t3 << 1) ^ s2;
454}
455
456static int
457aesvia_xts_update_selftest(void)
458{
459	static const struct {
460		uint32_t in[4], out[4];
461	} cases[] = {
462		{ {1}, {2} },
463		{ {0x80000000U,0,0,0}, {0,1,0,0} },
464		{ {0,0x80000000U,0,0}, {0,0,1,0} },
465		{ {0,0,0x80000000U,0}, {0,0,0,1} },
466		{ {0,0,0,0x80000000U}, {0x87,0,0,0} },
467		{ {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
468	};
469	unsigned i;
470	uint32_t t0, t1, t2, t3;
471
472	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
473		t0 = cases[i].in[0];
474		t1 = cases[i].in[1];
475		t2 = cases[i].in[2];
476		t3 = cases[i].in[3];
477		aesvia_xts_update(&t0, &t1, &t2, &t3);
478		if (t0 != cases[i].out[0] ||
479		    t1 != cases[i].out[1] ||
480		    t2 != cases[i].out[2] ||
481		    t3 != cases[i].out[3])
482			return -1;
483	}
484
485	/* Success!  */
486	return 0;
487}
488
489static struct evcnt xtsenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
490    NULL, "aesvia", "xtsenc aligned");
491EVCNT_ATTACH_STATIC(xtsenc_aligned_evcnt);
492static struct evcnt xtsenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
493    NULL, "aesvia", "xtsenc unaligned");
494EVCNT_ATTACH_STATIC(xtsenc_unaligned_evcnt);
495
496static void
497aesvia_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
498    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
499    uint32_t nrounds)
500{
501	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
502	uint32_t t[4];
503
504	KASSERT(nbytes % 16 == 0);
505
506	memcpy(t, tweak, 16);
507
508	fpu_kern_enter();
509	aesvia_reload_keys();
510	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
511		xtsenc_aligned_evcnt.ev_count++;
512		unsigned lastblock = 0;
513		uint32_t buf[8*4] __aligned(16);
514
515		/*
516		 * Make sure the last block is not the last block of a
517		 * page.  (Note that we store the AES input in `out' as
518		 * a temporary buffer, rather than reading it directly
519		 * from `in', since we have to combine the tweak
520		 * first.)
521		 */
522		lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
523		nbytes -= lastblock;
524
525		/*
526		 * Handle an odd number of initial blocks so we can
527		 * process the rest in eight-block (128-byte) chunks.
528		 */
529		if (nbytes % 128) {
530			unsigned nbytes128 = nbytes % 128;
531
532			nbytes -= nbytes128;
533			for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
534			{
535				xor128(out, in, t);
536				aesvia_encN(enc, out, out, 1, cw0);
537				xor128(out, out, t);
538				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
539			}
540		}
541
542		/* Process eight blocks at a time.  */
543		for (; nbytes; nbytes -= 128, in += 128, out += 128) {
544			unsigned i;
545			for (i = 0; i < 8; i++) {
546				memcpy(buf + 4*i, t, 16);
547				xor128(out + 4*i, in + 4*i, t);
548				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
549			}
550			aesvia_encN(enc, out, out, 8, cw0);
551			for (i = 0; i < 8; i++)
552				xor128(out + 4*i, in + 4*i, buf + 4*i);
553		}
554
555		/* Handle the last block of a page, if necessary.  */
556		if (lastblock) {
557			xor128(buf, in, t);
558			aesvia_encN(enc, (const void *)buf, out, 1, cw0);
559		}
560
561		explicit_memset(buf, 0, sizeof buf);
562	} else {
563		xtsenc_unaligned_evcnt.ev_count++;
564		uint8_t buf[16] __aligned(16);
565
566		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
567			memcpy(buf, in, 16);
568			xor128(buf, buf, t);
569			aesvia_encN(enc, buf, buf, 1, cw0);
570			xor128(buf, buf, t);
571			memcpy(out, buf, 16);
572			aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
573		}
574
575		explicit_memset(buf, 0, sizeof buf);
576	}
577	fpu_kern_leave();
578
579	memcpy(tweak, t, 16);
580	explicit_memset(t, 0, sizeof t);
581}
582
583static struct evcnt xtsdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
584    NULL, "aesvia", "xtsdec aligned");
585EVCNT_ATTACH_STATIC(xtsdec_aligned_evcnt);
586static struct evcnt xtsdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
587    NULL, "aesvia", "xtsdec unaligned");
588EVCNT_ATTACH_STATIC(xtsdec_unaligned_evcnt);
589
590static void
591aesvia_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
592    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
593    uint32_t nrounds)
594{
595	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
596	uint32_t t[4];
597
598	KASSERT(nbytes % 16 == 0);
599
600	memcpy(t, tweak, 16);
601
602	fpu_kern_enter();
603	aesvia_reload_keys();
604	if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
605		xtsdec_aligned_evcnt.ev_count++;
606		unsigned lastblock = 0;
607		uint32_t buf[8*4] __aligned(16);
608
609		/*
610		 * Make sure the last block is not the last block of a
611		 * page.  (Note that we store the AES input in `out' as
612		 * a temporary buffer, rather than reading it directly
613		 * from `in', since we have to combine the tweak
614		 * first.)
615		 */
616		lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
617		nbytes -= lastblock;
618
619		/*
620		 * Handle an odd number of initial blocks so we can
621		 * process the rest in eight-block (128-byte) chunks.
622		 */
623		if (nbytes % 128) {
624			unsigned nbytes128 = nbytes % 128;
625
626			nbytes -= nbytes128;
627			for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
628			{
629				xor128(out, in, t);
630				aesvia_decN(dec, out, out, 1, cw0);
631				xor128(out, out, t);
632				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
633			}
634		}
635
636		/* Process eight blocks at a time.  */
637		for (; nbytes; nbytes -= 128, in += 128, out += 128) {
638			unsigned i;
639			for (i = 0; i < 8; i++) {
640				memcpy(buf + 4*i, t, 16);
641				xor128(out + 4*i, in + 4*i, t);
642				aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
643			}
644			aesvia_decN(dec, out, out, 8, cw0);
645			for (i = 0; i < 8; i++)
646				xor128(out + 4*i, in + 4*i, buf + 4*i);
647		}
648
649		/* Handle the last block of a page, if necessary.  */
650		if (lastblock) {
651			xor128(buf, in, t);
652			aesvia_decN(dec, (const void *)buf, out, 1, cw0);
653		}
654
655		explicit_memset(buf, 0, sizeof buf);
656	} else {
657		xtsdec_unaligned_evcnt.ev_count++;
658		uint8_t buf[16] __aligned(16);
659
660		for (; nbytes; nbytes -= 16, in += 16, out += 16) {
661			memcpy(buf, in, 16);
662			xor128(buf, buf, t);
663			aesvia_decN(dec, buf, buf, 1, cw0);
664			xor128(buf, buf, t);
665			memcpy(out, buf, 16);
666			aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
667		}
668
669		explicit_memset(buf, 0, sizeof buf);
670	}
671	fpu_kern_leave();
672
673	memcpy(tweak, t, 16);
674	explicit_memset(t, 0, sizeof t);
675}
676
677static struct evcnt cbcmac_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
678    NULL, "aesvia", "cbcmac aligned");
679EVCNT_ATTACH_STATIC(cbcmac_aligned_evcnt);
680static struct evcnt cbcmac_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
681    NULL, "aesvia", "cbcmac unaligned");
682EVCNT_ATTACH_STATIC(cbcmac_unaligned_evcnt);
683
684static void
685aesvia_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
686    size_t nbytes, uint8_t auth0[static 16], uint32_t nrounds)
687{
688	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
689	uint8_t authbuf[16] __aligned(16);
690	uint8_t *auth = auth0;
691
692	KASSERT(nbytes);
693	KASSERT(nbytes % 16 == 0);
694
695	if ((uintptr_t)auth0 & 0xf) {
696		memcpy(authbuf, auth0, 16);
697		auth = authbuf;
698		cbcmac_unaligned_evcnt.ev_count++;
699	} else {
700		cbcmac_aligned_evcnt.ev_count++;
701	}
702
703	fpu_kern_enter();
704	aesvia_reload_keys();
705	for (; nbytes; nbytes -= 16, in += 16) {
706		xor128(auth, auth, in);
707		aesvia_encN(enc, auth, auth, 1, cw0);
708	}
709	fpu_kern_leave();
710
711	if ((uintptr_t)auth0 & 0xf) {
712		memcpy(auth0, authbuf, 16);
713		explicit_memset(authbuf, 0, sizeof authbuf);
714	}
715}
716
717static struct evcnt ccmenc_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
718    NULL, "aesvia", "ccmenc aligned");
719EVCNT_ATTACH_STATIC(ccmenc_aligned_evcnt);
720static struct evcnt ccmenc_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
721    NULL, "aesvia", "ccmenc unaligned");
722EVCNT_ATTACH_STATIC(ccmenc_unaligned_evcnt);
723
724static void
725aesvia_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
726    uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
727    uint32_t nrounds)
728{
729	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
730	uint8_t authctrbuf[32] __aligned(16);
731	uint8_t *authctr;
732	uint32_t c0, c1, c2, c3;
733
734	KASSERT(nbytes);
735	KASSERT(nbytes % 16 == 0);
736
737	if ((uintptr_t)authctr0 & 0xf) {
738		memcpy(authctrbuf, authctr0, 16);
739		authctr = authctrbuf;
740		ccmenc_unaligned_evcnt.ev_count++;
741	} else {
742		ccmenc_aligned_evcnt.ev_count++;
743	}
744	c0 = le32dec(authctr0 + 16 + 4*0);
745	c1 = le32dec(authctr0 + 16 + 4*1);
746	c2 = le32dec(authctr0 + 16 + 4*2);
747	c3 = be32dec(authctr0 + 16 + 4*3);
748
749	/*
750	 * In principle we could use REP XCRYPTCTR here, but that
751	 * doesn't help to compute the CBC-MAC step, and certain VIA
752	 * CPUs have some weird errata with REP XCRYPTCTR that make it
753	 * kind of a pain to use.  So let's just use REP XCRYPTECB to
754	 * simultaneously compute the CBC-MAC step and the CTR step.
755	 * (Maybe some VIA CPUs will compute REP XCRYPTECB in parallel,
756	 * who knows...)
757	 */
758	fpu_kern_enter();
759	aesvia_reload_keys();
760	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
761		xor128(authctr, authctr, in);
762		le32enc(authctr + 16 + 4*0, c0);
763		le32enc(authctr + 16 + 4*1, c1);
764		le32enc(authctr + 16 + 4*2, c2);
765		be32enc(authctr + 16 + 4*3, ++c3);
766		aesvia_encN(enc, authctr, authctr, 2, cw0);
767		xor128(out, in, authctr + 16);
768	}
769	fpu_kern_leave();
770
771	if ((uintptr_t)authctr0 & 0xf) {
772		memcpy(authctr0, authctrbuf, 16);
773		explicit_memset(authctrbuf, 0, sizeof authctrbuf);
774	}
775
776	le32enc(authctr0 + 16 + 4*0, c0);
777	le32enc(authctr0 + 16 + 4*1, c1);
778	le32enc(authctr0 + 16 + 4*2, c2);
779	be32enc(authctr0 + 16 + 4*3, c3);
780}
781
782static struct evcnt ccmdec_aligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
783    NULL, "aesvia", "ccmdec aligned");
784EVCNT_ATTACH_STATIC(ccmdec_aligned_evcnt);
785static struct evcnt ccmdec_unaligned_evcnt = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
786    NULL, "aesvia", "ccmdec unaligned");
787EVCNT_ATTACH_STATIC(ccmdec_unaligned_evcnt);
788
789static void
790aesvia_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
791    uint8_t out[static 16], size_t nbytes, uint8_t authctr0[static 32],
792    uint32_t nrounds)
793{
794	const uint32_t cw0 = aesvia_keylen_cw0(nrounds);
795	uint8_t authctrbuf[32] __aligned(16);
796	uint8_t *authctr;
797	uint32_t c0, c1, c2, c3;
798
799	KASSERT(nbytes);
800	KASSERT(nbytes % 16 == 0);
801
802	c0 = le32dec(authctr0 + 16 + 4*0);
803	c1 = le32dec(authctr0 + 16 + 4*1);
804	c2 = le32dec(authctr0 + 16 + 4*2);
805	c3 = be32dec(authctr0 + 16 + 4*3);
806
807	if ((uintptr_t)authctr0 & 0xf) {
808		memcpy(authctrbuf, authctr0, 16);
809		authctr = authctrbuf;
810		le32enc(authctr + 16 + 4*0, c0);
811		le32enc(authctr + 16 + 4*1, c1);
812		le32enc(authctr + 16 + 4*2, c2);
813		ccmdec_unaligned_evcnt.ev_count++;
814	} else {
815		ccmdec_aligned_evcnt.ev_count++;
816	}
817
818	fpu_kern_enter();
819	aesvia_reload_keys();
820	be32enc(authctr + 16 + 4*3, ++c3);
821	aesvia_encN(enc, authctr + 16, authctr + 16, 1, cw0);
822	for (;; in += 16, out += 16) {
823		xor128(out, authctr + 16, in);
824		xor128(authctr, authctr, out);
825		if ((nbytes -= 16) == 0)
826			break;
827		le32enc(authctr + 16 + 4*0, c0);
828		le32enc(authctr + 16 + 4*1, c1);
829		le32enc(authctr + 16 + 4*2, c2);
830		be32enc(authctr + 16 + 4*3, ++c3);
831		aesvia_encN(enc, authctr, authctr, 2, cw0);
832	}
833	aesvia_encN(enc, authctr, authctr, 1, cw0);
834	fpu_kern_leave();
835
836	if ((uintptr_t)authctr0 & 0xf) {
837		memcpy(authctr0, authctrbuf, 16);
838		explicit_memset(authctrbuf, 0, sizeof authctrbuf);
839	}
840
841	le32enc(authctr0 + 16 + 4*0, c0);
842	le32enc(authctr0 + 16 + 4*1, c1);
843	le32enc(authctr0 + 16 + 4*2, c2);
844	be32enc(authctr0 + 16 + 4*3, c3);
845}
846
847static int
848aesvia_probe(void)
849{
850
851	/* Verify that the CPU advertises VIA ACE support.  */
852#ifdef _KERNEL
853	if ((cpu_feature[4] & CPUID_VIA_HAS_ACE) == 0)
854		return -1;
855#else
856	/*
857	 * From the VIA PadLock Programming Guide:
858	 * http://linux.via.com.tw/support/beginDownload.action?eleid=181&fid=261
859	 */
860	unsigned eax, ebx, ecx, edx;
861	if (!__get_cpuid(0, &eax, &ebx, &ecx, &edx))
862		return -1;
863	if (ebx != signature_CENTAUR_ebx ||
864	    ecx != signature_CENTAUR_ecx ||
865	    edx != signature_CENTAUR_edx)
866		return -1;
867	if (eax < 0xc0000000)
868		return -1;
869	if (!__get_cpuid(0xc0000000, &eax, &ebx, &ecx, &edx))
870		return -1;
871	if (eax < 0xc0000001)
872		return -1;
873	if (!__get_cpuid(0xc0000001, &eax, &ebx, &ecx, &edx))
874		return -1;
875	/* Check whether ACE or ACE2 is both supported and enabled.  */
876	if ((edx & 0x000000c0) != 0x000000c0 ||
877	    (edx & 0x00000300) != 0x00000300)
878		return -1;
879#endif
880
881	/* Verify that our XTS tweak update logic works.  */
882	if (aesvia_xts_update_selftest())
883		return -1;
884
885	/* Success!  */
886	return 0;
887}
888
889struct aes_impl aes_via_impl = {
890	.ai_name = "VIA ACE",
891	.ai_probe = aesvia_probe,
892	.ai_setenckey = aesvia_setenckey,
893	.ai_setdeckey = aesvia_setdeckey,
894	.ai_enc = aesvia_enc,
895	.ai_dec = aesvia_dec,
896	.ai_cbc_enc = aesvia_cbc_enc,
897	.ai_cbc_dec = aesvia_cbc_dec,
898	.ai_xts_enc = aesvia_xts_enc,
899	.ai_xts_dec = aesvia_xts_dec,
900	.ai_cbcmac_update1 = aesvia_cbcmac_update1,
901	.ai_ccm_enc1 = aesvia_ccm_enc1,
902	.ai_ccm_dec1 = aesvia_ccm_dec1,
903};
904