1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#define BR_ENABLE_INTRINSICS   1
26#include "inner.h"
27
28/*
29 * This code contains the AES key schedule implementation using the
30 * AES-NI opcodes.
31 */
32
33#if BR_AES_X86NI
34
35/* see inner.h */
36int
37br_aes_x86ni_supported(void)
38{
39	/*
40	 * Bit mask for features in ECX:
41	 *   19   SSE4.1 (used for _mm_insert_epi32(), for AES-CTR)
42	 *   25   AES-NI
43	 */
44	return br_cpuid(0, 0, 0x02080000, 0);
45}
46
47BR_TARGETS_X86_UP
48
49BR_TARGET("sse2,aes")
50static inline __m128i
51expand_step128(__m128i k, __m128i k2)
52{
53	k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
54	k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
55	k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
56	k2 = _mm_shuffle_epi32(k2, 0xFF);
57	return _mm_xor_si128(k, k2);
58}
59
60BR_TARGET("sse2,aes")
61static inline void
62expand_step192(__m128i *t1, __m128i *t2, __m128i *t3)
63{
64	__m128i t4;
65
66	*t2 = _mm_shuffle_epi32(*t2, 0x55);
67	t4 = _mm_slli_si128(*t1, 0x4);
68	*t1 = _mm_xor_si128(*t1, t4);
69	t4 = _mm_slli_si128(t4, 0x4);
70	*t1 = _mm_xor_si128(*t1, t4);
71	t4 = _mm_slli_si128(t4, 0x4);
72	*t1 = _mm_xor_si128(*t1, t4);
73	*t1 = _mm_xor_si128(*t1, *t2);
74	*t2 = _mm_shuffle_epi32(*t1, 0xFF);
75	t4 = _mm_slli_si128(*t3, 0x4);
76	*t3 = _mm_xor_si128(*t3, t4);
77	*t3 = _mm_xor_si128(*t3, *t2);
78}
79
80BR_TARGET("sse2,aes")
81static inline void
82expand_step256_1(__m128i *t1, __m128i *t2)
83{
84	__m128i t4;
85
86	*t2 = _mm_shuffle_epi32(*t2, 0xFF);
87	t4 = _mm_slli_si128(*t1, 0x4);
88	*t1 = _mm_xor_si128(*t1, t4);
89	t4 = _mm_slli_si128(t4, 0x4);
90	*t1 = _mm_xor_si128(*t1, t4);
91	t4 = _mm_slli_si128(t4, 0x4);
92	*t1 = _mm_xor_si128(*t1, t4);
93	*t1 = _mm_xor_si128(*t1, *t2);
94}
95
96BR_TARGET("sse2,aes")
97static inline void
98expand_step256_2(__m128i *t1, __m128i *t3)
99{
100	__m128i t2, t4;
101
102	t4 = _mm_aeskeygenassist_si128(*t1, 0x0);
103	t2 = _mm_shuffle_epi32(t4, 0xAA);
104	t4 = _mm_slli_si128(*t3, 0x4);
105	*t3 = _mm_xor_si128(*t3, t4);
106	t4 = _mm_slli_si128(t4, 0x4);
107	*t3 = _mm_xor_si128(*t3, t4);
108	t4 = _mm_slli_si128(t4, 0x4);
109	*t3 = _mm_xor_si128(*t3, t4);
110	*t3 = _mm_xor_si128(*t3, t2);
111}
112
113/*
114 * Perform key schedule for AES, encryption direction. Subkeys are written
115 * in sk[], and the number of rounds is returned. Key length MUST be 16,
116 * 24 or 32 bytes.
117 */
118BR_TARGET("sse2,aes")
119static unsigned
120x86ni_keysched(__m128i *sk, const void *key, size_t len)
121{
122	const unsigned char *kb;
123
124#define KEXP128(k, i, rcon)   do { \
125		k = expand_step128(k, _mm_aeskeygenassist_si128(k, rcon)); \
126		sk[i] = k; \
127	} while (0)
128
129#define KEXP192(i, rcon1, rcon2)   do { \
130		sk[(i) + 0] = t1; \
131		sk[(i) + 1] = t3; \
132		t2 = _mm_aeskeygenassist_si128(t3, rcon1); \
133		expand_step192(&t1, &t2, &t3); \
134		sk[(i) + 1] = _mm_castpd_si128(_mm_shuffle_pd( \
135			_mm_castsi128_pd(sk[(i) + 1]), \
136			_mm_castsi128_pd(t1), 0)); \
137		sk[(i) + 2] = _mm_castpd_si128(_mm_shuffle_pd( \
138			_mm_castsi128_pd(t1), \
139			_mm_castsi128_pd(t3), 1)); \
140		t2 = _mm_aeskeygenassist_si128(t3, rcon2); \
141		expand_step192(&t1, &t2, &t3); \
142	} while (0)
143
144#define KEXP256(i, rcon)   do { \
145		sk[(i) + 0] = t3; \
146		t2 = _mm_aeskeygenassist_si128(t3, rcon); \
147		expand_step256_1(&t1, &t2); \
148		sk[(i) + 1] = t1; \
149		expand_step256_2(&t1, &t3); \
150	} while (0)
151
152	kb = key;
153	switch (len) {
154		__m128i t1, t2, t3;
155
156	case 16:
157		t1 = _mm_loadu_si128((const void *)kb);
158		sk[0] = t1;
159		KEXP128(t1,  1, 0x01);
160		KEXP128(t1,  2, 0x02);
161		KEXP128(t1,  3, 0x04);
162		KEXP128(t1,  4, 0x08);
163		KEXP128(t1,  5, 0x10);
164		KEXP128(t1,  6, 0x20);
165		KEXP128(t1,  7, 0x40);
166		KEXP128(t1,  8, 0x80);
167		KEXP128(t1,  9, 0x1B);
168		KEXP128(t1, 10, 0x36);
169		return 10;
170
171	case 24:
172		t1 = _mm_loadu_si128((const void *)kb);
173		t3 = _mm_loadu_si128((const void *)(kb + 8));
174		t3 = _mm_shuffle_epi32(t3, 0x4E);
175		KEXP192(0, 0x01, 0x02);
176		KEXP192(3, 0x04, 0x08);
177		KEXP192(6, 0x10, 0x20);
178		KEXP192(9, 0x40, 0x80);
179		sk[12] = t1;
180		return 12;
181
182	case 32:
183		t1 = _mm_loadu_si128((const void *)kb);
184		t3 = _mm_loadu_si128((const void *)(kb + 16));
185		sk[0] = t1;
186		KEXP256( 1, 0x01);
187		KEXP256( 3, 0x02);
188		KEXP256( 5, 0x04);
189		KEXP256( 7, 0x08);
190		KEXP256( 9, 0x10);
191		KEXP256(11, 0x20);
192		sk[13] = t3;
193		t2 = _mm_aeskeygenassist_si128(t3, 0x40);
194		expand_step256_1(&t1, &t2);
195		sk[14] = t1;
196		return 14;
197
198	default:
199		return 0;
200	}
201
202#undef KEXP128
203#undef KEXP192
204#undef KEXP256
205}
206
207/* see inner.h */
208BR_TARGET("sse2,aes")
209unsigned
210br_aes_x86ni_keysched_enc(unsigned char *skni, const void *key, size_t len)
211{
212	__m128i sk[15];
213	unsigned num_rounds;
214
215	num_rounds = x86ni_keysched(sk, key, len);
216	memcpy(skni, sk, (num_rounds + 1) << 4);
217	return num_rounds;
218}
219
220/* see inner.h */
221BR_TARGET("sse2,aes")
222unsigned
223br_aes_x86ni_keysched_dec(unsigned char *skni, const void *key, size_t len)
224{
225	__m128i sk[15];
226	unsigned u, num_rounds;
227
228	num_rounds = x86ni_keysched(sk, key, len);
229	_mm_storeu_si128((void *)skni, sk[num_rounds]);
230	for (u = 1; u < num_rounds; u ++) {
231		_mm_storeu_si128((void *)(skni + (u << 4)),
232			_mm_aesimc_si128(sk[num_rounds - u]));
233	}
234	_mm_storeu_si128((void *)(skni + (num_rounds << 4)), sk[0]);
235	return num_rounds;
236}
237
238BR_TARGETS_X86_DOWN
239
240#endif
241