1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#define BR_POWER_ASM_MACROS   1
26#include "inner.h"
27
28/*
29 * This code contains the AES key schedule implementation using the
30 * POWER8 opcodes.
31 */
32
33#if BR_POWER8
34
35static void
36key_schedule_128(unsigned char *sk, const unsigned char *key)
37{
38	long cc;
39
40	static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
41#if BR_POWER8_LE
42	static const uint32_t idx2be[] = {
43		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
44	};
45#endif
46
47	cc = 0;
48
49	/*
50	 * We use the VSX instructions for loading and storing the
51	 * key/subkeys, since they support unaligned accesses. The rest
52	 * of the computation is VMX only. VMX register 0 is VSX
53	 * register 32.
54	 */
55	asm volatile (
56
57		/*
58		 * v0 = all-zero word
59		 * v1 = constant -8 / +8, copied into four words
60		 * v2 = current subkey
61		 * v3 = Rcon (x4 words)
62		 * v6 = constant 8, copied into four words
63		 * v7 = constant 0x11B, copied into four words
64		 * v8 = constant for byteswapping words
65		 */
66		vspltisw(0, 0)
67#if BR_POWER8_LE
68		vspltisw(1, -8)
69#else
70		vspltisw(1, 8)
71#endif
72		lxvw4x(34, 0, %[key])
73		vspltisw(3, 1)
74		vspltisw(6, 8)
75		lxvw4x(39, 0, %[fmod])
76#if BR_POWER8_LE
77		lxvw4x(40, 0, %[idx2be])
78#endif
79
80		/*
81		 * First subkey is a copy of the key itself.
82		 */
83#if BR_POWER8_LE
84		vperm(4, 2, 2, 8)
85		stxvw4x(36, 0, %[sk])
86#else
87		stxvw4x(34, 0, %[sk])
88#endif
89
90		/*
91		 * Loop must run 10 times.
92		 */
93		li(%[cc], 10)
94		mtctr(%[cc])
95	label(loop)
96		/* Increment subkey address */
97		addi(%[sk], %[sk], 16)
98
99		/* Compute SubWord(RotWord(temp)) xor Rcon  (into v4, splat) */
100		vrlw(4, 2, 1)
101		vsbox(4, 4)
102#if BR_POWER8_LE
103		vxor(4, 4, 3)
104#else
105		vsldoi(5, 3, 0, 3)
106		vxor(4, 4, 5)
107#endif
108		vspltw(4, 4, 3)
109
110		/* XOR words for next subkey */
111		vsldoi(5, 0, 2, 12)
112		vxor(2, 2, 5)
113		vsldoi(5, 0, 2, 12)
114		vxor(2, 2, 5)
115		vsldoi(5, 0, 2, 12)
116		vxor(2, 2, 5)
117		vxor(2, 2, 4)
118
119		/* Store next subkey */
120#if BR_POWER8_LE
121		vperm(4, 2, 2, 8)
122		stxvw4x(36, 0, %[sk])
123#else
124		stxvw4x(34, 0, %[sk])
125#endif
126
127		/* Update Rcon */
128		vadduwm(3, 3, 3)
129		vsrw(4, 3, 6)
130		vsubuwm(4, 0, 4)
131		vand(4, 4, 7)
132		vxor(3, 3, 4)
133
134		bdnz(loop)
135
136: [sk] "+b" (sk), [cc] "+b" (cc)
137: [key] "b" (key), [fmod] "b" (fmod)
138#if BR_POWER8_LE
139	, [idx2be] "b" (idx2be)
140#endif
141: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
142	);
143}
144
145static void
146key_schedule_192(unsigned char *sk, const unsigned char *key)
147{
148	long cc;
149
150#if BR_POWER8_LE
151	static const uint32_t idx2be[] = {
152		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
153	};
154#endif
155
156	cc = 0;
157
158	/*
159	 * We use the VSX instructions for loading and storing the
160	 * key/subkeys, since they support unaligned accesses. The rest
161	 * of the computation is VMX only. VMX register 0 is VSX
162	 * register 32.
163	 */
164	asm volatile (
165
166		/*
167		 * v0 = all-zero word
168		 * v1 = constant -8 / +8, copied into four words
169		 * v2, v3 = current subkey
170		 * v5 = Rcon (x4 words) (already shifted on big-endian)
171		 * v6 = constant 8, copied into four words
172		 * v8 = constant for byteswapping words
173		 *
174		 * The left two words of v3 are ignored.
175		 */
176		vspltisw(0, 0)
177#if BR_POWER8_LE
178		vspltisw(1, -8)
179#else
180		vspltisw(1, 8)
181#endif
182		li(%[cc], 8)
183		lxvw4x(34, 0, %[key])
184		lxvw4x(35, %[cc], %[key])
185		vsldoi(3, 3, 0, 8)
186		vspltisw(5, 1)
187#if !BR_POWER8_LE
188		vsldoi(5, 5, 0, 3)
189#endif
190		vspltisw(6, 8)
191#if BR_POWER8_LE
192		lxvw4x(40, 0, %[idx2be])
193#endif
194
195		/*
196		 * Loop must run 8 times. Each iteration produces 256
197		 * bits of subkeys, with a 64-bit overlap.
198		 */
199		li(%[cc], 8)
200		mtctr(%[cc])
201		li(%[cc], 16)
202	label(loop)
203
204		/*
205		 * Last 6 words in v2:v3l. Compute next 6 words into
206		 * v3r:v4.
207		 */
208		vrlw(10, 3, 1)
209		vsbox(10, 10)
210		vxor(10, 10, 5)
211		vspltw(10, 10, 1)
212		vsldoi(11, 0, 10, 8)
213
214		vsldoi(12, 0, 2, 12)
215		vxor(12, 2, 12)
216		vsldoi(13, 0, 12, 12)
217		vxor(12, 12, 13)
218		vsldoi(13, 0, 12, 12)
219		vxor(12, 12, 13)
220
221		vspltw(13, 12, 3)
222		vxor(13, 13, 3)
223		vsldoi(14, 0, 3, 12)
224		vxor(13, 13, 14)
225
226		vsldoi(4, 12, 13, 8)
227		vsldoi(14, 0, 3, 8)
228		vsldoi(3, 14, 12, 8)
229
230		vxor(3, 3, 11)
231		vxor(4, 4, 10)
232
233		/*
234		 * Update Rcon. Since for a 192-bit key, we use only 8
235		 * such constants, we will not hit the field modulus,
236		 * so a simple shift (addition) works well.
237		 */
238		vadduwm(5, 5, 5)
239
240		/*
241		 * Write out the two left 128-bit words
242		 */
243#if BR_POWER8_LE
244		vperm(10, 2, 2, 8)
245		vperm(11, 3, 3, 8)
246		stxvw4x(42, 0, %[sk])
247		stxvw4x(43, %[cc], %[sk])
248#else
249		stxvw4x(34, 0, %[sk])
250		stxvw4x(35, %[cc], %[sk])
251#endif
252		addi(%[sk], %[sk], 24)
253
254		/*
255		 * Shift words for next iteration.
256		 */
257		vsldoi(2, 3, 4, 8)
258		vsldoi(3, 4, 0, 8)
259
260		bdnz(loop)
261
262		/*
263		 * The loop wrote the first 50 subkey words, but we need
264		 * to produce 52, so we must do one last write.
265		 */
266#if BR_POWER8_LE
267		vperm(10, 2, 2, 8)
268		stxvw4x(42, 0, %[sk])
269#else
270		stxvw4x(34, 0, %[sk])
271#endif
272
273: [sk] "+b" (sk), [cc] "+b" (cc)
274: [key] "b" (key)
275#if BR_POWER8_LE
276	, [idx2be] "b" (idx2be)
277#endif
278: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
279  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
280	);
281}
282
283static void
284key_schedule_256(unsigned char *sk, const unsigned char *key)
285{
286	long cc;
287
288#if BR_POWER8_LE
289	static const uint32_t idx2be[] = {
290		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
291	};
292#endif
293
294	cc = 0;
295
296	/*
297	 * We use the VSX instructions for loading and storing the
298	 * key/subkeys, since they support unaligned accesses. The rest
299	 * of the computation is VMX only. VMX register 0 is VSX
300	 * register 32.
301	 */
302	asm volatile (
303
304		/*
305		 * v0 = all-zero word
306		 * v1 = constant -8 / +8, copied into four words
307		 * v2, v3 = current subkey
308		 * v6 = Rcon (x4 words) (already shifted on big-endian)
309		 * v7 = constant 8, copied into four words
310		 * v8 = constant for byteswapping words
311		 *
312		 * The left two words of v3 are ignored.
313		 */
314		vspltisw(0, 0)
315#if BR_POWER8_LE
316		vspltisw(1, -8)
317#else
318		vspltisw(1, 8)
319#endif
320		li(%[cc], 16)
321		lxvw4x(34, 0, %[key])
322		lxvw4x(35, %[cc], %[key])
323		vspltisw(6, 1)
324#if !BR_POWER8_LE
325		vsldoi(6, 6, 0, 3)
326#endif
327		vspltisw(7, 8)
328#if BR_POWER8_LE
329		lxvw4x(40, 0, %[idx2be])
330#endif
331
332		/*
333		 * Loop must run 7 times. Each iteration produces two
334		 * subkeys.
335		 */
336		li(%[cc], 7)
337		mtctr(%[cc])
338		li(%[cc], 16)
339	label(loop)
340
341		/*
342		 * Current words are in v2:v3. Compute next word in v4.
343		 */
344		vrlw(10, 3, 1)
345		vsbox(10, 10)
346		vxor(10, 10, 6)
347		vspltw(10, 10, 3)
348
349		vsldoi(4, 0, 2, 12)
350		vxor(4, 2, 4)
351		vsldoi(5, 0, 4, 12)
352		vxor(4, 4, 5)
353		vsldoi(5, 0, 4, 12)
354		vxor(4, 4, 5)
355		vxor(4, 4, 10)
356
357		/*
358		 * Then other word in v5.
359		 */
360		vsbox(10, 4)
361		vspltw(10, 10, 3)
362
363		vsldoi(5, 0, 3, 12)
364		vxor(5, 3, 5)
365		vsldoi(11, 0, 5, 12)
366		vxor(5, 5, 11)
367		vsldoi(11, 0, 5, 12)
368		vxor(5, 5, 11)
369		vxor(5, 5, 10)
370
371		/*
372		 * Update Rcon. Since for a 256-bit key, we use only 7
373		 * such constants, we will not hit the field modulus,
374		 * so a simple shift (addition) works well.
375		 */
376		vadduwm(6, 6, 6)
377
378		/*
379		 * Write out the two left 128-bit words
380		 */
381#if BR_POWER8_LE
382		vperm(10, 2, 2, 8)
383		vperm(11, 3, 3, 8)
384		stxvw4x(42, 0, %[sk])
385		stxvw4x(43, %[cc], %[sk])
386#else
387		stxvw4x(34, 0, %[sk])
388		stxvw4x(35, %[cc], %[sk])
389#endif
390		addi(%[sk], %[sk], 32)
391
392		/*
393		 * Replace v2:v3 with v4:v5.
394		 */
395		vxor(2, 0, 4)
396		vxor(3, 0, 5)
397
398		bdnz(loop)
399
400		/*
401		 * The loop wrote the first 14 subkeys, but we need 15,
402		 * so we must do an extra write.
403		 */
404#if BR_POWER8_LE
405		vperm(10, 2, 2, 8)
406		stxvw4x(42, 0, %[sk])
407#else
408		stxvw4x(34, 0, %[sk])
409#endif
410
411: [sk] "+b" (sk), [cc] "+b" (cc)
412: [key] "b" (key)
413#if BR_POWER8_LE
414	, [idx2be] "b" (idx2be)
415#endif
416: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
417  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
418	);
419}
420
421/* see inner.h */
422int
423br_aes_pwr8_supported(void)
424{
425	return 1;
426}
427
428/* see inner.h */
429unsigned
430br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
431{
432	switch (len) {
433	case 16:
434		key_schedule_128(sk, key);
435		return 10;
436	case 24:
437		key_schedule_192(sk, key);
438		return 12;
439	default:
440		key_schedule_256(sk, key);
441		return 14;
442	}
443}
444
445#endif
446