1/*
2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#define BR_POWER_ASM_MACROS   1
26#include "inner.h"
27
28#if BR_POWER8
29
30/* see bearssl_block.h */
31const br_block_ctrcbc_class *
32br_aes_pwr8_ctrcbc_get_vtable(void)
33{
34	return br_aes_pwr8_supported() ? &br_aes_pwr8_ctrcbc_vtable : NULL;
35}
36
37/* see bearssl_block.h */
38void
39br_aes_pwr8_ctrcbc_init(br_aes_pwr8_ctrcbc_keys *ctx,
40	const void *key, size_t len)
41{
42	ctx->vtable = &br_aes_pwr8_ctrcbc_vtable;
43	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
44}
45
46/*
47 * Register conventions for CTR + CBC-MAC:
48 *
49 *   AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
50 *   Register v15 contains the byteswap index register (little-endian only)
51 *   Register v16 contains the CTR counter value
52 *   Register v17 contains the CBC-MAC current value
53 *   Registers v18 to v27 are scratch
54 *   Counter increment uses v28, v29 and v30
55 *
56 * For CTR alone:
57 *
58 *   AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
59 *   Register v15 contains the byteswap index register (little-endian only)
60 *   Registers v16 to v19 contain the CTR counter values (four blocks)
61 *   Registers v20 to v27 are scratch
62 *   Counter increment uses v28, v29 and v30
63 */
64
65#define LOAD_SUBKEYS_128 \
66		lxvw4x(32, %[cc], %[sk])   \
67		addi(%[cc], %[cc], 16)     \
68		lxvw4x(33, %[cc], %[sk])   \
69		addi(%[cc], %[cc], 16)     \
70		lxvw4x(34, %[cc], %[sk])   \
71		addi(%[cc], %[cc], 16)     \
72		lxvw4x(35, %[cc], %[sk])   \
73		addi(%[cc], %[cc], 16)     \
74		lxvw4x(36, %[cc], %[sk])   \
75		addi(%[cc], %[cc], 16)     \
76		lxvw4x(37, %[cc], %[sk])   \
77		addi(%[cc], %[cc], 16)     \
78		lxvw4x(38, %[cc], %[sk])   \
79		addi(%[cc], %[cc], 16)     \
80		lxvw4x(39, %[cc], %[sk])   \
81		addi(%[cc], %[cc], 16)     \
82		lxvw4x(40, %[cc], %[sk])   \
83		addi(%[cc], %[cc], 16)     \
84		lxvw4x(41, %[cc], %[sk])   \
85		addi(%[cc], %[cc], 16)     \
86		lxvw4x(42, %[cc], %[sk])
87
88#define LOAD_SUBKEYS_192 \
89		LOAD_SUBKEYS_128 \
90		addi(%[cc], %[cc], 16)     \
91		lxvw4x(43, %[cc], %[sk])   \
92		addi(%[cc], %[cc], 16)     \
93		lxvw4x(44, %[cc], %[sk])
94
95#define LOAD_SUBKEYS_256 \
96		LOAD_SUBKEYS_192 \
97		addi(%[cc], %[cc], 16)     \
98		lxvw4x(45, %[cc], %[sk])   \
99		addi(%[cc], %[cc], 16)     \
100		lxvw4x(46, %[cc], %[sk])
101
102#define BLOCK_ENCRYPT_128(x) \
103		vxor(x, x, 0) \
104		vcipher(x, x, 1) \
105		vcipher(x, x, 2) \
106		vcipher(x, x, 3) \
107		vcipher(x, x, 4) \
108		vcipher(x, x, 5) \
109		vcipher(x, x, 6) \
110		vcipher(x, x, 7) \
111		vcipher(x, x, 8) \
112		vcipher(x, x, 9) \
113		vcipherlast(x, x, 10)
114
115#define BLOCK_ENCRYPT_192(x) \
116		vxor(x, x, 0) \
117		vcipher(x, x, 1) \
118		vcipher(x, x, 2) \
119		vcipher(x, x, 3) \
120		vcipher(x, x, 4) \
121		vcipher(x, x, 5) \
122		vcipher(x, x, 6) \
123		vcipher(x, x, 7) \
124		vcipher(x, x, 8) \
125		vcipher(x, x, 9) \
126		vcipher(x, x, 10) \
127		vcipher(x, x, 11) \
128		vcipherlast(x, x, 12)
129
130#define BLOCK_ENCRYPT_256(x) \
131		vxor(x, x, 0) \
132		vcipher(x, x, 1) \
133		vcipher(x, x, 2) \
134		vcipher(x, x, 3) \
135		vcipher(x, x, 4) \
136		vcipher(x, x, 5) \
137		vcipher(x, x, 6) \
138		vcipher(x, x, 7) \
139		vcipher(x, x, 8) \
140		vcipher(x, x, 9) \
141		vcipher(x, x, 10) \
142		vcipher(x, x, 11) \
143		vcipher(x, x, 12) \
144		vcipher(x, x, 13) \
145		vcipherlast(x, x, 14)
146
147#define BLOCK_ENCRYPT_X2_128(x, y) \
148		vxor(x, x, 0) \
149		vxor(y, y, 0) \
150		vcipher(x, x, 1) \
151		vcipher(y, y, 1) \
152		vcipher(x, x, 2) \
153		vcipher(y, y, 2) \
154		vcipher(x, x, 3) \
155		vcipher(y, y, 3) \
156		vcipher(x, x, 4) \
157		vcipher(y, y, 4) \
158		vcipher(x, x, 5) \
159		vcipher(y, y, 5) \
160		vcipher(x, x, 6) \
161		vcipher(y, y, 6) \
162		vcipher(x, x, 7) \
163		vcipher(y, y, 7) \
164		vcipher(x, x, 8) \
165		vcipher(y, y, 8) \
166		vcipher(x, x, 9) \
167		vcipher(y, y, 9) \
168		vcipherlast(x, x, 10) \
169		vcipherlast(y, y, 10)
170
171#define BLOCK_ENCRYPT_X2_192(x, y) \
172		vxor(x, x, 0) \
173		vxor(y, y, 0) \
174		vcipher(x, x, 1) \
175		vcipher(y, y, 1) \
176		vcipher(x, x, 2) \
177		vcipher(y, y, 2) \
178		vcipher(x, x, 3) \
179		vcipher(y, y, 3) \
180		vcipher(x, x, 4) \
181		vcipher(y, y, 4) \
182		vcipher(x, x, 5) \
183		vcipher(y, y, 5) \
184		vcipher(x, x, 6) \
185		vcipher(y, y, 6) \
186		vcipher(x, x, 7) \
187		vcipher(y, y, 7) \
188		vcipher(x, x, 8) \
189		vcipher(y, y, 8) \
190		vcipher(x, x, 9) \
191		vcipher(y, y, 9) \
192		vcipher(x, x, 10) \
193		vcipher(y, y, 10) \
194		vcipher(x, x, 11) \
195		vcipher(y, y, 11) \
196		vcipherlast(x, x, 12) \
197		vcipherlast(y, y, 12)
198
199#define BLOCK_ENCRYPT_X2_256(x, y) \
200		vxor(x, x, 0) \
201		vxor(y, y, 0) \
202		vcipher(x, x, 1) \
203		vcipher(y, y, 1) \
204		vcipher(x, x, 2) \
205		vcipher(y, y, 2) \
206		vcipher(x, x, 3) \
207		vcipher(y, y, 3) \
208		vcipher(x, x, 4) \
209		vcipher(y, y, 4) \
210		vcipher(x, x, 5) \
211		vcipher(y, y, 5) \
212		vcipher(x, x, 6) \
213		vcipher(y, y, 6) \
214		vcipher(x, x, 7) \
215		vcipher(y, y, 7) \
216		vcipher(x, x, 8) \
217		vcipher(y, y, 8) \
218		vcipher(x, x, 9) \
219		vcipher(y, y, 9) \
220		vcipher(x, x, 10) \
221		vcipher(y, y, 10) \
222		vcipher(x, x, 11) \
223		vcipher(y, y, 11) \
224		vcipher(x, x, 12) \
225		vcipher(y, y, 12) \
226		vcipher(x, x, 13) \
227		vcipher(y, y, 13) \
228		vcipherlast(x, x, 14) \
229		vcipherlast(y, y, 14)
230
231#define BLOCK_ENCRYPT_X4_128(x0, x1, x2, x3) \
232		vxor(x0, x0, 0) \
233		vxor(x1, x1, 0) \
234		vxor(x2, x2, 0) \
235		vxor(x3, x3, 0) \
236		vcipher(x0, x0, 1) \
237		vcipher(x1, x1, 1) \
238		vcipher(x2, x2, 1) \
239		vcipher(x3, x3, 1) \
240		vcipher(x0, x0, 2) \
241		vcipher(x1, x1, 2) \
242		vcipher(x2, x2, 2) \
243		vcipher(x3, x3, 2) \
244		vcipher(x0, x0, 3) \
245		vcipher(x1, x1, 3) \
246		vcipher(x2, x2, 3) \
247		vcipher(x3, x3, 3) \
248		vcipher(x0, x0, 4) \
249		vcipher(x1, x1, 4) \
250		vcipher(x2, x2, 4) \
251		vcipher(x3, x3, 4) \
252		vcipher(x0, x0, 5) \
253		vcipher(x1, x1, 5) \
254		vcipher(x2, x2, 5) \
255		vcipher(x3, x3, 5) \
256		vcipher(x0, x0, 6) \
257		vcipher(x1, x1, 6) \
258		vcipher(x2, x2, 6) \
259		vcipher(x3, x3, 6) \
260		vcipher(x0, x0, 7) \
261		vcipher(x1, x1, 7) \
262		vcipher(x2, x2, 7) \
263		vcipher(x3, x3, 7) \
264		vcipher(x0, x0, 8) \
265		vcipher(x1, x1, 8) \
266		vcipher(x2, x2, 8) \
267		vcipher(x3, x3, 8) \
268		vcipher(x0, x0, 9) \
269		vcipher(x1, x1, 9) \
270		vcipher(x2, x2, 9) \
271		vcipher(x3, x3, 9) \
272		vcipherlast(x0, x0, 10) \
273		vcipherlast(x1, x1, 10) \
274		vcipherlast(x2, x2, 10) \
275		vcipherlast(x3, x3, 10)
276
277#define BLOCK_ENCRYPT_X4_192(x0, x1, x2, x3) \
278		vxor(x0, x0, 0) \
279		vxor(x1, x1, 0) \
280		vxor(x2, x2, 0) \
281		vxor(x3, x3, 0) \
282		vcipher(x0, x0, 1) \
283		vcipher(x1, x1, 1) \
284		vcipher(x2, x2, 1) \
285		vcipher(x3, x3, 1) \
286		vcipher(x0, x0, 2) \
287		vcipher(x1, x1, 2) \
288		vcipher(x2, x2, 2) \
289		vcipher(x3, x3, 2) \
290		vcipher(x0, x0, 3) \
291		vcipher(x1, x1, 3) \
292		vcipher(x2, x2, 3) \
293		vcipher(x3, x3, 3) \
294		vcipher(x0, x0, 4) \
295		vcipher(x1, x1, 4) \
296		vcipher(x2, x2, 4) \
297		vcipher(x3, x3, 4) \
298		vcipher(x0, x0, 5) \
299		vcipher(x1, x1, 5) \
300		vcipher(x2, x2, 5) \
301		vcipher(x3, x3, 5) \
302		vcipher(x0, x0, 6) \
303		vcipher(x1, x1, 6) \
304		vcipher(x2, x2, 6) \
305		vcipher(x3, x3, 6) \
306		vcipher(x0, x0, 7) \
307		vcipher(x1, x1, 7) \
308		vcipher(x2, x2, 7) \
309		vcipher(x3, x3, 7) \
310		vcipher(x0, x0, 8) \
311		vcipher(x1, x1, 8) \
312		vcipher(x2, x2, 8) \
313		vcipher(x3, x3, 8) \
314		vcipher(x0, x0, 9) \
315		vcipher(x1, x1, 9) \
316		vcipher(x2, x2, 9) \
317		vcipher(x3, x3, 9) \
318		vcipher(x0, x0, 10) \
319		vcipher(x1, x1, 10) \
320		vcipher(x2, x2, 10) \
321		vcipher(x3, x3, 10) \
322		vcipher(x0, x0, 11) \
323		vcipher(x1, x1, 11) \
324		vcipher(x2, x2, 11) \
325		vcipher(x3, x3, 11) \
326		vcipherlast(x0, x0, 12) \
327		vcipherlast(x1, x1, 12) \
328		vcipherlast(x2, x2, 12) \
329		vcipherlast(x3, x3, 12)
330
331#define BLOCK_ENCRYPT_X4_256(x0, x1, x2, x3) \
332		vxor(x0, x0, 0) \
333		vxor(x1, x1, 0) \
334		vxor(x2, x2, 0) \
335		vxor(x3, x3, 0) \
336		vcipher(x0, x0, 1) \
337		vcipher(x1, x1, 1) \
338		vcipher(x2, x2, 1) \
339		vcipher(x3, x3, 1) \
340		vcipher(x0, x0, 2) \
341		vcipher(x1, x1, 2) \
342		vcipher(x2, x2, 2) \
343		vcipher(x3, x3, 2) \
344		vcipher(x0, x0, 3) \
345		vcipher(x1, x1, 3) \
346		vcipher(x2, x2, 3) \
347		vcipher(x3, x3, 3) \
348		vcipher(x0, x0, 4) \
349		vcipher(x1, x1, 4) \
350		vcipher(x2, x2, 4) \
351		vcipher(x3, x3, 4) \
352		vcipher(x0, x0, 5) \
353		vcipher(x1, x1, 5) \
354		vcipher(x2, x2, 5) \
355		vcipher(x3, x3, 5) \
356		vcipher(x0, x0, 6) \
357		vcipher(x1, x1, 6) \
358		vcipher(x2, x2, 6) \
359		vcipher(x3, x3, 6) \
360		vcipher(x0, x0, 7) \
361		vcipher(x1, x1, 7) \
362		vcipher(x2, x2, 7) \
363		vcipher(x3, x3, 7) \
364		vcipher(x0, x0, 8) \
365		vcipher(x1, x1, 8) \
366		vcipher(x2, x2, 8) \
367		vcipher(x3, x3, 8) \
368		vcipher(x0, x0, 9) \
369		vcipher(x1, x1, 9) \
370		vcipher(x2, x2, 9) \
371		vcipher(x3, x3, 9) \
372		vcipher(x0, x0, 10) \
373		vcipher(x1, x1, 10) \
374		vcipher(x2, x2, 10) \
375		vcipher(x3, x3, 10) \
376		vcipher(x0, x0, 11) \
377		vcipher(x1, x1, 11) \
378		vcipher(x2, x2, 11) \
379		vcipher(x3, x3, 11) \
380		vcipher(x0, x0, 12) \
381		vcipher(x1, x1, 12) \
382		vcipher(x2, x2, 12) \
383		vcipher(x3, x3, 12) \
384		vcipher(x0, x0, 13) \
385		vcipher(x1, x1, 13) \
386		vcipher(x2, x2, 13) \
387		vcipher(x3, x3, 13) \
388		vcipherlast(x0, x0, 14) \
389		vcipherlast(x1, x1, 14) \
390		vcipherlast(x2, x2, 14) \
391		vcipherlast(x3, x3, 14)
392
393#if BR_POWER8_LE
394static const uint32_t idx2be[] = {
395	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
396};
397#define BYTESWAP_INIT     lxvw4x(47, 0, %[idx2be])
398#define BYTESWAP(x)       vperm(x, x, x, 15)
399#define BYTESWAPX(d, s)   vperm(d, s, s, 15)
400#define BYTESWAP_REG      , [idx2be] "b" (idx2be)
401#else
402#define BYTESWAP_INIT
403#define BYTESWAP(x)
404#define BYTESWAPX(d, s)   vand(d, s, s)
405#define BYTESWAP_REG
406#endif
407
408static const uint32_t ctrinc[] = {
409	0, 0, 0, 1
410};
411static const uint32_t ctrinc_x4[] = {
412	0, 0, 0, 4
413};
414#define INCR_128_INIT      lxvw4x(60, 0, %[ctrinc])
415#define INCR_128_X4_INIT   lxvw4x(60, 0, %[ctrinc_x4])
416#define INCR_128(d, s) \
417		vaddcuw(29, s, 28) \
418		vadduwm(d, s, 28) \
419		vsldoi(30, 29, 29, 4) \
420		vaddcuw(29, d, 30) \
421		vadduwm(d, d, 30) \
422		vsldoi(30, 29, 29, 4) \
423		vaddcuw(29, d, 30) \
424		vadduwm(d, d, 30) \
425		vsldoi(30, 29, 29, 4) \
426		vadduwm(d, d, 30)
427
428#define MKCTR(size) \
429static void \
430ctr_ ## size(const unsigned char *sk, \
431	unsigned char *ctrbuf, unsigned char *buf, size_t num_blocks_x4) \
432{ \
433	long cc, cc0, cc1, cc2, cc3; \
434 \
435	cc = 0; \
436	cc0 = 0; \
437	cc1 = 16; \
438	cc2 = 32; \
439	cc3 = 48; \
440	asm volatile ( \
441 \
442		/* \
443		 * Load subkeys into v0..v10 \
444		 */ \
445		LOAD_SUBKEYS_ ## size \
446		li(%[cc], 0) \
447 \
448		BYTESWAP_INIT \
449		INCR_128_X4_INIT \
450 \
451		/* \
452		 * Load current CTR counters into v16 to v19. \
453		 */ \
454		lxvw4x(48, %[cc0], %[ctrbuf]) \
455		lxvw4x(49, %[cc1], %[ctrbuf]) \
456		lxvw4x(50, %[cc2], %[ctrbuf]) \
457		lxvw4x(51, %[cc3], %[ctrbuf]) \
458		BYTESWAP(16) \
459		BYTESWAP(17) \
460		BYTESWAP(18) \
461		BYTESWAP(19) \
462 \
463		mtctr(%[num_blocks_x4]) \
464 \
465	label(loop) \
466		/* \
467		 * Compute next counter values into v20..v23. \
468		 */ \
469		INCR_128(20, 16) \
470		INCR_128(21, 17) \
471		INCR_128(22, 18) \
472		INCR_128(23, 19) \
473 \
474		/* \
475		 * Encrypt counter values and XOR into next data blocks. \
476		 */ \
477		lxvw4x(56, %[cc0], %[buf]) \
478		lxvw4x(57, %[cc1], %[buf]) \
479		lxvw4x(58, %[cc2], %[buf]) \
480		lxvw4x(59, %[cc3], %[buf]) \
481		BYTESWAP(24) \
482		BYTESWAP(25) \
483		BYTESWAP(26) \
484		BYTESWAP(27) \
485		BLOCK_ENCRYPT_X4_ ## size(16, 17, 18, 19) \
486		vxor(16, 16, 24) \
487		vxor(17, 17, 25) \
488		vxor(18, 18, 26) \
489		vxor(19, 19, 27) \
490		BYTESWAP(16) \
491		BYTESWAP(17) \
492		BYTESWAP(18) \
493		BYTESWAP(19) \
494		stxvw4x(48, %[cc0], %[buf]) \
495		stxvw4x(49, %[cc1], %[buf]) \
496		stxvw4x(50, %[cc2], %[buf]) \
497		stxvw4x(51, %[cc3], %[buf]) \
498 \
499		/* \
500		 * Update counters and data pointer. \
501		 */ \
502		vand(16, 20, 20) \
503		vand(17, 21, 21) \
504		vand(18, 22, 22) \
505		vand(19, 23, 23) \
506		addi(%[buf], %[buf], 64) \
507 \
508		bdnz(loop) \
509 \
510		/* \
511		 * Write back new counter values. \
512		 */ \
513		BYTESWAP(16) \
514		BYTESWAP(17) \
515		BYTESWAP(18) \
516		BYTESWAP(19) \
517		stxvw4x(48, %[cc0], %[ctrbuf]) \
518		stxvw4x(49, %[cc1], %[ctrbuf]) \
519		stxvw4x(50, %[cc2], %[ctrbuf]) \
520		stxvw4x(51, %[cc3], %[ctrbuf]) \
521 \
522: [cc] "+b" (cc), [buf] "+b" (buf), \
523	[cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3) \
524: [sk] "b" (sk), [ctrbuf] "b" (ctrbuf), \
525	[num_blocks_x4] "b" (num_blocks_x4), [ctrinc_x4] "b" (ctrinc_x4) \
526	BYTESWAP_REG \
527: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
528  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
529  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
530  "v30", "ctr", "memory" \
531	); \
532}
533
534MKCTR(128)
535MKCTR(192)
536MKCTR(256)
537
538#define MKCBCMAC(size) \
539static void \
540cbcmac_ ## size(const unsigned char *sk, \
541	unsigned char *cbcmac, const unsigned char *buf, size_t num_blocks) \
542{ \
543	long cc; \
544 \
545	cc = 0; \
546	asm volatile ( \
547 \
548		/* \
549		 * Load subkeys into v0..v10 \
550		 */ \
551		LOAD_SUBKEYS_ ## size \
552		li(%[cc], 0) \
553 \
554		BYTESWAP_INIT \
555 \
556		/* \
557		 * Load current CBC-MAC value into v16. \
558		 */ \
559		lxvw4x(48, %[cc], %[cbcmac]) \
560		BYTESWAP(16) \
561 \
562		mtctr(%[num_blocks]) \
563 \
564	label(loop) \
565		/* \
566		 * Load next block, XOR into current CBC-MAC value, \
567		 * and then encrypt it. \
568		 */ \
569		lxvw4x(49, %[cc], %[buf]) \
570		BYTESWAP(17) \
571		vxor(16, 16, 17) \
572		BLOCK_ENCRYPT_ ## size(16) \
573		addi(%[buf], %[buf], 16) \
574 \
575		bdnz(loop) \
576 \
577		/* \
578		 * Write back new CBC-MAC value. \
579		 */ \
580		BYTESWAP(16) \
581		stxvw4x(48, %[cc], %[cbcmac]) \
582 \
583: [cc] "+b" (cc), [buf] "+b" (buf) \
584: [sk] "b" (sk), [cbcmac] "b" (cbcmac), [num_blocks] "b" (num_blocks) \
585	BYTESWAP_REG \
586: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
587  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
588  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
589  "v30", "ctr", "memory" \
590	); \
591}
592
593MKCBCMAC(128)
594MKCBCMAC(192)
595MKCBCMAC(256)
596
597#define MKENCRYPT(size) \
598static void \
599ctrcbc_ ## size ## _encrypt(const unsigned char *sk, \
600	unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
601	size_t num_blocks) \
602{ \
603	long cc; \
604 \
605	cc = 0; \
606	asm volatile ( \
607 \
608		/* \
609		 * Load subkeys into v0..v10 \
610		 */ \
611		LOAD_SUBKEYS_ ## size \
612		li(%[cc], 0) \
613 \
614		BYTESWAP_INIT \
615		INCR_128_INIT \
616 \
617		/* \
618		 * Load current CTR counter into v16, and current \
619		 * CBC-MAC IV into v17. \
620		 */ \
621		lxvw4x(48, %[cc], %[ctr]) \
622		lxvw4x(49, %[cc], %[cbcmac]) \
623		BYTESWAP(16) \
624		BYTESWAP(17) \
625 \
626		/* \
627		 * At each iteration, we do two parallel encryption: \
628		 *  - new counter value for encryption of the next block; \
629		 *  - CBC-MAC over the previous encrypted block. \
630		 * Thus, each plaintext block implies two AES instances, \
631		 * over two successive iterations. This requires a single \
632		 * counter encryption before the loop, and a single \
633		 * CBC-MAC encryption after the loop. \
634		 */ \
635 \
636		/* \
637		 * Encrypt first block (into v20). \
638		 */ \
639		lxvw4x(52, %[cc], %[buf]) \
640		BYTESWAP(20) \
641		INCR_128(22, 16) \
642		BLOCK_ENCRYPT_ ## size(16) \
643		vxor(20, 20, 16) \
644		BYTESWAPX(21, 20) \
645		stxvw4x(53, %[cc], %[buf]) \
646		vand(16, 22, 22) \
647		addi(%[buf], %[buf], 16) \
648 \
649		/* \
650		 * Load loop counter; skip the loop if there is only \
651		 * one block in total (already handled by the boundary \
652		 * conditions). \
653		 */ \
654		mtctr(%[num_blocks]) \
655		bdz(fastexit) \
656 \
657	label(loop) \
658		/* \
659		 * Upon loop entry: \
660		 *    v16   counter value for next block \
661		 *    v17   current CBC-MAC value \
662		 *    v20   encrypted previous block \
663		 */ \
664		vxor(17, 17, 20) \
665		INCR_128(22, 16) \
666		lxvw4x(52, %[cc], %[buf]) \
667		BYTESWAP(20) \
668		BLOCK_ENCRYPT_X2_ ## size(16, 17) \
669		vxor(20, 20, 16) \
670		BYTESWAPX(21, 20) \
671		stxvw4x(53, %[cc], %[buf]) \
672		addi(%[buf], %[buf], 16) \
673		vand(16, 22, 22) \
674 \
675		bdnz(loop) \
676 \
677	label(fastexit) \
678		vxor(17, 17, 20) \
679		BLOCK_ENCRYPT_ ## size(17) \
680		BYTESWAP(16) \
681		BYTESWAP(17) \
682		stxvw4x(48, %[cc], %[ctr]) \
683		stxvw4x(49, %[cc], %[cbcmac]) \
684 \
685: [cc] "+b" (cc), [buf] "+b" (buf) \
686: [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
687	[num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
688	BYTESWAP_REG \
689: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
690  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
691  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
692  "v30", "ctr", "memory" \
693	); \
694}
695
696MKENCRYPT(128)
697MKENCRYPT(192)
698MKENCRYPT(256)
699
700#define MKDECRYPT(size) \
701static void \
702ctrcbc_ ## size ## _decrypt(const unsigned char *sk, \
703	unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
704	size_t num_blocks) \
705{ \
706	long cc; \
707 \
708	cc = 0; \
709	asm volatile ( \
710 \
711		/* \
712		 * Load subkeys into v0..v10 \
713		 */ \
714		LOAD_SUBKEYS_ ## size \
715		li(%[cc], 0) \
716 \
717		BYTESWAP_INIT \
718		INCR_128_INIT \
719 \
720		/* \
721		 * Load current CTR counter into v16, and current \
722		 * CBC-MAC IV into v17. \
723		 */ \
724		lxvw4x(48, %[cc], %[ctr]) \
725		lxvw4x(49, %[cc], %[cbcmac]) \
726		BYTESWAP(16) \
727		BYTESWAP(17) \
728 \
729		/* \
730		 * At each iteration, we do two parallel encryption: \
731		 *  - new counter value for decryption of the next block; \
732		 *  - CBC-MAC over the next encrypted block. \
733		 * Each iteration performs the two AES instances related \
734		 * to the current block; there is thus no need for some \
735		 * extra pre-loop and post-loop work as in encryption. \
736		 */ \
737 \
738		mtctr(%[num_blocks]) \
739 \
740	label(loop) \
741		/* \
742		 * Upon loop entry: \
743		 *    v16   counter value for next block \
744		 *    v17   current CBC-MAC value \
745		 */ \
746		lxvw4x(52, %[cc], %[buf]) \
747		BYTESWAP(20) \
748		vxor(17, 17, 20) \
749		INCR_128(22, 16) \
750		BLOCK_ENCRYPT_X2_ ## size(16, 17) \
751		vxor(20, 20, 16) \
752		BYTESWAPX(21, 20) \
753		stxvw4x(53, %[cc], %[buf]) \
754		addi(%[buf], %[buf], 16) \
755		vand(16, 22, 22) \
756 \
757		bdnz(loop) \
758 \
759		/* \
760		 * Store back counter and CBC-MAC value. \
761		 */ \
762		BYTESWAP(16) \
763		BYTESWAP(17) \
764		stxvw4x(48, %[cc], %[ctr]) \
765		stxvw4x(49, %[cc], %[cbcmac]) \
766 \
767: [cc] "+b" (cc), [buf] "+b" (buf) \
768: [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
769	[num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
770	BYTESWAP_REG \
771: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
772  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
773  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
774  "v30", "ctr", "memory" \
775	); \
776}
777
778MKDECRYPT(128)
779MKDECRYPT(192)
780MKDECRYPT(256)
781
782/* see bearssl_block.h */
783void
784br_aes_pwr8_ctrcbc_encrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
785	void *ctr, void *cbcmac, void *data, size_t len)
786{
787	if (len == 0) {
788		return;
789	}
790	switch (ctx->num_rounds) {
791	case 10:
792		ctrcbc_128_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
793		break;
794	case 12:
795		ctrcbc_192_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
796		break;
797	default:
798		ctrcbc_256_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
799		break;
800	}
801}
802
803/* see bearssl_block.h */
804void
805br_aes_pwr8_ctrcbc_decrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
806	void *ctr, void *cbcmac, void *data, size_t len)
807{
808	if (len == 0) {
809		return;
810	}
811	switch (ctx->num_rounds) {
812	case 10:
813		ctrcbc_128_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
814		break;
815	case 12:
816		ctrcbc_192_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
817		break;
818	default:
819		ctrcbc_256_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
820		break;
821	}
822}
823
824static inline void
825incr_ctr(void *dst, const void *src)
826{
827	uint64_t hi, lo;
828
829	hi = br_dec64be(src);
830	lo = br_dec64be((const unsigned char *)src + 8);
831	lo ++;
832	hi += ((lo | -lo) >> 63) ^ (uint64_t)1;
833	br_enc64be(dst, hi);
834	br_enc64be((unsigned char *)dst + 8, lo);
835}
836
837/* see bearssl_block.h */
838void
839br_aes_pwr8_ctrcbc_ctr(const br_aes_pwr8_ctrcbc_keys *ctx,
840	void *ctr, void *data, size_t len)
841{
842	unsigned char ctrbuf[64];
843
844	memcpy(ctrbuf, ctr, 16);
845	incr_ctr(ctrbuf + 16, ctrbuf);
846	incr_ctr(ctrbuf + 32, ctrbuf + 16);
847	incr_ctr(ctrbuf + 48, ctrbuf + 32);
848	if (len >= 64) {
849		switch (ctx->num_rounds) {
850		case 10:
851			ctr_128(ctx->skey.skni, ctrbuf, data, len >> 6);
852			break;
853		case 12:
854			ctr_192(ctx->skey.skni, ctrbuf, data, len >> 6);
855			break;
856		default:
857			ctr_256(ctx->skey.skni, ctrbuf, data, len >> 6);
858			break;
859		}
860		data = (unsigned char *)data + (len & ~(size_t)63);
861		len &= 63;
862	}
863	if (len > 0) {
864		unsigned char tmp[64];
865
866		if (len >= 32) {
867			if (len >= 48) {
868				memcpy(ctr, ctrbuf + 48, 16);
869			} else {
870				memcpy(ctr, ctrbuf + 32, 16);
871			}
872		} else {
873			if (len >= 16) {
874				memcpy(ctr, ctrbuf + 16, 16);
875			}
876		}
877		memcpy(tmp, data, len);
878		memset(tmp + len, 0, (sizeof tmp) - len);
879		switch (ctx->num_rounds) {
880		case 10:
881			ctr_128(ctx->skey.skni, ctrbuf, tmp, 1);
882			break;
883		case 12:
884			ctr_192(ctx->skey.skni, ctrbuf, tmp, 1);
885			break;
886		default:
887			ctr_256(ctx->skey.skni, ctrbuf, tmp, 1);
888			break;
889		}
890		memcpy(data, tmp, len);
891	} else {
892		memcpy(ctr, ctrbuf, 16);
893	}
894}
895
896/* see bearssl_block.h */
897void
898br_aes_pwr8_ctrcbc_mac(const br_aes_pwr8_ctrcbc_keys *ctx,
899	void *cbcmac, const void *data, size_t len)
900{
901	if (len > 0) {
902		switch (ctx->num_rounds) {
903		case 10:
904			cbcmac_128(ctx->skey.skni, cbcmac, data, len >> 4);
905			break;
906		case 12:
907			cbcmac_192(ctx->skey.skni, cbcmac, data, len >> 4);
908			break;
909		default:
910			cbcmac_256(ctx->skey.skni, cbcmac, data, len >> 4);
911			break;
912		}
913	}
914}
915
916/* see bearssl_block.h */
917const br_block_ctrcbc_class br_aes_pwr8_ctrcbc_vtable = {
918	sizeof(br_aes_pwr8_ctrcbc_keys),
919	16,
920	4,
921	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
922		&br_aes_pwr8_ctrcbc_init,
923	(void (*)(const br_block_ctrcbc_class *const *,
924		void *, void *, void *, size_t))
925		&br_aes_pwr8_ctrcbc_encrypt,
926	(void (*)(const br_block_ctrcbc_class *const *,
927		void *, void *, void *, size_t))
928		&br_aes_pwr8_ctrcbc_decrypt,
929	(void (*)(const br_block_ctrcbc_class *const *,
930		void *, void *, size_t))
931		&br_aes_pwr8_ctrcbc_ctr,
932	(void (*)(const br_block_ctrcbc_class *const *,
933		void *, const void *, size_t))
934		&br_aes_pwr8_ctrcbc_mac
935};
936
937#else
938
939/* see bearssl_block.h */
940const br_block_ctrcbc_class *
941br_aes_pwr8_ctrcbc_get_vtable(void)
942{
943	return NULL;
944}
945
946#endif
947