1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#define BR_POWER_ASM_MACROS   1
26#include "inner.h"
27
28/*
29 * This is the GHASH implementation that leverages the POWER8 opcodes.
30 */
31
32#if BR_POWER8
33
34/*
35 * Some symbolic names for registers.
36 *   HB0 = 16 bytes of value 0
37 *   HB1 = 16 bytes of value 1
38 *   HB2 = 16 bytes of value 2
39 *   HB6 = 16 bytes of value 6
40 *   HB7 = 16 bytes of value 7
41 *   TT0, TT1 and TT2 are temporaries
42 *
43 * BSW holds the pattern for byteswapping 32-bit words; this is set only
44 * on little-endian systems. XBSW is the same register with the +32 offset
45 * for access with the VSX opcodes.
46 */
47#define HB0     0
48#define HB1     1
49#define HB2     2
50#define HB6     3
51#define HB7     4
52#define TT0     5
53#define TT1     6
54#define TT2     7
55
56#define BSW     8
57#define XBSW   40
58
59/*
60 * Macro to initialise the constants.
61 */
62#define INIT \
63		vxor(HB0, HB0, HB0) \
64		vspltisb(HB1, 1) \
65		vspltisb(HB2, 2) \
66		vspltisb(HB6, 6) \
67		vspltisb(HB7, 7) \
68		INIT_BSW
69
70/*
71 * Fix endianness of a value after reading it or before writing it, if
72 * necessary.
73 */
74#if BR_POWER8_LE
75#define INIT_BSW         lxvw4x(XBSW, 0, %[idx2be])
76#define FIX_ENDIAN(xx)   vperm(xx, xx, xx, BSW)
77#else
78#define INIT_BSW
79#define FIX_ENDIAN(xx)
80#endif
81
82/*
83 * Left-shift x0:x1 by one bit to the left. This is a corrective action
84 * needed because GHASH is defined in full little-endian specification,
85 * while the opcodes use full big-endian convention, so the 255-bit product
86 * ends up one bit to the right.
87 */
88#define SL_256(x0, x1) \
89		vsldoi(TT0, HB0, x1, 1) \
90		vsl(x0, x0, HB1) \
91		vsr(TT0, TT0, HB7) \
92		vsl(x1, x1, HB1) \
93		vxor(x0, x0, TT0)
94
95/*
96 * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as
97 * x0 or x1, or a different register). x0 and x1 are modified.
98 */
99#define REDUCE_F128(xd, x0, x1) \
100		vxor(x0, x0, x1) \
101		vsr(TT0, x1, HB1) \
102		vsr(TT1, x1, HB2) \
103		vsr(TT2, x1, HB7) \
104		vxor(x0, x0, TT0) \
105		vxor(TT1, TT1, TT2) \
106		vxor(x0, x0, TT1) \
107		vsldoi(x1, x1, HB0, 15) \
108		vsl(TT1, x1, HB6) \
109		vsl(TT2, x1, HB1) \
110		vxor(x1, TT1, TT2) \
111		vsr(TT0, x1, HB1) \
112		vsr(TT1, x1, HB2) \
113		vsr(TT2, x1, HB7) \
114		vxor(x0, x0, x1) \
115		vxor(x0, x0, TT0) \
116		vxor(TT1, TT1, TT2) \
117		vxor(xd, x0, TT1)
118
119/* see bearssl_hash.h */
120void
121br_ghash_pwr8(void *y, const void *h, const void *data, size_t len)
122{
123	const unsigned char *buf1, *buf2;
124	size_t num4, num1;
125	unsigned char tmp[64];
126	long cc0, cc1, cc2, cc3;
127
128#if BR_POWER8_LE
129	static const uint32_t idx2be[] = {
130		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
131	};
132#endif
133
134	buf1 = data;
135
136	/*
137	 * Assembly code requires data into two chunks; first chunk
138	 * must contain a number of blocks which is a multiple of 4.
139	 * Since the processing for the first chunk is faster, we want
140	 * to make it as big as possible.
141	 *
142	 * For the remainder, there are two possibilities:
143	 *  -- if the remainder size is a multiple of 16, then use it
144	 *     in place;
145	 *  -- otherwise, copy it to the tmp[] array and pad it with
146	 *     zeros.
147	 */
148	num4 = len >> 6;
149	buf2 = buf1 + (num4 << 6);
150	len &= 63;
151	num1 = (len + 15) >> 4;
152	if ((len & 15) != 0) {
153		memcpy(tmp, buf2, len);
154		memset(tmp + len, 0, (num1 << 4) - len);
155		buf2 = tmp;
156	}
157
158	cc0 =  0;
159	cc1 = 16;
160	cc2 = 32;
161	cc3 = 48;
162	asm volatile (
163		INIT
164
165		/*
166		 * Load current h (denoted hereafter h1) in v9.
167		 */
168		lxvw4x(41, 0, %[h])
169		FIX_ENDIAN(9)
170
171		/*
172		 * Load current y into v28.
173		 */
174		lxvw4x(60, 0, %[y])
175		FIX_ENDIAN(28)
176
177		/*
178		 * Split h1 into three registers:
179		 *   v17 = h1_1:h1_0
180		 *   v18 =    0:h1_0
181		 *   v19 = h1_1:0
182		 */
183		xxpermdi(49, 41, 41, 2)
184		vsldoi(18, HB0, 9, 8)
185		vsldoi(19, 9, HB0, 8)
186
187		/*
188		 * If num4 is 0, skip directly to the second chunk.
189		 */
190		cmpldi(%[num4], 0)
191		beq(chunk1)
192
193		/*
194		 * Compute h2 = h*h in v10.
195		 */
196		vpmsumd(10, 18, 18)
197		vpmsumd(11, 19, 19)
198		SL_256(10, 11)
199		REDUCE_F128(10, 10, 11)
200
201		/*
202		 * Compute h3 = h*h*h in v11.
203		 * We first split h2 into:
204		 *   v10 = h2_0:h2_1
205		 *   v11 =    0:h2_0
206		 *   v12 = h2_1:0
207		 * Then we do the product with h1, and reduce into v11.
208		 */
209		vsldoi(11, HB0, 10, 8)
210		vsldoi(12, 10, HB0, 8)
211		vpmsumd(13, 10, 17)
212		vpmsumd(11, 11, 18)
213		vpmsumd(12, 12, 19)
214		vsldoi(14, HB0, 13, 8)
215		vsldoi(15, 13, HB0, 8)
216		vxor(11, 11, 14)
217		vxor(12, 12, 15)
218		SL_256(11, 12)
219		REDUCE_F128(11, 11, 12)
220
221		/*
222		 * Compute h4 = h*h*h*h in v12. This is done by squaring h2.
223		 */
224		vsldoi(12, HB0, 10, 8)
225		vsldoi(13, 10, HB0, 8)
226		vpmsumd(12, 12, 12)
227		vpmsumd(13, 13, 13)
228		SL_256(12, 13)
229		REDUCE_F128(12, 12, 13)
230
231		/*
232		 * Repack h1, h2, h3 and h4:
233		 *   v13 = h4_0:h3_0
234		 *   v14 = h4_1:h3_1
235		 *   v15 = h2_0:h1_0
236		 *   v16 = h2_1:h1_1
237		 */
238		xxpermdi(45, 44, 43, 0)
239		xxpermdi(46, 44, 43, 3)
240		xxpermdi(47, 42, 41, 0)
241		xxpermdi(48, 42, 41, 3)
242
243		/*
244		 * Loop for each group of four blocks.
245		 */
246		mtctr(%[num4])
247	label(loop4)
248		/*
249		 * Read the four next blocks.
250		 *   v20 = y + a0 = b0
251		 *   v21 = a1     = b1
252		 *   v22 = a2     = b2
253		 *   v23 = a3     = b3
254		 */
255		lxvw4x(52, %[cc0], %[buf1])
256		lxvw4x(53, %[cc1], %[buf1])
257		lxvw4x(54, %[cc2], %[buf1])
258		lxvw4x(55, %[cc3], %[buf1])
259		FIX_ENDIAN(20)
260		FIX_ENDIAN(21)
261		FIX_ENDIAN(22)
262		FIX_ENDIAN(23)
263		addi(%[buf1], %[buf1], 64)
264		vxor(20, 20, 28)
265
266		/*
267		 * Repack the blocks into v9, v10, v11 and v12.
268		 *   v9  = b0_0:b1_0
269		 *   v10 = b0_1:b1_1
270		 *   v11 = b2_0:b3_0
271		 *   v12 = b2_1:b3_1
272		 */
273		xxpermdi(41, 52, 53, 0)
274		xxpermdi(42, 52, 53, 3)
275		xxpermdi(43, 54, 55, 0)
276		xxpermdi(44, 54, 55, 3)
277
278		/*
279		 * Compute the products.
280		 *   v20 = b0_0*h4_0 + b1_0*h3_0
281		 *   v21 = b0_1*h4_0 + b1_1*h3_0
282		 *   v22 = b0_0*h4_1 + b1_0*h3_1
283		 *   v23 = b0_1*h4_1 + b1_1*h3_1
284		 *   v24 = b2_0*h2_0 + b3_0*h1_0
285		 *   v25 = b2_1*h2_0 + b3_1*h1_0
286		 *   v26 = b2_0*h2_1 + b3_0*h1_1
287		 *   v27 = b2_1*h2_1 + b3_1*h1_1
288		 */
289		vpmsumd(20, 13,  9)
290		vpmsumd(21, 13, 10)
291		vpmsumd(22, 14,  9)
292		vpmsumd(23, 14, 10)
293		vpmsumd(24, 15, 11)
294		vpmsumd(25, 15, 12)
295		vpmsumd(26, 16, 11)
296		vpmsumd(27, 16, 12)
297
298		/*
299		 * Sum products into a single 256-bit result in v11:v12.
300		 */
301		vxor(11, 20, 24)
302		vxor(12, 23, 27)
303		vxor( 9, 21, 22)
304		vxor(10, 25, 26)
305		vxor(20,  9, 10)
306		vsldoi( 9, HB0, 20, 8)
307		vsldoi(10, 20, HB0, 8)
308		vxor(11, 11, 9)
309		vxor(12, 12, 10)
310
311		/*
312		 * Fix and reduce in GF(2^128); this is the new y (in v28).
313		 */
314		SL_256(11, 12)
315		REDUCE_F128(28, 11, 12)
316
317		/*
318		 * Loop for next group of four blocks.
319		 */
320		bdnz(loop4)
321
322		/*
323		 * Process second chunk, one block at a time.
324		 */
325	label(chunk1)
326		cmpldi(%[num1], 0)
327		beq(done)
328
329		mtctr(%[num1])
330	label(loop1)
331		/*
332		 * Load next data block and XOR it into y.
333		 */
334		lxvw4x(41, 0, %[buf2])
335#if BR_POWER8_LE
336		FIX_ENDIAN(9)
337#endif
338		addi(%[buf2], %[buf2], 16)
339		vxor(9, 28, 9)
340
341		/*
342		 * Split y into doublewords:
343		 *   v9  = y_0:y_1
344		 *   v10 =   0:y_0
345		 *   v11 = y_1:0
346		 */
347		vsldoi(10, HB0, 9, 8)
348		vsldoi(11, 9, HB0, 8)
349
350		/*
351		 * Compute products with h:
352		 *   v12 = y_0 * h_0
353		 *   v13 = y_1 * h_1
354		 *   v14 = y_1 * h_0 + y_0 * h_1
355		 */
356		vpmsumd(14,  9, 17)
357		vpmsumd(12, 10, 18)
358		vpmsumd(13, 11, 19)
359
360		/*
361		 * Propagate v14 into v12:v13 to finalise product.
362		 */
363		vsldoi(10, HB0, 14, 8)
364		vsldoi(11, 14, HB0, 8)
365		vxor(12, 12, 10)
366		vxor(13, 13, 11)
367
368		/*
369		 * Fix result and reduce into v28 (next value for y).
370		 */
371		SL_256(12, 13)
372		REDUCE_F128(28, 12, 13)
373		bdnz(loop1)
374
375	label(done)
376		/*
377		 * Write back the new y.
378		 */
379		FIX_ENDIAN(28)
380		stxvw4x(60, 0, %[y])
381
382: [buf1] "+b" (buf1), [buf2] "+b" (buf2)
383: [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1),
384  [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3)
385#if BR_POWER8_LE
386	, [idx2be] "b" (idx2be)
387#endif
388: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
389  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
390  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
391  "ctr", "memory"
392	);
393}
394
395/* see bearssl_hash.h */
396br_ghash
397br_ghash_pwr8_get(void)
398{
399	return &br_ghash_pwr8;
400}
401
402#else
403
404/* see bearssl_hash.h */
405br_ghash
406br_ghash_pwr8_get(void)
407{
408	return 0;
409}
410
411#endif
412