1/*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#define BR_ENABLE_INTRINSICS   1
26#include "inner.h"
27
28#if BR_AES_X86NI
29
30/* see bearssl_block.h */
31const br_block_ctrcbc_class *
32br_aes_x86ni_ctrcbc_get_vtable(void)
33{
34	return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL;
35}
36
37/* see bearssl_block.h */
38void
39br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx,
40	const void *key, size_t len)
41{
42	ctx->vtable = &br_aes_x86ni_ctrcbc_vtable;
43	ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
44}
45
46BR_TARGETS_X86_UP
47
48/* see bearssl_block.h */
49BR_TARGET("sse2,sse4.1,aes")
50void
51br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx,
52	void *ctr, void *data, size_t len)
53{
54	unsigned char *buf;
55	unsigned num_rounds;
56	__m128i sk[15];
57	__m128i ivx0, ivx1, ivx2, ivx3;
58	__m128i erev, zero, one, four, notthree;
59	unsigned u;
60
61	buf = data;
62	num_rounds = ctx->num_rounds;
63	for (u = 0; u <= num_rounds; u ++) {
64		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
65	}
66
67	/*
68	 * Some SSE2 constants.
69	 */
70	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
71		8, 9, 10, 11, 12, 13, 14, 15);
72	zero = _mm_setzero_si128();
73	one = _mm_set_epi64x(0, 1);
74	four = _mm_set_epi64x(0, 4);
75	notthree = _mm_sub_epi64(zero, four);
76
77	/*
78	 * Decode the counter in big-endian and pre-increment the other
79	 * three counters.
80	 */
81	ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev);
82	ivx1 = _mm_add_epi64(ivx0, one);
83	ivx1 = _mm_sub_epi64(ivx1,
84		_mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8));
85	ivx2 = _mm_add_epi64(ivx1, one);
86	ivx2 = _mm_sub_epi64(ivx2,
87		_mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8));
88	ivx3 = _mm_add_epi64(ivx2, one);
89	ivx3 = _mm_sub_epi64(ivx3,
90		_mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8));
91	while (len > 0) {
92		__m128i x0, x1, x2, x3;
93
94		/*
95		 * Load counter values; we need to byteswap them because
96		 * the specification says that they use big-endian.
97		 */
98		x0 = _mm_shuffle_epi8(ivx0, erev);
99		x1 = _mm_shuffle_epi8(ivx1, erev);
100		x2 = _mm_shuffle_epi8(ivx2, erev);
101		x3 = _mm_shuffle_epi8(ivx3, erev);
102
103		x0 = _mm_xor_si128(x0, sk[0]);
104		x1 = _mm_xor_si128(x1, sk[0]);
105		x2 = _mm_xor_si128(x2, sk[0]);
106		x3 = _mm_xor_si128(x3, sk[0]);
107		x0 = _mm_aesenc_si128(x0, sk[1]);
108		x1 = _mm_aesenc_si128(x1, sk[1]);
109		x2 = _mm_aesenc_si128(x2, sk[1]);
110		x3 = _mm_aesenc_si128(x3, sk[1]);
111		x0 = _mm_aesenc_si128(x0, sk[2]);
112		x1 = _mm_aesenc_si128(x1, sk[2]);
113		x2 = _mm_aesenc_si128(x2, sk[2]);
114		x3 = _mm_aesenc_si128(x3, sk[2]);
115		x0 = _mm_aesenc_si128(x0, sk[3]);
116		x1 = _mm_aesenc_si128(x1, sk[3]);
117		x2 = _mm_aesenc_si128(x2, sk[3]);
118		x3 = _mm_aesenc_si128(x3, sk[3]);
119		x0 = _mm_aesenc_si128(x0, sk[4]);
120		x1 = _mm_aesenc_si128(x1, sk[4]);
121		x2 = _mm_aesenc_si128(x2, sk[4]);
122		x3 = _mm_aesenc_si128(x3, sk[4]);
123		x0 = _mm_aesenc_si128(x0, sk[5]);
124		x1 = _mm_aesenc_si128(x1, sk[5]);
125		x2 = _mm_aesenc_si128(x2, sk[5]);
126		x3 = _mm_aesenc_si128(x3, sk[5]);
127		x0 = _mm_aesenc_si128(x0, sk[6]);
128		x1 = _mm_aesenc_si128(x1, sk[6]);
129		x2 = _mm_aesenc_si128(x2, sk[6]);
130		x3 = _mm_aesenc_si128(x3, sk[6]);
131		x0 = _mm_aesenc_si128(x0, sk[7]);
132		x1 = _mm_aesenc_si128(x1, sk[7]);
133		x2 = _mm_aesenc_si128(x2, sk[7]);
134		x3 = _mm_aesenc_si128(x3, sk[7]);
135		x0 = _mm_aesenc_si128(x0, sk[8]);
136		x1 = _mm_aesenc_si128(x1, sk[8]);
137		x2 = _mm_aesenc_si128(x2, sk[8]);
138		x3 = _mm_aesenc_si128(x3, sk[8]);
139		x0 = _mm_aesenc_si128(x0, sk[9]);
140		x1 = _mm_aesenc_si128(x1, sk[9]);
141		x2 = _mm_aesenc_si128(x2, sk[9]);
142		x3 = _mm_aesenc_si128(x3, sk[9]);
143		if (num_rounds == 10) {
144			x0 = _mm_aesenclast_si128(x0, sk[10]);
145			x1 = _mm_aesenclast_si128(x1, sk[10]);
146			x2 = _mm_aesenclast_si128(x2, sk[10]);
147			x3 = _mm_aesenclast_si128(x3, sk[10]);
148		} else if (num_rounds == 12) {
149			x0 = _mm_aesenc_si128(x0, sk[10]);
150			x1 = _mm_aesenc_si128(x1, sk[10]);
151			x2 = _mm_aesenc_si128(x2, sk[10]);
152			x3 = _mm_aesenc_si128(x3, sk[10]);
153			x0 = _mm_aesenc_si128(x0, sk[11]);
154			x1 = _mm_aesenc_si128(x1, sk[11]);
155			x2 = _mm_aesenc_si128(x2, sk[11]);
156			x3 = _mm_aesenc_si128(x3, sk[11]);
157			x0 = _mm_aesenclast_si128(x0, sk[12]);
158			x1 = _mm_aesenclast_si128(x1, sk[12]);
159			x2 = _mm_aesenclast_si128(x2, sk[12]);
160			x3 = _mm_aesenclast_si128(x3, sk[12]);
161		} else {
162			x0 = _mm_aesenc_si128(x0, sk[10]);
163			x1 = _mm_aesenc_si128(x1, sk[10]);
164			x2 = _mm_aesenc_si128(x2, sk[10]);
165			x3 = _mm_aesenc_si128(x3, sk[10]);
166			x0 = _mm_aesenc_si128(x0, sk[11]);
167			x1 = _mm_aesenc_si128(x1, sk[11]);
168			x2 = _mm_aesenc_si128(x2, sk[11]);
169			x3 = _mm_aesenc_si128(x3, sk[11]);
170			x0 = _mm_aesenc_si128(x0, sk[12]);
171			x1 = _mm_aesenc_si128(x1, sk[12]);
172			x2 = _mm_aesenc_si128(x2, sk[12]);
173			x3 = _mm_aesenc_si128(x3, sk[12]);
174			x0 = _mm_aesenc_si128(x0, sk[13]);
175			x1 = _mm_aesenc_si128(x1, sk[13]);
176			x2 = _mm_aesenc_si128(x2, sk[13]);
177			x3 = _mm_aesenc_si128(x3, sk[13]);
178			x0 = _mm_aesenclast_si128(x0, sk[14]);
179			x1 = _mm_aesenclast_si128(x1, sk[14]);
180			x2 = _mm_aesenclast_si128(x2, sk[14]);
181			x3 = _mm_aesenclast_si128(x3, sk[14]);
182		}
183		if (len >= 64) {
184			x0 = _mm_xor_si128(x0,
185				_mm_loadu_si128((void *)(buf +  0)));
186			x1 = _mm_xor_si128(x1,
187				_mm_loadu_si128((void *)(buf + 16)));
188			x2 = _mm_xor_si128(x2,
189				_mm_loadu_si128((void *)(buf + 32)));
190			x3 = _mm_xor_si128(x3,
191				_mm_loadu_si128((void *)(buf + 48)));
192			_mm_storeu_si128((void *)(buf +  0), x0);
193			_mm_storeu_si128((void *)(buf + 16), x1);
194			_mm_storeu_si128((void *)(buf + 32), x2);
195			_mm_storeu_si128((void *)(buf + 48), x3);
196			buf += 64;
197			len -= 64;
198		} else {
199			unsigned char tmp[64];
200
201			_mm_storeu_si128((void *)(tmp +  0), x0);
202			_mm_storeu_si128((void *)(tmp + 16), x1);
203			_mm_storeu_si128((void *)(tmp + 32), x2);
204			_mm_storeu_si128((void *)(tmp + 48), x3);
205			for (u = 0; u < len; u ++) {
206				buf[u] ^= tmp[u];
207			}
208			switch (len) {
209			case 16:
210				ivx0 = ivx1;
211				break;
212			case 32:
213				ivx0 = ivx2;
214				break;
215			case 48:
216				ivx0 = ivx3;
217				break;
218			}
219			break;
220		}
221
222		/*
223		 * Add 4 to each counter value. For carry propagation
224		 * into the upper 64-bit words, we would need to compare
225		 * the results with 4, but SSE2+ has only _signed_
226		 * comparisons. Instead, we mask out the low two bits,
227		 * and check whether the remaining bits are zero.
228		 */
229		ivx0 = _mm_add_epi64(ivx0, four);
230		ivx1 = _mm_add_epi64(ivx1, four);
231		ivx2 = _mm_add_epi64(ivx2, four);
232		ivx3 = _mm_add_epi64(ivx3, four);
233		ivx0 = _mm_sub_epi64(ivx0,
234			_mm_slli_si128(_mm_cmpeq_epi64(
235				_mm_and_si128(ivx0, notthree), zero), 8));
236		ivx1 = _mm_sub_epi64(ivx1,
237			_mm_slli_si128(_mm_cmpeq_epi64(
238				_mm_and_si128(ivx1, notthree), zero), 8));
239		ivx2 = _mm_sub_epi64(ivx2,
240			_mm_slli_si128(_mm_cmpeq_epi64(
241				_mm_and_si128(ivx2, notthree), zero), 8));
242		ivx3 = _mm_sub_epi64(ivx3,
243			_mm_slli_si128(_mm_cmpeq_epi64(
244				_mm_and_si128(ivx3, notthree), zero), 8));
245	}
246
247	/*
248	 * Write back new counter value. The loop took care to put the
249	 * right counter value in ivx0.
250	 */
251	_mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev));
252}
253
254/* see bearssl_block.h */
255BR_TARGET("sse2,sse4.1,aes")
256void
257br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx,
258	void *cbcmac, const void *data, size_t len)
259{
260	const unsigned char *buf;
261	unsigned num_rounds;
262	__m128i sk[15], ivx;
263	unsigned u;
264
265	buf = data;
266	ivx = _mm_loadu_si128(cbcmac);
267	num_rounds = ctx->num_rounds;
268	for (u = 0; u <= num_rounds; u ++) {
269		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
270	}
271	while (len > 0) {
272		__m128i x;
273
274		x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx);
275		x = _mm_xor_si128(x, sk[0]);
276		x = _mm_aesenc_si128(x, sk[1]);
277		x = _mm_aesenc_si128(x, sk[2]);
278		x = _mm_aesenc_si128(x, sk[3]);
279		x = _mm_aesenc_si128(x, sk[4]);
280		x = _mm_aesenc_si128(x, sk[5]);
281		x = _mm_aesenc_si128(x, sk[6]);
282		x = _mm_aesenc_si128(x, sk[7]);
283		x = _mm_aesenc_si128(x, sk[8]);
284		x = _mm_aesenc_si128(x, sk[9]);
285		if (num_rounds == 10) {
286			x = _mm_aesenclast_si128(x, sk[10]);
287		} else if (num_rounds == 12) {
288			x = _mm_aesenc_si128(x, sk[10]);
289			x = _mm_aesenc_si128(x, sk[11]);
290			x = _mm_aesenclast_si128(x, sk[12]);
291		} else {
292			x = _mm_aesenc_si128(x, sk[10]);
293			x = _mm_aesenc_si128(x, sk[11]);
294			x = _mm_aesenc_si128(x, sk[12]);
295			x = _mm_aesenc_si128(x, sk[13]);
296			x = _mm_aesenclast_si128(x, sk[14]);
297		}
298		ivx = x;
299		buf += 16;
300		len -= 16;
301	}
302	_mm_storeu_si128(cbcmac, ivx);
303}
304
305/* see bearssl_block.h */
306BR_TARGET("sse2,sse4.1,aes")
307void
308br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
309	void *ctr, void *cbcmac, void *data, size_t len)
310{
311	unsigned char *buf;
312	unsigned num_rounds;
313	__m128i sk[15];
314	__m128i ivx, cmx;
315	__m128i erev, zero, one;
316	unsigned u;
317	int first_iter;
318
319	num_rounds = ctx->num_rounds;
320	for (u = 0; u <= num_rounds; u ++) {
321		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
322	}
323
324	/*
325	 * Some SSE2 constants.
326	 */
327	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
328		8, 9, 10, 11, 12, 13, 14, 15);
329	zero = _mm_setzero_si128();
330	one = _mm_set_epi64x(0, 1);
331
332	/*
333	 * Decode the counter in big-endian.
334	 */
335	ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
336	cmx = _mm_loadu_si128(cbcmac);
337
338	buf = data;
339	first_iter = 1;
340	while (len > 0) {
341		__m128i dx, x0, x1;
342
343		/*
344		 * Load initial values:
345		 *   dx   encrypted block of data
346		 *   x0   counter (for CTR encryption)
347		 *   x1   input for CBC-MAC
348		 */
349		dx = _mm_loadu_si128((void *)buf);
350		x0 = _mm_shuffle_epi8(ivx, erev);
351		x1 = cmx;
352
353		x0 = _mm_xor_si128(x0, sk[0]);
354		x1 = _mm_xor_si128(x1, sk[0]);
355		x0 = _mm_aesenc_si128(x0, sk[1]);
356		x1 = _mm_aesenc_si128(x1, sk[1]);
357		x0 = _mm_aesenc_si128(x0, sk[2]);
358		x1 = _mm_aesenc_si128(x1, sk[2]);
359		x0 = _mm_aesenc_si128(x0, sk[3]);
360		x1 = _mm_aesenc_si128(x1, sk[3]);
361		x0 = _mm_aesenc_si128(x0, sk[4]);
362		x1 = _mm_aesenc_si128(x1, sk[4]);
363		x0 = _mm_aesenc_si128(x0, sk[5]);
364		x1 = _mm_aesenc_si128(x1, sk[5]);
365		x0 = _mm_aesenc_si128(x0, sk[6]);
366		x1 = _mm_aesenc_si128(x1, sk[6]);
367		x0 = _mm_aesenc_si128(x0, sk[7]);
368		x1 = _mm_aesenc_si128(x1, sk[7]);
369		x0 = _mm_aesenc_si128(x0, sk[8]);
370		x1 = _mm_aesenc_si128(x1, sk[8]);
371		x0 = _mm_aesenc_si128(x0, sk[9]);
372		x1 = _mm_aesenc_si128(x1, sk[9]);
373		if (num_rounds == 10) {
374			x0 = _mm_aesenclast_si128(x0, sk[10]);
375			x1 = _mm_aesenclast_si128(x1, sk[10]);
376		} else if (num_rounds == 12) {
377			x0 = _mm_aesenc_si128(x0, sk[10]);
378			x1 = _mm_aesenc_si128(x1, sk[10]);
379			x0 = _mm_aesenc_si128(x0, sk[11]);
380			x1 = _mm_aesenc_si128(x1, sk[11]);
381			x0 = _mm_aesenclast_si128(x0, sk[12]);
382			x1 = _mm_aesenclast_si128(x1, sk[12]);
383		} else {
384			x0 = _mm_aesenc_si128(x0, sk[10]);
385			x1 = _mm_aesenc_si128(x1, sk[10]);
386			x0 = _mm_aesenc_si128(x0, sk[11]);
387			x1 = _mm_aesenc_si128(x1, sk[11]);
388			x0 = _mm_aesenc_si128(x0, sk[12]);
389			x1 = _mm_aesenc_si128(x1, sk[12]);
390			x0 = _mm_aesenc_si128(x0, sk[13]);
391			x1 = _mm_aesenc_si128(x1, sk[13]);
392			x0 = _mm_aesenclast_si128(x0, sk[14]);
393			x1 = _mm_aesenclast_si128(x1, sk[14]);
394		}
395
396		x0 = _mm_xor_si128(x0, dx);
397		if (first_iter) {
398			cmx = _mm_xor_si128(cmx, x0);
399			first_iter = 0;
400		} else {
401			cmx = _mm_xor_si128(x1, x0);
402		}
403		_mm_storeu_si128((void *)buf, x0);
404
405		buf += 16;
406		len -= 16;
407
408		/*
409		 * Increment the counter value.
410		 */
411		ivx = _mm_add_epi64(ivx, one);
412		ivx = _mm_sub_epi64(ivx,
413			_mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
414
415		/*
416		 * If this was the last iteration, then compute the
417		 * extra block encryption to complete CBC-MAC.
418		 */
419		if (len == 0) {
420			cmx = _mm_xor_si128(cmx, sk[0]);
421			cmx = _mm_aesenc_si128(cmx, sk[1]);
422			cmx = _mm_aesenc_si128(cmx, sk[2]);
423			cmx = _mm_aesenc_si128(cmx, sk[3]);
424			cmx = _mm_aesenc_si128(cmx, sk[4]);
425			cmx = _mm_aesenc_si128(cmx, sk[5]);
426			cmx = _mm_aesenc_si128(cmx, sk[6]);
427			cmx = _mm_aesenc_si128(cmx, sk[7]);
428			cmx = _mm_aesenc_si128(cmx, sk[8]);
429			cmx = _mm_aesenc_si128(cmx, sk[9]);
430			if (num_rounds == 10) {
431				cmx = _mm_aesenclast_si128(cmx, sk[10]);
432			} else if (num_rounds == 12) {
433				cmx = _mm_aesenc_si128(cmx, sk[10]);
434				cmx = _mm_aesenc_si128(cmx, sk[11]);
435				cmx = _mm_aesenclast_si128(cmx, sk[12]);
436			} else {
437				cmx = _mm_aesenc_si128(cmx, sk[10]);
438				cmx = _mm_aesenc_si128(cmx, sk[11]);
439				cmx = _mm_aesenc_si128(cmx, sk[12]);
440				cmx = _mm_aesenc_si128(cmx, sk[13]);
441				cmx = _mm_aesenclast_si128(cmx, sk[14]);
442			}
443			break;
444		}
445	}
446
447	/*
448	 * Write back new counter value and CBC-MAC value.
449	 */
450	_mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
451	_mm_storeu_si128(cbcmac, cmx);
452}
453
454/* see bearssl_block.h */
455BR_TARGET("sse2,sse4.1,aes")
456void
457br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
458	void *ctr, void *cbcmac, void *data, size_t len)
459{
460	unsigned char *buf;
461	unsigned num_rounds;
462	__m128i sk[15];
463	__m128i ivx, cmx;
464	__m128i erev, zero, one;
465	unsigned u;
466
467	num_rounds = ctx->num_rounds;
468	for (u = 0; u <= num_rounds; u ++) {
469		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
470	}
471
472	/*
473	 * Some SSE2 constants.
474	 */
475	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
476		8, 9, 10, 11, 12, 13, 14, 15);
477	zero = _mm_setzero_si128();
478	one = _mm_set_epi64x(0, 1);
479
480	/*
481	 * Decode the counter in big-endian.
482	 */
483	ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
484	cmx = _mm_loadu_si128(cbcmac);
485
486	buf = data;
487	while (len > 0) {
488		__m128i dx, x0, x1;
489
490		/*
491		 * Load initial values:
492		 *   dx   encrypted block of data
493		 *   x0   counter (for CTR encryption)
494		 *   x1   input for CBC-MAC
495		 */
496		dx = _mm_loadu_si128((void *)buf);
497		x0 = _mm_shuffle_epi8(ivx, erev);
498		x1 = _mm_xor_si128(cmx, dx);
499
500		x0 = _mm_xor_si128(x0, sk[0]);
501		x1 = _mm_xor_si128(x1, sk[0]);
502		x0 = _mm_aesenc_si128(x0, sk[1]);
503		x1 = _mm_aesenc_si128(x1, sk[1]);
504		x0 = _mm_aesenc_si128(x0, sk[2]);
505		x1 = _mm_aesenc_si128(x1, sk[2]);
506		x0 = _mm_aesenc_si128(x0, sk[3]);
507		x1 = _mm_aesenc_si128(x1, sk[3]);
508		x0 = _mm_aesenc_si128(x0, sk[4]);
509		x1 = _mm_aesenc_si128(x1, sk[4]);
510		x0 = _mm_aesenc_si128(x0, sk[5]);
511		x1 = _mm_aesenc_si128(x1, sk[5]);
512		x0 = _mm_aesenc_si128(x0, sk[6]);
513		x1 = _mm_aesenc_si128(x1, sk[6]);
514		x0 = _mm_aesenc_si128(x0, sk[7]);
515		x1 = _mm_aesenc_si128(x1, sk[7]);
516		x0 = _mm_aesenc_si128(x0, sk[8]);
517		x1 = _mm_aesenc_si128(x1, sk[8]);
518		x0 = _mm_aesenc_si128(x0, sk[9]);
519		x1 = _mm_aesenc_si128(x1, sk[9]);
520		if (num_rounds == 10) {
521			x0 = _mm_aesenclast_si128(x0, sk[10]);
522			x1 = _mm_aesenclast_si128(x1, sk[10]);
523		} else if (num_rounds == 12) {
524			x0 = _mm_aesenc_si128(x0, sk[10]);
525			x1 = _mm_aesenc_si128(x1, sk[10]);
526			x0 = _mm_aesenc_si128(x0, sk[11]);
527			x1 = _mm_aesenc_si128(x1, sk[11]);
528			x0 = _mm_aesenclast_si128(x0, sk[12]);
529			x1 = _mm_aesenclast_si128(x1, sk[12]);
530		} else {
531			x0 = _mm_aesenc_si128(x0, sk[10]);
532			x1 = _mm_aesenc_si128(x1, sk[10]);
533			x0 = _mm_aesenc_si128(x0, sk[11]);
534			x1 = _mm_aesenc_si128(x1, sk[11]);
535			x0 = _mm_aesenc_si128(x0, sk[12]);
536			x1 = _mm_aesenc_si128(x1, sk[12]);
537			x0 = _mm_aesenc_si128(x0, sk[13]);
538			x1 = _mm_aesenc_si128(x1, sk[13]);
539			x0 = _mm_aesenclast_si128(x0, sk[14]);
540			x1 = _mm_aesenclast_si128(x1, sk[14]);
541		}
542		x0 = _mm_xor_si128(x0, dx);
543		cmx = x1;
544		_mm_storeu_si128((void *)buf, x0);
545
546		buf += 16;
547		len -= 16;
548
549		/*
550		 * Increment the counter value.
551		 */
552		ivx = _mm_add_epi64(ivx, one);
553		ivx = _mm_sub_epi64(ivx,
554			_mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
555	}
556
557	/*
558	 * Write back new counter value and CBC-MAC value.
559	 */
560	_mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
561	_mm_storeu_si128(cbcmac, cmx);
562}
563
564BR_TARGETS_X86_DOWN
565
566/* see bearssl_block.h */
567const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = {
568	sizeof(br_aes_x86ni_ctrcbc_keys),
569	16,
570	4,
571	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
572		&br_aes_x86ni_ctrcbc_init,
573	(void (*)(const br_block_ctrcbc_class *const *,
574		void *, void *, void *, size_t))
575		&br_aes_x86ni_ctrcbc_encrypt,
576	(void (*)(const br_block_ctrcbc_class *const *,
577		void *, void *, void *, size_t))
578		&br_aes_x86ni_ctrcbc_decrypt,
579	(void (*)(const br_block_ctrcbc_class *const *,
580		void *, void *, size_t))
581		&br_aes_x86ni_ctrcbc_ctr,
582	(void (*)(const br_block_ctrcbc_class *const *,
583		void *, const void *, size_t))
584		&br_aes_x86ni_ctrcbc_mac
585};
586
587#else
588
589/* see bearssl_block.h */
590const br_block_ctrcbc_class *
591br_aes_x86ni_ctrcbc_get_vtable(void)
592{
593	return NULL;
594}
595
596#endif
597