1/*	$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $");
31
32#ifdef _KERNEL
33#include <sys/systm.h>
34#include <lib/libkern/libkern.h>
35#else
36#include <err.h>
37#include <assert.h>
38#include <inttypes.h>
39#include <stdio.h>
40#include <string.h>
41#define	KASSERT			assert
42#define	panic(fmt, args...)	err(1, fmt, ##args)
43#endif
44
45#include <crypto/aes/aes.h>
46#include <crypto/aes/arch/x86/aes_sse2.h>
47
48#include "aes_sse2_impl.h"
49
50void
51aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
52{
53	size_t key_len;
54
55	switch (nrounds) {
56	case 10:
57		key_len = 16;
58		break;
59	case 12:
60		key_len = 24;
61		break;
62	case 14:
63		key_len = 32;
64		break;
65	default:
66		panic("invalid AES nrounds: %u", nrounds);
67	}
68
69	aes_sse2_keysched(rk, key, key_len);
70}
71
72void
73aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
74    uint8_t out[static 16], uint32_t nrounds)
75{
76	uint64_t sk_exp[120];
77	__m128i q[4];
78
79	/* Expand round keys for bitslicing.  */
80	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
81
82	/* Load input block interleaved with garbage blocks.  */
83	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
84	q[1] = q[2] = q[3] = _mm_setzero_si128();
85
86	/* Transform to bitslice, decrypt, transform from bitslice.  */
87	aes_sse2_ortho(q);
88	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
89	aes_sse2_ortho(q);
90
91	/* Store output block.  */
92	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
93
94	/* Paranoia: Zero temporary buffers.  */
95	explicit_memset(sk_exp, 0, sizeof sk_exp);
96	explicit_memset(q, 0, sizeof q);
97}
98
99void
100aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
101    uint8_t out[static 16], uint32_t nrounds)
102{
103	uint64_t sk_exp[120];
104	__m128i q[4];
105
106	/* Expand round keys for bitslicing.  */
107	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
108
109	/* Load input block interleaved with garbage blocks.  */
110	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
111	q[1] = q[2] = q[3] = _mm_setzero_si128();
112
113	/* Transform to bitslice, decrypt, transform from bitslice.  */
114	aes_sse2_ortho(q);
115	aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
116	aes_sse2_ortho(q);
117
118	/* Store output block.  */
119	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
120
121	/* Paranoia: Zero temporary buffers.  */
122	explicit_memset(sk_exp, 0, sizeof sk_exp);
123	explicit_memset(q, 0, sizeof q);
124}
125
126void
127aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
128    uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
129    uint32_t nrounds)
130{
131	uint64_t sk_exp[120];
132	__m128i q[4];
133	__m128i cv;
134
135	KASSERT(nbytes);
136	KASSERT(nbytes % 16 == 0);
137
138	/* Expand round keys for bitslicing.  */
139	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
140
141	/* Load the IV.  */
142	cv = _mm_loadu_epi8(iv);
143
144	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
145		/* Load input block and apply CV.  */
146		q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
147
148		/* Transform to bitslice, encrypt, transform from bitslice.  */
149		aes_sse2_ortho(q);
150		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
151		aes_sse2_ortho(q);
152
153		/* Remember ciphertext as CV and store output block.  */
154		cv = aes_sse2_interleave_out(q[0]);
155		_mm_storeu_epi8(out, cv);
156	}
157
158	/* Store updated IV.  */
159	_mm_storeu_epi8(iv, cv);
160
161	/* Paranoia: Zero temporary buffers.  */
162	explicit_memset(sk_exp, 0, sizeof sk_exp);
163	explicit_memset(q, 0, sizeof q);
164}
165
166void
167aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
168    uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
169    uint32_t nrounds)
170{
171	uint64_t sk_exp[120];
172	__m128i q[4];
173	__m128i cv, iv, w;
174
175	KASSERT(nbytes);
176	KASSERT(nbytes % 16 == 0);
177
178	/* Expand round keys for bitslicing.  */
179	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
180
181	/* Load the IV.  */
182	iv = _mm_loadu_epi8(ivp);
183
184	/* Load the last cipher block.  */
185	cv = _mm_loadu_epi8(in + nbytes - 16);
186
187	/* Store the updated IV.  */
188	_mm_storeu_epi8(ivp, cv);
189
190	/* Process the last blocks if not an even multiple of four.  */
191	if (nbytes % (4*16)) {
192		unsigned n = (nbytes/16) % 4;
193
194		KASSERT(n > 0);
195		KASSERT(n < 4);
196
197		q[1] = q[2] = q[3] = _mm_setzero_si128();
198		q[n - 1] = aes_sse2_interleave_in(cv);
199		switch (nbytes % 64) {
200		case 48:
201			w = _mm_loadu_epi8(in + nbytes - 32);
202			q[1] = aes_sse2_interleave_in(w);
203			w = _mm_loadu_epi8(in + nbytes - 48);
204			q[0] = aes_sse2_interleave_in(w);
205			break;
206		case 32:
207			w = _mm_loadu_epi8(in + nbytes - 32);
208			q[0] = aes_sse2_interleave_in(w);
209			break;
210		case 16:
211			break;
212		}
213
214		/* Decrypt.  */
215		aes_sse2_ortho(q);
216		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
217		aes_sse2_ortho(q);
218
219		do {
220			n--;
221			w = aes_sse2_interleave_out(q[n]);
222			if ((nbytes -= 16) == 0)
223				goto out;
224			cv = _mm_loadu_epi8(in + nbytes - 16);
225			_mm_storeu_epi8(out + nbytes, w ^ cv);
226		} while (n);
227	}
228
229	for (;;) {
230		KASSERT(nbytes >= 64);
231		nbytes -= 64;
232
233		/*
234		 * 1. Set up upper cipher block from cv.
235		 * 2. Load lower cipher block into cv and set it up.
236		 * 3. Decrypt.
237		 */
238		q[3] = aes_sse2_interleave_in(cv);
239
240		w = _mm_loadu_epi8(in + nbytes + 4*8);
241		q[2] = aes_sse2_interleave_in(w);
242
243		w = _mm_loadu_epi8(in + nbytes + 4*4);
244		q[1] = aes_sse2_interleave_in(w);
245
246		w = _mm_loadu_epi8(in + nbytes + 4*0);
247		q[0] = aes_sse2_interleave_in(w);
248
249		aes_sse2_ortho(q);
250		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
251		aes_sse2_ortho(q);
252
253		/* Store the upper output block.  */
254		w = aes_sse2_interleave_out(q[3]);
255		cv = _mm_loadu_epi8(in + nbytes + 4*8);
256		_mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
257
258		/* Store the middle output blocks.  */
259		w = aes_sse2_interleave_out(q[2]);
260		cv = _mm_loadu_epi8(in + nbytes + 4*4);
261		_mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
262
263		w = aes_sse2_interleave_out(q[1]);
264		cv = _mm_loadu_epi8(in + nbytes + 4*0);
265		_mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
266
267		/*
268		 * Get the first output block, but don't load the CV
269		 * yet -- it might be the previous ciphertext block, or
270		 * it might be the IV.
271		 */
272		w = aes_sse2_interleave_out(q[0]);
273
274		/* Stop if we've reached the first output block.  */
275		if (nbytes == 0)
276			goto out;
277
278		/*
279		 * Load the preceding cipher block, and apply it as the
280		 * chaining value to this one.
281		 */
282		cv = _mm_loadu_epi8(in + nbytes - 16);
283		_mm_storeu_epi8(out + nbytes, w ^ cv);
284	}
285
286out:	/* Store the first output block.  */
287	_mm_storeu_epi8(out, w ^ iv);
288
289	/* Paranoia: Zero temporary buffers.  */
290	explicit_memset(sk_exp, 0, sizeof sk_exp);
291	explicit_memset(q, 0, sizeof q);
292}
293
294static inline __m128i
295aes_sse2_xts_update(__m128i t)
296{
297	const __m128i one = _mm_set_epi64x(1, 1);
298	__m128i s, m, c;
299
300	s = _mm_srli_epi64(t, 63);	/* 1 if high bit set else 0 */
301	m = _mm_sub_epi64(s, one);	/* 0 if high bit set else -1 */
302	m = _mm_shuffle_epi32(m, 0x4e);	/* swap halves */
303	c = _mm_set_epi64x(1, 0x87);	/* carry */
304
305	return _mm_slli_epi64(t, 1) ^ (c & ~m);
306}
307
308static int
309aes_sse2_xts_update_selftest(void)
310{
311	static const struct {
312		uint32_t in[4], out[4];
313	} cases[] = {
314		[0] = { {1}, {2} },
315		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
316		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
317		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
318		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
319		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
320	};
321	unsigned i;
322	uint32_t t[4];
323	int result = 0;
324
325	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
326		t[0] = cases[i].in[0];
327		t[1] = cases[i].in[1];
328		t[2] = cases[i].in[2];
329		t[3] = cases[i].in[3];
330		_mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
331		if (t[0] != cases[i].out[0] ||
332		    t[1] != cases[i].out[1] ||
333		    t[2] != cases[i].out[2] ||
334		    t[3] != cases[i].out[3]) {
335			printf("%s %u:"
336			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
337			    __func__, i, t[0], t[1], t[2], t[3]);
338			result = -1;
339		}
340	}
341
342	return result;
343}
344
345void
346aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
347    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
348    uint32_t nrounds)
349{
350	uint64_t sk_exp[120];
351	__m128i q[4];
352	__m128i w;
353	__m128i t[5];
354	unsigned i;
355
356	KASSERT(nbytes);
357	KASSERT(nbytes % 16 == 0);
358
359	/* Expand round keys for bitslicing.  */
360	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
361
362	/* Load tweak.  */
363	t[0] = _mm_loadu_epi8(tweak);
364
365	/* Handle the first block separately if odd number.  */
366	if (nbytes % (4*16)) {
367		/* Load up the tweaked inputs.  */
368		for (i = 0; i < (nbytes/16) % 4; i++) {
369			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
370			q[i] = aes_sse2_interleave_in(w);
371			t[i + 1] = aes_sse2_xts_update(t[i]);
372		}
373		for (; i < 4; i++)
374			q[i] = _mm_setzero_si128();
375
376		/* Encrypt up to four blocks.  */
377		aes_sse2_ortho(q);
378		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
379		aes_sse2_ortho(q);
380
381		/* Store the tweaked outputs.  */
382		for (i = 0; i < (nbytes/16) % 4; i++) {
383			w = aes_sse2_interleave_out(q[i]);
384			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
385		}
386
387		/* Advance to the next block.  */
388		t[0] = t[i];
389		in += nbytes % (4*16);
390		out += nbytes % (4*16);
391		nbytes -= nbytes % (4*16);
392		if (nbytes == 0)
393			goto out;
394	}
395
396	do {
397		KASSERT(nbytes % 64 == 0);
398		KASSERT(nbytes >= 64);
399
400		/* Load up the tweaked inputs.  */
401		for (i = 0; i < 4; i++) {
402			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
403			q[i] = aes_sse2_interleave_in(w);
404			t[i + 1] = aes_sse2_xts_update(t[i]);
405		}
406
407		/* Encrypt four blocks.  */
408		aes_sse2_ortho(q);
409		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
410		aes_sse2_ortho(q);
411
412		/* Store the tweaked outputs.  */
413		for (i = 0; i < 4; i++) {
414			w = aes_sse2_interleave_out(q[i]);
415			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
416		}
417
418		/* Advance to the next block.  */
419		t[0] = t[4];
420		in += 64;
421		out += 64;
422		nbytes -= 64;
423	} while (nbytes);
424
425out:	/* Store the updated tweak.  */
426	_mm_storeu_epi8(tweak, t[0]);
427
428	/* Paranoia: Zero temporary buffers.  */
429	explicit_memset(sk_exp, 0, sizeof sk_exp);
430	explicit_memset(q, 0, sizeof q);
431	explicit_memset(t, 0, sizeof t);
432}
433
434void
435aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
436    uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
437    uint32_t nrounds)
438{
439	uint64_t sk_exp[120];
440	__m128i q[4];
441	__m128i w;
442	__m128i t[5];
443	unsigned i;
444
445	KASSERT(nbytes);
446	KASSERT(nbytes % 16 == 0);
447
448	/* Expand round keys for bitslicing.  */
449	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
450
451	/* Load tweak.  */
452	t[0] = _mm_loadu_epi8(tweak);
453
454	/* Handle the first block separately if odd number.  */
455	if (nbytes % (4*16)) {
456		/* Load up the tweaked inputs.  */
457		for (i = 0; i < (nbytes/16) % 4; i++) {
458			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
459			q[i] = aes_sse2_interleave_in(w);
460			t[i + 1] = aes_sse2_xts_update(t[i]);
461		}
462		for (; i < 4; i++)
463			q[i] = _mm_setzero_si128();
464
465		/* Decrypt up to four blocks.  */
466		aes_sse2_ortho(q);
467		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
468		aes_sse2_ortho(q);
469
470		/* Store the tweaked outputs.  */
471		for (i = 0; i < (nbytes/16) % 4; i++) {
472			w = aes_sse2_interleave_out(q[i]);
473			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
474		}
475
476		/* Advance to the next block.  */
477		t[0] = t[i];
478		in += nbytes % (4*16);
479		out += nbytes % (4*16);
480		nbytes -= nbytes % (4*16);
481		if (nbytes == 0)
482			goto out;
483	}
484
485	do {
486		KASSERT(nbytes % 64 == 0);
487		KASSERT(nbytes >= 64);
488
489		/* Load up the tweaked inputs.  */
490		for (i = 0; i < 4; i++) {
491			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
492			q[i] = aes_sse2_interleave_in(w);
493			t[i + 1] = aes_sse2_xts_update(t[i]);
494		}
495
496		/* Decrypt four blocks.  */
497		aes_sse2_ortho(q);
498		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
499		aes_sse2_ortho(q);
500
501		/* Store the tweaked outputs.  */
502		for (i = 0; i < 4; i++) {
503			w = aes_sse2_interleave_out(q[i]);
504			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
505		}
506
507		/* Advance to the next block.  */
508		t[0] = t[4];
509		in += 64;
510		out += 64;
511		nbytes -= 64;
512	} while (nbytes);
513
514out:	/* Store the updated tweak.  */
515	_mm_storeu_epi8(tweak, t[0]);
516
517	/* Paranoia: Zero temporary buffers.  */
518	explicit_memset(sk_exp, 0, sizeof sk_exp);
519	explicit_memset(q, 0, sizeof q);
520	explicit_memset(t, 0, sizeof t);
521}
522
523void
524aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
525    size_t nbytes, uint8_t auth[static 16], uint32_t nrounds)
526{
527	uint64_t sk_exp[120];
528	__m128i q[4];
529
530	KASSERT(nbytes);
531	KASSERT(nbytes % 16 == 0);
532
533	/* Expand round keys for bitslicing.  */
534	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
535
536	/* Load initial authenticator.  */
537	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth));
538
539	for (; nbytes; nbytes -= 16, in += 16) {
540		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
541		aes_sse2_ortho(q);
542		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
543		aes_sse2_ortho(q);
544	}
545
546	/* Store updated authenticator.  */
547	_mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0]));
548
549	/* Paranoia: Zero temporary buffers.  */
550	explicit_memset(sk_exp, 0, sizeof sk_exp);
551	explicit_memset(q, 0, sizeof q);
552}
553
554void
555aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
556    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
557    uint32_t nrounds)
558{
559	uint64_t sk_exp[120];
560	__m128i q[4];
561	__m128i ctr;
562	uint32_t c0, c1, c2, c3;
563
564	KASSERT(nbytes);
565	KASSERT(nbytes % 16 == 0);
566
567	/* Expand round keys for bitslicing.  */
568	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
569
570	/* Set first block to authenticator.  */
571	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
572
573	/* Load initial counter block, big-endian so we can increment it.  */
574	c0 = le32dec(authctr + 16 + 4*0);
575	c1 = le32dec(authctr + 16 + 4*1);
576	c2 = le32dec(authctr + 16 + 4*2);
577	c3 = be32dec(authctr + 16 + 4*3);
578
579	/* Set other blocks to garbage -- can't take advantage.  */
580	q[2] = q[3] = _mm_setzero_si128();
581
582	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
583		/* Update authenticator.  */
584		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
585
586		/* Increment 32-bit counter.  */
587		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
588		q[1] = aes_sse2_interleave_in(ctr);
589
590		/* Encrypt authenticator and counter.  */
591		aes_sse2_ortho(q);
592		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
593		aes_sse2_ortho(q);
594
595		/* Encrypt with CTR output.  */
596		_mm_storeu_epi8(out,
597		    _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1]));
598	}
599
600	/* Update authenticator.  */
601	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0]));
602
603	/* Update counter.  */
604	be32enc(authctr + 16 + 4*3, c3);
605
606	/* Paranoia: Zero temporary buffers.  */
607	explicit_memset(sk_exp, 0, sizeof sk_exp);
608	explicit_memset(q, 0, sizeof q);
609}
610
611void
612aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
613    uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
614    uint32_t nrounds)
615{
616	uint64_t sk_exp[120];
617	__m128i q[4];
618	__m128i ctr, block;
619	uint32_t c0, c1, c2, c3;
620
621	KASSERT(nbytes);
622	KASSERT(nbytes % 16 == 0);
623
624	/* Expand round keys for bitslicing.  */
625	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
626
627	/* Load initial counter block, big-endian so we can increment it.  */
628	c0 = le32dec(authctr + 16 + 4*0);
629	c1 = le32dec(authctr + 16 + 4*1);
630	c2 = le32dec(authctr + 16 + 4*2);
631	c3 = be32dec(authctr + 16 + 4*3);
632
633	/* Increment 32-bit counter.  */
634	ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
635	q[0] = aes_sse2_interleave_in(ctr);
636
637	/*
638	 * Set the other blocks to garbage -- we don't have any
639	 * plaintext to authenticate yet.
640	 */
641	q[1] = q[2] = q[3] = _mm_setzero_si128();
642
643	/* Encrypt first CTR.  */
644	aes_sse2_ortho(q);
645	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
646	aes_sse2_ortho(q);
647
648	/* Load the initial authenticator.  */
649	q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
650
651	for (;; in += 16, out += 16) {
652		/* Decrypt the block.  */
653		block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]);
654
655		/* Update authenticator.  */
656		q[1] ^= aes_sse2_interleave_in(block);
657
658		/* Store plaintext.  */
659		_mm_storeu_epi8(out, block);
660
661		/* If this is the last block, stop.  */
662		if ((nbytes -= 16) == 0)
663			break;
664
665		/* Increment 32-bit counter.  */
666		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
667		q[0] = aes_sse2_interleave_in(ctr);
668
669		/* Authenticate previous plaintext, encrypt next CTR.  */
670		aes_sse2_ortho(q);
671		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
672		aes_sse2_ortho(q);
673	}
674
675	/*
676	 * Authenticate last plaintext.  We're only doing this for the
677	 * authenticator, not for the counter, so don't bother to
678	 * initialize q[0], q[2], q[3].  (Even for the sake of
679	 * sanitizers, they're already initialized to something by
680	 * now.)
681	 */
682	aes_sse2_ortho(q);
683	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
684	aes_sse2_ortho(q);
685
686	/* Update authenticator.  */
687	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1]));
688
689	/* Update counter.  */
690	be32enc(authctr + 16 + 4*3, c3);
691
692	/* Paranoia: Zero temporary buffers.  */
693	explicit_memset(sk_exp, 0, sizeof sk_exp);
694	explicit_memset(q, 0, sizeof q);
695}
696
697int
698aes_sse2_selftest(void)
699{
700
701	if (aes_sse2_xts_update_selftest())
702		return -1;
703
704	/* XXX test aes_sse2_bitslice_decrypt */
705	/* XXX test aes_sse2_bitslice_encrypt */
706	/* XXX test aes_sse2_keysched */
707	/* XXX test aes_sse2_ortho */
708	/* XXX test aes_sse2_skey_expand */
709
710	return 0;
711}
712