1/*-
2 * Copyright (c) 2014 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by John-Mark Gurney under
6 * the sponsorship of the FreeBSD Foundation and
7 * Rubicon Communications, LLC (Netgate).
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1.  Redistributions of source code must retain the above copyright
12 *     notice, this list of conditions and the following disclaimer.
13 * 2.  Redistributions in binary form must reproduce the above copyright
14 *     notice, this list of conditions and the following disclaimer in the
15 *     documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *
30 *	$FreeBSD$
31 *
32 */
33
34/*
35 * Figure 5, 8 and 12 are copied from the Intel white paper:
36 * Intel�� Carry-Less Multiplication Instruction and its Usage for
37 * Computing the GCM Mode
38 *
39 * and as such are:
40 * Copyright �� 2010 Intel Corporation.
41 * All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 *   * Redistributions of source code must retain the above copyright
47 *     notice, this list of conditions and the following disclaimer.
48 *   * Redistributions in binary form must reproduce the above copyright
49 *     notice, this list of conditions and the following disclaimer in the
50 *     documentation and/or other materials provided with the distribution.
51 *   * Neither the name of Intel Corporation nor the
52 *     names of its contributors may be used to endorse or promote products
53 *     derived from this software without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
56 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
57 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
58 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
59 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
60 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
61 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
62 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
63 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
65 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
66 */
67
68#ifdef _KERNEL
69#include <crypto/aesni/aesni.h>
70#include <crypto/aesni/aesni_os.h>
71#else
72#include <stdint.h>
73#endif
74
75#include <wmmintrin.h>
76#include <emmintrin.h>
77#include <smmintrin.h>
78
79static inline int
80m128icmp(__m128i a, __m128i b)
81{
82	__m128i cmp;
83
84	cmp = _mm_cmpeq_epi32(a, b);
85
86	return _mm_movemask_epi8(cmp) == 0xffff;
87}
88
89#ifdef __i386__
90static inline __m128i
91_mm_insert_epi64(__m128i a, int64_t b, const int ndx)
92{
93
94	if (!ndx) {
95		a = _mm_insert_epi32(a, b, 0);
96		a = _mm_insert_epi32(a, b >> 32, 1);
97	} else {
98		a = _mm_insert_epi32(a, b, 2);
99		a = _mm_insert_epi32(a, b >> 32, 3);
100	}
101
102	return a;
103}
104#endif
105
106/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
107
108/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
109static void
110gfmul(__m128i a, __m128i b, __m128i *res)
111{
112	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
113
114	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
115	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
116	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
117	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
118
119	tmp4 = _mm_xor_si128(tmp4, tmp5);
120	tmp5 = _mm_slli_si128(tmp4, 8);
121	tmp4 = _mm_srli_si128(tmp4, 8);
122	tmp3 = _mm_xor_si128(tmp3, tmp5);
123	tmp6 = _mm_xor_si128(tmp6, tmp4);
124
125	tmp7 = _mm_srli_epi32(tmp3, 31);
126	tmp8 = _mm_srli_epi32(tmp6, 31);
127	tmp3 = _mm_slli_epi32(tmp3, 1);
128	tmp6 = _mm_slli_epi32(tmp6, 1);
129
130	tmp9 = _mm_srli_si128(tmp7, 12);
131	tmp8 = _mm_slli_si128(tmp8, 4);
132	tmp7 = _mm_slli_si128(tmp7, 4);
133	tmp3 = _mm_or_si128(tmp3, tmp7);
134	tmp6 = _mm_or_si128(tmp6, tmp8);
135	tmp6 = _mm_or_si128(tmp6, tmp9);
136
137	tmp7 = _mm_slli_epi32(tmp3, 31);
138	tmp8 = _mm_slli_epi32(tmp3, 30);
139	tmp9 = _mm_slli_epi32(tmp3, 25);
140
141	tmp7 = _mm_xor_si128(tmp7, tmp8);
142	tmp7 = _mm_xor_si128(tmp7, tmp9);
143	tmp8 = _mm_srli_si128(tmp7, 4);
144	tmp7 = _mm_slli_si128(tmp7, 12);
145	tmp3 = _mm_xor_si128(tmp3, tmp7);
146
147	tmp2 = _mm_srli_epi32(tmp3, 1);
148	tmp4 = _mm_srli_epi32(tmp3, 2);
149	tmp5 = _mm_srli_epi32(tmp3, 7);
150	tmp2 = _mm_xor_si128(tmp2, tmp4);
151	tmp2 = _mm_xor_si128(tmp2, tmp5);
152	tmp2 = _mm_xor_si128(tmp2, tmp8);
153	tmp3 = _mm_xor_si128(tmp3, tmp2);
154	tmp6 = _mm_xor_si128(tmp6, tmp3);
155
156	*res = tmp6;
157}
158
159/*
160 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
161 * Method */
162static void
163reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
164    __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
165{
166	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
167	__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
168	    H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
169	__m128i tmp0, tmp1, tmp2, tmp3;
170	__m128i tmp4, tmp5, tmp6, tmp7;
171	__m128i tmp8, tmp9;
172
173	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
174	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
175	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
176	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
177
178	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
179	lo = _mm_xor_si128(lo, H3_X3_lo);
180	lo = _mm_xor_si128(lo, H4_X4_lo);
181
182	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
183	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
184	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
185	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
186
187	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
188	hi = _mm_xor_si128(hi, H3_X3_hi);
189	hi = _mm_xor_si128(hi, H4_X4_hi);
190
191	tmp0 = _mm_shuffle_epi32(H1, 78);
192	tmp4 = _mm_shuffle_epi32(X1, 78);
193	tmp0 = _mm_xor_si128(tmp0, H1);
194	tmp4 = _mm_xor_si128(tmp4, X1);
195	tmp1 = _mm_shuffle_epi32(H2, 78);
196	tmp5 = _mm_shuffle_epi32(X2, 78);
197	tmp1 = _mm_xor_si128(tmp1, H2);
198	tmp5 = _mm_xor_si128(tmp5, X2);
199	tmp2 = _mm_shuffle_epi32(H3, 78);
200	tmp6 = _mm_shuffle_epi32(X3, 78);
201	tmp2 = _mm_xor_si128(tmp2, H3);
202	tmp6 = _mm_xor_si128(tmp6, X3);
203	tmp3 = _mm_shuffle_epi32(H4, 78);
204	tmp7 = _mm_shuffle_epi32(X4, 78);
205	tmp3 = _mm_xor_si128(tmp3, H4);
206	tmp7 = _mm_xor_si128(tmp7, X4);
207
208	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
209	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
210	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
211	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
212
213	tmp0 = _mm_xor_si128(tmp0, lo);
214	tmp0 = _mm_xor_si128(tmp0, hi);
215	tmp0 = _mm_xor_si128(tmp1, tmp0);
216	tmp0 = _mm_xor_si128(tmp2, tmp0);
217	tmp0 = _mm_xor_si128(tmp3, tmp0);
218
219	tmp4 = _mm_slli_si128(tmp0, 8);
220	tmp0 = _mm_srli_si128(tmp0, 8);
221
222	lo = _mm_xor_si128(tmp4, lo);
223	hi = _mm_xor_si128(tmp0, hi);
224
225	tmp3 = lo;
226	tmp6 = hi;
227
228	tmp7 = _mm_srli_epi32(tmp3, 31);
229	tmp8 = _mm_srli_epi32(tmp6, 31);
230	tmp3 = _mm_slli_epi32(tmp3, 1);
231	tmp6 = _mm_slli_epi32(tmp6, 1);
232
233	tmp9 = _mm_srli_si128(tmp7, 12);
234	tmp8 = _mm_slli_si128(tmp8, 4);
235	tmp7 = _mm_slli_si128(tmp7, 4);
236	tmp3 = _mm_or_si128(tmp3, tmp7);
237	tmp6 = _mm_or_si128(tmp6, tmp8);
238	tmp6 = _mm_or_si128(tmp6, tmp9);
239
240	tmp7 = _mm_slli_epi32(tmp3, 31);
241	tmp8 = _mm_slli_epi32(tmp3, 30);
242	tmp9 = _mm_slli_epi32(tmp3, 25);
243
244	tmp7 = _mm_xor_si128(tmp7, tmp8);
245	tmp7 = _mm_xor_si128(tmp7, tmp9);
246	tmp8 = _mm_srli_si128(tmp7, 4);
247	tmp7 = _mm_slli_si128(tmp7, 12);
248	tmp3 = _mm_xor_si128(tmp3, tmp7);
249
250	tmp2 = _mm_srli_epi32(tmp3, 1);
251	tmp4 = _mm_srli_epi32(tmp3, 2);
252	tmp5 = _mm_srli_epi32(tmp3, 7);
253	tmp2 = _mm_xor_si128(tmp2, tmp4);
254	tmp2 = _mm_xor_si128(tmp2, tmp5);
255	tmp2 = _mm_xor_si128(tmp2, tmp8);
256	tmp3 = _mm_xor_si128(tmp3, tmp2);
257	tmp6 = _mm_xor_si128(tmp6, tmp3);
258
259	*res = tmp6;
260}
261
262/*
263 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
264 * Every Four Blocks
265 */
266/*
267 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
268 * 2^32-256*8*16 bytes.
269 */
270void
271AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
272	const unsigned char *addt, const unsigned char *ivec,
273	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
274	const unsigned char *key, int nr)
275{
276	int i, j ,k;
277	__m128i tmp1, tmp2, tmp3, tmp4;
278	__m128i tmp5, tmp6, tmp7, tmp8;
279	__m128i H, H2, H3, H4, Y, T;
280	const __m128i *KEY = (const __m128i *)key;
281	__m128i ctr1, ctr2, ctr3, ctr4;
282	__m128i ctr5, ctr6, ctr7, ctr8;
283	__m128i last_block = _mm_setzero_si128();
284	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
285	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
286	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
287	    7);
288	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
289	    15);
290	__m128i X = _mm_setzero_si128();
291
292	if (ibytes == 96/8) {
293		Y = _mm_loadu_si128((const __m128i *)ivec);
294		Y = _mm_insert_epi32(Y, 0x1000000, 3);
295		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
296		tmp1 = _mm_xor_si128(X, KEY[0]);
297		tmp2 = _mm_xor_si128(Y, KEY[0]);
298		for (j=1; j < nr-1; j+=2) {
299			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
300			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
301
302			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
303			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
304		}
305		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
306		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
307
308		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
309		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
310
311		H = _mm_shuffle_epi8(H, BSWAP_MASK);
312	} else {
313		tmp1 = _mm_xor_si128(X, KEY[0]);
314		for (j=1; j <nr; j++)
315			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
316		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
317
318		H = _mm_shuffle_epi8(H, BSWAP_MASK);
319		Y = _mm_setzero_si128();
320
321		for (i=0; i < ibytes/16; i++) {
322			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
323			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
324			Y = _mm_xor_si128(Y, tmp1);
325			gfmul(Y, H, &Y);
326		}
327		if (ibytes%16) {
328			for (j=0; j < ibytes%16; j++)
329				((unsigned char*)&last_block)[j] = ivec[i*16+j];
330			tmp1 = last_block;
331			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
332			Y = _mm_xor_si128(Y, tmp1);
333			gfmul(Y, H, &Y);
334		}
335		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
336		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
337
338		Y = _mm_xor_si128(Y, tmp1);
339		gfmul(Y, H, &Y);
340		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
341		tmp1 = _mm_xor_si128(Y, KEY[0]);
342		for (j=1; j < nr; j++)
343			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
344		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
345	}
346
347	gfmul(H,H,&H2);
348	gfmul(H,H2,&H3);
349	gfmul(H,H3,&H4);
350
351	for (i=0; i<abytes/16/4; i++) {
352		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
353		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
354		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
355		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
356
357		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
358		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
359		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
360		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
361		tmp1 = _mm_xor_si128(X, tmp1);
362
363		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
364	}
365	for (i=i*4; i<abytes/16; i++) {
366		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
367		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
368		X = _mm_xor_si128(X,tmp1);
369		gfmul(X, H, &X);
370	}
371	if (abytes%16) {
372		last_block = _mm_setzero_si128();
373		for (j=0; j<abytes%16; j++)
374			((unsigned char*)&last_block)[j] = addt[i*16+j];
375		tmp1 = last_block;
376		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
377		X =_mm_xor_si128(X,tmp1);
378		gfmul(X,H,&X);
379	}
380
381	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
382	ctr1 = _mm_add_epi64(ctr1, ONE);
383	ctr2 = _mm_add_epi64(ctr1, ONE);
384	ctr3 = _mm_add_epi64(ctr2, ONE);
385	ctr4 = _mm_add_epi64(ctr3, ONE);
386	ctr5 = _mm_add_epi64(ctr4, ONE);
387	ctr6 = _mm_add_epi64(ctr5, ONE);
388	ctr7 = _mm_add_epi64(ctr6, ONE);
389	ctr8 = _mm_add_epi64(ctr7, ONE);
390
391	for (i=0; i<nbytes/16/8; i++) {
392		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
393		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
394		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
395		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
396		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
397		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
398		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
399		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
400
401		ctr1 = _mm_add_epi64(ctr1, EIGHT);
402		ctr2 = _mm_add_epi64(ctr2, EIGHT);
403		ctr3 = _mm_add_epi64(ctr3, EIGHT);
404		ctr4 = _mm_add_epi64(ctr4, EIGHT);
405		ctr5 = _mm_add_epi64(ctr5, EIGHT);
406		ctr6 = _mm_add_epi64(ctr6, EIGHT);
407		ctr7 = _mm_add_epi64(ctr7, EIGHT);
408		ctr8 = _mm_add_epi64(ctr8, EIGHT);
409
410		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
411		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
412		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
413		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
414		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
415		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
416		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
417		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
418
419		for (j=1; j<nr; j++) {
420			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
421			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
422			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
423			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
424			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
425			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
426			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
427			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
428		}
429		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
430		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
431		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
432		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
433		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
434		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
435		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
436		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
437
438		tmp1 = _mm_xor_si128(tmp1,
439		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
440		tmp2 = _mm_xor_si128(tmp2,
441		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
442		tmp3 = _mm_xor_si128(tmp3,
443		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
444		tmp4 = _mm_xor_si128(tmp4,
445		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
446		tmp5 = _mm_xor_si128(tmp5,
447		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
448		tmp6 = _mm_xor_si128(tmp6,
449		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
450		tmp7 = _mm_xor_si128(tmp7,
451		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
452		tmp8 = _mm_xor_si128(tmp8,
453		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
454
455		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
456		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
457		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
458		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
459		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
460		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
461		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
462		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
463
464		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
465		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
466		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
467		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
468		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
469		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
470		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
471		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
472
473		tmp1 = _mm_xor_si128(X, tmp1);
474
475		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
476
477		tmp5 = _mm_xor_si128(X, tmp5);
478		reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
479	}
480	for (k=i*8; k<nbytes/16; k++) {
481		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
482		ctr1 = _mm_add_epi64(ctr1, ONE);
483		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
484		for (j=1; j<nr-1; j+=2) {
485			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
486			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
487		}
488		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
489		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
490		tmp1 = _mm_xor_si128(tmp1,
491		    _mm_loadu_si128(&((const __m128i *)in)[k]));
492		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
493		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
494		X = _mm_xor_si128(X, tmp1);
495		gfmul(X,H,&X);
496	}
497	//If remains one incomplete block
498	if (nbytes%16) {
499		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
500		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
501		for (j=1; j<nr-1; j+=2) {
502			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
503			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
504		}
505		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
506		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
507		tmp1 = _mm_xor_si128(tmp1,
508		    _mm_loadu_si128(&((const __m128i *)in)[k]));
509		last_block = tmp1;
510		for (j=0; j<nbytes%16; j++)
511			out[k*16+j] = ((unsigned char*)&last_block)[j];
512		for ((void)j; j<16; j++)
513			((unsigned char*)&last_block)[j] = 0;
514		tmp1 = last_block;
515		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
516		X = _mm_xor_si128(X, tmp1);
517		gfmul(X, H, &X);
518	}
519	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
520	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
521
522	X = _mm_xor_si128(X, tmp1);
523	gfmul(X,H,&X);
524	X = _mm_shuffle_epi8(X, BSWAP_MASK);
525	T = _mm_xor_si128(X, T);
526	_mm_storeu_si128((__m128i*)tag, T);
527}
528
529/* My modification of _encrypt to be _decrypt */
530int
531AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
532	const unsigned char *addt, const unsigned char *ivec,
533	const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
534	const unsigned char *key, int nr)
535{
536	int i, j ,k;
537	__m128i tmp1, tmp2, tmp3, tmp4;
538	__m128i tmp5, tmp6, tmp7, tmp8;
539	__m128i H, H2, H3, H4, Y, T;
540	const __m128i *KEY = (const __m128i *)key;
541	__m128i ctr1, ctr2, ctr3, ctr4;
542	__m128i ctr5, ctr6, ctr7, ctr8;
543	__m128i last_block = _mm_setzero_si128();
544	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
545	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
546	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
547	    7);
548	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
549	    15);
550	__m128i X = _mm_setzero_si128();
551
552	if (ibytes == 96/8) {
553		Y = _mm_loadu_si128((const __m128i *)ivec);
554		Y = _mm_insert_epi32(Y, 0x1000000, 3);
555		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
556		tmp1 = _mm_xor_si128(X, KEY[0]);
557		tmp2 = _mm_xor_si128(Y, KEY[0]);
558		for (j=1; j < nr-1; j+=2) {
559			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
560			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
561
562			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
563			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
564		}
565		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
566		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
567
568		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
569		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
570
571		H = _mm_shuffle_epi8(H, BSWAP_MASK);
572	} else {
573		tmp1 = _mm_xor_si128(X, KEY[0]);
574		for (j=1; j <nr; j++)
575			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
576		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
577
578		H = _mm_shuffle_epi8(H, BSWAP_MASK);
579		Y = _mm_setzero_si128();
580
581		for (i=0; i < ibytes/16; i++) {
582			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
583			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
584			Y = _mm_xor_si128(Y, tmp1);
585			gfmul(Y, H, &Y);
586		}
587		if (ibytes%16) {
588			for (j=0; j < ibytes%16; j++)
589				((unsigned char*)&last_block)[j] = ivec[i*16+j];
590			tmp1 = last_block;
591			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
592			Y = _mm_xor_si128(Y, tmp1);
593			gfmul(Y, H, &Y);
594		}
595		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
596		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
597
598		Y = _mm_xor_si128(Y, tmp1);
599		gfmul(Y, H, &Y);
600		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
601		tmp1 = _mm_xor_si128(Y, KEY[0]);
602		for (j=1; j < nr; j++)
603			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
604		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
605	}
606
607	gfmul(H,H,&H2);
608	gfmul(H,H2,&H3);
609	gfmul(H,H3,&H4);
610
611	for (i=0; i<abytes/16/4; i++) {
612		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
613		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
614		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
615		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
616
617		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
618		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
619		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
620		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
621
622		tmp1 = _mm_xor_si128(X, tmp1);
623
624		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
625	}
626	for (i=i*4; i<abytes/16; i++) {
627		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
628		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
629		X = _mm_xor_si128(X,tmp1);
630		gfmul(X, H, &X);
631	}
632	if (abytes%16) {
633		last_block = _mm_setzero_si128();
634		for (j=0; j<abytes%16; j++)
635			((unsigned char*)&last_block)[j] = addt[i*16+j];
636		tmp1 = last_block;
637		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
638		X =_mm_xor_si128(X,tmp1);
639		gfmul(X,H,&X);
640	}
641
642	/* This is where we validate the cipher text before decrypt */
643	for (i = 0; i<nbytes/16/4; i++) {
644		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
645		tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
646		tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
647		tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
648
649		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
650		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
651		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
652		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
653
654		tmp1 = _mm_xor_si128(X, tmp1);
655
656		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
657	}
658	for (i = i*4; i<nbytes/16; i++) {
659		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
660		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
661		X = _mm_xor_si128(X, tmp1);
662		gfmul(X,H,&X);
663	}
664	if (nbytes%16) {
665		last_block = _mm_setzero_si128();
666		for (j=0; j<nbytes%16; j++)
667			((unsigned char*)&last_block)[j] = in[i*16+j];
668		tmp1 = last_block;
669		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
670		X = _mm_xor_si128(X, tmp1);
671		gfmul(X, H, &X);
672	}
673
674	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
675	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
676
677	X = _mm_xor_si128(X, tmp1);
678	gfmul(X,H,&X);
679	X = _mm_shuffle_epi8(X, BSWAP_MASK);
680	T = _mm_xor_si128(X, T);
681
682	if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
683		return 0; //in case the authentication failed
684
685	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
686	ctr1 = _mm_add_epi64(ctr1, ONE);
687	ctr2 = _mm_add_epi64(ctr1, ONE);
688	ctr3 = _mm_add_epi64(ctr2, ONE);
689	ctr4 = _mm_add_epi64(ctr3, ONE);
690	ctr5 = _mm_add_epi64(ctr4, ONE);
691	ctr6 = _mm_add_epi64(ctr5, ONE);
692	ctr7 = _mm_add_epi64(ctr6, ONE);
693	ctr8 = _mm_add_epi64(ctr7, ONE);
694
695	for (i=0; i<nbytes/16/8; i++) {
696		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
697		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
698		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
699		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
700		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
701		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
702		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
703		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
704
705		ctr1 = _mm_add_epi64(ctr1, EIGHT);
706		ctr2 = _mm_add_epi64(ctr2, EIGHT);
707		ctr3 = _mm_add_epi64(ctr3, EIGHT);
708		ctr4 = _mm_add_epi64(ctr4, EIGHT);
709		ctr5 = _mm_add_epi64(ctr5, EIGHT);
710		ctr6 = _mm_add_epi64(ctr6, EIGHT);
711		ctr7 = _mm_add_epi64(ctr7, EIGHT);
712		ctr8 = _mm_add_epi64(ctr8, EIGHT);
713
714		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
715		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
716		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
717		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
718		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
719		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
720		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
721		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
722
723		for (j=1; j<nr; j++) {
724			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
725			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
726			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
727			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
728			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
729			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
730			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
731			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
732		}
733		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
734		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
735		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
736		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
737		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
738		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
739		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
740		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
741
742		tmp1 = _mm_xor_si128(tmp1,
743		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
744		tmp2 = _mm_xor_si128(tmp2,
745		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
746		tmp3 = _mm_xor_si128(tmp3,
747		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
748		tmp4 = _mm_xor_si128(tmp4,
749		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
750		tmp5 = _mm_xor_si128(tmp5,
751		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
752		tmp6 = _mm_xor_si128(tmp6,
753		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
754		tmp7 = _mm_xor_si128(tmp7,
755		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
756		tmp8 = _mm_xor_si128(tmp8,
757		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
758
759		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
760		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
761		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
762		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
763		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
764		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
765		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
766		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
767
768		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
769		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
770		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
771		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
772		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
773		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
774		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
775		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
776	}
777	for (k=i*8; k<nbytes/16; k++) {
778		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
779		ctr1 = _mm_add_epi64(ctr1, ONE);
780		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
781		for (j=1; j<nr-1; j+=2) {
782			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
783			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
784		}
785		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
786		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
787		tmp1 = _mm_xor_si128(tmp1,
788		    _mm_loadu_si128(&((const __m128i *)in)[k]));
789		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
790	}
791	//If remains one incomplete block
792	if (nbytes%16) {
793		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
794		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
795		for (j=1; j<nr-1; j+=2) {
796			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
797			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
798		}
799		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
800		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
801		tmp1 = _mm_xor_si128(tmp1,
802		    _mm_loadu_si128(&((const __m128i *)in)[k]));
803		last_block = tmp1;
804		for (j=0; j<nbytes%16; j++)
805			out[k*16+j] = ((unsigned char*)&last_block)[j];
806	}
807	return 1; //when sucessfull returns 1
808}
809