1275732Sjmg/*-
2275732Sjmg * Copyright (c) 2014 The FreeBSD Foundation
3275732Sjmg * All rights reserved.
4275732Sjmg *
5275732Sjmg * This software was developed by John-Mark Gurney under
6275732Sjmg * the sponsorship of the FreeBSD Foundation and
7275732Sjmg * Rubicon Communications, LLC (Netgate).
8275732Sjmg * Redistribution and use in source and binary forms, with or without
9275732Sjmg * modification, are permitted provided that the following conditions
10275732Sjmg * are met:
11275732Sjmg * 1.  Redistributions of source code must retain the above copyright
12275732Sjmg *     notice, this list of conditions and the following disclaimer.
13275732Sjmg * 2.  Redistributions in binary form must reproduce the above copyright
14275732Sjmg *     notice, this list of conditions and the following disclaimer in the
15275732Sjmg *     documentation and/or other materials provided with the distribution.
16275732Sjmg *
17275732Sjmg * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18275732Sjmg * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19275732Sjmg * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20275732Sjmg * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21275732Sjmg * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22275732Sjmg * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23275732Sjmg * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24275732Sjmg * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25275732Sjmg * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26275732Sjmg * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27275732Sjmg * SUCH DAMAGE.
28275732Sjmg *
29275732Sjmg *
30275732Sjmg *	$FreeBSD$
31275732Sjmg *
32275732Sjmg */
33275732Sjmg
34275732Sjmg/*
35275732Sjmg * Figure 5, 8 and 12 are copied from the Intel white paper:
36275732Sjmg * Intel�� Carry-Less Multiplication Instruction and its Usage for
37275732Sjmg * Computing the GCM Mode
38275732Sjmg *
39275732Sjmg * and as such are:
40275732Sjmg * Copyright �� 2010 Intel Corporation.
41275732Sjmg * All rights reserved.
42275732Sjmg *
43275732Sjmg * Redistribution and use in source and binary forms, with or without
44275732Sjmg * modification, are permitted provided that the following conditions
45275732Sjmg * are met:
46275732Sjmg *   * Redistributions of source code must retain the above copyright
47275732Sjmg *     notice, this list of conditions and the following disclaimer.
48275732Sjmg *   * Redistributions in binary form must reproduce the above copyright
49275732Sjmg *     notice, this list of conditions and the following disclaimer in the
50275732Sjmg *     documentation and/or other materials provided with the distribution.
51275732Sjmg *   * Neither the name of Intel Corporation nor the
52275732Sjmg *     names of its contributors may be used to endorse or promote products
53275732Sjmg *     derived from this software without specific prior written permission.
54275732Sjmg *
55275732Sjmg * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
56275732Sjmg * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
57275732Sjmg * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
58275732Sjmg * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
59275732Sjmg * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
60275732Sjmg * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
61275732Sjmg * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
62275732Sjmg * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
63275732Sjmg * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64275732Sjmg * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
65275732Sjmg * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
66275732Sjmg */
67275732Sjmg
68275732Sjmg#ifdef _KERNEL
69275732Sjmg#include <crypto/aesni/aesni.h>
70281606Srodrigc#include <crypto/aesni/aesni_os.h>
71275732Sjmg#else
72275732Sjmg#include <stdint.h>
73275732Sjmg#endif
74275732Sjmg
75275732Sjmg#include <wmmintrin.h>
76275732Sjmg#include <emmintrin.h>
77275732Sjmg#include <smmintrin.h>
78275732Sjmg
79275732Sjmgstatic inline int
80275732Sjmgm128icmp(__m128i a, __m128i b)
81275732Sjmg{
82275732Sjmg	__m128i cmp;
83275732Sjmg
84275732Sjmg	cmp = _mm_cmpeq_epi32(a, b);
85275732Sjmg
86275732Sjmg	return _mm_movemask_epi8(cmp) == 0xffff;
87275732Sjmg}
88275732Sjmg
89275732Sjmg#ifdef __i386__
90275732Sjmgstatic inline __m128i
91275732Sjmg_mm_insert_epi64(__m128i a, int64_t b, const int ndx)
92275732Sjmg{
93275732Sjmg
94275732Sjmg	if (!ndx) {
95275732Sjmg		a = _mm_insert_epi32(a, b, 0);
96275732Sjmg		a = _mm_insert_epi32(a, b >> 32, 1);
97275732Sjmg	} else {
98275732Sjmg		a = _mm_insert_epi32(a, b, 2);
99275732Sjmg		a = _mm_insert_epi32(a, b >> 32, 3);
100275732Sjmg	}
101275732Sjmg
102275732Sjmg	return a;
103275732Sjmg}
104275732Sjmg#endif
105275732Sjmg
106275732Sjmg/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
107275732Sjmg
108275732Sjmg/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
109275732Sjmgstatic void
110275732Sjmggfmul(__m128i a, __m128i b, __m128i *res)
111275732Sjmg{
112275732Sjmg	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
113275732Sjmg
114275732Sjmg	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
115275732Sjmg	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
116275732Sjmg	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
117275732Sjmg	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
118275732Sjmg
119275732Sjmg	tmp4 = _mm_xor_si128(tmp4, tmp5);
120275732Sjmg	tmp5 = _mm_slli_si128(tmp4, 8);
121275732Sjmg	tmp4 = _mm_srli_si128(tmp4, 8);
122275732Sjmg	tmp3 = _mm_xor_si128(tmp3, tmp5);
123275732Sjmg	tmp6 = _mm_xor_si128(tmp6, tmp4);
124275732Sjmg
125275732Sjmg	tmp7 = _mm_srli_epi32(tmp3, 31);
126275732Sjmg	tmp8 = _mm_srli_epi32(tmp6, 31);
127275732Sjmg	tmp3 = _mm_slli_epi32(tmp3, 1);
128275732Sjmg	tmp6 = _mm_slli_epi32(tmp6, 1);
129275732Sjmg
130275732Sjmg	tmp9 = _mm_srli_si128(tmp7, 12);
131275732Sjmg	tmp8 = _mm_slli_si128(tmp8, 4);
132275732Sjmg	tmp7 = _mm_slli_si128(tmp7, 4);
133275732Sjmg	tmp3 = _mm_or_si128(tmp3, tmp7);
134275732Sjmg	tmp6 = _mm_or_si128(tmp6, tmp8);
135275732Sjmg	tmp6 = _mm_or_si128(tmp6, tmp9);
136275732Sjmg
137275732Sjmg	tmp7 = _mm_slli_epi32(tmp3, 31);
138275732Sjmg	tmp8 = _mm_slli_epi32(tmp3, 30);
139275732Sjmg	tmp9 = _mm_slli_epi32(tmp3, 25);
140275732Sjmg
141275732Sjmg	tmp7 = _mm_xor_si128(tmp7, tmp8);
142275732Sjmg	tmp7 = _mm_xor_si128(tmp7, tmp9);
143275732Sjmg	tmp8 = _mm_srli_si128(tmp7, 4);
144275732Sjmg	tmp7 = _mm_slli_si128(tmp7, 12);
145275732Sjmg	tmp3 = _mm_xor_si128(tmp3, tmp7);
146275732Sjmg
147275732Sjmg	tmp2 = _mm_srli_epi32(tmp3, 1);
148275732Sjmg	tmp4 = _mm_srli_epi32(tmp3, 2);
149275732Sjmg	tmp5 = _mm_srli_epi32(tmp3, 7);
150275732Sjmg	tmp2 = _mm_xor_si128(tmp2, tmp4);
151275732Sjmg	tmp2 = _mm_xor_si128(tmp2, tmp5);
152275732Sjmg	tmp2 = _mm_xor_si128(tmp2, tmp8);
153275732Sjmg	tmp3 = _mm_xor_si128(tmp3, tmp2);
154275732Sjmg	tmp6 = _mm_xor_si128(tmp6, tmp3);
155275732Sjmg
156275732Sjmg	*res = tmp6;
157275732Sjmg}
158275732Sjmg
159275732Sjmg/*
160275732Sjmg * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
161275732Sjmg * Method */
162275732Sjmgstatic void
163275732Sjmgreduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
164275732Sjmg    __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
165275732Sjmg{
166275732Sjmg	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
167275732Sjmg	__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
168275732Sjmg	    H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
169275732Sjmg	__m128i tmp0, tmp1, tmp2, tmp3;
170275732Sjmg	__m128i tmp4, tmp5, tmp6, tmp7;
171275732Sjmg	__m128i tmp8, tmp9;
172275732Sjmg
173275732Sjmg	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
174275732Sjmg	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
175275732Sjmg	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
176275732Sjmg	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
177275732Sjmg
178275732Sjmg	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
179275732Sjmg	lo = _mm_xor_si128(lo, H3_X3_lo);
180275732Sjmg	lo = _mm_xor_si128(lo, H4_X4_lo);
181275732Sjmg
182275732Sjmg	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
183275732Sjmg	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
184275732Sjmg	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
185275732Sjmg	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
186275732Sjmg
187275732Sjmg	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
188275732Sjmg	hi = _mm_xor_si128(hi, H3_X3_hi);
189275732Sjmg	hi = _mm_xor_si128(hi, H4_X4_hi);
190275732Sjmg
191275732Sjmg	tmp0 = _mm_shuffle_epi32(H1, 78);
192275732Sjmg	tmp4 = _mm_shuffle_epi32(X1, 78);
193275732Sjmg	tmp0 = _mm_xor_si128(tmp0, H1);
194275732Sjmg	tmp4 = _mm_xor_si128(tmp4, X1);
195275732Sjmg	tmp1 = _mm_shuffle_epi32(H2, 78);
196275732Sjmg	tmp5 = _mm_shuffle_epi32(X2, 78);
197275732Sjmg	tmp1 = _mm_xor_si128(tmp1, H2);
198275732Sjmg	tmp5 = _mm_xor_si128(tmp5, X2);
199275732Sjmg	tmp2 = _mm_shuffle_epi32(H3, 78);
200275732Sjmg	tmp6 = _mm_shuffle_epi32(X3, 78);
201275732Sjmg	tmp2 = _mm_xor_si128(tmp2, H3);
202275732Sjmg	tmp6 = _mm_xor_si128(tmp6, X3);
203275732Sjmg	tmp3 = _mm_shuffle_epi32(H4, 78);
204275732Sjmg	tmp7 = _mm_shuffle_epi32(X4, 78);
205275732Sjmg	tmp3 = _mm_xor_si128(tmp3, H4);
206275732Sjmg	tmp7 = _mm_xor_si128(tmp7, X4);
207275732Sjmg
208275732Sjmg	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
209275732Sjmg	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
210275732Sjmg	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
211275732Sjmg	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
212275732Sjmg
213275732Sjmg	tmp0 = _mm_xor_si128(tmp0, lo);
214275732Sjmg	tmp0 = _mm_xor_si128(tmp0, hi);
215275732Sjmg	tmp0 = _mm_xor_si128(tmp1, tmp0);
216275732Sjmg	tmp0 = _mm_xor_si128(tmp2, tmp0);
217275732Sjmg	tmp0 = _mm_xor_si128(tmp3, tmp0);
218275732Sjmg
219275732Sjmg	tmp4 = _mm_slli_si128(tmp0, 8);
220275732Sjmg	tmp0 = _mm_srli_si128(tmp0, 8);
221275732Sjmg
222275732Sjmg	lo = _mm_xor_si128(tmp4, lo);
223275732Sjmg	hi = _mm_xor_si128(tmp0, hi);
224275732Sjmg
225275732Sjmg	tmp3 = lo;
226275732Sjmg	tmp6 = hi;
227275732Sjmg
228275732Sjmg	tmp7 = _mm_srli_epi32(tmp3, 31);
229275732Sjmg	tmp8 = _mm_srli_epi32(tmp6, 31);
230275732Sjmg	tmp3 = _mm_slli_epi32(tmp3, 1);
231275732Sjmg	tmp6 = _mm_slli_epi32(tmp6, 1);
232275732Sjmg
233275732Sjmg	tmp9 = _mm_srli_si128(tmp7, 12);
234275732Sjmg	tmp8 = _mm_slli_si128(tmp8, 4);
235275732Sjmg	tmp7 = _mm_slli_si128(tmp7, 4);
236275732Sjmg	tmp3 = _mm_or_si128(tmp3, tmp7);
237275732Sjmg	tmp6 = _mm_or_si128(tmp6, tmp8);
238275732Sjmg	tmp6 = _mm_or_si128(tmp6, tmp9);
239275732Sjmg
240275732Sjmg	tmp7 = _mm_slli_epi32(tmp3, 31);
241275732Sjmg	tmp8 = _mm_slli_epi32(tmp3, 30);
242275732Sjmg	tmp9 = _mm_slli_epi32(tmp3, 25);
243275732Sjmg
244275732Sjmg	tmp7 = _mm_xor_si128(tmp7, tmp8);
245275732Sjmg	tmp7 = _mm_xor_si128(tmp7, tmp9);
246275732Sjmg	tmp8 = _mm_srli_si128(tmp7, 4);
247275732Sjmg	tmp7 = _mm_slli_si128(tmp7, 12);
248275732Sjmg	tmp3 = _mm_xor_si128(tmp3, tmp7);
249275732Sjmg
250275732Sjmg	tmp2 = _mm_srli_epi32(tmp3, 1);
251275732Sjmg	tmp4 = _mm_srli_epi32(tmp3, 2);
252275732Sjmg	tmp5 = _mm_srli_epi32(tmp3, 7);
253275732Sjmg	tmp2 = _mm_xor_si128(tmp2, tmp4);
254275732Sjmg	tmp2 = _mm_xor_si128(tmp2, tmp5);
255275732Sjmg	tmp2 = _mm_xor_si128(tmp2, tmp8);
256275732Sjmg	tmp3 = _mm_xor_si128(tmp3, tmp2);
257275732Sjmg	tmp6 = _mm_xor_si128(tmp6, tmp3);
258275732Sjmg
259275732Sjmg	*res = tmp6;
260275732Sjmg}
261275732Sjmg
262275732Sjmg/*
263275732Sjmg * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
264275732Sjmg * Every Four Blocks
265275732Sjmg */
266275732Sjmg/*
267275732Sjmg * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
268275732Sjmg * 2^32-256*8*16 bytes.
269275732Sjmg */
270275732Sjmgvoid
271275732SjmgAES_GCM_encrypt(const unsigned char *in, unsigned char *out,
272275732Sjmg	const unsigned char *addt, const unsigned char *ivec,
273275732Sjmg	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
274275732Sjmg	const unsigned char *key, int nr)
275275732Sjmg{
276275732Sjmg	int i, j ,k;
277275732Sjmg	__m128i tmp1, tmp2, tmp3, tmp4;
278275732Sjmg	__m128i tmp5, tmp6, tmp7, tmp8;
279275732Sjmg	__m128i H, H2, H3, H4, Y, T;
280275732Sjmg	__m128i *KEY = (__m128i*)key;
281275732Sjmg	__m128i ctr1, ctr2, ctr3, ctr4;
282275732Sjmg	__m128i ctr5, ctr6, ctr7, ctr8;
283275732Sjmg	__m128i last_block = _mm_setzero_si128();
284275732Sjmg	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
285275732Sjmg	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
286275732Sjmg	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
287275732Sjmg	    7);
288275732Sjmg	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
289275732Sjmg	    15);
290275732Sjmg	__m128i X = _mm_setzero_si128();
291275732Sjmg
292275732Sjmg	if (ibytes == 96/8) {
293275732Sjmg		Y = _mm_loadu_si128((__m128i*)ivec);
294275732Sjmg		Y = _mm_insert_epi32(Y, 0x1000000, 3);
295275732Sjmg		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
296275732Sjmg		tmp1 = _mm_xor_si128(X, KEY[0]);
297275732Sjmg		tmp2 = _mm_xor_si128(Y, KEY[0]);
298275732Sjmg		for (j=1; j < nr-1; j+=2) {
299275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
300275732Sjmg			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
301275732Sjmg
302275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
303275732Sjmg			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
304275732Sjmg		}
305275732Sjmg		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
306275732Sjmg		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
307275732Sjmg
308275732Sjmg		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
309275732Sjmg		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
310275732Sjmg
311275732Sjmg		H = _mm_shuffle_epi8(H, BSWAP_MASK);
312275732Sjmg	} else {
313275732Sjmg		tmp1 = _mm_xor_si128(X, KEY[0]);
314275732Sjmg		for (j=1; j <nr; j++)
315275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
316275732Sjmg		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
317275732Sjmg
318275732Sjmg		H = _mm_shuffle_epi8(H, BSWAP_MASK);
319275732Sjmg		Y = _mm_setzero_si128();
320275732Sjmg
321275732Sjmg		for (i=0; i < ibytes/16; i++) {
322275732Sjmg			tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
323275732Sjmg			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
324275732Sjmg			Y = _mm_xor_si128(Y, tmp1);
325275732Sjmg			gfmul(Y, H, &Y);
326275732Sjmg		}
327275732Sjmg		if (ibytes%16) {
328275732Sjmg			for (j=0; j < ibytes%16; j++)
329275732Sjmg				((unsigned char*)&last_block)[j] = ivec[i*16+j];
330275732Sjmg			tmp1 = last_block;
331275732Sjmg			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
332275732Sjmg			Y = _mm_xor_si128(Y, tmp1);
333275732Sjmg			gfmul(Y, H, &Y);
334275732Sjmg		}
335275732Sjmg		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
336275732Sjmg		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
337275732Sjmg
338275732Sjmg		Y = _mm_xor_si128(Y, tmp1);
339275732Sjmg		gfmul(Y, H, &Y);
340275732Sjmg		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
341275732Sjmg		tmp1 = _mm_xor_si128(Y, KEY[0]);
342275732Sjmg		for (j=1; j < nr; j++)
343275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
344275732Sjmg		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
345275732Sjmg	}
346275732Sjmg
347275732Sjmg	gfmul(H,H,&H2);
348275732Sjmg	gfmul(H,H2,&H3);
349275732Sjmg	gfmul(H,H3,&H4);
350275732Sjmg
351275732Sjmg	for (i=0; i<abytes/16/4; i++) {
352275732Sjmg		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
353275732Sjmg		tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
354275732Sjmg		tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
355275732Sjmg		tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
356275732Sjmg
357275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
358275732Sjmg		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
359275732Sjmg		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
360275732Sjmg		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
361275732Sjmg		tmp1 = _mm_xor_si128(X, tmp1);
362275732Sjmg
363275732Sjmg		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
364275732Sjmg	}
365275732Sjmg	for (i=i*4; i<abytes/16; i++) {
366275732Sjmg		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
367275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
368275732Sjmg		X = _mm_xor_si128(X,tmp1);
369275732Sjmg		gfmul(X, H, &X);
370275732Sjmg	}
371275732Sjmg	if (abytes%16) {
372275732Sjmg		last_block = _mm_setzero_si128();
373275732Sjmg		for (j=0; j<abytes%16; j++)
374275732Sjmg			((unsigned char*)&last_block)[j] = addt[i*16+j];
375275732Sjmg		tmp1 = last_block;
376275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
377275732Sjmg		X =_mm_xor_si128(X,tmp1);
378275732Sjmg		gfmul(X,H,&X);
379275732Sjmg	}
380275732Sjmg
381275732Sjmg	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
382275732Sjmg	ctr1 = _mm_add_epi64(ctr1, ONE);
383275732Sjmg	ctr2 = _mm_add_epi64(ctr1, ONE);
384275732Sjmg	ctr3 = _mm_add_epi64(ctr2, ONE);
385275732Sjmg	ctr4 = _mm_add_epi64(ctr3, ONE);
386275732Sjmg	ctr5 = _mm_add_epi64(ctr4, ONE);
387275732Sjmg	ctr6 = _mm_add_epi64(ctr5, ONE);
388275732Sjmg	ctr7 = _mm_add_epi64(ctr6, ONE);
389275732Sjmg	ctr8 = _mm_add_epi64(ctr7, ONE);
390275732Sjmg
391275732Sjmg	for (i=0; i<nbytes/16/8; i++) {
392275732Sjmg		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
393275732Sjmg		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
394275732Sjmg		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
395275732Sjmg		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
396275732Sjmg		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
397275732Sjmg		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
398275732Sjmg		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
399275732Sjmg		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
400275732Sjmg
401275732Sjmg		ctr1 = _mm_add_epi64(ctr1, EIGHT);
402275732Sjmg		ctr2 = _mm_add_epi64(ctr2, EIGHT);
403275732Sjmg		ctr3 = _mm_add_epi64(ctr3, EIGHT);
404275732Sjmg		ctr4 = _mm_add_epi64(ctr4, EIGHT);
405275732Sjmg		ctr5 = _mm_add_epi64(ctr5, EIGHT);
406275732Sjmg		ctr6 = _mm_add_epi64(ctr6, EIGHT);
407275732Sjmg		ctr7 = _mm_add_epi64(ctr7, EIGHT);
408275732Sjmg		ctr8 = _mm_add_epi64(ctr8, EIGHT);
409275732Sjmg
410275732Sjmg		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
411275732Sjmg		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
412275732Sjmg		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
413275732Sjmg		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
414275732Sjmg		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
415275732Sjmg		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
416275732Sjmg		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
417275732Sjmg		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
418275732Sjmg
419275732Sjmg		for (j=1; j<nr; j++) {
420275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
421275732Sjmg			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
422275732Sjmg			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
423275732Sjmg			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
424275732Sjmg			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
425275732Sjmg			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
426275732Sjmg			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
427275732Sjmg			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
428275732Sjmg		}
429275732Sjmg		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
430275732Sjmg		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
431275732Sjmg		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
432275732Sjmg		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
433275732Sjmg		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
434275732Sjmg		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
435275732Sjmg		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
436275732Sjmg		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
437275732Sjmg
438275732Sjmg		tmp1 = _mm_xor_si128(tmp1,
439275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+0]));
440275732Sjmg		tmp2 = _mm_xor_si128(tmp2,
441275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+1]));
442275732Sjmg		tmp3 = _mm_xor_si128(tmp3,
443275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+2]));
444275732Sjmg		tmp4 = _mm_xor_si128(tmp4,
445275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+3]));
446275732Sjmg		tmp5 = _mm_xor_si128(tmp5,
447275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+4]));
448275732Sjmg		tmp6 = _mm_xor_si128(tmp6,
449275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+5]));
450275732Sjmg		tmp7 = _mm_xor_si128(tmp7,
451275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+6]));
452275732Sjmg		tmp8 = _mm_xor_si128(tmp8,
453275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+7]));
454275732Sjmg
455275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
456275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
457275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
458275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
459275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
460275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
461275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
462275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
463275732Sjmg
464275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
465275732Sjmg		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
466275732Sjmg		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
467275732Sjmg		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
468275732Sjmg		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
469275732Sjmg		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
470275732Sjmg		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
471275732Sjmg		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
472275732Sjmg
473275732Sjmg		tmp1 = _mm_xor_si128(X, tmp1);
474275732Sjmg
475275732Sjmg		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
476275732Sjmg
477275732Sjmg		tmp5 = _mm_xor_si128(X, tmp5);
478275732Sjmg		reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
479275732Sjmg	}
480275732Sjmg	for (k=i*8; k<nbytes/16; k++) {
481275732Sjmg		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
482275732Sjmg		ctr1 = _mm_add_epi64(ctr1, ONE);
483275732Sjmg		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
484275732Sjmg		for (j=1; j<nr-1; j+=2) {
485275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
486275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
487275732Sjmg		}
488275732Sjmg		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
489275732Sjmg		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
490275732Sjmg		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
491275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
492275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
493275732Sjmg		X = _mm_xor_si128(X, tmp1);
494275732Sjmg		gfmul(X,H,&X);
495275732Sjmg	}
496275732Sjmg	//If remains one incomplete block
497275732Sjmg	if (nbytes%16) {
498275732Sjmg		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
499275732Sjmg		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
500275732Sjmg		for (j=1; j<nr-1; j+=2) {
501275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
502275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
503275732Sjmg		}
504275732Sjmg		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
505275732Sjmg		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
506275732Sjmg		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
507275732Sjmg		last_block = tmp1;
508275732Sjmg		for (j=0; j<nbytes%16; j++)
509275732Sjmg			out[k*16+j] = ((unsigned char*)&last_block)[j];
510275732Sjmg		for ((void)j; j<16; j++)
511275732Sjmg			((unsigned char*)&last_block)[j] = 0;
512275732Sjmg		tmp1 = last_block;
513275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
514275732Sjmg		X = _mm_xor_si128(X, tmp1);
515275732Sjmg		gfmul(X, H, &X);
516275732Sjmg	}
517275732Sjmg	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
518275732Sjmg	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
519275732Sjmg
520275732Sjmg	X = _mm_xor_si128(X, tmp1);
521275732Sjmg	gfmul(X,H,&X);
522275732Sjmg	X = _mm_shuffle_epi8(X, BSWAP_MASK);
523275732Sjmg	T = _mm_xor_si128(X, T);
524275732Sjmg	_mm_storeu_si128((__m128i*)tag, T);
525275732Sjmg}
526275732Sjmg
527275732Sjmg/* My modification of _encrypt to be _decrypt */
528275732Sjmgint
529275732SjmgAES_GCM_decrypt(const unsigned char *in, unsigned char *out,
530275732Sjmg	const unsigned char *addt, const unsigned char *ivec,
531286049Sjmg	const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
532275732Sjmg	const unsigned char *key, int nr)
533275732Sjmg{
534275732Sjmg	int i, j ,k;
535275732Sjmg	__m128i tmp1, tmp2, tmp3, tmp4;
536275732Sjmg	__m128i tmp5, tmp6, tmp7, tmp8;
537275732Sjmg	__m128i H, H2, H3, H4, Y, T;
538275732Sjmg	__m128i *KEY = (__m128i*)key;
539275732Sjmg	__m128i ctr1, ctr2, ctr3, ctr4;
540275732Sjmg	__m128i ctr5, ctr6, ctr7, ctr8;
541275732Sjmg	__m128i last_block = _mm_setzero_si128();
542275732Sjmg	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
543275732Sjmg	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
544275732Sjmg	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
545275732Sjmg	    7);
546275732Sjmg	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
547275732Sjmg	    15);
548275732Sjmg	__m128i X = _mm_setzero_si128();
549275732Sjmg
550275732Sjmg	if (ibytes == 96/8) {
551275732Sjmg		Y = _mm_loadu_si128((__m128i*)ivec);
552275732Sjmg		Y = _mm_insert_epi32(Y, 0x1000000, 3);
553275732Sjmg		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
554275732Sjmg		tmp1 = _mm_xor_si128(X, KEY[0]);
555275732Sjmg		tmp2 = _mm_xor_si128(Y, KEY[0]);
556275732Sjmg		for (j=1; j < nr-1; j+=2) {
557275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
558275732Sjmg			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
559275732Sjmg
560275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
561275732Sjmg			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
562275732Sjmg		}
563275732Sjmg		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
564275732Sjmg		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
565275732Sjmg
566275732Sjmg		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
567275732Sjmg		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
568275732Sjmg
569275732Sjmg		H = _mm_shuffle_epi8(H, BSWAP_MASK);
570275732Sjmg	} else {
571275732Sjmg		tmp1 = _mm_xor_si128(X, KEY[0]);
572275732Sjmg		for (j=1; j <nr; j++)
573275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
574275732Sjmg		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
575275732Sjmg
576275732Sjmg		H = _mm_shuffle_epi8(H, BSWAP_MASK);
577275732Sjmg		Y = _mm_setzero_si128();
578275732Sjmg
579275732Sjmg		for (i=0; i < ibytes/16; i++) {
580275732Sjmg			tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
581275732Sjmg			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
582275732Sjmg			Y = _mm_xor_si128(Y, tmp1);
583275732Sjmg			gfmul(Y, H, &Y);
584275732Sjmg		}
585275732Sjmg		if (ibytes%16) {
586275732Sjmg			for (j=0; j < ibytes%16; j++)
587275732Sjmg				((unsigned char*)&last_block)[j] = ivec[i*16+j];
588275732Sjmg			tmp1 = last_block;
589275732Sjmg			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
590275732Sjmg			Y = _mm_xor_si128(Y, tmp1);
591275732Sjmg			gfmul(Y, H, &Y);
592275732Sjmg		}
593275732Sjmg		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
594275732Sjmg		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
595275732Sjmg
596275732Sjmg		Y = _mm_xor_si128(Y, tmp1);
597275732Sjmg		gfmul(Y, H, &Y);
598275732Sjmg		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
599275732Sjmg		tmp1 = _mm_xor_si128(Y, KEY[0]);
600275732Sjmg		for (j=1; j < nr; j++)
601275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
602275732Sjmg		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
603275732Sjmg	}
604275732Sjmg
605275732Sjmg	gfmul(H,H,&H2);
606275732Sjmg	gfmul(H,H2,&H3);
607275732Sjmg	gfmul(H,H3,&H4);
608275732Sjmg
609275732Sjmg	for (i=0; i<abytes/16/4; i++) {
610275732Sjmg		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
611275732Sjmg		tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
612275732Sjmg		tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
613275732Sjmg		tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
614275732Sjmg
615275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
616275732Sjmg		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
617275732Sjmg		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
618275732Sjmg		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
619275732Sjmg
620275732Sjmg		tmp1 = _mm_xor_si128(X, tmp1);
621275732Sjmg
622275732Sjmg		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
623275732Sjmg	}
624275732Sjmg	for (i=i*4; i<abytes/16; i++) {
625275732Sjmg		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
626275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
627275732Sjmg		X = _mm_xor_si128(X,tmp1);
628275732Sjmg		gfmul(X, H, &X);
629275732Sjmg	}
630275732Sjmg	if (abytes%16) {
631275732Sjmg		last_block = _mm_setzero_si128();
632275732Sjmg		for (j=0; j<abytes%16; j++)
633275732Sjmg			((unsigned char*)&last_block)[j] = addt[i*16+j];
634275732Sjmg		tmp1 = last_block;
635275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
636275732Sjmg		X =_mm_xor_si128(X,tmp1);
637275732Sjmg		gfmul(X,H,&X);
638275732Sjmg	}
639275732Sjmg
640275732Sjmg	/* This is where we validate the cipher text before decrypt */
641275732Sjmg	for (i = 0; i<nbytes/16/4; i++) {
642275732Sjmg		tmp1 = _mm_loadu_si128(&((__m128i*)in)[i*4]);
643275732Sjmg		tmp2 = _mm_loadu_si128(&((__m128i*)in)[i*4+1]);
644275732Sjmg		tmp3 = _mm_loadu_si128(&((__m128i*)in)[i*4+2]);
645275732Sjmg		tmp4 = _mm_loadu_si128(&((__m128i*)in)[i*4+3]);
646275732Sjmg
647275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
648275732Sjmg		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
649275732Sjmg		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
650275732Sjmg		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
651275732Sjmg
652275732Sjmg		tmp1 = _mm_xor_si128(X, tmp1);
653275732Sjmg
654275732Sjmg		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
655275732Sjmg	}
656275732Sjmg	for (i = i*4; i<nbytes/16; i++) {
657275732Sjmg		tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]);
658275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
659275732Sjmg		X = _mm_xor_si128(X, tmp1);
660275732Sjmg		gfmul(X,H,&X);
661275732Sjmg	}
662275732Sjmg	if (nbytes%16) {
663275732Sjmg		last_block = _mm_setzero_si128();
664275732Sjmg		for (j=0; j<nbytes%16; j++)
665275732Sjmg			((unsigned char*)&last_block)[j] = in[i*16+j];
666275732Sjmg		tmp1 = last_block;
667275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
668275732Sjmg		X = _mm_xor_si128(X, tmp1);
669275732Sjmg		gfmul(X, H, &X);
670275732Sjmg	}
671275732Sjmg
672275732Sjmg	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
673275732Sjmg	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
674275732Sjmg
675275732Sjmg	X = _mm_xor_si128(X, tmp1);
676275732Sjmg	gfmul(X,H,&X);
677275732Sjmg	X = _mm_shuffle_epi8(X, BSWAP_MASK);
678275732Sjmg	T = _mm_xor_si128(X, T);
679275732Sjmg
680286049Sjmg	if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
681275732Sjmg		return 0; //in case the authentication failed
682275732Sjmg
683275732Sjmg	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
684275732Sjmg	ctr1 = _mm_add_epi64(ctr1, ONE);
685275732Sjmg	ctr2 = _mm_add_epi64(ctr1, ONE);
686275732Sjmg	ctr3 = _mm_add_epi64(ctr2, ONE);
687275732Sjmg	ctr4 = _mm_add_epi64(ctr3, ONE);
688275732Sjmg	ctr5 = _mm_add_epi64(ctr4, ONE);
689275732Sjmg	ctr6 = _mm_add_epi64(ctr5, ONE);
690275732Sjmg	ctr7 = _mm_add_epi64(ctr6, ONE);
691275732Sjmg	ctr8 = _mm_add_epi64(ctr7, ONE);
692275732Sjmg
693275732Sjmg	for (i=0; i<nbytes/16/8; i++) {
694275732Sjmg		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
695275732Sjmg		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
696275732Sjmg		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
697275732Sjmg		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
698275732Sjmg		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
699275732Sjmg		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
700275732Sjmg		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
701275732Sjmg		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
702275732Sjmg
703275732Sjmg		ctr1 = _mm_add_epi64(ctr1, EIGHT);
704275732Sjmg		ctr2 = _mm_add_epi64(ctr2, EIGHT);
705275732Sjmg		ctr3 = _mm_add_epi64(ctr3, EIGHT);
706275732Sjmg		ctr4 = _mm_add_epi64(ctr4, EIGHT);
707275732Sjmg		ctr5 = _mm_add_epi64(ctr5, EIGHT);
708275732Sjmg		ctr6 = _mm_add_epi64(ctr6, EIGHT);
709275732Sjmg		ctr7 = _mm_add_epi64(ctr7, EIGHT);
710275732Sjmg		ctr8 = _mm_add_epi64(ctr8, EIGHT);
711275732Sjmg
712275732Sjmg		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
713275732Sjmg		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
714275732Sjmg		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
715275732Sjmg		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
716275732Sjmg		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
717275732Sjmg		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
718275732Sjmg		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
719275732Sjmg		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
720275732Sjmg
721275732Sjmg		for (j=1; j<nr; j++) {
722275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
723275732Sjmg			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
724275732Sjmg			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
725275732Sjmg			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
726275732Sjmg			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
727275732Sjmg			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
728275732Sjmg			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
729275732Sjmg			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
730275732Sjmg		}
731275732Sjmg		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
732275732Sjmg		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
733275732Sjmg		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
734275732Sjmg		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
735275732Sjmg		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
736275732Sjmg		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
737275732Sjmg		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
738275732Sjmg		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
739275732Sjmg
740275732Sjmg		tmp1 = _mm_xor_si128(tmp1,
741275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+0]));
742275732Sjmg		tmp2 = _mm_xor_si128(tmp2,
743275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+1]));
744275732Sjmg		tmp3 = _mm_xor_si128(tmp3,
745275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+2]));
746275732Sjmg		tmp4 = _mm_xor_si128(tmp4,
747275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+3]));
748275732Sjmg		tmp5 = _mm_xor_si128(tmp5,
749275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+4]));
750275732Sjmg		tmp6 = _mm_xor_si128(tmp6,
751275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+5]));
752275732Sjmg		tmp7 = _mm_xor_si128(tmp7,
753275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+6]));
754275732Sjmg		tmp8 = _mm_xor_si128(tmp8,
755275732Sjmg		    _mm_loadu_si128(&((__m128i*)in)[i*8+7]));
756275732Sjmg
757275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
758275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
759275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
760275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
761275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
762275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
763275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
764275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
765275732Sjmg
766275732Sjmg		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
767275732Sjmg		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
768275732Sjmg		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
769275732Sjmg		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
770275732Sjmg		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
771275732Sjmg		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
772275732Sjmg		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
773275732Sjmg		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
774275732Sjmg	}
775275732Sjmg	for (k=i*8; k<nbytes/16; k++) {
776275732Sjmg		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
777275732Sjmg		ctr1 = _mm_add_epi64(ctr1, ONE);
778275732Sjmg		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
779275732Sjmg		for (j=1; j<nr-1; j+=2) {
780275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
781275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
782275732Sjmg		}
783275732Sjmg		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
784275732Sjmg		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
785275732Sjmg		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
786275732Sjmg		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
787275732Sjmg	}
788275732Sjmg	//If remains one incomplete block
789275732Sjmg	if (nbytes%16) {
790275732Sjmg		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
791275732Sjmg		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
792275732Sjmg		for (j=1; j<nr-1; j+=2) {
793275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
794275732Sjmg			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
795275732Sjmg		}
796275732Sjmg		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
797275732Sjmg		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
798275732Sjmg		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
799275732Sjmg		last_block = tmp1;
800275732Sjmg		for (j=0; j<nbytes%16; j++)
801275732Sjmg			out[k*16+j] = ((unsigned char*)&last_block)[j];
802275732Sjmg	}
803275732Sjmg	return 1; //when sucessfull returns 1
804275732Sjmg}
805