1275732Sjmg/*- 2275732Sjmg * Copyright (c) 2014 The FreeBSD Foundation 3275732Sjmg * All rights reserved. 4275732Sjmg * 5275732Sjmg * This software was developed by John-Mark Gurney under 6275732Sjmg * the sponsorship of the FreeBSD Foundation and 7275732Sjmg * Rubicon Communications, LLC (Netgate). 8275732Sjmg * Redistribution and use in source and binary forms, with or without 9275732Sjmg * modification, are permitted provided that the following conditions 10275732Sjmg * are met: 11275732Sjmg * 1. Redistributions of source code must retain the above copyright 12275732Sjmg * notice, this list of conditions and the following disclaimer. 13275732Sjmg * 2. Redistributions in binary form must reproduce the above copyright 14275732Sjmg * notice, this list of conditions and the following disclaimer in the 15275732Sjmg * documentation and/or other materials provided with the distribution. 16275732Sjmg * 17275732Sjmg * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18275732Sjmg * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19275732Sjmg * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20275732Sjmg * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21275732Sjmg * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22275732Sjmg * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23275732Sjmg * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24275732Sjmg * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25275732Sjmg * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26275732Sjmg * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27275732Sjmg * SUCH DAMAGE. 28275732Sjmg * 29275732Sjmg * 30275732Sjmg * $FreeBSD$ 31275732Sjmg * 32275732Sjmg */ 33275732Sjmg 34275732Sjmg/* 35275732Sjmg * Figure 5, 8 and 12 are copied from the Intel white paper: 36275732Sjmg * Intel�� Carry-Less Multiplication Instruction and its Usage for 37275732Sjmg * Computing the GCM Mode 38275732Sjmg * 39275732Sjmg * and as such are: 40275732Sjmg * Copyright �� 2010 Intel Corporation. 41275732Sjmg * All rights reserved. 42275732Sjmg * 43275732Sjmg * Redistribution and use in source and binary forms, with or without 44275732Sjmg * modification, are permitted provided that the following conditions 45275732Sjmg * are met: 46275732Sjmg * * Redistributions of source code must retain the above copyright 47275732Sjmg * notice, this list of conditions and the following disclaimer. 48275732Sjmg * * Redistributions in binary form must reproduce the above copyright 49275732Sjmg * notice, this list of conditions and the following disclaimer in the 50275732Sjmg * documentation and/or other materials provided with the distribution. 51275732Sjmg * * Neither the name of Intel Corporation nor the 52275732Sjmg * names of its contributors may be used to endorse or promote products 53275732Sjmg * derived from this software without specific prior written permission. 54275732Sjmg * 55275732Sjmg * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 56275732Sjmg * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 57275732Sjmg * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 58275732Sjmg * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 59275732Sjmg * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 60275732Sjmg * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 61275732Sjmg * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 62275732Sjmg * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 63275732Sjmg * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 64275732Sjmg * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 65275732Sjmg * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 66275732Sjmg */ 67275732Sjmg 68275732Sjmg#ifdef _KERNEL 69275732Sjmg#include <crypto/aesni/aesni.h> 70281606Srodrigc#include <crypto/aesni/aesni_os.h> 71275732Sjmg#else 72275732Sjmg#include <stdint.h> 73275732Sjmg#endif 74275732Sjmg 75275732Sjmg#include <wmmintrin.h> 76275732Sjmg#include <emmintrin.h> 77275732Sjmg#include <smmintrin.h> 78275732Sjmg 79275732Sjmgstatic inline int 80275732Sjmgm128icmp(__m128i a, __m128i b) 81275732Sjmg{ 82275732Sjmg __m128i cmp; 83275732Sjmg 84275732Sjmg cmp = _mm_cmpeq_epi32(a, b); 85275732Sjmg 86275732Sjmg return _mm_movemask_epi8(cmp) == 0xffff; 87275732Sjmg} 88275732Sjmg 89275732Sjmg#ifdef __i386__ 90275732Sjmgstatic inline __m128i 91275732Sjmg_mm_insert_epi64(__m128i a, int64_t b, const int ndx) 92275732Sjmg{ 93275732Sjmg 94275732Sjmg if (!ndx) { 95275732Sjmg a = _mm_insert_epi32(a, b, 0); 96275732Sjmg a = _mm_insert_epi32(a, b >> 32, 1); 97275732Sjmg } else { 98275732Sjmg a = _mm_insert_epi32(a, b, 2); 99275732Sjmg a = _mm_insert_epi32(a, b >> 32, 3); 100275732Sjmg } 101275732Sjmg 102275732Sjmg return a; 103275732Sjmg} 104275732Sjmg#endif 105275732Sjmg 106275732Sjmg/* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */ 107275732Sjmg 108275732Sjmg/* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */ 109275732Sjmgstatic void 110275732Sjmggfmul(__m128i a, __m128i b, __m128i *res) 111275732Sjmg{ 112275732Sjmg __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 113275732Sjmg 114275732Sjmg tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 115275732Sjmg tmp4 = _mm_clmulepi64_si128(a, b, 0x10); 116275732Sjmg tmp5 = _mm_clmulepi64_si128(a, b, 0x01); 117275732Sjmg tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 118275732Sjmg 119275732Sjmg tmp4 = _mm_xor_si128(tmp4, tmp5); 120275732Sjmg tmp5 = _mm_slli_si128(tmp4, 8); 121275732Sjmg tmp4 = _mm_srli_si128(tmp4, 8); 122275732Sjmg tmp3 = _mm_xor_si128(tmp3, tmp5); 123275732Sjmg tmp6 = _mm_xor_si128(tmp6, tmp4); 124275732Sjmg 125275732Sjmg tmp7 = _mm_srli_epi32(tmp3, 31); 126275732Sjmg tmp8 = _mm_srli_epi32(tmp6, 31); 127275732Sjmg tmp3 = _mm_slli_epi32(tmp3, 1); 128275732Sjmg tmp6 = _mm_slli_epi32(tmp6, 1); 129275732Sjmg 130275732Sjmg tmp9 = _mm_srli_si128(tmp7, 12); 131275732Sjmg tmp8 = _mm_slli_si128(tmp8, 4); 132275732Sjmg tmp7 = _mm_slli_si128(tmp7, 4); 133275732Sjmg tmp3 = _mm_or_si128(tmp3, tmp7); 134275732Sjmg tmp6 = _mm_or_si128(tmp6, tmp8); 135275732Sjmg tmp6 = _mm_or_si128(tmp6, tmp9); 136275732Sjmg 137275732Sjmg tmp7 = _mm_slli_epi32(tmp3, 31); 138275732Sjmg tmp8 = _mm_slli_epi32(tmp3, 30); 139275732Sjmg tmp9 = _mm_slli_epi32(tmp3, 25); 140275732Sjmg 141275732Sjmg tmp7 = _mm_xor_si128(tmp7, tmp8); 142275732Sjmg tmp7 = _mm_xor_si128(tmp7, tmp9); 143275732Sjmg tmp8 = _mm_srli_si128(tmp7, 4); 144275732Sjmg tmp7 = _mm_slli_si128(tmp7, 12); 145275732Sjmg tmp3 = _mm_xor_si128(tmp3, tmp7); 146275732Sjmg 147275732Sjmg tmp2 = _mm_srli_epi32(tmp3, 1); 148275732Sjmg tmp4 = _mm_srli_epi32(tmp3, 2); 149275732Sjmg tmp5 = _mm_srli_epi32(tmp3, 7); 150275732Sjmg tmp2 = _mm_xor_si128(tmp2, tmp4); 151275732Sjmg tmp2 = _mm_xor_si128(tmp2, tmp5); 152275732Sjmg tmp2 = _mm_xor_si128(tmp2, tmp8); 153275732Sjmg tmp3 = _mm_xor_si128(tmp3, tmp2); 154275732Sjmg tmp6 = _mm_xor_si128(tmp6, tmp3); 155275732Sjmg 156275732Sjmg *res = tmp6; 157275732Sjmg} 158275732Sjmg 159275732Sjmg/* 160275732Sjmg * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction 161275732Sjmg * Method */ 162275732Sjmgstatic void 163275732Sjmgreduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4, 164275732Sjmg __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res) 165275732Sjmg{ 166275732Sjmg /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ 167275732Sjmg __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo, 168275732Sjmg H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi; 169275732Sjmg __m128i tmp0, tmp1, tmp2, tmp3; 170275732Sjmg __m128i tmp4, tmp5, tmp6, tmp7; 171275732Sjmg __m128i tmp8, tmp9; 172275732Sjmg 173275732Sjmg H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); 174275732Sjmg H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); 175275732Sjmg H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); 176275732Sjmg H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); 177275732Sjmg 178275732Sjmg lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); 179275732Sjmg lo = _mm_xor_si128(lo, H3_X3_lo); 180275732Sjmg lo = _mm_xor_si128(lo, H4_X4_lo); 181275732Sjmg 182275732Sjmg H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); 183275732Sjmg H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); 184275732Sjmg H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); 185275732Sjmg H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); 186275732Sjmg 187275732Sjmg hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); 188275732Sjmg hi = _mm_xor_si128(hi, H3_X3_hi); 189275732Sjmg hi = _mm_xor_si128(hi, H4_X4_hi); 190275732Sjmg 191275732Sjmg tmp0 = _mm_shuffle_epi32(H1, 78); 192275732Sjmg tmp4 = _mm_shuffle_epi32(X1, 78); 193275732Sjmg tmp0 = _mm_xor_si128(tmp0, H1); 194275732Sjmg tmp4 = _mm_xor_si128(tmp4, X1); 195275732Sjmg tmp1 = _mm_shuffle_epi32(H2, 78); 196275732Sjmg tmp5 = _mm_shuffle_epi32(X2, 78); 197275732Sjmg tmp1 = _mm_xor_si128(tmp1, H2); 198275732Sjmg tmp5 = _mm_xor_si128(tmp5, X2); 199275732Sjmg tmp2 = _mm_shuffle_epi32(H3, 78); 200275732Sjmg tmp6 = _mm_shuffle_epi32(X3, 78); 201275732Sjmg tmp2 = _mm_xor_si128(tmp2, H3); 202275732Sjmg tmp6 = _mm_xor_si128(tmp6, X3); 203275732Sjmg tmp3 = _mm_shuffle_epi32(H4, 78); 204275732Sjmg tmp7 = _mm_shuffle_epi32(X4, 78); 205275732Sjmg tmp3 = _mm_xor_si128(tmp3, H4); 206275732Sjmg tmp7 = _mm_xor_si128(tmp7, X4); 207275732Sjmg 208275732Sjmg tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); 209275732Sjmg tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00); 210275732Sjmg tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); 211275732Sjmg tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00); 212275732Sjmg 213275732Sjmg tmp0 = _mm_xor_si128(tmp0, lo); 214275732Sjmg tmp0 = _mm_xor_si128(tmp0, hi); 215275732Sjmg tmp0 = _mm_xor_si128(tmp1, tmp0); 216275732Sjmg tmp0 = _mm_xor_si128(tmp2, tmp0); 217275732Sjmg tmp0 = _mm_xor_si128(tmp3, tmp0); 218275732Sjmg 219275732Sjmg tmp4 = _mm_slli_si128(tmp0, 8); 220275732Sjmg tmp0 = _mm_srli_si128(tmp0, 8); 221275732Sjmg 222275732Sjmg lo = _mm_xor_si128(tmp4, lo); 223275732Sjmg hi = _mm_xor_si128(tmp0, hi); 224275732Sjmg 225275732Sjmg tmp3 = lo; 226275732Sjmg tmp6 = hi; 227275732Sjmg 228275732Sjmg tmp7 = _mm_srli_epi32(tmp3, 31); 229275732Sjmg tmp8 = _mm_srli_epi32(tmp6, 31); 230275732Sjmg tmp3 = _mm_slli_epi32(tmp3, 1); 231275732Sjmg tmp6 = _mm_slli_epi32(tmp6, 1); 232275732Sjmg 233275732Sjmg tmp9 = _mm_srli_si128(tmp7, 12); 234275732Sjmg tmp8 = _mm_slli_si128(tmp8, 4); 235275732Sjmg tmp7 = _mm_slli_si128(tmp7, 4); 236275732Sjmg tmp3 = _mm_or_si128(tmp3, tmp7); 237275732Sjmg tmp6 = _mm_or_si128(tmp6, tmp8); 238275732Sjmg tmp6 = _mm_or_si128(tmp6, tmp9); 239275732Sjmg 240275732Sjmg tmp7 = _mm_slli_epi32(tmp3, 31); 241275732Sjmg tmp8 = _mm_slli_epi32(tmp3, 30); 242275732Sjmg tmp9 = _mm_slli_epi32(tmp3, 25); 243275732Sjmg 244275732Sjmg tmp7 = _mm_xor_si128(tmp7, tmp8); 245275732Sjmg tmp7 = _mm_xor_si128(tmp7, tmp9); 246275732Sjmg tmp8 = _mm_srli_si128(tmp7, 4); 247275732Sjmg tmp7 = _mm_slli_si128(tmp7, 12); 248275732Sjmg tmp3 = _mm_xor_si128(tmp3, tmp7); 249275732Sjmg 250275732Sjmg tmp2 = _mm_srli_epi32(tmp3, 1); 251275732Sjmg tmp4 = _mm_srli_epi32(tmp3, 2); 252275732Sjmg tmp5 = _mm_srli_epi32(tmp3, 7); 253275732Sjmg tmp2 = _mm_xor_si128(tmp2, tmp4); 254275732Sjmg tmp2 = _mm_xor_si128(tmp2, tmp5); 255275732Sjmg tmp2 = _mm_xor_si128(tmp2, tmp8); 256275732Sjmg tmp3 = _mm_xor_si128(tmp3, tmp2); 257275732Sjmg tmp6 = _mm_xor_si128(tmp6, tmp3); 258275732Sjmg 259275732Sjmg *res = tmp6; 260275732Sjmg} 261275732Sjmg 262275732Sjmg/* 263275732Sjmg * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated 264275732Sjmg * Every Four Blocks 265275732Sjmg */ 266275732Sjmg/* 267275732Sjmg * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or 268275732Sjmg * 2^32-256*8*16 bytes. 269275732Sjmg */ 270275732Sjmgvoid 271275732SjmgAES_GCM_encrypt(const unsigned char *in, unsigned char *out, 272275732Sjmg const unsigned char *addt, const unsigned char *ivec, 273275732Sjmg unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 274275732Sjmg const unsigned char *key, int nr) 275275732Sjmg{ 276275732Sjmg int i, j ,k; 277275732Sjmg __m128i tmp1, tmp2, tmp3, tmp4; 278275732Sjmg __m128i tmp5, tmp6, tmp7, tmp8; 279275732Sjmg __m128i H, H2, H3, H4, Y, T; 280275732Sjmg __m128i *KEY = (__m128i*)key; 281275732Sjmg __m128i ctr1, ctr2, ctr3, ctr4; 282275732Sjmg __m128i ctr5, ctr6, ctr7, ctr8; 283275732Sjmg __m128i last_block = _mm_setzero_si128(); 284275732Sjmg __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 285275732Sjmg __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 286275732Sjmg __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 287275732Sjmg 7); 288275732Sjmg __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 289275732Sjmg 15); 290275732Sjmg __m128i X = _mm_setzero_si128(); 291275732Sjmg 292275732Sjmg if (ibytes == 96/8) { 293275732Sjmg Y = _mm_loadu_si128((__m128i*)ivec); 294275732Sjmg Y = _mm_insert_epi32(Y, 0x1000000, 3); 295275732Sjmg /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 296275732Sjmg tmp1 = _mm_xor_si128(X, KEY[0]); 297275732Sjmg tmp2 = _mm_xor_si128(Y, KEY[0]); 298275732Sjmg for (j=1; j < nr-1; j+=2) { 299275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 300275732Sjmg tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 301275732Sjmg 302275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 303275732Sjmg tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 304275732Sjmg } 305275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 306275732Sjmg tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 307275732Sjmg 308275732Sjmg H = _mm_aesenclast_si128(tmp1, KEY[nr]); 309275732Sjmg T = _mm_aesenclast_si128(tmp2, KEY[nr]); 310275732Sjmg 311275732Sjmg H = _mm_shuffle_epi8(H, BSWAP_MASK); 312275732Sjmg } else { 313275732Sjmg tmp1 = _mm_xor_si128(X, KEY[0]); 314275732Sjmg for (j=1; j <nr; j++) 315275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 316275732Sjmg H = _mm_aesenclast_si128(tmp1, KEY[nr]); 317275732Sjmg 318275732Sjmg H = _mm_shuffle_epi8(H, BSWAP_MASK); 319275732Sjmg Y = _mm_setzero_si128(); 320275732Sjmg 321275732Sjmg for (i=0; i < ibytes/16; i++) { 322275732Sjmg tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 323275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 324275732Sjmg Y = _mm_xor_si128(Y, tmp1); 325275732Sjmg gfmul(Y, H, &Y); 326275732Sjmg } 327275732Sjmg if (ibytes%16) { 328275732Sjmg for (j=0; j < ibytes%16; j++) 329275732Sjmg ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 330275732Sjmg tmp1 = last_block; 331275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 332275732Sjmg Y = _mm_xor_si128(Y, tmp1); 333275732Sjmg gfmul(Y, H, &Y); 334275732Sjmg } 335275732Sjmg tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 336275732Sjmg tmp1 = _mm_insert_epi64(tmp1, 0, 1); 337275732Sjmg 338275732Sjmg Y = _mm_xor_si128(Y, tmp1); 339275732Sjmg gfmul(Y, H, &Y); 340275732Sjmg Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 341275732Sjmg tmp1 = _mm_xor_si128(Y, KEY[0]); 342275732Sjmg for (j=1; j < nr; j++) 343275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 344275732Sjmg T = _mm_aesenclast_si128(tmp1, KEY[nr]); 345275732Sjmg } 346275732Sjmg 347275732Sjmg gfmul(H,H,&H2); 348275732Sjmg gfmul(H,H2,&H3); 349275732Sjmg gfmul(H,H3,&H4); 350275732Sjmg 351275732Sjmg for (i=0; i<abytes/16/4; i++) { 352275732Sjmg tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]); 353275732Sjmg tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]); 354275732Sjmg tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]); 355275732Sjmg tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]); 356275732Sjmg 357275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 358275732Sjmg tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 359275732Sjmg tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 360275732Sjmg tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 361275732Sjmg tmp1 = _mm_xor_si128(X, tmp1); 362275732Sjmg 363275732Sjmg reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 364275732Sjmg } 365275732Sjmg for (i=i*4; i<abytes/16; i++) { 366275732Sjmg tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); 367275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 368275732Sjmg X = _mm_xor_si128(X,tmp1); 369275732Sjmg gfmul(X, H, &X); 370275732Sjmg } 371275732Sjmg if (abytes%16) { 372275732Sjmg last_block = _mm_setzero_si128(); 373275732Sjmg for (j=0; j<abytes%16; j++) 374275732Sjmg ((unsigned char*)&last_block)[j] = addt[i*16+j]; 375275732Sjmg tmp1 = last_block; 376275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 377275732Sjmg X =_mm_xor_si128(X,tmp1); 378275732Sjmg gfmul(X,H,&X); 379275732Sjmg } 380275732Sjmg 381275732Sjmg ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 382275732Sjmg ctr1 = _mm_add_epi64(ctr1, ONE); 383275732Sjmg ctr2 = _mm_add_epi64(ctr1, ONE); 384275732Sjmg ctr3 = _mm_add_epi64(ctr2, ONE); 385275732Sjmg ctr4 = _mm_add_epi64(ctr3, ONE); 386275732Sjmg ctr5 = _mm_add_epi64(ctr4, ONE); 387275732Sjmg ctr6 = _mm_add_epi64(ctr5, ONE); 388275732Sjmg ctr7 = _mm_add_epi64(ctr6, ONE); 389275732Sjmg ctr8 = _mm_add_epi64(ctr7, ONE); 390275732Sjmg 391275732Sjmg for (i=0; i<nbytes/16/8; i++) { 392275732Sjmg tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 393275732Sjmg tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 394275732Sjmg tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 395275732Sjmg tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 396275732Sjmg tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 397275732Sjmg tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 398275732Sjmg tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 399275732Sjmg tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 400275732Sjmg 401275732Sjmg ctr1 = _mm_add_epi64(ctr1, EIGHT); 402275732Sjmg ctr2 = _mm_add_epi64(ctr2, EIGHT); 403275732Sjmg ctr3 = _mm_add_epi64(ctr3, EIGHT); 404275732Sjmg ctr4 = _mm_add_epi64(ctr4, EIGHT); 405275732Sjmg ctr5 = _mm_add_epi64(ctr5, EIGHT); 406275732Sjmg ctr6 = _mm_add_epi64(ctr6, EIGHT); 407275732Sjmg ctr7 = _mm_add_epi64(ctr7, EIGHT); 408275732Sjmg ctr8 = _mm_add_epi64(ctr8, EIGHT); 409275732Sjmg 410275732Sjmg tmp1 =_mm_xor_si128(tmp1, KEY[0]); 411275732Sjmg tmp2 =_mm_xor_si128(tmp2, KEY[0]); 412275732Sjmg tmp3 =_mm_xor_si128(tmp3, KEY[0]); 413275732Sjmg tmp4 =_mm_xor_si128(tmp4, KEY[0]); 414275732Sjmg tmp5 =_mm_xor_si128(tmp5, KEY[0]); 415275732Sjmg tmp6 =_mm_xor_si128(tmp6, KEY[0]); 416275732Sjmg tmp7 =_mm_xor_si128(tmp7, KEY[0]); 417275732Sjmg tmp8 =_mm_xor_si128(tmp8, KEY[0]); 418275732Sjmg 419275732Sjmg for (j=1; j<nr; j++) { 420275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 421275732Sjmg tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 422275732Sjmg tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 423275732Sjmg tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 424275732Sjmg tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 425275732Sjmg tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 426275732Sjmg tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 427275732Sjmg tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 428275732Sjmg } 429275732Sjmg tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 430275732Sjmg tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 431275732Sjmg tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 432275732Sjmg tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 433275732Sjmg tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 434275732Sjmg tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 435275732Sjmg tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 436275732Sjmg tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 437275732Sjmg 438275732Sjmg tmp1 = _mm_xor_si128(tmp1, 439275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+0])); 440275732Sjmg tmp2 = _mm_xor_si128(tmp2, 441275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+1])); 442275732Sjmg tmp3 = _mm_xor_si128(tmp3, 443275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+2])); 444275732Sjmg tmp4 = _mm_xor_si128(tmp4, 445275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+3])); 446275732Sjmg tmp5 = _mm_xor_si128(tmp5, 447275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+4])); 448275732Sjmg tmp6 = _mm_xor_si128(tmp6, 449275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+5])); 450275732Sjmg tmp7 = _mm_xor_si128(tmp7, 451275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+6])); 452275732Sjmg tmp8 = _mm_xor_si128(tmp8, 453275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+7])); 454275732Sjmg 455275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 456275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 457275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 458275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 459275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 460275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 461275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 462275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 463275732Sjmg 464275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 465275732Sjmg tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 466275732Sjmg tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 467275732Sjmg tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 468275732Sjmg tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 469275732Sjmg tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 470275732Sjmg tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 471275732Sjmg tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 472275732Sjmg 473275732Sjmg tmp1 = _mm_xor_si128(X, tmp1); 474275732Sjmg 475275732Sjmg reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 476275732Sjmg 477275732Sjmg tmp5 = _mm_xor_si128(X, tmp5); 478275732Sjmg reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X); 479275732Sjmg } 480275732Sjmg for (k=i*8; k<nbytes/16; k++) { 481275732Sjmg tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 482275732Sjmg ctr1 = _mm_add_epi64(ctr1, ONE); 483275732Sjmg tmp1 = _mm_xor_si128(tmp1, KEY[0]); 484275732Sjmg for (j=1; j<nr-1; j+=2) { 485275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 486275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 487275732Sjmg } 488275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 489275732Sjmg tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 490275732Sjmg tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 491275732Sjmg _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 492275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 493275732Sjmg X = _mm_xor_si128(X, tmp1); 494275732Sjmg gfmul(X,H,&X); 495275732Sjmg } 496275732Sjmg //If remains one incomplete block 497275732Sjmg if (nbytes%16) { 498275732Sjmg tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 499275732Sjmg tmp1 = _mm_xor_si128(tmp1, KEY[0]); 500275732Sjmg for (j=1; j<nr-1; j+=2) { 501275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 502275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 503275732Sjmg } 504275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 505275732Sjmg tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 506275732Sjmg tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 507275732Sjmg last_block = tmp1; 508275732Sjmg for (j=0; j<nbytes%16; j++) 509275732Sjmg out[k*16+j] = ((unsigned char*)&last_block)[j]; 510275732Sjmg for ((void)j; j<16; j++) 511275732Sjmg ((unsigned char*)&last_block)[j] = 0; 512275732Sjmg tmp1 = last_block; 513275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 514275732Sjmg X = _mm_xor_si128(X, tmp1); 515275732Sjmg gfmul(X, H, &X); 516275732Sjmg } 517275732Sjmg tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 518275732Sjmg tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 519275732Sjmg 520275732Sjmg X = _mm_xor_si128(X, tmp1); 521275732Sjmg gfmul(X,H,&X); 522275732Sjmg X = _mm_shuffle_epi8(X, BSWAP_MASK); 523275732Sjmg T = _mm_xor_si128(X, T); 524275732Sjmg _mm_storeu_si128((__m128i*)tag, T); 525275732Sjmg} 526275732Sjmg 527275732Sjmg/* My modification of _encrypt to be _decrypt */ 528275732Sjmgint 529275732SjmgAES_GCM_decrypt(const unsigned char *in, unsigned char *out, 530275732Sjmg const unsigned char *addt, const unsigned char *ivec, 531286049Sjmg const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 532275732Sjmg const unsigned char *key, int nr) 533275732Sjmg{ 534275732Sjmg int i, j ,k; 535275732Sjmg __m128i tmp1, tmp2, tmp3, tmp4; 536275732Sjmg __m128i tmp5, tmp6, tmp7, tmp8; 537275732Sjmg __m128i H, H2, H3, H4, Y, T; 538275732Sjmg __m128i *KEY = (__m128i*)key; 539275732Sjmg __m128i ctr1, ctr2, ctr3, ctr4; 540275732Sjmg __m128i ctr5, ctr6, ctr7, ctr8; 541275732Sjmg __m128i last_block = _mm_setzero_si128(); 542275732Sjmg __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 543275732Sjmg __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 544275732Sjmg __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 545275732Sjmg 7); 546275732Sjmg __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 547275732Sjmg 15); 548275732Sjmg __m128i X = _mm_setzero_si128(); 549275732Sjmg 550275732Sjmg if (ibytes == 96/8) { 551275732Sjmg Y = _mm_loadu_si128((__m128i*)ivec); 552275732Sjmg Y = _mm_insert_epi32(Y, 0x1000000, 3); 553275732Sjmg /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 554275732Sjmg tmp1 = _mm_xor_si128(X, KEY[0]); 555275732Sjmg tmp2 = _mm_xor_si128(Y, KEY[0]); 556275732Sjmg for (j=1; j < nr-1; j+=2) { 557275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 558275732Sjmg tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 559275732Sjmg 560275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 561275732Sjmg tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 562275732Sjmg } 563275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 564275732Sjmg tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 565275732Sjmg 566275732Sjmg H = _mm_aesenclast_si128(tmp1, KEY[nr]); 567275732Sjmg T = _mm_aesenclast_si128(tmp2, KEY[nr]); 568275732Sjmg 569275732Sjmg H = _mm_shuffle_epi8(H, BSWAP_MASK); 570275732Sjmg } else { 571275732Sjmg tmp1 = _mm_xor_si128(X, KEY[0]); 572275732Sjmg for (j=1; j <nr; j++) 573275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 574275732Sjmg H = _mm_aesenclast_si128(tmp1, KEY[nr]); 575275732Sjmg 576275732Sjmg H = _mm_shuffle_epi8(H, BSWAP_MASK); 577275732Sjmg Y = _mm_setzero_si128(); 578275732Sjmg 579275732Sjmg for (i=0; i < ibytes/16; i++) { 580275732Sjmg tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 581275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 582275732Sjmg Y = _mm_xor_si128(Y, tmp1); 583275732Sjmg gfmul(Y, H, &Y); 584275732Sjmg } 585275732Sjmg if (ibytes%16) { 586275732Sjmg for (j=0; j < ibytes%16; j++) 587275732Sjmg ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 588275732Sjmg tmp1 = last_block; 589275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 590275732Sjmg Y = _mm_xor_si128(Y, tmp1); 591275732Sjmg gfmul(Y, H, &Y); 592275732Sjmg } 593275732Sjmg tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 594275732Sjmg tmp1 = _mm_insert_epi64(tmp1, 0, 1); 595275732Sjmg 596275732Sjmg Y = _mm_xor_si128(Y, tmp1); 597275732Sjmg gfmul(Y, H, &Y); 598275732Sjmg Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 599275732Sjmg tmp1 = _mm_xor_si128(Y, KEY[0]); 600275732Sjmg for (j=1; j < nr; j++) 601275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 602275732Sjmg T = _mm_aesenclast_si128(tmp1, KEY[nr]); 603275732Sjmg } 604275732Sjmg 605275732Sjmg gfmul(H,H,&H2); 606275732Sjmg gfmul(H,H2,&H3); 607275732Sjmg gfmul(H,H3,&H4); 608275732Sjmg 609275732Sjmg for (i=0; i<abytes/16/4; i++) { 610275732Sjmg tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]); 611275732Sjmg tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]); 612275732Sjmg tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]); 613275732Sjmg tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]); 614275732Sjmg 615275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 616275732Sjmg tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 617275732Sjmg tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 618275732Sjmg tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 619275732Sjmg 620275732Sjmg tmp1 = _mm_xor_si128(X, tmp1); 621275732Sjmg 622275732Sjmg reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 623275732Sjmg } 624275732Sjmg for (i=i*4; i<abytes/16; i++) { 625275732Sjmg tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); 626275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 627275732Sjmg X = _mm_xor_si128(X,tmp1); 628275732Sjmg gfmul(X, H, &X); 629275732Sjmg } 630275732Sjmg if (abytes%16) { 631275732Sjmg last_block = _mm_setzero_si128(); 632275732Sjmg for (j=0; j<abytes%16; j++) 633275732Sjmg ((unsigned char*)&last_block)[j] = addt[i*16+j]; 634275732Sjmg tmp1 = last_block; 635275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 636275732Sjmg X =_mm_xor_si128(X,tmp1); 637275732Sjmg gfmul(X,H,&X); 638275732Sjmg } 639275732Sjmg 640275732Sjmg /* This is where we validate the cipher text before decrypt */ 641275732Sjmg for (i = 0; i<nbytes/16/4; i++) { 642275732Sjmg tmp1 = _mm_loadu_si128(&((__m128i*)in)[i*4]); 643275732Sjmg tmp2 = _mm_loadu_si128(&((__m128i*)in)[i*4+1]); 644275732Sjmg tmp3 = _mm_loadu_si128(&((__m128i*)in)[i*4+2]); 645275732Sjmg tmp4 = _mm_loadu_si128(&((__m128i*)in)[i*4+3]); 646275732Sjmg 647275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 648275732Sjmg tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 649275732Sjmg tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 650275732Sjmg tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 651275732Sjmg 652275732Sjmg tmp1 = _mm_xor_si128(X, tmp1); 653275732Sjmg 654275732Sjmg reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 655275732Sjmg } 656275732Sjmg for (i = i*4; i<nbytes/16; i++) { 657275732Sjmg tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]); 658275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 659275732Sjmg X = _mm_xor_si128(X, tmp1); 660275732Sjmg gfmul(X,H,&X); 661275732Sjmg } 662275732Sjmg if (nbytes%16) { 663275732Sjmg last_block = _mm_setzero_si128(); 664275732Sjmg for (j=0; j<nbytes%16; j++) 665275732Sjmg ((unsigned char*)&last_block)[j] = in[i*16+j]; 666275732Sjmg tmp1 = last_block; 667275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 668275732Sjmg X = _mm_xor_si128(X, tmp1); 669275732Sjmg gfmul(X, H, &X); 670275732Sjmg } 671275732Sjmg 672275732Sjmg tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 673275732Sjmg tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 674275732Sjmg 675275732Sjmg X = _mm_xor_si128(X, tmp1); 676275732Sjmg gfmul(X,H,&X); 677275732Sjmg X = _mm_shuffle_epi8(X, BSWAP_MASK); 678275732Sjmg T = _mm_xor_si128(X, T); 679275732Sjmg 680286049Sjmg if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag))) 681275732Sjmg return 0; //in case the authentication failed 682275732Sjmg 683275732Sjmg ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 684275732Sjmg ctr1 = _mm_add_epi64(ctr1, ONE); 685275732Sjmg ctr2 = _mm_add_epi64(ctr1, ONE); 686275732Sjmg ctr3 = _mm_add_epi64(ctr2, ONE); 687275732Sjmg ctr4 = _mm_add_epi64(ctr3, ONE); 688275732Sjmg ctr5 = _mm_add_epi64(ctr4, ONE); 689275732Sjmg ctr6 = _mm_add_epi64(ctr5, ONE); 690275732Sjmg ctr7 = _mm_add_epi64(ctr6, ONE); 691275732Sjmg ctr8 = _mm_add_epi64(ctr7, ONE); 692275732Sjmg 693275732Sjmg for (i=0; i<nbytes/16/8; i++) { 694275732Sjmg tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 695275732Sjmg tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 696275732Sjmg tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 697275732Sjmg tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 698275732Sjmg tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 699275732Sjmg tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 700275732Sjmg tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 701275732Sjmg tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 702275732Sjmg 703275732Sjmg ctr1 = _mm_add_epi64(ctr1, EIGHT); 704275732Sjmg ctr2 = _mm_add_epi64(ctr2, EIGHT); 705275732Sjmg ctr3 = _mm_add_epi64(ctr3, EIGHT); 706275732Sjmg ctr4 = _mm_add_epi64(ctr4, EIGHT); 707275732Sjmg ctr5 = _mm_add_epi64(ctr5, EIGHT); 708275732Sjmg ctr6 = _mm_add_epi64(ctr6, EIGHT); 709275732Sjmg ctr7 = _mm_add_epi64(ctr7, EIGHT); 710275732Sjmg ctr8 = _mm_add_epi64(ctr8, EIGHT); 711275732Sjmg 712275732Sjmg tmp1 =_mm_xor_si128(tmp1, KEY[0]); 713275732Sjmg tmp2 =_mm_xor_si128(tmp2, KEY[0]); 714275732Sjmg tmp3 =_mm_xor_si128(tmp3, KEY[0]); 715275732Sjmg tmp4 =_mm_xor_si128(tmp4, KEY[0]); 716275732Sjmg tmp5 =_mm_xor_si128(tmp5, KEY[0]); 717275732Sjmg tmp6 =_mm_xor_si128(tmp6, KEY[0]); 718275732Sjmg tmp7 =_mm_xor_si128(tmp7, KEY[0]); 719275732Sjmg tmp8 =_mm_xor_si128(tmp8, KEY[0]); 720275732Sjmg 721275732Sjmg for (j=1; j<nr; j++) { 722275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 723275732Sjmg tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 724275732Sjmg tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 725275732Sjmg tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 726275732Sjmg tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 727275732Sjmg tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 728275732Sjmg tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 729275732Sjmg tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 730275732Sjmg } 731275732Sjmg tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 732275732Sjmg tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 733275732Sjmg tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 734275732Sjmg tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 735275732Sjmg tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 736275732Sjmg tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 737275732Sjmg tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 738275732Sjmg tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 739275732Sjmg 740275732Sjmg tmp1 = _mm_xor_si128(tmp1, 741275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+0])); 742275732Sjmg tmp2 = _mm_xor_si128(tmp2, 743275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+1])); 744275732Sjmg tmp3 = _mm_xor_si128(tmp3, 745275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+2])); 746275732Sjmg tmp4 = _mm_xor_si128(tmp4, 747275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+3])); 748275732Sjmg tmp5 = _mm_xor_si128(tmp5, 749275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+4])); 750275732Sjmg tmp6 = _mm_xor_si128(tmp6, 751275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+5])); 752275732Sjmg tmp7 = _mm_xor_si128(tmp7, 753275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+6])); 754275732Sjmg tmp8 = _mm_xor_si128(tmp8, 755275732Sjmg _mm_loadu_si128(&((__m128i*)in)[i*8+7])); 756275732Sjmg 757275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 758275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 759275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 760275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 761275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 762275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 763275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 764275732Sjmg _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 765275732Sjmg 766275732Sjmg tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 767275732Sjmg tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 768275732Sjmg tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 769275732Sjmg tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 770275732Sjmg tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 771275732Sjmg tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 772275732Sjmg tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 773275732Sjmg tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 774275732Sjmg } 775275732Sjmg for (k=i*8; k<nbytes/16; k++) { 776275732Sjmg tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 777275732Sjmg ctr1 = _mm_add_epi64(ctr1, ONE); 778275732Sjmg tmp1 = _mm_xor_si128(tmp1, KEY[0]); 779275732Sjmg for (j=1; j<nr-1; j+=2) { 780275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 781275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 782275732Sjmg } 783275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 784275732Sjmg tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 785275732Sjmg tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 786275732Sjmg _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 787275732Sjmg } 788275732Sjmg //If remains one incomplete block 789275732Sjmg if (nbytes%16) { 790275732Sjmg tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 791275732Sjmg tmp1 = _mm_xor_si128(tmp1, KEY[0]); 792275732Sjmg for (j=1; j<nr-1; j+=2) { 793275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 794275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 795275732Sjmg } 796275732Sjmg tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 797275732Sjmg tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 798275732Sjmg tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 799275732Sjmg last_block = tmp1; 800275732Sjmg for (j=0; j<nbytes%16; j++) 801275732Sjmg out[k*16+j] = ((unsigned char*)&last_block)[j]; 802275732Sjmg } 803275732Sjmg return 1; //when sucessfull returns 1 804275732Sjmg} 805