1// vmac.cpp - written and placed in the public domain by Wei Dai 2// based on Ted Krovetz's public domain vmac.c and draft-krovetz-vmac-01.txt 3 4#include "pch.h" 5#include "vmac.h" 6#include "argnames.h" 7#include "cpu.h" 8 9NAMESPACE_BEGIN(CryptoPP) 10 11#if defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64 12#include <intrin.h> 13#endif 14 15#define VMAC_BOOL_WORD128 (defined(CRYPTOPP_WORD128_AVAILABLE) && !defined(CRYPTOPP_X64_ASM_AVAILABLE)) 16#ifdef __BORLANDC__ 17#define const // Turbo C++ 2006 workaround 18#endif 19static const word64 p64 = W64LIT(0xfffffffffffffeff); /* 2^64 - 257 prime */ 20static const word64 m62 = W64LIT(0x3fffffffffffffff); /* 62-bit mask */ 21static const word64 m63 = W64LIT(0x7fffffffffffffff); /* 63-bit mask */ 22static const word64 m64 = W64LIT(0xffffffffffffffff); /* 64-bit mask */ 23static const word64 mpoly = W64LIT(0x1fffffff1fffffff); /* Poly key mask */ 24#ifdef __BORLANDC__ 25#undef const 26#endif 27#if VMAC_BOOL_WORD128 28#ifdef __powerpc__ 29// workaround GCC Bug 31690: ICE with const __uint128_t and C++ front-end 30#define m126 ((word128(m62)<<64)|m64) 31#else 32static const word128 m126 = (word128(m62)<<64)|m64; /* 126-bit mask */ 33#endif 34#endif 35 36void VMAC_Base::UncheckedSetKey(const byte *userKey, unsigned int keylength, const NameValuePairs ¶ms) 37{ 38 int digestLength = params.GetIntValueWithDefault(Name::DigestSize(), DefaultDigestSize()); 39 if (digestLength != 8 && digestLength != 16) 40 throw InvalidArgument("VMAC: DigestSize must be 8 or 16"); 41 m_is128 = digestLength == 16; 42 43 m_L1KeyLength = params.GetIntValueWithDefault(Name::L1KeyLength(), 128); 44 if (m_L1KeyLength <= 0 || m_L1KeyLength % 128 != 0) 45 throw InvalidArgument("VMAC: L1KeyLength must be a positive multiple of 128"); 46 47 AllocateBlocks(); 48 49 BlockCipher &cipher = AccessCipher(); 50 cipher.SetKey(userKey, keylength, params); 51 unsigned int blockSize = cipher.BlockSize(); 52 unsigned int blockSizeInWords = blockSize / sizeof(word64); 53 SecBlock<word64> out(blockSizeInWords); 54 SecByteBlock in; 55 in.CleanNew(blockSize); 56 size_t i; 57 58 /* Fill nh key */ 59 in[0] = 0x80; 60 cipher.AdvancedProcessBlocks(in, NULL, (byte *)m_nhKey(), m_nhKeySize()*sizeof(word64), cipher.BT_InBlockIsCounter); 61 ConditionalByteReverse<word64>(BIG_ENDIAN_ORDER, m_nhKey(), m_nhKey(), m_nhKeySize()*sizeof(word64)); 62 63 /* Fill poly key */ 64 in[0] = 0xC0; 65 in[15] = 0; 66 for (i = 0; i <= (size_t)m_is128; i++) 67 { 68 cipher.ProcessBlock(in, out.BytePtr()); 69 m_polyState()[i*4+2] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()) & mpoly; 70 m_polyState()[i*4+3] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8) & mpoly; 71 in[15]++; 72 } 73 74 /* Fill ip key */ 75 in[0] = 0xE0; 76 in[15] = 0; 77 word64 *l3Key = m_l3Key(); 78 for (i = 0; i <= (size_t)m_is128; i++) 79 do 80 { 81 cipher.ProcessBlock(in, out.BytePtr()); 82 l3Key[i*2+0] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()); 83 l3Key[i*2+1] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8); 84 in[15]++; 85 } while ((l3Key[i*2+0] >= p64) || (l3Key[i*2+1] >= p64)); 86 87 m_padCached = false; 88 size_t nonceLength; 89 const byte *nonce = GetIVAndThrowIfInvalid(params, nonceLength); 90 Resynchronize(nonce, (int)nonceLength); 91} 92 93void VMAC_Base::GetNextIV(RandomNumberGenerator &rng, byte *IV) 94{ 95 SimpleKeyingInterface::GetNextIV(rng, IV); 96 IV[0] &= 0x7f; 97} 98 99void VMAC_Base::Resynchronize(const byte *nonce, int len) 100{ 101 size_t length = ThrowIfInvalidIVLength(len); 102 size_t s = IVSize(); 103 byte *storedNonce = m_nonce(); 104 105 if (m_is128) 106 { 107 memset(storedNonce, 0, s-length); 108 memcpy(storedNonce+s-length, nonce, length); 109 AccessCipher().ProcessBlock(storedNonce, m_pad()); 110 } 111 else 112 { 113 if (m_padCached && (storedNonce[s-1] | 1) == (nonce[length-1] | 1)) 114 { 115 m_padCached = VerifyBufsEqual(storedNonce+s-length, nonce, length-1); 116 for (size_t i=0; m_padCached && i<s-length; i++) 117 m_padCached = (storedNonce[i] == 0); 118 } 119 if (!m_padCached) 120 { 121 memset(storedNonce, 0, s-length); 122 memcpy(storedNonce+s-length, nonce, length-1); 123 storedNonce[s-1] = nonce[length-1] & 0xfe; 124 AccessCipher().ProcessBlock(storedNonce, m_pad()); 125 m_padCached = true; 126 } 127 storedNonce[s-1] = nonce[length-1]; 128 } 129 m_isFirstBlock = true; 130 Restart(); 131} 132 133void VMAC_Base::HashEndianCorrectedBlock(const word64 *data) 134{ 135 assert(false); 136 throw 0; 137} 138 139#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86 140#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code 141void 142#ifdef __GNUC__ 143__attribute__ ((noinline)) // Intel Compiler 9.1 workaround 144#endif 145VMAC_Base::VHASH_Update_SSE2(const word64 *data, size_t blocksRemainingInWord64, int tagPart) 146{ 147 const word64 *nhK = m_nhKey(); 148 word64 *polyS = m_polyState(); 149 word32 L1KeyLength = m_L1KeyLength; 150 151#ifdef __GNUC__ 152 word32 temp; 153 __asm__ __volatile__ 154 ( 155 AS2( mov %%ebx, %0) 156 AS2( mov %1, %%ebx) 157 ".intel_syntax noprefix;" 158#else 159 #if _MSC_VER < 1300 || defined(__INTEL_COMPILER) 160 char isFirstBlock = m_isFirstBlock; 161 AS2( mov ebx, [L1KeyLength]) 162 AS2( mov dl, [isFirstBlock]) 163 #else 164 AS2( mov ecx, this) 165 AS2( mov ebx, [ecx+m_L1KeyLength]) 166 AS2( mov dl, [ecx+m_isFirstBlock]) 167 #endif 168 AS2( mov eax, tagPart) 169 AS2( shl eax, 4) 170 AS2( mov edi, nhK) 171 AS2( add edi, eax) 172 AS2( add eax, eax) 173 AS2( add eax, polyS) 174 175 AS2( mov esi, data) 176 AS2( mov ecx, blocksRemainingInWord64) 177#endif 178 179 AS2( shr ebx, 3) 180 AS1( push ebp) 181 AS2( sub esp, 12) 182 ASL(4) 183 AS2( mov ebp, ebx) 184 AS2( cmp ecx, ebx) 185 AS2( cmovl ebp, ecx) 186 AS2( sub ecx, ebp) 187 AS2( lea ebp, [edi+8*ebp]) // end of nhK 188 AS2( movq mm6, [esi]) 189 AS2( paddq mm6, [edi]) 190 AS2( movq mm5, [esi+8]) 191 AS2( paddq mm5, [edi+8]) 192 AS2( add esi, 16) 193 AS2( add edi, 16) 194 AS2( movq mm4, mm6) 195 ASS( pshufw mm2, mm6, 1, 0, 3, 2) 196 AS2( pmuludq mm6, mm5) 197 ASS( pshufw mm3, mm5, 1, 0, 3, 2) 198 AS2( pmuludq mm5, mm2) 199 AS2( pmuludq mm2, mm3) 200 AS2( pmuludq mm3, mm4) 201 AS2( pxor mm7, mm7) 202 AS2( movd [esp], mm6) 203 AS2( psrlq mm6, 32) 204 AS2( movd [esp+4], mm5) 205 AS2( psrlq mm5, 32) 206 AS2( cmp edi, ebp) 207 ASJ( je, 1, f) 208 ASL(0) 209 AS2( movq mm0, [esi]) 210 AS2( paddq mm0, [edi]) 211 AS2( movq mm1, [esi+8]) 212 AS2( paddq mm1, [edi+8]) 213 AS2( add esi, 16) 214 AS2( add edi, 16) 215 AS2( movq mm4, mm0) 216 AS2( paddq mm5, mm2) 217 ASS( pshufw mm2, mm0, 1, 0, 3, 2) 218 AS2( pmuludq mm0, mm1) 219 AS2( movd [esp+8], mm3) 220 AS2( psrlq mm3, 32) 221 AS2( paddq mm5, mm3) 222 ASS( pshufw mm3, mm1, 1, 0, 3, 2) 223 AS2( pmuludq mm1, mm2) 224 AS2( pmuludq mm2, mm3) 225 AS2( pmuludq mm3, mm4) 226 AS2( movd mm4, [esp]) 227 AS2( paddq mm7, mm4) 228 AS2( movd mm4, [esp+4]) 229 AS2( paddq mm6, mm4) 230 AS2( movd mm4, [esp+8]) 231 AS2( paddq mm6, mm4) 232 AS2( movd [esp], mm0) 233 AS2( psrlq mm0, 32) 234 AS2( paddq mm6, mm0) 235 AS2( movd [esp+4], mm1) 236 AS2( psrlq mm1, 32) 237 AS2( paddq mm5, mm1) 238 AS2( cmp edi, ebp) 239 ASJ( jne, 0, b) 240 ASL(1) 241 AS2( paddq mm5, mm2) 242 AS2( movd [esp+8], mm3) 243 AS2( psrlq mm3, 32) 244 AS2( paddq mm5, mm3) 245 AS2( movd mm4, [esp]) 246 AS2( paddq mm7, mm4) 247 AS2( movd mm4, [esp+4]) 248 AS2( paddq mm6, mm4) 249 AS2( movd mm4, [esp+8]) 250 AS2( paddq mm6, mm4) 251 AS2( lea ebp, [8*ebx]) 252 AS2( sub edi, ebp) // reset edi to start of nhK 253 254 AS2( movd [esp], mm7) 255 AS2( psrlq mm7, 32) 256 AS2( paddq mm6, mm7) 257 AS2( movd [esp+4], mm6) 258 AS2( psrlq mm6, 32) 259 AS2( paddq mm5, mm6) 260 AS2( psllq mm5, 2) 261 AS2( psrlq mm5, 2) 262 263#define a0 [eax+2*4] 264#define a1 [eax+3*4] 265#define a2 [eax+0*4] 266#define a3 [eax+1*4] 267#define k0 [eax+2*8+2*4] 268#define k1 [eax+2*8+3*4] 269#define k2 [eax+2*8+0*4] 270#define k3 [eax+2*8+1*4] 271 AS2( test dl, dl) 272 ASJ( jz, 2, f) 273 AS2( movd mm1, k0) 274 AS2( movd mm0, [esp]) 275 AS2( paddq mm0, mm1) 276 AS2( movd a0, mm0) 277 AS2( psrlq mm0, 32) 278 AS2( movd mm1, k1) 279 AS2( movd mm2, [esp+4]) 280 AS2( paddq mm1, mm2) 281 AS2( paddq mm0, mm1) 282 AS2( movd a1, mm0) 283 AS2( psrlq mm0, 32) 284 AS2( paddq mm5, k2) 285 AS2( paddq mm0, mm5) 286 AS2( movq a2, mm0) 287 AS2( xor edx, edx) 288 ASJ( jmp, 3, f) 289 ASL(2) 290 AS2( movd mm0, a3) 291 AS2( movq mm4, mm0) 292 AS2( pmuludq mm0, k3) // a3*k3 293 AS2( movd mm1, a0) 294 AS2( pmuludq mm1, k2) // a0*k2 295 AS2( movd mm2, a1) 296 AS2( movd mm6, k1) 297 AS2( pmuludq mm2, mm6) // a1*k1 298 AS2( movd mm3, a2) 299 AS2( psllq mm0, 1) 300 AS2( paddq mm0, mm5) 301 AS2( movq mm5, mm3) 302 AS2( movd mm7, k0) 303 AS2( pmuludq mm3, mm7) // a2*k0 304 AS2( pmuludq mm4, mm7) // a3*k0 305 AS2( pmuludq mm5, mm6) // a2*k1 306 AS2( paddq mm0, mm1) 307 AS2( movd mm1, a1) 308 AS2( paddq mm4, mm5) 309 AS2( movq mm5, mm1) 310 AS2( pmuludq mm1, k2) // a1*k2 311 AS2( paddq mm0, mm2) 312 AS2( movd mm2, a0) 313 AS2( paddq mm0, mm3) 314 AS2( movq mm3, mm2) 315 AS2( pmuludq mm2, k3) // a0*k3 316 AS2( pmuludq mm3, mm7) // a0*k0 317 AS2( movd [esp+8], mm0) 318 AS2( psrlq mm0, 32) 319 AS2( pmuludq mm7, mm5) // a1*k0 320 AS2( pmuludq mm5, k3) // a1*k3 321 AS2( paddq mm0, mm1) 322 AS2( movd mm1, a2) 323 AS2( pmuludq mm1, k2) // a2*k2 324 AS2( paddq mm0, mm2) 325 AS2( paddq mm0, mm4) 326 AS2( movq mm4, mm0) 327 AS2( movd mm2, a3) 328 AS2( pmuludq mm2, mm6) // a3*k1 329 AS2( pmuludq mm6, a0) // a0*k1 330 AS2( psrlq mm0, 31) 331 AS2( paddq mm0, mm3) 332 AS2( movd mm3, [esp]) 333 AS2( paddq mm0, mm3) 334 AS2( movd mm3, a2) 335 AS2( pmuludq mm3, k3) // a2*k3 336 AS2( paddq mm5, mm1) 337 AS2( movd mm1, a3) 338 AS2( pmuludq mm1, k2) // a3*k2 339 AS2( paddq mm5, mm2) 340 AS2( movd mm2, [esp+4]) 341 AS2( psllq mm5, 1) 342 AS2( paddq mm0, mm5) 343 AS2( psllq mm4, 33) 344 AS2( movd a0, mm0) 345 AS2( psrlq mm0, 32) 346 AS2( paddq mm6, mm7) 347 AS2( movd mm7, [esp+8]) 348 AS2( paddq mm0, mm6) 349 AS2( paddq mm0, mm2) 350 AS2( paddq mm3, mm1) 351 AS2( psllq mm3, 1) 352 AS2( paddq mm0, mm3) 353 AS2( psrlq mm4, 1) 354 AS2( movd a1, mm0) 355 AS2( psrlq mm0, 32) 356 AS2( por mm4, mm7) 357 AS2( paddq mm0, mm4) 358 AS2( movq a2, mm0) 359#undef a0 360#undef a1 361#undef a2 362#undef a3 363#undef k0 364#undef k1 365#undef k2 366#undef k3 367 368 ASL(3) 369 AS2( test ecx, ecx) 370 ASJ( jnz, 4, b) 371 372 AS2( add esp, 12) 373 AS1( pop ebp) 374 AS1( emms) 375#ifdef __GNUC__ 376 ".att_syntax prefix;" 377 AS2( mov %0, %%ebx) 378 : "=m" (temp) 379 : "m" (L1KeyLength), "c" (blocksRemainingInWord64), "S" (data), "D" (nhK+tagPart*2), "d" (m_isFirstBlock), "a" (polyS+tagPart*4) 380 : "memory", "cc" 381 ); 382#endif 383} 384#endif 385 386#if VMAC_BOOL_WORD128 387 #define DeclareNH(a) word128 a=0 388 #define MUL64(rh,rl,i1,i2) {word128 p = word128(i1)*(i2); rh = word64(p>>64); rl = word64(p);} 389 #define AccumulateNH(a, b, c) a += word128(b)*(c) 390 #define Multiply128(r, i1, i2) r = word128(word64(i1)) * word64(i2) 391#else 392 #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) 393 #define MUL32(a, b) __emulu(word32(a), word32(b)) 394 #else 395 #define MUL32(a, b) ((word64)((word32)(a)) * (word32)(b)) 396 #endif 397 #if defined(CRYPTOPP_X64_ASM_AVAILABLE) 398 #define DeclareNH(a) word64 a##0=0, a##1=0 399 #define MUL64(rh,rl,i1,i2) asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "g"(i2) : "cc"); 400 #define AccumulateNH(a, b, c) asm ("mulq %3; addq %%rax, %0; adcq %%rdx, %1" : "+r"(a##0), "+r"(a##1) : "a"(b), "g"(c) : "%rdx", "cc"); 401 #define ADD128(rh,rl,ih,il) asm ("addq %3, %1; adcq %2, %0" : "+r"(rh),"+r"(rl) : "r"(ih),"r"(il) : "cc"); 402 #elif defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64 403 #define DeclareNH(a) word64 a##0=0, a##1=0 404 #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh)); 405 #define AccumulateNH(a, b, c) {\ 406 word64 ph, pl;\ 407 pl = _umul128(b,c,&ph);\ 408 a##0 += pl;\ 409 a##1 += ph + (a##0 < pl);} 410 #else 411 #define VMAC_BOOL_32BIT 1 412 #define DeclareNH(a) word64 a##0=0, a##1=0, a##2=0 413 #define MUL64(rh,rl,i1,i2) \ 414 { word64 _i1 = (i1), _i2 = (i2); \ 415 word64 m1= MUL32(_i1,_i2>>32); \ 416 word64 m2= MUL32(_i1>>32,_i2); \ 417 rh = MUL32(_i1>>32,_i2>>32); \ 418 rl = MUL32(_i1,_i2); \ 419 ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \ 420 ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \ 421 } 422 #define AccumulateNH(a, b, c) {\ 423 word64 p = MUL32(b, c);\ 424 a##1 += word32((p)>>32);\ 425 a##0 += word32(p);\ 426 p = MUL32((b)>>32, c);\ 427 a##2 += word32((p)>>32);\ 428 a##1 += word32(p);\ 429 p = MUL32((b)>>32, (c)>>32);\ 430 a##2 += p;\ 431 p = MUL32(b, (c)>>32);\ 432 a##1 += word32(p);\ 433 a##2 += word32(p>>32);} 434 #endif 435#endif 436#ifndef VMAC_BOOL_32BIT 437 #define VMAC_BOOL_32BIT 0 438#endif 439#ifndef ADD128 440 #define ADD128(rh,rl,ih,il) \ 441 { word64 _il = (il); \ 442 (rl) += (_il); \ 443 (rh) += (ih) + ((rl) < (_il)); \ 444 } 445#endif 446 447#if !(defined(_MSC_VER) && _MSC_VER < 1300) 448template <bool T_128BitTag> 449#endif 450void VMAC_Base::VHASH_Update_Template(const word64 *data, size_t blocksRemainingInWord64) 451{ 452 #define INNER_LOOP_ITERATION(j) {\ 453 word64 d0 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+0]);\ 454 word64 d1 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+1]);\ 455 AccumulateNH(nhA, d0+nhK[i+2*j+0], d1+nhK[i+2*j+1]);\ 456 if (T_128BitTag)\ 457 AccumulateNH(nhB, d0+nhK[i+2*j+2], d1+nhK[i+2*j+3]);\ 458 } 459 460#if (defined(_MSC_VER) && _MSC_VER < 1300) 461 bool T_128BitTag = m_is128; 462#endif 463 size_t L1KeyLengthInWord64 = m_L1KeyLength / 8; 464 size_t innerLoopEnd = L1KeyLengthInWord64; 465 const word64 *nhK = m_nhKey(); 466 word64 *polyS = m_polyState(); 467 bool isFirstBlock = true; 468 size_t i; 469 470 #if !VMAC_BOOL_32BIT 471 #if VMAC_BOOL_WORD128 472 word128 a1, a2; 473 #else 474 word64 ah1, al1, ah2, al2; 475 #endif 476 word64 kh1, kl1, kh2, kl2; 477 kh1=(polyS+0*4+2)[0]; kl1=(polyS+0*4+2)[1]; 478 if (T_128BitTag) 479 { 480 kh2=(polyS+1*4+2)[0]; kl2=(polyS+1*4+2)[1]; 481 } 482 #endif 483 484 do 485 { 486 DeclareNH(nhA); 487 DeclareNH(nhB); 488 489 i = 0; 490 if (blocksRemainingInWord64 < L1KeyLengthInWord64) 491 { 492 if (blocksRemainingInWord64 % 8) 493 { 494 innerLoopEnd = blocksRemainingInWord64 % 8; 495 for (; i<innerLoopEnd; i+=2) 496 INNER_LOOP_ITERATION(0); 497 } 498 innerLoopEnd = blocksRemainingInWord64; 499 } 500 for (; i<innerLoopEnd; i+=8) 501 { 502 INNER_LOOP_ITERATION(0); 503 INNER_LOOP_ITERATION(1); 504 INNER_LOOP_ITERATION(2); 505 INNER_LOOP_ITERATION(3); 506 } 507 blocksRemainingInWord64 -= innerLoopEnd; 508 data += innerLoopEnd; 509 510 #if VMAC_BOOL_32BIT 511 word32 nh0[2], nh1[2]; 512 word64 nh2[2]; 513 514 nh0[0] = word32(nhA0); 515 nhA1 += (nhA0 >> 32); 516 nh1[0] = word32(nhA1); 517 nh2[0] = (nhA2 + (nhA1 >> 32)) & m62; 518 519 if (T_128BitTag) 520 { 521 nh0[1] = word32(nhB0); 522 nhB1 += (nhB0 >> 32); 523 nh1[1] = word32(nhB1); 524 nh2[1] = (nhB2 + (nhB1 >> 32)) & m62; 525 } 526 527 #define a0 (((word32 *)(polyS+i*4))[2+NativeByteOrder::ToEnum()]) 528 #define a1 (*(((word32 *)(polyS+i*4))+3-NativeByteOrder::ToEnum())) // workaround for GCC 3.2 529 #define a2 (((word32 *)(polyS+i*4))[0+NativeByteOrder::ToEnum()]) 530 #define a3 (*(((word32 *)(polyS+i*4))+1-NativeByteOrder::ToEnum())) 531 #define aHi ((polyS+i*4)[0]) 532 #define k0 (((word32 *)(polyS+i*4+2))[2+NativeByteOrder::ToEnum()]) 533 #define k1 (*(((word32 *)(polyS+i*4+2))+3-NativeByteOrder::ToEnum())) 534 #define k2 (((word32 *)(polyS+i*4+2))[0+NativeByteOrder::ToEnum()]) 535 #define k3 (*(((word32 *)(polyS+i*4+2))+1-NativeByteOrder::ToEnum())) 536 #define kHi ((polyS+i*4+2)[0]) 537 538 if (isFirstBlock) 539 { 540 isFirstBlock = false; 541 if (m_isFirstBlock) 542 { 543 m_isFirstBlock = false; 544 for (i=0; i<=(size_t)T_128BitTag; i++) 545 { 546 word64 t = (word64)nh0[i] + k0; 547 a0 = (word32)t; 548 t = (t >> 32) + nh1[i] + k1; 549 a1 = (word32)t; 550 aHi = (t >> 32) + nh2[i] + kHi; 551 } 552 continue; 553 } 554 } 555 for (i=0; i<=(size_t)T_128BitTag; i++) 556 { 557 word64 p, t; 558 word32 t2; 559 560 p = MUL32(a3, 2*k3); 561 p += nh2[i]; 562 p += MUL32(a0, k2); 563 p += MUL32(a1, k1); 564 p += MUL32(a2, k0); 565 t2 = (word32)p; 566 p >>= 32; 567 p += MUL32(a0, k3); 568 p += MUL32(a1, k2); 569 p += MUL32(a2, k1); 570 p += MUL32(a3, k0); 571 t = (word64(word32(p) & 0x7fffffff) << 32) | t2; 572 p >>= 31; 573 p += nh0[i]; 574 p += MUL32(a0, k0); 575 p += MUL32(a1, 2*k3); 576 p += MUL32(a2, 2*k2); 577 p += MUL32(a3, 2*k1); 578 t2 = (word32)p; 579 p >>= 32; 580 p += nh1[i]; 581 p += MUL32(a0, k1); 582 p += MUL32(a1, k0); 583 p += MUL32(a2, 2*k3); 584 p += MUL32(a3, 2*k2); 585 a0 = t2; 586 a1 = (word32)p; 587 aHi = (p >> 32) + t; 588 } 589 590 #undef a0 591 #undef a1 592 #undef a2 593 #undef a3 594 #undef aHi 595 #undef k0 596 #undef k1 597 #undef k2 598 #undef k3 599 #undef kHi 600 #else // #if VMAC_BOOL_32BIT 601 if (isFirstBlock) 602 { 603 isFirstBlock = false; 604 if (m_isFirstBlock) 605 { 606 m_isFirstBlock = false; 607 #if VMAC_BOOL_WORD128 608 #define first_poly_step(a, kh, kl, m) a = (m & m126) + ((word128(kh) << 64) | kl) 609 610 first_poly_step(a1, kh1, kl1, nhA); 611 if (T_128BitTag) 612 first_poly_step(a2, kh2, kl2, nhB); 613 #else 614 #define first_poly_step(ah, al, kh, kl, mh, ml) {\ 615 mh &= m62;\ 616 ADD128(mh, ml, kh, kl); \ 617 ah = mh; al = ml;} 618 619 first_poly_step(ah1, al1, kh1, kl1, nhA1, nhA0); 620 if (T_128BitTag) 621 first_poly_step(ah2, al2, kh2, kl2, nhB1, nhB0); 622 #endif 623 continue; 624 } 625 else 626 { 627 #if VMAC_BOOL_WORD128 628 a1 = (word128((polyS+0*4)[0]) << 64) | (polyS+0*4)[1]; 629 #else 630 ah1=(polyS+0*4)[0]; al1=(polyS+0*4)[1]; 631 #endif 632 if (T_128BitTag) 633 { 634 #if VMAC_BOOL_WORD128 635 a2 = (word128((polyS+1*4)[0]) << 64) | (polyS+1*4)[1]; 636 #else 637 ah2=(polyS+1*4)[0]; al2=(polyS+1*4)[1]; 638 #endif 639 } 640 } 641 } 642 643 #if VMAC_BOOL_WORD128 644 #define poly_step(a, kh, kl, m) \ 645 { word128 t1, t2, t3, t4;\ 646 Multiply128(t2, a>>64, kl);\ 647 Multiply128(t3, a, kh);\ 648 Multiply128(t1, a, kl);\ 649 Multiply128(t4, a>>64, 2*kh);\ 650 t2 += t3;\ 651 t4 += t1;\ 652 t2 += t4>>64;\ 653 a = (word128(word64(t2)&m63) << 64) | word64(t4);\ 654 t2 *= 2;\ 655 a += m & m126;\ 656 a += t2>>64;} 657 658 poly_step(a1, kh1, kl1, nhA); 659 if (T_128BitTag) 660 poly_step(a2, kh2, kl2, nhB); 661 #else 662 #define poly_step(ah, al, kh, kl, mh, ml) \ 663 { word64 t1h, t1l, t2h, t2l, t3h, t3l, z=0; \ 664 /* compute ab*cd, put bd into result registers */ \ 665 MUL64(t2h,t2l,ah,kl); \ 666 MUL64(t3h,t3l,al,kh); \ 667 MUL64(t1h,t1l,ah,2*kh); \ 668 MUL64(ah,al,al,kl); \ 669 /* add together ad + bc */ \ 670 ADD128(t2h,t2l,t3h,t3l); \ 671 /* add 2 * ac to result */ \ 672 ADD128(ah,al,t1h,t1l); \ 673 /* now (ah,al), (t2l,2*t2h) need summing */ \ 674 /* first add the high registers, carrying into t2h */ \ 675 ADD128(t2h,ah,z,t2l); \ 676 /* double t2h and add top bit of ah */ \ 677 t2h += t2h + (ah >> 63); \ 678 ah &= m63; \ 679 /* now add the low registers */ \ 680 mh &= m62; \ 681 ADD128(ah,al,mh,ml); \ 682 ADD128(ah,al,z,t2h); \ 683 } 684 685 poly_step(ah1, al1, kh1, kl1, nhA1, nhA0); 686 if (T_128BitTag) 687 poly_step(ah2, al2, kh2, kl2, nhB1, nhB0); 688 #endif 689 #endif // #if VMAC_BOOL_32BIT 690 } while (blocksRemainingInWord64); 691 692 #if VMAC_BOOL_WORD128 693 (polyS+0*4)[0]=word64(a1>>64); (polyS+0*4)[1]=word64(a1); 694 if (T_128BitTag) 695 { 696 (polyS+1*4)[0]=word64(a2>>64); (polyS+1*4)[1]=word64(a2); 697 } 698 #elif !VMAC_BOOL_32BIT 699 (polyS+0*4)[0]=ah1; (polyS+0*4)[1]=al1; 700 if (T_128BitTag) 701 { 702 (polyS+1*4)[0]=ah2; (polyS+1*4)[1]=al2; 703 } 704 #endif 705} 706 707inline void VMAC_Base::VHASH_Update(const word64 *data, size_t blocksRemainingInWord64) 708{ 709#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86 710 if (HasSSE2()) 711 { 712 VHASH_Update_SSE2(data, blocksRemainingInWord64, 0); 713 if (m_is128) 714 VHASH_Update_SSE2(data, blocksRemainingInWord64, 1); 715 m_isFirstBlock = false; 716 } 717 else 718#endif 719 { 720#if defined(_MSC_VER) && _MSC_VER < 1300 721 VHASH_Update_Template(data, blocksRemainingInWord64); 722#else 723 if (m_is128) 724 VHASH_Update_Template<true>(data, blocksRemainingInWord64); 725 else 726 VHASH_Update_Template<false>(data, blocksRemainingInWord64); 727#endif 728 } 729} 730 731size_t VMAC_Base::HashMultipleBlocks(const word64 *data, size_t length) 732{ 733 size_t remaining = ModPowerOf2(length, m_L1KeyLength); 734 VHASH_Update(data, (length-remaining)/8); 735 return remaining; 736} 737 738static word64 L3Hash(const word64 *input, const word64 *l3Key, size_t len) 739{ 740 word64 rh, rl, t, z=0; 741 word64 p1 = input[0], p2 = input[1]; 742 word64 k1 = l3Key[0], k2 = l3Key[1]; 743 744 /* fully reduce (p1,p2)+(len,0) mod p127 */ 745 t = p1 >> 63; 746 p1 &= m63; 747 ADD128(p1, p2, len, t); 748 /* At this point, (p1,p2) is at most 2^127+(len<<64) */ 749 t = (p1 > m63) + ((p1 == m63) & (p2 == m64)); 750 ADD128(p1, p2, z, t); 751 p1 &= m63; 752 753 /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */ 754 t = p1 + (p2 >> 32); 755 t += (t >> 32); 756 t += (word32)t > 0xfffffffeU; 757 p1 += (t >> 32); 758 p2 += (p1 << 32); 759 760 /* compute (p1+k1)%p64 and (p2+k2)%p64 */ 761 p1 += k1; 762 p1 += (0 - (p1 < k1)) & 257; 763 p2 += k2; 764 p2 += (0 - (p2 < k2)) & 257; 765 766 /* compute (p1+k1)*(p2+k2)%p64 */ 767 MUL64(rh, rl, p1, p2); 768 t = rh >> 56; 769 ADD128(t, rl, z, rh); 770 rh <<= 8; 771 ADD128(t, rl, z, rh); 772 t += t << 8; 773 rl += t; 774 rl += (0 - (rl < t)) & 257; 775 rl += (0 - (rl > p64-1)) & 257; 776 return rl; 777} 778 779void VMAC_Base::TruncatedFinal(byte *mac, size_t size) 780{ 781 size_t len = ModPowerOf2(GetBitCountLo()/8, m_L1KeyLength); 782 783 if (len) 784 { 785 memset(m_data()+len, 0, (0-len)%16); 786 VHASH_Update(DataBuf(), ((len+15)/16)*2); 787 len *= 8; // convert to bits 788 } 789 else if (m_isFirstBlock) 790 { 791 // special case for empty string 792 m_polyState()[0] = m_polyState()[2]; 793 m_polyState()[1] = m_polyState()[3]; 794 if (m_is128) 795 { 796 m_polyState()[4] = m_polyState()[6]; 797 m_polyState()[5] = m_polyState()[7]; 798 } 799 } 800 801 if (m_is128) 802 { 803 word64 t[2]; 804 t[0] = L3Hash(m_polyState(), m_l3Key(), len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad()); 805 t[1] = L3Hash(m_polyState()+4, m_l3Key()+2, len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad()+8); 806 if (size == 16) 807 { 808 PutWord(false, BIG_ENDIAN_ORDER, mac, t[0]); 809 PutWord(false, BIG_ENDIAN_ORDER, mac+8, t[1]); 810 } 811 else 812 { 813 t[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[0]); 814 t[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[1]); 815 memcpy(mac, t, size); 816 } 817 } 818 else 819 { 820 word64 t = L3Hash(m_polyState(), m_l3Key(), len); 821 t += GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad() + (m_nonce()[IVSize()-1]&1) * 8); 822 if (size == 8) 823 PutWord(false, BIG_ENDIAN_ORDER, mac, t); 824 else 825 { 826 t = ConditionalByteReverse(BIG_ENDIAN_ORDER, t); 827 memcpy(mac, &t, size); 828 } 829 } 830} 831 832NAMESPACE_END 833