1// vmac.cpp - written and placed in the public domain by Wei Dai
2// based on Ted Krovetz's public domain vmac.c and draft-krovetz-vmac-01.txt
3
4#include "pch.h"
5#include "vmac.h"
6#include "argnames.h"
7#include "cpu.h"
8
9NAMESPACE_BEGIN(CryptoPP)
10
11#if defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
12#include <intrin.h>
13#endif
14
15#define VMAC_BOOL_WORD128 (defined(CRYPTOPP_WORD128_AVAILABLE) && !defined(CRYPTOPP_X64_ASM_AVAILABLE))
16#ifdef __BORLANDC__
17#define const	// Turbo C++ 2006 workaround
18#endif
19static const word64 p64   = W64LIT(0xfffffffffffffeff);  /* 2^64 - 257 prime  */
20static const word64 m62   = W64LIT(0x3fffffffffffffff);  /* 62-bit mask       */
21static const word64 m63   = W64LIT(0x7fffffffffffffff);  /* 63-bit mask       */
22static const word64 m64   = W64LIT(0xffffffffffffffff);  /* 64-bit mask       */
23static const word64 mpoly = W64LIT(0x1fffffff1fffffff);  /* Poly key mask     */
24#ifdef __BORLANDC__
25#undef const
26#endif
27#if VMAC_BOOL_WORD128
28#ifdef __powerpc__
29// workaround GCC Bug 31690: ICE with const __uint128_t and C++ front-end
30#define m126				((word128(m62)<<64)|m64)
31#else
32static const word128 m126 = (word128(m62)<<64)|m64;		 /* 126-bit mask      */
33#endif
34#endif
35
36void VMAC_Base::UncheckedSetKey(const byte *userKey, unsigned int keylength, const NameValuePairs &params)
37{
38	int digestLength = params.GetIntValueWithDefault(Name::DigestSize(), DefaultDigestSize());
39	if (digestLength != 8 && digestLength != 16)
40		throw InvalidArgument("VMAC: DigestSize must be 8 or 16");
41	m_is128 = digestLength == 16;
42
43	m_L1KeyLength = params.GetIntValueWithDefault(Name::L1KeyLength(), 128);
44	if (m_L1KeyLength <= 0 || m_L1KeyLength % 128 != 0)
45		throw InvalidArgument("VMAC: L1KeyLength must be a positive multiple of 128");
46
47	AllocateBlocks();
48
49	BlockCipher &cipher = AccessCipher();
50	cipher.SetKey(userKey, keylength, params);
51	unsigned int blockSize = cipher.BlockSize();
52	unsigned int blockSizeInWords = blockSize / sizeof(word64);
53	SecBlock<word64> out(blockSizeInWords);
54	SecByteBlock in;
55	in.CleanNew(blockSize);
56	size_t i;
57
58	/* Fill nh key */
59	in[0] = 0x80;
60	cipher.AdvancedProcessBlocks(in, NULL, (byte *)m_nhKey(), m_nhKeySize()*sizeof(word64), cipher.BT_InBlockIsCounter);
61	ConditionalByteReverse<word64>(BIG_ENDIAN_ORDER, m_nhKey(), m_nhKey(), m_nhKeySize()*sizeof(word64));
62
63	/* Fill poly key */
64	in[0] = 0xC0;
65	in[15] = 0;
66	for (i = 0; i <= (size_t)m_is128; i++)
67	{
68		cipher.ProcessBlock(in, out.BytePtr());
69		m_polyState()[i*4+2] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()) & mpoly;
70		m_polyState()[i*4+3]  = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8) & mpoly;
71		in[15]++;
72	}
73
74	/* Fill ip key */
75	in[0] = 0xE0;
76	in[15] = 0;
77	word64 *l3Key = m_l3Key();
78	for (i = 0; i <= (size_t)m_is128; i++)
79		do
80		{
81			cipher.ProcessBlock(in, out.BytePtr());
82			l3Key[i*2+0] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr());
83			l3Key[i*2+1] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8);
84			in[15]++;
85		} while ((l3Key[i*2+0] >= p64) || (l3Key[i*2+1] >= p64));
86
87	m_padCached = false;
88	size_t nonceLength;
89	const byte *nonce = GetIVAndThrowIfInvalid(params, nonceLength);
90	Resynchronize(nonce, (int)nonceLength);
91}
92
93void VMAC_Base::GetNextIV(RandomNumberGenerator &rng, byte *IV)
94{
95	SimpleKeyingInterface::GetNextIV(rng, IV);
96	IV[0] &= 0x7f;
97}
98
99void VMAC_Base::Resynchronize(const byte *nonce, int len)
100{
101	size_t length = ThrowIfInvalidIVLength(len);
102	size_t s = IVSize();
103	byte *storedNonce = m_nonce();
104
105	if (m_is128)
106	{
107		memset(storedNonce, 0, s-length);
108		memcpy(storedNonce+s-length, nonce, length);
109		AccessCipher().ProcessBlock(storedNonce, m_pad());
110	}
111	else
112	{
113		if (m_padCached && (storedNonce[s-1] | 1) == (nonce[length-1] | 1))
114		{
115			m_padCached = VerifyBufsEqual(storedNonce+s-length, nonce, length-1);
116			for (size_t i=0; m_padCached && i<s-length; i++)
117				m_padCached = (storedNonce[i] == 0);
118		}
119		if (!m_padCached)
120		{
121			memset(storedNonce, 0, s-length);
122			memcpy(storedNonce+s-length, nonce, length-1);
123			storedNonce[s-1] = nonce[length-1] & 0xfe;
124			AccessCipher().ProcessBlock(storedNonce, m_pad());
125			m_padCached = true;
126		}
127		storedNonce[s-1] = nonce[length-1];
128	}
129	m_isFirstBlock = true;
130	Restart();
131}
132
133void VMAC_Base::HashEndianCorrectedBlock(const word64 *data)
134{
135	assert(false);
136	throw 0;
137}
138
139#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
140#pragma warning(disable: 4731)	// frame pointer register 'ebp' modified by inline assembly code
141void
142#ifdef __GNUC__
143__attribute__ ((noinline))		// Intel Compiler 9.1 workaround
144#endif
145VMAC_Base::VHASH_Update_SSE2(const word64 *data, size_t blocksRemainingInWord64, int tagPart)
146{
147	const word64 *nhK = m_nhKey();
148	word64 *polyS = m_polyState();
149	word32 L1KeyLength = m_L1KeyLength;
150
151#ifdef __GNUC__
152	word32 temp;
153	__asm__ __volatile__
154	(
155	AS2(	mov		%%ebx, %0)
156	AS2(	mov		%1, %%ebx)
157	".intel_syntax noprefix;"
158#else
159	#if _MSC_VER < 1300 || defined(__INTEL_COMPILER)
160	char isFirstBlock = m_isFirstBlock;
161	AS2(	mov		ebx, [L1KeyLength])
162	AS2(	mov		dl, [isFirstBlock])
163	#else
164	AS2(	mov		ecx, this)
165	AS2(	mov		ebx, [ecx+m_L1KeyLength])
166	AS2(	mov		dl, [ecx+m_isFirstBlock])
167	#endif
168	AS2(	mov		eax, tagPart)
169	AS2(	shl		eax, 4)
170	AS2(	mov		edi, nhK)
171	AS2(	add		edi, eax)
172	AS2(	add		eax, eax)
173	AS2(	add		eax, polyS)
174
175	AS2(	mov		esi, data)
176	AS2(	mov		ecx, blocksRemainingInWord64)
177#endif
178
179	AS2(	shr		ebx, 3)
180	AS1(	push	ebp)
181	AS2(	sub		esp, 12)
182	ASL(4)
183	AS2(	mov		ebp, ebx)
184	AS2(	cmp		ecx, ebx)
185	AS2(	cmovl	ebp, ecx)
186	AS2(	sub		ecx, ebp)
187	AS2(	lea		ebp, [edi+8*ebp])	// end of nhK
188	AS2(	movq	mm6, [esi])
189	AS2(	paddq	mm6, [edi])
190	AS2(	movq	mm5, [esi+8])
191	AS2(	paddq	mm5, [edi+8])
192	AS2(	add		esi, 16)
193	AS2(	add		edi, 16)
194	AS2(	movq	mm4, mm6)
195	ASS(	pshufw	mm2, mm6, 1, 0, 3, 2)
196	AS2(	pmuludq	mm6, mm5)
197	ASS(	pshufw	mm3, mm5, 1, 0, 3, 2)
198	AS2(	pmuludq	mm5, mm2)
199	AS2(	pmuludq	mm2, mm3)
200	AS2(	pmuludq	mm3, mm4)
201	AS2(	pxor	mm7, mm7)
202	AS2(	movd	[esp], mm6)
203	AS2(	psrlq	mm6, 32)
204	AS2(	movd	[esp+4], mm5)
205	AS2(	psrlq	mm5, 32)
206	AS2(	cmp		edi, ebp)
207	ASJ(	je,		1, f)
208	ASL(0)
209	AS2(	movq	mm0, [esi])
210	AS2(	paddq	mm0, [edi])
211	AS2(	movq	mm1, [esi+8])
212	AS2(	paddq	mm1, [edi+8])
213	AS2(	add		esi, 16)
214	AS2(	add		edi, 16)
215	AS2(	movq	mm4, mm0)
216	AS2(	paddq	mm5, mm2)
217	ASS(	pshufw	mm2, mm0, 1, 0, 3, 2)
218	AS2(	pmuludq	mm0, mm1)
219	AS2(	movd	[esp+8], mm3)
220	AS2(	psrlq	mm3, 32)
221	AS2(	paddq	mm5, mm3)
222	ASS(	pshufw	mm3, mm1, 1, 0, 3, 2)
223	AS2(	pmuludq	mm1, mm2)
224	AS2(	pmuludq	mm2, mm3)
225	AS2(	pmuludq	mm3, mm4)
226	AS2(	movd	mm4, [esp])
227	AS2(	paddq	mm7, mm4)
228	AS2(	movd	mm4, [esp+4])
229	AS2(	paddq	mm6, mm4)
230	AS2(	movd	mm4, [esp+8])
231	AS2(	paddq	mm6, mm4)
232	AS2(	movd	[esp], mm0)
233	AS2(	psrlq	mm0, 32)
234	AS2(	paddq	mm6, mm0)
235	AS2(	movd	[esp+4], mm1)
236	AS2(	psrlq	mm1, 32)
237	AS2(	paddq	mm5, mm1)
238	AS2(	cmp		edi, ebp)
239	ASJ(	jne,	0, b)
240	ASL(1)
241	AS2(	paddq	mm5, mm2)
242	AS2(	movd	[esp+8], mm3)
243	AS2(	psrlq	mm3, 32)
244	AS2(	paddq	mm5, mm3)
245	AS2(	movd	mm4, [esp])
246	AS2(	paddq	mm7, mm4)
247	AS2(	movd	mm4, [esp+4])
248	AS2(	paddq	mm6, mm4)
249	AS2(	movd	mm4, [esp+8])
250	AS2(	paddq	mm6, mm4)
251	AS2(	lea		ebp, [8*ebx])
252	AS2(	sub		edi, ebp)		// reset edi to start of nhK
253
254	AS2(	movd	[esp], mm7)
255	AS2(	psrlq	mm7, 32)
256	AS2(	paddq	mm6, mm7)
257	AS2(	movd	[esp+4], mm6)
258	AS2(	psrlq	mm6, 32)
259	AS2(	paddq	mm5, mm6)
260	AS2(	psllq	mm5, 2)
261	AS2(	psrlq	mm5, 2)
262
263#define a0 [eax+2*4]
264#define a1 [eax+3*4]
265#define a2 [eax+0*4]
266#define a3 [eax+1*4]
267#define k0 [eax+2*8+2*4]
268#define k1 [eax+2*8+3*4]
269#define k2 [eax+2*8+0*4]
270#define k3 [eax+2*8+1*4]
271	AS2(	test	dl, dl)
272	ASJ(	jz,		2, f)
273	AS2(	movd	mm1, k0)
274	AS2(	movd	mm0, [esp])
275	AS2(	paddq	mm0, mm1)
276	AS2(	movd	a0, mm0)
277	AS2(	psrlq	mm0, 32)
278	AS2(	movd	mm1, k1)
279	AS2(	movd	mm2, [esp+4])
280	AS2(	paddq	mm1, mm2)
281	AS2(	paddq	mm0, mm1)
282	AS2(	movd	a1, mm0)
283	AS2(	psrlq	mm0, 32)
284	AS2(	paddq	mm5, k2)
285	AS2(	paddq	mm0, mm5)
286	AS2(	movq	a2, mm0)
287	AS2(	xor		edx, edx)
288	ASJ(	jmp,	3, f)
289	ASL(2)
290	AS2(	movd	mm0, a3)
291	AS2(	movq	mm4, mm0)
292	AS2(	pmuludq	mm0, k3)		// a3*k3
293	AS2(	movd	mm1, a0)
294	AS2(	pmuludq	mm1, k2)		// a0*k2
295	AS2(	movd	mm2, a1)
296	AS2(	movd	mm6, k1)
297	AS2(	pmuludq	mm2, mm6)		// a1*k1
298	AS2(	movd	mm3, a2)
299	AS2(	psllq	mm0, 1)
300	AS2(	paddq	mm0, mm5)
301	AS2(	movq	mm5, mm3)
302	AS2(	movd	mm7, k0)
303	AS2(	pmuludq	mm3, mm7)		// a2*k0
304	AS2(	pmuludq	mm4, mm7)		// a3*k0
305	AS2(	pmuludq	mm5, mm6)		// a2*k1
306	AS2(	paddq	mm0, mm1)
307	AS2(	movd	mm1, a1)
308	AS2(	paddq	mm4, mm5)
309	AS2(	movq	mm5, mm1)
310	AS2(	pmuludq	mm1, k2)		// a1*k2
311	AS2(	paddq	mm0, mm2)
312	AS2(	movd	mm2, a0)
313	AS2(	paddq	mm0, mm3)
314	AS2(	movq	mm3, mm2)
315	AS2(	pmuludq	mm2, k3)		// a0*k3
316	AS2(	pmuludq	mm3, mm7)		// a0*k0
317	AS2(	movd	[esp+8], mm0)
318	AS2(	psrlq	mm0, 32)
319	AS2(	pmuludq	mm7, mm5)		// a1*k0
320	AS2(	pmuludq	mm5, k3)		// a1*k3
321	AS2(	paddq	mm0, mm1)
322	AS2(	movd	mm1, a2)
323	AS2(	pmuludq	mm1, k2)		// a2*k2
324	AS2(	paddq	mm0, mm2)
325	AS2(	paddq	mm0, mm4)
326	AS2(	movq	mm4, mm0)
327	AS2(	movd	mm2, a3)
328	AS2(	pmuludq	mm2, mm6)		// a3*k1
329	AS2(	pmuludq	mm6, a0)		// a0*k1
330	AS2(	psrlq	mm0, 31)
331	AS2(	paddq	mm0, mm3)
332	AS2(	movd	mm3, [esp])
333	AS2(	paddq	mm0, mm3)
334	AS2(	movd	mm3, a2)
335	AS2(	pmuludq	mm3, k3)		// a2*k3
336	AS2(	paddq	mm5, mm1)
337	AS2(	movd	mm1, a3)
338	AS2(	pmuludq	mm1, k2)		// a3*k2
339	AS2(	paddq	mm5, mm2)
340	AS2(	movd	mm2, [esp+4])
341	AS2(	psllq	mm5, 1)
342	AS2(	paddq	mm0, mm5)
343	AS2(	psllq	mm4, 33)
344	AS2(	movd	a0, mm0)
345	AS2(	psrlq	mm0, 32)
346	AS2(	paddq	mm6, mm7)
347	AS2(	movd	mm7, [esp+8])
348	AS2(	paddq	mm0, mm6)
349	AS2(	paddq	mm0, mm2)
350	AS2(	paddq	mm3, mm1)
351	AS2(	psllq	mm3, 1)
352	AS2(	paddq	mm0, mm3)
353	AS2(	psrlq	mm4, 1)
354	AS2(	movd	a1, mm0)
355	AS2(	psrlq	mm0, 32)
356	AS2(	por		mm4, mm7)
357	AS2(	paddq	mm0, mm4)
358	AS2(	movq	a2, mm0)
359#undef a0
360#undef a1
361#undef a2
362#undef a3
363#undef k0
364#undef k1
365#undef k2
366#undef k3
367
368	ASL(3)
369	AS2(	test	ecx, ecx)
370	ASJ(	jnz,	4, b)
371
372	AS2(	add		esp, 12)
373	AS1(	pop		ebp)
374	AS1(	emms)
375#ifdef __GNUC__
376	".att_syntax prefix;"
377	AS2(	mov	%0, %%ebx)
378		: "=m" (temp)
379		: "m" (L1KeyLength), "c" (blocksRemainingInWord64), "S" (data), "D" (nhK+tagPart*2), "d" (m_isFirstBlock), "a" (polyS+tagPart*4)
380		: "memory", "cc"
381	);
382#endif
383}
384#endif
385
386#if VMAC_BOOL_WORD128
387	#define DeclareNH(a) word128 a=0
388	#define MUL64(rh,rl,i1,i2) {word128 p = word128(i1)*(i2); rh = word64(p>>64); rl = word64(p);}
389	#define AccumulateNH(a, b, c) a += word128(b)*(c)
390	#define Multiply128(r, i1, i2) r = word128(word64(i1)) * word64(i2)
391#else
392	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
393		#define MUL32(a, b) __emulu(word32(a), word32(b))
394	#else
395		#define MUL32(a, b) ((word64)((word32)(a)) * (word32)(b))
396	#endif
397	#if defined(CRYPTOPP_X64_ASM_AVAILABLE)
398		#define DeclareNH(a)			word64 a##0=0, a##1=0
399		#define MUL64(rh,rl,i1,i2)		asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "g"(i2) : "cc");
400		#define AccumulateNH(a, b, c)	asm ("mulq %3; addq %%rax, %0; adcq %%rdx, %1" : "+r"(a##0), "+r"(a##1) : "a"(b), "g"(c) : "%rdx", "cc");
401		#define ADD128(rh,rl,ih,il)     asm ("addq %3, %1; adcq %2, %0" : "+r"(rh),"+r"(rl) : "r"(ih),"r"(il) : "cc");
402	#elif defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
403		#define DeclareNH(a) word64 a##0=0, a##1=0
404		#define MUL64(rh,rl,i1,i2)   (rl) = _umul128(i1,i2,&(rh));
405		#define AccumulateNH(a, b, c)	{\
406			word64 ph, pl;\
407			pl = _umul128(b,c,&ph);\
408			a##0 += pl;\
409			a##1 += ph + (a##0 < pl);}
410	#else
411		#define VMAC_BOOL_32BIT 1
412		#define DeclareNH(a) word64 a##0=0, a##1=0, a##2=0
413		#define MUL64(rh,rl,i1,i2)                                               \
414			{   word64 _i1 = (i1), _i2 = (i2);                                 \
415				word64 m1= MUL32(_i1,_i2>>32);                                 \
416				word64 m2= MUL32(_i1>>32,_i2);                                 \
417				rh         = MUL32(_i1>>32,_i2>>32);                             \
418				rl         = MUL32(_i1,_i2);                                     \
419				ADD128(rh,rl,(m1 >> 32),(m1 << 32));                             \
420				ADD128(rh,rl,(m2 >> 32),(m2 << 32));                             \
421			}
422		#define AccumulateNH(a, b, c)	{\
423			word64 p = MUL32(b, c);\
424			a##1 += word32((p)>>32);\
425			a##0 += word32(p);\
426			p = MUL32((b)>>32, c);\
427			a##2 += word32((p)>>32);\
428			a##1 += word32(p);\
429			p = MUL32((b)>>32, (c)>>32);\
430			a##2 += p;\
431			p = MUL32(b, (c)>>32);\
432			a##1 += word32(p);\
433			a##2 += word32(p>>32);}
434	#endif
435#endif
436#ifndef VMAC_BOOL_32BIT
437	#define VMAC_BOOL_32BIT 0
438#endif
439#ifndef ADD128
440	#define ADD128(rh,rl,ih,il)                                          \
441		{   word64 _il = (il);                                         \
442			(rl) += (_il);                                               \
443			(rh) += (ih) + ((rl) < (_il));                               \
444		}
445#endif
446
447#if !(defined(_MSC_VER) && _MSC_VER < 1300)
448template <bool T_128BitTag>
449#endif
450void VMAC_Base::VHASH_Update_Template(const word64 *data, size_t blocksRemainingInWord64)
451{
452	#define INNER_LOOP_ITERATION(j)	{\
453		word64 d0 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+0]);\
454		word64 d1 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+1]);\
455		AccumulateNH(nhA, d0+nhK[i+2*j+0], d1+nhK[i+2*j+1]);\
456		if (T_128BitTag)\
457			AccumulateNH(nhB, d0+nhK[i+2*j+2], d1+nhK[i+2*j+3]);\
458		}
459
460#if (defined(_MSC_VER) && _MSC_VER < 1300)
461	bool T_128BitTag = m_is128;
462#endif
463	size_t L1KeyLengthInWord64 = m_L1KeyLength / 8;
464	size_t innerLoopEnd = L1KeyLengthInWord64;
465	const word64 *nhK = m_nhKey();
466	word64 *polyS = m_polyState();
467	bool isFirstBlock = true;
468	size_t i;
469
470	#if !VMAC_BOOL_32BIT
471		#if VMAC_BOOL_WORD128
472			word128 a1, a2;
473		#else
474			word64 ah1, al1, ah2, al2;
475		#endif
476		word64 kh1, kl1, kh2, kl2;
477		kh1=(polyS+0*4+2)[0]; kl1=(polyS+0*4+2)[1];
478		if (T_128BitTag)
479		{
480			kh2=(polyS+1*4+2)[0]; kl2=(polyS+1*4+2)[1];
481		}
482	#endif
483
484	do
485	{
486		DeclareNH(nhA);
487		DeclareNH(nhB);
488
489		i = 0;
490		if (blocksRemainingInWord64 < L1KeyLengthInWord64)
491		{
492			if (blocksRemainingInWord64 % 8)
493			{
494				innerLoopEnd = blocksRemainingInWord64 % 8;
495				for (; i<innerLoopEnd; i+=2)
496					INNER_LOOP_ITERATION(0);
497			}
498			innerLoopEnd = blocksRemainingInWord64;
499		}
500		for (; i<innerLoopEnd; i+=8)
501		{
502			INNER_LOOP_ITERATION(0);
503			INNER_LOOP_ITERATION(1);
504			INNER_LOOP_ITERATION(2);
505			INNER_LOOP_ITERATION(3);
506		}
507		blocksRemainingInWord64 -= innerLoopEnd;
508		data += innerLoopEnd;
509
510		#if VMAC_BOOL_32BIT
511			word32 nh0[2],  nh1[2];
512			word64 nh2[2];
513
514			nh0[0] = word32(nhA0);
515			nhA1 += (nhA0 >> 32);
516			nh1[0] = word32(nhA1);
517			nh2[0] = (nhA2 + (nhA1 >> 32)) & m62;
518
519			if (T_128BitTag)
520			{
521				nh0[1] = word32(nhB0);
522				nhB1 += (nhB0 >> 32);
523				nh1[1] = word32(nhB1);
524				nh2[1] = (nhB2 + (nhB1 >> 32)) & m62;
525			}
526
527			#define a0 (((word32 *)(polyS+i*4))[2+NativeByteOrder::ToEnum()])
528			#define a1 (*(((word32 *)(polyS+i*4))+3-NativeByteOrder::ToEnum()))		// workaround for GCC 3.2
529			#define a2 (((word32 *)(polyS+i*4))[0+NativeByteOrder::ToEnum()])
530			#define a3 (*(((word32 *)(polyS+i*4))+1-NativeByteOrder::ToEnum()))
531			#define aHi ((polyS+i*4)[0])
532			#define k0 (((word32 *)(polyS+i*4+2))[2+NativeByteOrder::ToEnum()])
533			#define k1 (*(((word32 *)(polyS+i*4+2))+3-NativeByteOrder::ToEnum()))
534			#define k2 (((word32 *)(polyS+i*4+2))[0+NativeByteOrder::ToEnum()])
535			#define k3 (*(((word32 *)(polyS+i*4+2))+1-NativeByteOrder::ToEnum()))
536			#define kHi ((polyS+i*4+2)[0])
537
538			if (isFirstBlock)
539			{
540				isFirstBlock = false;
541				if (m_isFirstBlock)
542				{
543					m_isFirstBlock = false;
544					for (i=0; i<=(size_t)T_128BitTag; i++)
545					{
546						word64 t = (word64)nh0[i] + k0;
547						a0 = (word32)t;
548						t = (t >> 32) + nh1[i] + k1;
549						a1 = (word32)t;
550						aHi = (t >> 32) + nh2[i] + kHi;
551					}
552					continue;
553				}
554			}
555			for (i=0; i<=(size_t)T_128BitTag; i++)
556			{
557				word64 p, t;
558				word32 t2;
559
560				p = MUL32(a3, 2*k3);
561				p += nh2[i];
562				p += MUL32(a0, k2);
563				p += MUL32(a1, k1);
564				p += MUL32(a2, k0);
565				t2 = (word32)p;
566				p >>= 32;
567				p += MUL32(a0, k3);
568				p += MUL32(a1, k2);
569				p += MUL32(a2, k1);
570				p += MUL32(a3, k0);
571				t = (word64(word32(p) & 0x7fffffff) << 32) | t2;
572				p >>= 31;
573				p += nh0[i];
574				p += MUL32(a0, k0);
575				p += MUL32(a1, 2*k3);
576				p += MUL32(a2, 2*k2);
577				p += MUL32(a3, 2*k1);
578				t2 = (word32)p;
579				p >>= 32;
580				p += nh1[i];
581				p += MUL32(a0, k1);
582				p += MUL32(a1, k0);
583				p += MUL32(a2, 2*k3);
584				p += MUL32(a3, 2*k2);
585				a0 = t2;
586				a1 = (word32)p;
587				aHi = (p >> 32) + t;
588			}
589
590			#undef a0
591			#undef a1
592			#undef a2
593			#undef a3
594			#undef aHi
595			#undef k0
596			#undef k1
597			#undef k2
598			#undef k3
599			#undef kHi
600		#else		// #if VMAC_BOOL_32BIT
601			if (isFirstBlock)
602			{
603				isFirstBlock = false;
604				if (m_isFirstBlock)
605				{
606					m_isFirstBlock = false;
607					#if VMAC_BOOL_WORD128
608						#define first_poly_step(a, kh, kl, m)	a = (m & m126) + ((word128(kh) << 64) | kl)
609
610						first_poly_step(a1, kh1, kl1, nhA);
611						if (T_128BitTag)
612							first_poly_step(a2, kh2, kl2, nhB);
613					#else
614						#define first_poly_step(ah, al, kh, kl, mh, ml)		{\
615							mh &= m62;\
616							ADD128(mh, ml, kh, kl);	\
617							ah = mh; al = ml;}
618
619						first_poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
620						if (T_128BitTag)
621							first_poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
622					#endif
623					continue;
624				}
625				else
626				{
627					#if VMAC_BOOL_WORD128
628						a1 = (word128((polyS+0*4)[0]) << 64) | (polyS+0*4)[1];
629					#else
630						ah1=(polyS+0*4)[0]; al1=(polyS+0*4)[1];
631					#endif
632					if (T_128BitTag)
633					{
634						#if VMAC_BOOL_WORD128
635							a2 = (word128((polyS+1*4)[0]) << 64) | (polyS+1*4)[1];
636						#else
637							ah2=(polyS+1*4)[0]; al2=(polyS+1*4)[1];
638						#endif
639					}
640				}
641			}
642
643			#if VMAC_BOOL_WORD128
644				#define poly_step(a, kh, kl, m)	\
645				{   word128 t1, t2, t3, t4;\
646					Multiply128(t2, a>>64, kl);\
647					Multiply128(t3, a, kh);\
648					Multiply128(t1, a, kl);\
649					Multiply128(t4, a>>64, 2*kh);\
650					t2 += t3;\
651					t4 += t1;\
652					t2 += t4>>64;\
653					a = (word128(word64(t2)&m63) << 64) | word64(t4);\
654					t2 *= 2;\
655					a += m & m126;\
656					a += t2>>64;}
657
658				poly_step(a1, kh1, kl1, nhA);
659				if (T_128BitTag)
660					poly_step(a2, kh2, kl2, nhB);
661			#else
662				#define poly_step(ah, al, kh, kl, mh, ml)					\
663				{   word64 t1h, t1l, t2h, t2l, t3h, t3l, z=0;				\
664					/* compute ab*cd, put bd into result registers */       \
665					MUL64(t2h,t2l,ah,kl);                                   \
666					MUL64(t3h,t3l,al,kh);                                   \
667					MUL64(t1h,t1l,ah,2*kh);                                 \
668					MUL64(ah,al,al,kl);                                     \
669					/* add together ad + bc */                              \
670					ADD128(t2h,t2l,t3h,t3l);                                \
671					/* add 2 * ac to result */                              \
672					ADD128(ah,al,t1h,t1l);                                  \
673					/* now (ah,al), (t2l,2*t2h) need summing */             \
674					/* first add the high registers, carrying into t2h */   \
675					ADD128(t2h,ah,z,t2l);                                   \
676					/* double t2h and add top bit of ah */                  \
677					t2h += t2h + (ah >> 63);                                \
678					ah &= m63;                                              \
679					/* now add the low registers */                         \
680					mh &= m62;												\
681					ADD128(ah,al,mh,ml);                                    \
682					ADD128(ah,al,z,t2h);                                    \
683				}
684
685				poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
686				if (T_128BitTag)
687					poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
688			#endif
689		#endif		// #if VMAC_BOOL_32BIT
690	} while (blocksRemainingInWord64);
691
692	#if VMAC_BOOL_WORD128
693		(polyS+0*4)[0]=word64(a1>>64); (polyS+0*4)[1]=word64(a1);
694		if (T_128BitTag)
695		{
696			(polyS+1*4)[0]=word64(a2>>64); (polyS+1*4)[1]=word64(a2);
697		}
698	#elif !VMAC_BOOL_32BIT
699		(polyS+0*4)[0]=ah1; (polyS+0*4)[1]=al1;
700		if (T_128BitTag)
701		{
702			(polyS+1*4)[0]=ah2; (polyS+1*4)[1]=al2;
703		}
704	#endif
705}
706
707inline void VMAC_Base::VHASH_Update(const word64 *data, size_t blocksRemainingInWord64)
708{
709#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
710	if (HasSSE2())
711	{
712		VHASH_Update_SSE2(data, blocksRemainingInWord64, 0);
713		if (m_is128)
714			VHASH_Update_SSE2(data, blocksRemainingInWord64, 1);
715		m_isFirstBlock = false;
716	}
717	else
718#endif
719	{
720#if defined(_MSC_VER) && _MSC_VER < 1300
721		VHASH_Update_Template(data, blocksRemainingInWord64);
722#else
723		if (m_is128)
724			VHASH_Update_Template<true>(data, blocksRemainingInWord64);
725		else
726			VHASH_Update_Template<false>(data, blocksRemainingInWord64);
727#endif
728	}
729}
730
731size_t VMAC_Base::HashMultipleBlocks(const word64 *data, size_t length)
732{
733	size_t remaining = ModPowerOf2(length, m_L1KeyLength);
734	VHASH_Update(data, (length-remaining)/8);
735	return remaining;
736}
737
738static word64 L3Hash(const word64 *input, const word64 *l3Key, size_t len)
739{
740    word64 rh, rl, t, z=0;
741	word64 p1 = input[0], p2 = input[1];
742	word64 k1 = l3Key[0], k2 = l3Key[1];
743
744    /* fully reduce (p1,p2)+(len,0) mod p127 */
745    t = p1 >> 63;
746    p1 &= m63;
747    ADD128(p1, p2, len, t);
748    /* At this point, (p1,p2) is at most 2^127+(len<<64) */
749    t = (p1 > m63) + ((p1 == m63) & (p2 == m64));
750    ADD128(p1, p2, z, t);
751    p1 &= m63;
752
753    /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
754    t = p1 + (p2 >> 32);
755    t += (t >> 32);
756    t += (word32)t > 0xfffffffeU;
757    p1 += (t >> 32);
758    p2 += (p1 << 32);
759
760    /* compute (p1+k1)%p64 and (p2+k2)%p64 */
761    p1 += k1;
762    p1 += (0 - (p1 < k1)) & 257;
763    p2 += k2;
764    p2 += (0 - (p2 < k2)) & 257;
765
766    /* compute (p1+k1)*(p2+k2)%p64 */
767    MUL64(rh, rl, p1, p2);
768    t = rh >> 56;
769    ADD128(t, rl, z, rh);
770    rh <<= 8;
771    ADD128(t, rl, z, rh);
772    t += t << 8;
773    rl += t;
774    rl += (0 - (rl < t)) & 257;
775    rl += (0 - (rl > p64-1)) & 257;
776    return rl;
777}
778
779void VMAC_Base::TruncatedFinal(byte *mac, size_t size)
780{
781	size_t len = ModPowerOf2(GetBitCountLo()/8, m_L1KeyLength);
782
783	if (len)
784	{
785		memset(m_data()+len, 0, (0-len)%16);
786		VHASH_Update(DataBuf(), ((len+15)/16)*2);
787		len *= 8;	// convert to bits
788	}
789	else if (m_isFirstBlock)
790	{
791		// special case for empty string
792		m_polyState()[0] = m_polyState()[2];
793		m_polyState()[1] = m_polyState()[3];
794		if (m_is128)
795		{
796			m_polyState()[4] = m_polyState()[6];
797			m_polyState()[5] = m_polyState()[7];
798		}
799	}
800
801	if (m_is128)
802	{
803		word64 t[2];
804		t[0] = L3Hash(m_polyState(), m_l3Key(), len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad());
805		t[1] = L3Hash(m_polyState()+4, m_l3Key()+2, len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad()+8);
806		if (size == 16)
807		{
808			PutWord(false, BIG_ENDIAN_ORDER, mac, t[0]);
809			PutWord(false, BIG_ENDIAN_ORDER, mac+8, t[1]);
810		}
811		else
812		{
813			t[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[0]);
814			t[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[1]);
815			memcpy(mac, t, size);
816		}
817	}
818	else
819	{
820		word64 t = L3Hash(m_polyState(), m_l3Key(), len);
821		t += GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad() + (m_nonce()[IVSize()-1]&1) * 8);
822		if (size == 8)
823			PutWord(false, BIG_ENDIAN_ORDER, mac, t);
824		else
825		{
826			t = ConditionalByteReverse(BIG_ENDIAN_ORDER, t);
827			memcpy(mac, &t, size);
828		}
829	}
830}
831
832NAMESPACE_END
833