amule/libcryptoxx-5.6.0/integer.cpp

// integer.cpp - written and placed in the public domain by Wei Dai
// contains public domain code contributed by Alister Lee and Leonard Janke

#include "pch.h"

#ifndef CRYPTOPP_IMPORTS

#include "integer.h"
#include "modarith.h"
#include "nbtheory.h"
#include "asn.h"
#include "oids.h"
#include "words.h"
#include "algparam.h"
#include "pubkey.h"		// for P1363_KDF2
#include "sha.h"
#include "cpu.h"

#include <iostream>

#if _MSC_VER >= 1400
	#include <intrin.h>
#endif

#ifdef __DECCXX
	#include <c_asm.h>
#endif

#ifdef CRYPTOPP_MSVC6_NO_PP
	#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
#endif

#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)

NAMESPACE_BEGIN(CryptoPP)

bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
{
	if (valueType != typeid(Integer))
		return false;
	*reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt);
	return true;
}

inline static int Compare(const word *A, const word *B, size_t N)
{
	while (N--)
		if (A[N] > B[N])
			return 1;
		else if (A[N] < B[N])
			return -1;

	return 0;
}

inline static int Increment(word *A, size_t N, word B=1)
{
	assert(N);
	word t = A[0];
	A[0] = t+B;
	if (A[0] >= t)
		return 0;
	for (unsigned i=1; i<N; i++)
		if (++A[i])
			return 0;
	return 1;
}

inline static int Decrement(word *A, size_t N, word B=1)
{
	assert(N);
	word t = A[0];
	A[0] = t-B;
	if (A[0] <= t)
		return 0;
	for (unsigned i=1; i<N; i++)
		if (A[i]--)
			return 0;
	return 1;
}

static void TwosComplement(word *A, size_t N)
{
	Decrement(A, N);
	for (unsigned i=0; i<N; i++)
		A[i] = ~A[i];
}

static word AtomicInverseModPower2(word A)
{
	assert(A%2==1);

	word R=A%8;

	for (unsigned i=3; i<WORD_BITS; i*=2)
		R = R*(2-R*A);

	assert(R*A==1);
	return R;
}

// ********************************************************

#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || (defined(__x86_64__) && defined(CRYPTOPP_WORD128_AVAILABLE))
	#define Declare2Words(x)			word x##0, x##1;
	#define AssignWord(a, b)			a##0 = b; a##1 = 0;
	#define Add2WordsBy1(a, b, c)		a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
	#define LowWord(a)					a##0
	#define HighWord(a)					a##1
	#ifdef _MSC_VER
		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = _umul128(a, b, &p1);
		#ifndef __INTEL_COMPILER
			#define Double3Words(c, d)		d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
		#endif
	#elif defined(__DECCXX)
		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = a*b; p1 = asm("umulh %a0, %a1, %v0", a, b);
	#elif defined(__x86_64__)
		#ifdef __SUNPRO_CC
			// Sun Studio's gcc-style inline assembly is heavily bugged as of version 5.9 Patch 124864-09 2008/12/16, but this one works
			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "r"(b) : "cc");
		#else
			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
			#define MulAcc(c, d, a, b)		asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
			#define Double3Words(c, d)		asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
			#define Acc2WordsBy1(a, b)		asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
			#define Acc2WordsBy2(a, b)		asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
			#define Acc3WordsBy2(c, d, e)	asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
		#endif
	#endif
	#define MultiplyWords(p, a, b)		MultiplyWordsLoHi(p##0, p##1, a, b)
	#ifndef Double3Words
		#define Double3Words(c, d)		d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
	#endif
	#ifndef Acc2WordsBy2
		#define Acc2WordsBy2(a, b)		a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
	#endif
	#define AddWithCarry(u, a, b)		{word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
	#define SubtractWithBorrow(u, a, b)	{word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
	#define GetCarry(u)					u##1
	#define GetBorrow(u)				u##1
#else
	#define Declare2Words(x)			dword x;
	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
		#define MultiplyWords(p, a, b)		p = __emulu(a, b);
	#else
		#define MultiplyWords(p, a, b)		p = (dword)a*b;
	#endif
	#define AssignWord(a, b)			a = b;
	#define Add2WordsBy1(a, b, c)		a = b + c;
	#define Acc2WordsBy2(a, b)			a += b;
	#define LowWord(a)					word(a)
	#define HighWord(a)					word(a>>WORD_BITS)
	#define Double3Words(c, d)			d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
	#define AddWithCarry(u, a, b)		u = dword(a) + b + GetCarry(u);
	#define SubtractWithBorrow(u, a, b)	u = dword(a) - b - GetBorrow(u);
	#define GetCarry(u)					HighWord(u)
	#define GetBorrow(u)				word(u>>(WORD_BITS*2-1))
#endif
#ifndef MulAcc
	#define MulAcc(c, d, a, b)			MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
#endif
#ifndef Acc2WordsBy1
	#define Acc2WordsBy1(a, b)			Add2WordsBy1(a, a, b)
#endif
#ifndef Acc3WordsBy2
	#define Acc3WordsBy2(c, d, e)		Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
#endif

class DWord
{
public:
	DWord() {}

#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
	explicit DWord(word low)
	{
		m_whole = low;
	}
#else
	explicit DWord(word low)
	{
		m_halfs.low = low;
		m_halfs.high = 0;
	}
#endif

	DWord(word low, word high)
	{
		m_halfs.low = low;
		m_halfs.high = high;
	}

	static DWord Multiply(word a, word b)
	{
		DWord r;
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
			r.m_whole = (dword)a * b;
		#elif defined(MultiplyWordsLoHi)
			MultiplyWordsLoHi(r.m_halfs.low, r.m_halfs.high, a, b);
		#endif
		return r;
	}

	static DWord MultiplyAndAdd(word a, word b, word c)
	{
		DWord r = Multiply(a, b);
		return r += c;
	}

	DWord & operator+=(word a)
	{
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
			m_whole = m_whole + a;
		#else
			m_halfs.low += a;
			m_halfs.high += (m_halfs.low < a);
		#endif
		return *this;
	}

	DWord operator+(word a)
	{
		DWord r;
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
			r.m_whole = m_whole + a;
		#else
			r.m_halfs.low = m_halfs.low + a;
			r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
		#endif
		return r;
	}

	DWord operator-(DWord a)
	{
		DWord r;
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
			r.m_whole = m_whole - a.m_whole;
		#else
			r.m_halfs.low = m_halfs.low - a.m_halfs.low;
			r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
		#endif
		return r;
	}

	DWord operator-(word a)
	{
		DWord r;
		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
			r.m_whole = m_whole - a;
		#else
			r.m_halfs.low = m_halfs.low - a;
			r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
		#endif
		return r;
	}

	// returns quotient, which must fit in a word
	word operator/(word divisor);

	word operator%(word a);

	bool operator!() const
	{
	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
		return !m_whole;
	#else
		return !m_halfs.high && !m_halfs.low;
	#endif
	}

	word GetLowHalf() const {return m_halfs.low;}
	word GetHighHalf() const {return m_halfs.high;}
	word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}

private:
	union
	{
	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
		dword m_whole;
	#endif
		struct
		{
		#ifdef IS_LITTLE_ENDIAN
			word low;
			word high;
		#else
			word high;
			word low;
		#endif
		} m_halfs;
	};
};

class Word
{
public:
	Word() {}

	Word(word value)
	{
		m_whole = value;
	}

	Word(hword low, hword high)
	{
		m_whole = low | (word(high) << (WORD_BITS/2));
	}

	static Word Multiply(hword a, hword b)
	{
		Word r;
		r.m_whole = (word)a * b;
		return r;
	}

	Word operator-(Word a)
	{
		Word r;
		r.m_whole = m_whole - a.m_whole;
		return r;
	}

	Word operator-(hword a)
	{
		Word r;
		r.m_whole = m_whole - a;
		return r;
	}

	// returns quotient, which must fit in a word
	hword operator/(hword divisor)
	{
		return hword(m_whole / divisor);
	}

	bool operator!() const
	{
		return !m_whole;
	}

	word GetWhole() const {return m_whole;}
	hword GetLowHalf() const {return hword(m_whole);}
	hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
	hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}

private:
	word m_whole;
};

// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
template <class S, class D>
S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULL)
{
	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));

	// estimate the quotient: do a 2 S by 1 S divide
	S Q;
	if (S(B1+1) == 0)
		Q = A[2];
	else
		Q = D(A[1], A[2]) / S(B1+1);

	// now subtract Q*B from A
	D p = D::Multiply(B0, Q);
	D u = (D) A[0] - p.GetLowHalf();
	A[0] = u.GetLowHalf();
	u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
	A[1] = u.GetLowHalf();
	A[2] += u.GetHighHalf();

	// Q <= actual quotient, so fix it
	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
	{
		u = (D) A[0] - B0;
		A[0] = u.GetLowHalf();
		u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
		A[1] = u.GetLowHalf();
		A[2] += u.GetHighHalf();
		Q++;
		assert(Q);	// shouldn't overflow
	}

	return Q;
}

// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
template <class S, class D>
inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
{
	if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
		return D(Ah.GetLowHalf(), Ah.GetHighHalf());
	else
	{
		S Q[2];
		T[0] = Al.GetLowHalf();
		T[1] = Al.GetHighHalf();
		T[2] = Ah.GetLowHalf();
		T[3] = Ah.GetHighHalf();
		Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
		Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
		return D(Q[0], Q[1]);
	}
}

// returns quotient, which must fit in a word
inline word DWord::operator/(word a)
{
	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
		return word(m_whole / a);
	#else
		hword r[4];
		return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
	#endif
}

inline word DWord::operator%(word a)
{
	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
		return word(m_whole % a);
	#else
		if (a < (word(1) << (WORD_BITS/2)))
		{
			hword h = hword(a);
			word r = m_halfs.high % h;
			r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
			return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
		}
		else
		{
			hword r[4];
			DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
			return Word(r[0], r[1]).GetWhole();
		}
	#endif
}

// ********************************************************

// use some tricks to share assembly code between MSVC and GCC
#if defined(__GNUC__)
	#define AddPrologue \
		int result;	\
		__asm__ __volatile__ \
		( \
			".intel_syntax noprefix;"
	#define AddEpilogue \
			".att_syntax prefix;" \
					: "=a" (result)\
					: "d" (C), "a" (A), "D" (B), "c" (N) \
					: "%esi", "memory", "cc" \
		);\
		return result;
	#define MulPrologue \
		__asm__ __volatile__ \
		( \
			".intel_syntax noprefix;" \
			AS1(	push	ebx) \
			AS2(	mov		ebx, edx)
	#define MulEpilogue \
			AS1(	pop		ebx) \
			".att_syntax prefix;" \
			: \
			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
			: "%esi", "memory", "cc" \
		);
	#define SquPrologue		MulPrologue
	#define SquEpilogue	\
			AS1(	pop		ebx) \
			".att_syntax prefix;" \
			: \
			: "d" (s_maskLow16), "c" (C), "a" (A) \
			: "%esi", "%edi", "memory", "cc" \
		);
	#define TopPrologue		MulPrologue
	#define TopEpilogue	\
			AS1(	pop		ebx) \
			".att_syntax prefix;" \
			: \
			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
			: "memory", "cc" \
		);
#else
	#define AddPrologue \
		__asm	push edi \
		__asm	push esi \
		__asm	mov		eax, [esp+12] \
		__asm	mov		edi, [esp+16]
	#define AddEpilogue \
		__asm	pop esi \
		__asm	pop edi \
		__asm	ret 8
#if _MSC_VER < 1300
	#define SaveEBX		__asm push ebx
	#define RestoreEBX	__asm pop ebx
#else
	#define SaveEBX
	#define RestoreEBX
#endif
	#define SquPrologue					\
		AS2(	mov		eax, A)			\
		AS2(	mov		ecx, C)			\
		SaveEBX							\
		AS2(	lea		ebx, s_maskLow16)
	#define MulPrologue					\
		AS2(	mov		eax, A)			\
		AS2(	mov		edi, B)			\
		AS2(	mov		ecx, C)			\
		SaveEBX							\
		AS2(	lea		ebx, s_maskLow16)
	#define TopPrologue					\
		AS2(	mov		eax, A)			\
		AS2(	mov		edi, B)			\
		AS2(	mov		ecx, C)			\
		AS2(	mov		esi, L)			\
		SaveEBX							\
		AS2(	lea		ebx, s_maskLow16)
	#define SquEpilogue		RestoreEBX
	#define MulEpilogue		RestoreEBX
	#define TopEpilogue		RestoreEBX
#endif

#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
int Baseline_Add(size_t N, word *C, const word *A, const word *B);
int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
}
#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
int Baseline_Add(size_t N, word *C, const word *A, const word *B)
{
	word result;
	__asm__ __volatile__
	(
	".intel_syntax;"
	AS1(	neg		%1)
	ASJ(	jz,		1, f)
	AS2(	mov		%0,[%3+8*%1])
	AS2(	add		%0,[%4+8*%1])
	AS2(	mov		[%2+8*%1],%0)
	ASL(0)
	AS2(	mov		%0,[%3+8*%1+8])
	AS2(	adc		%0,[%4+8*%1+8])
	AS2(	mov		[%2+8*%1+8],%0)
	AS2(	lea		%1,[%1+2])
	ASJ(	jrcxz,	1, f)
	AS2(	mov		%0,[%3+8*%1])
	AS2(	adc		%0,[%4+8*%1])
	AS2(	mov		[%2+8*%1],%0)
	ASJ(	jmp,	0, b)
	ASL(1)
	AS2(	mov		%0, 0)
	AS2(	adc		%0, %0)
	".att_syntax;"
	: "=&r" (result), "+c" (N)
	: "r" (C+N), "r" (A+N), "r" (B+N)
	: "memory", "cc"
	);
	return (int)result;
}

int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{
	word result;
	__asm__ __volatile__
	(
	".intel_syntax;"
	AS1(	neg		%1)
	ASJ(	jz,		1, f)
	AS2(	mov		%0,[%3+8*%1])
	AS2(	sub		%0,[%4+8*%1])
	AS2(	mov		[%2+8*%1],%0)
	ASL(0)
	AS2(	mov		%0,[%3+8*%1+8])
	AS2(	sbb		%0,[%4+8*%1+8])
	AS2(	mov		[%2+8*%1+8],%0)
	AS2(	lea		%1,[%1+2])
	ASJ(	jrcxz,	1, f)
	AS2(	mov		%0,[%3+8*%1])
	AS2(	sbb		%0,[%4+8*%1])
	AS2(	mov		[%2+8*%1],%0)
	ASJ(	jmp,	0, b)
	ASL(1)
	AS2(	mov		%0, 0)
	AS2(	adc		%0, %0)
	".att_syntax;"
	: "=&r" (result), "+c" (N)
	: "r" (C+N), "r" (A+N), "r" (B+N)
	: "memory", "cc"
	);
	return (int)result;
}
#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
{
	AddPrologue

	// now: eax = A, edi = B, edx = C, ecx = N
	AS2(	lea		eax, [eax+4*ecx])
	AS2(	lea		edi, [edi+4*ecx])
	AS2(	lea		edx, [edx+4*ecx])

	AS1(	neg		ecx)				// ecx is negative index
	AS2(	test	ecx, 2)				// this clears carry flag
	ASJ(	jz,		0, f)
	AS2(	sub		ecx, 2)
	ASJ(	jmp,	1, f)

	ASL(0)
	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
	AS2(	mov		esi,[eax+4*ecx])
	AS2(	adc		esi,[edi+4*ecx])
	AS2(	mov		[edx+4*ecx],esi)
	AS2(	mov		esi,[eax+4*ecx+4])
	AS2(	adc		esi,[edi+4*ecx+4])
	AS2(	mov		[edx+4*ecx+4],esi)
	ASL(1)
	AS2(	mov		esi,[eax+4*ecx+8])
	AS2(	adc		esi,[edi+4*ecx+8])
	AS2(	mov		[edx+4*ecx+8],esi)
	AS2(	mov		esi,[eax+4*ecx+12])
	AS2(	adc		esi,[edi+4*ecx+12])
	AS2(	mov		[edx+4*ecx+12],esi)

	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
	ASJ(	jmp,	0, b)

	ASL(2)
	AS2(	mov		eax, 0)
	AS1(	setc	al)					// store carry into eax (return result register)

	AddEpilogue
}

CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{
	AddPrologue

	// now: eax = A, edi = B, edx = C, ecx = N
	AS2(	lea		eax, [eax+4*ecx])
	AS2(	lea		edi, [edi+4*ecx])
	AS2(	lea		edx, [edx+4*ecx])

	AS1(	neg		ecx)				// ecx is negative index
	AS2(	test	ecx, 2)				// this clears carry flag
	ASJ(	jz,		0, f)
	AS2(	sub		ecx, 2)
	ASJ(	jmp,	1, f)

	ASL(0)
	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
	AS2(	mov		esi,[eax+4*ecx])
	AS2(	sbb		esi,[edi+4*ecx])
	AS2(	mov		[edx+4*ecx],esi)
	AS2(	mov		esi,[eax+4*ecx+4])
	AS2(	sbb		esi,[edi+4*ecx+4])
	AS2(	mov		[edx+4*ecx+4],esi)
	ASL(1)
	AS2(	mov		esi,[eax+4*ecx+8])
	AS2(	sbb		esi,[edi+4*ecx+8])
	AS2(	mov		[edx+4*ecx+8],esi)
	AS2(	mov		esi,[eax+4*ecx+12])
	AS2(	sbb		esi,[edi+4*ecx+12])
	AS2(	mov		[edx+4*ecx+12],esi)

	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
	ASJ(	jmp,	0, b)

	ASL(2)
	AS2(	mov		eax, 0)
	AS1(	setc	al)					// store carry into eax (return result register)

	AddEpilogue
}

#if CRYPTOPP_INTEGER_SSE2
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
{
	AddPrologue

	// now: eax = A, edi = B, edx = C, ecx = N
	AS2(	lea		eax, [eax+4*ecx])
	AS2(	lea		edi, [edi+4*ecx])
	AS2(	lea		edx, [edx+4*ecx])

	AS1(	neg		ecx)				// ecx is negative index
	AS2(	pxor    mm2, mm2)
	ASJ(	jz,		2, f)
	AS2(	test	ecx, 2)				// this clears carry flag
	ASJ(	jz,		0, f)
	AS2(	sub		ecx, 2)
	ASJ(	jmp,	1, f)

	ASL(0)
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
	AS2(	paddq    mm0, mm1)
	AS2(	paddq	 mm2, mm0)
	AS2(	movd	 DWORD PTR [edx+4*ecx], mm2)
	AS2(	psrlq    mm2, 32)

	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+4])
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
	AS2(	paddq    mm0, mm1)
	AS2(	paddq	 mm2, mm0)
	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
	AS2(	psrlq    mm2, 32)

	ASL(1)
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
	AS2(	paddq    mm0, mm1)
	AS2(	paddq	 mm2, mm0)
	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm2)
	AS2(	psrlq    mm2, 32)

	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+12])
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
	AS2(	paddq    mm0, mm1)
	AS2(	paddq	 mm2, mm0)
	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
	AS2(	psrlq    mm2, 32)

	AS2(	add		ecx, 4)
	ASJ(	jnz,	0, b)

	ASL(2)
	AS2(	movd	eax, mm2)
	AS1(	emms)

	AddEpilogue
}
CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
{
	AddPrologue

	// now: eax = A, edi = B, edx = C, ecx = N
	AS2(	lea		eax, [eax+4*ecx])
	AS2(	lea		edi, [edi+4*ecx])
	AS2(	lea		edx, [edx+4*ecx])

	AS1(	neg		ecx)				// ecx is negative index
	AS2(	pxor    mm2, mm2)
	ASJ(	jz,		2, f)
	AS2(	test	ecx, 2)				// this clears carry flag
	ASJ(	jz,		0, f)
	AS2(	sub		ecx, 2)
	ASJ(	jmp,	1, f)

	ASL(0)
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
	AS2(	psubq    mm0, mm1)
	AS2(	psubq	 mm0, mm2)
	AS2(	movd	 DWORD PTR [edx+4*ecx], mm0)
	AS2(	psrlq    mm0, 63)

	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+4])
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
	AS2(	psubq    mm2, mm1)
	AS2(	psubq	 mm2, mm0)
	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
	AS2(	psrlq    mm2, 63)

	ASL(1)
	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
	AS2(	psubq    mm0, mm1)
	AS2(	psubq	 mm0, mm2)
	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm0)
	AS2(	psrlq    mm0, 63)

	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+12])
	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
	AS2(	psubq    mm2, mm1)
	AS2(	psubq	 mm2, mm0)
	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
	AS2(	psrlq    mm2, 63)

	AS2(	add		ecx, 4)
	ASJ(	jnz,	0, b)

	ASL(2)
	AS2(	movd	eax, mm2)
	AS1(	emms)

	AddEpilogue
}
#endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#else
int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
{
	assert (N%2 == 0);

	Declare2Words(u);
	AssignWord(u, 0);
	for (size_t i=0; i<N; i+=2)
	{
		AddWithCarry(u, A[i], B[i]);
		C[i] = LowWord(u);
		AddWithCarry(u, A[i+1], B[i+1]);
		C[i+1] = LowWord(u);
	}
	return int(GetCarry(u));
}

int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
{
	assert (N%2 == 0);

	Declare2Words(u);
	AssignWord(u, 0);
	for (size_t i=0; i<N; i+=2)
	{
		SubtractWithBorrow(u, A[i], B[i]);
		C[i] = LowWord(u);
		SubtractWithBorrow(u, A[i+1], B[i+1]);
		C[i+1] = LowWord(u);
	}
	return int(GetBorrow(u));
}
#endif

static word LinearMultiply(word *C, const word *A, word B, size_t N)
{
	word carry=0;
	for(unsigned i=0; i<N; i++)
	{
		Declare2Words(p);
		MultiplyWords(p, A[i], B);
		Acc2WordsBy1(p, carry);
		C[i] = LowWord(p);
		carry = HighWord(p);
	}
	return carry;
}

#ifndef CRYPTOPP_DOXYGEN_PROCESSING

#define Mul_2 \
	Mul_Begin(2) \
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	Mul_End(1, 1)

#define Mul_4 \
	Mul_Begin(4) \
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
	Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
	Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
	Mul_End(5, 3)

#define Mul_8 \
	Mul_Begin(8) \
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
	Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
	Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
	Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
	Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
	Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
	Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
	Mul_End(13, 7)

#define Mul_16 \
	Mul_Begin(16) \
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
	Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
	Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
	Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
	Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
	Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
	Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
	Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
	Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
	Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
	Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
	Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
	Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
	Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
	Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
	Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
	Mul_End(29, 15)

#define Squ_2 \
	Squ_Begin(2) \
	Squ_End(2)

#define Squ_4 \
	Squ_Begin(4) \
	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
	Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
	Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
	Squ_End(4)

#define Squ_8 \
	Squ_Begin(8) \
	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
	Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
	Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5)  Squ_NonDiag \
	Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
	Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
	Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
	Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
	Squ_End(8)

#define Squ_16 \
	Squ_Begin(16) \
	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
	Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
	Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
	Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
	Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
	Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
	Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
	Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
	Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
	Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
	Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
	Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
	Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
	Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
	Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
	Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
	Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
	Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
	Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
	Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
	Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
	Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
	Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
	Squ_End(16)

#define Bot_2 \
	Mul_Begin(2) \
	Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
	Bot_End(2)

#define Bot_4 \
	Mul_Begin(4) \
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2)  \
	Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0)  \
	Bot_End(4)

#define Bot_8 \
	Mul_Begin(8) \
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
	Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
	Bot_End(8)

#define Bot_16 \
	Mul_Begin(16) \
	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
	Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
	Bot_End(16)

#endif

#if 0
#define Mul_Begin(n)				\
	Declare2Words(p)				\
	Declare2Words(c)				\
	Declare2Words(d)				\
	MultiplyWords(p, A[0], B[0])	\
	AssignWord(c, LowWord(p))		\
	AssignWord(d, HighWord(p))

#define Mul_Acc(i, j)				\
	MultiplyWords(p, A[i], B[j])	\
	Acc2WordsBy1(c, LowWord(p))		\
	Acc2WordsBy1(d, HighWord(p))

#define Mul_SaveAcc(k, i, j) 		\
	R[k] = LowWord(c);				\
	Add2WordsBy1(c, d, HighWord(c))	\
	MultiplyWords(p, A[i], B[j])	\
	AssignWord(d, HighWord(p))		\
	Acc2WordsBy1(c, LowWord(p))

#define Mul_End(n)					\
	R[2*n-3] = LowWord(c);			\
	Acc2WordsBy1(d, HighWord(c))	\
	MultiplyWords(p, A[n-1], B[n-1])\
	Acc2WordsBy2(d, p)				\
	R[2*n-2] = LowWord(d);			\
	R[2*n-1] = HighWord(d);

#define Bot_SaveAcc(k, i, j)		\
	R[k] = LowWord(c);				\
	word e = LowWord(d) + HighWord(c);	\
	e += A[i] * B[j];

#define Bot_Acc(i, j)	\
	e += A[i] * B[j];

#define Bot_End(n)		\
	R[n-1] = e;
#else
#define Mul_Begin(n)				\
	Declare2Words(p)				\
	word c;	\
	Declare2Words(d)				\
	MultiplyWords(p, A[0], B[0])	\
	c = LowWord(p);		\
	AssignWord(d, HighWord(p))

#define Mul_Acc(i, j)				\
	MulAcc(c, d, A[i], B[j])

#define Mul_SaveAcc(k, i, j) 		\
	R[k] = c;				\
	c = LowWord(d);	\
	AssignWord(d, HighWord(d))	\
	MulAcc(c, d, A[i], B[j])

#define Mul_End(k, i)					\
	R[k] = c;			\
	MultiplyWords(p, A[i], B[i])	\
	Acc2WordsBy2(p, d)				\
	R[k+1] = LowWord(p);			\
	R[k+2] = HighWord(p);

#define Bot_SaveAcc(k, i, j)		\
	R[k] = c;				\
	c = LowWord(d);	\
	c += A[i] * B[j];

#define Bot_Acc(i, j)	\
	c += A[i] * B[j];

#define Bot_End(n)		\
	R[n-1] = c;
#endif

#define Squ_Begin(n)				\
	Declare2Words(p)				\
	word c;				\
	Declare2Words(d)				\
	Declare2Words(e)				\
	MultiplyWords(p, A[0], A[0])	\
	R[0] = LowWord(p);				\
	AssignWord(e, HighWord(p))		\
	MultiplyWords(p, A[0], A[1])	\
	c = LowWord(p);		\
	AssignWord(d, HighWord(p))		\
	Squ_NonDiag						\

#define Squ_NonDiag				\
	Double3Words(c, d)

#define Squ_SaveAcc(k, i, j) 		\
	Acc3WordsBy2(c, d, e)			\
	R[k] = c;				\
	MultiplyWords(p, A[i], A[j])	\
	c = LowWord(p);		\
	AssignWord(d, HighWord(p))		\

#define Squ_Acc(i, j)				\
	MulAcc(c, d, A[i], A[j])

#define Squ_Diag(i)					\
	Squ_NonDiag						\
	MulAcc(c, d, A[i], A[i])

#define Squ_End(n)					\
	Acc3WordsBy2(c, d, e)			\
	R[2*n-3] = c;			\
	MultiplyWords(p, A[n-1], A[n-1])\
	Acc2WordsBy2(p, e)				\
	R[2*n-2] = LowWord(p);			\
	R[2*n-1] = HighWord(p);

void Baseline_Multiply2(word *R, const word *A, const word *B)
{
	Mul_2
}

void Baseline_Multiply4(word *R, const word *A, const word *B)
{
	Mul_4
}

void Baseline_Multiply8(word *R, const word *A, const word *B)
{
	Mul_8
}

void Baseline_Square2(word *R, const word *A)
{
	Squ_2
}

void Baseline_Square4(word *R, const word *A)
{
	Squ_4
}

void Baseline_Square8(word *R, const word *A)
{
	Squ_8
}

void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
{
	Bot_2
}

void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
{
	Bot_4
}

void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
{
	Bot_8
}

#define Top_Begin(n)				\
	Declare2Words(p)				\
	word c;	\
	Declare2Words(d)				\
	MultiplyWords(p, A[0], B[n-2]);\
	AssignWord(d, HighWord(p));

#define Top_Acc(i, j)	\
	MultiplyWords(p, A[i], B[j]);\
	Acc2WordsBy1(d, HighWord(p));

#define Top_SaveAcc0(i, j) 		\
	c = LowWord(d);	\
	AssignWord(d, HighWord(d))	\
	MulAcc(c, d, A[i], B[j])

#define Top_SaveAcc1(i, j) 		\
	c = L<c; \
	Acc2WordsBy1(d, c);	\
	c = LowWord(d);	\
	AssignWord(d, HighWord(d))	\
	MulAcc(c, d, A[i], B[j])

void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
{
	word T[4];
	Baseline_Multiply2(T, A, B);
	R[0] = T[2];
	R[1] = T[3];
}

void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
{
	Top_Begin(4)
	Top_Acc(1, 1) Top_Acc(2, 0)  \
	Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
	Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
	Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
	Mul_End(1, 3)
}

void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
{
	Top_Begin(8)
	Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
	Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
	Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
	Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
	Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
	Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
	Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
	Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
	Mul_End(5, 7)
}

#if !CRYPTOPP_INTEGER_SSE2	// save memory by not compiling these functions when SSE2 is available
void Baseline_Multiply16(word *R, const word *A, const word *B)
{
	Mul_16
}

void Baseline_Square16(word *R, const word *A)
{
	Squ_16
}

void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
{
	Bot_16
}

void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
{
	Top_Begin(16)
	Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
	Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
	Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
	Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
	Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
	Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
	Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
	Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
	Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
	Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
	Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
	Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
	Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
	Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
	Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
	Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
	Mul_End(13, 15)
}
#endif

// ********************************************************

#if CRYPTOPP_INTEGER_SSE2

CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};

#undef Mul_Begin
#undef Mul_Acc
#undef Top_Begin
#undef Top_Acc
#undef Squ_Acc
#undef Squ_NonDiag
#undef Squ_Diag
#undef Squ_SaveAcc
#undef Squ_Begin
#undef Mul_SaveAcc
#undef Bot_Acc
#undef Bot_SaveAcc
#undef Bot_End
#undef Squ_End
#undef Mul_End

#define SSE2_FinalSave(k)			\
	AS2(	psllq		xmm5, 16)	\
	AS2(	paddq		xmm4, xmm5)	\
	AS2(	movq		QWORD PTR [ecx+8*(k)], xmm4)

#define SSE2_SaveShift(k)			\
	AS2(	movq		xmm0, xmm6)	\
	AS2(	punpckhqdq	xmm6, xmm0)	\
	AS2(	movq		xmm1, xmm7)	\
	AS2(	punpckhqdq	xmm7, xmm1)	\
	AS2(	paddd		xmm6, xmm0)	\
	AS2(	pslldq		xmm6, 4)	\
	AS2(	paddd		xmm7, xmm1)	\
	AS2(	paddd		xmm4, xmm6)	\
	AS2(	pslldq		xmm7, 4)	\
	AS2(	movq		xmm6, xmm4)	\
	AS2(	paddd		xmm5, xmm7)	\
	AS2(	movq		xmm7, xmm5)	\
	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
	AS2(	psrlq		xmm6, 16)	\
	AS2(	paddq		xmm6, xmm7)	\
	AS2(	punpckhqdq	xmm4, xmm0)	\
	AS2(	punpckhqdq	xmm5, xmm0)	\
	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm6)	\
	AS2(	psrlq		xmm6, 3*16)	\
	AS2(	paddd		xmm4, xmm6)	\

#define Squ_SSE2_SaveShift(k)			\
	AS2(	movq		xmm0, xmm6)	\
	AS2(	punpckhqdq	xmm6, xmm0)	\
	AS2(	movq		xmm1, xmm7)	\
	AS2(	punpckhqdq	xmm7, xmm1)	\
	AS2(	paddd		xmm6, xmm0)	\
	AS2(	pslldq		xmm6, 4)	\
	AS2(	paddd		xmm7, xmm1)	\
	AS2(	paddd		xmm4, xmm6)	\
	AS2(	pslldq		xmm7, 4)	\
	AS2(	movhlps		xmm6, xmm4)	\
	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
	AS2(	paddd		xmm5, xmm7)	\
	AS2(	movhps		QWORD PTR [esp+12], xmm5)\
	AS2(	psrlq		xmm4, 16)	\
	AS2(	paddq		xmm4, xmm5)	\
	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm4)	\
	AS2(	psrlq		xmm4, 3*16)	\
	AS2(	paddd		xmm4, xmm6)	\
	AS2(	movq		QWORD PTR [esp+4], xmm4)\

#define SSE2_FirstMultiply(i)				\
	AS2(	movdqa		xmm7, [esi+(i)*16])\
	AS2(	movdqa		xmm5, [edi-(i)*16])\
	AS2(	pmuludq		xmm5, xmm7)		\
	AS2(	movdqa		xmm4, [ebx])\
	AS2(	movdqa		xmm6, xmm4)		\
	AS2(	pand		xmm4, xmm5)		\
	AS2(	psrld		xmm5, 16)		\
	AS2(	pmuludq		xmm7, [edx-(i)*16])\
	AS2(	pand		xmm6, xmm7)		\
	AS2(	psrld		xmm7, 16)

#define Squ_Begin(n)							\
	SquPrologue									\
	AS2(	mov		esi, esp)\
	AS2(	and		esp, 0xfffffff0)\
	AS2(	lea		edi, [esp-32*n])\
	AS2(	sub		esp, 32*n+16)\
	AS1(	push	esi)\
	AS2(	mov		esi, edi)					\
	AS2(	xor		edx, edx)					\
	ASL(1)										\
	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
	AS2(	movdqa	[edi+2*edx], xmm0)		\
	AS2(	psrlq	xmm0, 32)					\
	AS2(	movdqa	[edi+2*edx+16], xmm0)	\
	AS2(	movdqa	[edi+16*n+2*edx], xmm1)		\
	AS2(	psrlq	xmm1, 32)					\
	AS2(	movdqa	[edi+16*n+2*edx+16], xmm1)	\
	AS2(	add		edx, 16)					\
	AS2(	cmp		edx, 8*(n))					\
	ASJ(	jne,	1, b)						\
	AS2(	lea		edx, [edi+16*n])\
	SSE2_FirstMultiply(0)							\

#define Squ_Acc(i)								\
	ASL(LSqu##i)								\
	AS2(	movdqa		xmm1, [esi+(i)*16])	\
	AS2(	movdqa		xmm0, [edi-(i)*16])	\
	AS2(	movdqa		xmm2, [ebx])	\
	AS2(	pmuludq		xmm0, xmm1)				\
	AS2(	pmuludq		xmm1, [edx-(i)*16])	\
	AS2(	movdqa		xmm3, xmm2)			\
	AS2(	pand		xmm2, xmm0)			\
	AS2(	psrld		xmm0, 16)			\
	AS2(	paddd		xmm4, xmm2)			\
	AS2(	paddd		xmm5, xmm0)			\
	AS2(	pand		xmm3, xmm1)			\
	AS2(	psrld		xmm1, 16)			\
	AS2(	paddd		xmm6, xmm3)			\
	AS2(	paddd		xmm7, xmm1)		\

#define Squ_Acc1(i)
#define Squ_Acc2(i)		ASC(call, LSqu##i)
#define Squ_Acc3(i)		Squ_Acc2(i)
#define Squ_Acc4(i)		Squ_Acc2(i)
#define Squ_Acc5(i)		Squ_Acc2(i)
#define Squ_Acc6(i)		Squ_Acc2(i)
#define Squ_Acc7(i)		Squ_Acc2(i)
#define Squ_Acc8(i)		Squ_Acc2(i)

#define SSE2_End(E, n)					\
	SSE2_SaveShift(2*(n)-3)			\
	AS2(	movdqa		xmm7, [esi+16])	\
	AS2(	movdqa		xmm0, [edi])	\
	AS2(	pmuludq		xmm0, xmm7)				\
	AS2(	movdqa		xmm2, [ebx])		\
	AS2(	pmuludq		xmm7, [edx])	\
	AS2(	movdqa		xmm6, xmm2)				\
	AS2(	pand		xmm2, xmm0)				\
	AS2(	psrld		xmm0, 16)				\
	AS2(	paddd		xmm4, xmm2)				\
	AS2(	paddd		xmm5, xmm0)				\
	AS2(	pand		xmm6, xmm7)				\
	AS2(	psrld		xmm7, 16)	\
	SSE2_SaveShift(2*(n)-2)			\
	SSE2_FinalSave(2*(n)-1)			\
	AS1(	pop		esp)\
	E

#define Squ_End(n)		SSE2_End(SquEpilogue, n)
#define Mul_End(n)		SSE2_End(MulEpilogue, n)
#define Top_End(n)		SSE2_End(TopEpilogue, n)

#define Squ_Column1(k, i)	\
	Squ_SSE2_SaveShift(k)					\
	AS2(	add			esi, 16)	\
	SSE2_FirstMultiply(1)\
	Squ_Acc##i(i)	\
	AS2(	paddd		xmm4, xmm4)		\
	AS2(	paddd		xmm5, xmm5)		\
	AS2(	movdqa		xmm3, [esi])				\
	AS2(	movq		xmm1, QWORD PTR [esi+8])	\
	AS2(	pmuludq		xmm1, xmm3)		\
	AS2(	pmuludq		xmm3, xmm3)		\
	AS2(	movdqa		xmm0, [ebx])\
	AS2(	movdqa		xmm2, xmm0)		\
	AS2(	pand		xmm0, xmm1)		\
	AS2(	psrld		xmm1, 16)		\
	AS2(	paddd		xmm6, xmm0)		\
	AS2(	paddd		xmm7, xmm1)		\
	AS2(	pand		xmm2, xmm3)		\
	AS2(	psrld		xmm3, 16)		\
	AS2(	paddd		xmm6, xmm6)		\
	AS2(	paddd		xmm7, xmm7)		\
	AS2(	paddd		xmm4, xmm2)		\
	AS2(	paddd		xmm5, xmm3)		\
	AS2(	movq		xmm0, QWORD PTR [esp+4])\
	AS2(	movq		xmm1, QWORD PTR [esp+12])\
	AS2(	paddd		xmm4, xmm0)\
	AS2(	paddd		xmm5, xmm1)\

#define Squ_Column0(k, i)	\
	Squ_SSE2_SaveShift(k)					\
	AS2(	add			edi, 16)	\
	AS2(	add			edx, 16)	\
	SSE2_FirstMultiply(1)\
	Squ_Acc##i(i)	\
	AS2(	paddd		xmm6, xmm6)		\
	AS2(	paddd		xmm7, xmm7)		\
	AS2(	paddd		xmm4, xmm4)		\
	AS2(	paddd		xmm5, xmm5)		\
	AS2(	movq		xmm0, QWORD PTR [esp+4])\
	AS2(	movq		xmm1, QWORD PTR [esp+12])\
	AS2(	paddd		xmm4, xmm0)\
	AS2(	paddd		xmm5, xmm1)\

#define SSE2_MulAdd45						\
	AS2(	movdqa		xmm7, [esi])	\
	AS2(	movdqa		xmm0, [edi])	\
	AS2(	pmuludq		xmm0, xmm7)				\
	AS2(	movdqa		xmm2, [ebx])		\
	AS2(	pmuludq		xmm7, [edx])	\
	AS2(	movdqa		xmm6, xmm2)				\
	AS2(	pand		xmm2, xmm0)				\
	AS2(	psrld		xmm0, 16)				\
	AS2(	paddd		xmm4, xmm2)				\
	AS2(	paddd		xmm5, xmm0)				\
	AS2(	pand		xmm6, xmm7)				\
	AS2(	psrld		xmm7, 16)

#define Mul_Begin(n)							\
	MulPrologue									\
	AS2(	mov		esi, esp)\
	AS2(	and		esp, 0xfffffff0)\
	AS2(	sub		esp, 48*n+16)\
	AS1(	push	esi)\
	AS2(	xor		edx, edx)					\
	ASL(1)										\
	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
	AS2(	psrlq	xmm0, 32)					\
	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
	AS2(	psrlq	xmm1, 32)					\
	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
	AS2(	psrlq	xmm2, 32)					\
	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
	AS2(	add		edx, 16)					\
	AS2(	cmp		edx, 8*(n))					\
	ASJ(	jne,	1, b)						\
	AS2(	lea		edi, [esp+20])\
	AS2(	lea		edx, [esp+20+16*n])\
	AS2(	lea		esi, [esp+20+32*n])\
	SSE2_FirstMultiply(0)							\

#define Mul_Acc(i)								\
	ASL(LMul##i)										\
	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
	AS2(	movdqa		xmm2, [ebx])	\
	AS2(	pmuludq		xmm0, xmm1)				\
	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
	AS2(	movdqa		xmm3, xmm2)			\
	AS2(	pand		xmm2, xmm0)			\
	AS2(	psrld		xmm0, 16)			\
	AS2(	paddd		xmm4, xmm2)			\
	AS2(	paddd		xmm5, xmm0)			\
	AS2(	pand		xmm3, xmm1)			\
	AS2(	psrld		xmm1, 16)			\
	AS2(	paddd		xmm6, xmm3)			\
	AS2(	paddd		xmm7, xmm1)		\

#define Mul_Acc1(i)
#define Mul_Acc2(i)		ASC(call, LMul##i)
#define Mul_Acc3(i)		Mul_Acc2(i)
#define Mul_Acc4(i)		Mul_Acc2(i)
#define Mul_Acc5(i)		Mul_Acc2(i)
#define Mul_Acc6(i)		Mul_Acc2(i)
#define Mul_Acc7(i)		Mul_Acc2(i)
#define Mul_Acc8(i)		Mul_Acc2(i)
#define Mul_Acc9(i)		Mul_Acc2(i)
#define Mul_Acc10(i)	Mul_Acc2(i)
#define Mul_Acc11(i)	Mul_Acc2(i)
#define Mul_Acc12(i)	Mul_Acc2(i)
#define Mul_Acc13(i)	Mul_Acc2(i)
#define Mul_Acc14(i)	Mul_Acc2(i)
#define Mul_Acc15(i)	Mul_Acc2(i)
#define Mul_Acc16(i)	Mul_Acc2(i)

#define Mul_Column1(k, i)	\
	SSE2_SaveShift(k)					\
	AS2(	add			esi, 16)	\
	SSE2_MulAdd45\
	Mul_Acc##i(i)	\

#define Mul_Column0(k, i)	\
	SSE2_SaveShift(k)					\
	AS2(	add			edi, 16)	\
	AS2(	add			edx, 16)	\
	SSE2_MulAdd45\
	Mul_Acc##i(i)	\

#define Bot_Acc(i)							\
	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
	AS2(	pmuludq		xmm0, xmm1)				\
	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])		\
	AS2(	paddq		xmm4, xmm0)				\
	AS2(	paddd		xmm6, xmm1)

#define Bot_SaveAcc(k)					\
	SSE2_SaveShift(k)							\
	AS2(	add			edi, 16)	\
	AS2(	add			edx, 16)	\
	AS2(	movdqa		xmm6, [esi])	\
	AS2(	movdqa		xmm0, [edi])	\
	AS2(	pmuludq		xmm0, xmm6)				\
	AS2(	paddq		xmm4, xmm0)				\
	AS2(	psllq		xmm5, 16)				\
	AS2(	paddq		xmm4, xmm5)				\
	AS2(	pmuludq		xmm6, [edx])

#define Bot_End(n)							\
	AS2(	movhlps		xmm7, xmm6)			\
	AS2(	paddd		xmm6, xmm7)			\
	AS2(	psllq		xmm6, 32)			\
	AS2(	paddd		xmm4, xmm6)			\
	AS2(	movq		QWORD PTR [ecx+8*((n)-1)], xmm4)	\
	AS1(	pop		esp)\
	MulEpilogue

#define Top_Begin(n)							\
	TopPrologue									\
	AS2(	mov		edx, esp)\
	AS2(	and		esp, 0xfffffff0)\
	AS2(	sub		esp, 48*n+16)\
	AS1(	push	edx)\
	AS2(	xor		edx, edx)					\
	ASL(1)										\
	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
	AS2(	psrlq	xmm0, 32)					\
	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
	AS2(	psrlq	xmm1, 32)					\
	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
	AS2(	psrlq	xmm2, 32)					\
	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
	AS2(	add		edx, 16)					\
	AS2(	cmp		edx, 8*(n))					\
	ASJ(	jne,	1, b)						\
	AS2(	mov		eax, esi)					\
	AS2(	lea		edi, [esp+20+00*n+16*(n/2-1)])\
	AS2(	lea		edx, [esp+20+16*n+16*(n/2-1)])\
	AS2(	lea		esi, [esp+20+32*n+16*(n/2-1)])\
	AS2(	pxor	xmm4, xmm4)\
	AS2(	pxor	xmm5, xmm5)

#define Top_Acc(i)							\
	AS2(	movq		xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8])	\
	AS2(	pmuludq		xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
	AS2(	psrlq		xmm0, 48)				\
	AS2(	paddd		xmm5, xmm0)\

#define Top_Column0(i)	\
	AS2(	psllq		xmm5, 32)				\
	AS2(	add			edi, 16)	\
	AS2(	add			edx, 16)	\
	SSE2_MulAdd45\
	Mul_Acc##i(i)	\

#define Top_Column1(i)	\
	SSE2_SaveShift(0)					\
	AS2(	add			esi, 16)	\
	SSE2_MulAdd45\
	Mul_Acc##i(i)	\
	AS2(	shr			eax, 16)	\
	AS2(	movd		xmm0, eax)\
	AS2(	movd		xmm1, [ecx+4])\
	AS2(	psrld		xmm1, 16)\
	AS2(	pcmpgtd		xmm1, xmm0)\
	AS2(	psrld		xmm1, 31)\
	AS2(	paddd		xmm4, xmm1)\

void SSE2_Square4(word *C, const word *A)
{
	Squ_Begin(2)
	Squ_Column0(0, 1)
	Squ_End(2)
}

void SSE2_Square8(word *C, const word *A)
{
	Squ_Begin(4)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Squ_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Squ_Column0(0, 1)
	Squ_Column1(1, 1)
	Squ_Column0(2, 2)
	Squ_Column1(3, 1)
	Squ_Column0(4, 1)
	Squ_End(4)
}

void SSE2_Square16(word *C, const word *A)
{
	Squ_Begin(8)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Squ_Column0(0, 1)
	Squ_Column1(1, 1)
	Squ_Column0(2, 2)
	Squ_Column1(3, 2)
	Squ_Column0(4, 3)
	Squ_Column1(5, 3)
	Squ_Column0(6, 4)
	Squ_Column1(7, 3)
	Squ_Column0(8, 3)
	Squ_Column1(9, 2)
	Squ_Column0(10, 2)
	Squ_Column1(11, 1)
	Squ_Column0(12, 1)
	Squ_End(8)
}

void SSE2_Square32(word *C, const word *A)
{
	Squ_Begin(16)
	ASJ(	jmp,	0, f)
	Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
	AS1(	ret) ASL(0)
	Squ_Column0(0, 1)
	Squ_Column1(1, 1)
	Squ_Column0(2, 2)
	Squ_Column1(3, 2)
	Squ_Column0(4, 3)
	Squ_Column1(5, 3)
	Squ_Column0(6, 4)
	Squ_Column1(7, 4)
	Squ_Column0(8, 5)
	Squ_Column1(9, 5)
	Squ_Column0(10, 6)
	Squ_Column1(11, 6)
	Squ_Column0(12, 7)
	Squ_Column1(13, 7)
	Squ_Column0(14, 8)
	Squ_Column1(15, 7)
	Squ_Column0(16, 7)
	Squ_Column1(17, 6)
	Squ_Column0(18, 6)
	Squ_Column1(19, 5)
	Squ_Column0(20, 5)
	Squ_Column1(21, 4)
	Squ_Column0(22, 4)
	Squ_Column1(23, 3)
	Squ_Column0(24, 3)
	Squ_Column1(25, 2)
	Squ_Column0(26, 2)
	Squ_Column1(27, 1)
	Squ_Column0(28, 1)
	Squ_End(16)
}

void SSE2_Multiply4(word *C, const word *A, const word *B)
{
	Mul_Begin(2)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Mul_Column0(0, 2)
	Mul_End(2)
}

void SSE2_Multiply8(word *C, const word *A, const word *B)
{
	Mul_Begin(4)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Mul_Column0(0, 2)
	Mul_Column1(1, 3)
	Mul_Column0(2, 4)
	Mul_Column1(3, 3)
	Mul_Column0(4, 2)
	Mul_End(4)
}

void SSE2_Multiply16(word *C, const word *A, const word *B)
{
	Mul_Begin(8)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Mul_Column0(0, 2)
	Mul_Column1(1, 3)
	Mul_Column0(2, 4)
	Mul_Column1(3, 5)
	Mul_Column0(4, 6)
	Mul_Column1(5, 7)
	Mul_Column0(6, 8)
	Mul_Column1(7, 7)
	Mul_Column0(8, 6)
	Mul_Column1(9, 5)
	Mul_Column0(10, 4)
	Mul_Column1(11, 3)
	Mul_Column0(12, 2)
	Mul_End(8)
}

void SSE2_Multiply32(word *C, const word *A, const word *B)
{
	Mul_Begin(16)
	ASJ(	jmp,	0, f)
	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
	Mul_Column0(0, 2)
	Mul_Column1(1, 3)
	Mul_Column0(2, 4)
	Mul_Column1(3, 5)
	Mul_Column0(4, 6)
	Mul_Column1(5, 7)
	Mul_Column0(6, 8)
	Mul_Column1(7, 9)
	Mul_Column0(8, 10)
	Mul_Column1(9, 11)
	Mul_Column0(10, 12)
	Mul_Column1(11, 13)
	Mul_Column0(12, 14)
	Mul_Column1(13, 15)
	Mul_Column0(14, 16)
	Mul_Column1(15, 15)
	Mul_Column0(16, 14)
	Mul_Column1(17, 13)
	Mul_Column0(18, 12)
	Mul_Column1(19, 11)
	Mul_Column0(20, 10)
	Mul_Column1(21, 9)
	Mul_Column0(22, 8)
	Mul_Column1(23, 7)
	Mul_Column0(24, 6)
	Mul_Column1(25, 5)
	Mul_Column0(26, 4)
	Mul_Column1(27, 3)
	Mul_Column0(28, 2)
	Mul_End(16)
}

void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
{
	Mul_Begin(2)
	Bot_SaveAcc(0) Bot_Acc(2)
	Bot_End(2)
}

void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
{
	Mul_Begin(4)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Mul_Column0(0, 2)
	Mul_Column1(1, 3)
	Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
	Bot_End(4)
}

void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
{
	Mul_Begin(8)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Mul_Column0(0, 2)
	Mul_Column1(1, 3)
	Mul_Column0(2, 4)
	Mul_Column1(3, 5)
	Mul_Column0(4, 6)
	Mul_Column1(5, 7)
	Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
	Bot_End(8)
}

void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
{
	Mul_Begin(16)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Mul_Column0(0, 2)
	Mul_Column1(1, 3)
	Mul_Column0(2, 4)
	Mul_Column1(3, 5)
	Mul_Column0(4, 6)
	Mul_Column1(5, 7)
	Mul_Column0(6, 8)
	Mul_Column1(7, 9)
	Mul_Column0(8, 10)
	Mul_Column1(9, 11)
	Mul_Column0(10, 12)
	Mul_Column1(11, 13)
	Mul_Column0(12, 14)
	Mul_Column1(13, 15)
	Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
	Bot_End(16)
}

void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
{
	Top_Begin(4)
	Top_Acc(3) Top_Acc(2) Top_Acc(1)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Top_Column0(4)
	Top_Column1(3)
	Mul_Column0(0, 2)
	Top_End(2)
}

void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
{
	Top_Begin(8)
	Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Top_Column0(8)
	Top_Column1(7)
	Mul_Column0(0, 6)
	Mul_Column1(1, 5)
	Mul_Column0(2, 4)
	Mul_Column1(3, 3)
	Mul_Column0(4, 2)
	Top_End(4)
}

void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
{
	Top_Begin(16)
	Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
#ifndef __GNUC__
	ASJ(	jmp,	0, f)
	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
	AS1(	ret) ASL(0)
#endif
	Top_Column0(16)
	Top_Column1(15)
	Mul_Column0(0, 14)
	Mul_Column1(1, 13)
	Mul_Column0(2, 12)
	Mul_Column1(3, 11)
	Mul_Column0(4, 10)
	Mul_Column1(5, 9)
	Mul_Column0(6, 8)
	Mul_Column1(7, 7)
	Mul_Column0(8, 6)
	Mul_Column1(9, 5)
	Mul_Column0(10, 4)
	Mul_Column1(11, 3)
	Mul_Column0(12, 2)
	Top_End(8)
}

#endif	// #if CRYPTOPP_INTEGER_SSE2

// ********************************************************

typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
typedef void (* PMul)(word *C, const word *A, const word *B);
typedef void (* PSqu)(word *C, const word *A);
typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);

#if CRYPTOPP_INTEGER_SSE2
static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
static size_t s_recursionLimit = 8;
#else
static const size_t s_recursionLimit = 16;
#endif

static PMul s_pMul[9], s_pBot[9];
static PSqu s_pSqu[9];
static PMulTop s_pTop[9];

static void SetFunctionPointers()
{
	s_pMul[0] = &Baseline_Multiply2;
	s_pBot[0] = &Baseline_MultiplyBottom2;
	s_pSqu[0] = &Baseline_Square2;
	s_pTop[0] = &Baseline_MultiplyTop2;
	s_pTop[1] = &Baseline_MultiplyTop4;

#if CRYPTOPP_INTEGER_SSE2
	if (HasSSE2())
	{
#if _MSC_VER != 1200 || defined(NDEBUG)
		if (IsP4())
		{
			s_pAdd = &SSE2_Add;
			s_pSub = &SSE2_Sub;
		}
#endif

		s_recursionLimit = 32;

		s_pMul[1] = &SSE2_Multiply4;
		s_pMul[2] = &SSE2_Multiply8;
		s_pMul[4] = &SSE2_Multiply16;
		s_pMul[8] = &SSE2_Multiply32;

		s_pBot[1] = &SSE2_MultiplyBottom4;
		s_pBot[2] = &SSE2_MultiplyBottom8;
		s_pBot[4] = &SSE2_MultiplyBottom16;
		s_pBot[8] = &SSE2_MultiplyBottom32;

		s_pSqu[1] = &SSE2_Square4;
		s_pSqu[2] = &SSE2_Square8;
		s_pSqu[4] = &SSE2_Square16;
		s_pSqu[8] = &SSE2_Square32;

		s_pTop[2] = &SSE2_MultiplyTop8;
		s_pTop[4] = &SSE2_MultiplyTop16;
		s_pTop[8] = &SSE2_MultiplyTop32;
	}
	else
#endif
	{
		s_pMul[1] = &Baseline_Multiply4;
		s_pMul[2] = &Baseline_Multiply8;

		s_pBot[1] = &Baseline_MultiplyBottom4;
		s_pBot[2] = &Baseline_MultiplyBottom8;

		s_pSqu[1] = &Baseline_Square4;
		s_pSqu[2] = &Baseline_Square8;

		s_pTop[2] = &Baseline_MultiplyTop8;

#if	!CRYPTOPP_INTEGER_SSE2
		s_pMul[4] = &Baseline_Multiply16;
		s_pBot[4] = &Baseline_MultiplyBottom16;
		s_pSqu[4] = &Baseline_Square16;
		s_pTop[4] = &Baseline_MultiplyTop16;
#endif
	}
}

inline int Add(word *C, const word *A, const word *B, size_t N)
{
#if CRYPTOPP_INTEGER_SSE2
	return s_pAdd(N, C, A, B);
#else
	return Baseline_Add(N, C, A, B);
#endif
}

inline int Subtract(word *C, const word *A, const word *B, size_t N)
{
#if CRYPTOPP_INTEGER_SSE2
	return s_pSub(N, C, A, B);
#else
	return Baseline_Sub(N, C, A, B);
#endif
}

// ********************************************************


#define A0		A
#define A1		(A+N2)
#define B0		B
#define B1		(B+N2)

#define T0		T
#define T1		(T+N2)
#define T2		(T+N)
#define T3		(T+N+N2)

#define R0		R
#define R1		(R+N2)
#define R2		(R+N)
#define R3		(R+N+N2)

// R[2*N] - result = A*B
// T[2*N] - temporary work space
// A[N] --- multiplier
// B[N] --- multiplicant

void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
{
	assert(N>=2 && N%2==0);

	if (N <= s_recursionLimit)
		s_pMul[N/4](R, A, B);
	else
	{
		const size_t N2 = N/2;

		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);

		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);

		RecursiveMultiply(R2, T2, A1, B1, N2);
		RecursiveMultiply(T0, T2, R0, R1, N2);
		RecursiveMultiply(R0, T2, A0, B0, N2);

		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1

		int c2 = Add(R2, R2, R1, N2);
		int c3 = c2;
		c2 += Add(R1, R2, R0, N2);
		c3 += Add(R2, R2, R3, N2);

		if (AN2 == BN2)
			c3 -= Subtract(R1, R1, T0, N);
		else
			c3 += Add(R1, R1, T0, N);

		c3 += Increment(R2, N2, c2);
		assert (c3 >= 0 && c3 <= 2);
		Increment(R3, N2, c3);
	}
}

// R[2*N] - result = A*A
// T[2*N] - temporary work space
// A[N] --- number to be squared

void RecursiveSquare(word *R, word *T, const word *A, size_t N)
{
	assert(N && N%2==0);

	if (N <= s_recursionLimit)
		s_pSqu[N/4](R, A);
	else
	{
		const size_t N2 = N/2;

		RecursiveSquare(R0, T2, A0, N2);
		RecursiveSquare(R2, T2, A1, N2);
		RecursiveMultiply(T0, T2, A0, A1, N2);

		int carry = Add(R1, R1, T0, N);
		carry += Add(R1, R1, T0, N);
		Increment(R3, N2, carry);
	}
}

// R[N] - bottom half of A*B
// T[3*N/2] - temporary work space
// A[N] - multiplier
// B[N] - multiplicant

void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
{
	assert(N>=2 && N%2==0);

	if (N <= s_recursionLimit)
		s_pBot[N/4](R, A, B);
	else
	{
		const size_t N2 = N/2;

		RecursiveMultiply(R, T, A0, B0, N2);
		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
		Add(R1, R1, T0, N2);
		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
		Add(R1, R1, T0, N2);
	}
}

// R[N] --- upper half of A*B
// T[2*N] - temporary work space
// L[N] --- lower half of A*B
// A[N] --- multiplier
// B[N] --- multiplicant

void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
{
	assert(N>=2 && N%2==0);

	if (N <= s_recursionLimit)
		s_pTop[N/4](R, A, B, L[N-1]);
	else
	{
		const size_t N2 = N/2;

		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);

		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);

		RecursiveMultiply(T0, T2, R0, R1, N2);
		RecursiveMultiply(R0, T2, A1, B1, N2);

		// now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1

		int t, c3;
		int c2 = Subtract(T2, L+N2, L, N2);

		if (AN2 == BN2)
		{
			c2 -= Add(T2, T2, T0, N2);
			t = (Compare(T2, R0, N2) == -1);
			c3 = t - Subtract(T2, T2, T1, N2);
		}
		else
		{
			c2 += Subtract(T2, T2, T0, N2);
			t = (Compare(T2, R0, N2) == -1);
			c3 = t + Add(T2, T2, T1, N2);
		}

		c2 += t;
		if (c2 >= 0)
			c3 += Increment(T2, N2, c2);
		else
			c3 -= Decrement(T2, N2, -c2);
		c3 += Add(R0, T2, R1, N2);

		assert (c3 >= 0 && c3 <= 2);
		Increment(R1, N2, c3);
	}
}

inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
{
	RecursiveMultiply(R, T, A, B, N);
}

inline void Square(word *R, word *T, const word *A, size_t N)
{
	RecursiveSquare(R, T, A, N);
}

inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
{
	RecursiveMultiplyBottom(R, T, A, B, N);
}

// R[NA+NB] - result = A*B
// T[NA+NB] - temporary work space
// A[NA] ---- multiplier
// B[NB] ---- multiplicant

void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB)
{
	if (NA == NB)
	{
		if (A == B)
			Square(R, T, A, NA);
		else
			Multiply(R, T, A, B, NA);

		return;
	}

	if (NA > NB)
	{
		std::swap(A, B);
		std::swap(NA, NB);
	}

	assert(NB % NA == 0);

	if (NA==2 && !A[1])
	{
		switch (A[0])
		{
		case 0:
			SetWords(R, 0, NB+2);
			return;
		case 1:
			CopyWords(R, B, NB);
			R[NB] = R[NB+1] = 0;
			return;
		default:
			R[NB] = LinearMultiply(R, B, A[0], NB);
			R[NB+1] = 0;
			return;
		}
	}

	size_t i;
	if ((NB/NA)%2 == 0)
	{
		Multiply(R, T, A, B, NA);
		CopyWords(T+2*NA, R+NA, NA);

		for (i=2*NA; i<NB; i+=2*NA)
			Multiply(T+NA+i, T, A, B+i, NA);
		for (i=NA; i<NB; i+=2*NA)
			Multiply(R+i, T, A, B+i, NA);
	}
	else
	{
		for (i=0; i<NB; i+=2*NA)
			Multiply(R+i, T, A, B+i, NA);
		for (i=NA; i<NB; i+=2*NA)
			Multiply(T+NA+i, T, A, B+i, NA);
	}

	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
		Increment(R+NB, NA);
}

// R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
// T[3*N/2] - temporary work space
// A[N] ----- an odd number as input

void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
{
	if (N==2)
	{
		T[0] = AtomicInverseModPower2(A[0]);
		T[1] = 0;
		s_pBot[0](T+2, T, A);
		TwosComplement(T+2, 2);
		Increment(T+2, 2, 2);
		s_pBot[0](R, T, T+2);
	}
	else
	{
		const size_t N2 = N/2;
		RecursiveInverseModPower2(R0, T0, A0, N2);
		T0[0] = 1;
		SetWords(T0+1, 0, N2-1);
		MultiplyTop(R1, T1, T0, R0, A0, N2);
		MultiplyBottom(T0, T1, R0, A1, N2);
		Add(T0, R1, T0, N2);
		TwosComplement(T0, N2);
		MultiplyBottom(R1, T1, R0, T0, N2);
	}
}

// R[N] --- result = X/(2**(WORD_BITS*N)) mod M
// T[3*N] - temporary work space
// X[2*N] - number to be reduced
// M[N] --- modulus
// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)

void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
{
#if 1
	MultiplyBottom(R, T, X, U, N);
	MultiplyTop(T, T+N, X, R, M, N);
	word borrow = Subtract(T, X+N, T, N);
	// defend against timing attack by doing this Add even when not needed
	word carry = Add(T+N, T, M, N);
	assert(carry | !borrow);
	CopyWords(R, T + ((0-borrow) & N), N);
#elif 0
	const word u = 0-U[0];
	Declare2Words(p)
	for (size_t i=0; i<N; i++)
	{
		const word t = u * X[i];
		word c = 0;
		for (size_t j=0; j<N; j+=2)
		{
			MultiplyWords(p, t, M[j]);
			Acc2WordsBy1(p, X[i+j]);
			Acc2WordsBy1(p, c);
			X[i+j] = LowWord(p);
			c = HighWord(p);
			MultiplyWords(p, t, M[j+1]);
			Acc2WordsBy1(p, X[i+j+1]);
			Acc2WordsBy1(p, c);
			X[i+j+1] = LowWord(p);
			c = HighWord(p);
		}

		if (Increment(X+N+i, N-i, c))
			while (!Subtract(X+N, X+N, M, N)) {}
	}

	memcpy(R, X+N, N*WORD_SIZE);
#else
	__m64 u = _mm_cvtsi32_si64(0-U[0]), p;
	for (size_t i=0; i<N; i++)
	{
		__m64 t = _mm_cvtsi32_si64(X[i]);
		t = _mm_mul_su32(t, u);
		__m64 c = _mm_setzero_si64();
		for (size_t j=0; j<N; j+=2)
		{
			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
			c = _mm_add_si64(c, p);
			X[i+j] = _mm_cvtsi64_si32(c);
			c = _mm_srli_si64(c, 32);
			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
			c = _mm_add_si64(c, p);
			X[i+j+1] = _mm_cvtsi64_si32(c);
			c = _mm_srli_si64(c, 32);
		}

		if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
			while (!Subtract(X+N, X+N, M, N)) {}
	}

	memcpy(R, X+N, N*WORD_SIZE);
	_mm_empty();
#endif
}

// R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
// T[2*N] - temporary work space
// X[2*N] - number to be reduced
// M[N] --- modulus
// U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
// V[N] --- 2**(WORD_BITS*3*N/2) mod M

void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N)
{
	assert(N%2==0 && N>=4);

#define M0		M
#define M1		(M+N2)
#define V0		V
#define V1		(V+N2)

#define X0		X
#define X1		(X+N2)
#define X2		(X+N)
#define X3		(X+N+N2)

	const size_t N2 = N/2;
	Multiply(T0, T2, V0, X3, N2);
	int c2 = Add(T0, T0, X0, N);
	MultiplyBottom(T3, T2, T0, U, N2);
	MultiplyTop(T2, R, T0, T3, M0, N2);
	c2 -= Subtract(T2, T1, T2, N2);
	Multiply(T0, R, T3, M1, N2);
	c2 -= Subtract(T0, T2, T0, N2);
	int c3 = -(int)Subtract(T1, X2, T1, N2);
	Multiply(R0, T2, V1, X3, N2);
	c3 += Add(R, R, T, N);

	if (c2>0)
		c3 += Increment(R1, N2);
	else if (c2<0)
		c3 -= Decrement(R1, N2, -c2);

	assert(c3>=-1 && c3<=1);
	if (c3>0)
		Subtract(R, R, M, N);
	else if (c3<0)
		Add(R, R, M, N);

#undef M0
#undef M1
#undef V0
#undef V1

#undef X0
#undef X1
#undef X2
#undef X3
}

#undef A0
#undef A1
#undef B0
#undef B1

#undef T0
#undef T1
#undef T2
#undef T3

#undef R0
#undef R1
#undef R2
#undef R3

/*
// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
static word SubatomicDivide(word *A, word B0, word B1)
{
	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));

	// estimate the quotient: do a 2 word by 1 word divide
	word Q;
	if (B1+1 == 0)
		Q = A[2];
	else
		Q = DWord(A[1], A[2]).DividedBy(B1+1);

	// now subtract Q*B from A
	DWord p = DWord::Multiply(B0, Q);
	DWord u = (DWord) A[0] - p.GetLowHalf();
	A[0] = u.GetLowHalf();
	u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
	A[1] = u.GetLowHalf();
	A[2] += u.GetHighHalf();

	// Q <= actual quotient, so fix it
	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
	{
		u = (DWord) A[0] - B0;
		A[0] = u.GetLowHalf();
		u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
		A[1] = u.GetLowHalf();
		A[2] += u.GetHighHalf();
		Q++;
		assert(Q);	// shouldn't overflow
	}

	return Q;
}

// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
static inline void AtomicDivide(word *Q, const word *A, const word *B)
{
	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
	{
		Q[0] = A[2];
		Q[1] = A[3];
	}
	else
	{
		word T[4];
		T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
		Q[1] = SubatomicDivide(T+1, B[0], B[1]);
		Q[0] = SubatomicDivide(T, B[0], B[1]);

#ifndef NDEBUG
		// multiply quotient and divisor and add remainder, make sure it equals dividend
		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
		word P[4];
		LowLevel::Multiply2(P, Q, B);
		Add(P, P, T, 4);
		assert(memcmp(P, A, 4*WORD_SIZE)==0);
#endif
	}
}
*/

static inline void AtomicDivide(word *Q, const word *A, const word *B)
{
	word T[4];
	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
	Q[0] = q.GetLowHalf();
	Q[1] = q.GetHighHalf();

#ifndef NDEBUG
	if (B[0] || B[1])
	{
		// multiply quotient and divisor and add remainder, make sure it equals dividend
		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
		word P[4];
		s_pMul[0](P, Q, B);
		Add(P, P, T, 4);
		assert(memcmp(P, A, 4*WORD_SIZE)==0);
	}
#endif
}

// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N)
{
	assert(N && N%2==0);

	AsymmetricMultiply(T, T+N+2, Q, 2, B, N);

	word borrow = Subtract(R, R, T, N+2);
	assert(!borrow && !R[N+1]);

	while (R[N] || Compare(R, B, N) >= 0)
	{
		R[N] -= Subtract(R, R, B, N);
		Q[1] += (++Q[0]==0);
		assert(Q[0] || Q[1]); // no overflow
	}
}

// R[NB] -------- remainder = A%B
// Q[NA-NB+2] --- quotient	= A/B
// T[NA+3*(NB+2)] - temp work space
// A[NA] -------- dividend
// B[NB] -------- divisor

void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB)
{
	assert(NA && NB && NA%2==0 && NB%2==0);
	assert(B[NB-1] || B[NB-2]);
	assert(NB <= NA);

	// set up temporary work space
	word *const TA=T;
	word *const TB=T+NA+2;
	word *const TP=T+NA+2+NB;

	// copy B into TB and normalize it so that TB has highest bit set to 1
	unsigned shiftWords = (B[NB-1]==0);
	TB[0] = TB[NB-1] = 0;
	CopyWords(TB+shiftWords, B, NB-shiftWords);
	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
	assert(shiftBits < WORD_BITS);
	ShiftWordsLeftByBits(TB, NB, shiftBits);

	// copy A into TA and normalize it
	TA[0] = TA[NA] = TA[NA+1] = 0;
	CopyWords(TA+shiftWords, A, NA);
	ShiftWordsLeftByBits(TA, NA+2, shiftBits);

	if (TA[NA+1]==0 && TA[NA] <= 1)
	{
		Q[NA-NB+1] = Q[NA-NB] = 0;
		while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
		{
			TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
			++Q[NA-NB];
		}
	}
	else
	{
		NA+=2;
		assert(Compare(TA+NA-NB, TB, NB) < 0);
	}

	word BT[2];
	BT[0] = TB[NB-2] + 1;
	BT[1] = TB[NB-1] + (BT[0]==0);

	// start reducing TA mod TB, 2 words at a time
	for (size_t i=NA-2; i>=NB; i-=2)
	{
		AtomicDivide(Q+i-NB, TA+i-2, BT);
		CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
	}

	// copy TA into R, and denormalize it
	CopyWords(R, TA+shiftWords, NB);
	ShiftWordsRightByBits(R, NB, shiftBits);
}

static inline size_t EvenWordCount(const word *X, size_t N)
{
	while (N && X[N-2]==0 && X[N-1]==0)
		N-=2;
	return N;
}

// return k
// R[N] --- result = A^(-1) * 2^k mod M
// T[4*N] - temporary work space
// A[NA] -- number to take inverse of
// M[N] --- modulus

unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N)
{
	assert(NA<=N && N && N%2==0);

	word *b = T;
	word *c = T+N;
	word *f = T+2*N;
	word *g = T+3*N;
	size_t bcLen=2, fgLen=EvenWordCount(M, N);
	unsigned int k=0, s=0;

	SetWords(T, 0, 3*N);
	b[0]=1;
	CopyWords(f, A, NA);
	CopyWords(g, M, N);

	while (1)
	{
		word t=f[0];
		while (!t)
		{
			if (EvenWordCount(f, fgLen)==0)
			{
				SetWords(R, 0, N);
				return 0;
			}

			ShiftWordsRightByWords(f, fgLen, 1);
			if (c[bcLen-1]) bcLen+=2;
			assert(bcLen <= N);
			ShiftWordsLeftByWords(c, bcLen, 1);
			k+=WORD_BITS;
			t=f[0];
		}

		unsigned int i=0;
		while (t%2 == 0)
		{
			t>>=1;
			i++;
		}
		k+=i;

		if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
		{
			if (s%2==0)
				CopyWords(R, b, N);
			else
				Subtract(R, M, b, N);
			return k;
		}

		ShiftWordsRightByBits(f, fgLen, i);
		t=ShiftWordsLeftByBits(c, bcLen, i);
		if (t)
		{
			c[bcLen] = t;
			bcLen+=2;
			assert(bcLen <= N);
		}

		if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
			fgLen-=2;

		if (Compare(f, g, fgLen)==-1)
		{
			std::swap(f, g);
			std::swap(b, c);
			s++;
		}

		Subtract(f, f, g, fgLen);

		if (Add(b, b, c, bcLen))
		{
			b[bcLen] = 1;
			bcLen+=2;
			assert(bcLen <= N);
		}
	}
}

// R[N] - result = A/(2^k) mod M
// A[N] - input
// M[N] - modulus

void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
{
	CopyWords(R, A, N);

	while (k--)
	{
		if (R[0]%2==0)
			ShiftWordsRightByBits(R, N, 1);
		else
		{
			word carry = Add(R, R, M, N);
			ShiftWordsRightByBits(R, N, 1);
			R[N-1] += carry<<(WORD_BITS-1);
		}
	}
}

// R[N] - result = A*(2^k) mod M
// A[N] - input
// M[N] - modulus

void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
{
	CopyWords(R, A, N);

	while (k--)
		if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
			Subtract(R, R, M, N);
}

// ******************************************************************

InitializeInteger::InitializeInteger()
{
	if (!g_pAssignIntToInteger)
	{
		SetFunctionPointers();
		g_pAssignIntToInteger = AssignIntToInteger;
	}
}

static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};

static inline size_t RoundupSize(size_t n)
{
	if (n<=8)
		return RoundupSizeTable[n];
	else if (n<=16)
		return 16;
	else if (n<=32)
		return 32;
	else if (n<=64)
		return 64;
	else return size_t(1) << BitPrecision(n-1);
}

Integer::Integer()
	: reg(2), sign(POSITIVE)
{
	reg[0] = reg[1] = 0;
}

Integer::Integer(const Integer& t)
	: reg(RoundupSize(t.WordCount())), sign(t.sign)
{
	CopyWords(reg, t.reg, reg.size());
}

Integer::Integer(Sign s, lword value)
	: reg(2), sign(s)
{
	reg[0] = word(value);
	reg[1] = word(SafeRightShift<WORD_BITS>(value));
}

Integer::Integer(signed long value)
	: reg(2)
{
	if (value >= 0)
		sign = POSITIVE;
	else
	{
		sign = NEGATIVE;
		value = -value;
	}
	reg[0] = word(value);
	reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
}

Integer::Integer(Sign s, word high, word low)
	: reg(2), sign(s)
{
	reg[0] = low;
	reg[1] = high;
}

bool Integer::IsConvertableToLong() const
{
	if (ByteCount() > sizeof(long))
		return false;

	unsigned long value = (unsigned long)reg[0];
	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);

	if (sign==POSITIVE)
		return (signed long)value >= 0;
	else
		return -(signed long)value < 0;
}

signed long Integer::ConvertToLong() const
{
	assert(IsConvertableToLong());

	unsigned long value = (unsigned long)reg[0];
	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
	return sign==POSITIVE ? value : -(signed long)value;
}

Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s)
{
	Decode(encodedInteger, byteCount, s);
}

Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s)
{
	Decode(encodedInteger, byteCount, s);
}

Integer::Integer(BufferedTransformation &bt)
{
	BERDecode(bt);
}

Integer::Integer(RandomNumberGenerator &rng, size_t bitcount)
{
	Randomize(rng, bitcount);
}

Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
{
	if (!Randomize(rng, min, max, rnType, equiv, mod))
		throw Integer::RandomNumberNotFound();
}

Integer Integer::Power2(size_t e)
{
	Integer r((word)0, BitsToWords(e+1));
	r.SetBit(e);
	return r;
}

template <long i>
struct NewInteger
{
	Integer * operator()() const
	{
		return new Integer(i);
	}
};

const Integer &Integer::Zero()
{
	return Singleton<Integer>().Ref();
}

const Integer &Integer::One()
{
	return Singleton<Integer, NewInteger<1> >().Ref();
}

const Integer &Integer::Two()
{
	return Singleton<Integer, NewInteger<2> >().Ref();
}

bool Integer::operator!() const
{
	return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
}

Integer& Integer::operator=(const Integer& t)
{
	if (this != &t)
	{
		if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
			reg.New(RoundupSize(t.WordCount()));
		CopyWords(reg, t.reg, reg.size());
		sign = t.sign;
	}
	return *this;
}

bool Integer::GetBit(size_t n) const
{
	if (n/WORD_BITS >= reg.size())
		return 0;
	else
		return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
}

void Integer::SetBit(size_t n, bool value)
{
	if (value)
	{
		reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
		reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
	}
	else
	{
		if (n/WORD_BITS < reg.size())
			reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
	}
}

byte Integer::GetByte(size_t n) const
{
	if (n/WORD_SIZE >= reg.size())
		return 0;
	else
		return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
}

void Integer::SetByte(size_t n, byte value)
{
	reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
	reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
	reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
}

lword Integer::GetBits(size_t i, size_t n) const
{
	lword v = 0;
	assert(n <= sizeof(v)*8);
	for (unsigned int j=0; j<n; j++)
		v |= lword(GetBit(i+j)) << j;
	return v;
}

Integer Integer::operator-() const
{
	Integer result(*this);
	result.Negate();
	return result;
}

Integer Integer::AbsoluteValue() const
{
	Integer result(*this);
	result.sign = POSITIVE;
	return result;
}

void Integer::swap(Integer &a)
{
	reg.swap(a.reg);
	std::swap(sign, a.sign);
}

Integer::Integer(word value, size_t length)
	: reg(RoundupSize(length)), sign(POSITIVE)
{
	reg[0] = value;
	SetWords(reg+1, 0, reg.size()-1);
}

template <class T>
static Integer StringToInteger(const T *str)
{
	int radix;
	// GCC workaround
	// std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3
	unsigned int length;
	for (length = 0; str[length] != 0; length++) {}

	Integer v;

	if (length == 0)
		return v;

	switch (str[length-1])
	{
	case 'h':
	case 'H':
		radix=16;
		break;
	case 'o':
	case 'O':
		radix=8;
		break;
	case 'b':
	case 'B':
		radix=2;
		break;
	default:
		radix=10;
	}

	if (length > 2 && str[0] == '0' && str[1] == 'x')
		radix = 16;

	for (unsigned i=0; i<length; i++)
	{
		int digit;

		if (str[i] >= '0' && str[i] <= '9')
			digit = str[i] - '0';
		else if (str[i] >= 'A' && str[i] <= 'F')
			digit = str[i] - 'A' + 10;
		else if (str[i] >= 'a' && str[i] <= 'f')
			digit = str[i] - 'a' + 10;
		else
			digit = radix;

		if (digit < radix)
		{
			v *= radix;
			v += digit;
		}
	}

	if (str[0] == '-')
		v.Negate();

	return v;
}

Integer::Integer(const char *str)
	: reg(2), sign(POSITIVE)
{
	*this = StringToInteger(str);
}

Integer::Integer(const wchar_t *str)
	: reg(2), sign(POSITIVE)
{
	*this = StringToInteger(str);
}

unsigned int Integer::WordCount() const
{
	return (unsigned int)CountWords(reg, reg.size());
}

unsigned int Integer::ByteCount() const
{
	unsigned wordCount = WordCount();
	if (wordCount)
		return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
	else
		return 0;
}

unsigned int Integer::BitCount() const
{
	unsigned wordCount = WordCount();
	if (wordCount)
		return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
	else
		return 0;
}

void Integer::Decode(const byte *input, size_t inputLen, Signedness s)
{
	StringStore store(input, inputLen);
	Decode(store, inputLen, s);
}

void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)
{
	assert(bt.MaxRetrievable() >= inputLen);

	byte b;
	bt.Peek(b);
	sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;

	while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
	{
		bt.Skip(1);
		inputLen--;
		bt.Peek(b);
	}

	reg.CleanNew(RoundupSize(BytesToWords(inputLen)));

	for (size_t i=inputLen; i > 0; i--)
	{
		bt.Get(b);
		reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
	}

	if (sign == NEGATIVE)
	{
		for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)
			reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;
		TwosComplement(reg, reg.size());
	}
}

size_t Integer::MinEncodedSize(Signedness signedness) const
{
	unsigned int outputLen = STDMAX(1U, ByteCount());
	if (signedness == UNSIGNED)
		return outputLen;
	if (NotNegative() && (GetByte(outputLen-1) & 0x80))
		outputLen++;
	if (IsNegative() && *this < -Power2(outputLen*8-1))
		outputLen++;
	return outputLen;
}

void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const
{
	ArraySink sink(output, outputLen);
	Encode(sink, outputLen, signedness);
}

void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const
{
	if (signedness == UNSIGNED || NotNegative())
	{
		for (size_t i=outputLen; i > 0; i--)
			bt.Put(GetByte(i-1));
	}
	else
	{
		// take two's complement of *this
		Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this;
		temp.Encode(bt, outputLen, UNSIGNED);
	}
}

void Integer::DEREncode(BufferedTransformation &bt) const
{
	DERGeneralEncoder enc(bt, INTEGER);
	Encode(enc, MinEncodedSize(SIGNED), SIGNED);
	enc.MessageEnd();
}

void Integer::BERDecode(const byte *input, size_t len)
{
	StringStore store(input, len);
	BERDecode(store);
}

void Integer::BERDecode(BufferedTransformation &bt)
{
	BERGeneralDecoder dec(bt, INTEGER);
	if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
		BERDecodeError();
	Decode(dec, (size_t)dec.RemainingLength(), SIGNED);
	dec.MessageEnd();
}

void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const
{
	DERGeneralEncoder enc(bt, OCTET_STRING);
	Encode(enc, length);
	enc.MessageEnd();
}

void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length)
{
	BERGeneralDecoder dec(bt, OCTET_STRING);
	if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
		BERDecodeError();
	Decode(dec, length);
	dec.MessageEnd();
}

size_t Integer::OpenPGPEncode(byte *output, size_t len) const
{
	ArraySink sink(output, len);
	return OpenPGPEncode(sink);
}

size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const
{
	word16 bitCount = BitCount();
	bt.PutWord16(bitCount);
	size_t byteCount = BitsToBytes(bitCount);
	Encode(bt, byteCount);
	return 2 + byteCount;
}

void Integer::OpenPGPDecode(const byte *input, size_t len)
{
	StringStore store(input, len);
	OpenPGPDecode(store);
}

void Integer::OpenPGPDecode(BufferedTransformation &bt)
{
	word16 bitCount;
	if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
		throw OpenPGPDecodeErr();
	Decode(bt, BitsToBytes(bitCount));
}

void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)
{
	const size_t nbytes = nbits/8 + 1;
	SecByteBlock buf(nbytes);
	rng.GenerateBlock(buf, nbytes);
	if (nbytes)
		buf[0] = (byte)Crop(buf[0], nbits % 8);
	Decode(buf, nbytes, UNSIGNED);
}

void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
{
	if (min > max)
		throw InvalidArgument("Integer: Min must be no greater than Max");

	Integer range = max - min;
	const unsigned int nbits = range.BitCount();

	do
	{
		Randomize(rng, nbits);
	}
	while (*this > range);

	*this += min;
}

bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
{
	return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
}

class KDF2_RNG : public RandomNumberGenerator
{
public:
	KDF2_RNG(const byte *seed, size_t seedSize)
		: m_counter(0), m_counterAndSeed(seedSize + 4)
	{
		memcpy(m_counterAndSeed + 4, seed, seedSize);
	}

	void GenerateBlock(byte *output, size_t size)
	{
		PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
		++m_counter;
		P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0);
	}

private:
	word32 m_counter;
	SecByteBlock m_counterAndSeed;
};

bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
{
	Integer min = params.GetValueWithDefault("Min", Integer::Zero());
	Integer max;
	if (!params.GetValue("Max", max))
	{
		int bitLength;
		if (params.GetIntValue("BitLength", bitLength))
			max = Integer::Power2(bitLength);
		else
			throw InvalidArgument("Integer: missing Max argument");
	}
	if (min > max)
		throw InvalidArgument("Integer: Min must be no greater than Max");

	Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
	Integer mod = params.GetValueWithDefault("Mod", Integer::One());

	if (equiv.IsNegative() || equiv >= mod)
		throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");

	Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);

	member_ptr<KDF2_RNG> kdf2Rng;
	ConstByteArrayParameter seed;
	if (params.GetValue(Name::Seed(), seed))
	{
		ByteQueue bq;
		DERSequenceEncoder seq(bq);
		min.DEREncode(seq);
		max.DEREncode(seq);
		equiv.DEREncode(seq);
		mod.DEREncode(seq);
		DEREncodeUnsigned(seq, rnType);
		DEREncodeOctetString(seq, seed.begin(), seed.size());
		seq.MessageEnd();

		SecByteBlock finalSeed((size_t)bq.MaxRetrievable());
		bq.Get(finalSeed, finalSeed.size());
		kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
	}
	RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;

	switch (rnType)
	{
		case ANY:
			if (mod == One())
				Randomize(rng, min, max);
			else
			{
				Integer min1 = min + (equiv-min)%mod;
				if (max < min1)
					return false;
				Randomize(rng, Zero(), (max - min1) / mod);
				*this *= mod;
				*this += min1;
			}
			return true;

		case PRIME:
		{
			const PrimeSelector *pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector *)NULL);

			int i;
			i = 0;
			while (1)
			{
				if (++i==16)
				{
					// check if there are any suitable primes in [min, max]
					Integer first = min;
					if (FirstPrime(first, max, equiv, mod, pSelector))
					{
						// if there is only one suitable prime, we're done
						*this = first;
						if (!FirstPrime(first, max, equiv, mod, pSelector))
							return true;
					}
					else
						return false;
				}

				Randomize(rng, min, max);
				if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
					return true;
			}
		}

		default:
			throw InvalidArgument("Integer: invalid RandomNumberType argument");
	}
}

std::istream& operator>>(std::istream& in, Integer &a)
{
	char c;
	unsigned int length = 0;
	SecBlock<char> str(length + 16);

	std::ws(in);

	do
	{
		in.read(&c, 1);
		str[length++] = c;
		if (length >= str.size())
			str.Grow(length + 16);
	}
	while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));

	if (in.gcount())
		in.putback(c);
	str[length-1] = '\0';
	a = Integer(str);

	return in;
}

std::ostream& operator<<(std::ostream& out, const Integer &a)
{
	// Get relevant conversion specifications from ostream.
	long f = out.flags() & std::ios::basefield; // Get base digits.
	int base, block;
	char suffix;
	switch(f)
	{
	case std::ios::oct :
		base = 8;
		block = 8;
		suffix = 'o';
		break;
	case std::ios::hex :
		base = 16;
		block = 4;
		suffix = 'h';
		break;
	default :
		base = 10;
		block = 3;
		suffix = '.';
	}

	Integer temp1=a, temp2;

	if (a.IsNegative())
	{
		out << '-';
		temp1.Negate();
	}

	if (!a)
		out << '0';

	static const char upper[]="0123456789ABCDEF";
	static const char lower[]="0123456789abcdef";

	const char* vec = (out.flags() & std::ios::uppercase) ? upper : lower;
	unsigned i=0;
	SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1);

	while (!!temp1)
	{
		word digit;
		Integer::Divide(digit, temp2, temp1, base);
		s[i++]=vec[digit];
		temp1.swap(temp2);
	}

	while (i--)
	{
		out << s[i];
//		if (i && !(i%block))
//			out << ",";
	}
	return out << suffix;
}

Integer& Integer::operator++()
{
	if (NotNegative())
	{
		if (Increment(reg, reg.size()))
		{
			reg.CleanGrow(2*reg.size());
			reg[reg.size()/2]=1;
		}
	}
	else
	{
		word borrow = Decrement(reg, reg.size());
		assert(!borrow);
		if (WordCount()==0)
			*this = Zero();
	}
	return *this;
}

Integer& Integer::operator--()
{
	if (IsNegative())
	{
		if (Increment(reg, reg.size()))
		{
			reg.CleanGrow(2*reg.size());
			reg[reg.size()/2]=1;
		}
	}
	else
	{
		if (Decrement(reg, reg.size()))
			*this = -One();
	}
	return *this;
}

void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
{
	int carry;
	if (a.reg.size() == b.reg.size())
		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
	else if (a.reg.size() > b.reg.size())
	{
		carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
		CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
		carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
	}
	else
	{
		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
		CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
		carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
	}

	if (carry)
	{
		sum.reg.CleanGrow(2*sum.reg.size());
		sum.reg[sum.reg.size()/2] = 1;
	}
	sum.sign = Integer::POSITIVE;
}

void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
{
	unsigned aSize = a.WordCount();
	aSize += aSize%2;
	unsigned bSize = b.WordCount();
	bSize += bSize%2;

	if (aSize == bSize)
	{
		if (Compare(a.reg, b.reg, aSize) >= 0)
		{
			Subtract(diff.reg, a.reg, b.reg, aSize);
			diff.sign = Integer::POSITIVE;
		}
		else
		{
			Subtract(diff.reg, b.reg, a.reg, aSize);
			diff.sign = Integer::NEGATIVE;
		}
	}
	else if (aSize > bSize)
	{
		word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
		CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
		borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
		assert(!borrow);
		diff.sign = Integer::POSITIVE;
	}
	else
	{
		word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
		CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
		borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
		assert(!borrow);
		diff.sign = Integer::NEGATIVE;
	}
}

// MSVC .NET 2003 workaround
template <class T> inline const T& STDMAX2(const T& a, const T& b)
{
	return a < b ? b : a;
}

Integer Integer::Plus(const Integer& b) const
{
	Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));
	if (NotNegative())
	{
		if (b.NotNegative())
			PositiveAdd(sum, *this, b);
		else
			PositiveSubtract(sum, *this, b);
	}
	else
	{
		if (b.NotNegative())
			PositiveSubtract(sum, b, *this);
		else
		{
			PositiveAdd(sum, *this, b);
			sum.sign = Integer::NEGATIVE;
		}
	}
	return sum;
}

Integer& Integer::operator+=(const Integer& t)
{
	reg.CleanGrow(t.reg.size());
	if (NotNegative())
	{
		if (t.NotNegative())
			PositiveAdd(*this, *this, t);
		else
			PositiveSubtract(*this, *this, t);
	}
	else
	{
		if (t.NotNegative())
			PositiveSubtract(*this, t, *this);
		else
		{
			PositiveAdd(*this, *this, t);
			sign = Integer::NEGATIVE;
		}
	}
	return *this;
}

Integer Integer::Minus(const Integer& b) const
{
	Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));
	if (NotNegative())
	{
		if (b.NotNegative())
			PositiveSubtract(diff, *this, b);
		else
			PositiveAdd(diff, *this, b);
	}
	else
	{
		if (b.NotNegative())
		{
			PositiveAdd(diff, *this, b);
			diff.sign = Integer::NEGATIVE;
		}
		else
			PositiveSubtract(diff, b, *this);
	}
	return diff;
}

Integer& Integer::operator-=(const Integer& t)
{
	reg.CleanGrow(t.reg.size());
	if (NotNegative())
	{
		if (t.NotNegative())
			PositiveSubtract(*this, *this, t);
		else
			PositiveAdd(*this, *this, t);
	}
	else
	{
		if (t.NotNegative())
		{
			PositiveAdd(*this, *this, t);
			sign = Integer::NEGATIVE;
		}
		else
			PositiveSubtract(*this, t, *this);
	}
	return *this;
}

Integer& Integer::operator<<=(size_t n)
{
	const size_t wordCount = WordCount();
	const size_t shiftWords = n / WORD_BITS;
	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);

	reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
	ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
	ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
	return *this;
}

Integer& Integer::operator>>=(size_t n)
{
	const size_t wordCount = WordCount();
	const size_t shiftWords = n / WORD_BITS;
	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);

	ShiftWordsRightByWords(reg, wordCount, shiftWords);
	if (wordCount > shiftWords)
		ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
	if (IsNegative() && WordCount()==0)   // avoid -0
		*this = Zero();
	return *this;
}

void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
{
	size_t aSize = RoundupSize(a.WordCount());
	size_t bSize = RoundupSize(b.WordCount());

	product.reg.CleanNew(RoundupSize(aSize+bSize));
	product.sign = Integer::POSITIVE;

	IntegerSecBlock workspace(aSize + bSize);
	AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
}

void Multiply(Integer &product, const Integer &a, const Integer &b)
{
	PositiveMultiply(product, a, b);

	if (a.NotNegative() != b.NotNegative())
		product.Negate();
}

Integer Integer::Times(const Integer &b) const
{
	Integer product;
	Multiply(product, *this, b);
	return product;
}

/*
void PositiveDivide(Integer &remainder, Integer &quotient,
				   const Integer &dividend, const Integer &divisor)
{
	remainder.reg.CleanNew(divisor.reg.size());
	remainder.sign = Integer::POSITIVE;
	quotient.reg.New(0);
	quotient.sign = Integer::POSITIVE;
	unsigned i=dividend.BitCount();
	while (i--)
	{
		word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
		remainder.reg[0] |= dividend[i];
		if (overflow || remainder >= divisor)
		{
			Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
			quotient.SetBit(i);
		}
	}
}
*/

void PositiveDivide(Integer &remainder, Integer &quotient,
				   const Integer &a, const Integer &b)
{
	unsigned aSize = a.WordCount();
	unsigned bSize = b.WordCount();

	if (!bSize)
		throw Integer::DivideByZero();

	if (aSize < bSize)
	{
		remainder = a;
		remainder.sign = Integer::POSITIVE;
		quotient = Integer::Zero();
		return;
	}

	aSize += aSize%2;	// round up to next even number
	bSize += bSize%2;

	remainder.reg.CleanNew(RoundupSize(bSize));
	remainder.sign = Integer::POSITIVE;
	quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
	quotient.sign = Integer::POSITIVE;

	IntegerSecBlock T(aSize+3*(bSize+2));
	Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
}

void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
{
	PositiveDivide(remainder, quotient, dividend, divisor);

	if (dividend.IsNegative())
	{
		quotient.Negate();
		if (remainder.NotZero())
		{
			--quotient;
			remainder = divisor.AbsoluteValue() - remainder;
		}
	}

	if (divisor.IsNegative())
		quotient.Negate();
}

void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
{
	q = a;
	q >>= n;

	const size_t wordCount = BitsToWords(n);
	if (wordCount <= a.WordCount())
	{
		r.reg.resize(RoundupSize(wordCount));
		CopyWords(r.reg, a.reg, wordCount);
		SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
		if (n % WORD_BITS != 0)
			r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));
	}
	else
	{
		r.reg.resize(RoundupSize(a.WordCount()));
		CopyWords(r.reg, a.reg, r.reg.size());
	}
	r.sign = POSITIVE;

	if (a.IsNegative() && r.NotZero())
	{
		--q;
		r = Power2(n) - r;
	}
}

Integer Integer::DividedBy(const Integer &b) const
{
	Integer remainder, quotient;
	Integer::Divide(remainder, quotient, *this, b);
	return quotient;
}

Integer Integer::Modulo(const Integer &b) const
{
	Integer remainder, quotient;
	Integer::Divide(remainder, quotient, *this, b);
	return remainder;
}

void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
{
	if (!divisor)
		throw Integer::DivideByZero();

	assert(divisor);

	if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
	{
		quotient = dividend >> (BitPrecision(divisor)-1);
		remainder = dividend.reg[0] & (divisor-1);
		return;
	}

	unsigned int i = dividend.WordCount();
	quotient.reg.CleanNew(RoundupSize(i));
	remainder = 0;
	while (i--)
	{
		quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
		remainder = DWord(dividend.reg[i], remainder) % divisor;
	}

	if (dividend.NotNegative())
		quotient.sign = POSITIVE;
	else
	{
		quotient.sign = NEGATIVE;
		if (remainder)
		{
			--quotient;
			remainder = divisor - remainder;
		}
	}
}

Integer Integer::DividedBy(word b) const
{
	word remainder;
	Integer quotient;
	Integer::Divide(remainder, quotient, *this, b);
	return quotient;
}

word Integer::Modulo(word divisor) const
{
	if (!divisor)
		throw Integer::DivideByZero();

	assert(divisor);

	word remainder;

	if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
		remainder = reg[0] & (divisor-1);
	else
	{
		unsigned int i = WordCount();

		if (divisor <= 5)
		{
			DWord sum(0, 0);
			while (i--)
				sum += reg[i];
			remainder = sum % divisor;
		}
		else
		{
			remainder = 0;
			while (i--)
				remainder = DWord(reg[i], remainder) % divisor;
		}
	}

	if (IsNegative() && remainder)
		remainder = divisor - remainder;

	return remainder;
}

void Integer::Negate()
{
	if (!!(*this))	// don't flip sign if *this==0
		sign = Sign(1-sign);
}

int Integer::PositiveCompare(const Integer& t) const
{
	unsigned size = WordCount(), tSize = t.WordCount();

	if (size == tSize)
		return CryptoPP::Compare(reg, t.reg, size);
	else
		return size > tSize ? 1 : -1;
}

int Integer::Compare(const Integer& t) const
{
	if (NotNegative())
	{
		if (t.NotNegative())
			return PositiveCompare(t);
		else
			return 1;
	}
	else
	{
		if (t.NotNegative())
			return -1;
		else
			return -PositiveCompare(t);
	}
}

Integer Integer::SquareRoot() const
{
	if (!IsPositive())
		return Zero();

	// overestimate square root
	Integer x, y = Power2((BitCount()+1)/2);
	assert(y*y >= *this);

	do
	{
		x = y;
		y = (x + *this/x) >> 1;
	} while (y<x);

	return x;
}

bool Integer::IsSquare() const
{
	Integer r = SquareRoot();
	return *this == r.Squared();
}

bool Integer::IsUnit() const
{
	return (WordCount() == 1) && (reg[0] == 1);
}

Integer Integer::MultiplicativeInverse() const
{
	return IsUnit() ? *this : Zero();
}

Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
{
	return x*y%m;
}

Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
{
	ModularArithmetic mr(m);
	return mr.Exponentiate(x, e);
}

Integer Integer::Gcd(const Integer &a, const Integer &b)
{
	return EuclideanDomainOf<Integer>().Gcd(a, b);
}

Integer Integer::InverseMod(const Integer &m) const
{
	assert(m.NotNegative());

	if (IsNegative())
		return Modulo(m).InverseMod(m);

	if (m.IsEven())
	{
		if (!m || IsEven())
			return Zero();	// no inverse
		if (*this == One())
			return One();

		Integer u = m.Modulo(*this).InverseMod(*this);
		return !u ? Zero() : (m*(*this-u)+1)/(*this);
	}

	SecBlock<word> T(m.reg.size() * 4);
	Integer r((word)0, m.reg.size());
	unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
	DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
	return r;
}

word Integer::InverseMod(word mod) const
{
	word g0 = mod, g1 = *this % mod;
	word v0 = 0, v1 = 1;
	word y;

	while (g1)
	{
		if (g1 == 1)
			return v1;
		y = g0 / g1;
		g0 = g0 % g1;
		v0 += y * v1;

		if (!g0)
			break;
		if (g0 == 1)
			return mod-v0;
		y = g1 / g0;
		g1 = g1 % g0;
		v1 += y * v0;
	}
	return 0;
}

// ********************************************************

ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
{
	BERSequenceDecoder seq(bt);
	OID oid(seq);
	if (oid != ASN1::prime_field())
		BERDecodeError();
	m_modulus.BERDecode(seq);
	seq.MessageEnd();
	m_result.reg.resize(m_modulus.reg.size());
}

void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
{
	DERSequenceEncoder seq(bt);
	ASN1::prime_field().DEREncode(seq);
	m_modulus.DEREncode(seq);
	seq.MessageEnd();
}

void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
{
	a.DEREncodeAsOctetString(out, MaxElementByteLength());
}

void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
{
	a.BERDecodeAsOctetString(in, MaxElementByteLength());
}

const Integer& ModularArithmetic::Half(const Integer &a) const
{
	if (a.reg.size()==m_modulus.reg.size())
	{
		CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());
		return m_result;
	}
	else
		return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));
}

const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
{
	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
	{
		if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())
			|| Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)
		{
			CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
		}
		return m_result;
	}
	else
	{
		m_result1 = a+b;
		if (m_result1 >= m_modulus)
			m_result1 -= m_modulus;
		return m_result1;
	}
}

Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
{
	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
	{
		if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
			|| Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)
		{
			CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());
		}
	}
	else
	{
		a+=b;
		if (a>=m_modulus)
			a-=m_modulus;
	}

	return a;
}

const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
{
	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
	{
		if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))
			CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
		return m_result;
	}
	else
	{
		m_result1 = a-b;
		if (m_result1.IsNegative())
			m_result1 += m_modulus;
		return m_result1;
	}
}

Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
{
	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
	{
		if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
			CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());
	}
	else
	{
		a-=b;
		if (a.IsNegative())
			a+=m_modulus;
	}

	return a;
}

const Integer& ModularArithmetic::Inverse(const Integer &a) const
{
	if (!a)
		return a;

	CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());
	if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))
		Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());

	return m_result;
}

Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
{
	if (m_modulus.IsOdd())
	{
		MontgomeryRepresentation dr(m_modulus);
		return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
	}
	else
		return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
}

void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
{
	if (m_modulus.IsOdd())
	{
		MontgomeryRepresentation dr(m_modulus);
		dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
		for (unsigned int i=0; i<exponentsCount; i++)
			results[i] = dr.ConvertOut(results[i]);
	}
	else
		AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
}

MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)	// modulus must be odd
	: ModularArithmetic(m),
	  m_u((word)0, m_modulus.reg.size()),
	  m_workspace(5*m_modulus.reg.size())
{
	if (!m_modulus.IsOdd())
		throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");

	RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
}

const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
{
	word *const T = m_workspace.begin();
	word *const R = m_result.reg.begin();
	const size_t N = m_modulus.reg.size();
	assert(a.reg.size()<=N && b.reg.size()<=N);

	AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
	SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
	return m_result;
}

const Integer& MontgomeryRepresentation::Square(const Integer &a) const
{
	word *const T = m_workspace.begin();
	word *const R = m_result.reg.begin();
	const size_t N = m_modulus.reg.size();
	assert(a.reg.size()<=N);

	CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
	SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
	return m_result;
}

Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
{
	word *const T = m_workspace.begin();
	word *const R = m_result.reg.begin();
	const size_t N = m_modulus.reg.size();
	assert(a.reg.size()<=N);

	CopyWords(T, a.reg, a.reg.size());
	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
	return m_result;
}

const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
{
//	  return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
	word *const T = m_workspace.begin();
	word *const R = m_result.reg.begin();
	const size_t N = m_modulus.reg.size();
	assert(a.reg.size()<=N);

	CopyWords(T, a.reg, a.reg.size());
	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
	unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);

//	cout << "k=" << k << " N*32=" << 32*N << endl;

	if (k>N*WORD_BITS)
		DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
	else
		MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);

	return m_result;
}

NAMESPACE_END

#endif