1// integer.cpp - written and placed in the public domain by Wei Dai
2// contains public domain code contributed by Alister Lee and Leonard Janke
3
4#include "pch.h"
5
6#ifndef CRYPTOPP_IMPORTS
7
8#include "integer.h"
9#include "modarith.h"
10#include "nbtheory.h"
11#include "asn.h"
12#include "oids.h"
13#include "words.h"
14#include "algparam.h"
15#include "pubkey.h"		// for P1363_KDF2
16#include "sha.h"
17#include "cpu.h"
18
19#include <iostream>
20
21#if _MSC_VER >= 1400
22	#include <intrin.h>
23#endif
24
25#ifdef __DECCXX
26	#include <c_asm.h>
27#endif
28
29#ifdef CRYPTOPP_MSVC6_NO_PP
30	#pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
31#endif
32
33#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
34
35NAMESPACE_BEGIN(CryptoPP)
36
37bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
38{
39	if (valueType != typeid(Integer))
40		return false;
41	*reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt);
42	return true;
43}
44
45inline static int Compare(const word *A, const word *B, size_t N)
46{
47	while (N--)
48		if (A[N] > B[N])
49			return 1;
50		else if (A[N] < B[N])
51			return -1;
52
53	return 0;
54}
55
56inline static int Increment(word *A, size_t N, word B=1)
57{
58	assert(N);
59	word t = A[0];
60	A[0] = t+B;
61	if (A[0] >= t)
62		return 0;
63	for (unsigned i=1; i<N; i++)
64		if (++A[i])
65			return 0;
66	return 1;
67}
68
69inline static int Decrement(word *A, size_t N, word B=1)
70{
71	assert(N);
72	word t = A[0];
73	A[0] = t-B;
74	if (A[0] <= t)
75		return 0;
76	for (unsigned i=1; i<N; i++)
77		if (A[i]--)
78			return 0;
79	return 1;
80}
81
82static void TwosComplement(word *A, size_t N)
83{
84	Decrement(A, N);
85	for (unsigned i=0; i<N; i++)
86		A[i] = ~A[i];
87}
88
89static word AtomicInverseModPower2(word A)
90{
91	assert(A%2==1);
92
93	word R=A%8;
94
95	for (unsigned i=3; i<WORD_BITS; i*=2)
96		R = R*(2-R*A);
97
98	assert(R*A==1);
99	return R;
100}
101
102// ********************************************************
103
104#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || (defined(__x86_64__) && defined(CRYPTOPP_WORD128_AVAILABLE))
105	#define Declare2Words(x)			word x##0, x##1;
106	#define AssignWord(a, b)			a##0 = b; a##1 = 0;
107	#define Add2WordsBy1(a, b, c)		a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
108	#define LowWord(a)					a##0
109	#define HighWord(a)					a##1
110	#ifdef _MSC_VER
111		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = _umul128(a, b, &p1);
112		#ifndef __INTEL_COMPILER
113			#define Double3Words(c, d)		d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
114		#endif
115	#elif defined(__DECCXX)
116		#define MultiplyWordsLoHi(p0, p1, a, b)		p0 = a*b; p1 = asm("umulh %a0, %a1, %v0", a, b);
117	#elif defined(__x86_64__)
118		#ifdef __SUNPRO_CC
119			// Sun Studio's gcc-style inline assembly is heavily bugged as of version 5.9 Patch 124864-09 2008/12/16, but this one works
120			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "r"(b) : "cc");
121		#else
122			#define MultiplyWordsLoHi(p0, p1, a, b)		asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
123			#define MulAcc(c, d, a, b)		asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
124			#define Double3Words(c, d)		asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
125			#define Acc2WordsBy1(a, b)		asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
126			#define Acc2WordsBy2(a, b)		asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
127			#define Acc3WordsBy2(c, d, e)	asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
128		#endif
129	#endif
130	#define MultiplyWords(p, a, b)		MultiplyWordsLoHi(p##0, p##1, a, b)
131	#ifndef Double3Words
132		#define Double3Words(c, d)		d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
133	#endif
134	#ifndef Acc2WordsBy2
135		#define Acc2WordsBy2(a, b)		a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
136	#endif
137	#define AddWithCarry(u, a, b)		{word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
138	#define SubtractWithBorrow(u, a, b)	{word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
139	#define GetCarry(u)					u##1
140	#define GetBorrow(u)				u##1
141#else
142	#define Declare2Words(x)			dword x;
143	#if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
144		#define MultiplyWords(p, a, b)		p = __emulu(a, b);
145	#else
146		#define MultiplyWords(p, a, b)		p = (dword)a*b;
147	#endif
148	#define AssignWord(a, b)			a = b;
149	#define Add2WordsBy1(a, b, c)		a = b + c;
150	#define Acc2WordsBy2(a, b)			a += b;
151	#define LowWord(a)					word(a)
152	#define HighWord(a)					word(a>>WORD_BITS)
153	#define Double3Words(c, d)			d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
154	#define AddWithCarry(u, a, b)		u = dword(a) + b + GetCarry(u);
155	#define SubtractWithBorrow(u, a, b)	u = dword(a) - b - GetBorrow(u);
156	#define GetCarry(u)					HighWord(u)
157	#define GetBorrow(u)				word(u>>(WORD_BITS*2-1))
158#endif
159#ifndef MulAcc
160	#define MulAcc(c, d, a, b)			MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
161#endif
162#ifndef Acc2WordsBy1
163	#define Acc2WordsBy1(a, b)			Add2WordsBy1(a, a, b)
164#endif
165#ifndef Acc3WordsBy2
166	#define Acc3WordsBy2(c, d, e)		Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
167#endif
168
169class DWord
170{
171public:
172	DWord() {}
173
174#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
175	explicit DWord(word low)
176	{
177		m_whole = low;
178	}
179#else
180	explicit DWord(word low)
181	{
182		m_halfs.low = low;
183		m_halfs.high = 0;
184	}
185#endif
186
187	DWord(word low, word high)
188	{
189		m_halfs.low = low;
190		m_halfs.high = high;
191	}
192
193	static DWord Multiply(word a, word b)
194	{
195		DWord r;
196		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
197			r.m_whole = (dword)a * b;
198		#elif defined(MultiplyWordsLoHi)
199			MultiplyWordsLoHi(r.m_halfs.low, r.m_halfs.high, a, b);
200		#endif
201		return r;
202	}
203
204	static DWord MultiplyAndAdd(word a, word b, word c)
205	{
206		DWord r = Multiply(a, b);
207		return r += c;
208	}
209
210	DWord & operator+=(word a)
211	{
212		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
213			m_whole = m_whole + a;
214		#else
215			m_halfs.low += a;
216			m_halfs.high += (m_halfs.low < a);
217		#endif
218		return *this;
219	}
220
221	DWord operator+(word a)
222	{
223		DWord r;
224		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
225			r.m_whole = m_whole + a;
226		#else
227			r.m_halfs.low = m_halfs.low + a;
228			r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
229		#endif
230		return r;
231	}
232
233	DWord operator-(DWord a)
234	{
235		DWord r;
236		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
237			r.m_whole = m_whole - a.m_whole;
238		#else
239			r.m_halfs.low = m_halfs.low - a.m_halfs.low;
240			r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
241		#endif
242		return r;
243	}
244
245	DWord operator-(word a)
246	{
247		DWord r;
248		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
249			r.m_whole = m_whole - a;
250		#else
251			r.m_halfs.low = m_halfs.low - a;
252			r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
253		#endif
254		return r;
255	}
256
257	// returns quotient, which must fit in a word
258	word operator/(word divisor);
259
260	word operator%(word a);
261
262	bool operator!() const
263	{
264	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
265		return !m_whole;
266	#else
267		return !m_halfs.high && !m_halfs.low;
268	#endif
269	}
270
271	word GetLowHalf() const {return m_halfs.low;}
272	word GetHighHalf() const {return m_halfs.high;}
273	word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}
274
275private:
276	union
277	{
278	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
279		dword m_whole;
280	#endif
281		struct
282		{
283		#ifdef IS_LITTLE_ENDIAN
284			word low;
285			word high;
286		#else
287			word high;
288			word low;
289		#endif
290		} m_halfs;
291	};
292};
293
294class Word
295{
296public:
297	Word() {}
298
299	Word(word value)
300	{
301		m_whole = value;
302	}
303
304	Word(hword low, hword high)
305	{
306		m_whole = low | (word(high) << (WORD_BITS/2));
307	}
308
309	static Word Multiply(hword a, hword b)
310	{
311		Word r;
312		r.m_whole = (word)a * b;
313		return r;
314	}
315
316	Word operator-(Word a)
317	{
318		Word r;
319		r.m_whole = m_whole - a.m_whole;
320		return r;
321	}
322
323	Word operator-(hword a)
324	{
325		Word r;
326		r.m_whole = m_whole - a;
327		return r;
328	}
329
330	// returns quotient, which must fit in a word
331	hword operator/(hword divisor)
332	{
333		return hword(m_whole / divisor);
334	}
335
336	bool operator!() const
337	{
338		return !m_whole;
339	}
340
341	word GetWhole() const {return m_whole;}
342	hword GetLowHalf() const {return hword(m_whole);}
343	hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
344	hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}
345
346private:
347	word m_whole;
348};
349
350// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
351template <class S, class D>
352S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULL)
353{
354	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
355	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
356
357	// estimate the quotient: do a 2 S by 1 S divide
358	S Q;
359	if (S(B1+1) == 0)
360		Q = A[2];
361	else
362		Q = D(A[1], A[2]) / S(B1+1);
363
364	// now subtract Q*B from A
365	D p = D::Multiply(B0, Q);
366	D u = (D) A[0] - p.GetLowHalf();
367	A[0] = u.GetLowHalf();
368	u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
369	A[1] = u.GetLowHalf();
370	A[2] += u.GetHighHalf();
371
372	// Q <= actual quotient, so fix it
373	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
374	{
375		u = (D) A[0] - B0;
376		A[0] = u.GetLowHalf();
377		u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
378		A[1] = u.GetLowHalf();
379		A[2] += u.GetHighHalf();
380		Q++;
381		assert(Q);	// shouldn't overflow
382	}
383
384	return Q;
385}
386
387// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
388template <class S, class D>
389inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
390{
391	if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
392		return D(Ah.GetLowHalf(), Ah.GetHighHalf());
393	else
394	{
395		S Q[2];
396		T[0] = Al.GetLowHalf();
397		T[1] = Al.GetHighHalf();
398		T[2] = Ah.GetLowHalf();
399		T[3] = Ah.GetHighHalf();
400		Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
401		Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
402		return D(Q[0], Q[1]);
403	}
404}
405
406// returns quotient, which must fit in a word
407inline word DWord::operator/(word a)
408{
409	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
410		return word(m_whole / a);
411	#else
412		hword r[4];
413		return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
414	#endif
415}
416
417inline word DWord::operator%(word a)
418{
419	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
420		return word(m_whole % a);
421	#else
422		if (a < (word(1) << (WORD_BITS/2)))
423		{
424			hword h = hword(a);
425			word r = m_halfs.high % h;
426			r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
427			return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
428		}
429		else
430		{
431			hword r[4];
432			DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
433			return Word(r[0], r[1]).GetWhole();
434		}
435	#endif
436}
437
438// ********************************************************
439
440// use some tricks to share assembly code between MSVC and GCC
441#if defined(__GNUC__)
442	#define AddPrologue \
443		int result;	\
444		__asm__ __volatile__ \
445		( \
446			".intel_syntax noprefix;"
447	#define AddEpilogue \
448			".att_syntax prefix;" \
449					: "=a" (result)\
450					: "d" (C), "a" (A), "D" (B), "c" (N) \
451					: "%esi", "memory", "cc" \
452		);\
453		return result;
454	#define MulPrologue \
455		__asm__ __volatile__ \
456		( \
457			".intel_syntax noprefix;" \
458			AS1(	push	ebx) \
459			AS2(	mov		ebx, edx)
460	#define MulEpilogue \
461			AS1(	pop		ebx) \
462			".att_syntax prefix;" \
463			: \
464			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
465			: "%esi", "memory", "cc" \
466		);
467	#define SquPrologue		MulPrologue
468	#define SquEpilogue	\
469			AS1(	pop		ebx) \
470			".att_syntax prefix;" \
471			: \
472			: "d" (s_maskLow16), "c" (C), "a" (A) \
473			: "%esi", "%edi", "memory", "cc" \
474		);
475	#define TopPrologue		MulPrologue
476	#define TopEpilogue	\
477			AS1(	pop		ebx) \
478			".att_syntax prefix;" \
479			: \
480			: "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
481			: "memory", "cc" \
482		);
483#else
484	#define AddPrologue \
485		__asm	push edi \
486		__asm	push esi \
487		__asm	mov		eax, [esp+12] \
488		__asm	mov		edi, [esp+16]
489	#define AddEpilogue \
490		__asm	pop esi \
491		__asm	pop edi \
492		__asm	ret 8
493#if _MSC_VER < 1300
494	#define SaveEBX		__asm push ebx
495	#define RestoreEBX	__asm pop ebx
496#else
497	#define SaveEBX
498	#define RestoreEBX
499#endif
500	#define SquPrologue					\
501		AS2(	mov		eax, A)			\
502		AS2(	mov		ecx, C)			\
503		SaveEBX							\
504		AS2(	lea		ebx, s_maskLow16)
505	#define MulPrologue					\
506		AS2(	mov		eax, A)			\
507		AS2(	mov		edi, B)			\
508		AS2(	mov		ecx, C)			\
509		SaveEBX							\
510		AS2(	lea		ebx, s_maskLow16)
511	#define TopPrologue					\
512		AS2(	mov		eax, A)			\
513		AS2(	mov		edi, B)			\
514		AS2(	mov		ecx, C)			\
515		AS2(	mov		esi, L)			\
516		SaveEBX							\
517		AS2(	lea		ebx, s_maskLow16)
518	#define SquEpilogue		RestoreEBX
519	#define MulEpilogue		RestoreEBX
520	#define TopEpilogue		RestoreEBX
521#endif
522
523#ifdef CRYPTOPP_X64_MASM_AVAILABLE
524extern "C" {
525int Baseline_Add(size_t N, word *C, const word *A, const word *B);
526int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
527}
528#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
529int Baseline_Add(size_t N, word *C, const word *A, const word *B)
530{
531	word result;
532	__asm__ __volatile__
533	(
534	".intel_syntax;"
535	AS1(	neg		%1)
536	ASJ(	jz,		1, f)
537	AS2(	mov		%0,[%3+8*%1])
538	AS2(	add		%0,[%4+8*%1])
539	AS2(	mov		[%2+8*%1],%0)
540	ASL(0)
541	AS2(	mov		%0,[%3+8*%1+8])
542	AS2(	adc		%0,[%4+8*%1+8])
543	AS2(	mov		[%2+8*%1+8],%0)
544	AS2(	lea		%1,[%1+2])
545	ASJ(	jrcxz,	1, f)
546	AS2(	mov		%0,[%3+8*%1])
547	AS2(	adc		%0,[%4+8*%1])
548	AS2(	mov		[%2+8*%1],%0)
549	ASJ(	jmp,	0, b)
550	ASL(1)
551	AS2(	mov		%0, 0)
552	AS2(	adc		%0, %0)
553	".att_syntax;"
554	: "=&r" (result), "+c" (N)
555	: "r" (C+N), "r" (A+N), "r" (B+N)
556	: "memory", "cc"
557	);
558	return (int)result;
559}
560
561int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
562{
563	word result;
564	__asm__ __volatile__
565	(
566	".intel_syntax;"
567	AS1(	neg		%1)
568	ASJ(	jz,		1, f)
569	AS2(	mov		%0,[%3+8*%1])
570	AS2(	sub		%0,[%4+8*%1])
571	AS2(	mov		[%2+8*%1],%0)
572	ASL(0)
573	AS2(	mov		%0,[%3+8*%1+8])
574	AS2(	sbb		%0,[%4+8*%1+8])
575	AS2(	mov		[%2+8*%1+8],%0)
576	AS2(	lea		%1,[%1+2])
577	ASJ(	jrcxz,	1, f)
578	AS2(	mov		%0,[%3+8*%1])
579	AS2(	sbb		%0,[%4+8*%1])
580	AS2(	mov		[%2+8*%1],%0)
581	ASJ(	jmp,	0, b)
582	ASL(1)
583	AS2(	mov		%0, 0)
584	AS2(	adc		%0, %0)
585	".att_syntax;"
586	: "=&r" (result), "+c" (N)
587	: "r" (C+N), "r" (A+N), "r" (B+N)
588	: "memory", "cc"
589	);
590	return (int)result;
591}
592#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
593CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
594{
595	AddPrologue
596
597	// now: eax = A, edi = B, edx = C, ecx = N
598	AS2(	lea		eax, [eax+4*ecx])
599	AS2(	lea		edi, [edi+4*ecx])
600	AS2(	lea		edx, [edx+4*ecx])
601
602	AS1(	neg		ecx)				// ecx is negative index
603	AS2(	test	ecx, 2)				// this clears carry flag
604	ASJ(	jz,		0, f)
605	AS2(	sub		ecx, 2)
606	ASJ(	jmp,	1, f)
607
608	ASL(0)
609	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
610	AS2(	mov		esi,[eax+4*ecx])
611	AS2(	adc		esi,[edi+4*ecx])
612	AS2(	mov		[edx+4*ecx],esi)
613	AS2(	mov		esi,[eax+4*ecx+4])
614	AS2(	adc		esi,[edi+4*ecx+4])
615	AS2(	mov		[edx+4*ecx+4],esi)
616	ASL(1)
617	AS2(	mov		esi,[eax+4*ecx+8])
618	AS2(	adc		esi,[edi+4*ecx+8])
619	AS2(	mov		[edx+4*ecx+8],esi)
620	AS2(	mov		esi,[eax+4*ecx+12])
621	AS2(	adc		esi,[edi+4*ecx+12])
622	AS2(	mov		[edx+4*ecx+12],esi)
623
624	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
625	ASJ(	jmp,	0, b)
626
627	ASL(2)
628	AS2(	mov		eax, 0)
629	AS1(	setc	al)					// store carry into eax (return result register)
630
631	AddEpilogue
632}
633
634CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
635{
636	AddPrologue
637
638	// now: eax = A, edi = B, edx = C, ecx = N
639	AS2(	lea		eax, [eax+4*ecx])
640	AS2(	lea		edi, [edi+4*ecx])
641	AS2(	lea		edx, [edx+4*ecx])
642
643	AS1(	neg		ecx)				// ecx is negative index
644	AS2(	test	ecx, 2)				// this clears carry flag
645	ASJ(	jz,		0, f)
646	AS2(	sub		ecx, 2)
647	ASJ(	jmp,	1, f)
648
649	ASL(0)
650	ASJ(	jecxz,	2, f)				// loop until ecx overflows and becomes zero
651	AS2(	mov		esi,[eax+4*ecx])
652	AS2(	sbb		esi,[edi+4*ecx])
653	AS2(	mov		[edx+4*ecx],esi)
654	AS2(	mov		esi,[eax+4*ecx+4])
655	AS2(	sbb		esi,[edi+4*ecx+4])
656	AS2(	mov		[edx+4*ecx+4],esi)
657	ASL(1)
658	AS2(	mov		esi,[eax+4*ecx+8])
659	AS2(	sbb		esi,[edi+4*ecx+8])
660	AS2(	mov		[edx+4*ecx+8],esi)
661	AS2(	mov		esi,[eax+4*ecx+12])
662	AS2(	sbb		esi,[edi+4*ecx+12])
663	AS2(	mov		[edx+4*ecx+12],esi)
664
665	AS2(	lea		ecx,[ecx+4])		// advance index, avoid inc which causes slowdown on Intel Core 2
666	ASJ(	jmp,	0, b)
667
668	ASL(2)
669	AS2(	mov		eax, 0)
670	AS1(	setc	al)					// store carry into eax (return result register)
671
672	AddEpilogue
673}
674
675#if CRYPTOPP_INTEGER_SSE2
676CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
677{
678	AddPrologue
679
680	// now: eax = A, edi = B, edx = C, ecx = N
681	AS2(	lea		eax, [eax+4*ecx])
682	AS2(	lea		edi, [edi+4*ecx])
683	AS2(	lea		edx, [edx+4*ecx])
684
685	AS1(	neg		ecx)				// ecx is negative index
686	AS2(	pxor    mm2, mm2)
687	ASJ(	jz,		2, f)
688	AS2(	test	ecx, 2)				// this clears carry flag
689	ASJ(	jz,		0, f)
690	AS2(	sub		ecx, 2)
691	ASJ(	jmp,	1, f)
692
693	ASL(0)
694	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
695	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
696	AS2(	paddq    mm0, mm1)
697	AS2(	paddq	 mm2, mm0)
698	AS2(	movd	 DWORD PTR [edx+4*ecx], mm2)
699	AS2(	psrlq    mm2, 32)
700
701	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+4])
702	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
703	AS2(	paddq    mm0, mm1)
704	AS2(	paddq	 mm2, mm0)
705	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
706	AS2(	psrlq    mm2, 32)
707
708	ASL(1)
709	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
710	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
711	AS2(	paddq    mm0, mm1)
712	AS2(	paddq	 mm2, mm0)
713	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm2)
714	AS2(	psrlq    mm2, 32)
715
716	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+12])
717	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
718	AS2(	paddq    mm0, mm1)
719	AS2(	paddq	 mm2, mm0)
720	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
721	AS2(	psrlq    mm2, 32)
722
723	AS2(	add		ecx, 4)
724	ASJ(	jnz,	0, b)
725
726	ASL(2)
727	AS2(	movd	eax, mm2)
728	AS1(	emms)
729
730	AddEpilogue
731}
732CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
733{
734	AddPrologue
735
736	// now: eax = A, edi = B, edx = C, ecx = N
737	AS2(	lea		eax, [eax+4*ecx])
738	AS2(	lea		edi, [edi+4*ecx])
739	AS2(	lea		edx, [edx+4*ecx])
740
741	AS1(	neg		ecx)				// ecx is negative index
742	AS2(	pxor    mm2, mm2)
743	ASJ(	jz,		2, f)
744	AS2(	test	ecx, 2)				// this clears carry flag
745	ASJ(	jz,		0, f)
746	AS2(	sub		ecx, 2)
747	ASJ(	jmp,	1, f)
748
749	ASL(0)
750	AS2(	movd     mm0, DWORD PTR [eax+4*ecx])
751	AS2(	movd     mm1, DWORD PTR [edi+4*ecx])
752	AS2(	psubq    mm0, mm1)
753	AS2(	psubq	 mm0, mm2)
754	AS2(	movd	 DWORD PTR [edx+4*ecx], mm0)
755	AS2(	psrlq    mm0, 63)
756
757	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+4])
758	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+4])
759	AS2(	psubq    mm2, mm1)
760	AS2(	psubq	 mm2, mm0)
761	AS2(	movd	 DWORD PTR [edx+4*ecx+4], mm2)
762	AS2(	psrlq    mm2, 63)
763
764	ASL(1)
765	AS2(	movd     mm0, DWORD PTR [eax+4*ecx+8])
766	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+8])
767	AS2(	psubq    mm0, mm1)
768	AS2(	psubq	 mm0, mm2)
769	AS2(	movd	 DWORD PTR [edx+4*ecx+8], mm0)
770	AS2(	psrlq    mm0, 63)
771
772	AS2(	movd     mm2, DWORD PTR [eax+4*ecx+12])
773	AS2(	movd     mm1, DWORD PTR [edi+4*ecx+12])
774	AS2(	psubq    mm2, mm1)
775	AS2(	psubq	 mm2, mm0)
776	AS2(	movd	 DWORD PTR [edx+4*ecx+12], mm2)
777	AS2(	psrlq    mm2, 63)
778
779	AS2(	add		ecx, 4)
780	ASJ(	jnz,	0, b)
781
782	ASL(2)
783	AS2(	movd	eax, mm2)
784	AS1(	emms)
785
786	AddEpilogue
787}
788#endif	// #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
789#else
790int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
791{
792	assert (N%2 == 0);
793
794	Declare2Words(u);
795	AssignWord(u, 0);
796	for (size_t i=0; i<N; i+=2)
797	{
798		AddWithCarry(u, A[i], B[i]);
799		C[i] = LowWord(u);
800		AddWithCarry(u, A[i+1], B[i+1]);
801		C[i+1] = LowWord(u);
802	}
803	return int(GetCarry(u));
804}
805
806int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
807{
808	assert (N%2 == 0);
809
810	Declare2Words(u);
811	AssignWord(u, 0);
812	for (size_t i=0; i<N; i+=2)
813	{
814		SubtractWithBorrow(u, A[i], B[i]);
815		C[i] = LowWord(u);
816		SubtractWithBorrow(u, A[i+1], B[i+1]);
817		C[i+1] = LowWord(u);
818	}
819	return int(GetBorrow(u));
820}
821#endif
822
823static word LinearMultiply(word *C, const word *A, word B, size_t N)
824{
825	word carry=0;
826	for(unsigned i=0; i<N; i++)
827	{
828		Declare2Words(p);
829		MultiplyWords(p, A[i], B);
830		Acc2WordsBy1(p, carry);
831		C[i] = LowWord(p);
832		carry = HighWord(p);
833	}
834	return carry;
835}
836
837#ifndef CRYPTOPP_DOXYGEN_PROCESSING
838
839#define Mul_2 \
840	Mul_Begin(2) \
841	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
842	Mul_End(1, 1)
843
844#define Mul_4 \
845	Mul_Begin(4) \
846	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
847	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
848	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
849	Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
850	Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
851	Mul_End(5, 3)
852
853#define Mul_8 \
854	Mul_Begin(8) \
855	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
856	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
857	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
858	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
859	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
860	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
861	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
862	Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
863	Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
864	Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
865	Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
866	Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
867	Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
868	Mul_End(13, 7)
869
870#define Mul_16 \
871	Mul_Begin(16) \
872	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
873	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
874	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
875	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
876	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
877	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
878	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
879	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
880	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
881	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
882	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
883	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
884	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
885	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
886	Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
887	Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
888	Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
889	Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
890	Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
891	Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
892	Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
893	Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
894	Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
895	Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
896	Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
897	Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
898	Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
899	Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
900	Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
901	Mul_End(29, 15)
902
903#define Squ_2 \
904	Squ_Begin(2) \
905	Squ_End(2)
906
907#define Squ_4 \
908	Squ_Begin(4) \
909	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
910	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
911	Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
912	Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
913	Squ_End(4)
914
915#define Squ_8 \
916	Squ_Begin(8) \
917	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
918	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
919	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
920	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
921	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
922	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
923	Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
924	Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5)  Squ_NonDiag \
925	Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
926	Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
927	Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
928	Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
929	Squ_End(8)
930
931#define Squ_16 \
932	Squ_Begin(16) \
933	Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
934	Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
935	Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
936	Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
937	Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
938	Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
939	Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
940	Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
941	Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
942	Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
943	Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
944	Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
945	Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
946	Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
947	Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
948	Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
949	Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
950	Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
951	Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
952	Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
953	Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
954	Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
955	Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
956	Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
957	Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
958	Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
959	Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
960	Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
961	Squ_End(16)
962
963#define Bot_2 \
964	Mul_Begin(2) \
965	Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
966	Bot_End(2)
967
968#define Bot_4 \
969	Mul_Begin(4) \
970	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
971	Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2)  \
972	Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0)  \
973	Bot_End(4)
974
975#define Bot_8 \
976	Mul_Begin(8) \
977	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
978	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0)  \
979	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
980	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
981	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
982	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
983	Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
984	Bot_End(8)
985
986#define Bot_16 \
987	Mul_Begin(16) \
988	Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
989	Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
990	Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
991	Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
992	Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
993	Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
994	Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
995	Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
996	Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
997	Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
998	Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
999	Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1000	Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1001	Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1002	Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
1003	Bot_End(16)
1004
1005#endif
1006
1007#if 0
1008#define Mul_Begin(n)				\
1009	Declare2Words(p)				\
1010	Declare2Words(c)				\
1011	Declare2Words(d)				\
1012	MultiplyWords(p, A[0], B[0])	\
1013	AssignWord(c, LowWord(p))		\
1014	AssignWord(d, HighWord(p))
1015
1016#define Mul_Acc(i, j)				\
1017	MultiplyWords(p, A[i], B[j])	\
1018	Acc2WordsBy1(c, LowWord(p))		\
1019	Acc2WordsBy1(d, HighWord(p))
1020
1021#define Mul_SaveAcc(k, i, j) 		\
1022	R[k] = LowWord(c);				\
1023	Add2WordsBy1(c, d, HighWord(c))	\
1024	MultiplyWords(p, A[i], B[j])	\
1025	AssignWord(d, HighWord(p))		\
1026	Acc2WordsBy1(c, LowWord(p))
1027
1028#define Mul_End(n)					\
1029	R[2*n-3] = LowWord(c);			\
1030	Acc2WordsBy1(d, HighWord(c))	\
1031	MultiplyWords(p, A[n-1], B[n-1])\
1032	Acc2WordsBy2(d, p)				\
1033	R[2*n-2] = LowWord(d);			\
1034	R[2*n-1] = HighWord(d);
1035
1036#define Bot_SaveAcc(k, i, j)		\
1037	R[k] = LowWord(c);				\
1038	word e = LowWord(d) + HighWord(c);	\
1039	e += A[i] * B[j];
1040
1041#define Bot_Acc(i, j)	\
1042	e += A[i] * B[j];
1043
1044#define Bot_End(n)		\
1045	R[n-1] = e;
1046#else
1047#define Mul_Begin(n)				\
1048	Declare2Words(p)				\
1049	word c;	\
1050	Declare2Words(d)				\
1051	MultiplyWords(p, A[0], B[0])	\
1052	c = LowWord(p);		\
1053	AssignWord(d, HighWord(p))
1054
1055#define Mul_Acc(i, j)				\
1056	MulAcc(c, d, A[i], B[j])
1057
1058#define Mul_SaveAcc(k, i, j) 		\
1059	R[k] = c;				\
1060	c = LowWord(d);	\
1061	AssignWord(d, HighWord(d))	\
1062	MulAcc(c, d, A[i], B[j])
1063
1064#define Mul_End(k, i)					\
1065	R[k] = c;			\
1066	MultiplyWords(p, A[i], B[i])	\
1067	Acc2WordsBy2(p, d)				\
1068	R[k+1] = LowWord(p);			\
1069	R[k+2] = HighWord(p);
1070
1071#define Bot_SaveAcc(k, i, j)		\
1072	R[k] = c;				\
1073	c = LowWord(d);	\
1074	c += A[i] * B[j];
1075
1076#define Bot_Acc(i, j)	\
1077	c += A[i] * B[j];
1078
1079#define Bot_End(n)		\
1080	R[n-1] = c;
1081#endif
1082
1083#define Squ_Begin(n)				\
1084	Declare2Words(p)				\
1085	word c;				\
1086	Declare2Words(d)				\
1087	Declare2Words(e)				\
1088	MultiplyWords(p, A[0], A[0])	\
1089	R[0] = LowWord(p);				\
1090	AssignWord(e, HighWord(p))		\
1091	MultiplyWords(p, A[0], A[1])	\
1092	c = LowWord(p);		\
1093	AssignWord(d, HighWord(p))		\
1094	Squ_NonDiag						\
1095
1096#define Squ_NonDiag				\
1097	Double3Words(c, d)
1098
1099#define Squ_SaveAcc(k, i, j) 		\
1100	Acc3WordsBy2(c, d, e)			\
1101	R[k] = c;				\
1102	MultiplyWords(p, A[i], A[j])	\
1103	c = LowWord(p);		\
1104	AssignWord(d, HighWord(p))		\
1105
1106#define Squ_Acc(i, j)				\
1107	MulAcc(c, d, A[i], A[j])
1108
1109#define Squ_Diag(i)					\
1110	Squ_NonDiag						\
1111	MulAcc(c, d, A[i], A[i])
1112
1113#define Squ_End(n)					\
1114	Acc3WordsBy2(c, d, e)			\
1115	R[2*n-3] = c;			\
1116	MultiplyWords(p, A[n-1], A[n-1])\
1117	Acc2WordsBy2(p, e)				\
1118	R[2*n-2] = LowWord(p);			\
1119	R[2*n-1] = HighWord(p);
1120
1121void Baseline_Multiply2(word *R, const word *A, const word *B)
1122{
1123	Mul_2
1124}
1125
1126void Baseline_Multiply4(word *R, const word *A, const word *B)
1127{
1128	Mul_4
1129}
1130
1131void Baseline_Multiply8(word *R, const word *A, const word *B)
1132{
1133	Mul_8
1134}
1135
1136void Baseline_Square2(word *R, const word *A)
1137{
1138	Squ_2
1139}
1140
1141void Baseline_Square4(word *R, const word *A)
1142{
1143	Squ_4
1144}
1145
1146void Baseline_Square8(word *R, const word *A)
1147{
1148	Squ_8
1149}
1150
1151void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
1152{
1153	Bot_2
1154}
1155
1156void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
1157{
1158	Bot_4
1159}
1160
1161void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
1162{
1163	Bot_8
1164}
1165
1166#define Top_Begin(n)				\
1167	Declare2Words(p)				\
1168	word c;	\
1169	Declare2Words(d)				\
1170	MultiplyWords(p, A[0], B[n-2]);\
1171	AssignWord(d, HighWord(p));
1172
1173#define Top_Acc(i, j)	\
1174	MultiplyWords(p, A[i], B[j]);\
1175	Acc2WordsBy1(d, HighWord(p));
1176
1177#define Top_SaveAcc0(i, j) 		\
1178	c = LowWord(d);	\
1179	AssignWord(d, HighWord(d))	\
1180	MulAcc(c, d, A[i], B[j])
1181
1182#define Top_SaveAcc1(i, j) 		\
1183	c = L<c; \
1184	Acc2WordsBy1(d, c);	\
1185	c = LowWord(d);	\
1186	AssignWord(d, HighWord(d))	\
1187	MulAcc(c, d, A[i], B[j])
1188
1189void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
1190{
1191	word T[4];
1192	Baseline_Multiply2(T, A, B);
1193	R[0] = T[2];
1194	R[1] = T[3];
1195}
1196
1197void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
1198{
1199	Top_Begin(4)
1200	Top_Acc(1, 1) Top_Acc(2, 0)  \
1201	Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0)  \
1202	Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1)  \
1203	Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
1204	Mul_End(1, 3)
1205}
1206
1207void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
1208{
1209	Top_Begin(8)
1210	Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
1211	Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1212	Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1213	Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1214	Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1215	Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1216	Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1217	Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
1218	Mul_End(5, 7)
1219}
1220
1221#if !CRYPTOPP_INTEGER_SSE2	// save memory by not compiling these functions when SSE2 is available
1222void Baseline_Multiply16(word *R, const word *A, const word *B)
1223{
1224	Mul_16
1225}
1226
1227void Baseline_Square16(word *R, const word *A)
1228{
1229	Squ_16
1230}
1231
1232void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
1233{
1234	Bot_16
1235}
1236
1237void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
1238{
1239	Top_Begin(16)
1240	Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
1241	Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1242	Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1243	Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1244	Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1245	Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1246	Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1247	Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1248	Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1249	Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1250	Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1251	Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1252	Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1253	Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1254	Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1255	Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
1256	Mul_End(13, 15)
1257}
1258#endif
1259
1260// ********************************************************
1261
1262#if CRYPTOPP_INTEGER_SSE2
1263
1264CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
1265
1266#undef Mul_Begin
1267#undef Mul_Acc
1268#undef Top_Begin
1269#undef Top_Acc
1270#undef Squ_Acc
1271#undef Squ_NonDiag
1272#undef Squ_Diag
1273#undef Squ_SaveAcc
1274#undef Squ_Begin
1275#undef Mul_SaveAcc
1276#undef Bot_Acc
1277#undef Bot_SaveAcc
1278#undef Bot_End
1279#undef Squ_End
1280#undef Mul_End
1281
1282#define SSE2_FinalSave(k)			\
1283	AS2(	psllq		xmm5, 16)	\
1284	AS2(	paddq		xmm4, xmm5)	\
1285	AS2(	movq		QWORD PTR [ecx+8*(k)], xmm4)
1286
1287#define SSE2_SaveShift(k)			\
1288	AS2(	movq		xmm0, xmm6)	\
1289	AS2(	punpckhqdq	xmm6, xmm0)	\
1290	AS2(	movq		xmm1, xmm7)	\
1291	AS2(	punpckhqdq	xmm7, xmm1)	\
1292	AS2(	paddd		xmm6, xmm0)	\
1293	AS2(	pslldq		xmm6, 4)	\
1294	AS2(	paddd		xmm7, xmm1)	\
1295	AS2(	paddd		xmm4, xmm6)	\
1296	AS2(	pslldq		xmm7, 4)	\
1297	AS2(	movq		xmm6, xmm4)	\
1298	AS2(	paddd		xmm5, xmm7)	\
1299	AS2(	movq		xmm7, xmm5)	\
1300	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
1301	AS2(	psrlq		xmm6, 16)	\
1302	AS2(	paddq		xmm6, xmm7)	\
1303	AS2(	punpckhqdq	xmm4, xmm0)	\
1304	AS2(	punpckhqdq	xmm5, xmm0)	\
1305	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm6)	\
1306	AS2(	psrlq		xmm6, 3*16)	\
1307	AS2(	paddd		xmm4, xmm6)	\
1308
1309#define Squ_SSE2_SaveShift(k)			\
1310	AS2(	movq		xmm0, xmm6)	\
1311	AS2(	punpckhqdq	xmm6, xmm0)	\
1312	AS2(	movq		xmm1, xmm7)	\
1313	AS2(	punpckhqdq	xmm7, xmm1)	\
1314	AS2(	paddd		xmm6, xmm0)	\
1315	AS2(	pslldq		xmm6, 4)	\
1316	AS2(	paddd		xmm7, xmm1)	\
1317	AS2(	paddd		xmm4, xmm6)	\
1318	AS2(	pslldq		xmm7, 4)	\
1319	AS2(	movhlps		xmm6, xmm4)	\
1320	AS2(	movd		DWORD PTR [ecx+8*(k)], xmm4)	\
1321	AS2(	paddd		xmm5, xmm7)	\
1322	AS2(	movhps		QWORD PTR [esp+12], xmm5)\
1323	AS2(	psrlq		xmm4, 16)	\
1324	AS2(	paddq		xmm4, xmm5)	\
1325	AS2(	movq		QWORD PTR [ecx+8*(k)+2], xmm4)	\
1326	AS2(	psrlq		xmm4, 3*16)	\
1327	AS2(	paddd		xmm4, xmm6)	\
1328	AS2(	movq		QWORD PTR [esp+4], xmm4)\
1329
1330#define SSE2_FirstMultiply(i)				\
1331	AS2(	movdqa		xmm7, [esi+(i)*16])\
1332	AS2(	movdqa		xmm5, [edi-(i)*16])\
1333	AS2(	pmuludq		xmm5, xmm7)		\
1334	AS2(	movdqa		xmm4, [ebx])\
1335	AS2(	movdqa		xmm6, xmm4)		\
1336	AS2(	pand		xmm4, xmm5)		\
1337	AS2(	psrld		xmm5, 16)		\
1338	AS2(	pmuludq		xmm7, [edx-(i)*16])\
1339	AS2(	pand		xmm6, xmm7)		\
1340	AS2(	psrld		xmm7, 16)
1341
1342#define Squ_Begin(n)							\
1343	SquPrologue									\
1344	AS2(	mov		esi, esp)\
1345	AS2(	and		esp, 0xfffffff0)\
1346	AS2(	lea		edi, [esp-32*n])\
1347	AS2(	sub		esp, 32*n+16)\
1348	AS1(	push	esi)\
1349	AS2(	mov		esi, edi)					\
1350	AS2(	xor		edx, edx)					\
1351	ASL(1)										\
1352	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1353	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1354	AS2(	movdqa	[edi+2*edx], xmm0)		\
1355	AS2(	psrlq	xmm0, 32)					\
1356	AS2(	movdqa	[edi+2*edx+16], xmm0)	\
1357	AS2(	movdqa	[edi+16*n+2*edx], xmm1)		\
1358	AS2(	psrlq	xmm1, 32)					\
1359	AS2(	movdqa	[edi+16*n+2*edx+16], xmm1)	\
1360	AS2(	add		edx, 16)					\
1361	AS2(	cmp		edx, 8*(n))					\
1362	ASJ(	jne,	1, b)						\
1363	AS2(	lea		edx, [edi+16*n])\
1364	SSE2_FirstMultiply(0)							\
1365
1366#define Squ_Acc(i)								\
1367	ASL(LSqu##i)								\
1368	AS2(	movdqa		xmm1, [esi+(i)*16])	\
1369	AS2(	movdqa		xmm0, [edi-(i)*16])	\
1370	AS2(	movdqa		xmm2, [ebx])	\
1371	AS2(	pmuludq		xmm0, xmm1)				\
1372	AS2(	pmuludq		xmm1, [edx-(i)*16])	\
1373	AS2(	movdqa		xmm3, xmm2)			\
1374	AS2(	pand		xmm2, xmm0)			\
1375	AS2(	psrld		xmm0, 16)			\
1376	AS2(	paddd		xmm4, xmm2)			\
1377	AS2(	paddd		xmm5, xmm0)			\
1378	AS2(	pand		xmm3, xmm1)			\
1379	AS2(	psrld		xmm1, 16)			\
1380	AS2(	paddd		xmm6, xmm3)			\
1381	AS2(	paddd		xmm7, xmm1)		\
1382
1383#define Squ_Acc1(i)
1384#define Squ_Acc2(i)		ASC(call, LSqu##i)
1385#define Squ_Acc3(i)		Squ_Acc2(i)
1386#define Squ_Acc4(i)		Squ_Acc2(i)
1387#define Squ_Acc5(i)		Squ_Acc2(i)
1388#define Squ_Acc6(i)		Squ_Acc2(i)
1389#define Squ_Acc7(i)		Squ_Acc2(i)
1390#define Squ_Acc8(i)		Squ_Acc2(i)
1391
1392#define SSE2_End(E, n)					\
1393	SSE2_SaveShift(2*(n)-3)			\
1394	AS2(	movdqa		xmm7, [esi+16])	\
1395	AS2(	movdqa		xmm0, [edi])	\
1396	AS2(	pmuludq		xmm0, xmm7)				\
1397	AS2(	movdqa		xmm2, [ebx])		\
1398	AS2(	pmuludq		xmm7, [edx])	\
1399	AS2(	movdqa		xmm6, xmm2)				\
1400	AS2(	pand		xmm2, xmm0)				\
1401	AS2(	psrld		xmm0, 16)				\
1402	AS2(	paddd		xmm4, xmm2)				\
1403	AS2(	paddd		xmm5, xmm0)				\
1404	AS2(	pand		xmm6, xmm7)				\
1405	AS2(	psrld		xmm7, 16)	\
1406	SSE2_SaveShift(2*(n)-2)			\
1407	SSE2_FinalSave(2*(n)-1)			\
1408	AS1(	pop		esp)\
1409	E
1410
1411#define Squ_End(n)		SSE2_End(SquEpilogue, n)
1412#define Mul_End(n)		SSE2_End(MulEpilogue, n)
1413#define Top_End(n)		SSE2_End(TopEpilogue, n)
1414
1415#define Squ_Column1(k, i)	\
1416	Squ_SSE2_SaveShift(k)					\
1417	AS2(	add			esi, 16)	\
1418	SSE2_FirstMultiply(1)\
1419	Squ_Acc##i(i)	\
1420	AS2(	paddd		xmm4, xmm4)		\
1421	AS2(	paddd		xmm5, xmm5)		\
1422	AS2(	movdqa		xmm3, [esi])				\
1423	AS2(	movq		xmm1, QWORD PTR [esi+8])	\
1424	AS2(	pmuludq		xmm1, xmm3)		\
1425	AS2(	pmuludq		xmm3, xmm3)		\
1426	AS2(	movdqa		xmm0, [ebx])\
1427	AS2(	movdqa		xmm2, xmm0)		\
1428	AS2(	pand		xmm0, xmm1)		\
1429	AS2(	psrld		xmm1, 16)		\
1430	AS2(	paddd		xmm6, xmm0)		\
1431	AS2(	paddd		xmm7, xmm1)		\
1432	AS2(	pand		xmm2, xmm3)		\
1433	AS2(	psrld		xmm3, 16)		\
1434	AS2(	paddd		xmm6, xmm6)		\
1435	AS2(	paddd		xmm7, xmm7)		\
1436	AS2(	paddd		xmm4, xmm2)		\
1437	AS2(	paddd		xmm5, xmm3)		\
1438	AS2(	movq		xmm0, QWORD PTR [esp+4])\
1439	AS2(	movq		xmm1, QWORD PTR [esp+12])\
1440	AS2(	paddd		xmm4, xmm0)\
1441	AS2(	paddd		xmm5, xmm1)\
1442
1443#define Squ_Column0(k, i)	\
1444	Squ_SSE2_SaveShift(k)					\
1445	AS2(	add			edi, 16)	\
1446	AS2(	add			edx, 16)	\
1447	SSE2_FirstMultiply(1)\
1448	Squ_Acc##i(i)	\
1449	AS2(	paddd		xmm6, xmm6)		\
1450	AS2(	paddd		xmm7, xmm7)		\
1451	AS2(	paddd		xmm4, xmm4)		\
1452	AS2(	paddd		xmm5, xmm5)		\
1453	AS2(	movq		xmm0, QWORD PTR [esp+4])\
1454	AS2(	movq		xmm1, QWORD PTR [esp+12])\
1455	AS2(	paddd		xmm4, xmm0)\
1456	AS2(	paddd		xmm5, xmm1)\
1457
1458#define SSE2_MulAdd45						\
1459	AS2(	movdqa		xmm7, [esi])	\
1460	AS2(	movdqa		xmm0, [edi])	\
1461	AS2(	pmuludq		xmm0, xmm7)				\
1462	AS2(	movdqa		xmm2, [ebx])		\
1463	AS2(	pmuludq		xmm7, [edx])	\
1464	AS2(	movdqa		xmm6, xmm2)				\
1465	AS2(	pand		xmm2, xmm0)				\
1466	AS2(	psrld		xmm0, 16)				\
1467	AS2(	paddd		xmm4, xmm2)				\
1468	AS2(	paddd		xmm5, xmm0)				\
1469	AS2(	pand		xmm6, xmm7)				\
1470	AS2(	psrld		xmm7, 16)
1471
1472#define Mul_Begin(n)							\
1473	MulPrologue									\
1474	AS2(	mov		esi, esp)\
1475	AS2(	and		esp, 0xfffffff0)\
1476	AS2(	sub		esp, 48*n+16)\
1477	AS1(	push	esi)\
1478	AS2(	xor		edx, edx)					\
1479	ASL(1)										\
1480	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1481	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1482	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
1483	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
1484	AS2(	psrlq	xmm0, 32)					\
1485	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
1486	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
1487	AS2(	psrlq	xmm1, 32)					\
1488	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
1489	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
1490	AS2(	psrlq	xmm2, 32)					\
1491	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
1492	AS2(	add		edx, 16)					\
1493	AS2(	cmp		edx, 8*(n))					\
1494	ASJ(	jne,	1, b)						\
1495	AS2(	lea		edi, [esp+20])\
1496	AS2(	lea		edx, [esp+20+16*n])\
1497	AS2(	lea		esi, [esp+20+32*n])\
1498	SSE2_FirstMultiply(0)							\
1499
1500#define Mul_Acc(i)								\
1501	ASL(LMul##i)										\
1502	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
1503	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
1504	AS2(	movdqa		xmm2, [ebx])	\
1505	AS2(	pmuludq		xmm0, xmm1)				\
1506	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
1507	AS2(	movdqa		xmm3, xmm2)			\
1508	AS2(	pand		xmm2, xmm0)			\
1509	AS2(	psrld		xmm0, 16)			\
1510	AS2(	paddd		xmm4, xmm2)			\
1511	AS2(	paddd		xmm5, xmm0)			\
1512	AS2(	pand		xmm3, xmm1)			\
1513	AS2(	psrld		xmm1, 16)			\
1514	AS2(	paddd		xmm6, xmm3)			\
1515	AS2(	paddd		xmm7, xmm1)		\
1516
1517#define Mul_Acc1(i)
1518#define Mul_Acc2(i)		ASC(call, LMul##i)
1519#define Mul_Acc3(i)		Mul_Acc2(i)
1520#define Mul_Acc4(i)		Mul_Acc2(i)
1521#define Mul_Acc5(i)		Mul_Acc2(i)
1522#define Mul_Acc6(i)		Mul_Acc2(i)
1523#define Mul_Acc7(i)		Mul_Acc2(i)
1524#define Mul_Acc8(i)		Mul_Acc2(i)
1525#define Mul_Acc9(i)		Mul_Acc2(i)
1526#define Mul_Acc10(i)	Mul_Acc2(i)
1527#define Mul_Acc11(i)	Mul_Acc2(i)
1528#define Mul_Acc12(i)	Mul_Acc2(i)
1529#define Mul_Acc13(i)	Mul_Acc2(i)
1530#define Mul_Acc14(i)	Mul_Acc2(i)
1531#define Mul_Acc15(i)	Mul_Acc2(i)
1532#define Mul_Acc16(i)	Mul_Acc2(i)
1533
1534#define Mul_Column1(k, i)	\
1535	SSE2_SaveShift(k)					\
1536	AS2(	add			esi, 16)	\
1537	SSE2_MulAdd45\
1538	Mul_Acc##i(i)	\
1539
1540#define Mul_Column0(k, i)	\
1541	SSE2_SaveShift(k)					\
1542	AS2(	add			edi, 16)	\
1543	AS2(	add			edx, 16)	\
1544	SSE2_MulAdd45\
1545	Mul_Acc##i(i)	\
1546
1547#define Bot_Acc(i)							\
1548	AS2(	movdqa		xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16])	\
1549	AS2(	movdqa		xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16])	\
1550	AS2(	pmuludq		xmm0, xmm1)				\
1551	AS2(	pmuludq		xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16])		\
1552	AS2(	paddq		xmm4, xmm0)				\
1553	AS2(	paddd		xmm6, xmm1)
1554
1555#define Bot_SaveAcc(k)					\
1556	SSE2_SaveShift(k)							\
1557	AS2(	add			edi, 16)	\
1558	AS2(	add			edx, 16)	\
1559	AS2(	movdqa		xmm6, [esi])	\
1560	AS2(	movdqa		xmm0, [edi])	\
1561	AS2(	pmuludq		xmm0, xmm6)				\
1562	AS2(	paddq		xmm4, xmm0)				\
1563	AS2(	psllq		xmm5, 16)				\
1564	AS2(	paddq		xmm4, xmm5)				\
1565	AS2(	pmuludq		xmm6, [edx])
1566
1567#define Bot_End(n)							\
1568	AS2(	movhlps		xmm7, xmm6)			\
1569	AS2(	paddd		xmm6, xmm7)			\
1570	AS2(	psllq		xmm6, 32)			\
1571	AS2(	paddd		xmm4, xmm6)			\
1572	AS2(	movq		QWORD PTR [ecx+8*((n)-1)], xmm4)	\
1573	AS1(	pop		esp)\
1574	MulEpilogue
1575
1576#define Top_Begin(n)							\
1577	TopPrologue									\
1578	AS2(	mov		edx, esp)\
1579	AS2(	and		esp, 0xfffffff0)\
1580	AS2(	sub		esp, 48*n+16)\
1581	AS1(	push	edx)\
1582	AS2(	xor		edx, edx)					\
1583	ASL(1)										\
1584	ASS(	pshufd	xmm0, [eax+edx], 3,1,2,0)	\
1585	ASS(	pshufd	xmm1, [eax+edx], 2,0,3,1)	\
1586	ASS(	pshufd	xmm2, [edi+edx], 3,1,2,0)	\
1587	AS2(	movdqa	[esp+20+2*edx], xmm0)		\
1588	AS2(	psrlq	xmm0, 32)					\
1589	AS2(	movdqa	[esp+20+2*edx+16], xmm0)	\
1590	AS2(	movdqa	[esp+20+16*n+2*edx], xmm1)		\
1591	AS2(	psrlq	xmm1, 32)					\
1592	AS2(	movdqa	[esp+20+16*n+2*edx+16], xmm1)	\
1593	AS2(	movdqa	[esp+20+32*n+2*edx], xmm2)		\
1594	AS2(	psrlq	xmm2, 32)					\
1595	AS2(	movdqa	[esp+20+32*n+2*edx+16], xmm2)	\
1596	AS2(	add		edx, 16)					\
1597	AS2(	cmp		edx, 8*(n))					\
1598	ASJ(	jne,	1, b)						\
1599	AS2(	mov		eax, esi)					\
1600	AS2(	lea		edi, [esp+20+00*n+16*(n/2-1)])\
1601	AS2(	lea		edx, [esp+20+16*n+16*(n/2-1)])\
1602	AS2(	lea		esi, [esp+20+32*n+16*(n/2-1)])\
1603	AS2(	pxor	xmm4, xmm4)\
1604	AS2(	pxor	xmm5, xmm5)
1605
1606#define Top_Acc(i)							\
1607	AS2(	movq		xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8])	\
1608	AS2(	pmuludq		xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16])	\
1609	AS2(	psrlq		xmm0, 48)				\
1610	AS2(	paddd		xmm5, xmm0)\
1611
1612#define Top_Column0(i)	\
1613	AS2(	psllq		xmm5, 32)				\
1614	AS2(	add			edi, 16)	\
1615	AS2(	add			edx, 16)	\
1616	SSE2_MulAdd45\
1617	Mul_Acc##i(i)	\
1618
1619#define Top_Column1(i)	\
1620	SSE2_SaveShift(0)					\
1621	AS2(	add			esi, 16)	\
1622	SSE2_MulAdd45\
1623	Mul_Acc##i(i)	\
1624	AS2(	shr			eax, 16)	\
1625	AS2(	movd		xmm0, eax)\
1626	AS2(	movd		xmm1, [ecx+4])\
1627	AS2(	psrld		xmm1, 16)\
1628	AS2(	pcmpgtd		xmm1, xmm0)\
1629	AS2(	psrld		xmm1, 31)\
1630	AS2(	paddd		xmm4, xmm1)\
1631
1632void SSE2_Square4(word *C, const word *A)
1633{
1634	Squ_Begin(2)
1635	Squ_Column0(0, 1)
1636	Squ_End(2)
1637}
1638
1639void SSE2_Square8(word *C, const word *A)
1640{
1641	Squ_Begin(4)
1642#ifndef __GNUC__
1643	ASJ(	jmp,	0, f)
1644	Squ_Acc(2)
1645	AS1(	ret) ASL(0)
1646#endif
1647	Squ_Column0(0, 1)
1648	Squ_Column1(1, 1)
1649	Squ_Column0(2, 2)
1650	Squ_Column1(3, 1)
1651	Squ_Column0(4, 1)
1652	Squ_End(4)
1653}
1654
1655void SSE2_Square16(word *C, const word *A)
1656{
1657	Squ_Begin(8)
1658#ifndef __GNUC__
1659	ASJ(	jmp,	0, f)
1660	Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1661	AS1(	ret) ASL(0)
1662#endif
1663	Squ_Column0(0, 1)
1664	Squ_Column1(1, 1)
1665	Squ_Column0(2, 2)
1666	Squ_Column1(3, 2)
1667	Squ_Column0(4, 3)
1668	Squ_Column1(5, 3)
1669	Squ_Column0(6, 4)
1670	Squ_Column1(7, 3)
1671	Squ_Column0(8, 3)
1672	Squ_Column1(9, 2)
1673	Squ_Column0(10, 2)
1674	Squ_Column1(11, 1)
1675	Squ_Column0(12, 1)
1676	Squ_End(8)
1677}
1678
1679void SSE2_Square32(word *C, const word *A)
1680{
1681	Squ_Begin(16)
1682	ASJ(	jmp,	0, f)
1683	Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1684	AS1(	ret) ASL(0)
1685	Squ_Column0(0, 1)
1686	Squ_Column1(1, 1)
1687	Squ_Column0(2, 2)
1688	Squ_Column1(3, 2)
1689	Squ_Column0(4, 3)
1690	Squ_Column1(5, 3)
1691	Squ_Column0(6, 4)
1692	Squ_Column1(7, 4)
1693	Squ_Column0(8, 5)
1694	Squ_Column1(9, 5)
1695	Squ_Column0(10, 6)
1696	Squ_Column1(11, 6)
1697	Squ_Column0(12, 7)
1698	Squ_Column1(13, 7)
1699	Squ_Column0(14, 8)
1700	Squ_Column1(15, 7)
1701	Squ_Column0(16, 7)
1702	Squ_Column1(17, 6)
1703	Squ_Column0(18, 6)
1704	Squ_Column1(19, 5)
1705	Squ_Column0(20, 5)
1706	Squ_Column1(21, 4)
1707	Squ_Column0(22, 4)
1708	Squ_Column1(23, 3)
1709	Squ_Column0(24, 3)
1710	Squ_Column1(25, 2)
1711	Squ_Column0(26, 2)
1712	Squ_Column1(27, 1)
1713	Squ_Column0(28, 1)
1714	Squ_End(16)
1715}
1716
1717void SSE2_Multiply4(word *C, const word *A, const word *B)
1718{
1719	Mul_Begin(2)
1720#ifndef __GNUC__
1721	ASJ(	jmp,	0, f)
1722	Mul_Acc(2)
1723	AS1(	ret) ASL(0)
1724#endif
1725	Mul_Column0(0, 2)
1726	Mul_End(2)
1727}
1728
1729void SSE2_Multiply8(word *C, const word *A, const word *B)
1730{
1731	Mul_Begin(4)
1732#ifndef __GNUC__
1733	ASJ(	jmp,	0, f)
1734	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1735	AS1(	ret) ASL(0)
1736#endif
1737	Mul_Column0(0, 2)
1738	Mul_Column1(1, 3)
1739	Mul_Column0(2, 4)
1740	Mul_Column1(3, 3)
1741	Mul_Column0(4, 2)
1742	Mul_End(4)
1743}
1744
1745void SSE2_Multiply16(word *C, const word *A, const word *B)
1746{
1747	Mul_Begin(8)
1748#ifndef __GNUC__
1749	ASJ(	jmp,	0, f)
1750	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1751	AS1(	ret) ASL(0)
1752#endif
1753	Mul_Column0(0, 2)
1754	Mul_Column1(1, 3)
1755	Mul_Column0(2, 4)
1756	Mul_Column1(3, 5)
1757	Mul_Column0(4, 6)
1758	Mul_Column1(5, 7)
1759	Mul_Column0(6, 8)
1760	Mul_Column1(7, 7)
1761	Mul_Column0(8, 6)
1762	Mul_Column1(9, 5)
1763	Mul_Column0(10, 4)
1764	Mul_Column1(11, 3)
1765	Mul_Column0(12, 2)
1766	Mul_End(8)
1767}
1768
1769void SSE2_Multiply32(word *C, const word *A, const word *B)
1770{
1771	Mul_Begin(16)
1772	ASJ(	jmp,	0, f)
1773	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1774	AS1(	ret) ASL(0)
1775	Mul_Column0(0, 2)
1776	Mul_Column1(1, 3)
1777	Mul_Column0(2, 4)
1778	Mul_Column1(3, 5)
1779	Mul_Column0(4, 6)
1780	Mul_Column1(5, 7)
1781	Mul_Column0(6, 8)
1782	Mul_Column1(7, 9)
1783	Mul_Column0(8, 10)
1784	Mul_Column1(9, 11)
1785	Mul_Column0(10, 12)
1786	Mul_Column1(11, 13)
1787	Mul_Column0(12, 14)
1788	Mul_Column1(13, 15)
1789	Mul_Column0(14, 16)
1790	Mul_Column1(15, 15)
1791	Mul_Column0(16, 14)
1792	Mul_Column1(17, 13)
1793	Mul_Column0(18, 12)
1794	Mul_Column1(19, 11)
1795	Mul_Column0(20, 10)
1796	Mul_Column1(21, 9)
1797	Mul_Column0(22, 8)
1798	Mul_Column1(23, 7)
1799	Mul_Column0(24, 6)
1800	Mul_Column1(25, 5)
1801	Mul_Column0(26, 4)
1802	Mul_Column1(27, 3)
1803	Mul_Column0(28, 2)
1804	Mul_End(16)
1805}
1806
1807void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
1808{
1809	Mul_Begin(2)
1810	Bot_SaveAcc(0) Bot_Acc(2)
1811	Bot_End(2)
1812}
1813
1814void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
1815{
1816	Mul_Begin(4)
1817#ifndef __GNUC__
1818	ASJ(	jmp,	0, f)
1819	Mul_Acc(3) Mul_Acc(2)
1820	AS1(	ret) ASL(0)
1821#endif
1822	Mul_Column0(0, 2)
1823	Mul_Column1(1, 3)
1824	Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1825	Bot_End(4)
1826}
1827
1828void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
1829{
1830	Mul_Begin(8)
1831#ifndef __GNUC__
1832	ASJ(	jmp,	0, f)
1833	Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1834	AS1(	ret) ASL(0)
1835#endif
1836	Mul_Column0(0, 2)
1837	Mul_Column1(1, 3)
1838	Mul_Column0(2, 4)
1839	Mul_Column1(3, 5)
1840	Mul_Column0(4, 6)
1841	Mul_Column1(5, 7)
1842	Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1843	Bot_End(8)
1844}
1845
1846void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
1847{
1848	Mul_Begin(16)
1849#ifndef __GNUC__
1850	ASJ(	jmp,	0, f)
1851	Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1852	AS1(	ret) ASL(0)
1853#endif
1854	Mul_Column0(0, 2)
1855	Mul_Column1(1, 3)
1856	Mul_Column0(2, 4)
1857	Mul_Column1(3, 5)
1858	Mul_Column0(4, 6)
1859	Mul_Column1(5, 7)
1860	Mul_Column0(6, 8)
1861	Mul_Column1(7, 9)
1862	Mul_Column0(8, 10)
1863	Mul_Column1(9, 11)
1864	Mul_Column0(10, 12)
1865	Mul_Column1(11, 13)
1866	Mul_Column0(12, 14)
1867	Mul_Column1(13, 15)
1868	Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1869	Bot_End(16)
1870}
1871
1872void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
1873{
1874	Top_Begin(4)
1875	Top_Acc(3) Top_Acc(2) Top_Acc(1)
1876#ifndef __GNUC__
1877	ASJ(	jmp,	0, f)
1878	Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1879	AS1(	ret) ASL(0)
1880#endif
1881	Top_Column0(4)
1882	Top_Column1(3)
1883	Mul_Column0(0, 2)
1884	Top_End(2)
1885}
1886
1887void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
1888{
1889	Top_Begin(8)
1890	Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1891#ifndef __GNUC__
1892	ASJ(	jmp,	0, f)
1893	Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1894	AS1(	ret) ASL(0)
1895#endif
1896	Top_Column0(8)
1897	Top_Column1(7)
1898	Mul_Column0(0, 6)
1899	Mul_Column1(1, 5)
1900	Mul_Column0(2, 4)
1901	Mul_Column1(3, 3)
1902	Mul_Column0(4, 2)
1903	Top_End(4)
1904}
1905
1906void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
1907{
1908	Top_Begin(16)
1909	Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1910#ifndef __GNUC__
1911	ASJ(	jmp,	0, f)
1912	Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1913	AS1(	ret) ASL(0)
1914#endif
1915	Top_Column0(16)
1916	Top_Column1(15)
1917	Mul_Column0(0, 14)
1918	Mul_Column1(1, 13)
1919	Mul_Column0(2, 12)
1920	Mul_Column1(3, 11)
1921	Mul_Column0(4, 10)
1922	Mul_Column1(5, 9)
1923	Mul_Column0(6, 8)
1924	Mul_Column1(7, 7)
1925	Mul_Column0(8, 6)
1926	Mul_Column1(9, 5)
1927	Mul_Column0(10, 4)
1928	Mul_Column1(11, 3)
1929	Mul_Column0(12, 2)
1930	Top_End(8)
1931}
1932
1933#endif	// #if CRYPTOPP_INTEGER_SSE2
1934
1935// ********************************************************
1936
1937typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
1938typedef void (* PMul)(word *C, const word *A, const word *B);
1939typedef void (* PSqu)(word *C, const word *A);
1940typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
1941
1942#if CRYPTOPP_INTEGER_SSE2
1943static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
1944static size_t s_recursionLimit = 8;
1945#else
1946static const size_t s_recursionLimit = 16;
1947#endif
1948
1949static PMul s_pMul[9], s_pBot[9];
1950static PSqu s_pSqu[9];
1951static PMulTop s_pTop[9];
1952
1953static void SetFunctionPointers()
1954{
1955	s_pMul[0] = &Baseline_Multiply2;
1956	s_pBot[0] = &Baseline_MultiplyBottom2;
1957	s_pSqu[0] = &Baseline_Square2;
1958	s_pTop[0] = &Baseline_MultiplyTop2;
1959	s_pTop[1] = &Baseline_MultiplyTop4;
1960
1961#if CRYPTOPP_INTEGER_SSE2
1962	if (HasSSE2())
1963	{
1964#if _MSC_VER != 1200 || defined(NDEBUG)
1965		if (IsP4())
1966		{
1967			s_pAdd = &SSE2_Add;
1968			s_pSub = &SSE2_Sub;
1969		}
1970#endif
1971
1972		s_recursionLimit = 32;
1973
1974		s_pMul[1] = &SSE2_Multiply4;
1975		s_pMul[2] = &SSE2_Multiply8;
1976		s_pMul[4] = &SSE2_Multiply16;
1977		s_pMul[8] = &SSE2_Multiply32;
1978
1979		s_pBot[1] = &SSE2_MultiplyBottom4;
1980		s_pBot[2] = &SSE2_MultiplyBottom8;
1981		s_pBot[4] = &SSE2_MultiplyBottom16;
1982		s_pBot[8] = &SSE2_MultiplyBottom32;
1983
1984		s_pSqu[1] = &SSE2_Square4;
1985		s_pSqu[2] = &SSE2_Square8;
1986		s_pSqu[4] = &SSE2_Square16;
1987		s_pSqu[8] = &SSE2_Square32;
1988
1989		s_pTop[2] = &SSE2_MultiplyTop8;
1990		s_pTop[4] = &SSE2_MultiplyTop16;
1991		s_pTop[8] = &SSE2_MultiplyTop32;
1992	}
1993	else
1994#endif
1995	{
1996		s_pMul[1] = &Baseline_Multiply4;
1997		s_pMul[2] = &Baseline_Multiply8;
1998
1999		s_pBot[1] = &Baseline_MultiplyBottom4;
2000		s_pBot[2] = &Baseline_MultiplyBottom8;
2001
2002		s_pSqu[1] = &Baseline_Square4;
2003		s_pSqu[2] = &Baseline_Square8;
2004
2005		s_pTop[2] = &Baseline_MultiplyTop8;
2006
2007#if	!CRYPTOPP_INTEGER_SSE2
2008		s_pMul[4] = &Baseline_Multiply16;
2009		s_pBot[4] = &Baseline_MultiplyBottom16;
2010		s_pSqu[4] = &Baseline_Square16;
2011		s_pTop[4] = &Baseline_MultiplyTop16;
2012#endif
2013	}
2014}
2015
2016inline int Add(word *C, const word *A, const word *B, size_t N)
2017{
2018#if CRYPTOPP_INTEGER_SSE2
2019	return s_pAdd(N, C, A, B);
2020#else
2021	return Baseline_Add(N, C, A, B);
2022#endif
2023}
2024
2025inline int Subtract(word *C, const word *A, const word *B, size_t N)
2026{
2027#if CRYPTOPP_INTEGER_SSE2
2028	return s_pSub(N, C, A, B);
2029#else
2030	return Baseline_Sub(N, C, A, B);
2031#endif
2032}
2033
2034// ********************************************************
2035
2036
2037#define A0		A
2038#define A1		(A+N2)
2039#define B0		B
2040#define B1		(B+N2)
2041
2042#define T0		T
2043#define T1		(T+N2)
2044#define T2		(T+N)
2045#define T3		(T+N+N2)
2046
2047#define R0		R
2048#define R1		(R+N2)
2049#define R2		(R+N)
2050#define R3		(R+N+N2)
2051
2052// R[2*N] - result = A*B
2053// T[2*N] - temporary work space
2054// A[N] --- multiplier
2055// B[N] --- multiplicant
2056
2057void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
2058{
2059	assert(N>=2 && N%2==0);
2060
2061	if (N <= s_recursionLimit)
2062		s_pMul[N/4](R, A, B);
2063	else
2064	{
2065		const size_t N2 = N/2;
2066
2067		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
2068		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2069
2070		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
2071		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2072
2073		RecursiveMultiply(R2, T2, A1, B1, N2);
2074		RecursiveMultiply(T0, T2, R0, R1, N2);
2075		RecursiveMultiply(R0, T2, A0, B0, N2);
2076
2077		// now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
2078
2079		int c2 = Add(R2, R2, R1, N2);
2080		int c3 = c2;
2081		c2 += Add(R1, R2, R0, N2);
2082		c3 += Add(R2, R2, R3, N2);
2083
2084		if (AN2 == BN2)
2085			c3 -= Subtract(R1, R1, T0, N);
2086		else
2087			c3 += Add(R1, R1, T0, N);
2088
2089		c3 += Increment(R2, N2, c2);
2090		assert (c3 >= 0 && c3 <= 2);
2091		Increment(R3, N2, c3);
2092	}
2093}
2094
2095// R[2*N] - result = A*A
2096// T[2*N] - temporary work space
2097// A[N] --- number to be squared
2098
2099void RecursiveSquare(word *R, word *T, const word *A, size_t N)
2100{
2101	assert(N && N%2==0);
2102
2103	if (N <= s_recursionLimit)
2104		s_pSqu[N/4](R, A);
2105	else
2106	{
2107		const size_t N2 = N/2;
2108
2109		RecursiveSquare(R0, T2, A0, N2);
2110		RecursiveSquare(R2, T2, A1, N2);
2111		RecursiveMultiply(T0, T2, A0, A1, N2);
2112
2113		int carry = Add(R1, R1, T0, N);
2114		carry += Add(R1, R1, T0, N);
2115		Increment(R3, N2, carry);
2116	}
2117}
2118
2119// R[N] - bottom half of A*B
2120// T[3*N/2] - temporary work space
2121// A[N] - multiplier
2122// B[N] - multiplicant
2123
2124void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2125{
2126	assert(N>=2 && N%2==0);
2127
2128	if (N <= s_recursionLimit)
2129		s_pBot[N/4](R, A, B);
2130	else
2131	{
2132		const size_t N2 = N/2;
2133
2134		RecursiveMultiply(R, T, A0, B0, N2);
2135		RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
2136		Add(R1, R1, T0, N2);
2137		RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
2138		Add(R1, R1, T0, N2);
2139	}
2140}
2141
2142// R[N] --- upper half of A*B
2143// T[2*N] - temporary work space
2144// L[N] --- lower half of A*B
2145// A[N] --- multiplier
2146// B[N] --- multiplicant
2147
2148void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
2149{
2150	assert(N>=2 && N%2==0);
2151
2152	if (N <= s_recursionLimit)
2153		s_pTop[N/4](R, A, B, L[N-1]);
2154	else
2155	{
2156		const size_t N2 = N/2;
2157
2158		size_t AN2 = Compare(A0, A1, N2) > 0 ?  0 : N2;
2159		Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2160
2161		size_t BN2 = Compare(B0, B1, N2) > 0 ?  0 : N2;
2162		Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2163
2164		RecursiveMultiply(T0, T2, R0, R1, N2);
2165		RecursiveMultiply(R0, T2, A1, B1, N2);
2166
2167		// now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1
2168
2169		int t, c3;
2170		int c2 = Subtract(T2, L+N2, L, N2);
2171
2172		if (AN2 == BN2)
2173		{
2174			c2 -= Add(T2, T2, T0, N2);
2175			t = (Compare(T2, R0, N2) == -1);
2176			c3 = t - Subtract(T2, T2, T1, N2);
2177		}
2178		else
2179		{
2180			c2 += Subtract(T2, T2, T0, N2);
2181			t = (Compare(T2, R0, N2) == -1);
2182			c3 = t + Add(T2, T2, T1, N2);
2183		}
2184
2185		c2 += t;
2186		if (c2 >= 0)
2187			c3 += Increment(T2, N2, c2);
2188		else
2189			c3 -= Decrement(T2, N2, -c2);
2190		c3 += Add(R0, T2, R1, N2);
2191
2192		assert (c3 >= 0 && c3 <= 2);
2193		Increment(R1, N2, c3);
2194	}
2195}
2196
2197inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
2198{
2199	RecursiveMultiply(R, T, A, B, N);
2200}
2201
2202inline void Square(word *R, word *T, const word *A, size_t N)
2203{
2204	RecursiveSquare(R, T, A, N);
2205}
2206
2207inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2208{
2209	RecursiveMultiplyBottom(R, T, A, B, N);
2210}
2211
2212// R[NA+NB] - result = A*B
2213// T[NA+NB] - temporary work space
2214// A[NA] ---- multiplier
2215// B[NB] ---- multiplicant
2216
2217void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB)
2218{
2219	if (NA == NB)
2220	{
2221		if (A == B)
2222			Square(R, T, A, NA);
2223		else
2224			Multiply(R, T, A, B, NA);
2225
2226		return;
2227	}
2228
2229	if (NA > NB)
2230	{
2231		std::swap(A, B);
2232		std::swap(NA, NB);
2233	}
2234
2235	assert(NB % NA == 0);
2236
2237	if (NA==2 && !A[1])
2238	{
2239		switch (A[0])
2240		{
2241		case 0:
2242			SetWords(R, 0, NB+2);
2243			return;
2244		case 1:
2245			CopyWords(R, B, NB);
2246			R[NB] = R[NB+1] = 0;
2247			return;
2248		default:
2249			R[NB] = LinearMultiply(R, B, A[0], NB);
2250			R[NB+1] = 0;
2251			return;
2252		}
2253	}
2254
2255	size_t i;
2256	if ((NB/NA)%2 == 0)
2257	{
2258		Multiply(R, T, A, B, NA);
2259		CopyWords(T+2*NA, R+NA, NA);
2260
2261		for (i=2*NA; i<NB; i+=2*NA)
2262			Multiply(T+NA+i, T, A, B+i, NA);
2263		for (i=NA; i<NB; i+=2*NA)
2264			Multiply(R+i, T, A, B+i, NA);
2265	}
2266	else
2267	{
2268		for (i=0; i<NB; i+=2*NA)
2269			Multiply(R+i, T, A, B+i, NA);
2270		for (i=NA; i<NB; i+=2*NA)
2271			Multiply(T+NA+i, T, A, B+i, NA);
2272	}
2273
2274	if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2275		Increment(R+NB, NA);
2276}
2277
2278// R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
2279// T[3*N/2] - temporary work space
2280// A[N] ----- an odd number as input
2281
2282void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
2283{
2284	if (N==2)
2285	{
2286		T[0] = AtomicInverseModPower2(A[0]);
2287		T[1] = 0;
2288		s_pBot[0](T+2, T, A);
2289		TwosComplement(T+2, 2);
2290		Increment(T+2, 2, 2);
2291		s_pBot[0](R, T, T+2);
2292	}
2293	else
2294	{
2295		const size_t N2 = N/2;
2296		RecursiveInverseModPower2(R0, T0, A0, N2);
2297		T0[0] = 1;
2298		SetWords(T0+1, 0, N2-1);
2299		MultiplyTop(R1, T1, T0, R0, A0, N2);
2300		MultiplyBottom(T0, T1, R0, A1, N2);
2301		Add(T0, R1, T0, N2);
2302		TwosComplement(T0, N2);
2303		MultiplyBottom(R1, T1, R0, T0, N2);
2304	}
2305}
2306
2307// R[N] --- result = X/(2**(WORD_BITS*N)) mod M
2308// T[3*N] - temporary work space
2309// X[2*N] - number to be reduced
2310// M[N] --- modulus
2311// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
2312
2313void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
2314{
2315#if 1
2316	MultiplyBottom(R, T, X, U, N);
2317	MultiplyTop(T, T+N, X, R, M, N);
2318	word borrow = Subtract(T, X+N, T, N);
2319	// defend against timing attack by doing this Add even when not needed
2320	word carry = Add(T+N, T, M, N);
2321	assert(carry | !borrow);
2322	CopyWords(R, T + ((0-borrow) & N), N);
2323#elif 0
2324	const word u = 0-U[0];
2325	Declare2Words(p)
2326	for (size_t i=0; i<N; i++)
2327	{
2328		const word t = u * X[i];
2329		word c = 0;
2330		for (size_t j=0; j<N; j+=2)
2331		{
2332			MultiplyWords(p, t, M[j]);
2333			Acc2WordsBy1(p, X[i+j]);
2334			Acc2WordsBy1(p, c);
2335			X[i+j] = LowWord(p);
2336			c = HighWord(p);
2337			MultiplyWords(p, t, M[j+1]);
2338			Acc2WordsBy1(p, X[i+j+1]);
2339			Acc2WordsBy1(p, c);
2340			X[i+j+1] = LowWord(p);
2341			c = HighWord(p);
2342		}
2343
2344		if (Increment(X+N+i, N-i, c))
2345			while (!Subtract(X+N, X+N, M, N)) {}
2346	}
2347
2348	memcpy(R, X+N, N*WORD_SIZE);
2349#else
2350	__m64 u = _mm_cvtsi32_si64(0-U[0]), p;
2351	for (size_t i=0; i<N; i++)
2352	{
2353		__m64 t = _mm_cvtsi32_si64(X[i]);
2354		t = _mm_mul_su32(t, u);
2355		__m64 c = _mm_setzero_si64();
2356		for (size_t j=0; j<N; j+=2)
2357		{
2358			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
2359			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
2360			c = _mm_add_si64(c, p);
2361			X[i+j] = _mm_cvtsi64_si32(c);
2362			c = _mm_srli_si64(c, 32);
2363			p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
2364			p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
2365			c = _mm_add_si64(c, p);
2366			X[i+j+1] = _mm_cvtsi64_si32(c);
2367			c = _mm_srli_si64(c, 32);
2368		}
2369
2370		if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
2371			while (!Subtract(X+N, X+N, M, N)) {}
2372	}
2373
2374	memcpy(R, X+N, N*WORD_SIZE);
2375	_mm_empty();
2376#endif
2377}
2378
2379// R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
2380// T[2*N] - temporary work space
2381// X[2*N] - number to be reduced
2382// M[N] --- modulus
2383// U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
2384// V[N] --- 2**(WORD_BITS*3*N/2) mod M
2385
2386void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N)
2387{
2388	assert(N%2==0 && N>=4);
2389
2390#define M0		M
2391#define M1		(M+N2)
2392#define V0		V
2393#define V1		(V+N2)
2394
2395#define X0		X
2396#define X1		(X+N2)
2397#define X2		(X+N)
2398#define X3		(X+N+N2)
2399
2400	const size_t N2 = N/2;
2401	Multiply(T0, T2, V0, X3, N2);
2402	int c2 = Add(T0, T0, X0, N);
2403	MultiplyBottom(T3, T2, T0, U, N2);
2404	MultiplyTop(T2, R, T0, T3, M0, N2);
2405	c2 -= Subtract(T2, T1, T2, N2);
2406	Multiply(T0, R, T3, M1, N2);
2407	c2 -= Subtract(T0, T2, T0, N2);
2408	int c3 = -(int)Subtract(T1, X2, T1, N2);
2409	Multiply(R0, T2, V1, X3, N2);
2410	c3 += Add(R, R, T, N);
2411
2412	if (c2>0)
2413		c3 += Increment(R1, N2);
2414	else if (c2<0)
2415		c3 -= Decrement(R1, N2, -c2);
2416
2417	assert(c3>=-1 && c3<=1);
2418	if (c3>0)
2419		Subtract(R, R, M, N);
2420	else if (c3<0)
2421		Add(R, R, M, N);
2422
2423#undef M0
2424#undef M1
2425#undef V0
2426#undef V1
2427
2428#undef X0
2429#undef X1
2430#undef X2
2431#undef X3
2432}
2433
2434#undef A0
2435#undef A1
2436#undef B0
2437#undef B1
2438
2439#undef T0
2440#undef T1
2441#undef T2
2442#undef T3
2443
2444#undef R0
2445#undef R1
2446#undef R2
2447#undef R3
2448
2449/*
2450// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
2451static word SubatomicDivide(word *A, word B0, word B1)
2452{
2453	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
2454	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
2455
2456	// estimate the quotient: do a 2 word by 1 word divide
2457	word Q;
2458	if (B1+1 == 0)
2459		Q = A[2];
2460	else
2461		Q = DWord(A[1], A[2]).DividedBy(B1+1);
2462
2463	// now subtract Q*B from A
2464	DWord p = DWord::Multiply(B0, Q);
2465	DWord u = (DWord) A[0] - p.GetLowHalf();
2466	A[0] = u.GetLowHalf();
2467	u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
2468	A[1] = u.GetLowHalf();
2469	A[2] += u.GetHighHalf();
2470
2471	// Q <= actual quotient, so fix it
2472	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
2473	{
2474		u = (DWord) A[0] - B0;
2475		A[0] = u.GetLowHalf();
2476		u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
2477		A[1] = u.GetLowHalf();
2478		A[2] += u.GetHighHalf();
2479		Q++;
2480		assert(Q);	// shouldn't overflow
2481	}
2482
2483	return Q;
2484}
2485
2486// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
2487static inline void AtomicDivide(word *Q, const word *A, const word *B)
2488{
2489	if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
2490	{
2491		Q[0] = A[2];
2492		Q[1] = A[3];
2493	}
2494	else
2495	{
2496		word T[4];
2497		T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
2498		Q[1] = SubatomicDivide(T+1, B[0], B[1]);
2499		Q[0] = SubatomicDivide(T, B[0], B[1]);
2500
2501#ifndef NDEBUG
2502		// multiply quotient and divisor and add remainder, make sure it equals dividend
2503		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2504		word P[4];
2505		LowLevel::Multiply2(P, Q, B);
2506		Add(P, P, T, 4);
2507		assert(memcmp(P, A, 4*WORD_SIZE)==0);
2508#endif
2509	}
2510}
2511*/
2512
2513static inline void AtomicDivide(word *Q, const word *A, const word *B)
2514{
2515	word T[4];
2516	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
2517	Q[0] = q.GetLowHalf();
2518	Q[1] = q.GetHighHalf();
2519
2520#ifndef NDEBUG
2521	if (B[0] || B[1])
2522	{
2523		// multiply quotient and divisor and add remainder, make sure it equals dividend
2524		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2525		word P[4];
2526		s_pMul[0](P, Q, B);
2527		Add(P, P, T, 4);
2528		assert(memcmp(P, A, 4*WORD_SIZE)==0);
2529	}
2530#endif
2531}
2532
2533// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
2534static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N)
2535{
2536	assert(N && N%2==0);
2537
2538	AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
2539
2540	word borrow = Subtract(R, R, T, N+2);
2541	assert(!borrow && !R[N+1]);
2542
2543	while (R[N] || Compare(R, B, N) >= 0)
2544	{
2545		R[N] -= Subtract(R, R, B, N);
2546		Q[1] += (++Q[0]==0);
2547		assert(Q[0] || Q[1]); // no overflow
2548	}
2549}
2550
2551// R[NB] -------- remainder = A%B
2552// Q[NA-NB+2] --- quotient	= A/B
2553// T[NA+3*(NB+2)] - temp work space
2554// A[NA] -------- dividend
2555// B[NB] -------- divisor
2556
2557void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB)
2558{
2559	assert(NA && NB && NA%2==0 && NB%2==0);
2560	assert(B[NB-1] || B[NB-2]);
2561	assert(NB <= NA);
2562
2563	// set up temporary work space
2564	word *const TA=T;
2565	word *const TB=T+NA+2;
2566	word *const TP=T+NA+2+NB;
2567
2568	// copy B into TB and normalize it so that TB has highest bit set to 1
2569	unsigned shiftWords = (B[NB-1]==0);
2570	TB[0] = TB[NB-1] = 0;
2571	CopyWords(TB+shiftWords, B, NB-shiftWords);
2572	unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
2573	assert(shiftBits < WORD_BITS);
2574	ShiftWordsLeftByBits(TB, NB, shiftBits);
2575
2576	// copy A into TA and normalize it
2577	TA[0] = TA[NA] = TA[NA+1] = 0;
2578	CopyWords(TA+shiftWords, A, NA);
2579	ShiftWordsLeftByBits(TA, NA+2, shiftBits);
2580
2581	if (TA[NA+1]==0 && TA[NA] <= 1)
2582	{
2583		Q[NA-NB+1] = Q[NA-NB] = 0;
2584		while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
2585		{
2586			TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
2587			++Q[NA-NB];
2588		}
2589	}
2590	else
2591	{
2592		NA+=2;
2593		assert(Compare(TA+NA-NB, TB, NB) < 0);
2594	}
2595
2596	word BT[2];
2597	BT[0] = TB[NB-2] + 1;
2598	BT[1] = TB[NB-1] + (BT[0]==0);
2599
2600	// start reducing TA mod TB, 2 words at a time
2601	for (size_t i=NA-2; i>=NB; i-=2)
2602	{
2603		AtomicDivide(Q+i-NB, TA+i-2, BT);
2604		CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
2605	}
2606
2607	// copy TA into R, and denormalize it
2608	CopyWords(R, TA+shiftWords, NB);
2609	ShiftWordsRightByBits(R, NB, shiftBits);
2610}
2611
2612static inline size_t EvenWordCount(const word *X, size_t N)
2613{
2614	while (N && X[N-2]==0 && X[N-1]==0)
2615		N-=2;
2616	return N;
2617}
2618
2619// return k
2620// R[N] --- result = A^(-1) * 2^k mod M
2621// T[4*N] - temporary work space
2622// A[NA] -- number to take inverse of
2623// M[N] --- modulus
2624
2625unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N)
2626{
2627	assert(NA<=N && N && N%2==0);
2628
2629	word *b = T;
2630	word *c = T+N;
2631	word *f = T+2*N;
2632	word *g = T+3*N;
2633	size_t bcLen=2, fgLen=EvenWordCount(M, N);
2634	unsigned int k=0, s=0;
2635
2636	SetWords(T, 0, 3*N);
2637	b[0]=1;
2638	CopyWords(f, A, NA);
2639	CopyWords(g, M, N);
2640
2641	while (1)
2642	{
2643		word t=f[0];
2644		while (!t)
2645		{
2646			if (EvenWordCount(f, fgLen)==0)
2647			{
2648				SetWords(R, 0, N);
2649				return 0;
2650			}
2651
2652			ShiftWordsRightByWords(f, fgLen, 1);
2653			if (c[bcLen-1]) bcLen+=2;
2654			assert(bcLen <= N);
2655			ShiftWordsLeftByWords(c, bcLen, 1);
2656			k+=WORD_BITS;
2657			t=f[0];
2658		}
2659
2660		unsigned int i=0;
2661		while (t%2 == 0)
2662		{
2663			t>>=1;
2664			i++;
2665		}
2666		k+=i;
2667
2668		if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
2669		{
2670			if (s%2==0)
2671				CopyWords(R, b, N);
2672			else
2673				Subtract(R, M, b, N);
2674			return k;
2675		}
2676
2677		ShiftWordsRightByBits(f, fgLen, i);
2678		t=ShiftWordsLeftByBits(c, bcLen, i);
2679		if (t)
2680		{
2681			c[bcLen] = t;
2682			bcLen+=2;
2683			assert(bcLen <= N);
2684		}
2685
2686		if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
2687			fgLen-=2;
2688
2689		if (Compare(f, g, fgLen)==-1)
2690		{
2691			std::swap(f, g);
2692			std::swap(b, c);
2693			s++;
2694		}
2695
2696		Subtract(f, f, g, fgLen);
2697
2698		if (Add(b, b, c, bcLen))
2699		{
2700			b[bcLen] = 1;
2701			bcLen+=2;
2702			assert(bcLen <= N);
2703		}
2704	}
2705}
2706
2707// R[N] - result = A/(2^k) mod M
2708// A[N] - input
2709// M[N] - modulus
2710
2711void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
2712{
2713	CopyWords(R, A, N);
2714
2715	while (k--)
2716	{
2717		if (R[0]%2==0)
2718			ShiftWordsRightByBits(R, N, 1);
2719		else
2720		{
2721			word carry = Add(R, R, M, N);
2722			ShiftWordsRightByBits(R, N, 1);
2723			R[N-1] += carry<<(WORD_BITS-1);
2724		}
2725	}
2726}
2727
2728// R[N] - result = A*(2^k) mod M
2729// A[N] - input
2730// M[N] - modulus
2731
2732void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
2733{
2734	CopyWords(R, A, N);
2735
2736	while (k--)
2737		if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2738			Subtract(R, R, M, N);
2739}
2740
2741// ******************************************************************
2742
2743InitializeInteger::InitializeInteger()
2744{
2745	if (!g_pAssignIntToInteger)
2746	{
2747		SetFunctionPointers();
2748		g_pAssignIntToInteger = AssignIntToInteger;
2749	}
2750}
2751
2752static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
2753
2754static inline size_t RoundupSize(size_t n)
2755{
2756	if (n<=8)
2757		return RoundupSizeTable[n];
2758	else if (n<=16)
2759		return 16;
2760	else if (n<=32)
2761		return 32;
2762	else if (n<=64)
2763		return 64;
2764	else return size_t(1) << BitPrecision(n-1);
2765}
2766
2767Integer::Integer()
2768	: reg(2), sign(POSITIVE)
2769{
2770	reg[0] = reg[1] = 0;
2771}
2772
2773Integer::Integer(const Integer& t)
2774	: reg(RoundupSize(t.WordCount())), sign(t.sign)
2775{
2776	CopyWords(reg, t.reg, reg.size());
2777}
2778
2779Integer::Integer(Sign s, lword value)
2780	: reg(2), sign(s)
2781{
2782	reg[0] = word(value);
2783	reg[1] = word(SafeRightShift<WORD_BITS>(value));
2784}
2785
2786Integer::Integer(signed long value)
2787	: reg(2)
2788{
2789	if (value >= 0)
2790		sign = POSITIVE;
2791	else
2792	{
2793		sign = NEGATIVE;
2794		value = -value;
2795	}
2796	reg[0] = word(value);
2797	reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
2798}
2799
2800Integer::Integer(Sign s, word high, word low)
2801	: reg(2), sign(s)
2802{
2803	reg[0] = low;
2804	reg[1] = high;
2805}
2806
2807bool Integer::IsConvertableToLong() const
2808{
2809	if (ByteCount() > sizeof(long))
2810		return false;
2811
2812	unsigned long value = (unsigned long)reg[0];
2813	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
2814
2815	if (sign==POSITIVE)
2816		return (signed long)value >= 0;
2817	else
2818		return -(signed long)value < 0;
2819}
2820
2821signed long Integer::ConvertToLong() const
2822{
2823	assert(IsConvertableToLong());
2824
2825	unsigned long value = (unsigned long)reg[0];
2826	value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
2827	return sign==POSITIVE ? value : -(signed long)value;
2828}
2829
2830Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s)
2831{
2832	Decode(encodedInteger, byteCount, s);
2833}
2834
2835Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s)
2836{
2837	Decode(encodedInteger, byteCount, s);
2838}
2839
2840Integer::Integer(BufferedTransformation &bt)
2841{
2842	BERDecode(bt);
2843}
2844
2845Integer::Integer(RandomNumberGenerator &rng, size_t bitcount)
2846{
2847	Randomize(rng, bitcount);
2848}
2849
2850Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
2851{
2852	if (!Randomize(rng, min, max, rnType, equiv, mod))
2853		throw Integer::RandomNumberNotFound();
2854}
2855
2856Integer Integer::Power2(size_t e)
2857{
2858	Integer r((word)0, BitsToWords(e+1));
2859	r.SetBit(e);
2860	return r;
2861}
2862
2863template <long i>
2864struct NewInteger
2865{
2866	Integer * operator()() const
2867	{
2868		return new Integer(i);
2869	}
2870};
2871
2872const Integer &Integer::Zero()
2873{
2874	return Singleton<Integer>().Ref();
2875}
2876
2877const Integer &Integer::One()
2878{
2879	return Singleton<Integer, NewInteger<1> >().Ref();
2880}
2881
2882const Integer &Integer::Two()
2883{
2884	return Singleton<Integer, NewInteger<2> >().Ref();
2885}
2886
2887bool Integer::operator!() const
2888{
2889	return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
2890}
2891
2892Integer& Integer::operator=(const Integer& t)
2893{
2894	if (this != &t)
2895	{
2896		if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
2897			reg.New(RoundupSize(t.WordCount()));
2898		CopyWords(reg, t.reg, reg.size());
2899		sign = t.sign;
2900	}
2901	return *this;
2902}
2903
2904bool Integer::GetBit(size_t n) const
2905{
2906	if (n/WORD_BITS >= reg.size())
2907		return 0;
2908	else
2909		return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
2910}
2911
2912void Integer::SetBit(size_t n, bool value)
2913{
2914	if (value)
2915	{
2916		reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
2917		reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
2918	}
2919	else
2920	{
2921		if (n/WORD_BITS < reg.size())
2922			reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
2923	}
2924}
2925
2926byte Integer::GetByte(size_t n) const
2927{
2928	if (n/WORD_SIZE >= reg.size())
2929		return 0;
2930	else
2931		return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
2932}
2933
2934void Integer::SetByte(size_t n, byte value)
2935{
2936	reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
2937	reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
2938	reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
2939}
2940
2941lword Integer::GetBits(size_t i, size_t n) const
2942{
2943	lword v = 0;
2944	assert(n <= sizeof(v)*8);
2945	for (unsigned int j=0; j<n; j++)
2946		v |= lword(GetBit(i+j)) << j;
2947	return v;
2948}
2949
2950Integer Integer::operator-() const
2951{
2952	Integer result(*this);
2953	result.Negate();
2954	return result;
2955}
2956
2957Integer Integer::AbsoluteValue() const
2958{
2959	Integer result(*this);
2960	result.sign = POSITIVE;
2961	return result;
2962}
2963
2964void Integer::swap(Integer &a)
2965{
2966	reg.swap(a.reg);
2967	std::swap(sign, a.sign);
2968}
2969
2970Integer::Integer(word value, size_t length)
2971	: reg(RoundupSize(length)), sign(POSITIVE)
2972{
2973	reg[0] = value;
2974	SetWords(reg+1, 0, reg.size()-1);
2975}
2976
2977template <class T>
2978static Integer StringToInteger(const T *str)
2979{
2980	int radix;
2981	// GCC workaround
2982	// std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3
2983	unsigned int length;
2984	for (length = 0; str[length] != 0; length++) {}
2985
2986	Integer v;
2987
2988	if (length == 0)
2989		return v;
2990
2991	switch (str[length-1])
2992	{
2993	case 'h':
2994	case 'H':
2995		radix=16;
2996		break;
2997	case 'o':
2998	case 'O':
2999		radix=8;
3000		break;
3001	case 'b':
3002	case 'B':
3003		radix=2;
3004		break;
3005	default:
3006		radix=10;
3007	}
3008
3009	if (length > 2 && str[0] == '0' && str[1] == 'x')
3010		radix = 16;
3011
3012	for (unsigned i=0; i<length; i++)
3013	{
3014		int digit;
3015
3016		if (str[i] >= '0' && str[i] <= '9')
3017			digit = str[i] - '0';
3018		else if (str[i] >= 'A' && str[i] <= 'F')
3019			digit = str[i] - 'A' + 10;
3020		else if (str[i] >= 'a' && str[i] <= 'f')
3021			digit = str[i] - 'a' + 10;
3022		else
3023			digit = radix;
3024
3025		if (digit < radix)
3026		{
3027			v *= radix;
3028			v += digit;
3029		}
3030	}
3031
3032	if (str[0] == '-')
3033		v.Negate();
3034
3035	return v;
3036}
3037
3038Integer::Integer(const char *str)
3039	: reg(2), sign(POSITIVE)
3040{
3041	*this = StringToInteger(str);
3042}
3043
3044Integer::Integer(const wchar_t *str)
3045	: reg(2), sign(POSITIVE)
3046{
3047	*this = StringToInteger(str);
3048}
3049
3050unsigned int Integer::WordCount() const
3051{
3052	return (unsigned int)CountWords(reg, reg.size());
3053}
3054
3055unsigned int Integer::ByteCount() const
3056{
3057	unsigned wordCount = WordCount();
3058	if (wordCount)
3059		return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
3060	else
3061		return 0;
3062}
3063
3064unsigned int Integer::BitCount() const
3065{
3066	unsigned wordCount = WordCount();
3067	if (wordCount)
3068		return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
3069	else
3070		return 0;
3071}
3072
3073void Integer::Decode(const byte *input, size_t inputLen, Signedness s)
3074{
3075	StringStore store(input, inputLen);
3076	Decode(store, inputLen, s);
3077}
3078
3079void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)
3080{
3081	assert(bt.MaxRetrievable() >= inputLen);
3082
3083	byte b;
3084	bt.Peek(b);
3085	sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
3086
3087	while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
3088	{
3089		bt.Skip(1);
3090		inputLen--;
3091		bt.Peek(b);
3092	}
3093
3094	reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
3095
3096	for (size_t i=inputLen; i > 0; i--)
3097	{
3098		bt.Get(b);
3099		reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
3100	}
3101
3102	if (sign == NEGATIVE)
3103	{
3104		for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)
3105			reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;
3106		TwosComplement(reg, reg.size());
3107	}
3108}
3109
3110size_t Integer::MinEncodedSize(Signedness signedness) const
3111{
3112	unsigned int outputLen = STDMAX(1U, ByteCount());
3113	if (signedness == UNSIGNED)
3114		return outputLen;
3115	if (NotNegative() && (GetByte(outputLen-1) & 0x80))
3116		outputLen++;
3117	if (IsNegative() && *this < -Power2(outputLen*8-1))
3118		outputLen++;
3119	return outputLen;
3120}
3121
3122void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const
3123{
3124	ArraySink sink(output, outputLen);
3125	Encode(sink, outputLen, signedness);
3126}
3127
3128void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const
3129{
3130	if (signedness == UNSIGNED || NotNegative())
3131	{
3132		for (size_t i=outputLen; i > 0; i--)
3133			bt.Put(GetByte(i-1));
3134	}
3135	else
3136	{
3137		// take two's complement of *this
3138		Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this;
3139		temp.Encode(bt, outputLen, UNSIGNED);
3140	}
3141}
3142
3143void Integer::DEREncode(BufferedTransformation &bt) const
3144{
3145	DERGeneralEncoder enc(bt, INTEGER);
3146	Encode(enc, MinEncodedSize(SIGNED), SIGNED);
3147	enc.MessageEnd();
3148}
3149
3150void Integer::BERDecode(const byte *input, size_t len)
3151{
3152	StringStore store(input, len);
3153	BERDecode(store);
3154}
3155
3156void Integer::BERDecode(BufferedTransformation &bt)
3157{
3158	BERGeneralDecoder dec(bt, INTEGER);
3159	if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
3160		BERDecodeError();
3161	Decode(dec, (size_t)dec.RemainingLength(), SIGNED);
3162	dec.MessageEnd();
3163}
3164
3165void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const
3166{
3167	DERGeneralEncoder enc(bt, OCTET_STRING);
3168	Encode(enc, length);
3169	enc.MessageEnd();
3170}
3171
3172void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length)
3173{
3174	BERGeneralDecoder dec(bt, OCTET_STRING);
3175	if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
3176		BERDecodeError();
3177	Decode(dec, length);
3178	dec.MessageEnd();
3179}
3180
3181size_t Integer::OpenPGPEncode(byte *output, size_t len) const
3182{
3183	ArraySink sink(output, len);
3184	return OpenPGPEncode(sink);
3185}
3186
3187size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const
3188{
3189	word16 bitCount = BitCount();
3190	bt.PutWord16(bitCount);
3191	size_t byteCount = BitsToBytes(bitCount);
3192	Encode(bt, byteCount);
3193	return 2 + byteCount;
3194}
3195
3196void Integer::OpenPGPDecode(const byte *input, size_t len)
3197{
3198	StringStore store(input, len);
3199	OpenPGPDecode(store);
3200}
3201
3202void Integer::OpenPGPDecode(BufferedTransformation &bt)
3203{
3204	word16 bitCount;
3205	if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
3206		throw OpenPGPDecodeErr();
3207	Decode(bt, BitsToBytes(bitCount));
3208}
3209
3210void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)
3211{
3212	const size_t nbytes = nbits/8 + 1;
3213	SecByteBlock buf(nbytes);
3214	rng.GenerateBlock(buf, nbytes);
3215	if (nbytes)
3216		buf[0] = (byte)Crop(buf[0], nbits % 8);
3217	Decode(buf, nbytes, UNSIGNED);
3218}
3219
3220void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
3221{
3222	if (min > max)
3223		throw InvalidArgument("Integer: Min must be no greater than Max");
3224
3225	Integer range = max - min;
3226	const unsigned int nbits = range.BitCount();
3227
3228	do
3229	{
3230		Randomize(rng, nbits);
3231	}
3232	while (*this > range);
3233
3234	*this += min;
3235}
3236
3237bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
3238{
3239	return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
3240}
3241
3242class KDF2_RNG : public RandomNumberGenerator
3243{
3244public:
3245	KDF2_RNG(const byte *seed, size_t seedSize)
3246		: m_counter(0), m_counterAndSeed(seedSize + 4)
3247	{
3248		memcpy(m_counterAndSeed + 4, seed, seedSize);
3249	}
3250
3251	void GenerateBlock(byte *output, size_t size)
3252	{
3253		PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
3254		++m_counter;
3255		P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0);
3256	}
3257
3258private:
3259	word32 m_counter;
3260	SecByteBlock m_counterAndSeed;
3261};
3262
3263bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
3264{
3265	Integer min = params.GetValueWithDefault("Min", Integer::Zero());
3266	Integer max;
3267	if (!params.GetValue("Max", max))
3268	{
3269		int bitLength;
3270		if (params.GetIntValue("BitLength", bitLength))
3271			max = Integer::Power2(bitLength);
3272		else
3273			throw InvalidArgument("Integer: missing Max argument");
3274	}
3275	if (min > max)
3276		throw InvalidArgument("Integer: Min must be no greater than Max");
3277
3278	Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
3279	Integer mod = params.GetValueWithDefault("Mod", Integer::One());
3280
3281	if (equiv.IsNegative() || equiv >= mod)
3282		throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
3283
3284	Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
3285
3286	member_ptr<KDF2_RNG> kdf2Rng;
3287	ConstByteArrayParameter seed;
3288	if (params.GetValue(Name::Seed(), seed))
3289	{
3290		ByteQueue bq;
3291		DERSequenceEncoder seq(bq);
3292		min.DEREncode(seq);
3293		max.DEREncode(seq);
3294		equiv.DEREncode(seq);
3295		mod.DEREncode(seq);
3296		DEREncodeUnsigned(seq, rnType);
3297		DEREncodeOctetString(seq, seed.begin(), seed.size());
3298		seq.MessageEnd();
3299
3300		SecByteBlock finalSeed((size_t)bq.MaxRetrievable());
3301		bq.Get(finalSeed, finalSeed.size());
3302		kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
3303	}
3304	RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
3305
3306	switch (rnType)
3307	{
3308		case ANY:
3309			if (mod == One())
3310				Randomize(rng, min, max);
3311			else
3312			{
3313				Integer min1 = min + (equiv-min)%mod;
3314				if (max < min1)
3315					return false;
3316				Randomize(rng, Zero(), (max - min1) / mod);
3317				*this *= mod;
3318				*this += min1;
3319			}
3320			return true;
3321
3322		case PRIME:
3323		{
3324			const PrimeSelector *pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector *)NULL);
3325
3326			int i;
3327			i = 0;
3328			while (1)
3329			{
3330				if (++i==16)
3331				{
3332					// check if there are any suitable primes in [min, max]
3333					Integer first = min;
3334					if (FirstPrime(first, max, equiv, mod, pSelector))
3335					{
3336						// if there is only one suitable prime, we're done
3337						*this = first;
3338						if (!FirstPrime(first, max, equiv, mod, pSelector))
3339							return true;
3340					}
3341					else
3342						return false;
3343				}
3344
3345				Randomize(rng, min, max);
3346				if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
3347					return true;
3348			}
3349		}
3350
3351		default:
3352			throw InvalidArgument("Integer: invalid RandomNumberType argument");
3353	}
3354}
3355
3356std::istream& operator>>(std::istream& in, Integer &a)
3357{
3358	char c;
3359	unsigned int length = 0;
3360	SecBlock<char> str(length + 16);
3361
3362	std::ws(in);
3363
3364	do
3365	{
3366		in.read(&c, 1);
3367		str[length++] = c;
3368		if (length >= str.size())
3369			str.Grow(length + 16);
3370	}
3371	while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));
3372
3373	if (in.gcount())
3374		in.putback(c);
3375	str[length-1] = '\0';
3376	a = Integer(str);
3377
3378	return in;
3379}
3380
3381std::ostream& operator<<(std::ostream& out, const Integer &a)
3382{
3383	// Get relevant conversion specifications from ostream.
3384	long f = out.flags() & std::ios::basefield; // Get base digits.
3385	int base, block;
3386	char suffix;
3387	switch(f)
3388	{
3389	case std::ios::oct :
3390		base = 8;
3391		block = 8;
3392		suffix = 'o';
3393		break;
3394	case std::ios::hex :
3395		base = 16;
3396		block = 4;
3397		suffix = 'h';
3398		break;
3399	default :
3400		base = 10;
3401		block = 3;
3402		suffix = '.';
3403	}
3404
3405	Integer temp1=a, temp2;
3406
3407	if (a.IsNegative())
3408	{
3409		out << '-';
3410		temp1.Negate();
3411	}
3412
3413	if (!a)
3414		out << '0';
3415
3416	static const char upper[]="0123456789ABCDEF";
3417	static const char lower[]="0123456789abcdef";
3418
3419	const char* vec = (out.flags() & std::ios::uppercase) ? upper : lower;
3420	unsigned i=0;
3421	SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1);
3422
3423	while (!!temp1)
3424	{
3425		word digit;
3426		Integer::Divide(digit, temp2, temp1, base);
3427		s[i++]=vec[digit];
3428		temp1.swap(temp2);
3429	}
3430
3431	while (i--)
3432	{
3433		out << s[i];
3434//		if (i && !(i%block))
3435//			out << ",";
3436	}
3437	return out << suffix;
3438}
3439
3440Integer& Integer::operator++()
3441{
3442	if (NotNegative())
3443	{
3444		if (Increment(reg, reg.size()))
3445		{
3446			reg.CleanGrow(2*reg.size());
3447			reg[reg.size()/2]=1;
3448		}
3449	}
3450	else
3451	{
3452		word borrow = Decrement(reg, reg.size());
3453		assert(!borrow);
3454		if (WordCount()==0)
3455			*this = Zero();
3456	}
3457	return *this;
3458}
3459
3460Integer& Integer::operator--()
3461{
3462	if (IsNegative())
3463	{
3464		if (Increment(reg, reg.size()))
3465		{
3466			reg.CleanGrow(2*reg.size());
3467			reg[reg.size()/2]=1;
3468		}
3469	}
3470	else
3471	{
3472		if (Decrement(reg, reg.size()))
3473			*this = -One();
3474	}
3475	return *this;
3476}
3477
3478void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
3479{
3480	int carry;
3481	if (a.reg.size() == b.reg.size())
3482		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3483	else if (a.reg.size() > b.reg.size())
3484	{
3485		carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
3486		CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
3487		carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
3488	}
3489	else
3490	{
3491		carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3492		CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
3493		carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
3494	}
3495
3496	if (carry)
3497	{
3498		sum.reg.CleanGrow(2*sum.reg.size());
3499		sum.reg[sum.reg.size()/2] = 1;
3500	}
3501	sum.sign = Integer::POSITIVE;
3502}
3503
3504void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
3505{
3506	unsigned aSize = a.WordCount();
3507	aSize += aSize%2;
3508	unsigned bSize = b.WordCount();
3509	bSize += bSize%2;
3510
3511	if (aSize == bSize)
3512	{
3513		if (Compare(a.reg, b.reg, aSize) >= 0)
3514		{
3515			Subtract(diff.reg, a.reg, b.reg, aSize);
3516			diff.sign = Integer::POSITIVE;
3517		}
3518		else
3519		{
3520			Subtract(diff.reg, b.reg, a.reg, aSize);
3521			diff.sign = Integer::NEGATIVE;
3522		}
3523	}
3524	else if (aSize > bSize)
3525	{
3526		word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
3527		CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
3528		borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
3529		assert(!borrow);
3530		diff.sign = Integer::POSITIVE;
3531	}
3532	else
3533	{
3534		word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
3535		CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
3536		borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
3537		assert(!borrow);
3538		diff.sign = Integer::NEGATIVE;
3539	}
3540}
3541
3542// MSVC .NET 2003 workaround
3543template <class T> inline const T& STDMAX2(const T& a, const T& b)
3544{
3545	return a < b ? b : a;
3546}
3547
3548Integer Integer::Plus(const Integer& b) const
3549{
3550	Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));
3551	if (NotNegative())
3552	{
3553		if (b.NotNegative())
3554			PositiveAdd(sum, *this, b);
3555		else
3556			PositiveSubtract(sum, *this, b);
3557	}
3558	else
3559	{
3560		if (b.NotNegative())
3561			PositiveSubtract(sum, b, *this);
3562		else
3563		{
3564			PositiveAdd(sum, *this, b);
3565			sum.sign = Integer::NEGATIVE;
3566		}
3567	}
3568	return sum;
3569}
3570
3571Integer& Integer::operator+=(const Integer& t)
3572{
3573	reg.CleanGrow(t.reg.size());
3574	if (NotNegative())
3575	{
3576		if (t.NotNegative())
3577			PositiveAdd(*this, *this, t);
3578		else
3579			PositiveSubtract(*this, *this, t);
3580	}
3581	else
3582	{
3583		if (t.NotNegative())
3584			PositiveSubtract(*this, t, *this);
3585		else
3586		{
3587			PositiveAdd(*this, *this, t);
3588			sign = Integer::NEGATIVE;
3589		}
3590	}
3591	return *this;
3592}
3593
3594Integer Integer::Minus(const Integer& b) const
3595{
3596	Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));
3597	if (NotNegative())
3598	{
3599		if (b.NotNegative())
3600			PositiveSubtract(diff, *this, b);
3601		else
3602			PositiveAdd(diff, *this, b);
3603	}
3604	else
3605	{
3606		if (b.NotNegative())
3607		{
3608			PositiveAdd(diff, *this, b);
3609			diff.sign = Integer::NEGATIVE;
3610		}
3611		else
3612			PositiveSubtract(diff, b, *this);
3613	}
3614	return diff;
3615}
3616
3617Integer& Integer::operator-=(const Integer& t)
3618{
3619	reg.CleanGrow(t.reg.size());
3620	if (NotNegative())
3621	{
3622		if (t.NotNegative())
3623			PositiveSubtract(*this, *this, t);
3624		else
3625			PositiveAdd(*this, *this, t);
3626	}
3627	else
3628	{
3629		if (t.NotNegative())
3630		{
3631			PositiveAdd(*this, *this, t);
3632			sign = Integer::NEGATIVE;
3633		}
3634		else
3635			PositiveSubtract(*this, t, *this);
3636	}
3637	return *this;
3638}
3639
3640Integer& Integer::operator<<=(size_t n)
3641{
3642	const size_t wordCount = WordCount();
3643	const size_t shiftWords = n / WORD_BITS;
3644	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
3645
3646	reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
3647	ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
3648	ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
3649	return *this;
3650}
3651
3652Integer& Integer::operator>>=(size_t n)
3653{
3654	const size_t wordCount = WordCount();
3655	const size_t shiftWords = n / WORD_BITS;
3656	const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
3657
3658	ShiftWordsRightByWords(reg, wordCount, shiftWords);
3659	if (wordCount > shiftWords)
3660		ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
3661	if (IsNegative() && WordCount()==0)   // avoid -0
3662		*this = Zero();
3663	return *this;
3664}
3665
3666void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
3667{
3668	size_t aSize = RoundupSize(a.WordCount());
3669	size_t bSize = RoundupSize(b.WordCount());
3670
3671	product.reg.CleanNew(RoundupSize(aSize+bSize));
3672	product.sign = Integer::POSITIVE;
3673
3674	IntegerSecBlock workspace(aSize + bSize);
3675	AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
3676}
3677
3678void Multiply(Integer &product, const Integer &a, const Integer &b)
3679{
3680	PositiveMultiply(product, a, b);
3681
3682	if (a.NotNegative() != b.NotNegative())
3683		product.Negate();
3684}
3685
3686Integer Integer::Times(const Integer &b) const
3687{
3688	Integer product;
3689	Multiply(product, *this, b);
3690	return product;
3691}
3692
3693/*
3694void PositiveDivide(Integer &remainder, Integer &quotient,
3695				   const Integer &dividend, const Integer &divisor)
3696{
3697	remainder.reg.CleanNew(divisor.reg.size());
3698	remainder.sign = Integer::POSITIVE;
3699	quotient.reg.New(0);
3700	quotient.sign = Integer::POSITIVE;
3701	unsigned i=dividend.BitCount();
3702	while (i--)
3703	{
3704		word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
3705		remainder.reg[0] |= dividend[i];
3706		if (overflow || remainder >= divisor)
3707		{
3708			Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
3709			quotient.SetBit(i);
3710		}
3711	}
3712}
3713*/
3714
3715void PositiveDivide(Integer &remainder, Integer &quotient,
3716				   const Integer &a, const Integer &b)
3717{
3718	unsigned aSize = a.WordCount();
3719	unsigned bSize = b.WordCount();
3720
3721	if (!bSize)
3722		throw Integer::DivideByZero();
3723
3724	if (aSize < bSize)
3725	{
3726		remainder = a;
3727		remainder.sign = Integer::POSITIVE;
3728		quotient = Integer::Zero();
3729		return;
3730	}
3731
3732	aSize += aSize%2;	// round up to next even number
3733	bSize += bSize%2;
3734
3735	remainder.reg.CleanNew(RoundupSize(bSize));
3736	remainder.sign = Integer::POSITIVE;
3737	quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
3738	quotient.sign = Integer::POSITIVE;
3739
3740	IntegerSecBlock T(aSize+3*(bSize+2));
3741	Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
3742}
3743
3744void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
3745{
3746	PositiveDivide(remainder, quotient, dividend, divisor);
3747
3748	if (dividend.IsNegative())
3749	{
3750		quotient.Negate();
3751		if (remainder.NotZero())
3752		{
3753			--quotient;
3754			remainder = divisor.AbsoluteValue() - remainder;
3755		}
3756	}
3757
3758	if (divisor.IsNegative())
3759		quotient.Negate();
3760}
3761
3762void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
3763{
3764	q = a;
3765	q >>= n;
3766
3767	const size_t wordCount = BitsToWords(n);
3768	if (wordCount <= a.WordCount())
3769	{
3770		r.reg.resize(RoundupSize(wordCount));
3771		CopyWords(r.reg, a.reg, wordCount);
3772		SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
3773		if (n % WORD_BITS != 0)
3774			r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));
3775	}
3776	else
3777	{
3778		r.reg.resize(RoundupSize(a.WordCount()));
3779		CopyWords(r.reg, a.reg, r.reg.size());
3780	}
3781	r.sign = POSITIVE;
3782
3783	if (a.IsNegative() && r.NotZero())
3784	{
3785		--q;
3786		r = Power2(n) - r;
3787	}
3788}
3789
3790Integer Integer::DividedBy(const Integer &b) const
3791{
3792	Integer remainder, quotient;
3793	Integer::Divide(remainder, quotient, *this, b);
3794	return quotient;
3795}
3796
3797Integer Integer::Modulo(const Integer &b) const
3798{
3799	Integer remainder, quotient;
3800	Integer::Divide(remainder, quotient, *this, b);
3801	return remainder;
3802}
3803
3804void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
3805{
3806	if (!divisor)
3807		throw Integer::DivideByZero();
3808
3809	assert(divisor);
3810
3811	if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
3812	{
3813		quotient = dividend >> (BitPrecision(divisor)-1);
3814		remainder = dividend.reg[0] & (divisor-1);
3815		return;
3816	}
3817
3818	unsigned int i = dividend.WordCount();
3819	quotient.reg.CleanNew(RoundupSize(i));
3820	remainder = 0;
3821	while (i--)
3822	{
3823		quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
3824		remainder = DWord(dividend.reg[i], remainder) % divisor;
3825	}
3826
3827	if (dividend.NotNegative())
3828		quotient.sign = POSITIVE;
3829	else
3830	{
3831		quotient.sign = NEGATIVE;
3832		if (remainder)
3833		{
3834			--quotient;
3835			remainder = divisor - remainder;
3836		}
3837	}
3838}
3839
3840Integer Integer::DividedBy(word b) const
3841{
3842	word remainder;
3843	Integer quotient;
3844	Integer::Divide(remainder, quotient, *this, b);
3845	return quotient;
3846}
3847
3848word Integer::Modulo(word divisor) const
3849{
3850	if (!divisor)
3851		throw Integer::DivideByZero();
3852
3853	assert(divisor);
3854
3855	word remainder;
3856
3857	if ((divisor & (divisor-1)) == 0)	// divisor is a power of 2
3858		remainder = reg[0] & (divisor-1);
3859	else
3860	{
3861		unsigned int i = WordCount();
3862
3863		if (divisor <= 5)
3864		{
3865			DWord sum(0, 0);
3866			while (i--)
3867				sum += reg[i];
3868			remainder = sum % divisor;
3869		}
3870		else
3871		{
3872			remainder = 0;
3873			while (i--)
3874				remainder = DWord(reg[i], remainder) % divisor;
3875		}
3876	}
3877
3878	if (IsNegative() && remainder)
3879		remainder = divisor - remainder;
3880
3881	return remainder;
3882}
3883
3884void Integer::Negate()
3885{
3886	if (!!(*this))	// don't flip sign if *this==0
3887		sign = Sign(1-sign);
3888}
3889
3890int Integer::PositiveCompare(const Integer& t) const
3891{
3892	unsigned size = WordCount(), tSize = t.WordCount();
3893
3894	if (size == tSize)
3895		return CryptoPP::Compare(reg, t.reg, size);
3896	else
3897		return size > tSize ? 1 : -1;
3898}
3899
3900int Integer::Compare(const Integer& t) const
3901{
3902	if (NotNegative())
3903	{
3904		if (t.NotNegative())
3905			return PositiveCompare(t);
3906		else
3907			return 1;
3908	}
3909	else
3910	{
3911		if (t.NotNegative())
3912			return -1;
3913		else
3914			return -PositiveCompare(t);
3915	}
3916}
3917
3918Integer Integer::SquareRoot() const
3919{
3920	if (!IsPositive())
3921		return Zero();
3922
3923	// overestimate square root
3924	Integer x, y = Power2((BitCount()+1)/2);
3925	assert(y*y >= *this);
3926
3927	do
3928	{
3929		x = y;
3930		y = (x + *this/x) >> 1;
3931	} while (y<x);
3932
3933	return x;
3934}
3935
3936bool Integer::IsSquare() const
3937{
3938	Integer r = SquareRoot();
3939	return *this == r.Squared();
3940}
3941
3942bool Integer::IsUnit() const
3943{
3944	return (WordCount() == 1) && (reg[0] == 1);
3945}
3946
3947Integer Integer::MultiplicativeInverse() const
3948{
3949	return IsUnit() ? *this : Zero();
3950}
3951
3952Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
3953{
3954	return x*y%m;
3955}
3956
3957Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
3958{
3959	ModularArithmetic mr(m);
3960	return mr.Exponentiate(x, e);
3961}
3962
3963Integer Integer::Gcd(const Integer &a, const Integer &b)
3964{
3965	return EuclideanDomainOf<Integer>().Gcd(a, b);
3966}
3967
3968Integer Integer::InverseMod(const Integer &m) const
3969{
3970	assert(m.NotNegative());
3971
3972	if (IsNegative())
3973		return Modulo(m).InverseMod(m);
3974
3975	if (m.IsEven())
3976	{
3977		if (!m || IsEven())
3978			return Zero();	// no inverse
3979		if (*this == One())
3980			return One();
3981
3982		Integer u = m.Modulo(*this).InverseMod(*this);
3983		return !u ? Zero() : (m*(*this-u)+1)/(*this);
3984	}
3985
3986	SecBlock<word> T(m.reg.size() * 4);
3987	Integer r((word)0, m.reg.size());
3988	unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
3989	DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
3990	return r;
3991}
3992
3993word Integer::InverseMod(word mod) const
3994{
3995	word g0 = mod, g1 = *this % mod;
3996	word v0 = 0, v1 = 1;
3997	word y;
3998
3999	while (g1)
4000	{
4001		if (g1 == 1)
4002			return v1;
4003		y = g0 / g1;
4004		g0 = g0 % g1;
4005		v0 += y * v1;
4006
4007		if (!g0)
4008			break;
4009		if (g0 == 1)
4010			return mod-v0;
4011		y = g1 / g0;
4012		g1 = g1 % g0;
4013		v1 += y * v0;
4014	}
4015	return 0;
4016}
4017
4018// ********************************************************
4019
4020ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
4021{
4022	BERSequenceDecoder seq(bt);
4023	OID oid(seq);
4024	if (oid != ASN1::prime_field())
4025		BERDecodeError();
4026	m_modulus.BERDecode(seq);
4027	seq.MessageEnd();
4028	m_result.reg.resize(m_modulus.reg.size());
4029}
4030
4031void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
4032{
4033	DERSequenceEncoder seq(bt);
4034	ASN1::prime_field().DEREncode(seq);
4035	m_modulus.DEREncode(seq);
4036	seq.MessageEnd();
4037}
4038
4039void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
4040{
4041	a.DEREncodeAsOctetString(out, MaxElementByteLength());
4042}
4043
4044void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
4045{
4046	a.BERDecodeAsOctetString(in, MaxElementByteLength());
4047}
4048
4049const Integer& ModularArithmetic::Half(const Integer &a) const
4050{
4051	if (a.reg.size()==m_modulus.reg.size())
4052	{
4053		CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());
4054		return m_result;
4055	}
4056	else
4057		return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));
4058}
4059
4060const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
4061{
4062	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4063	{
4064		if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())
4065			|| Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)
4066		{
4067			CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4068		}
4069		return m_result;
4070	}
4071	else
4072	{
4073		m_result1 = a+b;
4074		if (m_result1 >= m_modulus)
4075			m_result1 -= m_modulus;
4076		return m_result1;
4077	}
4078}
4079
4080Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
4081{
4082	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4083	{
4084		if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
4085			|| Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)
4086		{
4087			CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());
4088		}
4089	}
4090	else
4091	{
4092		a+=b;
4093		if (a>=m_modulus)
4094			a-=m_modulus;
4095	}
4096
4097	return a;
4098}
4099
4100const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
4101{
4102	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4103	{
4104		if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))
4105			CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4106		return m_result;
4107	}
4108	else
4109	{
4110		m_result1 = a-b;
4111		if (m_result1.IsNegative())
4112			m_result1 += m_modulus;
4113		return m_result1;
4114	}
4115}
4116
4117Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
4118{
4119	if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4120	{
4121		if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
4122			CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());
4123	}
4124	else
4125	{
4126		a-=b;
4127		if (a.IsNegative())
4128			a+=m_modulus;
4129	}
4130
4131	return a;
4132}
4133
4134const Integer& ModularArithmetic::Inverse(const Integer &a) const
4135{
4136	if (!a)
4137		return a;
4138
4139	CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());
4140	if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))
4141		Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());
4142
4143	return m_result;
4144}
4145
4146Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
4147{
4148	if (m_modulus.IsOdd())
4149	{
4150		MontgomeryRepresentation dr(m_modulus);
4151		return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
4152	}
4153	else
4154		return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
4155}
4156
4157void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
4158{
4159	if (m_modulus.IsOdd())
4160	{
4161		MontgomeryRepresentation dr(m_modulus);
4162		dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
4163		for (unsigned int i=0; i<exponentsCount; i++)
4164			results[i] = dr.ConvertOut(results[i]);
4165	}
4166	else
4167		AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
4168}
4169
4170MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)	// modulus must be odd
4171	: ModularArithmetic(m),
4172	  m_u((word)0, m_modulus.reg.size()),
4173	  m_workspace(5*m_modulus.reg.size())
4174{
4175	if (!m_modulus.IsOdd())
4176		throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
4177
4178	RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
4179}
4180
4181const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
4182{
4183	word *const T = m_workspace.begin();
4184	word *const R = m_result.reg.begin();
4185	const size_t N = m_modulus.reg.size();
4186	assert(a.reg.size()<=N && b.reg.size()<=N);
4187
4188	AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
4189	SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
4190	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4191	return m_result;
4192}
4193
4194const Integer& MontgomeryRepresentation::Square(const Integer &a) const
4195{
4196	word *const T = m_workspace.begin();
4197	word *const R = m_result.reg.begin();
4198	const size_t N = m_modulus.reg.size();
4199	assert(a.reg.size()<=N);
4200
4201	CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
4202	SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
4203	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4204	return m_result;
4205}
4206
4207Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
4208{
4209	word *const T = m_workspace.begin();
4210	word *const R = m_result.reg.begin();
4211	const size_t N = m_modulus.reg.size();
4212	assert(a.reg.size()<=N);
4213
4214	CopyWords(T, a.reg, a.reg.size());
4215	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4216	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4217	return m_result;
4218}
4219
4220const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
4221{
4222//	  return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
4223	word *const T = m_workspace.begin();
4224	word *const R = m_result.reg.begin();
4225	const size_t N = m_modulus.reg.size();
4226	assert(a.reg.size()<=N);
4227
4228	CopyWords(T, a.reg, a.reg.size());
4229	SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4230	MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4231	unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);
4232
4233//	cout << "k=" << k << " N*32=" << 32*N << endl;
4234
4235	if (k>N*WORD_BITS)
4236		DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
4237	else
4238		MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);
4239
4240	return m_result;
4241}
4242
4243NAMESPACE_END
4244
4245#endif
4246