1// integer.cpp - written and placed in the public domain by Wei Dai 2// contains public domain code contributed by Alister Lee and Leonard Janke 3 4#include "pch.h" 5 6#ifndef CRYPTOPP_IMPORTS 7 8#include "integer.h" 9#include "modarith.h" 10#include "nbtheory.h" 11#include "asn.h" 12#include "oids.h" 13#include "words.h" 14#include "algparam.h" 15#include "pubkey.h" // for P1363_KDF2 16#include "sha.h" 17#include "cpu.h" 18 19#include <iostream> 20 21#if _MSC_VER >= 1400 22 #include <intrin.h> 23#endif 24 25#ifdef __DECCXX 26 #include <c_asm.h> 27#endif 28 29#ifdef CRYPTOPP_MSVC6_NO_PP 30 #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.") 31#endif 32 33#define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86) 34 35NAMESPACE_BEGIN(CryptoPP) 36 37bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt) 38{ 39 if (valueType != typeid(Integer)) 40 return false; 41 *reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt); 42 return true; 43} 44 45inline static int Compare(const word *A, const word *B, size_t N) 46{ 47 while (N--) 48 if (A[N] > B[N]) 49 return 1; 50 else if (A[N] < B[N]) 51 return -1; 52 53 return 0; 54} 55 56inline static int Increment(word *A, size_t N, word B=1) 57{ 58 assert(N); 59 word t = A[0]; 60 A[0] = t+B; 61 if (A[0] >= t) 62 return 0; 63 for (unsigned i=1; i<N; i++) 64 if (++A[i]) 65 return 0; 66 return 1; 67} 68 69inline static int Decrement(word *A, size_t N, word B=1) 70{ 71 assert(N); 72 word t = A[0]; 73 A[0] = t-B; 74 if (A[0] <= t) 75 return 0; 76 for (unsigned i=1; i<N; i++) 77 if (A[i]--) 78 return 0; 79 return 1; 80} 81 82static void TwosComplement(word *A, size_t N) 83{ 84 Decrement(A, N); 85 for (unsigned i=0; i<N; i++) 86 A[i] = ~A[i]; 87} 88 89static word AtomicInverseModPower2(word A) 90{ 91 assert(A%2==1); 92 93 word R=A%8; 94 95 for (unsigned i=3; i<WORD_BITS; i*=2) 96 R = R*(2-R*A); 97 98 assert(R*A==1); 99 return R; 100} 101 102// ******************************************************** 103 104#if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || (defined(__x86_64__) && defined(CRYPTOPP_WORD128_AVAILABLE)) 105 #define Declare2Words(x) word x##0, x##1; 106 #define AssignWord(a, b) a##0 = b; a##1 = 0; 107 #define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c); 108 #define LowWord(a) a##0 109 #define HighWord(a) a##1 110 #ifdef _MSC_VER 111 #define MultiplyWordsLoHi(p0, p1, a, b) p0 = _umul128(a, b, &p1); 112 #ifndef __INTEL_COMPILER 113 #define Double3Words(c, d) d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2; 114 #endif 115 #elif defined(__DECCXX) 116 #define MultiplyWordsLoHi(p0, p1, a, b) p0 = a*b; p1 = asm("umulh %a0, %a1, %v0", a, b); 117 #elif defined(__x86_64__) 118 #ifdef __SUNPRO_CC 119 // Sun Studio's gcc-style inline assembly is heavily bugged as of version 5.9 Patch 124864-09 2008/12/16, but this one works 120 #define MultiplyWordsLoHi(p0, p1, a, b) asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "r"(b) : "cc"); 121 #else 122 #define MultiplyWordsLoHi(p0, p1, a, b) asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc"); 123 #define MulAcc(c, d, a, b) asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc"); 124 #define Double3Words(c, d) asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc"); 125 #define Acc2WordsBy1(a, b) asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc"); 126 #define Acc2WordsBy2(a, b) asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc"); 127 #define Acc3WordsBy2(c, d, e) asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc"); 128 #endif 129 #endif 130 #define MultiplyWords(p, a, b) MultiplyWordsLoHi(p##0, p##1, a, b) 131 #ifndef Double3Words 132 #define Double3Words(c, d) d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2; 133 #endif 134 #ifndef Acc2WordsBy2 135 #define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1; 136 #endif 137 #define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);} 138 #define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);} 139 #define GetCarry(u) u##1 140 #define GetBorrow(u) u##1 141#else 142 #define Declare2Words(x) dword x; 143 #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) 144 #define MultiplyWords(p, a, b) p = __emulu(a, b); 145 #else 146 #define MultiplyWords(p, a, b) p = (dword)a*b; 147 #endif 148 #define AssignWord(a, b) a = b; 149 #define Add2WordsBy1(a, b, c) a = b + c; 150 #define Acc2WordsBy2(a, b) a += b; 151 #define LowWord(a) word(a) 152 #define HighWord(a) word(a>>WORD_BITS) 153 #define Double3Words(c, d) d = 2*d + (c>>(WORD_BITS-1)); c *= 2; 154 #define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u); 155 #define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u); 156 #define GetCarry(u) HighWord(u) 157 #define GetBorrow(u) word(u>>(WORD_BITS*2-1)) 158#endif 159#ifndef MulAcc 160 #define MulAcc(c, d, a, b) MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p)); 161#endif 162#ifndef Acc2WordsBy1 163 #define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b) 164#endif 165#ifndef Acc3WordsBy2 166 #define Acc3WordsBy2(c, d, e) Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e)); 167#endif 168 169class DWord 170{ 171public: 172 DWord() {} 173 174#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 175 explicit DWord(word low) 176 { 177 m_whole = low; 178 } 179#else 180 explicit DWord(word low) 181 { 182 m_halfs.low = low; 183 m_halfs.high = 0; 184 } 185#endif 186 187 DWord(word low, word high) 188 { 189 m_halfs.low = low; 190 m_halfs.high = high; 191 } 192 193 static DWord Multiply(word a, word b) 194 { 195 DWord r; 196 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 197 r.m_whole = (dword)a * b; 198 #elif defined(MultiplyWordsLoHi) 199 MultiplyWordsLoHi(r.m_halfs.low, r.m_halfs.high, a, b); 200 #endif 201 return r; 202 } 203 204 static DWord MultiplyAndAdd(word a, word b, word c) 205 { 206 DWord r = Multiply(a, b); 207 return r += c; 208 } 209 210 DWord & operator+=(word a) 211 { 212 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 213 m_whole = m_whole + a; 214 #else 215 m_halfs.low += a; 216 m_halfs.high += (m_halfs.low < a); 217 #endif 218 return *this; 219 } 220 221 DWord operator+(word a) 222 { 223 DWord r; 224 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 225 r.m_whole = m_whole + a; 226 #else 227 r.m_halfs.low = m_halfs.low + a; 228 r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a); 229 #endif 230 return r; 231 } 232 233 DWord operator-(DWord a) 234 { 235 DWord r; 236 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 237 r.m_whole = m_whole - a.m_whole; 238 #else 239 r.m_halfs.low = m_halfs.low - a.m_halfs.low; 240 r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low); 241 #endif 242 return r; 243 } 244 245 DWord operator-(word a) 246 { 247 DWord r; 248 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 249 r.m_whole = m_whole - a; 250 #else 251 r.m_halfs.low = m_halfs.low - a; 252 r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low); 253 #endif 254 return r; 255 } 256 257 // returns quotient, which must fit in a word 258 word operator/(word divisor); 259 260 word operator%(word a); 261 262 bool operator!() const 263 { 264 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 265 return !m_whole; 266 #else 267 return !m_halfs.high && !m_halfs.low; 268 #endif 269 } 270 271 word GetLowHalf() const {return m_halfs.low;} 272 word GetHighHalf() const {return m_halfs.high;} 273 word GetHighHalfAsBorrow() const {return 0-m_halfs.high;} 274 275private: 276 union 277 { 278 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 279 dword m_whole; 280 #endif 281 struct 282 { 283 #ifdef IS_LITTLE_ENDIAN 284 word low; 285 word high; 286 #else 287 word high; 288 word low; 289 #endif 290 } m_halfs; 291 }; 292}; 293 294class Word 295{ 296public: 297 Word() {} 298 299 Word(word value) 300 { 301 m_whole = value; 302 } 303 304 Word(hword low, hword high) 305 { 306 m_whole = low | (word(high) << (WORD_BITS/2)); 307 } 308 309 static Word Multiply(hword a, hword b) 310 { 311 Word r; 312 r.m_whole = (word)a * b; 313 return r; 314 } 315 316 Word operator-(Word a) 317 { 318 Word r; 319 r.m_whole = m_whole - a.m_whole; 320 return r; 321 } 322 323 Word operator-(hword a) 324 { 325 Word r; 326 r.m_whole = m_whole - a; 327 return r; 328 } 329 330 // returns quotient, which must fit in a word 331 hword operator/(hword divisor) 332 { 333 return hword(m_whole / divisor); 334 } 335 336 bool operator!() const 337 { 338 return !m_whole; 339 } 340 341 word GetWhole() const {return m_whole;} 342 hword GetLowHalf() const {return hword(m_whole);} 343 hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));} 344 hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));} 345 346private: 347 word m_whole; 348}; 349 350// do a 3 word by 2 word divide, returns quotient and leaves remainder in A 351template <class S, class D> 352S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULL) 353{ 354 // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S 355 assert(A[2] < B1 || (A[2]==B1 && A[1] < B0)); 356 357 // estimate the quotient: do a 2 S by 1 S divide 358 S Q; 359 if (S(B1+1) == 0) 360 Q = A[2]; 361 else 362 Q = D(A[1], A[2]) / S(B1+1); 363 364 // now subtract Q*B from A 365 D p = D::Multiply(B0, Q); 366 D u = (D) A[0] - p.GetLowHalf(); 367 A[0] = u.GetLowHalf(); 368 u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q); 369 A[1] = u.GetLowHalf(); 370 A[2] += u.GetHighHalf(); 371 372 // Q <= actual quotient, so fix it 373 while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0)) 374 { 375 u = (D) A[0] - B0; 376 A[0] = u.GetLowHalf(); 377 u = (D) A[1] - B1 - u.GetHighHalfAsBorrow(); 378 A[1] = u.GetLowHalf(); 379 A[2] += u.GetHighHalf(); 380 Q++; 381 assert(Q); // shouldn't overflow 382 } 383 384 return Q; 385} 386 387// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1 388template <class S, class D> 389inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B) 390{ 391 if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS) 392 return D(Ah.GetLowHalf(), Ah.GetHighHalf()); 393 else 394 { 395 S Q[2]; 396 T[0] = Al.GetLowHalf(); 397 T[1] = Al.GetHighHalf(); 398 T[2] = Ah.GetLowHalf(); 399 T[3] = Ah.GetHighHalf(); 400 Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf()); 401 Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf()); 402 return D(Q[0], Q[1]); 403 } 404} 405 406// returns quotient, which must fit in a word 407inline word DWord::operator/(word a) 408{ 409 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 410 return word(m_whole / a); 411 #else 412 hword r[4]; 413 return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole(); 414 #endif 415} 416 417inline word DWord::operator%(word a) 418{ 419 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE 420 return word(m_whole % a); 421 #else 422 if (a < (word(1) << (WORD_BITS/2))) 423 { 424 hword h = hword(a); 425 word r = m_halfs.high % h; 426 r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h; 427 return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h); 428 } 429 else 430 { 431 hword r[4]; 432 DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a); 433 return Word(r[0], r[1]).GetWhole(); 434 } 435 #endif 436} 437 438// ******************************************************** 439 440// use some tricks to share assembly code between MSVC and GCC 441#if defined(__GNUC__) 442 #define AddPrologue \ 443 int result; \ 444 __asm__ __volatile__ \ 445 ( \ 446 ".intel_syntax noprefix;" 447 #define AddEpilogue \ 448 ".att_syntax prefix;" \ 449 : "=a" (result)\ 450 : "d" (C), "a" (A), "D" (B), "c" (N) \ 451 : "%esi", "memory", "cc" \ 452 );\ 453 return result; 454 #define MulPrologue \ 455 __asm__ __volatile__ \ 456 ( \ 457 ".intel_syntax noprefix;" \ 458 AS1( push ebx) \ 459 AS2( mov ebx, edx) 460 #define MulEpilogue \ 461 AS1( pop ebx) \ 462 ".att_syntax prefix;" \ 463 : \ 464 : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \ 465 : "%esi", "memory", "cc" \ 466 ); 467 #define SquPrologue MulPrologue 468 #define SquEpilogue \ 469 AS1( pop ebx) \ 470 ".att_syntax prefix;" \ 471 : \ 472 : "d" (s_maskLow16), "c" (C), "a" (A) \ 473 : "%esi", "%edi", "memory", "cc" \ 474 ); 475 #define TopPrologue MulPrologue 476 #define TopEpilogue \ 477 AS1( pop ebx) \ 478 ".att_syntax prefix;" \ 479 : \ 480 : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \ 481 : "memory", "cc" \ 482 ); 483#else 484 #define AddPrologue \ 485 __asm push edi \ 486 __asm push esi \ 487 __asm mov eax, [esp+12] \ 488 __asm mov edi, [esp+16] 489 #define AddEpilogue \ 490 __asm pop esi \ 491 __asm pop edi \ 492 __asm ret 8 493#if _MSC_VER < 1300 494 #define SaveEBX __asm push ebx 495 #define RestoreEBX __asm pop ebx 496#else 497 #define SaveEBX 498 #define RestoreEBX 499#endif 500 #define SquPrologue \ 501 AS2( mov eax, A) \ 502 AS2( mov ecx, C) \ 503 SaveEBX \ 504 AS2( lea ebx, s_maskLow16) 505 #define MulPrologue \ 506 AS2( mov eax, A) \ 507 AS2( mov edi, B) \ 508 AS2( mov ecx, C) \ 509 SaveEBX \ 510 AS2( lea ebx, s_maskLow16) 511 #define TopPrologue \ 512 AS2( mov eax, A) \ 513 AS2( mov edi, B) \ 514 AS2( mov ecx, C) \ 515 AS2( mov esi, L) \ 516 SaveEBX \ 517 AS2( lea ebx, s_maskLow16) 518 #define SquEpilogue RestoreEBX 519 #define MulEpilogue RestoreEBX 520 #define TopEpilogue RestoreEBX 521#endif 522 523#ifdef CRYPTOPP_X64_MASM_AVAILABLE 524extern "C" { 525int Baseline_Add(size_t N, word *C, const word *A, const word *B); 526int Baseline_Sub(size_t N, word *C, const word *A, const word *B); 527} 528#elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE) 529int Baseline_Add(size_t N, word *C, const word *A, const word *B) 530{ 531 word result; 532 __asm__ __volatile__ 533 ( 534 ".intel_syntax;" 535 AS1( neg %1) 536 ASJ( jz, 1, f) 537 AS2( mov %0,[%3+8*%1]) 538 AS2( add %0,[%4+8*%1]) 539 AS2( mov [%2+8*%1],%0) 540 ASL(0) 541 AS2( mov %0,[%3+8*%1+8]) 542 AS2( adc %0,[%4+8*%1+8]) 543 AS2( mov [%2+8*%1+8],%0) 544 AS2( lea %1,[%1+2]) 545 ASJ( jrcxz, 1, f) 546 AS2( mov %0,[%3+8*%1]) 547 AS2( adc %0,[%4+8*%1]) 548 AS2( mov [%2+8*%1],%0) 549 ASJ( jmp, 0, b) 550 ASL(1) 551 AS2( mov %0, 0) 552 AS2( adc %0, %0) 553 ".att_syntax;" 554 : "=&r" (result), "+c" (N) 555 : "r" (C+N), "r" (A+N), "r" (B+N) 556 : "memory", "cc" 557 ); 558 return (int)result; 559} 560 561int Baseline_Sub(size_t N, word *C, const word *A, const word *B) 562{ 563 word result; 564 __asm__ __volatile__ 565 ( 566 ".intel_syntax;" 567 AS1( neg %1) 568 ASJ( jz, 1, f) 569 AS2( mov %0,[%3+8*%1]) 570 AS2( sub %0,[%4+8*%1]) 571 AS2( mov [%2+8*%1],%0) 572 ASL(0) 573 AS2( mov %0,[%3+8*%1+8]) 574 AS2( sbb %0,[%4+8*%1+8]) 575 AS2( mov [%2+8*%1+8],%0) 576 AS2( lea %1,[%1+2]) 577 ASJ( jrcxz, 1, f) 578 AS2( mov %0,[%3+8*%1]) 579 AS2( sbb %0,[%4+8*%1]) 580 AS2( mov [%2+8*%1],%0) 581 ASJ( jmp, 0, b) 582 ASL(1) 583 AS2( mov %0, 0) 584 AS2( adc %0, %0) 585 ".att_syntax;" 586 : "=&r" (result), "+c" (N) 587 : "r" (C+N), "r" (A+N), "r" (B+N) 588 : "memory", "cc" 589 ); 590 return (int)result; 591} 592#elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86 593CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) 594{ 595 AddPrologue 596 597 // now: eax = A, edi = B, edx = C, ecx = N 598 AS2( lea eax, [eax+4*ecx]) 599 AS2( lea edi, [edi+4*ecx]) 600 AS2( lea edx, [edx+4*ecx]) 601 602 AS1( neg ecx) // ecx is negative index 603 AS2( test ecx, 2) // this clears carry flag 604 ASJ( jz, 0, f) 605 AS2( sub ecx, 2) 606 ASJ( jmp, 1, f) 607 608 ASL(0) 609 ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero 610 AS2( mov esi,[eax+4*ecx]) 611 AS2( adc esi,[edi+4*ecx]) 612 AS2( mov [edx+4*ecx],esi) 613 AS2( mov esi,[eax+4*ecx+4]) 614 AS2( adc esi,[edi+4*ecx+4]) 615 AS2( mov [edx+4*ecx+4],esi) 616 ASL(1) 617 AS2( mov esi,[eax+4*ecx+8]) 618 AS2( adc esi,[edi+4*ecx+8]) 619 AS2( mov [edx+4*ecx+8],esi) 620 AS2( mov esi,[eax+4*ecx+12]) 621 AS2( adc esi,[edi+4*ecx+12]) 622 AS2( mov [edx+4*ecx+12],esi) 623 624 AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2 625 ASJ( jmp, 0, b) 626 627 ASL(2) 628 AS2( mov eax, 0) 629 AS1( setc al) // store carry into eax (return result register) 630 631 AddEpilogue 632} 633 634CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) 635{ 636 AddPrologue 637 638 // now: eax = A, edi = B, edx = C, ecx = N 639 AS2( lea eax, [eax+4*ecx]) 640 AS2( lea edi, [edi+4*ecx]) 641 AS2( lea edx, [edx+4*ecx]) 642 643 AS1( neg ecx) // ecx is negative index 644 AS2( test ecx, 2) // this clears carry flag 645 ASJ( jz, 0, f) 646 AS2( sub ecx, 2) 647 ASJ( jmp, 1, f) 648 649 ASL(0) 650 ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero 651 AS2( mov esi,[eax+4*ecx]) 652 AS2( sbb esi,[edi+4*ecx]) 653 AS2( mov [edx+4*ecx],esi) 654 AS2( mov esi,[eax+4*ecx+4]) 655 AS2( sbb esi,[edi+4*ecx+4]) 656 AS2( mov [edx+4*ecx+4],esi) 657 ASL(1) 658 AS2( mov esi,[eax+4*ecx+8]) 659 AS2( sbb esi,[edi+4*ecx+8]) 660 AS2( mov [edx+4*ecx+8],esi) 661 AS2( mov esi,[eax+4*ecx+12]) 662 AS2( sbb esi,[edi+4*ecx+12]) 663 AS2( mov [edx+4*ecx+12],esi) 664 665 AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2 666 ASJ( jmp, 0, b) 667 668 ASL(2) 669 AS2( mov eax, 0) 670 AS1( setc al) // store carry into eax (return result register) 671 672 AddEpilogue 673} 674 675#if CRYPTOPP_INTEGER_SSE2 676CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B) 677{ 678 AddPrologue 679 680 // now: eax = A, edi = B, edx = C, ecx = N 681 AS2( lea eax, [eax+4*ecx]) 682 AS2( lea edi, [edi+4*ecx]) 683 AS2( lea edx, [edx+4*ecx]) 684 685 AS1( neg ecx) // ecx is negative index 686 AS2( pxor mm2, mm2) 687 ASJ( jz, 2, f) 688 AS2( test ecx, 2) // this clears carry flag 689 ASJ( jz, 0, f) 690 AS2( sub ecx, 2) 691 ASJ( jmp, 1, f) 692 693 ASL(0) 694 AS2( movd mm0, DWORD PTR [eax+4*ecx]) 695 AS2( movd mm1, DWORD PTR [edi+4*ecx]) 696 AS2( paddq mm0, mm1) 697 AS2( paddq mm2, mm0) 698 AS2( movd DWORD PTR [edx+4*ecx], mm2) 699 AS2( psrlq mm2, 32) 700 701 AS2( movd mm0, DWORD PTR [eax+4*ecx+4]) 702 AS2( movd mm1, DWORD PTR [edi+4*ecx+4]) 703 AS2( paddq mm0, mm1) 704 AS2( paddq mm2, mm0) 705 AS2( movd DWORD PTR [edx+4*ecx+4], mm2) 706 AS2( psrlq mm2, 32) 707 708 ASL(1) 709 AS2( movd mm0, DWORD PTR [eax+4*ecx+8]) 710 AS2( movd mm1, DWORD PTR [edi+4*ecx+8]) 711 AS2( paddq mm0, mm1) 712 AS2( paddq mm2, mm0) 713 AS2( movd DWORD PTR [edx+4*ecx+8], mm2) 714 AS2( psrlq mm2, 32) 715 716 AS2( movd mm0, DWORD PTR [eax+4*ecx+12]) 717 AS2( movd mm1, DWORD PTR [edi+4*ecx+12]) 718 AS2( paddq mm0, mm1) 719 AS2( paddq mm2, mm0) 720 AS2( movd DWORD PTR [edx+4*ecx+12], mm2) 721 AS2( psrlq mm2, 32) 722 723 AS2( add ecx, 4) 724 ASJ( jnz, 0, b) 725 726 ASL(2) 727 AS2( movd eax, mm2) 728 AS1( emms) 729 730 AddEpilogue 731} 732CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B) 733{ 734 AddPrologue 735 736 // now: eax = A, edi = B, edx = C, ecx = N 737 AS2( lea eax, [eax+4*ecx]) 738 AS2( lea edi, [edi+4*ecx]) 739 AS2( lea edx, [edx+4*ecx]) 740 741 AS1( neg ecx) // ecx is negative index 742 AS2( pxor mm2, mm2) 743 ASJ( jz, 2, f) 744 AS2( test ecx, 2) // this clears carry flag 745 ASJ( jz, 0, f) 746 AS2( sub ecx, 2) 747 ASJ( jmp, 1, f) 748 749 ASL(0) 750 AS2( movd mm0, DWORD PTR [eax+4*ecx]) 751 AS2( movd mm1, DWORD PTR [edi+4*ecx]) 752 AS2( psubq mm0, mm1) 753 AS2( psubq mm0, mm2) 754 AS2( movd DWORD PTR [edx+4*ecx], mm0) 755 AS2( psrlq mm0, 63) 756 757 AS2( movd mm2, DWORD PTR [eax+4*ecx+4]) 758 AS2( movd mm1, DWORD PTR [edi+4*ecx+4]) 759 AS2( psubq mm2, mm1) 760 AS2( psubq mm2, mm0) 761 AS2( movd DWORD PTR [edx+4*ecx+4], mm2) 762 AS2( psrlq mm2, 63) 763 764 ASL(1) 765 AS2( movd mm0, DWORD PTR [eax+4*ecx+8]) 766 AS2( movd mm1, DWORD PTR [edi+4*ecx+8]) 767 AS2( psubq mm0, mm1) 768 AS2( psubq mm0, mm2) 769 AS2( movd DWORD PTR [edx+4*ecx+8], mm0) 770 AS2( psrlq mm0, 63) 771 772 AS2( movd mm2, DWORD PTR [eax+4*ecx+12]) 773 AS2( movd mm1, DWORD PTR [edi+4*ecx+12]) 774 AS2( psubq mm2, mm1) 775 AS2( psubq mm2, mm0) 776 AS2( movd DWORD PTR [edx+4*ecx+12], mm2) 777 AS2( psrlq mm2, 63) 778 779 AS2( add ecx, 4) 780 ASJ( jnz, 0, b) 781 782 ASL(2) 783 AS2( movd eax, mm2) 784 AS1( emms) 785 786 AddEpilogue 787} 788#endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 789#else 790int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B) 791{ 792 assert (N%2 == 0); 793 794 Declare2Words(u); 795 AssignWord(u, 0); 796 for (size_t i=0; i<N; i+=2) 797 { 798 AddWithCarry(u, A[i], B[i]); 799 C[i] = LowWord(u); 800 AddWithCarry(u, A[i+1], B[i+1]); 801 C[i+1] = LowWord(u); 802 } 803 return int(GetCarry(u)); 804} 805 806int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B) 807{ 808 assert (N%2 == 0); 809 810 Declare2Words(u); 811 AssignWord(u, 0); 812 for (size_t i=0; i<N; i+=2) 813 { 814 SubtractWithBorrow(u, A[i], B[i]); 815 C[i] = LowWord(u); 816 SubtractWithBorrow(u, A[i+1], B[i+1]); 817 C[i+1] = LowWord(u); 818 } 819 return int(GetBorrow(u)); 820} 821#endif 822 823static word LinearMultiply(word *C, const word *A, word B, size_t N) 824{ 825 word carry=0; 826 for(unsigned i=0; i<N; i++) 827 { 828 Declare2Words(p); 829 MultiplyWords(p, A[i], B); 830 Acc2WordsBy1(p, carry); 831 C[i] = LowWord(p); 832 carry = HighWord(p); 833 } 834 return carry; 835} 836 837#ifndef CRYPTOPP_DOXYGEN_PROCESSING 838 839#define Mul_2 \ 840 Mul_Begin(2) \ 841 Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ 842 Mul_End(1, 1) 843 844#define Mul_4 \ 845 Mul_Begin(4) \ 846 Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ 847 Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ 848 Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ 849 Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \ 850 Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \ 851 Mul_End(5, 3) 852 853#define Mul_8 \ 854 Mul_Begin(8) \ 855 Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ 856 Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ 857 Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ 858 Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ 859 Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ 860 Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ 861 Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ 862 Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \ 863 Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \ 864 Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \ 865 Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \ 866 Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \ 867 Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \ 868 Mul_End(13, 7) 869 870#define Mul_16 \ 871 Mul_Begin(16) \ 872 Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ 873 Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ 874 Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ 875 Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ 876 Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ 877 Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ 878 Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ 879 Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \ 880 Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \ 881 Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \ 882 Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \ 883 Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \ 884 Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \ 885 Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \ 886 Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \ 887 Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \ 888 Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \ 889 Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \ 890 Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \ 891 Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \ 892 Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \ 893 Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \ 894 Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \ 895 Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \ 896 Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \ 897 Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \ 898 Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \ 899 Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \ 900 Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \ 901 Mul_End(29, 15) 902 903#define Squ_2 \ 904 Squ_Begin(2) \ 905 Squ_End(2) 906 907#define Squ_4 \ 908 Squ_Begin(4) \ 909 Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ 910 Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ 911 Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \ 912 Squ_SaveAcc(4, 2, 3) Squ_NonDiag \ 913 Squ_End(4) 914 915#define Squ_8 \ 916 Squ_Begin(8) \ 917 Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ 918 Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ 919 Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \ 920 Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \ 921 Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \ 922 Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \ 923 Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \ 924 Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \ 925 Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \ 926 Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \ 927 Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \ 928 Squ_SaveAcc(12, 6, 7) Squ_NonDiag \ 929 Squ_End(8) 930 931#define Squ_16 \ 932 Squ_Begin(16) \ 933 Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \ 934 Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \ 935 Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \ 936 Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \ 937 Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \ 938 Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \ 939 Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \ 940 Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \ 941 Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \ 942 Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \ 943 Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \ 944 Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \ 945 Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \ 946 Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \ 947 Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \ 948 Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \ 949 Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \ 950 Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \ 951 Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \ 952 Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \ 953 Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \ 954 Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \ 955 Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \ 956 Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \ 957 Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \ 958 Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \ 959 Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \ 960 Squ_SaveAcc(28, 14, 15) Squ_NonDiag \ 961 Squ_End(16) 962 963#define Bot_2 \ 964 Mul_Begin(2) \ 965 Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \ 966 Bot_End(2) 967 968#define Bot_4 \ 969 Mul_Begin(4) \ 970 Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ 971 Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \ 972 Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \ 973 Bot_End(4) 974 975#define Bot_8 \ 976 Mul_Begin(8) \ 977 Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ 978 Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ 979 Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ 980 Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ 981 Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ 982 Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ 983 Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \ 984 Bot_End(8) 985 986#define Bot_16 \ 987 Mul_Begin(16) \ 988 Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \ 989 Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \ 990 Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ 991 Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \ 992 Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \ 993 Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \ 994 Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ 995 Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \ 996 Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \ 997 Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \ 998 Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \ 999 Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \ 1000 Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \ 1001 Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \ 1002 Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \ 1003 Bot_End(16) 1004 1005#endif 1006 1007#if 0 1008#define Mul_Begin(n) \ 1009 Declare2Words(p) \ 1010 Declare2Words(c) \ 1011 Declare2Words(d) \ 1012 MultiplyWords(p, A[0], B[0]) \ 1013 AssignWord(c, LowWord(p)) \ 1014 AssignWord(d, HighWord(p)) 1015 1016#define Mul_Acc(i, j) \ 1017 MultiplyWords(p, A[i], B[j]) \ 1018 Acc2WordsBy1(c, LowWord(p)) \ 1019 Acc2WordsBy1(d, HighWord(p)) 1020 1021#define Mul_SaveAcc(k, i, j) \ 1022 R[k] = LowWord(c); \ 1023 Add2WordsBy1(c, d, HighWord(c)) \ 1024 MultiplyWords(p, A[i], B[j]) \ 1025 AssignWord(d, HighWord(p)) \ 1026 Acc2WordsBy1(c, LowWord(p)) 1027 1028#define Mul_End(n) \ 1029 R[2*n-3] = LowWord(c); \ 1030 Acc2WordsBy1(d, HighWord(c)) \ 1031 MultiplyWords(p, A[n-1], B[n-1])\ 1032 Acc2WordsBy2(d, p) \ 1033 R[2*n-2] = LowWord(d); \ 1034 R[2*n-1] = HighWord(d); 1035 1036#define Bot_SaveAcc(k, i, j) \ 1037 R[k] = LowWord(c); \ 1038 word e = LowWord(d) + HighWord(c); \ 1039 e += A[i] * B[j]; 1040 1041#define Bot_Acc(i, j) \ 1042 e += A[i] * B[j]; 1043 1044#define Bot_End(n) \ 1045 R[n-1] = e; 1046#else 1047#define Mul_Begin(n) \ 1048 Declare2Words(p) \ 1049 word c; \ 1050 Declare2Words(d) \ 1051 MultiplyWords(p, A[0], B[0]) \ 1052 c = LowWord(p); \ 1053 AssignWord(d, HighWord(p)) 1054 1055#define Mul_Acc(i, j) \ 1056 MulAcc(c, d, A[i], B[j]) 1057 1058#define Mul_SaveAcc(k, i, j) \ 1059 R[k] = c; \ 1060 c = LowWord(d); \ 1061 AssignWord(d, HighWord(d)) \ 1062 MulAcc(c, d, A[i], B[j]) 1063 1064#define Mul_End(k, i) \ 1065 R[k] = c; \ 1066 MultiplyWords(p, A[i], B[i]) \ 1067 Acc2WordsBy2(p, d) \ 1068 R[k+1] = LowWord(p); \ 1069 R[k+2] = HighWord(p); 1070 1071#define Bot_SaveAcc(k, i, j) \ 1072 R[k] = c; \ 1073 c = LowWord(d); \ 1074 c += A[i] * B[j]; 1075 1076#define Bot_Acc(i, j) \ 1077 c += A[i] * B[j]; 1078 1079#define Bot_End(n) \ 1080 R[n-1] = c; 1081#endif 1082 1083#define Squ_Begin(n) \ 1084 Declare2Words(p) \ 1085 word c; \ 1086 Declare2Words(d) \ 1087 Declare2Words(e) \ 1088 MultiplyWords(p, A[0], A[0]) \ 1089 R[0] = LowWord(p); \ 1090 AssignWord(e, HighWord(p)) \ 1091 MultiplyWords(p, A[0], A[1]) \ 1092 c = LowWord(p); \ 1093 AssignWord(d, HighWord(p)) \ 1094 Squ_NonDiag \ 1095 1096#define Squ_NonDiag \ 1097 Double3Words(c, d) 1098 1099#define Squ_SaveAcc(k, i, j) \ 1100 Acc3WordsBy2(c, d, e) \ 1101 R[k] = c; \ 1102 MultiplyWords(p, A[i], A[j]) \ 1103 c = LowWord(p); \ 1104 AssignWord(d, HighWord(p)) \ 1105 1106#define Squ_Acc(i, j) \ 1107 MulAcc(c, d, A[i], A[j]) 1108 1109#define Squ_Diag(i) \ 1110 Squ_NonDiag \ 1111 MulAcc(c, d, A[i], A[i]) 1112 1113#define Squ_End(n) \ 1114 Acc3WordsBy2(c, d, e) \ 1115 R[2*n-3] = c; \ 1116 MultiplyWords(p, A[n-1], A[n-1])\ 1117 Acc2WordsBy2(p, e) \ 1118 R[2*n-2] = LowWord(p); \ 1119 R[2*n-1] = HighWord(p); 1120 1121void Baseline_Multiply2(word *R, const word *A, const word *B) 1122{ 1123 Mul_2 1124} 1125 1126void Baseline_Multiply4(word *R, const word *A, const word *B) 1127{ 1128 Mul_4 1129} 1130 1131void Baseline_Multiply8(word *R, const word *A, const word *B) 1132{ 1133 Mul_8 1134} 1135 1136void Baseline_Square2(word *R, const word *A) 1137{ 1138 Squ_2 1139} 1140 1141void Baseline_Square4(word *R, const word *A) 1142{ 1143 Squ_4 1144} 1145 1146void Baseline_Square8(word *R, const word *A) 1147{ 1148 Squ_8 1149} 1150 1151void Baseline_MultiplyBottom2(word *R, const word *A, const word *B) 1152{ 1153 Bot_2 1154} 1155 1156void Baseline_MultiplyBottom4(word *R, const word *A, const word *B) 1157{ 1158 Bot_4 1159} 1160 1161void Baseline_MultiplyBottom8(word *R, const word *A, const word *B) 1162{ 1163 Bot_8 1164} 1165 1166#define Top_Begin(n) \ 1167 Declare2Words(p) \ 1168 word c; \ 1169 Declare2Words(d) \ 1170 MultiplyWords(p, A[0], B[n-2]);\ 1171 AssignWord(d, HighWord(p)); 1172 1173#define Top_Acc(i, j) \ 1174 MultiplyWords(p, A[i], B[j]);\ 1175 Acc2WordsBy1(d, HighWord(p)); 1176 1177#define Top_SaveAcc0(i, j) \ 1178 c = LowWord(d); \ 1179 AssignWord(d, HighWord(d)) \ 1180 MulAcc(c, d, A[i], B[j]) 1181 1182#define Top_SaveAcc1(i, j) \ 1183 c = L<c; \ 1184 Acc2WordsBy1(d, c); \ 1185 c = LowWord(d); \ 1186 AssignWord(d, HighWord(d)) \ 1187 MulAcc(c, d, A[i], B[j]) 1188 1189void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L) 1190{ 1191 word T[4]; 1192 Baseline_Multiply2(T, A, B); 1193 R[0] = T[2]; 1194 R[1] = T[3]; 1195} 1196 1197void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L) 1198{ 1199 Top_Begin(4) 1200 Top_Acc(1, 1) Top_Acc(2, 0) \ 1201 Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \ 1202 Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \ 1203 Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \ 1204 Mul_End(1, 3) 1205} 1206 1207void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L) 1208{ 1209 Top_Begin(8) 1210 Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \ 1211 Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \ 1212 Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \ 1213 Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \ 1214 Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \ 1215 Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \ 1216 Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \ 1217 Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \ 1218 Mul_End(5, 7) 1219} 1220 1221#if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available 1222void Baseline_Multiply16(word *R, const word *A, const word *B) 1223{ 1224 Mul_16 1225} 1226 1227void Baseline_Square16(word *R, const word *A) 1228{ 1229 Squ_16 1230} 1231 1232void Baseline_MultiplyBottom16(word *R, const word *A, const word *B) 1233{ 1234 Bot_16 1235} 1236 1237void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L) 1238{ 1239 Top_Begin(16) 1240 Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \ 1241 Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \ 1242 Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \ 1243 Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \ 1244 Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \ 1245 Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \ 1246 Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \ 1247 Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \ 1248 Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \ 1249 Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \ 1250 Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \ 1251 Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \ 1252 Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \ 1253 Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \ 1254 Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \ 1255 Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \ 1256 Mul_End(13, 15) 1257} 1258#endif 1259 1260// ******************************************************** 1261 1262#if CRYPTOPP_INTEGER_SSE2 1263 1264CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff}; 1265 1266#undef Mul_Begin 1267#undef Mul_Acc 1268#undef Top_Begin 1269#undef Top_Acc 1270#undef Squ_Acc 1271#undef Squ_NonDiag 1272#undef Squ_Diag 1273#undef Squ_SaveAcc 1274#undef Squ_Begin 1275#undef Mul_SaveAcc 1276#undef Bot_Acc 1277#undef Bot_SaveAcc 1278#undef Bot_End 1279#undef Squ_End 1280#undef Mul_End 1281 1282#define SSE2_FinalSave(k) \ 1283 AS2( psllq xmm5, 16) \ 1284 AS2( paddq xmm4, xmm5) \ 1285 AS2( movq QWORD PTR [ecx+8*(k)], xmm4) 1286 1287#define SSE2_SaveShift(k) \ 1288 AS2( movq xmm0, xmm6) \ 1289 AS2( punpckhqdq xmm6, xmm0) \ 1290 AS2( movq xmm1, xmm7) \ 1291 AS2( punpckhqdq xmm7, xmm1) \ 1292 AS2( paddd xmm6, xmm0) \ 1293 AS2( pslldq xmm6, 4) \ 1294 AS2( paddd xmm7, xmm1) \ 1295 AS2( paddd xmm4, xmm6) \ 1296 AS2( pslldq xmm7, 4) \ 1297 AS2( movq xmm6, xmm4) \ 1298 AS2( paddd xmm5, xmm7) \ 1299 AS2( movq xmm7, xmm5) \ 1300 AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \ 1301 AS2( psrlq xmm6, 16) \ 1302 AS2( paddq xmm6, xmm7) \ 1303 AS2( punpckhqdq xmm4, xmm0) \ 1304 AS2( punpckhqdq xmm5, xmm0) \ 1305 AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \ 1306 AS2( psrlq xmm6, 3*16) \ 1307 AS2( paddd xmm4, xmm6) \ 1308 1309#define Squ_SSE2_SaveShift(k) \ 1310 AS2( movq xmm0, xmm6) \ 1311 AS2( punpckhqdq xmm6, xmm0) \ 1312 AS2( movq xmm1, xmm7) \ 1313 AS2( punpckhqdq xmm7, xmm1) \ 1314 AS2( paddd xmm6, xmm0) \ 1315 AS2( pslldq xmm6, 4) \ 1316 AS2( paddd xmm7, xmm1) \ 1317 AS2( paddd xmm4, xmm6) \ 1318 AS2( pslldq xmm7, 4) \ 1319 AS2( movhlps xmm6, xmm4) \ 1320 AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \ 1321 AS2( paddd xmm5, xmm7) \ 1322 AS2( movhps QWORD PTR [esp+12], xmm5)\ 1323 AS2( psrlq xmm4, 16) \ 1324 AS2( paddq xmm4, xmm5) \ 1325 AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \ 1326 AS2( psrlq xmm4, 3*16) \ 1327 AS2( paddd xmm4, xmm6) \ 1328 AS2( movq QWORD PTR [esp+4], xmm4)\ 1329 1330#define SSE2_FirstMultiply(i) \ 1331 AS2( movdqa xmm7, [esi+(i)*16])\ 1332 AS2( movdqa xmm5, [edi-(i)*16])\ 1333 AS2( pmuludq xmm5, xmm7) \ 1334 AS2( movdqa xmm4, [ebx])\ 1335 AS2( movdqa xmm6, xmm4) \ 1336 AS2( pand xmm4, xmm5) \ 1337 AS2( psrld xmm5, 16) \ 1338 AS2( pmuludq xmm7, [edx-(i)*16])\ 1339 AS2( pand xmm6, xmm7) \ 1340 AS2( psrld xmm7, 16) 1341 1342#define Squ_Begin(n) \ 1343 SquPrologue \ 1344 AS2( mov esi, esp)\ 1345 AS2( and esp, 0xfffffff0)\ 1346 AS2( lea edi, [esp-32*n])\ 1347 AS2( sub esp, 32*n+16)\ 1348 AS1( push esi)\ 1349 AS2( mov esi, edi) \ 1350 AS2( xor edx, edx) \ 1351 ASL(1) \ 1352 ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ 1353 ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ 1354 AS2( movdqa [edi+2*edx], xmm0) \ 1355 AS2( psrlq xmm0, 32) \ 1356 AS2( movdqa [edi+2*edx+16], xmm0) \ 1357 AS2( movdqa [edi+16*n+2*edx], xmm1) \ 1358 AS2( psrlq xmm1, 32) \ 1359 AS2( movdqa [edi+16*n+2*edx+16], xmm1) \ 1360 AS2( add edx, 16) \ 1361 AS2( cmp edx, 8*(n)) \ 1362 ASJ( jne, 1, b) \ 1363 AS2( lea edx, [edi+16*n])\ 1364 SSE2_FirstMultiply(0) \ 1365 1366#define Squ_Acc(i) \ 1367 ASL(LSqu##i) \ 1368 AS2( movdqa xmm1, [esi+(i)*16]) \ 1369 AS2( movdqa xmm0, [edi-(i)*16]) \ 1370 AS2( movdqa xmm2, [ebx]) \ 1371 AS2( pmuludq xmm0, xmm1) \ 1372 AS2( pmuludq xmm1, [edx-(i)*16]) \ 1373 AS2( movdqa xmm3, xmm2) \ 1374 AS2( pand xmm2, xmm0) \ 1375 AS2( psrld xmm0, 16) \ 1376 AS2( paddd xmm4, xmm2) \ 1377 AS2( paddd xmm5, xmm0) \ 1378 AS2( pand xmm3, xmm1) \ 1379 AS2( psrld xmm1, 16) \ 1380 AS2( paddd xmm6, xmm3) \ 1381 AS2( paddd xmm7, xmm1) \ 1382 1383#define Squ_Acc1(i) 1384#define Squ_Acc2(i) ASC(call, LSqu##i) 1385#define Squ_Acc3(i) Squ_Acc2(i) 1386#define Squ_Acc4(i) Squ_Acc2(i) 1387#define Squ_Acc5(i) Squ_Acc2(i) 1388#define Squ_Acc6(i) Squ_Acc2(i) 1389#define Squ_Acc7(i) Squ_Acc2(i) 1390#define Squ_Acc8(i) Squ_Acc2(i) 1391 1392#define SSE2_End(E, n) \ 1393 SSE2_SaveShift(2*(n)-3) \ 1394 AS2( movdqa xmm7, [esi+16]) \ 1395 AS2( movdqa xmm0, [edi]) \ 1396 AS2( pmuludq xmm0, xmm7) \ 1397 AS2( movdqa xmm2, [ebx]) \ 1398 AS2( pmuludq xmm7, [edx]) \ 1399 AS2( movdqa xmm6, xmm2) \ 1400 AS2( pand xmm2, xmm0) \ 1401 AS2( psrld xmm0, 16) \ 1402 AS2( paddd xmm4, xmm2) \ 1403 AS2( paddd xmm5, xmm0) \ 1404 AS2( pand xmm6, xmm7) \ 1405 AS2( psrld xmm7, 16) \ 1406 SSE2_SaveShift(2*(n)-2) \ 1407 SSE2_FinalSave(2*(n)-1) \ 1408 AS1( pop esp)\ 1409 E 1410 1411#define Squ_End(n) SSE2_End(SquEpilogue, n) 1412#define Mul_End(n) SSE2_End(MulEpilogue, n) 1413#define Top_End(n) SSE2_End(TopEpilogue, n) 1414 1415#define Squ_Column1(k, i) \ 1416 Squ_SSE2_SaveShift(k) \ 1417 AS2( add esi, 16) \ 1418 SSE2_FirstMultiply(1)\ 1419 Squ_Acc##i(i) \ 1420 AS2( paddd xmm4, xmm4) \ 1421 AS2( paddd xmm5, xmm5) \ 1422 AS2( movdqa xmm3, [esi]) \ 1423 AS2( movq xmm1, QWORD PTR [esi+8]) \ 1424 AS2( pmuludq xmm1, xmm3) \ 1425 AS2( pmuludq xmm3, xmm3) \ 1426 AS2( movdqa xmm0, [ebx])\ 1427 AS2( movdqa xmm2, xmm0) \ 1428 AS2( pand xmm0, xmm1) \ 1429 AS2( psrld xmm1, 16) \ 1430 AS2( paddd xmm6, xmm0) \ 1431 AS2( paddd xmm7, xmm1) \ 1432 AS2( pand xmm2, xmm3) \ 1433 AS2( psrld xmm3, 16) \ 1434 AS2( paddd xmm6, xmm6) \ 1435 AS2( paddd xmm7, xmm7) \ 1436 AS2( paddd xmm4, xmm2) \ 1437 AS2( paddd xmm5, xmm3) \ 1438 AS2( movq xmm0, QWORD PTR [esp+4])\ 1439 AS2( movq xmm1, QWORD PTR [esp+12])\ 1440 AS2( paddd xmm4, xmm0)\ 1441 AS2( paddd xmm5, xmm1)\ 1442 1443#define Squ_Column0(k, i) \ 1444 Squ_SSE2_SaveShift(k) \ 1445 AS2( add edi, 16) \ 1446 AS2( add edx, 16) \ 1447 SSE2_FirstMultiply(1)\ 1448 Squ_Acc##i(i) \ 1449 AS2( paddd xmm6, xmm6) \ 1450 AS2( paddd xmm7, xmm7) \ 1451 AS2( paddd xmm4, xmm4) \ 1452 AS2( paddd xmm5, xmm5) \ 1453 AS2( movq xmm0, QWORD PTR [esp+4])\ 1454 AS2( movq xmm1, QWORD PTR [esp+12])\ 1455 AS2( paddd xmm4, xmm0)\ 1456 AS2( paddd xmm5, xmm1)\ 1457 1458#define SSE2_MulAdd45 \ 1459 AS2( movdqa xmm7, [esi]) \ 1460 AS2( movdqa xmm0, [edi]) \ 1461 AS2( pmuludq xmm0, xmm7) \ 1462 AS2( movdqa xmm2, [ebx]) \ 1463 AS2( pmuludq xmm7, [edx]) \ 1464 AS2( movdqa xmm6, xmm2) \ 1465 AS2( pand xmm2, xmm0) \ 1466 AS2( psrld xmm0, 16) \ 1467 AS2( paddd xmm4, xmm2) \ 1468 AS2( paddd xmm5, xmm0) \ 1469 AS2( pand xmm6, xmm7) \ 1470 AS2( psrld xmm7, 16) 1471 1472#define Mul_Begin(n) \ 1473 MulPrologue \ 1474 AS2( mov esi, esp)\ 1475 AS2( and esp, 0xfffffff0)\ 1476 AS2( sub esp, 48*n+16)\ 1477 AS1( push esi)\ 1478 AS2( xor edx, edx) \ 1479 ASL(1) \ 1480 ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ 1481 ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ 1482 ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \ 1483 AS2( movdqa [esp+20+2*edx], xmm0) \ 1484 AS2( psrlq xmm0, 32) \ 1485 AS2( movdqa [esp+20+2*edx+16], xmm0) \ 1486 AS2( movdqa [esp+20+16*n+2*edx], xmm1) \ 1487 AS2( psrlq xmm1, 32) \ 1488 AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \ 1489 AS2( movdqa [esp+20+32*n+2*edx], xmm2) \ 1490 AS2( psrlq xmm2, 32) \ 1491 AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \ 1492 AS2( add edx, 16) \ 1493 AS2( cmp edx, 8*(n)) \ 1494 ASJ( jne, 1, b) \ 1495 AS2( lea edi, [esp+20])\ 1496 AS2( lea edx, [esp+20+16*n])\ 1497 AS2( lea esi, [esp+20+32*n])\ 1498 SSE2_FirstMultiply(0) \ 1499 1500#define Mul_Acc(i) \ 1501 ASL(LMul##i) \ 1502 AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \ 1503 AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \ 1504 AS2( movdqa xmm2, [ebx]) \ 1505 AS2( pmuludq xmm0, xmm1) \ 1506 AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ 1507 AS2( movdqa xmm3, xmm2) \ 1508 AS2( pand xmm2, xmm0) \ 1509 AS2( psrld xmm0, 16) \ 1510 AS2( paddd xmm4, xmm2) \ 1511 AS2( paddd xmm5, xmm0) \ 1512 AS2( pand xmm3, xmm1) \ 1513 AS2( psrld xmm1, 16) \ 1514 AS2( paddd xmm6, xmm3) \ 1515 AS2( paddd xmm7, xmm1) \ 1516 1517#define Mul_Acc1(i) 1518#define Mul_Acc2(i) ASC(call, LMul##i) 1519#define Mul_Acc3(i) Mul_Acc2(i) 1520#define Mul_Acc4(i) Mul_Acc2(i) 1521#define Mul_Acc5(i) Mul_Acc2(i) 1522#define Mul_Acc6(i) Mul_Acc2(i) 1523#define Mul_Acc7(i) Mul_Acc2(i) 1524#define Mul_Acc8(i) Mul_Acc2(i) 1525#define Mul_Acc9(i) Mul_Acc2(i) 1526#define Mul_Acc10(i) Mul_Acc2(i) 1527#define Mul_Acc11(i) Mul_Acc2(i) 1528#define Mul_Acc12(i) Mul_Acc2(i) 1529#define Mul_Acc13(i) Mul_Acc2(i) 1530#define Mul_Acc14(i) Mul_Acc2(i) 1531#define Mul_Acc15(i) Mul_Acc2(i) 1532#define Mul_Acc16(i) Mul_Acc2(i) 1533 1534#define Mul_Column1(k, i) \ 1535 SSE2_SaveShift(k) \ 1536 AS2( add esi, 16) \ 1537 SSE2_MulAdd45\ 1538 Mul_Acc##i(i) \ 1539 1540#define Mul_Column0(k, i) \ 1541 SSE2_SaveShift(k) \ 1542 AS2( add edi, 16) \ 1543 AS2( add edx, 16) \ 1544 SSE2_MulAdd45\ 1545 Mul_Acc##i(i) \ 1546 1547#define Bot_Acc(i) \ 1548 AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \ 1549 AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \ 1550 AS2( pmuludq xmm0, xmm1) \ 1551 AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ 1552 AS2( paddq xmm4, xmm0) \ 1553 AS2( paddd xmm6, xmm1) 1554 1555#define Bot_SaveAcc(k) \ 1556 SSE2_SaveShift(k) \ 1557 AS2( add edi, 16) \ 1558 AS2( add edx, 16) \ 1559 AS2( movdqa xmm6, [esi]) \ 1560 AS2( movdqa xmm0, [edi]) \ 1561 AS2( pmuludq xmm0, xmm6) \ 1562 AS2( paddq xmm4, xmm0) \ 1563 AS2( psllq xmm5, 16) \ 1564 AS2( paddq xmm4, xmm5) \ 1565 AS2( pmuludq xmm6, [edx]) 1566 1567#define Bot_End(n) \ 1568 AS2( movhlps xmm7, xmm6) \ 1569 AS2( paddd xmm6, xmm7) \ 1570 AS2( psllq xmm6, 32) \ 1571 AS2( paddd xmm4, xmm6) \ 1572 AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \ 1573 AS1( pop esp)\ 1574 MulEpilogue 1575 1576#define Top_Begin(n) \ 1577 TopPrologue \ 1578 AS2( mov edx, esp)\ 1579 AS2( and esp, 0xfffffff0)\ 1580 AS2( sub esp, 48*n+16)\ 1581 AS1( push edx)\ 1582 AS2( xor edx, edx) \ 1583 ASL(1) \ 1584 ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ 1585 ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ 1586 ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \ 1587 AS2( movdqa [esp+20+2*edx], xmm0) \ 1588 AS2( psrlq xmm0, 32) \ 1589 AS2( movdqa [esp+20+2*edx+16], xmm0) \ 1590 AS2( movdqa [esp+20+16*n+2*edx], xmm1) \ 1591 AS2( psrlq xmm1, 32) \ 1592 AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \ 1593 AS2( movdqa [esp+20+32*n+2*edx], xmm2) \ 1594 AS2( psrlq xmm2, 32) \ 1595 AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \ 1596 AS2( add edx, 16) \ 1597 AS2( cmp edx, 8*(n)) \ 1598 ASJ( jne, 1, b) \ 1599 AS2( mov eax, esi) \ 1600 AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\ 1601 AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\ 1602 AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\ 1603 AS2( pxor xmm4, xmm4)\ 1604 AS2( pxor xmm5, xmm5) 1605 1606#define Top_Acc(i) \ 1607 AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \ 1608 AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ 1609 AS2( psrlq xmm0, 48) \ 1610 AS2( paddd xmm5, xmm0)\ 1611 1612#define Top_Column0(i) \ 1613 AS2( psllq xmm5, 32) \ 1614 AS2( add edi, 16) \ 1615 AS2( add edx, 16) \ 1616 SSE2_MulAdd45\ 1617 Mul_Acc##i(i) \ 1618 1619#define Top_Column1(i) \ 1620 SSE2_SaveShift(0) \ 1621 AS2( add esi, 16) \ 1622 SSE2_MulAdd45\ 1623 Mul_Acc##i(i) \ 1624 AS2( shr eax, 16) \ 1625 AS2( movd xmm0, eax)\ 1626 AS2( movd xmm1, [ecx+4])\ 1627 AS2( psrld xmm1, 16)\ 1628 AS2( pcmpgtd xmm1, xmm0)\ 1629 AS2( psrld xmm1, 31)\ 1630 AS2( paddd xmm4, xmm1)\ 1631 1632void SSE2_Square4(word *C, const word *A) 1633{ 1634 Squ_Begin(2) 1635 Squ_Column0(0, 1) 1636 Squ_End(2) 1637} 1638 1639void SSE2_Square8(word *C, const word *A) 1640{ 1641 Squ_Begin(4) 1642#ifndef __GNUC__ 1643 ASJ( jmp, 0, f) 1644 Squ_Acc(2) 1645 AS1( ret) ASL(0) 1646#endif 1647 Squ_Column0(0, 1) 1648 Squ_Column1(1, 1) 1649 Squ_Column0(2, 2) 1650 Squ_Column1(3, 1) 1651 Squ_Column0(4, 1) 1652 Squ_End(4) 1653} 1654 1655void SSE2_Square16(word *C, const word *A) 1656{ 1657 Squ_Begin(8) 1658#ifndef __GNUC__ 1659 ASJ( jmp, 0, f) 1660 Squ_Acc(4) Squ_Acc(3) Squ_Acc(2) 1661 AS1( ret) ASL(0) 1662#endif 1663 Squ_Column0(0, 1) 1664 Squ_Column1(1, 1) 1665 Squ_Column0(2, 2) 1666 Squ_Column1(3, 2) 1667 Squ_Column0(4, 3) 1668 Squ_Column1(5, 3) 1669 Squ_Column0(6, 4) 1670 Squ_Column1(7, 3) 1671 Squ_Column0(8, 3) 1672 Squ_Column1(9, 2) 1673 Squ_Column0(10, 2) 1674 Squ_Column1(11, 1) 1675 Squ_Column0(12, 1) 1676 Squ_End(8) 1677} 1678 1679void SSE2_Square32(word *C, const word *A) 1680{ 1681 Squ_Begin(16) 1682 ASJ( jmp, 0, f) 1683 Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2) 1684 AS1( ret) ASL(0) 1685 Squ_Column0(0, 1) 1686 Squ_Column1(1, 1) 1687 Squ_Column0(2, 2) 1688 Squ_Column1(3, 2) 1689 Squ_Column0(4, 3) 1690 Squ_Column1(5, 3) 1691 Squ_Column0(6, 4) 1692 Squ_Column1(7, 4) 1693 Squ_Column0(8, 5) 1694 Squ_Column1(9, 5) 1695 Squ_Column0(10, 6) 1696 Squ_Column1(11, 6) 1697 Squ_Column0(12, 7) 1698 Squ_Column1(13, 7) 1699 Squ_Column0(14, 8) 1700 Squ_Column1(15, 7) 1701 Squ_Column0(16, 7) 1702 Squ_Column1(17, 6) 1703 Squ_Column0(18, 6) 1704 Squ_Column1(19, 5) 1705 Squ_Column0(20, 5) 1706 Squ_Column1(21, 4) 1707 Squ_Column0(22, 4) 1708 Squ_Column1(23, 3) 1709 Squ_Column0(24, 3) 1710 Squ_Column1(25, 2) 1711 Squ_Column0(26, 2) 1712 Squ_Column1(27, 1) 1713 Squ_Column0(28, 1) 1714 Squ_End(16) 1715} 1716 1717void SSE2_Multiply4(word *C, const word *A, const word *B) 1718{ 1719 Mul_Begin(2) 1720#ifndef __GNUC__ 1721 ASJ( jmp, 0, f) 1722 Mul_Acc(2) 1723 AS1( ret) ASL(0) 1724#endif 1725 Mul_Column0(0, 2) 1726 Mul_End(2) 1727} 1728 1729void SSE2_Multiply8(word *C, const word *A, const word *B) 1730{ 1731 Mul_Begin(4) 1732#ifndef __GNUC__ 1733 ASJ( jmp, 0, f) 1734 Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) 1735 AS1( ret) ASL(0) 1736#endif 1737 Mul_Column0(0, 2) 1738 Mul_Column1(1, 3) 1739 Mul_Column0(2, 4) 1740 Mul_Column1(3, 3) 1741 Mul_Column0(4, 2) 1742 Mul_End(4) 1743} 1744 1745void SSE2_Multiply16(word *C, const word *A, const word *B) 1746{ 1747 Mul_Begin(8) 1748#ifndef __GNUC__ 1749 ASJ( jmp, 0, f) 1750 Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) 1751 AS1( ret) ASL(0) 1752#endif 1753 Mul_Column0(0, 2) 1754 Mul_Column1(1, 3) 1755 Mul_Column0(2, 4) 1756 Mul_Column1(3, 5) 1757 Mul_Column0(4, 6) 1758 Mul_Column1(5, 7) 1759 Mul_Column0(6, 8) 1760 Mul_Column1(7, 7) 1761 Mul_Column0(8, 6) 1762 Mul_Column1(9, 5) 1763 Mul_Column0(10, 4) 1764 Mul_Column1(11, 3) 1765 Mul_Column0(12, 2) 1766 Mul_End(8) 1767} 1768 1769void SSE2_Multiply32(word *C, const word *A, const word *B) 1770{ 1771 Mul_Begin(16) 1772 ASJ( jmp, 0, f) 1773 Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) 1774 AS1( ret) ASL(0) 1775 Mul_Column0(0, 2) 1776 Mul_Column1(1, 3) 1777 Mul_Column0(2, 4) 1778 Mul_Column1(3, 5) 1779 Mul_Column0(4, 6) 1780 Mul_Column1(5, 7) 1781 Mul_Column0(6, 8) 1782 Mul_Column1(7, 9) 1783 Mul_Column0(8, 10) 1784 Mul_Column1(9, 11) 1785 Mul_Column0(10, 12) 1786 Mul_Column1(11, 13) 1787 Mul_Column0(12, 14) 1788 Mul_Column1(13, 15) 1789 Mul_Column0(14, 16) 1790 Mul_Column1(15, 15) 1791 Mul_Column0(16, 14) 1792 Mul_Column1(17, 13) 1793 Mul_Column0(18, 12) 1794 Mul_Column1(19, 11) 1795 Mul_Column0(20, 10) 1796 Mul_Column1(21, 9) 1797 Mul_Column0(22, 8) 1798 Mul_Column1(23, 7) 1799 Mul_Column0(24, 6) 1800 Mul_Column1(25, 5) 1801 Mul_Column0(26, 4) 1802 Mul_Column1(27, 3) 1803 Mul_Column0(28, 2) 1804 Mul_End(16) 1805} 1806 1807void SSE2_MultiplyBottom4(word *C, const word *A, const word *B) 1808{ 1809 Mul_Begin(2) 1810 Bot_SaveAcc(0) Bot_Acc(2) 1811 Bot_End(2) 1812} 1813 1814void SSE2_MultiplyBottom8(word *C, const word *A, const word *B) 1815{ 1816 Mul_Begin(4) 1817#ifndef __GNUC__ 1818 ASJ( jmp, 0, f) 1819 Mul_Acc(3) Mul_Acc(2) 1820 AS1( ret) ASL(0) 1821#endif 1822 Mul_Column0(0, 2) 1823 Mul_Column1(1, 3) 1824 Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) 1825 Bot_End(4) 1826} 1827 1828void SSE2_MultiplyBottom16(word *C, const word *A, const word *B) 1829{ 1830 Mul_Begin(8) 1831#ifndef __GNUC__ 1832 ASJ( jmp, 0, f) 1833 Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) 1834 AS1( ret) ASL(0) 1835#endif 1836 Mul_Column0(0, 2) 1837 Mul_Column1(1, 3) 1838 Mul_Column0(2, 4) 1839 Mul_Column1(3, 5) 1840 Mul_Column0(4, 6) 1841 Mul_Column1(5, 7) 1842 Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) 1843 Bot_End(8) 1844} 1845 1846void SSE2_MultiplyBottom32(word *C, const word *A, const word *B) 1847{ 1848 Mul_Begin(16) 1849#ifndef __GNUC__ 1850 ASJ( jmp, 0, f) 1851 Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) 1852 AS1( ret) ASL(0) 1853#endif 1854 Mul_Column0(0, 2) 1855 Mul_Column1(1, 3) 1856 Mul_Column0(2, 4) 1857 Mul_Column1(3, 5) 1858 Mul_Column0(4, 6) 1859 Mul_Column1(5, 7) 1860 Mul_Column0(6, 8) 1861 Mul_Column1(7, 9) 1862 Mul_Column0(8, 10) 1863 Mul_Column1(9, 11) 1864 Mul_Column0(10, 12) 1865 Mul_Column1(11, 13) 1866 Mul_Column0(12, 14) 1867 Mul_Column1(13, 15) 1868 Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) 1869 Bot_End(16) 1870} 1871 1872void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L) 1873{ 1874 Top_Begin(4) 1875 Top_Acc(3) Top_Acc(2) Top_Acc(1) 1876#ifndef __GNUC__ 1877 ASJ( jmp, 0, f) 1878 Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) 1879 AS1( ret) ASL(0) 1880#endif 1881 Top_Column0(4) 1882 Top_Column1(3) 1883 Mul_Column0(0, 2) 1884 Top_End(2) 1885} 1886 1887void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L) 1888{ 1889 Top_Begin(8) 1890 Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1) 1891#ifndef __GNUC__ 1892 ASJ( jmp, 0, f) 1893 Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) 1894 AS1( ret) ASL(0) 1895#endif 1896 Top_Column0(8) 1897 Top_Column1(7) 1898 Mul_Column0(0, 6) 1899 Mul_Column1(1, 5) 1900 Mul_Column0(2, 4) 1901 Mul_Column1(3, 3) 1902 Mul_Column0(4, 2) 1903 Top_End(4) 1904} 1905 1906void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L) 1907{ 1908 Top_Begin(16) 1909 Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1) 1910#ifndef __GNUC__ 1911 ASJ( jmp, 0, f) 1912 Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) 1913 AS1( ret) ASL(0) 1914#endif 1915 Top_Column0(16) 1916 Top_Column1(15) 1917 Mul_Column0(0, 14) 1918 Mul_Column1(1, 13) 1919 Mul_Column0(2, 12) 1920 Mul_Column1(3, 11) 1921 Mul_Column0(4, 10) 1922 Mul_Column1(5, 9) 1923 Mul_Column0(6, 8) 1924 Mul_Column1(7, 7) 1925 Mul_Column0(8, 6) 1926 Mul_Column1(9, 5) 1927 Mul_Column0(10, 4) 1928 Mul_Column1(11, 3) 1929 Mul_Column0(12, 2) 1930 Top_End(8) 1931} 1932 1933#endif // #if CRYPTOPP_INTEGER_SSE2 1934 1935// ******************************************************** 1936 1937typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B); 1938typedef void (* PMul)(word *C, const word *A, const word *B); 1939typedef void (* PSqu)(word *C, const word *A); 1940typedef void (* PMulTop)(word *C, const word *A, const word *B, word L); 1941 1942#if CRYPTOPP_INTEGER_SSE2 1943static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub; 1944static size_t s_recursionLimit = 8; 1945#else 1946static const size_t s_recursionLimit = 16; 1947#endif 1948 1949static PMul s_pMul[9], s_pBot[9]; 1950static PSqu s_pSqu[9]; 1951static PMulTop s_pTop[9]; 1952 1953static void SetFunctionPointers() 1954{ 1955 s_pMul[0] = &Baseline_Multiply2; 1956 s_pBot[0] = &Baseline_MultiplyBottom2; 1957 s_pSqu[0] = &Baseline_Square2; 1958 s_pTop[0] = &Baseline_MultiplyTop2; 1959 s_pTop[1] = &Baseline_MultiplyTop4; 1960 1961#if CRYPTOPP_INTEGER_SSE2 1962 if (HasSSE2()) 1963 { 1964#if _MSC_VER != 1200 || defined(NDEBUG) 1965 if (IsP4()) 1966 { 1967 s_pAdd = &SSE2_Add; 1968 s_pSub = &SSE2_Sub; 1969 } 1970#endif 1971 1972 s_recursionLimit = 32; 1973 1974 s_pMul[1] = &SSE2_Multiply4; 1975 s_pMul[2] = &SSE2_Multiply8; 1976 s_pMul[4] = &SSE2_Multiply16; 1977 s_pMul[8] = &SSE2_Multiply32; 1978 1979 s_pBot[1] = &SSE2_MultiplyBottom4; 1980 s_pBot[2] = &SSE2_MultiplyBottom8; 1981 s_pBot[4] = &SSE2_MultiplyBottom16; 1982 s_pBot[8] = &SSE2_MultiplyBottom32; 1983 1984 s_pSqu[1] = &SSE2_Square4; 1985 s_pSqu[2] = &SSE2_Square8; 1986 s_pSqu[4] = &SSE2_Square16; 1987 s_pSqu[8] = &SSE2_Square32; 1988 1989 s_pTop[2] = &SSE2_MultiplyTop8; 1990 s_pTop[4] = &SSE2_MultiplyTop16; 1991 s_pTop[8] = &SSE2_MultiplyTop32; 1992 } 1993 else 1994#endif 1995 { 1996 s_pMul[1] = &Baseline_Multiply4; 1997 s_pMul[2] = &Baseline_Multiply8; 1998 1999 s_pBot[1] = &Baseline_MultiplyBottom4; 2000 s_pBot[2] = &Baseline_MultiplyBottom8; 2001 2002 s_pSqu[1] = &Baseline_Square4; 2003 s_pSqu[2] = &Baseline_Square8; 2004 2005 s_pTop[2] = &Baseline_MultiplyTop8; 2006 2007#if !CRYPTOPP_INTEGER_SSE2 2008 s_pMul[4] = &Baseline_Multiply16; 2009 s_pBot[4] = &Baseline_MultiplyBottom16; 2010 s_pSqu[4] = &Baseline_Square16; 2011 s_pTop[4] = &Baseline_MultiplyTop16; 2012#endif 2013 } 2014} 2015 2016inline int Add(word *C, const word *A, const word *B, size_t N) 2017{ 2018#if CRYPTOPP_INTEGER_SSE2 2019 return s_pAdd(N, C, A, B); 2020#else 2021 return Baseline_Add(N, C, A, B); 2022#endif 2023} 2024 2025inline int Subtract(word *C, const word *A, const word *B, size_t N) 2026{ 2027#if CRYPTOPP_INTEGER_SSE2 2028 return s_pSub(N, C, A, B); 2029#else 2030 return Baseline_Sub(N, C, A, B); 2031#endif 2032} 2033 2034// ******************************************************** 2035 2036 2037#define A0 A 2038#define A1 (A+N2) 2039#define B0 B 2040#define B1 (B+N2) 2041 2042#define T0 T 2043#define T1 (T+N2) 2044#define T2 (T+N) 2045#define T3 (T+N+N2) 2046 2047#define R0 R 2048#define R1 (R+N2) 2049#define R2 (R+N) 2050#define R3 (R+N+N2) 2051 2052// R[2*N] - result = A*B 2053// T[2*N] - temporary work space 2054// A[N] --- multiplier 2055// B[N] --- multiplicant 2056 2057void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N) 2058{ 2059 assert(N>=2 && N%2==0); 2060 2061 if (N <= s_recursionLimit) 2062 s_pMul[N/4](R, A, B); 2063 else 2064 { 2065 const size_t N2 = N/2; 2066 2067 size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2; 2068 Subtract(R0, A + AN2, A + (N2 ^ AN2), N2); 2069 2070 size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2; 2071 Subtract(R1, B + BN2, B + (N2 ^ BN2), N2); 2072 2073 RecursiveMultiply(R2, T2, A1, B1, N2); 2074 RecursiveMultiply(T0, T2, R0, R1, N2); 2075 RecursiveMultiply(R0, T2, A0, B0, N2); 2076 2077 // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1 2078 2079 int c2 = Add(R2, R2, R1, N2); 2080 int c3 = c2; 2081 c2 += Add(R1, R2, R0, N2); 2082 c3 += Add(R2, R2, R3, N2); 2083 2084 if (AN2 == BN2) 2085 c3 -= Subtract(R1, R1, T0, N); 2086 else 2087 c3 += Add(R1, R1, T0, N); 2088 2089 c3 += Increment(R2, N2, c2); 2090 assert (c3 >= 0 && c3 <= 2); 2091 Increment(R3, N2, c3); 2092 } 2093} 2094 2095// R[2*N] - result = A*A 2096// T[2*N] - temporary work space 2097// A[N] --- number to be squared 2098 2099void RecursiveSquare(word *R, word *T, const word *A, size_t N) 2100{ 2101 assert(N && N%2==0); 2102 2103 if (N <= s_recursionLimit) 2104 s_pSqu[N/4](R, A); 2105 else 2106 { 2107 const size_t N2 = N/2; 2108 2109 RecursiveSquare(R0, T2, A0, N2); 2110 RecursiveSquare(R2, T2, A1, N2); 2111 RecursiveMultiply(T0, T2, A0, A1, N2); 2112 2113 int carry = Add(R1, R1, T0, N); 2114 carry += Add(R1, R1, T0, N); 2115 Increment(R3, N2, carry); 2116 } 2117} 2118 2119// R[N] - bottom half of A*B 2120// T[3*N/2] - temporary work space 2121// A[N] - multiplier 2122// B[N] - multiplicant 2123 2124void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N) 2125{ 2126 assert(N>=2 && N%2==0); 2127 2128 if (N <= s_recursionLimit) 2129 s_pBot[N/4](R, A, B); 2130 else 2131 { 2132 const size_t N2 = N/2; 2133 2134 RecursiveMultiply(R, T, A0, B0, N2); 2135 RecursiveMultiplyBottom(T0, T1, A1, B0, N2); 2136 Add(R1, R1, T0, N2); 2137 RecursiveMultiplyBottom(T0, T1, A0, B1, N2); 2138 Add(R1, R1, T0, N2); 2139 } 2140} 2141 2142// R[N] --- upper half of A*B 2143// T[2*N] - temporary work space 2144// L[N] --- lower half of A*B 2145// A[N] --- multiplier 2146// B[N] --- multiplicant 2147 2148void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N) 2149{ 2150 assert(N>=2 && N%2==0); 2151 2152 if (N <= s_recursionLimit) 2153 s_pTop[N/4](R, A, B, L[N-1]); 2154 else 2155 { 2156 const size_t N2 = N/2; 2157 2158 size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2; 2159 Subtract(R0, A + AN2, A + (N2 ^ AN2), N2); 2160 2161 size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2; 2162 Subtract(R1, B + BN2, B + (N2 ^ BN2), N2); 2163 2164 RecursiveMultiply(T0, T2, R0, R1, N2); 2165 RecursiveMultiply(R0, T2, A1, B1, N2); 2166 2167 // now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1 2168 2169 int t, c3; 2170 int c2 = Subtract(T2, L+N2, L, N2); 2171 2172 if (AN2 == BN2) 2173 { 2174 c2 -= Add(T2, T2, T0, N2); 2175 t = (Compare(T2, R0, N2) == -1); 2176 c3 = t - Subtract(T2, T2, T1, N2); 2177 } 2178 else 2179 { 2180 c2 += Subtract(T2, T2, T0, N2); 2181 t = (Compare(T2, R0, N2) == -1); 2182 c3 = t + Add(T2, T2, T1, N2); 2183 } 2184 2185 c2 += t; 2186 if (c2 >= 0) 2187 c3 += Increment(T2, N2, c2); 2188 else 2189 c3 -= Decrement(T2, N2, -c2); 2190 c3 += Add(R0, T2, R1, N2); 2191 2192 assert (c3 >= 0 && c3 <= 2); 2193 Increment(R1, N2, c3); 2194 } 2195} 2196 2197inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N) 2198{ 2199 RecursiveMultiply(R, T, A, B, N); 2200} 2201 2202inline void Square(word *R, word *T, const word *A, size_t N) 2203{ 2204 RecursiveSquare(R, T, A, N); 2205} 2206 2207inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N) 2208{ 2209 RecursiveMultiplyBottom(R, T, A, B, N); 2210} 2211 2212// R[NA+NB] - result = A*B 2213// T[NA+NB] - temporary work space 2214// A[NA] ---- multiplier 2215// B[NB] ---- multiplicant 2216 2217void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB) 2218{ 2219 if (NA == NB) 2220 { 2221 if (A == B) 2222 Square(R, T, A, NA); 2223 else 2224 Multiply(R, T, A, B, NA); 2225 2226 return; 2227 } 2228 2229 if (NA > NB) 2230 { 2231 std::swap(A, B); 2232 std::swap(NA, NB); 2233 } 2234 2235 assert(NB % NA == 0); 2236 2237 if (NA==2 && !A[1]) 2238 { 2239 switch (A[0]) 2240 { 2241 case 0: 2242 SetWords(R, 0, NB+2); 2243 return; 2244 case 1: 2245 CopyWords(R, B, NB); 2246 R[NB] = R[NB+1] = 0; 2247 return; 2248 default: 2249 R[NB] = LinearMultiply(R, B, A[0], NB); 2250 R[NB+1] = 0; 2251 return; 2252 } 2253 } 2254 2255 size_t i; 2256 if ((NB/NA)%2 == 0) 2257 { 2258 Multiply(R, T, A, B, NA); 2259 CopyWords(T+2*NA, R+NA, NA); 2260 2261 for (i=2*NA; i<NB; i+=2*NA) 2262 Multiply(T+NA+i, T, A, B+i, NA); 2263 for (i=NA; i<NB; i+=2*NA) 2264 Multiply(R+i, T, A, B+i, NA); 2265 } 2266 else 2267 { 2268 for (i=0; i<NB; i+=2*NA) 2269 Multiply(R+i, T, A, B+i, NA); 2270 for (i=NA; i<NB; i+=2*NA) 2271 Multiply(T+NA+i, T, A, B+i, NA); 2272 } 2273 2274 if (Add(R+NA, R+NA, T+2*NA, NB-NA)) 2275 Increment(R+NB, NA); 2276} 2277 2278// R[N] ----- result = A inverse mod 2**(WORD_BITS*N) 2279// T[3*N/2] - temporary work space 2280// A[N] ----- an odd number as input 2281 2282void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N) 2283{ 2284 if (N==2) 2285 { 2286 T[0] = AtomicInverseModPower2(A[0]); 2287 T[1] = 0; 2288 s_pBot[0](T+2, T, A); 2289 TwosComplement(T+2, 2); 2290 Increment(T+2, 2, 2); 2291 s_pBot[0](R, T, T+2); 2292 } 2293 else 2294 { 2295 const size_t N2 = N/2; 2296 RecursiveInverseModPower2(R0, T0, A0, N2); 2297 T0[0] = 1; 2298 SetWords(T0+1, 0, N2-1); 2299 MultiplyTop(R1, T1, T0, R0, A0, N2); 2300 MultiplyBottom(T0, T1, R0, A1, N2); 2301 Add(T0, R1, T0, N2); 2302 TwosComplement(T0, N2); 2303 MultiplyBottom(R1, T1, R0, T0, N2); 2304 } 2305} 2306 2307// R[N] --- result = X/(2**(WORD_BITS*N)) mod M 2308// T[3*N] - temporary work space 2309// X[2*N] - number to be reduced 2310// M[N] --- modulus 2311// U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N) 2312 2313void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N) 2314{ 2315#if 1 2316 MultiplyBottom(R, T, X, U, N); 2317 MultiplyTop(T, T+N, X, R, M, N); 2318 word borrow = Subtract(T, X+N, T, N); 2319 // defend against timing attack by doing this Add even when not needed 2320 word carry = Add(T+N, T, M, N); 2321 assert(carry | !borrow); 2322 CopyWords(R, T + ((0-borrow) & N), N); 2323#elif 0 2324 const word u = 0-U[0]; 2325 Declare2Words(p) 2326 for (size_t i=0; i<N; i++) 2327 { 2328 const word t = u * X[i]; 2329 word c = 0; 2330 for (size_t j=0; j<N; j+=2) 2331 { 2332 MultiplyWords(p, t, M[j]); 2333 Acc2WordsBy1(p, X[i+j]); 2334 Acc2WordsBy1(p, c); 2335 X[i+j] = LowWord(p); 2336 c = HighWord(p); 2337 MultiplyWords(p, t, M[j+1]); 2338 Acc2WordsBy1(p, X[i+j+1]); 2339 Acc2WordsBy1(p, c); 2340 X[i+j+1] = LowWord(p); 2341 c = HighWord(p); 2342 } 2343 2344 if (Increment(X+N+i, N-i, c)) 2345 while (!Subtract(X+N, X+N, M, N)) {} 2346 } 2347 2348 memcpy(R, X+N, N*WORD_SIZE); 2349#else 2350 __m64 u = _mm_cvtsi32_si64(0-U[0]), p; 2351 for (size_t i=0; i<N; i++) 2352 { 2353 __m64 t = _mm_cvtsi32_si64(X[i]); 2354 t = _mm_mul_su32(t, u); 2355 __m64 c = _mm_setzero_si64(); 2356 for (size_t j=0; j<N; j+=2) 2357 { 2358 p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j])); 2359 p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j])); 2360 c = _mm_add_si64(c, p); 2361 X[i+j] = _mm_cvtsi64_si32(c); 2362 c = _mm_srli_si64(c, 32); 2363 p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1])); 2364 p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1])); 2365 c = _mm_add_si64(c, p); 2366 X[i+j+1] = _mm_cvtsi64_si32(c); 2367 c = _mm_srli_si64(c, 32); 2368 } 2369 2370 if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c))) 2371 while (!Subtract(X+N, X+N, M, N)) {} 2372 } 2373 2374 memcpy(R, X+N, N*WORD_SIZE); 2375 _mm_empty(); 2376#endif 2377} 2378 2379// R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M 2380// T[2*N] - temporary work space 2381// X[2*N] - number to be reduced 2382// M[N] --- modulus 2383// U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2) 2384// V[N] --- 2**(WORD_BITS*3*N/2) mod M 2385 2386void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N) 2387{ 2388 assert(N%2==0 && N>=4); 2389 2390#define M0 M 2391#define M1 (M+N2) 2392#define V0 V 2393#define V1 (V+N2) 2394 2395#define X0 X 2396#define X1 (X+N2) 2397#define X2 (X+N) 2398#define X3 (X+N+N2) 2399 2400 const size_t N2 = N/2; 2401 Multiply(T0, T2, V0, X3, N2); 2402 int c2 = Add(T0, T0, X0, N); 2403 MultiplyBottom(T3, T2, T0, U, N2); 2404 MultiplyTop(T2, R, T0, T3, M0, N2); 2405 c2 -= Subtract(T2, T1, T2, N2); 2406 Multiply(T0, R, T3, M1, N2); 2407 c2 -= Subtract(T0, T2, T0, N2); 2408 int c3 = -(int)Subtract(T1, X2, T1, N2); 2409 Multiply(R0, T2, V1, X3, N2); 2410 c3 += Add(R, R, T, N); 2411 2412 if (c2>0) 2413 c3 += Increment(R1, N2); 2414 else if (c2<0) 2415 c3 -= Decrement(R1, N2, -c2); 2416 2417 assert(c3>=-1 && c3<=1); 2418 if (c3>0) 2419 Subtract(R, R, M, N); 2420 else if (c3<0) 2421 Add(R, R, M, N); 2422 2423#undef M0 2424#undef M1 2425#undef V0 2426#undef V1 2427 2428#undef X0 2429#undef X1 2430#undef X2 2431#undef X3 2432} 2433 2434#undef A0 2435#undef A1 2436#undef B0 2437#undef B1 2438 2439#undef T0 2440#undef T1 2441#undef T2 2442#undef T3 2443 2444#undef R0 2445#undef R1 2446#undef R2 2447#undef R3 2448 2449/* 2450// do a 3 word by 2 word divide, returns quotient and leaves remainder in A 2451static word SubatomicDivide(word *A, word B0, word B1) 2452{ 2453 // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word 2454 assert(A[2] < B1 || (A[2]==B1 && A[1] < B0)); 2455 2456 // estimate the quotient: do a 2 word by 1 word divide 2457 word Q; 2458 if (B1+1 == 0) 2459 Q = A[2]; 2460 else 2461 Q = DWord(A[1], A[2]).DividedBy(B1+1); 2462 2463 // now subtract Q*B from A 2464 DWord p = DWord::Multiply(B0, Q); 2465 DWord u = (DWord) A[0] - p.GetLowHalf(); 2466 A[0] = u.GetLowHalf(); 2467 u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q); 2468 A[1] = u.GetLowHalf(); 2469 A[2] += u.GetHighHalf(); 2470 2471 // Q <= actual quotient, so fix it 2472 while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0)) 2473 { 2474 u = (DWord) A[0] - B0; 2475 A[0] = u.GetLowHalf(); 2476 u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow(); 2477 A[1] = u.GetLowHalf(); 2478 A[2] += u.GetHighHalf(); 2479 Q++; 2480 assert(Q); // shouldn't overflow 2481 } 2482 2483 return Q; 2484} 2485 2486// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1 2487static inline void AtomicDivide(word *Q, const word *A, const word *B) 2488{ 2489 if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS) 2490 { 2491 Q[0] = A[2]; 2492 Q[1] = A[3]; 2493 } 2494 else 2495 { 2496 word T[4]; 2497 T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3]; 2498 Q[1] = SubatomicDivide(T+1, B[0], B[1]); 2499 Q[0] = SubatomicDivide(T, B[0], B[1]); 2500 2501#ifndef NDEBUG 2502 // multiply quotient and divisor and add remainder, make sure it equals dividend 2503 assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0]))); 2504 word P[4]; 2505 LowLevel::Multiply2(P, Q, B); 2506 Add(P, P, T, 4); 2507 assert(memcmp(P, A, 4*WORD_SIZE)==0); 2508#endif 2509 } 2510} 2511*/ 2512 2513static inline void AtomicDivide(word *Q, const word *A, const word *B) 2514{ 2515 word T[4]; 2516 DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1])); 2517 Q[0] = q.GetLowHalf(); 2518 Q[1] = q.GetHighHalf(); 2519 2520#ifndef NDEBUG 2521 if (B[0] || B[1]) 2522 { 2523 // multiply quotient and divisor and add remainder, make sure it equals dividend 2524 assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0]))); 2525 word P[4]; 2526 s_pMul[0](P, Q, B); 2527 Add(P, P, T, 4); 2528 assert(memcmp(P, A, 4*WORD_SIZE)==0); 2529 } 2530#endif 2531} 2532 2533// for use by Divide(), corrects the underestimated quotient {Q1,Q0} 2534static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N) 2535{ 2536 assert(N && N%2==0); 2537 2538 AsymmetricMultiply(T, T+N+2, Q, 2, B, N); 2539 2540 word borrow = Subtract(R, R, T, N+2); 2541 assert(!borrow && !R[N+1]); 2542 2543 while (R[N] || Compare(R, B, N) >= 0) 2544 { 2545 R[N] -= Subtract(R, R, B, N); 2546 Q[1] += (++Q[0]==0); 2547 assert(Q[0] || Q[1]); // no overflow 2548 } 2549} 2550 2551// R[NB] -------- remainder = A%B 2552// Q[NA-NB+2] --- quotient = A/B 2553// T[NA+3*(NB+2)] - temp work space 2554// A[NA] -------- dividend 2555// B[NB] -------- divisor 2556 2557void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB) 2558{ 2559 assert(NA && NB && NA%2==0 && NB%2==0); 2560 assert(B[NB-1] || B[NB-2]); 2561 assert(NB <= NA); 2562 2563 // set up temporary work space 2564 word *const TA=T; 2565 word *const TB=T+NA+2; 2566 word *const TP=T+NA+2+NB; 2567 2568 // copy B into TB and normalize it so that TB has highest bit set to 1 2569 unsigned shiftWords = (B[NB-1]==0); 2570 TB[0] = TB[NB-1] = 0; 2571 CopyWords(TB+shiftWords, B, NB-shiftWords); 2572 unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]); 2573 assert(shiftBits < WORD_BITS); 2574 ShiftWordsLeftByBits(TB, NB, shiftBits); 2575 2576 // copy A into TA and normalize it 2577 TA[0] = TA[NA] = TA[NA+1] = 0; 2578 CopyWords(TA+shiftWords, A, NA); 2579 ShiftWordsLeftByBits(TA, NA+2, shiftBits); 2580 2581 if (TA[NA+1]==0 && TA[NA] <= 1) 2582 { 2583 Q[NA-NB+1] = Q[NA-NB] = 0; 2584 while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0) 2585 { 2586 TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB); 2587 ++Q[NA-NB]; 2588 } 2589 } 2590 else 2591 { 2592 NA+=2; 2593 assert(Compare(TA+NA-NB, TB, NB) < 0); 2594 } 2595 2596 word BT[2]; 2597 BT[0] = TB[NB-2] + 1; 2598 BT[1] = TB[NB-1] + (BT[0]==0); 2599 2600 // start reducing TA mod TB, 2 words at a time 2601 for (size_t i=NA-2; i>=NB; i-=2) 2602 { 2603 AtomicDivide(Q+i-NB, TA+i-2, BT); 2604 CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB); 2605 } 2606 2607 // copy TA into R, and denormalize it 2608 CopyWords(R, TA+shiftWords, NB); 2609 ShiftWordsRightByBits(R, NB, shiftBits); 2610} 2611 2612static inline size_t EvenWordCount(const word *X, size_t N) 2613{ 2614 while (N && X[N-2]==0 && X[N-1]==0) 2615 N-=2; 2616 return N; 2617} 2618 2619// return k 2620// R[N] --- result = A^(-1) * 2^k mod M 2621// T[4*N] - temporary work space 2622// A[NA] -- number to take inverse of 2623// M[N] --- modulus 2624 2625unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N) 2626{ 2627 assert(NA<=N && N && N%2==0); 2628 2629 word *b = T; 2630 word *c = T+N; 2631 word *f = T+2*N; 2632 word *g = T+3*N; 2633 size_t bcLen=2, fgLen=EvenWordCount(M, N); 2634 unsigned int k=0, s=0; 2635 2636 SetWords(T, 0, 3*N); 2637 b[0]=1; 2638 CopyWords(f, A, NA); 2639 CopyWords(g, M, N); 2640 2641 while (1) 2642 { 2643 word t=f[0]; 2644 while (!t) 2645 { 2646 if (EvenWordCount(f, fgLen)==0) 2647 { 2648 SetWords(R, 0, N); 2649 return 0; 2650 } 2651 2652 ShiftWordsRightByWords(f, fgLen, 1); 2653 if (c[bcLen-1]) bcLen+=2; 2654 assert(bcLen <= N); 2655 ShiftWordsLeftByWords(c, bcLen, 1); 2656 k+=WORD_BITS; 2657 t=f[0]; 2658 } 2659 2660 unsigned int i=0; 2661 while (t%2 == 0) 2662 { 2663 t>>=1; 2664 i++; 2665 } 2666 k+=i; 2667 2668 if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2) 2669 { 2670 if (s%2==0) 2671 CopyWords(R, b, N); 2672 else 2673 Subtract(R, M, b, N); 2674 return k; 2675 } 2676 2677 ShiftWordsRightByBits(f, fgLen, i); 2678 t=ShiftWordsLeftByBits(c, bcLen, i); 2679 if (t) 2680 { 2681 c[bcLen] = t; 2682 bcLen+=2; 2683 assert(bcLen <= N); 2684 } 2685 2686 if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0) 2687 fgLen-=2; 2688 2689 if (Compare(f, g, fgLen)==-1) 2690 { 2691 std::swap(f, g); 2692 std::swap(b, c); 2693 s++; 2694 } 2695 2696 Subtract(f, f, g, fgLen); 2697 2698 if (Add(b, b, c, bcLen)) 2699 { 2700 b[bcLen] = 1; 2701 bcLen+=2; 2702 assert(bcLen <= N); 2703 } 2704 } 2705} 2706 2707// R[N] - result = A/(2^k) mod M 2708// A[N] - input 2709// M[N] - modulus 2710 2711void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N) 2712{ 2713 CopyWords(R, A, N); 2714 2715 while (k--) 2716 { 2717 if (R[0]%2==0) 2718 ShiftWordsRightByBits(R, N, 1); 2719 else 2720 { 2721 word carry = Add(R, R, M, N); 2722 ShiftWordsRightByBits(R, N, 1); 2723 R[N-1] += carry<<(WORD_BITS-1); 2724 } 2725 } 2726} 2727 2728// R[N] - result = A*(2^k) mod M 2729// A[N] - input 2730// M[N] - modulus 2731 2732void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N) 2733{ 2734 CopyWords(R, A, N); 2735 2736 while (k--) 2737 if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0) 2738 Subtract(R, R, M, N); 2739} 2740 2741// ****************************************************************** 2742 2743InitializeInteger::InitializeInteger() 2744{ 2745 if (!g_pAssignIntToInteger) 2746 { 2747 SetFunctionPointers(); 2748 g_pAssignIntToInteger = AssignIntToInteger; 2749 } 2750} 2751 2752static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8}; 2753 2754static inline size_t RoundupSize(size_t n) 2755{ 2756 if (n<=8) 2757 return RoundupSizeTable[n]; 2758 else if (n<=16) 2759 return 16; 2760 else if (n<=32) 2761 return 32; 2762 else if (n<=64) 2763 return 64; 2764 else return size_t(1) << BitPrecision(n-1); 2765} 2766 2767Integer::Integer() 2768 : reg(2), sign(POSITIVE) 2769{ 2770 reg[0] = reg[1] = 0; 2771} 2772 2773Integer::Integer(const Integer& t) 2774 : reg(RoundupSize(t.WordCount())), sign(t.sign) 2775{ 2776 CopyWords(reg, t.reg, reg.size()); 2777} 2778 2779Integer::Integer(Sign s, lword value) 2780 : reg(2), sign(s) 2781{ 2782 reg[0] = word(value); 2783 reg[1] = word(SafeRightShift<WORD_BITS>(value)); 2784} 2785 2786Integer::Integer(signed long value) 2787 : reg(2) 2788{ 2789 if (value >= 0) 2790 sign = POSITIVE; 2791 else 2792 { 2793 sign = NEGATIVE; 2794 value = -value; 2795 } 2796 reg[0] = word(value); 2797 reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value)); 2798} 2799 2800Integer::Integer(Sign s, word high, word low) 2801 : reg(2), sign(s) 2802{ 2803 reg[0] = low; 2804 reg[1] = high; 2805} 2806 2807bool Integer::IsConvertableToLong() const 2808{ 2809 if (ByteCount() > sizeof(long)) 2810 return false; 2811 2812 unsigned long value = (unsigned long)reg[0]; 2813 value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]); 2814 2815 if (sign==POSITIVE) 2816 return (signed long)value >= 0; 2817 else 2818 return -(signed long)value < 0; 2819} 2820 2821signed long Integer::ConvertToLong() const 2822{ 2823 assert(IsConvertableToLong()); 2824 2825 unsigned long value = (unsigned long)reg[0]; 2826 value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]); 2827 return sign==POSITIVE ? value : -(signed long)value; 2828} 2829 2830Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s) 2831{ 2832 Decode(encodedInteger, byteCount, s); 2833} 2834 2835Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s) 2836{ 2837 Decode(encodedInteger, byteCount, s); 2838} 2839 2840Integer::Integer(BufferedTransformation &bt) 2841{ 2842 BERDecode(bt); 2843} 2844 2845Integer::Integer(RandomNumberGenerator &rng, size_t bitcount) 2846{ 2847 Randomize(rng, bitcount); 2848} 2849 2850Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod) 2851{ 2852 if (!Randomize(rng, min, max, rnType, equiv, mod)) 2853 throw Integer::RandomNumberNotFound(); 2854} 2855 2856Integer Integer::Power2(size_t e) 2857{ 2858 Integer r((word)0, BitsToWords(e+1)); 2859 r.SetBit(e); 2860 return r; 2861} 2862 2863template <long i> 2864struct NewInteger 2865{ 2866 Integer * operator()() const 2867 { 2868 return new Integer(i); 2869 } 2870}; 2871 2872const Integer &Integer::Zero() 2873{ 2874 return Singleton<Integer>().Ref(); 2875} 2876 2877const Integer &Integer::One() 2878{ 2879 return Singleton<Integer, NewInteger<1> >().Ref(); 2880} 2881 2882const Integer &Integer::Two() 2883{ 2884 return Singleton<Integer, NewInteger<2> >().Ref(); 2885} 2886 2887bool Integer::operator!() const 2888{ 2889 return IsNegative() ? false : (reg[0]==0 && WordCount()==0); 2890} 2891 2892Integer& Integer::operator=(const Integer& t) 2893{ 2894 if (this != &t) 2895 { 2896 if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0) 2897 reg.New(RoundupSize(t.WordCount())); 2898 CopyWords(reg, t.reg, reg.size()); 2899 sign = t.sign; 2900 } 2901 return *this; 2902} 2903 2904bool Integer::GetBit(size_t n) const 2905{ 2906 if (n/WORD_BITS >= reg.size()) 2907 return 0; 2908 else 2909 return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1); 2910} 2911 2912void Integer::SetBit(size_t n, bool value) 2913{ 2914 if (value) 2915 { 2916 reg.CleanGrow(RoundupSize(BitsToWords(n+1))); 2917 reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS)); 2918 } 2919 else 2920 { 2921 if (n/WORD_BITS < reg.size()) 2922 reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS)); 2923 } 2924} 2925 2926byte Integer::GetByte(size_t n) const 2927{ 2928 if (n/WORD_SIZE >= reg.size()) 2929 return 0; 2930 else 2931 return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8)); 2932} 2933 2934void Integer::SetByte(size_t n, byte value) 2935{ 2936 reg.CleanGrow(RoundupSize(BytesToWords(n+1))); 2937 reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE)); 2938 reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE)); 2939} 2940 2941lword Integer::GetBits(size_t i, size_t n) const 2942{ 2943 lword v = 0; 2944 assert(n <= sizeof(v)*8); 2945 for (unsigned int j=0; j<n; j++) 2946 v |= lword(GetBit(i+j)) << j; 2947 return v; 2948} 2949 2950Integer Integer::operator-() const 2951{ 2952 Integer result(*this); 2953 result.Negate(); 2954 return result; 2955} 2956 2957Integer Integer::AbsoluteValue() const 2958{ 2959 Integer result(*this); 2960 result.sign = POSITIVE; 2961 return result; 2962} 2963 2964void Integer::swap(Integer &a) 2965{ 2966 reg.swap(a.reg); 2967 std::swap(sign, a.sign); 2968} 2969 2970Integer::Integer(word value, size_t length) 2971 : reg(RoundupSize(length)), sign(POSITIVE) 2972{ 2973 reg[0] = value; 2974 SetWords(reg+1, 0, reg.size()-1); 2975} 2976 2977template <class T> 2978static Integer StringToInteger(const T *str) 2979{ 2980 int radix; 2981 // GCC workaround 2982 // std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3 2983 unsigned int length; 2984 for (length = 0; str[length] != 0; length++) {} 2985 2986 Integer v; 2987 2988 if (length == 0) 2989 return v; 2990 2991 switch (str[length-1]) 2992 { 2993 case 'h': 2994 case 'H': 2995 radix=16; 2996 break; 2997 case 'o': 2998 case 'O': 2999 radix=8; 3000 break; 3001 case 'b': 3002 case 'B': 3003 radix=2; 3004 break; 3005 default: 3006 radix=10; 3007 } 3008 3009 if (length > 2 && str[0] == '0' && str[1] == 'x') 3010 radix = 16; 3011 3012 for (unsigned i=0; i<length; i++) 3013 { 3014 int digit; 3015 3016 if (str[i] >= '0' && str[i] <= '9') 3017 digit = str[i] - '0'; 3018 else if (str[i] >= 'A' && str[i] <= 'F') 3019 digit = str[i] - 'A' + 10; 3020 else if (str[i] >= 'a' && str[i] <= 'f') 3021 digit = str[i] - 'a' + 10; 3022 else 3023 digit = radix; 3024 3025 if (digit < radix) 3026 { 3027 v *= radix; 3028 v += digit; 3029 } 3030 } 3031 3032 if (str[0] == '-') 3033 v.Negate(); 3034 3035 return v; 3036} 3037 3038Integer::Integer(const char *str) 3039 : reg(2), sign(POSITIVE) 3040{ 3041 *this = StringToInteger(str); 3042} 3043 3044Integer::Integer(const wchar_t *str) 3045 : reg(2), sign(POSITIVE) 3046{ 3047 *this = StringToInteger(str); 3048} 3049 3050unsigned int Integer::WordCount() const 3051{ 3052 return (unsigned int)CountWords(reg, reg.size()); 3053} 3054 3055unsigned int Integer::ByteCount() const 3056{ 3057 unsigned wordCount = WordCount(); 3058 if (wordCount) 3059 return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]); 3060 else 3061 return 0; 3062} 3063 3064unsigned int Integer::BitCount() const 3065{ 3066 unsigned wordCount = WordCount(); 3067 if (wordCount) 3068 return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]); 3069 else 3070 return 0; 3071} 3072 3073void Integer::Decode(const byte *input, size_t inputLen, Signedness s) 3074{ 3075 StringStore store(input, inputLen); 3076 Decode(store, inputLen, s); 3077} 3078 3079void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s) 3080{ 3081 assert(bt.MaxRetrievable() >= inputLen); 3082 3083 byte b; 3084 bt.Peek(b); 3085 sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE; 3086 3087 while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff)) 3088 { 3089 bt.Skip(1); 3090 inputLen--; 3091 bt.Peek(b); 3092 } 3093 3094 reg.CleanNew(RoundupSize(BytesToWords(inputLen))); 3095 3096 for (size_t i=inputLen; i > 0; i--) 3097 { 3098 bt.Get(b); 3099 reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8; 3100 } 3101 3102 if (sign == NEGATIVE) 3103 { 3104 for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++) 3105 reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8; 3106 TwosComplement(reg, reg.size()); 3107 } 3108} 3109 3110size_t Integer::MinEncodedSize(Signedness signedness) const 3111{ 3112 unsigned int outputLen = STDMAX(1U, ByteCount()); 3113 if (signedness == UNSIGNED) 3114 return outputLen; 3115 if (NotNegative() && (GetByte(outputLen-1) & 0x80)) 3116 outputLen++; 3117 if (IsNegative() && *this < -Power2(outputLen*8-1)) 3118 outputLen++; 3119 return outputLen; 3120} 3121 3122void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const 3123{ 3124 ArraySink sink(output, outputLen); 3125 Encode(sink, outputLen, signedness); 3126} 3127 3128void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const 3129{ 3130 if (signedness == UNSIGNED || NotNegative()) 3131 { 3132 for (size_t i=outputLen; i > 0; i--) 3133 bt.Put(GetByte(i-1)); 3134 } 3135 else 3136 { 3137 // take two's complement of *this 3138 Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this; 3139 temp.Encode(bt, outputLen, UNSIGNED); 3140 } 3141} 3142 3143void Integer::DEREncode(BufferedTransformation &bt) const 3144{ 3145 DERGeneralEncoder enc(bt, INTEGER); 3146 Encode(enc, MinEncodedSize(SIGNED), SIGNED); 3147 enc.MessageEnd(); 3148} 3149 3150void Integer::BERDecode(const byte *input, size_t len) 3151{ 3152 StringStore store(input, len); 3153 BERDecode(store); 3154} 3155 3156void Integer::BERDecode(BufferedTransformation &bt) 3157{ 3158 BERGeneralDecoder dec(bt, INTEGER); 3159 if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength()) 3160 BERDecodeError(); 3161 Decode(dec, (size_t)dec.RemainingLength(), SIGNED); 3162 dec.MessageEnd(); 3163} 3164 3165void Integer::DEREncodeAsOctetString(BufferedTransformation &bt, size_t length) const 3166{ 3167 DERGeneralEncoder enc(bt, OCTET_STRING); 3168 Encode(enc, length); 3169 enc.MessageEnd(); 3170} 3171 3172void Integer::BERDecodeAsOctetString(BufferedTransformation &bt, size_t length) 3173{ 3174 BERGeneralDecoder dec(bt, OCTET_STRING); 3175 if (!dec.IsDefiniteLength() || dec.RemainingLength() != length) 3176 BERDecodeError(); 3177 Decode(dec, length); 3178 dec.MessageEnd(); 3179} 3180 3181size_t Integer::OpenPGPEncode(byte *output, size_t len) const 3182{ 3183 ArraySink sink(output, len); 3184 return OpenPGPEncode(sink); 3185} 3186 3187size_t Integer::OpenPGPEncode(BufferedTransformation &bt) const 3188{ 3189 word16 bitCount = BitCount(); 3190 bt.PutWord16(bitCount); 3191 size_t byteCount = BitsToBytes(bitCount); 3192 Encode(bt, byteCount); 3193 return 2 + byteCount; 3194} 3195 3196void Integer::OpenPGPDecode(const byte *input, size_t len) 3197{ 3198 StringStore store(input, len); 3199 OpenPGPDecode(store); 3200} 3201 3202void Integer::OpenPGPDecode(BufferedTransformation &bt) 3203{ 3204 word16 bitCount; 3205 if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount)) 3206 throw OpenPGPDecodeErr(); 3207 Decode(bt, BitsToBytes(bitCount)); 3208} 3209 3210void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits) 3211{ 3212 const size_t nbytes = nbits/8 + 1; 3213 SecByteBlock buf(nbytes); 3214 rng.GenerateBlock(buf, nbytes); 3215 if (nbytes) 3216 buf[0] = (byte)Crop(buf[0], nbits % 8); 3217 Decode(buf, nbytes, UNSIGNED); 3218} 3219 3220void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max) 3221{ 3222 if (min > max) 3223 throw InvalidArgument("Integer: Min must be no greater than Max"); 3224 3225 Integer range = max - min; 3226 const unsigned int nbits = range.BitCount(); 3227 3228 do 3229 { 3230 Randomize(rng, nbits); 3231 } 3232 while (*this > range); 3233 3234 *this += min; 3235} 3236 3237bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod) 3238{ 3239 return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod)); 3240} 3241 3242class KDF2_RNG : public RandomNumberGenerator 3243{ 3244public: 3245 KDF2_RNG(const byte *seed, size_t seedSize) 3246 : m_counter(0), m_counterAndSeed(seedSize + 4) 3247 { 3248 memcpy(m_counterAndSeed + 4, seed, seedSize); 3249 } 3250 3251 void GenerateBlock(byte *output, size_t size) 3252 { 3253 PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter); 3254 ++m_counter; 3255 P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0); 3256 } 3257 3258private: 3259 word32 m_counter; 3260 SecByteBlock m_counterAndSeed; 3261}; 3262 3263bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs ¶ms) 3264{ 3265 Integer min = params.GetValueWithDefault("Min", Integer::Zero()); 3266 Integer max; 3267 if (!params.GetValue("Max", max)) 3268 { 3269 int bitLength; 3270 if (params.GetIntValue("BitLength", bitLength)) 3271 max = Integer::Power2(bitLength); 3272 else 3273 throw InvalidArgument("Integer: missing Max argument"); 3274 } 3275 if (min > max) 3276 throw InvalidArgument("Integer: Min must be no greater than Max"); 3277 3278 Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero()); 3279 Integer mod = params.GetValueWithDefault("Mod", Integer::One()); 3280 3281 if (equiv.IsNegative() || equiv >= mod) 3282 throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument"); 3283 3284 Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY); 3285 3286 member_ptr<KDF2_RNG> kdf2Rng; 3287 ConstByteArrayParameter seed; 3288 if (params.GetValue(Name::Seed(), seed)) 3289 { 3290 ByteQueue bq; 3291 DERSequenceEncoder seq(bq); 3292 min.DEREncode(seq); 3293 max.DEREncode(seq); 3294 equiv.DEREncode(seq); 3295 mod.DEREncode(seq); 3296 DEREncodeUnsigned(seq, rnType); 3297 DEREncodeOctetString(seq, seed.begin(), seed.size()); 3298 seq.MessageEnd(); 3299 3300 SecByteBlock finalSeed((size_t)bq.MaxRetrievable()); 3301 bq.Get(finalSeed, finalSeed.size()); 3302 kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size())); 3303 } 3304 RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng; 3305 3306 switch (rnType) 3307 { 3308 case ANY: 3309 if (mod == One()) 3310 Randomize(rng, min, max); 3311 else 3312 { 3313 Integer min1 = min + (equiv-min)%mod; 3314 if (max < min1) 3315 return false; 3316 Randomize(rng, Zero(), (max - min1) / mod); 3317 *this *= mod; 3318 *this += min1; 3319 } 3320 return true; 3321 3322 case PRIME: 3323 { 3324 const PrimeSelector *pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector *)NULL); 3325 3326 int i; 3327 i = 0; 3328 while (1) 3329 { 3330 if (++i==16) 3331 { 3332 // check if there are any suitable primes in [min, max] 3333 Integer first = min; 3334 if (FirstPrime(first, max, equiv, mod, pSelector)) 3335 { 3336 // if there is only one suitable prime, we're done 3337 *this = first; 3338 if (!FirstPrime(first, max, equiv, mod, pSelector)) 3339 return true; 3340 } 3341 else 3342 return false; 3343 } 3344 3345 Randomize(rng, min, max); 3346 if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector)) 3347 return true; 3348 } 3349 } 3350 3351 default: 3352 throw InvalidArgument("Integer: invalid RandomNumberType argument"); 3353 } 3354} 3355 3356std::istream& operator>>(std::istream& in, Integer &a) 3357{ 3358 char c; 3359 unsigned int length = 0; 3360 SecBlock<char> str(length + 16); 3361 3362 std::ws(in); 3363 3364 do 3365 { 3366 in.read(&c, 1); 3367 str[length++] = c; 3368 if (length >= str.size()) 3369 str.Grow(length + 16); 3370 } 3371 while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.')); 3372 3373 if (in.gcount()) 3374 in.putback(c); 3375 str[length-1] = '\0'; 3376 a = Integer(str); 3377 3378 return in; 3379} 3380 3381std::ostream& operator<<(std::ostream& out, const Integer &a) 3382{ 3383 // Get relevant conversion specifications from ostream. 3384 long f = out.flags() & std::ios::basefield; // Get base digits. 3385 int base, block; 3386 char suffix; 3387 switch(f) 3388 { 3389 case std::ios::oct : 3390 base = 8; 3391 block = 8; 3392 suffix = 'o'; 3393 break; 3394 case std::ios::hex : 3395 base = 16; 3396 block = 4; 3397 suffix = 'h'; 3398 break; 3399 default : 3400 base = 10; 3401 block = 3; 3402 suffix = '.'; 3403 } 3404 3405 Integer temp1=a, temp2; 3406 3407 if (a.IsNegative()) 3408 { 3409 out << '-'; 3410 temp1.Negate(); 3411 } 3412 3413 if (!a) 3414 out << '0'; 3415 3416 static const char upper[]="0123456789ABCDEF"; 3417 static const char lower[]="0123456789abcdef"; 3418 3419 const char* vec = (out.flags() & std::ios::uppercase) ? upper : lower; 3420 unsigned i=0; 3421 SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1); 3422 3423 while (!!temp1) 3424 { 3425 word digit; 3426 Integer::Divide(digit, temp2, temp1, base); 3427 s[i++]=vec[digit]; 3428 temp1.swap(temp2); 3429 } 3430 3431 while (i--) 3432 { 3433 out << s[i]; 3434// if (i && !(i%block)) 3435// out << ","; 3436 } 3437 return out << suffix; 3438} 3439 3440Integer& Integer::operator++() 3441{ 3442 if (NotNegative()) 3443 { 3444 if (Increment(reg, reg.size())) 3445 { 3446 reg.CleanGrow(2*reg.size()); 3447 reg[reg.size()/2]=1; 3448 } 3449 } 3450 else 3451 { 3452 word borrow = Decrement(reg, reg.size()); 3453 assert(!borrow); 3454 if (WordCount()==0) 3455 *this = Zero(); 3456 } 3457 return *this; 3458} 3459 3460Integer& Integer::operator--() 3461{ 3462 if (IsNegative()) 3463 { 3464 if (Increment(reg, reg.size())) 3465 { 3466 reg.CleanGrow(2*reg.size()); 3467 reg[reg.size()/2]=1; 3468 } 3469 } 3470 else 3471 { 3472 if (Decrement(reg, reg.size())) 3473 *this = -One(); 3474 } 3475 return *this; 3476} 3477 3478void PositiveAdd(Integer &sum, const Integer &a, const Integer& b) 3479{ 3480 int carry; 3481 if (a.reg.size() == b.reg.size()) 3482 carry = Add(sum.reg, a.reg, b.reg, a.reg.size()); 3483 else if (a.reg.size() > b.reg.size()) 3484 { 3485 carry = Add(sum.reg, a.reg, b.reg, b.reg.size()); 3486 CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size()); 3487 carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry); 3488 } 3489 else 3490 { 3491 carry = Add(sum.reg, a.reg, b.reg, a.reg.size()); 3492 CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size()); 3493 carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry); 3494 } 3495 3496 if (carry) 3497 { 3498 sum.reg.CleanGrow(2*sum.reg.size()); 3499 sum.reg[sum.reg.size()/2] = 1; 3500 } 3501 sum.sign = Integer::POSITIVE; 3502} 3503 3504void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b) 3505{ 3506 unsigned aSize = a.WordCount(); 3507 aSize += aSize%2; 3508 unsigned bSize = b.WordCount(); 3509 bSize += bSize%2; 3510 3511 if (aSize == bSize) 3512 { 3513 if (Compare(a.reg, b.reg, aSize) >= 0) 3514 { 3515 Subtract(diff.reg, a.reg, b.reg, aSize); 3516 diff.sign = Integer::POSITIVE; 3517 } 3518 else 3519 { 3520 Subtract(diff.reg, b.reg, a.reg, aSize); 3521 diff.sign = Integer::NEGATIVE; 3522 } 3523 } 3524 else if (aSize > bSize) 3525 { 3526 word borrow = Subtract(diff.reg, a.reg, b.reg, bSize); 3527 CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize); 3528 borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow); 3529 assert(!borrow); 3530 diff.sign = Integer::POSITIVE; 3531 } 3532 else 3533 { 3534 word borrow = Subtract(diff.reg, b.reg, a.reg, aSize); 3535 CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize); 3536 borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow); 3537 assert(!borrow); 3538 diff.sign = Integer::NEGATIVE; 3539 } 3540} 3541 3542// MSVC .NET 2003 workaround 3543template <class T> inline const T& STDMAX2(const T& a, const T& b) 3544{ 3545 return a < b ? b : a; 3546} 3547 3548Integer Integer::Plus(const Integer& b) const 3549{ 3550 Integer sum((word)0, STDMAX2(reg.size(), b.reg.size())); 3551 if (NotNegative()) 3552 { 3553 if (b.NotNegative()) 3554 PositiveAdd(sum, *this, b); 3555 else 3556 PositiveSubtract(sum, *this, b); 3557 } 3558 else 3559 { 3560 if (b.NotNegative()) 3561 PositiveSubtract(sum, b, *this); 3562 else 3563 { 3564 PositiveAdd(sum, *this, b); 3565 sum.sign = Integer::NEGATIVE; 3566 } 3567 } 3568 return sum; 3569} 3570 3571Integer& Integer::operator+=(const Integer& t) 3572{ 3573 reg.CleanGrow(t.reg.size()); 3574 if (NotNegative()) 3575 { 3576 if (t.NotNegative()) 3577 PositiveAdd(*this, *this, t); 3578 else 3579 PositiveSubtract(*this, *this, t); 3580 } 3581 else 3582 { 3583 if (t.NotNegative()) 3584 PositiveSubtract(*this, t, *this); 3585 else 3586 { 3587 PositiveAdd(*this, *this, t); 3588 sign = Integer::NEGATIVE; 3589 } 3590 } 3591 return *this; 3592} 3593 3594Integer Integer::Minus(const Integer& b) const 3595{ 3596 Integer diff((word)0, STDMAX2(reg.size(), b.reg.size())); 3597 if (NotNegative()) 3598 { 3599 if (b.NotNegative()) 3600 PositiveSubtract(diff, *this, b); 3601 else 3602 PositiveAdd(diff, *this, b); 3603 } 3604 else 3605 { 3606 if (b.NotNegative()) 3607 { 3608 PositiveAdd(diff, *this, b); 3609 diff.sign = Integer::NEGATIVE; 3610 } 3611 else 3612 PositiveSubtract(diff, b, *this); 3613 } 3614 return diff; 3615} 3616 3617Integer& Integer::operator-=(const Integer& t) 3618{ 3619 reg.CleanGrow(t.reg.size()); 3620 if (NotNegative()) 3621 { 3622 if (t.NotNegative()) 3623 PositiveSubtract(*this, *this, t); 3624 else 3625 PositiveAdd(*this, *this, t); 3626 } 3627 else 3628 { 3629 if (t.NotNegative()) 3630 { 3631 PositiveAdd(*this, *this, t); 3632 sign = Integer::NEGATIVE; 3633 } 3634 else 3635 PositiveSubtract(*this, t, *this); 3636 } 3637 return *this; 3638} 3639 3640Integer& Integer::operator<<=(size_t n) 3641{ 3642 const size_t wordCount = WordCount(); 3643 const size_t shiftWords = n / WORD_BITS; 3644 const unsigned int shiftBits = (unsigned int)(n % WORD_BITS); 3645 3646 reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n))); 3647 ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords); 3648 ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits); 3649 return *this; 3650} 3651 3652Integer& Integer::operator>>=(size_t n) 3653{ 3654 const size_t wordCount = WordCount(); 3655 const size_t shiftWords = n / WORD_BITS; 3656 const unsigned int shiftBits = (unsigned int)(n % WORD_BITS); 3657 3658 ShiftWordsRightByWords(reg, wordCount, shiftWords); 3659 if (wordCount > shiftWords) 3660 ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits); 3661 if (IsNegative() && WordCount()==0) // avoid -0 3662 *this = Zero(); 3663 return *this; 3664} 3665 3666void PositiveMultiply(Integer &product, const Integer &a, const Integer &b) 3667{ 3668 size_t aSize = RoundupSize(a.WordCount()); 3669 size_t bSize = RoundupSize(b.WordCount()); 3670 3671 product.reg.CleanNew(RoundupSize(aSize+bSize)); 3672 product.sign = Integer::POSITIVE; 3673 3674 IntegerSecBlock workspace(aSize + bSize); 3675 AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize); 3676} 3677 3678void Multiply(Integer &product, const Integer &a, const Integer &b) 3679{ 3680 PositiveMultiply(product, a, b); 3681 3682 if (a.NotNegative() != b.NotNegative()) 3683 product.Negate(); 3684} 3685 3686Integer Integer::Times(const Integer &b) const 3687{ 3688 Integer product; 3689 Multiply(product, *this, b); 3690 return product; 3691} 3692 3693/* 3694void PositiveDivide(Integer &remainder, Integer "ient, 3695 const Integer ÷nd, const Integer &divisor) 3696{ 3697 remainder.reg.CleanNew(divisor.reg.size()); 3698 remainder.sign = Integer::POSITIVE; 3699 quotient.reg.New(0); 3700 quotient.sign = Integer::POSITIVE; 3701 unsigned i=dividend.BitCount(); 3702 while (i--) 3703 { 3704 word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1); 3705 remainder.reg[0] |= dividend[i]; 3706 if (overflow || remainder >= divisor) 3707 { 3708 Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size()); 3709 quotient.SetBit(i); 3710 } 3711 } 3712} 3713*/ 3714 3715void PositiveDivide(Integer &remainder, Integer "ient, 3716 const Integer &a, const Integer &b) 3717{ 3718 unsigned aSize = a.WordCount(); 3719 unsigned bSize = b.WordCount(); 3720 3721 if (!bSize) 3722 throw Integer::DivideByZero(); 3723 3724 if (aSize < bSize) 3725 { 3726 remainder = a; 3727 remainder.sign = Integer::POSITIVE; 3728 quotient = Integer::Zero(); 3729 return; 3730 } 3731 3732 aSize += aSize%2; // round up to next even number 3733 bSize += bSize%2; 3734 3735 remainder.reg.CleanNew(RoundupSize(bSize)); 3736 remainder.sign = Integer::POSITIVE; 3737 quotient.reg.CleanNew(RoundupSize(aSize-bSize+2)); 3738 quotient.sign = Integer::POSITIVE; 3739 3740 IntegerSecBlock T(aSize+3*(bSize+2)); 3741 Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize); 3742} 3743 3744void Integer::Divide(Integer &remainder, Integer "ient, const Integer ÷nd, const Integer &divisor) 3745{ 3746 PositiveDivide(remainder, quotient, dividend, divisor); 3747 3748 if (dividend.IsNegative()) 3749 { 3750 quotient.Negate(); 3751 if (remainder.NotZero()) 3752 { 3753 --quotient; 3754 remainder = divisor.AbsoluteValue() - remainder; 3755 } 3756 } 3757 3758 if (divisor.IsNegative()) 3759 quotient.Negate(); 3760} 3761 3762void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n) 3763{ 3764 q = a; 3765 q >>= n; 3766 3767 const size_t wordCount = BitsToWords(n); 3768 if (wordCount <= a.WordCount()) 3769 { 3770 r.reg.resize(RoundupSize(wordCount)); 3771 CopyWords(r.reg, a.reg, wordCount); 3772 SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount); 3773 if (n % WORD_BITS != 0) 3774 r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS)); 3775 } 3776 else 3777 { 3778 r.reg.resize(RoundupSize(a.WordCount())); 3779 CopyWords(r.reg, a.reg, r.reg.size()); 3780 } 3781 r.sign = POSITIVE; 3782 3783 if (a.IsNegative() && r.NotZero()) 3784 { 3785 --q; 3786 r = Power2(n) - r; 3787 } 3788} 3789 3790Integer Integer::DividedBy(const Integer &b) const 3791{ 3792 Integer remainder, quotient; 3793 Integer::Divide(remainder, quotient, *this, b); 3794 return quotient; 3795} 3796 3797Integer Integer::Modulo(const Integer &b) const 3798{ 3799 Integer remainder, quotient; 3800 Integer::Divide(remainder, quotient, *this, b); 3801 return remainder; 3802} 3803 3804void Integer::Divide(word &remainder, Integer "ient, const Integer ÷nd, word divisor) 3805{ 3806 if (!divisor) 3807 throw Integer::DivideByZero(); 3808 3809 assert(divisor); 3810 3811 if ((divisor & (divisor-1)) == 0) // divisor is a power of 2 3812 { 3813 quotient = dividend >> (BitPrecision(divisor)-1); 3814 remainder = dividend.reg[0] & (divisor-1); 3815 return; 3816 } 3817 3818 unsigned int i = dividend.WordCount(); 3819 quotient.reg.CleanNew(RoundupSize(i)); 3820 remainder = 0; 3821 while (i--) 3822 { 3823 quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor; 3824 remainder = DWord(dividend.reg[i], remainder) % divisor; 3825 } 3826 3827 if (dividend.NotNegative()) 3828 quotient.sign = POSITIVE; 3829 else 3830 { 3831 quotient.sign = NEGATIVE; 3832 if (remainder) 3833 { 3834 --quotient; 3835 remainder = divisor - remainder; 3836 } 3837 } 3838} 3839 3840Integer Integer::DividedBy(word b) const 3841{ 3842 word remainder; 3843 Integer quotient; 3844 Integer::Divide(remainder, quotient, *this, b); 3845 return quotient; 3846} 3847 3848word Integer::Modulo(word divisor) const 3849{ 3850 if (!divisor) 3851 throw Integer::DivideByZero(); 3852 3853 assert(divisor); 3854 3855 word remainder; 3856 3857 if ((divisor & (divisor-1)) == 0) // divisor is a power of 2 3858 remainder = reg[0] & (divisor-1); 3859 else 3860 { 3861 unsigned int i = WordCount(); 3862 3863 if (divisor <= 5) 3864 { 3865 DWord sum(0, 0); 3866 while (i--) 3867 sum += reg[i]; 3868 remainder = sum % divisor; 3869 } 3870 else 3871 { 3872 remainder = 0; 3873 while (i--) 3874 remainder = DWord(reg[i], remainder) % divisor; 3875 } 3876 } 3877 3878 if (IsNegative() && remainder) 3879 remainder = divisor - remainder; 3880 3881 return remainder; 3882} 3883 3884void Integer::Negate() 3885{ 3886 if (!!(*this)) // don't flip sign if *this==0 3887 sign = Sign(1-sign); 3888} 3889 3890int Integer::PositiveCompare(const Integer& t) const 3891{ 3892 unsigned size = WordCount(), tSize = t.WordCount(); 3893 3894 if (size == tSize) 3895 return CryptoPP::Compare(reg, t.reg, size); 3896 else 3897 return size > tSize ? 1 : -1; 3898} 3899 3900int Integer::Compare(const Integer& t) const 3901{ 3902 if (NotNegative()) 3903 { 3904 if (t.NotNegative()) 3905 return PositiveCompare(t); 3906 else 3907 return 1; 3908 } 3909 else 3910 { 3911 if (t.NotNegative()) 3912 return -1; 3913 else 3914 return -PositiveCompare(t); 3915 } 3916} 3917 3918Integer Integer::SquareRoot() const 3919{ 3920 if (!IsPositive()) 3921 return Zero(); 3922 3923 // overestimate square root 3924 Integer x, y = Power2((BitCount()+1)/2); 3925 assert(y*y >= *this); 3926 3927 do 3928 { 3929 x = y; 3930 y = (x + *this/x) >> 1; 3931 } while (y<x); 3932 3933 return x; 3934} 3935 3936bool Integer::IsSquare() const 3937{ 3938 Integer r = SquareRoot(); 3939 return *this == r.Squared(); 3940} 3941 3942bool Integer::IsUnit() const 3943{ 3944 return (WordCount() == 1) && (reg[0] == 1); 3945} 3946 3947Integer Integer::MultiplicativeInverse() const 3948{ 3949 return IsUnit() ? *this : Zero(); 3950} 3951 3952Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m) 3953{ 3954 return x*y%m; 3955} 3956 3957Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m) 3958{ 3959 ModularArithmetic mr(m); 3960 return mr.Exponentiate(x, e); 3961} 3962 3963Integer Integer::Gcd(const Integer &a, const Integer &b) 3964{ 3965 return EuclideanDomainOf<Integer>().Gcd(a, b); 3966} 3967 3968Integer Integer::InverseMod(const Integer &m) const 3969{ 3970 assert(m.NotNegative()); 3971 3972 if (IsNegative()) 3973 return Modulo(m).InverseMod(m); 3974 3975 if (m.IsEven()) 3976 { 3977 if (!m || IsEven()) 3978 return Zero(); // no inverse 3979 if (*this == One()) 3980 return One(); 3981 3982 Integer u = m.Modulo(*this).InverseMod(*this); 3983 return !u ? Zero() : (m*(*this-u)+1)/(*this); 3984 } 3985 3986 SecBlock<word> T(m.reg.size() * 4); 3987 Integer r((word)0, m.reg.size()); 3988 unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size()); 3989 DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size()); 3990 return r; 3991} 3992 3993word Integer::InverseMod(word mod) const 3994{ 3995 word g0 = mod, g1 = *this % mod; 3996 word v0 = 0, v1 = 1; 3997 word y; 3998 3999 while (g1) 4000 { 4001 if (g1 == 1) 4002 return v1; 4003 y = g0 / g1; 4004 g0 = g0 % g1; 4005 v0 += y * v1; 4006 4007 if (!g0) 4008 break; 4009 if (g0 == 1) 4010 return mod-v0; 4011 y = g1 / g0; 4012 g1 = g1 % g0; 4013 v1 += y * v0; 4014 } 4015 return 0; 4016} 4017 4018// ******************************************************** 4019 4020ModularArithmetic::ModularArithmetic(BufferedTransformation &bt) 4021{ 4022 BERSequenceDecoder seq(bt); 4023 OID oid(seq); 4024 if (oid != ASN1::prime_field()) 4025 BERDecodeError(); 4026 m_modulus.BERDecode(seq); 4027 seq.MessageEnd(); 4028 m_result.reg.resize(m_modulus.reg.size()); 4029} 4030 4031void ModularArithmetic::DEREncode(BufferedTransformation &bt) const 4032{ 4033 DERSequenceEncoder seq(bt); 4034 ASN1::prime_field().DEREncode(seq); 4035 m_modulus.DEREncode(seq); 4036 seq.MessageEnd(); 4037} 4038 4039void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const 4040{ 4041 a.DEREncodeAsOctetString(out, MaxElementByteLength()); 4042} 4043 4044void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const 4045{ 4046 a.BERDecodeAsOctetString(in, MaxElementByteLength()); 4047} 4048 4049const Integer& ModularArithmetic::Half(const Integer &a) const 4050{ 4051 if (a.reg.size()==m_modulus.reg.size()) 4052 { 4053 CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size()); 4054 return m_result; 4055 } 4056 else 4057 return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1)); 4058} 4059 4060const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const 4061{ 4062 if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size()) 4063 { 4064 if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size()) 4065 || Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0) 4066 { 4067 CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size()); 4068 } 4069 return m_result; 4070 } 4071 else 4072 { 4073 m_result1 = a+b; 4074 if (m_result1 >= m_modulus) 4075 m_result1 -= m_modulus; 4076 return m_result1; 4077 } 4078} 4079 4080Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const 4081{ 4082 if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size()) 4083 { 4084 if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size()) 4085 || Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0) 4086 { 4087 CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size()); 4088 } 4089 } 4090 else 4091 { 4092 a+=b; 4093 if (a>=m_modulus) 4094 a-=m_modulus; 4095 } 4096 4097 return a; 4098} 4099 4100const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const 4101{ 4102 if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size()) 4103 { 4104 if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size())) 4105 CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size()); 4106 return m_result; 4107 } 4108 else 4109 { 4110 m_result1 = a-b; 4111 if (m_result1.IsNegative()) 4112 m_result1 += m_modulus; 4113 return m_result1; 4114 } 4115} 4116 4117Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const 4118{ 4119 if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size()) 4120 { 4121 if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size())) 4122 CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size()); 4123 } 4124 else 4125 { 4126 a-=b; 4127 if (a.IsNegative()) 4128 a+=m_modulus; 4129 } 4130 4131 return a; 4132} 4133 4134const Integer& ModularArithmetic::Inverse(const Integer &a) const 4135{ 4136 if (!a) 4137 return a; 4138 4139 CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size()); 4140 if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size())) 4141 Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size()); 4142 4143 return m_result; 4144} 4145 4146Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const 4147{ 4148 if (m_modulus.IsOdd()) 4149 { 4150 MontgomeryRepresentation dr(m_modulus); 4151 return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2)); 4152 } 4153 else 4154 return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2); 4155} 4156 4157void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const 4158{ 4159 if (m_modulus.IsOdd()) 4160 { 4161 MontgomeryRepresentation dr(m_modulus); 4162 dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount); 4163 for (unsigned int i=0; i<exponentsCount; i++) 4164 results[i] = dr.ConvertOut(results[i]); 4165 } 4166 else 4167 AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount); 4168} 4169 4170MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m) // modulus must be odd 4171 : ModularArithmetic(m), 4172 m_u((word)0, m_modulus.reg.size()), 4173 m_workspace(5*m_modulus.reg.size()) 4174{ 4175 if (!m_modulus.IsOdd()) 4176 throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus"); 4177 4178 RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size()); 4179} 4180 4181const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const 4182{ 4183 word *const T = m_workspace.begin(); 4184 word *const R = m_result.reg.begin(); 4185 const size_t N = m_modulus.reg.size(); 4186 assert(a.reg.size()<=N && b.reg.size()<=N); 4187 4188 AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size()); 4189 SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size()); 4190 MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N); 4191 return m_result; 4192} 4193 4194const Integer& MontgomeryRepresentation::Square(const Integer &a) const 4195{ 4196 word *const T = m_workspace.begin(); 4197 word *const R = m_result.reg.begin(); 4198 const size_t N = m_modulus.reg.size(); 4199 assert(a.reg.size()<=N); 4200 4201 CryptoPP::Square(T, T+2*N, a.reg, a.reg.size()); 4202 SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size()); 4203 MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N); 4204 return m_result; 4205} 4206 4207Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const 4208{ 4209 word *const T = m_workspace.begin(); 4210 word *const R = m_result.reg.begin(); 4211 const size_t N = m_modulus.reg.size(); 4212 assert(a.reg.size()<=N); 4213 4214 CopyWords(T, a.reg, a.reg.size()); 4215 SetWords(T+a.reg.size(), 0, 2*N-a.reg.size()); 4216 MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N); 4217 return m_result; 4218} 4219 4220const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const 4221{ 4222// return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus; 4223 word *const T = m_workspace.begin(); 4224 word *const R = m_result.reg.begin(); 4225 const size_t N = m_modulus.reg.size(); 4226 assert(a.reg.size()<=N); 4227 4228 CopyWords(T, a.reg, a.reg.size()); 4229 SetWords(T+a.reg.size(), 0, 2*N-a.reg.size()); 4230 MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N); 4231 unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N); 4232 4233// cout << "k=" << k << " N*32=" << 32*N << endl; 4234 4235 if (k>N*WORD_BITS) 4236 DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N); 4237 else 4238 MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N); 4239 4240 return m_result; 4241} 4242 4243NAMESPACE_END 4244 4245#endif 4246