1/** Fundamental operations for arbitrary-precision arithmetic
2 *
3 * These functions are for internal use only.
4 */
5/*          Copyright Don Clugston 2008 - 2010.
6 * Distributed under the Boost Software License, Version 1.0.
7 *    (See accompanying file LICENSE_1_0.txt or copy at
8 *          http://www.boost.org/LICENSE_1_0.txt)
9 */
10/* References:
11   "Modern Computer Arithmetic" (MCA) is the primary reference for all
12    algorithms used in this library.
13  - R.P. Brent and P. Zimmermann, "Modern Computer Arithmetic",
14    Version 0.5.9, (Oct 2010).
15  - C. Burkinel and J. Ziegler, "Fast Recursive Division", MPI-I-98-1-022,
16    Max-Planck Institute fuer Informatik, (Oct 1998).
17  - G. Hanrot, M. Quercia, and P. Zimmermann, "The Middle Product Algorithm, I.",
18    INRIA 4664, (Dec 2002).
19  - M. Bodrato and A. Zanoni, "What about Toom-Cook Matrices Optimality?",
20    http://bodrato.it/papers (2006).
21  - A. Fog, "Optimizing subroutines in assembly language",
22    www.agner.org/optimize (2008).
23  - A. Fog, "The microarchitecture of Intel and AMD CPU's",
24    www.agner.org/optimize (2008).
25  - A. Fog, "Instruction tables: Lists of instruction latencies, throughputs
26    and micro-operation breakdowns for Intel and AMD CPU's.", www.agner.org/optimize (2008).
27
28Idioms:
29  Many functions in this module use
30  'func(Tulong)(Tulong x) if (is(Tulong == ulong))' rather than 'func(ulong x)'
31  in order to disable implicit conversion.
32
33*/
34module std.internal.math.biguintcore;
35
36version (D_InlineAsm_X86)
37{
38    import std.internal.math.biguintx86;
39}
40else
41{
42    import std.internal.math.biguintnoasm;
43}
44
45alias multibyteAdd = multibyteAddSub!('+');
46alias multibyteSub = multibyteAddSub!('-');
47
48
49import core.cpuid;
50public import std.ascii : LetterCase;
51import std.range.primitives;
52import std.traits;
53
54shared static this()
55{
56    CACHELIMIT = core.cpuid.datacache[0].size*1024/2;
57}
58
59private:
60// Limits for when to switch between algorithms.
61immutable size_t CACHELIMIT;   // Half the size of the data cache.
62enum size_t FASTDIVLIMIT = 100; // crossover to recursive division
63
64
65// These constants are used by shift operations
66static if (BigDigit.sizeof == int.sizeof)
67{
68    enum { LG2BIGDIGITBITS = 5, BIGDIGITSHIFTMASK = 31 };
69    alias BIGHALFDIGIT = ushort;
70}
71else static if (BigDigit.sizeof == long.sizeof)
72{
73    alias BIGHALFDIGIT = uint;
74    enum { LG2BIGDIGITBITS = 6, BIGDIGITSHIFTMASK = 63 };
75}
76else static assert(0, "Unsupported BigDigit size");
77
78import std.exception : assumeUnique;
79import std.traits : isIntegral;
80enum BigDigitBits = BigDigit.sizeof*8;
81template maxBigDigits(T)
82if (isIntegral!T)
83{
84    enum maxBigDigits = (T.sizeof+BigDigit.sizeof-1)/BigDigit.sizeof;
85}
86
87static immutable BigDigit[] ZERO = [0];
88static immutable BigDigit[] ONE = [1];
89static immutable BigDigit[] TWO = [2];
90static immutable BigDigit[] TEN = [10];
91
92
93public:
94
95/// BigUint performs memory management and wraps the low-level calls.
96struct BigUint
97{
98private:
99    pure invariant()
100    {
101        assert( data.length >= 1 && (data.length == 1 || data[$-1] != 0 ));
102    }
103
104    immutable(BigDigit) [] data = ZERO;
105
106    this(immutable(BigDigit) [] x) pure nothrow @nogc @safe
107    {
108       data = x;
109    }
110  package(std)  // used from: std.bigint
111    this(T)(T x) pure nothrow @safe if (isIntegral!T)
112    {
113        opAssign(x);
114    }
115
116    enum trustedAssumeUnique = function(BigDigit[] input) pure @trusted @nogc {
117        return assumeUnique(input);
118    };
119public:
120    // Length in uints
121    @property size_t uintLength() pure nothrow const @safe @nogc
122    {
123        static if (BigDigit.sizeof == uint.sizeof)
124        {
125            return data.length;
126        }
127        else static if (BigDigit.sizeof == ulong.sizeof)
128        {
129            return data.length * 2 -
130            ((data[$-1] & 0xFFFF_FFFF_0000_0000L) ? 1 : 0);
131        }
132    }
133    @property size_t ulongLength() pure nothrow const @safe @nogc
134    {
135        static if (BigDigit.sizeof == uint.sizeof)
136        {
137            return (data.length + 1) >> 1;
138        }
139        else static if (BigDigit.sizeof == ulong.sizeof)
140        {
141            return data.length;
142        }
143    }
144
145    // The value at (cast(ulong[]) data)[n]
146    ulong peekUlong(int n) pure nothrow const @safe @nogc
147    {
148        static if (BigDigit.sizeof == int.sizeof)
149        {
150            if (data.length == n*2 + 1) return data[n*2];
151            return data[n*2] + ((cast(ulong) data[n*2 + 1]) << 32 );
152        }
153        else static if (BigDigit.sizeof == long.sizeof)
154        {
155            return data[n];
156        }
157    }
158    uint peekUint(int n) pure nothrow const @safe @nogc
159    {
160        static if (BigDigit.sizeof == int.sizeof)
161        {
162            return data[n];
163        }
164        else
165        {
166            immutable x = data[n >> 1];
167            return (n & 1) ? cast(uint)(x >> 32) : cast(uint) x;
168        }
169    }
170public:
171    ///
172    void opAssign(Tulong)(Tulong u) pure nothrow @safe if (is (Tulong == ulong))
173    {
174        if (u == 0) data = ZERO;
175        else if (u == 1) data = ONE;
176        else if (u == 2) data = TWO;
177        else if (u == 10) data = TEN;
178        else
179        {
180            static if (BigDigit.sizeof == int.sizeof)
181            {
182                uint ulo = cast(uint)(u & 0xFFFF_FFFF);
183                uint uhi = cast(uint)(u >> 32);
184                if (uhi == 0)
185                {
186                    data = [ulo];
187                }
188                else
189                {
190                    data = [ulo, uhi];
191                }
192            }
193            else static if (BigDigit.sizeof == long.sizeof)
194            {
195                data = [u];
196            }
197        }
198    }
199    void opAssign(Tdummy = void)(BigUint y) pure nothrow @nogc @safe
200    {
201        this.data = y.data;
202    }
203
204    ///
205    int opCmp(Tdummy = void)(const BigUint y) pure nothrow @nogc const @safe
206    {
207        if (data.length != y.data.length)
208            return (data.length > y.data.length) ?  1 : -1;
209        size_t k = highestDifferentDigit(data, y.data);
210        if (data[k] == y.data[k])
211            return 0;
212        return data[k] > y.data[k] ? 1 : -1;
213    }
214
215    ///
216    int opCmp(Tulong)(Tulong y) pure nothrow @nogc const @safe if (is (Tulong == ulong))
217    {
218        if (data.length > maxBigDigits!Tulong)
219            return 1;
220
221        foreach_reverse (i; 0 .. maxBigDigits!Tulong)
222        {
223            BigDigit tmp = cast(BigDigit)(y>>(i*BigDigitBits));
224            if (tmp == 0)
225                if (data.length >= i+1)
226                {
227                    // Since ZERO is [0], so we cannot simply return 1 here, as
228                    // data[i] would be 0 for i == 0 in that case.
229                    return (data[i] > 0) ? 1 : 0;
230                }
231                else
232                    continue;
233            else
234                if (i+1 > data.length)
235                    return -1;
236                else if (tmp != data[i])
237                    return data[i] > tmp ? 1 : -1;
238        }
239        return 0;
240    }
241
242    bool opEquals(Tdummy = void)(ref const BigUint y) pure nothrow @nogc const @safe
243    {
244           return y.data[] == data[];
245    }
246
247    bool opEquals(Tdummy = void)(ulong y) pure nothrow @nogc const @safe
248    {
249        if (data.length > 2)
250            return false;
251        uint ylo = cast(uint)(y & 0xFFFF_FFFF);
252        uint yhi = cast(uint)(y >> 32);
253        if (data.length == 2 && data[1]!=yhi)
254            return false;
255        if (data.length == 1 && yhi != 0)
256            return false;
257        return (data[0] == ylo);
258    }
259
260    bool isZero() pure const nothrow @safe @nogc
261    {
262        return data.length == 1 && data[0] == 0;
263    }
264
265    size_t numBytes() pure nothrow const @safe @nogc
266    {
267        return data.length * BigDigit.sizeof;
268    }
269
270    // the extra bytes are added to the start of the string
271    char [] toDecimalString(int frontExtraBytes) const pure nothrow
272    {
273        immutable predictlength = 20+20*(data.length/2); // just over 19
274        char [] buff = new char[frontExtraBytes + predictlength];
275        ptrdiff_t sofar = biguintToDecimal(buff, data.dup);
276        return buff[sofar-frontExtraBytes..$];
277    }
278
279    /** Convert to a hex string, printing a minimum number of digits 'minPadding',
280     *  allocating an additional 'frontExtraBytes' at the start of the string.
281     *  Padding is done with padChar, which may be '0' or ' '.
282     *  'separator' is a digit separation character. If non-zero, it is inserted
283     *  between every 8 digits.
284     *  Separator characters do not contribute to the minPadding.
285     */
286    char [] toHexString(int frontExtraBytes, char separator = 0,
287            int minPadding=0, char padChar = '0',
288            LetterCase letterCase = LetterCase.upper) const pure nothrow @safe
289    {
290        // Calculate number of extra padding bytes
291        size_t extraPad = (minPadding > data.length * 2 * BigDigit.sizeof)
292            ? minPadding - data.length * 2 * BigDigit.sizeof : 0;
293
294        // Length not including separator bytes
295        size_t lenBytes = data.length * 2 * BigDigit.sizeof;
296
297        // Calculate number of separator bytes
298        size_t mainSeparatorBytes = separator ? (lenBytes  / 8) - 1 : 0;
299        immutable totalSeparatorBytes = separator ? ((extraPad + lenBytes + 7) / 8) - 1: 0;
300
301        char [] buff = new char[lenBytes + extraPad + totalSeparatorBytes + frontExtraBytes];
302        biguintToHex(buff[$ - lenBytes - mainSeparatorBytes .. $], data, separator, letterCase);
303        if (extraPad > 0)
304        {
305            if (separator)
306            {
307                size_t start = frontExtraBytes; // first index to pad
308                if (extraPad &7)
309                {
310                    // Do 1 to 7 extra zeros.
311                    buff[frontExtraBytes .. frontExtraBytes + (extraPad & 7)] = padChar;
312                    buff[frontExtraBytes + (extraPad & 7)] = (padChar == ' ' ? ' ' : separator);
313                    start += (extraPad & 7) + 1;
314                }
315                for (int i=0; i< (extraPad >> 3); ++i)
316                {
317                    buff[start .. start + 8] = padChar;
318                    buff[start + 8] = (padChar == ' ' ? ' ' : separator);
319                    start += 9;
320                }
321            }
322            else
323            {
324                buff[frontExtraBytes .. frontExtraBytes + extraPad]=padChar;
325            }
326        }
327        int z = frontExtraBytes;
328        if (lenBytes > minPadding)
329        {
330            // Strip leading zeros.
331            ptrdiff_t maxStrip = lenBytes - minPadding;
332            while (z< buff.length-1 && (buff[z]=='0' || buff[z]==padChar) && maxStrip>0)
333            {
334                ++z;
335                --maxStrip;
336            }
337        }
338        if (padChar!='0')
339        {
340            // Convert leading zeros into padChars.
341            for (size_t k= z; k< buff.length-1 && (buff[k]=='0' || buff[k]==padChar); ++k)
342            {
343                if (buff[k]=='0') buff[k]=padChar;
344            }
345        }
346        return buff[z-frontExtraBytes..$];
347    }
348
349    /**
350     * Convert to an octal string.
351     */
352    char[] toOctalString() const
353    {
354        auto predictLength = 1 + data.length*BigDigitBits / 3;
355        char[] buff = new char[predictLength];
356        size_t firstNonZero = biguintToOctal(buff, data);
357        return buff[firstNonZero .. $];
358    }
359
360    // return false if invalid character found
361    bool fromHexString(Range)(Range s) if (
362        isBidirectionalRange!Range && isSomeChar!(ElementType!Range))
363    {
364        import std.range : walkLength;
365
366        //Strip leading zeros
367        while (!s.empty && s.front == '0')
368            s.popFront;
369
370        if (s.empty)
371        {
372            data = ZERO;
373            return true;
374        }
375
376        immutable len = (s.save.walkLength + 15) / 4;
377        auto tmp = new BigDigit[len + 1];
378        uint part, sofar, partcount;
379
380        foreach_reverse (character; s)
381        {
382            if (character == '_')
383                continue;
384
385            uint x;
386            if (character >= '0' && character <= '9')
387            {
388                x = character - '0';
389            }
390            else if (character >= 'A' && character <= 'F')
391            {
392                x = character - 'A' + 10;
393            }
394            else if (character >= 'a' && character <= 'f')
395            {
396                x = character - 'a' + 10;
397            }
398            else
399            {
400                return false;
401            }
402
403            part >>= 4;
404            part |= (x << (32 - 4));
405            ++partcount;
406
407            if (partcount == 8)
408            {
409                tmp[sofar] = part;
410                ++sofar;
411                partcount = 0;
412                part = 0;
413            }
414        }
415        if (part)
416        {
417            for ( ; partcount != 8; ++partcount) part >>= 4;
418            tmp[sofar] = part;
419            ++sofar;
420        }
421        if (sofar == 0)
422            data = ZERO;
423        else
424            data = trustedAssumeUnique(tmp[0 .. sofar]);
425
426        return true;
427    }
428
429    // return true if OK; false if erroneous characters found
430    bool fromDecimalString(Range)(Range s) if (
431        isForwardRange!Range && isSomeChar!(ElementType!Range))
432    {
433        import std.range : walkLength;
434
435        while (!s.empty && s.front == '0')
436        {
437            s.popFront;
438        }
439
440        if (s.empty)
441        {
442            data = ZERO;
443            return true;
444        }
445
446        auto predict_length = (18 * 2 + 2 * s.save.walkLength) / 19;
447        auto tmp = new BigDigit[predict_length];
448
449        tmp.length = biguintFromDecimal(tmp, s);
450
451        data = trustedAssumeUnique(tmp);
452        return true;
453    }
454
455    ////////////////////////
456    //
457    // All of these member functions create a new BigUint.
458
459    // return x >> y
460    BigUint opShr(Tulong)(Tulong y) pure nothrow const if (is (Tulong == ulong))
461    {
462        assert(y>0);
463        uint bits = cast(uint) y & BIGDIGITSHIFTMASK;
464        if ((y >> LG2BIGDIGITBITS) >= data.length) return BigUint(ZERO);
465        uint words = cast(uint)(y >> LG2BIGDIGITBITS);
466        if (bits == 0)
467        {
468            return BigUint(data[words..$]);
469        }
470        else
471        {
472            uint [] result = new BigDigit[data.length - words];
473            multibyteShr(result, data[words..$], bits);
474
475            if (result.length > 1 && result[$-1] == 0)
476                return BigUint(trustedAssumeUnique(result[0 .. $-1]));
477            else
478                return BigUint(trustedAssumeUnique(result));
479        }
480    }
481
482    // return x << y
483    BigUint opShl(Tulong)(Tulong y) pure nothrow const if (is (Tulong == ulong))
484    {
485        assert(y>0);
486        if (isZero()) return this;
487        uint bits = cast(uint) y & BIGDIGITSHIFTMASK;
488        assert((y >> LG2BIGDIGITBITS) < cast(ulong)(uint.max));
489        uint words = cast(uint)(y >> LG2BIGDIGITBITS);
490        BigDigit [] result = new BigDigit[data.length + words+1];
491        result[0 .. words] = 0;
492        if (bits == 0)
493        {
494            result[words .. words+data.length] = data[];
495            return BigUint(trustedAssumeUnique(result[0 .. words+data.length]));
496        }
497        else
498        {
499            immutable c = multibyteShl(result[words .. words+data.length], data, bits);
500            if (c == 0) return BigUint(trustedAssumeUnique(result[0 .. words+data.length]));
501            result[$-1] = c;
502            return BigUint(trustedAssumeUnique(result));
503        }
504    }
505
506    // If wantSub is false, return x + y, leaving sign unchanged
507    // If wantSub is true, return abs(x - y), negating sign if x < y
508    static BigUint addOrSubInt(Tulong)(const BigUint x, Tulong y,
509            bool wantSub, ref bool sign) pure nothrow if (is(Tulong == ulong))
510    {
511        BigUint r;
512        if (wantSub)
513        {   // perform a subtraction
514            if (x.data.length > 2)
515            {
516                r.data = subInt(x.data, y);
517            }
518            else
519            {   // could change sign!
520                ulong xx = x.data[0];
521                if (x.data.length > 1)
522                    xx += (cast(ulong) x.data[1]) << 32;
523                ulong d;
524                if (xx <= y)
525                {
526                    d = y - xx;
527                    sign = !sign;
528                }
529                else
530                {
531                    d = xx - y;
532                }
533                if (d == 0)
534                {
535                    r = 0UL;
536                    sign = false;
537                    return r;
538                }
539                if (d > uint.max)
540                {
541                    r.data = [cast(uint)(d & 0xFFFF_FFFF), cast(uint)(d >> 32)];
542                }
543                else
544                {
545                    r.data = [cast(uint)(d & 0xFFFF_FFFF)];
546                }
547            }
548        }
549        else
550        {
551            r.data = addInt(x.data, y);
552        }
553        return r;
554    }
555
556    // If wantSub is false, return x + y, leaving sign unchanged.
557    // If wantSub is true, return abs(x - y), negating sign if x<y
558    static BigUint addOrSub(BigUint x, BigUint y, bool wantSub, bool *sign)
559        pure nothrow
560    {
561        BigUint r;
562        if (wantSub)
563        {   // perform a subtraction
564            bool negative;
565            r.data = sub(x.data, y.data, &negative);
566            *sign ^= negative;
567            if (r.isZero())
568            {
569                *sign = false;
570            }
571        }
572        else
573        {
574            r.data = add(x.data, y.data);
575        }
576        return r;
577    }
578
579
580    //  return x*y.
581    //  y must not be zero.
582    static BigUint mulInt(T = ulong)(BigUint x, T y) pure nothrow
583    {
584        if (y == 0 || x == 0) return BigUint(ZERO);
585        uint hi = cast(uint)(y >>> 32);
586        uint lo = cast(uint)(y & 0xFFFF_FFFF);
587        uint [] result = new BigDigit[x.data.length+1+(hi != 0)];
588        result[x.data.length] = multibyteMul(result[0 .. x.data.length], x.data, lo, 0);
589        if (hi != 0)
590        {
591            result[x.data.length+1] = multibyteMulAdd!('+')(result[1 .. x.data.length+1],
592                x.data, hi, 0);
593        }
594        return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
595    }
596
597    /*  return x * y.
598     */
599    static BigUint mul(BigUint x, BigUint y) pure nothrow
600    {
601        if (y == 0 || x == 0)
602            return BigUint(ZERO);
603        auto len = x.data.length + y.data.length;
604        BigDigit [] result = new BigDigit[len];
605        if (y.data.length > x.data.length)
606        {
607            mulInternal(result, y.data, x.data);
608        }
609        else
610        {
611            if (x.data[]==y.data[]) squareInternal(result, x.data);
612            else mulInternal(result, x.data, y.data);
613        }
614        // the highest element could be zero,
615        // in which case we need to reduce the length
616        return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
617    }
618
619    // return x / y
620    static BigUint divInt(T)(BigUint x, T y_) pure nothrow
621    if ( is(Unqual!T == uint) )
622    {
623        uint y = y_;
624        if (y == 1)
625            return x;
626        uint [] result = new BigDigit[x.data.length];
627        if ((y&(-y))==y)
628        {
629            assert(y != 0, "BigUint division by zero");
630            // perfect power of 2
631            uint b = 0;
632            for (;y != 1; y>>=1)
633            {
634                ++b;
635            }
636            multibyteShr(result, x.data, b);
637        }
638        else
639        {
640            result[] = x.data[];
641            cast(void) multibyteDivAssign(result, y, 0);
642        }
643        return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
644    }
645
646    static BigUint divInt(T)(BigUint x, T y) pure nothrow
647    if ( is(Unqual!T == ulong) )
648    {
649        if (y <= uint.max)
650            return divInt!uint(x, cast(uint) y);
651        if (x.data.length < 2)
652            return BigUint(ZERO);
653        uint hi = cast(uint)(y >>> 32);
654        uint lo = cast(uint)(y & 0xFFFF_FFFF);
655        immutable uint[2] z = [lo, hi];
656        BigDigit[] result = new BigDigit[x.data.length - z.length + 1];
657        divModInternal(result, null, x.data, z[]);
658        return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
659    }
660
661    // return x % y
662    static uint modInt(T)(BigUint x, T y_) pure if ( is(Unqual!T == uint) )
663    {
664        import core.memory : GC;
665        uint y = y_;
666        assert(y != 0);
667        if ((y&(-y)) == y)
668        {   // perfect power of 2
669            return x.data[0] & (y-1);
670        }
671        else
672        {
673            // horribly inefficient - malloc, copy, & store are unnecessary.
674            uint [] wasteful = new BigDigit[x.data.length];
675            wasteful[] = x.data[];
676            immutable rem = multibyteDivAssign(wasteful, y, 0);
677            () @trusted { GC.free(wasteful.ptr); } ();
678            return rem;
679        }
680    }
681
682    // return x / y
683    static BigUint div(BigUint x, BigUint y) pure nothrow
684    {
685        if (y.data.length > x.data.length)
686            return BigUint(ZERO);
687        if (y.data.length == 1)
688            return divInt(x, y.data[0]);
689        BigDigit [] result = new BigDigit[x.data.length - y.data.length + 1];
690           divModInternal(result, null, x.data, y.data);
691        return BigUint(removeLeadingZeros(trustedAssumeUnique(result)));
692    }
693
694    // return x % y
695    static BigUint mod(BigUint x, BigUint y) pure nothrow
696    {
697        if (y.data.length > x.data.length) return x;
698        if (y.data.length == 1)
699        {
700            return BigUint([modInt(x, y.data[0])]);
701        }
702        BigDigit [] result = new BigDigit[x.data.length - y.data.length + 1];
703        BigDigit [] rem = new BigDigit[y.data.length];
704        divModInternal(result, rem, x.data, y.data);
705        return BigUint(removeLeadingZeros(trustedAssumeUnique(rem)));
706    }
707
708    // return x op y
709    static BigUint bitwiseOp(string op)(BigUint x, BigUint y, bool xSign, bool ySign, ref bool resultSign)
710    pure nothrow @safe if (op == "|" || op == "^" || op == "&")
711    {
712        auto d1 = includeSign(x.data, y.uintLength, xSign);
713        auto d2 = includeSign(y.data, x.uintLength, ySign);
714
715        foreach (i; 0 .. d1.length)
716        {
717            mixin("d1[i] " ~ op ~ "= d2[i];");
718        }
719
720        mixin("resultSign = xSign " ~ op ~ " ySign;");
721
722        if (resultSign)
723        {
724            twosComplement(d1, d1);
725        }
726
727        return BigUint(removeLeadingZeros(trustedAssumeUnique(d1)));
728    }
729
730    /**
731     * Return a BigUint which is x raised to the power of y.
732     * Method: Powers of 2 are removed from x, then left-to-right binary
733     * exponentiation is used.
734     * Memory allocation is minimized: at most one temporary BigUint is used.
735     */
736    static BigUint pow(BigUint x, ulong y) pure nothrow
737    {
738        // Deal with the degenerate cases first.
739        if (y == 0) return BigUint(ONE);
740        if (y == 1) return x;
741        if (x == 0 || x == 1) return x;
742
743        BigUint result;
744
745        // Simplify, step 1: Remove all powers of 2.
746        uint firstnonzero = firstNonZeroDigit(x.data);
747        // Now we know x = x[firstnonzero..$] * (2^^(firstnonzero*BigDigitBits))
748        // where BigDigitBits = BigDigit.sizeof * 8
749
750        // See if x[firstnonzero..$] can now fit into a single digit.
751        bool singledigit = ((x.data.length - firstnonzero) == 1);
752        // If true, then x0 is that digit
753        // and the result will be (x0 ^^ y) * (2^^(firstnonzero*y*BigDigitBits))
754        BigDigit x0 = x.data[firstnonzero];
755        assert(x0 != 0);
756        // Length of the non-zero portion
757        size_t nonzerolength = x.data.length - firstnonzero;
758        ulong y0;
759        uint evenbits = 0; // number of even bits in the bottom of x
760        while (!(x0 & 1))
761        {
762            x0 >>= 1;
763            ++evenbits;
764        }
765
766        if (x.data.length- firstnonzero == 2)
767        {
768            // Check for a single digit straddling a digit boundary
769            const BigDigit x1 = x.data[firstnonzero+1];
770            if ((x1 >> evenbits) == 0)
771            {
772                x0 |= (x1 << (BigDigit.sizeof * 8 - evenbits));
773                singledigit = true;
774            }
775        }
776        // Now if (singledigit), x^^y  = (x0 ^^ y) * 2^^(evenbits * y) * 2^^(firstnonzero*y*BigDigitBits))
777
778        uint evenshiftbits = 0; // Total powers of 2 to shift by, at the end
779
780        // Simplify, step 2: For singledigits, see if we can trivially reduce y
781
782        BigDigit finalMultiplier = 1UL;
783
784        if (singledigit)
785        {
786            // x fits into a single digit. Raise it to the highest power we can
787            // that still fits into a single digit, then reduce the exponent accordingly.
788            // We're quite likely to have a residual multiply at the end.
789            // For example, 10^^100 = (((5^^13)^^7) * 5^^9) * 2^^100.
790            // and 5^^13 still fits into a uint.
791            evenshiftbits  = cast(uint)( (evenbits * y) & BIGDIGITSHIFTMASK);
792            if (x0 == 1)
793            {   // Perfect power of 2
794                result = 1UL;
795                return result << (evenbits + firstnonzero * 8 * BigDigit.sizeof) * y;
796            }
797            immutable p = highestPowerBelowUintMax(x0);
798            if (y <= p)
799            {   // Just do it with pow
800                result = cast(ulong) intpow(x0, y);
801                if (evenbits + firstnonzero == 0)
802                    return result;
803                return result << (evenbits + firstnonzero * 8 * BigDigit.sizeof) * y;
804            }
805            y0 = y / p;
806            finalMultiplier = intpow(x0, y - y0*p);
807            x0 = intpow(x0, p);
808            // Result is x0
809            nonzerolength = 1;
810        }
811        // Now if (singledigit), x^^y  = finalMultiplier * (x0 ^^ y0) * 2^^(evenbits * y) * 2^^(firstnonzero*y*BigDigitBits))
812
813        // Perform a crude check for overflow and allocate result buffer.
814        // The length required is y * lg2(x) bits.
815        // which will always fit into y*x.length digits. But this is
816        // a gross overestimate if x is small (length 1 or 2) and the highest
817        // digit is nearly empty.
818        // A better estimate is:
819        //   y * lg2(x[$-1]/BigDigit.max) + y * (x.length - 1) digits,
820        //  and the first term is always between
821        //  y * (bsr(x.data[$-1]) + 1) / BIGDIGITBITS and
822        //  y * (bsr(x.data[$-1]) + 2) / BIGDIGITBITS
823        // For single digit payloads, we already have
824        //   x^^y  = finalMultiplier * (x0 ^^ y0) * 2^^(evenbits * y) * 2^^(firstnonzero*y*BigDigitBits))
825        // and x0 is almost a full digit, so it's a tight estimate.
826        // Number of digits is therefore 1 + x0.length*y0 + (evenbits*y)/BIGDIGIT + firstnonzero*y
827        // Note that the divisions must be rounded up.
828
829        // Estimated length in BigDigits
830        immutable estimatelength = singledigit
831            ? 1 + y0 + ((evenbits*y  + BigDigit.sizeof * 8 - 1) / (BigDigit.sizeof *8)) + firstnonzero*y
832            :  x.data.length * y;
833        // Imprecise check for overflow. Makes the extreme cases easier to debug
834        // (less extreme overflow will result in an out of memory error).
835        if (estimatelength > uint.max/(4*BigDigit.sizeof))
836            assert(0, "Overflow in BigInt.pow");
837
838        // The result buffer includes space for all the trailing zeros
839        BigDigit [] resultBuffer = new BigDigit[cast(size_t) estimatelength];
840
841        // Do all the powers of 2!
842        size_t result_start = cast(size_t)( firstnonzero * y
843            + (singledigit ? ((evenbits * y) >> LG2BIGDIGITBITS) : 0));
844
845        resultBuffer[0 .. result_start] = 0;
846        BigDigit [] t1 = resultBuffer[result_start..$];
847        BigDigit [] r1;
848
849        if (singledigit)
850        {
851            r1 = t1[0 .. 1];
852            r1[0] = x0;
853            y = y0;
854        }
855        else
856        {
857            // It's not worth right shifting by evenbits unless we also shrink the length after each
858            // multiply or squaring operation. That might still be worthwhile for large y.
859            r1 = t1[0 .. x.data.length - firstnonzero];
860            r1[0..$] = x.data[firstnonzero..$];
861        }
862
863        if (y>1)
864        {   // Set r1 = r1 ^^ y.
865            // The secondary buffer only needs space for the multiplication results
866            BigDigit [] t2 = new BigDigit[resultBuffer.length - result_start];
867            BigDigit [] r2;
868
869            int shifts = 63; // num bits in a long
870            while (!(y & 0x8000_0000_0000_0000L))
871            {
872                y <<= 1;
873                --shifts;
874            }
875            y <<=1;
876
877            while (y != 0)
878            {
879                // For each bit of y: Set r1 =  r1 * r1
880                // If the bit is 1, set r1 = r1 * x
881                // Eg, if y is 0b101, result = ((x^^2)^^2)*x == x^^5.
882                // Optimization opportunity: if more than 2 bits in y are set,
883                // it's usually possible to reduce the number of multiplies
884                // by caching odd powers of x. eg for y = 54,
885                // (0b110110), set u = x^^3, and result is ((u^^8)*u)^^2
886                r2 = t2[0 .. r1.length*2];
887                squareInternal(r2, r1);
888                if (y & 0x8000_0000_0000_0000L)
889                {
890                    r1 = t1[0 .. r2.length + nonzerolength];
891                    if (singledigit)
892                    {
893                        r1[$-1] = multibyteMul(r1[0 .. $-1], r2, x0, 0);
894                    }
895                    else
896                    {
897                        mulInternal(r1, r2, x.data[firstnonzero..$]);
898                    }
899                }
900                else
901                {
902                    r1 = t1[0 .. r2.length];
903                    r1[] = r2[];
904                }
905                y <<=1;
906                shifts--;
907            }
908            while (shifts>0)
909            {
910                r2 = t2[0 .. r1.length * 2];
911                squareInternal(r2, r1);
912                r1 = t1[0 .. r2.length];
913                r1[] = r2[];
914                --shifts;
915            }
916        }
917
918        if (finalMultiplier != 1)
919        {
920            const BigDigit carry = multibyteMul(r1, r1, finalMultiplier, 0);
921            if (carry)
922            {
923                r1 = t1[0 .. r1.length + 1];
924                r1[$-1] = carry;
925            }
926        }
927        if (evenshiftbits)
928        {
929            const BigDigit carry = multibyteShl(r1, r1, evenshiftbits);
930            if (carry != 0)
931            {
932                r1 = t1[0 .. r1.length + 1];
933                r1[$ - 1] = carry;
934            }
935        }
936        while (r1[$ - 1]==0)
937        {
938            r1=r1[0 .. $ - 1];
939        }
940        return BigUint(trustedAssumeUnique(resultBuffer[0 .. result_start + r1.length]));
941    }
942
943    // Implement toHash so that BigUint works properly as an AA key.
944    size_t toHash() const @trusted nothrow
945    {
946        return typeid(data).getHash(&data);
947    }
948
949} // end BigUint
950
951@safe pure nothrow unittest
952{
953    // ulong comparison test
954    BigUint a = [1];
955    assert(a == 1);
956    assert(a < 0x8000_0000_0000_0000UL); // bug 9548
957
958    // bug 12234
959    BigUint z = [0];
960    assert(z == 0UL);
961    assert(!(z > 0UL));
962    assert(!(z < 0UL));
963}
964
965// Remove leading zeros from x, to restore the BigUint invariant
966inout(BigDigit) [] removeLeadingZeros(inout(BigDigit) [] x) pure nothrow @safe
967{
968    size_t k = x.length;
969    while (k>1 && x[k - 1]==0) --k;
970    return x[0 .. k];
971}
972
973pure @system unittest
974{
975   BigUint r = BigUint([5]);
976   BigUint t = BigUint([7]);
977   BigUint s = BigUint.mod(r, t);
978   assert(s == 5);
979}
980
981
982@safe pure unittest
983{
984    BigUint r;
985    r = 5UL;
986    assert(r.peekUlong(0) == 5UL);
987    assert(r.peekUint(0) == 5U);
988    r = 0x1234_5678_9ABC_DEF0UL;
989    assert(r.peekUlong(0) == 0x1234_5678_9ABC_DEF0UL);
990    assert(r.peekUint(0) == 0x9ABC_DEF0U);
991}
992
993
994// Pow tests
995pure @system unittest
996{
997    BigUint r, s;
998    r.fromHexString("80000000_00000001");
999    s = BigUint.pow(r, 5);
1000    r.fromHexString("08000000_00000000_50000000_00000001_40000000_00000002_80000000"
1001      ~ "_00000002_80000000_00000001");
1002    assert(s == r);
1003    s = 10UL;
1004    s = BigUint.pow(s, 39);
1005    r.fromDecimalString("1000000000000000000000000000000000000000");
1006    assert(s == r);
1007    r.fromHexString("1_E1178E81_00000000");
1008    s = BigUint.pow(r, 15); // Regression test: this used to overflow array bounds
1009
1010    r.fromDecimalString("000_000_00");
1011    assert(r == 0);
1012    r.fromDecimalString("0007");
1013    assert(r == 7);
1014    r.fromDecimalString("0");
1015    assert(r == 0);
1016}
1017
1018// Radix conversion tests
1019@safe pure unittest
1020{
1021    BigUint r;
1022    r.fromHexString("1_E1178E81_00000000");
1023    assert(r.toHexString(0, '_', 0) == "1_E1178E81_00000000");
1024    assert(r.toHexString(0, '_', 20) == "0001_E1178E81_00000000");
1025    assert(r.toHexString(0, '_', 16+8) == "00000001_E1178E81_00000000");
1026    assert(r.toHexString(0, '_', 16+9) == "0_00000001_E1178E81_00000000");
1027    assert(r.toHexString(0, '_', 16+8+8) ==   "00000000_00000001_E1178E81_00000000");
1028    assert(r.toHexString(0, '_', 16+8+8+1) ==      "0_00000000_00000001_E1178E81_00000000");
1029    assert(r.toHexString(0, '_', 16+8+8+1, ' ') == "                  1_E1178E81_00000000");
1030    assert(r.toHexString(0, 0, 16+8+8+1) == "00000000000000001E1178E8100000000");
1031    r = 0UL;
1032    assert(r.toHexString(0, '_', 0) == "0");
1033    assert(r.toHexString(0, '_', 7) == "0000000");
1034    assert(r.toHexString(0, '_', 7, ' ') == "      0");
1035    assert(r.toHexString(0, '#', 9) == "0#00000000");
1036    assert(r.toHexString(0, 0, 9) == "000000000");
1037}
1038
1039//
1040@safe pure unittest
1041{
1042    BigUint r;
1043    r.fromHexString("1_E1178E81_00000000");
1044    assert(r.toHexString(0, '_', 0, '0', LetterCase.lower) == "1_e1178e81_00000000");
1045    assert(r.toHexString(0, '_', 20, '0', LetterCase.lower) == "0001_e1178e81_00000000");
1046    assert(r.toHexString(0, '_', 16+8, '0', LetterCase.lower) == "00000001_e1178e81_00000000");
1047    assert(r.toHexString(0, '_', 16+9, '0', LetterCase.lower) == "0_00000001_e1178e81_00000000");
1048    assert(r.toHexString(0, '_', 16+8+8, '0', LetterCase.lower) ==   "00000000_00000001_e1178e81_00000000");
1049    assert(r.toHexString(0, '_', 16+8+8+1, '0', LetterCase.lower) == "0_00000000_00000001_e1178e81_00000000");
1050    assert(r.toHexString(0, '_', 16+8+8+1, ' ', LetterCase.lower) == "                  1_e1178e81_00000000");
1051    assert(r.toHexString(0, 0, 16+8+8+1, '0', LetterCase.lower) == "00000000000000001e1178e8100000000");
1052    r = 0UL;
1053    assert(r.toHexString(0, '_', 0, '0', LetterCase.lower) == "0");
1054    assert(r.toHexString(0, '_', 7, '0', LetterCase.lower) == "0000000");
1055    assert(r.toHexString(0, '_', 7, ' ', LetterCase.lower) == "      0");
1056    assert(r.toHexString(0, '#', 9, '0', LetterCase.lower) == "0#00000000");
1057    assert(r.toHexString(0, 'Z', 9, '0', LetterCase.lower) == "0Z00000000");
1058    assert(r.toHexString(0, 0, 9, '0', LetterCase.lower) == "000000000");
1059}
1060
1061
1062private:
1063void twosComplement(const(BigDigit) [] x, BigDigit[] result)
1064pure nothrow @safe
1065{
1066    foreach (i; 0 .. x.length)
1067    {
1068        result[i] = ~x[i];
1069    }
1070    result[x.length..$] = BigDigit.max;
1071
1072    foreach (i; 0 .. result.length)
1073    {
1074        if (result[i] == BigDigit.max)
1075        {
1076            result[i] = 0;
1077        }
1078        else
1079        {
1080            result[i] += 1;
1081            break;
1082        }
1083    }
1084}
1085
1086// Encode BigInt as BigDigit array (sign and 2's complement)
1087BigDigit[] includeSign(const(BigDigit) [] x, size_t minSize, bool sign)
1088pure nothrow @safe
1089{
1090    size_t length = (x.length > minSize) ? x.length : minSize;
1091    BigDigit [] result = new BigDigit[length];
1092    if (sign)
1093    {
1094        twosComplement(x, result);
1095    }
1096    else
1097    {
1098        result[0 .. x.length] = x;
1099    }
1100    return result;
1101}
1102
1103// works for any type
1104T intpow(T)(T x, ulong n) pure nothrow @safe
1105{
1106    T p;
1107
1108    switch (n)
1109    {
1110    case 0:
1111        p = 1;
1112        break;
1113
1114    case 1:
1115        p = x;
1116        break;
1117
1118    case 2:
1119        p = x * x;
1120        break;
1121
1122    default:
1123        p = 1;
1124        while (1)
1125        {
1126            if (n & 1)
1127                p *= x;
1128            n >>= 1;
1129            if (!n)
1130                break;
1131            x *= x;
1132        }
1133        break;
1134    }
1135    return p;
1136}
1137
1138
1139//  returns the maximum power of x that will fit in a uint.
1140int highestPowerBelowUintMax(uint x) pure nothrow @safe
1141{
1142     assert(x>1);
1143     static immutable ubyte [22] maxpwr = [ 31, 20, 15, 13, 12, 11, 10, 10, 9, 9,
1144                                          8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7];
1145     if (x<24) return maxpwr[x-2];
1146     if (x<41) return 6;
1147     if (x<85) return 5;
1148     if (x<256) return 4;
1149     if (x<1626) return 3;
1150     if (x<65_536) return 2;
1151     return 1;
1152}
1153
1154//  returns the maximum power of x that will fit in a ulong.
1155int highestPowerBelowUlongMax(uint x) pure nothrow @safe
1156{
1157     assert(x>1);
1158     static immutable ubyte [39] maxpwr = [ 63, 40, 31, 27, 24, 22, 21, 20, 19, 18,
1159                                         17, 17, 16, 16, 15, 15, 15, 15, 14, 14,
1160                                         14, 14, 13, 13, 13, 13, 13, 13, 13, 12,
1161                                         12, 12, 12, 12, 12, 12, 12, 12, 12];
1162     if (x<41) return maxpwr[x-2];
1163     if (x<57) return 11;
1164     if (x<85) return 10;
1165     if (x<139) return 9;
1166     if (x<256) return 8;
1167     if (x<566) return 7;
1168     if (x<1626) return 6;
1169     if (x<7132) return 5;
1170     if (x<65_536) return 4;
1171     if (x<2_642_246) return 3;
1172     return 2;
1173}
1174
1175version (unittest)
1176{
1177
1178int slowHighestPowerBelowUintMax(uint x) pure nothrow @safe
1179{
1180     int pwr = 1;
1181     for (ulong q = x;x*q < cast(ulong) uint.max; )
1182     {
1183         q*=x; ++pwr;
1184     }
1185     return pwr;
1186}
1187
1188@safe pure unittest
1189{
1190    assert(highestPowerBelowUintMax(10)==9);
1191    for (int k=82; k<88; ++k)
1192    {
1193        assert(highestPowerBelowUintMax(k)== slowHighestPowerBelowUintMax(k));
1194    }
1195}
1196
1197}
1198
1199
1200/*  General unsigned subtraction routine for bigints.
1201 *  Sets result = x - y. If the result is negative, negative will be true.
1202 */
1203BigDigit [] sub(const BigDigit [] x, const BigDigit [] y, bool *negative)
1204pure nothrow
1205{
1206    if (x.length == y.length)
1207    {
1208        // There's a possibility of cancellation, if x and y are almost equal.
1209        ptrdiff_t last = highestDifferentDigit(x, y);
1210        BigDigit [] result = new BigDigit[last+1];
1211        if (x[last] < y[last])
1212        {   // we know result is negative
1213            multibyteSub(result[0 .. last+1], y[0 .. last+1], x[0 .. last+1], 0);
1214            *negative = true;
1215        }
1216        else
1217        {   // positive or zero result
1218            multibyteSub(result[0 .. last+1], x[0 .. last+1], y[0 .. last+1], 0);
1219            *negative = false;
1220        }
1221        while (result.length > 1 && result[$-1] == 0)
1222        {
1223            result = result[0..$-1];
1224        }
1225//        if (result.length >1 && result[$-1]==0) return result[0..$-1];
1226        return result;
1227    }
1228    // Lengths are different
1229    const(BigDigit) [] large, small;
1230    if (x.length < y.length)
1231    {
1232        *negative = true;
1233        large = y; small = x;
1234    }
1235    else
1236    {
1237        *negative = false;
1238        large = x; small = y;
1239    }
1240    // result.length will be equal to larger length, or could decrease by 1.
1241
1242
1243    BigDigit [] result = new BigDigit[large.length];
1244    BigDigit carry = multibyteSub(result[0 .. small.length], large[0 .. small.length], small, 0);
1245    result[small.length..$] = large[small.length..$];
1246    if (carry)
1247    {
1248        multibyteIncrementAssign!('-')(result[small.length..$], carry);
1249    }
1250    while (result.length > 1 && result[$-1] == 0)
1251    {
1252        result = result[0..$-1];
1253    }
1254    return result;
1255}
1256
1257
1258// return a + b
1259BigDigit [] add(const BigDigit [] a, const BigDigit [] b) pure nothrow
1260{
1261    const(BigDigit) [] x, y;
1262    if (a.length < b.length)
1263    {
1264        x = b; y = a;
1265    }
1266    else
1267    {
1268        x = a; y = b;
1269    }
1270    // now we know x.length > y.length
1271    // create result. add 1 in case it overflows
1272    BigDigit [] result = new BigDigit[x.length + 1];
1273
1274    BigDigit carry = multibyteAdd(result[0 .. y.length], x[0 .. y.length], y, 0);
1275    if (x.length != y.length)
1276    {
1277        result[y.length..$-1]= x[y.length..$];
1278        carry  = multibyteIncrementAssign!('+')(result[y.length..$-1], carry);
1279    }
1280    if (carry)
1281    {
1282        result[$-1] = carry;
1283        return result;
1284    }
1285    else
1286        return result[0..$-1];
1287}
1288
1289/**  return x + y
1290 */
1291BigDigit [] addInt(const BigDigit[] x, ulong y) pure nothrow
1292{
1293    uint hi = cast(uint)(y >>> 32);
1294    uint lo = cast(uint)(y& 0xFFFF_FFFF);
1295    auto len = x.length;
1296    if (x.length < 2 && hi != 0) ++len;
1297    BigDigit [] result = new BigDigit[len+1];
1298    result[0 .. x.length] = x[];
1299    if (x.length < 2 && hi != 0)
1300    {
1301        result[1]=hi;
1302        hi=0;
1303    }
1304    uint carry = multibyteIncrementAssign!('+')(result[0..$-1], lo);
1305    if (hi != 0) carry += multibyteIncrementAssign!('+')(result[1..$-1], hi);
1306    if (carry)
1307    {
1308        result[$-1] = carry;
1309        return result;
1310    }
1311    else
1312        return result[0..$-1];
1313}
1314
1315/** Return x - y.
1316 *  x must be greater than y.
1317 */
1318BigDigit [] subInt(const BigDigit[] x, ulong y) pure nothrow
1319{
1320    uint hi = cast(uint)(y >>> 32);
1321    uint lo = cast(uint)(y & 0xFFFF_FFFF);
1322    BigDigit [] result = new BigDigit[x.length];
1323    result[] = x[];
1324    multibyteIncrementAssign!('-')(result[], lo);
1325    if (hi)
1326        multibyteIncrementAssign!('-')(result[1..$], hi);
1327    if (result[$-1] == 0)
1328        return result[0..$-1];
1329    else
1330        return result;
1331}
1332
1333/**  General unsigned multiply routine for bigints.
1334 *  Sets result = x * y.
1335 *
1336 *  The length of y must not be larger than the length of x.
1337 *  Different algorithms are used, depending on the lengths of x and y.
1338 *  TODO: "Modern Computer Arithmetic" suggests the OddEvenKaratsuba algorithm for the
1339 *  unbalanced case. (But I doubt it would be faster in practice).
1340 *
1341 */
1342void mulInternal(BigDigit[] result, const(BigDigit)[] x, const(BigDigit)[] y)
1343    pure nothrow
1344{
1345    import core.memory : GC;
1346    assert( result.length == x.length + y.length );
1347    assert( y.length > 0 );
1348    assert( x.length >= y.length);
1349    if (y.length <= KARATSUBALIMIT)
1350    {
1351        // Small multiplier, we'll just use the asm classic multiply.
1352        if (y.length == 1)
1353        {   // Trivial case, no cache effects to worry about
1354            result[x.length] = multibyteMul(result[0 .. x.length], x, y[0], 0);
1355            return;
1356        }
1357
1358        if (x.length + y.length < CACHELIMIT)
1359            return mulSimple(result, x, y);
1360
1361        // If x is so big that it won't fit into the cache, we divide it into chunks
1362        // Every chunk must be greater than y.length.
1363        // We make the first chunk shorter, if necessary, to ensure this.
1364
1365        auto chunksize = CACHELIMIT / y.length;
1366        immutable residual  =  x.length % chunksize;
1367        if (residual < y.length)
1368        {
1369            chunksize -= y.length;
1370        }
1371
1372        // Use schoolbook multiply.
1373        mulSimple(result[0 .. chunksize + y.length], x[0 .. chunksize], y);
1374        auto done = chunksize;
1375
1376        while (done < x.length)
1377        {
1378            // result[done .. done+ylength] already has a value.
1379            chunksize = (done + (CACHELIMIT / y.length) < x.length) ? (CACHELIMIT / y.length) :  x.length - done;
1380            BigDigit [KARATSUBALIMIT] partial;
1381            partial[0 .. y.length] = result[done .. done+y.length];
1382            mulSimple(result[done .. done+chunksize+y.length], x[done .. done+chunksize], y);
1383            addAssignSimple(result[done .. done+chunksize + y.length], partial[0 .. y.length]);
1384            done += chunksize;
1385        }
1386        return;
1387    }
1388
1389    immutable half = (x.length >> 1) + (x.length & 1);
1390    if (2*y.length*y.length <= x.length*x.length)
1391    {
1392        // UNBALANCED MULTIPLY
1393        // Use school multiply to cut into quasi-squares of Karatsuba-size
1394        // or larger. The ratio of the two sides of the 'square' must be
1395        // between 1.414:1 and 1:1. Use Karatsuba on each chunk.
1396        //
1397        // For maximum performance, we want the ratio to be as close to
1398        // 1:1 as possible. To achieve this, we can either pad x or y.
1399        // The best choice depends on the modulus x%y.
1400        auto numchunks = x.length / y.length;
1401        auto chunksize = y.length;
1402        auto extra =  x.length % y.length;
1403        auto maxchunk = chunksize + extra;
1404        bool paddingY; // true = we're padding Y, false = we're padding X.
1405        if (extra * extra * 2 < y.length*y.length)
1406        {
1407            // The leftover bit is small enough that it should be incorporated
1408            // in the existing chunks.
1409            // Make all the chunks a tiny bit bigger
1410            // (We're padding y with zeros)
1411            chunksize += extra / numchunks;
1412            extra = x.length - chunksize*numchunks;
1413            // there will probably be a few left over.
1414            // Every chunk will either have size chunksize, or chunksize+1.
1415            maxchunk = chunksize + 1;
1416            paddingY = true;
1417            assert(chunksize + extra + chunksize *(numchunks-1) == x.length );
1418        }
1419        else
1420        {
1421            // the extra bit is large enough that it's worth making a new chunk.
1422            // (This means we're padding x with zeros, when doing the first one).
1423            maxchunk = chunksize;
1424            ++numchunks;
1425            paddingY = false;
1426            assert(extra + chunksize *(numchunks-1) == x.length );
1427        }
1428        // We make the buffer a bit bigger so we have space for the partial sums.
1429        BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(maxchunk) + y.length];
1430        BigDigit [] partial = scratchbuff[$ - y.length .. $];
1431        size_t done; // how much of X have we done so far?
1432        if (paddingY)
1433        {
1434            // If the first chunk is bigger, do it first. We're padding y.
1435            mulKaratsuba(result[0 .. y.length + chunksize + (extra > 0 ? 1 : 0 )],
1436                x[0 .. chunksize + (extra>0?1:0)], y, scratchbuff);
1437            done = chunksize + (extra > 0 ? 1 : 0);
1438            if (extra) --extra;
1439        }
1440        else
1441        {   // We're padding X. Begin with the extra bit.
1442            mulKaratsuba(result[0 .. y.length + extra], y, x[0 .. extra], scratchbuff);
1443            done = extra;
1444            extra = 0;
1445        }
1446        immutable basechunksize = chunksize;
1447        while (done < x.length)
1448        {
1449            chunksize = basechunksize + (extra > 0 ? 1 : 0);
1450            if (extra) --extra;
1451            partial[] = result[done .. done+y.length];
1452            mulKaratsuba(result[done .. done + y.length + chunksize],
1453                       x[done .. done+chunksize], y, scratchbuff);
1454            addAssignSimple(result[done .. done + y.length + chunksize], partial);
1455            done += chunksize;
1456        }
1457        () @trusted { GC.free(scratchbuff.ptr); } ();
1458    }
1459    else
1460    {
1461        // Balanced. Use Karatsuba directly.
1462        BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(x.length)];
1463        mulKaratsuba(result, x, y, scratchbuff);
1464        () @trusted { GC.free(scratchbuff.ptr); } ();
1465    }
1466}
1467
1468/**  General unsigned squaring routine for BigInts.
1469 *   Sets result = x*x.
1470 *   NOTE: If the highest half-digit of x is zero, the highest digit of result will
1471 *   also be zero.
1472 */
1473void squareInternal(BigDigit[] result, const BigDigit[] x) pure nothrow
1474{
1475  import core.memory : GC;
1476  // Squaring is potentially half a multiply, plus add the squares of
1477  // the diagonal elements.
1478  assert(result.length == 2*x.length);
1479  if (x.length <= KARATSUBASQUARELIMIT)
1480  {
1481      if (x.length == 1)
1482      {
1483         result[1] = multibyteMul(result[0 .. 1], x, x[0], 0);
1484         return;
1485      }
1486      return squareSimple(result, x);
1487  }
1488  // The nice thing about squaring is that it always stays balanced
1489  BigDigit [] scratchbuff = new BigDigit[karatsubaRequiredBuffSize(x.length)];
1490  squareKaratsuba(result, x, scratchbuff);
1491  () @trusted { GC.free(scratchbuff.ptr); } ();
1492}
1493
1494
1495import core.bitop : bsr;
1496
1497/// if remainder is null, only calculate quotient.
1498void divModInternal(BigDigit [] quotient, BigDigit[] remainder, const BigDigit [] u,
1499        const BigDigit [] v) pure nothrow
1500{
1501    import core.memory : GC;
1502    assert(quotient.length == u.length - v.length + 1);
1503    assert(remainder == null || remainder.length == v.length);
1504    assert(v.length > 1);
1505    assert(u.length >= v.length);
1506
1507    // Normalize by shifting v left just enough so that
1508    // its high-order bit is on, and shift u left the
1509    // same amount. The highest bit of u will never be set.
1510
1511    BigDigit [] vn = new BigDigit[v.length];
1512    BigDigit [] un = new BigDigit[u.length + 1];
1513    // How much to left shift v, so that its MSB is set.
1514    uint s = BIGDIGITSHIFTMASK - bsr(v[$-1]);
1515    if (s != 0)
1516    {
1517        multibyteShl(vn, v, s);
1518        un[$-1] = multibyteShl(un[0..$-1], u, s);
1519    }
1520    else
1521    {
1522        vn[] = v[];
1523        un[0..$-1] = u[];
1524        un[$-1] = 0;
1525    }
1526    if (quotient.length<FASTDIVLIMIT)
1527    {
1528        schoolbookDivMod(quotient, un, vn);
1529    }
1530    else
1531    {
1532        blockDivMod(quotient, un, vn);
1533    }
1534
1535    // Unnormalize remainder, if required.
1536    if (remainder != null)
1537    {
1538        if (s == 0) remainder[] = un[0 .. vn.length];
1539        else multibyteShr(remainder, un[0 .. vn.length+1], s);
1540    }
1541    () @trusted { GC.free(un.ptr); GC.free(vn.ptr); } ();
1542}
1543
1544pure @system unittest
1545{
1546    immutable(uint) [] u = [0, 0xFFFF_FFFE, 0x8000_0000];
1547    immutable(uint) [] v = [0xFFFF_FFFF, 0x8000_0000];
1548    uint [] q = new uint[u.length - v.length + 1];
1549    uint [] r = new uint[2];
1550    divModInternal(q, r, u, v);
1551    assert(q[]==[0xFFFF_FFFFu, 0]);
1552    assert(r[]==[0xFFFF_FFFFu, 0x7FFF_FFFF]);
1553    u = [0, 0xFFFF_FFFE, 0x8000_0001];
1554    v = [0xFFFF_FFFF, 0x8000_0000];
1555    divModInternal(q, r, u, v);
1556}
1557
1558
1559private:
1560// Converts a big uint to a hexadecimal string.
1561//
1562// Optionally, a separator character (eg, an underscore) may be added between
1563// every 8 digits.
1564// buff.length must be data.length*8 if separator is zero,
1565// or data.length*9 if separator is non-zero. It will be completely filled.
1566char [] biguintToHex(char [] buff, const BigDigit [] data, char separator=0,
1567        LetterCase letterCase = LetterCase.upper) pure nothrow @safe
1568{
1569    int x=0;
1570    for (ptrdiff_t i=data.length - 1; i >= 0; --i)
1571    {
1572        toHexZeroPadded(buff[x .. x+8], data[i], letterCase);
1573        x+=8;
1574        if (separator)
1575        {
1576            if (i>0) buff[x] = separator;
1577            ++x;
1578        }
1579    }
1580    return buff;
1581}
1582
1583/**
1584 * Convert a big uint into an octal string.
1585 *
1586 * Params:
1587 *  buff = The destination buffer for the octal string. Must be large enough to
1588 *      store the result, including leading zeroes, which is
1589 *      ceil(data.length * BigDigitBits / 3) characters.
1590 *      The buffer is filled from back to front, starting from `buff[$-1]`.
1591 *  data = The biguint to be converted.
1592 *
1593 * Returns: The index of the leading non-zero digit in `buff`. Will be
1594 * `buff.length - 1` if the entire big uint is zero.
1595 */
1596size_t biguintToOctal(char[] buff, const(BigDigit)[] data)
1597    pure nothrow @safe @nogc
1598{
1599    ubyte carry = 0;
1600    int shift = 0;
1601    size_t penPos = buff.length - 1;
1602    size_t lastNonZero = buff.length - 1;
1603
1604    pragma(inline) void output(uint digit) @nogc nothrow
1605    {
1606        if (digit != 0)
1607            lastNonZero = penPos;
1608        buff[penPos--] = cast(char)('0' + digit);
1609    }
1610
1611    foreach (bigdigit; data)
1612    {
1613        if (shift < 0)
1614        {
1615            // Some bits were carried over from previous word.
1616            assert(shift > -3);
1617            output(((bigdigit << -shift) | carry) & 0b111);
1618            shift += 3;
1619            assert(shift > 0);
1620        }
1621
1622        while (shift <= BigDigitBits - 3)
1623        {
1624            output((bigdigit >>> shift) & 0b111);
1625            shift += 3;
1626        }
1627
1628        if (shift < BigDigitBits)
1629        {
1630            // Some bits need to be carried forward.
1631            carry = (bigdigit >>> shift) & 0b11;
1632        }
1633        shift -= BigDigitBits;
1634        assert(shift >= -2 && shift <= 0);
1635    }
1636
1637    if (shift < 0)
1638    {
1639        // Last word had bits that haven't been output yet.
1640        assert(shift > -3);
1641        output(carry);
1642    }
1643
1644    return lastNonZero;
1645}
1646
1647/** Convert a big uint into a decimal string.
1648 *
1649 * Params:
1650 *  data    The biguint to be converted. Will be destroyed.
1651 *  buff    The destination buffer for the decimal string. Must be
1652 *          large enough to store the result, including leading zeros.
1653 *          Will be filled backwards, starting from buff[$-1].
1654 *
1655 * buff.length must be >= (data.length*32)/log2(10) = 9.63296 * data.length.
1656 * Returns:
1657 *    the lowest index of buff which was used.
1658 */
1659size_t biguintToDecimal(char [] buff, BigDigit [] data) pure nothrow
1660{
1661    ptrdiff_t sofar = buff.length;
1662    // Might be better to divide by (10^38/2^32) since that gives 38 digits for
1663    // the price of 3 divisions and a shr; this version only gives 27 digits
1664    // for 3 divisions.
1665    while (data.length>1)
1666    {
1667        uint rem = multibyteDivAssign(data, 10_0000_0000, 0);
1668        itoaZeroPadded(buff[sofar-9 .. sofar], rem);
1669        sofar -= 9;
1670        if (data[$-1] == 0 && data.length > 1)
1671        {
1672            data.length = data.length - 1;
1673        }
1674    }
1675    itoaZeroPadded(buff[sofar-10 .. sofar], data[0]);
1676    sofar -= 10;
1677    // and strip off the leading zeros
1678    while (sofar != buff.length-1 && buff[sofar] == '0')
1679        sofar++;
1680    return sofar;
1681}
1682
1683/** Convert a decimal string into a big uint.
1684 *
1685 * Params:
1686 *  data    The biguint to be receive the result. Must be large enough to
1687 *          store the result.
1688 *  s       The decimal string. May contain _ or 0 .. 9
1689 *
1690 * The required length for the destination buffer is slightly less than
1691 *  1 + s.length/log2(10) = 1 + s.length/3.3219.
1692 *
1693 * Returns:
1694 *    the highest index of data which was used.
1695 */
1696int biguintFromDecimal(Range)(BigDigit[] data, Range s)
1697if (
1698    isInputRange!Range &&
1699    isSomeChar!(ElementType!Range) &&
1700    !isInfinite!Range)
1701in
1702{
1703    static if (hasLength!Range)
1704        assert((data.length >= 2) || (data.length == 1 && s.length == 1));
1705}
1706body
1707{
1708    import std.conv : ConvException;
1709
1710    // Convert to base 1e19 = 10_000_000_000_000_000_000.
1711    // (this is the largest power of 10 that will fit into a long).
1712    // The length will be less than 1 + s.length/log2(10) = 1 + s.length/3.3219.
1713    // 485 bits will only just fit into 146 decimal digits.
1714    // As we convert the string, we record the number of digits we've seen in base 19:
1715    // hi is the number of digits/19, lo is the extra digits (0 to 18).
1716    // TODO: This is inefficient for very large strings (it is O(n^^2)).
1717    // We should take advantage of fast multiplication once the numbers exceed
1718    // Karatsuba size.
1719    uint lo = 0; // number of powers of digits, 0 .. 18
1720    uint x = 0;
1721    ulong y = 0;
1722    uint hi = 0; // number of base 1e19 digits
1723    data[0] = 0; // initially number is 0.
1724    if (data.length > 1)
1725        data[1] = 0;
1726
1727    foreach (character; s)
1728    {
1729        if (character == '_')
1730            continue;
1731
1732        if (character < '0' || character > '9')
1733            throw new ConvException("invalid digit");
1734        x *= 10;
1735        x += character - '0';
1736        ++lo;
1737        if (lo == 9)
1738        {
1739            y = x;
1740            x = 0;
1741        }
1742        if (lo == 18)
1743        {
1744            y *= 10_0000_0000;
1745            y += x;
1746            x = 0;
1747        }
1748        if (lo == 19)
1749        {
1750            y *= 10;
1751            y += x;
1752            x = 0;
1753            // Multiply existing number by 10^19, then add y1.
1754            if (hi>0)
1755            {
1756                data[hi] = multibyteMul(data[0 .. hi], data[0 .. hi], 1_220_703_125*2u, 0); // 5^13*2 = 0x9184_E72A
1757                ++hi;
1758                data[hi] = multibyteMul(data[0 .. hi], data[0 .. hi], 15_625*262_144u, 0); // 5^6*2^18 = 0xF424_0000
1759                ++hi;
1760            }
1761            else
1762                hi = 2;
1763            uint c = multibyteIncrementAssign!('+')(data[0 .. hi], cast(uint)(y&0xFFFF_FFFF));
1764            c += multibyteIncrementAssign!('+')(data[1 .. hi], cast(uint)(y >> 32));
1765            if (c != 0)
1766            {
1767                data[hi]=c;
1768                ++hi;
1769            }
1770            y = 0;
1771            lo = 0;
1772        }
1773    }
1774    // Now set y = all remaining digits.
1775    if (lo >= 18)
1776    {
1777    }
1778    else if (lo >= 9)
1779    {
1780        for (int k=9; k<lo; ++k) y*=10;
1781        y+=x;
1782    }
1783    else
1784    {
1785        for (int k=0; k<lo; ++k) y*=10;
1786        y+=x;
1787    }
1788    if (lo != 0)
1789    {
1790        if (hi == 0)
1791        {
1792            data[0] = cast(uint) y;
1793            if (data.length == 1)
1794            {
1795                hi = 1;
1796            }
1797            else
1798            {
1799                data[1] = cast(uint)(y >>> 32);
1800                hi=2;
1801            }
1802        }
1803        else
1804        {
1805            while (lo>0)
1806            {
1807                immutable c = multibyteMul(data[0 .. hi], data[0 .. hi], 10, 0);
1808                if (c != 0)
1809                {
1810                    data[hi]=c;
1811                    ++hi;
1812                }
1813                --lo;
1814            }
1815            uint c = multibyteIncrementAssign!('+')(data[0 .. hi], cast(uint)(y&0xFFFF_FFFF));
1816            if (y > 0xFFFF_FFFFL)
1817            {
1818                c += multibyteIncrementAssign!('+')(data[1 .. hi], cast(uint)(y >> 32));
1819            }
1820            if (c != 0)
1821            {
1822                data[hi]=c;
1823                ++hi;
1824            }
1825        }
1826    }
1827    while (hi>1 && data[hi-1]==0)
1828        --hi;
1829    return hi;
1830}
1831
1832
1833private:
1834// ------------------------
1835// These in-place functions are only for internal use; they are incompatible
1836// with COW.
1837
1838// Classic 'schoolbook' multiplication.
1839void mulSimple(BigDigit[] result, const(BigDigit) [] left,
1840        const(BigDigit)[] right) pure nothrow
1841in
1842{
1843    assert(result.length == left.length + right.length);
1844    assert(right.length>1);
1845}
1846body
1847{
1848    result[left.length] = multibyteMul(result[0 .. left.length], left, right[0], 0);
1849    multibyteMultiplyAccumulate(result[1..$], left, right[1..$]);
1850}
1851
1852// Classic 'schoolbook' squaring
1853void squareSimple(BigDigit[] result, const(BigDigit) [] x) pure nothrow
1854in
1855{
1856    assert(result.length == 2*x.length);
1857    assert(x.length>1);
1858}
1859body
1860{
1861    multibyteSquare(result, x);
1862}
1863
1864
1865// add two uints of possibly different lengths. Result must be as long
1866// as the larger length.
1867// Returns carry (0 or 1).
1868uint addSimple(BigDigit[] result, const BigDigit [] left, const BigDigit [] right)
1869pure nothrow
1870in
1871{
1872    assert(result.length == left.length);
1873    assert(left.length >= right.length);
1874    assert(right.length>0);
1875}
1876body
1877{
1878    uint carry = multibyteAdd(result[0 .. right.length],
1879            left[0 .. right.length], right, 0);
1880    if (right.length < left.length)
1881    {
1882        result[right.length .. left.length] = left[right.length .. $];
1883        carry = multibyteIncrementAssign!('+')(result[right.length..$], carry);
1884    }
1885    return carry;
1886}
1887
1888//  result = left - right
1889// returns carry (0 or 1)
1890BigDigit subSimple(BigDigit [] result,const(BigDigit) [] left,
1891        const(BigDigit) [] right) pure nothrow
1892in
1893{
1894    assert(result.length == left.length);
1895    assert(left.length >= right.length);
1896    assert(right.length>0);
1897}
1898body
1899{
1900    BigDigit carry = multibyteSub(result[0 .. right.length],
1901            left[0 .. right.length], right, 0);
1902    if (right.length < left.length)
1903    {
1904        result[right.length .. left.length] = left[right.length .. $];
1905        carry = multibyteIncrementAssign!('-')(result[right.length..$], carry);
1906    } //else if (result.length == left.length+1) { result[$-1] = carry; carry=0; }
1907    return carry;
1908}
1909
1910
1911/* result = result - right
1912 * Returns carry = 1 if result was less than right.
1913*/
1914BigDigit subAssignSimple(BigDigit [] result, const(BigDigit) [] right)
1915pure nothrow
1916{
1917    assert(result.length >= right.length);
1918    uint c = multibyteSub(result[0 .. right.length], result[0 .. right.length], right, 0);
1919    if (c && result.length > right.length)
1920        c = multibyteIncrementAssign!('-')(result[right.length .. $], c);
1921    return c;
1922}
1923
1924/* result = result + right
1925*/
1926BigDigit addAssignSimple(BigDigit [] result, const(BigDigit) [] right)
1927pure nothrow
1928{
1929    assert(result.length >= right.length);
1930    uint c = multibyteAdd(result[0 .. right.length], result[0 .. right.length], right, 0);
1931    if (c && result.length > right.length)
1932       c = multibyteIncrementAssign!('+')(result[right.length .. $], c);
1933    return c;
1934}
1935
1936/* performs result += wantSub? - right : right;
1937*/
1938BigDigit addOrSubAssignSimple(BigDigit [] result, const(BigDigit) [] right,
1939        bool wantSub) pure nothrow
1940{
1941    if (wantSub)
1942        return subAssignSimple(result, right);
1943    else
1944        return addAssignSimple(result, right);
1945}
1946
1947
1948// return true if x<y, considering leading zeros
1949bool less(const(BigDigit)[] x, const(BigDigit)[] y) pure nothrow
1950{
1951    assert(x.length >= y.length);
1952    auto k = x.length-1;
1953    while (x[k]==0 && k >= y.length)
1954        --k;
1955    if (k >= y.length)
1956        return false;
1957    while (k>0 && x[k]==y[k])
1958        --k;
1959    return x[k] < y[k];
1960}
1961
1962// Set result = abs(x-y), return true if result is negative(x<y), false if x <= y.
1963bool inplaceSub(BigDigit[] result, const(BigDigit)[] x, const(BigDigit)[] y)
1964    pure nothrow
1965{
1966    assert(result.length == (x.length >= y.length) ? x.length : y.length);
1967
1968    size_t minlen;
1969    bool negative;
1970    if (x.length >= y.length)
1971    {
1972        minlen = y.length;
1973        negative = less(x, y);
1974    }
1975    else
1976    {
1977       minlen = x.length;
1978       negative = !less(y, x);
1979    }
1980    const (BigDigit)[] large, small;
1981    if (negative)
1982    {
1983        large = y; small = x;
1984    }
1985    else
1986    {
1987        large = x; small = y;
1988    }
1989
1990    BigDigit carry = multibyteSub(result[0 .. minlen], large[0 .. minlen], small[0 .. minlen], 0);
1991    if (x.length != y.length)
1992    {
1993        result[minlen .. large.length]= large[minlen..$];
1994        result[large.length..$] = 0;
1995        if (carry)
1996            multibyteIncrementAssign!('-')(result[minlen..$], carry);
1997    }
1998    return negative;
1999}
2000
2001/* Determine how much space is required for the temporaries
2002 * when performing a Karatsuba multiplication.
2003 */
2004size_t karatsubaRequiredBuffSize(size_t xlen) pure nothrow @safe
2005{
2006    return xlen <= KARATSUBALIMIT ? 0 : 2*xlen; // - KARATSUBALIMIT+2;
2007}
2008
2009/* Sets result = x*y, using Karatsuba multiplication.
2010* x must be longer or equal to y.
2011* Valid only for balanced multiplies, where x is not shorter than y.
2012* It is superior to schoolbook multiplication if and only if
2013*    sqrt(2)*y.length > x.length > y.length.
2014* Karatsuba multiplication is O(n^1.59), whereas schoolbook is O(n^2)
2015* The maximum allowable length of x and y is uint.max; but better algorithms
2016* should be used far before that length is reached.
2017* Params:
2018* scratchbuff      An array long enough to store all the temporaries. Will be destroyed.
2019*/
2020void mulKaratsuba(BigDigit [] result, const(BigDigit) [] x,
2021        const(BigDigit)[] y, BigDigit [] scratchbuff) pure nothrow
2022{
2023    assert(x.length >= y.length);
2024          assert(result.length < uint.max, "Operands too large");
2025    assert(result.length == x.length + y.length);
2026    if (x.length <= KARATSUBALIMIT)
2027    {
2028        return mulSimple(result, x, y);
2029    }
2030    // Must be almost square (otherwise, a schoolbook iteration is better)
2031    assert(2L * y.length * y.length > (x.length-1) * (x.length-1),
2032        "Bigint Internal Error: Asymmetric Karatsuba");
2033
2034    // The subtractive version of Karatsuba multiply uses the following result:
2035    // (Nx1 + x0)*(Ny1 + y0) = (N*N)*x1y1 + x0y0 + N * (x0y0 + x1y1 - mid)
2036    // where mid = (x0-x1)*(y0-y1)
2037    // requiring 3 multiplies of length N, instead of 4.
2038    // The advantage of the subtractive over the additive version is that
2039    // the mid multiply cannot exceed length N. But there are subtleties:
2040    // (x0-x1),(y0-y1) may be negative or zero. To keep it simple, we
2041    // retain all of the leading zeros in the subtractions
2042
2043    // half length, round up.
2044    auto half = (x.length >> 1) + (x.length & 1);
2045
2046    const(BigDigit) [] x0 = x[0 .. half];
2047    const(BigDigit) [] x1 = x[half .. $];
2048    const(BigDigit) [] y0 = y[0 .. half];
2049    const(BigDigit) [] y1 = y[half .. $];
2050    BigDigit [] mid = scratchbuff[0 .. half*2];
2051    BigDigit [] newscratchbuff = scratchbuff[half*2 .. $];
2052    BigDigit [] resultLow = result[0 .. 2*half];
2053    BigDigit [] resultHigh = result[2*half .. $];
2054     // initially use result to store temporaries
2055    BigDigit [] xdiff= result[0 .. half];
2056    BigDigit [] ydiff = result[half .. half*2];
2057
2058    // First, we calculate mid, and sign of mid
2059    immutable bool midNegative = inplaceSub(xdiff, x0, x1)
2060                      ^ inplaceSub(ydiff, y0, y1);
2061    mulKaratsuba(mid, xdiff, ydiff, newscratchbuff);
2062
2063    // Low half of result gets x0 * y0. High half gets x1 * y1
2064
2065    mulKaratsuba(resultLow, x0, y0, newscratchbuff);
2066
2067    if (2L * y1.length * y1.length < x1.length * x1.length)
2068    {
2069        // an asymmetric situation has been created.
2070        // Worst case is if x:y = 1.414 : 1, then x1:y1 = 2.41 : 1.
2071        // Applying one schoolbook multiply gives us two pieces each 1.2:1
2072        if (y1.length <= KARATSUBALIMIT)
2073            mulSimple(resultHigh, x1, y1);
2074        else
2075        {
2076            // divide x1 in two, then use schoolbook multiply on the two pieces.
2077            auto quarter = (x1.length >> 1) + (x1.length & 1);
2078            immutable ysmaller = (quarter >= y1.length);
2079            mulKaratsuba(resultHigh[0 .. quarter+y1.length], ysmaller ? x1[0 .. quarter] : y1,
2080                ysmaller ? y1 : x1[0 .. quarter], newscratchbuff);
2081            // Save the part which will be overwritten.
2082            immutable ysmaller2 = ((x1.length - quarter) >= y1.length);
2083            newscratchbuff[0 .. y1.length] = resultHigh[quarter .. quarter + y1.length];
2084            mulKaratsuba(resultHigh[quarter..$], ysmaller2 ? x1[quarter..$] : y1,
2085                ysmaller2 ? y1 : x1[quarter..$], newscratchbuff[y1.length..$]);
2086
2087            resultHigh[quarter..$].addAssignSimple(newscratchbuff[0 .. y1.length]);
2088        }
2089    }
2090    else
2091        mulKaratsuba(resultHigh, x1, y1, newscratchbuff);
2092
2093    /* We now have result = x0y0 + (N*N)*x1y1
2094       Before adding or subtracting mid, we must calculate
2095       result += N * (x0y0 + x1y1)
2096       We can do this with three half-length additions. With a = x0y0, b = x1y1:
2097                      aHI aLO
2098        +       aHI   aLO
2099        +       bHI   bLO
2100        +  bHI  bLO
2101        =  R3   R2    R1   R0
2102        R1 = aHI + bLO + aLO
2103        R2 = aHI + bLO + aHI + carry_from_R1
2104        R3 = bHi + carry_from_R2
2105
2106     It might actually be quicker to do it in two full-length additions:
2107     newscratchbuff[2*half] = addSimple(newscratchbuff[0 .. 2*half], result[0 .. 2*half], result[2*half..$]);
2108     addAssignSimple(result[half..$], newscratchbuff[0 .. 2*half+1]);
2109   */
2110    BigDigit[] R1 = result[half .. half*2];
2111    BigDigit[] R2 = result[half*2 .. half*3];
2112    BigDigit[] R3 = result[half*3..$];
2113    BigDigit c1 = multibyteAdd(R2, R2, R1, 0); // c1:R2 = R2 + R1
2114    BigDigit c2 = multibyteAdd(R1, R2, result[0 .. half], 0); // c2:R1 = R2 + R1 + R0
2115    BigDigit c3 = addAssignSimple(R2, R3); // R2 = R2 + R1 + R3
2116    if (c1+c2)
2117        multibyteIncrementAssign!('+')(result[half*2..$], c1+c2);
2118    if (c1+c3)
2119        multibyteIncrementAssign!('+')(R3, c1+c3);
2120
2121    // And finally we subtract mid
2122    addOrSubAssignSimple(result[half..$], mid, !midNegative);
2123}
2124
2125void squareKaratsuba(BigDigit [] result, const BigDigit [] x,
2126        BigDigit [] scratchbuff) pure nothrow
2127{
2128    // See mulKaratsuba for implementation comments.
2129    // Squaring is simpler, since it never gets asymmetric.
2130    assert(result.length < uint.max, "Operands too large");
2131    assert(result.length == 2*x.length);
2132    if (x.length <= KARATSUBASQUARELIMIT)
2133    {
2134        return squareSimple(result, x);
2135    }
2136    // half length, round up.
2137    auto half = (x.length >> 1) + (x.length & 1);
2138
2139    const(BigDigit)[] x0 = x[0 .. half];
2140    const(BigDigit)[] x1 = x[half .. $];
2141    BigDigit [] mid = scratchbuff[0 .. half*2];
2142    BigDigit [] newscratchbuff = scratchbuff[half*2 .. $];
2143     // initially use result to store temporaries
2144    BigDigit [] xdiff= result[0 .. half];
2145    const BigDigit [] ydiff = result[half .. half*2];
2146
2147    // First, we calculate mid. We don't need its sign
2148    inplaceSub(xdiff, x0, x1);
2149    squareKaratsuba(mid, xdiff, newscratchbuff);
2150
2151    // Set result = x0x0 + (N*N)*x1x1
2152    squareKaratsuba(result[0 .. 2*half], x0, newscratchbuff);
2153    squareKaratsuba(result[2*half .. $], x1, newscratchbuff);
2154
2155    /* result += N * (x0x0 + x1x1)
2156       Do this with three half-length additions. With a = x0x0, b = x1x1:
2157        R1 = aHI + bLO + aLO
2158        R2 = aHI + bLO + aHI + carry_from_R1
2159        R3 = bHi + carry_from_R2
2160    */
2161    BigDigit[] R1 = result[half .. half*2];
2162    BigDigit[] R2 = result[half*2 .. half*3];
2163    BigDigit[] R3 = result[half*3..$];
2164    BigDigit c1 = multibyteAdd(R2, R2, R1, 0); // c1:R2 = R2 + R1
2165    BigDigit c2 = multibyteAdd(R1, R2, result[0 .. half], 0); // c2:R1 = R2 + R1 + R0
2166    BigDigit c3 = addAssignSimple(R2, R3); // R2 = R2 + R1 + R3
2167    if (c1+c2) multibyteIncrementAssign!('+')(result[half*2..$], c1+c2);
2168    if (c1+c3) multibyteIncrementAssign!('+')(R3, c1+c3);
2169
2170    // And finally we subtract mid, which is always positive
2171    subAssignSimple(result[half..$], mid);
2172}
2173
2174/* Knuth's Algorithm D, as presented in
2175 * H.S. Warren, "Hacker's Delight", Addison-Wesley Professional (2002).
2176 * Also described in "Modern Computer Arithmetic" 0.2, Exercise 1.8.18.
2177 * Given u and v, calculates  quotient  = u / v, u = u % v.
2178 * v must be normalized (ie, the MSB of v must be 1).
2179 * The most significant words of quotient and u may be zero.
2180 * u[0 .. v.length] holds the remainder.
2181 */
2182void schoolbookDivMod(BigDigit [] quotient, BigDigit [] u, in BigDigit [] v)
2183    pure nothrow
2184{
2185    assert(quotient.length == u.length - v.length);
2186    assert(v.length > 1);
2187    assert(u.length >= v.length);
2188    assert((v[$-1]&0x8000_0000)!=0);
2189    assert(u[$-1] < v[$-1]);
2190    // BUG: This code only works if BigDigit is uint.
2191    uint vhi = v[$-1];
2192    uint vlo = v[$-2];
2193
2194    for (ptrdiff_t j = u.length - v.length - 1; j >= 0; j--)
2195    {
2196        // Compute estimate of quotient[j],
2197        // qhat = (three most significant words of u)/(two most sig words of v).
2198        uint qhat;
2199        if (u[j + v.length] == vhi)
2200        {
2201            // uu/vhi could exceed uint.max (it will be 0x8000_0000 or 0x8000_0001)
2202            qhat = uint.max;
2203        }
2204        else
2205        {
2206            uint ulo = u[j + v.length - 2];
2207            version (D_InlineAsm_X86)
2208            {
2209                // Note: On DMD, this is only ~10% faster than the non-asm code.
2210                uint *p = &u[j + v.length - 1];
2211                asm pure nothrow
2212                {
2213                    mov EAX, p;
2214                    mov EDX, [EAX+4];
2215                    mov EAX, [EAX];
2216                    div dword ptr [vhi];
2217                    mov qhat, EAX;
2218                    mov ECX, EDX;
2219div3by2correction:
2220                    mul dword ptr [vlo]; // EDX:EAX = qhat * vlo
2221                    sub EAX, ulo;
2222                    sbb EDX, ECX;
2223                    jbe div3by2done;
2224                    mov EAX, qhat;
2225                    dec EAX;
2226                    mov qhat, EAX;
2227                    add ECX, dword ptr [vhi];
2228                    jnc div3by2correction;
2229div3by2done:    ;
2230                }
2231            }
2232            else
2233            { // version (InlineAsm)
2234                ulong uu = (cast(ulong)(u[j + v.length]) << 32) | u[j + v.length - 1];
2235                immutable bigqhat = uu / vhi;
2236                ulong rhat =  uu - bigqhat * vhi;
2237                qhat = cast(uint) bigqhat;
2238again:
2239                if (cast(ulong) qhat * vlo > ((rhat << 32) + ulo))
2240                {
2241                    --qhat;
2242                    rhat += vhi;
2243                    if (!(rhat & 0xFFFF_FFFF_0000_0000L))
2244                        goto again;
2245                }
2246            } // version (InlineAsm)
2247        }
2248        // Multiply and subtract.
2249        uint carry = multibyteMulAdd!('-')(u[j .. j + v.length], v, qhat, 0);
2250
2251        if (u[j+v.length] < carry)
2252        {
2253            // If we subtracted too much, add back
2254            --qhat;
2255            carry -= multibyteAdd(u[j .. j + v.length],u[j .. j + v.length], v, 0);
2256        }
2257        quotient[j] = qhat;
2258        u[j + v.length] = u[j + v.length] - carry;
2259    }
2260}
2261
2262private:
2263
2264// TODO: Replace with a library call
2265void itoaZeroPadded(char[] output, uint value)
2266    pure nothrow @safe @nogc
2267{
2268    for (auto i = output.length; i--;)
2269    {
2270        if (value < 10)
2271        {
2272            output[i] = cast(char)(value + '0');
2273            value = 0;
2274        }
2275        else
2276        {
2277            output[i] = cast(char)(value % 10 + '0');
2278            value /= 10;
2279        }
2280    }
2281}
2282
2283void toHexZeroPadded(char[] output, uint value,
2284        LetterCase letterCase = LetterCase.upper) pure nothrow @safe
2285{
2286    ptrdiff_t x = output.length - 1;
2287    static immutable string upperHexDigits = "0123456789ABCDEF";
2288    static immutable string lowerHexDigits = "0123456789abcdef";
2289    for ( ; x >= 0; --x)
2290    {
2291        if (letterCase == LetterCase.upper)
2292        {
2293            output[x] = upperHexDigits[value & 0xF];
2294        }
2295        else
2296        {
2297            output[x] = lowerHexDigits[value & 0xF];
2298        }
2299        value >>= 4;
2300    }
2301}
2302
2303private:
2304
2305// Returns the highest value of i for which left[i]!=right[i],
2306// or 0 if left[] == right[]
2307size_t highestDifferentDigit(const BigDigit [] left, const BigDigit [] right)
2308pure nothrow @nogc @safe
2309{
2310    assert(left.length == right.length);
2311    for (ptrdiff_t i = left.length - 1; i>0; --i)
2312    {
2313        if (left[i] != right[i])
2314            return i;
2315    }
2316    return 0;
2317}
2318
2319// Returns the lowest value of i for which x[i]!=0.
2320int firstNonZeroDigit(const BigDigit [] x) pure nothrow @nogc @safe
2321{
2322    int k = 0;
2323    while (x[k]==0)
2324    {
2325        ++k;
2326        assert(k<x.length);
2327    }
2328    return k;
2329}
2330
2331/*
2332    Calculate quotient and remainder of u / v using fast recursive division.
2333    v must be normalised, and must be at least half as long as u.
2334    Given u and v, v normalised, calculates  quotient  = u/v, u = u%v.
2335    scratch is temporary storage space, length must be >= quotient + 1.
2336
2337Returns:
2338    u[0 .. v.length] is the remainder. u[v.length..$] is corrupted.
2339
2340    Implements algorithm 1.8 from MCA.
2341    This algorithm has an annoying special case. After the first recursion, the
2342    highest bit of the quotient may be set. This means that in the second
2343    recursive call, the 'in' contract would be violated. (This happens only
2344    when the top quarter of u is equal to the top half of v. A base 10
2345    equivalent example of this situation is 5517/56; the first step gives
2346    55/5 = 11). To maintain the in contract, we pad a zero to the top of both
2347    u and the quotient. 'mayOverflow' indicates that that the special case
2348    has occurred.
2349    (In MCA, a different strategy is used: the in contract is weakened, and
2350    schoolbookDivMod is more general: it allows the high bit of u to be set).
2351    See also:
2352    - C. Burkinel and J. Ziegler, "Fast Recursive Division", MPI-I-98-1-022,
2353      Max-Planck Institute fuer Informatik, (Oct 1998).
2354*/
2355void recursiveDivMod(BigDigit[] quotient, BigDigit[] u, const(BigDigit)[] v,
2356                     BigDigit[] scratch, bool mayOverflow = false)
2357                     pure nothrow
2358in
2359{
2360    // v must be normalized
2361    assert(v.length > 1);
2362    assert((v[$ - 1] & 0x8000_0000) != 0);
2363    assert(!(u[$ - 1] & 0x8000_0000));
2364    assert(quotient.length == u.length - v.length);
2365    if (mayOverflow)
2366    {
2367        assert(u[$-1] == 0);
2368        assert(u[$-2] & 0x8000_0000);
2369    }
2370
2371    // Must be symmetric. Use block schoolbook division if not.
2372    assert((mayOverflow ? u.length-1 : u.length) <= 2 * v.length);
2373    assert((mayOverflow ? u.length-1 : u.length) >= v.length);
2374    assert(scratch.length >= quotient.length + (mayOverflow ? 0 : 1));
2375}
2376body
2377{
2378    if (quotient.length < FASTDIVLIMIT)
2379    {
2380        return schoolbookDivMod(quotient, u, v);
2381    }
2382
2383    // Split quotient into two halves, but keep padding in the top half
2384    auto k = (mayOverflow ?  quotient.length - 1 : quotient.length) >> 1;
2385
2386    // RECURSION 1: Calculate the high half of the quotient
2387
2388    // Note that if u and quotient were padded, they remain padded during
2389    // this call, so in contract is satisfied.
2390    recursiveDivMod(quotient[k .. $], u[2 * k .. $], v[k .. $],
2391        scratch, mayOverflow);
2392
2393    // quotient[k..$] is our guess at the high quotient.
2394    // u[2*k .. 2.*k + v.length - k = k + v.length] is the high part of the
2395    // first remainder. u[0 .. 2*k] is the low part.
2396
2397    // Calculate the full first remainder to be
2398    //    remainder - highQuotient * lowDivisor
2399    // reducing highQuotient until the remainder is positive.
2400    // The low part of the remainder, u[0 .. k], cannot be altered by this.
2401
2402    adjustRemainder(quotient[k .. $], u[k .. k + v.length], v, k,
2403            scratch[0 .. quotient.length], mayOverflow);
2404
2405    // RECURSION 2: Calculate the low half of the quotient
2406    // The full first remainder is now in u[0 .. k + v.length].
2407
2408    if (u[k + v.length - 1] & 0x8000_0000)
2409    {
2410        // Special case. The high quotient is 0x1_00...000 or 0x1_00...001.
2411        // This means we need an extra quotient word for the next recursion.
2412        // We need to restore the invariant for the recursive calls.
2413        // We do this by padding both u and quotient. Extending u is trivial,
2414        // because the higher words will not be used again. But for the
2415        // quotient, we're clobbering the low word of the high quotient,
2416        // so we need save it, and add it back in after the recursive call.
2417
2418        auto clobberedQuotient = quotient[k];
2419        u[k+v.length] = 0;
2420
2421        recursiveDivMod(quotient[0 .. k+1], u[k .. k + v.length+1],
2422            v[k .. $], scratch, true);
2423        adjustRemainder(quotient[0 .. k+1], u[0 .. v.length], v, k,
2424            scratch[0 .. 2 * k+1], true);
2425
2426        // Now add the quotient word that got clobbered earlier.
2427        multibyteIncrementAssign!('+')(quotient[k..$], clobberedQuotient);
2428    }
2429    else
2430    {
2431        // The special case has NOT happened.
2432        recursiveDivMod(quotient[0 .. k], u[k .. k + v.length], v[k .. $],
2433            scratch, false);
2434
2435        // high remainder is in u[k .. k+(v.length-k)] == u[k .. v.length]
2436
2437        adjustRemainder(quotient[0 .. k], u[0 .. v.length], v, k,
2438            scratch[0 .. 2 * k]);
2439    }
2440}
2441
2442// rem -= quot * v[0 .. k].
2443// If would make rem negative, decrease quot until rem is >= 0.
2444// Needs (quot.length * k) scratch space to store the result of the multiply.
2445void adjustRemainder(BigDigit[] quot, BigDigit[] rem, const(BigDigit)[] v,
2446        ptrdiff_t k,
2447        BigDigit[] scratch, bool mayOverflow = false) pure nothrow
2448{
2449    assert(rem.length == v.length);
2450    mulInternal(scratch, quot, v[0 .. k]);
2451    uint carry = 0;
2452    if (mayOverflow)
2453        carry = scratch[$-1] + subAssignSimple(rem, scratch[0..$-1]);
2454    else
2455        carry = subAssignSimple(rem, scratch);
2456    while (carry)
2457    {
2458        multibyteIncrementAssign!('-')(quot, 1); // quot--
2459        carry -= multibyteAdd(rem, rem, v, 0);
2460    }
2461}
2462
2463// Cope with unbalanced division by performing block schoolbook division.
2464void blockDivMod(BigDigit [] quotient, BigDigit [] u, in BigDigit [] v)
2465pure nothrow
2466{
2467    import core.memory : GC;
2468    assert(quotient.length == u.length - v.length);
2469    assert(v.length > 1);
2470    assert(u.length >= v.length);
2471    assert((v[$-1] & 0x8000_0000)!=0);
2472    assert((u[$-1] & 0x8000_0000)==0);
2473    BigDigit [] scratch = new BigDigit[v.length + 1];
2474
2475    // Perform block schoolbook division, with 'v.length' blocks.
2476    auto m = u.length - v.length;
2477    while (m > v.length)
2478    {
2479        immutable mayOverflow = (u[m + v.length -1 ] & 0x8000_0000)!=0;
2480        BigDigit saveq;
2481        if (mayOverflow)
2482        {
2483            u[m + v.length] = 0;
2484            saveq = quotient[m];
2485        }
2486        recursiveDivMod(quotient[m-v.length .. m + (mayOverflow? 1: 0)],
2487            u[m - v.length .. m + v.length + (mayOverflow? 1: 0)], v, scratch, mayOverflow);
2488        if (mayOverflow)
2489        {
2490            assert(quotient[m] == 0);
2491            quotient[m] = saveq;
2492        }
2493        m -= v.length;
2494    }
2495    recursiveDivMod(quotient[0 .. m], u[0 .. m + v.length], v, scratch);
2496    () @trusted { GC.free(scratch.ptr); } ();
2497}
2498
2499@system unittest
2500{
2501    import core.stdc.stdio;
2502
2503    void printBiguint(const uint [] data)
2504    {
2505        char [] buff = biguintToHex(new char[data.length*9], data, '_');
2506        printf("%.*s\n", buff.length, buff.ptr);
2507    }
2508
2509    void printDecimalBigUint(BigUint data)
2510    {
2511        auto str = data.toDecimalString(0);
2512        printf("%.*s\n", str.length, str.ptr);
2513    }
2514
2515    uint [] a, b;
2516    a = new uint[43];
2517    b = new uint[179];
2518    for (int i=0; i<a.length; ++i) a[i] = 0x1234_B6E9 + i;
2519    for (int i=0; i<b.length; ++i) b[i] = 0x1BCD_8763 - i*546;
2520
2521    a[$-1] |= 0x8000_0000;
2522    uint [] r = new uint[a.length];
2523    uint [] q = new uint[b.length-a.length+1];
2524
2525    divModInternal(q, r, b, a);
2526    q = q[0..$-1];
2527    uint [] r1 = r.dup;
2528    uint [] q1 = q.dup;
2529    blockDivMod(q, b, a);
2530    r = b[0 .. a.length];
2531    assert(r[] == r1[]);
2532    assert(q[] == q1[]);
2533}
2534
2535// biguintToOctal
2536@safe unittest
2537{
2538    enum bufSize = 5 * BigDigitBits / 3 + 1;
2539    auto buf = new char[bufSize];
2540    size_t i;
2541    BigDigit[] data = [ 342391 ];
2542
2543    // Basic functionality with single word
2544    i = biguintToOctal(buf, data);
2545    assert(i == bufSize - 7 && buf[i .. $] == "1234567");
2546
2547    // Test carrying bits between words
2548    data = [ 0x77053977, 0x39770539, 0x00000005 ];
2549    i = biguintToOctal(buf, data);
2550    assert(i == bufSize - 23 && buf[i .. $] == "12345670123456701234567");
2551
2552    // Test carried bits in the last word
2553    data = [ 0x80000000 ];
2554    i = biguintToOctal(buf, data);
2555    assert(buf[i .. $] == "20000000000");
2556
2557    // Test boundary between 3rd and 4th word where the number of bits is
2558    // divisible by 3 and no bits should be carried.
2559    //
2560    // The 0xC0000000's are for "poisoning" the carry to be non-zero when the
2561    // rollover happens, so that if any bugs happen in wrongly adding the carry
2562    // to the next word, non-zero bits will show up in the output.
2563    data = [ 0xC0000000, 0xC0000000, 0xC0000000, 0x00000010 ];
2564    i = biguintToOctal(buf, data);
2565    assert(buf[i .. $] == "2060000000001400000000030000000000");
2566
2567    // Boundary case: 0
2568    data = [ 0 ];
2569    i = biguintToOctal(buf, data);
2570    assert(buf[i .. $] == "0");
2571}
2572