1// Written in the D programming language.
2
3/**
4Classes and functions for handling and transcoding between various encodings.
5
6For cases where the encoding is known at compile-time, functions are provided
7for arbitrary encoding and decoding of characters, arbitrary transcoding
8between strings of different type, as well as validation and sanitization.
9
10Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11(also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250, WINDOWS-1251
12and WINDOWS-1252.
13
14$(SCRIPT inhibitQuickIndex = 1;)
15$(DIVC quickindex,
16$(BOOKTABLE,
17$(TR $(TH Category) $(TH Functions))
18$(TR $(TD Decode) $(TD
19    $(LREF codePoints)
20    $(LREF decode)
21    $(LREF decodeReverse)
22    $(LREF safeDecode)
23))
24$(TR $(TD Conversion) $(TD
25    $(LREF codeUnits)
26    $(LREF sanitize)
27    $(LREF transcode)
28))
29$(TR $(TD Classification) $(TD
30    $(LREF canEncode)
31    $(LREF isValid)
32    $(LREF isValidCodePoint)
33    $(LREF isValidCodeUnit)
34))
35$(TR $(TD BOM) $(TD
36    $(LREF BOM)
37    $(LREF BOMSeq)
38    $(LREF getBOM)
39    $(LREF utfBOM)
40))
41$(TR $(TD Length & Index) $(TD
42    $(LREF firstSequence)
43    $(LREF encodedLength)
44    $(LREF index)
45    $(LREF lastSequence)
46    $(LREF validLength)
47))
48$(TR $(TD Encoding schemes) $(TD
49    $(LREF encodingName)
50    $(LREF EncodingScheme)
51    $(LREF EncodingSchemeASCII)
52    $(LREF EncodingSchemeLatin1)
53    $(LREF EncodingSchemeLatin2)
54    $(LREF EncodingSchemeUtf16Native)
55    $(LREF EncodingSchemeUtf32Native)
56    $(LREF EncodingSchemeUtf8)
57    $(LREF EncodingSchemeWindows1250)
58    $(LREF EncodingSchemeWindows1251)
59    $(LREF EncodingSchemeWindows1252)
60))
61$(TR $(TD Representation) $(TD
62    $(LREF AsciiChar)
63    $(LREF AsciiString)
64    $(LREF Latin1Char)
65    $(LREF Latin1String)
66    $(LREF Latin2Char)
67    $(LREF Latin2String)
68    $(LREF Windows1250Char)
69    $(LREF Windows1250String)
70    $(LREF Windows1251Char)
71    $(LREF Windows1251String)
72    $(LREF Windows1252Char)
73    $(LREF Windows1252String)
74))
75$(TR $(TD Exceptions) $(TD
76    $(LREF INVALID_SEQUENCE)
77    $(LREF EncodingException)
78))
79))
80
81For cases where the encoding is not known at compile-time, but is
82known at run-time, the abstract class $(LREF EncodingScheme)
83and its subclasses is provided.  To construct a run-time encoder/decoder,
84one does e.g.
85
86----------------------------------------------------
87auto e = EncodingScheme.create("utf-8");
88----------------------------------------------------
89
90This library supplies $(LREF EncodingScheme) subclasses for ASCII,
91ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250,
92WINDOWS-1251, WINDOWS-1252, UTF-8, and (on little-endian architectures)
93UTF-16LE and UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE.
94
95This library provides a mechanism whereby other modules may add $(LREF
96EncodingScheme) subclasses for any other encoding.
97
98Copyright: Copyright Janice Caron 2008 - 2009.
99License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
100Authors:   Janice Caron
101Source:    $(PHOBOSSRC std/encoding.d)
102*/
103/*
104         Copyright Janice Caron 2008 - 2009.
105Distributed under the Boost Software License, Version 1.0.
106   (See accompanying file LICENSE_1_0.txt or copy at
107         http://www.boost.org/LICENSE_1_0.txt)
108*/
109module std.encoding;
110
111import std.range.primitives;
112import std.traits;
113import std.typecons;
114
115@system unittest
116{
117    static ubyte[][] validStrings =
118    [
119        // Plain ASCII
120        cast(ubyte[])"hello",
121
122        // First possible sequence of a certain length
123        [ 0x00 ],                       // U+00000000   one byte
124        [ 0xC2, 0x80 ],                 // U+00000080   two bytes
125        [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes
126        [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes
127
128        // Last possible sequence of a certain length
129        [ 0x7F ],                       // U+0000007F   one byte
130        [ 0xDF, 0xBF ],                 // U+000007FF   two bytes
131        [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes
132
133        // Other boundary conditions
134        [ 0xED, 0x9F, 0xBF ],
135        // U+0000D7FF   Last character before surrogates
136        [ 0xEE, 0x80, 0x80 ],
137        // U+0000E000   First character after surrogates
138        [ 0xEF, 0xBF, 0xBD ],
139        // U+0000FFFD   Unicode replacement character
140        [ 0xF4, 0x8F, 0xBF, 0xBF ],
141        // U+0010FFFF   Very last character
142
143        // Non-character code points
144        /*  NOTE: These are legal in UTF, and may be converted from
145            one UTF to another, however they do not represent Unicode
146            characters. These code points have been reserved by
147            Unicode as non-character code points. They are permissible
148            for data exchange within an application, but they are are
149            not permitted to be used as characters. Since this module
150            deals with UTF, and not with Unicode per se, we choose to
151            accept them here. */
152        [ 0xDF, 0xBE ],                 // U+0000FFFE
153        [ 0xDF, 0xBF ],                 // U+0000FFFF
154    ];
155
156    static ubyte[][] invalidStrings =
157    [
158        // First possible sequence of a certain length, but greater
159        // than U+10FFFF
160        [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes
161        [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes
162
163        // Last possible sequence of a certain length, but greater than U+10FFFF
164        [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes
165        [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes
166        [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes
167
168        // Other boundary conditions
169        [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000
170                                                    // First code
171                                                    // point after
172                                                    // last character
173
174        // Unexpected continuation bytes
175        [ 0x80 ],
176        [ 0xBF ],
177        [ 0x20, 0x80, 0x20 ],
178        [ 0x20, 0xBF, 0x20 ],
179        [ 0x80, 0x9F, 0xA0 ],
180
181        // Lonely start bytes
182        [ 0xC0 ],
183        [ 0xCF ],
184        [ 0x20, 0xC0, 0x20 ],
185        [ 0x20, 0xCF, 0x20 ],
186        [ 0xD0 ],
187        [ 0xDF ],
188        [ 0x20, 0xD0, 0x20 ],
189        [ 0x20, 0xDF, 0x20 ],
190        [ 0xE0 ],
191        [ 0xEF ],
192        [ 0x20, 0xE0, 0x20 ],
193        [ 0x20, 0xEF, 0x20 ],
194        [ 0xF0 ],
195        [ 0xF1 ],
196        [ 0xF2 ],
197        [ 0xF3 ],
198        [ 0xF4 ],
199        [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF
200        [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF
201        [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF
202
203        [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing
204        [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing
205        [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above
206
207        // Impossible bytes
208        [ 0xF8 ],
209        [ 0xF9 ],
210        [ 0xFA ],
211        [ 0xFB ],
212        [ 0xFC ],
213        [ 0xFD ],
214        [ 0xFE ],
215        [ 0xFF ],
216        [ 0x20, 0xF8, 0x20 ],
217        [ 0x20, 0xF9, 0x20 ],
218        [ 0x20, 0xFA, 0x20 ],
219        [ 0x20, 0xFB, 0x20 ],
220        [ 0x20, 0xFC, 0x20 ],
221        [ 0x20, 0xFD, 0x20 ],
222        [ 0x20, 0xFE, 0x20 ],
223        [ 0x20, 0xFF, 0x20 ],
224
225        // Overlong sequences, all representing U+002F
226        /*  With a safe UTF-8 decoder, all of the following five overlong
227            representations of the ASCII character slash ("/") should be
228            rejected like a malformed UTF-8 sequence */
229        [ 0xC0, 0xAF ],
230        [ 0xE0, 0x80, 0xAF ],
231        [ 0xF0, 0x80, 0x80, 0xAF ],
232        [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
233        [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
234
235        // Maximum overlong sequences
236        /*  Below you see the highest Unicode value that is still resulting in
237            an overlong sequence if represented with the given number of bytes.
238            This is a boundary test for safe UTF-8 decoders. All five
239            characters should be rejected like malformed UTF-8 sequences. */
240        [ 0xC1, 0xBF ],                             // U+0000007F
241        [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF
242        [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF
243        [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF
244        [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF
245
246        // Overlong representation of the NUL character
247        /*  The following five sequences should also be rejected like malformed
248            UTF-8 sequences and should not be treated like the ASCII NUL
249            character. */
250        [ 0xC0, 0x80 ],
251        [ 0xE0, 0x80, 0x80 ],
252        [ 0xF0, 0x80, 0x80, 0x80 ],
253        [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
254        [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
255
256        // Illegal code positions
257        /*  The following UTF-8 sequences should be rejected like malformed
258            sequences, because they never represent valid ISO 10646 characters
259            and a UTF-8 decoder that accepts them might introduce security
260            problems comparable to overlong UTF-8 sequences. */
261        [ 0xED, 0xA0, 0x80 ],       // U+D800
262        [ 0xED, 0xAD, 0xBF ],       // U+DB7F
263        [ 0xED, 0xAE, 0x80 ],       // U+DB80
264        [ 0xED, 0xAF, 0xBF ],       // U+DBFF
265        [ 0xED, 0xB0, 0x80 ],       // U+DC00
266        [ 0xED, 0xBE, 0x80 ],       // U+DF80
267        [ 0xED, 0xBF, 0xBF ],       // U+DFFF
268    ];
269
270    static string[] sanitizedStrings =
271    [
272        "\uFFFD","\uFFFD",
273        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
274        " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
275        "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
276        " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
277        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
278        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
279        " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
280        " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
281        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
282        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
283    ];
284
285    // HELPER FUNCTIONS
286    // we can probably do this better...
287    static char toHexDigit(int n)
288    {
289        return "0123456789ABCDEF"[n & 0xF];
290    }
291
292    static string makeReadable(string s)
293    {
294        string r = "\"";
295        foreach (char c;s)
296        {
297            if (c >= 0x20 && c < 0x80)
298            {
299                r ~= c;
300            }
301            else
302            {
303                r ~= "\\x";
304                r ~= toHexDigit(c >> 4);
305                r ~= toHexDigit(c);
306            }
307        }
308        r ~= "\"";
309        return r;
310    }
311
312    void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
313    {
314        static if (is(Src == Dst))
315        {
316            return s;
317        }
318        else static if (is(Src == AsciiChar))
319        {
320            transcodeReverse!(char,Dst)(cast(string) s,r);
321        }
322        else
323        {
324            foreach_reverse (d;codePoints(s))
325            {
326                foreach_reverse (c;codeUnits!(Dst)(d))
327                {
328                    r = c ~ r;
329                }
330            }
331        }
332    }
333
334    // Make sure everything that should be valid, is
335    foreach (a;validStrings)
336    {
337        string s = cast(string) a;
338        assert(isValid(s),"Failed to validate: "~makeReadable(s));
339    }
340
341    // Make sure everything that shouldn't be valid, isn't
342    foreach (a;invalidStrings)
343    {
344        string s = cast(string) a;
345        assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
346    }
347
348    // Make sure we can sanitize everything bad
349    assert(invalidStrings.length == sanitizedStrings.length);
350    for (int i=0; i<invalidStrings.length; ++i)
351    {
352        string s = cast(string) invalidStrings[i];
353        string t = sanitize(s);
354        assert(isValid(t));
355        assert(t == sanitizedStrings[i]);
356        ubyte[] u = cast(ubyte[]) t;
357        validStrings ~= u;
358    }
359
360    // Make sure all transcodings work in both directions, using both forward
361    // and reverse iteration
362    foreach (a; validStrings)
363    {
364        string s = cast(string) a;
365        string s2;
366        wstring ws, ws2;
367        dstring ds, ds2;
368
369        transcode(s,ws);
370        assert(isValid(ws));
371        transcode(ws,s2);
372        assert(s == s2);
373
374        transcode(s,ds);
375        assert(isValid(ds));
376        transcode(ds,s2);
377        assert(s == s2);
378
379        transcode(ws,s);
380        assert(isValid(s));
381        transcode(s,ws2);
382        assert(ws == ws2);
383
384        transcode(ws,ds);
385        assert(isValid(ds));
386        transcode(ds,ws2);
387        assert(ws == ws2);
388
389        transcode(ds,s);
390        assert(isValid(s));
391        transcode(s,ds2);
392        assert(ds == ds2);
393
394        transcode(ds,ws);
395        assert(isValid(ws));
396        transcode(ws,ds2);
397        assert(ds == ds2);
398
399        transcodeReverse(s,ws);
400        assert(isValid(ws));
401        transcodeReverse(ws,s2);
402        assert(s == s2);
403
404        transcodeReverse(s,ds);
405        assert(isValid(ds));
406        transcodeReverse(ds,s2);
407        assert(s == s2);
408
409        transcodeReverse(ws,s);
410        assert(isValid(s));
411        transcodeReverse(s,ws2);
412        assert(ws == ws2);
413
414        transcodeReverse(ws,ds);
415        assert(isValid(ds));
416        transcodeReverse(ds,ws2);
417        assert(ws == ws2);
418
419        transcodeReverse(ds,s);
420        assert(isValid(s));
421        transcodeReverse(s,ds2);
422        assert(ds == ds2);
423
424        transcodeReverse(ds,ws);
425        assert(isValid(ws));
426        transcodeReverse(ws,ds2);
427        assert(ds == ds2);
428    }
429
430    // Make sure the non-UTF encodings work too
431    {
432        auto s = "\u20AC100";
433        Windows1252String t;
434        transcode(s,t);
435        assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
436        string u;
437        transcode(s,u);
438        assert(s == u);
439        Latin1String v;
440        transcode(s,v);
441        assert(cast(string) v == "?100");
442        AsciiString w;
443        transcode(v,w);
444        assert(cast(string) w == "?100");
445        s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148";
446        Latin2String x;
447        transcode(s,x);
448        assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
449        Windows1250String y;
450        transcode(s,y);
451        assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
452        s = "\u0402lu\u0403ou\u201D\u045C k\u0414\u044F";
453        Windows1251String s51;
454        transcode(s,s51);
455        assert(s51 == cast(Windows1251Char[])[0x80, 'l', 'u', 0x81, 'o', 'u', 0x94, 0x9d, ' ', 'k', 0xc4, 0xff]);
456    }
457
458    // Make sure we can count properly
459    {
460        assert(encodedLength!(char)('A') == 1);
461        assert(encodedLength!(char)('\u00E3') == 2);
462        assert(encodedLength!(char)('\u2028') == 3);
463        assert(encodedLength!(char)('\U0010FFF0') == 4);
464        assert(encodedLength!(wchar)('A') == 1);
465        assert(encodedLength!(wchar)('\U0010FFF0') == 2);
466    }
467
468    // Make sure we can write into mutable arrays
469    {
470        char[4] buffer;
471        auto n = encode(cast(dchar)'\u00E3',buffer);
472        assert(n == 2);
473        assert(buffer[0] == 0xC3);
474        assert(buffer[1] == 0xA3);
475    }
476}
477
478//=============================================================================
479
480/** Special value returned by `safeDecode` */
481enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
482
483template EncoderFunctions()
484{
485    // Various forms of read
486
487    template ReadFromString()
488    {
489        @property bool canRead() { return s.length != 0; }
490        E peek() @safe pure @nogc nothrow { return s[0]; }
491        E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; }
492    }
493
494    template ReverseReadFromString()
495    {
496        @property bool canRead() { return s.length != 0; }
497        E peek() @safe pure @nogc nothrow { return s[$-1]; }
498        E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; }
499    }
500
501    // Various forms of Write
502
503    template WriteToString()
504    {
505        E[] s;
506        void write(E c) @safe pure nothrow { s ~= c; }
507    }
508
509    template WriteToArray()
510    {
511        void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; }
512    }
513
514    template WriteToDelegate()
515    {
516        void write(E c) { dg(c); }
517    }
518
519    // Functions we will export
520
521    template EncodeViaWrite()
522    {
523        mixin encodeViaWrite;
524        void encode(dchar c) { encodeViaWrite(c); }
525    }
526
527    template SkipViaRead()
528    {
529        mixin skipViaRead;
530        void skip() @safe pure @nogc nothrow { skipViaRead(); }
531    }
532
533    template DecodeViaRead()
534    {
535        mixin decodeViaRead;
536        dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); }
537    }
538
539    template SafeDecodeViaRead()
540    {
541        mixin safeDecodeViaRead;
542        dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); }
543    }
544
545    template DecodeReverseViaRead()
546    {
547        mixin decodeReverseViaRead;
548        dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); }
549    }
550
551    // Encoding to different destinations
552
553    template EncodeToString()
554    {
555        mixin WriteToString;
556        mixin EncodeViaWrite;
557    }
558
559    template EncodeToArray()
560    {
561        mixin WriteToArray;
562        mixin EncodeViaWrite;
563    }
564
565    template EncodeToDelegate()
566    {
567        mixin WriteToDelegate;
568        mixin EncodeViaWrite;
569    }
570
571    // Decoding functions
572
573    template SkipFromString()
574    {
575        mixin ReadFromString;
576        mixin SkipViaRead;
577    }
578
579    template DecodeFromString()
580    {
581        mixin ReadFromString;
582        mixin DecodeViaRead;
583    }
584
585    template SafeDecodeFromString()
586    {
587        mixin ReadFromString;
588        mixin SafeDecodeViaRead;
589    }
590
591    template DecodeReverseFromString()
592    {
593        mixin ReverseReadFromString;
594        mixin DecodeReverseViaRead;
595    }
596
597    //=========================================================================
598
599    // Below are the functions we will ultimately expose to the user
600
601    E[] encode(dchar c) @safe pure nothrow
602    {
603        mixin EncodeToString e;
604        e.encode(c);
605        return e.s;
606    }
607
608    void encode(dchar c, ref E[] array) @safe pure nothrow
609    {
610        mixin EncodeToArray e;
611        e.encode(c);
612    }
613
614    void encode(dchar c, void delegate(E) dg)
615    {
616        mixin EncodeToDelegate e;
617        e.encode(c);
618    }
619
620    void skip(ref const(E)[] s) @safe pure nothrow
621    {
622        mixin SkipFromString e;
623        e.skip();
624    }
625
626    dchar decode(S)(ref S s)
627    {
628        mixin DecodeFromString e;
629        return e.decode();
630    }
631
632    dchar safeDecode(S)(ref S s)
633    {
634        mixin SafeDecodeFromString e;
635        return e.safeDecode();
636    }
637
638    dchar decodeReverse(ref const(E)[] s) @safe pure nothrow
639    {
640        mixin DecodeReverseFromString e;
641        return e.decodeReverse();
642    }
643}
644
645//=========================================================================
646
647struct CodePoints(E)
648{
649    const(E)[] s;
650
651    this(const(E)[] s)
652    in
653    {
654        assert(isValid(s));
655    }
656    do
657    {
658        this.s = s;
659    }
660
661    int opApply(scope int delegate(ref dchar) dg)
662    {
663        int result = 0;
664        while (s.length != 0)
665        {
666            dchar c = decode(s);
667            result = dg(c);
668            if (result != 0) break;
669        }
670        return result;
671    }
672
673    int opApply(scope int delegate(ref size_t, ref dchar) dg)
674    {
675        size_t i = 0;
676        int result = 0;
677        while (s.length != 0)
678        {
679            immutable len = s.length;
680            dchar c = decode(s);
681            size_t j = i; // We don't want the delegate corrupting i
682            result = dg(j,c);
683            if (result != 0) break;
684            i += len - s.length;
685        }
686        return result;
687    }
688
689    int opApplyReverse(scope int delegate(ref dchar) dg)
690    {
691        int result = 0;
692        while (s.length != 0)
693        {
694            dchar c = decodeReverse(s);
695            result = dg(c);
696            if (result != 0) break;
697        }
698        return result;
699    }
700
701    int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
702    {
703        int result = 0;
704        while (s.length != 0)
705        {
706            dchar c = decodeReverse(s);
707            size_t i = s.length;
708            result = dg(i,c);
709            if (result != 0) break;
710        }
711        return result;
712    }
713}
714
715struct CodeUnits(E)
716{
717    E[] s;
718
719    this(dchar d)
720    in
721    {
722        assert(isValidCodePoint(d));
723    }
724    do
725    {
726        s = encode!(E)(d);
727    }
728
729    int opApply(scope int delegate(ref E) dg)
730    {
731        int result = 0;
732        foreach (E c;s)
733        {
734            result = dg(c);
735            if (result != 0) break;
736        }
737        return result;
738    }
739
740    int opApplyReverse(scope int delegate(ref E) dg)
741    {
742        int result = 0;
743        foreach_reverse (E c;s)
744        {
745            result = dg(c);
746            if (result != 0) break;
747        }
748        return result;
749    }
750}
751
752//=============================================================================
753
754template EncoderInstance(E)
755{
756    static assert(false,"Cannot instantiate EncoderInstance for type "
757        ~ E.stringof);
758}
759
760private template GenericEncoder()
761{
762    bool canEncode(dchar c) @safe pure @nogc nothrow
763    {
764        if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true;
765        if (c >= 0xFFFD) return false;
766
767        auto idx = 0;
768        while (idx < bstMap.length)
769        {
770            if (bstMap[idx][0] == c) return true;
771            idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
772        }
773
774        return false;
775    }
776
777    bool isValidCodeUnit(E c) @safe pure @nogc nothrow
778    {
779        if (c < m_charMapStart || c > m_charMapEnd) return true;
780        return charMap[c-m_charMapStart] != 0xFFFD;
781    }
782
783    size_t encodedLength(dchar c) @safe pure @nogc nothrow
784    in
785    {
786        assert(canEncode(c));
787    }
788    do
789    {
790        return 1;
791    }
792
793    void encodeViaWrite()(dchar c)
794    {
795        if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {}
796        else if (c >= 0xFFFD) { c = '?'; }
797        else
798        {
799            auto idx = 0;
800            while (idx < bstMap.length)
801            {
802                if (bstMap[idx][0] == c)
803                {
804                    write(cast(E) bstMap[idx][1]);
805                    return;
806                }
807                idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
808            }
809            c = '?';
810        }
811        write(cast(E) c);
812    }
813
814    void skipViaRead()()
815    {
816        read();
817    }
818
819    dchar decodeViaRead()()
820    {
821        E c = read();
822        return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
823    }
824
825    dchar safeDecodeViaRead()()
826    {
827        immutable E c = read();
828        immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
829        return d == 0xFFFD ? INVALID_SEQUENCE : d;
830    }
831
832    dchar decodeReverseViaRead()()
833    {
834        E c = read();
835        return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
836    }
837
838    @property EString replacementSequence() @safe pure @nogc nothrow
839    {
840        return cast(EString)("?");
841    }
842
843    mixin EncoderFunctions;
844}
845
846//=============================================================================
847//          ASCII
848//=============================================================================
849
850/** Defines various character sets. */
851enum AsciiChar : ubyte { _init }
852/// Ditto
853alias AsciiString = immutable(AsciiChar)[];
854
855template EncoderInstance(CharType : AsciiChar)
856{
857    alias E = AsciiChar;
858    alias EString = AsciiString;
859
860    @property string encodingName() @safe pure nothrow @nogc
861    {
862        return "ASCII";
863    }
864
865    bool canEncode(dchar c) @safe pure nothrow @nogc
866    {
867        return c < 0x80;
868    }
869
870    bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc
871    {
872        return c < 0x80;
873    }
874
875    size_t encodedLength(dchar c) @safe pure nothrow @nogc
876    in
877    {
878        assert(canEncode(c));
879    }
880    do
881    {
882        return 1;
883    }
884
885    void encodeX(Range)(dchar c, Range r)
886    {
887        if (!canEncode(c)) c = '?';
888        r.write(cast(AsciiChar) c);
889    }
890
891    void encodeViaWrite()(dchar c)
892    {
893        if (!canEncode(c)) c = '?';
894        write(cast(AsciiChar) c);
895    }
896
897    void skipViaRead()()
898    {
899        read();
900    }
901
902    dchar decodeViaRead()()
903    {
904        return read();
905    }
906
907    dchar safeDecodeViaRead()()
908    {
909        immutable c = read();
910        return canEncode(c) ? c : INVALID_SEQUENCE;
911    }
912
913    dchar decodeReverseViaRead()()
914    {
915        return read();
916    }
917
918    @property EString replacementSequence() @safe pure nothrow @nogc
919    {
920        return cast(EString)("?");
921    }
922
923    mixin EncoderFunctions;
924}
925
926//=============================================================================
927//          ISO-8859-1
928//=============================================================================
929
930/** Defines an Latin1-encoded character. */
931enum Latin1Char : ubyte { _init }
932/**
933Defines an Latin1-encoded string (as an array of $(D
934immutable(Latin1Char))).
935 */
936alias Latin1String = immutable(Latin1Char)[];
937
938template EncoderInstance(CharType : Latin1Char)
939{
940    alias E = Latin1Char;
941    alias EString = Latin1String;
942
943    @property string encodingName() @safe pure nothrow @nogc
944    {
945        return "ISO-8859-1";
946    }
947
948    bool canEncode(dchar c) @safe pure nothrow @nogc
949    {
950        return c < 0x100;
951    }
952
953    bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc
954    {
955        return true;
956    }
957
958    size_t encodedLength(dchar c) @safe pure nothrow @nogc
959    in
960    {
961        assert(canEncode(c));
962    }
963    do
964    {
965        return 1;
966    }
967
968    void encodeViaWrite()(dchar c)
969    {
970        if (!canEncode(c)) c = '?';
971        write(cast(Latin1Char) c);
972    }
973
974    void skipViaRead()()
975    {
976        read();
977    }
978
979    dchar decodeViaRead()()
980    {
981        return read();
982    }
983
984    dchar safeDecodeViaRead()()
985    {
986        return read();
987    }
988
989    dchar decodeReverseViaRead()()
990    {
991        return read();
992    }
993
994    @property EString replacementSequence() @safe pure nothrow @nogc
995    {
996        return cast(EString)("?");
997    }
998
999    mixin EncoderFunctions;
1000}
1001
1002//=============================================================================
1003//          ISO-8859-2
1004//=============================================================================
1005
1006/// Defines a Latin2-encoded character.
1007enum Latin2Char : ubyte { _init }
1008
1009/**
1010 * Defines an Latin2-encoded string (as an array of $(D
1011 * immutable(Latin2Char))).
1012 */
1013alias Latin2String = immutable(Latin2Char)[];
1014
1015private template EncoderInstance(CharType : Latin2Char)
1016{
1017    import std.typecons : Tuple, tuple;
1018
1019    alias E = Latin2Char;
1020    alias EString = Latin2String;
1021
1022    @property string encodingName() @safe pure nothrow @nogc
1023    {
1024        return "ISO-8859-2";
1025    }
1026
1027    private static immutable dchar m_charMapStart = 0xa1;
1028    private static immutable dchar m_charMapEnd = 0xff;
1029
1030    private immutable wstring charMap =
1031        "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~
1032        "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~
1033        "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~
1034        "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~
1035        "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~
1036        "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~
1037        "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~
1038        "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~
1039        "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~
1040        "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~
1041        "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~
1042        "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1043
1044    private immutable Tuple!(wchar, char)[] bstMap = [
1045        tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'),
1046        tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'),
1047        tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'),
1048        tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'),
1049        tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'),
1050        tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'),
1051        tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'),
1052        tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'),
1053        tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'),
1054        tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'),
1055        tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'),
1056        tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'),
1057        tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'),
1058        tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'),
1059        tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'),
1060        tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'),
1061        tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'),
1062        tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'),
1063        tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'),
1064        tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'),
1065        tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'),
1066        tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'),
1067        tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'),
1068        tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'),
1069        tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'),
1070        tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'),
1071        tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'),
1072        tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'),
1073        tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'),
1074        tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'),
1075        tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'),
1076        tuple('\u0143','\xD1'), tuple('\u0147','\xD2')
1077    ];
1078
1079    mixin GenericEncoder!();
1080}
1081
1082//=============================================================================
1083//          WINDOWS-1250
1084//=============================================================================
1085
1086/// Defines a Windows1250-encoded character.
1087enum Windows1250Char : ubyte { _init }
1088
1089/**
1090 * Defines an Windows1250-encoded string (as an array of $(D
1091 * immutable(Windows1250Char))).
1092 */
1093alias Windows1250String = immutable(Windows1250Char)[];
1094
1095private template EncoderInstance(CharType : Windows1250Char)
1096{
1097    import std.typecons : Tuple, tuple;
1098
1099    alias E = Windows1250Char;
1100    alias EString = Windows1250String;
1101
1102    @property string encodingName() @safe pure nothrow @nogc
1103    {
1104        return "windows-1250";
1105    }
1106
1107    private static immutable dchar m_charMapStart = 0x80;
1108    private static immutable dchar m_charMapEnd = 0xff;
1109
1110    private immutable wstring charMap =
1111        "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~
1112        "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~
1113        "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1114        "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~
1115        "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~
1116        "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~
1117        "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~
1118        "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~
1119        "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~
1120        "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~
1121        "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~
1122        "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~
1123        "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~
1124        "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~
1125        "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~
1126        "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1127
1128    private immutable Tuple!(wchar, char)[] bstMap = [
1129        tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'),
1130        tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'),
1131        tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'),
1132        tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'),
1133        tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'),
1134        tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'),
1135        tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'),
1136        tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'),
1137        tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'),
1138        tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1139        tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'),
1140        tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'),
1141        tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'),
1142        tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'),
1143        tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'),
1144        tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'),
1145        tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'),
1146        tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'),
1147        tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'),
1148        tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1149        tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1150        tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'),
1151        tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'),
1152        tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'),
1153        tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'),
1154        tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'),
1155        tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'),
1156        tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'),
1157        tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'),
1158        tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'),
1159        tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'),
1160        tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'),
1161        tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'),
1162        tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'),
1163        tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'),
1164        tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'),
1165        tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'),
1166        tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'),
1167        tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'),
1168        tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1169        tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1170    ];
1171
1172    mixin GenericEncoder!();
1173}
1174
1175//=============================================================================
1176//          WINDOWS-1251
1177//=============================================================================
1178
1179/// Defines a Windows1251-encoded character.
1180enum Windows1251Char : ubyte { _init }
1181
1182/**
1183 * Defines an Windows1251-encoded string (as an array of $(D
1184 * immutable(Windows1251Char))).
1185 */
1186alias Windows1251String = immutable(Windows1251Char)[];
1187
1188private template EncoderInstance(CharType : Windows1251Char)
1189{
1190    import std.typecons : Tuple, tuple;
1191
1192    alias E = Windows1251Char;
1193    alias EString = Windows1251String;
1194
1195    @property string encodingName() @safe pure nothrow @nogc
1196    {
1197        return "windows-1251";
1198    }
1199
1200    private static immutable dchar m_charMapStart = 0x80;
1201    private static immutable dchar m_charMapEnd = 0xff;
1202
1203    private immutable wstring charMap =
1204        "\u0402\u0403\u201A\u0453\u201E\u2026\u2020\u2021"~
1205        "\u20AC\u2030\u0409\u2039\u040A\u040C\u040B\u040F"~
1206        "\u0452\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1207        "\uFFFD\u2122\u0459\u203A\u045A\u045C\u045B\u045F"~
1208        "\u00A0\u040E\u045E\u0408\u00A4\u0490\u00A6\u00A7"~
1209        "\u0401\u00A9\u0404\u00AB\u00AC\u00AD\u00AE\u0407"~
1210        "\u00B0\u00B1\u0406\u0456\u0491\u00B5\u00B6\u00B7"~
1211        "\u0451\u2116\u0454\u00BB\u0458\u0405\u0455\u0457"~
1212        "\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417"~
1213        "\u0418\u0419\u041A\u041B\u041C\u041D\u041E\u041F"~
1214        "\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427"~
1215        "\u0428\u0429\u042A\u042B\u042C\u042D\u042E\u042F"~
1216        "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437"~
1217        "\u0438\u0439\u043A\u043B\u043C\u043D\u043E\u043F"~
1218        "\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447"~
1219        "\u0448\u0449\u044A\u044B\u044C\u044D\u044E\u044F";
1220
1221    private immutable Tuple!(wchar, char)[] bstMap = [
1222        tuple('\u0432','\xE2'),tuple('\u0412','\xC2'),tuple('\u0453','\x83'),
1223        tuple('\u0401','\xA8'),tuple('\u0422','\xD2'),tuple('\u0442','\xF2'),
1224        tuple('\u2018','\x91'),tuple('\u00AD','\xAD'),tuple('\u0409','\x8A'),
1225        tuple('\u041A','\xCA'),tuple('\u042A','\xDA'),tuple('\u043A','\xEA'),
1226        tuple('\u044A','\xFA'),tuple('\u045B','\x9E'),tuple('\u2022','\x95'),
1227        tuple('\u00A7','\xA7'),tuple('\u00B5','\xB5'),tuple('\u0405','\xBD'),
1228        tuple('\u040E','\xA1'),tuple('\u0416','\xC6'),tuple('\u041E','\xCE'),
1229        tuple('\u0426','\xD6'),tuple('\u042E','\xDE'),tuple('\u0436','\xE6'),
1230        tuple('\u043E','\xEE'),tuple('\u0446','\xF6'),tuple('\u044E','\xFE'),
1231        tuple('\u0457','\xBF'),tuple('\u0490','\xA5'),tuple('\u201D','\x94'),
1232        tuple('\u203A','\x9B'),tuple('\u00A4','\xA4'),tuple('\u00AB','\xAB'),
1233        tuple('\u00B0','\xB0'),tuple('\u00B7','\xB7'),tuple('\u0403','\x81'),
1234        tuple('\u0407','\xAF'),tuple('\u040B','\x8E'),tuple('\u0410','\xC0'),
1235        tuple('\u0414','\xC4'),tuple('\u0418','\xC8'),tuple('\u041C','\xCC'),
1236        tuple('\u0420','\xD0'),tuple('\u0424','\xD4'),tuple('\u0428','\xD8'),
1237        tuple('\u042C','\xDC'),tuple('\u0430','\xE0'),tuple('\u0434','\xE4'),
1238        tuple('\u0438','\xE8'),tuple('\u043C','\xEC'),tuple('\u0440','\xF0'),
1239        tuple('\u0444','\xF4'),tuple('\u0448','\xF8'),tuple('\u044C','\xFC'),
1240        tuple('\u0451','\xB8'),tuple('\u0455','\xBE'),tuple('\u0459','\x9A'),
1241        tuple('\u045E','\xA2'),tuple('\u2013','\x96'),tuple('\u201A','\x82'),
1242        tuple('\u2020','\x86'),tuple('\u2030','\x89'),tuple('\u2116','\xB9'),
1243        tuple('\u00A0','\xA0'),tuple('\u00A6','\xA6'),tuple('\u00A9','\xA9'),
1244        tuple('\u00AC','\xAC'),tuple('\u00AE','\xAE'),tuple('\u00B1','\xB1'),
1245        tuple('\u00B6','\xB6'),tuple('\u00BB','\xBB'),tuple('\u0402','\x80'),
1246        tuple('\u0404','\xAA'),tuple('\u0406','\xB2'),tuple('\u0408','\xA3'),
1247        tuple('\u040A','\x8C'),tuple('\u040C','\x8D'),tuple('\u040F','\x8F'),
1248        tuple('\u0411','\xC1'),tuple('\u0413','\xC3'),tuple('\u0415','\xC5'),
1249        tuple('\u0417','\xC7'),tuple('\u0419','\xC9'),tuple('\u041B','\xCB'),
1250        tuple('\u041D','\xCD'),tuple('\u041F','\xCF'),tuple('\u0421','\xD1'),
1251        tuple('\u0423','\xD3'),tuple('\u0425','\xD5'),tuple('\u0427','\xD7'),
1252        tuple('\u0429','\xD9'),tuple('\u042B','\xDB'),tuple('\u042D','\xDD'),
1253        tuple('\u042F','\xDF'),tuple('\u0431','\xE1'),tuple('\u0433','\xE3'),
1254        tuple('\u0435','\xE5'),tuple('\u0437','\xE7'),tuple('\u0439','\xE9'),
1255        tuple('\u043B','\xEB'),tuple('\u043D','\xED'),tuple('\u043F','\xEF'),
1256        tuple('\u0441','\xF1'),tuple('\u0443','\xF3'),tuple('\u0445','\xF5'),
1257        tuple('\u0447','\xF7'),tuple('\u0449','\xF9'),tuple('\u044B','\xFB'),
1258        tuple('\u044D','\xFD'),tuple('\u044F','\xFF'),tuple('\u0452','\x90'),
1259        tuple('\u0454','\xBA'),tuple('\u0456','\xB3'),tuple('\u0458','\xBC'),
1260        tuple('\u045A','\x9C'),tuple('\u045C','\x9D'),tuple('\u045F','\x9F'),
1261        tuple('\u0491','\xB4'),tuple('\u2014','\x97'),tuple('\u2019','\x92'),
1262        tuple('\u201C','\x93'),tuple('\u201E','\x84'),tuple('\u2021','\x87'),
1263        tuple('\u2026','\x85'),tuple('\u2039','\x8B'),tuple('\u20AC','\x88'),
1264        tuple('\u2122','\x99')
1265    ];
1266
1267    mixin GenericEncoder!();
1268}
1269
1270//=============================================================================
1271//          WINDOWS-1252
1272//=============================================================================
1273
1274/// Defines a Windows1252-encoded character.
1275enum Windows1252Char : ubyte { _init }
1276
1277/**
1278 * Defines an Windows1252-encoded string (as an array of $(D
1279 * immutable(Windows1252Char))).
1280 */
1281alias Windows1252String = immutable(Windows1252Char)[];
1282
1283template EncoderInstance(CharType : Windows1252Char)
1284{
1285    import std.typecons : Tuple, tuple;
1286
1287    alias E = Windows1252Char;
1288    alias EString = Windows1252String;
1289
1290    @property string encodingName() @safe pure nothrow @nogc
1291    {
1292        return "windows-1252";
1293    }
1294
1295    private static immutable dchar m_charMapStart = 0x80;
1296    private static immutable dchar m_charMapEnd = 0x9f;
1297
1298    private immutable wstring charMap =
1299        "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~
1300        "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~
1301        "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1302        "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
1303
1304    private immutable Tuple!(wchar, char)[] bstMap = [
1305        tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'),
1306        tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1307        tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'),
1308        tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1309        tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1310        tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'),
1311        tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'),
1312        tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1313        tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1314    ];
1315
1316    mixin GenericEncoder!();
1317}
1318
1319//=============================================================================
1320//          UTF-8
1321//=============================================================================
1322
1323template EncoderInstance(CharType : char)
1324{
1325    alias E = char;
1326    alias EString = immutable(char)[];
1327
1328    @property string encodingName() @safe pure nothrow @nogc
1329    {
1330        return "UTF-8";
1331    }
1332
1333    bool canEncode(dchar c) @safe pure nothrow @nogc
1334    {
1335        return isValidCodePoint(c);
1336    }
1337
1338    bool isValidCodeUnit(char c) @safe pure nothrow @nogc
1339    {
1340        return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
1341    }
1342
1343    immutable ubyte[128] tailTable =
1344    [
1345        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1346        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1347        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1348        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1349        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1350        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1351        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1352        3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
1353    ];
1354
1355    private int tails(char c) @safe pure nothrow @nogc
1356    in
1357    {
1358        assert(c >= 0x80);
1359    }
1360    do
1361    {
1362        return tailTable[c-0x80];
1363    }
1364
1365    size_t encodedLength(dchar c) @safe pure nothrow @nogc
1366    in
1367    {
1368        assert(canEncode(c));
1369    }
1370    do
1371    {
1372        if (c < 0x80) return 1;
1373        if (c < 0x800) return 2;
1374        if (c < 0x10000) return 3;
1375        return 4;
1376    }
1377
1378    void encodeViaWrite()(dchar c)
1379    {
1380        if (c < 0x80)
1381        {
1382            write(cast(char) c);
1383        }
1384        else if (c < 0x800)
1385        {
1386            write(cast(char)((c >> 6) + 0xC0));
1387            write(cast(char)((c & 0x3F) + 0x80));
1388        }
1389        else if (c < 0x10000)
1390        {
1391            write(cast(char)((c >> 12) + 0xE0));
1392            write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1393            write(cast(char)((c & 0x3F) + 0x80));
1394        }
1395        else
1396        {
1397            write(cast(char)((c >> 18) + 0xF0));
1398            write(cast(char)(((c >> 12) & 0x3F) + 0x80));
1399            write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1400            write(cast(char)((c & 0x3F) + 0x80));
1401        }
1402    }
1403
1404    void skipViaRead()()
1405    {
1406        auto c = read();
1407        if (c < 0xC0) return;
1408        int n = tails(cast(char) c);
1409        for (size_t i=0; i<n; ++i)
1410        {
1411            read();
1412        }
1413    }
1414
1415    dchar decodeViaRead()()
1416    {
1417        dchar c = read();
1418        if (c < 0xC0) return c;
1419        int n = tails(cast(char) c);
1420        c &= (1 << (6 - n)) - 1;
1421        for (size_t i=0; i<n; ++i)
1422        {
1423            c = (c << 6) + (read() & 0x3F);
1424        }
1425        return c;
1426    }
1427
1428    dchar safeDecodeViaRead()()
1429    {
1430        dchar c = read();
1431        if (c < 0x80) return c;
1432        int n = tails(cast(char) c);
1433        if (n == 0) return INVALID_SEQUENCE;
1434
1435        if (!canRead) return INVALID_SEQUENCE;
1436        size_t d = peek();
1437        immutable err =
1438        (
1439            (c < 0xC2)                              // fail overlong 2-byte sequences
1440        ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences
1441        ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences
1442        ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates
1443        ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences
1444        ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF
1445        );
1446
1447        c &= (1 << (6 - n)) - 1;
1448        for (size_t i=0; i<n; ++i)
1449        {
1450            if (!canRead) return INVALID_SEQUENCE;
1451            d = peek();
1452            if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1453            c = (c << 6) + (read() & 0x3F);
1454        }
1455
1456        return err ? INVALID_SEQUENCE : c;
1457    }
1458
1459    dchar decodeReverseViaRead()()
1460    {
1461        dchar c = read();
1462        if (c < 0x80) return c;
1463        size_t shift = 0;
1464        c &= 0x3F;
1465        for (size_t i=0; i<4; ++i)
1466        {
1467            shift += 6;
1468            auto d = read();
1469            size_t n = tails(cast(char) d);
1470            immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1471            c += ((d & mask) << shift);
1472            if (n != 0) break;
1473        }
1474        return c;
1475    }
1476
1477    @property EString replacementSequence() @safe pure nothrow @nogc
1478    {
1479        return "\uFFFD";
1480    }
1481
1482    mixin EncoderFunctions;
1483}
1484
1485//=============================================================================
1486//          UTF-16
1487//=============================================================================
1488
1489template EncoderInstance(CharType : wchar)
1490{
1491    alias E = wchar;
1492    alias EString = immutable(wchar)[];
1493
1494    @property string encodingName() @safe pure nothrow @nogc
1495    {
1496        return "UTF-16";
1497    }
1498
1499    bool canEncode(dchar c) @safe pure nothrow @nogc
1500    {
1501        return isValidCodePoint(c);
1502    }
1503
1504    bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc
1505    {
1506        return true;
1507    }
1508
1509    size_t encodedLength(dchar c) @safe pure nothrow @nogc
1510    in
1511    {
1512        assert(canEncode(c));
1513    }
1514    do
1515    {
1516        return (c < 0x10000) ? 1 : 2;
1517    }
1518
1519    void encodeViaWrite()(dchar c)
1520    {
1521        if (c < 0x10000)
1522        {
1523            write(cast(wchar) c);
1524        }
1525        else
1526        {
1527            size_t n = c - 0x10000;
1528            write(cast(wchar)(0xD800 + (n >> 10)));
1529            write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1530        }
1531    }
1532
1533    void skipViaRead()()
1534    {
1535        immutable c = read();
1536        if (c < 0xD800 || c >= 0xE000) return;
1537        read();
1538    }
1539
1540    dchar decodeViaRead()()
1541    {
1542        wchar c = read();
1543        if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1544        wchar d = read();
1545        c &= 0x3FF;
1546        d &= 0x3FF;
1547        return 0x10000 + (c << 10) + d;
1548    }
1549
1550    dchar safeDecodeViaRead()()
1551    {
1552        wchar c = read();
1553        if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1554        if (c >= 0xDC00) return INVALID_SEQUENCE;
1555        if (!canRead) return INVALID_SEQUENCE;
1556        wchar d = peek();
1557        if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1558        d = read();
1559        c &= 0x3FF;
1560        d &= 0x3FF;
1561        return 0x10000 + (c << 10) + d;
1562    }
1563
1564    dchar decodeReverseViaRead()()
1565    {
1566        wchar c = read();
1567        if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1568        wchar d = read();
1569        c &= 0x3FF;
1570        d &= 0x3FF;
1571        return 0x10000 + (d << 10) + c;
1572    }
1573
1574    @property EString replacementSequence() @safe pure nothrow @nogc
1575    {
1576        return "\uFFFD"w;
1577    }
1578
1579    mixin EncoderFunctions;
1580}
1581
1582//=============================================================================
1583//          UTF-32
1584//=============================================================================
1585
1586template EncoderInstance(CharType : dchar)
1587{
1588    alias E = dchar;
1589    alias EString = immutable(dchar)[];
1590
1591    @property string encodingName() @safe pure nothrow @nogc
1592    {
1593        return "UTF-32";
1594    }
1595
1596    bool canEncode(dchar c) @safe pure @nogc nothrow
1597    {
1598        return isValidCodePoint(c);
1599    }
1600
1601    bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow
1602    {
1603        return isValidCodePoint(c);
1604    }
1605
1606    size_t encodedLength(dchar c) @safe pure @nogc nothrow
1607    in
1608    {
1609        assert(canEncode(c));
1610    }
1611    do
1612    {
1613        return 1;
1614    }
1615
1616    void encodeViaWrite()(dchar c)
1617    {
1618        write(c);
1619    }
1620
1621    void skipViaRead()()
1622    {
1623        read();
1624    }
1625
1626    dchar decodeViaRead()()
1627    {
1628        return cast(dchar) read();
1629    }
1630
1631    dchar safeDecodeViaRead()()
1632    {
1633        immutable c = read();
1634        return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1635    }
1636
1637    dchar decodeReverseViaRead()()
1638    {
1639        return cast(dchar) read();
1640    }
1641
1642    @property EString replacementSequence() @safe pure nothrow @nogc
1643    {
1644        return "\uFFFD"d;
1645    }
1646
1647    mixin EncoderFunctions;
1648}
1649
1650//=============================================================================
1651// Below are forwarding functions which expose the function to the user
1652
1653/**
1654Returns true if c is a valid code point
1655
1656 Note that this includes the non-character code points U+FFFE and U+FFFF,
1657 since these are valid code points (even though they are not valid
1658 characters).
1659
1660 Supersedes:
1661 This function supersedes `std.utf.startsValidDchar()`.
1662
1663 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1664 WINDOWS-1251, WINDOWS-1252
1665
1666 Params:
1667    c = the code point to be tested
1668 */
1669bool isValidCodePoint(dchar c) @safe pure nothrow @nogc
1670{
1671    return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1672}
1673
1674/**
1675 Returns the name of an encoding.
1676
1677 The type of encoding cannot be deduced. Therefore, it is necessary to
1678 explicitly specify the encoding type.
1679
1680 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1681 WINDOWS-1251, WINDOWS-1252
1682 */
1683@property string encodingName(T)()
1684{
1685    return EncoderInstance!(T).encodingName;
1686}
1687
1688///
1689@safe unittest
1690{
1691    assert(encodingName!(char) == "UTF-8");
1692    assert(encodingName!(wchar) == "UTF-16");
1693    assert(encodingName!(dchar) == "UTF-32");
1694    assert(encodingName!(AsciiChar) == "ASCII");
1695    assert(encodingName!(Latin1Char) == "ISO-8859-1");
1696    assert(encodingName!(Latin2Char) == "ISO-8859-2");
1697    assert(encodingName!(Windows1250Char) == "windows-1250");
1698    assert(encodingName!(Windows1251Char) == "windows-1251");
1699    assert(encodingName!(Windows1252Char) == "windows-1252");
1700}
1701
1702/**
1703 Returns true iff it is possible to represent the specified codepoint
1704 in the encoding.
1705
1706 The type of encoding cannot be deduced. Therefore, it is necessary to
1707 explicitly specify the encoding type.
1708
1709 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1710 WINDOWS-1251, WINDOWS-1252
1711 */
1712bool canEncode(E)(dchar c)
1713{
1714    return EncoderInstance!(E).canEncode(c);
1715}
1716
1717///
1718@safe pure unittest
1719{
1720    assert( canEncode!(Latin1Char)('A'));
1721    assert( canEncode!(Latin2Char)('A'));
1722    assert(!canEncode!(AsciiChar)('\u00A0'));
1723    assert( canEncode!(Latin1Char)('\u00A0'));
1724    assert( canEncode!(Latin2Char)('\u00A0'));
1725    assert( canEncode!(Windows1250Char)('\u20AC'));
1726    assert(!canEncode!(Windows1250Char)('\u20AD'));
1727    assert(!canEncode!(Windows1250Char)('\uFFFD'));
1728    assert( canEncode!(Windows1251Char)('\u0402'));
1729    assert(!canEncode!(Windows1251Char)('\u20AD'));
1730    assert(!canEncode!(Windows1251Char)('\uFFFD'));
1731    assert( canEncode!(Windows1252Char)('\u20AC'));
1732    assert(!canEncode!(Windows1252Char)('\u20AD'));
1733    assert(!canEncode!(Windows1252Char)('\uFFFD'));
1734    assert(!canEncode!(char)(cast(dchar) 0x110000));
1735}
1736
1737/// How to check an entire string
1738@safe pure unittest
1739{
1740    import std.algorithm.searching : find;
1741    import std.utf : byDchar;
1742
1743    assert("The quick brown fox"
1744        .byDchar
1745        .find!(x => !canEncode!AsciiChar(x))
1746        .empty);
1747}
1748
1749/**
1750 Returns true if the code unit is legal. For example, the byte 0x80 would
1751 not be legal in ASCII, because ASCII code units must always be in the range
1752 0x00 to 0x7F.
1753
1754 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1755 WINDOWS-1251, WINDOWS-1252
1756
1757 Params:
1758    c = the code unit to be tested
1759 */
1760bool isValidCodeUnit(E)(E c)
1761{
1762    return EncoderInstance!(E).isValidCodeUnit(c);
1763}
1764
1765///
1766@system pure unittest
1767{
1768    assert(!isValidCodeUnit(cast(char) 0xC0));
1769    assert(!isValidCodeUnit(cast(char) 0xFF));
1770    assert( isValidCodeUnit(cast(wchar) 0xD800));
1771    assert(!isValidCodeUnit(cast(dchar) 0xD800));
1772    assert(!isValidCodeUnit(cast(AsciiChar) 0xA0));
1773    assert( isValidCodeUnit(cast(Windows1250Char) 0x80));
1774    assert(!isValidCodeUnit(cast(Windows1250Char) 0x81));
1775    assert( isValidCodeUnit(cast(Windows1251Char) 0x80));
1776    assert(!isValidCodeUnit(cast(Windows1251Char) 0x98));
1777    assert( isValidCodeUnit(cast(Windows1252Char) 0x80));
1778    assert(!isValidCodeUnit(cast(Windows1252Char) 0x81));
1779}
1780
1781/**
1782 Returns true if the string is encoded correctly
1783
1784 Supersedes:
1785 This function supersedes std.utf.validate(), however note that this
1786 function returns a bool indicating whether the input was valid or not,
1787 whereas the older function would throw an exception.
1788
1789 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1790 WINDOWS-1251, WINDOWS-1252
1791
1792 Params:
1793    s = the string to be tested
1794 */
1795bool isValid(E)(const(E)[] s)
1796{
1797    return s.length == validLength(s);
1798}
1799
1800///
1801@system pure unittest
1802{
1803    assert( isValid("\u20AC100"));
1804    assert(!isValid(cast(char[3])[167, 133, 175]));
1805}
1806
1807/**
1808 Returns the length of the longest possible substring, starting from
1809 the first code unit, which is validly encoded.
1810
1811 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1812 WINDOWS-1251, WINDOWS-1252
1813
1814 Params:
1815    s = the string to be tested
1816 */
1817size_t validLength(E)(const(E)[] s)
1818{
1819    size_t result, before = void;
1820    while ((before = s.length) > 0)
1821    {
1822        if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1823            break;
1824        result += before - s.length;
1825    }
1826    return result;
1827}
1828
1829/**
1830 Sanitizes a string by replacing malformed code unit sequences with valid
1831 code unit sequences. The result is guaranteed to be valid for this encoding.
1832
1833 If the input string is already valid, this function returns the original,
1834 otherwise it constructs a new string by replacing all illegal code unit
1835 sequences with the encoding's replacement character, Invalid sequences will
1836 be replaced with the Unicode replacement character (U+FFFD) if the
1837 character repertoire contains it, otherwise invalid sequences will be
1838 replaced with '?'.
1839
1840 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1841 WINDOWS-1251, WINDOWS-1252
1842
1843 Params:
1844    s = the string to be sanitized
1845 */
1846immutable(E)[] sanitize(E)(immutable(E)[] s)
1847{
1848    size_t n = validLength(s);
1849    if (n == s.length) return s;
1850
1851    auto repSeq = EncoderInstance!(E).replacementSequence;
1852
1853    // Count how long the string needs to be.
1854    // Overestimating is not a problem
1855    size_t len = s.length;
1856    const(E)[] t = s[n..$];
1857    while (t.length != 0)
1858    {
1859        immutable c = EncoderInstance!(E).safeDecode(t);
1860        assert(c == INVALID_SEQUENCE);
1861        len += repSeq.length;
1862        t = t[validLength(t)..$];
1863    }
1864
1865    // Now do the write
1866    E[] array = new E[len];
1867    array[0 .. n] = s[0 .. n];
1868    size_t offset = n;
1869
1870    t = s[n..$];
1871    while (t.length != 0)
1872    {
1873        immutable c = EncoderInstance!(E).safeDecode(t);
1874        assert(c == INVALID_SEQUENCE);
1875        array[offset .. offset+repSeq.length] = repSeq[];
1876        offset += repSeq.length;
1877        n = validLength(t);
1878        array[offset .. offset+n] = t[0 .. n];
1879        offset += n;
1880        t = t[n..$];
1881    }
1882    return cast(immutable(E)[])array[0 .. offset];
1883}
1884
1885///
1886@system pure unittest
1887{
1888    assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1889}
1890
1891/**
1892 Returns the length of the first encoded sequence.
1893
1894 The input to this function MUST be validly encoded.
1895 This is enforced by the function's in-contract.
1896
1897 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1898 WINDOWS-1251, WINDOWS-1252
1899
1900 Params:
1901 s = the string to be sliced
1902 */
1903size_t firstSequence(E)(const(E)[] s)
1904in
1905{
1906    assert(s.length != 0);
1907    const(E)[] u = s;
1908    assert(safeDecode(u) != INVALID_SEQUENCE);
1909}
1910do
1911{
1912    auto before = s.length;
1913    EncoderInstance!(E).skip(s);
1914    return before - s.length;
1915}
1916
1917///
1918@system pure unittest
1919{
1920    assert(firstSequence("\u20AC1000") == "\u20AC".length);
1921    assert(firstSequence("hel") == "h".length);
1922}
1923
1924/**
1925 Returns the length of the last encoded sequence.
1926
1927 The input to this function MUST be validly encoded.
1928 This is enforced by the function's in-contract.
1929
1930 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1931 WINDOWS-1251, WINDOWS-1252
1932
1933 Params:
1934    s = the string to be sliced
1935 */
1936size_t lastSequence(E)(const(E)[] s)
1937in
1938{
1939    assert(s.length != 0);
1940    assert(isValid(s));
1941}
1942do
1943{
1944    const(E)[] t = s;
1945    EncoderInstance!(E).decodeReverse(s);
1946    return t.length - s.length;
1947}
1948
1949///
1950@system pure unittest
1951{
1952    assert(lastSequence("1000\u20AC") == "\u20AC".length);
1953    assert(lastSequence("hell��") == "��".length);
1954}
1955
1956/**
1957 Returns the array index at which the (n+1)th code point begins.
1958
1959 The input to this function MUST be validly encoded.
1960 This is enforced by the function's in-contract.
1961
1962 Supersedes:
1963 This function supersedes std.utf.toUTFindex().
1964
1965 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1966 WINDOWS-1251, WINDOWS-1252
1967
1968 Params:
1969    s = the string to be counted
1970    n = the current code point index
1971 */
1972ptrdiff_t index(E)(const(E)[] s,int n)
1973in
1974{
1975    assert(isValid(s));
1976    assert(n >= 0);
1977}
1978do
1979{
1980    const(E)[] t = s;
1981    for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1982    return t.length - s.length;
1983}
1984
1985///
1986@system pure unittest
1987{
1988    assert(index("\u20AC100",1) == 3);
1989    assert(index("h��llo",2) == 3);
1990}
1991
1992/**
1993 Decodes a single code point.
1994
1995 This function removes one or more code units from the start of a string,
1996 and returns the decoded code point which those code units represent.
1997
1998 The input to this function MUST be validly encoded.
1999 This is enforced by the function's in-contract.
2000
2001 Supersedes:
2002 This function supersedes std.utf.decode(), however, note that the
2003 function codePoints() supersedes it more conveniently.
2004
2005 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2006 WINDOWS-1251, WINDOWS-1252
2007
2008 Params:
2009    s = the string whose first code point is to be decoded
2010 */
2011dchar decode(S)(ref S s)
2012in
2013{
2014    assert(s.length != 0);
2015    auto u = s;
2016    assert(safeDecode(u) != INVALID_SEQUENCE);
2017}
2018do
2019{
2020    return EncoderInstance!(typeof(s[0])).decode(s);
2021}
2022
2023/**
2024 Decodes a single code point from the end of a string.
2025
2026 This function removes one or more code units from the end of a string,
2027 and returns the decoded code point which those code units represent.
2028
2029 The input to this function MUST be validly encoded.
2030 This is enforced by the function's in-contract.
2031
2032 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2033 WINDOWS-1251, WINDOWS-1252
2034
2035 Params:
2036    s = the string whose first code point is to be decoded
2037 */
2038dchar decodeReverse(E)(ref const(E)[] s)
2039in
2040{
2041    assert(s.length != 0);
2042    assert(isValid(s));
2043}
2044do
2045{
2046    return EncoderInstance!(E).decodeReverse(s);
2047}
2048
2049/**
2050 Decodes a single code point. The input does not have to be valid.
2051
2052 This function removes one or more code units from the start of a string,
2053 and returns the decoded code point which those code units represent.
2054
2055 This function will accept an invalidly encoded string as input.
2056 If an invalid sequence is found at the start of the string, this
2057 function will remove it, and return the value INVALID_SEQUENCE.
2058
2059 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2060 WINDOWS-1251, WINDOWS-1252
2061
2062 Params:
2063    s = the string whose first code point is to be decoded
2064 */
2065dchar safeDecode(S)(ref S s)
2066in
2067{
2068    assert(s.length != 0);
2069}
2070do
2071{
2072    return EncoderInstance!(typeof(s[0])).safeDecode(s);
2073}
2074
2075/**
2076 Returns the number of code units required to encode a single code point.
2077
2078 The input to this function MUST be a valid code point.
2079 This is enforced by the function's in-contract.
2080
2081 The type of the output cannot be deduced. Therefore, it is necessary to
2082 explicitly specify the encoding as a template parameter.
2083
2084 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2085 WINDOWS-1251, WINDOWS-1252
2086
2087 Params:
2088    c = the code point to be encoded
2089 */
2090size_t encodedLength(E)(dchar c)
2091in
2092{
2093    assert(isValidCodePoint(c));
2094}
2095do
2096{
2097    return EncoderInstance!(E).encodedLength(c);
2098}
2099
2100/**
2101 Encodes a single code point.
2102
2103 This function encodes a single code point into one or more code units.
2104 It returns a string containing those code units.
2105
2106 The input to this function MUST be a valid code point.
2107 This is enforced by the function's in-contract.
2108
2109 The type of the output cannot be deduced. Therefore, it is necessary to
2110 explicitly specify the encoding as a template parameter.
2111
2112 Supersedes:
2113 This function supersedes std.utf.encode(), however, note that the
2114 function codeUnits() supersedes it more conveniently.
2115
2116 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2117 WINDOWS-1251, WINDOWS-1252
2118
2119 Params:
2120    c = the code point to be encoded
2121 */
2122E[] encode(E)(dchar c)
2123in
2124{
2125    assert(isValidCodePoint(c));
2126}
2127do
2128{
2129    return EncoderInstance!(E).encode(c);
2130}
2131
2132/**
2133 Encodes a single code point into an array.
2134
2135 This function encodes a single code point into one or more code units
2136 The code units are stored in a user-supplied fixed-size array,
2137 which must be passed by reference.
2138
2139 The input to this function MUST be a valid code point.
2140 This is enforced by the function's in-contract.
2141
2142 The type of the output cannot be deduced. Therefore, it is necessary to
2143 explicitly specify the encoding as a template parameter.
2144
2145 Supersedes:
2146 This function supersedes std.utf.encode(), however, note that the
2147 function codeUnits() supersedes it more conveniently.
2148
2149 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2150 WINDOWS-1251, WINDOWS-1252
2151
2152 Params:
2153    c     = the code point to be encoded
2154    array = the destination array
2155
2156 Returns:
2157          the number of code units written to the array
2158 */
2159size_t encode(E)(dchar c, E[] array)
2160in
2161{
2162    assert(isValidCodePoint(c));
2163}
2164do
2165{
2166    E[] t = array;
2167    EncoderInstance!(E).encode(c,t);
2168    return array.length - t.length;
2169}
2170
2171/*
2172Encodes `c` in units of type `E` and writes the result to the
2173output range `R`. Returns the number of `E`s written.
2174 */
2175size_t encode(E, R)(dchar c, auto ref R range)
2176if (isNativeOutputRange!(R, E))
2177{
2178    static if (is(immutable E == immutable char))
2179    {
2180        if (c <= 0x7F)
2181        {
2182            put(range, cast(char) c);
2183            return 1;
2184        }
2185        if (c <= 0x7FF)
2186        {
2187            put(range, cast(char)(0xC0 | (c >> 6)));
2188            put(range, cast(char)(0x80 | (c & 0x3F)));
2189            return 2;
2190        }
2191        if (c <= 0xFFFF)
2192        {
2193            put(range, cast(char)(0xE0 | (c >> 12)));
2194            put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2195            put(range, cast(char)(0x80 | (c & 0x3F)));
2196            return 3;
2197        }
2198        if (c <= 0x10FFFF)
2199        {
2200            put(range, cast(char)(0xF0 | (c >> 18)));
2201            put(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
2202            put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2203            put(range, cast(char)(0x80 | (c & 0x3F)));
2204            return 4;
2205        }
2206        else
2207        {
2208            assert(0);
2209        }
2210    }
2211    else static if (is(immutable E == immutable wchar))
2212    {
2213        if (c <= 0xFFFF)
2214        {
2215            range.put(cast(wchar) c);
2216            return 1;
2217        }
2218        range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
2219        range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
2220        return 2;
2221    }
2222    else static if (is(immutable E == immutable dchar))
2223    {
2224        range.put(c);
2225        return 1;
2226    }
2227    else
2228    {
2229        static assert(0);
2230    }
2231}
2232
2233@safe pure unittest
2234{
2235    import std.array;
2236    Appender!(char[]) r;
2237    assert(encode!(char)('T', r) == 1);
2238    assert(encode!(wchar)('T', r) == 1);
2239    assert(encode!(dchar)('T', r) == 1);
2240}
2241
2242/**
2243 Encodes a single code point to a delegate.
2244
2245 This function encodes a single code point into one or more code units.
2246 The code units are passed one at a time to the supplied delegate.
2247
2248 The input to this function MUST be a valid code point.
2249 This is enforced by the function's in-contract.
2250
2251 The type of the output cannot be deduced. Therefore, it is necessary to
2252 explicitly specify the encoding as a template parameter.
2253
2254 Supersedes:
2255 This function supersedes std.utf.encode(), however, note that the
2256 function codeUnits() supersedes it more conveniently.
2257
2258 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2259 WINDOWS-1251, WINDOWS-1252
2260
2261 Params:
2262    c  = the code point to be encoded
2263    dg = the delegate to invoke for each code unit
2264 */
2265void encode(E)(dchar c, void delegate(E) dg)
2266in
2267{
2268    assert(isValidCodePoint(c));
2269}
2270do
2271{
2272    EncoderInstance!(E).encode(c,dg);
2273}
2274
2275/**
2276Encodes the contents of `s` in units of type `Tgt`, writing the result to an
2277output range.
2278
2279Returns: The number of `Tgt` elements written.
2280Params:
2281Tgt = Element type of `range`.
2282s = Input array.
2283range = Output range.
2284 */
2285size_t encode(Tgt, Src, R)(in Src[] s, R range)
2286{
2287    size_t result;
2288    foreach (c; s)
2289    {
2290        result += encode!(Tgt)(c, range);
2291    }
2292    return result;
2293}
2294
2295/**
2296 Returns a foreachable struct which can bidirectionally iterate over all
2297 code points in a string.
2298
2299 The input to this function MUST be validly encoded.
2300 This is enforced by the function's in-contract.
2301
2302 You can foreach either
2303 with or without an index. If an index is specified, it will be initialized
2304 at each iteration with the offset into the string at which the code point
2305 begins.
2306
2307 Supersedes:
2308 This function supersedes std.utf.decode().
2309
2310 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2311 WINDOWS-1251, WINDOWS-1252
2312
2313 Params:
2314    s = the string to be decoded
2315
2316 Example:
2317 --------------------------------------------------------
2318 string s = "hello world";
2319 foreach (c;codePoints(s))
2320 {
2321     // do something with c (which will always be a dchar)
2322 }
2323 --------------------------------------------------------
2324
2325 Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s)
2326 in that the latter will fall over on encountering U+FFFF.
2327 */
2328CodePoints!(E) codePoints(E)(immutable(E)[] s)
2329in
2330{
2331    assert(isValid(s));
2332}
2333do
2334{
2335    return CodePoints!(E)(s);
2336}
2337
2338///
2339@system unittest
2340{
2341    string s = "hello";
2342    string t;
2343    foreach (c;codePoints(s))
2344    {
2345        t ~= cast(char) c;
2346    }
2347    assert(s == t);
2348}
2349
2350/**
2351 Returns a foreachable struct which can bidirectionally iterate over all
2352 code units in a code point.
2353
2354 The input to this function MUST be a valid code point.
2355 This is enforced by the function's in-contract.
2356
2357 The type of the output cannot be deduced. Therefore, it is necessary to
2358 explicitly specify the encoding type in the template parameter.
2359
2360 Supersedes:
2361 This function supersedes std.utf.encode().
2362
2363 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2364 WINDOWS-1251, WINDOWS-1252
2365
2366 Params:
2367    c = the code point to be encoded
2368 */
2369CodeUnits!(E) codeUnits(E)(dchar c)
2370in
2371{
2372    assert(isValidCodePoint(c));
2373}
2374do
2375{
2376    return CodeUnits!(E)(c);
2377}
2378
2379///
2380@system unittest
2381{
2382    char[] a;
2383    foreach (c;codeUnits!(char)(cast(dchar)'\u20AC'))
2384    {
2385        a ~= c;
2386    }
2387    assert(a.length == 3);
2388    assert(a[0] == 0xE2);
2389    assert(a[1] == 0x82);
2390    assert(a[2] == 0xAC);
2391}
2392
2393/**
2394 Convert a string from one encoding to another.
2395
2396 Supersedes:
2397 This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and
2398 std.utf.toUTF32()
2399 (but note that to!() supersedes it more conveniently).
2400
2401 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2402 WINDOWS-1251, WINDOWS-1252
2403
2404 Params:
2405    s = Source string. $(B Must) be validly encoded.
2406        This is enforced by the function's in-contract.
2407    r = Destination string
2408
2409 See_Also:
2410    $(REF to, std,conv)
2411 */
2412void transcode(Src, Dst)(Src[] s, out Dst[] r)
2413in
2414{
2415    assert(isValid(s));
2416}
2417do
2418{
2419    static if (is(Src == Dst) && is(Src == immutable))
2420    {
2421        r = s;
2422    }
2423    else static if (is(immutable Src == immutable AsciiChar))
2424    {
2425        transcode(cast(const(char)[])s, r);
2426    }
2427    else
2428    {
2429        static if (is(immutable Dst == immutable wchar))
2430        {
2431            immutable minReservePlace = 2;
2432        }
2433        else static if (is(immutable Dst == immutable dchar))
2434        {
2435            immutable minReservePlace = 1;
2436        }
2437        else
2438        {
2439            immutable minReservePlace = 6;
2440        }
2441
2442        auto buffer = new Unqual!Dst[s.length];
2443        auto tmpBuffer = buffer;
2444
2445        while (s.length != 0)
2446        {
2447            if (tmpBuffer.length < minReservePlace)
2448            {
2449                size_t prevLength = buffer.length;
2450                buffer.length += s.length + minReservePlace;
2451                tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
2452            }
2453            EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer);
2454        }
2455
2456        r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length];
2457    }
2458}
2459
2460///
2461@system pure unittest
2462{
2463    wstring ws;
2464    // transcode from UTF-8 to UTF-16
2465    transcode("hello world",ws);
2466    assert(ws == "hello world"w);
2467
2468    Latin1String ls;
2469    // transcode from UTF-16 to ISO-8859-1
2470    transcode(ws, ls);
2471    assert(ls == "hello world");
2472}
2473
2474@system pure unittest
2475{
2476    import std.meta;
2477    import std.range;
2478    {
2479        import std.conv : to;
2480
2481        string asciiCharString = to!string(iota(0, 128, 1));
2482
2483        alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString,
2484            Windows1250String, Windows1251String, Windows1252String, dstring, wstring);
2485        foreach (S; Types)
2486            foreach (D; Types)
2487            {
2488                string str;
2489                S sStr;
2490                D dStr;
2491                transcode(asciiCharString, sStr);
2492                transcode(sStr, dStr);
2493                transcode(dStr, str);
2494                assert(asciiCharString == str);
2495            }
2496    }
2497    {
2498        string czechChars = "P����li�� ��lu��ou��k�� k���� ��p��l ����belsk�� ��dy.";
2499        alias Types = AliasSeq!(string, dstring, wstring);
2500        foreach (S; Types)
2501            foreach (D; Types)
2502            {
2503                string str;
2504                S sStr;
2505                D dStr;
2506                transcode(czechChars, sStr);
2507                transcode(sStr, dStr);
2508                transcode(dStr, str);
2509                assert(czechChars == str);
2510            }
2511    }
2512}
2513
2514@system unittest // mutable/const input/output
2515{
2516    import std.meta : AliasSeq;
2517
2518    static foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char))
2519    {{
2520        O[] output;
2521
2522        char[] mutableInput = "��bc".dup;
2523        transcode(mutableInput, output);
2524        assert(output == [0xE4, 'b', 'c']);
2525
2526        const char[] constInput = "��bc";
2527        transcode(constInput, output);
2528        assert(output == [0xF6, 'b', 'c']);
2529
2530        immutable char[] immutInput = "��bc";
2531        transcode(immutInput, output);
2532        assert(output == [0xFC, 'b', 'c']);
2533    }}
2534
2535    // Make sure that const/mutable input is copied.
2536    static foreach (C; AliasSeq!(char, const char))
2537    {{
2538        C[] input = "foo".dup;
2539        C[] output;
2540        transcode(input, output);
2541        assert(input == output);
2542        assert(input !is output);
2543    }}
2544
2545    // But immutable input should not be copied.
2546    string input = "foo";
2547    string output;
2548    transcode(input, output);
2549    assert(input is output);
2550}
2551
2552//=============================================================================
2553
2554/** The base class for exceptions thrown by this module */
2555class EncodingException : Exception { this(string msg) @safe pure { super(msg); } }
2556
2557class UnrecognizedEncodingException : EncodingException
2558{
2559    private this(string msg) @safe pure { super(msg); }
2560}
2561
2562/** Abstract base class of all encoding schemes */
2563abstract class EncodingScheme
2564{
2565    import std.uni : toLower;
2566
2567    /**
2568     * Registers a subclass of EncodingScheme.
2569     *
2570     * This function allows user-defined subclasses of EncodingScheme to
2571     * be declared in other modules.
2572     *
2573     * Params:
2574     *     Klass = The subclass of EncodingScheme to register.
2575     *
2576     * Example:
2577     * ----------------------------------------------
2578     * class Amiga1251 : EncodingScheme
2579     * {
2580     *     shared static this()
2581     *     {
2582     *         EncodingScheme.register!Amiga1251;
2583     *     }
2584     * }
2585     * ----------------------------------------------
2586     */
2587    static void register(Klass:EncodingScheme)()
2588    {
2589        scope scheme = new Klass();
2590        foreach (encodingName;scheme.names())
2591        {
2592            supported[toLower(encodingName)] = () => new Klass();
2593        }
2594    }
2595
2596    deprecated("Please pass the EncodingScheme subclass as template argument instead.")
2597    static void register(string className)
2598    {
2599        auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2600        if (scheme is null)
2601            throw new EncodingException("Unable to create class "~className);
2602        foreach (encodingName;scheme.names())
2603        {
2604            supportedFactories[toLower(encodingName)] = className;
2605        }
2606    }
2607
2608    /**
2609     * Obtains a subclass of EncodingScheme which is capable of encoding
2610     * and decoding the named encoding scheme.
2611     *
2612     * This function is only aware of EncodingSchemes which have been
2613     * registered with the register() function.
2614     *
2615     * Example:
2616     * ---------------------------------------------------
2617     * auto scheme = EncodingScheme.create("Amiga-1251");
2618     * ---------------------------------------------------
2619     */
2620    static EncodingScheme create(string encodingName)
2621    {
2622        static bool registerDefaultEncodings()
2623        {
2624            EncodingScheme.register!EncodingSchemeASCII;
2625            EncodingScheme.register!EncodingSchemeLatin1;
2626            EncodingScheme.register!EncodingSchemeLatin2;
2627            EncodingScheme.register!EncodingSchemeWindows1250;
2628            EncodingScheme.register!EncodingSchemeWindows1251;
2629            EncodingScheme.register!EncodingSchemeWindows1252;
2630            EncodingScheme.register!EncodingSchemeUtf8;
2631            EncodingScheme.register!EncodingSchemeUtf16Native;
2632            EncodingScheme.register!EncodingSchemeUtf32Native;
2633            return true;
2634        }
2635
2636        static shared bool initialized;
2637        import std.concurrency : initOnce;
2638        initOnce!initialized(registerDefaultEncodings());
2639        encodingName = toLower(encodingName);
2640
2641        if (auto p = encodingName in supported)
2642            return (*p)();
2643
2644        auto p = encodingName in supportedFactories;
2645        if (p is null)
2646            throw new EncodingException("Unrecognized Encoding: "~encodingName);
2647        string className = *p;
2648        auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2649        if (scheme is null) throw new EncodingException("Unable to create class "~className);
2650        return scheme;
2651    }
2652
2653    const
2654    {
2655        /**
2656         * Returns the standard name of the encoding scheme
2657         */
2658        abstract override string toString();
2659
2660        /**
2661         * Returns an array of all known names for this encoding scheme
2662         */
2663        abstract string[] names();
2664
2665        /**
2666         * Returns true if the character c can be represented
2667         * in this encoding scheme.
2668         */
2669        abstract bool canEncode(dchar c);
2670
2671        /**
2672         * Returns the number of ubytes required to encode this code point.
2673         *
2674         * The input to this function MUST be a valid code point.
2675         *
2676         * Params:
2677         *    c = the code point to be encoded
2678         *
2679         * Returns:
2680         *    the number of ubytes required.
2681         */
2682        abstract size_t encodedLength(dchar c);
2683
2684        /**
2685         * Encodes a single code point into a user-supplied, fixed-size buffer.
2686         *
2687         * This function encodes a single code point into one or more ubytes.
2688         * The supplied buffer must be code unit aligned.
2689         * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2690         * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2691         *
2692         * The input to this function MUST be a valid code point.
2693         *
2694         * Params:
2695         *    c      = the code point to be encoded
2696         *    buffer = the destination array
2697         *
2698         * Returns:
2699         *    the number of ubytes written.
2700         */
2701        abstract size_t encode(dchar c, ubyte[] buffer);
2702
2703        /**
2704         * Decodes a single code point.
2705         *
2706         * This function removes one or more ubytes from the start of an array,
2707         * and returns the decoded code point which those ubytes represent.
2708         *
2709         * The input to this function MUST be validly encoded.
2710         *
2711         * Params:
2712         *    s = the array whose first code point is to be decoded
2713         */
2714        abstract dchar decode(ref const(ubyte)[] s);
2715
2716        /**
2717         * Decodes a single code point. The input does not have to be valid.
2718         *
2719         * This function removes one or more ubytes from the start of an array,
2720         * and returns the decoded code point which those ubytes represent.
2721         *
2722         * This function will accept an invalidly encoded array as input.
2723         * If an invalid sequence is found at the start of the string, this
2724         * function will remove it, and return the value INVALID_SEQUENCE.
2725         *
2726         * Params:
2727         *    s = the array whose first code point is to be decoded
2728         */
2729        abstract dchar safeDecode(ref const(ubyte)[] s);
2730
2731        /**
2732         * Returns the sequence of ubytes to be used to represent
2733         * any character which cannot be represented in the encoding scheme.
2734         *
2735         * Normally this will be a representation of some substitution
2736         * character, such as U+FFFD or '?'.
2737         */
2738        abstract @property immutable(ubyte)[] replacementSequence();
2739    }
2740
2741    /**
2742     * Returns true if the array is encoded correctly
2743     *
2744     * Params:
2745     *    s = the array to be tested
2746     */
2747    bool isValid(const(ubyte)[] s)
2748    {
2749        while (s.length != 0)
2750        {
2751            if (safeDecode(s) == INVALID_SEQUENCE)
2752                return false;
2753        }
2754        return true;
2755    }
2756
2757    /**
2758     * Returns the length of the longest possible substring, starting from
2759     * the first element, which is validly encoded.
2760     *
2761     * Params:
2762     *    s = the array to be tested
2763     */
2764    size_t validLength()(const(ubyte)[] s)
2765    {
2766        const(ubyte)[] r = s;
2767        const(ubyte)[] t = s;
2768        while (s.length != 0)
2769        {
2770            if (safeDecode(s) == INVALID_SEQUENCE) break;
2771            t = s;
2772        }
2773        return r.length - t.length;
2774    }
2775
2776    /**
2777     * Sanitizes an array by replacing malformed ubyte sequences with valid
2778     * ubyte sequences. The result is guaranteed to be valid for this
2779     * encoding scheme.
2780     *
2781     * If the input array is already valid, this function returns the
2782     * original, otherwise it constructs a new array by replacing all illegal
2783     * sequences with the encoding scheme's replacement sequence.
2784     *
2785     * Params:
2786     *    s = the string to be sanitized
2787     */
2788    immutable(ubyte)[] sanitize()(immutable(ubyte)[] s)
2789    {
2790        auto n = validLength(s);
2791        if (n == s.length) return s;
2792
2793        auto repSeq = replacementSequence;
2794
2795        // Count how long the string needs to be.
2796        // Overestimating is not a problem
2797        auto len = s.length;
2798        const(ubyte)[] t = s[n..$];
2799        while (t.length != 0)
2800        {
2801            immutable c = safeDecode(t);
2802            assert(c == INVALID_SEQUENCE);
2803            len += repSeq.length;
2804            t = t[validLength(t)..$];
2805        }
2806
2807        // Now do the write
2808        ubyte[] array = new ubyte[len];
2809        array[0 .. n] = s[0 .. n];
2810        auto offset = n;
2811
2812        t = s[n..$];
2813        while (t.length != 0)
2814        {
2815            immutable c = safeDecode(t);
2816            assert(c == INVALID_SEQUENCE);
2817            array[offset .. offset+repSeq.length] = repSeq[];
2818            offset += repSeq.length;
2819            n = validLength(t);
2820            array[offset .. offset+n] = t[0 .. n];
2821            offset += n;
2822            t = t[n..$];
2823        }
2824        return cast(immutable(ubyte)[])array[0 .. offset];
2825    }
2826
2827    /**
2828     * Returns the length of the first encoded sequence.
2829     *
2830     * The input to this function MUST be validly encoded.
2831     * This is enforced by the function's in-contract.
2832     *
2833     * Params:
2834     *    s = the array to be sliced
2835     */
2836    size_t firstSequence()(const(ubyte)[] s)
2837    in
2838    {
2839        assert(s.length != 0);
2840        const(ubyte)[] u = s;
2841        assert(safeDecode(u) != INVALID_SEQUENCE);
2842    }
2843    do
2844    {
2845        const(ubyte)[] t = s;
2846        decode(s);
2847        return t.length - s.length;
2848    }
2849
2850    /**
2851     * Returns the total number of code points encoded in a ubyte array.
2852     *
2853     * The input to this function MUST be validly encoded.
2854     * This is enforced by the function's in-contract.
2855     *
2856     * Params:
2857     *    s = the string to be counted
2858     */
2859    size_t count()(const(ubyte)[] s)
2860    in
2861    {
2862        assert(isValid(s));
2863    }
2864    do
2865    {
2866        size_t n = 0;
2867        while (s.length != 0)
2868        {
2869            decode(s);
2870            ++n;
2871        }
2872        return n;
2873    }
2874
2875    /**
2876     * Returns the array index at which the (n+1)th code point begins.
2877     *
2878     * The input to this function MUST be validly encoded.
2879     * This is enforced by the function's in-contract.
2880     *
2881     * Params:
2882     *    s = the string to be counted
2883     *    n = the current code point index
2884     */
2885    ptrdiff_t index()(const(ubyte)[] s, size_t n)
2886    in
2887    {
2888        assert(isValid(s));
2889        assert(n >= 0);
2890    }
2891    do
2892    {
2893        const(ubyte)[] t = s;
2894        for (size_t i=0; i<n; ++i) decode(s);
2895        return t.length - s.length;
2896    }
2897
2898    __gshared EncodingScheme function()[string] supported;
2899    __gshared string[string] supportedFactories;
2900}
2901
2902/**
2903 EncodingScheme to handle ASCII
2904
2905 This scheme recognises the following names:
2906                 "ANSI_X3.4-1968",
2907                 "ANSI_X3.4-1986",
2908                 "ASCII",
2909                 "IBM367",
2910                 "ISO646-US",
2911                 "ISO_646.irv:1991",
2912                 "US-ASCII",
2913                 "cp367",
2914                 "csASCII"
2915                 "iso-ir-6",
2916                 "us"
2917 */
2918class EncodingSchemeASCII : EncodingScheme
2919{
2920    /* // moved to std.internal.phobosinit
2921    shared static this()
2922    {
2923        EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2924    }*/
2925
2926    const
2927    {
2928        override string[] names() @safe pure nothrow
2929        {
2930            return
2931            [
2932                "ANSI_X3.4-1968",
2933                "ANSI_X3.4-1986",
2934                "ASCII",
2935                "IBM367",
2936                "ISO646-US",
2937                "ISO_646.irv:1991",
2938                "US-ASCII",
2939                "cp367",
2940                "csASCII",
2941                "iso-ir-6",
2942                "us"
2943            ];
2944        }
2945
2946        override string toString() @safe pure nothrow @nogc
2947        {
2948            return "ASCII";
2949        }
2950
2951        override bool canEncode(dchar c) @safe pure nothrow @nogc
2952        {
2953            return std.encoding.canEncode!(AsciiChar)(c);
2954        }
2955
2956        override size_t encodedLength(dchar c)  @safe pure nothrow @nogc
2957        {
2958            return std.encoding.encodedLength!(AsciiChar)(c);
2959        }
2960
2961        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2962        {
2963            auto r = cast(AsciiChar[]) buffer;
2964            return std.encoding.encode(c,r);
2965        }
2966
2967        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2968        {
2969            auto t = cast(const(AsciiChar)[]) s;
2970            dchar c = std.encoding.decode(t);
2971            s = s[$-t.length..$];
2972            return c;
2973        }
2974
2975        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2976        {
2977            auto t = cast(const(AsciiChar)[]) s;
2978            dchar c = std.encoding.safeDecode(t);
2979            s = s[$-t.length..$];
2980            return c;
2981        }
2982
2983        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2984        {
2985            return cast(immutable(ubyte)[])"?";
2986        }
2987    }
2988}
2989
2990/**
2991 EncodingScheme to handle Latin-1
2992
2993 This scheme recognises the following names:
2994                 "CP819",
2995                 "IBM819",
2996                 "ISO-8859-1",
2997                 "ISO_8859-1",
2998                 "ISO_8859-1:1987",
2999                 "csISOLatin1",
3000                 "iso-ir-100",
3001                 "l1",
3002                 "latin1"
3003 */
3004class EncodingSchemeLatin1 : EncodingScheme
3005{
3006    /* // moved to std.internal.phobosinit
3007    shared static this()
3008    {
3009        EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
3010    }*/
3011
3012    const
3013    {
3014        override string[] names() @safe pure nothrow
3015        {
3016            return
3017            [
3018                "CP819",
3019                "IBM819",
3020                "ISO-8859-1",
3021                "ISO_8859-1",
3022                "ISO_8859-1:1987",
3023                "csISOLatin1",
3024                "iso-ir-100",
3025                "l1",
3026                "latin1"
3027            ];
3028        }
3029
3030        override string toString() @safe pure nothrow @nogc
3031        {
3032            return "ISO-8859-1";
3033        }
3034
3035        override bool canEncode(dchar c) @safe pure nothrow @nogc
3036        {
3037            return std.encoding.canEncode!(Latin1Char)(c);
3038        }
3039
3040        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3041        {
3042            return std.encoding.encodedLength!(Latin1Char)(c);
3043        }
3044
3045        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3046        {
3047            auto r = cast(Latin1Char[]) buffer;
3048            return std.encoding.encode(c,r);
3049        }
3050
3051        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3052        {
3053            auto t = cast(const(Latin1Char)[]) s;
3054            dchar c = std.encoding.decode(t);
3055            s = s[$-t.length..$];
3056            return c;
3057        }
3058
3059        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3060        {
3061            auto t = cast(const(Latin1Char)[]) s;
3062            dchar c = std.encoding.safeDecode(t);
3063            s = s[$-t.length..$];
3064            return c;
3065        }
3066
3067        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3068        {
3069            return cast(immutable(ubyte)[])"?";
3070        }
3071    }
3072}
3073
3074/**
3075 EncodingScheme to handle Latin-2
3076
3077 This scheme recognises the following names:
3078                 "Latin 2",
3079                 "ISO-8859-2",
3080                 "ISO_8859-2",
3081                 "ISO_8859-2:1999",
3082                 "Windows-28592"
3083 */
3084class EncodingSchemeLatin2 : EncodingScheme
3085{
3086    /* // moved to std.internal.phobosinit
3087    shared static this()
3088    {
3089        EncodingScheme.register("std.encoding.EncodingSchemeLatin2");
3090    }*/
3091
3092    const
3093    {
3094        override string[] names() @safe pure nothrow
3095        {
3096            return
3097            [
3098                "Latin 2",
3099                "ISO-8859-2",
3100                "ISO_8859-2",
3101                "ISO_8859-2:1999",
3102                "windows-28592"
3103            ];
3104        }
3105
3106        override string toString() @safe pure nothrow @nogc
3107        {
3108            return "ISO-8859-2";
3109        }
3110
3111        override bool canEncode(dchar c) @safe pure nothrow @nogc
3112        {
3113            return std.encoding.canEncode!(Latin2Char)(c);
3114        }
3115
3116        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3117        {
3118            return std.encoding.encodedLength!(Latin2Char)(c);
3119        }
3120
3121        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3122        {
3123            auto r = cast(Latin2Char[]) buffer;
3124            return std.encoding.encode(c,r);
3125        }
3126
3127        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3128        {
3129            auto t = cast(const(Latin2Char)[]) s;
3130            dchar c = std.encoding.decode(t);
3131            s = s[$-t.length..$];
3132            return c;
3133        }
3134
3135        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3136        {
3137            auto t = cast(const(Latin2Char)[]) s;
3138            dchar c = std.encoding.safeDecode(t);
3139            s = s[$-t.length..$];
3140            return c;
3141        }
3142
3143        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3144        {
3145            return cast(immutable(ubyte)[])"?";
3146        }
3147    }
3148}
3149
3150/**
3151 EncodingScheme to handle Windows-1250
3152
3153 This scheme recognises the following names:
3154                 "windows-1250"
3155 */
3156class EncodingSchemeWindows1250 : EncodingScheme
3157{
3158    /* // moved to std.internal.phobosinit
3159    shared static this()
3160    {
3161        EncodingScheme.register("std.encoding.EncodingSchemeWindows1250");
3162    }*/
3163
3164    const
3165    {
3166        override string[] names() @safe pure nothrow
3167        {
3168            return
3169            [
3170                "windows-1250"
3171            ];
3172        }
3173
3174        override string toString() @safe pure nothrow @nogc
3175        {
3176            return "windows-1250";
3177        }
3178
3179        override bool canEncode(dchar c) @safe pure nothrow @nogc
3180        {
3181            return std.encoding.canEncode!(Windows1250Char)(c);
3182        }
3183
3184        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3185        {
3186            return std.encoding.encodedLength!(Windows1250Char)(c);
3187        }
3188
3189        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3190        {
3191            auto r = cast(Windows1250Char[]) buffer;
3192            return std.encoding.encode(c,r);
3193        }
3194
3195        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3196        {
3197            auto t = cast(const(Windows1250Char)[]) s;
3198            dchar c = std.encoding.decode(t);
3199            s = s[$-t.length..$];
3200            return c;
3201        }
3202
3203        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3204        {
3205            auto t = cast(const(Windows1250Char)[]) s;
3206            dchar c = std.encoding.safeDecode(t);
3207            s = s[$-t.length..$];
3208            return c;
3209        }
3210
3211        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3212        {
3213            return cast(immutable(ubyte)[])"?";
3214        }
3215    }
3216}
3217
3218/**
3219 EncodingScheme to handle Windows-1251
3220
3221 This scheme recognises the following names:
3222                 "windows-1251"
3223 */
3224class EncodingSchemeWindows1251 : EncodingScheme
3225{
3226    /* // moved to std.internal.phobosinit
3227    shared static this()
3228    {
3229        EncodingScheme.register("std.encoding.EncodingSchemeWindows1251");
3230    }*/
3231
3232    const
3233    {
3234        override string[] names() @safe pure nothrow
3235        {
3236            return
3237            [
3238                "windows-1251"
3239            ];
3240        }
3241
3242        override string toString() @safe pure nothrow @nogc
3243        {
3244            return "windows-1251";
3245        }
3246
3247        override bool canEncode(dchar c) @safe pure nothrow @nogc
3248        {
3249            return std.encoding.canEncode!(Windows1251Char)(c);
3250        }
3251
3252        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3253        {
3254            return std.encoding.encodedLength!(Windows1251Char)(c);
3255        }
3256
3257        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3258        {
3259            auto r = cast(Windows1251Char[]) buffer;
3260            return std.encoding.encode(c,r);
3261        }
3262
3263        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3264        {
3265            auto t = cast(const(Windows1251Char)[]) s;
3266            dchar c = std.encoding.decode(t);
3267            s = s[$-t.length..$];
3268            return c;
3269        }
3270
3271        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3272        {
3273            auto t = cast(const(Windows1251Char)[]) s;
3274            dchar c = std.encoding.safeDecode(t);
3275            s = s[$-t.length..$];
3276            return c;
3277        }
3278
3279        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3280        {
3281            return cast(immutable(ubyte)[])"?";
3282        }
3283    }
3284}
3285
3286/**
3287 EncodingScheme to handle Windows-1252
3288
3289 This scheme recognises the following names:
3290                 "windows-1252"
3291 */
3292class EncodingSchemeWindows1252 : EncodingScheme
3293{
3294    /* // moved to std.internal.phobosinit
3295    shared static this()
3296    {
3297        EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
3298    }*/
3299
3300    const
3301    {
3302        override string[] names() @safe pure nothrow
3303        {
3304            return
3305            [
3306                "windows-1252"
3307            ];
3308        }
3309
3310        override string toString() @safe pure nothrow @nogc
3311        {
3312            return "windows-1252";
3313        }
3314
3315        override bool canEncode(dchar c) @safe pure nothrow @nogc
3316        {
3317            return std.encoding.canEncode!(Windows1252Char)(c);
3318        }
3319
3320        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3321        {
3322            return std.encoding.encodedLength!(Windows1252Char)(c);
3323        }
3324
3325        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3326        {
3327            auto r = cast(Windows1252Char[]) buffer;
3328            return std.encoding.encode(c,r);
3329        }
3330
3331        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3332        {
3333            auto t = cast(const(Windows1252Char)[]) s;
3334            dchar c = std.encoding.decode(t);
3335            s = s[$-t.length..$];
3336            return c;
3337        }
3338
3339        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3340        {
3341            auto t = cast(const(Windows1252Char)[]) s;
3342            dchar c = std.encoding.safeDecode(t);
3343            s = s[$-t.length..$];
3344            return c;
3345        }
3346
3347        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3348        {
3349            return cast(immutable(ubyte)[])"?";
3350        }
3351    }
3352}
3353
3354@system unittest
3355{
3356    static string[] schemeNames =
3357    [
3358        "ASCII",
3359        "ISO-8859-1",
3360        "ISO-8859-2",
3361        "windows-1250",
3362        "windows-1251",
3363        "windows-1252"
3364    ];
3365
3366    EncodingScheme[] schemes;
3367
3368    foreach (name;schemeNames)
3369    {
3370       schemes ~= EncodingScheme.create(name);
3371    }
3372
3373    ubyte[1] buffer;
3374    static dchar[][] valid =
3375    [
3376        //Valid ASCII
3377        ['\u0001','\u0020','\u0040','\u0060','\u007F'],
3378        //Vaild 8859-1
3379        ['\u0001','\u0020','\u0070','\u00DA','\u00FF'],
3380        //Valid 8859-2
3381        ['\u0020','\u00D7','\u00DF','\u010F','\u02D9'],
3382        //Valid 1250
3383        ['\u0020','\u20AC','\u201E','\u2021','\u2039'],
3384        //Valid 1251
3385        ['\u0402','\u00A4','\u0415','\u0439','\u044F'],
3386        //Valid 1252
3387        ['\u20AC','\u0160','\u2019','\u2122','\u0178'],
3388    ];
3389
3390    static const(ubyte)[] invalid = [0xA0,0xFF,0xFF,0x81,0x98,0x81];
3391
3392    foreach (i,scheme;schemes)
3393    {
3394        assert(scheme.toString() == schemeNames[i],"Error in the name of encoding scheme"~schemeNames[i]);
3395        assert(!scheme.canEncode('\uFFFD'));
3396        assert(scheme.encodedLength('A') == 1);
3397        const(ubyte)[] encodeStr;
3398        dchar[] decStr;
3399        foreach (chr;valid[i])
3400        {
3401            assert(scheme.encode(chr,buffer) == 1);
3402            encodeStr ~= buffer;
3403            const(ubyte)[] buf = buffer;
3404            decStr ~= scheme.decode(buf);
3405        }
3406
3407        assert(scheme.isValid(encodeStr),"Not correctly encoded UTF => " ~ schemeNames[i]);
3408        assert(valid[i] == decStr,"Error encode/decode UTF8 <=> " ~ schemeNames[i]);
3409
3410        if (schemeNames[i] == "ISO-8859-1" || schemeNames[i] == "ISO-8859-2")
3411        {
3412            assert(scheme.safeDecode(invalid) != INVALID_SEQUENCE);
3413        }
3414        else
3415        {
3416            assert(scheme.safeDecode(invalid) == INVALID_SEQUENCE);
3417        }
3418        assert(scheme.replacementSequence() == cast(immutable(ubyte)[])"?");
3419    }
3420    assert(invalid.length == 0);
3421}
3422
3423/**
3424 EncodingScheme to handle UTF-8
3425
3426 This scheme recognises the following names:
3427                 "UTF-8"
3428 */
3429class EncodingSchemeUtf8 : EncodingScheme
3430{
3431    /* // moved to std.internal.phobosinit
3432    shared static this()
3433    {
3434        EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
3435    }*/
3436
3437    const
3438    {
3439        override string[] names() @safe pure nothrow
3440        {
3441            return
3442            [
3443                "UTF-8"
3444            ];
3445        }
3446
3447        override string toString() @safe pure nothrow @nogc
3448        {
3449            return "UTF-8";
3450        }
3451
3452        override bool canEncode(dchar c) @safe pure nothrow @nogc
3453        {
3454            return std.encoding.canEncode!(char)(c);
3455        }
3456
3457        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3458        {
3459            return std.encoding.encodedLength!(char)(c);
3460        }
3461
3462        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3463        {
3464            auto r = cast(char[]) buffer;
3465            return std.encoding.encode(c,r);
3466        }
3467
3468        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3469        {
3470            auto t = cast(const(char)[]) s;
3471            dchar c = std.encoding.decode(t);
3472            s = s[$-t.length..$];
3473            return c;
3474        }
3475
3476        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3477        {
3478            auto t = cast(const(char)[]) s;
3479            dchar c = std.encoding.safeDecode(t);
3480            s = s[$-t.length..$];
3481            return c;
3482        }
3483
3484        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3485        {
3486            return cast(immutable(ubyte)[])"\uFFFD";
3487        }
3488    }
3489}
3490
3491/**
3492 EncodingScheme to handle UTF-16 in native byte order
3493
3494 This scheme recognises the following names:
3495                 "UTF-16LE" (little-endian architecture only)
3496                 "UTF-16BE" (big-endian architecture only)
3497 */
3498class EncodingSchemeUtf16Native : EncodingScheme
3499{
3500    /* // moved to std.internal.phobosinit
3501    shared static this()
3502    {
3503        EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
3504    }*/
3505
3506    const
3507    {
3508        version (LittleEndian) { enum string NAME = "UTF-16LE"; }
3509        version (BigEndian)    { enum string NAME = "UTF-16BE"; }
3510
3511        override string[] names() @safe pure nothrow
3512        {
3513            return [ NAME ];
3514        }
3515
3516        override string toString() @safe pure nothrow @nogc
3517        {
3518            return NAME;
3519        }
3520
3521        override bool canEncode(dchar c) @safe pure nothrow @nogc
3522        {
3523            return std.encoding.canEncode!(wchar)(c);
3524        }
3525
3526        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3527        {
3528            return std.encoding.encodedLength!(wchar)(c);
3529        }
3530
3531        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3532        {
3533            auto r = cast(wchar[]) buffer;
3534            return wchar.sizeof * std.encoding.encode(c,r);
3535        }
3536
3537        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3538        in
3539        {
3540            assert((s.length & 1) == 0);
3541        }
3542        do
3543        {
3544            auto t = cast(const(wchar)[]) s;
3545            dchar c = std.encoding.decode(t);
3546            s = s[$-t.length * wchar.sizeof..$];
3547            return c;
3548        }
3549
3550        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3551        in
3552        {
3553            assert((s.length & 1) == 0);
3554        }
3555        do
3556        {
3557            auto t = cast(const(wchar)[]) s;
3558            dchar c = std.encoding.safeDecode(t);
3559            s = s[$-t.length * wchar.sizeof..$];
3560            return c;
3561        }
3562
3563        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3564        {
3565            return cast(immutable(ubyte)[])"\uFFFD"w;
3566        }
3567    }
3568}
3569@system unittest
3570{
3571    version (LittleEndian)
3572    {
3573        auto efrom = EncodingScheme.create("utf-16le");
3574        ubyte[6] sample = [154,1, 155,1, 156,1];
3575    }
3576    version (BigEndian)
3577    {
3578        auto efrom = EncodingScheme.create("utf-16be");
3579        ubyte[6] sample = [1,154, 1,155, 1,156];
3580    }
3581    const(ubyte)[] ub = cast(const(ubyte)[])sample;
3582    dchar dc = efrom.safeDecode(ub);
3583    assert(dc == 410);
3584    assert(ub.length == 4);
3585}
3586
3587/**
3588 EncodingScheme to handle UTF-32 in native byte order
3589
3590 This scheme recognises the following names:
3591                 "UTF-32LE" (little-endian architecture only)
3592                 "UTF-32BE" (big-endian architecture only)
3593 */
3594class EncodingSchemeUtf32Native : EncodingScheme
3595{
3596    /* // moved to std.internal.phobosinit
3597    shared static this()
3598    {
3599        EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
3600    }*/
3601
3602    const
3603    {
3604        version (LittleEndian) { enum string NAME = "UTF-32LE"; }
3605        version (BigEndian)    { enum string NAME = "UTF-32BE"; }
3606
3607        override string[] names() @safe pure nothrow
3608        {
3609            return [ NAME ];
3610        }
3611
3612        override string toString() @safe pure nothrow @nogc
3613        {
3614            return NAME;
3615        }
3616
3617        override bool canEncode(dchar c) @safe pure nothrow @nogc
3618        {
3619            return std.encoding.canEncode!(dchar)(c);
3620        }
3621
3622        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3623        {
3624            return std.encoding.encodedLength!(dchar)(c);
3625        }
3626
3627        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3628        {
3629            auto r = cast(dchar[]) buffer;
3630            return dchar.sizeof * std.encoding.encode(c,r);
3631        }
3632
3633        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3634        in
3635        {
3636            assert((s.length & 3) == 0);
3637        }
3638        do
3639        {
3640            auto t = cast(const(dchar)[]) s;
3641            dchar c = std.encoding.decode(t);
3642            s = s[$-t.length * dchar.sizeof..$];
3643            return c;
3644        }
3645
3646        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3647        in
3648        {
3649            assert((s.length & 3) == 0);
3650        }
3651        do
3652        {
3653            auto t = cast(const(dchar)[]) s;
3654            dchar c = std.encoding.safeDecode(t);
3655            s = s[$-t.length * dchar.sizeof..$];
3656            return c;
3657        }
3658
3659        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3660        {
3661            return cast(immutable(ubyte)[])"\uFFFD"d;
3662        }
3663    }
3664}
3665@system unittest
3666{
3667    version (LittleEndian)
3668    {
3669        auto efrom = EncodingScheme.create("utf-32le");
3670        ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0];
3671    }
3672    version (BigEndian)
3673    {
3674        auto efrom = EncodingScheme.create("utf-32be");
3675        ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156];
3676    }
3677    const(ubyte)[] ub = cast(const(ubyte)[])sample;
3678    dchar dc = efrom.safeDecode(ub);
3679    assert(dc == 410);
3680    assert(ub.length == 8);
3681}
3682
3683//=============================================================================
3684
3685
3686/** Definitions of common Byte Order Marks.
3687The elements of the `enum` can used as indices into `bomTable` to get
3688matching `BOMSeq`.
3689*/
3690enum BOM
3691{
3692    none      = 0,  /// no BOM was found
3693    utf32be   = 1,  /// [0x00, 0x00, 0xFE, 0xFF]
3694    utf32le   = 2,  /// [0xFF, 0xFE, 0x00, 0x00]
3695    utf7      = 3,  /** [0x2B, 0x2F, 0x76, 0x38]
3696                        [0x2B, 0x2F, 0x76, 0x39],
3697                        [0x2B, 0x2F, 0x76, 0x2B],
3698                        [0x2B, 0x2F, 0x76, 0x2F],
3699                        [0x2B, 0x2F, 0x76, 0x38, 0x2D]
3700                    */
3701    utf1      = 8,  /// [0xF7, 0x64, 0x4C]
3702    utfebcdic = 9,  /// [0xDD, 0x73, 0x66, 0x73]
3703    scsu      = 10, /// [0x0E, 0xFE, 0xFF]
3704    bocu1     = 11, /// [0xFB, 0xEE, 0x28]
3705    gb18030   = 12, /// [0x84, 0x31, 0x95, 0x33]
3706    utf8      = 13, /// [0xEF, 0xBB, 0xBF]
3707    utf16be   = 14, /// [0xFE, 0xFF]
3708    utf16le   = 15  /// [0xFF, 0xFE]
3709}
3710
3711/// The type stored inside `bomTable`.
3712alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
3713
3714/** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
3715*/
3716immutable bomTable = [
3717    BOMSeq(BOM.none, null),
3718    BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
3719    BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
3720    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
3721    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
3722    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
3723    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
3724    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
3725    BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
3726    BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
3727    BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
3728    BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
3729    BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
3730    BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
3731    BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
3732    BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
3733];
3734
3735/** Returns a `BOMSeq` for a given `input`.
3736If no `BOM` is present the `BOMSeq` for `BOM.none` is
3737returned. The `BOM` sequence at the beginning of the range will
3738not be comsumed from the passed range. If you pass a reference type
3739range make sure that `save` creates a deep copy.
3740
3741Params:
3742    input = The sequence to check for the `BOM`
3743
3744Returns:
3745    the found `BOMSeq` corresponding to the passed `input`.
3746*/
3747immutable(BOMSeq) getBOM(Range)(Range input)
3748if (isForwardRange!Range && is(immutable ElementType!Range == immutable ubyte))
3749{
3750    import std.algorithm.searching : startsWith;
3751    foreach (it; bomTable[1 .. $])
3752    {
3753        if (startsWith(input.save, it.sequence))
3754        {
3755            return it;
3756        }
3757    }
3758
3759    return bomTable[0];
3760}
3761
3762///
3763@system unittest
3764{
3765    import std.format : format;
3766
3767    auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
3768
3769    auto entry = getBOM(cast(ubyte[]) ts);
3770    version (BigEndian)
3771    {
3772        assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
3773    }
3774    else
3775    {
3776        assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
3777    }
3778}
3779
3780@system unittest
3781{
3782    import std.format : format;
3783
3784    foreach (idx, it; bomTable)
3785    {
3786        auto s = it[1] ~ cast(ubyte[])"hello world";
3787        auto i = getBOM(s);
3788        assert(i[0] == bomTable[idx][0]);
3789
3790        if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
3791        {
3792            assert(i[0] == BOM.init + idx);
3793            assert(i[1] == it[1]);
3794        }
3795    }
3796}
3797
3798@safe pure unittest
3799{
3800    struct BOMInputRange
3801    {
3802        ubyte[] arr;
3803
3804        @property ubyte front()
3805        {
3806            return this.arr.front;
3807        }
3808
3809        @property bool empty()
3810        {
3811            return this.arr.empty;
3812        }
3813
3814        void popFront()
3815        {
3816            this.arr = this.arr[1 .. $];
3817        }
3818
3819        @property typeof(this) save()
3820        {
3821            return this;
3822        }
3823    }
3824
3825    static assert( isInputRange!BOMInputRange);
3826    static assert(!isArray!BOMInputRange);
3827
3828    ubyte[] dummyEnd = [0,0,0,0];
3829
3830    foreach (idx, it; bomTable[1 .. $])
3831    {
3832        {
3833            auto ir = BOMInputRange(it.sequence.dup);
3834
3835            auto b = getBOM(ir);
3836            assert(b.schema == it.schema);
3837            assert(ir.arr == it.sequence);
3838        }
3839
3840        {
3841            auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
3842            size_t oldLen = noBom.length;
3843            assert(oldLen - 4 < it.sequence.length);
3844
3845            auto ir = BOMInputRange(noBom.dup);
3846            auto b = getBOM(ir);
3847            assert(b.schema == BOM.none);
3848            assert(noBom.length == oldLen);
3849        }
3850    }
3851}
3852
3853/** Constant defining a fully decoded BOM */
3854enum dchar utfBOM = 0xfeff;
3855