1// Written in the D programming language.
2
3/**
4Classes and functions for handling and transcoding between various encodings.
5
6For cases where the _encoding is known at compile-time, functions are provided
7for arbitrary _encoding and decoding of characters, arbitrary transcoding
8between strings of different type, as well as validation and sanitization.
9
10Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11(also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250 and WINDOWS-1252.
12
13$(SCRIPT inhibitQuickIndex = 1;)
14$(BOOKTABLE,
15$(TR $(TH Category) $(TH Functions))
16$(TR $(TD Decode) $(TD
17    $(LREF codePoints)
18    $(LREF decode)
19    $(LREF decodeReverse)
20    $(LREF safeDecode)
21))
22$(TR $(TD Conversion) $(TD
23    $(LREF codeUnits)
24    $(LREF sanitize)
25    $(LREF transcode)
26))
27$(TR $(TD Classification) $(TD
28    $(LREF canEncode)
29    $(LREF isValid)
30    $(LREF isValidCodePoint)
31    $(LREF isValidCodeUnit)
32))
33$(TR $(TD BOM) $(TD
34    $(LREF BOM)
35    $(LREF BOMSeq)
36    $(LREF getBOM)
37    $(LREF utfBOM)
38))
39$(TR $(TD Length & Index) $(TD
40    $(LREF firstSequence)
41    $(LREF encodedLength)
42    $(LREF index)
43    $(LREF lastSequence)
44    $(LREF validLength)
45))
46$(TR $(TD Encoding schemes) $(TD
47    $(LREF encodingName)
48    $(LREF EncodingScheme)
49    $(LREF EncodingSchemeASCII)
50    $(LREF EncodingSchemeLatin1)
51    $(LREF EncodingSchemeLatin2)
52    $(LREF EncodingSchemeUtf16Native)
53    $(LREF EncodingSchemeUtf32Native)
54    $(LREF EncodingSchemeUtf8)
55    $(LREF EncodingSchemeWindows1250)
56    $(LREF EncodingSchemeWindows1252)
57))
58$(TR $(TD Representation) $(TD
59    $(LREF AsciiChar)
60    $(LREF AsciiString)
61    $(LREF Latin1Char)
62    $(LREF Latin1String)
63    $(LREF Latin2Char)
64    $(LREF Latin2String)
65    $(LREF Windows1250Char)
66    $(LREF Windows1250String)
67    $(LREF Windows1252Char)
68    $(LREF Windows1252String)
69))
70$(TR $(TD Exceptions) $(TD
71    $(LREF INVALID_SEQUENCE)
72    $(LREF EncodingException)
73))
74)
75
76For cases where the _encoding is not known at compile-time, but is
77known at run-time, the abstract class $(LREF EncodingScheme)
78and its subclasses is provided.  To construct a run-time encoder/decoder,
79one does e.g.
80
81----------------------------------------------------
82auto e = EncodingScheme.create("utf-8");
83----------------------------------------------------
84
85This library supplies $(LREF EncodingScheme) subclasses for ASCII,
86ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250,
87WINDOWS-1252, UTF-8, and (on little-endian architectures) UTF-16LE and
88UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE.
89
90This library provides a mechanism whereby other modules may add $(LREF
91EncodingScheme) subclasses for any other _encoding.
92
93Copyright: Copyright Janice Caron 2008 - 2009.
94License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
95Authors:   Janice Caron
96Source:    $(PHOBOSSRC std/_encoding.d)
97*/
98/*
99         Copyright Janice Caron 2008 - 2009.
100Distributed under the Boost Software License, Version 1.0.
101   (See accompanying file LICENSE_1_0.txt or copy at
102         http://www.boost.org/LICENSE_1_0.txt)
103*/
104module std.encoding;
105
106import std.range.primitives;
107import std.traits;
108import std.typecons;
109
110@system unittest
111{
112    static ubyte[][] validStrings =
113    [
114        // Plain ASCII
115        cast(ubyte[])"hello",
116
117        // First possible sequence of a certain length
118        [ 0x00 ],                       // U+00000000   one byte
119        [ 0xC2, 0x80 ],                 // U+00000080   two bytes
120        [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes
121        [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes
122
123        // Last possible sequence of a certain length
124        [ 0x7F ],                       // U+0000007F   one byte
125        [ 0xDF, 0xBF ],                 // U+000007FF   two bytes
126        [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes
127
128        // Other boundary conditions
129        [ 0xED, 0x9F, 0xBF ],
130        // U+0000D7FF   Last character before surrogates
131        [ 0xEE, 0x80, 0x80 ],
132        // U+0000E000   First character after surrogates
133        [ 0xEF, 0xBF, 0xBD ],
134        // U+0000FFFD   Unicode replacement character
135        [ 0xF4, 0x8F, 0xBF, 0xBF ],
136        // U+0010FFFF   Very last character
137
138        // Non-character code points
139        /*  NOTE: These are legal in UTF, and may be converted from
140            one UTF to another, however they do not represent Unicode
141            characters. These code points have been reserved by
142            Unicode as non-character code points. They are permissible
143            for data exchange within an application, but they are are
144            not permitted to be used as characters. Since this module
145            deals with UTF, and not with Unicode per se, we choose to
146            accept them here. */
147        [ 0xDF, 0xBE ],                 // U+0000FFFE
148        [ 0xDF, 0xBF ],                 // U+0000FFFF
149    ];
150
151    static ubyte[][] invalidStrings =
152    [
153        // First possible sequence of a certain length, but greater
154        // than U+10FFFF
155        [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes
156        [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes
157
158        // Last possible sequence of a certain length, but greater than U+10FFFF
159        [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes
160        [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes
161        [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes
162
163        // Other boundary conditions
164        [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000
165                                                    // First code
166                                                    // point after
167                                                    // last character
168
169        // Unexpected continuation bytes
170        [ 0x80 ],
171        [ 0xBF ],
172        [ 0x20, 0x80, 0x20 ],
173        [ 0x20, 0xBF, 0x20 ],
174        [ 0x80, 0x9F, 0xA0 ],
175
176        // Lonely start bytes
177        [ 0xC0 ],
178        [ 0xCF ],
179        [ 0x20, 0xC0, 0x20 ],
180        [ 0x20, 0xCF, 0x20 ],
181        [ 0xD0 ],
182        [ 0xDF ],
183        [ 0x20, 0xD0, 0x20 ],
184        [ 0x20, 0xDF, 0x20 ],
185        [ 0xE0 ],
186        [ 0xEF ],
187        [ 0x20, 0xE0, 0x20 ],
188        [ 0x20, 0xEF, 0x20 ],
189        [ 0xF0 ],
190        [ 0xF1 ],
191        [ 0xF2 ],
192        [ 0xF3 ],
193        [ 0xF4 ],
194        [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF
195        [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF
196        [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF
197
198        [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing
199        [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing
200        [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above
201
202        // Impossible bytes
203        [ 0xF8 ],
204        [ 0xF9 ],
205        [ 0xFA ],
206        [ 0xFB ],
207        [ 0xFC ],
208        [ 0xFD ],
209        [ 0xFE ],
210        [ 0xFF ],
211        [ 0x20, 0xF8, 0x20 ],
212        [ 0x20, 0xF9, 0x20 ],
213        [ 0x20, 0xFA, 0x20 ],
214        [ 0x20, 0xFB, 0x20 ],
215        [ 0x20, 0xFC, 0x20 ],
216        [ 0x20, 0xFD, 0x20 ],
217        [ 0x20, 0xFE, 0x20 ],
218        [ 0x20, 0xFF, 0x20 ],
219
220        // Overlong sequences, all representing U+002F
221        /*  With a safe UTF-8 decoder, all of the following five overlong
222            representations of the ASCII character slash ("/") should be
223            rejected like a malformed UTF-8 sequence */
224        [ 0xC0, 0xAF ],
225        [ 0xE0, 0x80, 0xAF ],
226        [ 0xF0, 0x80, 0x80, 0xAF ],
227        [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
228        [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
229
230        // Maximum overlong sequences
231        /*  Below you see the highest Unicode value that is still resulting in
232            an overlong sequence if represented with the given number of bytes.
233            This is a boundary test for safe UTF-8 decoders. All five
234            characters should be rejected like malformed UTF-8 sequences. */
235        [ 0xC1, 0xBF ],                             // U+0000007F
236        [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF
237        [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF
238        [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF
239        [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF
240
241        // Overlong representation of the NUL character
242        /*  The following five sequences should also be rejected like malformed
243            UTF-8 sequences and should not be treated like the ASCII NUL
244            character. */
245        [ 0xC0, 0x80 ],
246        [ 0xE0, 0x80, 0x80 ],
247        [ 0xF0, 0x80, 0x80, 0x80 ],
248        [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
249        [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
250
251        // Illegal code positions
252        /*  The following UTF-8 sequences should be rejected like malformed
253            sequences, because they never represent valid ISO 10646 characters
254            and a UTF-8 decoder that accepts them might introduce security
255            problems comparable to overlong UTF-8 sequences. */
256        [ 0xED, 0xA0, 0x80 ],       // U+D800
257        [ 0xED, 0xAD, 0xBF ],       // U+DB7F
258        [ 0xED, 0xAE, 0x80 ],       // U+DB80
259        [ 0xED, 0xAF, 0xBF ],       // U+DBFF
260        [ 0xED, 0xB0, 0x80 ],       // U+DC00
261        [ 0xED, 0xBE, 0x80 ],       // U+DF80
262        [ 0xED, 0xBF, 0xBF ],       // U+DFFF
263    ];
264
265    static string[] sanitizedStrings =
266    [
267        "\uFFFD","\uFFFD",
268        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
269        " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
270        "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
271        " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
272        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
273        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
274        " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
275        " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
276        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
277        "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
278    ];
279
280    // Make sure everything that should be valid, is
281    foreach (a;validStrings)
282    {
283        string s = cast(string) a;
284        assert(isValid(s),"Failed to validate: "~makeReadable(s));
285    }
286
287    // Make sure everything that shouldn't be valid, isn't
288    foreach (a;invalidStrings)
289    {
290        string s = cast(string) a;
291        assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
292    }
293
294    // Make sure we can sanitize everything bad
295    assert(invalidStrings.length == sanitizedStrings.length);
296    for (int i=0; i<invalidStrings.length; ++i)
297    {
298        string s = cast(string) invalidStrings[i];
299        string t = sanitize(s);
300        assert(isValid(t));
301        assert(t == sanitizedStrings[i]);
302        ubyte[] u = cast(ubyte[]) t;
303        validStrings ~= u;
304    }
305
306    // Make sure all transcodings work in both directions, using both forward
307    // and reverse iteration
308    foreach (a; validStrings)
309    {
310        string s = cast(string) a;
311        string s2;
312        wstring ws, ws2;
313        dstring ds, ds2;
314
315        transcode(s,ws);
316        assert(isValid(ws));
317        transcode(ws,s2);
318        assert(s == s2);
319
320        transcode(s,ds);
321        assert(isValid(ds));
322        transcode(ds,s2);
323        assert(s == s2);
324
325        transcode(ws,s);
326        assert(isValid(s));
327        transcode(s,ws2);
328        assert(ws == ws2);
329
330        transcode(ws,ds);
331        assert(isValid(ds));
332        transcode(ds,ws2);
333        assert(ws == ws2);
334
335        transcode(ds,s);
336        assert(isValid(s));
337        transcode(s,ds2);
338        assert(ds == ds2);
339
340        transcode(ds,ws);
341        assert(isValid(ws));
342        transcode(ws,ds2);
343        assert(ds == ds2);
344
345        transcodeReverse(s,ws);
346        assert(isValid(ws));
347        transcodeReverse(ws,s2);
348        assert(s == s2);
349
350        transcodeReverse(s,ds);
351        assert(isValid(ds));
352        transcodeReverse(ds,s2);
353        assert(s == s2);
354
355        transcodeReverse(ws,s);
356        assert(isValid(s));
357        transcodeReverse(s,ws2);
358        assert(ws == ws2);
359
360        transcodeReverse(ws,ds);
361        assert(isValid(ds));
362        transcodeReverse(ds,ws2);
363        assert(ws == ws2);
364
365        transcodeReverse(ds,s);
366        assert(isValid(s));
367        transcodeReverse(s,ds2);
368        assert(ds == ds2);
369
370        transcodeReverse(ds,ws);
371        assert(isValid(ws));
372        transcodeReverse(ws,ds2);
373        assert(ds == ds2);
374    }
375
376    // Make sure the non-UTF encodings work too
377    {
378        auto s = "\u20AC100";
379        Windows1252String t;
380        transcode(s,t);
381        assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
382        string u;
383        transcode(s,u);
384        assert(s == u);
385        Latin1String v;
386        transcode(s,v);
387        assert(cast(string) v == "?100");
388        AsciiString w;
389        transcode(v,w);
390        assert(cast(string) w == "?100");
391        s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148";
392        Latin2String x;
393        transcode(s,x);
394        assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
395        Windows1250String y;
396        transcode(s,y);
397        assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
398    }
399
400    // Make sure we can count properly
401    {
402        assert(encodedLength!(char)('A') == 1);
403        assert(encodedLength!(char)('\u00E3') == 2);
404        assert(encodedLength!(char)('\u2028') == 3);
405        assert(encodedLength!(char)('\U0010FFF0') == 4);
406        assert(encodedLength!(wchar)('A') == 1);
407        assert(encodedLength!(wchar)('\U0010FFF0') == 2);
408    }
409
410    // Make sure we can write into mutable arrays
411    {
412        char[4] buffer;
413        auto n = encode(cast(dchar)'\u00E3',buffer);
414        assert(n == 2);
415        assert(buffer[0] == 0xC3);
416        assert(buffer[1] == 0xA3);
417    }
418}
419
420//=============================================================================
421
422/** Special value returned by $(D safeDecode) */
423enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
424
425template EncoderFunctions()
426{
427    // Various forms of read
428
429    template ReadFromString()
430    {
431        @property bool canRead() { return s.length != 0; }
432        E peek() @safe pure @nogc nothrow { return s[0]; }
433        E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; }
434    }
435
436    template ReverseReadFromString()
437    {
438        @property bool canRead() { return s.length != 0; }
439        E peek() @safe pure @nogc nothrow { return s[$-1]; }
440        E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; }
441    }
442
443    // Various forms of Write
444
445    template WriteToString()
446    {
447        E[] s;
448        void write(E c) @safe pure nothrow { s ~= c; }
449    }
450
451    template WriteToArray()
452    {
453        void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; }
454    }
455
456    template WriteToDelegate()
457    {
458        void write(E c) { dg(c); }
459    }
460
461    // Functions we will export
462
463    template EncodeViaWrite()
464    {
465        mixin encodeViaWrite;
466        void encode(dchar c) { encodeViaWrite(c); }
467    }
468
469    template SkipViaRead()
470    {
471        mixin skipViaRead;
472        void skip() @safe pure @nogc nothrow { skipViaRead(); }
473    }
474
475    template DecodeViaRead()
476    {
477        mixin decodeViaRead;
478        dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); }
479    }
480
481    template SafeDecodeViaRead()
482    {
483        mixin safeDecodeViaRead;
484        dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); }
485    }
486
487    template DecodeReverseViaRead()
488    {
489        mixin decodeReverseViaRead;
490        dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); }
491    }
492
493    // Encoding to different destinations
494
495    template EncodeToString()
496    {
497        mixin WriteToString;
498        mixin EncodeViaWrite;
499    }
500
501    template EncodeToArray()
502    {
503        mixin WriteToArray;
504        mixin EncodeViaWrite;
505    }
506
507    template EncodeToDelegate()
508    {
509        mixin WriteToDelegate;
510        mixin EncodeViaWrite;
511    }
512
513    // Decoding functions
514
515    template SkipFromString()
516    {
517        mixin ReadFromString;
518        mixin SkipViaRead;
519    }
520
521    template DecodeFromString()
522    {
523        mixin ReadFromString;
524        mixin DecodeViaRead;
525    }
526
527    template SafeDecodeFromString()
528    {
529        mixin ReadFromString;
530        mixin SafeDecodeViaRead;
531    }
532
533    template DecodeReverseFromString()
534    {
535        mixin ReverseReadFromString;
536        mixin DecodeReverseViaRead;
537    }
538
539    //=========================================================================
540
541    // Below are the functions we will ultimately expose to the user
542
543    E[] encode(dchar c) @safe pure nothrow
544    {
545        mixin EncodeToString e;
546        e.encode(c);
547        return e.s;
548    }
549
550    void encode(dchar c, ref E[] array) @safe pure nothrow
551    {
552        mixin EncodeToArray e;
553        e.encode(c);
554    }
555
556    void encode(dchar c, void delegate(E) dg)
557    {
558        mixin EncodeToDelegate e;
559        e.encode(c);
560    }
561
562    void skip(ref const(E)[] s) @safe pure nothrow
563    {
564        mixin SkipFromString e;
565        e.skip();
566    }
567
568    dchar decode(S)(ref S s)
569    {
570        mixin DecodeFromString e;
571        return e.decode();
572    }
573
574    dchar safeDecode(S)(ref S s)
575    {
576        mixin SafeDecodeFromString e;
577        return e.safeDecode();
578    }
579
580    dchar decodeReverse(ref const(E)[] s) @safe pure nothrow
581    {
582        mixin DecodeReverseFromString e;
583        return e.decodeReverse();
584    }
585}
586
587//=========================================================================
588
589struct CodePoints(E)
590{
591    const(E)[] s;
592
593    this(const(E)[] s)
594    in
595    {
596        assert(isValid(s));
597    }
598    body
599    {
600        this.s = s;
601    }
602
603    int opApply(scope int delegate(ref dchar) dg)
604    {
605        int result = 0;
606        while (s.length != 0)
607        {
608            dchar c = decode(s);
609            result = dg(c);
610            if (result != 0) break;
611        }
612        return result;
613    }
614
615    int opApply(scope int delegate(ref size_t, ref dchar) dg)
616    {
617        size_t i = 0;
618        int result = 0;
619        while (s.length != 0)
620        {
621            immutable len = s.length;
622            dchar c = decode(s);
623            size_t j = i; // We don't want the delegate corrupting i
624            result = dg(j,c);
625            if (result != 0) break;
626            i += len - s.length;
627        }
628        return result;
629    }
630
631    int opApplyReverse(scope int delegate(ref dchar) dg)
632    {
633        int result = 0;
634        while (s.length != 0)
635        {
636            dchar c = decodeReverse(s);
637            result = dg(c);
638            if (result != 0) break;
639        }
640        return result;
641    }
642
643    int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
644    {
645        int result = 0;
646        while (s.length != 0)
647        {
648            dchar c = decodeReverse(s);
649            size_t i = s.length;
650            result = dg(i,c);
651            if (result != 0) break;
652        }
653        return result;
654    }
655}
656
657struct CodeUnits(E)
658{
659    E[] s;
660
661    this(dchar d)
662    in
663    {
664        assert(isValidCodePoint(d));
665    }
666    body
667    {
668        s = encode!(E)(d);
669    }
670
671    int opApply(scope int delegate(ref E) dg)
672    {
673        int result = 0;
674        foreach (E c;s)
675        {
676            result = dg(c);
677            if (result != 0) break;
678        }
679        return result;
680    }
681
682    int opApplyReverse(scope int delegate(ref E) dg)
683    {
684        int result = 0;
685        foreach_reverse (E c;s)
686        {
687            result = dg(c);
688            if (result != 0) break;
689        }
690        return result;
691    }
692}
693
694//=============================================================================
695
696template EncoderInstance(E)
697{
698    static assert(false,"Cannot instantiate EncoderInstance for type "
699        ~ E.stringof);
700}
701
702private template GenericEncoder()
703{
704    bool canEncode(dchar c) @safe pure @nogc nothrow
705    {
706        if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true;
707        if (c >= 0xFFFD) return false;
708
709        auto idx = 0;
710        while (idx < bstMap.length)
711        {
712            if (bstMap[idx][0] == c) return true;
713            idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
714        }
715
716        return false;
717    }
718
719    bool isValidCodeUnit(E c) @safe pure @nogc nothrow
720    {
721        if (c < m_charMapStart || c > m_charMapEnd) return true;
722        return charMap[c-m_charMapStart] != 0xFFFD;
723    }
724
725    size_t encodedLength(dchar c) @safe pure @nogc nothrow
726    in
727    {
728        assert(canEncode(c));
729    }
730    body
731    {
732        return 1;
733    }
734
735    void encodeViaWrite()(dchar c)
736    {
737        if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {}
738        else if (c >= 0xFFFD) { c = '?'; }
739        else
740        {
741            auto idx = 0;
742            while (idx < bstMap.length)
743            {
744                if (bstMap[idx][0] == c)
745                {
746                    write(cast(E) bstMap[idx][1]);
747                    return;
748                }
749                idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
750            }
751            c = '?';
752        }
753        write(cast(E) c);
754    }
755
756    void skipViaRead()()
757    {
758        read();
759    }
760
761    dchar decodeViaRead()()
762    {
763        E c = read();
764        return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
765    }
766
767    dchar safeDecodeViaRead()()
768    {
769        immutable E c = read();
770        immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
771        return d == 0xFFFD ? INVALID_SEQUENCE : d;
772    }
773
774    dchar decodeReverseViaRead()()
775    {
776        E c = read();
777        return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
778    }
779
780    @property EString replacementSequence() @safe pure @nogc nothrow
781    {
782        return cast(EString)("?");
783    }
784
785    mixin EncoderFunctions;
786}
787
788//=============================================================================
789//          ASCII
790//=============================================================================
791
792/** Defines various character sets. */
793enum AsciiChar : ubyte { init }
794/// Ditto
795alias AsciiString = immutable(AsciiChar)[];
796
797template EncoderInstance(CharType : AsciiChar)
798{
799    alias E = AsciiChar;
800    alias EString = AsciiString;
801
802    @property string encodingName() @safe pure nothrow @nogc
803    {
804        return "ASCII";
805    }
806
807    bool canEncode(dchar c) @safe pure nothrow @nogc
808    {
809        return c < 0x80;
810    }
811
812    bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc
813    {
814        return c < 0x80;
815    }
816
817    size_t encodedLength(dchar c) @safe pure nothrow @nogc
818    in
819    {
820        assert(canEncode(c));
821    }
822    body
823    {
824        return 1;
825    }
826
827    void encodeX(Range)(dchar c, Range r)
828    {
829        if (!canEncode(c)) c = '?';
830        r.write(cast(AsciiChar) c);
831    }
832
833    void encodeViaWrite()(dchar c)
834    {
835        if (!canEncode(c)) c = '?';
836        write(cast(AsciiChar) c);
837    }
838
839    void skipViaRead()()
840    {
841        read();
842    }
843
844    dchar decodeViaRead()()
845    {
846        return read();
847    }
848
849    dchar safeDecodeViaRead()()
850    {
851        immutable c = read();
852        return canEncode(c) ? c : INVALID_SEQUENCE;
853    }
854
855    dchar decodeReverseViaRead()()
856    {
857        return read();
858    }
859
860    @property EString replacementSequence() @safe pure nothrow @nogc
861    {
862        return cast(EString)("?");
863    }
864
865    mixin EncoderFunctions;
866}
867
868//=============================================================================
869//          ISO-8859-1
870//=============================================================================
871
872/** Defines an Latin1-encoded character. */
873enum Latin1Char : ubyte { init }
874/**
875Defines an Latin1-encoded string (as an array of $(D
876immutable(Latin1Char))).
877 */
878alias Latin1String = immutable(Latin1Char)[];
879
880template EncoderInstance(CharType : Latin1Char)
881{
882    alias E = Latin1Char;
883    alias EString = Latin1String;
884
885    @property string encodingName() @safe pure nothrow @nogc
886    {
887        return "ISO-8859-1";
888    }
889
890    bool canEncode(dchar c) @safe pure nothrow @nogc
891    {
892        return c < 0x100;
893    }
894
895    bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc
896    {
897        return true;
898    }
899
900    size_t encodedLength(dchar c) @safe pure nothrow @nogc
901    in
902    {
903        assert(canEncode(c));
904    }
905    body
906    {
907        return 1;
908    }
909
910    void encodeViaWrite()(dchar c)
911    {
912        if (!canEncode(c)) c = '?';
913        write(cast(Latin1Char) c);
914    }
915
916    void skipViaRead()()
917    {
918        read();
919    }
920
921    dchar decodeViaRead()()
922    {
923        return read();
924    }
925
926    dchar safeDecodeViaRead()()
927    {
928        return read();
929    }
930
931    dchar decodeReverseViaRead()()
932    {
933        return read();
934    }
935
936    @property EString replacementSequence() @safe pure nothrow @nogc
937    {
938        return cast(EString)("?");
939    }
940
941    mixin EncoderFunctions;
942}
943
944//=============================================================================
945//          ISO-8859-2
946//=============================================================================
947
948/// Defines a Latin2-encoded character.
949enum Latin2Char : ubyte { init }
950
951/**
952 * Defines an Latin2-encoded string (as an array of $(D
953 * immutable(Latin2Char))).
954 */
955alias Latin2String = immutable(Latin2Char)[];
956
957private template EncoderInstance(CharType : Latin2Char)
958{
959    import std.typecons : Tuple, tuple;
960
961    alias E = Latin2Char;
962    alias EString = Latin2String;
963
964    @property string encodingName() @safe pure nothrow @nogc
965    {
966        return "ISO-8859-2";
967    }
968
969    private static immutable dchar m_charMapStart = 0xa1;
970    private static immutable dchar m_charMapEnd = 0xff;
971
972    private immutable wstring charMap =
973        "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~
974        "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~
975        "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~
976        "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~
977        "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~
978        "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~
979        "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~
980        "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~
981        "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~
982        "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~
983        "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~
984        "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
985
986    private immutable Tuple!(wchar, char)[] bstMap = [
987        tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'),
988        tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'),
989        tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'),
990        tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'),
991        tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'),
992        tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'),
993        tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'),
994        tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'),
995        tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'),
996        tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'),
997        tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'),
998        tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'),
999        tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'),
1000        tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'),
1001        tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'),
1002        tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'),
1003        tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'),
1004        tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'),
1005        tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'),
1006        tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'),
1007        tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'),
1008        tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'),
1009        tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'),
1010        tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'),
1011        tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'),
1012        tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'),
1013        tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'),
1014        tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'),
1015        tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'),
1016        tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'),
1017        tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'),
1018        tuple('\u0143','\xD1'), tuple('\u0147','\xD2')
1019    ];
1020
1021    mixin GenericEncoder!();
1022}
1023
1024//=============================================================================
1025//          WINDOWS-1250
1026//=============================================================================
1027
1028/// Defines a Windows1250-encoded character.
1029enum Windows1250Char : ubyte { init }
1030
1031/**
1032 * Defines an Windows1250-encoded string (as an array of $(D
1033 * immutable(Windows1250Char))).
1034 */
1035alias Windows1250String = immutable(Windows1250Char)[];
1036
1037private template EncoderInstance(CharType : Windows1250Char)
1038{
1039    import std.typecons : Tuple, tuple;
1040
1041    alias E = Windows1250Char;
1042    alias EString = Windows1250String;
1043
1044    @property string encodingName() @safe pure nothrow @nogc
1045    {
1046        return "windows-1250";
1047    }
1048
1049    private static immutable dchar m_charMapStart = 0x80;
1050    private static immutable dchar m_charMapEnd = 0xff;
1051
1052    private immutable wstring charMap =
1053        "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~
1054        "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~
1055        "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1056        "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~
1057        "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~
1058        "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~
1059        "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~
1060        "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~
1061        "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~
1062        "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~
1063        "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~
1064        "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~
1065        "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~
1066        "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~
1067        "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~
1068        "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1069
1070    private immutable Tuple!(wchar, char)[] bstMap = [
1071        tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'),
1072        tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'),
1073        tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'),
1074        tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'),
1075        tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'),
1076        tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'),
1077        tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'),
1078        tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'),
1079        tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'),
1080        tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1081        tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'),
1082        tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'),
1083        tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'),
1084        tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'),
1085        tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'),
1086        tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'),
1087        tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'),
1088        tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'),
1089        tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'),
1090        tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1091        tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1092        tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'),
1093        tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'),
1094        tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'),
1095        tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'),
1096        tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'),
1097        tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'),
1098        tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'),
1099        tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'),
1100        tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'),
1101        tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'),
1102        tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'),
1103        tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'),
1104        tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'),
1105        tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'),
1106        tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'),
1107        tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'),
1108        tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'),
1109        tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'),
1110        tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1111        tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1112    ];
1113
1114    mixin GenericEncoder!();
1115}
1116
1117//=============================================================================
1118//          WINDOWS-1252
1119//=============================================================================
1120
1121/// Defines a Windows1252-encoded character.
1122enum Windows1252Char : ubyte { init }
1123
1124/**
1125 * Defines an Windows1252-encoded string (as an array of $(D
1126 * immutable(Windows1252Char))).
1127 */
1128alias Windows1252String = immutable(Windows1252Char)[];
1129
1130template EncoderInstance(CharType : Windows1252Char)
1131{
1132    import std.typecons : Tuple, tuple;
1133
1134    alias E = Windows1252Char;
1135    alias EString = Windows1252String;
1136
1137    @property string encodingName() @safe pure nothrow @nogc
1138    {
1139        return "windows-1252";
1140    }
1141
1142    private static immutable dchar m_charMapStart = 0x80;
1143    private static immutable dchar m_charMapEnd = 0x9f;
1144
1145    private immutable wstring charMap =
1146        "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~
1147        "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~
1148        "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1149        "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
1150
1151    private immutable Tuple!(wchar, char)[] bstMap = [
1152        tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'),
1153        tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1154        tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'),
1155        tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1156        tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1157        tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'),
1158        tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'),
1159        tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1160        tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1161    ];
1162
1163    mixin GenericEncoder!();
1164}
1165
1166//=============================================================================
1167//          UTF-8
1168//=============================================================================
1169
1170template EncoderInstance(CharType : char)
1171{
1172    alias E = char;
1173    alias EString = immutable(char)[];
1174
1175    @property string encodingName() @safe pure nothrow @nogc
1176    {
1177        return "UTF-8";
1178    }
1179
1180    bool canEncode(dchar c) @safe pure nothrow @nogc
1181    {
1182        return isValidCodePoint(c);
1183    }
1184
1185    bool isValidCodeUnit(char c) @safe pure nothrow @nogc
1186    {
1187        return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
1188    }
1189
1190    immutable ubyte[128] tailTable =
1191    [
1192        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1193        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1194        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1195        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1196        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1197        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1198        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1199        3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
1200    ];
1201
1202    private int tails(char c) @safe pure nothrow @nogc
1203    in
1204    {
1205        assert(c >= 0x80);
1206    }
1207    body
1208    {
1209        return tailTable[c-0x80];
1210    }
1211
1212    size_t encodedLength(dchar c) @safe pure nothrow @nogc
1213    in
1214    {
1215        assert(canEncode(c));
1216    }
1217    body
1218    {
1219        if (c < 0x80) return 1;
1220        if (c < 0x800) return 2;
1221        if (c < 0x10000) return 3;
1222        return 4;
1223    }
1224
1225    void encodeViaWrite()(dchar c)
1226    {
1227        if (c < 0x80)
1228        {
1229            write(cast(char) c);
1230        }
1231        else if (c < 0x800)
1232        {
1233            write(cast(char)((c >> 6) + 0xC0));
1234            write(cast(char)((c & 0x3F) + 0x80));
1235        }
1236        else if (c < 0x10000)
1237        {
1238            write(cast(char)((c >> 12) + 0xE0));
1239            write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1240            write(cast(char)((c & 0x3F) + 0x80));
1241        }
1242        else
1243        {
1244            write(cast(char)((c >> 18) + 0xF0));
1245            write(cast(char)(((c >> 12) & 0x3F) + 0x80));
1246            write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1247            write(cast(char)((c & 0x3F) + 0x80));
1248        }
1249    }
1250
1251    void skipViaRead()()
1252    {
1253        auto c = read();
1254        if (c < 0xC0) return;
1255        int n = tails(cast(char) c);
1256        for (size_t i=0; i<n; ++i)
1257        {
1258            read();
1259        }
1260    }
1261
1262    dchar decodeViaRead()()
1263    {
1264        dchar c = read();
1265        if (c < 0xC0) return c;
1266        int n = tails(cast(char) c);
1267        c &= (1 << (6 - n)) - 1;
1268        for (size_t i=0; i<n; ++i)
1269        {
1270            c = (c << 6) + (read() & 0x3F);
1271        }
1272        return c;
1273    }
1274
1275    dchar safeDecodeViaRead()()
1276    {
1277        dchar c = read();
1278        if (c < 0x80) return c;
1279        int n = tails(cast(char) c);
1280        if (n == 0) return INVALID_SEQUENCE;
1281
1282        if (!canRead) return INVALID_SEQUENCE;
1283        size_t d = peek();
1284        immutable err =
1285        (
1286            (c < 0xC2)                              // fail overlong 2-byte sequences
1287        ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences
1288        ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences
1289        ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates
1290        ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences
1291        ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF
1292        );
1293
1294        c &= (1 << (6 - n)) - 1;
1295        for (size_t i=0; i<n; ++i)
1296        {
1297            if (!canRead) return INVALID_SEQUENCE;
1298            d = peek();
1299            if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1300            c = (c << 6) + (read() & 0x3F);
1301        }
1302
1303        return err ? INVALID_SEQUENCE : c;
1304    }
1305
1306    dchar decodeReverseViaRead()()
1307    {
1308        dchar c = read();
1309        if (c < 0x80) return c;
1310        size_t shift = 0;
1311        c &= 0x3F;
1312        for (size_t i=0; i<4; ++i)
1313        {
1314            shift += 6;
1315            auto d = read();
1316            size_t n = tails(cast(char) d);
1317            immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1318            c += ((d & mask) << shift);
1319            if (n != 0) break;
1320        }
1321        return c;
1322    }
1323
1324    @property EString replacementSequence() @safe pure nothrow @nogc
1325    {
1326        return "\uFFFD";
1327    }
1328
1329    mixin EncoderFunctions;
1330}
1331
1332//=============================================================================
1333//          UTF-16
1334//=============================================================================
1335
1336template EncoderInstance(CharType : wchar)
1337{
1338    alias E = wchar;
1339    alias EString = immutable(wchar)[];
1340
1341    @property string encodingName() @safe pure nothrow @nogc
1342    {
1343        return "UTF-16";
1344    }
1345
1346    bool canEncode(dchar c) @safe pure nothrow @nogc
1347    {
1348        return isValidCodePoint(c);
1349    }
1350
1351    bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc
1352    {
1353        return true;
1354    }
1355
1356    size_t encodedLength(dchar c) @safe pure nothrow @nogc
1357    in
1358    {
1359        assert(canEncode(c));
1360    }
1361    body
1362    {
1363        return (c < 0x10000) ? 1 : 2;
1364    }
1365
1366    void encodeViaWrite()(dchar c)
1367    {
1368        if (c < 0x10000)
1369        {
1370            write(cast(wchar) c);
1371        }
1372        else
1373        {
1374            size_t n = c - 0x10000;
1375            write(cast(wchar)(0xD800 + (n >> 10)));
1376            write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1377        }
1378    }
1379
1380    void skipViaRead()()
1381    {
1382        immutable c = read();
1383        if (c < 0xD800 || c >= 0xE000) return;
1384        read();
1385    }
1386
1387    dchar decodeViaRead()()
1388    {
1389        wchar c = read();
1390        if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1391        wchar d = read();
1392        c &= 0x3FF;
1393        d &= 0x3FF;
1394        return 0x10000 + (c << 10) + d;
1395    }
1396
1397    dchar safeDecodeViaRead()()
1398    {
1399        wchar c = read();
1400        if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1401        if (c >= 0xDC00) return INVALID_SEQUENCE;
1402        if (!canRead) return INVALID_SEQUENCE;
1403        wchar d = peek();
1404        if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1405        d = read();
1406        c &= 0x3FF;
1407        d &= 0x3FF;
1408        return 0x10000 + (c << 10) + d;
1409    }
1410
1411    dchar decodeReverseViaRead()()
1412    {
1413        wchar c = read();
1414        if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1415        wchar d = read();
1416        c &= 0x3FF;
1417        d &= 0x3FF;
1418        return 0x10000 + (d << 10) + c;
1419    }
1420
1421    @property EString replacementSequence() @safe pure nothrow @nogc
1422    {
1423        return "\uFFFD"w;
1424    }
1425
1426    mixin EncoderFunctions;
1427}
1428
1429//=============================================================================
1430//          UTF-32
1431//=============================================================================
1432
1433template EncoderInstance(CharType : dchar)
1434{
1435    alias E = dchar;
1436    alias EString = immutable(dchar)[];
1437
1438    @property string encodingName() @safe pure nothrow @nogc
1439    {
1440        return "UTF-32";
1441    }
1442
1443    bool canEncode(dchar c) @safe pure @nogc nothrow
1444    {
1445        return isValidCodePoint(c);
1446    }
1447
1448    bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow
1449    {
1450        return isValidCodePoint(c);
1451    }
1452
1453    size_t encodedLength(dchar c) @safe pure @nogc nothrow
1454    in
1455    {
1456        assert(canEncode(c));
1457    }
1458    body
1459    {
1460        return 1;
1461    }
1462
1463    void encodeViaWrite()(dchar c)
1464    {
1465        write(c);
1466    }
1467
1468    void skipViaRead()()
1469    {
1470        read();
1471    }
1472
1473    dchar decodeViaRead()()
1474    {
1475        return cast(dchar) read();
1476    }
1477
1478    dchar safeDecodeViaRead()()
1479    {
1480        immutable c = read();
1481        return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1482    }
1483
1484    dchar decodeReverseViaRead()()
1485    {
1486        return cast(dchar) read();
1487    }
1488
1489    @property EString replacementSequence() @safe pure nothrow @nogc
1490    {
1491        return "\uFFFD"d;
1492    }
1493
1494    mixin EncoderFunctions;
1495}
1496
1497//=============================================================================
1498// Below are forwarding functions which expose the function to the user
1499
1500/**
1501Returns true if c is a valid code point
1502
1503 Note that this includes the non-character code points U+FFFE and U+FFFF,
1504 since these are valid code points (even though they are not valid
1505 characters).
1506
1507 Supersedes:
1508 This function supersedes $(D std.utf.startsValidDchar()).
1509
1510 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1511 WINDOWS-1252
1512
1513 Params:
1514    c = the code point to be tested
1515 */
1516bool isValidCodePoint(dchar c) @safe pure nothrow @nogc
1517{
1518    return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1519}
1520
1521/**
1522 Returns the name of an encoding.
1523
1524 The type of encoding cannot be deduced. Therefore, it is necessary to
1525 explicitly specify the encoding type.
1526
1527 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1528 WINDOWS-1252
1529 */
1530@property string encodingName(T)()
1531{
1532    return EncoderInstance!(T).encodingName;
1533}
1534
1535///
1536@safe unittest
1537{
1538    assert(encodingName!(char) == "UTF-8");
1539    assert(encodingName!(wchar) == "UTF-16");
1540    assert(encodingName!(dchar) == "UTF-32");
1541    assert(encodingName!(AsciiChar) == "ASCII");
1542    assert(encodingName!(Latin1Char) == "ISO-8859-1");
1543    assert(encodingName!(Latin2Char) == "ISO-8859-2");
1544    assert(encodingName!(Windows1250Char) == "windows-1250");
1545    assert(encodingName!(Windows1252Char) == "windows-1252");
1546}
1547
1548/**
1549 Returns true iff it is possible to represent the specified codepoint
1550 in the encoding.
1551
1552 The type of encoding cannot be deduced. Therefore, it is necessary to
1553 explicitly specify the encoding type.
1554
1555 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1556 WINDOWS-1252
1557 */
1558bool canEncode(E)(dchar c)
1559{
1560    return EncoderInstance!(E).canEncode(c);
1561}
1562
1563///
1564@safe pure unittest
1565{
1566    assert( canEncode!(Latin1Char)('A'));
1567    assert( canEncode!(Latin2Char)('A'));
1568    assert(!canEncode!(AsciiChar)('\u00A0'));
1569    assert( canEncode!(Latin1Char)('\u00A0'));
1570    assert( canEncode!(Latin2Char)('\u00A0'));
1571    assert( canEncode!(Windows1250Char)('\u20AC'));
1572    assert(!canEncode!(Windows1250Char)('\u20AD'));
1573    assert(!canEncode!(Windows1250Char)('\uFFFD'));
1574    assert( canEncode!(Windows1252Char)('\u20AC'));
1575    assert(!canEncode!(Windows1252Char)('\u20AD'));
1576    assert(!canEncode!(Windows1252Char)('\uFFFD'));
1577    assert(!canEncode!(char)(cast(dchar) 0x110000));
1578}
1579
1580/// How to check an entire string
1581@safe pure unittest
1582{
1583    import std.algorithm.searching : find;
1584    import std.utf : byDchar;
1585
1586    assert("The quick brown fox"
1587        .byDchar
1588        .find!(x => !canEncode!AsciiChar(x))
1589        .empty);
1590}
1591
1592/**
1593 Returns true if the code unit is legal. For example, the byte 0x80 would
1594 not be legal in ASCII, because ASCII code units must always be in the range
1595 0x00 to 0x7F.
1596
1597 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1598 WINDOWS-1252
1599
1600 Params:
1601    c = the code unit to be tested
1602 */
1603bool isValidCodeUnit(E)(E c)
1604{
1605    return EncoderInstance!(E).isValidCodeUnit(c);
1606}
1607
1608///
1609@system pure unittest
1610{
1611    assert(!isValidCodeUnit(cast(char) 0xC0));
1612    assert(!isValidCodeUnit(cast(char) 0xFF));
1613    assert( isValidCodeUnit(cast(wchar) 0xD800));
1614    assert(!isValidCodeUnit(cast(dchar) 0xD800));
1615    assert(!isValidCodeUnit(cast(AsciiChar) 0xA0));
1616    assert( isValidCodeUnit(cast(Windows1250Char) 0x80));
1617    assert(!isValidCodeUnit(cast(Windows1250Char) 0x81));
1618    assert( isValidCodeUnit(cast(Windows1252Char) 0x80));
1619    assert(!isValidCodeUnit(cast(Windows1252Char) 0x81));
1620}
1621
1622/**
1623 Returns true if the string is encoded correctly
1624
1625 Supersedes:
1626 This function supersedes std.utf.validate(), however note that this
1627 function returns a bool indicating whether the input was valid or not,
1628 whereas the older function would throw an exception.
1629
1630 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1631 WINDOWS-1252
1632
1633 Params:
1634    s = the string to be tested
1635 */
1636bool isValid(E)(const(E)[] s)
1637{
1638    return s.length == validLength(s);
1639}
1640
1641///
1642@system pure unittest
1643{
1644    assert( isValid("\u20AC100"));
1645    assert(!isValid(cast(char[3])[167, 133, 175]));
1646}
1647
1648/**
1649 Returns the length of the longest possible substring, starting from
1650 the first code unit, which is validly encoded.
1651
1652 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1653 WINDOWS-1252
1654
1655 Params:
1656    s = the string to be tested
1657 */
1658size_t validLength(E)(const(E)[] s)
1659{
1660    size_t result, before = void;
1661    while ((before = s.length) > 0)
1662    {
1663        if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1664            break;
1665        result += before - s.length;
1666    }
1667    return result;
1668}
1669
1670/**
1671 Sanitizes a string by replacing malformed code unit sequences with valid
1672 code unit sequences. The result is guaranteed to be valid for this encoding.
1673
1674 If the input string is already valid, this function returns the original,
1675 otherwise it constructs a new string by replacing all illegal code unit
1676 sequences with the encoding's replacement character, Invalid sequences will
1677 be replaced with the Unicode replacement character (U+FFFD) if the
1678 character repertoire contains it, otherwise invalid sequences will be
1679 replaced with '?'.
1680
1681 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1682 WINDOWS-1252
1683
1684 Params:
1685    s = the string to be sanitized
1686 */
1687immutable(E)[] sanitize(E)(immutable(E)[] s)
1688{
1689    size_t n = validLength(s);
1690    if (n == s.length) return s;
1691
1692    auto repSeq = EncoderInstance!(E).replacementSequence;
1693
1694    // Count how long the string needs to be.
1695    // Overestimating is not a problem
1696    size_t len = s.length;
1697    const(E)[] t = s[n..$];
1698    while (t.length != 0)
1699    {
1700        immutable c = EncoderInstance!(E).safeDecode(t);
1701        assert(c == INVALID_SEQUENCE);
1702        len += repSeq.length;
1703        t = t[validLength(t)..$];
1704    }
1705
1706    // Now do the write
1707    E[] array = new E[len];
1708    array[0 .. n] = s[0 .. n];
1709    size_t offset = n;
1710
1711    t = s[n..$];
1712    while (t.length != 0)
1713    {
1714        immutable c = EncoderInstance!(E).safeDecode(t);
1715        assert(c == INVALID_SEQUENCE);
1716        array[offset .. offset+repSeq.length] = repSeq[];
1717        offset += repSeq.length;
1718        n = validLength(t);
1719        array[offset .. offset+n] = t[0 .. n];
1720        offset += n;
1721        t = t[n..$];
1722    }
1723    return cast(immutable(E)[])array[0 .. offset];
1724}
1725
1726///
1727@system pure unittest
1728{
1729    assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1730}
1731
1732/**
1733 Returns the length of the first encoded sequence.
1734
1735 The input to this function MUST be validly encoded.
1736 This is enforced by the function's in-contract.
1737
1738 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1739 WINDOWS-1252
1740
1741 Params:
1742 s = the string to be sliced
1743 */
1744size_t firstSequence(E)(const(E)[] s)
1745in
1746{
1747    assert(s.length != 0);
1748    const(E)[] u = s;
1749    assert(safeDecode(u) != INVALID_SEQUENCE);
1750}
1751body
1752{
1753    auto before = s.length;
1754    EncoderInstance!(E).skip(s);
1755    return before - s.length;
1756}
1757
1758///
1759@system pure unittest
1760{
1761    assert(firstSequence("\u20AC1000") == "\u20AC".length);
1762    assert(firstSequence("hel") == "h".length);
1763}
1764
1765/**
1766 Returns the length of the last encoded sequence.
1767
1768 The input to this function MUST be validly encoded.
1769 This is enforced by the function's in-contract.
1770
1771 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1772 WINDOWS-1252
1773
1774 Params:
1775    s = the string to be sliced
1776 */
1777size_t lastSequence(E)(const(E)[] s)
1778in
1779{
1780    assert(s.length != 0);
1781    assert(isValid(s));
1782}
1783body
1784{
1785    const(E)[] t = s;
1786    EncoderInstance!(E).decodeReverse(s);
1787    return t.length - s.length;
1788}
1789
1790///
1791@system pure unittest
1792{
1793    assert(lastSequence("1000\u20AC") == "\u20AC".length);
1794    assert(lastSequence("hell��") == "��".length);
1795}
1796
1797/**
1798 Returns the array index at which the (n+1)th code point begins.
1799
1800 The input to this function MUST be validly encoded.
1801 This is enforced by the function's in-contract.
1802
1803 Supersedes:
1804 This function supersedes std.utf.toUTFindex().
1805
1806 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1807 WINDOWS-1252
1808
1809 Params:
1810    s = the string to be counted
1811    n = the current code point index
1812 */
1813ptrdiff_t index(E)(const(E)[] s,int n)
1814in
1815{
1816    assert(isValid(s));
1817    assert(n >= 0);
1818}
1819body
1820{
1821    const(E)[] t = s;
1822    for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1823    return t.length - s.length;
1824}
1825
1826///
1827@system pure unittest
1828{
1829    assert(index("\u20AC100",1) == 3);
1830    assert(index("h��llo",2) == 3);
1831}
1832
1833/**
1834 Decodes a single code point.
1835
1836 This function removes one or more code units from the start of a string,
1837 and returns the decoded code point which those code units represent.
1838
1839 The input to this function MUST be validly encoded.
1840 This is enforced by the function's in-contract.
1841
1842 Supersedes:
1843 This function supersedes std.utf.decode(), however, note that the
1844 function codePoints() supersedes it more conveniently.
1845
1846 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1847 WINDOWS-1252
1848
1849 Params:
1850    s = the string whose first code point is to be decoded
1851 */
1852dchar decode(S)(ref S s)
1853in
1854{
1855    assert(s.length != 0);
1856    auto u = s;
1857    assert(safeDecode(u) != INVALID_SEQUENCE);
1858}
1859body
1860{
1861    return EncoderInstance!(typeof(s[0])).decode(s);
1862}
1863
1864/**
1865 Decodes a single code point from the end of a string.
1866
1867 This function removes one or more code units from the end of a string,
1868 and returns the decoded code point which those code units represent.
1869
1870 The input to this function MUST be validly encoded.
1871 This is enforced by the function's in-contract.
1872
1873 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1874 WINDOWS-1252
1875
1876 Params:
1877    s = the string whose first code point is to be decoded
1878 */
1879dchar decodeReverse(E)(ref const(E)[] s)
1880in
1881{
1882    assert(s.length != 0);
1883    assert(isValid(s));
1884}
1885body
1886{
1887    return EncoderInstance!(E).decodeReverse(s);
1888}
1889
1890/**
1891 Decodes a single code point. The input does not have to be valid.
1892
1893 This function removes one or more code units from the start of a string,
1894 and returns the decoded code point which those code units represent.
1895
1896 This function will accept an invalidly encoded string as input.
1897 If an invalid sequence is found at the start of the string, this
1898 function will remove it, and return the value INVALID_SEQUENCE.
1899
1900 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1901 WINDOWS-1252
1902
1903 Params:
1904    s = the string whose first code point is to be decoded
1905 */
1906dchar safeDecode(S)(ref S s)
1907in
1908{
1909    assert(s.length != 0);
1910}
1911body
1912{
1913    return EncoderInstance!(typeof(s[0])).safeDecode(s);
1914}
1915
1916/**
1917 Returns the number of code units required to encode a single code point.
1918
1919 The input to this function MUST be a valid code point.
1920 This is enforced by the function's in-contract.
1921
1922 The type of the output cannot be deduced. Therefore, it is necessary to
1923 explicitly specify the encoding as a template parameter.
1924
1925 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1926 WINDOWS-1252
1927
1928 Params:
1929    c = the code point to be encoded
1930 */
1931size_t encodedLength(E)(dchar c)
1932in
1933{
1934    assert(isValidCodePoint(c));
1935}
1936body
1937{
1938    return EncoderInstance!(E).encodedLength(c);
1939}
1940
1941/**
1942 Encodes a single code point.
1943
1944 This function encodes a single code point into one or more code units.
1945 It returns a string containing those code units.
1946
1947 The input to this function MUST be a valid code point.
1948 This is enforced by the function's in-contract.
1949
1950 The type of the output cannot be deduced. Therefore, it is necessary to
1951 explicitly specify the encoding as a template parameter.
1952
1953 Supersedes:
1954 This function supersedes std.utf.encode(), however, note that the
1955 function codeUnits() supersedes it more conveniently.
1956
1957 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1958 WINDOWS-1252
1959
1960 Params:
1961    c = the code point to be encoded
1962 */
1963E[] encode(E)(dchar c)
1964in
1965{
1966    assert(isValidCodePoint(c));
1967}
1968body
1969{
1970    return EncoderInstance!(E).encode(c);
1971}
1972
1973/**
1974 Encodes a single code point into an array.
1975
1976 This function encodes a single code point into one or more code units
1977 The code units are stored in a user-supplied fixed-size array,
1978 which must be passed by reference.
1979
1980 The input to this function MUST be a valid code point.
1981 This is enforced by the function's in-contract.
1982
1983 The type of the output cannot be deduced. Therefore, it is necessary to
1984 explicitly specify the encoding as a template parameter.
1985
1986 Supersedes:
1987 This function supersedes std.utf.encode(), however, note that the
1988 function codeUnits() supersedes it more conveniently.
1989
1990 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1991 WINDOWS-1252
1992
1993 Params:
1994    c     = the code point to be encoded
1995    array = the destination array
1996
1997 Returns:
1998          the number of code units written to the array
1999 */
2000size_t encode(E)(dchar c, E[] array)
2001in
2002{
2003    assert(isValidCodePoint(c));
2004}
2005body
2006{
2007    E[] t = array;
2008    EncoderInstance!(E).encode(c,t);
2009    return array.length - t.length;
2010}
2011
2012/*
2013Encodes $(D c) in units of type $(D E) and writes the result to the
2014output range $(D R). Returns the number of $(D E)s written.
2015 */
2016size_t encode(E, R)(dchar c, auto ref R range)
2017if (isNativeOutputRange!(R, E))
2018{
2019    static if (is(Unqual!E == char))
2020    {
2021        if (c <= 0x7F)
2022        {
2023            put(range, cast(char) c);
2024            return 1;
2025        }
2026        if (c <= 0x7FF)
2027        {
2028            put(range, cast(char)(0xC0 | (c >> 6)));
2029            put(range, cast(char)(0x80 | (c & 0x3F)));
2030            return 2;
2031        }
2032        if (c <= 0xFFFF)
2033        {
2034            put(range, cast(char)(0xE0 | (c >> 12)));
2035            put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2036            put(range, cast(char)(0x80 | (c & 0x3F)));
2037            return 3;
2038        }
2039        if (c <= 0x10FFFF)
2040        {
2041            put(range, cast(char)(0xF0 | (c >> 18)));
2042            put(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
2043            put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2044            put(range, cast(char)(0x80 | (c & 0x3F)));
2045            return 4;
2046        }
2047        else
2048        {
2049            assert(0);
2050        }
2051    }
2052    else static if (is(Unqual!E == wchar))
2053    {
2054        if (c <= 0xFFFF)
2055        {
2056            range.put(cast(wchar) c);
2057            return 1;
2058        }
2059        range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
2060        range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
2061        return 2;
2062    }
2063    else static if (is(Unqual!E == dchar))
2064    {
2065        range.put(c);
2066        return 1;
2067    }
2068    else
2069    {
2070        static assert(0);
2071    }
2072}
2073
2074@safe pure unittest
2075{
2076    import std.array;
2077    Appender!(char[]) r;
2078    assert(encode!(char)('T', r) == 1);
2079    assert(encode!(wchar)('T', r) == 1);
2080    assert(encode!(dchar)('T', r) == 1);
2081}
2082
2083/**
2084 Encodes a single code point to a delegate.
2085
2086 This function encodes a single code point into one or more code units.
2087 The code units are passed one at a time to the supplied delegate.
2088
2089 The input to this function MUST be a valid code point.
2090 This is enforced by the function's in-contract.
2091
2092 The type of the output cannot be deduced. Therefore, it is necessary to
2093 explicitly specify the encoding as a template parameter.
2094
2095 Supersedes:
2096 This function supersedes std.utf.encode(), however, note that the
2097 function codeUnits() supersedes it more conveniently.
2098
2099 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2100 WINDOWS-1252
2101
2102 Params:
2103    c  = the code point to be encoded
2104    dg = the delegate to invoke for each code unit
2105 */
2106void encode(E)(dchar c, void delegate(E) dg)
2107in
2108{
2109    assert(isValidCodePoint(c));
2110}
2111body
2112{
2113    EncoderInstance!(E).encode(c,dg);
2114}
2115
2116/**
2117Encodes the contents of $(D s) in units of type $(D Tgt), writing the result to an
2118output range.
2119
2120Returns: The number of $(D Tgt) elements written.
2121Params:
2122Tgt = Element type of $(D range).
2123s = Input array.
2124range = Output range.
2125 */
2126size_t encode(Tgt, Src, R)(in Src[] s, R range)
2127{
2128    size_t result;
2129    foreach (c; s)
2130    {
2131        result += encode!(Tgt)(c, range);
2132    }
2133    return result;
2134}
2135
2136/**
2137 Returns a foreachable struct which can bidirectionally iterate over all
2138 code points in a string.
2139
2140 The input to this function MUST be validly encoded.
2141 This is enforced by the function's in-contract.
2142
2143 You can foreach either
2144 with or without an index. If an index is specified, it will be initialized
2145 at each iteration with the offset into the string at which the code point
2146 begins.
2147
2148 Supersedes:
2149 This function supersedes std.utf.decode().
2150
2151 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2152 WINDOWS-1252
2153
2154 Params:
2155    s = the string to be decoded
2156
2157 Example:
2158 --------------------------------------------------------
2159 string s = "hello world";
2160 foreach (c;codePoints(s))
2161 {
2162     // do something with c (which will always be a dchar)
2163 }
2164 --------------------------------------------------------
2165
2166 Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s)
2167 in that the latter will fall over on encountering U+FFFF.
2168 */
2169CodePoints!(E) codePoints(E)(immutable(E)[] s)
2170in
2171{
2172    assert(isValid(s));
2173}
2174body
2175{
2176    return CodePoints!(E)(s);
2177}
2178
2179///
2180@system unittest
2181{
2182    string s = "hello";
2183    string t;
2184    foreach (c;codePoints(s))
2185    {
2186        t ~= cast(char) c;
2187    }
2188    assert(s == t);
2189}
2190
2191/**
2192 Returns a foreachable struct which can bidirectionally iterate over all
2193 code units in a code point.
2194
2195 The input to this function MUST be a valid code point.
2196 This is enforced by the function's in-contract.
2197
2198 The type of the output cannot be deduced. Therefore, it is necessary to
2199 explicitly specify the encoding type in the template parameter.
2200
2201 Supersedes:
2202 This function supersedes std.utf.encode().
2203
2204 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2205 WINDOWS-1252
2206
2207 Params:
2208    c = the code point to be encoded
2209 */
2210CodeUnits!(E) codeUnits(E)(dchar c)
2211in
2212{
2213    assert(isValidCodePoint(c));
2214}
2215body
2216{
2217    return CodeUnits!(E)(c);
2218}
2219
2220///
2221@system unittest
2222{
2223    char[] a;
2224    foreach (c;codeUnits!(char)(cast(dchar)'\u20AC'))
2225    {
2226        a ~= c;
2227    }
2228    assert(a.length == 3);
2229    assert(a[0] == 0xE2);
2230    assert(a[1] == 0x82);
2231    assert(a[2] == 0xAC);
2232}
2233
2234/**
2235 Convert a string from one encoding to another.
2236
2237 Supersedes:
2238 This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and
2239 std.utf.toUTF32()
2240 (but note that to!() supersedes it more conveniently).
2241
2242 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2243 WINDOWS-1252
2244
2245 Params:
2246    s = Source string. $(B Must) be validly encoded.
2247        This is enforced by the function's in-contract.
2248    r = Destination string
2249
2250 See_Also:
2251    $(REF to, std,conv)
2252 */
2253void transcode(Src, Dst)(Src[] s, out Dst[] r)
2254in
2255{
2256    assert(isValid(s));
2257}
2258body
2259{
2260    static if (is(Src == Dst) && is(Src == immutable))
2261    {
2262        r = s;
2263    }
2264    else static if (is(Unqual!Src == AsciiChar))
2265    {
2266        transcode(cast(const(char)[])s, r);
2267    }
2268    else
2269    {
2270        static if (is(Unqual!Dst == wchar))
2271        {
2272            immutable minReservePlace = 2;
2273        }
2274        else static if (is(Unqual!Dst == dchar))
2275        {
2276            immutable minReservePlace = 1;
2277        }
2278        else
2279        {
2280            immutable minReservePlace = 6;
2281        }
2282
2283        auto buffer = new Unqual!Dst[s.length];
2284        auto tmpBuffer = buffer;
2285
2286        while (s.length != 0)
2287        {
2288            if (tmpBuffer.length < minReservePlace)
2289            {
2290                size_t prevLength = buffer.length;
2291                buffer.length += s.length + minReservePlace;
2292                tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
2293            }
2294            EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer);
2295        }
2296
2297        r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length];
2298    }
2299}
2300
2301///
2302@system pure unittest
2303{
2304    wstring ws;
2305    // transcode from UTF-8 to UTF-16
2306    transcode("hello world",ws);
2307    assert(ws == "hello world"w);
2308
2309    Latin1String ls;
2310    // transcode from UTF-16 to ISO-8859-1
2311    transcode(ws, ls);
2312    assert(ws == "hello world");
2313}
2314
2315@system pure unittest
2316{
2317    import std.meta;
2318    import std.range;
2319    {
2320        import std.conv : to;
2321
2322        string asciiCharString = to!string(iota(0, 128, 1));
2323
2324        alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString,
2325            Windows1250String, Windows1252String, dstring, wstring);
2326        foreach (S; Types)
2327            foreach (D; Types)
2328            {
2329                string str;
2330                S sStr;
2331                D dStr;
2332                transcode(asciiCharString, sStr);
2333                transcode(sStr, dStr);
2334                transcode(dStr, str);
2335                assert(asciiCharString == str);
2336            }
2337    }
2338    {
2339        string czechChars = "P����li�� ��lu��ou��k�� k���� ��p��l ����belsk�� ��dy.";
2340        alias Types = AliasSeq!(string, dstring, wstring);
2341        foreach (S; Types)
2342            foreach (D; Types)
2343            {
2344                string str;
2345                S sStr;
2346                D dStr;
2347                transcode(czechChars, sStr);
2348                transcode(sStr, dStr);
2349                transcode(dStr, str);
2350                assert(czechChars == str);
2351            }
2352    }
2353}
2354
2355@system unittest // mutable/const input/output
2356{
2357    import std.meta : AliasSeq;
2358
2359    foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char))
2360    {
2361        O[] output;
2362
2363        char[] mutableInput = "��bc".dup;
2364        transcode(mutableInput, output);
2365        assert(output == [0xE4, 'b', 'c']);
2366
2367        const char[] constInput = "��bc";
2368        transcode(constInput, output);
2369        assert(output == [0xF6, 'b', 'c']);
2370
2371        immutable char[] immutInput = "��bc";
2372        transcode(immutInput, output);
2373        assert(output == [0xFC, 'b', 'c']);
2374    }
2375
2376    // Make sure that const/mutable input is copied.
2377    foreach (C; AliasSeq!(char, const char))
2378    {
2379        C[] input = "foo".dup;
2380        C[] output;
2381        transcode(input, output);
2382        assert(input == output);
2383        assert(input !is output);
2384    }
2385
2386    // But immutable input should not be copied.
2387    string input = "foo";
2388    string output;
2389    transcode(input, output);
2390    assert(input is output);
2391}
2392
2393//=============================================================================
2394
2395/** The base class for exceptions thrown by this module */
2396class EncodingException : Exception { this(string msg) @safe pure { super(msg); } }
2397
2398class UnrecognizedEncodingException : EncodingException
2399{
2400    private this(string msg) @safe pure { super(msg); }
2401}
2402
2403/** Abstract base class of all encoding schemes */
2404abstract class EncodingScheme
2405{
2406    import std.uni : toLower;
2407
2408    /**
2409     * Registers a subclass of EncodingScheme.
2410     *
2411     * This function allows user-defined subclasses of EncodingScheme to
2412     * be declared in other modules.
2413     *
2414     * Params:
2415     *     Klass = The subclass of EncodingScheme to register.
2416     *
2417     * Example:
2418     * ----------------------------------------------
2419     * class Amiga1251 : EncodingScheme
2420     * {
2421     *     shared static this()
2422     *     {
2423     *         EncodingScheme.register!Amiga1251;
2424     *     }
2425     * }
2426     * ----------------------------------------------
2427     */
2428    static void register(Klass:EncodingScheme)()
2429    {
2430        scope scheme = new Klass();
2431        foreach (encodingName;scheme.names())
2432        {
2433            supported[toLower(encodingName)] = () => new Klass();
2434        }
2435    }
2436
2437    deprecated("Please pass the EncodingScheme subclass as template argument instead.")
2438    static void register(string className)
2439    {
2440        auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2441        if (scheme is null)
2442            throw new EncodingException("Unable to create class "~className);
2443        foreach (encodingName;scheme.names())
2444        {
2445            supportedFactories[toLower(encodingName)] = className;
2446        }
2447    }
2448
2449    /**
2450     * Obtains a subclass of EncodingScheme which is capable of encoding
2451     * and decoding the named encoding scheme.
2452     *
2453     * This function is only aware of EncodingSchemes which have been
2454     * registered with the register() function.
2455     *
2456     * Example:
2457     * ---------------------------------------------------
2458     * auto scheme = EncodingScheme.create("Amiga-1251");
2459     * ---------------------------------------------------
2460     */
2461    static EncodingScheme create(string encodingName)
2462    {
2463        static bool registerDefaultEncodings()
2464        {
2465            EncodingScheme.register!EncodingSchemeASCII;
2466            EncodingScheme.register!EncodingSchemeLatin1;
2467            EncodingScheme.register!EncodingSchemeLatin2;
2468            EncodingScheme.register!EncodingSchemeWindows1250;
2469            EncodingScheme.register!EncodingSchemeWindows1252;
2470            EncodingScheme.register!EncodingSchemeUtf8;
2471            EncodingScheme.register!EncodingSchemeUtf16Native;
2472            EncodingScheme.register!EncodingSchemeUtf32Native;
2473            return true;
2474        }
2475
2476        static shared bool initialized;
2477        import std.concurrency : initOnce;
2478        initOnce!initialized(registerDefaultEncodings());
2479        encodingName = toLower(encodingName);
2480
2481        if (auto p = encodingName in supported)
2482            return (*p)();
2483
2484        auto p = encodingName in supportedFactories;
2485        if (p is null)
2486            throw new EncodingException("Unrecognized Encoding: "~encodingName);
2487        string className = *p;
2488        auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2489        if (scheme is null) throw new EncodingException("Unable to create class "~className);
2490        return scheme;
2491    }
2492
2493    const
2494    {
2495        /**
2496         * Returns the standard name of the encoding scheme
2497         */
2498        abstract override string toString();
2499
2500        /**
2501         * Returns an array of all known names for this encoding scheme
2502         */
2503        abstract string[] names();
2504
2505        /**
2506         * Returns true if the character c can be represented
2507         * in this encoding scheme.
2508         */
2509        abstract bool canEncode(dchar c);
2510
2511        /**
2512         * Returns the number of ubytes required to encode this code point.
2513         *
2514         * The input to this function MUST be a valid code point.
2515         *
2516         * Params:
2517         *    c = the code point to be encoded
2518         *
2519         * Returns:
2520         *    the number of ubytes required.
2521         */
2522        abstract size_t encodedLength(dchar c);
2523
2524        /**
2525         * Encodes a single code point into a user-supplied, fixed-size buffer.
2526         *
2527         * This function encodes a single code point into one or more ubytes.
2528         * The supplied buffer must be code unit aligned.
2529         * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2530         * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2531         *
2532         * The input to this function MUST be a valid code point.
2533         *
2534         * Params:
2535         *    c      = the code point to be encoded
2536         *    buffer = the destination array
2537         *
2538         * Returns:
2539         *    the number of ubytes written.
2540         */
2541        abstract size_t encode(dchar c, ubyte[] buffer);
2542
2543        /**
2544         * Decodes a single code point.
2545         *
2546         * This function removes one or more ubytes from the start of an array,
2547         * and returns the decoded code point which those ubytes represent.
2548         *
2549         * The input to this function MUST be validly encoded.
2550         *
2551         * Params:
2552         *    s = the array whose first code point is to be decoded
2553         */
2554        abstract dchar decode(ref const(ubyte)[] s);
2555
2556        /**
2557         * Decodes a single code point. The input does not have to be valid.
2558         *
2559         * This function removes one or more ubytes from the start of an array,
2560         * and returns the decoded code point which those ubytes represent.
2561         *
2562         * This function will accept an invalidly encoded array as input.
2563         * If an invalid sequence is found at the start of the string, this
2564         * function will remove it, and return the value INVALID_SEQUENCE.
2565         *
2566         * Params:
2567         *    s = the array whose first code point is to be decoded
2568         */
2569        abstract dchar safeDecode(ref const(ubyte)[] s);
2570
2571        /**
2572         * Returns the sequence of ubytes to be used to represent
2573         * any character which cannot be represented in the encoding scheme.
2574         *
2575         * Normally this will be a representation of some substitution
2576         * character, such as U+FFFD or '?'.
2577         */
2578        abstract @property immutable(ubyte)[] replacementSequence();
2579    }
2580
2581    /**
2582     * Returns true if the array is encoded correctly
2583     *
2584     * Params:
2585     *    s = the array to be tested
2586     */
2587    bool isValid(const(ubyte)[] s)
2588    {
2589        while (s.length != 0)
2590        {
2591            if (safeDecode(s) == INVALID_SEQUENCE)
2592                return false;
2593        }
2594        return true;
2595    }
2596
2597    /**
2598     * Returns the length of the longest possible substring, starting from
2599     * the first element, which is validly encoded.
2600     *
2601     * Params:
2602     *    s = the array to be tested
2603     */
2604    size_t validLength()(const(ubyte)[] s)
2605    {
2606        const(ubyte)[] r = s;
2607        const(ubyte)[] t = s;
2608        while (s.length != 0)
2609        {
2610            if (safeDecode(s) == INVALID_SEQUENCE) break;
2611            t = s;
2612        }
2613        return r.length - t.length;
2614    }
2615
2616    /**
2617     * Sanitizes an array by replacing malformed ubyte sequences with valid
2618     * ubyte sequences. The result is guaranteed to be valid for this
2619     * encoding scheme.
2620     *
2621     * If the input array is already valid, this function returns the
2622     * original, otherwise it constructs a new array by replacing all illegal
2623     * sequences with the encoding scheme's replacement sequence.
2624     *
2625     * Params:
2626     *    s = the string to be sanitized
2627     */
2628    immutable(ubyte)[] sanitize()(immutable(ubyte)[] s)
2629    {
2630        auto n = validLength(s);
2631        if (n == s.length) return s;
2632
2633        auto repSeq = replacementSequence;
2634
2635        // Count how long the string needs to be.
2636        // Overestimating is not a problem
2637        auto len = s.length;
2638        const(ubyte)[] t = s[n..$];
2639        while (t.length != 0)
2640        {
2641            immutable c = safeDecode(t);
2642            assert(c == INVALID_SEQUENCE);
2643            len += repSeq.length;
2644            t = t[validLength(t)..$];
2645        }
2646
2647        // Now do the write
2648        ubyte[] array = new ubyte[len];
2649        array[0 .. n] = s[0 .. n];
2650        auto offset = n;
2651
2652        t = s[n..$];
2653        while (t.length != 0)
2654        {
2655            immutable c = safeDecode(t);
2656            assert(c == INVALID_SEQUENCE);
2657            array[offset .. offset+repSeq.length] = repSeq[];
2658            offset += repSeq.length;
2659            n = validLength(t);
2660            array[offset .. offset+n] = t[0 .. n];
2661            offset += n;
2662            t = t[n..$];
2663        }
2664        return cast(immutable(ubyte)[])array[0 .. offset];
2665    }
2666
2667    /**
2668     * Returns the length of the first encoded sequence.
2669     *
2670     * The input to this function MUST be validly encoded.
2671     * This is enforced by the function's in-contract.
2672     *
2673     * Params:
2674     *    s = the array to be sliced
2675     */
2676    size_t firstSequence()(const(ubyte)[] s)
2677    in
2678    {
2679        assert(s.length != 0);
2680        const(ubyte)[] u = s;
2681        assert(safeDecode(u) != INVALID_SEQUENCE);
2682    }
2683    body
2684    {
2685        const(ubyte)[] t = s;
2686        decode(s);
2687        return t.length - s.length;
2688    }
2689
2690    /**
2691     * Returns the total number of code points encoded in a ubyte array.
2692     *
2693     * The input to this function MUST be validly encoded.
2694     * This is enforced by the function's in-contract.
2695     *
2696     * Params:
2697     *    s = the string to be counted
2698     */
2699    size_t count()(const(ubyte)[] s)
2700    in
2701    {
2702        assert(isValid(s));
2703    }
2704    body
2705    {
2706        size_t n = 0;
2707        while (s.length != 0)
2708        {
2709            decode(s);
2710            ++n;
2711        }
2712        return n;
2713    }
2714
2715    /**
2716     * Returns the array index at which the (n+1)th code point begins.
2717     *
2718     * The input to this function MUST be validly encoded.
2719     * This is enforced by the function's in-contract.
2720     *
2721     * Params:
2722     *    s = the string to be counted
2723     *    n = the current code point index
2724     */
2725    ptrdiff_t index()(const(ubyte)[] s, size_t n)
2726    in
2727    {
2728        assert(isValid(s));
2729        assert(n >= 0);
2730    }
2731    body
2732    {
2733        const(ubyte)[] t = s;
2734        for (size_t i=0; i<n; ++i) decode(s);
2735        return t.length - s.length;
2736    }
2737
2738    __gshared EncodingScheme function()[string] supported;
2739    __gshared string[string] supportedFactories;
2740}
2741
2742/**
2743 EncodingScheme to handle ASCII
2744
2745 This scheme recognises the following names:
2746                 "ANSI_X3.4-1968",
2747                 "ANSI_X3.4-1986",
2748                 "ASCII",
2749                 "IBM367",
2750                 "ISO646-US",
2751                 "ISO_646.irv:1991",
2752                 "US-ASCII",
2753                 "cp367",
2754                 "csASCII"
2755                 "iso-ir-6",
2756                 "us"
2757 */
2758class EncodingSchemeASCII : EncodingScheme
2759{
2760    /* // moved to std.internal.phobosinit
2761    shared static this()
2762    {
2763        EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2764    }*/
2765
2766    const
2767    {
2768        override string[] names() @safe pure nothrow
2769        {
2770            return
2771            [
2772                "ANSI_X3.4-1968",
2773                "ANSI_X3.4-1986",
2774                "ASCII",
2775                "IBM367",
2776                "ISO646-US",
2777                "ISO_646.irv:1991",
2778                "US-ASCII",
2779                "cp367",
2780                "csASCII",
2781                "iso-ir-6",
2782                "us"
2783            ];
2784        }
2785
2786        override string toString() @safe pure nothrow @nogc
2787        {
2788            return "ASCII";
2789        }
2790
2791        override bool canEncode(dchar c) @safe pure nothrow @nogc
2792        {
2793            return std.encoding.canEncode!(AsciiChar)(c);
2794        }
2795
2796        override size_t encodedLength(dchar c)  @safe pure nothrow @nogc
2797        {
2798            return std.encoding.encodedLength!(AsciiChar)(c);
2799        }
2800
2801        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2802        {
2803            auto r = cast(AsciiChar[]) buffer;
2804            return std.encoding.encode(c,r);
2805        }
2806
2807        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2808        {
2809            auto t = cast(const(AsciiChar)[]) s;
2810            dchar c = std.encoding.decode(t);
2811            s = s[$-t.length..$];
2812            return c;
2813        }
2814
2815        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2816        {
2817            auto t = cast(const(AsciiChar)[]) s;
2818            dchar c = std.encoding.safeDecode(t);
2819            s = s[$-t.length..$];
2820            return c;
2821        }
2822
2823        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2824        {
2825            return cast(immutable(ubyte)[])"?";
2826        }
2827    }
2828}
2829
2830/**
2831 EncodingScheme to handle Latin-1
2832
2833 This scheme recognises the following names:
2834                 "CP819",
2835                 "IBM819",
2836                 "ISO-8859-1",
2837                 "ISO_8859-1",
2838                 "ISO_8859-1:1987",
2839                 "csISOLatin1",
2840                 "iso-ir-100",
2841                 "l1",
2842                 "latin1"
2843 */
2844class EncodingSchemeLatin1 : EncodingScheme
2845{
2846    /* // moved to std.internal.phobosinit
2847    shared static this()
2848    {
2849        EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
2850    }*/
2851
2852    const
2853    {
2854        override string[] names() @safe pure nothrow
2855        {
2856            return
2857            [
2858                "CP819",
2859                "IBM819",
2860                "ISO-8859-1",
2861                "ISO_8859-1",
2862                "ISO_8859-1:1987",
2863                "csISOLatin1",
2864                "iso-ir-100",
2865                "l1",
2866                "latin1"
2867            ];
2868        }
2869
2870        override string toString() @safe pure nothrow @nogc
2871        {
2872            return "ISO-8859-1";
2873        }
2874
2875        override bool canEncode(dchar c) @safe pure nothrow @nogc
2876        {
2877            return std.encoding.canEncode!(Latin1Char)(c);
2878        }
2879
2880        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2881        {
2882            return std.encoding.encodedLength!(Latin1Char)(c);
2883        }
2884
2885        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2886        {
2887            auto r = cast(Latin1Char[]) buffer;
2888            return std.encoding.encode(c,r);
2889        }
2890
2891        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2892        {
2893            auto t = cast(const(Latin1Char)[]) s;
2894            dchar c = std.encoding.decode(t);
2895            s = s[$-t.length..$];
2896            return c;
2897        }
2898
2899        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2900        {
2901            auto t = cast(const(Latin1Char)[]) s;
2902            dchar c = std.encoding.safeDecode(t);
2903            s = s[$-t.length..$];
2904            return c;
2905        }
2906
2907        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2908        {
2909            return cast(immutable(ubyte)[])"?";
2910        }
2911    }
2912}
2913
2914/**
2915 EncodingScheme to handle Latin-2
2916
2917 This scheme recognises the following names:
2918                 "Latin 2",
2919                 "ISO-8859-2",
2920                 "ISO_8859-2",
2921                 "ISO_8859-2:1999",
2922                 "Windows-28592"
2923 */
2924class EncodingSchemeLatin2 : EncodingScheme
2925{
2926    /* // moved to std.internal.phobosinit
2927    shared static this()
2928    {
2929        EncodingScheme.register("std.encoding.EncodingSchemeLatin2");
2930    }*/
2931
2932    const
2933    {
2934        override string[] names() @safe pure nothrow
2935        {
2936            return
2937            [
2938                "Latin 2",
2939                "ISO-8859-2",
2940                "ISO_8859-2",
2941                "ISO_8859-2:1999",
2942                "windows-28592"
2943            ];
2944        }
2945
2946        override string toString() @safe pure nothrow @nogc
2947        {
2948            return "ISO-8859-2";
2949        }
2950
2951        override bool canEncode(dchar c) @safe pure nothrow @nogc
2952        {
2953            return std.encoding.canEncode!(Latin2Char)(c);
2954        }
2955
2956        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2957        {
2958            return std.encoding.encodedLength!(Latin2Char)(c);
2959        }
2960
2961        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2962        {
2963            auto r = cast(Latin2Char[]) buffer;
2964            return std.encoding.encode(c,r);
2965        }
2966
2967        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2968        {
2969            auto t = cast(const(Latin2Char)[]) s;
2970            dchar c = std.encoding.decode(t);
2971            s = s[$-t.length..$];
2972            return c;
2973        }
2974
2975        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2976        {
2977            auto t = cast(const(Latin2Char)[]) s;
2978            dchar c = std.encoding.safeDecode(t);
2979            s = s[$-t.length..$];
2980            return c;
2981        }
2982
2983        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2984        {
2985            return cast(immutable(ubyte)[])"?";
2986        }
2987    }
2988}
2989
2990/**
2991 EncodingScheme to handle Windows-1250
2992
2993 This scheme recognises the following names:
2994                 "windows-1250"
2995 */
2996class EncodingSchemeWindows1250 : EncodingScheme
2997{
2998    /* // moved to std.internal.phobosinit
2999    shared static this()
3000    {
3001        EncodingScheme.register("std.encoding.EncodingSchemeWindows1250");
3002    }*/
3003
3004    const
3005    {
3006        override string[] names() @safe pure nothrow
3007        {
3008            return
3009            [
3010                "windows-1250"
3011            ];
3012        }
3013
3014        override string toString() @safe pure nothrow @nogc
3015        {
3016            return "windows-1250";
3017        }
3018
3019        override bool canEncode(dchar c) @safe pure nothrow @nogc
3020        {
3021            return std.encoding.canEncode!(Windows1250Char)(c);
3022        }
3023
3024        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3025        {
3026            return std.encoding.encodedLength!(Windows1250Char)(c);
3027        }
3028
3029        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3030        {
3031            auto r = cast(Windows1250Char[]) buffer;
3032            return std.encoding.encode(c,r);
3033        }
3034
3035        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3036        {
3037            auto t = cast(const(Windows1250Char)[]) s;
3038            dchar c = std.encoding.decode(t);
3039            s = s[$-t.length..$];
3040            return c;
3041        }
3042
3043        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3044        {
3045            auto t = cast(const(Windows1250Char)[]) s;
3046            dchar c = std.encoding.safeDecode(t);
3047            s = s[$-t.length..$];
3048            return c;
3049        }
3050
3051        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3052        {
3053            return cast(immutable(ubyte)[])"?";
3054        }
3055    }
3056}
3057
3058/**
3059 EncodingScheme to handle Windows-1252
3060
3061 This scheme recognises the following names:
3062                 "windows-1252"
3063 */
3064class EncodingSchemeWindows1252 : EncodingScheme
3065{
3066    /* // moved to std.internal.phobosinit
3067    shared static this()
3068    {
3069        EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
3070    }*/
3071
3072    const
3073    {
3074        override string[] names() @safe pure nothrow
3075        {
3076            return
3077            [
3078                "windows-1252"
3079            ];
3080        }
3081
3082        override string toString() @safe pure nothrow @nogc
3083        {
3084            return "windows-1252";
3085        }
3086
3087        override bool canEncode(dchar c) @safe pure nothrow @nogc
3088        {
3089            return std.encoding.canEncode!(Windows1252Char)(c);
3090        }
3091
3092        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3093        {
3094            return std.encoding.encodedLength!(Windows1252Char)(c);
3095        }
3096
3097        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3098        {
3099            auto r = cast(Windows1252Char[]) buffer;
3100            return std.encoding.encode(c,r);
3101        }
3102
3103        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3104        {
3105            auto t = cast(const(Windows1252Char)[]) s;
3106            dchar c = std.encoding.decode(t);
3107            s = s[$-t.length..$];
3108            return c;
3109        }
3110
3111        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3112        {
3113            auto t = cast(const(Windows1252Char)[]) s;
3114            dchar c = std.encoding.safeDecode(t);
3115            s = s[$-t.length..$];
3116            return c;
3117        }
3118
3119        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3120        {
3121            return cast(immutable(ubyte)[])"?";
3122        }
3123    }
3124}
3125
3126/**
3127 EncodingScheme to handle UTF-8
3128
3129 This scheme recognises the following names:
3130                 "UTF-8"
3131 */
3132class EncodingSchemeUtf8 : EncodingScheme
3133{
3134    /* // moved to std.internal.phobosinit
3135    shared static this()
3136    {
3137        EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
3138    }*/
3139
3140    const
3141    {
3142        override string[] names() @safe pure nothrow
3143        {
3144            return
3145            [
3146                "UTF-8"
3147            ];
3148        }
3149
3150        override string toString() @safe pure nothrow @nogc
3151        {
3152            return "UTF-8";
3153        }
3154
3155        override bool canEncode(dchar c) @safe pure nothrow @nogc
3156        {
3157            return std.encoding.canEncode!(char)(c);
3158        }
3159
3160        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3161        {
3162            return std.encoding.encodedLength!(char)(c);
3163        }
3164
3165        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3166        {
3167            auto r = cast(char[]) buffer;
3168            return std.encoding.encode(c,r);
3169        }
3170
3171        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3172        {
3173            auto t = cast(const(char)[]) s;
3174            dchar c = std.encoding.decode(t);
3175            s = s[$-t.length..$];
3176            return c;
3177        }
3178
3179        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3180        {
3181            auto t = cast(const(char)[]) s;
3182            dchar c = std.encoding.safeDecode(t);
3183            s = s[$-t.length..$];
3184            return c;
3185        }
3186
3187        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3188        {
3189            return cast(immutable(ubyte)[])"\uFFFD";
3190        }
3191    }
3192}
3193
3194/**
3195 EncodingScheme to handle UTF-16 in native byte order
3196
3197 This scheme recognises the following names:
3198                 "UTF-16LE" (little-endian architecture only)
3199                 "UTF-16BE" (big-endian architecture only)
3200 */
3201class EncodingSchemeUtf16Native : EncodingScheme
3202{
3203    /* // moved to std.internal.phobosinit
3204    shared static this()
3205    {
3206        EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
3207    }*/
3208
3209    const
3210    {
3211        version (LittleEndian) { enum string NAME = "UTF-16LE"; }
3212        version (BigEndian)    { enum string NAME = "UTF-16BE"; }
3213
3214        override string[] names() @safe pure nothrow
3215        {
3216            return [ NAME ];
3217        }
3218
3219        override string toString() @safe pure nothrow @nogc
3220        {
3221            return NAME;
3222        }
3223
3224        override bool canEncode(dchar c) @safe pure nothrow @nogc
3225        {
3226            return std.encoding.canEncode!(wchar)(c);
3227        }
3228
3229        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3230        {
3231            return std.encoding.encodedLength!(wchar)(c);
3232        }
3233
3234        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3235        {
3236            auto r = cast(wchar[]) buffer;
3237            return wchar.sizeof * std.encoding.encode(c,r);
3238        }
3239
3240        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3241        in
3242        {
3243            assert((s.length & 1) == 0);
3244        }
3245        body
3246        {
3247            auto t = cast(const(wchar)[]) s;
3248            dchar c = std.encoding.decode(t);
3249            s = s[$-t.length * wchar.sizeof..$];
3250            return c;
3251        }
3252
3253        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3254        in
3255        {
3256            assert((s.length & 1) == 0);
3257        }
3258        body
3259        {
3260            auto t = cast(const(wchar)[]) s;
3261            dchar c = std.encoding.safeDecode(t);
3262            s = s[$-t.length * wchar.sizeof..$];
3263            return c;
3264        }
3265
3266        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3267        {
3268            return cast(immutable(ubyte)[])"\uFFFD"w;
3269        }
3270    }
3271}
3272@system unittest
3273{
3274    version (LittleEndian)
3275    {
3276        auto efrom = EncodingScheme.create("utf-16le");
3277        ubyte[6] sample = [154,1, 155,1, 156,1];
3278    }
3279    version (BigEndian)
3280    {
3281        auto efrom = EncodingScheme.create("utf-16be");
3282        ubyte[6] sample = [1,154, 1,155, 1,156];
3283    }
3284    const(ubyte)[] ub = cast(const(ubyte)[])sample;
3285    dchar dc = efrom.safeDecode(ub);
3286    assert(dc == 410);
3287    assert(ub.length == 4);
3288}
3289
3290/**
3291 EncodingScheme to handle UTF-32 in native byte order
3292
3293 This scheme recognises the following names:
3294                 "UTF-32LE" (little-endian architecture only)
3295                 "UTF-32BE" (big-endian architecture only)
3296 */
3297class EncodingSchemeUtf32Native : EncodingScheme
3298{
3299    /* // moved to std.internal.phobosinit
3300    shared static this()
3301    {
3302        EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
3303    }*/
3304
3305    const
3306    {
3307        version (LittleEndian) { enum string NAME = "UTF-32LE"; }
3308        version (BigEndian)    { enum string NAME = "UTF-32BE"; }
3309
3310        override string[] names() @safe pure nothrow
3311        {
3312            return [ NAME ];
3313        }
3314
3315        override string toString() @safe pure nothrow @nogc
3316        {
3317            return NAME;
3318        }
3319
3320        override bool canEncode(dchar c) @safe pure nothrow @nogc
3321        {
3322            return std.encoding.canEncode!(dchar)(c);
3323        }
3324
3325        override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3326        {
3327            return std.encoding.encodedLength!(dchar)(c);
3328        }
3329
3330        override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3331        {
3332            auto r = cast(dchar[]) buffer;
3333            return dchar.sizeof * std.encoding.encode(c,r);
3334        }
3335
3336        override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3337        in
3338        {
3339            assert((s.length & 3) == 0);
3340        }
3341        body
3342        {
3343            auto t = cast(const(dchar)[]) s;
3344            dchar c = std.encoding.decode(t);
3345            s = s[$-t.length * dchar.sizeof..$];
3346            return c;
3347        }
3348
3349        override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3350        in
3351        {
3352            assert((s.length & 3) == 0);
3353        }
3354        body
3355        {
3356            auto t = cast(const(dchar)[]) s;
3357            dchar c = std.encoding.safeDecode(t);
3358            s = s[$-t.length * dchar.sizeof..$];
3359            return c;
3360        }
3361
3362        override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3363        {
3364            return cast(immutable(ubyte)[])"\uFFFD"d;
3365        }
3366    }
3367}
3368@system unittest
3369{
3370    version (LittleEndian)
3371    {
3372        auto efrom = EncodingScheme.create("utf-32le");
3373        ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0];
3374    }
3375    version (BigEndian)
3376    {
3377        auto efrom = EncodingScheme.create("utf-32be");
3378        ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156];
3379    }
3380    const(ubyte)[] ub = cast(const(ubyte)[])sample;
3381    dchar dc = efrom.safeDecode(ub);
3382    assert(dc == 410);
3383    assert(ub.length == 8);
3384}
3385
3386//=============================================================================
3387
3388
3389// Helper functions
3390version (unittest)
3391{
3392    void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
3393    {
3394        static if (is(Src == Dst))
3395        {
3396            return s;
3397        }
3398        else static if (is(Src == AsciiChar))
3399        {
3400            transcodeReverse!(char,Dst)(cast(string) s,r);
3401        }
3402        else
3403        {
3404            foreach_reverse (d;codePoints(s))
3405            {
3406                foreach_reverse (c;codeUnits!(Dst)(d))
3407                {
3408                    r = c ~ r;
3409                }
3410            }
3411        }
3412    }
3413
3414    string makeReadable(string s)
3415    {
3416        string r = "\"";
3417        foreach (char c;s)
3418        {
3419            if (c >= 0x20 && c < 0x80)
3420            {
3421                r ~= c;
3422            }
3423            else
3424            {
3425                r ~= "\\x";
3426                r ~= toHexDigit(c >> 4);
3427                r ~= toHexDigit(c);
3428            }
3429        }
3430        r ~= "\"";
3431        return r;
3432    }
3433
3434    string makeReadable(wstring s)
3435    {
3436        string r = "\"";
3437        foreach (wchar c;s)
3438        {
3439            if (c >= 0x20 && c < 0x80)
3440            {
3441                r ~= cast(char) c;
3442            }
3443            else
3444            {
3445                r ~= "\\u";
3446                r ~= toHexDigit(c >> 12);
3447                r ~= toHexDigit(c >> 8);
3448                r ~= toHexDigit(c >> 4);
3449                r ~= toHexDigit(c);
3450            }
3451        }
3452        r ~= "\"w";
3453        return r;
3454    }
3455
3456    string makeReadable(dstring s)
3457    {
3458        string r = "\"";
3459        foreach (dchar c; s)
3460        {
3461            if (c >= 0x20 && c < 0x80)
3462            {
3463                r ~= cast(char) c;
3464            }
3465            else if (c < 0x10000)
3466            {
3467                r ~= "\\u";
3468                r ~= toHexDigit(c >> 12);
3469                r ~= toHexDigit(c >> 8);
3470                r ~= toHexDigit(c >> 4);
3471                r ~= toHexDigit(c);
3472            }
3473            else
3474            {
3475                r ~= "\\U00";
3476                r ~= toHexDigit(c >> 20);
3477                r ~= toHexDigit(c >> 16);
3478                r ~= toHexDigit(c >> 12);
3479                r ~= toHexDigit(c >> 8);
3480                r ~= toHexDigit(c >> 4);
3481                r ~= toHexDigit(c);
3482            }
3483        }
3484        r ~= "\"d";
3485        return r;
3486    }
3487
3488    char toHexDigit(int n)
3489    {
3490        return "0123456789ABCDEF"[n & 0xF];
3491    }
3492}
3493
3494/** Definitions of common Byte Order Marks.
3495The elements of the $(D enum) can used as indices into $(D bomTable) to get
3496matching $(D BOMSeq).
3497*/
3498enum BOM
3499{
3500    none      = 0,  /// no BOM was found
3501    utf32be   = 1,  /// [0x00, 0x00, 0xFE, 0xFF]
3502    utf32le   = 2,  /// [0xFF, 0xFE, 0x00, 0x00]
3503    utf7      = 3,  /*  [0x2B, 0x2F, 0x76, 0x38]
3504                        [0x2B, 0x2F, 0x76, 0x39],
3505                        [0x2B, 0x2F, 0x76, 0x2B],
3506                        [0x2B, 0x2F, 0x76, 0x2F],
3507                        [0x2B, 0x2F, 0x76, 0x38, 0x2D]
3508                    */
3509    utf1      = 8,  /// [0xF7, 0x64, 0x4C]
3510    utfebcdic = 9,  /// [0xDD, 0x73, 0x66, 0x73]
3511    scsu      = 10, /// [0x0E, 0xFE, 0xFF]
3512    bocu1     = 11, /// [0xFB, 0xEE, 0x28]
3513    gb18030   = 12, /// [0x84, 0x31, 0x95, 0x33]
3514    utf8      = 13, /// [0xEF, 0xBB, 0xBF]
3515    utf16be   = 14, /// [0xFE, 0xFF]
3516    utf16le   = 15  /// [0xFF, 0xFE]
3517}
3518
3519/// The type stored inside $(D bomTable).
3520alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
3521
3522/** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
3523*/
3524immutable bomTable = [
3525    BOMSeq(BOM.none, null),
3526    BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
3527    BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
3528    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
3529    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
3530    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
3531    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
3532    BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
3533    BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
3534    BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
3535    BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
3536    BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
3537    BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
3538    BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
3539    BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
3540    BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
3541];
3542
3543/** Returns a $(D BOMSeq) for a given $(D input).
3544If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is
3545returned. The $(D BOM) sequence at the beginning of the range will
3546not be comsumed from the passed range. If you pass a reference type
3547range make sure that $(D save) creates a deep copy.
3548
3549Params:
3550    input = The sequence to check for the $(D BOM)
3551
3552Returns:
3553    the found $(D BOMSeq) corresponding to the passed $(D input).
3554*/
3555immutable(BOMSeq) getBOM(Range)(Range input)
3556if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte))
3557{
3558    import std.algorithm.searching : startsWith;
3559    foreach (it; bomTable[1 .. $])
3560    {
3561        if (startsWith(input.save, it.sequence))
3562        {
3563            return it;
3564        }
3565    }
3566
3567    return bomTable[0];
3568}
3569
3570///
3571@system unittest
3572{
3573    import std.format : format;
3574
3575    auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
3576
3577    auto entry = getBOM(cast(ubyte[]) ts);
3578    version (BigEndian)
3579    {
3580        assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
3581    }
3582    else
3583    {
3584        assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
3585    }
3586}
3587
3588@system unittest
3589{
3590    import std.format : format;
3591
3592    foreach (idx, it; bomTable)
3593    {
3594        auto s = it[1] ~ cast(ubyte[])"hello world";
3595        auto i = getBOM(s);
3596        assert(i[0] == bomTable[idx][0]);
3597
3598        if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
3599        {
3600            assert(i[0] == BOM.init + idx);
3601            assert(i[1] == it[1]);
3602        }
3603    }
3604}
3605
3606@safe pure unittest
3607{
3608    struct BOMInputRange
3609    {
3610        ubyte[] arr;
3611
3612        @property ubyte front()
3613        {
3614            return this.arr.front;
3615        }
3616
3617        @property bool empty()
3618        {
3619            return this.arr.empty;
3620        }
3621
3622        void popFront()
3623        {
3624            this.arr = this.arr[1 .. $];
3625        }
3626
3627        @property typeof(this) save()
3628        {
3629            return this;
3630        }
3631    }
3632
3633    static assert( isInputRange!BOMInputRange);
3634    static assert(!isArray!BOMInputRange);
3635
3636    ubyte[] dummyEnd = [0,0,0,0];
3637
3638    foreach (idx, it; bomTable[1 .. $])
3639    {
3640        {
3641            auto ir = BOMInputRange(it.sequence.dup);
3642
3643            auto b = getBOM(ir);
3644            assert(b.schema == it.schema);
3645            assert(ir.arr == it.sequence);
3646        }
3647
3648        {
3649            auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
3650            size_t oldLen = noBom.length;
3651            assert(oldLen - 4 < it.sequence.length);
3652
3653            auto ir = BOMInputRange(noBom.dup);
3654            auto b = getBOM(ir);
3655            assert(b.schema == BOM.none);
3656            assert(noBom.length == oldLen);
3657        }
3658    }
3659}
3660
3661/** Constant defining a fully decoded BOM */
3662enum dchar utfBOM = 0xfeff;
3663