1/********************************************
2 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
3 *
4 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
5 * wchar type.
6 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
7 * the D utf.dchar type.
8 *
9 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
10 *
11 * See_Also:
12 *      $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
13 *      $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
14 *      $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
15 *
16 * Copyright: Copyright Digital Mars 2003 - 2016.
17 * License:   $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
18 * Authors:   Walter Bright, Sean Kelly
19 * Source:    $(DRUNTIMESRC src/rt/util/_utf.d)
20 */
21
22module rt.util.utf;
23
24extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;
25
26/*******************************
27 * Test if c is a valid UTF-32 character.
28 *
29 * \uFFFE and \uFFFF are considered valid by this function,
30 * as they are permitted for internal use by an application,
31 * but they are not allowed for interchange by the Unicode standard.
32 *
33 * Returns: true if it is, false if not.
34 */
35
36@safe @nogc pure nothrow
37bool isValidDchar(dchar c)
38{
39    /* Note: FFFE and FFFF are specifically permitted by the
40     * Unicode standard for application internal use, but are not
41     * allowed for interchange.
42     * (thanks to Arcane Jill)
43     */
44
45    return c < 0xD800 ||
46        (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
47}
48
49unittest
50{
51    debug(utf) printf("utf.isValidDchar.unittest\n");
52    assert(isValidDchar(cast(dchar)'a') == true);
53    assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
54}
55
56
57
58static immutable UTF8stride =
59[
60    cast(ubyte)
61    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
62    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
63    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
64    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
70    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
71    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
72    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
73    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
74    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
75    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
76    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
77];
78
79/**
80 * stride() returns the length of a UTF-8 sequence starting at index i
81 * in string s.
82 * Returns:
83 *      The number of bytes in the UTF-8 sequence or
84 *      0xFF meaning s[i] is not the start of of UTF-8 sequence.
85 */
86@safe @nogc pure nothrow
87uint stride(in char[] s, size_t i)
88{
89    return UTF8stride[s[i]];
90}
91
92/**
93 * stride() returns the length of a UTF-16 sequence starting at index i
94 * in string s.
95 */
96@safe @nogc pure nothrow
97uint stride(in wchar[] s, size_t i)
98{   uint u = s[i];
99    return 1 + (u >= 0xD800 && u <= 0xDBFF);
100}
101
102/**
103 * stride() returns the length of a UTF-32 sequence starting at index i
104 * in string s.
105 * Returns: The return value will always be 1.
106 */
107@safe @nogc pure nothrow
108uint stride(in dchar[] s, size_t i)
109{
110    return 1;
111}
112
113/*******************************************
114 * Given an index i into an array of characters s[],
115 * and assuming that index i is at the start of a UTF character,
116 * determine the number of UCS characters up to that index i.
117 */
118@safe pure
119size_t toUCSindex(in char[] s, size_t i)
120{
121    size_t n;
122    size_t j;
123
124    for (j = 0; j < i; )
125    {
126        j += stride(s, j);
127        n++;
128    }
129    if (j > i)
130    {
131        onUnicodeError("invalid UTF-8 sequence", j);
132    }
133    return n;
134}
135
136/** ditto */
137@safe pure
138size_t toUCSindex(in wchar[] s, size_t i)
139{
140    size_t n;
141    size_t j;
142
143    for (j = 0; j < i; )
144    {
145        j += stride(s, j);
146        n++;
147    }
148    if (j > i)
149    {
150        onUnicodeError("invalid UTF-16 sequence", j);
151    }
152    return n;
153}
154
155/** ditto */
156@safe @nogc pure nothrow
157size_t toUCSindex(in dchar[] s, size_t i)
158{
159    return i;
160}
161
162/******************************************
163 * Given a UCS index n into an array of characters s[], return the UTF index.
164 */
165@safe pure
166size_t toUTFindex(in char[] s, size_t n)
167{
168    size_t i;
169
170    while (n--)
171    {
172        uint j = UTF8stride[s[i]];
173        if (j == 0xFF)
174            onUnicodeError("invalid UTF-8 sequence", i);
175        i += j;
176    }
177    return i;
178}
179
180/** ditto */
181@safe @nogc pure nothrow
182size_t toUTFindex(in wchar[] s, size_t n)
183{
184    size_t i;
185
186    while (n--)
187    {   wchar u = s[i];
188
189        i += 1 + (u >= 0xD800 && u <= 0xDBFF);
190    }
191    return i;
192}
193
194/** ditto */
195@safe @nogc pure nothrow
196size_t toUTFindex(in dchar[] s, size_t n)
197{
198    return n;
199}
200
201/* =================== Decode ======================= */
202
203/***************
204 * Decodes and returns character starting at s[idx]. idx is advanced past the
205 * decoded character. If the character is not well formed, a UtfException is
206 * thrown and idx remains unchanged.
207 */
208@safe pure
209dchar decode(in char[] s, ref size_t idx)
210    in
211    {
212        assert(idx >= 0 && idx < s.length);
213    }
214    out (result)
215    {
216        assert(isValidDchar(result));
217    }
218    body
219    {
220        size_t len = s.length;
221        dchar V;
222        size_t i = idx;
223        char u = s[i];
224
225        if (u & 0x80)
226        {   uint n;
227            char u2;
228
229            /* The following encodings are valid, except for the 5 and 6 byte
230             * combinations:
231             *  0xxxxxxx
232             *  110xxxxx 10xxxxxx
233             *  1110xxxx 10xxxxxx 10xxxxxx
234             *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
235             *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
236             *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
237             */
238            for (n = 1; ; n++)
239            {
240                if (n > 4)
241                    goto Lerr;          // only do the first 4 of 6 encodings
242                if (((u << n) & 0x80) == 0)
243                {
244                    if (n == 1)
245                        goto Lerr;
246                    break;
247                }
248            }
249
250            // Pick off (7 - n) significant bits of B from first byte of octet
251            V = cast(dchar)(u & ((1 << (7 - n)) - 1));
252
253            if (i + (n - 1) >= len)
254                goto Lerr;                      // off end of string
255
256            /* The following combinations are overlong, and illegal:
257             *  1100000x (10xxxxxx)
258             *  11100000 100xxxxx (10xxxxxx)
259             *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
260             *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
261             *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
262             */
263            u2 = s[i + 1];
264            if ((u & 0xFE) == 0xC0 ||
265                (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
266                (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
267                (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
268                (u == 0xFC && (u2 & 0xFC) == 0x80))
269                goto Lerr;                      // overlong combination
270
271            for (uint j = 1; j != n; j++)
272            {
273                u = s[i + j];
274                if ((u & 0xC0) != 0x80)
275                    goto Lerr;                  // trailing bytes are 10xxxxxx
276                V = (V << 6) | (u & 0x3F);
277            }
278            if (!isValidDchar(V))
279                goto Lerr;
280            i += n;
281        }
282        else
283        {
284            V = cast(dchar) u;
285            i++;
286        }
287
288        idx = i;
289        return V;
290
291      Lerr:
292      onUnicodeError("invalid UTF-8 sequence", i);
293    return V; // dummy return
294    }
295
296unittest
297{   size_t i;
298    dchar c;
299
300    debug(utf) printf("utf.decode.unittest\n");
301
302    static s1 = "abcd"c;
303    i = 0;
304    c = decode(s1, i);
305    assert(c == cast(dchar)'a');
306    assert(i == 1);
307    c = decode(s1, i);
308    assert(c == cast(dchar)'b');
309    assert(i == 2);
310
311    static s2 = "\xC2\xA9"c;
312    i = 0;
313    c = decode(s2, i);
314    assert(c == cast(dchar)'\u00A9');
315    assert(i == 2);
316
317    static s3 = "\xE2\x89\xA0"c;
318    i = 0;
319    c = decode(s3, i);
320    assert(c == cast(dchar)'\u2260');
321    assert(i == 3);
322
323    static s4 =
324    [   "\xE2\x89"c[],          // too short
325        "\xC0\x8A",
326        "\xE0\x80\x8A",
327        "\xF0\x80\x80\x8A",
328        "\xF8\x80\x80\x80\x8A",
329        "\xFC\x80\x80\x80\x80\x8A",
330    ];
331
332    for (int j = 0; j < s4.length; j++)
333    {
334        try
335        {
336            i = 0;
337            c = decode(s4[j], i);
338            assert(0);
339        }
340        catch (Throwable o)
341        {
342            i = 23;
343        }
344        assert(i == 23);
345    }
346}
347
348/** ditto */
349@safe pure
350dchar decode(in wchar[] s, ref size_t idx)
351    in
352    {
353        assert(idx >= 0 && idx < s.length);
354    }
355    out (result)
356    {
357        assert(isValidDchar(result));
358    }
359    body
360    {
361        string msg;
362        dchar V;
363        size_t i = idx;
364        uint u = s[i];
365
366        if (u & ~0x7F)
367        {   if (u >= 0xD800 && u <= 0xDBFF)
368            {   uint u2;
369
370                if (i + 1 == s.length)
371                {   msg = "surrogate UTF-16 high value past end of string";
372                    goto Lerr;
373                }
374                u2 = s[i + 1];
375                if (u2 < 0xDC00 || u2 > 0xDFFF)
376                {   msg = "surrogate UTF-16 low value out of range";
377                    goto Lerr;
378                }
379                u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
380                i += 2;
381            }
382            else if (u >= 0xDC00 && u <= 0xDFFF)
383            {   msg = "unpaired surrogate UTF-16 value";
384                goto Lerr;
385            }
386            else if (u == 0xFFFE || u == 0xFFFF)
387            {   msg = "illegal UTF-16 value";
388                goto Lerr;
389            }
390            else
391                i++;
392        }
393        else
394        {
395            i++;
396        }
397
398        idx = i;
399        return cast(dchar)u;
400
401      Lerr:
402          onUnicodeError(msg, i);
403        return cast(dchar)u; // dummy return
404    }
405
406/** ditto */
407@safe pure
408dchar decode(in dchar[] s, ref size_t idx)
409    in
410    {
411        assert(idx >= 0 && idx < s.length);
412    }
413    body
414    {
415        size_t i = idx;
416        dchar c = s[i];
417
418        if (!isValidDchar(c))
419            goto Lerr;
420        idx = i + 1;
421        return c;
422
423      Lerr:
424          onUnicodeError("invalid UTF-32 value", i);
425        return c; // dummy return
426    }
427
428
429/* =================== Encode ======================= */
430
431/*******************************
432 * Encodes character c and appends it to array s[].
433 */
434@safe pure nothrow
435void encode(ref char[] s, dchar c)
436    in
437    {
438        assert(isValidDchar(c));
439    }
440    body
441    {
442        char[] r = s;
443
444        if (c <= 0x7F)
445        {
446            r ~= cast(char) c;
447        }
448        else
449        {
450            char[4] buf;
451            uint L;
452
453            if (c <= 0x7FF)
454            {
455                buf[0] = cast(char)(0xC0 | (c >> 6));
456                buf[1] = cast(char)(0x80 | (c & 0x3F));
457                L = 2;
458            }
459            else if (c <= 0xFFFF)
460            {
461                buf[0] = cast(char)(0xE0 | (c >> 12));
462                buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
463                buf[2] = cast(char)(0x80 | (c & 0x3F));
464                L = 3;
465            }
466            else if (c <= 0x10FFFF)
467            {
468                buf[0] = cast(char)(0xF0 | (c >> 18));
469                buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
470                buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
471                buf[3] = cast(char)(0x80 | (c & 0x3F));
472                L = 4;
473            }
474            else
475            {
476                assert(0);
477            }
478            r ~= buf[0 .. L];
479        }
480        s = r;
481    }
482
483unittest
484{
485    debug(utf) printf("utf.encode.unittest\n");
486
487    char[] s = "abcd".dup;
488    encode(s, cast(dchar)'a');
489    assert(s.length == 5);
490    assert(s == "abcda");
491
492    encode(s, cast(dchar)'\u00A9');
493    assert(s.length == 7);
494    assert(s == "abcda\xC2\xA9");
495    //assert(s == "abcda\u00A9");       // BUG: fix compiler
496
497    encode(s, cast(dchar)'\u2260');
498    assert(s.length == 10);
499    assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
500}
501
502/** ditto */
503@safe pure nothrow
504void encode(ref wchar[] s, dchar c)
505    in
506    {
507        assert(isValidDchar(c));
508    }
509    body
510    {
511        wchar[] r = s;
512
513        if (c <= 0xFFFF)
514        {
515            r ~= cast(wchar) c;
516        }
517        else
518        {
519            wchar[2] buf;
520
521            buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
522            buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
523            r ~= buf;
524        }
525        s = r;
526    }
527
528/** ditto */
529@safe pure nothrow
530void encode(ref dchar[] s, dchar c)
531    in
532    {
533        assert(isValidDchar(c));
534    }
535    body
536    {
537        s ~= c;
538    }
539
540/**
541Returns the code length of $(D c) in the encoding using $(D C) as a
542code point. The code is returned in character count, not in bytes.
543 */
544@safe pure nothrow @nogc
545ubyte codeLength(C)(dchar c)
546{
547    static if (C.sizeof == 1)
548    {
549        if (c <= 0x7F) return 1;
550        if (c <= 0x7FF) return 2;
551        if (c <= 0xFFFF) return 3;
552        if (c <= 0x10FFFF) return 4;
553        assert(false);
554    }
555    else static if (C.sizeof == 2)
556    {
557        return c <= 0xFFFF ? 1 : 2;
558    }
559    else
560    {
561        static assert(C.sizeof == 4);
562        return 1;
563    }
564}
565
566/* =================== Validation ======================= */
567
568/***********************************
569Checks to see if string is well formed or not. $(D S) can be an array
570 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
571 if it is not. Use to check all untrusted input for correctness.
572 */
573@safe pure
574void validate(S)(in S s)
575{
576    auto len = s.length;
577    for (size_t i = 0; i < len; )
578    {
579        decode(s, i);
580    }
581}
582
583/* =================== Conversion to UTF8 ======================= */
584
585@safe pure nothrow @nogc
586char[] toUTF8(char[] buf, dchar c)
587    in
588    {
589        assert(isValidDchar(c));
590    }
591    body
592    {
593        if (c <= 0x7F)
594        {
595            buf[0] = cast(char) c;
596            return buf[0 .. 1];
597        }
598        else if (c <= 0x7FF)
599        {
600            buf[0] = cast(char)(0xC0 | (c >> 6));
601            buf[1] = cast(char)(0x80 | (c & 0x3F));
602            return buf[0 .. 2];
603        }
604        else if (c <= 0xFFFF)
605        {
606            buf[0] = cast(char)(0xE0 | (c >> 12));
607            buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
608            buf[2] = cast(char)(0x80 | (c & 0x3F));
609            return buf[0 .. 3];
610        }
611        else if (c <= 0x10FFFF)
612        {
613            buf[0] = cast(char)(0xF0 | (c >> 18));
614            buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
615            buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
616            buf[3] = cast(char)(0x80 | (c & 0x3F));
617            return buf[0 .. 4];
618        }
619        assert(0);
620    }
621
622/*******************
623 * Encodes string s into UTF-8 and returns the encoded string.
624 */
625@safe pure nothrow
626string toUTF8(string s)
627    in
628    {
629        validate(s);
630    }
631    body
632    {
633        return s;
634    }
635
636/** ditto */
637@trusted pure
638string toUTF8(in wchar[] s)
639{
640    char[] r;
641    size_t i;
642    size_t slen = s.length;
643
644    r.length = slen;
645
646    for (i = 0; i < slen; i++)
647    {   wchar c = s[i];
648
649        if (c <= 0x7F)
650            r[i] = cast(char)c;         // fast path for ascii
651        else
652        {
653            r.length = i;
654            foreach (dchar c; s[i .. slen])
655            {
656                encode(r, c);
657            }
658            break;
659        }
660    }
661    return cast(string)r;
662}
663
664/** ditto */
665@trusted pure
666string toUTF8(in dchar[] s)
667{
668    char[] r;
669    size_t i;
670    size_t slen = s.length;
671
672    r.length = slen;
673
674    for (i = 0; i < slen; i++)
675    {   dchar c = s[i];
676
677        if (c <= 0x7F)
678            r[i] = cast(char)c;         // fast path for ascii
679        else
680        {
681            r.length = i;
682            foreach (dchar d; s[i .. slen])
683            {
684                encode(r, d);
685            }
686            break;
687        }
688    }
689    return cast(string)r;
690}
691
692/* =================== Conversion to UTF16 ======================= */
693
694@safe pure nothrow @nogc
695wchar[] toUTF16(wchar[] buf, dchar c)
696    in
697    {
698        assert(isValidDchar(c));
699    }
700    body
701    {
702        if (c <= 0xFFFF)
703        {
704            buf[0] = cast(wchar) c;
705            return buf[0 .. 1];
706        }
707        else
708        {
709            buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
710            buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
711            return buf[0 .. 2];
712        }
713    }
714
715/****************
716 * Encodes string s into UTF-16 and returns the encoded string.
717 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
718 * an LPWSTR or LPCWSTR argument.
719 */
720@trusted pure
721wstring toUTF16(in char[] s)
722{
723    wchar[] r;
724    size_t slen = s.length;
725
726    r.length = slen;
727    r.length = 0;
728    for (size_t i = 0; i < slen; )
729    {
730        dchar c = s[i];
731        if (c <= 0x7F)
732        {
733            i++;
734            r ~= cast(wchar)c;
735        }
736        else
737        {
738            c = decode(s, i);
739            encode(r, c);
740        }
741    }
742    return cast(wstring)r;
743}
744
745alias const(wchar)* wptr;
746/** ditto */
747@safe pure
748wptr toUTF16z(in char[] s)
749{
750    wchar[] r;
751    size_t slen = s.length;
752
753    r.length = slen + 1;
754    r.length = 0;
755    for (size_t i = 0; i < slen; )
756    {
757        dchar c = s[i];
758        if (c <= 0x7F)
759        {
760            i++;
761            r ~= cast(wchar)c;
762        }
763        else
764        {
765            c = decode(s, i);
766            encode(r, c);
767        }
768    }
769    r ~= '\000';
770    return &r[0];
771}
772
773/** ditto */
774@safe pure nothrow
775wstring toUTF16(wstring s)
776    in
777    {
778        validate(s);
779    }
780    body
781    {
782        return s;
783    }
784
785/** ditto */
786@trusted pure nothrow
787wstring toUTF16(in dchar[] s)
788{
789    wchar[] r;
790    size_t slen = s.length;
791
792    r.length = slen;
793    r.length = 0;
794    for (size_t i = 0; i < slen; i++)
795    {
796        encode(r, s[i]);
797    }
798    return cast(wstring)r;
799}
800
801/* =================== Conversion to UTF32 ======================= */
802
803/*****
804 * Encodes string s into UTF-32 and returns the encoded string.
805 */
806@trusted pure
807dstring toUTF32(in char[] s)
808{
809    dchar[] r;
810    size_t slen = s.length;
811    size_t j = 0;
812
813    r.length = slen;            // r[] will never be longer than s[]
814    for (size_t i = 0; i < slen; )
815    {
816        dchar c = s[i];
817        if (c >= 0x80)
818            c = decode(s, i);
819        else
820            i++;                // c is ascii, no need for decode
821        r[j++] = c;
822    }
823    return cast(dstring)r[0 .. j];
824}
825
826/** ditto */
827@trusted pure
828dstring toUTF32(in wchar[] s)
829{
830    dchar[] r;
831    size_t slen = s.length;
832    size_t j = 0;
833
834    r.length = slen;            // r[] will never be longer than s[]
835    for (size_t i = 0; i < slen; )
836    {
837        dchar c = s[i];
838        if (c >= 0x80)
839            c = decode(s, i);
840        else
841            i++;                // c is ascii, no need for decode
842        r[j++] = c;
843    }
844    return cast(dstring)r[0 .. j];
845}
846
847/** ditto */
848@safe pure nothrow
849dstring toUTF32(dstring s)
850    in
851    {
852        validate(s);
853    }
854    body
855    {
856        return s;
857    }
858
859/* ================================ tests ================================== */
860
861unittest
862{
863    debug(utf) printf("utf.toUTF.unittest\n");
864
865    auto c = "hello"c[];
866    auto w = toUTF16(c);
867    assert(w == "hello");
868    auto d = toUTF32(c);
869    assert(d == "hello");
870
871    c = toUTF8(w);
872    assert(c == "hello");
873    d = toUTF32(w);
874    assert(d == "hello");
875
876    c = toUTF8(d);
877    assert(c == "hello");
878    w = toUTF16(d);
879    assert(w == "hello");
880
881
882    c = "hel\u1234o";
883    w = toUTF16(c);
884    assert(w == "hel\u1234o");
885    d = toUTF32(c);
886    assert(d == "hel\u1234o");
887
888    c = toUTF8(w);
889    assert(c == "hel\u1234o");
890    d = toUTF32(w);
891    assert(d == "hel\u1234o");
892
893    c = toUTF8(d);
894    assert(c == "hel\u1234o");
895    w = toUTF16(d);
896    assert(w == "hel\u1234o");
897
898
899    c = "he\U000BAAAAllo";
900    w = toUTF16(c);
901    //foreach (wchar c; w) printf("c = x%x\n", c);
902    //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
903    assert(w == "he\U000BAAAAllo");
904    d = toUTF32(c);
905    assert(d == "he\U000BAAAAllo");
906
907    c = toUTF8(w);
908    assert(c == "he\U000BAAAAllo");
909    d = toUTF32(w);
910    assert(d == "he\U000BAAAAllo");
911
912    c = toUTF8(d);
913    assert(c == "he\U000BAAAAllo");
914    w = toUTF16(d);
915    assert(w == "he\U000BAAAAllo");
916
917    wchar[2] buf;
918    auto ret = toUTF16(buf, '\U000BAAAA');
919    assert(ret == "\U000BAAAA");
920}
921