1// Copyright 2018 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <endian.h>
6#include <stdio.h>
7#include <unittest/unittest.h>
8#include <utf_conversion/utf_conversion.h>
9
10#include <fbl/algorithm.h>
11
12#if (BYTE_ORDER == BIG_ENDIAN)
13static constexpr uint32_t HOST_ENDIAN_FLAG   = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
14static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
15#else
16static constexpr uint32_t HOST_ENDIAN_FLAG   = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
17static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
18#endif
19
20#define ASSERT_UTF8_EQ(expected, expected_len, actual, actual_bytes, enc_len, msg)  \
21    do {                                                                            \
22        ASSERT_GE(actual_bytes, expected_len, msg);                                 \
23        ASSERT_EQ(expected_len, enc_len, msg);                                      \
24        ASSERT_BYTES_EQ(expected, actual, expected_len, msg);                       \
25    } while(false)
26
27static bool utf16to8_bad_args(void) {
28    BEGIN_TEST;
29
30    uint16_t src;
31    uint8_t dst = 0xFE;
32    size_t dst_len;
33    zx_status_t res;
34
35    // Bad destination buffer with non-zero destination length
36    dst_len = 1;
37    res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
38    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst should fail with INVALID_ARGS");
39    ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
40
41    // Bad dest len pointer
42    res = utf16_to_utf8(&src, 1, &dst, nullptr);
43    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst_len should fail with INVALID_ARGS");
44    ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");
45
46    // Bad (undefined) flags
47    res = utf16_to_utf8(&src, 1, &dst, &dst_len, 0x80000000);
48    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "undefined flags should fail with INVALID_ARGS");
49    ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
50    ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");
51
52    // A null dest buffer is allowed if (and only if) the dst_len is zero.
53    // Practical use cases include using the converter to determine the length
54    // needed to hold a converted string.
55    dst_len = 0;
56    src = 0xAB;
57    res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
58    ASSERT_EQ(ZX_OK, res, "null dst with zero dst_len should succeed");
59    ASSERT_EQ(2, dst_len, "encoded size of 0xAB should be 2!");
60
61    END_TEST;
62}
63
64static bool utf16to8_empty_source(void) {
65    BEGIN_TEST;
66
67    uint16_t src;
68    static const uint8_t expected[] = { 0xA1, 0xB2, 0xC3, 0xD4 };
69    uint8_t actual[sizeof(expected)];
70    size_t dst_len;
71    zx_status_t res;
72
73    // Check to make sure that attempting to encode a zero length source results
74    // in a length of zero and no changes to the destination buffer.
75    memcpy(actual, expected, sizeof(actual));
76    dst_len = sizeof(actual);
77    res = utf16_to_utf8(&src, 0,actual, &dst_len);
78    ASSERT_EQ(ZX_OK, res, "zero length string conversion failed");
79    ASSERT_EQ(0, dst_len, "dst_len should be zero after zero length string conversion");
80    ASSERT_BYTES_EQ(expected, actual, sizeof(actual),
81                    "dst buffer modified after zero length string conversion");
82
83    dst_len = sizeof(actual);
84    res = utf16_to_utf8(nullptr, 1,actual, &dst_len);
85    ASSERT_EQ(ZX_OK, res, "null source string conversion failed");
86    ASSERT_EQ(0, dst_len, "dst_len should be zero after null source string conversion");
87    ASSERT_BYTES_EQ(expected, actual, sizeof(actual),
88                    "dst buffer modified after null source string conversion");
89
90    END_TEST;
91}
92
93static bool utf16to8_simple_codepoints(void) {
94    BEGIN_TEST;
95
96    static const struct {
97        uint16_t src;
98        uint8_t expected[3];
99        size_t  expected_len;
100    } TEST_VECTORS[] = {
101        // 1 byte UTF-8 codepoints (U+0000, U+007F)
102        { 0x0000, { 0x00 }, 1 },
103        { 0x0001, { 0x01 }, 1 },
104        { 0x007f, { 0x7f }, 1 },
105
106        // 2 byte UTF-8 codepoints (U+0080, U+07FF)
107        { 0x0080, { 0xC2, 0x80 }, 2 },
108        { 0x0456, { 0xD1, 0x96 }, 2 },
109        { 0x07FF, { 0xDF, 0xBF }, 2 },
110
111        // 3 byte UTF-8 codepoints (U+0800, U+07FF)
112        // Note: we are skipping the (theoretically illegal) unpaired surrogate
113        // range (U+D800, U+DFFF) here.  There is a separate test for support of
114        // unpaired surrogates.
115        { 0x0800, { 0xE0, 0xA0, 0x80 }, 3 },
116        { 0x4567, { 0xE4, 0x95, 0xA7 }, 3 },
117        { 0xD7FF, { 0xED, 0x9F, 0xBF }, 3 },
118        { 0xE000, { 0xEE, 0x80, 0x80 }, 3 },
119        { 0xE456, { 0xEE, 0x91, 0x96 }, 3 },
120        { 0xFFFF, { 0xEF, 0xBF, 0xBF }, 3 },
121    };
122
123    uint8_t actual[3];
124    for (const auto& v : TEST_VECTORS) {
125        char case_id[64];
126        size_t encoded_len = sizeof(actual);
127        zx_status_t res;
128
129        snprintf(case_id, sizeof(case_id), "case id [0x%04hx]", v.src);
130        ::memset(actual, 0xAB, sizeof(actual));
131
132        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
133        ASSERT_EQ(ZX_OK, res, case_id);
134        ASSERT_LE(v.expected_len, sizeof(v.expected), case_id);
135        ASSERT_UTF8_EQ(v.expected, v.expected_len,
136                       actual, sizeof(actual),
137                       encoded_len, case_id);
138    }
139
140    END_TEST;
141}
142
143static bool utf16to8_paired_surrogates(void) {
144    BEGIN_TEST;
145
146    // All paired surrogate encodings are going to be 4 byte UTF-8 codepoints (U+010000, U+10FFFF)
147    static const struct {
148        uint16_t src[2];
149        uint8_t expected[4];
150    } TEST_VECTORS[] = {
151        { { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, // U+10000
152        { { 0xD811, 0xDD67 }, { 0xF0, 0x94, 0x95, 0xA7 } }, // U+14567
153        { { 0xDA6F, 0xDCDE }, { 0xF2, 0xAB, 0xB3, 0x9E } }, // U+ABCDE
154        { { 0xDBBF, 0xDFFF }, { 0xF3, 0xBF, 0xBF, 0xBF } }, // U+FFFFF
155        { { 0xDBC0, 0xDC00 }, { 0xF4, 0x80, 0x80, 0x80 } }, // U+100000
156        { { 0xDBD1, 0xDD67 }, { 0xF4, 0x84, 0x95, 0xA7 } }, // U+104567
157        { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } }, // U+10FFFF
158    };
159
160    uint8_t actual[4];
161    for (const auto& v : TEST_VECTORS) {
162        char case_id[64];
163        size_t encoded_len = sizeof(actual);
164        zx_status_t res;
165
166        snprintf(case_id, sizeof(case_id), "case id [0x%04hx : 0x%04hx]", v.src[0], v.src[1]);
167        ::memset(actual, 0xAB, sizeof(actual));
168
169        res = utf16_to_utf8(v.src, fbl::count_of(v.src), actual, &encoded_len);
170        ASSERT_EQ(ZX_OK, res, case_id);
171        ASSERT_UTF8_EQ(v.expected, sizeof(v.expected),
172                       actual, sizeof(actual),
173                       encoded_len, case_id);
174    }
175
176    END_TEST;
177}
178
179static bool utf16to8_unpaired_surrogates(void) {
180    BEGIN_TEST;
181
182    static const struct {
183        uint16_t src;
184        uint8_t expected[3];
185    } TEST_VECTORS[] = {
186        // All unpaired surrogates are technically supposed to be illegal, but
187        // apparently there are systems out there who use them any (Wikipedia
188        // claims that Windows allows unpaired surrogates in file names encoded
189        // using UTF-16)
190        //
191        // Unpaired surrogates are 16 bits wide, so they will require a 3-byte
192        // UTF-8 encoding.
193        { 0xD800, { 0xED, 0xA0, 0x80 } },
194        { 0xD945, { 0xED, 0xA5, 0x85 } },
195        { 0xDBFF, { 0xED, 0xAF, 0xBF } },
196        { 0xDC00, { 0xED, 0xB0, 0x80 } },
197        { 0xDD45, { 0xED, 0xB5, 0x85 } },
198        { 0xDFFF, { 0xED, 0xBF, 0xBF } },
199    };
200    uint8_t replace[3] = { 0xEF, 0xBF, 0xBD };
201    uint8_t actual[3];
202    for (const auto& v : TEST_VECTORS) {
203        char case_id[64];
204        size_t encoded_len = sizeof(actual);
205        zx_status_t res;
206
207        // Attempt to encode the unpaired surrogate, but do not specify that we
208        // want to preserve it.  We should end up with the encoded form of the
209        // replacement character (U+FFFD) instead.
210        snprintf(case_id, sizeof(case_id), "case id [0x%04hx, replace]", v.src);
211        ::memset(actual, 0xAB, sizeof(actual));
212
213        encoded_len = sizeof(actual);
214        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
215        ASSERT_EQ(ZX_OK, res, case_id);
216        ASSERT_UTF8_EQ(replace, sizeof(replace), actual, sizeof(actual),
217                       encoded_len, case_id);
218
219        // Do it again, but this time tell the converter to preserve the
220        // unpaired surrogate instead.
221        snprintf(case_id, sizeof(case_id), "case id [0x%04hx, preserve]", v.src);
222        ::memset(actual, 0xAB, sizeof(actual));
223
224        encoded_len = sizeof(actual);
225        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len,
226                            UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES);
227        ASSERT_EQ(ZX_OK, res, case_id);
228        ASSERT_UTF8_EQ(v.expected, sizeof(v.expected), actual, sizeof(actual),
229                       encoded_len, case_id);
230    }
231
232    END_TEST;
233}
234
235static bool utf16to8_dst_buffer_lengths(void) {
236    BEGIN_TEST;
237
238    const uint16_t src[] = { 'T', 'e', 's', 't' };
239    const uint8_t expected[] = { 'T', 'e', 's', 't' };
240    uint8_t actual[16];
241
242    // Perform a conversion, but test three cases.
243    //
244    // 1) The destination buffer size is exactly what is required.
245    // 2) The destination buffer size is more than what is required.
246    // 3) The destination buffer size is less than what is required.
247    static const size_t DST_LENGTHS[] = { sizeof(expected), sizeof(actual), sizeof(expected) >> 1 };
248    for (const auto& d : DST_LENGTHS) {
249        char case_id[64];
250        size_t encoded_len = d;
251        zx_status_t res;
252
253        snprintf(case_id, sizeof(case_id), "case id [needed %zu, provided %zu]",
254                 sizeof(expected), d);
255        ::memset(actual, 0xAB, sizeof(actual));
256
257        ASSERT_LE(encoded_len, sizeof(actual), case_id);
258        res = utf16_to_utf8(src, fbl::count_of(src), actual, &encoded_len);
259
260        ASSERT_EQ(ZX_OK, res, case_id);
261        ASSERT_EQ(sizeof(expected), encoded_len, case_id);
262        static_assert(sizeof(expected) <= sizeof(actual),
263                      "'actual' buffer must be large enough to hold 'expected' result");
264        ASSERT_BYTES_EQ(expected, actual, d < encoded_len ? d : encoded_len, case_id);
265
266        if (d < sizeof(actual)) {
267            uint8_t pattern[sizeof(actual)];
268            ::memset(pattern, 0xAB, sizeof(pattern));
269            ASSERT_BYTES_EQ(actual + d, pattern, sizeof(actual) - d, case_id);
270        }
271    }
272
273    END_TEST;
274}
275
276static bool utf16to8_endianness_and_bom(void) {
277    BEGIN_TEST;
278
279    static const struct {
280        uint16_t src[5];
281        bool host_order;
282    } SOURCES[] = {
283        { { 0xFEFF, 'T', 'e', 's', 't' }, true },
284        { { __bswap16(0xFEFF),
285            __bswap16('T'),
286            __bswap16('e'),
287            __bswap16('s'),
288            __bswap16('t'),
289            }, false }
290    };
291
292    const uint8_t bom_removed[] = { 'T', 'e', 's', 't' };
293    const uint8_t bom_removed_inverted[] = {
294        0xE5, 0x90, 0x80, 0xE6, 0x94, 0x80, 0xE7,
295        0x8C, 0x80, 0xE7, 0x90, 0x80 };
296    const uint8_t bom_encoded[] = { 0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't' };
297    const uint8_t bom_encoded_inverted[] = {
298        0xEF, 0xBF, 0xBE, 0xE5, 0x90, 0x80, 0xE6,
299        0x94, 0x80, 0xE7, 0x8C, 0x80, 0xE7, 0x90,
300        0x80 };
301    uint8_t actual[fbl::count_of(bom_encoded_inverted)];
302
303#define EXPECT(e) { e, sizeof(e) }
304    static const struct {
305        uint32_t flags;
306        struct {
307            const uint8_t* exp;
308            size_t len;
309        } host;
310        struct {
311            const uint8_t* exp;
312            size_t len;
313        } inv;
314    } EXPECTED[] {
315        { 0,
316          EXPECT(bom_encoded), EXPECT(bom_encoded) },
317        { UTF_CONVERT_FLAG_DISCARD_BOM,
318          EXPECT(bom_removed), EXPECT(bom_removed) },
319        { HOST_ENDIAN_FLAG,
320          EXPECT(bom_encoded), EXPECT(bom_encoded_inverted) },
321        { HOST_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM,
322          EXPECT(bom_removed), EXPECT(bom_removed_inverted) },
323        { INVERT_ENDIAN_FLAG,
324          EXPECT(bom_encoded_inverted), EXPECT(bom_encoded) },
325        { INVERT_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM,
326          EXPECT(bom_removed_inverted), EXPECT(bom_removed) },
327    };
328#undef EXPECT
329
330    for (const auto& s : SOURCES) {
331        for (const auto& e : EXPECTED) {
332            char case_id[64];
333            zx_status_t res;
334            size_t enc_len = sizeof(actual);
335
336            ::memset(actual, 0xAB, sizeof(actual));
337            snprintf(case_id, sizeof(case_id), "case id [%s BOM, %s endian]",
338                     (e.flags & UTF_CONVERT_FLAG_DISCARD_BOM) ? "discard" : "encode",
339                     (e.flags & HOST_ENDIAN_FLAG) ? "host" :
340                     (e.flags & INVERT_ENDIAN_FLAG) ? "invert" : "detect");
341
342            res = utf16_to_utf8(s.src, fbl::count_of(s.src), actual, &enc_len, e.flags);
343            ASSERT_EQ(ZX_OK, res, case_id);
344
345            if (s.host_order) {
346                ASSERT_UTF8_EQ(e.host.exp, e.host.len, actual, sizeof(actual), enc_len, case_id);
347            } else {
348                ASSERT_UTF8_EQ(e.inv.exp, e.inv.len, actual, sizeof(actual), enc_len, case_id);
349            }
350        }
351    }
352
353    END_TEST;
354}
355
356BEGIN_TEST_CASE(utf_conversion_tests)
357RUN_TEST(utf16to8_bad_args);
358RUN_TEST(utf16to8_empty_source);
359RUN_TEST(utf16to8_simple_codepoints);
360RUN_TEST(utf16to8_paired_surrogates);
361RUN_TEST(utf16to8_unpaired_surrogates);
362RUN_TEST(utf16to8_dst_buffer_lengths);
363RUN_TEST(utf16to8_endianness_and_bom);
364END_TEST_CASE(utf_conversion_tests)
365
366int main(int argc, char** argv) {
367    return unittest_run_all_tests(argc, argv) ? 0 : -1;
368}
369