utest/utf_conversion/main.cpp

// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <endian.h>
#include <stdio.h>
#include <unittest/unittest.h>
#include <utf_conversion/utf_conversion.h>

#include <fbl/algorithm.h>

#if (BYTE_ORDER == BIG_ENDIAN)
static constexpr uint32_t HOST_ENDIAN_FLAG   = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
#else
static constexpr uint32_t HOST_ENDIAN_FLAG   = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN;
static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN;
#endif

#define ASSERT_UTF8_EQ(expected, expected_len, actual, actual_bytes, enc_len, msg)  \
    do {                                                                            \
        ASSERT_GE(actual_bytes, expected_len, msg);                                 \
        ASSERT_EQ(expected_len, enc_len, msg);                                      \
        ASSERT_BYTES_EQ(expected, actual, expected_len, msg);                       \
    } while(false)

static bool utf16to8_bad_args(void) {
    BEGIN_TEST;

    uint16_t src;
    uint8_t dst = 0xFE;
    size_t dst_len;
    zx_status_t res;

    // Bad destination buffer with non-zero destination length
    dst_len = 1;
    res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst should fail with INVALID_ARGS");
    ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");

    // Bad dest len pointer
    res = utf16_to_utf8(&src, 1, &dst, nullptr);
    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst_len should fail with INVALID_ARGS");
    ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");

    // Bad (undefined) flags
    res = utf16_to_utf8(&src, 1, &dst, &dst_len, 0x80000000);
    ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "undefined flags should fail with INVALID_ARGS");
    ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args");
    ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args");

    // A null dest buffer is allowed if (and only if) the dst_len is zero.
    // Practical use cases include using the converter to determine the length
    // needed to hold a converted string.
    dst_len = 0;
    src = 0xAB;
    res = utf16_to_utf8(&src, 1, nullptr, &dst_len);
    ASSERT_EQ(ZX_OK, res, "null dst with zero dst_len should succeed");
    ASSERT_EQ(2, dst_len, "encoded size of 0xAB should be 2!");

    END_TEST;
}

static bool utf16to8_empty_source(void) {
    BEGIN_TEST;

    uint16_t src;
    static const uint8_t expected[] = { 0xA1, 0xB2, 0xC3, 0xD4 };
    uint8_t actual[sizeof(expected)];
    size_t dst_len;
    zx_status_t res;

    // Check to make sure that attempting to encode a zero length source results
    // in a length of zero and no changes to the destination buffer.
    memcpy(actual, expected, sizeof(actual));
    dst_len = sizeof(actual);
    res = utf16_to_utf8(&src, 0,actual, &dst_len);
    ASSERT_EQ(ZX_OK, res, "zero length string conversion failed");
    ASSERT_EQ(0, dst_len, "dst_len should be zero after zero length string conversion");
    ASSERT_BYTES_EQ(expected, actual, sizeof(actual),
                    "dst buffer modified after zero length string conversion");

    dst_len = sizeof(actual);
    res = utf16_to_utf8(nullptr, 1,actual, &dst_len);
    ASSERT_EQ(ZX_OK, res, "null source string conversion failed");
    ASSERT_EQ(0, dst_len, "dst_len should be zero after null source string conversion");
    ASSERT_BYTES_EQ(expected, actual, sizeof(actual),
                    "dst buffer modified after null source string conversion");

    END_TEST;
}

static bool utf16to8_simple_codepoints(void) {
    BEGIN_TEST;

    static const struct {
        uint16_t src;
        uint8_t expected[3];
        size_t  expected_len;
    } TEST_VECTORS[] = {
        // 1 byte UTF-8 codepoints (U+0000, U+007F)
        { 0x0000, { 0x00 }, 1 },
        { 0x0001, { 0x01 }, 1 },
        { 0x007f, { 0x7f }, 1 },

        // 2 byte UTF-8 codepoints (U+0080, U+07FF)
        { 0x0080, { 0xC2, 0x80 }, 2 },
        { 0x0456, { 0xD1, 0x96 }, 2 },
        { 0x07FF, { 0xDF, 0xBF }, 2 },

        // 3 byte UTF-8 codepoints (U+0800, U+07FF)
        // Note: we are skipping the (theoretically illegal) unpaired surrogate
        // range (U+D800, U+DFFF) here.  There is a separate test for support of
        // unpaired surrogates.
        { 0x0800, { 0xE0, 0xA0, 0x80 }, 3 },
        { 0x4567, { 0xE4, 0x95, 0xA7 }, 3 },
        { 0xD7FF, { 0xED, 0x9F, 0xBF }, 3 },
        { 0xE000, { 0xEE, 0x80, 0x80 }, 3 },
        { 0xE456, { 0xEE, 0x91, 0x96 }, 3 },
        { 0xFFFF, { 0xEF, 0xBF, 0xBF }, 3 },
    };

    uint8_t actual[3];
    for (const auto& v : TEST_VECTORS) {
        char case_id[64];
        size_t encoded_len = sizeof(actual);
        zx_status_t res;

        snprintf(case_id, sizeof(case_id), "case id [0x%04hx]", v.src);
        ::memset(actual, 0xAB, sizeof(actual));

        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
        ASSERT_EQ(ZX_OK, res, case_id);
        ASSERT_LE(v.expected_len, sizeof(v.expected), case_id);
        ASSERT_UTF8_EQ(v.expected, v.expected_len,
                       actual, sizeof(actual),
                       encoded_len, case_id);
    }

    END_TEST;
}

static bool utf16to8_paired_surrogates(void) {
    BEGIN_TEST;

    // All paired surrogate encodings are going to be 4 byte UTF-8 codepoints (U+010000, U+10FFFF)
    static const struct {
        uint16_t src[2];
        uint8_t expected[4];
    } TEST_VECTORS[] = {
        { { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, // U+10000
        { { 0xD811, 0xDD67 }, { 0xF0, 0x94, 0x95, 0xA7 } }, // U+14567
        { { 0xDA6F, 0xDCDE }, { 0xF2, 0xAB, 0xB3, 0x9E } }, // U+ABCDE
        { { 0xDBBF, 0xDFFF }, { 0xF3, 0xBF, 0xBF, 0xBF } }, // U+FFFFF
        { { 0xDBC0, 0xDC00 }, { 0xF4, 0x80, 0x80, 0x80 } }, // U+100000
        { { 0xDBD1, 0xDD67 }, { 0xF4, 0x84, 0x95, 0xA7 } }, // U+104567
        { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } }, // U+10FFFF
    };

    uint8_t actual[4];
    for (const auto& v : TEST_VECTORS) {
        char case_id[64];
        size_t encoded_len = sizeof(actual);
        zx_status_t res;

        snprintf(case_id, sizeof(case_id), "case id [0x%04hx : 0x%04hx]", v.src[0], v.src[1]);
        ::memset(actual, 0xAB, sizeof(actual));

        res = utf16_to_utf8(v.src, fbl::count_of(v.src), actual, &encoded_len);
        ASSERT_EQ(ZX_OK, res, case_id);
        ASSERT_UTF8_EQ(v.expected, sizeof(v.expected),
                       actual, sizeof(actual),
                       encoded_len, case_id);
    }

    END_TEST;
}

static bool utf16to8_unpaired_surrogates(void) {
    BEGIN_TEST;

    static const struct {
        uint16_t src;
        uint8_t expected[3];
    } TEST_VECTORS[] = {
        // All unpaired surrogates are technically supposed to be illegal, but
        // apparently there are systems out there who use them any (Wikipedia
        // claims that Windows allows unpaired surrogates in file names encoded
        // using UTF-16)
        //
        // Unpaired surrogates are 16 bits wide, so they will require a 3-byte
        // UTF-8 encoding.
        { 0xD800, { 0xED, 0xA0, 0x80 } },
        { 0xD945, { 0xED, 0xA5, 0x85 } },
        { 0xDBFF, { 0xED, 0xAF, 0xBF } },
        { 0xDC00, { 0xED, 0xB0, 0x80 } },
        { 0xDD45, { 0xED, 0xB5, 0x85 } },
        { 0xDFFF, { 0xED, 0xBF, 0xBF } },
    };
    uint8_t replace[3] = { 0xEF, 0xBF, 0xBD };
    uint8_t actual[3];
    for (const auto& v : TEST_VECTORS) {
        char case_id[64];
        size_t encoded_len = sizeof(actual);
        zx_status_t res;

        // Attempt to encode the unpaired surrogate, but do not specify that we
        // want to preserve it.  We should end up with the encoded form of the
        // replacement character (U+FFFD) instead.
        snprintf(case_id, sizeof(case_id), "case id [0x%04hx, replace]", v.src);
        ::memset(actual, 0xAB, sizeof(actual));

        encoded_len = sizeof(actual);
        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len);
        ASSERT_EQ(ZX_OK, res, case_id);
        ASSERT_UTF8_EQ(replace, sizeof(replace), actual, sizeof(actual),
                       encoded_len, case_id);

        // Do it again, but this time tell the converter to preserve the
        // unpaired surrogate instead.
        snprintf(case_id, sizeof(case_id), "case id [0x%04hx, preserve]", v.src);
        ::memset(actual, 0xAB, sizeof(actual));

        encoded_len = sizeof(actual);
        res = utf16_to_utf8(&v.src, 1, actual, &encoded_len,
                            UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES);
        ASSERT_EQ(ZX_OK, res, case_id);
        ASSERT_UTF8_EQ(v.expected, sizeof(v.expected), actual, sizeof(actual),
                       encoded_len, case_id);
    }

    END_TEST;
}

static bool utf16to8_dst_buffer_lengths(void) {
    BEGIN_TEST;

    const uint16_t src[] = { 'T', 'e', 's', 't' };
    const uint8_t expected[] = { 'T', 'e', 's', 't' };
    uint8_t actual[16];

    // Perform a conversion, but test three cases.
    //
    // 1) The destination buffer size is exactly what is required.
    // 2) The destination buffer size is more than what is required.
    // 3) The destination buffer size is less than what is required.
    static const size_t DST_LENGTHS[] = { sizeof(expected), sizeof(actual), sizeof(expected) >> 1 };
    for (const auto& d : DST_LENGTHS) {
        char case_id[64];
        size_t encoded_len = d;
        zx_status_t res;

        snprintf(case_id, sizeof(case_id), "case id [needed %zu, provided %zu]",
                 sizeof(expected), d);
        ::memset(actual, 0xAB, sizeof(actual));

        ASSERT_LE(encoded_len, sizeof(actual), case_id);
        res = utf16_to_utf8(src, fbl::count_of(src), actual, &encoded_len);

        ASSERT_EQ(ZX_OK, res, case_id);
        ASSERT_EQ(sizeof(expected), encoded_len, case_id);
        static_assert(sizeof(expected) <= sizeof(actual),
                      "'actual' buffer must be large enough to hold 'expected' result");
        ASSERT_BYTES_EQ(expected, actual, d < encoded_len ? d : encoded_len, case_id);

        if (d < sizeof(actual)) {
            uint8_t pattern[sizeof(actual)];
            ::memset(pattern, 0xAB, sizeof(pattern));
            ASSERT_BYTES_EQ(actual + d, pattern, sizeof(actual) - d, case_id);
        }
    }

    END_TEST;
}

static bool utf16to8_endianness_and_bom(void) {
    BEGIN_TEST;

    static const struct {
        uint16_t src[5];
        bool host_order;
    } SOURCES[] = {
        { { 0xFEFF, 'T', 'e', 's', 't' }, true },
        { { __bswap16(0xFEFF),
            __bswap16('T'),
            __bswap16('e'),
            __bswap16('s'),
            __bswap16('t'),
            }, false }
    };

    const uint8_t bom_removed[] = { 'T', 'e', 's', 't' };
    const uint8_t bom_removed_inverted[] = {
        0xE5, 0x90, 0x80, 0xE6, 0x94, 0x80, 0xE7,
        0x8C, 0x80, 0xE7, 0x90, 0x80 };
    const uint8_t bom_encoded[] = { 0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't' };
    const uint8_t bom_encoded_inverted[] = {
        0xEF, 0xBF, 0xBE, 0xE5, 0x90, 0x80, 0xE6,
        0x94, 0x80, 0xE7, 0x8C, 0x80, 0xE7, 0x90,
        0x80 };
    uint8_t actual[fbl::count_of(bom_encoded_inverted)];

#define EXPECT(e) { e, sizeof(e) }
    static const struct {
        uint32_t flags;
        struct {
            const uint8_t* exp;
            size_t len;
        } host;
        struct {
            const uint8_t* exp;
            size_t len;
        } inv;
    } EXPECTED[] {
        { 0,
          EXPECT(bom_encoded), EXPECT(bom_encoded) },
        { UTF_CONVERT_FLAG_DISCARD_BOM,
          EXPECT(bom_removed), EXPECT(bom_removed) },
        { HOST_ENDIAN_FLAG,
          EXPECT(bom_encoded), EXPECT(bom_encoded_inverted) },
        { HOST_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM,
          EXPECT(bom_removed), EXPECT(bom_removed_inverted) },
        { INVERT_ENDIAN_FLAG,
          EXPECT(bom_encoded_inverted), EXPECT(bom_encoded) },
        { INVERT_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM,
          EXPECT(bom_removed_inverted), EXPECT(bom_removed) },
    };
#undef EXPECT

    for (const auto& s : SOURCES) {
        for (const auto& e : EXPECTED) {
            char case_id[64];
            zx_status_t res;
            size_t enc_len = sizeof(actual);

            ::memset(actual, 0xAB, sizeof(actual));
            snprintf(case_id, sizeof(case_id), "case id [%s BOM, %s endian]",
                     (e.flags & UTF_CONVERT_FLAG_DISCARD_BOM) ? "discard" : "encode",
                     (e.flags & HOST_ENDIAN_FLAG) ? "host" :
                     (e.flags & INVERT_ENDIAN_FLAG) ? "invert" : "detect");

            res = utf16_to_utf8(s.src, fbl::count_of(s.src), actual, &enc_len, e.flags);
            ASSERT_EQ(ZX_OK, res, case_id);

            if (s.host_order) {
                ASSERT_UTF8_EQ(e.host.exp, e.host.len, actual, sizeof(actual), enc_len, case_id);
            } else {
                ASSERT_UTF8_EQ(e.inv.exp, e.inv.len, actual, sizeof(actual), enc_len, case_id);
            }
        }
    }

    END_TEST;
}

BEGIN_TEST_CASE(utf_conversion_tests)
RUN_TEST(utf16to8_bad_args);
RUN_TEST(utf16to8_empty_source);
RUN_TEST(utf16to8_simple_codepoints);
RUN_TEST(utf16to8_paired_surrogates);
RUN_TEST(utf16to8_unpaired_surrogates);
RUN_TEST(utf16to8_dst_buffer_lengths);
RUN_TEST(utf16to8_endianness_and_bom);
END_TEST_CASE(utf_conversion_tests)

int main(int argc, char** argv) {
    return unittest_run_all_tests(argc, argv) ? 0 : -1;
}