1// Copyright 2018 The Fuchsia Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <endian.h> 6#include <stdio.h> 7#include <unittest/unittest.h> 8#include <utf_conversion/utf_conversion.h> 9 10#include <fbl/algorithm.h> 11 12#if (BYTE_ORDER == BIG_ENDIAN) 13static constexpr uint32_t HOST_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN; 14static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN; 15#else 16static constexpr uint32_t HOST_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN; 17static constexpr uint32_t INVERT_ENDIAN_FLAG = UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN; 18#endif 19 20#define ASSERT_UTF8_EQ(expected, expected_len, actual, actual_bytes, enc_len, msg) \ 21 do { \ 22 ASSERT_GE(actual_bytes, expected_len, msg); \ 23 ASSERT_EQ(expected_len, enc_len, msg); \ 24 ASSERT_BYTES_EQ(expected, actual, expected_len, msg); \ 25 } while(false) 26 27static bool utf16to8_bad_args(void) { 28 BEGIN_TEST; 29 30 uint16_t src; 31 uint8_t dst = 0xFE; 32 size_t dst_len; 33 zx_status_t res; 34 35 // Bad destination buffer with non-zero destination length 36 dst_len = 1; 37 res = utf16_to_utf8(&src, 1, nullptr, &dst_len); 38 ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst should fail with INVALID_ARGS"); 39 ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args"); 40 41 // Bad dest len pointer 42 res = utf16_to_utf8(&src, 1, &dst, nullptr); 43 ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "null dst_len should fail with INVALID_ARGS"); 44 ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args"); 45 46 // Bad (undefined) flags 47 res = utf16_to_utf8(&src, 1, &dst, &dst_len, 0x80000000); 48 ASSERT_EQ(ZX_ERR_INVALID_ARGS, res, "undefined flags should fail with INVALID_ARGS"); 49 ASSERT_EQ(1, dst_len, "dst_len modified after conversion with invalid args"); 50 ASSERT_EQ(0xFE, dst, "dst modified after conversion with invalid args"); 51 52 // A null dest buffer is allowed if (and only if) the dst_len is zero. 53 // Practical use cases include using the converter to determine the length 54 // needed to hold a converted string. 55 dst_len = 0; 56 src = 0xAB; 57 res = utf16_to_utf8(&src, 1, nullptr, &dst_len); 58 ASSERT_EQ(ZX_OK, res, "null dst with zero dst_len should succeed"); 59 ASSERT_EQ(2, dst_len, "encoded size of 0xAB should be 2!"); 60 61 END_TEST; 62} 63 64static bool utf16to8_empty_source(void) { 65 BEGIN_TEST; 66 67 uint16_t src; 68 static const uint8_t expected[] = { 0xA1, 0xB2, 0xC3, 0xD4 }; 69 uint8_t actual[sizeof(expected)]; 70 size_t dst_len; 71 zx_status_t res; 72 73 // Check to make sure that attempting to encode a zero length source results 74 // in a length of zero and no changes to the destination buffer. 75 memcpy(actual, expected, sizeof(actual)); 76 dst_len = sizeof(actual); 77 res = utf16_to_utf8(&src, 0,actual, &dst_len); 78 ASSERT_EQ(ZX_OK, res, "zero length string conversion failed"); 79 ASSERT_EQ(0, dst_len, "dst_len should be zero after zero length string conversion"); 80 ASSERT_BYTES_EQ(expected, actual, sizeof(actual), 81 "dst buffer modified after zero length string conversion"); 82 83 dst_len = sizeof(actual); 84 res = utf16_to_utf8(nullptr, 1,actual, &dst_len); 85 ASSERT_EQ(ZX_OK, res, "null source string conversion failed"); 86 ASSERT_EQ(0, dst_len, "dst_len should be zero after null source string conversion"); 87 ASSERT_BYTES_EQ(expected, actual, sizeof(actual), 88 "dst buffer modified after null source string conversion"); 89 90 END_TEST; 91} 92 93static bool utf16to8_simple_codepoints(void) { 94 BEGIN_TEST; 95 96 static const struct { 97 uint16_t src; 98 uint8_t expected[3]; 99 size_t expected_len; 100 } TEST_VECTORS[] = { 101 // 1 byte UTF-8 codepoints (U+0000, U+007F) 102 { 0x0000, { 0x00 }, 1 }, 103 { 0x0001, { 0x01 }, 1 }, 104 { 0x007f, { 0x7f }, 1 }, 105 106 // 2 byte UTF-8 codepoints (U+0080, U+07FF) 107 { 0x0080, { 0xC2, 0x80 }, 2 }, 108 { 0x0456, { 0xD1, 0x96 }, 2 }, 109 { 0x07FF, { 0xDF, 0xBF }, 2 }, 110 111 // 3 byte UTF-8 codepoints (U+0800, U+07FF) 112 // Note: we are skipping the (theoretically illegal) unpaired surrogate 113 // range (U+D800, U+DFFF) here. There is a separate test for support of 114 // unpaired surrogates. 115 { 0x0800, { 0xE0, 0xA0, 0x80 }, 3 }, 116 { 0x4567, { 0xE4, 0x95, 0xA7 }, 3 }, 117 { 0xD7FF, { 0xED, 0x9F, 0xBF }, 3 }, 118 { 0xE000, { 0xEE, 0x80, 0x80 }, 3 }, 119 { 0xE456, { 0xEE, 0x91, 0x96 }, 3 }, 120 { 0xFFFF, { 0xEF, 0xBF, 0xBF }, 3 }, 121 }; 122 123 uint8_t actual[3]; 124 for (const auto& v : TEST_VECTORS) { 125 char case_id[64]; 126 size_t encoded_len = sizeof(actual); 127 zx_status_t res; 128 129 snprintf(case_id, sizeof(case_id), "case id [0x%04hx]", v.src); 130 ::memset(actual, 0xAB, sizeof(actual)); 131 132 res = utf16_to_utf8(&v.src, 1, actual, &encoded_len); 133 ASSERT_EQ(ZX_OK, res, case_id); 134 ASSERT_LE(v.expected_len, sizeof(v.expected), case_id); 135 ASSERT_UTF8_EQ(v.expected, v.expected_len, 136 actual, sizeof(actual), 137 encoded_len, case_id); 138 } 139 140 END_TEST; 141} 142 143static bool utf16to8_paired_surrogates(void) { 144 BEGIN_TEST; 145 146 // All paired surrogate encodings are going to be 4 byte UTF-8 codepoints (U+010000, U+10FFFF) 147 static const struct { 148 uint16_t src[2]; 149 uint8_t expected[4]; 150 } TEST_VECTORS[] = { 151 { { 0xD800, 0xDC00 }, { 0xF0, 0x90, 0x80, 0x80 } }, // U+10000 152 { { 0xD811, 0xDD67 }, { 0xF0, 0x94, 0x95, 0xA7 } }, // U+14567 153 { { 0xDA6F, 0xDCDE }, { 0xF2, 0xAB, 0xB3, 0x9E } }, // U+ABCDE 154 { { 0xDBBF, 0xDFFF }, { 0xF3, 0xBF, 0xBF, 0xBF } }, // U+FFFFF 155 { { 0xDBC0, 0xDC00 }, { 0xF4, 0x80, 0x80, 0x80 } }, // U+100000 156 { { 0xDBD1, 0xDD67 }, { 0xF4, 0x84, 0x95, 0xA7 } }, // U+104567 157 { { 0xDBFF, 0xDFFF }, { 0xF4, 0x8F, 0xBF, 0xBF } }, // U+10FFFF 158 }; 159 160 uint8_t actual[4]; 161 for (const auto& v : TEST_VECTORS) { 162 char case_id[64]; 163 size_t encoded_len = sizeof(actual); 164 zx_status_t res; 165 166 snprintf(case_id, sizeof(case_id), "case id [0x%04hx : 0x%04hx]", v.src[0], v.src[1]); 167 ::memset(actual, 0xAB, sizeof(actual)); 168 169 res = utf16_to_utf8(v.src, fbl::count_of(v.src), actual, &encoded_len); 170 ASSERT_EQ(ZX_OK, res, case_id); 171 ASSERT_UTF8_EQ(v.expected, sizeof(v.expected), 172 actual, sizeof(actual), 173 encoded_len, case_id); 174 } 175 176 END_TEST; 177} 178 179static bool utf16to8_unpaired_surrogates(void) { 180 BEGIN_TEST; 181 182 static const struct { 183 uint16_t src; 184 uint8_t expected[3]; 185 } TEST_VECTORS[] = { 186 // All unpaired surrogates are technically supposed to be illegal, but 187 // apparently there are systems out there who use them any (Wikipedia 188 // claims that Windows allows unpaired surrogates in file names encoded 189 // using UTF-16) 190 // 191 // Unpaired surrogates are 16 bits wide, so they will require a 3-byte 192 // UTF-8 encoding. 193 { 0xD800, { 0xED, 0xA0, 0x80 } }, 194 { 0xD945, { 0xED, 0xA5, 0x85 } }, 195 { 0xDBFF, { 0xED, 0xAF, 0xBF } }, 196 { 0xDC00, { 0xED, 0xB0, 0x80 } }, 197 { 0xDD45, { 0xED, 0xB5, 0x85 } }, 198 { 0xDFFF, { 0xED, 0xBF, 0xBF } }, 199 }; 200 uint8_t replace[3] = { 0xEF, 0xBF, 0xBD }; 201 uint8_t actual[3]; 202 for (const auto& v : TEST_VECTORS) { 203 char case_id[64]; 204 size_t encoded_len = sizeof(actual); 205 zx_status_t res; 206 207 // Attempt to encode the unpaired surrogate, but do not specify that we 208 // want to preserve it. We should end up with the encoded form of the 209 // replacement character (U+FFFD) instead. 210 snprintf(case_id, sizeof(case_id), "case id [0x%04hx, replace]", v.src); 211 ::memset(actual, 0xAB, sizeof(actual)); 212 213 encoded_len = sizeof(actual); 214 res = utf16_to_utf8(&v.src, 1, actual, &encoded_len); 215 ASSERT_EQ(ZX_OK, res, case_id); 216 ASSERT_UTF8_EQ(replace, sizeof(replace), actual, sizeof(actual), 217 encoded_len, case_id); 218 219 // Do it again, but this time tell the converter to preserve the 220 // unpaired surrogate instead. 221 snprintf(case_id, sizeof(case_id), "case id [0x%04hx, preserve]", v.src); 222 ::memset(actual, 0xAB, sizeof(actual)); 223 224 encoded_len = sizeof(actual); 225 res = utf16_to_utf8(&v.src, 1, actual, &encoded_len, 226 UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES); 227 ASSERT_EQ(ZX_OK, res, case_id); 228 ASSERT_UTF8_EQ(v.expected, sizeof(v.expected), actual, sizeof(actual), 229 encoded_len, case_id); 230 } 231 232 END_TEST; 233} 234 235static bool utf16to8_dst_buffer_lengths(void) { 236 BEGIN_TEST; 237 238 const uint16_t src[] = { 'T', 'e', 's', 't' }; 239 const uint8_t expected[] = { 'T', 'e', 's', 't' }; 240 uint8_t actual[16]; 241 242 // Perform a conversion, but test three cases. 243 // 244 // 1) The destination buffer size is exactly what is required. 245 // 2) The destination buffer size is more than what is required. 246 // 3) The destination buffer size is less than what is required. 247 static const size_t DST_LENGTHS[] = { sizeof(expected), sizeof(actual), sizeof(expected) >> 1 }; 248 for (const auto& d : DST_LENGTHS) { 249 char case_id[64]; 250 size_t encoded_len = d; 251 zx_status_t res; 252 253 snprintf(case_id, sizeof(case_id), "case id [needed %zu, provided %zu]", 254 sizeof(expected), d); 255 ::memset(actual, 0xAB, sizeof(actual)); 256 257 ASSERT_LE(encoded_len, sizeof(actual), case_id); 258 res = utf16_to_utf8(src, fbl::count_of(src), actual, &encoded_len); 259 260 ASSERT_EQ(ZX_OK, res, case_id); 261 ASSERT_EQ(sizeof(expected), encoded_len, case_id); 262 static_assert(sizeof(expected) <= sizeof(actual), 263 "'actual' buffer must be large enough to hold 'expected' result"); 264 ASSERT_BYTES_EQ(expected, actual, d < encoded_len ? d : encoded_len, case_id); 265 266 if (d < sizeof(actual)) { 267 uint8_t pattern[sizeof(actual)]; 268 ::memset(pattern, 0xAB, sizeof(pattern)); 269 ASSERT_BYTES_EQ(actual + d, pattern, sizeof(actual) - d, case_id); 270 } 271 } 272 273 END_TEST; 274} 275 276static bool utf16to8_endianness_and_bom(void) { 277 BEGIN_TEST; 278 279 static const struct { 280 uint16_t src[5]; 281 bool host_order; 282 } SOURCES[] = { 283 { { 0xFEFF, 'T', 'e', 's', 't' }, true }, 284 { { __bswap16(0xFEFF), 285 __bswap16('T'), 286 __bswap16('e'), 287 __bswap16('s'), 288 __bswap16('t'), 289 }, false } 290 }; 291 292 const uint8_t bom_removed[] = { 'T', 'e', 's', 't' }; 293 const uint8_t bom_removed_inverted[] = { 294 0xE5, 0x90, 0x80, 0xE6, 0x94, 0x80, 0xE7, 295 0x8C, 0x80, 0xE7, 0x90, 0x80 }; 296 const uint8_t bom_encoded[] = { 0xEF, 0xBB, 0xBF, 'T', 'e', 's', 't' }; 297 const uint8_t bom_encoded_inverted[] = { 298 0xEF, 0xBF, 0xBE, 0xE5, 0x90, 0x80, 0xE6, 299 0x94, 0x80, 0xE7, 0x8C, 0x80, 0xE7, 0x90, 300 0x80 }; 301 uint8_t actual[fbl::count_of(bom_encoded_inverted)]; 302 303#define EXPECT(e) { e, sizeof(e) } 304 static const struct { 305 uint32_t flags; 306 struct { 307 const uint8_t* exp; 308 size_t len; 309 } host; 310 struct { 311 const uint8_t* exp; 312 size_t len; 313 } inv; 314 } EXPECTED[] { 315 { 0, 316 EXPECT(bom_encoded), EXPECT(bom_encoded) }, 317 { UTF_CONVERT_FLAG_DISCARD_BOM, 318 EXPECT(bom_removed), EXPECT(bom_removed) }, 319 { HOST_ENDIAN_FLAG, 320 EXPECT(bom_encoded), EXPECT(bom_encoded_inverted) }, 321 { HOST_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM, 322 EXPECT(bom_removed), EXPECT(bom_removed_inverted) }, 323 { INVERT_ENDIAN_FLAG, 324 EXPECT(bom_encoded_inverted), EXPECT(bom_encoded) }, 325 { INVERT_ENDIAN_FLAG | UTF_CONVERT_FLAG_DISCARD_BOM, 326 EXPECT(bom_removed_inverted), EXPECT(bom_removed) }, 327 }; 328#undef EXPECT 329 330 for (const auto& s : SOURCES) { 331 for (const auto& e : EXPECTED) { 332 char case_id[64]; 333 zx_status_t res; 334 size_t enc_len = sizeof(actual); 335 336 ::memset(actual, 0xAB, sizeof(actual)); 337 snprintf(case_id, sizeof(case_id), "case id [%s BOM, %s endian]", 338 (e.flags & UTF_CONVERT_FLAG_DISCARD_BOM) ? "discard" : "encode", 339 (e.flags & HOST_ENDIAN_FLAG) ? "host" : 340 (e.flags & INVERT_ENDIAN_FLAG) ? "invert" : "detect"); 341 342 res = utf16_to_utf8(s.src, fbl::count_of(s.src), actual, &enc_len, e.flags); 343 ASSERT_EQ(ZX_OK, res, case_id); 344 345 if (s.host_order) { 346 ASSERT_UTF8_EQ(e.host.exp, e.host.len, actual, sizeof(actual), enc_len, case_id); 347 } else { 348 ASSERT_UTF8_EQ(e.inv.exp, e.inv.len, actual, sizeof(actual), enc_len, case_id); 349 } 350 } 351 } 352 353 END_TEST; 354} 355 356BEGIN_TEST_CASE(utf_conversion_tests) 357RUN_TEST(utf16to8_bad_args); 358RUN_TEST(utf16to8_empty_source); 359RUN_TEST(utf16to8_simple_codepoints); 360RUN_TEST(utf16to8_paired_surrogates); 361RUN_TEST(utf16to8_unpaired_surrogates); 362RUN_TEST(utf16to8_dst_buffer_lengths); 363RUN_TEST(utf16to8_endianness_and_bom); 364END_TEST_CASE(utf_conversion_tests) 365 366int main(int argc, char** argv) { 367 return unittest_run_all_tests(argc, argv) ? 0 : -1; 368} 369