test_archive_string_conversion.c revision 232153
1/*- 2 * Copyright (c) 2011 Michihiro NAKAJIMA 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25#include "test.h" 26__FBSDID("$FreeBSD$"); 27 28#include <locale.h> 29 30#define __LIBARCHIVE_TEST 31#include "archive_string.h" 32 33/* 34Execute the following to rebuild the data for this program: 35 tail -n +36 test_archive_string_conversion.c | /bin/sh 36# 37# This requires http://unicode.org/Public/UNIDATA/NormalizationTest.txt 38# 39if="NormalizationTest.txt" 40if [ ! -f ${if} ]; then 41 echo "Not found: \"${if}\"" 42 exit 0 43fi 44of=test_archive_string_conversion.txt.Z 45echo "\$FreeBSD\$" > ${of}.uu 46awk -F ';' '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu 47exit 1 48*/ 49 50static int 51unicode_to_utf8(char *p, uint32_t uc) 52{ 53 char *_p = p; 54 55 /* Translate code point to UTF8 */ 56 if (uc <= 0x7f) { 57 *p++ = (char)uc; 58 } else if (uc <= 0x7ff) { 59 *p++ = 0xc0 | ((uc >> 6) & 0x1f); 60 *p++ = 0x80 | (uc & 0x3f); 61 } else if (uc <= 0xffff) { 62 *p++ = 0xe0 | ((uc >> 12) & 0x0f); 63 *p++ = 0x80 | ((uc >> 6) & 0x3f); 64 *p++ = 0x80 | (uc & 0x3f); 65 } else { 66 *p++ = 0xf0 | ((uc >> 18) & 0x07); 67 *p++ = 0x80 | ((uc >> 12) & 0x3f); 68 *p++ = 0x80 | ((uc >> 6) & 0x3f); 69 *p++ = 0x80 | (uc & 0x3f); 70 } 71 return ((int)(p - _p)); 72} 73 74static void 75archive_be16enc(void *pp, uint16_t u) 76{ 77 unsigned char *p = (unsigned char *)pp; 78 79 p[0] = (u >> 8) & 0xff; 80 p[1] = u & 0xff; 81} 82 83static int 84unicode_to_utf16be(char *p, uint32_t uc) 85{ 86 char *utf16 = p; 87 88 if (uc > 0xffff) { 89 /* We have a code point that won't fit into a 90 * wchar_t; convert it to a surrogate pair. */ 91 uc -= 0x10000; 92 archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 93 archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 94 return (4); 95 } else { 96 archive_be16enc(utf16, uc); 97 return (2); 98 } 99} 100 101static void 102archive_le16enc(void *pp, uint16_t u) 103{ 104 unsigned char *p = (unsigned char *)pp; 105 106 p[0] = u & 0xff; 107 p[1] = (u >> 8) & 0xff; 108} 109 110static size_t 111unicode_to_utf16le(char *p, uint32_t uc) 112{ 113 char *utf16 = p; 114 115 if (uc > 0xffff) { 116 /* We have a code point that won't fit into a 117 * wchar_t; convert it to a surrogate pair. */ 118 uc -= 0x10000; 119 archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 120 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 121 return (4); 122 } else { 123 archive_le16enc(utf16, uc); 124 return (2); 125 } 126} 127 128static int 129wc_size(void) 130{ 131 return (sizeof(wchar_t)); 132} 133 134static int 135unicode_to_wc(wchar_t *wp, uint32_t uc) 136{ 137 if (wc_size() == 4) { 138 *wp = (wchar_t)uc; 139 return (1); 140 } 141 if (uc > 0xffff) { 142 /* We have a code point that won't fit into a 143 * wchar_t; convert it to a surrogate pair. */ 144 uc -= 0x10000; 145 *wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800); 146 *wp = (wchar_t)((uc & 0x3ff) + 0xDC00); 147 return (2); 148 } else { 149 *wp = (wchar_t)uc; 150 return (1); 151 } 152} 153 154/* 155 * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not 156 * converted to NFD on Mac OS. 157 * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html 158 */ 159static int 160scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le, 161 const char *pattern, int exclude_mac_nfd) 162{ 163 unsigned uc = 0; 164 const char *p = pattern; 165 char *op = out; 166 wchar_t *owp = wout; 167 char *op16be = u16be; 168 char *op16le = u16le; 169 170 for (;;) { 171 if (*p >= '0' && *p <= '9') 172 uc = (uc << 4) + (*p - '0'); 173 else if (*p >= 'A' && *p <= 'F') 174 uc = (uc << 4) + (*p - 'A' + 0x0a); 175 else { 176 if (exclude_mac_nfd) { 177 /* 178 * These are not converted to NFD on Mac OS. 179 */ 180 if ((uc >= 0x2000 && uc <= 0x2FFF) || 181 (uc >= 0xF900 && uc <= 0xFAFF) || 182 (uc >= 0x2F800 && uc <= 0x2FAFF)) 183 return (-1); 184 /* 185 * Those code points are not converted to 186 * NFD on Mac OS. I do not know the reason 187 * because it is undocumented. 188 * NFC NFD 189 * 1109A ==> 11099 110BA 190 * 1109C ==> 1109B 110BA 191 * 110AB ==> 110A5 110BA 192 */ 193 if (uc == 0x1109A || uc == 0x1109C || 194 uc == 0x110AB) 195 return (-1); 196 } 197 op16be += unicode_to_utf16be(op16be, uc); 198 op16le += unicode_to_utf16le(op16le, uc); 199 owp += unicode_to_wc(owp, uc); 200 op += unicode_to_utf8(op, uc); 201 if (!*p) { 202 *op16be++ = 0; 203 *op16be = 0; 204 *op16le++ = 0; 205 *op16le = 0; 206 *owp = L'\0'; 207 *op = '\0'; 208 break; 209 } 210 uc = 0; 211 } 212 p++; 213 } 214 return (0); 215} 216 217static int 218is_wc_unicode(void) 219{ 220#if defined(_WIN32) && !defined(__CYGWIN__) 221 return (1); 222#else 223 return (0); 224#endif 225} 226 227/* 228 * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters. 229 * On Mac OS, the characters to be Form D. 230 * On other platforms, the characters to be Form C. 231 */ 232static void 233test_archive_string_normalization(void) 234{ 235 struct archive *a, *a2; 236 struct archive_entry *ae; 237 struct archive_string utf8; 238 struct archive_mstring mstr; 239 struct archive_string_conv *f_sconv8, *t_sconv8; 240 struct archive_string_conv *f_sconv16be, *f_sconv16le; 241 FILE *fp; 242 char buff[512]; 243 static const char reffile[] = "test_archive_string_conversion.txt.Z"; 244 ssize_t size; 245 int line = 0; 246 int locale_is_utf8, wc_is_unicode; 247 248 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8")); 249 wc_is_unicode = is_wc_unicode(); 250 /* If it doesn't exist, just warn and return. */ 251 if (!locale_is_utf8 && !wc_is_unicode) { 252 skipping("invalid encoding tests require a suitable locale;" 253 " en_US.UTF-8 not available on this system"); 254 return; 255 } 256 257 archive_string_init(&utf8); 258 memset(&mstr, 0, sizeof(mstr)); 259 260 /* 261 * Extract a test pattern file. 262 */ 263 extract_reference_file(reffile); 264 assert((a = archive_read_new()) != NULL); 265 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); 266 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a)); 267 assertEqualIntA(a, ARCHIVE_OK, 268 archive_read_open_filename(a, reffile, 512)); 269 270 assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); 271 assert((fp = fopen("testdata.txt", "w")) != NULL); 272 while ((size = archive_read_data(a, buff, 512)) > 0) 273 fwrite(buff, 1, size, fp); 274 fclose(fp); 275 276 /* Open a test pattern file. */ 277 assert((fp = fopen("testdata.txt", "r")) != NULL); 278 279 /* 280 * Create string conversion objects. 281 */ 282 assertA(NULL != (f_sconv8 = 283 archive_string_conversion_from_charset(a, "UTF-8", 0))); 284 assertA(NULL != (f_sconv16be = 285 archive_string_conversion_from_charset(a, "UTF-16BE", 0))); 286 assertA(NULL != (f_sconv16le = 287 archive_string_conversion_from_charset(a, "UTF-16LE", 0))); 288 assert((a2 = archive_write_new()) != NULL); 289 assertA(NULL != (t_sconv8 = 290 archive_string_conversion_to_charset(a2, "UTF-8", 0))); 291 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL || 292 t_sconv8 == NULL || fp == NULL) { 293 /* We cannot continue this test. */ 294 if (fp != NULL) 295 fclose(fp); 296 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 297 return; 298 } 299 300 /* 301 * Read test data. 302 * Test data format: 303 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n' 304 * Unicode pattern format: 305 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,} 306 */ 307 while (fgets(buff, sizeof(buff), fp) != NULL) { 308 char nfc[80], nfd[80]; 309 char utf8_nfc[80], utf8_nfd[80]; 310 char utf16be_nfc[80], utf16be_nfd[80]; 311 char utf16le_nfc[80], utf16le_nfd[80]; 312 wchar_t wc_nfc[40], wc_nfd[40]; 313 char *e, *p; 314 315 line++; 316 if (buff[0] == '#') 317 continue; 318 p = strchr(buff, ';'); 319 if (p == NULL) 320 continue; 321 *p++ = '\0'; 322 /* Copy an NFC pattern */ 323 strncpy(nfc, buff, sizeof(nfc)-1); 324 nfc[sizeof(nfc)-1] = '\0'; 325 e = p; 326 p = strchr(p, '\n'); 327 if (p == NULL) 328 continue; 329 *p = '\0'; 330 /* Copy an NFD pattern */ 331 strncpy(nfd, e, sizeof(nfd)-1); 332 nfd[sizeof(nfd)-1] = '\0'; 333 334 /* 335 * Convert an NFC pattern to UTF-8 bytes. 336 */ 337#if defined(__APPLE__) 338 if (scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc, 339 nfc, 1) != 0) 340 continue; 341#else 342 scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc, 343 nfc, 0); 344#endif 345 346 /* 347 * Convert an NFD pattern to UTF-8 bytes. 348 */ 349 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd, 350 nfd, 0); 351 352 if (locale_is_utf8) { 353#if defined(__APPLE__) 354 /* 355 * Normalize an NFC string for import. 356 */ 357 assertEqualInt(0, archive_strcpy_in_locale( 358 &utf8, utf8_nfc, f_sconv8)); 359 failure("NFC(%s) should be converted to NFD(%s):%d", 360 nfc, nfd, line); 361 assertEqualUTF8String(utf8_nfd, utf8.s); 362 363 /* 364 * Normalize an NFD string for import. 365 */ 366 assertEqualInt(0, archive_strcpy_in_locale( 367 &utf8, utf8_nfd, f_sconv8)); 368 failure("NFD(%s) should not be any changed:%d", 369 nfd, line); 370 assertEqualUTF8String(utf8_nfd, utf8.s); 371 372 /* 373 * Copy an NFD string for export. 374 */ 375 assertEqualInt(0, archive_strcpy_in_locale( 376 &utf8, utf8_nfd, t_sconv8)); 377 failure("NFD(%s) should not be any changed:%d", 378 nfd, line); 379 assertEqualUTF8String(utf8_nfd, utf8.s); 380 381 /* 382 * Normalize an NFC string in UTF-16BE for import. 383 */ 384 assertEqualInt(0, archive_strncpy_in_locale( 385 &utf8, utf16be_nfc, 100000, f_sconv16be)); 386 failure("NFC(%s) should be converted to NFD(%s):%d", 387 nfc, nfd, line); 388 assertEqualUTF8String(utf8_nfd, utf8.s); 389 390 /* 391 * Normalize an NFC string in UTF-16LE for import. 392 */ 393 assertEqualInt(0, archive_strncpy_in_locale( 394 &utf8, utf16le_nfc, 100000, f_sconv16le)); 395 failure("NFC(%s) should be converted to NFD(%s):%d", 396 nfc, nfd, line); 397 assertEqualUTF8String(utf8_nfd, utf8.s); 398#else 399 /* 400 * Normalize an NFD string for import. 401 */ 402 assertEqualInt(0, archive_strcpy_in_locale( 403 &utf8, utf8_nfd, f_sconv8)); 404 failure("NFD(%s) should be converted to NFC(%s):%d", 405 nfd, nfc, line); 406 assertEqualUTF8String(utf8_nfc, utf8.s); 407 408 /* 409 * Normalize an NFC string for import. 410 */ 411 assertEqualInt(0, archive_strcpy_in_locale( 412 &utf8, utf8_nfc, f_sconv8)); 413 failure("NFC(%s) should not be any changed:%d", 414 nfc, line); 415 assertEqualUTF8String(utf8_nfc, utf8.s); 416 417 /* 418 * Copy an NFC string for export. 419 */ 420 assertEqualInt(0, archive_strcpy_in_locale( 421 &utf8, utf8_nfc, t_sconv8)); 422 failure("NFC(%s) should not be any changed:%d", 423 nfc, line); 424 assertEqualUTF8String(utf8_nfc, utf8.s); 425 426 /* 427 * Normalize an NFD string in UTF-16BE for import. 428 */ 429 assertEqualInt(0, archive_strncpy_in_locale( 430 &utf8, utf16be_nfd, 100000, f_sconv16be)); 431 failure("NFD(%s) should be converted to NFC(%s):%d", 432 nfd, nfc, line); 433 assertEqualUTF8String(utf8_nfc, utf8.s); 434 435 /* 436 * Normalize an NFD string in UTF-16LE for import. 437 */ 438 assertEqualInt(0, archive_strncpy_in_locale( 439 &utf8, utf16le_nfd, 100000, f_sconv16le)); 440 failure("NFD(%s) should be converted to NFC(%s):%d", 441 nfd, nfc, line); 442 assertEqualUTF8String(utf8_nfc, utf8.s); 443#endif 444 } 445 446 /* 447 * Test for archive_mstring interface. 448 * In specific, Windows platform UTF-16BE is directly 449 * converted to/from wide-character to avoid the effect of 450 * current locale since windows platform cannot make 451 * locale UTF-8. 452 */ 453 if (locale_is_utf8 || wc_is_unicode) { 454 const wchar_t *wp; 455 const char *mp; 456 size_t mplen; 457 458#if defined(__APPLE__) 459 /* 460 * Normalize an NFD string in UTF-8 for import. 461 */ 462 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 463 &mstr, utf8_nfc, 100000, f_sconv8)); 464 assertEqualInt(0, 465 archive_mstring_get_wcs(a, &mstr, &wp)); 466 failure("UTF-8 NFC(%s) should be converted " 467 "to WCS NFD(%s):%d", nfc, nfd, line); 468 assertEqualWString(wc_nfd, wp); 469 470 /* 471 * Normalize an NFD string in UTF-16BE for import. 472 */ 473 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 474 &mstr, utf16be_nfc, 100000, f_sconv16be)); 475 assertEqualInt(0, 476 archive_mstring_get_wcs(a, &mstr, &wp)); 477 failure("UTF-16BE NFC(%s) should be converted " 478 "to WCS NFD(%s):%d", nfc, nfd, line); 479 assertEqualWString(wc_nfd, wp); 480 481 /* 482 * Normalize an NFD string in UTF-16LE for import. 483 */ 484 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 485 &mstr, utf16le_nfc, 100000, f_sconv16le)); 486 assertEqualInt(0, 487 archive_mstring_get_wcs(a, &mstr, &wp)); 488 failure("UTF-16LE NFC(%s) should be converted " 489 "to WCS NFD(%s):%d", nfc, nfd, line); 490 assertEqualWString(wc_nfd, wp); 491 492 /* 493 * Copy an NFD wide-string for export. 494 */ 495 assertEqualInt(0, archive_mstring_copy_wcs( 496 &mstr, wc_nfd)); 497 assertEqualInt(0, archive_mstring_get_mbs_l( 498 &mstr, &mp, &mplen, t_sconv8)); 499 failure("WCS NFD(%s) should be UTF-8 NFD:%d" 500 ,nfd, line); 501 assertEqualUTF8String(utf8_nfd, mp); 502#else 503 /* 504 * Normalize an NFD string in UTF-8 for import. 505 */ 506 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 507 &mstr, utf8_nfd, 100000, f_sconv8)); 508 assertEqualInt(0, 509 archive_mstring_get_wcs(a, &mstr, &wp)); 510 failure("UTF-8 NFD(%s) should be converted " 511 "to WCS NFC(%s):%d", nfd, nfc, line); 512 assertEqualWString(wc_nfc, wp); 513 514 /* 515 * Normalize an NFD string in UTF-16BE for import. 516 */ 517 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 518 &mstr, utf16be_nfd, 100000, f_sconv16be)); 519 assertEqualInt(0, 520 archive_mstring_get_wcs(a, &mstr, &wp)); 521 failure("UTF-8 NFD(%s) should be converted " 522 "to WCS NFC(%s):%d", nfd, nfc, line); 523 assertEqualWString(wc_nfc, wp); 524 525 /* 526 * Normalize an NFD string in UTF-16LE for import. 527 */ 528 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 529 &mstr, utf16le_nfd, 100000, f_sconv16le)); 530 assertEqualInt(0, 531 archive_mstring_get_wcs(a, &mstr, &wp)); 532 failure("UTF-8 NFD(%s) should be converted " 533 "to WCS NFC(%s):%d", nfd, nfc, line); 534 assertEqualWString(wc_nfc, wp); 535 536 /* 537 * Copy an NFC wide-string for export. 538 */ 539 assertEqualInt(0, archive_mstring_copy_wcs( 540 &mstr, wc_nfc)); 541 assertEqualInt(0, archive_mstring_get_mbs_l( 542 &mstr, &mp, &mplen, t_sconv8)); 543 failure("WCS NFC(%s) should be UTF-8 NFC:%d" 544 ,nfc, line); 545 assertEqualUTF8String(utf8_nfc, mp); 546#endif 547 } 548 } 549 550 archive_string_free(&utf8); 551 archive_mstring_clean(&mstr); 552 fclose(fp); 553 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 554 assertEqualInt(ARCHIVE_OK, archive_write_free(a2)); 555} 556 557static void 558test_archive_string_canonicalization(void) 559{ 560 struct archive *a; 561 struct archive_string_conv *sconv; 562 563 setlocale(LC_ALL, "en_US.UTF-8"); 564 565 assert((a = archive_read_new()) != NULL); 566 567 assertA(NULL != (sconv = 568 archive_string_conversion_to_charset(a, "UTF-8", 1))); 569 failure("Charset name should be UTF-8"); 570 assertEqualString("UTF-8", 571 archive_string_conversion_charset_name(sconv)); 572 573 assertA(NULL != (sconv = 574 archive_string_conversion_to_charset(a, "UTF8", 1))); 575 failure("Charset name should be UTF-8"); 576 assertEqualString("UTF-8", 577 archive_string_conversion_charset_name(sconv)); 578 579 assertA(NULL != (sconv = 580 archive_string_conversion_to_charset(a, "utf8", 1))); 581 failure("Charset name should be UTF-8"); 582 assertEqualString("UTF-8", 583 archive_string_conversion_charset_name(sconv)); 584 585 assertA(NULL != (sconv = 586 archive_string_conversion_to_charset(a, "UTF-16BE", 1))); 587 failure("Charset name should be UTF-16BE"); 588 assertEqualString("UTF-16BE", 589 archive_string_conversion_charset_name(sconv)); 590 591 assertA(NULL != (sconv = 592 archive_string_conversion_to_charset(a, "UTF16BE", 1))); 593 failure("Charset name should be UTF-16BE"); 594 assertEqualString("UTF-16BE", 595 archive_string_conversion_charset_name(sconv)); 596 597 assertA(NULL != (sconv = 598 archive_string_conversion_to_charset(a, "utf16be", 1))); 599 failure("Charset name should be UTF-16BE"); 600 assertEqualString("UTF-16BE", 601 archive_string_conversion_charset_name(sconv)); 602 603 assertA(NULL != (sconv = 604 archive_string_conversion_to_charset(a, "UTF-16LE", 1))); 605 failure("Charset name should be UTF-16LE"); 606 assertEqualString("UTF-16LE", 607 archive_string_conversion_charset_name(sconv)); 608 609 assertA(NULL != (sconv = 610 archive_string_conversion_to_charset(a, "UTF16LE", 1))); 611 failure("Charset name should be UTF-16LE"); 612 assertEqualString("UTF-16LE", 613 archive_string_conversion_charset_name(sconv)); 614 615 assertA(NULL != (sconv = 616 archive_string_conversion_to_charset(a, "utf16le", 1))); 617 failure("Charset name should be UTF-16LE"); 618 assertEqualString("UTF-16LE", 619 archive_string_conversion_charset_name(sconv)); 620 621 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 622 623} 624 625DEFINE_TEST(test_archive_string_conversion) 626{ 627 test_archive_string_normalization(); 628 test_archive_string_canonicalization(); 629} 630