1231200Smm/*- 2238856Smm * Copyright (c) 2011-2012 Michihiro NAKAJIMA 3231200Smm * All rights reserved. 4231200Smm * 5231200Smm * Redistribution and use in source and binary forms, with or without 6231200Smm * modification, are permitted provided that the following conditions 7231200Smm * are met: 8231200Smm * 1. Redistributions of source code must retain the above copyright 9231200Smm * notice, this list of conditions and the following disclaimer. 10231200Smm * 2. Redistributions in binary form must reproduce the above copyright 11231200Smm * notice, this list of conditions and the following disclaimer in the 12231200Smm * documentation and/or other materials provided with the distribution. 13231200Smm * 14231200Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15231200Smm * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16231200Smm * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17231200Smm * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18231200Smm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19231200Smm * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20231200Smm * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21231200Smm * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22231200Smm * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23231200Smm * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24231200Smm */ 25231200Smm#include "test.h" 26231200Smm__FBSDID("$FreeBSD$"); 27231200Smm 28231200Smm#include <locale.h> 29231200Smm 30231200Smm#define __LIBARCHIVE_TEST 31231200Smm#include "archive_string.h" 32231200Smm 33231200Smm/* 34231200SmmExecute the following to rebuild the data for this program: 35231200Smm tail -n +36 test_archive_string_conversion.c | /bin/sh 36231200Smm# 37238856Smm# This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt 38231200Smm# 39231200Smmif="NormalizationTest.txt" 40231200Smmif [ ! -f ${if} ]; then 41231200Smm echo "Not found: \"${if}\"" 42231200Smm exit 0 43231200Smmfi 44231200Smmof=test_archive_string_conversion.txt.Z 45231200Smmecho "\$FreeBSD\$" > ${of}.uu 46231200Smmawk -F ';' '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu 47231200Smmexit 1 48231200Smm*/ 49231200Smm 50231200Smmstatic int 51231200Smmunicode_to_utf8(char *p, uint32_t uc) 52231200Smm{ 53231200Smm char *_p = p; 54231200Smm 55231200Smm /* Translate code point to UTF8 */ 56231200Smm if (uc <= 0x7f) { 57231200Smm *p++ = (char)uc; 58231200Smm } else if (uc <= 0x7ff) { 59231200Smm *p++ = 0xc0 | ((uc >> 6) & 0x1f); 60231200Smm *p++ = 0x80 | (uc & 0x3f); 61231200Smm } else if (uc <= 0xffff) { 62231200Smm *p++ = 0xe0 | ((uc >> 12) & 0x0f); 63231200Smm *p++ = 0x80 | ((uc >> 6) & 0x3f); 64231200Smm *p++ = 0x80 | (uc & 0x3f); 65231200Smm } else { 66231200Smm *p++ = 0xf0 | ((uc >> 18) & 0x07); 67231200Smm *p++ = 0x80 | ((uc >> 12) & 0x3f); 68231200Smm *p++ = 0x80 | ((uc >> 6) & 0x3f); 69231200Smm *p++ = 0x80 | (uc & 0x3f); 70231200Smm } 71231200Smm return ((int)(p - _p)); 72231200Smm} 73231200Smm 74231200Smmstatic void 75231200Smmarchive_be16enc(void *pp, uint16_t u) 76231200Smm{ 77231200Smm unsigned char *p = (unsigned char *)pp; 78231200Smm 79231200Smm p[0] = (u >> 8) & 0xff; 80231200Smm p[1] = u & 0xff; 81231200Smm} 82231200Smm 83231200Smmstatic int 84231200Smmunicode_to_utf16be(char *p, uint32_t uc) 85231200Smm{ 86231200Smm char *utf16 = p; 87231200Smm 88231200Smm if (uc > 0xffff) { 89231200Smm /* We have a code point that won't fit into a 90231200Smm * wchar_t; convert it to a surrogate pair. */ 91231200Smm uc -= 0x10000; 92231200Smm archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 93231200Smm archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 94231200Smm return (4); 95231200Smm } else { 96231200Smm archive_be16enc(utf16, uc); 97231200Smm return (2); 98231200Smm } 99231200Smm} 100231200Smm 101231200Smmstatic void 102231200Smmarchive_le16enc(void *pp, uint16_t u) 103231200Smm{ 104231200Smm unsigned char *p = (unsigned char *)pp; 105231200Smm 106231200Smm p[0] = u & 0xff; 107231200Smm p[1] = (u >> 8) & 0xff; 108231200Smm} 109231200Smm 110231200Smmstatic size_t 111231200Smmunicode_to_utf16le(char *p, uint32_t uc) 112231200Smm{ 113231200Smm char *utf16 = p; 114231200Smm 115231200Smm if (uc > 0xffff) { 116231200Smm /* We have a code point that won't fit into a 117231200Smm * wchar_t; convert it to a surrogate pair. */ 118231200Smm uc -= 0x10000; 119231200Smm archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 120231200Smm archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 121231200Smm return (4); 122231200Smm } else { 123231200Smm archive_le16enc(utf16, uc); 124231200Smm return (2); 125231200Smm } 126231200Smm} 127231200Smm 128231200Smmstatic int 129231200Smmwc_size(void) 130231200Smm{ 131231200Smm return (sizeof(wchar_t)); 132231200Smm} 133231200Smm 134231200Smmstatic int 135231200Smmunicode_to_wc(wchar_t *wp, uint32_t uc) 136231200Smm{ 137231200Smm if (wc_size() == 4) { 138231200Smm *wp = (wchar_t)uc; 139231200Smm return (1); 140231200Smm } 141231200Smm if (uc > 0xffff) { 142231200Smm /* We have a code point that won't fit into a 143231200Smm * wchar_t; convert it to a surrogate pair. */ 144231200Smm uc -= 0x10000; 145231200Smm *wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800); 146231200Smm *wp = (wchar_t)((uc & 0x3ff) + 0xDC00); 147231200Smm return (2); 148231200Smm } else { 149231200Smm *wp = (wchar_t)uc; 150231200Smm return (1); 151231200Smm } 152231200Smm} 153231200Smm 154231200Smm/* 155231200Smm * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not 156231200Smm * converted to NFD on Mac OS. 157231200Smm * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html 158231200Smm */ 159231200Smmstatic int 160231200Smmscan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le, 161238856Smm const char *pattern, int mac_nfd) 162231200Smm{ 163231200Smm unsigned uc = 0; 164231200Smm const char *p = pattern; 165231200Smm char *op = out; 166231200Smm wchar_t *owp = wout; 167231200Smm char *op16be = u16be; 168231200Smm char *op16le = u16le; 169238856Smm int ret = 0; 170231200Smm 171231200Smm for (;;) { 172231200Smm if (*p >= '0' && *p <= '9') 173231200Smm uc = (uc << 4) + (*p - '0'); 174231200Smm else if (*p >= 'A' && *p <= 'F') 175231200Smm uc = (uc << 4) + (*p - 'A' + 0x0a); 176231200Smm else { 177238856Smm if (mac_nfd && op == out) { 178231200Smm /* 179231200Smm * These are not converted to NFD on Mac OS. 180238856Smm * U+2000 - U+2FFF 181238856Smm * U+F900 - U+FAFF 182238856Smm * U+2F800 - U+2FAFF 183231200Smm */ 184238856Smm switch (uc) { 185238856Smm case 0x2194: case 0x219A: case 0x219B: 186238856Smm case 0x21AE: case 0x21CD: case 0x21CE: 187238856Smm case 0x21CF: case 0x2204: case 0x2209: 188238856Smm case 0x220C: case 0x2224: case 0x2226: 189238856Smm case 0x2241: case 0x2244: case 0x2247: 190238856Smm case 0x2249: case 0x2260: case 0x2262: 191238856Smm case 0x226D: case 0x226E: case 0x226F: 192238856Smm case 0x2270: case 0x2271: case 0x2274: 193238856Smm case 0x2275: case 0x2276: case 0x2278: 194238856Smm case 0x2279: case 0x227A: case 0x227B: 195238856Smm case 0x2280: case 0x2281: case 0x2284: 196238856Smm case 0x2285: case 0x2288: case 0x2289: 197238856Smm case 0x22AC: case 0x22AD: case 0x22AE: 198238856Smm case 0x22AF: case 0x22E0: case 0x22E1: 199238856Smm case 0x22E2: case 0x22E3: case 0x22EA: 200238856Smm case 0x22EB: case 0x22EC: case 0x22ED: 201238856Smm 202231200Smm /* 203231200Smm * Those code points are not converted to 204231200Smm * NFD on Mac OS. I do not know the reason 205231200Smm * because it is undocumented. 206231200Smm * NFC NFD 207231200Smm * 1109A ==> 11099 110BA 208231200Smm * 1109C ==> 1109B 110BA 209231200Smm * 110AB ==> 110A5 110BA 210231200Smm */ 211238856Smm case 0x1109A: case 0x1109C: case 0x110AB: 212238856Smm ret = 1; 213238856Smm break; 214238856Smm } 215231200Smm } 216231200Smm op16be += unicode_to_utf16be(op16be, uc); 217231200Smm op16le += unicode_to_utf16le(op16le, uc); 218231200Smm owp += unicode_to_wc(owp, uc); 219231200Smm op += unicode_to_utf8(op, uc); 220231200Smm if (!*p) { 221231200Smm *op16be++ = 0; 222231200Smm *op16be = 0; 223231200Smm *op16le++ = 0; 224231200Smm *op16le = 0; 225231200Smm *owp = L'\0'; 226231200Smm *op = '\0'; 227231200Smm break; 228231200Smm } 229231200Smm uc = 0; 230231200Smm } 231231200Smm p++; 232231200Smm } 233238856Smm return (ret); 234231200Smm} 235231200Smm 236231200Smmstatic int 237231200Smmis_wc_unicode(void) 238231200Smm{ 239231200Smm#if defined(_WIN32) && !defined(__CYGWIN__) 240231200Smm return (1); 241231200Smm#else 242231200Smm return (0); 243231200Smm#endif 244231200Smm} 245231200Smm 246231200Smm/* 247231200Smm * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters. 248231200Smm * On Mac OS, the characters to be Form D. 249231200Smm * On other platforms, the characters to be Form C. 250231200Smm */ 251231200Smmstatic void 252238856Smmtest_archive_string_normalization_nfc(const char *testdata) 253231200Smm{ 254231200Smm struct archive *a, *a2; 255231200Smm struct archive_string utf8; 256231200Smm struct archive_mstring mstr; 257231200Smm struct archive_string_conv *f_sconv8, *t_sconv8; 258231200Smm struct archive_string_conv *f_sconv16be, *f_sconv16le; 259231200Smm FILE *fp; 260231200Smm char buff[512]; 261231200Smm int line = 0; 262231200Smm int locale_is_utf8, wc_is_unicode; 263238856Smm int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C; 264231200Smm 265231200Smm locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8")); 266231200Smm wc_is_unicode = is_wc_unicode(); 267231200Smm /* If it doesn't exist, just warn and return. */ 268231200Smm if (!locale_is_utf8 && !wc_is_unicode) { 269238856Smm skipping("A test of string normalization for NFC requires " 270238856Smm "a suitable locale; en_US.UTF-8 not available on this " 271238856Smm "system"); 272231200Smm return; 273231200Smm } 274231200Smm 275231200Smm archive_string_init(&utf8); 276231200Smm memset(&mstr, 0, sizeof(mstr)); 277231200Smm 278231200Smm /* 279238856Smm * Create string conversion objects. 280231200Smm */ 281231200Smm assert((a = archive_read_new()) != NULL); 282231200Smm assertA(NULL != (f_sconv8 = 283231200Smm archive_string_conversion_from_charset(a, "UTF-8", 0))); 284231200Smm assertA(NULL != (f_sconv16be = 285231200Smm archive_string_conversion_from_charset(a, "UTF-16BE", 0))); 286231200Smm assertA(NULL != (f_sconv16le = 287231200Smm archive_string_conversion_from_charset(a, "UTF-16LE", 0))); 288231200Smm assert((a2 = archive_write_new()) != NULL); 289231200Smm assertA(NULL != (t_sconv8 = 290231200Smm archive_string_conversion_to_charset(a2, "UTF-8", 0))); 291231200Smm if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL || 292238856Smm t_sconv8 == NULL) { 293231200Smm /* We cannot continue this test. */ 294231200Smm assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 295231200Smm return; 296231200Smm } 297238856Smm archive_string_conversion_set_opt(f_sconv8, sconv_opt); 298238856Smm archive_string_conversion_set_opt(f_sconv16be, sconv_opt); 299238856Smm archive_string_conversion_set_opt(f_sconv16le, sconv_opt); 300238856Smm archive_string_conversion_set_opt(t_sconv8, sconv_opt); 301231200Smm 302238856Smm /* Open a test pattern file. */ 303238856Smm assert((fp = fopen(testdata, "r")) != NULL); 304238856Smm 305231200Smm /* 306231200Smm * Read test data. 307231200Smm * Test data format: 308231200Smm * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n' 309231200Smm * Unicode pattern format: 310231200Smm * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,} 311231200Smm */ 312231200Smm while (fgets(buff, sizeof(buff), fp) != NULL) { 313231200Smm char nfc[80], nfd[80]; 314231200Smm char utf8_nfc[80], utf8_nfd[80]; 315231200Smm char utf16be_nfc[80], utf16be_nfd[80]; 316231200Smm char utf16le_nfc[80], utf16le_nfd[80]; 317231200Smm wchar_t wc_nfc[40], wc_nfd[40]; 318231200Smm char *e, *p; 319238856Smm const wchar_t *wp; 320238856Smm const char *mp; 321238856Smm size_t mplen; 322231200Smm 323231200Smm line++; 324231200Smm if (buff[0] == '#') 325231200Smm continue; 326231200Smm p = strchr(buff, ';'); 327231200Smm if (p == NULL) 328231200Smm continue; 329231200Smm *p++ = '\0'; 330231200Smm /* Copy an NFC pattern */ 331231200Smm strncpy(nfc, buff, sizeof(nfc)-1); 332231200Smm nfc[sizeof(nfc)-1] = '\0'; 333231200Smm e = p; 334231200Smm p = strchr(p, '\n'); 335231200Smm if (p == NULL) 336231200Smm continue; 337231200Smm *p = '\0'; 338231200Smm /* Copy an NFD pattern */ 339231200Smm strncpy(nfd, e, sizeof(nfd)-1); 340231200Smm nfd[sizeof(nfd)-1] = '\0'; 341231200Smm 342231200Smm /* 343238856Smm * Get an NFC patterns. 344231200Smm */ 345231200Smm scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc, 346231200Smm nfc, 0); 347231200Smm 348231200Smm /* 349238856Smm * Get an NFD patterns. 350231200Smm */ 351231200Smm scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd, 352231200Smm nfd, 0); 353231200Smm 354231200Smm if (locale_is_utf8) { 355231200Smm /* 356231200Smm * Normalize an NFD string for import. 357231200Smm */ 358238856Smm assertEqualInt(0, archive_strcpy_l( 359231200Smm &utf8, utf8_nfd, f_sconv8)); 360231200Smm failure("NFD(%s) should be converted to NFC(%s):%d", 361231200Smm nfd, nfc, line); 362231200Smm assertEqualUTF8String(utf8_nfc, utf8.s); 363231200Smm 364231200Smm /* 365231200Smm * Normalize an NFC string for import. 366231200Smm */ 367238856Smm assertEqualInt(0, archive_strcpy_l( 368231200Smm &utf8, utf8_nfc, f_sconv8)); 369231200Smm failure("NFC(%s) should not be any changed:%d", 370231200Smm nfc, line); 371231200Smm assertEqualUTF8String(utf8_nfc, utf8.s); 372231200Smm 373231200Smm /* 374231200Smm * Copy an NFC string for export. 375231200Smm */ 376238856Smm assertEqualInt(0, archive_strcpy_l( 377231200Smm &utf8, utf8_nfc, t_sconv8)); 378231200Smm failure("NFC(%s) should not be any changed:%d", 379231200Smm nfc, line); 380231200Smm assertEqualUTF8String(utf8_nfc, utf8.s); 381231200Smm 382231200Smm /* 383231200Smm * Normalize an NFD string in UTF-16BE for import. 384231200Smm */ 385238856Smm assertEqualInt(0, archive_strncpy_l( 386231200Smm &utf8, utf16be_nfd, 100000, f_sconv16be)); 387231200Smm failure("NFD(%s) should be converted to NFC(%s):%d", 388231200Smm nfd, nfc, line); 389231200Smm assertEqualUTF8String(utf8_nfc, utf8.s); 390231200Smm 391231200Smm /* 392231200Smm * Normalize an NFD string in UTF-16LE for import. 393231200Smm */ 394238856Smm assertEqualInt(0, archive_strncpy_l( 395231200Smm &utf8, utf16le_nfd, 100000, f_sconv16le)); 396231200Smm failure("NFD(%s) should be converted to NFC(%s):%d", 397231200Smm nfd, nfc, line); 398231200Smm assertEqualUTF8String(utf8_nfc, utf8.s); 399231200Smm } 400231200Smm 401231200Smm /* 402231200Smm * Test for archive_mstring interface. 403231200Smm * In specific, Windows platform UTF-16BE is directly 404231200Smm * converted to/from wide-character to avoid the effect of 405231200Smm * current locale since windows platform cannot make 406231200Smm * locale UTF-8. 407231200Smm */ 408231200Smm if (locale_is_utf8 || wc_is_unicode) { 409231200Smm /* 410231200Smm * Normalize an NFD string in UTF-8 for import. 411231200Smm */ 412231200Smm assertEqualInt(0, archive_mstring_copy_mbs_len_l( 413238856Smm &mstr, utf8_nfd, 100000, f_sconv8)); 414231200Smm assertEqualInt(0, 415231200Smm archive_mstring_get_wcs(a, &mstr, &wp)); 416238856Smm failure("UTF-8 NFD(%s) should be converted " 417238856Smm "to WCS NFC(%s):%d", nfd, nfc, line); 418238856Smm assertEqualWString(wc_nfc, wp); 419231200Smm 420231200Smm /* 421231200Smm * Normalize an NFD string in UTF-16BE for import. 422231200Smm */ 423231200Smm assertEqualInt(0, archive_mstring_copy_mbs_len_l( 424238856Smm &mstr, utf16be_nfd, 100000, f_sconv16be)); 425231200Smm assertEqualInt(0, 426231200Smm archive_mstring_get_wcs(a, &mstr, &wp)); 427238856Smm failure("UTF-8 NFD(%s) should be converted " 428238856Smm "to WCS NFC(%s):%d", nfd, nfc, line); 429238856Smm assertEqualWString(wc_nfc, wp); 430231200Smm 431231200Smm /* 432231200Smm * Normalize an NFD string in UTF-16LE for import. 433231200Smm */ 434231200Smm assertEqualInt(0, archive_mstring_copy_mbs_len_l( 435238856Smm &mstr, utf16le_nfd, 100000, f_sconv16le)); 436231200Smm assertEqualInt(0, 437231200Smm archive_mstring_get_wcs(a, &mstr, &wp)); 438238856Smm failure("UTF-8 NFD(%s) should be converted " 439238856Smm "to WCS NFC(%s):%d", nfd, nfc, line); 440238856Smm assertEqualWString(wc_nfc, wp); 441231200Smm 442231200Smm /* 443238856Smm * Copy an NFC wide-string for export. 444231200Smm */ 445238856Smm assertEqualInt(0, 446238856Smm archive_mstring_copy_wcs(&mstr, wc_nfc)); 447231200Smm assertEqualInt(0, archive_mstring_get_mbs_l( 448368707Smm a, &mstr, &mp, &mplen, t_sconv8)); 449238856Smm failure("WCS NFC(%s) should be UTF-8 NFC:%d" 450238856Smm ,nfc, line); 451238856Smm assertEqualUTF8String(utf8_nfc, mp); 452238856Smm } 453238856Smm } 454238856Smm 455238856Smm archive_string_free(&utf8); 456238856Smm archive_mstring_clean(&mstr); 457238856Smm fclose(fp); 458238856Smm assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 459238856Smm assertEqualInt(ARCHIVE_OK, archive_write_free(a2)); 460238856Smm} 461238856Smm 462238856Smmstatic void 463238856Smmtest_archive_string_normalization_mac_nfd(const char *testdata) 464238856Smm{ 465238856Smm struct archive *a, *a2; 466238856Smm struct archive_string utf8; 467238856Smm struct archive_mstring mstr; 468238856Smm struct archive_string_conv *f_sconv8, *t_sconv8; 469238856Smm struct archive_string_conv *f_sconv16be, *f_sconv16le; 470238856Smm FILE *fp; 471238856Smm char buff[512]; 472238856Smm int line = 0; 473238856Smm int locale_is_utf8, wc_is_unicode; 474238856Smm int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D; 475238856Smm 476238856Smm locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8")); 477238856Smm wc_is_unicode = is_wc_unicode(); 478238856Smm /* If it doesn't exist, just warn and return. */ 479238856Smm if (!locale_is_utf8 && !wc_is_unicode) { 480238856Smm skipping("A test of string normalization for NFD requires " 481238856Smm "a suitable locale; en_US.UTF-8 not available on this " 482238856Smm "system"); 483238856Smm return; 484238856Smm } 485238856Smm 486238856Smm archive_string_init(&utf8); 487238856Smm memset(&mstr, 0, sizeof(mstr)); 488238856Smm 489238856Smm /* 490238856Smm * Create string conversion objects. 491238856Smm */ 492238856Smm assert((a = archive_read_new()) != NULL); 493238856Smm assertA(NULL != (f_sconv8 = 494238856Smm archive_string_conversion_from_charset(a, "UTF-8", 0))); 495238856Smm assertA(NULL != (f_sconv16be = 496238856Smm archive_string_conversion_from_charset(a, "UTF-16BE", 0))); 497238856Smm assertA(NULL != (f_sconv16le = 498238856Smm archive_string_conversion_from_charset(a, "UTF-16LE", 0))); 499238856Smm assert((a2 = archive_write_new()) != NULL); 500238856Smm assertA(NULL != (t_sconv8 = 501238856Smm archive_string_conversion_to_charset(a2, "UTF-8", 0))); 502238856Smm if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL || 503238856Smm t_sconv8 == NULL) { 504238856Smm /* We cannot continue this test. */ 505238856Smm assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 506238856Smm return; 507238856Smm } 508238856Smm archive_string_conversion_set_opt(f_sconv8, sconv_opt); 509238856Smm archive_string_conversion_set_opt(f_sconv16be, sconv_opt); 510238856Smm archive_string_conversion_set_opt(f_sconv16le, sconv_opt); 511238856Smm archive_string_conversion_set_opt(t_sconv8, sconv_opt); 512238856Smm 513238856Smm /* Open a test pattern file. */ 514238856Smm assert((fp = fopen(testdata, "r")) != NULL); 515238856Smm 516238856Smm /* 517238856Smm * Read test data. 518238856Smm * Test data format: 519238856Smm * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n' 520238856Smm * Unicode pattern format: 521238856Smm * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,} 522238856Smm */ 523238856Smm while (fgets(buff, sizeof(buff), fp) != NULL) { 524238856Smm char nfc[80], nfd[80]; 525238856Smm char utf8_nfc[80], utf8_nfd[80]; 526238856Smm char utf16be_nfc[80], utf16be_nfd[80]; 527238856Smm char utf16le_nfc[80], utf16le_nfd[80]; 528238856Smm wchar_t wc_nfc[40], wc_nfd[40]; 529238856Smm char *e, *p; 530238856Smm const wchar_t *wp; 531238856Smm const char *mp; 532238856Smm size_t mplen; 533238856Smm int should_be_nfc; 534238856Smm 535238856Smm line++; 536238856Smm if (buff[0] == '#') 537238856Smm continue; 538238856Smm p = strchr(buff, ';'); 539238856Smm if (p == NULL) 540238856Smm continue; 541238856Smm *p++ = '\0'; 542238856Smm /* Copy an NFC pattern */ 543238856Smm strncpy(nfc, buff, sizeof(nfc)-1); 544238856Smm nfc[sizeof(nfc)-1] = '\0'; 545238856Smm e = p; 546238856Smm p = strchr(p, '\n'); 547238856Smm if (p == NULL) 548238856Smm continue; 549238856Smm *p = '\0'; 550238856Smm /* Copy an NFD pattern */ 551238856Smm strncpy(nfd, e, sizeof(nfd)-1); 552238856Smm nfd[sizeof(nfd)-1] = '\0'; 553238856Smm 554238856Smm /* 555238856Smm * Get an NFC patterns. 556238856Smm */ 557238856Smm should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc, 558238856Smm utf16be_nfc, utf16le_nfc, nfc, 1); 559238856Smm 560238856Smm /* 561238856Smm * Get an NFD patterns. 562238856Smm */ 563238856Smm scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd, 564238856Smm nfd, 0); 565238856Smm 566238856Smm if (locale_is_utf8) { 567231200Smm /* 568238856Smm * Normalize an NFC string for import. 569238856Smm */ 570238856Smm assertEqualInt(0, archive_strcpy_l( 571238856Smm &utf8, utf8_nfc, f_sconv8)); 572238856Smm if (should_be_nfc) { 573238856Smm failure("NFC(%s) should not be converted to" 574238856Smm " NFD(%s):%d", nfc, nfd, line); 575238856Smm assertEqualUTF8String(utf8_nfc, utf8.s); 576238856Smm } else { 577238856Smm failure("NFC(%s) should be converted to" 578238856Smm " NFD(%s):%d", nfc, nfd, line); 579238856Smm assertEqualUTF8String(utf8_nfd, utf8.s); 580238856Smm } 581238856Smm 582238856Smm /* 583238856Smm * Normalize an NFD string for import. 584238856Smm */ 585238856Smm assertEqualInt(0, archive_strcpy_l( 586238856Smm &utf8, utf8_nfd, f_sconv8)); 587238856Smm failure("NFD(%s) should not be any changed:%d", 588238856Smm nfd, line); 589238856Smm assertEqualUTF8String(utf8_nfd, utf8.s); 590238856Smm 591238856Smm /* 592238856Smm * Copy an NFD string for export. 593238856Smm */ 594238856Smm assertEqualInt(0, archive_strcpy_l( 595238856Smm &utf8, utf8_nfd, t_sconv8)); 596238856Smm failure("NFD(%s) should not be any changed:%d", 597238856Smm nfd, line); 598238856Smm assertEqualUTF8String(utf8_nfd, utf8.s); 599238856Smm 600238856Smm /* 601238856Smm * Normalize an NFC string in UTF-16BE for import. 602238856Smm */ 603238856Smm assertEqualInt(0, archive_strncpy_l( 604238856Smm &utf8, utf16be_nfc, 100000, f_sconv16be)); 605238856Smm if (should_be_nfc) { 606238856Smm failure("NFC(%s) should not be converted to" 607238856Smm " NFD(%s):%d", nfc, nfd, line); 608238856Smm assertEqualUTF8String(utf8_nfc, utf8.s); 609238856Smm } else { 610238856Smm failure("NFC(%s) should be converted to" 611238856Smm " NFD(%s):%d", nfc, nfd, line); 612238856Smm assertEqualUTF8String(utf8_nfd, utf8.s); 613238856Smm } 614238856Smm 615238856Smm /* 616238856Smm * Normalize an NFC string in UTF-16LE for import. 617238856Smm */ 618238856Smm assertEqualInt(0, archive_strncpy_l( 619238856Smm &utf8, utf16le_nfc, 100000, f_sconv16le)); 620238856Smm if (should_be_nfc) { 621238856Smm failure("NFC(%s) should not be converted to" 622238856Smm " NFD(%s):%d", nfc, nfd, line); 623238856Smm assertEqualUTF8String(utf8_nfc, utf8.s); 624238856Smm } else { 625238856Smm failure("NFC(%s) should be converted to" 626238856Smm " NFD(%s):%d", nfc, nfd, line); 627238856Smm assertEqualUTF8String(utf8_nfd, utf8.s); 628238856Smm } 629238856Smm } 630238856Smm 631238856Smm /* 632238856Smm * Test for archive_mstring interface. 633238856Smm * In specific, Windows platform UTF-16BE is directly 634238856Smm * converted to/from wide-character to avoid the effect of 635238856Smm * current locale since windows platform cannot make 636238856Smm * locale UTF-8. 637238856Smm */ 638238856Smm if (locale_is_utf8 || wc_is_unicode) { 639238856Smm /* 640231200Smm * Normalize an NFD string in UTF-8 for import. 641231200Smm */ 642231200Smm assertEqualInt(0, archive_mstring_copy_mbs_len_l( 643238856Smm &mstr, utf8_nfc, 100000, f_sconv8)); 644231200Smm assertEqualInt(0, 645231200Smm archive_mstring_get_wcs(a, &mstr, &wp)); 646238856Smm if (should_be_nfc) { 647238856Smm failure("UTF-8 NFC(%s) should not be converted " 648238856Smm "to WCS NFD(%s):%d", nfc, nfd, line); 649238856Smm assertEqualWString(wc_nfc, wp); 650238856Smm } else { 651238856Smm failure("UTF-8 NFC(%s) should be converted " 652238856Smm "to WCS NFD(%s):%d", nfc, nfd, line); 653238856Smm assertEqualWString(wc_nfd, wp); 654238856Smm } 655231200Smm 656231200Smm /* 657231200Smm * Normalize an NFD string in UTF-16BE for import. 658231200Smm */ 659231200Smm assertEqualInt(0, archive_mstring_copy_mbs_len_l( 660238856Smm &mstr, utf16be_nfc, 100000, f_sconv16be)); 661231200Smm assertEqualInt(0, 662231200Smm archive_mstring_get_wcs(a, &mstr, &wp)); 663238856Smm if (should_be_nfc) { 664238856Smm failure("UTF-16BE NFC(%s) should not be " 665238856Smm "converted to WCS NFD(%s):%d", 666238856Smm nfc, nfd, line); 667238856Smm assertEqualWString(wc_nfc, wp); 668238856Smm } else { 669238856Smm failure("UTF-16BE NFC(%s) should be converted " 670238856Smm "to WCS NFD(%s):%d", nfc, nfd, line); 671238856Smm assertEqualWString(wc_nfd, wp); 672238856Smm } 673231200Smm 674231200Smm /* 675231200Smm * Normalize an NFD string in UTF-16LE for import. 676231200Smm */ 677231200Smm assertEqualInt(0, archive_mstring_copy_mbs_len_l( 678238856Smm &mstr, utf16le_nfc, 100000, f_sconv16le)); 679231200Smm assertEqualInt(0, 680231200Smm archive_mstring_get_wcs(a, &mstr, &wp)); 681238856Smm if (should_be_nfc) { 682238856Smm failure("UTF-16LE NFC(%s) should not be " 683238856Smm "converted to WCS NFD(%s):%d", 684238856Smm nfc, nfd, line); 685238856Smm assertEqualWString(wc_nfc, wp); 686238856Smm } else { 687238856Smm failure("UTF-16LE NFC(%s) should be converted " 688238856Smm "to WCS NFD(%s):%d", nfc, nfd, line); 689238856Smm assertEqualWString(wc_nfd, wp); 690238856Smm } 691231200Smm 692231200Smm /* 693238856Smm * Copy an NFD wide-string for export. 694231200Smm */ 695231200Smm assertEqualInt(0, archive_mstring_copy_wcs( 696238856Smm &mstr, wc_nfd)); 697231200Smm assertEqualInt(0, archive_mstring_get_mbs_l( 698368707Smm a, &mstr, &mp, &mplen, t_sconv8)); 699238856Smm failure("WCS NFD(%s) should be UTF-8 NFD:%d" 700238856Smm ,nfd, line); 701238856Smm assertEqualUTF8String(utf8_nfd, mp); 702231200Smm } 703231200Smm } 704231200Smm 705231200Smm archive_string_free(&utf8); 706231200Smm archive_mstring_clean(&mstr); 707231200Smm fclose(fp); 708231200Smm assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 709231200Smm assertEqualInt(ARCHIVE_OK, archive_write_free(a2)); 710231200Smm} 711231200Smm 712231200Smmstatic void 713231200Smmtest_archive_string_canonicalization(void) 714231200Smm{ 715231200Smm struct archive *a; 716231200Smm struct archive_string_conv *sconv; 717231200Smm 718231200Smm setlocale(LC_ALL, "en_US.UTF-8"); 719231200Smm 720231200Smm assert((a = archive_read_new()) != NULL); 721231200Smm 722231200Smm assertA(NULL != (sconv = 723231200Smm archive_string_conversion_to_charset(a, "UTF-8", 1))); 724231200Smm failure("Charset name should be UTF-8"); 725231200Smm assertEqualString("UTF-8", 726231200Smm archive_string_conversion_charset_name(sconv)); 727231200Smm 728231200Smm assertA(NULL != (sconv = 729231200Smm archive_string_conversion_to_charset(a, "UTF8", 1))); 730231200Smm failure("Charset name should be UTF-8"); 731231200Smm assertEqualString("UTF-8", 732231200Smm archive_string_conversion_charset_name(sconv)); 733231200Smm 734231200Smm assertA(NULL != (sconv = 735231200Smm archive_string_conversion_to_charset(a, "utf8", 1))); 736231200Smm failure("Charset name should be UTF-8"); 737231200Smm assertEqualString("UTF-8", 738231200Smm archive_string_conversion_charset_name(sconv)); 739231200Smm 740231200Smm assertA(NULL != (sconv = 741231200Smm archive_string_conversion_to_charset(a, "UTF-16BE", 1))); 742231200Smm failure("Charset name should be UTF-16BE"); 743231200Smm assertEqualString("UTF-16BE", 744231200Smm archive_string_conversion_charset_name(sconv)); 745231200Smm 746231200Smm assertA(NULL != (sconv = 747231200Smm archive_string_conversion_to_charset(a, "UTF16BE", 1))); 748231200Smm failure("Charset name should be UTF-16BE"); 749231200Smm assertEqualString("UTF-16BE", 750231200Smm archive_string_conversion_charset_name(sconv)); 751231200Smm 752231200Smm assertA(NULL != (sconv = 753231200Smm archive_string_conversion_to_charset(a, "utf16be", 1))); 754231200Smm failure("Charset name should be UTF-16BE"); 755231200Smm assertEqualString("UTF-16BE", 756231200Smm archive_string_conversion_charset_name(sconv)); 757231200Smm 758231200Smm assertA(NULL != (sconv = 759231200Smm archive_string_conversion_to_charset(a, "UTF-16LE", 1))); 760231200Smm failure("Charset name should be UTF-16LE"); 761231200Smm assertEqualString("UTF-16LE", 762231200Smm archive_string_conversion_charset_name(sconv)); 763231200Smm 764231200Smm assertA(NULL != (sconv = 765231200Smm archive_string_conversion_to_charset(a, "UTF16LE", 1))); 766231200Smm failure("Charset name should be UTF-16LE"); 767231200Smm assertEqualString("UTF-16LE", 768231200Smm archive_string_conversion_charset_name(sconv)); 769231200Smm 770231200Smm assertA(NULL != (sconv = 771231200Smm archive_string_conversion_to_charset(a, "utf16le", 1))); 772231200Smm failure("Charset name should be UTF-16LE"); 773231200Smm assertEqualString("UTF-16LE", 774231200Smm archive_string_conversion_charset_name(sconv)); 775231200Smm 776231200Smm assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 777231200Smm 778231200Smm} 779231200Smm 780368707Smmstatic void 781368707Smmcheck_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc, 782368707Smm const char *exp, const wchar_t *wexp) 783368707Smm{ 784368707Smm /* Do all the tests on a copy so that we can have a clear initial state every time */ 785368707Smm struct archive_mstring mstr2; 786368707Smm const char *p = NULL; 787368707Smm const wchar_t *wp = NULL; 788368707Smm size_t len = 0; 789368707Smm 790368707Smm memset(&mstr2, 0, sizeof(mstr2)); 791368707Smm 792368707Smm archive_mstring_copy(&mstr2, mstr); 793368707Smm assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p)); 794368707Smm assertEqualString(exp, p); 795368707Smm p = NULL; 796368707Smm 797368707Smm archive_mstring_copy(&mstr2, mstr); 798368707Smm assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p)); 799368707Smm assertEqualString(exp, p); 800368707Smm p = NULL; 801368707Smm 802368707Smm archive_mstring_copy(&mstr2, mstr); 803368707Smm assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp)); 804368707Smm assertEqualWString(wexp, wp); 805368707Smm wp = NULL; 806368707Smm 807368707Smm archive_mstring_copy(&mstr2, mstr); 808368707Smm assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc)); 809368707Smm assertEqualString(exp, p); 810368707Smm assertEqualInt(len, strlen(exp)); 811368707Smm p = NULL; 812368707Smm len = 0; 813368707Smm 814368707Smm archive_mstring_clean(&mstr2); 815368707Smm} 816368707Smm 817368707Smm/* 818368707Smm * Make sure no matter what the input encoding is, the string can be 819368707Smm * converted too all the output encodings. 820368707Smm */ 821368707Smmstatic void 822368707Smmtest_archive_string_set_get(void) 823368707Smm{ 824368707Smm struct archive *a; 825368707Smm struct archive_mstring mstr; 826368707Smm struct archive_string_conv *sc; 827368707Smm 828368707Smm setlocale(LC_ALL, "en_US.UTF-8"); 829368707Smm 830368707Smm assert((a = archive_read_new()) != NULL); 831368707Smm memset(&mstr, 0, sizeof(mstr)); 832368707Smm 833368707Smm assertA(NULL != (sc = 834368707Smm archive_string_conversion_to_charset(a, "UTF-8", 1))); 835368707Smm failure("Charset name should be UTF-8"); 836368707Smm assertEqualString("UTF-8", 837368707Smm archive_string_conversion_charset_name(sc)); 838368707Smm 839368707Smm assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA")); 840368707Smm check_string(a, &mstr, sc, "AAA", L"AAA"); 841368707Smm assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB")); 842368707Smm check_string(a, &mstr, sc, "BBBB", L"BBBB"); 843368707Smm assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12")); 844368707Smm check_string(a, &mstr, sc, "CCC12", L"CCC12"); 845368707Smm assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc)); 846368707Smm check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l"); 847368707Smm assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H")); 848368707Smm check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H"); 849368707Smm 850368707Smm assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 851368707Smm 852368707Smm} 853368707Smm 854231200SmmDEFINE_TEST(test_archive_string_conversion) 855231200Smm{ 856238856Smm static const char reffile[] = "test_archive_string_conversion.txt.Z"; 857238856Smm static const char testdata[] = "testdata.txt"; 858238856Smm struct archive *a; 859238856Smm struct archive_entry *ae; 860238856Smm char buff[512]; 861238856Smm ssize_t size; 862238856Smm FILE *fp; 863238856Smm 864238856Smm /* 865238856Smm * Extract a test pattern file. 866238856Smm */ 867238856Smm extract_reference_file(reffile); 868238856Smm assert((a = archive_read_new()) != NULL); 869238856Smm assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); 870238856Smm assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a)); 871238856Smm assertEqualIntA(a, ARCHIVE_OK, 872238856Smm archive_read_open_filename(a, reffile, 512)); 873238856Smm 874238856Smm assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); 875238856Smm assert((fp = fopen(testdata, "w")) != NULL); 876238856Smm while ((size = archive_read_data(a, buff, 512)) > 0) 877305188Smm assertEqualInt(size, fwrite(buff, 1, size, fp)); 878305188Smm assertEqualInt(0, fclose(fp)); 879238856Smm assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 880238856Smm 881238856Smm test_archive_string_normalization_nfc(testdata); 882238856Smm test_archive_string_normalization_mac_nfd(testdata); 883231200Smm test_archive_string_canonicalization(); 884368707Smm test_archive_string_set_get(); 885231200Smm} 886