1231200Smm/*-
2238856Smm * Copyright (c) 2011-2012 Michihiro NAKAJIMA
3231200Smm * All rights reserved.
4231200Smm *
5231200Smm * Redistribution and use in source and binary forms, with or without
6231200Smm * modification, are permitted provided that the following conditions
7231200Smm * are met:
8231200Smm * 1. Redistributions of source code must retain the above copyright
9231200Smm *    notice, this list of conditions and the following disclaimer.
10231200Smm * 2. Redistributions in binary form must reproduce the above copyright
11231200Smm *    notice, this list of conditions and the following disclaimer in the
12231200Smm *    documentation and/or other materials provided with the distribution.
13231200Smm *
14231200Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15231200Smm * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16231200Smm * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17231200Smm * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18231200Smm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19231200Smm * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20231200Smm * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21231200Smm * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22231200Smm * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23231200Smm * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24231200Smm */
25231200Smm#include "test.h"
26231200Smm__FBSDID("$FreeBSD$");
27231200Smm
28231200Smm#include <locale.h>
29231200Smm
30231200Smm#define __LIBARCHIVE_TEST
31231200Smm#include "archive_string.h"
32231200Smm
33231200Smm/*
34231200SmmExecute the following to rebuild the data for this program:
35231200Smm   tail -n +36 test_archive_string_conversion.c | /bin/sh
36231200Smm#
37238856Smm# This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt
38231200Smm#
39231200Smmif="NormalizationTest.txt"
40231200Smmif [ ! -f ${if} ]; then
41231200Smm  echo "Not found: \"${if}\""
42231200Smm  exit 0
43231200Smmfi
44231200Smmof=test_archive_string_conversion.txt.Z
45231200Smmecho "\$FreeBSD\$" > ${of}.uu
46231200Smmawk -F ';'  '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu
47231200Smmexit 1
48231200Smm*/
49231200Smm
50231200Smmstatic int
51231200Smmunicode_to_utf8(char *p, uint32_t uc)
52231200Smm{
53231200Smm        char *_p = p;
54231200Smm
55231200Smm        /* Translate code point to UTF8 */
56231200Smm        if (uc <= 0x7f) {
57231200Smm                *p++ = (char)uc;
58231200Smm        } else if (uc <= 0x7ff) {
59231200Smm                *p++ = 0xc0 | ((uc >> 6) & 0x1f);
60231200Smm                *p++ = 0x80 | (uc & 0x3f);
61231200Smm        } else if (uc <= 0xffff) {
62231200Smm                *p++ = 0xe0 | ((uc >> 12) & 0x0f);
63231200Smm                *p++ = 0x80 | ((uc >> 6) & 0x3f);
64231200Smm                *p++ = 0x80 | (uc & 0x3f);
65231200Smm        } else {
66231200Smm                *p++ = 0xf0 | ((uc >> 18) & 0x07);
67231200Smm                *p++ = 0x80 | ((uc >> 12) & 0x3f);
68231200Smm                *p++ = 0x80 | ((uc >> 6) & 0x3f);
69231200Smm                *p++ = 0x80 | (uc & 0x3f);
70231200Smm        }
71231200Smm        return ((int)(p - _p));
72231200Smm}
73231200Smm
74231200Smmstatic void
75231200Smmarchive_be16enc(void *pp, uint16_t u)
76231200Smm{
77231200Smm        unsigned char *p = (unsigned char *)pp;
78231200Smm
79231200Smm        p[0] = (u >> 8) & 0xff;
80231200Smm        p[1] = u & 0xff;
81231200Smm}
82231200Smm
83231200Smmstatic int
84231200Smmunicode_to_utf16be(char *p, uint32_t uc)
85231200Smm{
86231200Smm	char *utf16 = p;
87231200Smm
88231200Smm	if (uc > 0xffff) {
89231200Smm		/* We have a code point that won't fit into a
90231200Smm		 * wchar_t; convert it to a surrogate pair. */
91231200Smm		uc -= 0x10000;
92231200Smm		archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
93231200Smm		archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
94231200Smm		return (4);
95231200Smm	} else {
96231200Smm		archive_be16enc(utf16, uc);
97231200Smm		return (2);
98231200Smm	}
99231200Smm}
100231200Smm
101231200Smmstatic void
102231200Smmarchive_le16enc(void *pp, uint16_t u)
103231200Smm{
104231200Smm	unsigned char *p = (unsigned char *)pp;
105231200Smm
106231200Smm	p[0] = u & 0xff;
107231200Smm	p[1] = (u >> 8) & 0xff;
108231200Smm}
109231200Smm
110231200Smmstatic size_t
111231200Smmunicode_to_utf16le(char *p, uint32_t uc)
112231200Smm{
113231200Smm	char *utf16 = p;
114231200Smm
115231200Smm	if (uc > 0xffff) {
116231200Smm		/* We have a code point that won't fit into a
117231200Smm		 * wchar_t; convert it to a surrogate pair. */
118231200Smm		uc -= 0x10000;
119231200Smm		archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
120231200Smm		archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
121231200Smm		return (4);
122231200Smm	} else {
123231200Smm		archive_le16enc(utf16, uc);
124231200Smm		return (2);
125231200Smm	}
126231200Smm}
127231200Smm
128231200Smmstatic int
129231200Smmwc_size(void)
130231200Smm{
131231200Smm	return (sizeof(wchar_t));
132231200Smm}
133231200Smm
134231200Smmstatic int
135231200Smmunicode_to_wc(wchar_t *wp, uint32_t uc)
136231200Smm{
137231200Smm	if (wc_size() == 4) {
138231200Smm		*wp = (wchar_t)uc;
139231200Smm		return (1);
140231200Smm	}
141231200Smm	if (uc > 0xffff) {
142231200Smm		/* We have a code point that won't fit into a
143231200Smm		 * wchar_t; convert it to a surrogate pair. */
144231200Smm		uc -= 0x10000;
145231200Smm		*wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800);
146231200Smm		*wp = (wchar_t)((uc & 0x3ff) + 0xDC00);
147231200Smm		return (2);
148231200Smm	} else {
149231200Smm		*wp = (wchar_t)uc;
150231200Smm		return (1);
151231200Smm	}
152231200Smm}
153231200Smm
154231200Smm/*
155231200Smm * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not
156231200Smm * converted to NFD on Mac OS.
157231200Smm * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
158231200Smm */
159231200Smmstatic int
160231200Smmscan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le,
161238856Smm    const char *pattern, int mac_nfd)
162231200Smm{
163231200Smm	unsigned uc = 0;
164231200Smm	const char *p = pattern;
165231200Smm	char *op = out;
166231200Smm	wchar_t *owp = wout;
167231200Smm	char *op16be = u16be;
168231200Smm	char *op16le = u16le;
169238856Smm	int ret = 0;
170231200Smm
171231200Smm	for (;;) {
172231200Smm		if (*p >= '0' && *p <= '9')
173231200Smm			uc = (uc << 4) + (*p - '0');
174231200Smm		else if (*p >= 'A' && *p <= 'F')
175231200Smm			uc = (uc << 4) + (*p - 'A' + 0x0a);
176231200Smm		else {
177238856Smm			if (mac_nfd && op == out) {
178231200Smm				/*
179231200Smm				 * These are not converted to NFD on Mac OS.
180238856Smm 				 * U+2000 - U+2FFF
181238856Smm				 * U+F900 - U+FAFF
182238856Smm				 * U+2F800 - U+2FAFF
183231200Smm				 */
184238856Smm				switch (uc) {
185238856Smm				case 0x2194: case 0x219A: case 0x219B:
186238856Smm				case 0x21AE: case 0x21CD: case 0x21CE:
187238856Smm				case 0x21CF: case 0x2204: case 0x2209:
188238856Smm				case 0x220C: case 0x2224: case 0x2226:
189238856Smm				case 0x2241: case 0x2244: case 0x2247:
190238856Smm				case 0x2249: case 0x2260: case 0x2262:
191238856Smm				case 0x226D: case 0x226E: case 0x226F:
192238856Smm				case 0x2270: case 0x2271: case 0x2274:
193238856Smm				case 0x2275: case 0x2276: case 0x2278:
194238856Smm				case 0x2279: case 0x227A: case 0x227B:
195238856Smm				case 0x2280: case 0x2281: case 0x2284:
196238856Smm				case 0x2285: case 0x2288: case 0x2289:
197238856Smm				case 0x22AC: case 0x22AD: case 0x22AE:
198238856Smm				case 0x22AF: case 0x22E0: case 0x22E1:
199238856Smm				case 0x22E2: case 0x22E3: case 0x22EA:
200238856Smm				case 0x22EB: case 0x22EC: case 0x22ED:
201238856Smm
202231200Smm				/*
203231200Smm				 * Those code points are not converted to
204231200Smm				 * NFD on Mac OS. I do not know the reason
205231200Smm				 * because it is undocumented.
206231200Smm				 *   NFC        NFD
207231200Smm				 *   1109A  ==> 11099 110BA
208231200Smm				 *   1109C  ==> 1109B 110BA
209231200Smm				 *   110AB  ==> 110A5 110BA
210231200Smm				 */
211238856Smm				case 0x1109A: case 0x1109C: case 0x110AB:
212238856Smm					ret = 1;
213238856Smm					break;
214238856Smm				}
215231200Smm			}
216231200Smm			op16be += unicode_to_utf16be(op16be, uc);
217231200Smm			op16le += unicode_to_utf16le(op16le, uc);
218231200Smm			owp += unicode_to_wc(owp, uc);
219231200Smm			op += unicode_to_utf8(op, uc);
220231200Smm			if (!*p) {
221231200Smm				*op16be++ = 0;
222231200Smm				*op16be = 0;
223231200Smm				*op16le++ = 0;
224231200Smm				*op16le = 0;
225231200Smm				*owp = L'\0';
226231200Smm				*op = '\0';
227231200Smm				break;
228231200Smm			}
229231200Smm			uc = 0;
230231200Smm		}
231231200Smm		p++;
232231200Smm	}
233238856Smm	return (ret);
234231200Smm}
235231200Smm
236231200Smmstatic int
237231200Smmis_wc_unicode(void)
238231200Smm{
239231200Smm#if defined(_WIN32) && !defined(__CYGWIN__)
240231200Smm	return (1);
241231200Smm#else
242231200Smm	return (0);
243231200Smm#endif
244231200Smm}
245231200Smm
246231200Smm/*
247231200Smm * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters.
248231200Smm * On Mac OS, the characters to be Form D.
249231200Smm * On other platforms, the characters to be Form C.
250231200Smm */
251231200Smmstatic void
252238856Smmtest_archive_string_normalization_nfc(const char *testdata)
253231200Smm{
254231200Smm	struct archive *a, *a2;
255231200Smm	struct archive_string utf8;
256231200Smm	struct archive_mstring mstr;
257231200Smm	struct archive_string_conv *f_sconv8, *t_sconv8;
258231200Smm	struct archive_string_conv *f_sconv16be, *f_sconv16le;
259231200Smm	FILE *fp;
260231200Smm	char buff[512];
261231200Smm	int line = 0;
262231200Smm	int locale_is_utf8, wc_is_unicode;
263238856Smm	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C;
264231200Smm
265231200Smm	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
266231200Smm	wc_is_unicode = is_wc_unicode();
267231200Smm	/* If it doesn't exist, just warn and return. */
268231200Smm	if (!locale_is_utf8 && !wc_is_unicode) {
269238856Smm		skipping("A test of string normalization for NFC requires "
270238856Smm		    "a suitable locale; en_US.UTF-8 not available on this "
271238856Smm		    "system");
272231200Smm		return;
273231200Smm	}
274231200Smm
275231200Smm	archive_string_init(&utf8);
276231200Smm	memset(&mstr, 0, sizeof(mstr));
277231200Smm
278231200Smm	/*
279238856Smm	 * Create string conversion objects.
280231200Smm	 */
281231200Smm	assert((a = archive_read_new()) != NULL);
282231200Smm	assertA(NULL != (f_sconv8 =
283231200Smm	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
284231200Smm	assertA(NULL != (f_sconv16be =
285231200Smm	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
286231200Smm	assertA(NULL != (f_sconv16le =
287231200Smm	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
288231200Smm	assert((a2 = archive_write_new()) != NULL);
289231200Smm	assertA(NULL != (t_sconv8 =
290231200Smm	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
291231200Smm	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
292238856Smm	    t_sconv8 == NULL) {
293231200Smm		/* We cannot continue this test. */
294231200Smm		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
295231200Smm		return;
296231200Smm	}
297238856Smm	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
298238856Smm	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
299238856Smm	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
300238856Smm	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
301231200Smm
302238856Smm	/* Open a test pattern file. */
303238856Smm	assert((fp = fopen(testdata, "r")) != NULL);
304238856Smm
305231200Smm	/*
306231200Smm	 * Read test data.
307231200Smm	 *  Test data format:
308231200Smm	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
309231200Smm	 *  Unicode pattern format:
310231200Smm	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
311231200Smm	 */
312231200Smm	while (fgets(buff, sizeof(buff), fp) != NULL) {
313231200Smm		char nfc[80], nfd[80];
314231200Smm		char utf8_nfc[80], utf8_nfd[80];
315231200Smm		char utf16be_nfc[80], utf16be_nfd[80];
316231200Smm		char utf16le_nfc[80], utf16le_nfd[80];
317231200Smm		wchar_t wc_nfc[40], wc_nfd[40];
318231200Smm		char *e, *p;
319238856Smm		const wchar_t *wp;
320238856Smm		const char *mp;
321238856Smm		size_t mplen;
322231200Smm
323231200Smm		line++;
324231200Smm		if (buff[0] == '#')
325231200Smm			continue;
326231200Smm		p = strchr(buff, ';');
327231200Smm		if (p == NULL)
328231200Smm			continue;
329231200Smm		*p++ = '\0';
330231200Smm		/* Copy an NFC pattern */
331231200Smm		strncpy(nfc, buff, sizeof(nfc)-1);
332231200Smm		nfc[sizeof(nfc)-1] = '\0';
333231200Smm		e = p;
334231200Smm		p = strchr(p, '\n');
335231200Smm		if (p == NULL)
336231200Smm			continue;
337231200Smm		*p = '\0';
338231200Smm		/* Copy an NFD pattern */
339231200Smm		strncpy(nfd, e, sizeof(nfd)-1);
340231200Smm		nfd[sizeof(nfd)-1] = '\0';
341231200Smm
342231200Smm		/*
343238856Smm		 * Get an NFC patterns.
344231200Smm		 */
345231200Smm		scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc,
346231200Smm		    nfc, 0);
347231200Smm
348231200Smm		/*
349238856Smm		 * Get an NFD patterns.
350231200Smm		 */
351231200Smm		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
352231200Smm		    nfd, 0);
353231200Smm
354231200Smm		if (locale_is_utf8) {
355231200Smm			/*
356231200Smm			 * Normalize an NFD string for import.
357231200Smm			 */
358238856Smm			assertEqualInt(0, archive_strcpy_l(
359231200Smm			    &utf8, utf8_nfd, f_sconv8));
360231200Smm			failure("NFD(%s) should be converted to NFC(%s):%d",
361231200Smm			    nfd, nfc, line);
362231200Smm			assertEqualUTF8String(utf8_nfc, utf8.s);
363231200Smm
364231200Smm			/*
365231200Smm			 * Normalize an NFC string for import.
366231200Smm			 */
367238856Smm			assertEqualInt(0, archive_strcpy_l(
368231200Smm			    &utf8, utf8_nfc, f_sconv8));
369231200Smm			failure("NFC(%s) should not be any changed:%d",
370231200Smm			    nfc, line);
371231200Smm			assertEqualUTF8String(utf8_nfc, utf8.s);
372231200Smm
373231200Smm			/*
374231200Smm			 * Copy an NFC string for export.
375231200Smm			 */
376238856Smm			assertEqualInt(0, archive_strcpy_l(
377231200Smm			    &utf8, utf8_nfc, t_sconv8));
378231200Smm			failure("NFC(%s) should not be any changed:%d",
379231200Smm			    nfc, line);
380231200Smm			assertEqualUTF8String(utf8_nfc, utf8.s);
381231200Smm
382231200Smm			/*
383231200Smm			 * Normalize an NFD string in UTF-16BE for import.
384231200Smm			 */
385238856Smm			assertEqualInt(0, archive_strncpy_l(
386231200Smm			    &utf8, utf16be_nfd, 100000, f_sconv16be));
387231200Smm			failure("NFD(%s) should be converted to NFC(%s):%d",
388231200Smm			    nfd, nfc, line);
389231200Smm			assertEqualUTF8String(utf8_nfc, utf8.s);
390231200Smm
391231200Smm			/*
392231200Smm			 * Normalize an NFD string in UTF-16LE for import.
393231200Smm			 */
394238856Smm			assertEqualInt(0, archive_strncpy_l(
395231200Smm			    &utf8, utf16le_nfd, 100000, f_sconv16le));
396231200Smm			failure("NFD(%s) should be converted to NFC(%s):%d",
397231200Smm			    nfd, nfc, line);
398231200Smm			assertEqualUTF8String(utf8_nfc, utf8.s);
399231200Smm		}
400231200Smm
401231200Smm		/*
402231200Smm		 * Test for archive_mstring interface.
403231200Smm		 * In specific, Windows platform UTF-16BE is directly
404231200Smm		 * converted to/from wide-character to avoid the effect of
405231200Smm		 * current locale since windows platform cannot make
406231200Smm		 * locale UTF-8.
407231200Smm		 */
408231200Smm		if (locale_is_utf8 || wc_is_unicode) {
409231200Smm			/*
410231200Smm			 * Normalize an NFD string in UTF-8 for import.
411231200Smm			 */
412231200Smm			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
413238856Smm			    &mstr, utf8_nfd, 100000, f_sconv8));
414231200Smm			assertEqualInt(0,
415231200Smm			    archive_mstring_get_wcs(a, &mstr, &wp));
416238856Smm			failure("UTF-8 NFD(%s) should be converted "
417238856Smm			    "to WCS NFC(%s):%d", nfd, nfc, line);
418238856Smm			assertEqualWString(wc_nfc, wp);
419231200Smm
420231200Smm			/*
421231200Smm			 * Normalize an NFD string in UTF-16BE for import.
422231200Smm			 */
423231200Smm			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
424238856Smm			    &mstr, utf16be_nfd, 100000, f_sconv16be));
425231200Smm			assertEqualInt(0,
426231200Smm			    archive_mstring_get_wcs(a, &mstr, &wp));
427238856Smm			failure("UTF-8 NFD(%s) should be converted "
428238856Smm			    "to WCS NFC(%s):%d", nfd, nfc, line);
429238856Smm			assertEqualWString(wc_nfc, wp);
430231200Smm
431231200Smm			/*
432231200Smm			 * Normalize an NFD string in UTF-16LE for import.
433231200Smm			 */
434231200Smm			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
435238856Smm			    &mstr, utf16le_nfd, 100000, f_sconv16le));
436231200Smm			assertEqualInt(0,
437231200Smm			    archive_mstring_get_wcs(a, &mstr, &wp));
438238856Smm			failure("UTF-8 NFD(%s) should be converted "
439238856Smm			    "to WCS NFC(%s):%d", nfd, nfc, line);
440238856Smm			assertEqualWString(wc_nfc, wp);
441231200Smm
442231200Smm			/*
443238856Smm			 * Copy an NFC wide-string for export.
444231200Smm			 */
445238856Smm			assertEqualInt(0,
446238856Smm			    archive_mstring_copy_wcs(&mstr, wc_nfc));
447231200Smm			assertEqualInt(0, archive_mstring_get_mbs_l(
448368707Smm			    a, &mstr, &mp, &mplen, t_sconv8));
449238856Smm			failure("WCS NFC(%s) should be UTF-8 NFC:%d"
450238856Smm			    ,nfc, line);
451238856Smm			assertEqualUTF8String(utf8_nfc, mp);
452238856Smm		}
453238856Smm	}
454238856Smm
455238856Smm	archive_string_free(&utf8);
456238856Smm	archive_mstring_clean(&mstr);
457238856Smm	fclose(fp);
458238856Smm	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
459238856Smm	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
460238856Smm}
461238856Smm
462238856Smmstatic void
463238856Smmtest_archive_string_normalization_mac_nfd(const char *testdata)
464238856Smm{
465238856Smm	struct archive *a, *a2;
466238856Smm	struct archive_string utf8;
467238856Smm	struct archive_mstring mstr;
468238856Smm	struct archive_string_conv *f_sconv8, *t_sconv8;
469238856Smm	struct archive_string_conv *f_sconv16be, *f_sconv16le;
470238856Smm	FILE *fp;
471238856Smm	char buff[512];
472238856Smm	int line = 0;
473238856Smm	int locale_is_utf8, wc_is_unicode;
474238856Smm	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D;
475238856Smm
476238856Smm	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
477238856Smm	wc_is_unicode = is_wc_unicode();
478238856Smm	/* If it doesn't exist, just warn and return. */
479238856Smm	if (!locale_is_utf8 && !wc_is_unicode) {
480238856Smm		skipping("A test of string normalization for NFD requires "
481238856Smm		    "a suitable locale; en_US.UTF-8 not available on this "
482238856Smm		    "system");
483238856Smm		return;
484238856Smm	}
485238856Smm
486238856Smm	archive_string_init(&utf8);
487238856Smm	memset(&mstr, 0, sizeof(mstr));
488238856Smm
489238856Smm	/*
490238856Smm	 * Create string conversion objects.
491238856Smm	 */
492238856Smm	assert((a = archive_read_new()) != NULL);
493238856Smm	assertA(NULL != (f_sconv8 =
494238856Smm	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
495238856Smm	assertA(NULL != (f_sconv16be =
496238856Smm	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
497238856Smm	assertA(NULL != (f_sconv16le =
498238856Smm	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
499238856Smm	assert((a2 = archive_write_new()) != NULL);
500238856Smm	assertA(NULL != (t_sconv8 =
501238856Smm	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
502238856Smm	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
503238856Smm	    t_sconv8 == NULL) {
504238856Smm		/* We cannot continue this test. */
505238856Smm		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
506238856Smm		return;
507238856Smm	}
508238856Smm	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
509238856Smm	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
510238856Smm	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
511238856Smm	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
512238856Smm
513238856Smm	/* Open a test pattern file. */
514238856Smm	assert((fp = fopen(testdata, "r")) != NULL);
515238856Smm
516238856Smm	/*
517238856Smm	 * Read test data.
518238856Smm	 *  Test data format:
519238856Smm	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
520238856Smm	 *  Unicode pattern format:
521238856Smm	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
522238856Smm	 */
523238856Smm	while (fgets(buff, sizeof(buff), fp) != NULL) {
524238856Smm		char nfc[80], nfd[80];
525238856Smm		char utf8_nfc[80], utf8_nfd[80];
526238856Smm		char utf16be_nfc[80], utf16be_nfd[80];
527238856Smm		char utf16le_nfc[80], utf16le_nfd[80];
528238856Smm		wchar_t wc_nfc[40], wc_nfd[40];
529238856Smm		char *e, *p;
530238856Smm		const wchar_t *wp;
531238856Smm		const char *mp;
532238856Smm		size_t mplen;
533238856Smm		int should_be_nfc;
534238856Smm
535238856Smm		line++;
536238856Smm		if (buff[0] == '#')
537238856Smm			continue;
538238856Smm		p = strchr(buff, ';');
539238856Smm		if (p == NULL)
540238856Smm			continue;
541238856Smm		*p++ = '\0';
542238856Smm		/* Copy an NFC pattern */
543238856Smm		strncpy(nfc, buff, sizeof(nfc)-1);
544238856Smm		nfc[sizeof(nfc)-1] = '\0';
545238856Smm		e = p;
546238856Smm		p = strchr(p, '\n');
547238856Smm		if (p == NULL)
548238856Smm			continue;
549238856Smm		*p = '\0';
550238856Smm		/* Copy an NFD pattern */
551238856Smm		strncpy(nfd, e, sizeof(nfd)-1);
552238856Smm		nfd[sizeof(nfd)-1] = '\0';
553238856Smm
554238856Smm		/*
555238856Smm		 * Get an NFC patterns.
556238856Smm		 */
557238856Smm		should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc,
558238856Smm			utf16be_nfc, utf16le_nfc, nfc, 1);
559238856Smm
560238856Smm		/*
561238856Smm		 * Get an NFD patterns.
562238856Smm		 */
563238856Smm		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
564238856Smm		    nfd, 0);
565238856Smm
566238856Smm		if (locale_is_utf8) {
567231200Smm			/*
568238856Smm			 * Normalize an NFC string for import.
569238856Smm			 */
570238856Smm			assertEqualInt(0, archive_strcpy_l(
571238856Smm			    &utf8, utf8_nfc, f_sconv8));
572238856Smm			if (should_be_nfc) {
573238856Smm				failure("NFC(%s) should not be converted to"
574238856Smm				    " NFD(%s):%d", nfc, nfd, line);
575238856Smm				assertEqualUTF8String(utf8_nfc, utf8.s);
576238856Smm			} else {
577238856Smm				failure("NFC(%s) should be converted to"
578238856Smm				    " NFD(%s):%d", nfc, nfd, line);
579238856Smm				assertEqualUTF8String(utf8_nfd, utf8.s);
580238856Smm			}
581238856Smm
582238856Smm			/*
583238856Smm			 * Normalize an NFD string for import.
584238856Smm			 */
585238856Smm			assertEqualInt(0, archive_strcpy_l(
586238856Smm			    &utf8, utf8_nfd, f_sconv8));
587238856Smm			failure("NFD(%s) should not be any changed:%d",
588238856Smm			    nfd, line);
589238856Smm			assertEqualUTF8String(utf8_nfd, utf8.s);
590238856Smm
591238856Smm			/*
592238856Smm			 * Copy an NFD string for export.
593238856Smm			 */
594238856Smm			assertEqualInt(0, archive_strcpy_l(
595238856Smm			    &utf8, utf8_nfd, t_sconv8));
596238856Smm			failure("NFD(%s) should not be any changed:%d",
597238856Smm			    nfd, line);
598238856Smm			assertEqualUTF8String(utf8_nfd, utf8.s);
599238856Smm
600238856Smm			/*
601238856Smm			 * Normalize an NFC string in UTF-16BE for import.
602238856Smm			 */
603238856Smm			assertEqualInt(0, archive_strncpy_l(
604238856Smm			    &utf8, utf16be_nfc, 100000, f_sconv16be));
605238856Smm			if (should_be_nfc) {
606238856Smm				failure("NFC(%s) should not be converted to"
607238856Smm				    " NFD(%s):%d", nfc, nfd, line);
608238856Smm				assertEqualUTF8String(utf8_nfc, utf8.s);
609238856Smm			} else {
610238856Smm				failure("NFC(%s) should be converted to"
611238856Smm				    " NFD(%s):%d", nfc, nfd, line);
612238856Smm				assertEqualUTF8String(utf8_nfd, utf8.s);
613238856Smm			}
614238856Smm
615238856Smm			/*
616238856Smm			 * Normalize an NFC string in UTF-16LE for import.
617238856Smm			 */
618238856Smm			assertEqualInt(0, archive_strncpy_l(
619238856Smm			    &utf8, utf16le_nfc, 100000, f_sconv16le));
620238856Smm			if (should_be_nfc) {
621238856Smm				failure("NFC(%s) should not be converted to"
622238856Smm				    " NFD(%s):%d", nfc, nfd, line);
623238856Smm				assertEqualUTF8String(utf8_nfc, utf8.s);
624238856Smm			} else {
625238856Smm				failure("NFC(%s) should be converted to"
626238856Smm				    " NFD(%s):%d", nfc, nfd, line);
627238856Smm				assertEqualUTF8String(utf8_nfd, utf8.s);
628238856Smm			}
629238856Smm		}
630238856Smm
631238856Smm		/*
632238856Smm		 * Test for archive_mstring interface.
633238856Smm		 * In specific, Windows platform UTF-16BE is directly
634238856Smm		 * converted to/from wide-character to avoid the effect of
635238856Smm		 * current locale since windows platform cannot make
636238856Smm		 * locale UTF-8.
637238856Smm		 */
638238856Smm		if (locale_is_utf8 || wc_is_unicode) {
639238856Smm			/*
640231200Smm			 * Normalize an NFD string in UTF-8 for import.
641231200Smm			 */
642231200Smm			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
643238856Smm			    &mstr, utf8_nfc, 100000, f_sconv8));
644231200Smm			assertEqualInt(0,
645231200Smm			    archive_mstring_get_wcs(a, &mstr, &wp));
646238856Smm			if (should_be_nfc) {
647238856Smm				failure("UTF-8 NFC(%s) should not be converted "
648238856Smm				    "to WCS NFD(%s):%d", nfc, nfd, line);
649238856Smm				assertEqualWString(wc_nfc, wp);
650238856Smm			} else {
651238856Smm				failure("UTF-8 NFC(%s) should be converted "
652238856Smm				    "to WCS NFD(%s):%d", nfc, nfd, line);
653238856Smm				assertEqualWString(wc_nfd, wp);
654238856Smm			}
655231200Smm
656231200Smm			/*
657231200Smm			 * Normalize an NFD string in UTF-16BE for import.
658231200Smm			 */
659231200Smm			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
660238856Smm			    &mstr, utf16be_nfc, 100000, f_sconv16be));
661231200Smm			assertEqualInt(0,
662231200Smm			    archive_mstring_get_wcs(a, &mstr, &wp));
663238856Smm			if (should_be_nfc) {
664238856Smm				failure("UTF-16BE NFC(%s) should not be "
665238856Smm				    "converted to WCS NFD(%s):%d",
666238856Smm				    nfc, nfd, line);
667238856Smm				assertEqualWString(wc_nfc, wp);
668238856Smm			} else {
669238856Smm				failure("UTF-16BE NFC(%s) should be converted "
670238856Smm				    "to WCS NFD(%s):%d", nfc, nfd, line);
671238856Smm				assertEqualWString(wc_nfd, wp);
672238856Smm			}
673231200Smm
674231200Smm			/*
675231200Smm			 * Normalize an NFD string in UTF-16LE for import.
676231200Smm			 */
677231200Smm			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
678238856Smm			    &mstr, utf16le_nfc, 100000, f_sconv16le));
679231200Smm			assertEqualInt(0,
680231200Smm			    archive_mstring_get_wcs(a, &mstr, &wp));
681238856Smm			if (should_be_nfc) {
682238856Smm				failure("UTF-16LE NFC(%s) should not be "
683238856Smm				    "converted to WCS NFD(%s):%d",
684238856Smm				    nfc, nfd, line);
685238856Smm				assertEqualWString(wc_nfc, wp);
686238856Smm			} else {
687238856Smm				failure("UTF-16LE NFC(%s) should be converted "
688238856Smm				    "to WCS NFD(%s):%d", nfc, nfd, line);
689238856Smm				assertEqualWString(wc_nfd, wp);
690238856Smm			}
691231200Smm
692231200Smm			/*
693238856Smm			 * Copy an NFD wide-string for export.
694231200Smm			 */
695231200Smm			assertEqualInt(0, archive_mstring_copy_wcs(
696238856Smm			    &mstr, wc_nfd));
697231200Smm			assertEqualInt(0, archive_mstring_get_mbs_l(
698368707Smm			    a, &mstr, &mp, &mplen, t_sconv8));
699238856Smm			failure("WCS NFD(%s) should be UTF-8 NFD:%d"
700238856Smm			    ,nfd, line);
701238856Smm			assertEqualUTF8String(utf8_nfd, mp);
702231200Smm		}
703231200Smm	}
704231200Smm
705231200Smm	archive_string_free(&utf8);
706231200Smm	archive_mstring_clean(&mstr);
707231200Smm	fclose(fp);
708231200Smm	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
709231200Smm	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
710231200Smm}
711231200Smm
712231200Smmstatic void
713231200Smmtest_archive_string_canonicalization(void)
714231200Smm{
715231200Smm	struct archive *a;
716231200Smm	struct archive_string_conv *sconv;
717231200Smm
718231200Smm	setlocale(LC_ALL, "en_US.UTF-8");
719231200Smm
720231200Smm	assert((a = archive_read_new()) != NULL);
721231200Smm
722231200Smm	assertA(NULL != (sconv =
723231200Smm	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
724231200Smm	failure("Charset name should be UTF-8");
725231200Smm	assertEqualString("UTF-8",
726231200Smm	    archive_string_conversion_charset_name(sconv));
727231200Smm
728231200Smm	assertA(NULL != (sconv =
729231200Smm	    archive_string_conversion_to_charset(a, "UTF8", 1)));
730231200Smm	failure("Charset name should be UTF-8");
731231200Smm	assertEqualString("UTF-8",
732231200Smm	    archive_string_conversion_charset_name(sconv));
733231200Smm
734231200Smm	assertA(NULL != (sconv =
735231200Smm	    archive_string_conversion_to_charset(a, "utf8", 1)));
736231200Smm	failure("Charset name should be UTF-8");
737231200Smm	assertEqualString("UTF-8",
738231200Smm	    archive_string_conversion_charset_name(sconv));
739231200Smm
740231200Smm	assertA(NULL != (sconv =
741231200Smm	    archive_string_conversion_to_charset(a, "UTF-16BE", 1)));
742231200Smm	failure("Charset name should be UTF-16BE");
743231200Smm	assertEqualString("UTF-16BE",
744231200Smm	    archive_string_conversion_charset_name(sconv));
745231200Smm
746231200Smm	assertA(NULL != (sconv =
747231200Smm	    archive_string_conversion_to_charset(a, "UTF16BE", 1)));
748231200Smm	failure("Charset name should be UTF-16BE");
749231200Smm	assertEqualString("UTF-16BE",
750231200Smm	    archive_string_conversion_charset_name(sconv));
751231200Smm
752231200Smm	assertA(NULL != (sconv =
753231200Smm	    archive_string_conversion_to_charset(a, "utf16be", 1)));
754231200Smm	failure("Charset name should be UTF-16BE");
755231200Smm	assertEqualString("UTF-16BE",
756231200Smm	    archive_string_conversion_charset_name(sconv));
757231200Smm
758231200Smm	assertA(NULL != (sconv =
759231200Smm	    archive_string_conversion_to_charset(a, "UTF-16LE", 1)));
760231200Smm	failure("Charset name should be UTF-16LE");
761231200Smm	assertEqualString("UTF-16LE",
762231200Smm	    archive_string_conversion_charset_name(sconv));
763231200Smm
764231200Smm	assertA(NULL != (sconv =
765231200Smm	    archive_string_conversion_to_charset(a, "UTF16LE", 1)));
766231200Smm	failure("Charset name should be UTF-16LE");
767231200Smm	assertEqualString("UTF-16LE",
768231200Smm	    archive_string_conversion_charset_name(sconv));
769231200Smm
770231200Smm	assertA(NULL != (sconv =
771231200Smm	    archive_string_conversion_to_charset(a, "utf16le", 1)));
772231200Smm	failure("Charset name should be UTF-16LE");
773231200Smm	assertEqualString("UTF-16LE",
774231200Smm	    archive_string_conversion_charset_name(sconv));
775231200Smm
776231200Smm	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
777231200Smm
778231200Smm}
779231200Smm
780368707Smmstatic void
781368707Smmcheck_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc,
782368707Smm  const char *exp, const wchar_t *wexp)
783368707Smm{
784368707Smm	/* Do all the tests on a copy so that we can have a clear initial state every time */
785368707Smm	struct archive_mstring mstr2;
786368707Smm	const char *p = NULL;
787368707Smm	const wchar_t *wp = NULL;
788368707Smm	size_t len = 0;
789368707Smm
790368707Smm	memset(&mstr2, 0, sizeof(mstr2));
791368707Smm
792368707Smm	archive_mstring_copy(&mstr2, mstr);
793368707Smm	assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p));
794368707Smm	assertEqualString(exp, p);
795368707Smm	p = NULL;
796368707Smm
797368707Smm	archive_mstring_copy(&mstr2, mstr);
798368707Smm	assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p));
799368707Smm	assertEqualString(exp, p);
800368707Smm	p = NULL;
801368707Smm
802368707Smm	archive_mstring_copy(&mstr2, mstr);
803368707Smm	assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp));
804368707Smm	assertEqualWString(wexp, wp);
805368707Smm	wp = NULL;
806368707Smm
807368707Smm	archive_mstring_copy(&mstr2, mstr);
808368707Smm	assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc));
809368707Smm	assertEqualString(exp, p);
810368707Smm	assertEqualInt(len, strlen(exp));
811368707Smm	p = NULL;
812368707Smm	len = 0;
813368707Smm
814368707Smm	archive_mstring_clean(&mstr2);
815368707Smm}
816368707Smm
817368707Smm/*
818368707Smm * Make sure no matter what the input encoding is, the string can be
819368707Smm * converted too all the output encodings.
820368707Smm */
821368707Smmstatic void
822368707Smmtest_archive_string_set_get(void)
823368707Smm{
824368707Smm	struct archive *a;
825368707Smm	struct archive_mstring mstr;
826368707Smm	struct archive_string_conv *sc;
827368707Smm
828368707Smm	setlocale(LC_ALL, "en_US.UTF-8");
829368707Smm
830368707Smm	assert((a = archive_read_new()) != NULL);
831368707Smm	memset(&mstr, 0, sizeof(mstr));
832368707Smm
833368707Smm	assertA(NULL != (sc =
834368707Smm	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
835368707Smm	failure("Charset name should be UTF-8");
836368707Smm	assertEqualString("UTF-8",
837368707Smm	    archive_string_conversion_charset_name(sc));
838368707Smm
839368707Smm	assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA"));
840368707Smm	check_string(a, &mstr, sc, "AAA", L"AAA");
841368707Smm	assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB"));
842368707Smm	check_string(a, &mstr, sc, "BBBB", L"BBBB");
843368707Smm	assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12"));
844368707Smm	check_string(a, &mstr, sc, "CCC12", L"CCC12");
845368707Smm	assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc));
846368707Smm	check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l");
847368707Smm	assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H"));
848368707Smm	check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H");
849368707Smm
850368707Smm	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
851368707Smm
852368707Smm}
853368707Smm
854231200SmmDEFINE_TEST(test_archive_string_conversion)
855231200Smm{
856238856Smm	static const char reffile[] = "test_archive_string_conversion.txt.Z";
857238856Smm	static const char testdata[] = "testdata.txt";
858238856Smm	struct archive *a;
859238856Smm	struct archive_entry *ae;
860238856Smm	char buff[512];
861238856Smm	ssize_t size;
862238856Smm	FILE *fp;
863238856Smm
864238856Smm	/*
865238856Smm	 * Extract a test pattern file.
866238856Smm	 */
867238856Smm	extract_reference_file(reffile);
868238856Smm	assert((a = archive_read_new()) != NULL);
869238856Smm	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
870238856Smm	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a));
871238856Smm        assertEqualIntA(a, ARCHIVE_OK,
872238856Smm            archive_read_open_filename(a, reffile, 512));
873238856Smm
874238856Smm	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
875238856Smm	assert((fp = fopen(testdata, "w")) != NULL);
876238856Smm	while ((size = archive_read_data(a, buff, 512)) > 0)
877305188Smm		assertEqualInt(size, fwrite(buff, 1, size, fp));
878305188Smm	assertEqualInt(0, fclose(fp));
879238856Smm	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
880238856Smm
881238856Smm	test_archive_string_normalization_nfc(testdata);
882238856Smm	test_archive_string_normalization_mac_nfd(testdata);
883231200Smm	test_archive_string_canonicalization();
884368707Smm	test_archive_string_set_get();
885231200Smm}
886