1/*-
2 * Copyright (c) 2011-2012 Michihiro NAKAJIMA
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25#include "test.h"
26__FBSDID("$FreeBSD$");
27
28#include <locale.h>
29
30#define __LIBARCHIVE_TEST
31#include "archive_string.h"
32
33/*
34Execute the following to rebuild the data for this program:
35   tail -n +36 test_archive_string_conversion.c | /bin/sh
36#
37# This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt
38#
39if="NormalizationTest.txt"
40if [ ! -f ${if} ]; then
41  echo "Not found: \"${if}\""
42  exit 0
43fi
44of=test_archive_string_conversion.txt.Z
45echo "\$FreeBSD\$" > ${of}.uu
46awk -F ';'  '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu
47exit 1
48*/
49
50static int
51unicode_to_utf8(char *p, uint32_t uc)
52{
53        char *_p = p;
54
55        /* Translate code point to UTF8 */
56        if (uc <= 0x7f) {
57                *p++ = (char)uc;
58        } else if (uc <= 0x7ff) {
59                *p++ = 0xc0 | ((uc >> 6) & 0x1f);
60                *p++ = 0x80 | (uc & 0x3f);
61        } else if (uc <= 0xffff) {
62                *p++ = 0xe0 | ((uc >> 12) & 0x0f);
63                *p++ = 0x80 | ((uc >> 6) & 0x3f);
64                *p++ = 0x80 | (uc & 0x3f);
65        } else {
66                *p++ = 0xf0 | ((uc >> 18) & 0x07);
67                *p++ = 0x80 | ((uc >> 12) & 0x3f);
68                *p++ = 0x80 | ((uc >> 6) & 0x3f);
69                *p++ = 0x80 | (uc & 0x3f);
70        }
71        return ((int)(p - _p));
72}
73
74static void
75archive_be16enc(void *pp, uint16_t u)
76{
77        unsigned char *p = (unsigned char *)pp;
78
79        p[0] = (u >> 8) & 0xff;
80        p[1] = u & 0xff;
81}
82
83static int
84unicode_to_utf16be(char *p, uint32_t uc)
85{
86	char *utf16 = p;
87
88	if (uc > 0xffff) {
89		/* We have a code point that won't fit into a
90		 * wchar_t; convert it to a surrogate pair. */
91		uc -= 0x10000;
92		archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
93		archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
94		return (4);
95	} else {
96		archive_be16enc(utf16, uc);
97		return (2);
98	}
99}
100
101static void
102archive_le16enc(void *pp, uint16_t u)
103{
104	unsigned char *p = (unsigned char *)pp;
105
106	p[0] = u & 0xff;
107	p[1] = (u >> 8) & 0xff;
108}
109
110static size_t
111unicode_to_utf16le(char *p, uint32_t uc)
112{
113	char *utf16 = p;
114
115	if (uc > 0xffff) {
116		/* We have a code point that won't fit into a
117		 * wchar_t; convert it to a surrogate pair. */
118		uc -= 0x10000;
119		archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
120		archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
121		return (4);
122	} else {
123		archive_le16enc(utf16, uc);
124		return (2);
125	}
126}
127
128static int
129wc_size(void)
130{
131	return (sizeof(wchar_t));
132}
133
134static int
135unicode_to_wc(wchar_t *wp, uint32_t uc)
136{
137	if (wc_size() == 4) {
138		*wp = (wchar_t)uc;
139		return (1);
140	}
141	if (uc > 0xffff) {
142		/* We have a code point that won't fit into a
143		 * wchar_t; convert it to a surrogate pair. */
144		uc -= 0x10000;
145		*wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800);
146		*wp = (wchar_t)((uc & 0x3ff) + 0xDC00);
147		return (2);
148	} else {
149		*wp = (wchar_t)uc;
150		return (1);
151	}
152}
153
154/*
155 * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not
156 * converted to NFD on Mac OS.
157 * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
158 */
159static int
160scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le,
161    const char *pattern, int mac_nfd)
162{
163	unsigned uc = 0;
164	const char *p = pattern;
165	char *op = out;
166	wchar_t *owp = wout;
167	char *op16be = u16be;
168	char *op16le = u16le;
169	int ret = 0;
170
171	for (;;) {
172		if (*p >= '0' && *p <= '9')
173			uc = (uc << 4) + (*p - '0');
174		else if (*p >= 'A' && *p <= 'F')
175			uc = (uc << 4) + (*p - 'A' + 0x0a);
176		else {
177			if (mac_nfd && op == out) {
178				/*
179				 * These are not converted to NFD on Mac OS.
180 				 * U+2000 - U+2FFF
181				 * U+F900 - U+FAFF
182				 * U+2F800 - U+2FAFF
183				 */
184				switch (uc) {
185				case 0x2194: case 0x219A: case 0x219B:
186				case 0x21AE: case 0x21CD: case 0x21CE:
187				case 0x21CF: case 0x2204: case 0x2209:
188				case 0x220C: case 0x2224: case 0x2226:
189				case 0x2241: case 0x2244: case 0x2247:
190				case 0x2249: case 0x2260: case 0x2262:
191				case 0x226D: case 0x226E: case 0x226F:
192				case 0x2270: case 0x2271: case 0x2274:
193				case 0x2275: case 0x2276: case 0x2278:
194				case 0x2279: case 0x227A: case 0x227B:
195				case 0x2280: case 0x2281: case 0x2284:
196				case 0x2285: case 0x2288: case 0x2289:
197				case 0x22AC: case 0x22AD: case 0x22AE:
198				case 0x22AF: case 0x22E0: case 0x22E1:
199				case 0x22E2: case 0x22E3: case 0x22EA:
200				case 0x22EB: case 0x22EC: case 0x22ED:
201
202				/*
203				 * Those code points are not converted to
204				 * NFD on Mac OS. I do not know the reason
205				 * because it is undocumented.
206				 *   NFC        NFD
207				 *   1109A  ==> 11099 110BA
208				 *   1109C  ==> 1109B 110BA
209				 *   110AB  ==> 110A5 110BA
210				 */
211				case 0x1109A: case 0x1109C: case 0x110AB:
212					ret = 1;
213					break;
214				}
215			}
216			op16be += unicode_to_utf16be(op16be, uc);
217			op16le += unicode_to_utf16le(op16le, uc);
218			owp += unicode_to_wc(owp, uc);
219			op += unicode_to_utf8(op, uc);
220			if (!*p) {
221				*op16be++ = 0;
222				*op16be = 0;
223				*op16le++ = 0;
224				*op16le = 0;
225				*owp = L'\0';
226				*op = '\0';
227				break;
228			}
229			uc = 0;
230		}
231		p++;
232	}
233	return (ret);
234}
235
236static int
237is_wc_unicode(void)
238{
239#if defined(_WIN32) && !defined(__CYGWIN__)
240	return (1);
241#else
242	return (0);
243#endif
244}
245
246/*
247 * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters.
248 * On Mac OS, the characters to be Form D.
249 * On other platforms, the characters to be Form C.
250 */
251static void
252test_archive_string_normalization_nfc(const char *testdata)
253{
254	struct archive *a, *a2;
255	struct archive_string utf8;
256	struct archive_mstring mstr;
257	struct archive_string_conv *f_sconv8, *t_sconv8;
258	struct archive_string_conv *f_sconv16be, *f_sconv16le;
259	FILE *fp;
260	char buff[512];
261	int line = 0;
262	int locale_is_utf8, wc_is_unicode;
263	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C;
264
265	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
266	wc_is_unicode = is_wc_unicode();
267	/* If it doesn't exist, just warn and return. */
268	if (!locale_is_utf8 && !wc_is_unicode) {
269		skipping("A test of string normalization for NFC requires "
270		    "a suitable locale; en_US.UTF-8 not available on this "
271		    "system");
272		return;
273	}
274
275	archive_string_init(&utf8);
276	memset(&mstr, 0, sizeof(mstr));
277
278	/*
279	 * Create string conversion objects.
280	 */
281	assert((a = archive_read_new()) != NULL);
282	assertA(NULL != (f_sconv8 =
283	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
284	assertA(NULL != (f_sconv16be =
285	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
286	assertA(NULL != (f_sconv16le =
287	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
288	assert((a2 = archive_write_new()) != NULL);
289	assertA(NULL != (t_sconv8 =
290	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
291	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
292	    t_sconv8 == NULL) {
293		/* We cannot continue this test. */
294		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
295		return;
296	}
297	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
298	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
299	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
300	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
301
302	/* Open a test pattern file. */
303	assert((fp = fopen(testdata, "r")) != NULL);
304
305	/*
306	 * Read test data.
307	 *  Test data format:
308	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
309	 *  Unicode pattern format:
310	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
311	 */
312	while (fgets(buff, sizeof(buff), fp) != NULL) {
313		char nfc[80], nfd[80];
314		char utf8_nfc[80], utf8_nfd[80];
315		char utf16be_nfc[80], utf16be_nfd[80];
316		char utf16le_nfc[80], utf16le_nfd[80];
317		wchar_t wc_nfc[40], wc_nfd[40];
318		char *e, *p;
319		const wchar_t *wp;
320		const char *mp;
321		size_t mplen;
322
323		line++;
324		if (buff[0] == '#')
325			continue;
326		p = strchr(buff, ';');
327		if (p == NULL)
328			continue;
329		*p++ = '\0';
330		/* Copy an NFC pattern */
331		strncpy(nfc, buff, sizeof(nfc)-1);
332		nfc[sizeof(nfc)-1] = '\0';
333		e = p;
334		p = strchr(p, '\n');
335		if (p == NULL)
336			continue;
337		*p = '\0';
338		/* Copy an NFD pattern */
339		strncpy(nfd, e, sizeof(nfd)-1);
340		nfd[sizeof(nfd)-1] = '\0';
341
342		/*
343		 * Get an NFC patterns.
344		 */
345		scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc,
346		    nfc, 0);
347
348		/*
349		 * Get an NFD patterns.
350		 */
351		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
352		    nfd, 0);
353
354		if (locale_is_utf8) {
355			/*
356			 * Normalize an NFD string for import.
357			 */
358			assertEqualInt(0, archive_strcpy_l(
359			    &utf8, utf8_nfd, f_sconv8));
360			failure("NFD(%s) should be converted to NFC(%s):%d",
361			    nfd, nfc, line);
362			assertEqualUTF8String(utf8_nfc, utf8.s);
363
364			/*
365			 * Normalize an NFC string for import.
366			 */
367			assertEqualInt(0, archive_strcpy_l(
368			    &utf8, utf8_nfc, f_sconv8));
369			failure("NFC(%s) should not be any changed:%d",
370			    nfc, line);
371			assertEqualUTF8String(utf8_nfc, utf8.s);
372
373			/*
374			 * Copy an NFC string for export.
375			 */
376			assertEqualInt(0, archive_strcpy_l(
377			    &utf8, utf8_nfc, t_sconv8));
378			failure("NFC(%s) should not be any changed:%d",
379			    nfc, line);
380			assertEqualUTF8String(utf8_nfc, utf8.s);
381
382			/*
383			 * Normalize an NFD string in UTF-16BE for import.
384			 */
385			assertEqualInt(0, archive_strncpy_l(
386			    &utf8, utf16be_nfd, 100000, f_sconv16be));
387			failure("NFD(%s) should be converted to NFC(%s):%d",
388			    nfd, nfc, line);
389			assertEqualUTF8String(utf8_nfc, utf8.s);
390
391			/*
392			 * Normalize an NFD string in UTF-16LE for import.
393			 */
394			assertEqualInt(0, archive_strncpy_l(
395			    &utf8, utf16le_nfd, 100000, f_sconv16le));
396			failure("NFD(%s) should be converted to NFC(%s):%d",
397			    nfd, nfc, line);
398			assertEqualUTF8String(utf8_nfc, utf8.s);
399		}
400
401		/*
402		 * Test for archive_mstring interface.
403		 * In specific, Windows platform UTF-16BE is directly
404		 * converted to/from wide-character to avoid the effect of
405		 * current locale since windows platform cannot make
406		 * locale UTF-8.
407		 */
408		if (locale_is_utf8 || wc_is_unicode) {
409			/*
410			 * Normalize an NFD string in UTF-8 for import.
411			 */
412			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
413			    &mstr, utf8_nfd, 100000, f_sconv8));
414			assertEqualInt(0,
415			    archive_mstring_get_wcs(a, &mstr, &wp));
416			failure("UTF-8 NFD(%s) should be converted "
417			    "to WCS NFC(%s):%d", nfd, nfc, line);
418			assertEqualWString(wc_nfc, wp);
419
420			/*
421			 * Normalize an NFD string in UTF-16BE for import.
422			 */
423			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
424			    &mstr, utf16be_nfd, 100000, f_sconv16be));
425			assertEqualInt(0,
426			    archive_mstring_get_wcs(a, &mstr, &wp));
427			failure("UTF-8 NFD(%s) should be converted "
428			    "to WCS NFC(%s):%d", nfd, nfc, line);
429			assertEqualWString(wc_nfc, wp);
430
431			/*
432			 * Normalize an NFD string in UTF-16LE for import.
433			 */
434			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
435			    &mstr, utf16le_nfd, 100000, f_sconv16le));
436			assertEqualInt(0,
437			    archive_mstring_get_wcs(a, &mstr, &wp));
438			failure("UTF-8 NFD(%s) should be converted "
439			    "to WCS NFC(%s):%d", nfd, nfc, line);
440			assertEqualWString(wc_nfc, wp);
441
442			/*
443			 * Copy an NFC wide-string for export.
444			 */
445			assertEqualInt(0,
446			    archive_mstring_copy_wcs(&mstr, wc_nfc));
447			assertEqualInt(0, archive_mstring_get_mbs_l(
448			    a, &mstr, &mp, &mplen, t_sconv8));
449			failure("WCS NFC(%s) should be UTF-8 NFC:%d"
450			    ,nfc, line);
451			assertEqualUTF8String(utf8_nfc, mp);
452		}
453	}
454
455	archive_string_free(&utf8);
456	archive_mstring_clean(&mstr);
457	fclose(fp);
458	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
459	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
460}
461
462static void
463test_archive_string_normalization_mac_nfd(const char *testdata)
464{
465	struct archive *a, *a2;
466	struct archive_string utf8;
467	struct archive_mstring mstr;
468	struct archive_string_conv *f_sconv8, *t_sconv8;
469	struct archive_string_conv *f_sconv16be, *f_sconv16le;
470	FILE *fp;
471	char buff[512];
472	int line = 0;
473	int locale_is_utf8, wc_is_unicode;
474	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D;
475
476	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
477	wc_is_unicode = is_wc_unicode();
478	/* If it doesn't exist, just warn and return. */
479	if (!locale_is_utf8 && !wc_is_unicode) {
480		skipping("A test of string normalization for NFD requires "
481		    "a suitable locale; en_US.UTF-8 not available on this "
482		    "system");
483		return;
484	}
485
486	archive_string_init(&utf8);
487	memset(&mstr, 0, sizeof(mstr));
488
489	/*
490	 * Create string conversion objects.
491	 */
492	assert((a = archive_read_new()) != NULL);
493	assertA(NULL != (f_sconv8 =
494	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
495	assertA(NULL != (f_sconv16be =
496	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
497	assertA(NULL != (f_sconv16le =
498	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
499	assert((a2 = archive_write_new()) != NULL);
500	assertA(NULL != (t_sconv8 =
501	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
502	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
503	    t_sconv8 == NULL) {
504		/* We cannot continue this test. */
505		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
506		return;
507	}
508	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
509	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
510	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
511	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
512
513	/* Open a test pattern file. */
514	assert((fp = fopen(testdata, "r")) != NULL);
515
516	/*
517	 * Read test data.
518	 *  Test data format:
519	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
520	 *  Unicode pattern format:
521	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
522	 */
523	while (fgets(buff, sizeof(buff), fp) != NULL) {
524		char nfc[80], nfd[80];
525		char utf8_nfc[80], utf8_nfd[80];
526		char utf16be_nfc[80], utf16be_nfd[80];
527		char utf16le_nfc[80], utf16le_nfd[80];
528		wchar_t wc_nfc[40], wc_nfd[40];
529		char *e, *p;
530		const wchar_t *wp;
531		const char *mp;
532		size_t mplen;
533		int should_be_nfc;
534
535		line++;
536		if (buff[0] == '#')
537			continue;
538		p = strchr(buff, ';');
539		if (p == NULL)
540			continue;
541		*p++ = '\0';
542		/* Copy an NFC pattern */
543		strncpy(nfc, buff, sizeof(nfc)-1);
544		nfc[sizeof(nfc)-1] = '\0';
545		e = p;
546		p = strchr(p, '\n');
547		if (p == NULL)
548			continue;
549		*p = '\0';
550		/* Copy an NFD pattern */
551		strncpy(nfd, e, sizeof(nfd)-1);
552		nfd[sizeof(nfd)-1] = '\0';
553
554		/*
555		 * Get an NFC patterns.
556		 */
557		should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc,
558			utf16be_nfc, utf16le_nfc, nfc, 1);
559
560		/*
561		 * Get an NFD patterns.
562		 */
563		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
564		    nfd, 0);
565
566		if (locale_is_utf8) {
567			/*
568			 * Normalize an NFC string for import.
569			 */
570			assertEqualInt(0, archive_strcpy_l(
571			    &utf8, utf8_nfc, f_sconv8));
572			if (should_be_nfc) {
573				failure("NFC(%s) should not be converted to"
574				    " NFD(%s):%d", nfc, nfd, line);
575				assertEqualUTF8String(utf8_nfc, utf8.s);
576			} else {
577				failure("NFC(%s) should be converted to"
578				    " NFD(%s):%d", nfc, nfd, line);
579				assertEqualUTF8String(utf8_nfd, utf8.s);
580			}
581
582			/*
583			 * Normalize an NFD string for import.
584			 */
585			assertEqualInt(0, archive_strcpy_l(
586			    &utf8, utf8_nfd, f_sconv8));
587			failure("NFD(%s) should not be any changed:%d",
588			    nfd, line);
589			assertEqualUTF8String(utf8_nfd, utf8.s);
590
591			/*
592			 * Copy an NFD string for export.
593			 */
594			assertEqualInt(0, archive_strcpy_l(
595			    &utf8, utf8_nfd, t_sconv8));
596			failure("NFD(%s) should not be any changed:%d",
597			    nfd, line);
598			assertEqualUTF8String(utf8_nfd, utf8.s);
599
600			/*
601			 * Normalize an NFC string in UTF-16BE for import.
602			 */
603			assertEqualInt(0, archive_strncpy_l(
604			    &utf8, utf16be_nfc, 100000, f_sconv16be));
605			if (should_be_nfc) {
606				failure("NFC(%s) should not be converted to"
607				    " NFD(%s):%d", nfc, nfd, line);
608				assertEqualUTF8String(utf8_nfc, utf8.s);
609			} else {
610				failure("NFC(%s) should be converted to"
611				    " NFD(%s):%d", nfc, nfd, line);
612				assertEqualUTF8String(utf8_nfd, utf8.s);
613			}
614
615			/*
616			 * Normalize an NFC string in UTF-16LE for import.
617			 */
618			assertEqualInt(0, archive_strncpy_l(
619			    &utf8, utf16le_nfc, 100000, f_sconv16le));
620			if (should_be_nfc) {
621				failure("NFC(%s) should not be converted to"
622				    " NFD(%s):%d", nfc, nfd, line);
623				assertEqualUTF8String(utf8_nfc, utf8.s);
624			} else {
625				failure("NFC(%s) should be converted to"
626				    " NFD(%s):%d", nfc, nfd, line);
627				assertEqualUTF8String(utf8_nfd, utf8.s);
628			}
629		}
630
631		/*
632		 * Test for archive_mstring interface.
633		 * In specific, Windows platform UTF-16BE is directly
634		 * converted to/from wide-character to avoid the effect of
635		 * current locale since windows platform cannot make
636		 * locale UTF-8.
637		 */
638		if (locale_is_utf8 || wc_is_unicode) {
639			/*
640			 * Normalize an NFD string in UTF-8 for import.
641			 */
642			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
643			    &mstr, utf8_nfc, 100000, f_sconv8));
644			assertEqualInt(0,
645			    archive_mstring_get_wcs(a, &mstr, &wp));
646			if (should_be_nfc) {
647				failure("UTF-8 NFC(%s) should not be converted "
648				    "to WCS NFD(%s):%d", nfc, nfd, line);
649				assertEqualWString(wc_nfc, wp);
650			} else {
651				failure("UTF-8 NFC(%s) should be converted "
652				    "to WCS NFD(%s):%d", nfc, nfd, line);
653				assertEqualWString(wc_nfd, wp);
654			}
655
656			/*
657			 * Normalize an NFD string in UTF-16BE for import.
658			 */
659			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
660			    &mstr, utf16be_nfc, 100000, f_sconv16be));
661			assertEqualInt(0,
662			    archive_mstring_get_wcs(a, &mstr, &wp));
663			if (should_be_nfc) {
664				failure("UTF-16BE NFC(%s) should not be "
665				    "converted to WCS NFD(%s):%d",
666				    nfc, nfd, line);
667				assertEqualWString(wc_nfc, wp);
668			} else {
669				failure("UTF-16BE NFC(%s) should be converted "
670				    "to WCS NFD(%s):%d", nfc, nfd, line);
671				assertEqualWString(wc_nfd, wp);
672			}
673
674			/*
675			 * Normalize an NFD string in UTF-16LE for import.
676			 */
677			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
678			    &mstr, utf16le_nfc, 100000, f_sconv16le));
679			assertEqualInt(0,
680			    archive_mstring_get_wcs(a, &mstr, &wp));
681			if (should_be_nfc) {
682				failure("UTF-16LE NFC(%s) should not be "
683				    "converted to WCS NFD(%s):%d",
684				    nfc, nfd, line);
685				assertEqualWString(wc_nfc, wp);
686			} else {
687				failure("UTF-16LE NFC(%s) should be converted "
688				    "to WCS NFD(%s):%d", nfc, nfd, line);
689				assertEqualWString(wc_nfd, wp);
690			}
691
692			/*
693			 * Copy an NFD wide-string for export.
694			 */
695			assertEqualInt(0, archive_mstring_copy_wcs(
696			    &mstr, wc_nfd));
697			assertEqualInt(0, archive_mstring_get_mbs_l(
698			    a, &mstr, &mp, &mplen, t_sconv8));
699			failure("WCS NFD(%s) should be UTF-8 NFD:%d"
700			    ,nfd, line);
701			assertEqualUTF8String(utf8_nfd, mp);
702		}
703	}
704
705	archive_string_free(&utf8);
706	archive_mstring_clean(&mstr);
707	fclose(fp);
708	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
709	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
710}
711
712static void
713test_archive_string_canonicalization(void)
714{
715	struct archive *a;
716	struct archive_string_conv *sconv;
717
718	setlocale(LC_ALL, "en_US.UTF-8");
719
720	assert((a = archive_read_new()) != NULL);
721
722	assertA(NULL != (sconv =
723	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
724	failure("Charset name should be UTF-8");
725	assertEqualString("UTF-8",
726	    archive_string_conversion_charset_name(sconv));
727
728	assertA(NULL != (sconv =
729	    archive_string_conversion_to_charset(a, "UTF8", 1)));
730	failure("Charset name should be UTF-8");
731	assertEqualString("UTF-8",
732	    archive_string_conversion_charset_name(sconv));
733
734	assertA(NULL != (sconv =
735	    archive_string_conversion_to_charset(a, "utf8", 1)));
736	failure("Charset name should be UTF-8");
737	assertEqualString("UTF-8",
738	    archive_string_conversion_charset_name(sconv));
739
740	assertA(NULL != (sconv =
741	    archive_string_conversion_to_charset(a, "UTF-16BE", 1)));
742	failure("Charset name should be UTF-16BE");
743	assertEqualString("UTF-16BE",
744	    archive_string_conversion_charset_name(sconv));
745
746	assertA(NULL != (sconv =
747	    archive_string_conversion_to_charset(a, "UTF16BE", 1)));
748	failure("Charset name should be UTF-16BE");
749	assertEqualString("UTF-16BE",
750	    archive_string_conversion_charset_name(sconv));
751
752	assertA(NULL != (sconv =
753	    archive_string_conversion_to_charset(a, "utf16be", 1)));
754	failure("Charset name should be UTF-16BE");
755	assertEqualString("UTF-16BE",
756	    archive_string_conversion_charset_name(sconv));
757
758	assertA(NULL != (sconv =
759	    archive_string_conversion_to_charset(a, "UTF-16LE", 1)));
760	failure("Charset name should be UTF-16LE");
761	assertEqualString("UTF-16LE",
762	    archive_string_conversion_charset_name(sconv));
763
764	assertA(NULL != (sconv =
765	    archive_string_conversion_to_charset(a, "UTF16LE", 1)));
766	failure("Charset name should be UTF-16LE");
767	assertEqualString("UTF-16LE",
768	    archive_string_conversion_charset_name(sconv));
769
770	assertA(NULL != (sconv =
771	    archive_string_conversion_to_charset(a, "utf16le", 1)));
772	failure("Charset name should be UTF-16LE");
773	assertEqualString("UTF-16LE",
774	    archive_string_conversion_charset_name(sconv));
775
776	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
777
778}
779
780static void
781check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc,
782  const char *exp, const wchar_t *wexp)
783{
784	/* Do all the tests on a copy so that we can have a clear initial state every time */
785	struct archive_mstring mstr2;
786	const char *p = NULL;
787	const wchar_t *wp = NULL;
788	size_t len = 0;
789
790	memset(&mstr2, 0, sizeof(mstr2));
791
792	archive_mstring_copy(&mstr2, mstr);
793	assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p));
794	assertEqualString(exp, p);
795	p = NULL;
796
797	archive_mstring_copy(&mstr2, mstr);
798	assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p));
799	assertEqualString(exp, p);
800	p = NULL;
801
802	archive_mstring_copy(&mstr2, mstr);
803	assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp));
804	assertEqualWString(wexp, wp);
805	wp = NULL;
806
807	archive_mstring_copy(&mstr2, mstr);
808	assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc));
809	assertEqualString(exp, p);
810	assertEqualInt(len, strlen(exp));
811	p = NULL;
812	len = 0;
813
814	archive_mstring_clean(&mstr2);
815}
816
817/*
818 * Make sure no matter what the input encoding is, the string can be
819 * converted too all the output encodings.
820 */
821static void
822test_archive_string_set_get(void)
823{
824	struct archive *a;
825	struct archive_mstring mstr;
826	struct archive_string_conv *sc;
827
828	setlocale(LC_ALL, "en_US.UTF-8");
829
830	assert((a = archive_read_new()) != NULL);
831	memset(&mstr, 0, sizeof(mstr));
832
833	assertA(NULL != (sc =
834	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
835	failure("Charset name should be UTF-8");
836	assertEqualString("UTF-8",
837	    archive_string_conversion_charset_name(sc));
838
839	assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA"));
840	check_string(a, &mstr, sc, "AAA", L"AAA");
841	assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB"));
842	check_string(a, &mstr, sc, "BBBB", L"BBBB");
843	assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12"));
844	check_string(a, &mstr, sc, "CCC12", L"CCC12");
845	assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc));
846	check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l");
847	assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H"));
848	check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H");
849
850	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
851
852}
853
854DEFINE_TEST(test_archive_string_conversion)
855{
856	static const char reffile[] = "test_archive_string_conversion.txt.Z";
857	static const char testdata[] = "testdata.txt";
858	struct archive *a;
859	struct archive_entry *ae;
860	char buff[512];
861	ssize_t size;
862	FILE *fp;
863
864	/*
865	 * Extract a test pattern file.
866	 */
867	extract_reference_file(reffile);
868	assert((a = archive_read_new()) != NULL);
869	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
870	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a));
871        assertEqualIntA(a, ARCHIVE_OK,
872            archive_read_open_filename(a, reffile, 512));
873
874	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
875	assert((fp = fopen(testdata, "w")) != NULL);
876	while ((size = archive_read_data(a, buff, 512)) > 0)
877		assertEqualInt(size, fwrite(buff, 1, size, fp));
878	assertEqualInt(0, fclose(fp));
879	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
880
881	test_archive_string_normalization_nfc(testdata);
882	test_archive_string_normalization_mac_nfd(testdata);
883	test_archive_string_canonicalization();
884	test_archive_string_set_get();
885}
886