1/*-
2 * Copyright (c) 2011-2012 Michihiro NAKAJIMA
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25#include "test.h"
26
27#include <locale.h>
28
29#define __LIBARCHIVE_TEST
30#include "archive_string.h"
31
32/*
33Execute the following to rebuild the data for this program:
34   tail -n +36 test_archive_string_conversion.c | /bin/sh
35#
36# This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt
37#
38if="NormalizationTest.txt"
39if [ ! -f ${if} ]; then
40  echo "Not found: \"${if}\""
41  exit 0
42fi
43of=test_archive_string_conversion.txt.Z
44awk -F ';'  '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} > ${of}.uu
45exit 1
46*/
47
48static int
49unicode_to_utf8(char *p, uint32_t uc)
50{
51        char *_p = p;
52
53        /* Translate code point to UTF8 */
54        if (uc <= 0x7f) {
55                *p++ = (char)uc;
56        } else if (uc <= 0x7ff) {
57                *p++ = 0xc0 | ((uc >> 6) & 0x1f);
58                *p++ = 0x80 | (uc & 0x3f);
59        } else if (uc <= 0xffff) {
60                *p++ = 0xe0 | ((uc >> 12) & 0x0f);
61                *p++ = 0x80 | ((uc >> 6) & 0x3f);
62                *p++ = 0x80 | (uc & 0x3f);
63        } else {
64                *p++ = 0xf0 | ((uc >> 18) & 0x07);
65                *p++ = 0x80 | ((uc >> 12) & 0x3f);
66                *p++ = 0x80 | ((uc >> 6) & 0x3f);
67                *p++ = 0x80 | (uc & 0x3f);
68        }
69        return ((int)(p - _p));
70}
71
72static void
73archive_be16enc(void *pp, uint16_t u)
74{
75        unsigned char *p = (unsigned char *)pp;
76
77        p[0] = (u >> 8) & 0xff;
78        p[1] = u & 0xff;
79}
80
81static int
82unicode_to_utf16be(char *p, uint32_t uc)
83{
84	char *utf16 = p;
85
86	if (uc > 0xffff) {
87		/* We have a code point that won't fit into a
88		 * wchar_t; convert it to a surrogate pair. */
89		uc -= 0x10000;
90		archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
91		archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
92		return (4);
93	} else {
94		archive_be16enc(utf16, uc);
95		return (2);
96	}
97}
98
99static void
100archive_le16enc(void *pp, uint16_t u)
101{
102	unsigned char *p = (unsigned char *)pp;
103
104	p[0] = u & 0xff;
105	p[1] = (u >> 8) & 0xff;
106}
107
108static size_t
109unicode_to_utf16le(char *p, uint32_t uc)
110{
111	char *utf16 = p;
112
113	if (uc > 0xffff) {
114		/* We have a code point that won't fit into a
115		 * wchar_t; convert it to a surrogate pair. */
116		uc -= 0x10000;
117		archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
118		archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
119		return (4);
120	} else {
121		archive_le16enc(utf16, uc);
122		return (2);
123	}
124}
125
126static int
127wc_size(void)
128{
129	return (sizeof(wchar_t));
130}
131
132static int
133unicode_to_wc(wchar_t *wp, uint32_t uc)
134{
135	if (wc_size() == 4) {
136		*wp = (wchar_t)uc;
137		return (1);
138	}
139	if (uc > 0xffff) {
140		/* We have a code point that won't fit into a
141		 * wchar_t; convert it to a surrogate pair. */
142		uc -= 0x10000;
143		*wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800);
144		*wp = (wchar_t)((uc & 0x3ff) + 0xDC00);
145		return (2);
146	} else {
147		*wp = (wchar_t)uc;
148		return (1);
149	}
150}
151
152/*
153 * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not
154 * converted to NFD on Mac OS.
155 * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
156 */
157static int
158scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le,
159    const char *pattern, int mac_nfd)
160{
161	unsigned uc = 0;
162	const char *p = pattern;
163	char *op = out;
164	wchar_t *owp = wout;
165	char *op16be = u16be;
166	char *op16le = u16le;
167	int ret = 0;
168
169	for (;;) {
170		if (*p >= '0' && *p <= '9')
171			uc = (uc << 4) + (*p - '0');
172		else if (*p >= 'A' && *p <= 'F')
173			uc = (uc << 4) + (*p - 'A' + 0x0a);
174		else {
175			if (mac_nfd && op == out) {
176				/*
177				 * These are not converted to NFD on Mac OS.
178 				 * U+2000 - U+2FFF
179				 * U+F900 - U+FAFF
180				 * U+2F800 - U+2FAFF
181				 */
182				switch (uc) {
183				case 0x2194: case 0x219A: case 0x219B:
184				case 0x21AE: case 0x21CD: case 0x21CE:
185				case 0x21CF: case 0x2204: case 0x2209:
186				case 0x220C: case 0x2224: case 0x2226:
187				case 0x2241: case 0x2244: case 0x2247:
188				case 0x2249: case 0x2260: case 0x2262:
189				case 0x226D: case 0x226E: case 0x226F:
190				case 0x2270: case 0x2271: case 0x2274:
191				case 0x2275: case 0x2276: case 0x2278:
192				case 0x2279: case 0x227A: case 0x227B:
193				case 0x2280: case 0x2281: case 0x2284:
194				case 0x2285: case 0x2288: case 0x2289:
195				case 0x22AC: case 0x22AD: case 0x22AE:
196				case 0x22AF: case 0x22E0: case 0x22E1:
197				case 0x22E2: case 0x22E3: case 0x22EA:
198				case 0x22EB: case 0x22EC: case 0x22ED:
199
200				/*
201				 * Those code points are not converted to
202				 * NFD on Mac OS. I do not know the reason
203				 * because it is undocumented.
204				 *   NFC        NFD
205				 *   1109A  ==> 11099 110BA
206				 *   1109C  ==> 1109B 110BA
207				 *   110AB  ==> 110A5 110BA
208				 */
209				case 0x1109A: case 0x1109C: case 0x110AB:
210					ret = 1;
211					break;
212				}
213			}
214			op16be += unicode_to_utf16be(op16be, uc);
215			op16le += unicode_to_utf16le(op16le, uc);
216			owp += unicode_to_wc(owp, uc);
217			op += unicode_to_utf8(op, uc);
218			if (!*p) {
219				*op16be++ = 0;
220				*op16be = 0;
221				*op16le++ = 0;
222				*op16le = 0;
223				*owp = L'\0';
224				*op = '\0';
225				break;
226			}
227			uc = 0;
228		}
229		p++;
230	}
231	return (ret);
232}
233
234static int
235is_wc_unicode(void)
236{
237#if defined(_WIN32) && !defined(__CYGWIN__)
238	return (1);
239#else
240	return (0);
241#endif
242}
243
244/*
245 * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters.
246 * On Mac OS, the characters to be Form D.
247 * On other platforms, the characters to be Form C.
248 */
249static void
250test_archive_string_normalization_nfc(const char *testdata)
251{
252	struct archive *a, *a2;
253	struct archive_string utf8;
254	struct archive_mstring mstr;
255	struct archive_string_conv *f_sconv8, *t_sconv8;
256	struct archive_string_conv *f_sconv16be, *f_sconv16le;
257	FILE *fp;
258	char buff[512];
259	int line = 0;
260	int locale_is_utf8, wc_is_unicode;
261	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C;
262
263	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
264	wc_is_unicode = is_wc_unicode();
265	/* If it doesn't exist, just warn and return. */
266	if (!locale_is_utf8 && !wc_is_unicode) {
267		skipping("A test of string normalization for NFC requires "
268		    "a suitable locale; en_US.UTF-8 not available on this "
269		    "system");
270		return;
271	}
272
273	archive_string_init(&utf8);
274	memset(&mstr, 0, sizeof(mstr));
275
276	/*
277	 * Create string conversion objects.
278	 */
279	assert((a = archive_read_new()) != NULL);
280	assertA(NULL != (f_sconv8 =
281	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
282	assertA(NULL != (f_sconv16be =
283	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
284	assertA(NULL != (f_sconv16le =
285	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
286	assert((a2 = archive_write_new()) != NULL);
287	assertA(NULL != (t_sconv8 =
288	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
289	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
290	    t_sconv8 == NULL) {
291		/* We cannot continue this test. */
292		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
293		return;
294	}
295	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
296	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
297	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
298	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
299
300	/* Open a test pattern file. */
301	assert((fp = fopen(testdata, "r")) != NULL);
302
303	/*
304	 * Read test data.
305	 *  Test data format:
306	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
307	 *  Unicode pattern format:
308	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
309	 */
310	while (fgets(buff, sizeof(buff), fp) != NULL) {
311		char nfc[80], nfd[80];
312		char utf8_nfc[80], utf8_nfd[80];
313		char utf16be_nfc[80], utf16be_nfd[80];
314		char utf16le_nfc[80], utf16le_nfd[80];
315		wchar_t wc_nfc[40], wc_nfd[40];
316		char *e, *p;
317		const wchar_t *wp;
318		const char *mp;
319		size_t mplen;
320
321		line++;
322		if (buff[0] == '#')
323			continue;
324		p = strchr(buff, ';');
325		if (p == NULL)
326			continue;
327		*p++ = '\0';
328		/* Copy an NFC pattern */
329		strncpy(nfc, buff, sizeof(nfc)-1);
330		nfc[sizeof(nfc)-1] = '\0';
331		e = p;
332		p = strchr(p, '\n');
333		if (p == NULL)
334			continue;
335		*p = '\0';
336		/* Copy an NFD pattern */
337		strncpy(nfd, e, sizeof(nfd)-1);
338		nfd[sizeof(nfd)-1] = '\0';
339
340		/*
341		 * Get an NFC patterns.
342		 */
343		scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc,
344		    nfc, 0);
345
346		/*
347		 * Get an NFD patterns.
348		 */
349		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
350		    nfd, 0);
351
352		if (locale_is_utf8) {
353			/*
354			 * Normalize an NFD string for import.
355			 */
356			assertEqualInt(0, archive_strcpy_l(
357			    &utf8, utf8_nfd, f_sconv8));
358			failure("NFD(%s) should be converted to NFC(%s):%d",
359			    nfd, nfc, line);
360			assertEqualUTF8String(utf8_nfc, utf8.s);
361
362			/*
363			 * Normalize an NFC string for import.
364			 */
365			assertEqualInt(0, archive_strcpy_l(
366			    &utf8, utf8_nfc, f_sconv8));
367			failure("NFC(%s) should not be any changed:%d",
368			    nfc, line);
369			assertEqualUTF8String(utf8_nfc, utf8.s);
370
371			/*
372			 * Copy an NFC string for export.
373			 */
374			assertEqualInt(0, archive_strcpy_l(
375			    &utf8, utf8_nfc, t_sconv8));
376			failure("NFC(%s) should not be any changed:%d",
377			    nfc, line);
378			assertEqualUTF8String(utf8_nfc, utf8.s);
379
380			/*
381			 * Normalize an NFD string in UTF-16BE for import.
382			 */
383			assertEqualInt(0, archive_strncpy_l(
384			    &utf8, utf16be_nfd, 100000, f_sconv16be));
385			failure("NFD(%s) should be converted to NFC(%s):%d",
386			    nfd, nfc, line);
387			assertEqualUTF8String(utf8_nfc, utf8.s);
388
389			/*
390			 * Normalize an NFD string in UTF-16LE for import.
391			 */
392			assertEqualInt(0, archive_strncpy_l(
393			    &utf8, utf16le_nfd, 100000, f_sconv16le));
394			failure("NFD(%s) should be converted to NFC(%s):%d",
395			    nfd, nfc, line);
396			assertEqualUTF8String(utf8_nfc, utf8.s);
397		}
398
399		/*
400		 * Test for archive_mstring interface.
401		 * In specific, Windows platform UTF-16BE is directly
402		 * converted to/from wide-character to avoid the effect of
403		 * current locale since windows platform cannot make
404		 * locale UTF-8.
405		 */
406		if (locale_is_utf8 || wc_is_unicode) {
407			/*
408			 * Normalize an NFD string in UTF-8 for import.
409			 */
410			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
411			    &mstr, utf8_nfd, 100000, f_sconv8));
412			assertEqualInt(0,
413			    archive_mstring_get_wcs(a, &mstr, &wp));
414			failure("UTF-8 NFD(%s) should be converted "
415			    "to WCS NFC(%s):%d", nfd, nfc, line);
416			assertEqualWString(wc_nfc, wp);
417
418			/*
419			 * Normalize an NFD string in UTF-16BE for import.
420			 */
421			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
422			    &mstr, utf16be_nfd, 100000, f_sconv16be));
423			assertEqualInt(0,
424			    archive_mstring_get_wcs(a, &mstr, &wp));
425			failure("UTF-8 NFD(%s) should be converted "
426			    "to WCS NFC(%s):%d", nfd, nfc, line);
427			assertEqualWString(wc_nfc, wp);
428
429			/*
430			 * Normalize an NFD string in UTF-16LE for import.
431			 */
432			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
433			    &mstr, utf16le_nfd, 100000, f_sconv16le));
434			assertEqualInt(0,
435			    archive_mstring_get_wcs(a, &mstr, &wp));
436			failure("UTF-8 NFD(%s) should be converted "
437			    "to WCS NFC(%s):%d", nfd, nfc, line);
438			assertEqualWString(wc_nfc, wp);
439
440			/*
441			 * Copy an NFC wide-string for export.
442			 */
443			assertEqualInt(0,
444			    archive_mstring_copy_wcs(&mstr, wc_nfc));
445			assertEqualInt(0, archive_mstring_get_mbs_l(
446			    a, &mstr, &mp, &mplen, t_sconv8));
447			failure("WCS NFC(%s) should be UTF-8 NFC:%d"
448			    ,nfc, line);
449			assertEqualUTF8String(utf8_nfc, mp);
450		}
451	}
452
453	archive_string_free(&utf8);
454	archive_mstring_clean(&mstr);
455	fclose(fp);
456	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
457	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
458}
459
460static void
461test_archive_string_normalization_mac_nfd(const char *testdata)
462{
463	struct archive *a, *a2;
464	struct archive_string utf8;
465	struct archive_mstring mstr;
466	struct archive_string_conv *f_sconv8, *t_sconv8;
467	struct archive_string_conv *f_sconv16be, *f_sconv16le;
468	FILE *fp;
469	char buff[512];
470	int line = 0;
471	int locale_is_utf8, wc_is_unicode;
472	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D;
473
474	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
475	wc_is_unicode = is_wc_unicode();
476	/* If it doesn't exist, just warn and return. */
477	if (!locale_is_utf8 && !wc_is_unicode) {
478		skipping("A test of string normalization for NFD requires "
479		    "a suitable locale; en_US.UTF-8 not available on this "
480		    "system");
481		return;
482	}
483
484	archive_string_init(&utf8);
485	memset(&mstr, 0, sizeof(mstr));
486
487	/*
488	 * Create string conversion objects.
489	 */
490	assert((a = archive_read_new()) != NULL);
491	assertA(NULL != (f_sconv8 =
492	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
493	assertA(NULL != (f_sconv16be =
494	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
495	assertA(NULL != (f_sconv16le =
496	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
497	assert((a2 = archive_write_new()) != NULL);
498	assertA(NULL != (t_sconv8 =
499	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
500	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
501	    t_sconv8 == NULL) {
502		/* We cannot continue this test. */
503		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
504		return;
505	}
506	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
507	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
508	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
509	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
510
511	/* Open a test pattern file. */
512	assert((fp = fopen(testdata, "r")) != NULL);
513
514	/*
515	 * Read test data.
516	 *  Test data format:
517	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
518	 *  Unicode pattern format:
519	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
520	 */
521	while (fgets(buff, sizeof(buff), fp) != NULL) {
522		char nfc[80], nfd[80];
523		char utf8_nfc[80], utf8_nfd[80];
524		char utf16be_nfc[80], utf16be_nfd[80];
525		char utf16le_nfc[80], utf16le_nfd[80];
526		wchar_t wc_nfc[40], wc_nfd[40];
527		char *e, *p;
528		const wchar_t *wp;
529		const char *mp;
530		size_t mplen;
531		int should_be_nfc;
532
533		line++;
534		if (buff[0] == '#')
535			continue;
536		p = strchr(buff, ';');
537		if (p == NULL)
538			continue;
539		*p++ = '\0';
540		/* Copy an NFC pattern */
541		strncpy(nfc, buff, sizeof(nfc)-1);
542		nfc[sizeof(nfc)-1] = '\0';
543		e = p;
544		p = strchr(p, '\n');
545		if (p == NULL)
546			continue;
547		*p = '\0';
548		/* Copy an NFD pattern */
549		strncpy(nfd, e, sizeof(nfd)-1);
550		nfd[sizeof(nfd)-1] = '\0';
551
552		/*
553		 * Get an NFC patterns.
554		 */
555		should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc,
556			utf16be_nfc, utf16le_nfc, nfc, 1);
557
558		/*
559		 * Get an NFD patterns.
560		 */
561		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
562		    nfd, 0);
563
564		if (locale_is_utf8) {
565			/*
566			 * Normalize an NFC string for import.
567			 */
568			assertEqualInt(0, archive_strcpy_l(
569			    &utf8, utf8_nfc, f_sconv8));
570			if (should_be_nfc) {
571				failure("NFC(%s) should not be converted to"
572				    " NFD(%s):%d", nfc, nfd, line);
573				assertEqualUTF8String(utf8_nfc, utf8.s);
574			} else {
575				failure("NFC(%s) should be converted to"
576				    " NFD(%s):%d", nfc, nfd, line);
577				assertEqualUTF8String(utf8_nfd, utf8.s);
578			}
579
580			/*
581			 * Normalize an NFD string for import.
582			 */
583			assertEqualInt(0, archive_strcpy_l(
584			    &utf8, utf8_nfd, f_sconv8));
585			failure("NFD(%s) should not be any changed:%d",
586			    nfd, line);
587			assertEqualUTF8String(utf8_nfd, utf8.s);
588
589			/*
590			 * Copy an NFD string for export.
591			 */
592			assertEqualInt(0, archive_strcpy_l(
593			    &utf8, utf8_nfd, t_sconv8));
594			failure("NFD(%s) should not be any changed:%d",
595			    nfd, line);
596			assertEqualUTF8String(utf8_nfd, utf8.s);
597
598			/*
599			 * Normalize an NFC string in UTF-16BE for import.
600			 */
601			assertEqualInt(0, archive_strncpy_l(
602			    &utf8, utf16be_nfc, 100000, f_sconv16be));
603			if (should_be_nfc) {
604				failure("NFC(%s) should not be converted to"
605				    " NFD(%s):%d", nfc, nfd, line);
606				assertEqualUTF8String(utf8_nfc, utf8.s);
607			} else {
608				failure("NFC(%s) should be converted to"
609				    " NFD(%s):%d", nfc, nfd, line);
610				assertEqualUTF8String(utf8_nfd, utf8.s);
611			}
612
613			/*
614			 * Normalize an NFC string in UTF-16LE for import.
615			 */
616			assertEqualInt(0, archive_strncpy_l(
617			    &utf8, utf16le_nfc, 100000, f_sconv16le));
618			if (should_be_nfc) {
619				failure("NFC(%s) should not be converted to"
620				    " NFD(%s):%d", nfc, nfd, line);
621				assertEqualUTF8String(utf8_nfc, utf8.s);
622			} else {
623				failure("NFC(%s) should be converted to"
624				    " NFD(%s):%d", nfc, nfd, line);
625				assertEqualUTF8String(utf8_nfd, utf8.s);
626			}
627		}
628
629		/*
630		 * Test for archive_mstring interface.
631		 * In specific, Windows platform UTF-16BE is directly
632		 * converted to/from wide-character to avoid the effect of
633		 * current locale since windows platform cannot make
634		 * locale UTF-8.
635		 */
636		if (locale_is_utf8 || wc_is_unicode) {
637			/*
638			 * Normalize an NFD string in UTF-8 for import.
639			 */
640			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
641			    &mstr, utf8_nfc, 100000, f_sconv8));
642			assertEqualInt(0,
643			    archive_mstring_get_wcs(a, &mstr, &wp));
644			if (should_be_nfc) {
645				failure("UTF-8 NFC(%s) should not be converted "
646				    "to WCS NFD(%s):%d", nfc, nfd, line);
647				assertEqualWString(wc_nfc, wp);
648			} else {
649				failure("UTF-8 NFC(%s) should be converted "
650				    "to WCS NFD(%s):%d", nfc, nfd, line);
651				assertEqualWString(wc_nfd, wp);
652			}
653
654			/*
655			 * Normalize an NFD string in UTF-16BE for import.
656			 */
657			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
658			    &mstr, utf16be_nfc, 100000, f_sconv16be));
659			assertEqualInt(0,
660			    archive_mstring_get_wcs(a, &mstr, &wp));
661			if (should_be_nfc) {
662				failure("UTF-16BE NFC(%s) should not be "
663				    "converted to WCS NFD(%s):%d",
664				    nfc, nfd, line);
665				assertEqualWString(wc_nfc, wp);
666			} else {
667				failure("UTF-16BE NFC(%s) should be converted "
668				    "to WCS NFD(%s):%d", nfc, nfd, line);
669				assertEqualWString(wc_nfd, wp);
670			}
671
672			/*
673			 * Normalize an NFD string in UTF-16LE for import.
674			 */
675			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
676			    &mstr, utf16le_nfc, 100000, f_sconv16le));
677			assertEqualInt(0,
678			    archive_mstring_get_wcs(a, &mstr, &wp));
679			if (should_be_nfc) {
680				failure("UTF-16LE NFC(%s) should not be "
681				    "converted to WCS NFD(%s):%d",
682				    nfc, nfd, line);
683				assertEqualWString(wc_nfc, wp);
684			} else {
685				failure("UTF-16LE NFC(%s) should be converted "
686				    "to WCS NFD(%s):%d", nfc, nfd, line);
687				assertEqualWString(wc_nfd, wp);
688			}
689
690			/*
691			 * Copy an NFD wide-string for export.
692			 */
693			assertEqualInt(0, archive_mstring_copy_wcs(
694			    &mstr, wc_nfd));
695			assertEqualInt(0, archive_mstring_get_mbs_l(
696			    a, &mstr, &mp, &mplen, t_sconv8));
697			failure("WCS NFD(%s) should be UTF-8 NFD:%d"
698			    ,nfd, line);
699			assertEqualUTF8String(utf8_nfd, mp);
700		}
701	}
702
703	archive_string_free(&utf8);
704	archive_mstring_clean(&mstr);
705	fclose(fp);
706	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
707	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
708}
709
710static void
711test_archive_string_canonicalization(void)
712{
713	struct archive *a;
714	struct archive_string_conv *sconv;
715
716	setlocale(LC_ALL, "en_US.UTF-8");
717
718	assert((a = archive_read_new()) != NULL);
719
720	assertA(NULL != (sconv =
721	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
722	failure("Charset name should be UTF-8");
723	assertEqualString("UTF-8",
724	    archive_string_conversion_charset_name(sconv));
725
726	assertA(NULL != (sconv =
727	    archive_string_conversion_to_charset(a, "UTF8", 1)));
728	failure("Charset name should be UTF-8");
729	assertEqualString("UTF-8",
730	    archive_string_conversion_charset_name(sconv));
731
732	assertA(NULL != (sconv =
733	    archive_string_conversion_to_charset(a, "utf8", 1)));
734	failure("Charset name should be UTF-8");
735	assertEqualString("UTF-8",
736	    archive_string_conversion_charset_name(sconv));
737
738	assertA(NULL != (sconv =
739	    archive_string_conversion_to_charset(a, "UTF-16BE", 1)));
740	failure("Charset name should be UTF-16BE");
741	assertEqualString("UTF-16BE",
742	    archive_string_conversion_charset_name(sconv));
743
744	assertA(NULL != (sconv =
745	    archive_string_conversion_to_charset(a, "UTF16BE", 1)));
746	failure("Charset name should be UTF-16BE");
747	assertEqualString("UTF-16BE",
748	    archive_string_conversion_charset_name(sconv));
749
750	assertA(NULL != (sconv =
751	    archive_string_conversion_to_charset(a, "utf16be", 1)));
752	failure("Charset name should be UTF-16BE");
753	assertEqualString("UTF-16BE",
754	    archive_string_conversion_charset_name(sconv));
755
756	assertA(NULL != (sconv =
757	    archive_string_conversion_to_charset(a, "UTF-16LE", 1)));
758	failure("Charset name should be UTF-16LE");
759	assertEqualString("UTF-16LE",
760	    archive_string_conversion_charset_name(sconv));
761
762	assertA(NULL != (sconv =
763	    archive_string_conversion_to_charset(a, "UTF16LE", 1)));
764	failure("Charset name should be UTF-16LE");
765	assertEqualString("UTF-16LE",
766	    archive_string_conversion_charset_name(sconv));
767
768	assertA(NULL != (sconv =
769	    archive_string_conversion_to_charset(a, "utf16le", 1)));
770	failure("Charset name should be UTF-16LE");
771	assertEqualString("UTF-16LE",
772	    archive_string_conversion_charset_name(sconv));
773
774	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
775
776}
777
778static void
779check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc,
780  const char *exp, const wchar_t *wexp)
781{
782	/* Do all the tests on a copy so that we can have a clear initial state every time */
783	struct archive_mstring mstr2;
784	const char *p = NULL;
785	const wchar_t *wp = NULL;
786	size_t len = 0;
787
788	memset(&mstr2, 0, sizeof(mstr2));
789
790	archive_mstring_copy(&mstr2, mstr);
791	assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p));
792	assertEqualString(exp, p);
793	p = NULL;
794
795	archive_mstring_copy(&mstr2, mstr);
796	assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p));
797	assertEqualString(exp, p);
798	p = NULL;
799
800	archive_mstring_copy(&mstr2, mstr);
801	assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp));
802	assertEqualWString(wexp, wp);
803	wp = NULL;
804
805	archive_mstring_copy(&mstr2, mstr);
806	assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc));
807	assertEqualString(exp, p);
808	assertEqualInt(len, strlen(exp));
809	p = NULL;
810	len = 0;
811
812	archive_mstring_clean(&mstr2);
813}
814
815/*
816 * Make sure no matter what the input encoding is, the string can be
817 * converted too all the output encodings.
818 */
819static void
820test_archive_string_set_get(void)
821{
822	struct archive *a;
823	struct archive_mstring mstr;
824	struct archive_string_conv *sc;
825
826	setlocale(LC_ALL, "en_US.UTF-8");
827
828	assert((a = archive_read_new()) != NULL);
829	memset(&mstr, 0, sizeof(mstr));
830
831	assertA(NULL != (sc =
832	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
833	failure("Charset name should be UTF-8");
834	assertEqualString("UTF-8",
835	    archive_string_conversion_charset_name(sc));
836
837	assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA"));
838	check_string(a, &mstr, sc, "AAA", L"AAA");
839	assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB"));
840	check_string(a, &mstr, sc, "BBBB", L"BBBB");
841	assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12"));
842	check_string(a, &mstr, sc, "CCC12", L"CCC12");
843	assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc));
844	check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l");
845	assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H"));
846	check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H");
847
848        archive_mstring_clean(&mstr);
849	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
850
851}
852
853DEFINE_TEST(test_archive_string_conversion)
854{
855	static const char reffile[] = "test_archive_string_conversion.txt.Z";
856	static const char testdata[] = "testdata.txt";
857	struct archive *a;
858	struct archive_entry *ae;
859	char buff[512];
860	ssize_t size;
861	FILE *fp;
862
863	/*
864	 * Extract a test pattern file.
865	 */
866	extract_reference_file(reffile);
867	assert((a = archive_read_new()) != NULL);
868	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
869	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a));
870        assertEqualIntA(a, ARCHIVE_OK,
871            archive_read_open_filename(a, reffile, 512));
872
873	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
874	assert((fp = fopen(testdata, "w")) != NULL);
875	while ((size = archive_read_data(a, buff, 512)) > 0)
876		assertEqualInt(size, fwrite(buff, 1, size, fp));
877	assertEqualInt(0, fclose(fp));
878	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
879
880	test_archive_string_normalization_nfc(testdata);
881	test_archive_string_normalization_mac_nfd(testdata);
882	test_archive_string_canonicalization();
883	test_archive_string_set_get();
884}
885