test_pax_filename_encoding.c revision 228763
1228753Smm/*-
2228753Smm * Copyright (c) 2003-2007 Tim Kientzle
3228753Smm * All rights reserved.
4228753Smm *
5228753Smm * Redistribution and use in source and binary forms, with or without
6228753Smm * modification, are permitted provided that the following conditions
7228753Smm * are met:
8228753Smm * 1. Redistributions of source code must retain the above copyright
9228753Smm *    notice, this list of conditions and the following disclaimer.
10228753Smm * 2. Redistributions in binary form must reproduce the above copyright
11228753Smm *    notice, this list of conditions and the following disclaimer in the
12228753Smm *    documentation and/or other materials provided with the distribution.
13228753Smm *
14228753Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15228753Smm * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16228753Smm * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17228753Smm * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18228753Smm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19228753Smm * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20228753Smm * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21228753Smm * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22228753Smm * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23228753Smm * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24228753Smm */
25228753Smm#include "test.h"
26228763Smm__FBSDID("$FreeBSD: head/contrib/libarchive/libarchive/test/test_pax_filename_encoding.c 228763 2011-12-21 11:13:29Z mm $");
27228753Smm
28228753Smm#include <locale.h>
29228753Smm
30228753Smm/*
31228753Smm * Pax interchange is supposed to encode filenames into
32228753Smm * UTF-8.  Of course, that's not always possible.  This
33228753Smm * test is intended to verify that filenames always get
34228753Smm * stored and restored correctly, regardless of the encodings.
35228753Smm */
36228753Smm
37228753Smm/*
38228753Smm * Read a manually-created archive that has filenames that are
39228753Smm * stored in binary instead of UTF-8 and verify that we get
40228753Smm * the right filename returned and that we get a warning only
41228753Smm * if the header isn't marked as binary.
42228753Smm */
43228753Smmstatic void
44228753Smmtest_pax_filename_encoding_1(void)
45228753Smm{
46228753Smm	static const char testname[] = "test_pax_filename_encoding.tar";
47228753Smm	/*
48228753Smm	 * \314\214 is a valid 2-byte UTF-8 sequence.
49228753Smm	 * \374 is invalid in UTF-8.
50228753Smm	 */
51228753Smm	char filename[] = "abc\314\214mno\374xyz";
52228753Smm	struct archive *a;
53228753Smm	struct archive_entry *entry;
54228753Smm
55228753Smm	/*
56228753Smm	 * Read an archive that has non-UTF8 pax filenames in it.
57228753Smm	 */
58228753Smm	extract_reference_file(testname);
59228753Smm	a = archive_read_new();
60228753Smm	assertEqualInt(ARCHIVE_OK, archive_read_support_format_tar(a));
61228753Smm	assertEqualInt(ARCHIVE_OK, archive_read_support_compression_all(a));
62228753Smm	assertEqualInt(ARCHIVE_OK,
63228753Smm	    archive_read_open_filename(a, testname, 10240));
64228753Smm	/*
65228753Smm	 * First entry in this test archive has an invalid UTF-8 sequence
66228753Smm	 * in it, but the header is not marked as hdrcharset=BINARY, so that
67228753Smm	 * requires a warning.
68228753Smm	 */
69228753Smm	failure("Invalid UTF8 in a pax archive pathname should cause a warning");
70228753Smm	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
71228753Smm	assertEqualString(filename, archive_entry_pathname(entry));
72228753Smm	/*
73228753Smm	 * Second entry is identical except that it does have
74228753Smm	 * hdrcharset=BINARY, so no warning should be generated.
75228753Smm	 */
76228753Smm	failure("A pathname with hdrcharset=BINARY can have invalid UTF8\n"
77228753Smm	    " characters in it without generating a warning");
78228753Smm	assertEqualInt(ARCHIVE_OK, archive_read_next_header(a, &entry));
79228753Smm	assertEqualString(filename, archive_entry_pathname(entry));
80228753Smm	archive_read_finish(a);
81228753Smm}
82228753Smm
83228753Smm/*
84228753Smm * Set the locale and write a pathname containing invalid characters.
85228753Smm * This should work; the underlying implementation should automatically
86228753Smm * fall back to storing the pathname in binary.
87228753Smm */
88228753Smmstatic void
89228753Smmtest_pax_filename_encoding_2(void)
90228753Smm{
91228753Smm	char filename[] = "abc\314\214mno\374xyz";
92228753Smm	struct archive *a;
93228753Smm	struct archive_entry *entry;
94228753Smm	char buff[65536];
95228753Smm	char longname[] = "abc\314\214mno\374xyz"
96228753Smm	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
97228753Smm	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
98228753Smm	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
99228753Smm	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
100228753Smm	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
101228753Smm	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
102228753Smm	    ;
103228753Smm	size_t used;
104228753Smm
105228753Smm	/*
106228753Smm	 * We need a starting locale which has invalid sequences.
107228753Smm	 * de_DE.UTF-8 seems to be commonly supported.
108228753Smm	 */
109228753Smm	/* If it doesn't exist, just warn and return. */
110228753Smm	if (LOCALE_UTF8 == NULL
111228753Smm	    || NULL == setlocale(LC_ALL, LOCALE_UTF8)) {
112228753Smm		skipping("invalid encoding tests require a suitable locale;"
113228753Smm		    " %s not available on this system", LOCALE_UTF8);
114228753Smm		return;
115228753Smm	}
116228753Smm
117228753Smm	assert((a = archive_write_new()) != NULL);
118228753Smm	assertEqualIntA(a, 0, archive_write_set_format_pax(a));
119228753Smm	assertEqualIntA(a, 0, archive_write_set_compression_none(a));
120228753Smm	assertEqualIntA(a, 0, archive_write_set_bytes_per_block(a, 0));
121228753Smm	assertEqualInt(0,
122228753Smm	    archive_write_open_memory(a, buff, sizeof(buff), &used));
123228753Smm
124228753Smm	assert((entry = archive_entry_new()) != NULL);
125228753Smm	/* Set pathname, gname, uname, hardlink to nonconvertible values. */
126228753Smm	archive_entry_copy_pathname(entry, filename);
127228753Smm	archive_entry_copy_gname(entry, filename);
128228753Smm	archive_entry_copy_uname(entry, filename);
129228753Smm	archive_entry_copy_hardlink(entry, filename);
130228753Smm	archive_entry_set_filetype(entry, AE_IFREG);
131228753Smm	failure("This should generate a warning for nonconvertible names.");
132228753Smm	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
133228753Smm	archive_entry_free(entry);
134228753Smm
135228753Smm	assert((entry = archive_entry_new()) != NULL);
136228753Smm	/* Set path, gname, uname, and symlink to nonconvertible values. */
137228753Smm	archive_entry_copy_pathname(entry, filename);
138228753Smm	archive_entry_copy_gname(entry, filename);
139228753Smm	archive_entry_copy_uname(entry, filename);
140228753Smm	archive_entry_copy_symlink(entry, filename);
141228753Smm	archive_entry_set_filetype(entry, AE_IFLNK);
142228753Smm	failure("This should generate a warning for nonconvertible names.");
143228753Smm	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
144228753Smm	archive_entry_free(entry);
145228753Smm
146228753Smm	assert((entry = archive_entry_new()) != NULL);
147228753Smm	/* Set pathname to a very long nonconvertible value. */
148228753Smm	archive_entry_copy_pathname(entry, longname);
149228753Smm	archive_entry_set_filetype(entry, AE_IFREG);
150228753Smm	failure("This should generate a warning for nonconvertible names.");
151228753Smm	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
152228753Smm	archive_entry_free(entry);
153228753Smm
154228753Smm	assertEqualInt(0, archive_write_close(a));
155228753Smm	assertEqualInt(0, archive_write_finish(a));
156228753Smm
157228753Smm	/*
158228753Smm	 * Now read the entries back.
159228753Smm	 */
160228753Smm
161228753Smm	assert((a = archive_read_new()) != NULL);
162228753Smm	assertEqualInt(0, archive_read_support_format_tar(a));
163228753Smm	assertEqualInt(0, archive_read_open_memory(a, buff, used));
164228753Smm
165228753Smm	assertEqualInt(0, archive_read_next_header(a, &entry));
166228753Smm	assertEqualString(filename, archive_entry_pathname(entry));
167228753Smm	assertEqualString(filename, archive_entry_gname(entry));
168228753Smm	assertEqualString(filename, archive_entry_uname(entry));
169228753Smm	assertEqualString(filename, archive_entry_hardlink(entry));
170228753Smm
171228753Smm	assertEqualInt(0, archive_read_next_header(a, &entry));
172228753Smm	assertEqualString(filename, archive_entry_pathname(entry));
173228753Smm	assertEqualString(filename, archive_entry_gname(entry));
174228753Smm	assertEqualString(filename, archive_entry_uname(entry));
175228753Smm	assertEqualString(filename, archive_entry_symlink(entry));
176228753Smm
177228753Smm	assertEqualInt(0, archive_read_next_header(a, &entry));
178228753Smm	assertEqualString(longname, archive_entry_pathname(entry));
179228753Smm
180228753Smm	assertEqualInt(0, archive_read_close(a));
181228753Smm	assertEqualInt(0, archive_read_finish(a));
182228753Smm}
183228753Smm
184228753Smm/*
185228753Smm * Create an entry starting from a wide-character Unicode pathname,
186228753Smm * read it back into "C" locale, which doesn't support the name.
187228753Smm * TODO: Figure out the "right" behavior here.
188228753Smm */
189228753Smmstatic void
190228753Smmtest_pax_filename_encoding_3(void)
191228753Smm{
192228753Smm	wchar_t badname[] = L"xxxAyyyBzzz";
193228753Smm	const char badname_utf8[] = "xxx\xE1\x88\xB4yyy\xE5\x99\xB8zzz";
194228753Smm	struct archive *a;
195228753Smm	struct archive_entry *entry;
196228753Smm	char buff[65536];
197228753Smm	size_t used;
198228753Smm
199228753Smm	badname[3] = 0x1234;
200228753Smm	badname[7] = 0x5678;
201228753Smm
202228753Smm	/* If it doesn't exist, just warn and return. */
203228753Smm	if (NULL == setlocale(LC_ALL, "C")) {
204228753Smm		skipping("Can't set \"C\" locale, so can't exercise "
205228753Smm		    "certain character-conversion failures");
206228753Smm		return;
207228753Smm	}
208228753Smm
209228753Smm	/* If wctomb is broken, warn and return. */
210228753Smm	if (wctomb(buff, 0x1234) > 0) {
211228753Smm		skipping("Cannot test conversion failures because \"C\" "
212228753Smm		    "locale on this system has no invalid characters.");
213228753Smm		return;
214228753Smm	}
215228753Smm
216228753Smm	/* If wctomb is broken, warn and return. */
217228753Smm	if (wctomb(buff, 0x1234) > 0) {
218228753Smm		skipping("Cannot test conversion failures because \"C\" "
219228753Smm		    "locale on this system has no invalid characters.");
220228753Smm		return;
221228753Smm	}
222228753Smm
223228753Smm	/* Skip test if archive_entry_update_pathname_utf8() is broken. */
224228753Smm	/* In particular, this is currently broken on Win32 because
225228753Smm	 * setlocale() does not set the default encoding for CP_ACP. */
226228753Smm	entry = archive_entry_new();
227228753Smm	if (archive_entry_update_pathname_utf8(entry, badname_utf8)) {
228228753Smm		archive_entry_free(entry);
229228753Smm		skipping("Cannot test conversion failures.");
230228753Smm		return;
231228753Smm	}
232228753Smm	archive_entry_free(entry);
233228753Smm
234228753Smm	assert((a = archive_write_new()) != NULL);
235228753Smm	assertEqualIntA(a, 0, archive_write_set_format_pax(a));
236228753Smm	assertEqualIntA(a, 0, archive_write_set_compression_none(a));
237228753Smm	assertEqualIntA(a, 0, archive_write_set_bytes_per_block(a, 0));
238228753Smm	assertEqualInt(0,
239228753Smm	    archive_write_open_memory(a, buff, sizeof(buff), &used));
240228753Smm
241228753Smm	assert((entry = archive_entry_new()) != NULL);
242228753Smm	/* Set pathname to non-convertible wide value. */
243228753Smm	archive_entry_copy_pathname_w(entry, badname);
244228753Smm	archive_entry_set_filetype(entry, AE_IFREG);
245228753Smm	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
246228753Smm	archive_entry_free(entry);
247228753Smm
248228753Smm	assert((entry = archive_entry_new()) != NULL);
249228753Smm	archive_entry_copy_pathname_w(entry, L"abc");
250228753Smm	/* Set gname to non-convertible wide value. */
251228753Smm	archive_entry_copy_gname_w(entry, badname);
252228753Smm	archive_entry_set_filetype(entry, AE_IFREG);
253228753Smm	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
254228753Smm	archive_entry_free(entry);
255228753Smm
256228753Smm	assert((entry = archive_entry_new()) != NULL);
257228753Smm	archive_entry_copy_pathname_w(entry, L"abc");
258228753Smm	/* Set uname to non-convertible wide value. */
259228753Smm	archive_entry_copy_uname_w(entry, badname);
260228753Smm	archive_entry_set_filetype(entry, AE_IFREG);
261228753Smm	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
262228753Smm	archive_entry_free(entry);
263228753Smm
264228753Smm	assert((entry = archive_entry_new()) != NULL);
265228753Smm	archive_entry_copy_pathname_w(entry, L"abc");
266228753Smm	/* Set hardlink to non-convertible wide value. */
267228753Smm	archive_entry_copy_hardlink_w(entry, badname);
268228753Smm	archive_entry_set_filetype(entry, AE_IFREG);
269228753Smm	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
270228753Smm	archive_entry_free(entry);
271228753Smm
272228753Smm	assert((entry = archive_entry_new()) != NULL);
273228753Smm	archive_entry_copy_pathname_w(entry, L"abc");
274228753Smm	/* Set symlink to non-convertible wide value. */
275228753Smm	archive_entry_copy_symlink_w(entry, badname);
276228753Smm	archive_entry_set_filetype(entry, AE_IFLNK);
277228753Smm	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
278228753Smm	archive_entry_free(entry);
279228753Smm
280228753Smm	assertEqualInt(0, archive_write_close(a));
281228753Smm	assertEqualInt(0, archive_write_finish(a));
282228753Smm
283228753Smm	/*
284228753Smm	 * Now read the entries back.
285228753Smm	 */
286228753Smm
287228753Smm	assert((a = archive_read_new()) != NULL);
288228753Smm	assertEqualInt(0, archive_read_support_format_tar(a));
289228753Smm	assertEqualInt(0, archive_read_open_memory(a, buff, used));
290228753Smm
291228753Smm	failure("A non-convertible pathname should cause a warning.");
292228753Smm	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
293228753Smm	assertEqualWString(badname, archive_entry_pathname_w(entry));
294228753Smm	failure("If native locale can't convert, we should get UTF-8 back.");
295228753Smm	assertEqualString(badname_utf8, archive_entry_pathname(entry));
296228753Smm
297228753Smm	failure("A non-convertible gname should cause a warning.");
298228753Smm	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
299228753Smm	assertEqualWString(badname, archive_entry_gname_w(entry));
300228753Smm	failure("If native locale can't convert, we should get UTF-8 back.");
301228753Smm	assertEqualString(badname_utf8, archive_entry_gname(entry));
302228753Smm
303228753Smm	failure("A non-convertible uname should cause a warning.");
304228753Smm	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
305228753Smm	assertEqualWString(badname, archive_entry_uname_w(entry));
306228753Smm	failure("If native locale can't convert, we should get UTF-8 back.");
307228753Smm	assertEqualString(badname_utf8, archive_entry_uname(entry));
308228753Smm
309228753Smm	failure("A non-convertible hardlink should cause a warning.");
310228753Smm	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
311228753Smm	assertEqualWString(badname, archive_entry_hardlink_w(entry));
312228753Smm	failure("If native locale can't convert, we should get UTF-8 back.");
313228753Smm	assertEqualString(badname_utf8, archive_entry_hardlink(entry));
314228753Smm
315228753Smm	failure("A non-convertible symlink should cause a warning.");
316228753Smm	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
317228753Smm	assertEqualWString(badname, archive_entry_symlink_w(entry));
318228753Smm	assertEqualWString(NULL, archive_entry_hardlink_w(entry));
319228753Smm	failure("If native locale can't convert, we should get UTF-8 back.");
320228753Smm	assertEqualString(badname_utf8, archive_entry_symlink(entry));
321228753Smm
322228753Smm	assertEqualInt(ARCHIVE_EOF, archive_read_next_header(a, &entry));
323228753Smm
324228753Smm	assertEqualInt(0, archive_read_close(a));
325228753Smm	assertEqualInt(0, archive_read_finish(a));
326228753Smm}
327228753Smm
328228753SmmDEFINE_TEST(test_pax_filename_encoding)
329228753Smm{
330228753Smm	test_pax_filename_encoding_1();
331228753Smm	test_pax_filename_encoding_2();
332228753Smm	test_pax_filename_encoding_3();
333228753Smm}
334