test_pax_filename_encoding.c revision 228763
1/*-
2 * Copyright (c) 2003-2007 Tim Kientzle
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25#include "test.h"
26__FBSDID("$FreeBSD: head/contrib/libarchive/libarchive/test/test_pax_filename_encoding.c 228763 2011-12-21 11:13:29Z mm $");
27
28#include <locale.h>
29
30/*
31 * Pax interchange is supposed to encode filenames into
32 * UTF-8.  Of course, that's not always possible.  This
33 * test is intended to verify that filenames always get
34 * stored and restored correctly, regardless of the encodings.
35 */
36
37/*
38 * Read a manually-created archive that has filenames that are
39 * stored in binary instead of UTF-8 and verify that we get
40 * the right filename returned and that we get a warning only
41 * if the header isn't marked as binary.
42 */
43static void
44test_pax_filename_encoding_1(void)
45{
46	static const char testname[] = "test_pax_filename_encoding.tar";
47	/*
48	 * \314\214 is a valid 2-byte UTF-8 sequence.
49	 * \374 is invalid in UTF-8.
50	 */
51	char filename[] = "abc\314\214mno\374xyz";
52	struct archive *a;
53	struct archive_entry *entry;
54
55	/*
56	 * Read an archive that has non-UTF8 pax filenames in it.
57	 */
58	extract_reference_file(testname);
59	a = archive_read_new();
60	assertEqualInt(ARCHIVE_OK, archive_read_support_format_tar(a));
61	assertEqualInt(ARCHIVE_OK, archive_read_support_compression_all(a));
62	assertEqualInt(ARCHIVE_OK,
63	    archive_read_open_filename(a, testname, 10240));
64	/*
65	 * First entry in this test archive has an invalid UTF-8 sequence
66	 * in it, but the header is not marked as hdrcharset=BINARY, so that
67	 * requires a warning.
68	 */
69	failure("Invalid UTF8 in a pax archive pathname should cause a warning");
70	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
71	assertEqualString(filename, archive_entry_pathname(entry));
72	/*
73	 * Second entry is identical except that it does have
74	 * hdrcharset=BINARY, so no warning should be generated.
75	 */
76	failure("A pathname with hdrcharset=BINARY can have invalid UTF8\n"
77	    " characters in it without generating a warning");
78	assertEqualInt(ARCHIVE_OK, archive_read_next_header(a, &entry));
79	assertEqualString(filename, archive_entry_pathname(entry));
80	archive_read_finish(a);
81}
82
83/*
84 * Set the locale and write a pathname containing invalid characters.
85 * This should work; the underlying implementation should automatically
86 * fall back to storing the pathname in binary.
87 */
88static void
89test_pax_filename_encoding_2(void)
90{
91	char filename[] = "abc\314\214mno\374xyz";
92	struct archive *a;
93	struct archive_entry *entry;
94	char buff[65536];
95	char longname[] = "abc\314\214mno\374xyz"
96	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
97	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
98	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
99	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
100	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
101	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
102	    ;
103	size_t used;
104
105	/*
106	 * We need a starting locale which has invalid sequences.
107	 * de_DE.UTF-8 seems to be commonly supported.
108	 */
109	/* If it doesn't exist, just warn and return. */
110	if (LOCALE_UTF8 == NULL
111	    || NULL == setlocale(LC_ALL, LOCALE_UTF8)) {
112		skipping("invalid encoding tests require a suitable locale;"
113		    " %s not available on this system", LOCALE_UTF8);
114		return;
115	}
116
117	assert((a = archive_write_new()) != NULL);
118	assertEqualIntA(a, 0, archive_write_set_format_pax(a));
119	assertEqualIntA(a, 0, archive_write_set_compression_none(a));
120	assertEqualIntA(a, 0, archive_write_set_bytes_per_block(a, 0));
121	assertEqualInt(0,
122	    archive_write_open_memory(a, buff, sizeof(buff), &used));
123
124	assert((entry = archive_entry_new()) != NULL);
125	/* Set pathname, gname, uname, hardlink to nonconvertible values. */
126	archive_entry_copy_pathname(entry, filename);
127	archive_entry_copy_gname(entry, filename);
128	archive_entry_copy_uname(entry, filename);
129	archive_entry_copy_hardlink(entry, filename);
130	archive_entry_set_filetype(entry, AE_IFREG);
131	failure("This should generate a warning for nonconvertible names.");
132	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
133	archive_entry_free(entry);
134
135	assert((entry = archive_entry_new()) != NULL);
136	/* Set path, gname, uname, and symlink to nonconvertible values. */
137	archive_entry_copy_pathname(entry, filename);
138	archive_entry_copy_gname(entry, filename);
139	archive_entry_copy_uname(entry, filename);
140	archive_entry_copy_symlink(entry, filename);
141	archive_entry_set_filetype(entry, AE_IFLNK);
142	failure("This should generate a warning for nonconvertible names.");
143	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
144	archive_entry_free(entry);
145
146	assert((entry = archive_entry_new()) != NULL);
147	/* Set pathname to a very long nonconvertible value. */
148	archive_entry_copy_pathname(entry, longname);
149	archive_entry_set_filetype(entry, AE_IFREG);
150	failure("This should generate a warning for nonconvertible names.");
151	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
152	archive_entry_free(entry);
153
154	assertEqualInt(0, archive_write_close(a));
155	assertEqualInt(0, archive_write_finish(a));
156
157	/*
158	 * Now read the entries back.
159	 */
160
161	assert((a = archive_read_new()) != NULL);
162	assertEqualInt(0, archive_read_support_format_tar(a));
163	assertEqualInt(0, archive_read_open_memory(a, buff, used));
164
165	assertEqualInt(0, archive_read_next_header(a, &entry));
166	assertEqualString(filename, archive_entry_pathname(entry));
167	assertEqualString(filename, archive_entry_gname(entry));
168	assertEqualString(filename, archive_entry_uname(entry));
169	assertEqualString(filename, archive_entry_hardlink(entry));
170
171	assertEqualInt(0, archive_read_next_header(a, &entry));
172	assertEqualString(filename, archive_entry_pathname(entry));
173	assertEqualString(filename, archive_entry_gname(entry));
174	assertEqualString(filename, archive_entry_uname(entry));
175	assertEqualString(filename, archive_entry_symlink(entry));
176
177	assertEqualInt(0, archive_read_next_header(a, &entry));
178	assertEqualString(longname, archive_entry_pathname(entry));
179
180	assertEqualInt(0, archive_read_close(a));
181	assertEqualInt(0, archive_read_finish(a));
182}
183
184/*
185 * Create an entry starting from a wide-character Unicode pathname,
186 * read it back into "C" locale, which doesn't support the name.
187 * TODO: Figure out the "right" behavior here.
188 */
189static void
190test_pax_filename_encoding_3(void)
191{
192	wchar_t badname[] = L"xxxAyyyBzzz";
193	const char badname_utf8[] = "xxx\xE1\x88\xB4yyy\xE5\x99\xB8zzz";
194	struct archive *a;
195	struct archive_entry *entry;
196	char buff[65536];
197	size_t used;
198
199	badname[3] = 0x1234;
200	badname[7] = 0x5678;
201
202	/* If it doesn't exist, just warn and return. */
203	if (NULL == setlocale(LC_ALL, "C")) {
204		skipping("Can't set \"C\" locale, so can't exercise "
205		    "certain character-conversion failures");
206		return;
207	}
208
209	/* If wctomb is broken, warn and return. */
210	if (wctomb(buff, 0x1234) > 0) {
211		skipping("Cannot test conversion failures because \"C\" "
212		    "locale on this system has no invalid characters.");
213		return;
214	}
215
216	/* If wctomb is broken, warn and return. */
217	if (wctomb(buff, 0x1234) > 0) {
218		skipping("Cannot test conversion failures because \"C\" "
219		    "locale on this system has no invalid characters.");
220		return;
221	}
222
223	/* Skip test if archive_entry_update_pathname_utf8() is broken. */
224	/* In particular, this is currently broken on Win32 because
225	 * setlocale() does not set the default encoding for CP_ACP. */
226	entry = archive_entry_new();
227	if (archive_entry_update_pathname_utf8(entry, badname_utf8)) {
228		archive_entry_free(entry);
229		skipping("Cannot test conversion failures.");
230		return;
231	}
232	archive_entry_free(entry);
233
234	assert((a = archive_write_new()) != NULL);
235	assertEqualIntA(a, 0, archive_write_set_format_pax(a));
236	assertEqualIntA(a, 0, archive_write_set_compression_none(a));
237	assertEqualIntA(a, 0, archive_write_set_bytes_per_block(a, 0));
238	assertEqualInt(0,
239	    archive_write_open_memory(a, buff, sizeof(buff), &used));
240
241	assert((entry = archive_entry_new()) != NULL);
242	/* Set pathname to non-convertible wide value. */
243	archive_entry_copy_pathname_w(entry, badname);
244	archive_entry_set_filetype(entry, AE_IFREG);
245	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
246	archive_entry_free(entry);
247
248	assert((entry = archive_entry_new()) != NULL);
249	archive_entry_copy_pathname_w(entry, L"abc");
250	/* Set gname to non-convertible wide value. */
251	archive_entry_copy_gname_w(entry, badname);
252	archive_entry_set_filetype(entry, AE_IFREG);
253	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
254	archive_entry_free(entry);
255
256	assert((entry = archive_entry_new()) != NULL);
257	archive_entry_copy_pathname_w(entry, L"abc");
258	/* Set uname to non-convertible wide value. */
259	archive_entry_copy_uname_w(entry, badname);
260	archive_entry_set_filetype(entry, AE_IFREG);
261	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
262	archive_entry_free(entry);
263
264	assert((entry = archive_entry_new()) != NULL);
265	archive_entry_copy_pathname_w(entry, L"abc");
266	/* Set hardlink to non-convertible wide value. */
267	archive_entry_copy_hardlink_w(entry, badname);
268	archive_entry_set_filetype(entry, AE_IFREG);
269	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
270	archive_entry_free(entry);
271
272	assert((entry = archive_entry_new()) != NULL);
273	archive_entry_copy_pathname_w(entry, L"abc");
274	/* Set symlink to non-convertible wide value. */
275	archive_entry_copy_symlink_w(entry, badname);
276	archive_entry_set_filetype(entry, AE_IFLNK);
277	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
278	archive_entry_free(entry);
279
280	assertEqualInt(0, archive_write_close(a));
281	assertEqualInt(0, archive_write_finish(a));
282
283	/*
284	 * Now read the entries back.
285	 */
286
287	assert((a = archive_read_new()) != NULL);
288	assertEqualInt(0, archive_read_support_format_tar(a));
289	assertEqualInt(0, archive_read_open_memory(a, buff, used));
290
291	failure("A non-convertible pathname should cause a warning.");
292	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
293	assertEqualWString(badname, archive_entry_pathname_w(entry));
294	failure("If native locale can't convert, we should get UTF-8 back.");
295	assertEqualString(badname_utf8, archive_entry_pathname(entry));
296
297	failure("A non-convertible gname should cause a warning.");
298	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
299	assertEqualWString(badname, archive_entry_gname_w(entry));
300	failure("If native locale can't convert, we should get UTF-8 back.");
301	assertEqualString(badname_utf8, archive_entry_gname(entry));
302
303	failure("A non-convertible uname should cause a warning.");
304	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
305	assertEqualWString(badname, archive_entry_uname_w(entry));
306	failure("If native locale can't convert, we should get UTF-8 back.");
307	assertEqualString(badname_utf8, archive_entry_uname(entry));
308
309	failure("A non-convertible hardlink should cause a warning.");
310	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
311	assertEqualWString(badname, archive_entry_hardlink_w(entry));
312	failure("If native locale can't convert, we should get UTF-8 back.");
313	assertEqualString(badname_utf8, archive_entry_hardlink(entry));
314
315	failure("A non-convertible symlink should cause a warning.");
316	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
317	assertEqualWString(badname, archive_entry_symlink_w(entry));
318	assertEqualWString(NULL, archive_entry_hardlink_w(entry));
319	failure("If native locale can't convert, we should get UTF-8 back.");
320	assertEqualString(badname_utf8, archive_entry_symlink(entry));
321
322	assertEqualInt(ARCHIVE_EOF, archive_read_next_header(a, &entry));
323
324	assertEqualInt(0, archive_read_close(a));
325	assertEqualInt(0, archive_read_finish(a));
326}
327
328DEFINE_TEST(test_pax_filename_encoding)
329{
330	test_pax_filename_encoding_1();
331	test_pax_filename_encoding_2();
332	test_pax_filename_encoding_3();
333}
334