test_gnutar_filename_encoding.c revision 232153
1/*-
2 * Copyright (c) 2011 Michihiro NAKAJIMA
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25#include "test.h"
26__FBSDID("$FreeBSD$");
27
28#include <locale.h>
29
30static void
31test_gnutar_filename_encoding_UTF8_CP866(void)
32{
33  	struct archive *a;
34  	struct archive_entry *entry;
35	char buff[4096];
36	size_t used;
37
38	if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
39		skipping("en_US.UTF-8 locale not available on this system.");
40		return;
41	}
42
43	/*
44	 * Verify that UTF-8 filenames are correctly translated into CP866
45	 * and stored with hdrcharset=CP866 option.
46	 */
47	a = archive_write_new();
48	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
49	if (archive_write_set_options(a, "hdrcharset=CP866") != ARCHIVE_OK) {
50		skipping("This system cannot convert character-set"
51		    " from UTF-8 to CP866.");
52		archive_write_free(a);
53		return;
54	}
55	assertEqualInt(ARCHIVE_OK,
56	    archive_write_open_memory(a, buff, sizeof(buff), &used));
57
58	entry = archive_entry_new2(a);
59	/* Set a UTF-8 filename. */
60	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
61	archive_entry_set_filetype(entry, AE_IFREG);
62	archive_entry_set_size(entry, 0);
63	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
64	archive_entry_free(entry);
65	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
66
67	/* Above three characters in UTF-8 should translate to the following
68	 * three characters in CP866. */
69	assertEqualMem(buff, "\xAF\xE0\xA8", 3);
70}
71
72static void
73test_gnutar_filename_encoding_KOI8R_UTF8(void)
74{
75  	struct archive *a;
76  	struct archive_entry *entry;
77	char buff[4096];
78	size_t used;
79
80	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
81		skipping("KOI8-R locale not available on this system.");
82		return;
83	}
84
85	/*
86	 * Verify that KOI8-R filenames are correctly translated into UTF-8
87	 * and stored with hdrcharset=UTF-8 option.
88	 */
89	a = archive_write_new();
90	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
91	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
92		skipping("This system cannot convert character-set"
93		    " from KOI8-R to UTF-8.");
94		archive_write_free(a);
95		return;
96	}
97	assertEqualInt(ARCHIVE_OK,
98	    archive_write_open_memory(a, buff, sizeof(buff), &used));
99
100	entry = archive_entry_new2(a);
101	/* Set a KOI8-R filename. */
102	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
103	archive_entry_set_filetype(entry, AE_IFREG);
104	archive_entry_set_size(entry, 0);
105	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
106	archive_entry_free(entry);
107	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
108
109	/* Above three characters in KOI8-R should translate to the following
110	 * three characters (two bytes each) in UTF-8. */
111	assertEqualMem(buff, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
112}
113
114static void
115test_gnutar_filename_encoding_KOI8R_CP866(void)
116{
117  	struct archive *a;
118  	struct archive_entry *entry;
119	char buff[4096];
120	size_t used;
121
122	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
123		skipping("KOI8-R locale not available on this system.");
124		return;
125	}
126
127	/*
128	 * Verify that KOI8-R filenames are correctly translated into CP866
129	 * and stored with hdrcharset=CP866 option.
130	 */
131	a = archive_write_new();
132	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
133	if (archive_write_set_options(a, "hdrcharset=CP866") != ARCHIVE_OK) {
134		skipping("This system cannot convert character-set"
135		    " from KOI8-R to CP866.");
136		archive_write_free(a);
137		return;
138	}
139	assertEqualInt(ARCHIVE_OK,
140	    archive_write_open_memory(a, buff, sizeof(buff), &used));
141
142	entry = archive_entry_new2(a);
143	/* Set a KOI8-R filename. */
144	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
145	archive_entry_set_filetype(entry, AE_IFREG);
146	archive_entry_set_size(entry, 0);
147	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
148	archive_entry_free(entry);
149	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
150
151	/* Above three characters in KOI8-R should translate to the following
152	 * three characters in CP866. */
153	assertEqualMem(buff, "\xAF\xE0\xA8", 3);
154}
155
156static void
157test_gnutar_filename_encoding_CP1251_UTF8(void)
158{
159  	struct archive *a;
160  	struct archive_entry *entry;
161	char buff[4096];
162	size_t used;
163
164	if (NULL == setlocale(LC_ALL, "Russian_Russia") &&
165	    NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
166		skipping("KOI8-R locale not available on this system.");
167		return;
168	}
169
170	/*
171	 * Verify that CP1251 filenames are correctly translated into UTF-8
172	 * and stored with hdrcharset=UTF-8 option.
173	 */
174	a = archive_write_new();
175	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
176	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
177		skipping("This system cannot convert character-set"
178		    " from KOI8-R to UTF-8.");
179		archive_write_free(a);
180		return;
181	}
182	assertEqualInt(ARCHIVE_OK,
183	    archive_write_open_memory(a, buff, sizeof(buff), &used));
184
185	entry = archive_entry_new2(a);
186	/* Set a KOI8-R filename. */
187	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
188	archive_entry_set_filetype(entry, AE_IFREG);
189	archive_entry_set_size(entry, 0);
190	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
191	archive_entry_free(entry);
192	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
193
194	/* Above three characters in CP1251 should translate to the following
195	 * three characters (two bytes each) in UTF-8. */
196	assertEqualMem(buff, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
197}
198
199/*
200 * Do not translate CP1251 into CP866 if non Windows platform.
201 */
202static void
203test_gnutar_filename_encoding_ru_RU_CP1251(void)
204{
205  	struct archive *a;
206  	struct archive_entry *entry;
207	char buff[4096];
208	size_t used;
209
210	if (NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
211		skipping("KOI8-R locale not available on this system.");
212		return;
213	}
214
215	/*
216	 * Verify that CP1251 filenames are not translated into any
217	 * other character-set, in particular, CP866.
218	 */
219	a = archive_write_new();
220	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
221	assertEqualInt(ARCHIVE_OK,
222	    archive_write_open_memory(a, buff, sizeof(buff), &used));
223
224	entry = archive_entry_new2(a);
225	/* Set a KOI8-R filename. */
226	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
227	archive_entry_set_filetype(entry, AE_IFREG);
228	archive_entry_set_size(entry, 0);
229	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
230	archive_entry_free(entry);
231	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
232
233	/* Above three characters in CP1251 should not translate to
234	 * any other character-set. */
235	assertEqualMem(buff, "\xEF\xF0\xE8", 3);
236}
237
238/*
239 * Other archiver applications on Windows translate CP1251 filenames
240 * into CP866 filenames and store it in the gnutar file.
241 * Test above behavior works well.
242 */
243static void
244test_gnutar_filename_encoding_Russian_Russia(void)
245{
246  	struct archive *a;
247  	struct archive_entry *entry;
248	char buff[4096];
249	size_t used;
250
251	if (NULL == setlocale(LC_ALL, "Russian_Russia")) {
252		skipping("Russian_Russia locale not available on this system.");
253		return;
254	}
255
256	/*
257	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
258	 * to CP866.
259	 */
260	a = archive_write_new();
261	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
262	assertEqualInt(ARCHIVE_OK,
263	    archive_write_open_memory(a, buff, sizeof(buff), &used));
264
265	entry = archive_entry_new2(a);
266	/* Set a CP1251 filename. */
267	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
268	archive_entry_set_filetype(entry, AE_IFREG);
269	archive_entry_set_size(entry, 0);
270	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
271	archive_entry_free(entry);
272	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
273
274	/* Above three characters in CP1251 should translate to the following
275	 * three characters in CP866. */
276	assertEqualMem(buff, "\xAF\xE0\xA8", 3);
277}
278
279static void
280test_gnutar_filename_encoding_EUCJP_UTF8(void)
281{
282  	struct archive *a;
283  	struct archive_entry *entry;
284	char buff[4096];
285	size_t used;
286
287	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
288		skipping("eucJP locale not available on this system.");
289		return;
290	}
291
292	/*
293	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
294	 */
295	a = archive_write_new();
296	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
297	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
298		skipping("This system cannot convert character-set"
299		    " from eucJP to UTF-8.");
300		archive_write_free(a);
301		return;
302	}
303	assertEqualInt(ARCHIVE_OK,
304	    archive_write_open_memory(a, buff, sizeof(buff), &used));
305
306	entry = archive_entry_new2(a);
307	/* Set an EUC-JP filename. */
308	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
309	/* Check the Unicode version. */
310	archive_entry_set_filetype(entry, AE_IFREG);
311	archive_entry_set_size(entry, 0);
312	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
313	archive_entry_free(entry);
314	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
315
316	/* Check UTF-8 version. */
317	assertEqualMem(buff, "\xE8\xA1\xA8.txt", 7);
318}
319
320static void
321test_gnutar_filename_encoding_EUCJP_CP932(void)
322{
323  	struct archive *a;
324  	struct archive_entry *entry;
325	char buff[4096];
326	size_t used;
327
328	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
329		skipping("eucJP locale not available on this system.");
330		return;
331	}
332
333	/*
334	 * Verify that EUC-JP filenames are correctly translated to CP932.
335	 */
336	a = archive_write_new();
337	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
338	if (archive_write_set_options(a, "hdrcharset=CP932") != ARCHIVE_OK) {
339		skipping("This system cannot convert character-set"
340		    " from eucJP to CP932.");
341		archive_write_free(a);
342		return;
343	}
344	assertEqualInt(ARCHIVE_OK,
345	    archive_write_open_memory(a, buff, sizeof(buff), &used));
346
347	entry = archive_entry_new2(a);
348	/* Set an EUC-JP filename. */
349	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
350	/* Check the Unicode version. */
351	archive_entry_set_filetype(entry, AE_IFREG);
352	archive_entry_set_size(entry, 0);
353	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
354	archive_entry_free(entry);
355	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
356
357	/* Check CP932 version. */
358	assertEqualMem(buff, "\x95\x5C.txt", 6);
359}
360
361static void
362test_gnutar_filename_encoding_CP932_UTF8(void)
363{
364  	struct archive *a;
365  	struct archive_entry *entry;
366	char buff[4096];
367	size_t used;
368
369	if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
370	    NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
371		skipping("CP932/SJIS locale not available on this system.");
372		return;
373	}
374
375	/*
376	 * Verify that CP932/SJIS filenames are correctly translated to UTF-8.
377	 */
378	a = archive_write_new();
379	assertEqualInt(ARCHIVE_OK, archive_write_set_format_gnutar(a));
380	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
381		skipping("This system cannot convert character-set"
382		    " from CP932/SJIS to UTF-8.");
383		archive_write_free(a);
384		return;
385	}
386	assertEqualInt(ARCHIVE_OK,
387	    archive_write_open_memory(a, buff, sizeof(buff), &used));
388
389	entry = archive_entry_new2(a);
390	/* Set an CP932/SJIS filename. */
391	archive_entry_set_pathname(entry, "\x95\x5C.txt");
392	/* Check the Unicode version. */
393	archive_entry_set_filetype(entry, AE_IFREG);
394	archive_entry_set_size(entry, 0);
395	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
396	archive_entry_free(entry);
397	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
398
399	/* Check UTF-8 version. */
400	assertEqualMem(buff, "\xE8\xA1\xA8.txt", 7);
401}
402
403DEFINE_TEST(test_gnutar_filename_encoding)
404{
405	test_gnutar_filename_encoding_UTF8_CP866();
406	test_gnutar_filename_encoding_KOI8R_UTF8();
407	test_gnutar_filename_encoding_KOI8R_CP866();
408	test_gnutar_filename_encoding_CP1251_UTF8();
409	test_gnutar_filename_encoding_ru_RU_CP1251();
410	test_gnutar_filename_encoding_Russian_Russia();
411	test_gnutar_filename_encoding_EUCJP_UTF8();
412	test_gnutar_filename_encoding_EUCJP_CP932();
413	test_gnutar_filename_encoding_CP932_UTF8();
414}
415