test_zip_filename_encoding.c revision 256281
1/*-
2 * Copyright (c) 2011 Michihiro NAKAJIMA
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25#include "test.h"
26__FBSDID("$FreeBSD$");
27
28#include <locale.h>
29
30static void
31test_zip_filename_encoding_UTF8(void)
32{
33  	struct archive *a;
34  	struct archive_entry *entry;
35	char buff[4096];
36	size_t used;
37
38	if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
39		skipping("en_US.UTF-8 locale not available on this system.");
40		return;
41	}
42
43	/*
44	 * Verify that UTF-8 filenames are correctly stored with
45	 * hdrcharset=UTF-8 option.
46	 */
47	a = archive_write_new();
48	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
49	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
50		skipping("This system cannot convert character-set"
51		    " for UTF-8.");
52		archive_write_free(a);
53		return;
54	}
55	assertEqualInt(ARCHIVE_OK,
56	    archive_write_open_memory(a, buff, sizeof(buff), &used));
57
58	entry = archive_entry_new2(a);
59	/* Set a UTF-8 filename. */
60	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
61	archive_entry_set_filetype(entry, AE_IFREG);
62	archive_entry_set_size(entry, 0);
63	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
64	archive_entry_free(entry);
65	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
66
67	/* A bit 11 of general purpose flag should be 0x08,
68	 * which indicates the filename charset is UTF-8. */
69	assertEqualInt(0x08, buff[7]);
70	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
71
72	/*
73	 * Verify that UTF-8 filenames are correctly stored without
74	 * hdrcharset=UTF-8 option.
75	 */
76	a = archive_write_new();
77	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
78	assertEqualInt(ARCHIVE_OK,
79	    archive_write_open_memory(a, buff, sizeof(buff), &used));
80
81	entry = archive_entry_new2(a);
82	/* Set a UTF-8 filename. */
83	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
84	archive_entry_set_filetype(entry, AE_IFREG);
85	archive_entry_set_size(entry, 0);
86	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
87	archive_entry_free(entry);
88	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
89
90	/* A bit 11 of general purpose flag should be 0x08,
91	 * which indicates the filename charset is UTF-8. */
92	assertEqualInt(0x08, buff[7]);
93	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
94
95	/*
96	 * Verify that A bit 11 of general purpose flag is not set
97	 * when ASCII filenames are stored.
98	 */
99	a = archive_write_new();
100	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
101	assertEqualInt(ARCHIVE_OK,
102	    archive_write_open_memory(a, buff, sizeof(buff), &used));
103
104	entry = archive_entry_new2(a);
105	/* Set an ASCII filename. */
106	archive_entry_set_pathname(entry, "abcABC");
107	archive_entry_set_filetype(entry, AE_IFREG);
108	archive_entry_set_size(entry, 0);
109	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
110	archive_entry_free(entry);
111	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
112
113	/* A bit 11 of general purpose flag should be 0,
114	 * which indicates the filename charset is unknown. */
115	assertEqualInt(0, buff[7]);
116	assertEqualMem(buff + 30, "abcABC", 6);
117}
118
119static void
120test_zip_filename_encoding_KOI8R(void)
121{
122  	struct archive *a;
123  	struct archive_entry *entry;
124	char buff[4096];
125	size_t used;
126
127	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
128		skipping("KOI8-R locale not available on this system.");
129		return;
130	}
131
132	/*
133	 * Verify that KOI8-R filenames are correctly translated to UTF-8.
134	 */
135	a = archive_write_new();
136	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
137	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
138		skipping("This system cannot convert character-set"
139		    " from KOI8-R to UTF-8.");
140		archive_write_free(a);
141		return;
142	}
143	assertEqualInt(ARCHIVE_OK,
144	    archive_write_open_memory(a, buff, sizeof(buff), &used));
145
146	entry = archive_entry_new2(a);
147	/* Set a KOI8-R filename. */
148	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
149	archive_entry_set_filetype(entry, AE_IFREG);
150	archive_entry_set_size(entry, 0);
151	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
152	archive_entry_free(entry);
153	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
154
155	/* A bit 11 of general purpose flag should be 0x08,
156	 * which indicates the filename charset is UTF-8. */
157	assertEqualInt(0x08, buff[7]);
158	/* Above three characters in KOI8-R should translate to the following
159	 * three characters (two bytes each) in UTF-8. */
160	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
161
162	/*
163	 * Verify that KOI8-R filenames are not translated to UTF-8.
164	 */
165	a = archive_write_new();
166	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
167	assertEqualInt(ARCHIVE_OK,
168	    archive_write_open_memory(a, buff, sizeof(buff), &used));
169
170	entry = archive_entry_new2(a);
171	/* Set a KOI8-R filename. */
172	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
173	archive_entry_set_filetype(entry, AE_IFREG);
174	archive_entry_set_size(entry, 0);
175	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
176	archive_entry_free(entry);
177	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
178
179	/* A bit 11 of general purpose flag should be 0,
180	 * which indicates the filename charset is unknown. */
181	assertEqualInt(0, buff[7]);
182	/* Above three characters in KOI8-R should not translate to
183	 * any character-set. */
184	assertEqualMem(buff + 30, "\xD0\xD2\xC9", 3);
185
186	/*
187	 * Verify that A bit 11 of general purpose flag is not set
188	 * when ASCII filenames are stored even if hdrcharset=UTF-8
189	 * is specified.
190	 */
191	a = archive_write_new();
192	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
193	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
194		skipping("This system cannot convert character-set"
195		    " from KOI8-R to UTF-8.");
196		archive_write_free(a);
197		return;
198	}
199	assertEqualInt(ARCHIVE_OK,
200	    archive_write_open_memory(a, buff, sizeof(buff), &used));
201
202	entry = archive_entry_new2(a);
203	/* Set an ASCII filename. */
204	archive_entry_set_pathname(entry, "abcABC");
205	archive_entry_set_filetype(entry, AE_IFREG);
206	archive_entry_set_size(entry, 0);
207	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
208	archive_entry_free(entry);
209	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
210
211	/* A bit 11 of general purpose flag should be 0,
212	 * which indicates the filename charset is unknown. */
213	assertEqualInt(0, buff[7]);
214	assertEqualMem(buff + 30, "abcABC", 6);
215}
216
217/*
218 * Do not translate CP1251 into CP866 if non Windows platform.
219 */
220static void
221test_zip_filename_encoding_ru_RU_CP1251(void)
222{
223  	struct archive *a;
224  	struct archive_entry *entry;
225	char buff[4096];
226	size_t used;
227
228	if (NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
229		skipping("Russian_Russia locale not available on this system.");
230		return;
231	}
232
233	/*
234	 * Verify that CP1251 filenames are not translated into any
235	 * other character-set, in particular, CP866.
236	 */
237	a = archive_write_new();
238	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
239	assertEqualInt(ARCHIVE_OK,
240	    archive_write_open_memory(a, buff, sizeof(buff), &used));
241
242	entry = archive_entry_new2(a);
243	/* Set a CP1251 filename. */
244	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
245	archive_entry_set_filetype(entry, AE_IFREG);
246	archive_entry_set_size(entry, 0);
247	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
248	archive_entry_free(entry);
249	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
250
251	/* A bit 11 of general purpose flag should be 0,
252	 * which indicates the filename charset is unknown. */
253	assertEqualInt(0, buff[7]);
254	/* Above three characters in CP1251 should not translate into
255	 * any other character-set. */
256	assertEqualMem(buff + 30, "\xEF\xF0\xE8", 3);
257}
258
259/*
260 * Other archiver applications on Windows translate CP1251 filenames
261 * into CP866 filenames and store it in the zip file.
262 * Test above behavior works well.
263 */
264static void
265test_zip_filename_encoding_Russian_Russia(void)
266{
267  	struct archive *a;
268  	struct archive_entry *entry;
269	char buff[4096];
270	size_t used;
271
272	if (NULL == setlocale(LC_ALL, "Russian_Russia")) {
273		skipping("Russian_Russia locale not available on this system.");
274		return;
275	}
276
277	/*
278	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
279	 * to UTF-8.
280	 */
281	a = archive_write_new();
282	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
283	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
284		skipping("This system cannot convert character-set"
285		    " from Russian_Russia.CP1251 to UTF-8.");
286		archive_write_free(a);
287		return;
288	}
289	assertEqualInt(ARCHIVE_OK,
290	    archive_write_open_memory(a, buff, sizeof(buff), &used));
291
292	entry = archive_entry_new2(a);
293	/* Set a CP1251 filename. */
294	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
295	archive_entry_set_filetype(entry, AE_IFREG);
296	archive_entry_set_size(entry, 0);
297	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
298	archive_entry_free(entry);
299	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
300
301	/* A bit 11 of general purpose flag should be 0x08,
302	 * which indicates the filename charset is UTF-8. */
303	assertEqualInt(0x08, buff[7]);
304	/* Above three characters in CP1251 should translate to the following
305	 * three characters (two bytes each) in UTF-8. */
306	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
307
308	/*
309	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
310	 * to CP866.
311	 */
312	a = archive_write_new();
313	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
314	assertEqualInt(ARCHIVE_OK,
315	    archive_write_open_memory(a, buff, sizeof(buff), &used));
316
317	entry = archive_entry_new2(a);
318	/* Set a CP1251 filename. */
319	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
320	archive_entry_set_filetype(entry, AE_IFREG);
321	archive_entry_set_size(entry, 0);
322	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
323	archive_entry_free(entry);
324	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
325
326	/* A bit 11 of general purpose flag should be 0,
327	 * which indicates the filename charset is unknown. */
328	assertEqualInt(0, buff[7]);
329	/* Above three characters in CP1251 should translate to the following
330	 * three characters in CP866. */
331	assertEqualMem(buff + 30, "\xAF\xE0\xA8", 3);
332}
333
334static void
335test_zip_filename_encoding_EUCJP(void)
336{
337  	struct archive *a;
338  	struct archive_entry *entry;
339	char buff[4096];
340	size_t used;
341
342	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
343		skipping("eucJP locale not available on this system.");
344		return;
345	}
346
347	/*
348	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
349	 */
350	a = archive_write_new();
351	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
352	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
353		skipping("This system cannot convert character-set"
354		    " from eucJP to UTF-8.");
355		archive_write_free(a);
356		return;
357	}
358	assertEqualInt(ARCHIVE_OK,
359	    archive_write_open_memory(a, buff, sizeof(buff), &used));
360
361	entry = archive_entry_new2(a);
362	/* Set an EUC-JP filename. */
363	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
364	/* Check the Unicode version. */
365	archive_entry_set_filetype(entry, AE_IFREG);
366	archive_entry_set_size(entry, 0);
367	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
368	archive_entry_free(entry);
369	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
370
371	/* A bit 11 of general purpose flag should be 0x08,
372	 * which indicates the filename charset is UTF-8. */
373	assertEqualInt(0x08, buff[7]);
374	/* Check UTF-8 version. */
375	assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
376
377	/*
378	 * Verify that EUC-JP filenames are not translated to UTF-8.
379	 */
380	a = archive_write_new();
381	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
382	assertEqualInt(ARCHIVE_OK,
383	    archive_write_open_memory(a, buff, sizeof(buff), &used));
384
385	entry = archive_entry_new2(a);
386	/* Set an EUC-JP filename. */
387	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
388	/* Check the Unicode version. */
389	archive_entry_set_filetype(entry, AE_IFREG);
390	archive_entry_set_size(entry, 0);
391	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
392	archive_entry_free(entry);
393	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
394
395	/* A bit 11 of general purpose flag should be 0,
396	 * which indicates the filename charset is unknown. */
397	assertEqualInt(0, buff[7]);
398	/* Above three characters in EUC-JP should not translate to
399	 * any character-set. */
400	assertEqualMem(buff + 30, "\xC9\xBD.txt", 6);
401
402	/*
403	 * Verify that A bit 11 of general purpose flag is not set
404	 * when ASCII filenames are stored even if hdrcharset=UTF-8
405	 * is specified.
406	 */
407	a = archive_write_new();
408	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
409	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
410		skipping("This system cannot convert character-set"
411		    " from eucJP to UTF-8.");
412		archive_write_free(a);
413		return;
414	}
415	assertEqualInt(ARCHIVE_OK,
416	    archive_write_open_memory(a, buff, sizeof(buff), &used));
417
418	entry = archive_entry_new2(a);
419	/* Set an ASCII filename. */
420	archive_entry_set_pathname(entry, "abcABC");
421	/* Check the Unicode version. */
422	archive_entry_set_filetype(entry, AE_IFREG);
423	archive_entry_set_size(entry, 0);
424	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
425	archive_entry_free(entry);
426	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
427
428	/* A bit 11 of general purpose flag should be 0,
429	 * which indicates the filename charset is unknown. */
430	assertEqualInt(0, buff[7]);
431	assertEqualMem(buff + 30, "abcABC", 6);
432}
433
434static void
435test_zip_filename_encoding_CP932(void)
436{
437  	struct archive *a;
438  	struct archive_entry *entry;
439	char buff[4096];
440	size_t used;
441
442	if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
443	    NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
444		skipping("CP932/SJIS locale not available on this system.");
445		return;
446	}
447
448	/*
449	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
450	 */
451	a = archive_write_new();
452	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
453	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
454		skipping("This system cannot convert character-set"
455		    " from CP932/SJIS to UTF-8.");
456		archive_write_free(a);
457		return;
458	}
459	assertEqualInt(ARCHIVE_OK,
460	    archive_write_open_memory(a, buff, sizeof(buff), &used));
461
462	entry = archive_entry_new2(a);
463	/* Set a CP932/SJIS filename. */
464	archive_entry_set_pathname(entry, "\x95\x5C.txt");
465	/* Check the Unicode version. */
466	archive_entry_set_filetype(entry, AE_IFREG);
467	archive_entry_set_size(entry, 0);
468	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
469	archive_entry_free(entry);
470	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
471
472	/* A bit 11 of general purpose flag should be 0x08,
473	 * which indicates the filename charset is UTF-8. */
474	assertEqualInt(0x08, buff[7]);
475	/* Check UTF-8 version. */
476	assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
477
478	/*
479	 * Verify that CP932/SJIS filenames are not translated to UTF-8.
480	 */
481	a = archive_write_new();
482	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
483	assertEqualInt(ARCHIVE_OK,
484	    archive_write_open_memory(a, buff, sizeof(buff), &used));
485
486	entry = archive_entry_new2(a);
487	/* Set a CP932/SJIS filename. */
488	archive_entry_set_pathname(entry, "\x95\x5C.txt");
489	/* Check the Unicode version. */
490	archive_entry_set_filetype(entry, AE_IFREG);
491	archive_entry_set_size(entry, 0);
492	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
493	archive_entry_free(entry);
494	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
495
496	/* A bit 11 of general purpose flag should be 0,
497	 * which indicates the filename charset is unknown. */
498	assertEqualInt(0, buff[7]);
499	/* Above three characters in CP932/SJIS should not translate to
500	 * any character-set. */
501	assertEqualMem(buff + 30, "\x95\x5C.txt", 6);
502
503	/*
504	 * Verify that A bit 11 of general purpose flag is not set
505	 * when ASCII filenames are stored even if hdrcharset=UTF-8
506	 * is specified.
507	 */
508	a = archive_write_new();
509	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
510	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
511		skipping("This system cannot convert character-set"
512		    " from CP932/SJIS to UTF-8.");
513		archive_write_free(a);
514		return;
515	}
516	assertEqualInt(ARCHIVE_OK,
517	    archive_write_open_memory(a, buff, sizeof(buff), &used));
518
519	entry = archive_entry_new2(a);
520	/* Set an ASCII filename. */
521	archive_entry_set_pathname(entry, "abcABC");
522	/* Check the Unicode version. */
523	archive_entry_set_filetype(entry, AE_IFREG);
524	archive_entry_set_size(entry, 0);
525	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
526	archive_entry_free(entry);
527	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
528
529	/* A bit 11 of general purpose flag should be 0,
530	 * which indicates the filename charset is unknown. */
531	assertEqualInt(0, buff[7]);
532	assertEqualMem(buff + 30, "abcABC", 6);
533}
534
535DEFINE_TEST(test_zip_filename_encoding)
536{
537	test_zip_filename_encoding_UTF8();
538	test_zip_filename_encoding_KOI8R();
539	test_zip_filename_encoding_ru_RU_CP1251();
540	test_zip_filename_encoding_Russian_Russia();
541	test_zip_filename_encoding_EUCJP();
542	test_zip_filename_encoding_CP932();
543}
544