1/*-
2 * Copyright (c) 2011 Michihiro NAKAJIMA
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25#include "test.h"
26__FBSDID("$FreeBSD$");
27
28#include <locale.h>
29
30DEFINE_TEST(test_zip_filename_encoding_UTF8)
31{
32  	struct archive *a;
33  	struct archive_entry *entry;
34	char buff[4096];
35	size_t used;
36
37	if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
38		skipping("en_US.UTF-8 locale not available on this system.");
39		return;
40	}
41
42	/*
43	 * Verify that UTF-8 filenames are correctly stored with
44	 * hdrcharset=UTF-8 option.
45	 */
46	a = archive_write_new();
47	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
48	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
49		skipping("This system cannot convert character-set"
50		    " for UTF-8.");
51		archive_write_free(a);
52		return;
53	}
54	assertEqualInt(ARCHIVE_OK,
55	    archive_write_open_memory(a, buff, sizeof(buff), &used));
56
57	entry = archive_entry_new2(a);
58	/* Set a UTF-8 filename. */
59	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
60	archive_entry_set_filetype(entry, AE_IFREG);
61	archive_entry_set_size(entry, 0);
62	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
63	archive_entry_free(entry);
64	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
65
66	/* A bit 11 of general purpose flag should be 0x08,
67	 * which indicates the filename charset is UTF-8. */
68	assertEqualInt(0x08, buff[7]);
69	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
70
71	/*
72	 * Verify that UTF-8 filenames are correctly stored without
73	 * hdrcharset=UTF-8 option.
74	 */
75	a = archive_write_new();
76	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
77	assertEqualInt(ARCHIVE_OK,
78	    archive_write_open_memory(a, buff, sizeof(buff), &used));
79
80	entry = archive_entry_new2(a);
81	/* Set a UTF-8 filename. */
82	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
83	archive_entry_set_filetype(entry, AE_IFREG);
84	archive_entry_set_size(entry, 0);
85	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
86	archive_entry_free(entry);
87	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
88
89	/* A bit 11 of general purpose flag should be 0x08,
90	 * which indicates the filename charset is UTF-8. */
91	assertEqualInt(0x08, buff[7]);
92	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
93
94	/*
95	 * Verify that A bit 11 of general purpose flag is not set
96	 * when ASCII filenames are stored.
97	 */
98	a = archive_write_new();
99	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
100	assertEqualInt(ARCHIVE_OK,
101	    archive_write_open_memory(a, buff, sizeof(buff), &used));
102
103	entry = archive_entry_new2(a);
104	/* Set an ASCII filename. */
105	archive_entry_set_pathname(entry, "abcABC");
106	archive_entry_set_filetype(entry, AE_IFREG);
107	archive_entry_set_size(entry, 0);
108	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
109	archive_entry_free(entry);
110	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
111
112	/* A bit 11 of general purpose flag should be 0,
113	 * which indicates the filename charset is unknown. */
114	assertEqualInt(0, buff[7]);
115	assertEqualMem(buff + 30, "abcABC", 6);
116}
117
118DEFINE_TEST(test_zip_filename_encoding_KOI8R)
119{
120  	struct archive *a;
121  	struct archive_entry *entry;
122	char buff[4096];
123	size_t used;
124
125	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
126		skipping("KOI8-R locale not available on this system.");
127		return;
128	}
129
130	/*
131	 * Verify that KOI8-R filenames are correctly translated to UTF-8.
132	 */
133	a = archive_write_new();
134	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
135	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
136		skipping("This system cannot convert character-set"
137		    " from KOI8-R to UTF-8.");
138		archive_write_free(a);
139		return;
140	}
141	assertEqualInt(ARCHIVE_OK,
142	    archive_write_open_memory(a, buff, sizeof(buff), &used));
143
144	entry = archive_entry_new2(a);
145	/* Set a KOI8-R filename. */
146	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
147	archive_entry_set_filetype(entry, AE_IFREG);
148	archive_entry_set_size(entry, 0);
149	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
150	archive_entry_free(entry);
151	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
152
153	/* A bit 11 of general purpose flag should be 0x08,
154	 * which indicates the filename charset is UTF-8. */
155	assertEqualInt(0x08, buff[7]);
156	/* Above three characters in KOI8-R should translate to the following
157	 * three characters (two bytes each) in UTF-8. */
158	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
159
160	/*
161	 * Verify that KOI8-R filenames are not translated to UTF-8.
162	 */
163	a = archive_write_new();
164	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
165	assertEqualInt(ARCHIVE_OK,
166	    archive_write_open_memory(a, buff, sizeof(buff), &used));
167
168	entry = archive_entry_new2(a);
169	/* Set a KOI8-R filename. */
170	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
171	archive_entry_set_filetype(entry, AE_IFREG);
172	archive_entry_set_size(entry, 0);
173	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
174	archive_entry_free(entry);
175	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
176
177	/* A bit 11 of general purpose flag should be 0,
178	 * which indicates the filename charset is unknown. */
179	assertEqualInt(0, buff[7]);
180	/* Above three characters in KOI8-R should not translate to
181	 * any character-set. */
182	assertEqualMem(buff + 30, "\xD0\xD2\xC9", 3);
183
184	/*
185	 * Verify that A bit 11 of general purpose flag is not set
186	 * when ASCII filenames are stored even if hdrcharset=UTF-8
187	 * is specified.
188	 */
189	a = archive_write_new();
190	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
191	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
192		skipping("This system cannot convert character-set"
193		    " from KOI8-R to UTF-8.");
194		archive_write_free(a);
195		return;
196	}
197	assertEqualInt(ARCHIVE_OK,
198	    archive_write_open_memory(a, buff, sizeof(buff), &used));
199
200	entry = archive_entry_new2(a);
201	/* Set an ASCII filename. */
202	archive_entry_set_pathname(entry, "abcABC");
203	archive_entry_set_filetype(entry, AE_IFREG);
204	archive_entry_set_size(entry, 0);
205	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
206	archive_entry_free(entry);
207	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
208
209	/* A bit 11 of general purpose flag should be 0,
210	 * which indicates the filename charset is unknown. */
211	assertEqualInt(0, buff[7]);
212	assertEqualMem(buff + 30, "abcABC", 6);
213}
214
215/*
216 * Do not translate CP1251 into CP866 if non Windows platform.
217 */
218DEFINE_TEST(test_zip_filename_encoding_ru_RU_CP1251)
219{
220  	struct archive *a;
221  	struct archive_entry *entry;
222	char buff[4096];
223	size_t used;
224
225	if (NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
226		skipping("Russian_Russia locale not available on this system.");
227		return;
228	}
229
230	/*
231	 * Verify that CP1251 filenames are not translated into any
232	 * other character-set, in particular, CP866.
233	 */
234	a = archive_write_new();
235	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
236	assertEqualInt(ARCHIVE_OK,
237	    archive_write_open_memory(a, buff, sizeof(buff), &used));
238
239	entry = archive_entry_new2(a);
240	/* Set a CP1251 filename. */
241	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
242	archive_entry_set_filetype(entry, AE_IFREG);
243	archive_entry_set_size(entry, 0);
244	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
245	archive_entry_free(entry);
246	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
247
248	/* A bit 11 of general purpose flag should be 0,
249	 * which indicates the filename charset is unknown. */
250	assertEqualInt(0, buff[7]);
251	/* Above three characters in CP1251 should not translate into
252	 * any other character-set. */
253	assertEqualMem(buff + 30, "\xEF\xF0\xE8", 3);
254}
255
256/*
257 * Other archiver applications on Windows translate CP1251 filenames
258 * into CP866 filenames and store it in the zip file.
259 * Test above behavior works well.
260 */
261DEFINE_TEST(test_zip_filename_encoding_Russian_Russia)
262{
263  	struct archive *a;
264  	struct archive_entry *entry;
265	char buff[4096];
266	size_t used;
267
268	if (NULL == setlocale(LC_ALL, "Russian_Russia")) {
269		skipping("Russian_Russia locale not available on this system.");
270		return;
271	}
272
273	/*
274	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
275	 * to UTF-8.
276	 */
277	a = archive_write_new();
278	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
279	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
280		skipping("This system cannot convert character-set"
281		    " from Russian_Russia.CP1251 to UTF-8.");
282		archive_write_free(a);
283		return;
284	}
285	assertEqualInt(ARCHIVE_OK,
286	    archive_write_open_memory(a, buff, sizeof(buff), &used));
287
288	entry = archive_entry_new2(a);
289	/* Set a CP1251 filename. */
290	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
291	archive_entry_set_filetype(entry, AE_IFREG);
292	archive_entry_set_size(entry, 0);
293	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
294	archive_entry_free(entry);
295	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
296
297	/* A bit 11 of general purpose flag should be 0x08,
298	 * which indicates the filename charset is UTF-8. */
299	assertEqualInt(0x08, buff[7]);
300	/* Above three characters in CP1251 should translate to the following
301	 * three characters (two bytes each) in UTF-8. */
302	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
303
304	/*
305	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
306	 * to CP866.
307	 */
308	a = archive_write_new();
309	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
310	assertEqualInt(ARCHIVE_OK,
311	    archive_write_open_memory(a, buff, sizeof(buff), &used));
312
313	entry = archive_entry_new2(a);
314	/* Set a CP1251 filename. */
315	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
316	archive_entry_set_filetype(entry, AE_IFREG);
317	archive_entry_set_size(entry, 0);
318	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
319	archive_entry_free(entry);
320	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
321
322	/* A bit 11 of general purpose flag should be 0,
323	 * which indicates the filename charset is unknown. */
324	assertEqualInt(0, buff[7]);
325	/* Above three characters in CP1251 should translate to the following
326	 * three characters in CP866. */
327	assertEqualMem(buff + 30, "\xAF\xE0\xA8", 3);
328}
329
330DEFINE_TEST(test_zip_filename_encoding_EUCJP)
331{
332  	struct archive *a;
333  	struct archive_entry *entry;
334	char buff[4096];
335	size_t used;
336
337	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
338		skipping("eucJP locale not available on this system.");
339		return;
340	}
341
342	/*
343	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
344	 */
345	a = archive_write_new();
346	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
347	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
348		skipping("This system cannot convert character-set"
349		    " from eucJP to UTF-8.");
350		archive_write_free(a);
351		return;
352	}
353	assertEqualInt(ARCHIVE_OK,
354	    archive_write_open_memory(a, buff, sizeof(buff), &used));
355
356	entry = archive_entry_new2(a);
357	/* Set an EUC-JP filename. */
358	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
359	/* Check the Unicode version. */
360	archive_entry_set_filetype(entry, AE_IFREG);
361	archive_entry_set_size(entry, 0);
362	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
363	archive_entry_free(entry);
364	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
365
366	/* A bit 11 of general purpose flag should be 0x08,
367	 * which indicates the filename charset is UTF-8. */
368	assertEqualInt(0x08, buff[7]);
369	/* Check UTF-8 version. */
370	assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
371
372	/*
373	 * Verify that EUC-JP filenames are not translated to UTF-8.
374	 */
375	a = archive_write_new();
376	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
377	assertEqualInt(ARCHIVE_OK,
378	    archive_write_open_memory(a, buff, sizeof(buff), &used));
379
380	entry = archive_entry_new2(a);
381	/* Set an EUC-JP filename. */
382	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
383	/* Check the Unicode version. */
384	archive_entry_set_filetype(entry, AE_IFREG);
385	archive_entry_set_size(entry, 0);
386	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
387	archive_entry_free(entry);
388	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
389
390	/* A bit 11 of general purpose flag should be 0,
391	 * which indicates the filename charset is unknown. */
392	assertEqualInt(0, buff[7]);
393	/* Above three characters in EUC-JP should not translate to
394	 * any character-set. */
395	assertEqualMem(buff + 30, "\xC9\xBD.txt", 6);
396
397	/*
398	 * Verify that A bit 11 of general purpose flag is not set
399	 * when ASCII filenames are stored even if hdrcharset=UTF-8
400	 * is specified.
401	 */
402	a = archive_write_new();
403	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
404	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
405		skipping("This system cannot convert character-set"
406		    " from eucJP to UTF-8.");
407		archive_write_free(a);
408		return;
409	}
410	assertEqualInt(ARCHIVE_OK,
411	    archive_write_open_memory(a, buff, sizeof(buff), &used));
412
413	entry = archive_entry_new2(a);
414	/* Set an ASCII filename. */
415	archive_entry_set_pathname(entry, "abcABC");
416	/* Check the Unicode version. */
417	archive_entry_set_filetype(entry, AE_IFREG);
418	archive_entry_set_size(entry, 0);
419	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
420	archive_entry_free(entry);
421	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
422
423	/* A bit 11 of general purpose flag should be 0,
424	 * which indicates the filename charset is unknown. */
425	assertEqualInt(0, buff[7]);
426	assertEqualMem(buff + 30, "abcABC", 6);
427}
428
429DEFINE_TEST(test_zip_filename_encoding_CP932)
430{
431  	struct archive *a;
432  	struct archive_entry *entry;
433	char buff[4096];
434	size_t used;
435
436	if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
437	    NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
438		skipping("CP932/SJIS locale not available on this system.");
439		return;
440	}
441
442	/*
443	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
444	 */
445	a = archive_write_new();
446	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
447	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
448		skipping("This system cannot convert character-set"
449		    " from CP932/SJIS to UTF-8.");
450		archive_write_free(a);
451		return;
452	}
453	assertEqualInt(ARCHIVE_OK,
454	    archive_write_open_memory(a, buff, sizeof(buff), &used));
455
456	entry = archive_entry_new2(a);
457	/* Set a CP932/SJIS filename. */
458	archive_entry_set_pathname(entry, "\x95\x5C.txt");
459	/* Check the Unicode version. */
460	archive_entry_set_filetype(entry, AE_IFREG);
461	archive_entry_set_size(entry, 0);
462	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
463	archive_entry_free(entry);
464	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
465
466	/* A bit 11 of general purpose flag should be 0x08,
467	 * which indicates the filename charset is UTF-8. */
468	assertEqualInt(0x08, buff[7]);
469	/* Check UTF-8 version. */
470	assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
471
472	/*
473	 * Verify that CP932/SJIS filenames are not translated to UTF-8.
474	 */
475	a = archive_write_new();
476	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
477	assertEqualInt(ARCHIVE_OK,
478	    archive_write_open_memory(a, buff, sizeof(buff), &used));
479
480	entry = archive_entry_new2(a);
481	/* Set a CP932/SJIS filename. */
482	archive_entry_set_pathname(entry, "\x95\x5C.txt");
483	/* Check the Unicode version. */
484	archive_entry_set_filetype(entry, AE_IFREG);
485	archive_entry_set_size(entry, 0);
486	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
487	archive_entry_free(entry);
488	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
489
490	/* A bit 11 of general purpose flag should be 0,
491	 * which indicates the filename charset is unknown. */
492	assertEqualInt(0, buff[7]);
493	/* Above three characters in CP932/SJIS should not translate to
494	 * any character-set. */
495	assertEqualMem(buff + 30, "\x95\x5C.txt", 6);
496
497	/*
498	 * Verify that A bit 11 of general purpose flag is not set
499	 * when ASCII filenames are stored even if hdrcharset=UTF-8
500	 * is specified.
501	 */
502	a = archive_write_new();
503	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
504	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
505		skipping("This system cannot convert character-set"
506		    " from CP932/SJIS to UTF-8.");
507		archive_write_free(a);
508		return;
509	}
510	assertEqualInt(ARCHIVE_OK,
511	    archive_write_open_memory(a, buff, sizeof(buff), &used));
512
513	entry = archive_entry_new2(a);
514	/* Set an ASCII filename. */
515	archive_entry_set_pathname(entry, "abcABC");
516	/* Check the Unicode version. */
517	archive_entry_set_filetype(entry, AE_IFREG);
518	archive_entry_set_size(entry, 0);
519	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
520	archive_entry_free(entry);
521	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
522
523	/* A bit 11 of general purpose flag should be 0,
524	 * which indicates the filename charset is unknown. */
525	assertEqualInt(0, buff[7]);
526	assertEqualMem(buff + 30, "abcABC", 6);
527}
528