test_zip_filename_encoding.c revision 256281
1/*- 2 * Copyright (c) 2011 Michihiro NAKAJIMA 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25#include "test.h" 26__FBSDID("$FreeBSD$"); 27 28#include <locale.h> 29 30static void 31test_zip_filename_encoding_UTF8(void) 32{ 33 struct archive *a; 34 struct archive_entry *entry; 35 char buff[4096]; 36 size_t used; 37 38 if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) { 39 skipping("en_US.UTF-8 locale not available on this system."); 40 return; 41 } 42 43 /* 44 * Verify that UTF-8 filenames are correctly stored with 45 * hdrcharset=UTF-8 option. 46 */ 47 a = archive_write_new(); 48 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 49 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) { 50 skipping("This system cannot convert character-set" 51 " for UTF-8."); 52 archive_write_free(a); 53 return; 54 } 55 assertEqualInt(ARCHIVE_OK, 56 archive_write_open_memory(a, buff, sizeof(buff), &used)); 57 58 entry = archive_entry_new2(a); 59 /* Set a UTF-8 filename. */ 60 archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8"); 61 archive_entry_set_filetype(entry, AE_IFREG); 62 archive_entry_set_size(entry, 0); 63 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 64 archive_entry_free(entry); 65 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 66 67 /* A bit 11 of general purpose flag should be 0x08, 68 * which indicates the filename charset is UTF-8. */ 69 assertEqualInt(0x08, buff[7]); 70 assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6); 71 72 /* 73 * Verify that UTF-8 filenames are correctly stored without 74 * hdrcharset=UTF-8 option. 75 */ 76 a = archive_write_new(); 77 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 78 assertEqualInt(ARCHIVE_OK, 79 archive_write_open_memory(a, buff, sizeof(buff), &used)); 80 81 entry = archive_entry_new2(a); 82 /* Set a UTF-8 filename. */ 83 archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8"); 84 archive_entry_set_filetype(entry, AE_IFREG); 85 archive_entry_set_size(entry, 0); 86 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 87 archive_entry_free(entry); 88 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 89 90 /* A bit 11 of general purpose flag should be 0x08, 91 * which indicates the filename charset is UTF-8. */ 92 assertEqualInt(0x08, buff[7]); 93 assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6); 94 95 /* 96 * Verify that A bit 11 of general purpose flag is not set 97 * when ASCII filenames are stored. 98 */ 99 a = archive_write_new(); 100 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 101 assertEqualInt(ARCHIVE_OK, 102 archive_write_open_memory(a, buff, sizeof(buff), &used)); 103 104 entry = archive_entry_new2(a); 105 /* Set an ASCII filename. */ 106 archive_entry_set_pathname(entry, "abcABC"); 107 archive_entry_set_filetype(entry, AE_IFREG); 108 archive_entry_set_size(entry, 0); 109 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 110 archive_entry_free(entry); 111 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 112 113 /* A bit 11 of general purpose flag should be 0, 114 * which indicates the filename charset is unknown. */ 115 assertEqualInt(0, buff[7]); 116 assertEqualMem(buff + 30, "abcABC", 6); 117} 118 119static void 120test_zip_filename_encoding_KOI8R(void) 121{ 122 struct archive *a; 123 struct archive_entry *entry; 124 char buff[4096]; 125 size_t used; 126 127 if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) { 128 skipping("KOI8-R locale not available on this system."); 129 return; 130 } 131 132 /* 133 * Verify that KOI8-R filenames are correctly translated to UTF-8. 134 */ 135 a = archive_write_new(); 136 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 137 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) { 138 skipping("This system cannot convert character-set" 139 " from KOI8-R to UTF-8."); 140 archive_write_free(a); 141 return; 142 } 143 assertEqualInt(ARCHIVE_OK, 144 archive_write_open_memory(a, buff, sizeof(buff), &used)); 145 146 entry = archive_entry_new2(a); 147 /* Set a KOI8-R filename. */ 148 archive_entry_set_pathname(entry, "\xD0\xD2\xC9"); 149 archive_entry_set_filetype(entry, AE_IFREG); 150 archive_entry_set_size(entry, 0); 151 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 152 archive_entry_free(entry); 153 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 154 155 /* A bit 11 of general purpose flag should be 0x08, 156 * which indicates the filename charset is UTF-8. */ 157 assertEqualInt(0x08, buff[7]); 158 /* Above three characters in KOI8-R should translate to the following 159 * three characters (two bytes each) in UTF-8. */ 160 assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6); 161 162 /* 163 * Verify that KOI8-R filenames are not translated to UTF-8. 164 */ 165 a = archive_write_new(); 166 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 167 assertEqualInt(ARCHIVE_OK, 168 archive_write_open_memory(a, buff, sizeof(buff), &used)); 169 170 entry = archive_entry_new2(a); 171 /* Set a KOI8-R filename. */ 172 archive_entry_set_pathname(entry, "\xD0\xD2\xC9"); 173 archive_entry_set_filetype(entry, AE_IFREG); 174 archive_entry_set_size(entry, 0); 175 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 176 archive_entry_free(entry); 177 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 178 179 /* A bit 11 of general purpose flag should be 0, 180 * which indicates the filename charset is unknown. */ 181 assertEqualInt(0, buff[7]); 182 /* Above three characters in KOI8-R should not translate to 183 * any character-set. */ 184 assertEqualMem(buff + 30, "\xD0\xD2\xC9", 3); 185 186 /* 187 * Verify that A bit 11 of general purpose flag is not set 188 * when ASCII filenames are stored even if hdrcharset=UTF-8 189 * is specified. 190 */ 191 a = archive_write_new(); 192 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 193 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) { 194 skipping("This system cannot convert character-set" 195 " from KOI8-R to UTF-8."); 196 archive_write_free(a); 197 return; 198 } 199 assertEqualInt(ARCHIVE_OK, 200 archive_write_open_memory(a, buff, sizeof(buff), &used)); 201 202 entry = archive_entry_new2(a); 203 /* Set an ASCII filename. */ 204 archive_entry_set_pathname(entry, "abcABC"); 205 archive_entry_set_filetype(entry, AE_IFREG); 206 archive_entry_set_size(entry, 0); 207 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 208 archive_entry_free(entry); 209 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 210 211 /* A bit 11 of general purpose flag should be 0, 212 * which indicates the filename charset is unknown. */ 213 assertEqualInt(0, buff[7]); 214 assertEqualMem(buff + 30, "abcABC", 6); 215} 216 217/* 218 * Do not translate CP1251 into CP866 if non Windows platform. 219 */ 220static void 221test_zip_filename_encoding_ru_RU_CP1251(void) 222{ 223 struct archive *a; 224 struct archive_entry *entry; 225 char buff[4096]; 226 size_t used; 227 228 if (NULL == setlocale(LC_ALL, "ru_RU.CP1251")) { 229 skipping("Russian_Russia locale not available on this system."); 230 return; 231 } 232 233 /* 234 * Verify that CP1251 filenames are not translated into any 235 * other character-set, in particular, CP866. 236 */ 237 a = archive_write_new(); 238 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 239 assertEqualInt(ARCHIVE_OK, 240 archive_write_open_memory(a, buff, sizeof(buff), &used)); 241 242 entry = archive_entry_new2(a); 243 /* Set a CP1251 filename. */ 244 archive_entry_set_pathname(entry, "\xEF\xF0\xE8"); 245 archive_entry_set_filetype(entry, AE_IFREG); 246 archive_entry_set_size(entry, 0); 247 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 248 archive_entry_free(entry); 249 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 250 251 /* A bit 11 of general purpose flag should be 0, 252 * which indicates the filename charset is unknown. */ 253 assertEqualInt(0, buff[7]); 254 /* Above three characters in CP1251 should not translate into 255 * any other character-set. */ 256 assertEqualMem(buff + 30, "\xEF\xF0\xE8", 3); 257} 258 259/* 260 * Other archiver applications on Windows translate CP1251 filenames 261 * into CP866 filenames and store it in the zip file. 262 * Test above behavior works well. 263 */ 264static void 265test_zip_filename_encoding_Russian_Russia(void) 266{ 267 struct archive *a; 268 struct archive_entry *entry; 269 char buff[4096]; 270 size_t used; 271 272 if (NULL == setlocale(LC_ALL, "Russian_Russia")) { 273 skipping("Russian_Russia locale not available on this system."); 274 return; 275 } 276 277 /* 278 * Verify that Russian_Russia(CP1251) filenames are correctly translated 279 * to UTF-8. 280 */ 281 a = archive_write_new(); 282 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 283 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) { 284 skipping("This system cannot convert character-set" 285 " from Russian_Russia.CP1251 to UTF-8."); 286 archive_write_free(a); 287 return; 288 } 289 assertEqualInt(ARCHIVE_OK, 290 archive_write_open_memory(a, buff, sizeof(buff), &used)); 291 292 entry = archive_entry_new2(a); 293 /* Set a CP1251 filename. */ 294 archive_entry_set_pathname(entry, "\xEF\xF0\xE8"); 295 archive_entry_set_filetype(entry, AE_IFREG); 296 archive_entry_set_size(entry, 0); 297 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 298 archive_entry_free(entry); 299 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 300 301 /* A bit 11 of general purpose flag should be 0x08, 302 * which indicates the filename charset is UTF-8. */ 303 assertEqualInt(0x08, buff[7]); 304 /* Above three characters in CP1251 should translate to the following 305 * three characters (two bytes each) in UTF-8. */ 306 assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6); 307 308 /* 309 * Verify that Russian_Russia(CP1251) filenames are correctly translated 310 * to CP866. 311 */ 312 a = archive_write_new(); 313 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 314 assertEqualInt(ARCHIVE_OK, 315 archive_write_open_memory(a, buff, sizeof(buff), &used)); 316 317 entry = archive_entry_new2(a); 318 /* Set a CP1251 filename. */ 319 archive_entry_set_pathname(entry, "\xEF\xF0\xE8"); 320 archive_entry_set_filetype(entry, AE_IFREG); 321 archive_entry_set_size(entry, 0); 322 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 323 archive_entry_free(entry); 324 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 325 326 /* A bit 11 of general purpose flag should be 0, 327 * which indicates the filename charset is unknown. */ 328 assertEqualInt(0, buff[7]); 329 /* Above three characters in CP1251 should translate to the following 330 * three characters in CP866. */ 331 assertEqualMem(buff + 30, "\xAF\xE0\xA8", 3); 332} 333 334static void 335test_zip_filename_encoding_EUCJP(void) 336{ 337 struct archive *a; 338 struct archive_entry *entry; 339 char buff[4096]; 340 size_t used; 341 342 if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) { 343 skipping("eucJP locale not available on this system."); 344 return; 345 } 346 347 /* 348 * Verify that EUC-JP filenames are correctly translated to UTF-8. 349 */ 350 a = archive_write_new(); 351 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 352 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) { 353 skipping("This system cannot convert character-set" 354 " from eucJP to UTF-8."); 355 archive_write_free(a); 356 return; 357 } 358 assertEqualInt(ARCHIVE_OK, 359 archive_write_open_memory(a, buff, sizeof(buff), &used)); 360 361 entry = archive_entry_new2(a); 362 /* Set an EUC-JP filename. */ 363 archive_entry_set_pathname(entry, "\xC9\xBD.txt"); 364 /* Check the Unicode version. */ 365 archive_entry_set_filetype(entry, AE_IFREG); 366 archive_entry_set_size(entry, 0); 367 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 368 archive_entry_free(entry); 369 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 370 371 /* A bit 11 of general purpose flag should be 0x08, 372 * which indicates the filename charset is UTF-8. */ 373 assertEqualInt(0x08, buff[7]); 374 /* Check UTF-8 version. */ 375 assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7); 376 377 /* 378 * Verify that EUC-JP filenames are not translated to UTF-8. 379 */ 380 a = archive_write_new(); 381 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 382 assertEqualInt(ARCHIVE_OK, 383 archive_write_open_memory(a, buff, sizeof(buff), &used)); 384 385 entry = archive_entry_new2(a); 386 /* Set an EUC-JP filename. */ 387 archive_entry_set_pathname(entry, "\xC9\xBD.txt"); 388 /* Check the Unicode version. */ 389 archive_entry_set_filetype(entry, AE_IFREG); 390 archive_entry_set_size(entry, 0); 391 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 392 archive_entry_free(entry); 393 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 394 395 /* A bit 11 of general purpose flag should be 0, 396 * which indicates the filename charset is unknown. */ 397 assertEqualInt(0, buff[7]); 398 /* Above three characters in EUC-JP should not translate to 399 * any character-set. */ 400 assertEqualMem(buff + 30, "\xC9\xBD.txt", 6); 401 402 /* 403 * Verify that A bit 11 of general purpose flag is not set 404 * when ASCII filenames are stored even if hdrcharset=UTF-8 405 * is specified. 406 */ 407 a = archive_write_new(); 408 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 409 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) { 410 skipping("This system cannot convert character-set" 411 " from eucJP to UTF-8."); 412 archive_write_free(a); 413 return; 414 } 415 assertEqualInt(ARCHIVE_OK, 416 archive_write_open_memory(a, buff, sizeof(buff), &used)); 417 418 entry = archive_entry_new2(a); 419 /* Set an ASCII filename. */ 420 archive_entry_set_pathname(entry, "abcABC"); 421 /* Check the Unicode version. */ 422 archive_entry_set_filetype(entry, AE_IFREG); 423 archive_entry_set_size(entry, 0); 424 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 425 archive_entry_free(entry); 426 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 427 428 /* A bit 11 of general purpose flag should be 0, 429 * which indicates the filename charset is unknown. */ 430 assertEqualInt(0, buff[7]); 431 assertEqualMem(buff + 30, "abcABC", 6); 432} 433 434static void 435test_zip_filename_encoding_CP932(void) 436{ 437 struct archive *a; 438 struct archive_entry *entry; 439 char buff[4096]; 440 size_t used; 441 442 if (NULL == setlocale(LC_ALL, "Japanese_Japan") && 443 NULL == setlocale(LC_ALL, "ja_JP.SJIS")) { 444 skipping("CP932/SJIS locale not available on this system."); 445 return; 446 } 447 448 /* 449 * Verify that EUC-JP filenames are correctly translated to UTF-8. 450 */ 451 a = archive_write_new(); 452 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 453 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) { 454 skipping("This system cannot convert character-set" 455 " from CP932/SJIS to UTF-8."); 456 archive_write_free(a); 457 return; 458 } 459 assertEqualInt(ARCHIVE_OK, 460 archive_write_open_memory(a, buff, sizeof(buff), &used)); 461 462 entry = archive_entry_new2(a); 463 /* Set a CP932/SJIS filename. */ 464 archive_entry_set_pathname(entry, "\x95\x5C.txt"); 465 /* Check the Unicode version. */ 466 archive_entry_set_filetype(entry, AE_IFREG); 467 archive_entry_set_size(entry, 0); 468 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 469 archive_entry_free(entry); 470 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 471 472 /* A bit 11 of general purpose flag should be 0x08, 473 * which indicates the filename charset is UTF-8. */ 474 assertEqualInt(0x08, buff[7]); 475 /* Check UTF-8 version. */ 476 assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7); 477 478 /* 479 * Verify that CP932/SJIS filenames are not translated to UTF-8. 480 */ 481 a = archive_write_new(); 482 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 483 assertEqualInt(ARCHIVE_OK, 484 archive_write_open_memory(a, buff, sizeof(buff), &used)); 485 486 entry = archive_entry_new2(a); 487 /* Set a CP932/SJIS filename. */ 488 archive_entry_set_pathname(entry, "\x95\x5C.txt"); 489 /* Check the Unicode version. */ 490 archive_entry_set_filetype(entry, AE_IFREG); 491 archive_entry_set_size(entry, 0); 492 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 493 archive_entry_free(entry); 494 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 495 496 /* A bit 11 of general purpose flag should be 0, 497 * which indicates the filename charset is unknown. */ 498 assertEqualInt(0, buff[7]); 499 /* Above three characters in CP932/SJIS should not translate to 500 * any character-set. */ 501 assertEqualMem(buff + 30, "\x95\x5C.txt", 6); 502 503 /* 504 * Verify that A bit 11 of general purpose flag is not set 505 * when ASCII filenames are stored even if hdrcharset=UTF-8 506 * is specified. 507 */ 508 a = archive_write_new(); 509 assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a)); 510 if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) { 511 skipping("This system cannot convert character-set" 512 " from CP932/SJIS to UTF-8."); 513 archive_write_free(a); 514 return; 515 } 516 assertEqualInt(ARCHIVE_OK, 517 archive_write_open_memory(a, buff, sizeof(buff), &used)); 518 519 entry = archive_entry_new2(a); 520 /* Set an ASCII filename. */ 521 archive_entry_set_pathname(entry, "abcABC"); 522 /* Check the Unicode version. */ 523 archive_entry_set_filetype(entry, AE_IFREG); 524 archive_entry_set_size(entry, 0); 525 assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); 526 archive_entry_free(entry); 527 assertEqualInt(ARCHIVE_OK, archive_write_free(a)); 528 529 /* A bit 11 of general purpose flag should be 0, 530 * which indicates the filename charset is unknown. */ 531 assertEqualInt(0, buff[7]); 532 assertEqualMem(buff + 30, "abcABC", 6); 533} 534 535DEFINE_TEST(test_zip_filename_encoding) 536{ 537 test_zip_filename_encoding_UTF8(); 538 test_zip_filename_encoding_KOI8R(); 539 test_zip_filename_encoding_ru_RU_CP1251(); 540 test_zip_filename_encoding_Russian_Russia(); 541 test_zip_filename_encoding_EUCJP(); 542 test_zip_filename_encoding_CP932(); 543} 544