1/* Test of character set conversion with error handling and autodetection. 2 Copyright (C) 2007-2010 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17/* Written by Bruno Haible <bruno@clisp.org>, 2007. */ 18 19#include <config.h> 20 21#include "striconveha.h" 22 23#if HAVE_ICONV 24# include <iconv.h> 25#endif 26 27#include <errno.h> 28#include <stdlib.h> 29#include <string.h> 30 31#include "macros.h" 32 33/* Magic number for detecting bounds violations. */ 34#define MAGIC 0x1983EFF1 35 36static size_t * 37new_offsets (size_t n) 38{ 39 size_t *offsets = (size_t *) malloc ((n + 1) * sizeof (size_t)); 40 offsets[n] = MAGIC; 41 return offsets; 42} 43 44int 45main () 46{ 47 static enum iconv_ilseq_handler handlers[] = 48 { iconveh_error, iconveh_question_mark, iconveh_escape_sequence }; 49 size_t h; 50 size_t o; 51 size_t i; 52 53#if HAVE_ICONV 54 /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1, 55 ISO-8859-2, and UTF-8. */ 56 57 /* ------------------------- Test mem_iconveha() ------------------------- */ 58 59 /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors. */ 60 for (h = 0; h < SIZEOF (handlers); h++) 61 { 62 enum iconv_ilseq_handler handler = handlers[h]; 63 static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; 64 static const char expected[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; 65 for (o = 0; o < 2; o++) 66 { 67 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 68 char *result = NULL; 69 size_t length = 0; 70 int retval = mem_iconveha (input, strlen (input), 71 "ISO-8859-2", "ISO-8859-1", 72 false, handler, 73 offsets, 74 &result, &length); 75 ASSERT (retval == 0); 76 ASSERT (length == strlen (expected)); 77 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 78 if (o) 79 { 80 for (i = 0; i < 37; i++) 81 ASSERT (offsets[i] == i); 82 ASSERT (offsets[37] == MAGIC); 83 free (offsets); 84 } 85 free (result); 86 } 87 } 88 89 /* Test conversion from ISO-8859-2 to ISO-8859-1 with EILSEQ. */ 90 for (h = 0; h < SIZEOF (handlers); h++) 91 { 92 enum iconv_ilseq_handler handler = handlers[h]; 93 static const char input[] = "Rafa\263 Maszkowski"; /* Rafa�� Maszkowski */ 94 for (o = 0; o < 2; o++) 95 { 96 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 97 char *result = NULL; 98 size_t length = 0; 99 int retval = mem_iconveha (input, strlen (input), 100 "ISO-8859-2", "ISO-8859-1", 101 false, handler, 102 offsets, 103 &result, &length); 104 switch (handler) 105 { 106 case iconveh_error: 107 ASSERT (retval == -1 && errno == EILSEQ); 108 ASSERT (result == NULL); 109 if (o) 110 free (offsets); 111 break; 112 case iconveh_question_mark: 113 { 114 static const char expected[] = "Rafa? Maszkowski"; 115 ASSERT (retval == 0); 116 ASSERT (length == strlen (expected)); 117 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 118 if (o) 119 { 120 for (i = 0; i < 16; i++) 121 ASSERT (offsets[i] == i); 122 ASSERT (offsets[16] == MAGIC); 123 free (offsets); 124 } 125 free (result); 126 } 127 break; 128 case iconveh_escape_sequence: 129 { 130 static const char expected[] = "Rafa\\u0142 Maszkowski"; 131 ASSERT (retval == 0); 132 ASSERT (length == strlen (expected)); 133 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 134 if (o) 135 { 136 for (i = 0; i < 16; i++) 137 ASSERT (offsets[i] == (i < 5 ? i : 138 i + 5)); 139 ASSERT (offsets[16] == MAGIC); 140 free (offsets); 141 } 142 free (result); 143 } 144 break; 145 } 146 } 147 } 148 149 /* Test conversion from ISO-8859-1 to UTF-8 with no errors. */ 150 for (h = 0; h < SIZEOF (handlers); h++) 151 { 152 enum iconv_ilseq_handler handler = handlers[h]; 153 static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; 154 static const char expected[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237"; 155 for (o = 0; o < 2; o++) 156 { 157 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 158 char *result = NULL; 159 size_t length = 0; 160 int retval = mem_iconveha (input, strlen (input), 161 "ISO-8859-1", "UTF-8", 162 false, handler, 163 offsets, 164 &result, &length); 165 ASSERT (retval == 0); 166 ASSERT (length == strlen (expected)); 167 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 168 if (o) 169 { 170 for (i = 0; i < 37; i++) 171 ASSERT (offsets[i] == (i < 1 ? i : 172 i < 12 ? i + 1 : 173 i < 18 ? i + 2 : 174 i + 3)); 175 ASSERT (offsets[37] == MAGIC); 176 free (offsets); 177 } 178 free (result); 179 } 180 } 181 182 /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ 183 for (h = 0; h < SIZEOF (handlers); h++) 184 { 185 enum iconv_ilseq_handler handler = handlers[h]; 186 static const char input[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237"; 187 static const char expected[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; 188 for (o = 0; o < 2; o++) 189 { 190 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 191 char *result = NULL; 192 size_t length = 0; 193 int retval = mem_iconveha (input, strlen (input), 194 "UTF-8", "ISO-8859-1", 195 false, handler, 196 offsets, 197 &result, &length); 198 ASSERT (retval == 0); 199 ASSERT (length == strlen (expected)); 200 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 201 if (o) 202 { 203 for (i = 0; i < 41; i++) 204 ASSERT (offsets[i] == (i < 1 ? i : 205 i == 1 ? (size_t)(-1) : 206 i < 13 ? i - 1 : 207 i == 13 ? (size_t)(-1) : 208 i < 20 ? i - 2 : 209 i == 20 ? (size_t)(-1) : 210 i < 40 ? i - 3 : 211 (size_t)(-1))); 212 ASSERT (offsets[41] == MAGIC); 213 free (offsets); 214 } 215 free (result); 216 } 217 } 218 219 /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */ 220 for (h = 0; h < SIZEOF (handlers); h++) 221 { 222 enum iconv_ilseq_handler handler = handlers[h]; 223 static const char input[] = "Rafa\305\202 Maszkowski"; /* Rafa�� Maszkowski */ 224 for (o = 0; o < 2; o++) 225 { 226 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 227 char *result = NULL; 228 size_t length = 0; 229 int retval = mem_iconveha (input, strlen (input), 230 "UTF-8", "ISO-8859-1", 231 false, handler, 232 offsets, 233 &result, &length); 234 switch (handler) 235 { 236 case iconveh_error: 237 ASSERT (retval == -1 && errno == EILSEQ); 238 ASSERT (result == NULL); 239 if (o) 240 free (offsets); 241 break; 242 case iconveh_question_mark: 243 { 244 static const char expected[] = "Rafa? Maszkowski"; 245 ASSERT (retval == 0); 246 ASSERT (length == strlen (expected)); 247 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 248 if (o) 249 { 250 for (i = 0; i < 17; i++) 251 ASSERT (offsets[i] == (i < 5 ? i : 252 i == 5 ? (size_t)(-1) : 253 i - 1)); 254 ASSERT (offsets[17] == MAGIC); 255 free (offsets); 256 } 257 free (result); 258 } 259 break; 260 case iconveh_escape_sequence: 261 { 262 static const char expected[] = "Rafa\\u0142 Maszkowski"; 263 ASSERT (retval == 0); 264 ASSERT (length == strlen (expected)); 265 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 266 if (o) 267 { 268 for (i = 0; i < 17; i++) 269 ASSERT (offsets[i] == (i < 5 ? i : 270 i == 5 ? (size_t)(-1) : 271 i + 4)); 272 ASSERT (offsets[17] == MAGIC); 273 free (offsets); 274 } 275 free (result); 276 } 277 break; 278 } 279 } 280 } 281 282 /* Test conversion from UTF-8 to ISO-8859-1 with EINVAL. */ 283 for (h = 0; h < SIZEOF (handlers); h++) 284 { 285 enum iconv_ilseq_handler handler = handlers[h]; 286 static const char input[] = "\342"; 287 for (o = 0; o < 2; o++) 288 { 289 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 290 char *result = NULL; 291 size_t length = 0; 292 int retval = mem_iconveha (input, strlen (input), 293 "UTF-8", "ISO-8859-1", 294 false, handler, 295 offsets, 296 &result, &length); 297 ASSERT (retval == 0); 298 ASSERT (length == 0); 299 if (o) 300 { 301 ASSERT (offsets[0] == 0); 302 ASSERT (offsets[1] == MAGIC); 303 free (offsets); 304 } 305 free (result); 306 } 307 } 308 309 /* autodetect_jp is only supported when iconv() support ISO-2022-JP-2. */ 310# if defined _LIBICONV_VERSION || !(defined _AIX || defined __sgi || defined __hpux || defined __osf__ || defined __sun) 311 /* Test conversions from autodetect_jp to UTF-8. */ 312 for (h = 0; h < SIZEOF (handlers); h++) 313 { 314 enum iconv_ilseq_handler handler = handlers[h]; 315 static const char input[] = "\244\263\244\363\244\313\244\301\244\317"; /* ��������������� in EUC-JP */ 316 static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */ 317 for (o = 0; o < 2; o++) 318 { 319 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 320 char *result = NULL; 321 size_t length = 0; 322 int retval = mem_iconveha (input, strlen (input), 323 "autodetect_jp", "UTF-8", 324 false, handler, 325 offsets, 326 &result, &length); 327 ASSERT (retval == 0); 328 ASSERT (length == strlen (expected)); 329 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 330 if (o) 331 { 332 for (i = 0; i < 10; i++) 333 ASSERT (offsets[i] == ((i % 2) == 0 ? (i / 2) * 3 : (size_t)(-1))); 334 ASSERT (offsets[10] == MAGIC); 335 free (offsets); 336 } 337 free (result); 338 } 339 } 340 for (h = 0; h < SIZEOF (handlers); h++) 341 { 342 enum iconv_ilseq_handler handler = handlers[h]; 343 static const char input[] = "\202\261\202\361\202\311\202\277\202\315"; /* ��������������� in Shift_JIS */ 344 static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */ 345 for (o = 0; o < 2; o++) 346 { 347 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 348 char *result = NULL; 349 size_t length = 0; 350 int retval = mem_iconveha (input, strlen (input), 351 "autodetect_jp", "UTF-8", 352 false, handler, 353 offsets, 354 &result, &length); 355 ASSERT (retval == 0); 356 ASSERT (length == strlen (expected)); 357 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 358 if (o) 359 { 360 for (i = 0; i < 10; i++) 361 ASSERT (offsets[i] == ((i % 2) == 0 ? (i / 2) * 3 : (size_t)(-1))); 362 ASSERT (offsets[10] == MAGIC); 363 free (offsets); 364 } 365 free (result); 366 } 367 } 368 for (h = 0; h < SIZEOF (handlers); h++) 369 { 370 enum iconv_ilseq_handler handler = handlers[h]; 371 static const char input[] = "\033$B$3$s$K$A$O\033(B"; /* ��������������� in ISO-2022-JP-2 */ 372 static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */ 373 for (o = 0; o < 2; o++) 374 { 375 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 376 char *result = NULL; 377 size_t length = 0; 378 int retval = mem_iconveha (input, strlen (input), 379 "autodetect_jp", "UTF-8", 380 false, handler, 381 offsets, 382 &result, &length); 383 ASSERT (retval == 0); 384 ASSERT (length == strlen (expected)); 385 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 386 if (o) 387 { 388 for (i = 0; i < 16; i++) 389 ASSERT (offsets[i] == (i == 0 ? 0 : 390 i == 5 ? 3 : 391 i == 7 ? 6 : 392 i == 9 ? 9 : 393 i == 11 ? 12 : 394 i == 13 ? 15 : 395 (size_t)(-1))); 396 ASSERT (offsets[16] == MAGIC); 397 free (offsets); 398 } 399 free (result); 400 } 401 } 402# endif 403 404# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105 405 /* Test conversion from UTF-8 to ISO-8859-1 with transliteration. */ 406 for (h = 0; h < SIZEOF (handlers); h++) 407 { 408 enum iconv_ilseq_handler handler = handlers[h]; 409 static const char input[] = "Costs: 27 \342\202\254"; /* EURO SIGN */ 410 static const char expected[] = "Costs: 27 EUR"; 411 for (o = 0; o < 2; o++) 412 { 413 size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); 414 char *result = NULL; 415 size_t length = 0; 416 int retval = mem_iconveha (input, strlen (input), 417 "UTF-8", "ISO-8859-1", 418 true, handler, 419 offsets, 420 &result, &length); 421 ASSERT (retval == 0); 422 ASSERT (length == strlen (expected)); 423 ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); 424 if (o) 425 { 426 for (i = 0; i < 13; i++) 427 ASSERT (offsets[i] == (i < 11 ? i : (size_t)(-1))); 428 ASSERT (offsets[13] == MAGIC); 429 free (offsets); 430 } 431 free (result); 432 } 433 } 434# endif 435 436 /* ------------------------- Test str_iconveha() ------------------------- */ 437 438 /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors. */ 439 for (h = 0; h < SIZEOF (handlers); h++) 440 { 441 enum iconv_ilseq_handler handler = handlers[h]; 442 static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; 443 static const char expected[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; 444 char *result = str_iconveha (input, "ISO-8859-2", "ISO-8859-1", false, handler); 445 ASSERT (result != NULL); 446 ASSERT (strcmp (result, expected) == 0); 447 free (result); 448 } 449 450 /* Test conversion from ISO-8859-2 to ISO-8859-1 with EILSEQ. */ 451 for (h = 0; h < SIZEOF (handlers); h++) 452 { 453 enum iconv_ilseq_handler handler = handlers[h]; 454 static const char input[] = "Rafa\263 Maszkowski"; /* Rafa�� Maszkowski */ 455 char *result = str_iconveha (input, "ISO-8859-2", "ISO-8859-1", false, handler); 456 switch (handler) 457 { 458 case iconveh_error: 459 ASSERT (result == NULL && errno == EILSEQ); 460 break; 461 case iconveh_question_mark: 462 { 463 static const char expected[] = "Rafa? Maszkowski"; 464 ASSERT (result != NULL); 465 ASSERT (strcmp (result, expected) == 0); 466 free (result); 467 } 468 break; 469 case iconveh_escape_sequence: 470 { 471 static const char expected[] = "Rafa\\u0142 Maszkowski"; 472 ASSERT (result != NULL); 473 ASSERT (strcmp (result, expected) == 0); 474 free (result); 475 } 476 break; 477 } 478 } 479 480 /* Test conversion from ISO-8859-1 to UTF-8 with no errors. */ 481 for (h = 0; h < SIZEOF (handlers); h++) 482 { 483 enum iconv_ilseq_handler handler = handlers[h]; 484 static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; 485 static const char expected[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237"; 486 char *result = str_iconveha (input, "ISO-8859-1", "UTF-8", false, handler); 487 ASSERT (result != NULL); 488 ASSERT (strcmp (result, expected) == 0); 489 free (result); 490 } 491 492 /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ 493 for (h = 0; h < SIZEOF (handlers); h++) 494 { 495 enum iconv_ilseq_handler handler = handlers[h]; 496 static const char input[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237"; 497 static const char expected[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; 498 char *result = str_iconveha (input, "UTF-8", "ISO-8859-1", false, handler); 499 ASSERT (result != NULL); 500 ASSERT (strcmp (result, expected) == 0); 501 free (result); 502 } 503 504 /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */ 505 for (h = 0; h < SIZEOF (handlers); h++) 506 { 507 enum iconv_ilseq_handler handler = handlers[h]; 508 static const char input[] = "Costs: 27 \342\202\254"; /* EURO SIGN */ 509 char *result = str_iconveha (input, "UTF-8", "ISO-8859-1", false, handler); 510 switch (handler) 511 { 512 case iconveh_error: 513 ASSERT (result == NULL && errno == EILSEQ); 514 break; 515 case iconveh_question_mark: 516 { 517 static const char expected[] = "Costs: 27 ?"; 518 ASSERT (result != NULL); 519 ASSERT (strcmp (result, expected) == 0); 520 free (result); 521 } 522 break; 523 case iconveh_escape_sequence: 524 { 525 static const char expected[] = "Costs: 27 \\u20AC"; 526 ASSERT (result != NULL); 527 ASSERT (strcmp (result, expected) == 0); 528 free (result); 529 } 530 break; 531 } 532 } 533 534 /* Test conversion from UTF-8 to ISO-8859-1 with EINVAL. */ 535 for (h = 0; h < SIZEOF (handlers); h++) 536 { 537 enum iconv_ilseq_handler handler = handlers[h]; 538 static const char input[] = "\342"; 539 char *result = str_iconveha (input, "UTF-8", "ISO-8859-1", false, handler); 540 ASSERT (result != NULL); 541 ASSERT (strcmp (result, "") == 0); 542 free (result); 543 } 544 545 /* autodetect_jp is only supported when iconv() support ISO-2022-JP-2. */ 546# if defined _LIBICONV_VERSION || !(defined _AIX || defined __sgi || defined __hpux || defined __osf__ || defined __sun) 547 /* Test conversions from autodetect_jp to UTF-8. */ 548 for (h = 0; h < SIZEOF (handlers); h++) 549 { 550 enum iconv_ilseq_handler handler = handlers[h]; 551 static const char input[] = "\244\263\244\363\244\313\244\301\244\317"; /* ��������������� in EUC-JP */ 552 static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */ 553 char *result = str_iconveha (input, "autodetect_jp", "UTF-8", false, handler); 554 ASSERT (result != NULL); 555 ASSERT (strcmp (result, expected) == 0); 556 free (result); 557 } 558 for (h = 0; h < SIZEOF (handlers); h++) 559 { 560 enum iconv_ilseq_handler handler = handlers[h]; 561 static const char input[] = "\202\261\202\361\202\311\202\277\202\315"; /* ��������������� in Shift_JIS */ 562 static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */ 563 char *result = str_iconveha (input, "autodetect_jp", "UTF-8", false, handler); 564 ASSERT (result != NULL); 565 ASSERT (strcmp (result, expected) == 0); 566 free (result); 567 } 568 for (h = 0; h < SIZEOF (handlers); h++) 569 { 570 enum iconv_ilseq_handler handler = handlers[h]; 571 static const char input[] = "\033$B$3$s$K$A$O\033(B"; /* ��������������� in ISO-2022-JP-2 */ 572 static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */ 573 char *result = str_iconveha (input, "autodetect_jp", "UTF-8", false, handler); 574 ASSERT (result != NULL); 575 ASSERT (strcmp (result, expected) == 0); 576 free (result); 577 } 578# endif 579 580# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105 581 /* Test conversion from UTF-8 to ISO-8859-1 with transliteration. */ 582 for (h = 0; h < SIZEOF (handlers); h++) 583 { 584 enum iconv_ilseq_handler handler = handlers[h]; 585 static const char input[] = "Costs: 27 \342\202\254"; /* EURO SIGN */ 586 static const char expected[] = "Costs: 27 EUR"; 587 char *result = str_iconveha (input, "UTF-8", "ISO-8859-1", true, handler); 588 ASSERT (result != NULL); 589 ASSERT (strcmp (result, expected) == 0); 590 free (result); 591 } 592# endif 593 594#endif 595 596 return 0; 597} 598