1/* Character set conversion with error handling. 2 Copyright (C) 2001-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible and Simon Josefsson. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU Lesser General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18#include <config.h> 19 20/* Specification. */ 21#include "striconveh.h" 22 23#include <errno.h> 24#include <stdbool.h> 25#include <stdlib.h> 26#include <string.h> 27 28#if HAVE_ICONV 29# include <iconv.h> 30# include "unistr.h" 31#endif 32 33#include "c-strcase.h" 34#include "c-strcaseeq.h" 35 36#ifndef SIZE_MAX 37# define SIZE_MAX ((size_t) -1) 38#endif 39 40 41#if HAVE_ICONV 42 43/* The caller must provide an iconveh_t, not just an iconv_t, because when a 44 conversion error occurs, we may have to determine the Unicode representation 45 of the inconvertible character. */ 46 47int 48iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp) 49{ 50 iconv_t cd; 51 iconv_t cd1; 52 iconv_t cd2; 53 54 /* Avoid glibc-2.1 bug with EUC-KR. */ 55# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 56 if (c_strcasecmp (from_codeset, "EUC-KR") == 0 57 || c_strcasecmp (to_codeset, "EUC-KR") == 0) 58 { 59 errno = EINVAL; 60 return -1; 61 } 62# endif 63 64 cd = iconv_open (to_codeset, from_codeset); 65 66 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)) 67 cd1 = (iconv_t)(-1); 68 else 69 { 70 cd1 = iconv_open ("UTF-8", from_codeset); 71 if (cd1 == (iconv_t)(-1)) 72 { 73 int saved_errno = errno; 74 if (cd != (iconv_t)(-1)) 75 iconv_close (cdp->cd); 76 errno = saved_errno; 77 return -1; 78 } 79 } 80 81 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0) 82# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105 83 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0 84# endif 85 ) 86 cd2 = (iconv_t)(-1); 87 else 88 { 89 cd2 = iconv_open (to_codeset, "UTF-8"); 90 if (cd2 == (iconv_t)(-1)) 91 { 92 int saved_errno = errno; 93 if (cd1 != (iconv_t)(-1)) 94 iconv_close (cd1); 95 if (cd != (iconv_t)(-1)) 96 iconv_close (cd); 97 errno = saved_errno; 98 return -1; 99 } 100 } 101 102 cdp->cd = cd; 103 cdp->cd1 = cd1; 104 cdp->cd2 = cd2; 105 return 0; 106} 107 108int 109iconveh_close (const iconveh_t *cd) 110{ 111 if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0) 112 { 113 /* Return -1, but preserve the errno from iconv_close. */ 114 int saved_errno = errno; 115 if (cd->cd1 != (iconv_t)(-1)) 116 iconv_close (cd->cd1); 117 if (cd->cd != (iconv_t)(-1)) 118 iconv_close (cd->cd); 119 errno = saved_errno; 120 return -1; 121 } 122 if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0) 123 { 124 /* Return -1, but preserve the errno from iconv_close. */ 125 int saved_errno = errno; 126 if (cd->cd != (iconv_t)(-1)) 127 iconv_close (cd->cd); 128 errno = saved_errno; 129 return -1; 130 } 131 if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0) 132 return -1; 133 return 0; 134} 135 136/* iconv_carefully is like iconv, except that it stops as soon as it encounters 137 a conversion error, and it returns in *INCREMENTED a boolean telling whether 138 it has incremented the input pointers past the error location. */ 139# if !defined _LIBICONV_VERSION && !defined __GLIBC__ 140/* Irix iconv() inserts a NUL byte if it cannot convert. 141 NetBSD iconv() inserts a question mark if it cannot convert. 142 Only GNU libiconv and GNU libc are known to prefer to fail rather 143 than doing a lossy conversion. */ 144static size_t 145iconv_carefully (iconv_t cd, 146 const char **inbuf, size_t *inbytesleft, 147 char **outbuf, size_t *outbytesleft, 148 bool *incremented) 149{ 150 const char *inptr = *inbuf; 151 const char *inptr_end = inptr + *inbytesleft; 152 char *outptr = *outbuf; 153 size_t outsize = *outbytesleft; 154 const char *inptr_before; 155 size_t res; 156 157 do 158 { 159 size_t insize; 160 161 inptr_before = inptr; 162 res = (size_t)(-1); 163 164 for (insize = 1; inptr + insize <= inptr_end; insize++) 165 { 166 res = iconv (cd, 167 (ICONV_CONST char **) &inptr, &insize, 168 &outptr, &outsize); 169 if (!(res == (size_t)(-1) && errno == EINVAL)) 170 break; 171 /* iconv can eat up a shift sequence but give EINVAL while attempting 172 to convert the first character. E.g. libiconv does this. */ 173 if (inptr > inptr_before) 174 { 175 res = 0; 176 break; 177 } 178 } 179 180 if (res == 0) 181 { 182 *outbuf = outptr; 183 *outbytesleft = outsize; 184 } 185 } 186 while (res == 0 && inptr < inptr_end); 187 188 *inbuf = inptr; 189 *inbytesleft = inptr_end - inptr; 190 if (res != (size_t)(-1) && res > 0) 191 { 192 /* iconv() has already incremented INPTR. We cannot go back to a 193 previous INPTR, otherwise the state inside CD would become invalid, 194 if FROM_CODESET is a stateful encoding. So, tell the caller that 195 *INBUF has already been incremented. */ 196 *incremented = (inptr > inptr_before); 197 errno = EILSEQ; 198 return (size_t)(-1); 199 } 200 else 201 { 202 *incremented = false; 203 return res; 204 } 205} 206# else 207# define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \ 208 (*(incremented) = false, \ 209 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft)) 210# endif 211 212/* iconv_carefully_1 is like iconv_carefully, except that it stops after 213 converting one character or one shift sequence. */ 214static size_t 215iconv_carefully_1 (iconv_t cd, 216 const char **inbuf, size_t *inbytesleft, 217 char **outbuf, size_t *outbytesleft, 218 bool *incremented) 219{ 220 const char *inptr_before = *inbuf; 221 const char *inptr = inptr_before; 222 const char *inptr_end = inptr_before + *inbytesleft; 223 char *outptr = *outbuf; 224 size_t outsize = *outbytesleft; 225 size_t res = (size_t)(-1); 226 size_t insize; 227 228 for (insize = 1; inptr_before + insize <= inptr_end; insize++) 229 { 230 inptr = inptr_before; 231 res = iconv (cd, 232 (ICONV_CONST char **) &inptr, &insize, 233 &outptr, &outsize); 234 if (!(res == (size_t)(-1) && errno == EINVAL)) 235 break; 236 /* iconv can eat up a shift sequence but give EINVAL while attempting 237 to convert the first character. E.g. libiconv does this. */ 238 if (inptr > inptr_before) 239 { 240 res = 0; 241 break; 242 } 243 } 244 245 *inbuf = inptr; 246 *inbytesleft = inptr_end - inptr; 247# if !defined _LIBICONV_VERSION && !defined __GLIBC__ 248 /* Irix iconv() inserts a NUL byte if it cannot convert. 249 NetBSD iconv() inserts a question mark if it cannot convert. 250 Only GNU libiconv and GNU libc are known to prefer to fail rather 251 than doing a lossy conversion. */ 252 if (res != (size_t)(-1) && res > 0) 253 { 254 /* iconv() has already incremented INPTR. We cannot go back to a 255 previous INPTR, otherwise the state inside CD would become invalid, 256 if FROM_CODESET is a stateful encoding. So, tell the caller that 257 *INBUF has already been incremented. */ 258 *incremented = (inptr > inptr_before); 259 errno = EILSEQ; 260 return (size_t)(-1); 261 } 262# endif 263 264 if (res != (size_t)(-1)) 265 { 266 *outbuf = outptr; 267 *outbytesleft = outsize; 268 } 269 *incremented = false; 270 return res; 271} 272 273/* utf8conv_carefully is like iconv, except that 274 - it converts from UTF-8 to UTF-8, 275 - it stops as soon as it encounters a conversion error, and it returns 276 in *INCREMENTED a boolean telling whether it has incremented the input 277 pointers past the error location, 278 - if one_character_only is true, it stops after converting one 279 character. */ 280static size_t 281utf8conv_carefully (bool one_character_only, 282 const char **inbuf, size_t *inbytesleft, 283 char **outbuf, size_t *outbytesleft, 284 bool *incremented) 285{ 286 const char *inptr = *inbuf; 287 size_t insize = *inbytesleft; 288 char *outptr = *outbuf; 289 size_t outsize = *outbytesleft; 290 size_t res; 291 292 res = 0; 293 do 294 { 295 ucs4_t uc; 296 int n; 297 int m; 298 299 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize); 300 if (n < 0) 301 { 302 errno = (n == -2 ? EINVAL : EILSEQ); 303 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize); 304 inptr += n; 305 insize -= n; 306 res = (size_t)(-1); 307 *incremented = true; 308 break; 309 } 310 if (outsize == 0) 311 { 312 errno = E2BIG; 313 res = (size_t)(-1); 314 *incremented = false; 315 break; 316 } 317 m = u8_uctomb ((uint8_t *) outptr, uc, outsize); 318 if (m == -2) 319 { 320 errno = E2BIG; 321 res = (size_t)(-1); 322 *incremented = false; 323 break; 324 } 325 inptr += n; 326 insize -= n; 327 if (m == -1) 328 { 329 errno = EILSEQ; 330 res = (size_t)(-1); 331 *incremented = true; 332 break; 333 } 334 outptr += m; 335 outsize -= m; 336 } 337 while (!one_character_only && insize > 0); 338 339 *inbuf = inptr; 340 *inbytesleft = insize; 341 *outbuf = outptr; 342 *outbytesleft = outsize; 343 return res; 344} 345 346static int 347mem_cd_iconveh_internal (const char *src, size_t srclen, 348 iconv_t cd, iconv_t cd1, iconv_t cd2, 349 enum iconv_ilseq_handler handler, 350 size_t extra_alloc, 351 size_t *offsets, 352 char **resultp, size_t *lengthp) 353{ 354 /* When a conversion error occurs, we cannot start using CD1 and CD2 at 355 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR. 356 Instead, we have to start afresh from the beginning of SRC. */ 357 /* Use a temporary buffer, so that for small strings, a single malloc() 358 call will be sufficient. */ 359# define tmpbufsize 4096 360 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or 361 libiconv's UCS-4-INTERNAL encoding. */ 362 union { unsigned int align; char buf[tmpbufsize]; } tmp; 363# define tmpbuf tmp.buf 364 365 char *initial_result; 366 char *result; 367 size_t allocated; 368 size_t length; 369 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */ 370 371 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf)) 372 { 373 initial_result = *resultp; 374 allocated = *lengthp; 375 } 376 else 377 { 378 initial_result = tmpbuf; 379 allocated = sizeof (tmpbuf); 380 } 381 result = initial_result; 382 383 /* Test whether a direct conversion is possible at all. */ 384 if (cd == (iconv_t)(-1)) 385 goto indirectly; 386 387 if (offsets != NULL) 388 { 389 size_t i; 390 391 for (i = 0; i < srclen; i++) 392 offsets[i] = (size_t)(-1); 393 394 last_length = (size_t)(-1); 395 } 396 length = 0; 397 398 /* First, try a direct conversion, and see whether a conversion error 399 occurs at all. */ 400 { 401 const char *inptr = src; 402 size_t insize = srclen; 403 404 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ 405# if defined _LIBICONV_VERSION \ 406 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) 407 /* Set to the initial state. */ 408 iconv (cd, NULL, NULL, NULL, NULL); 409# endif 410 411 while (insize > 0) 412 { 413 char *outptr = result + length; 414 size_t outsize = allocated - extra_alloc - length; 415 bool incremented; 416 size_t res; 417 bool grow; 418 419 if (offsets != NULL) 420 { 421 if (length != last_length) /* ensure that offset[] be increasing */ 422 { 423 offsets[inptr - src] = length; 424 last_length = length; 425 } 426 res = iconv_carefully_1 (cd, 427 &inptr, &insize, 428 &outptr, &outsize, 429 &incremented); 430 } 431 else 432 /* Use iconv_carefully instead of iconv here, because: 433 - If TO_CODESET is UTF-8, we can do the error handling in this 434 loop, no need for a second loop, 435 - With iconv() implementations other than GNU libiconv and GNU 436 libc, if we use iconv() in a big swoop, checking for an E2BIG 437 return, we lose the number of irreversible conversions. */ 438 res = iconv_carefully (cd, 439 &inptr, &insize, 440 &outptr, &outsize, 441 &incremented); 442 443 length = outptr - result; 444 grow = (length + extra_alloc > allocated / 2); 445 if (res == (size_t)(-1)) 446 { 447 if (errno == E2BIG) 448 grow = true; 449 else if (errno == EINVAL) 450 break; 451 else if (errno == EILSEQ && handler != iconveh_error) 452 { 453 if (cd2 == (iconv_t)(-1)) 454 { 455 /* TO_CODESET is UTF-8. */ 456 /* Error handling can produce up to 1 byte of output. */ 457 if (length + 1 + extra_alloc > allocated) 458 { 459 char *memory; 460 461 allocated = 2 * allocated; 462 if (length + 1 + extra_alloc > allocated) 463 abort (); 464 if (result == initial_result) 465 memory = (char *) malloc (allocated); 466 else 467 memory = (char *) realloc (result, allocated); 468 if (memory == NULL) 469 { 470 if (result != initial_result) 471 free (result); 472 errno = ENOMEM; 473 return -1; 474 } 475 if (result == initial_result) 476 memcpy (memory, initial_result, length); 477 result = memory; 478 grow = false; 479 } 480 /* The input is invalid in FROM_CODESET. Eat up one byte 481 and emit a question mark. */ 482 if (!incremented) 483 { 484 if (insize == 0) 485 abort (); 486 inptr++; 487 insize--; 488 } 489 result[length] = '?'; 490 length++; 491 } 492 else 493 goto indirectly; 494 } 495 else 496 { 497 if (result != initial_result) 498 { 499 int saved_errno = errno; 500 free (result); 501 errno = saved_errno; 502 } 503 return -1; 504 } 505 } 506 if (insize == 0) 507 break; 508 if (grow) 509 { 510 char *memory; 511 512 allocated = 2 * allocated; 513 if (result == initial_result) 514 memory = (char *) malloc (allocated); 515 else 516 memory = (char *) realloc (result, allocated); 517 if (memory == NULL) 518 { 519 if (result != initial_result) 520 free (result); 521 errno = ENOMEM; 522 return -1; 523 } 524 if (result == initial_result) 525 memcpy (memory, initial_result, length); 526 result = memory; 527 } 528 } 529 } 530 531 /* Now get the conversion state back to the initial state. 532 But avoid glibc-2.1 bug and Solaris 2.7 bug. */ 533#if defined _LIBICONV_VERSION \ 534 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun) 535 for (;;) 536 { 537 char *outptr = result + length; 538 size_t outsize = allocated - extra_alloc - length; 539 size_t res; 540 541 res = iconv (cd, NULL, NULL, &outptr, &outsize); 542 length = outptr - result; 543 if (res == (size_t)(-1)) 544 { 545 if (errno == E2BIG) 546 { 547 char *memory; 548 549 allocated = 2 * allocated; 550 if (result == initial_result) 551 memory = (char *) malloc (allocated); 552 else 553 memory = (char *) realloc (result, allocated); 554 if (memory == NULL) 555 { 556 if (result != initial_result) 557 free (result); 558 errno = ENOMEM; 559 return -1; 560 } 561 if (result == initial_result) 562 memcpy (memory, initial_result, length); 563 result = memory; 564 } 565 else 566 { 567 if (result != initial_result) 568 { 569 int saved_errno = errno; 570 free (result); 571 errno = saved_errno; 572 } 573 return -1; 574 } 575 } 576 else 577 break; 578 } 579#endif 580 581 /* The direct conversion succeeded. */ 582 goto done; 583 584 indirectly: 585 /* The direct conversion failed. 586 Use a conversion through UTF-8. */ 587 if (offsets != NULL) 588 { 589 size_t i; 590 591 for (i = 0; i < srclen; i++) 592 offsets[i] = (size_t)(-1); 593 594 last_length = (size_t)(-1); 595 } 596 length = 0; 597 { 598 const bool slowly = (offsets != NULL || handler == iconveh_error); 599# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */ 600 char utf8buf[utf8bufsize + 1]; 601 size_t utf8len = 0; 602 const char *in1ptr = src; 603 size_t in1size = srclen; 604 bool do_final_flush1 = true; 605 bool do_final_flush2 = true; 606 607 /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */ 608# if defined _LIBICONV_VERSION \ 609 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) 610 /* Set to the initial state. */ 611 if (cd1 != (iconv_t)(-1)) 612 iconv (cd1, NULL, NULL, NULL, NULL); 613 if (cd2 != (iconv_t)(-1)) 614 iconv (cd2, NULL, NULL, NULL, NULL); 615# endif 616 617 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2) 618 { 619 char *out1ptr = utf8buf + utf8len; 620 size_t out1size = utf8bufsize - utf8len; 621 bool incremented1; 622 size_t res1; 623 int errno1; 624 625 /* Conversion step 1: from FROM_CODESET to UTF-8. */ 626 if (in1size > 0) 627 { 628 if (offsets != NULL 629 && length != last_length) /* ensure that offset[] be increasing */ 630 { 631 offsets[in1ptr - src] = length; 632 last_length = length; 633 } 634 if (cd1 != (iconv_t)(-1)) 635 { 636 if (slowly) 637 res1 = iconv_carefully_1 (cd1, 638 &in1ptr, &in1size, 639 &out1ptr, &out1size, 640 &incremented1); 641 else 642 res1 = iconv_carefully (cd1, 643 &in1ptr, &in1size, 644 &out1ptr, &out1size, 645 &incremented1); 646 } 647 else 648 { 649 /* FROM_CODESET is UTF-8. */ 650 res1 = utf8conv_carefully (slowly, 651 &in1ptr, &in1size, 652 &out1ptr, &out1size, 653 &incremented1); 654 } 655 } 656 else if (do_final_flush1) 657 { 658 /* Now get the conversion state of CD1 back to the initial state. 659 But avoid glibc-2.1 bug and Solaris 2.7 bug. */ 660# if defined _LIBICONV_VERSION \ 661 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun) 662 if (cd1 != (iconv_t)(-1)) 663 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size); 664 else 665# endif 666 res1 = 0; 667 do_final_flush1 = false; 668 incremented1 = true; 669 } 670 else 671 { 672 res1 = 0; 673 incremented1 = true; 674 } 675 if (res1 == (size_t)(-1) 676 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ)) 677 { 678 if (result != initial_result) 679 { 680 int saved_errno = errno; 681 free (result); 682 errno = saved_errno; 683 } 684 return -1; 685 } 686 if (res1 == (size_t)(-1) 687 && errno == EILSEQ && handler != iconveh_error) 688 { 689 /* The input is invalid in FROM_CODESET. Eat up one byte and 690 emit a question mark. Room for the question mark was allocated 691 at the end of utf8buf. */ 692 if (!incremented1) 693 { 694 if (in1size == 0) 695 abort (); 696 in1ptr++; 697 in1size--; 698 } 699 *out1ptr++ = '?'; 700 res1 = 0; 701 } 702 errno1 = errno; 703 utf8len = out1ptr - utf8buf; 704 705 if (offsets != NULL 706 || in1size == 0 707 || utf8len > utf8bufsize / 2 708 || (res1 == (size_t)(-1) && errno1 == E2BIG)) 709 { 710 /* Conversion step 2: from UTF-8 to TO_CODESET. */ 711 const char *in2ptr = utf8buf; 712 size_t in2size = utf8len; 713 714 while (in2size > 0 715 || (in1size == 0 && !do_final_flush1 && do_final_flush2)) 716 { 717 char *out2ptr = result + length; 718 size_t out2size = allocated - extra_alloc - length; 719 bool incremented2; 720 size_t res2; 721 bool grow; 722 723 if (in2size > 0) 724 { 725 if (cd2 != (iconv_t)(-1)) 726 res2 = iconv_carefully (cd2, 727 &in2ptr, &in2size, 728 &out2ptr, &out2size, 729 &incremented2); 730 else 731 /* TO_CODESET is UTF-8. */ 732 res2 = utf8conv_carefully (false, 733 &in2ptr, &in2size, 734 &out2ptr, &out2size, 735 &incremented2); 736 } 737 else /* in1size == 0 && !do_final_flush1 738 && in2size == 0 && do_final_flush2 */ 739 { 740 /* Now get the conversion state of CD1 back to the initial 741 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */ 742# if defined _LIBICONV_VERSION \ 743 || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun) 744 if (cd2 != (iconv_t)(-1)) 745 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size); 746 else 747# endif 748 res2 = 0; 749 do_final_flush2 = false; 750 incremented2 = true; 751 } 752 753 length = out2ptr - result; 754 grow = (length + extra_alloc > allocated / 2); 755 if (res2 == (size_t)(-1)) 756 { 757 if (errno == E2BIG) 758 grow = true; 759 else if (errno == EINVAL) 760 break; 761 else if (errno == EILSEQ && handler != iconveh_error) 762 { 763 /* Error handling can produce up to 10 bytes of ASCII 764 output. But TO_CODESET may be UCS-2, UTF-16 or 765 UCS-4, so use CD2 here as well. */ 766 char scratchbuf[10]; 767 size_t scratchlen; 768 ucs4_t uc; 769 const char *inptr; 770 size_t insize; 771 size_t res; 772 773 if (incremented2) 774 { 775 if (u8_prev (&uc, (const uint8_t *) in2ptr, 776 (const uint8_t *) utf8buf) 777 == NULL) 778 abort (); 779 } 780 else 781 { 782 int n; 783 if (in2size == 0) 784 abort (); 785 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr, 786 in2size); 787 in2ptr += n; 788 in2size -= n; 789 } 790 791 if (handler == iconveh_escape_sequence) 792 { 793 static char hex[16] = "0123456789ABCDEF"; 794 scratchlen = 0; 795 scratchbuf[scratchlen++] = '\\'; 796 if (uc < 0x10000) 797 scratchbuf[scratchlen++] = 'u'; 798 else 799 { 800 scratchbuf[scratchlen++] = 'U'; 801 scratchbuf[scratchlen++] = hex[(uc>>28) & 15]; 802 scratchbuf[scratchlen++] = hex[(uc>>24) & 15]; 803 scratchbuf[scratchlen++] = hex[(uc>>20) & 15]; 804 scratchbuf[scratchlen++] = hex[(uc>>16) & 15]; 805 } 806 scratchbuf[scratchlen++] = hex[(uc>>12) & 15]; 807 scratchbuf[scratchlen++] = hex[(uc>>8) & 15]; 808 scratchbuf[scratchlen++] = hex[(uc>>4) & 15]; 809 scratchbuf[scratchlen++] = hex[uc & 15]; 810 } 811 else 812 { 813 scratchbuf[0] = '?'; 814 scratchlen = 1; 815 } 816 817 inptr = scratchbuf; 818 insize = scratchlen; 819 if (cd2 != (iconv_t)(-1)) 820 res = iconv (cd2, 821 (ICONV_CONST char **) &inptr, &insize, 822 &out2ptr, &out2size); 823 else 824 { 825 /* TO_CODESET is UTF-8. */ 826 if (out2size >= insize) 827 { 828 memcpy (out2ptr, inptr, insize); 829 out2ptr += insize; 830 out2size -= insize; 831 inptr += insize; 832 insize = 0; 833 res = 0; 834 } 835 else 836 { 837 errno = E2BIG; 838 res = (size_t)(-1); 839 } 840 } 841 length = out2ptr - result; 842 if (res == (size_t)(-1) && errno == E2BIG) 843 { 844 char *memory; 845 846 allocated = 2 * allocated; 847 if (length + 1 + extra_alloc > allocated) 848 abort (); 849 if (result == initial_result) 850 memory = (char *) malloc (allocated); 851 else 852 memory = (char *) realloc (result, allocated); 853 if (memory == NULL) 854 { 855 if (result != initial_result) 856 free (result); 857 errno = ENOMEM; 858 return -1; 859 } 860 if (result == initial_result) 861 memcpy (memory, initial_result, length); 862 result = memory; 863 grow = false; 864 865 out2ptr = result + length; 866 out2size = allocated - extra_alloc - length; 867 if (cd2 != (iconv_t)(-1)) 868 res = iconv (cd2, 869 (ICONV_CONST char **) &inptr, 870 &insize, 871 &out2ptr, &out2size); 872 else 873 { 874 /* TO_CODESET is UTF-8. */ 875 if (!(out2size >= insize)) 876 abort (); 877 memcpy (out2ptr, inptr, insize); 878 out2ptr += insize; 879 out2size -= insize; 880 inptr += insize; 881 insize = 0; 882 res = 0; 883 } 884 length = out2ptr - result; 885 } 886# if !defined _LIBICONV_VERSION && !defined __GLIBC__ 887 /* Irix iconv() inserts a NUL byte if it cannot convert. 888 NetBSD iconv() inserts a question mark if it cannot 889 convert. 890 Only GNU libiconv and GNU libc are known to prefer 891 to fail rather than doing a lossy conversion. */ 892 if (res != (size_t)(-1) && res > 0) 893 { 894 errno = EILSEQ; 895 res = (size_t)(-1); 896 } 897# endif 898 if (res == (size_t)(-1)) 899 { 900 /* Failure converting the ASCII replacement. */ 901 if (result != initial_result) 902 { 903 int saved_errno = errno; 904 free (result); 905 errno = saved_errno; 906 } 907 return -1; 908 } 909 } 910 else 911 { 912 if (result != initial_result) 913 { 914 int saved_errno = errno; 915 free (result); 916 errno = saved_errno; 917 } 918 return -1; 919 } 920 } 921 if (!(in2size > 0 922 || (in1size == 0 && !do_final_flush1 && do_final_flush2))) 923 break; 924 if (grow) 925 { 926 char *memory; 927 928 allocated = 2 * allocated; 929 if (result == initial_result) 930 memory = (char *) malloc (allocated); 931 else 932 memory = (char *) realloc (result, allocated); 933 if (memory == NULL) 934 { 935 if (result != initial_result) 936 free (result); 937 errno = ENOMEM; 938 return -1; 939 } 940 if (result == initial_result) 941 memcpy (memory, initial_result, length); 942 result = memory; 943 } 944 } 945 946 /* Move the remaining bytes to the beginning of utf8buf. */ 947 if (in2size > 0) 948 memmove (utf8buf, in2ptr, in2size); 949 utf8len = in2size; 950 } 951 952 if (res1 == (size_t)(-1)) 953 { 954 if (errno1 == EINVAL) 955 in1size = 0; 956 else if (errno1 == EILSEQ) 957 { 958 if (result != initial_result) 959 free (result); 960 errno = errno1; 961 return -1; 962 } 963 } 964 } 965# undef utf8bufsize 966 } 967 968 done: 969 /* Now the final memory allocation. */ 970 if (result == tmpbuf) 971 { 972 size_t memsize = length + extra_alloc; 973 char *memory; 974 975 memory = (char *) malloc (memsize > 0 ? memsize : 1); 976 if (memory != NULL) 977 { 978 memcpy (memory, tmpbuf, length); 979 result = memory; 980 } 981 else 982 { 983 errno = ENOMEM; 984 return -1; 985 } 986 } 987 else if (result != *resultp && length + extra_alloc < allocated) 988 { 989 /* Shrink the allocated memory if possible. */ 990 size_t memsize = length + extra_alloc; 991 char *memory; 992 993 memory = (char *) realloc (result, memsize > 0 ? memsize : 1); 994 if (memory != NULL) 995 result = memory; 996 } 997 *resultp = result; 998 *lengthp = length; 999 return 0; 1000# undef tmpbuf 1001# undef tmpbufsize 1002} 1003 1004int 1005mem_cd_iconveh (const char *src, size_t srclen, 1006 const iconveh_t *cd, 1007 enum iconv_ilseq_handler handler, 1008 size_t *offsets, 1009 char **resultp, size_t *lengthp) 1010{ 1011 return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2, 1012 handler, 0, offsets, resultp, lengthp); 1013} 1014 1015char * 1016str_cd_iconveh (const char *src, 1017 const iconveh_t *cd, 1018 enum iconv_ilseq_handler handler) 1019{ 1020 /* For most encodings, a trailing NUL byte in the input will be converted 1021 to a trailing NUL byte in the output. But not for UTF-7. So that this 1022 function is usable for UTF-7, we have to exclude the NUL byte from the 1023 conversion and add it by hand afterwards. */ 1024 char *result = NULL; 1025 size_t length = 0; 1026 int retval = mem_cd_iconveh_internal (src, strlen (src), 1027 cd->cd, cd->cd1, cd->cd2, handler, 1, 1028 NULL, &result, &length); 1029 1030 if (retval < 0) 1031 { 1032 if (result != NULL) 1033 { 1034 int saved_errno = errno; 1035 free (result); 1036 errno = saved_errno; 1037 } 1038 return NULL; 1039 } 1040 1041 /* Add the terminating NUL byte. */ 1042 result[length] = '\0'; 1043 1044 return result; 1045} 1046 1047#endif 1048 1049int 1050mem_iconveh (const char *src, size_t srclen, 1051 const char *from_codeset, const char *to_codeset, 1052 enum iconv_ilseq_handler handler, 1053 size_t *offsets, 1054 char **resultp, size_t *lengthp) 1055{ 1056 if (srclen == 0) 1057 { 1058 /* Nothing to convert. */ 1059 *lengthp = 0; 1060 return 0; 1061 } 1062 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0) 1063 { 1064 char *result; 1065 1066 if (*resultp != NULL && *lengthp >= srclen) 1067 result = *resultp; 1068 else 1069 { 1070 result = (char *) malloc (srclen); 1071 if (result == NULL) 1072 { 1073 errno = ENOMEM; 1074 return -1; 1075 } 1076 } 1077 memcpy (result, src, srclen); 1078 *resultp = result; 1079 *lengthp = srclen; 1080 return 0; 1081 } 1082 else 1083 { 1084#if HAVE_ICONV 1085 iconveh_t cd; 1086 char *result; 1087 size_t length; 1088 int retval; 1089 1090 if (iconveh_open (to_codeset, from_codeset, &cd) < 0) 1091 return -1; 1092 1093 result = *resultp; 1094 length = *lengthp; 1095 retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets, 1096 &result, &length); 1097 1098 if (retval < 0) 1099 { 1100 /* Close cd, but preserve the errno from str_cd_iconv. */ 1101 int saved_errno = errno; 1102 iconveh_close (&cd); 1103 errno = saved_errno; 1104 } 1105 else 1106 { 1107 if (iconveh_close (&cd) < 0) 1108 { 1109 /* Return -1, but free the allocated memory, and while doing 1110 that, preserve the errno from iconveh_close. */ 1111 int saved_errno = errno; 1112 if (result != *resultp && result != NULL) 1113 free (result); 1114 errno = saved_errno; 1115 return -1; 1116 } 1117 *resultp = result; 1118 *lengthp = length; 1119 } 1120 return retval; 1121#else 1122 /* This is a different error code than if iconv_open existed but didn't 1123 support from_codeset and to_codeset, so that the caller can emit 1124 an error message such as 1125 "iconv() is not supported. Installing GNU libiconv and 1126 then reinstalling this package would fix this." */ 1127 errno = ENOSYS; 1128 return -1; 1129#endif 1130 } 1131} 1132 1133char * 1134str_iconveh (const char *src, 1135 const char *from_codeset, const char *to_codeset, 1136 enum iconv_ilseq_handler handler) 1137{ 1138 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0) 1139 { 1140 char *result = strdup (src); 1141 1142 if (result == NULL) 1143 errno = ENOMEM; 1144 return result; 1145 } 1146 else 1147 { 1148#if HAVE_ICONV 1149 iconveh_t cd; 1150 char *result; 1151 1152 if (iconveh_open (to_codeset, from_codeset, &cd) < 0) 1153 return NULL; 1154 1155 result = str_cd_iconveh (src, &cd, handler); 1156 1157 if (result == NULL) 1158 { 1159 /* Close cd, but preserve the errno from str_cd_iconv. */ 1160 int saved_errno = errno; 1161 iconveh_close (&cd); 1162 errno = saved_errno; 1163 } 1164 else 1165 { 1166 if (iconveh_close (&cd) < 0) 1167 { 1168 /* Return NULL, but free the allocated memory, and while doing 1169 that, preserve the errno from iconveh_close. */ 1170 int saved_errno = errno; 1171 free (result); 1172 errno = saved_errno; 1173 return NULL; 1174 } 1175 } 1176 return result; 1177#else 1178 /* This is a different error code than if iconv_open existed but didn't 1179 support from_codeset and to_codeset, so that the caller can emit 1180 an error message such as 1181 "iconv() is not supported. Installing GNU libiconv and 1182 then reinstalling this package would fix this." */ 1183 errno = ENOSYS; 1184 return NULL; 1185#endif 1186 } 1187} 1188