1/* Character set conversion with error handling.
2   Copyright (C) 2001-2010 Free Software Foundation, Inc.
3   Written by Bruno Haible and Simon Josefsson.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU Lesser General Public License as published by
7   the Free Software Foundation; either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18#include <config.h>
19
20/* Specification.  */
21#include "striconveh.h"
22
23#include <errno.h>
24#include <stdbool.h>
25#include <stdlib.h>
26#include <string.h>
27
28#if HAVE_ICONV
29# include <iconv.h>
30# include "unistr.h"
31#endif
32
33#include "c-strcase.h"
34#include "c-strcaseeq.h"
35
36#ifndef SIZE_MAX
37# define SIZE_MAX ((size_t) -1)
38#endif
39
40
41#if HAVE_ICONV
42
43/* The caller must provide an iconveh_t, not just an iconv_t, because when a
44   conversion error occurs, we may have to determine the Unicode representation
45   of the inconvertible character.  */
46
47int
48iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
49{
50  iconv_t cd;
51  iconv_t cd1;
52  iconv_t cd2;
53
54  /* Avoid glibc-2.1 bug with EUC-KR.  */
55# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
56  if (c_strcasecmp (from_codeset, "EUC-KR") == 0
57      || c_strcasecmp (to_codeset, "EUC-KR") == 0)
58    {
59      errno = EINVAL;
60      return -1;
61    }
62# endif
63
64  cd = iconv_open (to_codeset, from_codeset);
65
66  if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
67    cd1 = (iconv_t)(-1);
68  else
69    {
70      cd1 = iconv_open ("UTF-8", from_codeset);
71      if (cd1 == (iconv_t)(-1))
72        {
73          int saved_errno = errno;
74          if (cd != (iconv_t)(-1))
75            iconv_close (cdp->cd);
76          errno = saved_errno;
77          return -1;
78        }
79    }
80
81  if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
82# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
83      || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
84# endif
85     )
86    cd2 = (iconv_t)(-1);
87  else
88    {
89      cd2 = iconv_open (to_codeset, "UTF-8");
90      if (cd2 == (iconv_t)(-1))
91        {
92          int saved_errno = errno;
93          if (cd1 != (iconv_t)(-1))
94            iconv_close (cd1);
95          if (cd != (iconv_t)(-1))
96            iconv_close (cd);
97          errno = saved_errno;
98          return -1;
99        }
100    }
101
102  cdp->cd = cd;
103  cdp->cd1 = cd1;
104  cdp->cd2 = cd2;
105  return 0;
106}
107
108int
109iconveh_close (const iconveh_t *cd)
110{
111  if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
112    {
113      /* Return -1, but preserve the errno from iconv_close.  */
114      int saved_errno = errno;
115      if (cd->cd1 != (iconv_t)(-1))
116        iconv_close (cd->cd1);
117      if (cd->cd != (iconv_t)(-1))
118        iconv_close (cd->cd);
119      errno = saved_errno;
120      return -1;
121    }
122  if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
123    {
124      /* Return -1, but preserve the errno from iconv_close.  */
125      int saved_errno = errno;
126      if (cd->cd != (iconv_t)(-1))
127        iconv_close (cd->cd);
128      errno = saved_errno;
129      return -1;
130    }
131  if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
132    return -1;
133  return 0;
134}
135
136/* iconv_carefully is like iconv, except that it stops as soon as it encounters
137   a conversion error, and it returns in *INCREMENTED a boolean telling whether
138   it has incremented the input pointers past the error location.  */
139# if !defined _LIBICONV_VERSION && !defined __GLIBC__
140/* Irix iconv() inserts a NUL byte if it cannot convert.
141   NetBSD iconv() inserts a question mark if it cannot convert.
142   Only GNU libiconv and GNU libc are known to prefer to fail rather
143   than doing a lossy conversion.  */
144static size_t
145iconv_carefully (iconv_t cd,
146                 const char **inbuf, size_t *inbytesleft,
147                 char **outbuf, size_t *outbytesleft,
148                 bool *incremented)
149{
150  const char *inptr = *inbuf;
151  const char *inptr_end = inptr + *inbytesleft;
152  char *outptr = *outbuf;
153  size_t outsize = *outbytesleft;
154  const char *inptr_before;
155  size_t res;
156
157  do
158    {
159      size_t insize;
160
161      inptr_before = inptr;
162      res = (size_t)(-1);
163
164      for (insize = 1; inptr + insize <= inptr_end; insize++)
165        {
166          res = iconv (cd,
167                       (ICONV_CONST char **) &inptr, &insize,
168                       &outptr, &outsize);
169          if (!(res == (size_t)(-1) && errno == EINVAL))
170            break;
171          /* iconv can eat up a shift sequence but give EINVAL while attempting
172             to convert the first character.  E.g. libiconv does this.  */
173          if (inptr > inptr_before)
174            {
175              res = 0;
176              break;
177            }
178        }
179
180      if (res == 0)
181        {
182          *outbuf = outptr;
183          *outbytesleft = outsize;
184        }
185    }
186  while (res == 0 && inptr < inptr_end);
187
188  *inbuf = inptr;
189  *inbytesleft = inptr_end - inptr;
190  if (res != (size_t)(-1) && res > 0)
191    {
192      /* iconv() has already incremented INPTR.  We cannot go back to a
193         previous INPTR, otherwise the state inside CD would become invalid,
194         if FROM_CODESET is a stateful encoding.  So, tell the caller that
195         *INBUF has already been incremented.  */
196      *incremented = (inptr > inptr_before);
197      errno = EILSEQ;
198      return (size_t)(-1);
199    }
200  else
201    {
202      *incremented = false;
203      return res;
204    }
205}
206# else
207#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
208     (*(incremented) = false, \
209      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
210# endif
211
212/* iconv_carefully_1 is like iconv_carefully, except that it stops after
213   converting one character or one shift sequence.  */
214static size_t
215iconv_carefully_1 (iconv_t cd,
216                   const char **inbuf, size_t *inbytesleft,
217                   char **outbuf, size_t *outbytesleft,
218                   bool *incremented)
219{
220  const char *inptr_before = *inbuf;
221  const char *inptr = inptr_before;
222  const char *inptr_end = inptr_before + *inbytesleft;
223  char *outptr = *outbuf;
224  size_t outsize = *outbytesleft;
225  size_t res = (size_t)(-1);
226  size_t insize;
227
228  for (insize = 1; inptr_before + insize <= inptr_end; insize++)
229    {
230      inptr = inptr_before;
231      res = iconv (cd,
232                   (ICONV_CONST char **) &inptr, &insize,
233                   &outptr, &outsize);
234      if (!(res == (size_t)(-1) && errno == EINVAL))
235        break;
236      /* iconv can eat up a shift sequence but give EINVAL while attempting
237         to convert the first character.  E.g. libiconv does this.  */
238      if (inptr > inptr_before)
239        {
240          res = 0;
241          break;
242        }
243    }
244
245  *inbuf = inptr;
246  *inbytesleft = inptr_end - inptr;
247# if !defined _LIBICONV_VERSION && !defined __GLIBC__
248  /* Irix iconv() inserts a NUL byte if it cannot convert.
249     NetBSD iconv() inserts a question mark if it cannot convert.
250     Only GNU libiconv and GNU libc are known to prefer to fail rather
251     than doing a lossy conversion.  */
252  if (res != (size_t)(-1) && res > 0)
253    {
254      /* iconv() has already incremented INPTR.  We cannot go back to a
255         previous INPTR, otherwise the state inside CD would become invalid,
256         if FROM_CODESET is a stateful encoding.  So, tell the caller that
257         *INBUF has already been incremented.  */
258      *incremented = (inptr > inptr_before);
259      errno = EILSEQ;
260      return (size_t)(-1);
261    }
262# endif
263
264  if (res != (size_t)(-1))
265    {
266      *outbuf = outptr;
267      *outbytesleft = outsize;
268    }
269  *incremented = false;
270  return res;
271}
272
273/* utf8conv_carefully is like iconv, except that
274     - it converts from UTF-8 to UTF-8,
275     - it stops as soon as it encounters a conversion error, and it returns
276       in *INCREMENTED a boolean telling whether it has incremented the input
277       pointers past the error location,
278     - if one_character_only is true, it stops after converting one
279       character.  */
280static size_t
281utf8conv_carefully (bool one_character_only,
282                    const char **inbuf, size_t *inbytesleft,
283                    char **outbuf, size_t *outbytesleft,
284                    bool *incremented)
285{
286  const char *inptr = *inbuf;
287  size_t insize = *inbytesleft;
288  char *outptr = *outbuf;
289  size_t outsize = *outbytesleft;
290  size_t res;
291
292  res = 0;
293  do
294    {
295      ucs4_t uc;
296      int n;
297      int m;
298
299      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
300      if (n < 0)
301        {
302          errno = (n == -2 ? EINVAL : EILSEQ);
303          n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
304          inptr += n;
305          insize -= n;
306          res = (size_t)(-1);
307          *incremented = true;
308          break;
309        }
310      if (outsize == 0)
311        {
312          errno = E2BIG;
313          res = (size_t)(-1);
314          *incremented = false;
315          break;
316        }
317      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
318      if (m == -2)
319        {
320          errno = E2BIG;
321          res = (size_t)(-1);
322          *incremented = false;
323          break;
324        }
325      inptr += n;
326      insize -= n;
327      if (m == -1)
328        {
329          errno = EILSEQ;
330          res = (size_t)(-1);
331          *incremented = true;
332          break;
333        }
334      outptr += m;
335      outsize -= m;
336    }
337  while (!one_character_only && insize > 0);
338
339  *inbuf = inptr;
340  *inbytesleft = insize;
341  *outbuf = outptr;
342  *outbytesleft = outsize;
343  return res;
344}
345
346static int
347mem_cd_iconveh_internal (const char *src, size_t srclen,
348                         iconv_t cd, iconv_t cd1, iconv_t cd2,
349                         enum iconv_ilseq_handler handler,
350                         size_t extra_alloc,
351                         size_t *offsets,
352                         char **resultp, size_t *lengthp)
353{
354  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
355     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
356     Instead, we have to start afresh from the beginning of SRC.  */
357  /* Use a temporary buffer, so that for small strings, a single malloc()
358     call will be sufficient.  */
359# define tmpbufsize 4096
360  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
361     libiconv's UCS-4-INTERNAL encoding.  */
362  union { unsigned int align; char buf[tmpbufsize]; } tmp;
363# define tmpbuf tmp.buf
364
365  char *initial_result;
366  char *result;
367  size_t allocated;
368  size_t length;
369  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
370
371  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
372    {
373      initial_result = *resultp;
374      allocated = *lengthp;
375    }
376  else
377    {
378      initial_result = tmpbuf;
379      allocated = sizeof (tmpbuf);
380    }
381  result = initial_result;
382
383  /* Test whether a direct conversion is possible at all.  */
384  if (cd == (iconv_t)(-1))
385    goto indirectly;
386
387  if (offsets != NULL)
388    {
389      size_t i;
390
391      for (i = 0; i < srclen; i++)
392        offsets[i] = (size_t)(-1);
393
394      last_length = (size_t)(-1);
395    }
396  length = 0;
397
398  /* First, try a direct conversion, and see whether a conversion error
399     occurs at all.  */
400  {
401    const char *inptr = src;
402    size_t insize = srclen;
403
404    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
405# if defined _LIBICONV_VERSION \
406     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
407    /* Set to the initial state.  */
408    iconv (cd, NULL, NULL, NULL, NULL);
409# endif
410
411    while (insize > 0)
412      {
413        char *outptr = result + length;
414        size_t outsize = allocated - extra_alloc - length;
415        bool incremented;
416        size_t res;
417        bool grow;
418
419        if (offsets != NULL)
420          {
421            if (length != last_length) /* ensure that offset[] be increasing */
422              {
423                offsets[inptr - src] = length;
424                last_length = length;
425              }
426            res = iconv_carefully_1 (cd,
427                                     &inptr, &insize,
428                                     &outptr, &outsize,
429                                     &incremented);
430          }
431        else
432          /* Use iconv_carefully instead of iconv here, because:
433             - If TO_CODESET is UTF-8, we can do the error handling in this
434               loop, no need for a second loop,
435             - With iconv() implementations other than GNU libiconv and GNU
436               libc, if we use iconv() in a big swoop, checking for an E2BIG
437               return, we lose the number of irreversible conversions.  */
438          res = iconv_carefully (cd,
439                                 &inptr, &insize,
440                                 &outptr, &outsize,
441                                 &incremented);
442
443        length = outptr - result;
444        grow = (length + extra_alloc > allocated / 2);
445        if (res == (size_t)(-1))
446          {
447            if (errno == E2BIG)
448              grow = true;
449            else if (errno == EINVAL)
450              break;
451            else if (errno == EILSEQ && handler != iconveh_error)
452              {
453                if (cd2 == (iconv_t)(-1))
454                  {
455                    /* TO_CODESET is UTF-8.  */
456                    /* Error handling can produce up to 1 byte of output.  */
457                    if (length + 1 + extra_alloc > allocated)
458                      {
459                        char *memory;
460
461                        allocated = 2 * allocated;
462                        if (length + 1 + extra_alloc > allocated)
463                          abort ();
464                        if (result == initial_result)
465                          memory = (char *) malloc (allocated);
466                        else
467                          memory = (char *) realloc (result, allocated);
468                        if (memory == NULL)
469                          {
470                            if (result != initial_result)
471                              free (result);
472                            errno = ENOMEM;
473                            return -1;
474                          }
475                        if (result == initial_result)
476                          memcpy (memory, initial_result, length);
477                        result = memory;
478                        grow = false;
479                      }
480                    /* The input is invalid in FROM_CODESET.  Eat up one byte
481                       and emit a question mark.  */
482                    if (!incremented)
483                      {
484                        if (insize == 0)
485                          abort ();
486                        inptr++;
487                        insize--;
488                      }
489                    result[length] = '?';
490                    length++;
491                  }
492                else
493                  goto indirectly;
494              }
495            else
496              {
497                if (result != initial_result)
498                  {
499                    int saved_errno = errno;
500                    free (result);
501                    errno = saved_errno;
502                  }
503                return -1;
504              }
505          }
506        if (insize == 0)
507          break;
508        if (grow)
509          {
510            char *memory;
511
512            allocated = 2 * allocated;
513            if (result == initial_result)
514              memory = (char *) malloc (allocated);
515            else
516              memory = (char *) realloc (result, allocated);
517            if (memory == NULL)
518              {
519                if (result != initial_result)
520                  free (result);
521                errno = ENOMEM;
522                return -1;
523              }
524            if (result == initial_result)
525              memcpy (memory, initial_result, length);
526            result = memory;
527          }
528      }
529  }
530
531  /* Now get the conversion state back to the initial state.
532     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
533#if defined _LIBICONV_VERSION \
534    || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
535  for (;;)
536    {
537      char *outptr = result + length;
538      size_t outsize = allocated - extra_alloc - length;
539      size_t res;
540
541      res = iconv (cd, NULL, NULL, &outptr, &outsize);
542      length = outptr - result;
543      if (res == (size_t)(-1))
544        {
545          if (errno == E2BIG)
546            {
547              char *memory;
548
549              allocated = 2 * allocated;
550              if (result == initial_result)
551                memory = (char *) malloc (allocated);
552              else
553                memory = (char *) realloc (result, allocated);
554              if (memory == NULL)
555                {
556                  if (result != initial_result)
557                    free (result);
558                  errno = ENOMEM;
559                  return -1;
560                }
561              if (result == initial_result)
562                memcpy (memory, initial_result, length);
563              result = memory;
564            }
565          else
566            {
567              if (result != initial_result)
568                {
569                  int saved_errno = errno;
570                  free (result);
571                  errno = saved_errno;
572                }
573              return -1;
574            }
575        }
576      else
577        break;
578    }
579#endif
580
581  /* The direct conversion succeeded.  */
582  goto done;
583
584 indirectly:
585  /* The direct conversion failed.
586     Use a conversion through UTF-8.  */
587  if (offsets != NULL)
588    {
589      size_t i;
590
591      for (i = 0; i < srclen; i++)
592        offsets[i] = (size_t)(-1);
593
594      last_length = (size_t)(-1);
595    }
596  length = 0;
597  {
598    const bool slowly = (offsets != NULL || handler == iconveh_error);
599# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
600    char utf8buf[utf8bufsize + 1];
601    size_t utf8len = 0;
602    const char *in1ptr = src;
603    size_t in1size = srclen;
604    bool do_final_flush1 = true;
605    bool do_final_flush2 = true;
606
607    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
608# if defined _LIBICONV_VERSION \
609     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
610    /* Set to the initial state.  */
611    if (cd1 != (iconv_t)(-1))
612      iconv (cd1, NULL, NULL, NULL, NULL);
613    if (cd2 != (iconv_t)(-1))
614      iconv (cd2, NULL, NULL, NULL, NULL);
615# endif
616
617    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
618      {
619        char *out1ptr = utf8buf + utf8len;
620        size_t out1size = utf8bufsize - utf8len;
621        bool incremented1;
622        size_t res1;
623        int errno1;
624
625        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
626        if (in1size > 0)
627          {
628            if (offsets != NULL
629                && length != last_length) /* ensure that offset[] be increasing */
630              {
631                offsets[in1ptr - src] = length;
632                last_length = length;
633              }
634            if (cd1 != (iconv_t)(-1))
635              {
636                if (slowly)
637                  res1 = iconv_carefully_1 (cd1,
638                                            &in1ptr, &in1size,
639                                            &out1ptr, &out1size,
640                                            &incremented1);
641                else
642                  res1 = iconv_carefully (cd1,
643                                          &in1ptr, &in1size,
644                                          &out1ptr, &out1size,
645                                          &incremented1);
646              }
647            else
648              {
649                /* FROM_CODESET is UTF-8.  */
650                res1 = utf8conv_carefully (slowly,
651                                           &in1ptr, &in1size,
652                                           &out1ptr, &out1size,
653                                           &incremented1);
654              }
655          }
656        else if (do_final_flush1)
657          {
658            /* Now get the conversion state of CD1 back to the initial state.
659               But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
660# if defined _LIBICONV_VERSION \
661     || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
662            if (cd1 != (iconv_t)(-1))
663              res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
664            else
665# endif
666              res1 = 0;
667            do_final_flush1 = false;
668            incremented1 = true;
669          }
670        else
671          {
672            res1 = 0;
673            incremented1 = true;
674          }
675        if (res1 == (size_t)(-1)
676            && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
677          {
678            if (result != initial_result)
679              {
680                int saved_errno = errno;
681                free (result);
682                errno = saved_errno;
683              }
684            return -1;
685          }
686        if (res1 == (size_t)(-1)
687            && errno == EILSEQ && handler != iconveh_error)
688          {
689            /* The input is invalid in FROM_CODESET.  Eat up one byte and
690               emit a question mark.  Room for the question mark was allocated
691               at the end of utf8buf.  */
692            if (!incremented1)
693              {
694                if (in1size == 0)
695                  abort ();
696                in1ptr++;
697                in1size--;
698              }
699            *out1ptr++ = '?';
700            res1 = 0;
701          }
702        errno1 = errno;
703        utf8len = out1ptr - utf8buf;
704
705        if (offsets != NULL
706            || in1size == 0
707            || utf8len > utf8bufsize / 2
708            || (res1 == (size_t)(-1) && errno1 == E2BIG))
709          {
710            /* Conversion step 2: from UTF-8 to TO_CODESET.  */
711            const char *in2ptr = utf8buf;
712            size_t in2size = utf8len;
713
714            while (in2size > 0
715                   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
716              {
717                char *out2ptr = result + length;
718                size_t out2size = allocated - extra_alloc - length;
719                bool incremented2;
720                size_t res2;
721                bool grow;
722
723                if (in2size > 0)
724                  {
725                    if (cd2 != (iconv_t)(-1))
726                      res2 = iconv_carefully (cd2,
727                                              &in2ptr, &in2size,
728                                              &out2ptr, &out2size,
729                                              &incremented2);
730                    else
731                      /* TO_CODESET is UTF-8.  */
732                      res2 = utf8conv_carefully (false,
733                                                 &in2ptr, &in2size,
734                                                 &out2ptr, &out2size,
735                                                 &incremented2);
736                  }
737                else /* in1size == 0 && !do_final_flush1
738                        && in2size == 0 && do_final_flush2 */
739                  {
740                    /* Now get the conversion state of CD1 back to the initial
741                       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
742# if defined _LIBICONV_VERSION \
743     || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
744                    if (cd2 != (iconv_t)(-1))
745                      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
746                    else
747# endif
748                      res2 = 0;
749                    do_final_flush2 = false;
750                    incremented2 = true;
751                  }
752
753                length = out2ptr - result;
754                grow = (length + extra_alloc > allocated / 2);
755                if (res2 == (size_t)(-1))
756                  {
757                    if (errno == E2BIG)
758                      grow = true;
759                    else if (errno == EINVAL)
760                      break;
761                    else if (errno == EILSEQ && handler != iconveh_error)
762                      {
763                        /* Error handling can produce up to 10 bytes of ASCII
764                           output.  But TO_CODESET may be UCS-2, UTF-16 or
765                           UCS-4, so use CD2 here as well.  */
766                        char scratchbuf[10];
767                        size_t scratchlen;
768                        ucs4_t uc;
769                        const char *inptr;
770                        size_t insize;
771                        size_t res;
772
773                        if (incremented2)
774                          {
775                            if (u8_prev (&uc, (const uint8_t *) in2ptr,
776                                         (const uint8_t *) utf8buf)
777                                == NULL)
778                              abort ();
779                          }
780                        else
781                          {
782                            int n;
783                            if (in2size == 0)
784                              abort ();
785                            n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
786                                                  in2size);
787                            in2ptr += n;
788                            in2size -= n;
789                          }
790
791                        if (handler == iconveh_escape_sequence)
792                          {
793                            static char hex[16] = "0123456789ABCDEF";
794                            scratchlen = 0;
795                            scratchbuf[scratchlen++] = '\\';
796                            if (uc < 0x10000)
797                              scratchbuf[scratchlen++] = 'u';
798                            else
799                              {
800                                scratchbuf[scratchlen++] = 'U';
801                                scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
802                                scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
803                                scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
804                                scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
805                              }
806                            scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
807                            scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
808                            scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
809                            scratchbuf[scratchlen++] = hex[uc & 15];
810                          }
811                        else
812                          {
813                            scratchbuf[0] = '?';
814                            scratchlen = 1;
815                          }
816
817                        inptr = scratchbuf;
818                        insize = scratchlen;
819                        if (cd2 != (iconv_t)(-1))
820                          res = iconv (cd2,
821                                       (ICONV_CONST char **) &inptr, &insize,
822                                       &out2ptr, &out2size);
823                        else
824                          {
825                            /* TO_CODESET is UTF-8.  */
826                            if (out2size >= insize)
827                              {
828                                memcpy (out2ptr, inptr, insize);
829                                out2ptr += insize;
830                                out2size -= insize;
831                                inptr += insize;
832                                insize = 0;
833                                res = 0;
834                              }
835                            else
836                              {
837                                errno = E2BIG;
838                                res = (size_t)(-1);
839                              }
840                          }
841                        length = out2ptr - result;
842                        if (res == (size_t)(-1) && errno == E2BIG)
843                          {
844                            char *memory;
845
846                            allocated = 2 * allocated;
847                            if (length + 1 + extra_alloc > allocated)
848                              abort ();
849                            if (result == initial_result)
850                              memory = (char *) malloc (allocated);
851                            else
852                              memory = (char *) realloc (result, allocated);
853                            if (memory == NULL)
854                              {
855                                if (result != initial_result)
856                                  free (result);
857                                errno = ENOMEM;
858                                return -1;
859                              }
860                            if (result == initial_result)
861                              memcpy (memory, initial_result, length);
862                            result = memory;
863                            grow = false;
864
865                            out2ptr = result + length;
866                            out2size = allocated - extra_alloc - length;
867                            if (cd2 != (iconv_t)(-1))
868                              res = iconv (cd2,
869                                           (ICONV_CONST char **) &inptr,
870                                           &insize,
871                                           &out2ptr, &out2size);
872                            else
873                              {
874                                /* TO_CODESET is UTF-8.  */
875                                if (!(out2size >= insize))
876                                  abort ();
877                                memcpy (out2ptr, inptr, insize);
878                                out2ptr += insize;
879                                out2size -= insize;
880                                inptr += insize;
881                                insize = 0;
882                                res = 0;
883                              }
884                            length = out2ptr - result;
885                          }
886# if !defined _LIBICONV_VERSION && !defined __GLIBC__
887                        /* Irix iconv() inserts a NUL byte if it cannot convert.
888                           NetBSD iconv() inserts a question mark if it cannot
889                           convert.
890                           Only GNU libiconv and GNU libc are known to prefer
891                           to fail rather than doing a lossy conversion.  */
892                        if (res != (size_t)(-1) && res > 0)
893                          {
894                            errno = EILSEQ;
895                            res = (size_t)(-1);
896                          }
897# endif
898                        if (res == (size_t)(-1))
899                          {
900                            /* Failure converting the ASCII replacement.  */
901                            if (result != initial_result)
902                              {
903                                int saved_errno = errno;
904                                free (result);
905                                errno = saved_errno;
906                              }
907                            return -1;
908                          }
909                      }
910                    else
911                      {
912                        if (result != initial_result)
913                          {
914                            int saved_errno = errno;
915                            free (result);
916                            errno = saved_errno;
917                          }
918                        return -1;
919                      }
920                  }
921                if (!(in2size > 0
922                      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
923                  break;
924                if (grow)
925                  {
926                    char *memory;
927
928                    allocated = 2 * allocated;
929                    if (result == initial_result)
930                      memory = (char *) malloc (allocated);
931                    else
932                      memory = (char *) realloc (result, allocated);
933                    if (memory == NULL)
934                      {
935                        if (result != initial_result)
936                          free (result);
937                        errno = ENOMEM;
938                        return -1;
939                      }
940                    if (result == initial_result)
941                      memcpy (memory, initial_result, length);
942                    result = memory;
943                  }
944              }
945
946            /* Move the remaining bytes to the beginning of utf8buf.  */
947            if (in2size > 0)
948              memmove (utf8buf, in2ptr, in2size);
949            utf8len = in2size;
950          }
951
952        if (res1 == (size_t)(-1))
953          {
954            if (errno1 == EINVAL)
955              in1size = 0;
956            else if (errno1 == EILSEQ)
957              {
958                if (result != initial_result)
959                  free (result);
960                errno = errno1;
961                return -1;
962              }
963          }
964      }
965# undef utf8bufsize
966  }
967
968 done:
969  /* Now the final memory allocation.  */
970  if (result == tmpbuf)
971    {
972      size_t memsize = length + extra_alloc;
973      char *memory;
974
975      memory = (char *) malloc (memsize > 0 ? memsize : 1);
976      if (memory != NULL)
977        {
978          memcpy (memory, tmpbuf, length);
979          result = memory;
980        }
981      else
982        {
983          errno = ENOMEM;
984          return -1;
985        }
986    }
987  else if (result != *resultp && length + extra_alloc < allocated)
988    {
989      /* Shrink the allocated memory if possible.  */
990      size_t memsize = length + extra_alloc;
991      char *memory;
992
993      memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
994      if (memory != NULL)
995        result = memory;
996    }
997  *resultp = result;
998  *lengthp = length;
999  return 0;
1000# undef tmpbuf
1001# undef tmpbufsize
1002}
1003
1004int
1005mem_cd_iconveh (const char *src, size_t srclen,
1006                const iconveh_t *cd,
1007                enum iconv_ilseq_handler handler,
1008                size_t *offsets,
1009                char **resultp, size_t *lengthp)
1010{
1011  return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1012                                  handler, 0, offsets, resultp, lengthp);
1013}
1014
1015char *
1016str_cd_iconveh (const char *src,
1017                const iconveh_t *cd,
1018                enum iconv_ilseq_handler handler)
1019{
1020  /* For most encodings, a trailing NUL byte in the input will be converted
1021     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1022     function is usable for UTF-7, we have to exclude the NUL byte from the
1023     conversion and add it by hand afterwards.  */
1024  char *result = NULL;
1025  size_t length = 0;
1026  int retval = mem_cd_iconveh_internal (src, strlen (src),
1027                                        cd->cd, cd->cd1, cd->cd2, handler, 1,
1028                                        NULL, &result, &length);
1029
1030  if (retval < 0)
1031    {
1032      if (result != NULL)
1033        {
1034          int saved_errno = errno;
1035          free (result);
1036          errno = saved_errno;
1037        }
1038      return NULL;
1039    }
1040
1041  /* Add the terminating NUL byte.  */
1042  result[length] = '\0';
1043
1044  return result;
1045}
1046
1047#endif
1048
1049int
1050mem_iconveh (const char *src, size_t srclen,
1051             const char *from_codeset, const char *to_codeset,
1052             enum iconv_ilseq_handler handler,
1053             size_t *offsets,
1054             char **resultp, size_t *lengthp)
1055{
1056  if (srclen == 0)
1057    {
1058      /* Nothing to convert.  */
1059      *lengthp = 0;
1060      return 0;
1061    }
1062  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1063    {
1064      char *result;
1065
1066      if (*resultp != NULL && *lengthp >= srclen)
1067        result = *resultp;
1068      else
1069        {
1070          result = (char *) malloc (srclen);
1071          if (result == NULL)
1072            {
1073              errno = ENOMEM;
1074              return -1;
1075            }
1076        }
1077      memcpy (result, src, srclen);
1078      *resultp = result;
1079      *lengthp = srclen;
1080      return 0;
1081    }
1082  else
1083    {
1084#if HAVE_ICONV
1085      iconveh_t cd;
1086      char *result;
1087      size_t length;
1088      int retval;
1089
1090      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1091        return -1;
1092
1093      result = *resultp;
1094      length = *lengthp;
1095      retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1096                               &result, &length);
1097
1098      if (retval < 0)
1099        {
1100          /* Close cd, but preserve the errno from str_cd_iconv.  */
1101          int saved_errno = errno;
1102          iconveh_close (&cd);
1103          errno = saved_errno;
1104        }
1105      else
1106        {
1107          if (iconveh_close (&cd) < 0)
1108            {
1109              /* Return -1, but free the allocated memory, and while doing
1110                 that, preserve the errno from iconveh_close.  */
1111              int saved_errno = errno;
1112              if (result != *resultp && result != NULL)
1113                free (result);
1114              errno = saved_errno;
1115              return -1;
1116            }
1117          *resultp = result;
1118          *lengthp = length;
1119        }
1120      return retval;
1121#else
1122      /* This is a different error code than if iconv_open existed but didn't
1123         support from_codeset and to_codeset, so that the caller can emit
1124         an error message such as
1125           "iconv() is not supported. Installing GNU libiconv and
1126            then reinstalling this package would fix this."  */
1127      errno = ENOSYS;
1128      return -1;
1129#endif
1130    }
1131}
1132
1133char *
1134str_iconveh (const char *src,
1135             const char *from_codeset, const char *to_codeset,
1136             enum iconv_ilseq_handler handler)
1137{
1138  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1139    {
1140      char *result = strdup (src);
1141
1142      if (result == NULL)
1143        errno = ENOMEM;
1144      return result;
1145    }
1146  else
1147    {
1148#if HAVE_ICONV
1149      iconveh_t cd;
1150      char *result;
1151
1152      if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1153        return NULL;
1154
1155      result = str_cd_iconveh (src, &cd, handler);
1156
1157      if (result == NULL)
1158        {
1159          /* Close cd, but preserve the errno from str_cd_iconv.  */
1160          int saved_errno = errno;
1161          iconveh_close (&cd);
1162          errno = saved_errno;
1163        }
1164      else
1165        {
1166          if (iconveh_close (&cd) < 0)
1167            {
1168              /* Return NULL, but free the allocated memory, and while doing
1169                 that, preserve the errno from iconveh_close.  */
1170              int saved_errno = errno;
1171              free (result);
1172              errno = saved_errno;
1173              return NULL;
1174            }
1175        }
1176      return result;
1177#else
1178      /* This is a different error code than if iconv_open existed but didn't
1179         support from_codeset and to_codeset, so that the caller can emit
1180         an error message such as
1181           "iconv() is not supported. Installing GNU libiconv and
1182            then reinstalling this package would fix this."  */
1183      errno = ENOSYS;
1184      return NULL;
1185#endif
1186    }
1187}
1188