1/* Test of character set conversion with error handling and autodetection.
2   Copyright (C) 2007-2010 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17/* Written by Bruno Haible <bruno@clisp.org>, 2007.  */
18
19#include <config.h>
20
21#include "striconveha.h"
22
23#if HAVE_ICONV
24# include <iconv.h>
25#endif
26
27#include <errno.h>
28#include <stdlib.h>
29#include <string.h>
30
31#include "macros.h"
32
33/* Magic number for detecting bounds violations.  */
34#define MAGIC 0x1983EFF1
35
36static size_t *
37new_offsets (size_t n)
38{
39  size_t *offsets = (size_t *) malloc ((n + 1) * sizeof (size_t));
40  offsets[n] = MAGIC;
41  return offsets;
42}
43
44int
45main ()
46{
47  static enum iconv_ilseq_handler handlers[] =
48    { iconveh_error, iconveh_question_mark, iconveh_escape_sequence };
49  size_t h;
50  size_t o;
51  size_t i;
52
53#if HAVE_ICONV
54  /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1,
55     ISO-8859-2, and UTF-8.  */
56
57  /* ------------------------- Test mem_iconveha() ------------------------- */
58
59  /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors.  */
60  for (h = 0; h < SIZEOF (handlers); h++)
61    {
62      enum iconv_ilseq_handler handler = handlers[h];
63      static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
64      static const char expected[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
65      for (o = 0; o < 2; o++)
66        {
67          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
68          char *result = NULL;
69          size_t length = 0;
70          int retval = mem_iconveha (input, strlen (input),
71                                     "ISO-8859-2", "ISO-8859-1",
72                                     false, handler,
73                                     offsets,
74                                     &result, &length);
75          ASSERT (retval == 0);
76          ASSERT (length == strlen (expected));
77          ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
78          if (o)
79            {
80              for (i = 0; i < 37; i++)
81                ASSERT (offsets[i] == i);
82              ASSERT (offsets[37] == MAGIC);
83              free (offsets);
84            }
85          free (result);
86        }
87    }
88
89  /* Test conversion from ISO-8859-2 to ISO-8859-1 with EILSEQ.  */
90  for (h = 0; h < SIZEOF (handlers); h++)
91    {
92      enum iconv_ilseq_handler handler = handlers[h];
93      static const char input[] = "Rafa\263 Maszkowski"; /* Rafa�� Maszkowski */
94      for (o = 0; o < 2; o++)
95        {
96          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
97          char *result = NULL;
98          size_t length = 0;
99          int retval = mem_iconveha (input, strlen (input),
100                                     "ISO-8859-2", "ISO-8859-1",
101                                     false, handler,
102                                     offsets,
103                                     &result, &length);
104          switch (handler)
105            {
106            case iconveh_error:
107              ASSERT (retval == -1 && errno == EILSEQ);
108              ASSERT (result == NULL);
109              if (o)
110                free (offsets);
111              break;
112            case iconveh_question_mark:
113              {
114                static const char expected[] = "Rafa? Maszkowski";
115                ASSERT (retval == 0);
116                ASSERT (length == strlen (expected));
117                ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
118                if (o)
119                  {
120                    for (i = 0; i < 16; i++)
121                      ASSERT (offsets[i] == i);
122                    ASSERT (offsets[16] == MAGIC);
123                    free (offsets);
124                  }
125                free (result);
126              }
127              break;
128            case iconveh_escape_sequence:
129              {
130                static const char expected[] = "Rafa\\u0142 Maszkowski";
131                ASSERT (retval == 0);
132                ASSERT (length == strlen (expected));
133                ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
134                if (o)
135                  {
136                    for (i = 0; i < 16; i++)
137                      ASSERT (offsets[i] == (i < 5 ? i :
138                                             i + 5));
139                    ASSERT (offsets[16] == MAGIC);
140                    free (offsets);
141                  }
142                free (result);
143              }
144              break;
145            }
146        }
147    }
148
149  /* Test conversion from ISO-8859-1 to UTF-8 with no errors.  */
150  for (h = 0; h < SIZEOF (handlers); h++)
151    {
152      enum iconv_ilseq_handler handler = handlers[h];
153      static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
154      static const char expected[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
155      for (o = 0; o < 2; o++)
156        {
157          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
158          char *result = NULL;
159          size_t length = 0;
160          int retval = mem_iconveha (input, strlen (input),
161                                     "ISO-8859-1", "UTF-8",
162                                     false, handler,
163                                     offsets,
164                                     &result, &length);
165          ASSERT (retval == 0);
166          ASSERT (length == strlen (expected));
167          ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
168          if (o)
169            {
170              for (i = 0; i < 37; i++)
171                ASSERT (offsets[i] == (i < 1 ? i :
172                                       i < 12 ? i + 1 :
173                                       i < 18 ? i + 2 :
174                                       i + 3));
175              ASSERT (offsets[37] == MAGIC);
176              free (offsets);
177            }
178          free (result);
179        }
180    }
181
182  /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
183  for (h = 0; h < SIZEOF (handlers); h++)
184    {
185      enum iconv_ilseq_handler handler = handlers[h];
186      static const char input[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
187      static const char expected[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
188      for (o = 0; o < 2; o++)
189        {
190          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
191          char *result = NULL;
192          size_t length = 0;
193          int retval = mem_iconveha (input, strlen (input),
194                                     "UTF-8", "ISO-8859-1",
195                                     false, handler,
196                                     offsets,
197                                     &result, &length);
198          ASSERT (retval == 0);
199          ASSERT (length == strlen (expected));
200          ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
201          if (o)
202            {
203              for (i = 0; i < 41; i++)
204                ASSERT (offsets[i] == (i < 1 ? i :
205                                       i == 1 ? (size_t)(-1) :
206                                       i < 13 ? i - 1 :
207                                       i == 13 ? (size_t)(-1) :
208                                       i < 20 ? i - 2 :
209                                       i == 20 ? (size_t)(-1) :
210                                       i < 40 ? i - 3 :
211                                       (size_t)(-1)));
212              ASSERT (offsets[41] == MAGIC);
213              free (offsets);
214            }
215          free (result);
216        }
217    }
218
219  /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ.  */
220  for (h = 0; h < SIZEOF (handlers); h++)
221    {
222      enum iconv_ilseq_handler handler = handlers[h];
223      static const char input[] = "Rafa\305\202 Maszkowski"; /* Rafa�� Maszkowski */
224      for (o = 0; o < 2; o++)
225        {
226          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
227          char *result = NULL;
228          size_t length = 0;
229          int retval = mem_iconveha (input, strlen (input),
230                                     "UTF-8", "ISO-8859-1",
231                                     false, handler,
232                                     offsets,
233                                     &result, &length);
234          switch (handler)
235            {
236            case iconveh_error:
237              ASSERT (retval == -1 && errno == EILSEQ);
238              ASSERT (result == NULL);
239              if (o)
240                free (offsets);
241              break;
242            case iconveh_question_mark:
243              {
244                static const char expected[] = "Rafa? Maszkowski";
245                ASSERT (retval == 0);
246                ASSERT (length == strlen (expected));
247                ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
248                if (o)
249                  {
250                    for (i = 0; i < 17; i++)
251                      ASSERT (offsets[i] == (i < 5 ? i :
252                                             i == 5 ? (size_t)(-1) :
253                                             i - 1));
254                    ASSERT (offsets[17] == MAGIC);
255                    free (offsets);
256                  }
257                free (result);
258              }
259              break;
260            case iconveh_escape_sequence:
261              {
262                static const char expected[] = "Rafa\\u0142 Maszkowski";
263                ASSERT (retval == 0);
264                ASSERT (length == strlen (expected));
265                ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
266                if (o)
267                  {
268                    for (i = 0; i < 17; i++)
269                      ASSERT (offsets[i] == (i < 5 ? i :
270                                             i == 5 ? (size_t)(-1) :
271                                             i + 4));
272                    ASSERT (offsets[17] == MAGIC);
273                    free (offsets);
274                  }
275                free (result);
276              }
277              break;
278            }
279        }
280    }
281
282  /* Test conversion from UTF-8 to ISO-8859-1 with EINVAL.  */
283  for (h = 0; h < SIZEOF (handlers); h++)
284    {
285      enum iconv_ilseq_handler handler = handlers[h];
286      static const char input[] = "\342";
287      for (o = 0; o < 2; o++)
288        {
289          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
290          char *result = NULL;
291          size_t length = 0;
292          int retval = mem_iconveha (input, strlen (input),
293                                     "UTF-8", "ISO-8859-1",
294                                     false, handler,
295                                     offsets,
296                                     &result, &length);
297          ASSERT (retval == 0);
298          ASSERT (length == 0);
299          if (o)
300            {
301              ASSERT (offsets[0] == 0);
302              ASSERT (offsets[1] == MAGIC);
303              free (offsets);
304            }
305          free (result);
306        }
307    }
308
309  /* autodetect_jp is only supported when iconv() support ISO-2022-JP-2.  */
310# if defined _LIBICONV_VERSION || !(defined _AIX || defined __sgi || defined __hpux || defined __osf__ || defined __sun)
311  /* Test conversions from autodetect_jp to UTF-8.  */
312  for (h = 0; h < SIZEOF (handlers); h++)
313    {
314      enum iconv_ilseq_handler handler = handlers[h];
315      static const char input[] = "\244\263\244\363\244\313\244\301\244\317"; /* ��������������� in EUC-JP */
316      static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */
317      for (o = 0; o < 2; o++)
318        {
319          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
320          char *result = NULL;
321          size_t length = 0;
322          int retval = mem_iconveha (input, strlen (input),
323                                     "autodetect_jp", "UTF-8",
324                                     false, handler,
325                                     offsets,
326                                     &result, &length);
327          ASSERT (retval == 0);
328          ASSERT (length == strlen (expected));
329          ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
330          if (o)
331            {
332              for (i = 0; i < 10; i++)
333                ASSERT (offsets[i] == ((i % 2) == 0 ? (i / 2) * 3 : (size_t)(-1)));
334              ASSERT (offsets[10] == MAGIC);
335              free (offsets);
336            }
337          free (result);
338        }
339    }
340  for (h = 0; h < SIZEOF (handlers); h++)
341    {
342      enum iconv_ilseq_handler handler = handlers[h];
343      static const char input[] = "\202\261\202\361\202\311\202\277\202\315"; /* ��������������� in Shift_JIS */
344      static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */
345      for (o = 0; o < 2; o++)
346        {
347          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
348          char *result = NULL;
349          size_t length = 0;
350          int retval = mem_iconveha (input, strlen (input),
351                                     "autodetect_jp", "UTF-8",
352                                     false, handler,
353                                     offsets,
354                                     &result, &length);
355          ASSERT (retval == 0);
356          ASSERT (length == strlen (expected));
357          ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
358          if (o)
359            {
360              for (i = 0; i < 10; i++)
361                ASSERT (offsets[i] == ((i % 2) == 0 ? (i / 2) * 3 : (size_t)(-1)));
362              ASSERT (offsets[10] == MAGIC);
363              free (offsets);
364            }
365          free (result);
366        }
367    }
368  for (h = 0; h < SIZEOF (handlers); h++)
369    {
370      enum iconv_ilseq_handler handler = handlers[h];
371      static const char input[] = "\033$B$3$s$K$A$O\033(B"; /* ��������������� in ISO-2022-JP-2 */
372      static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */
373      for (o = 0; o < 2; o++)
374        {
375          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
376          char *result = NULL;
377          size_t length = 0;
378          int retval = mem_iconveha (input, strlen (input),
379                                     "autodetect_jp", "UTF-8",
380                                     false, handler,
381                                     offsets,
382                                     &result, &length);
383          ASSERT (retval == 0);
384          ASSERT (length == strlen (expected));
385          ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
386          if (o)
387            {
388              for (i = 0; i < 16; i++)
389                ASSERT (offsets[i] == (i == 0 ? 0 :
390                                       i == 5 ? 3 :
391                                       i == 7 ? 6 :
392                                       i == 9 ? 9 :
393                                       i == 11 ? 12 :
394                                       i == 13 ? 15 :
395                                       (size_t)(-1)));
396              ASSERT (offsets[16] == MAGIC);
397              free (offsets);
398            }
399          free (result);
400        }
401    }
402# endif
403
404# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
405  /* Test conversion from UTF-8 to ISO-8859-1 with transliteration.  */
406  for (h = 0; h < SIZEOF (handlers); h++)
407    {
408      enum iconv_ilseq_handler handler = handlers[h];
409      static const char input[] = "Costs: 27 \342\202\254"; /* EURO SIGN */
410      static const char expected[] = "Costs: 27 EUR";
411      for (o = 0; o < 2; o++)
412        {
413          size_t *offsets = (o ? new_offsets (strlen (input)) : NULL);
414          char *result = NULL;
415          size_t length = 0;
416          int retval = mem_iconveha (input, strlen (input),
417                                     "UTF-8", "ISO-8859-1",
418                                     true, handler,
419                                     offsets,
420                                     &result, &length);
421          ASSERT (retval == 0);
422          ASSERT (length == strlen (expected));
423          ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0);
424          if (o)
425            {
426              for (i = 0; i < 13; i++)
427                ASSERT (offsets[i] == (i < 11 ? i : (size_t)(-1)));
428              ASSERT (offsets[13] == MAGIC);
429              free (offsets);
430            }
431          free (result);
432        }
433    }
434# endif
435
436  /* ------------------------- Test str_iconveha() ------------------------- */
437
438  /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors.  */
439  for (h = 0; h < SIZEOF (handlers); h++)
440    {
441      enum iconv_ilseq_handler handler = handlers[h];
442      static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
443      static const char expected[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
444      char *result = str_iconveha (input, "ISO-8859-2", "ISO-8859-1", false, handler);
445      ASSERT (result != NULL);
446      ASSERT (strcmp (result, expected) == 0);
447      free (result);
448    }
449
450  /* Test conversion from ISO-8859-2 to ISO-8859-1 with EILSEQ.  */
451  for (h = 0; h < SIZEOF (handlers); h++)
452    {
453      enum iconv_ilseq_handler handler = handlers[h];
454      static const char input[] = "Rafa\263 Maszkowski"; /* Rafa�� Maszkowski */
455      char *result = str_iconveha (input, "ISO-8859-2", "ISO-8859-1", false, handler);
456      switch (handler)
457        {
458        case iconveh_error:
459          ASSERT (result == NULL && errno == EILSEQ);
460          break;
461        case iconveh_question_mark:
462          {
463            static const char expected[] = "Rafa? Maszkowski";
464            ASSERT (result != NULL);
465            ASSERT (strcmp (result, expected) == 0);
466            free (result);
467          }
468          break;
469        case iconveh_escape_sequence:
470          {
471            static const char expected[] = "Rafa\\u0142 Maszkowski";
472            ASSERT (result != NULL);
473            ASSERT (strcmp (result, expected) == 0);
474            free (result);
475          }
476          break;
477        }
478    }
479
480  /* Test conversion from ISO-8859-1 to UTF-8 with no errors.  */
481  for (h = 0; h < SIZEOF (handlers); h++)
482    {
483      enum iconv_ilseq_handler handler = handlers[h];
484      static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
485      static const char expected[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
486      char *result = str_iconveha (input, "ISO-8859-1", "UTF-8", false, handler);
487      ASSERT (result != NULL);
488      ASSERT (strcmp (result, expected) == 0);
489      free (result);
490    }
491
492  /* Test conversion from UTF-8 to ISO-8859-1 with no errors.  */
493  for (h = 0; h < SIZEOF (handlers); h++)
494    {
495      enum iconv_ilseq_handler handler = handlers[h];
496      static const char input[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
497      static const char expected[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
498      char *result = str_iconveha (input, "UTF-8", "ISO-8859-1", false, handler);
499      ASSERT (result != NULL);
500      ASSERT (strcmp (result, expected) == 0);
501      free (result);
502    }
503
504  /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ.  */
505  for (h = 0; h < SIZEOF (handlers); h++)
506    {
507      enum iconv_ilseq_handler handler = handlers[h];
508      static const char input[] = "Costs: 27 \342\202\254"; /* EURO SIGN */
509      char *result = str_iconveha (input, "UTF-8", "ISO-8859-1", false, handler);
510      switch (handler)
511        {
512        case iconveh_error:
513          ASSERT (result == NULL && errno == EILSEQ);
514          break;
515        case iconveh_question_mark:
516          {
517            static const char expected[] = "Costs: 27 ?";
518            ASSERT (result != NULL);
519            ASSERT (strcmp (result, expected) == 0);
520            free (result);
521          }
522          break;
523        case iconveh_escape_sequence:
524          {
525            static const char expected[] = "Costs: 27 \\u20AC";
526            ASSERT (result != NULL);
527            ASSERT (strcmp (result, expected) == 0);
528            free (result);
529          }
530          break;
531        }
532    }
533
534  /* Test conversion from UTF-8 to ISO-8859-1 with EINVAL.  */
535  for (h = 0; h < SIZEOF (handlers); h++)
536    {
537      enum iconv_ilseq_handler handler = handlers[h];
538      static const char input[] = "\342";
539      char *result = str_iconveha (input, "UTF-8", "ISO-8859-1", false, handler);
540      ASSERT (result != NULL);
541      ASSERT (strcmp (result, "") == 0);
542      free (result);
543    }
544
545  /* autodetect_jp is only supported when iconv() support ISO-2022-JP-2.  */
546# if defined _LIBICONV_VERSION || !(defined _AIX || defined __sgi || defined __hpux || defined __osf__ || defined __sun)
547  /* Test conversions from autodetect_jp to UTF-8.  */
548  for (h = 0; h < SIZEOF (handlers); h++)
549    {
550      enum iconv_ilseq_handler handler = handlers[h];
551      static const char input[] = "\244\263\244\363\244\313\244\301\244\317"; /* ��������������� in EUC-JP */
552      static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */
553      char *result = str_iconveha (input, "autodetect_jp", "UTF-8", false, handler);
554      ASSERT (result != NULL);
555      ASSERT (strcmp (result, expected) == 0);
556      free (result);
557    }
558  for (h = 0; h < SIZEOF (handlers); h++)
559    {
560      enum iconv_ilseq_handler handler = handlers[h];
561      static const char input[] = "\202\261\202\361\202\311\202\277\202\315"; /* ��������������� in Shift_JIS */
562      static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */
563      char *result = str_iconveha (input, "autodetect_jp", "UTF-8", false, handler);
564      ASSERT (result != NULL);
565      ASSERT (strcmp (result, expected) == 0);
566      free (result);
567    }
568  for (h = 0; h < SIZEOF (handlers); h++)
569    {
570      enum iconv_ilseq_handler handler = handlers[h];
571      static const char input[] = "\033$B$3$s$K$A$O\033(B"; /* ��������������� in ISO-2022-JP-2 */
572      static const char expected[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* ��������������� */
573      char *result = str_iconveha (input, "autodetect_jp", "UTF-8", false, handler);
574      ASSERT (result != NULL);
575      ASSERT (strcmp (result, expected) == 0);
576      free (result);
577    }
578# endif
579
580# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
581  /* Test conversion from UTF-8 to ISO-8859-1 with transliteration.  */
582  for (h = 0; h < SIZEOF (handlers); h++)
583    {
584      enum iconv_ilseq_handler handler = handlers[h];
585      static const char input[] = "Costs: 27 \342\202\254"; /* EURO SIGN */
586      static const char expected[] = "Costs: 27 EUR";
587      char *result = str_iconveha (input, "UTF-8", "ISO-8859-1", true, handler);
588      ASSERT (result != NULL);
589      ASSERT (strcmp (result, expected) == 0);
590      free (result);
591    }
592# endif
593
594#endif
595
596  return 0;
597}
598