1/* Copyright (C) 2000-2002, 2004-2005 Free Software Foundation, Inc.
2   This file is part of the GNU LIBICONV Library.
3
4   The GNU LIBICONV Library is free software; you can redistribute it
5   and/or modify it under the terms of the GNU Library General Public
6   License as published by the Free Software Foundation; either version 2
7   of the License, or (at your option) any later version.
8
9   The GNU LIBICONV Library is distributed in the hope that it will be
10   useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Library General Public License for more details.
13
14   You should have received a copy of the GNU Library General Public
15   License along with the GNU LIBICONV Library; see the file COPYING.LIB.
16   If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
17   Fifth Floor, Boston, MA 02110-1301, USA.  */
18
19/* Create a table from CHARSET to Unicode. */
20
21#include "config.h"
22
23#include <stddef.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <iconv.h>
28#include <errno.h>
29
30#include "binary-io.h"
31
32/* If nonzero, ignore conversions outside Unicode plane 0. */
33static int bmp_only;
34
35static const char* hexbuf (unsigned char buf[], unsigned int buflen)
36{
37  static char msg[50];
38  switch (buflen) {
39    case 1: sprintf(msg,"0x%02X",buf[0]); break;
40    case 2: sprintf(msg,"0x%02X%02X",buf[0],buf[1]); break;
41    case 3: sprintf(msg,"0x%02X%02X%02X",buf[0],buf[1],buf[2]); break;
42    case 4: sprintf(msg,"0x%02X%02X%02X%02X",buf[0],buf[1],buf[2],buf[3]); break;
43    default: abort();
44  }
45  return msg;
46}
47
48static int try (iconv_t cd, unsigned char buf[], unsigned int buflen, unsigned int* out)
49{
50  const char* inbuf = (const char*) buf;
51  size_t inbytesleft = buflen;
52  char* outbuf = (char*) out;
53  size_t outbytesleft = 3*sizeof(unsigned int);
54  size_t result;
55  iconv(cd,NULL,NULL,NULL,NULL);
56  result = iconv(cd,(ICONV_CONST char**)&inbuf,&inbytesleft,&outbuf,&outbytesleft);
57  if (result != (size_t)(-1))
58    result = iconv(cd,NULL,NULL,&outbuf,&outbytesleft);
59  if (result == (size_t)(-1)) {
60    if (errno == EILSEQ) {
61      return -1;
62    } else if (errno == EINVAL) {
63      return 0;
64    } else {
65      int saved_errno = errno;
66      fprintf(stderr,"%s: iconv error: ",hexbuf(buf,buflen));
67      errno = saved_errno;
68      perror("");
69      exit(1);
70    }
71  } else if (result > 0) /* ignore conversions with transliteration */ {
72    return -1;
73  } else {
74    if (inbytesleft != 0) {
75      fprintf(stderr,"%s: inbytes = %ld, outbytes = %ld\n",hexbuf(buf,buflen),(long)(buflen-inbytesleft),(long)(3*sizeof(unsigned int)-outbytesleft));
76      exit(1);
77    }
78    return (3*sizeof(unsigned int)-outbytesleft)/sizeof(unsigned int);
79  }
80}
81
82/* Returns the out[] buffer as a Unicode value, formatted as 0x%04X. */
83static const char* ucs4_decode (const unsigned int* out, unsigned int outlen)
84{
85  static char hexbuf[21];
86  char* p = hexbuf;
87  while (outlen > 0) {
88    if (p > hexbuf)
89      *p++ = ' ';
90    sprintf (p, "0x%04X", out[0]);
91    out += 1; outlen -= 1;
92    if (bmp_only && strlen(p) > 6)
93      return NULL;
94    p += strlen(p);
95  }
96  return hexbuf;
97}
98
99int main (int argc, char* argv[])
100{
101  const char* charset;
102  iconv_t cd;
103  int search_depth;
104
105  if (argc != 2) {
106    fprintf(stderr,"Usage: table-from charset\n");
107    exit(1);
108  }
109  charset = argv[1];
110
111#if O_BINARY
112  SET_BINARY(fileno(stdout));
113#endif
114
115  cd = iconv_open("UCS-4-INTERNAL",charset);
116  if (cd == (iconv_t)(-1)) {
117    perror("iconv_open");
118    exit(1);
119  }
120
121  /* When testing UTF-8, stop at 0x10000, otherwise the output file gets too
122     big. */
123  bmp_only = (strcmp(charset,"UTF-8") == 0);
124  search_depth = (strcmp(charset,"UTF-8") == 0 ? 3 : 4);
125
126  {
127    unsigned int out[3];
128    unsigned char buf[4];
129    unsigned int i0, i1, i2, i3;
130    int result;
131    for (i0 = 0; i0 < 0x100; i0++) {
132      buf[0] = i0;
133      result = try(cd,buf,1,out);
134      if (result < 0) {
135      } else if (result > 0) {
136        const char* unicode = ucs4_decode(out,result);
137        if (unicode != NULL)
138          printf("0x%02X\t%s\n",i0,unicode);
139      } else {
140        for (i1 = 0; i1 < 0x100; i1++) {
141          buf[1] = i1;
142          result = try(cd,buf,2,out);
143          if (result < 0) {
144          } else if (result > 0) {
145            const char* unicode = ucs4_decode(out,result);
146            if (unicode != NULL)
147              printf("0x%02X%02X\t%s\n",i0,i1,unicode);
148          } else {
149            for (i2 = 0; i2 < 0x100; i2++) {
150              buf[2] = i2;
151              result = try(cd,buf,3,out);
152              if (result < 0) {
153              } else if (result > 0) {
154                const char* unicode = ucs4_decode(out,result);
155                if (unicode != NULL)
156                  printf("0x%02X%02X%02X\t%s\n",i0,i1,i2,unicode);
157              } else if (search_depth > 3) {
158                for (i3 = 0; i3 < 0x100; i3++) {
159                  buf[3] = i3;
160                  result = try(cd,buf,4,out);
161                  if (result < 0) {
162                  } else if (result > 0) {
163                    const char* unicode = ucs4_decode(out,result);
164                    if (unicode != NULL)
165                      printf("0x%02X%02X%02X%02X\t%s\n",i0,i1,i2,i3,unicode);
166                  } else {
167                    fprintf(stderr,"%s: incomplete byte sequence\n",hexbuf(buf,4));
168                    exit(1);
169                  }
170                }
171              }
172            }
173          }
174        }
175      }
176    }
177  }
178
179  if (iconv_close(cd) < 0) {
180    perror("iconv_close");
181    exit(1);
182  }
183
184  if (ferror(stdin) || ferror(stdout) || fclose(stdout)) {
185    fprintf(stderr,"I/O error\n");
186    exit(1);
187  }
188
189  exit(0);
190}
191