1/* Copyright (C) 1999-2002 Free Software Foundation, Inc.
2   This file is part of the GNU LIBICONV Tools.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software Foundation,
16   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17
18/*
19 * Generates an 8-bit character set table from a .TXT table as found on
20 * ftp.unicode.org or from a table containing the 256 Unicode values as
21 * hexadecimal integers.
22 * Examples:
23 *
24 *   ./8bit_tab_to_h ISO-8859-1 iso8859_1 < tab8859_1
25 *   ./8bit_tab_to_h ISO-8859-2 iso8859_2 < tab8859_2
26 *   ./8bit_tab_to_h ISO-8859-3 iso8859_3 < tab8859_3
27 *   ./8bit_tab_to_h ISO-8859-4 iso8859_4 < tab8859_4
28 *   ./8bit_tab_to_h ISO-8859-5 iso8859_5 < tab8859_5
29 *   ./8bit_tab_to_h ISO-8859-6 iso8859_6 < tab8859_6
30 *   ./8bit_tab_to_h ISO-8859-7 iso8859_7 < tab8859_7
31 *   ./8bit_tab_to_h ISO-8859-8 iso8859_8 < tab8859_8
32 *   ./8bit_tab_to_h ISO-8859-9 iso8859_9 < tab8859_9
33 *   ./8bit_tab_to_h ISO-8859-10 iso8859_10 < tab8859_10
34 *   ./8bit_tab_to_h ISO-8859-14 iso8859_14 < tab8859_14
35 *   ./8bit_tab_to_h ISO-8859-15 iso8859_15 < tab8859_15
36 *   ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < jis0201
37 *   ./8bit_tab_to_h TIS620.2533-1 tis620 < tabtis620
38 *   ./8bit_tab_to_h KOI8-R koi8_r < tabkoi8_r
39 *   ./8bit_tab_to_h KOI8-U koi8_u < tabkoi8_u
40 *   ./8bit_tab_to_h ARMSCII-8 armscii_8 < tabarmscii_8
41 *   ./8bit_tab_to_h CP1133 cp1133 < tabibm_cp1133
42 *   ./8bit_tab_to_h MULELAO-1 mulelao < tabmulelao_1
43 *   ./8bit_tab_to_h VISCII1.1-1 viscii1 < tabviscii
44 *   ./8bit_tab_to_h TCVN-5712 tcvn < tabtcvn
45 *   ./8bit_tab_to_h GEORGIAN-ACADEMY georgian_ac < tabgeorgian_academy
46 *   ./8bit_tab_to_h GEORGIAN-PS georgian_ps < tabgeorgian_ps
47 *
48 *   ./8bit_tab_to_h ISO-8859-1 iso8859_1 < 8859-1.TXT
49 *   ./8bit_tab_to_h ISO-8859-2 iso8859_2 < 8859-2.TXT
50 *   ./8bit_tab_to_h ISO-8859-3 iso8859_3 < 8859-3.TXT
51 *   ./8bit_tab_to_h ISO-8859-4 iso8859_4 < 8859-4.TXT
52 *   ./8bit_tab_to_h ISO-8859-5 iso8859_5 < 8859-5.TXT
53 *   ./8bit_tab_to_h ISO-8859-6 iso8859_6 < 8859-6.TXT
54 *   ./8bit_tab_to_h ISO-8859-7 iso8859_7 < 8859-7.TXT
55 *   ./8bit_tab_to_h ISO-8859-8 iso8859_8 < 8859-8.TXT
56 *   ./8bit_tab_to_h ISO-8859-9 iso8859_9 < 8859-9.TXT
57 *   ./8bit_tab_to_h ISO-8859-10 iso8859_10 < 8859-10.TXT
58 *   ./8bit_tab_to_h ISO-8859-14 iso8859_14 < 8859-14.TXT
59 *   ./8bit_tab_to_h ISO-8859-15 iso8859_15 < 8859-15.TXT
60 *   ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < JIS0201.TXT
61 *   ./8bit_tab_to_h KOI8-R koi8_r < KOI8-R.TXT
62 */
63
64#include <stdio.h>
65#include <stdlib.h>
66#include <stdbool.h>
67#include <string.h>
68
69int main (int argc, char *argv[])
70{
71  const char* charsetname;
72  const char* c_charsetname;
73  const char* filename;
74  const char* directory;
75  int charset2uni[0x100];
76
77  if (argc != 3 && argc != 4 && argc != 5)
78    exit(1);
79  charsetname = argv[1];
80  c_charsetname = argv[2];
81  if (argc > 3) {
82    filename = argv[3];
83  } else {
84    char* s = (char*) malloc(strlen(c_charsetname)+strlen(".h")+1);
85    strcpy(s,c_charsetname); strcat(s,".h");
86    filename = s;
87  }
88  directory = (argc > 4 ? argv[4] : "");
89
90  fprintf(stderr, "Creating %s%s\n", directory, filename);
91
92  {
93    int i, c;
94    c = getc(stdin);
95    ungetc(c,stdin);
96    if (c == '#') {
97      /* Read a unicode.org style .TXT file. */
98      for (i = 0; i < 0x100; i++)
99        charset2uni[i] = 0xfffd;
100      for (;;) {
101        c = getc(stdin);
102        if (c == EOF)
103          break;
104        if (c == '\n' || c == ' ' || c == '\t')
105          continue;
106        if (c == '#') {
107          do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
108          continue;
109        }
110        ungetc(c,stdin);
111        if (scanf("0x%x", &i) != 1 || !(i >= 0 && i < 0x100))
112          exit(1);
113        do { c = getc(stdin); } while (c == ' ' || c == '\t');
114        if (c != EOF)
115          ungetc(c,stdin);
116        if (c == '\n' || c == '#')
117          continue;
118        if (scanf("0x%x", &charset2uni[i]) != 1)
119          exit(1);
120      }
121    } else {
122      /* Read a table of hexadecimal Unicode values. */
123      for (i = 0; i < 0x100; i++) {
124        if (scanf("%x", &charset2uni[i]) != 1)
125          exit(1);
126        if (charset2uni[i] < 0 || charset2uni[i] == 0xffff)
127          charset2uni[i] = 0xfffd;
128      }
129      if (scanf("%x", &i) != EOF)
130        exit(1);
131    }
132  }
133
134  /* Write the output file. */
135  {
136    FILE* f;
137
138    {
139      char* fname = malloc(strlen(directory)+strlen(filename)+1);
140      strcpy(fname,directory); strcat(fname,filename);
141      f = fopen(fname,"w");
142      if (f == NULL)
143        exit(1);
144    }
145
146    fprintf(f, "/*\n");
147    fprintf(f, " * Copyright (C) 1999-2002 Free Software Foundation, Inc.\n");
148    fprintf(f, " * This file is part of the GNU LIBICONV Library.\n");
149    fprintf(f, " *\n");
150    fprintf(f, " * The GNU LIBICONV Library is free software; you can redistribute it\n");
151    fprintf(f, " * and/or modify it under the terms of the GNU Library General Public\n");
152    fprintf(f, " * License as published by the Free Software Foundation; either version 2\n");
153    fprintf(f, " * of the License, or (at your option) any later version.\n");
154    fprintf(f, " *\n");
155    fprintf(f, " * The GNU LIBICONV Library is distributed in the hope that it will be\n");
156    fprintf(f, " * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
157    fprintf(f, " * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n");
158    fprintf(f, " * Library General Public License for more details.\n");
159    fprintf(f, " *\n");
160    fprintf(f, " * You should have received a copy of the GNU Library General Public\n");
161    fprintf(f, " * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
162    fprintf(f, " * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n");
163    fprintf(f, " * Fifth Floor, Boston, MA 02110-1301, USA.\n");
164    fprintf(f, " */\n");
165    fprintf(f, "\n");
166    fprintf(f, "/*\n");
167    fprintf(f, " * %s\n", charsetname);
168    fprintf(f, " */\n");
169    fprintf(f, "\n");
170
171    {
172      int i, i1, i2, i3;
173      int line[16];
174      int tableno;
175      struct { int minline; int maxline; } tables[16];
176      bool some_invalid;
177      bool final_ret_reached;
178
179      for (i1 = 0; i1 < 16; i1++) {
180        bool all_invalid = true;
181        bool all_identity = true;
182        for (i2 = 0; i2 < 16; i2++) {
183          i = 16*i1+i2;
184          if (charset2uni[i] != 0xfffd)
185            all_invalid = false;
186          if (charset2uni[i] != i)
187            all_identity = false;
188        }
189        if (all_invalid)
190          line[i1] = -2;
191        else if (all_identity)
192          line[i1] = -1;
193        else
194          line[i1] = 0;
195      }
196      tableno = 0;
197      for (i1 = 0; i1 < 16; i1++) {
198        if (line[i1] >= 0) {
199          if (i1 > 0 && tableno > 0 && line[i1-1] == tableno-1) {
200            line[i1] = tableno-1;
201            tables[tableno-1].maxline = i1;
202          } else {
203            tableno++;
204            line[i1] = tableno-1;
205            tables[tableno-1].minline = tables[tableno-1].maxline = i1;
206          }
207        }
208      }
209      some_invalid = false;
210      for (i = 0; i < 0x100; i++)
211        if (charset2uni[i] == 0xfffd)
212          some_invalid = true;
213      if (tableno > 0) {
214        int t;
215        for (t = 0; t < tableno; t++) {
216          fprintf(f, "static const unsigned short %s_2uni", c_charsetname);
217          if (tableno > 1)
218            fprintf(f, "_%d", t+1);
219          fprintf(f, "[%d] = {\n", 16*(tables[t].maxline-tables[t].minline+1));
220          for (i1 = tables[t].minline; i1 <= tables[t].maxline; i1++) {
221            fprintf(f, "  /* 0x%02x */\n", 16*i1);
222            for (i2 = 0; i2 < 2; i2++) {
223              fprintf(f, " ");
224              for (i3 = 0; i3 < 8; i3++) {
225                i = 16*i1+8*i2+i3;
226                fprintf(f, " 0x%04x,", charset2uni[i]);
227              }
228              fprintf(f, "\n");
229            }
230          }
231          fprintf(f, "};\n");
232        }
233        fprintf(f, "\n");
234      }
235      final_ret_reached = false;
236      fprintf(f, "static int\n%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", c_charsetname);
237      fprintf(f, "{\n");
238      fprintf(f, "  unsigned char c = *s;\n");
239      if (some_invalid) {
240        for (i1 = 0; i1 < 16;) {
241          int t = line[i1];
242          const char* indent;
243          for (i2 = i1; i2 < 16 && line[i2] == t; i2++);
244          indent = (i1 == 0 && i2 == 16 ? "  " : "    ");
245          if (i1 == 0) {
246            if (i2 == 16) {
247            } else {
248              fprintf(f, "  if (c < 0x%02x) {\n", 16*i2);
249            }
250          } else {
251            if (i2 == 16) {
252              fprintf(f, "  else {\n");
253            } else {
254              fprintf(f, "  else if (c < 0x%02x) {\n", 16*i2);
255            }
256          }
257          if (t == -2) {
258            final_ret_reached = true;
259          } else if (t == -1) {
260            fprintf(f, "%s*pwc = (ucs4_t) c;\n", indent);
261            fprintf(f, "%sreturn 1;\n", indent);
262          } else {
263            fprintf(f, "%s", indent);
264            some_invalid = false;
265            for (i = 16*i1; i < 16*i2; i++)
266              if (charset2uni[i] == 0xfffd)
267                some_invalid = true;
268            if (some_invalid)
269              fprintf(f, "unsigned short wc = ");
270            else
271              fprintf(f, "*pwc = (ucs4_t) ");
272            fprintf(f, "%s_2uni", c_charsetname);
273            if (tableno > 1)
274              fprintf(f, "_%d", t+1);
275            fprintf(f, "[c");
276            if (tables[t].minline > 0)
277              fprintf(f, "-0x%02x", 16*tables[t].minline);
278            fprintf(f, "];\n");
279            if (some_invalid) {
280              fprintf(f, "%sif (wc != 0xfffd) {\n", indent);
281              fprintf(f, "%s  *pwc = (ucs4_t) wc;\n", indent);
282              fprintf(f, "%s  return 1;\n", indent);
283              fprintf(f, "%s}\n", indent);
284              final_ret_reached = true;
285            } else {
286              fprintf(f, "%sreturn 1;\n", indent);
287            }
288          }
289          if (!(i1 == 0 && i2 == 16))
290            fprintf(f, "  }\n");
291          i1 = i2;
292        }
293        if (final_ret_reached)
294          fprintf(f, "  return RET_ILSEQ;\n");
295      } else {
296        for (i1 = 0; i1 < 16;) {
297          int t = line[i1];
298          for (i2 = i1; i2 < 16 && line[i2] == t; i2++);
299          if (i1 == 0) {
300            if (i2 == 16) {
301              fprintf(f, "  ");
302            } else {
303              fprintf(f, "  if (c < 0x%02x)\n    ", 16*i2);
304            }
305          } else {
306            if (i2 == 16) {
307              fprintf(f, "  else\n    ");
308            } else {
309              fprintf(f, "  else if (c < 0x%02x)\n    ", 16*i2);
310            }
311          }
312          if (t == -1)
313            fprintf(f, "*pwc = (ucs4_t) c;\n");
314          else {
315            fprintf(f, "*pwc = (ucs4_t) %s_2uni", c_charsetname);
316            if (tableno > 1)
317              fprintf(f, "_%d", t+1);
318            fprintf(f, "[c");
319            if (tables[t].minline > 0)
320              fprintf(f, "-0x%02x", 16*tables[t].minline);
321            fprintf(f, "];\n");
322          }
323          i1 = i2;
324        }
325        fprintf(f, "  return 1;\n");
326      }
327      fprintf(f, "}\n");
328
329    }
330
331    fprintf(f, "\n");
332
333    {
334      int uni2charset[0x10000];
335      bool pages[0x100];
336      int line[0x2000];
337      int tableno;
338      struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
339      bool need_c;
340      bool fix_0000;
341      int i, j, p, j1, j2, t;
342
343      for (j = 0; j < 0x10000; j++)
344        uni2charset[j] = 0;
345      for (p = 0; p < 0x100; p++)
346        pages[p] = false;
347      for (i = 0; i < 0x100; i++) {
348        j = charset2uni[i];
349        if (j != 0xfffd) {
350          uni2charset[j] = i;
351          pages[j>>8] = true;
352        }
353      }
354      for (j1 = 0; j1 < 0x2000; j1++) {
355        bool all_invalid = true;
356        bool all_identity = true;
357        for (j2 = 0; j2 < 8; j2++) {
358          j = 8*j1+j2;
359          if (uni2charset[j] != 0)
360            all_invalid = false;
361          if (uni2charset[j] != j)
362            all_identity = false;
363        }
364        if (all_invalid)
365          line[j1] = -2;
366        else if (all_identity)
367          line[j1] = -1;
368        else
369          line[j1] = 0;
370      }
371      tableno = 0;
372      for (j1 = 0; j1 < 0x2000; j1++) {
373        if (line[j1] >= 0) {
374          if (tableno > 0
375              && ((j1 > 0 && line[j1-1] == tableno-1)
376                  || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
377                      && j1 - tables[tableno-1].maxline <= 8))) {
378            line[j1] = tableno-1;
379            tables[tableno-1].maxline = j1;
380          } else {
381            tableno++;
382            line[j1] = tableno-1;
383            tables[tableno-1].minline = tables[tableno-1].maxline = j1;
384          }
385        }
386      }
387      for (t = 0; t < tableno; t++) {
388        tables[t].usecount = 0;
389        j1 = 8*tables[t].minline;
390        j2 = 8*(tables[t].maxline+1);
391        for (j = j1; j < j2; j++)
392          if (uni2charset[j] != 0)
393            tables[t].usecount++;
394      }
395      for (t = 0, p = -1, i = 0; t < tableno; t++) {
396        if (tables[t].usecount > 1) {
397          char* s;
398          if (p == tables[t].minline >> 5) {
399            s = (char*) malloc(5+1);
400            sprintf(s, "%02x_%d", p, ++i);
401          } else {
402            p = tables[t].minline >> 5;
403            s = (char*) malloc(2+1);
404            sprintf(s, "%02x", p);
405          }
406          tables[t].suffix = s;
407        } else
408          tables[t].suffix = NULL;
409      }
410      {
411        p = -1;
412        for (t = 0; t < tableno; t++)
413          if (tables[t].usecount > 1) {
414            p = 0;
415            fprintf(f, "static const unsigned char %s_page%s[%d] = {\n", c_charsetname, tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
416            for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
417              if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
418                fprintf(f, "  /* 0x%04x */\n", 8*j1);
419              fprintf(f, " ");
420              for (j2 = 0; j2 < 8; j2++) {
421                j = 8*j1+j2;
422                fprintf(f, " 0x%02x,", uni2charset[j]);
423              }
424              fprintf(f, " /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
425            }
426            fprintf(f, "};\n");
427          }
428        if (p >= 0)
429          fprintf(f, "\n");
430      }
431      need_c = false;
432      for (j1 = 0; j1 < 0x2000;) {
433        t = line[j1];
434        for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
435        if (t >= 0)
436          j2 = tables[t].maxline+1;
437        if (!(t == -2 || (t == -1 && j1 == 0)))
438          need_c = true;
439        j1 = j2;
440      }
441      fix_0000 = false;
442      fprintf(f, "static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", c_charsetname);
443      fprintf(f, "{\n");
444      if (need_c)
445        fprintf(f, "  unsigned char c = 0;\n");
446      for (j1 = 0; j1 < 0x2000;) {
447        t = line[j1];
448        for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
449        if (t >= 0) {
450          if (j1 != tables[t].minline) abort();
451          if (j2 > tables[t].maxline+1) abort();
452          j2 = tables[t].maxline+1;
453        }
454        if (t == -2) {
455        } else {
456          if (j1 == 0)
457            fprintf(f, "  ");
458          else
459            fprintf(f, "  else ");
460          if (t >= 0 && tables[t].usecount == 0) abort();
461          if (t >= 0 && tables[t].usecount == 1) {
462            if (j2 != j1+1) abort();
463            for (j = 8*j1; j < 8*j2; j++)
464              if (uni2charset[j] != 0) {
465                fprintf(f, "if (wc == 0x%04x)\n    c = 0x%02x;\n", j, uni2charset[j]);
466                break;
467              }
468          } else {
469            if (j1 == 0) {
470              fprintf(f, "if (wc < 0x%04x)", 8*j2);
471            } else {
472              fprintf(f, "if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
473            }
474            if (t == -1) {
475              if (j1 == 0)
476                /* If wc == 0, the function must return 1, not -1. */
477                fprintf(f, " {\n    *r = wc;\n    return 1;\n  }\n");
478              else
479                fprintf(f, "\n    c = wc;\n");
480            } else {
481              fprintf(f, "\n    c = %s_page%s[wc", c_charsetname, tables[t].suffix);
482              if (tables[t].minline > 0)
483                fprintf(f, "-0x%04x", 8*j1);
484              fprintf(f, "];\n");
485              if (j1 == 0 && uni2charset[0] == 0)
486                /* If wc == 0, the function must return 1, not -1. */
487                fix_0000 = true;
488            }
489          }
490        }
491        j1 = j2;
492      }
493      if (need_c) {
494        if (fix_0000)
495          fprintf(f, "  if (c != 0 || wc == 0) {\n");
496        else
497          fprintf(f, "  if (c != 0) {\n");
498        fprintf(f, "    *r = c;\n");
499        fprintf(f, "    return 1;\n");
500        fprintf(f, "  }\n");
501      }
502      fprintf(f, "  return RET_ILUNI;\n");
503      fprintf(f, "}\n");
504
505    }
506
507    if (ferror(f) || fclose(f))
508      exit(1);
509  }
510
511#if 0
512
513    int i1, i2, i3, i1_min, i1_max, j1, j2;
514
515  i1_min = 16;
516  i1_max = -1;
517  for (i1 = 0; i1 < 16; i1++)
518    for (i2 = 0; i2 < 16; i2++)
519      if (charset2uni[16*i1+i2] != 0xfffd) {
520        if (i1_min > i1) i1_min = i1;
521        if (i1_max < i1) i1_max = i1;
522      }
523  printf("static const unsigned short %s_2uni[%d] = {\n",
524         name, 16*(i1_max-i1_min+1));
525  for (i1 = i1_min; i1 <= i1_max; i1++) {
526    printf("  /""* 0x%02x *""/\n", 16*i1);
527    for (i2 = 0; i2 < 2; i2++) {
528      printf("  ");
529      for (i3 = 0; i3 < 8; i3++) {
530        if (i3 > 0) printf(" ");
531        printf("0x%04x,", charset2uni[16*i1+8*i2+i3]);
532      }
533      printf("\n");
534    }
535  }
536  printf("};\n");
537  printf("\n");
538
539  for (p = 0; p < 0x100; p++)
540    pages[p] = 0;
541  for (i = 0; i < 0x100; i++)
542    if (charset2uni[i] != 0xfffd)
543      pages[charset2uni[i]>>8] = 1;
544  for (p = 0; p < 0x100; p++)
545    if (pages[p]) {
546      int j1_min = 32;
547      int j1_max = -1;
548      for (j1 = 0; j1 < 32; j1++)
549        for (j2 = 0; j2 < 8; j2++)
550          if (uni2charset[256*p+8*j1+j2] != 0) {
551            if (j1_min > j1) j1_min = j1;
552            if (j1_max < j1) j1_max = j1;
553          }
554      printf("static const unsigned char %s_page%02x[%d] = {\n",
555             name, p, 8*(j1_max-j1_min+1));
556      for (j1 = j1_min; j1 <= j1_max; j1++) {
557        printf("  ");
558        for (j2 = 0; j2 < 8; j2++)
559          printf("0x%02x, ", uni2charset[256*p+8*j1+j2]);
560        printf("/""* 0x%02x-0x%02x *""/\n", 8*j1, 8*j1+7);
561      }
562      printf("};\n");
563    }
564  printf("\n");
565
566}
567#endif
568
569  exit(0);
570}
571