1/* Copyright (C) 1999-2004, 2006 Free Software Foundation, Inc.
2   This file is part of the GNU LIBICONV Tools.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software Foundation,
16   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17
18/*
19 * Generates a CJK character set table from a .TXT table as found on
20 * ftp.unicode.org or in the X nls directory.
21 * Examples:
22 *
23 *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
24 *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
25 *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
26 *
27 *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
28 *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
29 *   ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
30 *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
31 *   ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
32 *
33 *   ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
34 *
35 *   ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
36 *
37 *   ./cjk_tab_to_h JISX0213:2004 jisx0213 > jisx0213.h < JISX0213.TXT
38 */
39
40#include <stdio.h>
41#include <stdlib.h>
42#include <stdbool.h>
43#include <string.h>
44#include <ctype.h>
45#include <assert.h>
46
47typedef struct {
48  int start;
49  int end;
50} Block;
51
52typedef struct {
53  int rows;    /* number of possible values for the 1st byte */
54  int cols;    /* number of possible values for the 2nd byte */
55  int (*row_byte) (int row); /* returns the 1st byte value for a given row */
56  int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
57  int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
58  int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
59  const char* check_row_expr; /* format string for 1st byte value checking */
60  const char* check_col_expr; /* format string for 2nd byte value checking */
61  const char* byte_row_expr; /* format string for 1st byte value to row */
62  const char* byte_col_expr; /* format string for 2nd byte value to col */
63  int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
64  /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
65     Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
66  int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
67  int ncharsetblocks;
68  Block* charsetblocks; /* blocks[0..nblocks-1] */
69  int* uni2charset; /* uni2charset[0x0000..0xffff] */
70  int fffd;    /* uni representation of the invalid character */
71} Encoding;
72
73/*
74 * Outputs the file title.
75 */
76static void output_title (const char *charsetname)
77{
78  printf("/*\n");
79  printf(" * Copyright (C) 1999-2006 Free Software Foundation, Inc.\n");
80  printf(" * This file is part of the GNU LIBICONV Library.\n");
81  printf(" *\n");
82  printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
83  printf(" * and/or modify it under the terms of the GNU Library General Public\n");
84  printf(" * License as published by the Free Software Foundation; either version 2\n");
85  printf(" * of the License, or (at your option) any later version.\n");
86  printf(" *\n");
87  printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
88  printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
89  printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n");
90  printf(" * Library General Public License for more details.\n");
91  printf(" *\n");
92  printf(" * You should have received a copy of the GNU Library General Public\n");
93  printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
94  printf(" * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n");
95  printf(" * Fifth Floor, Boston, MA 02110-1301, USA.\n");
96  printf(" */\n");
97  printf("\n");
98  printf("/*\n");
99  printf(" * %s\n", charsetname);
100  printf(" */\n");
101  printf("\n");
102}
103
104/*
105 * Reads the charset2uni table from standard input.
106 */
107static void read_table (Encoding* enc)
108{
109  int row, col, i, i1, i2, c, j;
110
111  enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
112  for (row = 0; row < enc->rows; row++)
113    enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
114
115  for (row = 0; row < enc->rows; row++)
116    for (col = 0; col < enc->cols; col++)
117      enc->charset2uni[row][col] = 0xfffd;
118
119  c = getc(stdin);
120  ungetc(c,stdin);
121  if (c == '#') {
122    /* Read a unicode.org style .TXT file. */
123    for (;;) {
124      c = getc(stdin);
125      if (c == EOF)
126        break;
127      if (c == '\n' || c == ' ' || c == '\t')
128        continue;
129      if (c == '#') {
130        do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
131        continue;
132      }
133      ungetc(c,stdin);
134      if (scanf("0x%x", &j) != 1)
135        exit(1);
136      i1 = j >> 8;
137      i2 = j & 0xff;
138      row = enc->byte_row(i1);
139      col = enc->byte_col(i2);
140      if (row < 0 || col < 0) {
141        fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
142        exit(1);
143      }
144      if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
145        exit(1);
146    }
147  } else {
148    /* Read a table of hexadecimal Unicode values. */
149    for (i1 = 32; i1 < 132; i1++)
150      for (i2 = 32; i2 < 132; i2++) {
151        i = scanf("%x", &j);
152        if (i == EOF)
153          goto read_done;
154        if (i != 1)
155          exit(1);
156        if (j < 0 || j == 0xffff)
157          j = 0xfffd;
158        if (j != 0xfffd) {
159          if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
160            fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
161            exit (1);
162          }
163          enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
164        }
165      }
166   read_done: ;
167  }
168}
169
170/*
171 * Determine whether the Unicode range goes outside the BMP.
172 */
173static bool is_charset2uni_large (Encoding* enc)
174{
175  int row, col;
176
177  for (row = 0; row < enc->rows; row++)
178    for (col = 0; col < enc->cols; col++)
179      if (enc->charset2uni[row][col] >= 0x10000)
180        return true;
181  return false;
182}
183
184/*
185 * Compactify the Unicode range by use of an auxiliary table,
186 * so 16 bits suffice to store each value.
187 */
188static int compact_large_charset2uni (Encoding* enc, unsigned int **urows, unsigned int *urowshift)
189{
190  unsigned int shift;
191
192  for (shift = 8; ; shift--) {
193    int *upages = (int *) malloc((0x110000>>shift) * sizeof(int));
194    int i, row, col, nurows;
195
196    for (i = 0; i < 0x110000>>shift; i++)
197      upages[i] = -1;
198
199    for (row = 0; row < enc->rows; row++)
200      for (col = 0; col < enc->cols; col++)
201        upages[enc->charset2uni[row][col] >> shift] = 0;
202
203    nurows = 0;
204    for (i = 0; i < 0x110000>>shift; i++)
205      if (upages[i] == 0)
206        nurows++;
207
208    /* We want all table entries to fit in an 'unsigned short'. */
209    if (nurows <= 1<<(16-shift)) {
210      int** old_charset2uni;
211
212      *urows = (unsigned int *) malloc(nurows * sizeof(unsigned int));
213      *urowshift = shift;
214
215      nurows = 0;
216      for (i = 0; i < 0x110000>>shift; i++)
217        if (upages[i] == 0) {
218          upages[i] = nurows;
219          (*urows)[nurows] = i;
220          nurows++;
221        }
222
223      old_charset2uni = enc->charset2uni;
224      enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
225      for (row = 0; row < enc->rows; row++)
226        enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
227      for (row = 0; row < enc->rows; row++)
228        for (col = 0; col < enc->cols; col++) {
229          int u = old_charset2uni[row][col];
230          enc->charset2uni[row][col] =
231            (upages[u >> shift] << shift) | (u & ((1 << shift) - 1));
232        }
233      enc->fffd =
234        (upages[0xfffd >> shift] << shift) | (0xfffd & ((1 << shift) - 1));
235
236      return nurows;
237    }
238  }
239  abort();
240}
241
242/*
243 * Computes the charsetpage[0..rows] array.
244 */
245static void find_charset2uni_pages (Encoding* enc)
246{
247  int row, col;
248
249  enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
250
251  for (row = 0; row <= enc->rows; row++)
252    enc->charsetpage[row] = 0;
253
254  for (row = 0; row < enc->rows; row++) {
255    int used = 0;
256    for (col = 0; col < enc->cols; col++)
257      if (enc->charset2uni[row][col] != enc->fffd)
258        used = col+1;
259    enc->charsetpage[row] = used;
260  }
261}
262
263/*
264 * Fills in nblocks and blocks.
265 */
266static void find_charset2uni_blocks (Encoding* enc)
267{
268  int n, row, lastrow;
269
270  enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
271
272  n = 0;
273  for (row = 0; row < enc->rows; row++)
274    if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
275      for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
276      enc->charsetblocks[n].start = row * enc->cols;
277      enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
278      n++;
279    }
280  enc->ncharsetblocks = n;
281}
282
283/*
284 * Outputs the charset to unicode table and function.
285 */
286static void output_charset2uni (const char* name, Encoding* enc)
287{
288  int nurows, row, col, lastrow, col_max, i, i1_min, i1_max;
289  bool is_large;
290  unsigned int* urows;
291  unsigned int urowshift;
292  Encoding tmpenc;
293
294  is_large = is_charset2uni_large(enc);
295  if (is_large) {
296    /* Use a temporary copy of enc. */
297    tmpenc = *enc;
298    enc = &tmpenc;
299    nurows = compact_large_charset2uni(enc,&urows,&urowshift);
300  } else {
301    nurows = 0; urows = NULL; urowshift = 0; enc->fffd = 0xfffd;
302  }
303
304  find_charset2uni_pages(enc);
305
306  find_charset2uni_blocks(enc);
307
308  for (row = 0; row < enc->rows; row++)
309    if (enc->charsetpage[row] > 0) {
310      if (row == 0 || enc->charsetpage[row-1] == 0) {
311        /* Start a new block. */
312        for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
313        printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
314               name, enc->row_byte(row),
315               (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
316      }
317      printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));
318      col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
319      for (col = 0; col < col_max; col++) {
320        printf(" 0x%04x,", enc->charset2uni[row][col]);
321        if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
322      }
323      printf("\n");
324      if (enc->charsetpage[row+1] == 0) {
325        /* End a block. */
326        printf("};\n");
327      }
328    }
329  printf("\n");
330
331  if (is_large) {
332    printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name, nurows);
333    for (i = 0; i < nurows; i++) {
334      printf(" 0x%05x,", urows[i] << urowshift);
335      if ((i % 8) == 7 && (i+1 < nurows)) printf("\n ");
336    }
337    printf("\n");
338    printf("};\n");
339    printf("\n");
340  }
341
342  printf("static int\n");
343  printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
344  printf("{\n");
345  printf("  unsigned char c1 = s[0];\n");
346  printf("  if (");
347  for (i = 0; i < enc->ncharsetblocks; i++) {
348    i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
349    i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
350    if (i > 0)
351      printf(" || ");
352    if (i1_min == i1_max)
353      printf("(c1 == 0x%02x)", i1_min);
354    else
355      printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
356  }
357  printf(") {\n");
358  printf("    if (n >= 2) {\n");
359  printf("      unsigned char c2 = s[1];\n");
360  printf("      if (");
361  printf(enc->check_col_expr, "c2");
362  printf(") {\n");
363  printf("        unsigned int i = %d * (", enc->cols);
364  printf(enc->byte_row_expr, "c1");
365  printf(") + (");
366  printf(enc->byte_col_expr, "c2");
367  printf(");\n");
368  printf("        %s wc = 0xfffd;\n", is_large ? "ucs4_t" : "unsigned short");
369  if (is_large) printf("        unsigned short swc;\n");
370  for (i = 0; i < enc->ncharsetblocks; i++) {
371    printf("        ");
372    if (i > 0)
373      printf("} else ");
374    if (i < enc->ncharsetblocks-1)
375      printf("if (i < %d) ", enc->charsetblocks[i+1].start);
376    printf("{\n");
377    printf("          if (i < %d)\n", enc->charsetblocks[i].end);
378    printf("            %s = ", is_large ? "swc" : "wc");
379    printf("%s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
380    if (enc->charsetblocks[i].start > 0)
381      printf("-%d", enc->charsetblocks[i].start);
382    printf("]");
383    if (is_large) printf(",\n            wc = %s_2uni_upages[swc>>%d] | (swc & 0x%x)", name, urowshift, (1 << urowshift) - 1);
384    printf(";\n");
385  }
386  printf("        }\n");
387  printf("        if (wc != 0xfffd) {\n");
388  printf("          *pwc = %swc;\n", is_large ? "" : "(ucs4_t) ");
389  printf("          return 2;\n");
390  printf("        }\n");
391  printf("      }\n");
392  printf("      return RET_ILSEQ;\n");
393  printf("    }\n");
394  printf("    return RET_TOOFEW(0);\n");
395  printf("  }\n");
396  printf("  return RET_ILSEQ;\n");
397  printf("}\n");
398  printf("\n");
399}
400
401/*
402 * Outputs the charset to unicode table and function.
403 * (Suitable if the mapping function is well defined, i.e. has no holes, and
404 * is monotonically increasing with small gaps only.)
405 */
406static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc)
407{
408  int row, col, lastrow, r, col_max, i, i1_min, i1_max;
409
410  /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and
411     enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]
412     is always < 0x100. */
413  int steps_per_row = 2;
414  int stepsize = (enc->cols + steps_per_row-1) / steps_per_row;
415
416  find_charset2uni_pages(enc);
417
418  find_charset2uni_blocks(enc);
419
420  for (row = 0; row < enc->rows; row++)
421    if (enc->charsetpage[row] > 0) {
422      if (row == 0 || enc->charsetpage[row-1] == 0) {
423        /* Start a new block. */
424        for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
425        printf("static const unsigned short %s_2uni_main_page%02x[%d] = {\n ",
426               name, enc->row_byte(row),
427               steps_per_row*(lastrow-row+1));
428        for (r = row; r <= lastrow; r++) {
429          for (i = 0; i < steps_per_row; i++)
430            printf(" 0x%04x,", enc->charset2uni[r][i*stepsize]);
431          if (((r-row) % 4) == 3 && (r < lastrow)) printf("\n ");
432        }
433        printf("\n");
434        printf("};\n");
435        printf("static const unsigned char %s_2uni_page%02x[%d] = {\n",
436               name, enc->row_byte(row),
437               (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
438      }
439      printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));
440      col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
441      for (col = 0; col < col_max; col++) {
442        printf(" 0x%02x,", enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]);
443        if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
444      }
445      printf("\n");
446      if (enc->charsetpage[row+1] == 0) {
447        /* End a block. */
448        printf("};\n");
449      }
450    }
451  printf("\n");
452
453  printf("static int\n");
454  printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
455  printf("{\n");
456  printf("  unsigned char c1 = s[0];\n");
457  printf("  if (");
458  for (i = 0; i < enc->ncharsetblocks; i++) {
459    i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
460    i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
461    if (i > 0)
462      printf(" || ");
463    if (i1_min == i1_max)
464      printf("(c1 == 0x%02x)", i1_min);
465    else
466      printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
467  }
468  printf(") {\n");
469  printf("    if (n >= 2) {\n");
470  printf("      unsigned char c2 = s[1];\n");
471  printf("      if (");
472  printf(enc->check_col_expr, "c2");
473  printf(") {\n");
474  printf("        unsigned int row = ");
475  printf(enc->byte_row_expr, "c1");
476  printf(";\n");
477  printf("        unsigned int col = ");
478  printf(enc->byte_col_expr, "c2");
479  printf(";\n");
480  printf("        unsigned int i = %d * row + col;\n", enc->cols);
481  printf("        unsigned short wc = 0xfffd;\n");
482  for (i = 0; i < enc->ncharsetblocks; i++) {
483    printf("        ");
484    if (i > 0)
485      printf("} else ");
486    if (i < enc->ncharsetblocks-1)
487      printf("if (i < %d) ", enc->charsetblocks[i+1].start);
488    printf("{\n");
489    printf("          if (i < %d)\n", enc->charsetblocks[i].end);
490    printf("            wc = %s_2uni_main_page%02x[%d*", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols), steps_per_row);
491    if (enc->charsetblocks[i].start > 0)
492      printf("(row-%d)", enc->charsetblocks[i].start / enc->cols);
493    else
494      printf("row");
495    printf("+");
496    if (steps_per_row == 2)
497      printf("(col>=%d?1:0)", stepsize);
498    else
499      printf("col/%d", stepsize);
500    printf("] + %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
501    if (enc->charsetblocks[i].start > 0)
502      printf("-%d", enc->charsetblocks[i].start);
503    printf("];\n");
504  }
505  printf("        }\n");
506  printf("        if (wc != 0xfffd) {\n");
507  printf("          *pwc = (ucs4_t) wc;\n");
508  printf("          return 2;\n");
509  printf("        }\n");
510  printf("      }\n");
511  printf("      return RET_ILSEQ;\n");
512  printf("    }\n");
513  printf("    return RET_TOOFEW(0);\n");
514  printf("  }\n");
515  printf("  return RET_ILSEQ;\n");
516  printf("}\n");
517  printf("\n");
518}
519
520/*
521 * Computes the uni2charset[0x0000..0x2ffff] array.
522 */
523static void invert (Encoding* enc)
524{
525  int row, col, j;
526
527  enc->uni2charset = (int*) malloc(0x30000*sizeof(int));
528
529  for (j = 0; j < 0x30000; j++)
530    enc->uni2charset[j] = 0;
531
532  for (row = 0; row < enc->rows; row++)
533    for (col = 0; col < enc->cols; col++) {
534      j = enc->charset2uni[row][col];
535      if (j != 0xfffd)
536        enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
537    }
538}
539
540/*
541 * Outputs the unicode to charset table and function, using a linear array.
542 * (Suitable if the table is dense.)
543 */
544static void output_uni2charset_dense (const char* name, Encoding* enc)
545{
546  /* Like in 8bit_tab_to_h.c */
547  bool pages[0x300];
548  int line[0x6000];
549  int tableno;
550  struct { int minline; int maxline; int usecount; } tables[0x6000];
551  bool first;
552  int row, col, j, p, j1, j2, t;
553
554  for (p = 0; p < 0x300; p++)
555    pages[p] = false;
556  for (row = 0; row < enc->rows; row++)
557    for (col = 0; col < enc->cols; col++) {
558      j = enc->charset2uni[row][col];
559      if (j != 0xfffd)
560        pages[j>>8] = true;
561    }
562  for (j1 = 0; j1 < 0x6000; j1++) {
563    bool all_invalid = true;
564    for (j2 = 0; j2 < 8; j2++) {
565      j = 8*j1+j2;
566      if (enc->uni2charset[j] != 0)
567        all_invalid = false;
568    }
569    if (all_invalid)
570      line[j1] = -1;
571    else
572      line[j1] = 0;
573  }
574  tableno = 0;
575  for (j1 = 0; j1 < 0x6000; j1++) {
576    if (line[j1] >= 0) {
577      if (tableno > 0
578          && ((j1 > 0 && line[j1-1] == tableno-1)
579              || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
580                  && j1 - tables[tableno-1].maxline <= 8))) {
581        line[j1] = tableno-1;
582        tables[tableno-1].maxline = j1;
583      } else {
584        tableno++;
585        line[j1] = tableno-1;
586        tables[tableno-1].minline = tables[tableno-1].maxline = j1;
587      }
588    }
589  }
590  for (t = 0; t < tableno; t++) {
591    tables[t].usecount = 0;
592    j1 = 8*tables[t].minline;
593    j2 = 8*(tables[t].maxline+1);
594    for (j = j1; j < j2; j++)
595      if (enc->uni2charset[j] != 0)
596        tables[t].usecount++;
597  }
598  {
599    p = -1;
600    for (t = 0; t < tableno; t++)
601      if (tables[t].usecount > 1) {
602        p = tables[t].minline >> 5;
603        printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
604        for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
605          if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
606            printf("  /* 0x%04x */\n", 8*j1);
607          printf(" ");
608          for (j2 = 0; j2 < 8; j2++) {
609            j = 8*j1+j2;
610            printf(" 0x%04x,", enc->uni2charset[j]);
611          }
612          printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
613        }
614        printf("};\n");
615      }
616    if (p >= 0)
617      printf("\n");
618  }
619  printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
620  printf("{\n");
621  printf("  if (n >= 2) {\n");
622  printf("    unsigned short c = 0;\n");
623  first = true;
624  for (j1 = 0; j1 < 0x6000;) {
625    t = line[j1];
626    for (j2 = j1; j2 < 0x6000 && line[j2] == t; j2++);
627    if (t >= 0) {
628      if (j1 != tables[t].minline) abort();
629      if (j2 > tables[t].maxline+1) abort();
630      j2 = tables[t].maxline+1;
631      if (first)
632        printf("    ");
633      else
634        printf("    else ");
635      first = false;
636      if (tables[t].usecount == 0) abort();
637      if (tables[t].usecount == 1) {
638        if (j2 != j1+1) abort();
639        for (j = 8*j1; j < 8*j2; j++)
640          if (enc->uni2charset[j] != 0) {
641            printf("if (wc == 0x%04x)\n      c = 0x%02x;\n", j, enc->uni2charset[j]);
642            break;
643          }
644      } else {
645        if (j1 == 0) {
646          printf("if (wc < 0x%04x)", 8*j2);
647        } else {
648          printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
649        }
650        printf("\n      c = %s_page%02x[wc", name, j1 >> 5);
651        if (tables[t].minline > 0)
652          printf("-0x%04x", 8*j1);
653        printf("];\n");
654      }
655    }
656    j1 = j2;
657  }
658  printf("    if (c != 0) {\n");
659  printf("      r[0] = (c >> 8); r[1] = (c & 0xff);\n");
660  printf("      return 2;\n");
661  printf("    }\n");
662  printf("    return RET_ILUNI;\n");
663  printf("  }\n");
664  printf("  return RET_TOOSMALL;\n");
665  printf("}\n");
666}
667
668/*
669 * Outputs the unicode to charset table and function, using a packed array.
670 * (Suitable if the table is sparse.)
671 * The argument 'monotonic' may be set to true if the mapping is monotonically
672 * increasing with small gaps only.
673 */
674static void output_uni2charset_sparse (const char* name, Encoding* enc, bool monotonic)
675{
676  bool pages[0x300];
677  Block pageblocks[0x300]; int npageblocks;
678  int indx2charset[0x30000];
679  int summary_indx[0x3000];
680  int summary_used[0x3000];
681  int i, row, col, j, p, j1, j2, indx;
682  bool is_large;
683  /* for monotonic: */
684  int log2_stepsize = (!strcmp(name,"uhc_2") ? 6 : 7);
685  int stepsize = 1 << log2_stepsize;
686  int indxsteps;
687
688  /* Fill pages[0x300]. */
689  for (p = 0; p < 0x300; p++)
690    pages[p] = false;
691  for (row = 0; row < enc->rows; row++)
692    for (col = 0; col < enc->cols; col++) {
693      j = enc->charset2uni[row][col];
694      if (j != 0xfffd)
695        pages[j>>8] = true;
696    }
697
698  /* Determine whether two or three bytes are needed for each character. */
699  is_large = false;
700  for (j = 0; j < 0x30000; j++)
701    if (enc->uni2charset[j] >= 0x10000)
702      is_large = true;
703
704#if 0
705  for (p = 0; p < 0x300; p++)
706    if (pages[p]) {
707      printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
708      for (j1 = 0; j1 < 32; j1++) {
709        printf("  ");
710        for (j2 = 0; j2 < 8; j2++)
711          printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
712        printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
713      }
714      printf("};\n");
715    }
716  printf("\n");
717#endif
718
719  /* Fill summary_indx[] and summary_used[]. */
720  indx = 0;
721  for (j1 = 0; j1 < 0x3000; j1++) {
722    summary_indx[j1] = indx;
723    summary_used[j1] = 0;
724    for (j2 = 0; j2 < 16; j2++) {
725      j = 16*j1+j2;
726      if (enc->uni2charset[j] != 0) {
727        indx2charset[indx++] = enc->uni2charset[j];
728        summary_used[j1] |= (1 << j2);
729      }
730    }
731  }
732
733  /* Fill npageblocks and pageblocks[]. */
734  npageblocks = 0;
735  for (p = 0; p < 0x300; ) {
736    if (pages[p] && (p == 0 || !pages[p-1])) {
737      pageblocks[npageblocks].start = 16*p;
738      do p++; while (p < 0x300 && pages[p]);
739      j1 = 16*p;
740      while (summary_used[j1-1] == 0) j1--;
741      pageblocks[npageblocks].end = j1;
742      npageblocks++;
743    } else
744      p++;
745  }
746
747  if (monotonic) {
748    indxsteps = (indx + stepsize-1) / stepsize;
749    printf("static const unsigned short %s_2charset_main[%d] = {\n", name, indxsteps);
750    for (i = 0; i < indxsteps; ) {
751      if ((i % 8) == 0) printf(" ");
752      printf(" 0x%04x,", indx2charset[i*stepsize]);
753      i++;
754      if ((i % 8) == 0 || i == indxsteps) printf("\n");
755    }
756    printf("};\n");
757    printf("static const unsigned char %s_2charset[%d] = {\n", name, indx);
758    for (i = 0; i < indx; ) {
759      if ((i % 8) == 0) printf(" ");
760      printf(" 0x%02x,", indx2charset[i] - indx2charset[i/stepsize*stepsize]);
761      i++;
762      if ((i % 8) == 0 || i == indx) printf("\n");
763    }
764    printf("};\n");
765  } else {
766    if (is_large) {
767      printf("static const unsigned char %s_2charset[3*%d] = {\n", name, indx);
768      for (i = 0; i < indx; ) {
769        if ((i % 4) == 0) printf(" ");
770        printf(" 0x%1x,0x%02x,0x%02x,", indx2charset[i] >> 16,
771               (indx2charset[i] >> 8) & 0xff, indx2charset[i] & 0xff);
772        i++;
773        if ((i % 4) == 0 || i == indx) printf("\n");
774      }
775      printf("};\n");
776    } else {
777      printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
778      for (i = 0; i < indx; ) {
779        if ((i % 8) == 0) printf(" ");
780        printf(" 0x%04x,", indx2charset[i]);
781        i++;
782        if ((i % 8) == 0 || i == indx) printf("\n");
783      }
784      printf("};\n");
785    }
786  }
787  printf("\n");
788  for (i = 0; i < npageblocks; i++) {
789    printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
790           pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
791    for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
792      if (((16*j1) % 0x100) == 0) printf("  /""* 0x%04x *""/\n", 16*j1);
793      if ((j1 % 4) == 0) printf(" ");
794      printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
795      j1++;
796      if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
797    }
798    printf("};\n");
799  }
800  printf("\n");
801
802  printf("static int\n");
803  printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
804  printf("{\n");
805  printf("  if (n >= 2) {\n");
806  printf("    const Summary16 *summary = NULL;\n");
807  for (i = 0; i < npageblocks; i++) {
808    printf("    ");
809    if (i > 0)
810      printf("else ");
811    printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
812           16*pageblocks[i].start, 16*pageblocks[i].end);
813    printf("      summary = &%s_uni2indx_page%02x[(wc>>4)", name,
814           pageblocks[i].start/16);
815    if (pageblocks[i].start > 0)
816      printf("-0x%03x", pageblocks[i].start);
817    printf("];\n");
818  }
819  printf("    if (summary) {\n");
820  printf("      unsigned short used = summary->used;\n");
821  printf("      unsigned int i = wc & 0x0f;\n");
822  printf("      if (used & ((unsigned short) 1 << i)) {\n");
823  if (monotonic || !is_large)
824    printf("        unsigned short c;\n");
825  printf("        /* Keep in `used' only the bits 0..i-1. */\n");
826  printf("        used &= ((unsigned short) 1 << i) - 1;\n");
827  printf("        /* Add `summary->indx' and the number of bits set in `used'. */\n");
828  printf("        used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
829  printf("        used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
830  printf("        used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
831  printf("        used = (used & 0x00ff) + (used >> 8);\n");
832  if (monotonic) {
833    printf("        used += summary->indx;\n");
834    printf("        c = %s_2charset_main[used>>%d] + %s_2charset[used];\n", name, log2_stepsize, name);
835    printf("        r[0] = (c >> 8); r[1] = (c & 0xff);\n");
836    printf("        return 2;\n");
837  } else {
838    if (is_large) {
839      printf("        used += summary->indx;\n");
840      printf("        r[0] = %s_2charset[3*used];\n", name);
841      printf("        r[1] = %s_2charset[3*used+1];\n", name);
842      printf("        r[2] = %s_2charset[3*used+2];\n", name);
843      printf("        return 3;\n");
844    } else {
845      printf("        c = %s_2charset[summary->indx + used];\n", name);
846      printf("        r[0] = (c >> 8); r[1] = (c & 0xff);\n");
847      printf("        return 2;\n");
848    }
849  }
850  printf("      }\n");
851  printf("    }\n");
852  printf("    return RET_ILUNI;\n");
853  printf("  }\n");
854  printf("  return RET_TOOSMALL;\n");
855  printf("}\n");
856}
857
858/* ISO-2022/EUC specifics */
859
860static int row_byte_normal (int row) { return 0x21+row; }
861static int col_byte_normal (int col) { return 0x21+col; }
862static int byte_row_normal (int byte) { return byte-0x21; }
863static int byte_col_normal (int byte) { return byte-0x21; }
864
865static void do_normal (const char* name)
866{
867  Encoding enc;
868
869  enc.rows = 94;
870  enc.cols = 94;
871  enc.row_byte = row_byte_normal;
872  enc.col_byte = col_byte_normal;
873  enc.byte_row = byte_row_normal;
874  enc.byte_col = byte_col_normal;
875  enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
876  enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
877  enc.byte_row_expr = "%1$s - 0x21";
878  enc.byte_col_expr = "%1$s - 0x21";
879
880  read_table(&enc);
881  output_charset2uni(name,&enc);
882  invert(&enc); output_uni2charset_sparse(name,&enc,false);
883}
884
885/* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
886   starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
887   order. There are 75 out-of-order values, scattered all throughout the table.
888 */
889
890static void do_normal_only_charset2uni (const char* name)
891{
892  Encoding enc;
893
894  enc.rows = 94;
895  enc.cols = 94;
896  enc.row_byte = row_byte_normal;
897  enc.col_byte = col_byte_normal;
898  enc.byte_row = byte_row_normal;
899  enc.byte_col = byte_col_normal;
900  enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
901  enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
902  enc.byte_row_expr = "%1$s - 0x21";
903  enc.byte_col_expr = "%1$s - 0x21";
904
905  read_table(&enc);
906  output_charset2uni(name,&enc);
907}
908
909/* CNS 11643 specifics - trick to put two tables into one */
910
911static int row_byte_cns11643 (int row) {
912  return 0x100 * (row / 94) + (row % 94) + 0x21;
913}
914static int byte_row_cns11643 (int byte) {
915  return (byte >> 8) * 94 + (byte & 0xff) - 0x21;
916}
917
918static void do_cns11643_only_uni2charset (const char* name)
919{
920  Encoding enc;
921
922  enc.rows = 16*94;
923  enc.cols = 94;
924  enc.row_byte = row_byte_cns11643;
925  enc.col_byte = col_byte_normal;
926  enc.byte_row = byte_row_cns11643;
927  enc.byte_col = byte_col_normal;
928  enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
929  enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
930  enc.byte_row_expr = "%1$s - 0x21";
931  enc.byte_col_expr = "%1$s - 0x21";
932
933  read_table(&enc);
934  invert(&enc);
935  output_uni2charset_sparse(name,&enc,false);
936}
937
938/* GBK specifics */
939
940static int row_byte_gbk1 (int row) {
941  return 0x81+row;
942}
943static int col_byte_gbk1 (int col) {
944  return (col >= 0x3f ? 0x41 : 0x40) + col;
945}
946static int byte_row_gbk1 (int byte) {
947  if (byte >= 0x81 && byte < 0xff)
948    return byte-0x81;
949  else
950    return -1;
951}
952static int byte_col_gbk1 (int byte) {
953  if (byte >= 0x40 && byte < 0x7f)
954    return byte-0x40;
955  else if (byte >= 0x80 && byte < 0xff)
956    return byte-0x41;
957  else
958    return -1;
959}
960
961static void do_gbk1 (const char* name)
962{
963  Encoding enc;
964
965  enc.rows = 126;
966  enc.cols = 190;
967  enc.row_byte = row_byte_gbk1;
968  enc.col_byte = col_byte_gbk1;
969  enc.byte_row = byte_row_gbk1;
970  enc.byte_col = byte_col_gbk1;
971  enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
972  enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
973  enc.byte_row_expr = "%1$s - 0x81";
974  enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
975
976  read_table(&enc);
977  output_charset2uni(name,&enc);
978  invert(&enc); output_uni2charset_dense(name,&enc);
979}
980
981static void do_gbk1_only_charset2uni (const char* name)
982{
983  Encoding enc;
984
985  enc.rows = 126;
986  enc.cols = 190;
987  enc.row_byte = row_byte_gbk1;
988  enc.col_byte = col_byte_gbk1;
989  enc.byte_row = byte_row_gbk1;
990  enc.byte_col = byte_col_gbk1;
991  enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
992  enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
993  enc.byte_row_expr = "%1$s - 0x81";
994  enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
995
996  read_table(&enc);
997  output_charset2uni(name,&enc);
998}
999
1000static int row_byte_gbk2 (int row) {
1001  return 0x81+row;
1002}
1003static int col_byte_gbk2 (int col) {
1004  return (col >= 0x3f ? 0x41 : 0x40) + col;
1005}
1006static int byte_row_gbk2 (int byte) {
1007  if (byte >= 0x81 && byte < 0xff)
1008    return byte-0x81;
1009  else
1010    return -1;
1011}
1012static int byte_col_gbk2 (int byte) {
1013  if (byte >= 0x40 && byte < 0x7f)
1014    return byte-0x40;
1015  else if (byte >= 0x80 && byte < 0xa1)
1016    return byte-0x41;
1017  else
1018    return -1;
1019}
1020
1021static void do_gbk2_only_charset2uni (const char* name)
1022{
1023  Encoding enc;
1024
1025  enc.rows = 126;
1026  enc.cols = 96;
1027  enc.row_byte = row_byte_gbk2;
1028  enc.col_byte = col_byte_gbk2;
1029  enc.byte_row = byte_row_gbk2;
1030  enc.byte_col = byte_col_gbk2;
1031  enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1032  enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
1033  enc.byte_row_expr = "%1$s - 0x81";
1034  enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1035
1036  read_table(&enc);
1037  output_charset2uni(name,&enc);
1038}
1039
1040static void do_gbk1_only_uni2charset (const char* name)
1041{
1042  Encoding enc;
1043
1044  enc.rows = 126;
1045  enc.cols = 190;
1046  enc.row_byte = row_byte_gbk1;
1047  enc.col_byte = col_byte_gbk1;
1048  enc.byte_row = byte_row_gbk1;
1049  enc.byte_col = byte_col_gbk1;
1050  enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1051  enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
1052  enc.byte_row_expr = "%1$s - 0x81";
1053  enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1054
1055  read_table(&enc);
1056  invert(&enc); output_uni2charset_sparse(name,&enc,false);
1057}
1058
1059/* KSC 5601 specifics */
1060
1061/*
1062 * Reads the charset2uni table from standard input.
1063 */
1064static void read_table_ksc5601 (Encoding* enc)
1065{
1066  int row, col, i, i1, i2, c, j;
1067
1068  enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
1069  for (row = 0; row < enc->rows; row++)
1070    enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
1071
1072  for (row = 0; row < enc->rows; row++)
1073    for (col = 0; col < enc->cols; col++)
1074      enc->charset2uni[row][col] = 0xfffd;
1075
1076  c = getc(stdin);
1077  ungetc(c,stdin);
1078  if (c == '#') {
1079    /* Read a unicode.org style .TXT file. */
1080    for (;;) {
1081      c = getc(stdin);
1082      if (c == EOF)
1083        break;
1084      if (c == '\n' || c == ' ' || c == '\t')
1085        continue;
1086      if (c == '#') {
1087        do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1088        continue;
1089      }
1090      ungetc(c,stdin);
1091      if (scanf("0x%x", &j) != 1)
1092        exit(1);
1093      i1 = j >> 8;
1094      i2 = j & 0xff;
1095      if (scanf(" 0x%x", &j) != 1)
1096        exit(1);
1097      /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
1098         = KS X 1001.1992, ignore the rest. */
1099      if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
1100        continue;  /* KSC5601 specific */
1101      i1 &= 0x7f;  /* KSC5601 specific */
1102      i2 &= 0x7f;  /* KSC5601 specific */
1103      row = enc->byte_row(i1);
1104      col = enc->byte_col(i2);
1105      if (row < 0 || col < 0) {
1106        fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
1107        exit(1);
1108      }
1109      enc->charset2uni[row][col] = j;
1110    }
1111  } else {
1112    /* Read a table of hexadecimal Unicode values. */
1113    for (i1 = 33; i1 < 127; i1++)
1114      for (i2 = 33; i2 < 127; i2++) {
1115        i = scanf("%x", &j);
1116        if (i == EOF)
1117          goto read_done;
1118        if (i != 1)
1119          exit(1);
1120        if (j < 0 || j == 0xffff)
1121          j = 0xfffd;
1122        if (j != 0xfffd) {
1123          if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
1124            fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
1125            exit (1);
1126          }
1127          enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
1128        }
1129      }
1130   read_done: ;
1131  }
1132}
1133
1134static void do_ksc5601 (const char* name)
1135{
1136  Encoding enc;
1137
1138  enc.rows = 94;
1139  enc.cols = 94;
1140  enc.row_byte = row_byte_normal;
1141  enc.col_byte = col_byte_normal;
1142  enc.byte_row = byte_row_normal;
1143  enc.byte_col = byte_col_normal;
1144  enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1145  enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1146  enc.byte_row_expr = "%1$s - 0x21";
1147  enc.byte_col_expr = "%1$s - 0x21";
1148
1149  read_table_ksc5601(&enc);
1150  output_charset2uni(name,&enc);
1151  invert(&enc); output_uni2charset_sparse(name,&enc,false);
1152}
1153
1154/* UHC specifics */
1155
1156/* UHC part 1: 0x{81..A0}{41..5A,61..7A,81..FE} */
1157
1158static int row_byte_uhc_1 (int row) {
1159  return 0x81 + row;
1160}
1161static int col_byte_uhc_1 (int col) {
1162  return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1163}
1164static int byte_row_uhc_1 (int byte) {
1165  if (byte >= 0x81 && byte < 0xa1)
1166    return byte-0x81;
1167  else
1168    return -1;
1169}
1170static int byte_col_uhc_1 (int byte) {
1171  if (byte >= 0x41 && byte < 0x5b)
1172    return byte-0x41;
1173  else if (byte >= 0x61 && byte < 0x7b)
1174    return byte-0x47;
1175  else if (byte >= 0x81 && byte < 0xff)
1176    return byte-0x4d;
1177  else
1178    return -1;
1179}
1180
1181static void do_uhc_1 (const char* name)
1182{
1183  Encoding enc;
1184
1185  enc.rows = 32;
1186  enc.cols = 178;
1187  enc.row_byte = row_byte_uhc_1;
1188  enc.col_byte = col_byte_uhc_1;
1189  enc.byte_row = byte_row_uhc_1;
1190  enc.byte_col = byte_col_uhc_1;
1191  enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa1)";
1192  enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xff)";
1193  enc.byte_row_expr = "%1$s - 0x81";
1194  enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1195
1196  read_table(&enc);
1197  output_charset2uni_noholes_monotonic(name,&enc);
1198  invert(&enc); output_uni2charset_sparse(name,&enc,true);
1199}
1200
1201/* UHC part 2: 0x{A1..C6}{41..5A,61..7A,81..A0} */
1202
1203static int row_byte_uhc_2 (int row) {
1204  return 0xa1 + row;
1205}
1206static int col_byte_uhc_2 (int col) {
1207  return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1208}
1209static int byte_row_uhc_2 (int byte) {
1210  if (byte >= 0xa1 && byte < 0xff)
1211    return byte-0xa1;
1212  else
1213    return -1;
1214}
1215static int byte_col_uhc_2 (int byte) {
1216  if (byte >= 0x41 && byte < 0x5b)
1217    return byte-0x41;
1218  else if (byte >= 0x61 && byte < 0x7b)
1219    return byte-0x47;
1220  else if (byte >= 0x81 && byte < 0xa1)
1221    return byte-0x4d;
1222  else
1223    return -1;
1224}
1225
1226static void do_uhc_2 (const char* name)
1227{
1228  Encoding enc;
1229
1230  enc.rows = 94;
1231  enc.cols = 84;
1232  enc.row_byte = row_byte_uhc_2;
1233  enc.col_byte = col_byte_uhc_2;
1234  enc.byte_row = byte_row_uhc_2;
1235  enc.byte_col = byte_col_uhc_2;
1236  enc.check_row_expr = "(%1$s >= 0xa1 && %1$s < 0xff)";
1237  enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xa1)";
1238  enc.byte_row_expr = "%1$s - 0xa1";
1239  enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1240
1241  read_table(&enc);
1242  output_charset2uni_noholes_monotonic(name,&enc);
1243  invert(&enc); output_uni2charset_sparse(name,&enc,true);
1244}
1245
1246/* Big5 specifics */
1247
1248static int row_byte_big5 (int row) {
1249  return 0xa1+row;
1250}
1251static int col_byte_big5 (int col) {
1252  return (col >= 0x3f ? 0x62 : 0x40) + col;
1253}
1254static int byte_row_big5 (int byte) {
1255  if (byte >= 0xa1 && byte < 0xff)
1256    return byte-0xa1;
1257  else
1258    return -1;
1259}
1260static int byte_col_big5 (int byte) {
1261  if (byte >= 0x40 && byte < 0x7f)
1262    return byte-0x40;
1263  else if (byte >= 0xa1 && byte < 0xff)
1264    return byte-0x62;
1265  else
1266    return -1;
1267}
1268
1269static void do_big5 (const char* name)
1270{
1271  Encoding enc;
1272
1273  enc.rows = 94;
1274  enc.cols = 157;
1275  enc.row_byte = row_byte_big5;
1276  enc.col_byte = col_byte_big5;
1277  enc.byte_row = byte_row_big5;
1278  enc.byte_col = byte_col_big5;
1279  enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
1280  enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1281  enc.byte_row_expr = "%1$s - 0xa1";
1282  enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1283
1284  read_table(&enc);
1285  output_charset2uni(name,&enc);
1286  invert(&enc); output_uni2charset_sparse(name,&enc,false);
1287}
1288
1289/* HKSCS specifics */
1290
1291static int row_byte_hkscs (int row) {
1292  return 0x80+row;
1293}
1294static int byte_row_hkscs (int byte) {
1295  if (byte >= 0x80 && byte < 0xff)
1296    return byte-0x80;
1297  else
1298    return -1;
1299}
1300
1301static void do_hkscs (const char* name)
1302{
1303  Encoding enc;
1304
1305  enc.rows = 128;
1306  enc.cols = 157;
1307  enc.row_byte = row_byte_hkscs;
1308  enc.col_byte = col_byte_big5;
1309  enc.byte_row = byte_row_hkscs;
1310  enc.byte_col = byte_col_big5;
1311  enc.check_row_expr = "%1$s >= 0x80 && %1$s < 0xff";
1312  enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1313  enc.byte_row_expr = "%1$s - 0x80";
1314  enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1315
1316  read_table(&enc);
1317  output_charset2uni(name,&enc);
1318  invert(&enc); output_uni2charset_sparse(name,&enc,false);
1319}
1320
1321/* Johab Hangul specifics */
1322
1323static int row_byte_johab_hangul (int row) {
1324  return 0x84+row;
1325}
1326static int col_byte_johab_hangul (int col) {
1327  return (col >= 0x3e ? 0x43 : 0x41) + col;
1328}
1329static int byte_row_johab_hangul (int byte) {
1330  if (byte >= 0x84 && byte < 0xd4)
1331    return byte-0x84;
1332  else
1333    return -1;
1334}
1335static int byte_col_johab_hangul (int byte) {
1336  if (byte >= 0x41 && byte < 0x7f)
1337    return byte-0x41;
1338  else if (byte >= 0x81 && byte < 0xff)
1339    return byte-0x43;
1340  else
1341    return -1;
1342}
1343
1344static void do_johab_hangul (const char* name)
1345{
1346  Encoding enc;
1347
1348  enc.rows = 80;
1349  enc.cols = 188;
1350  enc.row_byte = row_byte_johab_hangul;
1351  enc.col_byte = col_byte_johab_hangul;
1352  enc.byte_row = byte_row_johab_hangul;
1353  enc.byte_col = byte_col_johab_hangul;
1354  enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
1355  enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
1356  enc.byte_row_expr = "%1$s - 0x84";
1357  enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
1358
1359  read_table(&enc);
1360  output_charset2uni(name,&enc);
1361  invert(&enc); output_uni2charset_dense(name,&enc);
1362}
1363
1364/* SJIS specifics */
1365
1366static int row_byte_sjis (int row) {
1367  return (row >= 0x1f ? 0xc1 : 0x81) + row;
1368}
1369static int col_byte_sjis (int col) {
1370  return (col >= 0x3f ? 0x41 : 0x40) + col;
1371}
1372static int byte_row_sjis (int byte) {
1373  if (byte >= 0x81 && byte < 0xa0)
1374    return byte-0x81;
1375  else if (byte >= 0xe0)
1376    return byte-0xc1;
1377  else
1378    return -1;
1379}
1380static int byte_col_sjis (int byte) {
1381  if (byte >= 0x40 && byte < 0x7f)
1382    return byte-0x40;
1383  else if (byte >= 0x80 && byte < 0xfd)
1384    return byte-0x41;
1385  else
1386    return -1;
1387}
1388
1389static void do_sjis (const char* name)
1390{
1391  Encoding enc;
1392
1393  enc.rows = 94;
1394  enc.cols = 188;
1395  enc.row_byte = row_byte_sjis;
1396  enc.col_byte = col_byte_sjis;
1397  enc.byte_row = byte_row_sjis;
1398  enc.byte_col = byte_col_sjis;
1399  enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1400  enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1401  enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1402  enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1403
1404  read_table(&enc);
1405  output_charset2uni(name,&enc);
1406  invert(&enc); output_uni2charset_sparse(name,&enc,false);
1407}
1408
1409/* GB18030 Unicode specifics */
1410
1411static void do_gb18030uni (const char* name)
1412{
1413  int c;
1414  unsigned int bytes;
1415  int i1, i2, i3, i4, i, j, k;
1416  int charset2uni[4*10*126*10];
1417  int uni2charset[0x10000];
1418  struct { int low; int high; int diff; int total; } ranges[256];
1419  int ranges_count, ranges_total;
1420
1421  for (i = 0; i < 4*10*126*10; i++)
1422    charset2uni[i] = 0;
1423  for (j = 0; j < 0x10000; j++)
1424    uni2charset[j] = 0;
1425
1426  /* Read a unicode.org style .TXT file. */
1427  for (;;) {
1428    c = getc(stdin);
1429    if (c == EOF)
1430      break;
1431    if (c == '\n' || c == ' ' || c == '\t')
1432      continue;
1433    if (c == '#') {
1434      do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1435      continue;
1436    }
1437    ungetc(c,stdin);
1438    if (scanf("0x%x", &bytes) != 1)
1439      exit(1);
1440    i1 = (bytes >> 24) & 0xff;
1441    i2 = (bytes >> 16) & 0xff;
1442    i3 = (bytes >> 8) & 0xff;
1443    i4 = bytes & 0xff;
1444    if (!(i1 >= 0x81 && i1 <= 0x84
1445          && i2 >= 0x30 && i2 <= 0x39
1446          && i3 >= 0x81 && i3 <= 0xfe
1447          && i4 >= 0x30 && i4 <= 0x39)) {
1448      fprintf(stderr, "lost entry for %02x %02x %02x %02x\n", i1, i2, i3, i4);
1449      exit(1);
1450    }
1451    i = (((i1-0x81) * 10 + (i2-0x30)) * 126 + (i3-0x81)) * 10 + (i4-0x30);
1452    if (scanf(" 0x%x", &j) != 1)
1453      exit(1);
1454    if (!(j >= 0 && j < 0x10000))
1455      exit(1);
1456    charset2uni[i] = j;
1457    uni2charset[j] = i;
1458  }
1459
1460  /* Verify that the mapping i -> j is monotonically increasing and
1461     of the form
1462        low[k] <= i <= high[k]  =>  j = diff[k] + i
1463     with a set of disjoint intervals (low[k], high[k]). */
1464  ranges_count = 0;
1465  for (i = 0; i < 4*10*126*10; i++)
1466    if (charset2uni[i] != 0) {
1467      int diff;
1468      j = charset2uni[i];
1469      diff = j - i;
1470      if (ranges_count > 0) {
1471        if (!(i > ranges[ranges_count-1].high))
1472          exit(1);
1473        if (!(j > ranges[ranges_count-1].high + ranges[ranges_count-1].diff))
1474          exit(1);
1475        /* Additional property: The diffs are also increasing. */
1476        if (!(diff >= ranges[ranges_count-1].diff))
1477          exit(1);
1478      }
1479      if (ranges_count > 0 && diff == ranges[ranges_count-1].diff)
1480        ranges[ranges_count-1].high = i;
1481      else {
1482        if (ranges_count == 256)
1483          exit(1);
1484        ranges[ranges_count].low = i;
1485        ranges[ranges_count].high = i;
1486        ranges[ranges_count].diff = diff;
1487        ranges_count++;
1488      }
1489    }
1490
1491  /* Determine size of bitmap. */
1492  ranges_total = 0;
1493  for (k = 0; k < ranges_count; k++) {
1494    ranges[k].total = ranges_total;
1495    ranges_total += ranges[k].high - ranges[k].low + 1;
1496  }
1497
1498  printf("static const unsigned short %s_charset2uni_ranges[%d] = {\n", name, 2*ranges_count);
1499  for (k = 0; k < ranges_count; k++) {
1500    printf("  0x%04x, 0x%04x", ranges[k].low, ranges[k].high);
1501    if (k+1 < ranges_count) printf(",");
1502    if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1503  }
1504  printf("\n");
1505  printf("};\n");
1506
1507  printf("\n");
1508
1509  printf("static const unsigned short %s_uni2charset_ranges[%d] = {\n", name, 2*ranges_count);
1510  for (k = 0; k < ranges_count; k++) {
1511    printf("  0x%04x, 0x%04x", ranges[k].low + ranges[k].diff, ranges[k].high + ranges[k].diff);
1512    if (k+1 < ranges_count) printf(",");
1513    if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1514  }
1515  printf("\n");
1516  printf("};\n");
1517
1518  printf("\n");
1519
1520  printf("static const struct { unsigned short diff; unsigned short bitmap_offset; } %s_ranges[%d] = {\n ", name, ranges_count);
1521  for (k = 0; k < ranges_count; k++) {
1522    printf(" { %5d, 0x%04x }", ranges[k].diff, ranges[k].total);
1523    if (k+1 < ranges_count) printf(",");
1524    if ((k % 4) == 3 && k+1 < ranges_count) printf("\n ");
1525  }
1526  printf("\n");
1527  printf("};\n");
1528
1529  printf("\n");
1530
1531  printf("static const unsigned char %s_bitmap[%d] = {\n ", name, (ranges_total + 7) / 8);
1532  {
1533    int accu = 0;
1534    for (k = 0; k < ranges_count; k++) {
1535      for (i = ranges[k].total; i <= ranges[k].total + (ranges[k].high - ranges[k].low);) {
1536        if (charset2uni[i - ranges[k].total + ranges[k].low] != 0)
1537          accu |= (1 << (i % 8));
1538        i++;
1539        if ((i % 8) == 0) {
1540          printf(" 0x%02x", accu);
1541          if ((i / 8) < (ranges_total + 7) / 8) printf(",");
1542          if (((i / 8) % 12) == 0)
1543            printf("\n ");
1544          accu = 0;
1545        }
1546      }
1547      if (i != (k+1 < ranges_count ? ranges[k+1].total : ranges_total)) abort();
1548    }
1549    if ((ranges_total % 8) != 0)
1550      printf(" 0x%02x", accu);
1551    printf("\n");
1552  }
1553  printf("};\n");
1554
1555  printf("\n");
1556
1557  printf("static int\n");
1558  printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
1559  printf("{\n");
1560  printf("  unsigned char c1 = s[0];\n");
1561  printf("  if (c1 >= 0x81 && c1 <= 0x84) {\n");
1562  printf("    if (n >= 2) {\n");
1563  printf("      unsigned char c2 = s[1];\n");
1564  printf("      if (c2 >= 0x30 && c2 <= 0x39) {\n");
1565  printf("        if (n >= 3) {\n");
1566  printf("          unsigned char c3 = s[2];\n");
1567  printf("          if (c3 >= 0x81 && c3 <= 0xfe) {\n");
1568  printf("            if (n >= 4) {\n");
1569  printf("              unsigned char c4 = s[3];\n");
1570  printf("              if (c4 >= 0x30 && c4 <= 0x39) {\n");
1571  printf("                unsigned int i = (((c1 - 0x81) * 10 + (c2 - 0x30)) * 126 + (c3 - 0x81)) * 10 + (c4 - 0x30);\n");
1572  printf("                if (i >= %d && i <= %d) {\n", ranges[0].low, ranges[ranges_count-1].high);
1573  printf("                  unsigned int k1 = 0;\n");
1574  printf("                  unsigned int k2 = %d;\n", ranges_count-1);
1575  printf("                  while (k1 < k2) {\n");
1576  printf("                    unsigned int k = (k1 + k2) / 2;\n");
1577  printf("                    if (i <= %s_charset2uni_ranges[2*k+1])\n", name);
1578  printf("                      k2 = k;\n");
1579  printf("                    else if (i >= %s_charset2uni_ranges[2*k+2])\n", name);
1580  printf("                      k1 = k + 1;\n");
1581  printf("                    else\n");
1582  printf("                      return RET_ILSEQ;\n");
1583  printf("                  }\n");
1584  printf("                  {\n");
1585  printf("                    unsigned int bitmap_index = i - %s_charset2uni_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1586  printf("                    if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1587  printf("                      unsigned int diff = %s_ranges[k1].diff;\n", name);
1588  printf("                      *pwc = (ucs4_t) (i + diff);\n");
1589  printf("                      return 4;\n");
1590  printf("                    }\n");
1591  printf("                  }\n");
1592  printf("                }\n");
1593  printf("              }\n");
1594  printf("              return RET_ILSEQ;\n");
1595  printf("            }\n");
1596  printf("            return RET_TOOFEW(0);\n");
1597  printf("          }\n");
1598  printf("          return RET_ILSEQ;\n");
1599  printf("        }\n");
1600  printf("        return RET_TOOFEW(0);\n");
1601  printf("      }\n");
1602  printf("      return RET_ILSEQ;\n");
1603  printf("    }\n");
1604  printf("    return RET_TOOFEW(0);\n");
1605  printf("  }\n");
1606  printf("  return RET_ILSEQ;\n");
1607  printf("}\n");
1608
1609  printf("\n");
1610
1611  printf("static int\n");
1612  printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
1613  printf("{\n");
1614  printf("  if (n >= 4) {\n");
1615  printf("    unsigned int i = wc;\n");
1616  printf("    if (i >= 0x%04x && i <= 0x%04x) {\n", ranges[0].low + ranges[0].diff, ranges[ranges_count-1].high + ranges[ranges_count-1].diff);
1617  printf("      unsigned int k1 = 0;\n");
1618  printf("      unsigned int k2 = %d;\n", ranges_count-1);
1619  printf("      while (k1 < k2) {\n");
1620  printf("        unsigned int k = (k1 + k2) / 2;\n");
1621  printf("        if (i <= %s_uni2charset_ranges[2*k+1])\n", name);
1622  printf("          k2 = k;\n");
1623  printf("        else if (i >= %s_uni2charset_ranges[2*k+2])\n", name);
1624  printf("          k1 = k + 1;\n");
1625  printf("        else\n");
1626  printf("          return RET_ILUNI;\n");
1627  printf("      }\n");
1628  printf("      {\n");
1629  printf("        unsigned int bitmap_index = i - %s_uni2charset_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1630  printf("        if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1631  printf("          unsigned int diff = %s_ranges[k1].diff;\n", name);
1632  printf("          i -= diff;\n");
1633  printf("          r[3] = (i %% 10) + 0x30; i = i / 10;\n");
1634  printf("          r[2] = (i %% 126) + 0x81; i = i / 126;\n");
1635  printf("          r[1] = (i %% 10) + 0x30; i = i / 10;\n");
1636  printf("          r[0] = i + 0x81;\n");
1637  printf("          return 4;\n");
1638  printf("        }\n");
1639  printf("      }\n");
1640  printf("    }\n");
1641  printf("    return RET_ILUNI;\n");
1642  printf("  }\n");
1643  printf("  return RET_TOOSMALL;\n");
1644  printf("}\n");
1645}
1646
1647/* JISX0213 specifics */
1648
1649static void do_jisx0213 (const char* name)
1650{
1651  printf("#ifndef _JISX0213_H\n");
1652  printf("#define _JISX0213_H\n");
1653  printf("\n");
1654  printf("/* JISX0213 plane 1 (= ISO-IR-233) characters are in the range\n");
1655  printf("   0x{21..7E}{21..7E}.\n");
1656  printf("   JISX0213 plane 2 (= ISO-IR-229) characters are in the range\n");
1657  printf("   0x{21,23..25,28,2C..2F,6E..7E}{21..7E}.\n");
1658  printf("   Together this makes 120 rows of 94 characters.\n");
1659  printf("*/\n");
1660  printf("\n");
1661  {
1662#define row_convert(row) \
1663      ((row) >= 0x121 && (row) <= 0x17E ? row-289 : /* 0..93 */    \
1664       (row) == 0x221                   ? row-451 : /* 94 */       \
1665       (row) >= 0x223 && (row) <= 0x225 ? row-452 : /* 95..97 */   \
1666       (row) == 0x228                   ? row-454 : /* 98 */       \
1667       (row) >= 0x22C && (row) <= 0x22F ? row-457 : /* 99..102 */  \
1668       (row) >= 0x26E && (row) <= 0x27E ? row-519 : /* 103..119 */ \
1669       -1)
1670    unsigned int table[120][94];
1671    int pagemin[0x1100];
1672    int pagemax[0x1100];
1673    int pageidx[0x1100];
1674    unsigned int pagestart[0x1100];
1675    unsigned int pagestart_len = 0;
1676    {
1677      unsigned int rowc, colc;
1678      for (rowc = 0; rowc < 120; rowc++)
1679        for (colc = 0; colc < 94; colc++)
1680          table[rowc][colc] = 0;
1681    }
1682    {
1683      unsigned int page;
1684      for (page = 0; page < 0x1100; page++)
1685        pagemin[page] = -1;
1686      for (page = 0; page < 0x1100; page++)
1687        pagemax[page] = -1;
1688      for (page = 0; page < 0x1100; page++)
1689        pageidx[page] = -1;
1690    }
1691    printf("static const unsigned short jisx0213_to_ucs_combining[][2] = {\n");
1692    {
1693      int private_use = 0x0001;
1694      for (;;) {
1695        char line[30];
1696        unsigned int row, col;
1697        unsigned int ucs;
1698        memset(line,0,sizeof(line));
1699        if (scanf("%[^\n]\n",line) < 1)
1700          break;
1701        assert(line[0]=='0');
1702        assert(line[1]=='x');
1703        assert(isxdigit(line[2]));
1704        assert(isxdigit(line[3]));
1705        assert(isxdigit(line[4]));
1706        assert(isxdigit(line[5]));
1707        assert(isxdigit(line[6]));
1708        assert(line[7]=='\t');
1709        line[7] = '\0';
1710        col = strtoul(&line[5],NULL,16);
1711        line[5] = '\0';
1712        row = strtoul(&line[2],NULL,16);
1713        if (line[20] != '\0' && line[21] == '\0') {
1714          unsigned int u1, u2;
1715          assert(line[8]=='0');
1716          assert(line[9]=='x');
1717          assert(isxdigit(line[10]));
1718          assert(isxdigit(line[11]));
1719          assert(isxdigit(line[12]));
1720          assert(isxdigit(line[13]));
1721          assert(line[14]==' ');
1722          assert(line[15]=='0');
1723          assert(line[16]=='x');
1724          assert(isxdigit(line[17]));
1725          assert(isxdigit(line[18]));
1726          assert(isxdigit(line[19]));
1727          assert(isxdigit(line[20]));
1728          u2 = strtoul(&line[17],NULL,16);
1729          line[14] = '\0';
1730          u1 = strtoul(&line[10],NULL,16);
1731          printf("  { 0x%04x, 0x%04x },\n", u1, u2);
1732          ucs = private_use++;
1733        } else {
1734          assert(line[8]=='0');
1735          assert(line[9]=='x');
1736          assert(isxdigit(line[10]));
1737          assert(isxdigit(line[11]));
1738          assert(isxdigit(line[12]));
1739          assert(isxdigit(line[13]));
1740          ucs = strtoul(&line[10],NULL,16);
1741        }
1742        assert((unsigned int) row_convert(row) < 120);
1743        assert((unsigned int) (col-0x21) < 94);
1744        table[row_convert(row)][col-0x21] = ucs;
1745      }
1746    }
1747    printf("};\n");
1748    printf("\n");
1749    {
1750      unsigned int rowc, colc;
1751      for (rowc = 0; rowc < 120; rowc++) {
1752        for (colc = 0; colc < 94; colc++) {
1753          unsigned int value = table[rowc][colc];
1754          unsigned int page = value >> 8;
1755          unsigned int rest = value & 0xff;
1756          if (pagemin[page] < 0 || pagemin[page] > rest) pagemin[page] = rest;
1757          if (pagemax[page] < 0 || pagemax[page] < rest) pagemax[page] = rest;
1758        }
1759      }
1760    }
1761    {
1762      unsigned int index = 0;
1763      unsigned int i;
1764      for (i = 0; i < 0x1100; ) {
1765        if (pagemin[i] >= 0) {
1766          if (pagemin[i+1] >= 0 && pagemin[i] >= 0x80 && pagemax[i+1] < 0x80) {
1767            /* Combine two pages into a single one. */
1768            assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0]));
1769            pagestart[pagestart_len++] = (i<<8)+0x80;
1770            pageidx[i] = index;
1771            pageidx[i+1] = index;
1772            index++;
1773            i += 2;
1774          } else {
1775            /* A single page. */
1776            assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0]));
1777            pagestart[pagestart_len++] = i<<8;
1778            pageidx[i] = index;
1779            index++;
1780            i += 1;
1781          }
1782        } else
1783          i++;
1784      }
1785    }
1786    printf("static const unsigned short jisx0213_to_ucs_main[120 * 94] = {\n");
1787    {
1788      unsigned int row;
1789      for (row = 0; row < 0x300; row++) {
1790        unsigned int rowc = row_convert(row);
1791        if (rowc != (unsigned int) (-1)) {
1792          printf("  /* 0x%X21..0x%X7E */\n",row,row);
1793          {
1794            unsigned int count = 0;
1795            unsigned int colc;
1796            for (colc = 0; colc < 94; colc++) {
1797              if ((count % 8) == 0) printf(" ");
1798              {
1799                unsigned int value = table[rowc][colc];
1800                unsigned int page = value >> 8;
1801                unsigned int index = pageidx[page];
1802                assert(value-pagestart[index] < 0x100);
1803                printf(" 0x%04x,",(index<<8)|(value-pagestart[index]));
1804              }
1805              count++;
1806              if ((count % 8) == 0) printf("\n");
1807            }
1808          }
1809          printf("\n");
1810        }
1811      }
1812    }
1813    printf("};\n");
1814    printf("\n");
1815    printf("static const ucs4_t jisx0213_to_ucs_pagestart[] = {\n");
1816    {
1817      unsigned int count = 0;
1818      unsigned int i;
1819      for (i = 0; i < pagestart_len; i++) {
1820        char buf[10];
1821        if ((count % 8) == 0) printf(" ");
1822        printf(" ");
1823        sprintf(buf,"0x%04x",pagestart[i]);
1824        if (strlen(buf) < 7) printf("%*s",7-strlen(buf),"");
1825        printf("%s,",buf);
1826        count++;
1827        if ((count % 8) == 0) printf("\n");
1828      }
1829    }
1830    printf("\n");
1831    printf("};\n");
1832#undef row_convert
1833  }
1834  rewind(stdin);
1835  printf("\n");
1836  {
1837    int table[0x110000];
1838    bool pages[0x4400];
1839    int maxpage = -1;
1840    unsigned int combining_prefixes[100];
1841    unsigned int combining_prefixes_len = 0;
1842    {
1843      unsigned int i;
1844      for (i = 0; i < 0x110000; i++)
1845        table[i] = -1;
1846      for (i = 0; i < 0x4400; i++)
1847        pages[i] = false;
1848    }
1849    for (;;) {
1850      char line[30];
1851      unsigned int plane, row, col;
1852      memset(line,0,sizeof(line));
1853      if (scanf("%[^\n]\n",line) < 1)
1854        break;
1855      assert(line[0]=='0');
1856      assert(line[1]=='x');
1857      assert(isxdigit(line[2]));
1858      assert(isxdigit(line[3]));
1859      assert(isxdigit(line[4]));
1860      assert(isxdigit(line[5]));
1861      assert(isxdigit(line[6]));
1862      assert(line[7]=='\t');
1863      line[7] = '\0';
1864      col = strtoul(&line[5],NULL,16);
1865      line[5] = '\0';
1866      row = strtoul(&line[3],NULL,16);
1867      line[3] = '\0';
1868      plane = strtoul(&line[2],NULL,16) - 1;
1869      if (line[20] != '\0' && line[21] == '\0') {
1870        unsigned int u1, u2;
1871        assert(line[8]=='0');
1872        assert(line[9]=='x');
1873        assert(isxdigit(line[10]));
1874        assert(isxdigit(line[11]));
1875        assert(isxdigit(line[12]));
1876        assert(isxdigit(line[13]));
1877        assert(line[14]==' ');
1878        assert(line[15]=='0');
1879        assert(line[16]=='x');
1880        assert(isxdigit(line[17]));
1881        assert(isxdigit(line[18]));
1882        assert(isxdigit(line[19]));
1883        assert(isxdigit(line[20]));
1884        u2 = strtoul(&line[17],NULL,16);
1885        line[14] = '\0';
1886        u1 = strtoul(&line[10],NULL,16);
1887        assert(u2 == 0x02E5 || u2 == 0x02E9 || u2 == 0x0300 || u2 == 0x0301
1888               || u2 == 0x309A);
1889        assert(combining_prefixes_len < sizeof(combining_prefixes)/sizeof(combining_prefixes[0]));
1890        combining_prefixes[combining_prefixes_len++] = u1;
1891      } else {
1892        unsigned int ucs;
1893        assert(line[8]=='0');
1894        assert(line[9]=='x');
1895        assert(isxdigit(line[10]));
1896        assert(isxdigit(line[11]));
1897        assert(isxdigit(line[12]));
1898        assert(isxdigit(line[13]));
1899        ucs = strtoul(&line[10],NULL,16);
1900        /* Add an entry. */
1901        assert(plane <= 1);
1902        assert(row <= 0x7f);
1903        assert(col <= 0x7f);
1904        table[ucs] = (plane << 15) | (row << 8) | col;
1905        pages[ucs>>6] = true;
1906        if (maxpage < 0 || (ucs>>6) > maxpage) maxpage = ucs>>6;
1907      }
1908    }
1909    {
1910      unsigned int i;
1911      for (i = 0; i < combining_prefixes_len; i++) {
1912        unsigned int u1 = combining_prefixes[i];
1913        assert(table[u1] >= 0);
1914        table[u1] |= 0x0080;
1915      }
1916    }
1917    printf("static const short jisx0213_from_ucs_level1[%d] = {\n",maxpage+1);
1918    {
1919      unsigned int index = 0;
1920      unsigned int i;
1921      for (i = 0; i <= maxpage; i++) {
1922        if ((i % 8) == 0) printf(" ");
1923        if (pages[i]) {
1924          printf(" %3u,",index);
1925          index++;
1926        } else {
1927          printf(" %3d,",-1);
1928        }
1929        if (((i+1) % 8) == 0) printf("\n");
1930      }
1931    }
1932    printf("\n");
1933    printf("};\n");
1934    printf("\n");
1935    #if 0 /* Dense array */
1936    printf("static const unsigned short jisx0213_from_ucs_level2[] = {\n");
1937    {
1938      unsigned int i;
1939      for (i = 0; i <= maxpage; i++) {
1940        if (pages[i]) {
1941          printf("  /* 0x%04X */\n",i<<6);
1942          {
1943            unsigned int j;
1944            for (j = 0; j < 0x40; ) {
1945              unsigned int ucs = (i<<6)+j;
1946              int value = table[ucs];
1947              if (value < 0) value = 0;
1948              if ((j % 8) == 0) printf(" ");
1949              printf(" 0x%04x,",value);
1950              j++;
1951              if ((j % 8) == 0) printf("\n");
1952            }
1953          }
1954        }
1955      }
1956    }
1957    printf("};\n");
1958    #else /* Sparse array */
1959    {
1960      int summary_indx[0x11000];
1961      int summary_used[0x11000];
1962      unsigned int i, k, indx;
1963      printf("static const unsigned short jisx0213_from_ucs_level2_data[] = {\n");
1964      /* Fill summary_indx[] and summary_used[]. */
1965      indx = 0;
1966      for (i = 0, k = 0; i <= maxpage; i++) {
1967        if (pages[i]) {
1968          unsigned int j1, j2;
1969          unsigned int count = 0;
1970          printf("  /* 0x%04X */\n",i<<6);
1971          for (j1 = 0; j1 < 4; j1++) {
1972            summary_indx[4*k+j1] = indx;
1973            summary_used[4*k+j1] = 0;
1974            for (j2 = 0; j2 < 16; j2++) {
1975              unsigned int j = 16*j1+j2;
1976              unsigned int ucs = (i<<6)+j;
1977              int value = table[ucs];
1978              if (value < 0) value = 0;
1979              if (value > 0) {
1980                summary_used[4*k+j1] |= (1 << j2);
1981                if ((count % 8) == 0) printf(" ");
1982                printf(" 0x%04x,",value);
1983                count++;
1984                if ((count % 8) == 0) printf("\n");
1985                indx++;
1986              }
1987            }
1988          }
1989          if ((count % 8) > 0)
1990            printf("\n");
1991          k++;
1992        }
1993      }
1994      printf("};\n");
1995      printf("\n");
1996      printf("static const Summary16 jisx0213_from_ucs_level2_2indx[] = {\n");
1997      for (i = 0, k = 0; i <= maxpage; i++) {
1998        if (pages[i]) {
1999          unsigned int j1;
2000          printf("  /* 0x%04X */\n",i<<6);
2001          printf(" ");
2002          for (j1 = 0; j1 < 4; j1++) {
2003            printf(" { %4d, 0x%04x },", summary_indx[4*k+j1], summary_used[4*k+j1]);
2004          }
2005          printf("\n");
2006          k++;
2007        }
2008      }
2009      printf("};\n");
2010    }
2011    #endif
2012    printf("\n");
2013  }
2014  printf("#ifdef __GNUC__\n");
2015  printf("__inline\n");
2016  printf("#else\n");
2017  printf("#ifdef __cplusplus\n");
2018  printf("inline\n");
2019  printf("#endif\n");
2020  printf("#endif\n");
2021  printf("static ucs4_t jisx0213_to_ucs4 (unsigned int row, unsigned int col)\n");
2022  printf("{\n");
2023  printf("  ucs4_t val;\n");
2024  printf("\n");
2025  printf("  if (row >= 0x121 && row <= 0x17e)\n");
2026  printf("    row -= 289;\n");
2027  printf("  else if (row == 0x221)\n");
2028  printf("    row -= 451;\n");
2029  printf("  else if (row >= 0x223 && row <= 0x225)\n");
2030  printf("    row -= 452;\n");
2031  printf("  else if (row == 0x228)\n");
2032  printf("    row -= 454;\n");
2033  printf("  else if (row >= 0x22c && row <= 0x22f)\n");
2034  printf("    row -= 457;\n");
2035  printf("  else if (row >= 0x26e && row <= 0x27e)\n");
2036  printf("    row -= 519;\n");
2037  printf("  else\n");
2038  printf("    return 0x0000;\n");
2039  printf("\n");
2040  printf("  if (col >= 0x21 && col <= 0x7e)\n");
2041  printf("    col -= 0x21;\n");
2042  printf("  else\n");
2043  printf("    return 0x0000;\n");
2044  printf("\n");
2045  printf("  val = jisx0213_to_ucs_main[row * 94 + col];\n");
2046  printf("  val = jisx0213_to_ucs_pagestart[val >> 8] + (val & 0xff);\n");
2047  printf("  if (val == 0xfffd)\n");
2048  printf("    val = 0x0000;\n");
2049  printf("  return val;\n");
2050  printf("}\n");
2051  printf("\n");
2052  printf("#ifdef __GNUC__\n");
2053  printf("__inline\n");
2054  printf("#else\n");
2055  printf("#ifdef __cplusplus\n");
2056  printf("inline\n");
2057  printf("#endif\n");
2058  printf("#endif\n");
2059  printf("static unsigned short ucs4_to_jisx0213 (ucs4_t ucs)\n");
2060  printf("{\n");
2061  printf("  if (ucs < (sizeof(jisx0213_from_ucs_level1)/sizeof(jisx0213_from_ucs_level1[0])) << 6) {\n");
2062  printf("    int index1 = jisx0213_from_ucs_level1[ucs >> 6];\n");
2063  printf("    if (index1 >= 0)");
2064  #if 0 /* Dense array */
2065  printf("\n");
2066  printf("      return jisx0213_from_ucs_level2[(index1 << 6) + (ucs & 0x3f)];\n");
2067  #else /* Sparse array */
2068  printf(" {\n");
2069  printf("      const Summary16 *summary = &jisx0213_from_ucs_level2_2indx[((index1 << 6) + (ucs & 0x3f)) >> 4];\n");
2070  printf("      unsigned short used = summary->used;\n");
2071  printf("      unsigned int i = ucs & 0x0f;\n");
2072  printf("      if (used & ((unsigned short) 1 << i)) {\n");
2073  printf("        /* Keep in `used' only the bits 0..i-1. */\n");
2074  printf("        used &= ((unsigned short) 1 << i) - 1;\n");
2075  printf("        /* Add `summary->indx' and the number of bits set in `used'. */\n");
2076  printf("        used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
2077  printf("        used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
2078  printf("        used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
2079  printf("        used = (used & 0x00ff) + (used >> 8);\n");
2080  printf("        return jisx0213_from_ucs_level2_data[summary->indx + used];\n");
2081  printf("      };\n");
2082  printf("    };\n");
2083  #endif
2084  printf("  }\n");
2085  printf("  return 0x0000;\n");
2086  printf("}\n");
2087  printf("\n");
2088  printf("#endif /* _JISX0213_H */\n");
2089}
2090
2091/* Main program */
2092
2093int main (int argc, char *argv[])
2094{
2095  const char* charsetname;
2096  const char* name;
2097
2098  if (argc != 3)
2099    exit(1);
2100  charsetname = argv[1];
2101  name = argv[2];
2102
2103  output_title(charsetname);
2104
2105  if (!strcmp(name,"gb2312")
2106      || !strcmp(name,"isoir165ext") || !strcmp(name,"gb12345ext")
2107      || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
2108    do_normal(name);
2109  else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
2110           || !strcmp(name,"cns11643_3") || !strcmp(name,"cns11643_4a")
2111           || !strcmp(name,"cns11643_4b") || !strcmp(name,"cns11643_5")
2112           || !strcmp(name,"cns11643_6") || !strcmp(name,"cns11643_7")
2113           || !strcmp(name,"cns11643_15"))
2114    do_normal_only_charset2uni(name);
2115  else if (!strcmp(name,"cns11643_inv"))
2116    do_cns11643_only_uni2charset(name);
2117  else if (!strcmp(name,"gbkext1"))
2118    do_gbk1_only_charset2uni(name);
2119  else if (!strcmp(name,"gbkext2"))
2120    do_gbk2_only_charset2uni(name);
2121  else if (!strcmp(name,"gbkext_inv"))
2122    do_gbk1_only_uni2charset(name);
2123  else if (!strcmp(name,"cp936ext") || !strcmp(name,"gb18030ext"))
2124    do_gbk1(name);
2125  else if (!strcmp(name,"ksc5601"))
2126    do_ksc5601(name);
2127  else if (!strcmp(name,"uhc_1"))
2128    do_uhc_1(name);
2129  else if (!strcmp(name,"uhc_2"))
2130    do_uhc_2(name);
2131  else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
2132    do_big5(name);
2133  else if (!strcmp(name,"hkscs1999") || !strcmp(name,"hkscs2001")
2134           || !strcmp(name,"hkscs2004"))
2135    do_hkscs(name);
2136  else if (!strcmp(name,"johab_hangul"))
2137    do_johab_hangul(name);
2138  else if (!strcmp(name,"cp932ext"))
2139    do_sjis(name);
2140  else if (!strcmp(name,"gb18030uni"))
2141    do_gb18030uni(name);
2142  else if (!strcmp(name,"jisx0213"))
2143    do_jisx0213(name);
2144  else
2145    exit(1);
2146
2147  return 0;
2148}
2149