1/* Copyright (C) 1999-2004, 2006 Free Software Foundation, Inc. 2 This file is part of the GNU LIBICONV Tools. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software Foundation, 16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 17 18/* 19 * Generates a CJK character set table from a .TXT table as found on 20 * ftp.unicode.org or in the X nls directory. 21 * Examples: 22 * 23 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312 24 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208 25 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601 26 * 27 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT 28 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT 29 * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT 30 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT 31 * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT 32 * 33 * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT 34 * 35 * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT 36 * 37 * ./cjk_tab_to_h JISX0213:2004 jisx0213 > jisx0213.h < JISX0213.TXT 38 */ 39 40#include <stdio.h> 41#include <stdlib.h> 42#include <stdbool.h> 43#include <string.h> 44#include <ctype.h> 45#include <assert.h> 46 47typedef struct { 48 int start; 49 int end; 50} Block; 51 52typedef struct { 53 int rows; /* number of possible values for the 1st byte */ 54 int cols; /* number of possible values for the 2nd byte */ 55 int (*row_byte) (int row); /* returns the 1st byte value for a given row */ 56 int (*col_byte) (int col); /* returns the 2nd byte value for a given col */ 57 int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */ 58 int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */ 59 const char* check_row_expr; /* format string for 1st byte value checking */ 60 const char* check_col_expr; /* format string for 2nd byte value checking */ 61 const char* byte_row_expr; /* format string for 1st byte value to row */ 62 const char* byte_col_expr; /* format string for 2nd byte value to col */ 63 int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */ 64 /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book. 65 Once a row is fixed, choosing a "col" is the same as choosing a "cell". */ 66 int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */ 67 int ncharsetblocks; 68 Block* charsetblocks; /* blocks[0..nblocks-1] */ 69 int* uni2charset; /* uni2charset[0x0000..0xffff] */ 70 int fffd; /* uni representation of the invalid character */ 71} Encoding; 72 73/* 74 * Outputs the file title. 75 */ 76static void output_title (const char *charsetname) 77{ 78 printf("/*\n"); 79 printf(" * Copyright (C) 1999-2006 Free Software Foundation, Inc.\n"); 80 printf(" * This file is part of the GNU LIBICONV Library.\n"); 81 printf(" *\n"); 82 printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n"); 83 printf(" * and/or modify it under the terms of the GNU Library General Public\n"); 84 printf(" * License as published by the Free Software Foundation; either version 2\n"); 85 printf(" * of the License, or (at your option) any later version.\n"); 86 printf(" *\n"); 87 printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n"); 88 printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); 89 printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n"); 90 printf(" * Library General Public License for more details.\n"); 91 printf(" *\n"); 92 printf(" * You should have received a copy of the GNU Library General Public\n"); 93 printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n"); 94 printf(" * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n"); 95 printf(" * Fifth Floor, Boston, MA 02110-1301, USA.\n"); 96 printf(" */\n"); 97 printf("\n"); 98 printf("/*\n"); 99 printf(" * %s\n", charsetname); 100 printf(" */\n"); 101 printf("\n"); 102} 103 104/* 105 * Reads the charset2uni table from standard input. 106 */ 107static void read_table (Encoding* enc) 108{ 109 int row, col, i, i1, i2, c, j; 110 111 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*)); 112 for (row = 0; row < enc->rows; row++) 113 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int)); 114 115 for (row = 0; row < enc->rows; row++) 116 for (col = 0; col < enc->cols; col++) 117 enc->charset2uni[row][col] = 0xfffd; 118 119 c = getc(stdin); 120 ungetc(c,stdin); 121 if (c == '#') { 122 /* Read a unicode.org style .TXT file. */ 123 for (;;) { 124 c = getc(stdin); 125 if (c == EOF) 126 break; 127 if (c == '\n' || c == ' ' || c == '\t') 128 continue; 129 if (c == '#') { 130 do { c = getc(stdin); } while (!(c == EOF || c == '\n')); 131 continue; 132 } 133 ungetc(c,stdin); 134 if (scanf("0x%x", &j) != 1) 135 exit(1); 136 i1 = j >> 8; 137 i2 = j & 0xff; 138 row = enc->byte_row(i1); 139 col = enc->byte_col(i2); 140 if (row < 0 || col < 0) { 141 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2); 142 exit(1); 143 } 144 if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1) 145 exit(1); 146 } 147 } else { 148 /* Read a table of hexadecimal Unicode values. */ 149 for (i1 = 32; i1 < 132; i1++) 150 for (i2 = 32; i2 < 132; i2++) { 151 i = scanf("%x", &j); 152 if (i == EOF) 153 goto read_done; 154 if (i != 1) 155 exit(1); 156 if (j < 0 || j == 0xffff) 157 j = 0xfffd; 158 if (j != 0xfffd) { 159 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) { 160 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2); 161 exit (1); 162 } 163 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j; 164 } 165 } 166 read_done: ; 167 } 168} 169 170/* 171 * Determine whether the Unicode range goes outside the BMP. 172 */ 173static bool is_charset2uni_large (Encoding* enc) 174{ 175 int row, col; 176 177 for (row = 0; row < enc->rows; row++) 178 for (col = 0; col < enc->cols; col++) 179 if (enc->charset2uni[row][col] >= 0x10000) 180 return true; 181 return false; 182} 183 184/* 185 * Compactify the Unicode range by use of an auxiliary table, 186 * so 16 bits suffice to store each value. 187 */ 188static int compact_large_charset2uni (Encoding* enc, unsigned int **urows, unsigned int *urowshift) 189{ 190 unsigned int shift; 191 192 for (shift = 8; ; shift--) { 193 int *upages = (int *) malloc((0x110000>>shift) * sizeof(int)); 194 int i, row, col, nurows; 195 196 for (i = 0; i < 0x110000>>shift; i++) 197 upages[i] = -1; 198 199 for (row = 0; row < enc->rows; row++) 200 for (col = 0; col < enc->cols; col++) 201 upages[enc->charset2uni[row][col] >> shift] = 0; 202 203 nurows = 0; 204 for (i = 0; i < 0x110000>>shift; i++) 205 if (upages[i] == 0) 206 nurows++; 207 208 /* We want all table entries to fit in an 'unsigned short'. */ 209 if (nurows <= 1<<(16-shift)) { 210 int** old_charset2uni; 211 212 *urows = (unsigned int *) malloc(nurows * sizeof(unsigned int)); 213 *urowshift = shift; 214 215 nurows = 0; 216 for (i = 0; i < 0x110000>>shift; i++) 217 if (upages[i] == 0) { 218 upages[i] = nurows; 219 (*urows)[nurows] = i; 220 nurows++; 221 } 222 223 old_charset2uni = enc->charset2uni; 224 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*)); 225 for (row = 0; row < enc->rows; row++) 226 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int)); 227 for (row = 0; row < enc->rows; row++) 228 for (col = 0; col < enc->cols; col++) { 229 int u = old_charset2uni[row][col]; 230 enc->charset2uni[row][col] = 231 (upages[u >> shift] << shift) | (u & ((1 << shift) - 1)); 232 } 233 enc->fffd = 234 (upages[0xfffd >> shift] << shift) | (0xfffd & ((1 << shift) - 1)); 235 236 return nurows; 237 } 238 } 239 abort(); 240} 241 242/* 243 * Computes the charsetpage[0..rows] array. 244 */ 245static void find_charset2uni_pages (Encoding* enc) 246{ 247 int row, col; 248 249 enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int)); 250 251 for (row = 0; row <= enc->rows; row++) 252 enc->charsetpage[row] = 0; 253 254 for (row = 0; row < enc->rows; row++) { 255 int used = 0; 256 for (col = 0; col < enc->cols; col++) 257 if (enc->charset2uni[row][col] != enc->fffd) 258 used = col+1; 259 enc->charsetpage[row] = used; 260 } 261} 262 263/* 264 * Fills in nblocks and blocks. 265 */ 266static void find_charset2uni_blocks (Encoding* enc) 267{ 268 int n, row, lastrow; 269 270 enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block)); 271 272 n = 0; 273 for (row = 0; row < enc->rows; row++) 274 if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) { 275 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); 276 enc->charsetblocks[n].start = row * enc->cols; 277 enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow]; 278 n++; 279 } 280 enc->ncharsetblocks = n; 281} 282 283/* 284 * Outputs the charset to unicode table and function. 285 */ 286static void output_charset2uni (const char* name, Encoding* enc) 287{ 288 int nurows, row, col, lastrow, col_max, i, i1_min, i1_max; 289 bool is_large; 290 unsigned int* urows; 291 unsigned int urowshift; 292 Encoding tmpenc; 293 294 is_large = is_charset2uni_large(enc); 295 if (is_large) { 296 /* Use a temporary copy of enc. */ 297 tmpenc = *enc; 298 enc = &tmpenc; 299 nurows = compact_large_charset2uni(enc,&urows,&urowshift); 300 } else { 301 nurows = 0; urows = NULL; urowshift = 0; enc->fffd = 0xfffd; 302 } 303 304 find_charset2uni_pages(enc); 305 306 find_charset2uni_blocks(enc); 307 308 for (row = 0; row < enc->rows; row++) 309 if (enc->charsetpage[row] > 0) { 310 if (row == 0 || enc->charsetpage[row-1] == 0) { 311 /* Start a new block. */ 312 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); 313 printf("static const unsigned short %s_2uni_page%02x[%d] = {\n", 314 name, enc->row_byte(row), 315 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]); 316 } 317 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row)); 318 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]); 319 for (col = 0; col < col_max; col++) { 320 printf(" 0x%04x,", enc->charset2uni[row][col]); 321 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n "); 322 } 323 printf("\n"); 324 if (enc->charsetpage[row+1] == 0) { 325 /* End a block. */ 326 printf("};\n"); 327 } 328 } 329 printf("\n"); 330 331 if (is_large) { 332 printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name, nurows); 333 for (i = 0; i < nurows; i++) { 334 printf(" 0x%05x,", urows[i] << urowshift); 335 if ((i % 8) == 7 && (i+1 < nurows)) printf("\n "); 336 } 337 printf("\n"); 338 printf("};\n"); 339 printf("\n"); 340 } 341 342 printf("static int\n"); 343 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name); 344 printf("{\n"); 345 printf(" unsigned char c1 = s[0];\n"); 346 printf(" if ("); 347 for (i = 0; i < enc->ncharsetblocks; i++) { 348 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols); 349 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols); 350 if (i > 0) 351 printf(" || "); 352 if (i1_min == i1_max) 353 printf("(c1 == 0x%02x)", i1_min); 354 else 355 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max); 356 } 357 printf(") {\n"); 358 printf(" if (n >= 2) {\n"); 359 printf(" unsigned char c2 = s[1];\n"); 360 printf(" if ("); 361 printf(enc->check_col_expr, "c2"); 362 printf(") {\n"); 363 printf(" unsigned int i = %d * (", enc->cols); 364 printf(enc->byte_row_expr, "c1"); 365 printf(") + ("); 366 printf(enc->byte_col_expr, "c2"); 367 printf(");\n"); 368 printf(" %s wc = 0xfffd;\n", is_large ? "ucs4_t" : "unsigned short"); 369 if (is_large) printf(" unsigned short swc;\n"); 370 for (i = 0; i < enc->ncharsetblocks; i++) { 371 printf(" "); 372 if (i > 0) 373 printf("} else "); 374 if (i < enc->ncharsetblocks-1) 375 printf("if (i < %d) ", enc->charsetblocks[i+1].start); 376 printf("{\n"); 377 printf(" if (i < %d)\n", enc->charsetblocks[i].end); 378 printf(" %s = ", is_large ? "swc" : "wc"); 379 printf("%s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols)); 380 if (enc->charsetblocks[i].start > 0) 381 printf("-%d", enc->charsetblocks[i].start); 382 printf("]"); 383 if (is_large) printf(",\n wc = %s_2uni_upages[swc>>%d] | (swc & 0x%x)", name, urowshift, (1 << urowshift) - 1); 384 printf(";\n"); 385 } 386 printf(" }\n"); 387 printf(" if (wc != 0xfffd) {\n"); 388 printf(" *pwc = %swc;\n", is_large ? "" : "(ucs4_t) "); 389 printf(" return 2;\n"); 390 printf(" }\n"); 391 printf(" }\n"); 392 printf(" return RET_ILSEQ;\n"); 393 printf(" }\n"); 394 printf(" return RET_TOOFEW(0);\n"); 395 printf(" }\n"); 396 printf(" return RET_ILSEQ;\n"); 397 printf("}\n"); 398 printf("\n"); 399} 400 401/* 402 * Outputs the charset to unicode table and function. 403 * (Suitable if the mapping function is well defined, i.e. has no holes, and 404 * is monotonically increasing with small gaps only.) 405 */ 406static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc) 407{ 408 int row, col, lastrow, r, col_max, i, i1_min, i1_max; 409 410 /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and 411 enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize] 412 is always < 0x100. */ 413 int steps_per_row = 2; 414 int stepsize = (enc->cols + steps_per_row-1) / steps_per_row; 415 416 find_charset2uni_pages(enc); 417 418 find_charset2uni_blocks(enc); 419 420 for (row = 0; row < enc->rows; row++) 421 if (enc->charsetpage[row] > 0) { 422 if (row == 0 || enc->charsetpage[row-1] == 0) { 423 /* Start a new block. */ 424 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); 425 printf("static const unsigned short %s_2uni_main_page%02x[%d] = {\n ", 426 name, enc->row_byte(row), 427 steps_per_row*(lastrow-row+1)); 428 for (r = row; r <= lastrow; r++) { 429 for (i = 0; i < steps_per_row; i++) 430 printf(" 0x%04x,", enc->charset2uni[r][i*stepsize]); 431 if (((r-row) % 4) == 3 && (r < lastrow)) printf("\n "); 432 } 433 printf("\n"); 434 printf("};\n"); 435 printf("static const unsigned char %s_2uni_page%02x[%d] = {\n", 436 name, enc->row_byte(row), 437 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]); 438 } 439 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row)); 440 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]); 441 for (col = 0; col < col_max; col++) { 442 printf(" 0x%02x,", enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]); 443 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n "); 444 } 445 printf("\n"); 446 if (enc->charsetpage[row+1] == 0) { 447 /* End a block. */ 448 printf("};\n"); 449 } 450 } 451 printf("\n"); 452 453 printf("static int\n"); 454 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name); 455 printf("{\n"); 456 printf(" unsigned char c1 = s[0];\n"); 457 printf(" if ("); 458 for (i = 0; i < enc->ncharsetblocks; i++) { 459 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols); 460 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols); 461 if (i > 0) 462 printf(" || "); 463 if (i1_min == i1_max) 464 printf("(c1 == 0x%02x)", i1_min); 465 else 466 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max); 467 } 468 printf(") {\n"); 469 printf(" if (n >= 2) {\n"); 470 printf(" unsigned char c2 = s[1];\n"); 471 printf(" if ("); 472 printf(enc->check_col_expr, "c2"); 473 printf(") {\n"); 474 printf(" unsigned int row = "); 475 printf(enc->byte_row_expr, "c1"); 476 printf(";\n"); 477 printf(" unsigned int col = "); 478 printf(enc->byte_col_expr, "c2"); 479 printf(";\n"); 480 printf(" unsigned int i = %d * row + col;\n", enc->cols); 481 printf(" unsigned short wc = 0xfffd;\n"); 482 for (i = 0; i < enc->ncharsetblocks; i++) { 483 printf(" "); 484 if (i > 0) 485 printf("} else "); 486 if (i < enc->ncharsetblocks-1) 487 printf("if (i < %d) ", enc->charsetblocks[i+1].start); 488 printf("{\n"); 489 printf(" if (i < %d)\n", enc->charsetblocks[i].end); 490 printf(" wc = %s_2uni_main_page%02x[%d*", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols), steps_per_row); 491 if (enc->charsetblocks[i].start > 0) 492 printf("(row-%d)", enc->charsetblocks[i].start / enc->cols); 493 else 494 printf("row"); 495 printf("+"); 496 if (steps_per_row == 2) 497 printf("(col>=%d?1:0)", stepsize); 498 else 499 printf("col/%d", stepsize); 500 printf("] + %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols)); 501 if (enc->charsetblocks[i].start > 0) 502 printf("-%d", enc->charsetblocks[i].start); 503 printf("];\n"); 504 } 505 printf(" }\n"); 506 printf(" if (wc != 0xfffd) {\n"); 507 printf(" *pwc = (ucs4_t) wc;\n"); 508 printf(" return 2;\n"); 509 printf(" }\n"); 510 printf(" }\n"); 511 printf(" return RET_ILSEQ;\n"); 512 printf(" }\n"); 513 printf(" return RET_TOOFEW(0);\n"); 514 printf(" }\n"); 515 printf(" return RET_ILSEQ;\n"); 516 printf("}\n"); 517 printf("\n"); 518} 519 520/* 521 * Computes the uni2charset[0x0000..0x2ffff] array. 522 */ 523static void invert (Encoding* enc) 524{ 525 int row, col, j; 526 527 enc->uni2charset = (int*) malloc(0x30000*sizeof(int)); 528 529 for (j = 0; j < 0x30000; j++) 530 enc->uni2charset[j] = 0; 531 532 for (row = 0; row < enc->rows; row++) 533 for (col = 0; col < enc->cols; col++) { 534 j = enc->charset2uni[row][col]; 535 if (j != 0xfffd) 536 enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col); 537 } 538} 539 540/* 541 * Outputs the unicode to charset table and function, using a linear array. 542 * (Suitable if the table is dense.) 543 */ 544static void output_uni2charset_dense (const char* name, Encoding* enc) 545{ 546 /* Like in 8bit_tab_to_h.c */ 547 bool pages[0x300]; 548 int line[0x6000]; 549 int tableno; 550 struct { int minline; int maxline; int usecount; } tables[0x6000]; 551 bool first; 552 int row, col, j, p, j1, j2, t; 553 554 for (p = 0; p < 0x300; p++) 555 pages[p] = false; 556 for (row = 0; row < enc->rows; row++) 557 for (col = 0; col < enc->cols; col++) { 558 j = enc->charset2uni[row][col]; 559 if (j != 0xfffd) 560 pages[j>>8] = true; 561 } 562 for (j1 = 0; j1 < 0x6000; j1++) { 563 bool all_invalid = true; 564 for (j2 = 0; j2 < 8; j2++) { 565 j = 8*j1+j2; 566 if (enc->uni2charset[j] != 0) 567 all_invalid = false; 568 } 569 if (all_invalid) 570 line[j1] = -1; 571 else 572 line[j1] = 0; 573 } 574 tableno = 0; 575 for (j1 = 0; j1 < 0x6000; j1++) { 576 if (line[j1] >= 0) { 577 if (tableno > 0 578 && ((j1 > 0 && line[j1-1] == tableno-1) 579 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5) 580 && j1 - tables[tableno-1].maxline <= 8))) { 581 line[j1] = tableno-1; 582 tables[tableno-1].maxline = j1; 583 } else { 584 tableno++; 585 line[j1] = tableno-1; 586 tables[tableno-1].minline = tables[tableno-1].maxline = j1; 587 } 588 } 589 } 590 for (t = 0; t < tableno; t++) { 591 tables[t].usecount = 0; 592 j1 = 8*tables[t].minline; 593 j2 = 8*(tables[t].maxline+1); 594 for (j = j1; j < j2; j++) 595 if (enc->uni2charset[j] != 0) 596 tables[t].usecount++; 597 } 598 { 599 p = -1; 600 for (t = 0; t < tableno; t++) 601 if (tables[t].usecount > 1) { 602 p = tables[t].minline >> 5; 603 printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1)); 604 for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) { 605 if ((j1 % 0x20) == 0 && j1 > tables[t].minline) 606 printf(" /* 0x%04x */\n", 8*j1); 607 printf(" "); 608 for (j2 = 0; j2 < 8; j2++) { 609 j = 8*j1+j2; 610 printf(" 0x%04x,", enc->uni2charset[j]); 611 } 612 printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7); 613 } 614 printf("};\n"); 615 } 616 if (p >= 0) 617 printf("\n"); 618 } 619 printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name); 620 printf("{\n"); 621 printf(" if (n >= 2) {\n"); 622 printf(" unsigned short c = 0;\n"); 623 first = true; 624 for (j1 = 0; j1 < 0x6000;) { 625 t = line[j1]; 626 for (j2 = j1; j2 < 0x6000 && line[j2] == t; j2++); 627 if (t >= 0) { 628 if (j1 != tables[t].minline) abort(); 629 if (j2 > tables[t].maxline+1) abort(); 630 j2 = tables[t].maxline+1; 631 if (first) 632 printf(" "); 633 else 634 printf(" else "); 635 first = false; 636 if (tables[t].usecount == 0) abort(); 637 if (tables[t].usecount == 1) { 638 if (j2 != j1+1) abort(); 639 for (j = 8*j1; j < 8*j2; j++) 640 if (enc->uni2charset[j] != 0) { 641 printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]); 642 break; 643 } 644 } else { 645 if (j1 == 0) { 646 printf("if (wc < 0x%04x)", 8*j2); 647 } else { 648 printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2); 649 } 650 printf("\n c = %s_page%02x[wc", name, j1 >> 5); 651 if (tables[t].minline > 0) 652 printf("-0x%04x", 8*j1); 653 printf("];\n"); 654 } 655 } 656 j1 = j2; 657 } 658 printf(" if (c != 0) {\n"); 659 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n"); 660 printf(" return 2;\n"); 661 printf(" }\n"); 662 printf(" return RET_ILUNI;\n"); 663 printf(" }\n"); 664 printf(" return RET_TOOSMALL;\n"); 665 printf("}\n"); 666} 667 668/* 669 * Outputs the unicode to charset table and function, using a packed array. 670 * (Suitable if the table is sparse.) 671 * The argument 'monotonic' may be set to true if the mapping is monotonically 672 * increasing with small gaps only. 673 */ 674static void output_uni2charset_sparse (const char* name, Encoding* enc, bool monotonic) 675{ 676 bool pages[0x300]; 677 Block pageblocks[0x300]; int npageblocks; 678 int indx2charset[0x30000]; 679 int summary_indx[0x3000]; 680 int summary_used[0x3000]; 681 int i, row, col, j, p, j1, j2, indx; 682 bool is_large; 683 /* for monotonic: */ 684 int log2_stepsize = (!strcmp(name,"uhc_2") ? 6 : 7); 685 int stepsize = 1 << log2_stepsize; 686 int indxsteps; 687 688 /* Fill pages[0x300]. */ 689 for (p = 0; p < 0x300; p++) 690 pages[p] = false; 691 for (row = 0; row < enc->rows; row++) 692 for (col = 0; col < enc->cols; col++) { 693 j = enc->charset2uni[row][col]; 694 if (j != 0xfffd) 695 pages[j>>8] = true; 696 } 697 698 /* Determine whether two or three bytes are needed for each character. */ 699 is_large = false; 700 for (j = 0; j < 0x30000; j++) 701 if (enc->uni2charset[j] >= 0x10000) 702 is_large = true; 703 704#if 0 705 for (p = 0; p < 0x300; p++) 706 if (pages[p]) { 707 printf("static const unsigned short %s_page%02x[256] = {\n", name, p); 708 for (j1 = 0; j1 < 32; j1++) { 709 printf(" "); 710 for (j2 = 0; j2 < 8; j2++) 711 printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]); 712 printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7); 713 } 714 printf("};\n"); 715 } 716 printf("\n"); 717#endif 718 719 /* Fill summary_indx[] and summary_used[]. */ 720 indx = 0; 721 for (j1 = 0; j1 < 0x3000; j1++) { 722 summary_indx[j1] = indx; 723 summary_used[j1] = 0; 724 for (j2 = 0; j2 < 16; j2++) { 725 j = 16*j1+j2; 726 if (enc->uni2charset[j] != 0) { 727 indx2charset[indx++] = enc->uni2charset[j]; 728 summary_used[j1] |= (1 << j2); 729 } 730 } 731 } 732 733 /* Fill npageblocks and pageblocks[]. */ 734 npageblocks = 0; 735 for (p = 0; p < 0x300; ) { 736 if (pages[p] && (p == 0 || !pages[p-1])) { 737 pageblocks[npageblocks].start = 16*p; 738 do p++; while (p < 0x300 && pages[p]); 739 j1 = 16*p; 740 while (summary_used[j1-1] == 0) j1--; 741 pageblocks[npageblocks].end = j1; 742 npageblocks++; 743 } else 744 p++; 745 } 746 747 if (monotonic) { 748 indxsteps = (indx + stepsize-1) / stepsize; 749 printf("static const unsigned short %s_2charset_main[%d] = {\n", name, indxsteps); 750 for (i = 0; i < indxsteps; ) { 751 if ((i % 8) == 0) printf(" "); 752 printf(" 0x%04x,", indx2charset[i*stepsize]); 753 i++; 754 if ((i % 8) == 0 || i == indxsteps) printf("\n"); 755 } 756 printf("};\n"); 757 printf("static const unsigned char %s_2charset[%d] = {\n", name, indx); 758 for (i = 0; i < indx; ) { 759 if ((i % 8) == 0) printf(" "); 760 printf(" 0x%02x,", indx2charset[i] - indx2charset[i/stepsize*stepsize]); 761 i++; 762 if ((i % 8) == 0 || i == indx) printf("\n"); 763 } 764 printf("};\n"); 765 } else { 766 if (is_large) { 767 printf("static const unsigned char %s_2charset[3*%d] = {\n", name, indx); 768 for (i = 0; i < indx; ) { 769 if ((i % 4) == 0) printf(" "); 770 printf(" 0x%1x,0x%02x,0x%02x,", indx2charset[i] >> 16, 771 (indx2charset[i] >> 8) & 0xff, indx2charset[i] & 0xff); 772 i++; 773 if ((i % 4) == 0 || i == indx) printf("\n"); 774 } 775 printf("};\n"); 776 } else { 777 printf("static const unsigned short %s_2charset[%d] = {\n", name, indx); 778 for (i = 0; i < indx; ) { 779 if ((i % 8) == 0) printf(" "); 780 printf(" 0x%04x,", indx2charset[i]); 781 i++; 782 if ((i % 8) == 0 || i == indx) printf("\n"); 783 } 784 printf("};\n"); 785 } 786 } 787 printf("\n"); 788 for (i = 0; i < npageblocks; i++) { 789 printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name, 790 pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start); 791 for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) { 792 if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1); 793 if ((j1 % 4) == 0) printf(" "); 794 printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]); 795 j1++; 796 if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n"); 797 } 798 printf("};\n"); 799 } 800 printf("\n"); 801 802 printf("static int\n"); 803 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name); 804 printf("{\n"); 805 printf(" if (n >= 2) {\n"); 806 printf(" const Summary16 *summary = NULL;\n"); 807 for (i = 0; i < npageblocks; i++) { 808 printf(" "); 809 if (i > 0) 810 printf("else "); 811 printf("if (wc >= 0x%04x && wc < 0x%04x)\n", 812 16*pageblocks[i].start, 16*pageblocks[i].end); 813 printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name, 814 pageblocks[i].start/16); 815 if (pageblocks[i].start > 0) 816 printf("-0x%03x", pageblocks[i].start); 817 printf("];\n"); 818 } 819 printf(" if (summary) {\n"); 820 printf(" unsigned short used = summary->used;\n"); 821 printf(" unsigned int i = wc & 0x0f;\n"); 822 printf(" if (used & ((unsigned short) 1 << i)) {\n"); 823 if (monotonic || !is_large) 824 printf(" unsigned short c;\n"); 825 printf(" /* Keep in `used' only the bits 0..i-1. */\n"); 826 printf(" used &= ((unsigned short) 1 << i) - 1;\n"); 827 printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n"); 828 printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n"); 829 printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n"); 830 printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n"); 831 printf(" used = (used & 0x00ff) + (used >> 8);\n"); 832 if (monotonic) { 833 printf(" used += summary->indx;\n"); 834 printf(" c = %s_2charset_main[used>>%d] + %s_2charset[used];\n", name, log2_stepsize, name); 835 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n"); 836 printf(" return 2;\n"); 837 } else { 838 if (is_large) { 839 printf(" used += summary->indx;\n"); 840 printf(" r[0] = %s_2charset[3*used];\n", name); 841 printf(" r[1] = %s_2charset[3*used+1];\n", name); 842 printf(" r[2] = %s_2charset[3*used+2];\n", name); 843 printf(" return 3;\n"); 844 } else { 845 printf(" c = %s_2charset[summary->indx + used];\n", name); 846 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n"); 847 printf(" return 2;\n"); 848 } 849 } 850 printf(" }\n"); 851 printf(" }\n"); 852 printf(" return RET_ILUNI;\n"); 853 printf(" }\n"); 854 printf(" return RET_TOOSMALL;\n"); 855 printf("}\n"); 856} 857 858/* ISO-2022/EUC specifics */ 859 860static int row_byte_normal (int row) { return 0x21+row; } 861static int col_byte_normal (int col) { return 0x21+col; } 862static int byte_row_normal (int byte) { return byte-0x21; } 863static int byte_col_normal (int byte) { return byte-0x21; } 864 865static void do_normal (const char* name) 866{ 867 Encoding enc; 868 869 enc.rows = 94; 870 enc.cols = 94; 871 enc.row_byte = row_byte_normal; 872 enc.col_byte = col_byte_normal; 873 enc.byte_row = byte_row_normal; 874 enc.byte_col = byte_col_normal; 875 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 876 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 877 enc.byte_row_expr = "%1$s - 0x21"; 878 enc.byte_col_expr = "%1$s - 0x21"; 879 880 read_table(&enc); 881 output_charset2uni(name,&enc); 882 invert(&enc); output_uni2charset_sparse(name,&enc,false); 883} 884 885/* Note: On first sight, the jisx0212_2charset[] table seems to be in order, 886 starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in 887 order. There are 75 out-of-order values, scattered all throughout the table. 888 */ 889 890static void do_normal_only_charset2uni (const char* name) 891{ 892 Encoding enc; 893 894 enc.rows = 94; 895 enc.cols = 94; 896 enc.row_byte = row_byte_normal; 897 enc.col_byte = col_byte_normal; 898 enc.byte_row = byte_row_normal; 899 enc.byte_col = byte_col_normal; 900 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 901 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 902 enc.byte_row_expr = "%1$s - 0x21"; 903 enc.byte_col_expr = "%1$s - 0x21"; 904 905 read_table(&enc); 906 output_charset2uni(name,&enc); 907} 908 909/* CNS 11643 specifics - trick to put two tables into one */ 910 911static int row_byte_cns11643 (int row) { 912 return 0x100 * (row / 94) + (row % 94) + 0x21; 913} 914static int byte_row_cns11643 (int byte) { 915 return (byte >> 8) * 94 + (byte & 0xff) - 0x21; 916} 917 918static void do_cns11643_only_uni2charset (const char* name) 919{ 920 Encoding enc; 921 922 enc.rows = 16*94; 923 enc.cols = 94; 924 enc.row_byte = row_byte_cns11643; 925 enc.col_byte = col_byte_normal; 926 enc.byte_row = byte_row_cns11643; 927 enc.byte_col = byte_col_normal; 928 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 929 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 930 enc.byte_row_expr = "%1$s - 0x21"; 931 enc.byte_col_expr = "%1$s - 0x21"; 932 933 read_table(&enc); 934 invert(&enc); 935 output_uni2charset_sparse(name,&enc,false); 936} 937 938/* GBK specifics */ 939 940static int row_byte_gbk1 (int row) { 941 return 0x81+row; 942} 943static int col_byte_gbk1 (int col) { 944 return (col >= 0x3f ? 0x41 : 0x40) + col; 945} 946static int byte_row_gbk1 (int byte) { 947 if (byte >= 0x81 && byte < 0xff) 948 return byte-0x81; 949 else 950 return -1; 951} 952static int byte_col_gbk1 (int byte) { 953 if (byte >= 0x40 && byte < 0x7f) 954 return byte-0x40; 955 else if (byte >= 0x80 && byte < 0xff) 956 return byte-0x41; 957 else 958 return -1; 959} 960 961static void do_gbk1 (const char* name) 962{ 963 Encoding enc; 964 965 enc.rows = 126; 966 enc.cols = 190; 967 enc.row_byte = row_byte_gbk1; 968 enc.col_byte = col_byte_gbk1; 969 enc.byte_row = byte_row_gbk1; 970 enc.byte_col = byte_col_gbk1; 971 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; 972 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; 973 enc.byte_row_expr = "%1$s - 0x81"; 974 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 975 976 read_table(&enc); 977 output_charset2uni(name,&enc); 978 invert(&enc); output_uni2charset_dense(name,&enc); 979} 980 981static void do_gbk1_only_charset2uni (const char* name) 982{ 983 Encoding enc; 984 985 enc.rows = 126; 986 enc.cols = 190; 987 enc.row_byte = row_byte_gbk1; 988 enc.col_byte = col_byte_gbk1; 989 enc.byte_row = byte_row_gbk1; 990 enc.byte_col = byte_col_gbk1; 991 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; 992 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; 993 enc.byte_row_expr = "%1$s - 0x81"; 994 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 995 996 read_table(&enc); 997 output_charset2uni(name,&enc); 998} 999 1000static int row_byte_gbk2 (int row) { 1001 return 0x81+row; 1002} 1003static int col_byte_gbk2 (int col) { 1004 return (col >= 0x3f ? 0x41 : 0x40) + col; 1005} 1006static int byte_row_gbk2 (int byte) { 1007 if (byte >= 0x81 && byte < 0xff) 1008 return byte-0x81; 1009 else 1010 return -1; 1011} 1012static int byte_col_gbk2 (int byte) { 1013 if (byte >= 0x40 && byte < 0x7f) 1014 return byte-0x40; 1015 else if (byte >= 0x80 && byte < 0xa1) 1016 return byte-0x41; 1017 else 1018 return -1; 1019} 1020 1021static void do_gbk2_only_charset2uni (const char* name) 1022{ 1023 Encoding enc; 1024 1025 enc.rows = 126; 1026 enc.cols = 96; 1027 enc.row_byte = row_byte_gbk2; 1028 enc.col_byte = col_byte_gbk2; 1029 enc.byte_row = byte_row_gbk2; 1030 enc.byte_col = byte_col_gbk2; 1031 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; 1032 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)"; 1033 enc.byte_row_expr = "%1$s - 0x81"; 1034 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 1035 1036 read_table(&enc); 1037 output_charset2uni(name,&enc); 1038} 1039 1040static void do_gbk1_only_uni2charset (const char* name) 1041{ 1042 Encoding enc; 1043 1044 enc.rows = 126; 1045 enc.cols = 190; 1046 enc.row_byte = row_byte_gbk1; 1047 enc.col_byte = col_byte_gbk1; 1048 enc.byte_row = byte_row_gbk1; 1049 enc.byte_col = byte_col_gbk1; 1050 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; 1051 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; 1052 enc.byte_row_expr = "%1$s - 0x81"; 1053 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 1054 1055 read_table(&enc); 1056 invert(&enc); output_uni2charset_sparse(name,&enc,false); 1057} 1058 1059/* KSC 5601 specifics */ 1060 1061/* 1062 * Reads the charset2uni table from standard input. 1063 */ 1064static void read_table_ksc5601 (Encoding* enc) 1065{ 1066 int row, col, i, i1, i2, c, j; 1067 1068 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*)); 1069 for (row = 0; row < enc->rows; row++) 1070 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int)); 1071 1072 for (row = 0; row < enc->rows; row++) 1073 for (col = 0; col < enc->cols; col++) 1074 enc->charset2uni[row][col] = 0xfffd; 1075 1076 c = getc(stdin); 1077 ungetc(c,stdin); 1078 if (c == '#') { 1079 /* Read a unicode.org style .TXT file. */ 1080 for (;;) { 1081 c = getc(stdin); 1082 if (c == EOF) 1083 break; 1084 if (c == '\n' || c == ' ' || c == '\t') 1085 continue; 1086 if (c == '#') { 1087 do { c = getc(stdin); } while (!(c == EOF || c == '\n')); 1088 continue; 1089 } 1090 ungetc(c,stdin); 1091 if (scanf("0x%x", &j) != 1) 1092 exit(1); 1093 i1 = j >> 8; 1094 i2 = j & 0xff; 1095 if (scanf(" 0x%x", &j) != 1) 1096 exit(1); 1097 /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0 1098 = KS X 1001.1992, ignore the rest. */ 1099 if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127)) 1100 continue; /* KSC5601 specific */ 1101 i1 &= 0x7f; /* KSC5601 specific */ 1102 i2 &= 0x7f; /* KSC5601 specific */ 1103 row = enc->byte_row(i1); 1104 col = enc->byte_col(i2); 1105 if (row < 0 || col < 0) { 1106 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2); 1107 exit(1); 1108 } 1109 enc->charset2uni[row][col] = j; 1110 } 1111 } else { 1112 /* Read a table of hexadecimal Unicode values. */ 1113 for (i1 = 33; i1 < 127; i1++) 1114 for (i2 = 33; i2 < 127; i2++) { 1115 i = scanf("%x", &j); 1116 if (i == EOF) 1117 goto read_done; 1118 if (i != 1) 1119 exit(1); 1120 if (j < 0 || j == 0xffff) 1121 j = 0xfffd; 1122 if (j != 0xfffd) { 1123 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) { 1124 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2); 1125 exit (1); 1126 } 1127 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j; 1128 } 1129 } 1130 read_done: ; 1131 } 1132} 1133 1134static void do_ksc5601 (const char* name) 1135{ 1136 Encoding enc; 1137 1138 enc.rows = 94; 1139 enc.cols = 94; 1140 enc.row_byte = row_byte_normal; 1141 enc.col_byte = col_byte_normal; 1142 enc.byte_row = byte_row_normal; 1143 enc.byte_col = byte_col_normal; 1144 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 1145 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 1146 enc.byte_row_expr = "%1$s - 0x21"; 1147 enc.byte_col_expr = "%1$s - 0x21"; 1148 1149 read_table_ksc5601(&enc); 1150 output_charset2uni(name,&enc); 1151 invert(&enc); output_uni2charset_sparse(name,&enc,false); 1152} 1153 1154/* UHC specifics */ 1155 1156/* UHC part 1: 0x{81..A0}{41..5A,61..7A,81..FE} */ 1157 1158static int row_byte_uhc_1 (int row) { 1159 return 0x81 + row; 1160} 1161static int col_byte_uhc_1 (int col) { 1162 return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col; 1163} 1164static int byte_row_uhc_1 (int byte) { 1165 if (byte >= 0x81 && byte < 0xa1) 1166 return byte-0x81; 1167 else 1168 return -1; 1169} 1170static int byte_col_uhc_1 (int byte) { 1171 if (byte >= 0x41 && byte < 0x5b) 1172 return byte-0x41; 1173 else if (byte >= 0x61 && byte < 0x7b) 1174 return byte-0x47; 1175 else if (byte >= 0x81 && byte < 0xff) 1176 return byte-0x4d; 1177 else 1178 return -1; 1179} 1180 1181static void do_uhc_1 (const char* name) 1182{ 1183 Encoding enc; 1184 1185 enc.rows = 32; 1186 enc.cols = 178; 1187 enc.row_byte = row_byte_uhc_1; 1188 enc.col_byte = col_byte_uhc_1; 1189 enc.byte_row = byte_row_uhc_1; 1190 enc.byte_col = byte_col_uhc_1; 1191 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa1)"; 1192 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xff)"; 1193 enc.byte_row_expr = "%1$s - 0x81"; 1194 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)"; 1195 1196 read_table(&enc); 1197 output_charset2uni_noholes_monotonic(name,&enc); 1198 invert(&enc); output_uni2charset_sparse(name,&enc,true); 1199} 1200 1201/* UHC part 2: 0x{A1..C6}{41..5A,61..7A,81..A0} */ 1202 1203static int row_byte_uhc_2 (int row) { 1204 return 0xa1 + row; 1205} 1206static int col_byte_uhc_2 (int col) { 1207 return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col; 1208} 1209static int byte_row_uhc_2 (int byte) { 1210 if (byte >= 0xa1 && byte < 0xff) 1211 return byte-0xa1; 1212 else 1213 return -1; 1214} 1215static int byte_col_uhc_2 (int byte) { 1216 if (byte >= 0x41 && byte < 0x5b) 1217 return byte-0x41; 1218 else if (byte >= 0x61 && byte < 0x7b) 1219 return byte-0x47; 1220 else if (byte >= 0x81 && byte < 0xa1) 1221 return byte-0x4d; 1222 else 1223 return -1; 1224} 1225 1226static void do_uhc_2 (const char* name) 1227{ 1228 Encoding enc; 1229 1230 enc.rows = 94; 1231 enc.cols = 84; 1232 enc.row_byte = row_byte_uhc_2; 1233 enc.col_byte = col_byte_uhc_2; 1234 enc.byte_row = byte_row_uhc_2; 1235 enc.byte_col = byte_col_uhc_2; 1236 enc.check_row_expr = "(%1$s >= 0xa1 && %1$s < 0xff)"; 1237 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xa1)"; 1238 enc.byte_row_expr = "%1$s - 0xa1"; 1239 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)"; 1240 1241 read_table(&enc); 1242 output_charset2uni_noholes_monotonic(name,&enc); 1243 invert(&enc); output_uni2charset_sparse(name,&enc,true); 1244} 1245 1246/* Big5 specifics */ 1247 1248static int row_byte_big5 (int row) { 1249 return 0xa1+row; 1250} 1251static int col_byte_big5 (int col) { 1252 return (col >= 0x3f ? 0x62 : 0x40) + col; 1253} 1254static int byte_row_big5 (int byte) { 1255 if (byte >= 0xa1 && byte < 0xff) 1256 return byte-0xa1; 1257 else 1258 return -1; 1259} 1260static int byte_col_big5 (int byte) { 1261 if (byte >= 0x40 && byte < 0x7f) 1262 return byte-0x40; 1263 else if (byte >= 0xa1 && byte < 0xff) 1264 return byte-0x62; 1265 else 1266 return -1; 1267} 1268 1269static void do_big5 (const char* name) 1270{ 1271 Encoding enc; 1272 1273 enc.rows = 94; 1274 enc.cols = 157; 1275 enc.row_byte = row_byte_big5; 1276 enc.col_byte = col_byte_big5; 1277 enc.byte_row = byte_row_big5; 1278 enc.byte_col = byte_col_big5; 1279 enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff"; 1280 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)"; 1281 enc.byte_row_expr = "%1$s - 0xa1"; 1282 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)"; 1283 1284 read_table(&enc); 1285 output_charset2uni(name,&enc); 1286 invert(&enc); output_uni2charset_sparse(name,&enc,false); 1287} 1288 1289/* HKSCS specifics */ 1290 1291static int row_byte_hkscs (int row) { 1292 return 0x80+row; 1293} 1294static int byte_row_hkscs (int byte) { 1295 if (byte >= 0x80 && byte < 0xff) 1296 return byte-0x80; 1297 else 1298 return -1; 1299} 1300 1301static void do_hkscs (const char* name) 1302{ 1303 Encoding enc; 1304 1305 enc.rows = 128; 1306 enc.cols = 157; 1307 enc.row_byte = row_byte_hkscs; 1308 enc.col_byte = col_byte_big5; 1309 enc.byte_row = byte_row_hkscs; 1310 enc.byte_col = byte_col_big5; 1311 enc.check_row_expr = "%1$s >= 0x80 && %1$s < 0xff"; 1312 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)"; 1313 enc.byte_row_expr = "%1$s - 0x80"; 1314 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)"; 1315 1316 read_table(&enc); 1317 output_charset2uni(name,&enc); 1318 invert(&enc); output_uni2charset_sparse(name,&enc,false); 1319} 1320 1321/* Johab Hangul specifics */ 1322 1323static int row_byte_johab_hangul (int row) { 1324 return 0x84+row; 1325} 1326static int col_byte_johab_hangul (int col) { 1327 return (col >= 0x3e ? 0x43 : 0x41) + col; 1328} 1329static int byte_row_johab_hangul (int byte) { 1330 if (byte >= 0x84 && byte < 0xd4) 1331 return byte-0x84; 1332 else 1333 return -1; 1334} 1335static int byte_col_johab_hangul (int byte) { 1336 if (byte >= 0x41 && byte < 0x7f) 1337 return byte-0x41; 1338 else if (byte >= 0x81 && byte < 0xff) 1339 return byte-0x43; 1340 else 1341 return -1; 1342} 1343 1344static void do_johab_hangul (const char* name) 1345{ 1346 Encoding enc; 1347 1348 enc.rows = 80; 1349 enc.cols = 188; 1350 enc.row_byte = row_byte_johab_hangul; 1351 enc.col_byte = col_byte_johab_hangul; 1352 enc.byte_row = byte_row_johab_hangul; 1353 enc.byte_col = byte_col_johab_hangul; 1354 enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4"; 1355 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)"; 1356 enc.byte_row_expr = "%1$s - 0x84"; 1357 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)"; 1358 1359 read_table(&enc); 1360 output_charset2uni(name,&enc); 1361 invert(&enc); output_uni2charset_dense(name,&enc); 1362} 1363 1364/* SJIS specifics */ 1365 1366static int row_byte_sjis (int row) { 1367 return (row >= 0x1f ? 0xc1 : 0x81) + row; 1368} 1369static int col_byte_sjis (int col) { 1370 return (col >= 0x3f ? 0x41 : 0x40) + col; 1371} 1372static int byte_row_sjis (int byte) { 1373 if (byte >= 0x81 && byte < 0xa0) 1374 return byte-0x81; 1375 else if (byte >= 0xe0) 1376 return byte-0xc1; 1377 else 1378 return -1; 1379} 1380static int byte_col_sjis (int byte) { 1381 if (byte >= 0x40 && byte < 0x7f) 1382 return byte-0x40; 1383 else if (byte >= 0x80 && byte < 0xfd) 1384 return byte-0x41; 1385 else 1386 return -1; 1387} 1388 1389static void do_sjis (const char* name) 1390{ 1391 Encoding enc; 1392 1393 enc.rows = 94; 1394 enc.cols = 188; 1395 enc.row_byte = row_byte_sjis; 1396 enc.col_byte = col_byte_sjis; 1397 enc.byte_row = byte_row_sjis; 1398 enc.byte_col = byte_col_sjis; 1399 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)"; 1400 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)"; 1401 enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)"; 1402 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 1403 1404 read_table(&enc); 1405 output_charset2uni(name,&enc); 1406 invert(&enc); output_uni2charset_sparse(name,&enc,false); 1407} 1408 1409/* GB18030 Unicode specifics */ 1410 1411static void do_gb18030uni (const char* name) 1412{ 1413 int c; 1414 unsigned int bytes; 1415 int i1, i2, i3, i4, i, j, k; 1416 int charset2uni[4*10*126*10]; 1417 int uni2charset[0x10000]; 1418 struct { int low; int high; int diff; int total; } ranges[256]; 1419 int ranges_count, ranges_total; 1420 1421 for (i = 0; i < 4*10*126*10; i++) 1422 charset2uni[i] = 0; 1423 for (j = 0; j < 0x10000; j++) 1424 uni2charset[j] = 0; 1425 1426 /* Read a unicode.org style .TXT file. */ 1427 for (;;) { 1428 c = getc(stdin); 1429 if (c == EOF) 1430 break; 1431 if (c == '\n' || c == ' ' || c == '\t') 1432 continue; 1433 if (c == '#') { 1434 do { c = getc(stdin); } while (!(c == EOF || c == '\n')); 1435 continue; 1436 } 1437 ungetc(c,stdin); 1438 if (scanf("0x%x", &bytes) != 1) 1439 exit(1); 1440 i1 = (bytes >> 24) & 0xff; 1441 i2 = (bytes >> 16) & 0xff; 1442 i3 = (bytes >> 8) & 0xff; 1443 i4 = bytes & 0xff; 1444 if (!(i1 >= 0x81 && i1 <= 0x84 1445 && i2 >= 0x30 && i2 <= 0x39 1446 && i3 >= 0x81 && i3 <= 0xfe 1447 && i4 >= 0x30 && i4 <= 0x39)) { 1448 fprintf(stderr, "lost entry for %02x %02x %02x %02x\n", i1, i2, i3, i4); 1449 exit(1); 1450 } 1451 i = (((i1-0x81) * 10 + (i2-0x30)) * 126 + (i3-0x81)) * 10 + (i4-0x30); 1452 if (scanf(" 0x%x", &j) != 1) 1453 exit(1); 1454 if (!(j >= 0 && j < 0x10000)) 1455 exit(1); 1456 charset2uni[i] = j; 1457 uni2charset[j] = i; 1458 } 1459 1460 /* Verify that the mapping i -> j is monotonically increasing and 1461 of the form 1462 low[k] <= i <= high[k] => j = diff[k] + i 1463 with a set of disjoint intervals (low[k], high[k]). */ 1464 ranges_count = 0; 1465 for (i = 0; i < 4*10*126*10; i++) 1466 if (charset2uni[i] != 0) { 1467 int diff; 1468 j = charset2uni[i]; 1469 diff = j - i; 1470 if (ranges_count > 0) { 1471 if (!(i > ranges[ranges_count-1].high)) 1472 exit(1); 1473 if (!(j > ranges[ranges_count-1].high + ranges[ranges_count-1].diff)) 1474 exit(1); 1475 /* Additional property: The diffs are also increasing. */ 1476 if (!(diff >= ranges[ranges_count-1].diff)) 1477 exit(1); 1478 } 1479 if (ranges_count > 0 && diff == ranges[ranges_count-1].diff) 1480 ranges[ranges_count-1].high = i; 1481 else { 1482 if (ranges_count == 256) 1483 exit(1); 1484 ranges[ranges_count].low = i; 1485 ranges[ranges_count].high = i; 1486 ranges[ranges_count].diff = diff; 1487 ranges_count++; 1488 } 1489 } 1490 1491 /* Determine size of bitmap. */ 1492 ranges_total = 0; 1493 for (k = 0; k < ranges_count; k++) { 1494 ranges[k].total = ranges_total; 1495 ranges_total += ranges[k].high - ranges[k].low + 1; 1496 } 1497 1498 printf("static const unsigned short %s_charset2uni_ranges[%d] = {\n", name, 2*ranges_count); 1499 for (k = 0; k < ranges_count; k++) { 1500 printf(" 0x%04x, 0x%04x", ranges[k].low, ranges[k].high); 1501 if (k+1 < ranges_count) printf(","); 1502 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n"); 1503 } 1504 printf("\n"); 1505 printf("};\n"); 1506 1507 printf("\n"); 1508 1509 printf("static const unsigned short %s_uni2charset_ranges[%d] = {\n", name, 2*ranges_count); 1510 for (k = 0; k < ranges_count; k++) { 1511 printf(" 0x%04x, 0x%04x", ranges[k].low + ranges[k].diff, ranges[k].high + ranges[k].diff); 1512 if (k+1 < ranges_count) printf(","); 1513 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n"); 1514 } 1515 printf("\n"); 1516 printf("};\n"); 1517 1518 printf("\n"); 1519 1520 printf("static const struct { unsigned short diff; unsigned short bitmap_offset; } %s_ranges[%d] = {\n ", name, ranges_count); 1521 for (k = 0; k < ranges_count; k++) { 1522 printf(" { %5d, 0x%04x }", ranges[k].diff, ranges[k].total); 1523 if (k+1 < ranges_count) printf(","); 1524 if ((k % 4) == 3 && k+1 < ranges_count) printf("\n "); 1525 } 1526 printf("\n"); 1527 printf("};\n"); 1528 1529 printf("\n"); 1530 1531 printf("static const unsigned char %s_bitmap[%d] = {\n ", name, (ranges_total + 7) / 8); 1532 { 1533 int accu = 0; 1534 for (k = 0; k < ranges_count; k++) { 1535 for (i = ranges[k].total; i <= ranges[k].total + (ranges[k].high - ranges[k].low);) { 1536 if (charset2uni[i - ranges[k].total + ranges[k].low] != 0) 1537 accu |= (1 << (i % 8)); 1538 i++; 1539 if ((i % 8) == 0) { 1540 printf(" 0x%02x", accu); 1541 if ((i / 8) < (ranges_total + 7) / 8) printf(","); 1542 if (((i / 8) % 12) == 0) 1543 printf("\n "); 1544 accu = 0; 1545 } 1546 } 1547 if (i != (k+1 < ranges_count ? ranges[k+1].total : ranges_total)) abort(); 1548 } 1549 if ((ranges_total % 8) != 0) 1550 printf(" 0x%02x", accu); 1551 printf("\n"); 1552 } 1553 printf("};\n"); 1554 1555 printf("\n"); 1556 1557 printf("static int\n"); 1558 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name); 1559 printf("{\n"); 1560 printf(" unsigned char c1 = s[0];\n"); 1561 printf(" if (c1 >= 0x81 && c1 <= 0x84) {\n"); 1562 printf(" if (n >= 2) {\n"); 1563 printf(" unsigned char c2 = s[1];\n"); 1564 printf(" if (c2 >= 0x30 && c2 <= 0x39) {\n"); 1565 printf(" if (n >= 3) {\n"); 1566 printf(" unsigned char c3 = s[2];\n"); 1567 printf(" if (c3 >= 0x81 && c3 <= 0xfe) {\n"); 1568 printf(" if (n >= 4) {\n"); 1569 printf(" unsigned char c4 = s[3];\n"); 1570 printf(" if (c4 >= 0x30 && c4 <= 0x39) {\n"); 1571 printf(" unsigned int i = (((c1 - 0x81) * 10 + (c2 - 0x30)) * 126 + (c3 - 0x81)) * 10 + (c4 - 0x30);\n"); 1572 printf(" if (i >= %d && i <= %d) {\n", ranges[0].low, ranges[ranges_count-1].high); 1573 printf(" unsigned int k1 = 0;\n"); 1574 printf(" unsigned int k2 = %d;\n", ranges_count-1); 1575 printf(" while (k1 < k2) {\n"); 1576 printf(" unsigned int k = (k1 + k2) / 2;\n"); 1577 printf(" if (i <= %s_charset2uni_ranges[2*k+1])\n", name); 1578 printf(" k2 = k;\n"); 1579 printf(" else if (i >= %s_charset2uni_ranges[2*k+2])\n", name); 1580 printf(" k1 = k + 1;\n"); 1581 printf(" else\n"); 1582 printf(" return RET_ILSEQ;\n"); 1583 printf(" }\n"); 1584 printf(" {\n"); 1585 printf(" unsigned int bitmap_index = i - %s_charset2uni_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name); 1586 printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name); 1587 printf(" unsigned int diff = %s_ranges[k1].diff;\n", name); 1588 printf(" *pwc = (ucs4_t) (i + diff);\n"); 1589 printf(" return 4;\n"); 1590 printf(" }\n"); 1591 printf(" }\n"); 1592 printf(" }\n"); 1593 printf(" }\n"); 1594 printf(" return RET_ILSEQ;\n"); 1595 printf(" }\n"); 1596 printf(" return RET_TOOFEW(0);\n"); 1597 printf(" }\n"); 1598 printf(" return RET_ILSEQ;\n"); 1599 printf(" }\n"); 1600 printf(" return RET_TOOFEW(0);\n"); 1601 printf(" }\n"); 1602 printf(" return RET_ILSEQ;\n"); 1603 printf(" }\n"); 1604 printf(" return RET_TOOFEW(0);\n"); 1605 printf(" }\n"); 1606 printf(" return RET_ILSEQ;\n"); 1607 printf("}\n"); 1608 1609 printf("\n"); 1610 1611 printf("static int\n"); 1612 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name); 1613 printf("{\n"); 1614 printf(" if (n >= 4) {\n"); 1615 printf(" unsigned int i = wc;\n"); 1616 printf(" if (i >= 0x%04x && i <= 0x%04x) {\n", ranges[0].low + ranges[0].diff, ranges[ranges_count-1].high + ranges[ranges_count-1].diff); 1617 printf(" unsigned int k1 = 0;\n"); 1618 printf(" unsigned int k2 = %d;\n", ranges_count-1); 1619 printf(" while (k1 < k2) {\n"); 1620 printf(" unsigned int k = (k1 + k2) / 2;\n"); 1621 printf(" if (i <= %s_uni2charset_ranges[2*k+1])\n", name); 1622 printf(" k2 = k;\n"); 1623 printf(" else if (i >= %s_uni2charset_ranges[2*k+2])\n", name); 1624 printf(" k1 = k + 1;\n"); 1625 printf(" else\n"); 1626 printf(" return RET_ILUNI;\n"); 1627 printf(" }\n"); 1628 printf(" {\n"); 1629 printf(" unsigned int bitmap_index = i - %s_uni2charset_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name); 1630 printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name); 1631 printf(" unsigned int diff = %s_ranges[k1].diff;\n", name); 1632 printf(" i -= diff;\n"); 1633 printf(" r[3] = (i %% 10) + 0x30; i = i / 10;\n"); 1634 printf(" r[2] = (i %% 126) + 0x81; i = i / 126;\n"); 1635 printf(" r[1] = (i %% 10) + 0x30; i = i / 10;\n"); 1636 printf(" r[0] = i + 0x81;\n"); 1637 printf(" return 4;\n"); 1638 printf(" }\n"); 1639 printf(" }\n"); 1640 printf(" }\n"); 1641 printf(" return RET_ILUNI;\n"); 1642 printf(" }\n"); 1643 printf(" return RET_TOOSMALL;\n"); 1644 printf("}\n"); 1645} 1646 1647/* JISX0213 specifics */ 1648 1649static void do_jisx0213 (const char* name) 1650{ 1651 printf("#ifndef _JISX0213_H\n"); 1652 printf("#define _JISX0213_H\n"); 1653 printf("\n"); 1654 printf("/* JISX0213 plane 1 (= ISO-IR-233) characters are in the range\n"); 1655 printf(" 0x{21..7E}{21..7E}.\n"); 1656 printf(" JISX0213 plane 2 (= ISO-IR-229) characters are in the range\n"); 1657 printf(" 0x{21,23..25,28,2C..2F,6E..7E}{21..7E}.\n"); 1658 printf(" Together this makes 120 rows of 94 characters.\n"); 1659 printf("*/\n"); 1660 printf("\n"); 1661 { 1662#define row_convert(row) \ 1663 ((row) >= 0x121 && (row) <= 0x17E ? row-289 : /* 0..93 */ \ 1664 (row) == 0x221 ? row-451 : /* 94 */ \ 1665 (row) >= 0x223 && (row) <= 0x225 ? row-452 : /* 95..97 */ \ 1666 (row) == 0x228 ? row-454 : /* 98 */ \ 1667 (row) >= 0x22C && (row) <= 0x22F ? row-457 : /* 99..102 */ \ 1668 (row) >= 0x26E && (row) <= 0x27E ? row-519 : /* 103..119 */ \ 1669 -1) 1670 unsigned int table[120][94]; 1671 int pagemin[0x1100]; 1672 int pagemax[0x1100]; 1673 int pageidx[0x1100]; 1674 unsigned int pagestart[0x1100]; 1675 unsigned int pagestart_len = 0; 1676 { 1677 unsigned int rowc, colc; 1678 for (rowc = 0; rowc < 120; rowc++) 1679 for (colc = 0; colc < 94; colc++) 1680 table[rowc][colc] = 0; 1681 } 1682 { 1683 unsigned int page; 1684 for (page = 0; page < 0x1100; page++) 1685 pagemin[page] = -1; 1686 for (page = 0; page < 0x1100; page++) 1687 pagemax[page] = -1; 1688 for (page = 0; page < 0x1100; page++) 1689 pageidx[page] = -1; 1690 } 1691 printf("static const unsigned short jisx0213_to_ucs_combining[][2] = {\n"); 1692 { 1693 int private_use = 0x0001; 1694 for (;;) { 1695 char line[30]; 1696 unsigned int row, col; 1697 unsigned int ucs; 1698 memset(line,0,sizeof(line)); 1699 if (scanf("%[^\n]\n",line) < 1) 1700 break; 1701 assert(line[0]=='0'); 1702 assert(line[1]=='x'); 1703 assert(isxdigit(line[2])); 1704 assert(isxdigit(line[3])); 1705 assert(isxdigit(line[4])); 1706 assert(isxdigit(line[5])); 1707 assert(isxdigit(line[6])); 1708 assert(line[7]=='\t'); 1709 line[7] = '\0'; 1710 col = strtoul(&line[5],NULL,16); 1711 line[5] = '\0'; 1712 row = strtoul(&line[2],NULL,16); 1713 if (line[20] != '\0' && line[21] == '\0') { 1714 unsigned int u1, u2; 1715 assert(line[8]=='0'); 1716 assert(line[9]=='x'); 1717 assert(isxdigit(line[10])); 1718 assert(isxdigit(line[11])); 1719 assert(isxdigit(line[12])); 1720 assert(isxdigit(line[13])); 1721 assert(line[14]==' '); 1722 assert(line[15]=='0'); 1723 assert(line[16]=='x'); 1724 assert(isxdigit(line[17])); 1725 assert(isxdigit(line[18])); 1726 assert(isxdigit(line[19])); 1727 assert(isxdigit(line[20])); 1728 u2 = strtoul(&line[17],NULL,16); 1729 line[14] = '\0'; 1730 u1 = strtoul(&line[10],NULL,16); 1731 printf(" { 0x%04x, 0x%04x },\n", u1, u2); 1732 ucs = private_use++; 1733 } else { 1734 assert(line[8]=='0'); 1735 assert(line[9]=='x'); 1736 assert(isxdigit(line[10])); 1737 assert(isxdigit(line[11])); 1738 assert(isxdigit(line[12])); 1739 assert(isxdigit(line[13])); 1740 ucs = strtoul(&line[10],NULL,16); 1741 } 1742 assert((unsigned int) row_convert(row) < 120); 1743 assert((unsigned int) (col-0x21) < 94); 1744 table[row_convert(row)][col-0x21] = ucs; 1745 } 1746 } 1747 printf("};\n"); 1748 printf("\n"); 1749 { 1750 unsigned int rowc, colc; 1751 for (rowc = 0; rowc < 120; rowc++) { 1752 for (colc = 0; colc < 94; colc++) { 1753 unsigned int value = table[rowc][colc]; 1754 unsigned int page = value >> 8; 1755 unsigned int rest = value & 0xff; 1756 if (pagemin[page] < 0 || pagemin[page] > rest) pagemin[page] = rest; 1757 if (pagemax[page] < 0 || pagemax[page] < rest) pagemax[page] = rest; 1758 } 1759 } 1760 } 1761 { 1762 unsigned int index = 0; 1763 unsigned int i; 1764 for (i = 0; i < 0x1100; ) { 1765 if (pagemin[i] >= 0) { 1766 if (pagemin[i+1] >= 0 && pagemin[i] >= 0x80 && pagemax[i+1] < 0x80) { 1767 /* Combine two pages into a single one. */ 1768 assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0])); 1769 pagestart[pagestart_len++] = (i<<8)+0x80; 1770 pageidx[i] = index; 1771 pageidx[i+1] = index; 1772 index++; 1773 i += 2; 1774 } else { 1775 /* A single page. */ 1776 assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0])); 1777 pagestart[pagestart_len++] = i<<8; 1778 pageidx[i] = index; 1779 index++; 1780 i += 1; 1781 } 1782 } else 1783 i++; 1784 } 1785 } 1786 printf("static const unsigned short jisx0213_to_ucs_main[120 * 94] = {\n"); 1787 { 1788 unsigned int row; 1789 for (row = 0; row < 0x300; row++) { 1790 unsigned int rowc = row_convert(row); 1791 if (rowc != (unsigned int) (-1)) { 1792 printf(" /* 0x%X21..0x%X7E */\n",row,row); 1793 { 1794 unsigned int count = 0; 1795 unsigned int colc; 1796 for (colc = 0; colc < 94; colc++) { 1797 if ((count % 8) == 0) printf(" "); 1798 { 1799 unsigned int value = table[rowc][colc]; 1800 unsigned int page = value >> 8; 1801 unsigned int index = pageidx[page]; 1802 assert(value-pagestart[index] < 0x100); 1803 printf(" 0x%04x,",(index<<8)|(value-pagestart[index])); 1804 } 1805 count++; 1806 if ((count % 8) == 0) printf("\n"); 1807 } 1808 } 1809 printf("\n"); 1810 } 1811 } 1812 } 1813 printf("};\n"); 1814 printf("\n"); 1815 printf("static const ucs4_t jisx0213_to_ucs_pagestart[] = {\n"); 1816 { 1817 unsigned int count = 0; 1818 unsigned int i; 1819 for (i = 0; i < pagestart_len; i++) { 1820 char buf[10]; 1821 if ((count % 8) == 0) printf(" "); 1822 printf(" "); 1823 sprintf(buf,"0x%04x",pagestart[i]); 1824 if (strlen(buf) < 7) printf("%*s",7-strlen(buf),""); 1825 printf("%s,",buf); 1826 count++; 1827 if ((count % 8) == 0) printf("\n"); 1828 } 1829 } 1830 printf("\n"); 1831 printf("};\n"); 1832#undef row_convert 1833 } 1834 rewind(stdin); 1835 printf("\n"); 1836 { 1837 int table[0x110000]; 1838 bool pages[0x4400]; 1839 int maxpage = -1; 1840 unsigned int combining_prefixes[100]; 1841 unsigned int combining_prefixes_len = 0; 1842 { 1843 unsigned int i; 1844 for (i = 0; i < 0x110000; i++) 1845 table[i] = -1; 1846 for (i = 0; i < 0x4400; i++) 1847 pages[i] = false; 1848 } 1849 for (;;) { 1850 char line[30]; 1851 unsigned int plane, row, col; 1852 memset(line,0,sizeof(line)); 1853 if (scanf("%[^\n]\n",line) < 1) 1854 break; 1855 assert(line[0]=='0'); 1856 assert(line[1]=='x'); 1857 assert(isxdigit(line[2])); 1858 assert(isxdigit(line[3])); 1859 assert(isxdigit(line[4])); 1860 assert(isxdigit(line[5])); 1861 assert(isxdigit(line[6])); 1862 assert(line[7]=='\t'); 1863 line[7] = '\0'; 1864 col = strtoul(&line[5],NULL,16); 1865 line[5] = '\0'; 1866 row = strtoul(&line[3],NULL,16); 1867 line[3] = '\0'; 1868 plane = strtoul(&line[2],NULL,16) - 1; 1869 if (line[20] != '\0' && line[21] == '\0') { 1870 unsigned int u1, u2; 1871 assert(line[8]=='0'); 1872 assert(line[9]=='x'); 1873 assert(isxdigit(line[10])); 1874 assert(isxdigit(line[11])); 1875 assert(isxdigit(line[12])); 1876 assert(isxdigit(line[13])); 1877 assert(line[14]==' '); 1878 assert(line[15]=='0'); 1879 assert(line[16]=='x'); 1880 assert(isxdigit(line[17])); 1881 assert(isxdigit(line[18])); 1882 assert(isxdigit(line[19])); 1883 assert(isxdigit(line[20])); 1884 u2 = strtoul(&line[17],NULL,16); 1885 line[14] = '\0'; 1886 u1 = strtoul(&line[10],NULL,16); 1887 assert(u2 == 0x02E5 || u2 == 0x02E9 || u2 == 0x0300 || u2 == 0x0301 1888 || u2 == 0x309A); 1889 assert(combining_prefixes_len < sizeof(combining_prefixes)/sizeof(combining_prefixes[0])); 1890 combining_prefixes[combining_prefixes_len++] = u1; 1891 } else { 1892 unsigned int ucs; 1893 assert(line[8]=='0'); 1894 assert(line[9]=='x'); 1895 assert(isxdigit(line[10])); 1896 assert(isxdigit(line[11])); 1897 assert(isxdigit(line[12])); 1898 assert(isxdigit(line[13])); 1899 ucs = strtoul(&line[10],NULL,16); 1900 /* Add an entry. */ 1901 assert(plane <= 1); 1902 assert(row <= 0x7f); 1903 assert(col <= 0x7f); 1904 table[ucs] = (plane << 15) | (row << 8) | col; 1905 pages[ucs>>6] = true; 1906 if (maxpage < 0 || (ucs>>6) > maxpage) maxpage = ucs>>6; 1907 } 1908 } 1909 { 1910 unsigned int i; 1911 for (i = 0; i < combining_prefixes_len; i++) { 1912 unsigned int u1 = combining_prefixes[i]; 1913 assert(table[u1] >= 0); 1914 table[u1] |= 0x0080; 1915 } 1916 } 1917 printf("static const short jisx0213_from_ucs_level1[%d] = {\n",maxpage+1); 1918 { 1919 unsigned int index = 0; 1920 unsigned int i; 1921 for (i = 0; i <= maxpage; i++) { 1922 if ((i % 8) == 0) printf(" "); 1923 if (pages[i]) { 1924 printf(" %3u,",index); 1925 index++; 1926 } else { 1927 printf(" %3d,",-1); 1928 } 1929 if (((i+1) % 8) == 0) printf("\n"); 1930 } 1931 } 1932 printf("\n"); 1933 printf("};\n"); 1934 printf("\n"); 1935 #if 0 /* Dense array */ 1936 printf("static const unsigned short jisx0213_from_ucs_level2[] = {\n"); 1937 { 1938 unsigned int i; 1939 for (i = 0; i <= maxpage; i++) { 1940 if (pages[i]) { 1941 printf(" /* 0x%04X */\n",i<<6); 1942 { 1943 unsigned int j; 1944 for (j = 0; j < 0x40; ) { 1945 unsigned int ucs = (i<<6)+j; 1946 int value = table[ucs]; 1947 if (value < 0) value = 0; 1948 if ((j % 8) == 0) printf(" "); 1949 printf(" 0x%04x,",value); 1950 j++; 1951 if ((j % 8) == 0) printf("\n"); 1952 } 1953 } 1954 } 1955 } 1956 } 1957 printf("};\n"); 1958 #else /* Sparse array */ 1959 { 1960 int summary_indx[0x11000]; 1961 int summary_used[0x11000]; 1962 unsigned int i, k, indx; 1963 printf("static const unsigned short jisx0213_from_ucs_level2_data[] = {\n"); 1964 /* Fill summary_indx[] and summary_used[]. */ 1965 indx = 0; 1966 for (i = 0, k = 0; i <= maxpage; i++) { 1967 if (pages[i]) { 1968 unsigned int j1, j2; 1969 unsigned int count = 0; 1970 printf(" /* 0x%04X */\n",i<<6); 1971 for (j1 = 0; j1 < 4; j1++) { 1972 summary_indx[4*k+j1] = indx; 1973 summary_used[4*k+j1] = 0; 1974 for (j2 = 0; j2 < 16; j2++) { 1975 unsigned int j = 16*j1+j2; 1976 unsigned int ucs = (i<<6)+j; 1977 int value = table[ucs]; 1978 if (value < 0) value = 0; 1979 if (value > 0) { 1980 summary_used[4*k+j1] |= (1 << j2); 1981 if ((count % 8) == 0) printf(" "); 1982 printf(" 0x%04x,",value); 1983 count++; 1984 if ((count % 8) == 0) printf("\n"); 1985 indx++; 1986 } 1987 } 1988 } 1989 if ((count % 8) > 0) 1990 printf("\n"); 1991 k++; 1992 } 1993 } 1994 printf("};\n"); 1995 printf("\n"); 1996 printf("static const Summary16 jisx0213_from_ucs_level2_2indx[] = {\n"); 1997 for (i = 0, k = 0; i <= maxpage; i++) { 1998 if (pages[i]) { 1999 unsigned int j1; 2000 printf(" /* 0x%04X */\n",i<<6); 2001 printf(" "); 2002 for (j1 = 0; j1 < 4; j1++) { 2003 printf(" { %4d, 0x%04x },", summary_indx[4*k+j1], summary_used[4*k+j1]); 2004 } 2005 printf("\n"); 2006 k++; 2007 } 2008 } 2009 printf("};\n"); 2010 } 2011 #endif 2012 printf("\n"); 2013 } 2014 printf("#ifdef __GNUC__\n"); 2015 printf("__inline\n"); 2016 printf("#else\n"); 2017 printf("#ifdef __cplusplus\n"); 2018 printf("inline\n"); 2019 printf("#endif\n"); 2020 printf("#endif\n"); 2021 printf("static ucs4_t jisx0213_to_ucs4 (unsigned int row, unsigned int col)\n"); 2022 printf("{\n"); 2023 printf(" ucs4_t val;\n"); 2024 printf("\n"); 2025 printf(" if (row >= 0x121 && row <= 0x17e)\n"); 2026 printf(" row -= 289;\n"); 2027 printf(" else if (row == 0x221)\n"); 2028 printf(" row -= 451;\n"); 2029 printf(" else if (row >= 0x223 && row <= 0x225)\n"); 2030 printf(" row -= 452;\n"); 2031 printf(" else if (row == 0x228)\n"); 2032 printf(" row -= 454;\n"); 2033 printf(" else if (row >= 0x22c && row <= 0x22f)\n"); 2034 printf(" row -= 457;\n"); 2035 printf(" else if (row >= 0x26e && row <= 0x27e)\n"); 2036 printf(" row -= 519;\n"); 2037 printf(" else\n"); 2038 printf(" return 0x0000;\n"); 2039 printf("\n"); 2040 printf(" if (col >= 0x21 && col <= 0x7e)\n"); 2041 printf(" col -= 0x21;\n"); 2042 printf(" else\n"); 2043 printf(" return 0x0000;\n"); 2044 printf("\n"); 2045 printf(" val = jisx0213_to_ucs_main[row * 94 + col];\n"); 2046 printf(" val = jisx0213_to_ucs_pagestart[val >> 8] + (val & 0xff);\n"); 2047 printf(" if (val == 0xfffd)\n"); 2048 printf(" val = 0x0000;\n"); 2049 printf(" return val;\n"); 2050 printf("}\n"); 2051 printf("\n"); 2052 printf("#ifdef __GNUC__\n"); 2053 printf("__inline\n"); 2054 printf("#else\n"); 2055 printf("#ifdef __cplusplus\n"); 2056 printf("inline\n"); 2057 printf("#endif\n"); 2058 printf("#endif\n"); 2059 printf("static unsigned short ucs4_to_jisx0213 (ucs4_t ucs)\n"); 2060 printf("{\n"); 2061 printf(" if (ucs < (sizeof(jisx0213_from_ucs_level1)/sizeof(jisx0213_from_ucs_level1[0])) << 6) {\n"); 2062 printf(" int index1 = jisx0213_from_ucs_level1[ucs >> 6];\n"); 2063 printf(" if (index1 >= 0)"); 2064 #if 0 /* Dense array */ 2065 printf("\n"); 2066 printf(" return jisx0213_from_ucs_level2[(index1 << 6) + (ucs & 0x3f)];\n"); 2067 #else /* Sparse array */ 2068 printf(" {\n"); 2069 printf(" const Summary16 *summary = &jisx0213_from_ucs_level2_2indx[((index1 << 6) + (ucs & 0x3f)) >> 4];\n"); 2070 printf(" unsigned short used = summary->used;\n"); 2071 printf(" unsigned int i = ucs & 0x0f;\n"); 2072 printf(" if (used & ((unsigned short) 1 << i)) {\n"); 2073 printf(" /* Keep in `used' only the bits 0..i-1. */\n"); 2074 printf(" used &= ((unsigned short) 1 << i) - 1;\n"); 2075 printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n"); 2076 printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n"); 2077 printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n"); 2078 printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n"); 2079 printf(" used = (used & 0x00ff) + (used >> 8);\n"); 2080 printf(" return jisx0213_from_ucs_level2_data[summary->indx + used];\n"); 2081 printf(" };\n"); 2082 printf(" };\n"); 2083 #endif 2084 printf(" }\n"); 2085 printf(" return 0x0000;\n"); 2086 printf("}\n"); 2087 printf("\n"); 2088 printf("#endif /* _JISX0213_H */\n"); 2089} 2090 2091/* Main program */ 2092 2093int main (int argc, char *argv[]) 2094{ 2095 const char* charsetname; 2096 const char* name; 2097 2098 if (argc != 3) 2099 exit(1); 2100 charsetname = argv[1]; 2101 name = argv[2]; 2102 2103 output_title(charsetname); 2104 2105 if (!strcmp(name,"gb2312") 2106 || !strcmp(name,"isoir165ext") || !strcmp(name,"gb12345ext") 2107 || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212")) 2108 do_normal(name); 2109 else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2") 2110 || !strcmp(name,"cns11643_3") || !strcmp(name,"cns11643_4a") 2111 || !strcmp(name,"cns11643_4b") || !strcmp(name,"cns11643_5") 2112 || !strcmp(name,"cns11643_6") || !strcmp(name,"cns11643_7") 2113 || !strcmp(name,"cns11643_15")) 2114 do_normal_only_charset2uni(name); 2115 else if (!strcmp(name,"cns11643_inv")) 2116 do_cns11643_only_uni2charset(name); 2117 else if (!strcmp(name,"gbkext1")) 2118 do_gbk1_only_charset2uni(name); 2119 else if (!strcmp(name,"gbkext2")) 2120 do_gbk2_only_charset2uni(name); 2121 else if (!strcmp(name,"gbkext_inv")) 2122 do_gbk1_only_uni2charset(name); 2123 else if (!strcmp(name,"cp936ext") || !strcmp(name,"gb18030ext")) 2124 do_gbk1(name); 2125 else if (!strcmp(name,"ksc5601")) 2126 do_ksc5601(name); 2127 else if (!strcmp(name,"uhc_1")) 2128 do_uhc_1(name); 2129 else if (!strcmp(name,"uhc_2")) 2130 do_uhc_2(name); 2131 else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext")) 2132 do_big5(name); 2133 else if (!strcmp(name,"hkscs1999") || !strcmp(name,"hkscs2001") 2134 || !strcmp(name,"hkscs2004")) 2135 do_hkscs(name); 2136 else if (!strcmp(name,"johab_hangul")) 2137 do_johab_hangul(name); 2138 else if (!strcmp(name,"cp932ext")) 2139 do_sjis(name); 2140 else if (!strcmp(name,"gb18030uni")) 2141 do_gb18030uni(name); 2142 else if (!strcmp(name,"jisx0213")) 2143 do_jisx0213(name); 2144 else 2145 exit(1); 2146 2147 return 0; 2148} 2149