1/* Copyright (C) 1999-2002 Free Software Foundation, Inc. 2 This file is part of the GNU LIBICONV Tools. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software Foundation, 16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 17 18/* 19 * Generates an 8-bit character set table from a .TXT table as found on 20 * ftp.unicode.org or from a table containing the 256 Unicode values as 21 * hexadecimal integers. 22 * Examples: 23 * 24 * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < tab8859_1 25 * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < tab8859_2 26 * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < tab8859_3 27 * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < tab8859_4 28 * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < tab8859_5 29 * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < tab8859_6 30 * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < tab8859_7 31 * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < tab8859_8 32 * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < tab8859_9 33 * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < tab8859_10 34 * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < tab8859_14 35 * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < tab8859_15 36 * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < jis0201 37 * ./8bit_tab_to_h TIS620.2533-1 tis620 < tabtis620 38 * ./8bit_tab_to_h KOI8-R koi8_r < tabkoi8_r 39 * ./8bit_tab_to_h KOI8-U koi8_u < tabkoi8_u 40 * ./8bit_tab_to_h ARMSCII-8 armscii_8 < tabarmscii_8 41 * ./8bit_tab_to_h CP1133 cp1133 < tabibm_cp1133 42 * ./8bit_tab_to_h MULELAO-1 mulelao < tabmulelao_1 43 * ./8bit_tab_to_h VISCII1.1-1 viscii1 < tabviscii 44 * ./8bit_tab_to_h TCVN-5712 tcvn < tabtcvn 45 * ./8bit_tab_to_h GEORGIAN-ACADEMY georgian_ac < tabgeorgian_academy 46 * ./8bit_tab_to_h GEORGIAN-PS georgian_ps < tabgeorgian_ps 47 * 48 * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < 8859-1.TXT 49 * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < 8859-2.TXT 50 * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < 8859-3.TXT 51 * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < 8859-4.TXT 52 * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < 8859-5.TXT 53 * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < 8859-6.TXT 54 * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < 8859-7.TXT 55 * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < 8859-8.TXT 56 * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < 8859-9.TXT 57 * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < 8859-10.TXT 58 * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < 8859-14.TXT 59 * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < 8859-15.TXT 60 * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < JIS0201.TXT 61 * ./8bit_tab_to_h KOI8-R koi8_r < KOI8-R.TXT 62 */ 63 64#include <stdio.h> 65#include <stdlib.h> 66#include <stdbool.h> 67#include <string.h> 68 69int main (int argc, char *argv[]) 70{ 71 const char* charsetname; 72 const char* c_charsetname; 73 const char* filename; 74 const char* directory; 75 int charset2uni[0x100]; 76 77 if (argc != 3 && argc != 4 && argc != 5) 78 exit(1); 79 charsetname = argv[1]; 80 c_charsetname = argv[2]; 81 if (argc > 3) { 82 filename = argv[3]; 83 } else { 84 char* s = (char*) malloc(strlen(c_charsetname)+strlen(".h")+1); 85 strcpy(s,c_charsetname); strcat(s,".h"); 86 filename = s; 87 } 88 directory = (argc > 4 ? argv[4] : ""); 89 90 fprintf(stderr, "Creating %s%s\n", directory, filename); 91 92 { 93 int i, c; 94 c = getc(stdin); 95 ungetc(c,stdin); 96 if (c == '#') { 97 /* Read a unicode.org style .TXT file. */ 98 for (i = 0; i < 0x100; i++) 99 charset2uni[i] = 0xfffd; 100 for (;;) { 101 c = getc(stdin); 102 if (c == EOF) 103 break; 104 if (c == '\n' || c == ' ' || c == '\t') 105 continue; 106 if (c == '#') { 107 do { c = getc(stdin); } while (!(c == EOF || c == '\n')); 108 continue; 109 } 110 ungetc(c,stdin); 111 if (scanf("0x%x", &i) != 1 || !(i >= 0 && i < 0x100)) 112 exit(1); 113 do { c = getc(stdin); } while (c == ' ' || c == '\t'); 114 if (c != EOF) 115 ungetc(c,stdin); 116 if (c == '\n' || c == '#') 117 continue; 118 if (scanf("0x%x", &charset2uni[i]) != 1) 119 exit(1); 120 } 121 } else { 122 /* Read a table of hexadecimal Unicode values. */ 123 for (i = 0; i < 0x100; i++) { 124 if (scanf("%x", &charset2uni[i]) != 1) 125 exit(1); 126 if (charset2uni[i] < 0 || charset2uni[i] == 0xffff) 127 charset2uni[i] = 0xfffd; 128 } 129 if (scanf("%x", &i) != EOF) 130 exit(1); 131 } 132 } 133 134 /* Write the output file. */ 135 { 136 FILE* f; 137 138 { 139 char* fname = malloc(strlen(directory)+strlen(filename)+1); 140 strcpy(fname,directory); strcat(fname,filename); 141 f = fopen(fname,"w"); 142 if (f == NULL) 143 exit(1); 144 } 145 146 fprintf(f, "/*\n"); 147 fprintf(f, " * Copyright (C) 1999-2002 Free Software Foundation, Inc.\n"); 148 fprintf(f, " * This file is part of the GNU LIBICONV Library.\n"); 149 fprintf(f, " *\n"); 150 fprintf(f, " * The GNU LIBICONV Library is free software; you can redistribute it\n"); 151 fprintf(f, " * and/or modify it under the terms of the GNU Library General Public\n"); 152 fprintf(f, " * License as published by the Free Software Foundation; either version 2\n"); 153 fprintf(f, " * of the License, or (at your option) any later version.\n"); 154 fprintf(f, " *\n"); 155 fprintf(f, " * The GNU LIBICONV Library is distributed in the hope that it will be\n"); 156 fprintf(f, " * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); 157 fprintf(f, " * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n"); 158 fprintf(f, " * Library General Public License for more details.\n"); 159 fprintf(f, " *\n"); 160 fprintf(f, " * You should have received a copy of the GNU Library General Public\n"); 161 fprintf(f, " * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n"); 162 fprintf(f, " * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n"); 163 fprintf(f, " * Fifth Floor, Boston, MA 02110-1301, USA.\n"); 164 fprintf(f, " */\n"); 165 fprintf(f, "\n"); 166 fprintf(f, "/*\n"); 167 fprintf(f, " * %s\n", charsetname); 168 fprintf(f, " */\n"); 169 fprintf(f, "\n"); 170 171 { 172 int i, i1, i2, i3; 173 int line[16]; 174 int tableno; 175 struct { int minline; int maxline; } tables[16]; 176 bool some_invalid; 177 bool final_ret_reached; 178 179 for (i1 = 0; i1 < 16; i1++) { 180 bool all_invalid = true; 181 bool all_identity = true; 182 for (i2 = 0; i2 < 16; i2++) { 183 i = 16*i1+i2; 184 if (charset2uni[i] != 0xfffd) 185 all_invalid = false; 186 if (charset2uni[i] != i) 187 all_identity = false; 188 } 189 if (all_invalid) 190 line[i1] = -2; 191 else if (all_identity) 192 line[i1] = -1; 193 else 194 line[i1] = 0; 195 } 196 tableno = 0; 197 for (i1 = 0; i1 < 16; i1++) { 198 if (line[i1] >= 0) { 199 if (i1 > 0 && tableno > 0 && line[i1-1] == tableno-1) { 200 line[i1] = tableno-1; 201 tables[tableno-1].maxline = i1; 202 } else { 203 tableno++; 204 line[i1] = tableno-1; 205 tables[tableno-1].minline = tables[tableno-1].maxline = i1; 206 } 207 } 208 } 209 some_invalid = false; 210 for (i = 0; i < 0x100; i++) 211 if (charset2uni[i] == 0xfffd) 212 some_invalid = true; 213 if (tableno > 0) { 214 int t; 215 for (t = 0; t < tableno; t++) { 216 fprintf(f, "static const unsigned short %s_2uni", c_charsetname); 217 if (tableno > 1) 218 fprintf(f, "_%d", t+1); 219 fprintf(f, "[%d] = {\n", 16*(tables[t].maxline-tables[t].minline+1)); 220 for (i1 = tables[t].minline; i1 <= tables[t].maxline; i1++) { 221 fprintf(f, " /* 0x%02x */\n", 16*i1); 222 for (i2 = 0; i2 < 2; i2++) { 223 fprintf(f, " "); 224 for (i3 = 0; i3 < 8; i3++) { 225 i = 16*i1+8*i2+i3; 226 fprintf(f, " 0x%04x,", charset2uni[i]); 227 } 228 fprintf(f, "\n"); 229 } 230 } 231 fprintf(f, "};\n"); 232 } 233 fprintf(f, "\n"); 234 } 235 final_ret_reached = false; 236 fprintf(f, "static int\n%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", c_charsetname); 237 fprintf(f, "{\n"); 238 fprintf(f, " unsigned char c = *s;\n"); 239 if (some_invalid) { 240 for (i1 = 0; i1 < 16;) { 241 int t = line[i1]; 242 const char* indent; 243 for (i2 = i1; i2 < 16 && line[i2] == t; i2++); 244 indent = (i1 == 0 && i2 == 16 ? " " : " "); 245 if (i1 == 0) { 246 if (i2 == 16) { 247 } else { 248 fprintf(f, " if (c < 0x%02x) {\n", 16*i2); 249 } 250 } else { 251 if (i2 == 16) { 252 fprintf(f, " else {\n"); 253 } else { 254 fprintf(f, " else if (c < 0x%02x) {\n", 16*i2); 255 } 256 } 257 if (t == -2) { 258 final_ret_reached = true; 259 } else if (t == -1) { 260 fprintf(f, "%s*pwc = (ucs4_t) c;\n", indent); 261 fprintf(f, "%sreturn 1;\n", indent); 262 } else { 263 fprintf(f, "%s", indent); 264 some_invalid = false; 265 for (i = 16*i1; i < 16*i2; i++) 266 if (charset2uni[i] == 0xfffd) 267 some_invalid = true; 268 if (some_invalid) 269 fprintf(f, "unsigned short wc = "); 270 else 271 fprintf(f, "*pwc = (ucs4_t) "); 272 fprintf(f, "%s_2uni", c_charsetname); 273 if (tableno > 1) 274 fprintf(f, "_%d", t+1); 275 fprintf(f, "[c"); 276 if (tables[t].minline > 0) 277 fprintf(f, "-0x%02x", 16*tables[t].minline); 278 fprintf(f, "];\n"); 279 if (some_invalid) { 280 fprintf(f, "%sif (wc != 0xfffd) {\n", indent); 281 fprintf(f, "%s *pwc = (ucs4_t) wc;\n", indent); 282 fprintf(f, "%s return 1;\n", indent); 283 fprintf(f, "%s}\n", indent); 284 final_ret_reached = true; 285 } else { 286 fprintf(f, "%sreturn 1;\n", indent); 287 } 288 } 289 if (!(i1 == 0 && i2 == 16)) 290 fprintf(f, " }\n"); 291 i1 = i2; 292 } 293 if (final_ret_reached) 294 fprintf(f, " return RET_ILSEQ;\n"); 295 } else { 296 for (i1 = 0; i1 < 16;) { 297 int t = line[i1]; 298 for (i2 = i1; i2 < 16 && line[i2] == t; i2++); 299 if (i1 == 0) { 300 if (i2 == 16) { 301 fprintf(f, " "); 302 } else { 303 fprintf(f, " if (c < 0x%02x)\n ", 16*i2); 304 } 305 } else { 306 if (i2 == 16) { 307 fprintf(f, " else\n "); 308 } else { 309 fprintf(f, " else if (c < 0x%02x)\n ", 16*i2); 310 } 311 } 312 if (t == -1) 313 fprintf(f, "*pwc = (ucs4_t) c;\n"); 314 else { 315 fprintf(f, "*pwc = (ucs4_t) %s_2uni", c_charsetname); 316 if (tableno > 1) 317 fprintf(f, "_%d", t+1); 318 fprintf(f, "[c"); 319 if (tables[t].minline > 0) 320 fprintf(f, "-0x%02x", 16*tables[t].minline); 321 fprintf(f, "];\n"); 322 } 323 i1 = i2; 324 } 325 fprintf(f, " return 1;\n"); 326 } 327 fprintf(f, "}\n"); 328 329 } 330 331 fprintf(f, "\n"); 332 333 { 334 int uni2charset[0x10000]; 335 bool pages[0x100]; 336 int line[0x2000]; 337 int tableno; 338 struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000]; 339 bool need_c; 340 bool fix_0000; 341 int i, j, p, j1, j2, t; 342 343 for (j = 0; j < 0x10000; j++) 344 uni2charset[j] = 0; 345 for (p = 0; p < 0x100; p++) 346 pages[p] = false; 347 for (i = 0; i < 0x100; i++) { 348 j = charset2uni[i]; 349 if (j != 0xfffd) { 350 uni2charset[j] = i; 351 pages[j>>8] = true; 352 } 353 } 354 for (j1 = 0; j1 < 0x2000; j1++) { 355 bool all_invalid = true; 356 bool all_identity = true; 357 for (j2 = 0; j2 < 8; j2++) { 358 j = 8*j1+j2; 359 if (uni2charset[j] != 0) 360 all_invalid = false; 361 if (uni2charset[j] != j) 362 all_identity = false; 363 } 364 if (all_invalid) 365 line[j1] = -2; 366 else if (all_identity) 367 line[j1] = -1; 368 else 369 line[j1] = 0; 370 } 371 tableno = 0; 372 for (j1 = 0; j1 < 0x2000; j1++) { 373 if (line[j1] >= 0) { 374 if (tableno > 0 375 && ((j1 > 0 && line[j1-1] == tableno-1) 376 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5) 377 && j1 - tables[tableno-1].maxline <= 8))) { 378 line[j1] = tableno-1; 379 tables[tableno-1].maxline = j1; 380 } else { 381 tableno++; 382 line[j1] = tableno-1; 383 tables[tableno-1].minline = tables[tableno-1].maxline = j1; 384 } 385 } 386 } 387 for (t = 0; t < tableno; t++) { 388 tables[t].usecount = 0; 389 j1 = 8*tables[t].minline; 390 j2 = 8*(tables[t].maxline+1); 391 for (j = j1; j < j2; j++) 392 if (uni2charset[j] != 0) 393 tables[t].usecount++; 394 } 395 for (t = 0, p = -1, i = 0; t < tableno; t++) { 396 if (tables[t].usecount > 1) { 397 char* s; 398 if (p == tables[t].minline >> 5) { 399 s = (char*) malloc(5+1); 400 sprintf(s, "%02x_%d", p, ++i); 401 } else { 402 p = tables[t].minline >> 5; 403 s = (char*) malloc(2+1); 404 sprintf(s, "%02x", p); 405 } 406 tables[t].suffix = s; 407 } else 408 tables[t].suffix = NULL; 409 } 410 { 411 p = -1; 412 for (t = 0; t < tableno; t++) 413 if (tables[t].usecount > 1) { 414 p = 0; 415 fprintf(f, "static const unsigned char %s_page%s[%d] = {\n", c_charsetname, tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1)); 416 for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) { 417 if ((j1 % 0x20) == 0 && j1 > tables[t].minline) 418 fprintf(f, " /* 0x%04x */\n", 8*j1); 419 fprintf(f, " "); 420 for (j2 = 0; j2 < 8; j2++) { 421 j = 8*j1+j2; 422 fprintf(f, " 0x%02x,", uni2charset[j]); 423 } 424 fprintf(f, " /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7); 425 } 426 fprintf(f, "};\n"); 427 } 428 if (p >= 0) 429 fprintf(f, "\n"); 430 } 431 need_c = false; 432 for (j1 = 0; j1 < 0x2000;) { 433 t = line[j1]; 434 for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++); 435 if (t >= 0) 436 j2 = tables[t].maxline+1; 437 if (!(t == -2 || (t == -1 && j1 == 0))) 438 need_c = true; 439 j1 = j2; 440 } 441 fix_0000 = false; 442 fprintf(f, "static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", c_charsetname); 443 fprintf(f, "{\n"); 444 if (need_c) 445 fprintf(f, " unsigned char c = 0;\n"); 446 for (j1 = 0; j1 < 0x2000;) { 447 t = line[j1]; 448 for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++); 449 if (t >= 0) { 450 if (j1 != tables[t].minline) abort(); 451 if (j2 > tables[t].maxline+1) abort(); 452 j2 = tables[t].maxline+1; 453 } 454 if (t == -2) { 455 } else { 456 if (j1 == 0) 457 fprintf(f, " "); 458 else 459 fprintf(f, " else "); 460 if (t >= 0 && tables[t].usecount == 0) abort(); 461 if (t >= 0 && tables[t].usecount == 1) { 462 if (j2 != j1+1) abort(); 463 for (j = 8*j1; j < 8*j2; j++) 464 if (uni2charset[j] != 0) { 465 fprintf(f, "if (wc == 0x%04x)\n c = 0x%02x;\n", j, uni2charset[j]); 466 break; 467 } 468 } else { 469 if (j1 == 0) { 470 fprintf(f, "if (wc < 0x%04x)", 8*j2); 471 } else { 472 fprintf(f, "if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2); 473 } 474 if (t == -1) { 475 if (j1 == 0) 476 /* If wc == 0, the function must return 1, not -1. */ 477 fprintf(f, " {\n *r = wc;\n return 1;\n }\n"); 478 else 479 fprintf(f, "\n c = wc;\n"); 480 } else { 481 fprintf(f, "\n c = %s_page%s[wc", c_charsetname, tables[t].suffix); 482 if (tables[t].minline > 0) 483 fprintf(f, "-0x%04x", 8*j1); 484 fprintf(f, "];\n"); 485 if (j1 == 0 && uni2charset[0] == 0) 486 /* If wc == 0, the function must return 1, not -1. */ 487 fix_0000 = true; 488 } 489 } 490 } 491 j1 = j2; 492 } 493 if (need_c) { 494 if (fix_0000) 495 fprintf(f, " if (c != 0 || wc == 0) {\n"); 496 else 497 fprintf(f, " if (c != 0) {\n"); 498 fprintf(f, " *r = c;\n"); 499 fprintf(f, " return 1;\n"); 500 fprintf(f, " }\n"); 501 } 502 fprintf(f, " return RET_ILUNI;\n"); 503 fprintf(f, "}\n"); 504 505 } 506 507 if (ferror(f) || fclose(f)) 508 exit(1); 509 } 510 511#if 0 512 513 int i1, i2, i3, i1_min, i1_max, j1, j2; 514 515 i1_min = 16; 516 i1_max = -1; 517 for (i1 = 0; i1 < 16; i1++) 518 for (i2 = 0; i2 < 16; i2++) 519 if (charset2uni[16*i1+i2] != 0xfffd) { 520 if (i1_min > i1) i1_min = i1; 521 if (i1_max < i1) i1_max = i1; 522 } 523 printf("static const unsigned short %s_2uni[%d] = {\n", 524 name, 16*(i1_max-i1_min+1)); 525 for (i1 = i1_min; i1 <= i1_max; i1++) { 526 printf(" /""* 0x%02x *""/\n", 16*i1); 527 for (i2 = 0; i2 < 2; i2++) { 528 printf(" "); 529 for (i3 = 0; i3 < 8; i3++) { 530 if (i3 > 0) printf(" "); 531 printf("0x%04x,", charset2uni[16*i1+8*i2+i3]); 532 } 533 printf("\n"); 534 } 535 } 536 printf("};\n"); 537 printf("\n"); 538 539 for (p = 0; p < 0x100; p++) 540 pages[p] = 0; 541 for (i = 0; i < 0x100; i++) 542 if (charset2uni[i] != 0xfffd) 543 pages[charset2uni[i]>>8] = 1; 544 for (p = 0; p < 0x100; p++) 545 if (pages[p]) { 546 int j1_min = 32; 547 int j1_max = -1; 548 for (j1 = 0; j1 < 32; j1++) 549 for (j2 = 0; j2 < 8; j2++) 550 if (uni2charset[256*p+8*j1+j2] != 0) { 551 if (j1_min > j1) j1_min = j1; 552 if (j1_max < j1) j1_max = j1; 553 } 554 printf("static const unsigned char %s_page%02x[%d] = {\n", 555 name, p, 8*(j1_max-j1_min+1)); 556 for (j1 = j1_min; j1 <= j1_max; j1++) { 557 printf(" "); 558 for (j2 = 0; j2 < 8; j2++) 559 printf("0x%02x, ", uni2charset[256*p+8*j1+j2]); 560 printf("/""* 0x%02x-0x%02x *""/\n", 8*j1, 8*j1+7); 561 } 562 printf("};\n"); 563 } 564 printf("\n"); 565 566} 567#endif 568 569 exit(0); 570} 571