1/* Generate a Unicode conforming Line Break Properties tables from a 2 UnicodeData file. 3 Written by Bruno Haible <bruno@clisp.org>, 2000-2004. 4 5This program is free software: you can redistribute it and/or modify 6it under the terms of the GNU General Public License as published by 7the Free Software Foundation; either version 3 of the License, or 8(at your option) any later version. 9 10This program is distributed in the hope that it will be useful, 11but WITHOUT ANY WARRANTY; without even the implied warranty of 12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13GNU General Public License for more details. 14 15You should have received a copy of the GNU General Public License 16along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18/* Usage example: 19 $ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \ 20 Combining.txt \ 21 /usr/local/share/Unidata/EastAsianWidth.txt \ 22 /usr/local/share/Unidata/LineBreak.txt \ 23 3.1.0 24 */ 25 26#include <stdio.h> 27#include <stdlib.h> 28#include <stdbool.h> 29#include <stdint.h> 30#include <string.h> 31#include <time.h> 32 33/* This structure represents one line in the UnicodeData.txt file. */ 34struct unicode_attribute 35{ 36 const char *name; /* Character name */ 37 const char *category; /* General category */ 38 const char *combining; /* Canonical combining classes */ 39 const char *bidi; /* Bidirectional category */ 40 const char *decomposition; /* Character decomposition mapping */ 41 const char *decdigit; /* Decimal digit value */ 42 const char *digit; /* Digit value */ 43 const char *numeric; /* Numeric value */ 44 int mirrored; /* mirrored */ 45 const char *oldname; /* Old Unicode 1.0 name */ 46 const char *comment; /* Comment */ 47 unsigned int upper; /* Uppercase mapping */ 48 unsigned int lower; /* Lowercase mapping */ 49 unsigned int title; /* Titlecase mapping */ 50}; 51 52/* Missing fields are represented with "" for strings, and NONE for 53 characters. */ 54#define NONE (~(unsigned int)0) 55 56/* The entire contents of the UnicodeData.txt file. */ 57struct unicode_attribute unicode_attributes [0x110000]; 58 59/* Stores in unicode_attributes[i] the values from the given fields. */ 60static void 61fill_attribute (unsigned int i, 62 const char *field1, const char *field2, 63 const char *field3, const char *field4, 64 const char *field5, const char *field6, 65 const char *field7, const char *field8, 66 const char *field9, const char *field10, 67 const char *field11, const char *field12, 68 const char *field13, const char *field14) 69{ 70 struct unicode_attribute * uni; 71 72 if (i >= 0x110000) 73 { 74 fprintf (stderr, "index too large\n"); 75 exit (1); 76 } 77 uni = &unicode_attributes[i]; 78 /* Copy the strings. */ 79 uni->name = strdup (field1); 80 uni->category = (field2[0] == '\0' ? "" : strdup (field2)); 81 uni->combining = (field3[0] == '\0' ? "" : strdup (field3)); 82 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4)); 83 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5)); 84 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6)); 85 uni->digit = (field7[0] == '\0' ? "" : strdup (field7)); 86 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8)); 87 uni->mirrored = (field9[0] == 'Y'); 88 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10)); 89 uni->comment = (field11[0] == '\0' ? "" : strdup (field11)); 90 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16)); 91 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16)); 92 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16)); 93} 94 95/* Maximum length of a field in the UnicodeData.txt file. */ 96#define FIELDLEN 120 97 98/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN. 99 Reads up to (but excluding) DELIM. 100 Returns 1 when a field was successfully read, otherwise 0. */ 101static int 102getfield (FILE *stream, char *buffer, int delim) 103{ 104 int count = 0; 105 int c; 106 107 for (; (c = getc (stream)), (c != EOF && c != delim); ) 108 { 109 /* The original unicode.org UnicodeData.txt file happens to have 110 CR/LF line terminators. Silently convert to LF. */ 111 if (c == '\r') 112 continue; 113 114 /* Put c into the buffer. */ 115 if (++count >= FIELDLEN - 1) 116 { 117 fprintf (stderr, "field too long\n"); 118 exit (1); 119 } 120 *buffer++ = c; 121 } 122 123 if (c == EOF) 124 return 0; 125 126 *buffer = '\0'; 127 return 1; 128} 129 130/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt 131 file. */ 132static void 133fill_attributes (const char *unicodedata_filename) 134{ 135 unsigned int i, j; 136 FILE *stream; 137 char field0[FIELDLEN]; 138 char field1[FIELDLEN]; 139 char field2[FIELDLEN]; 140 char field3[FIELDLEN]; 141 char field4[FIELDLEN]; 142 char field5[FIELDLEN]; 143 char field6[FIELDLEN]; 144 char field7[FIELDLEN]; 145 char field8[FIELDLEN]; 146 char field9[FIELDLEN]; 147 char field10[FIELDLEN]; 148 char field11[FIELDLEN]; 149 char field12[FIELDLEN]; 150 char field13[FIELDLEN]; 151 char field14[FIELDLEN]; 152 int lineno = 0; 153 154 for (i = 0; i < 0x110000; i++) 155 unicode_attributes[i].name = NULL; 156 157 stream = fopen (unicodedata_filename, "r"); 158 if (stream == NULL) 159 { 160 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename); 161 exit (1); 162 } 163 164 for (;;) 165 { 166 int n; 167 168 lineno++; 169 n = getfield (stream, field0, ';'); 170 n += getfield (stream, field1, ';'); 171 n += getfield (stream, field2, ';'); 172 n += getfield (stream, field3, ';'); 173 n += getfield (stream, field4, ';'); 174 n += getfield (stream, field5, ';'); 175 n += getfield (stream, field6, ';'); 176 n += getfield (stream, field7, ';'); 177 n += getfield (stream, field8, ';'); 178 n += getfield (stream, field9, ';'); 179 n += getfield (stream, field10, ';'); 180 n += getfield (stream, field11, ';'); 181 n += getfield (stream, field12, ';'); 182 n += getfield (stream, field13, ';'); 183 n += getfield (stream, field14, '\n'); 184 if (n == 0) 185 break; 186 if (n != 15) 187 { 188 fprintf (stderr, "short line in'%s':%d\n", 189 unicodedata_filename, lineno); 190 exit (1); 191 } 192 i = strtoul (field0, NULL, 16); 193 if (field1[0] == '<' 194 && strlen (field1) >= 9 195 && !strcmp (field1 + strlen(field1) - 8, ", First>")) 196 { 197 /* Deal with a range. */ 198 lineno++; 199 n = getfield (stream, field0, ';'); 200 n += getfield (stream, field1, ';'); 201 n += getfield (stream, field2, ';'); 202 n += getfield (stream, field3, ';'); 203 n += getfield (stream, field4, ';'); 204 n += getfield (stream, field5, ';'); 205 n += getfield (stream, field6, ';'); 206 n += getfield (stream, field7, ';'); 207 n += getfield (stream, field8, ';'); 208 n += getfield (stream, field9, ';'); 209 n += getfield (stream, field10, ';'); 210 n += getfield (stream, field11, ';'); 211 n += getfield (stream, field12, ';'); 212 n += getfield (stream, field13, ';'); 213 n += getfield (stream, field14, '\n'); 214 if (n != 15) 215 { 216 fprintf (stderr, "missing end range in '%s':%d\n", 217 unicodedata_filename, lineno); 218 exit (1); 219 } 220 if (!(field1[0] == '<' 221 && strlen (field1) >= 8 222 && !strcmp (field1 + strlen (field1) - 7, ", Last>"))) 223 { 224 fprintf (stderr, "missing end range in '%s':%d\n", 225 unicodedata_filename, lineno); 226 exit (1); 227 } 228 field1[strlen (field1) - 7] = '\0'; 229 j = strtoul (field0, NULL, 16); 230 for (; i <= j; i++) 231 fill_attribute (i, field1+1, field2, field3, field4, field5, 232 field6, field7, field8, field9, field10, 233 field11, field12, field13, field14); 234 } 235 else 236 { 237 /* Single character line */ 238 fill_attribute (i, field1, field2, field3, field4, field5, 239 field6, field7, field8, field9, field10, 240 field11, field12, field13, field14); 241 } 242 } 243 if (ferror (stream) || fclose (stream)) 244 { 245 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); 246 exit (1); 247 } 248} 249 250/* The combining property from the PropList.txt file. */ 251char unicode_combining[0x110000]; 252 253/* Stores in unicode_combining[] the Combining property from the 254 Unicode 3.0 PropList.txt file. */ 255static void 256fill_combining (const char *proplist_filename) 257{ 258 unsigned int i; 259 FILE *stream; 260 char buf[100+1]; 261 262 for (i = 0; i < 0x110000; i++) 263 unicode_combining[i] = 0; 264 265 stream = fopen (proplist_filename, "r"); 266 if (stream == NULL) 267 { 268 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename); 269 exit (1); 270 } 271 272 /* Search for the "Property dump for: 0x20000004 (Combining)" line. */ 273 do 274 { 275 if (fscanf (stream, "%100[^\n]\n", buf) < 1) 276 { 277 fprintf (stderr, "no combining property found in '%s'\n", 278 proplist_filename); 279 exit (1); 280 } 281 } 282 while (strstr (buf, "(Combining)") == NULL); 283 284 for (;;) 285 { 286 unsigned int i1, i2; 287 288 if (fscanf (stream, "%100[^\n]\n", buf) < 1) 289 { 290 fprintf (stderr, "premature end of combining property in '%s'\n", 291 proplist_filename); 292 exit (1); 293 } 294 if (buf[0] == '*') 295 break; 296 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.') 297 { 298 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2) 299 { 300 fprintf (stderr, "parse error in combining property in '%s'\n", 301 proplist_filename); 302 exit (1); 303 } 304 } 305 else if (strlen (buf) >= 4) 306 { 307 if (sscanf (buf, "%4X", &i1) < 1) 308 { 309 fprintf (stderr, "parse error in combining property in '%s'\n", 310 proplist_filename); 311 exit (1); 312 } 313 i2 = i1; 314 } 315 else 316 { 317 fprintf (stderr, "parse error in combining property in '%s'\n", 318 proplist_filename); 319 exit (1); 320 } 321 for (i = i1; i <= i2; i++) 322 unicode_combining[i] = 1; 323 } 324 if (ferror (stream) || fclose (stream)) 325 { 326 fprintf (stderr, "error reading from '%s'\n", proplist_filename); 327 exit (1); 328 } 329} 330 331/* The width property from the EastAsianWidth.txt file. 332 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */ 333const char * unicode_width[0x110000]; 334 335/* Stores in unicode_width[] the width property from the EastAsianWidth.txt 336 file. */ 337static void 338fill_width (const char *width_filename) 339{ 340 unsigned int i, j; 341 FILE *stream; 342 char field0[FIELDLEN]; 343 char field1[FIELDLEN]; 344 char field2[FIELDLEN]; 345 int lineno = 0; 346 347 for (i = 0; i < 0x110000; i++) 348 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL); 349 350 stream = fopen (width_filename, "r"); 351 if (stream == NULL) 352 { 353 fprintf (stderr, "error during fopen of '%s'\n", width_filename); 354 exit (1); 355 } 356 357 for (;;) 358 { 359 int n; 360 int c; 361 362 lineno++; 363 c = getc (stream); 364 if (c == EOF) 365 break; 366 if (c == '#') 367 { 368 do c = getc (stream); while (c != EOF && c != '\n'); 369 continue; 370 } 371 ungetc (c, stream); 372 n = getfield (stream, field0, ';'); 373 n += getfield (stream, field1, ' '); 374 n += getfield (stream, field2, '\n'); 375 if (n == 0) 376 break; 377 if (n != 3) 378 { 379 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno); 380 exit (1); 381 } 382 i = strtoul (field0, NULL, 16); 383 if (strstr (field0, "..") != NULL) 384 { 385 /* Deal with a range. */ 386 j = strtoul (strstr (field0, "..") + 2, NULL, 16); 387 for (; i <= j; i++) 388 unicode_width[i] = strdup (field1); 389 } 390 else 391 { 392 /* Single character line. */ 393 unicode_width[i] = strdup (field1); 394 } 395 } 396 if (ferror (stream) || fclose (stream)) 397 { 398 fprintf (stderr, "error reading from '%s'\n", width_filename); 399 exit (1); 400 } 401} 402 403/* Line breaking classification. */ 404 405enum 406{ 407 /* Values >= 20 are resolved at run time. */ 408 LBP_BK = 0, /* mandatory break */ 409/*LBP_CR, carriage return - not used here because it's a DOSism */ 410/*LBP_LF, line feed - not used here because it's a DOSism */ 411 LBP_CM = 20, /* attached characters and combining marks */ 412/*LBP_SG, surrogates - not used here because they are not characters */ 413 LBP_ZW = 1, /* zero width space */ 414 LBP_IN = 2, /* inseparable */ 415 LBP_GL = 3, /* non-breaking (glue) */ 416 LBP_CB = 22, /* contingent break opportunity */ 417 LBP_SP = 21, /* space */ 418 LBP_BA = 4, /* break opportunity after */ 419 LBP_BB = 5, /* break opportunity before */ 420 LBP_B2 = 6, /* break opportunity before and after */ 421 LBP_HY = 7, /* hyphen */ 422 LBP_NS = 8, /* non starter */ 423 LBP_OP = 9, /* opening punctuation */ 424 LBP_CL = 10, /* closing punctuation */ 425 LBP_QU = 11, /* ambiguous quotation */ 426 LBP_EX = 12, /* exclamation/interrogation */ 427 LBP_ID = 13, /* ideographic */ 428 LBP_NU = 14, /* numeric */ 429 LBP_IS = 15, /* infix separator (numeric) */ 430 LBP_SY = 16, /* symbols allowing breaks */ 431 LBP_AL = 17, /* ordinary alphabetic and symbol characters */ 432 LBP_PR = 18, /* prefix (numeric) */ 433 LBP_PO = 19, /* postfix (numeric) */ 434 LBP_SA = 23, /* complex context (South East Asian) */ 435 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ 436 LBP_XX = 25 /* unknown */ 437}; 438 439/* Returns the line breaking classification for ch, as a bit mask. */ 440static int 441get_lbp (unsigned int ch) 442{ 443 int attr = 0; 444 445 if (unicode_attributes[ch].name != NULL) 446 { 447 /* mandatory break */ 448 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */ 449 || ch == 0x000C /* form feed */ 450 || ch == 0x2028 /* LINE SEPARATOR */ 451 || ch == 0x2029 /* PARAGRAPH SEPARATOR */) 452 attr |= 1 << LBP_BK; 453 454 /* zero width space */ 455 if (ch == 0x200B /* ZERO WIDTH SPACE */) 456 attr |= 1 << LBP_ZW; 457 458 /* inseparable */ 459 if (ch == 0x2024 /* ONE DOT LEADER */ 460 || ch == 0x2025 /* TWO DOT LEADER */ 461 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */) 462 attr |= 1 << LBP_IN; 463 464 /* non-breaking (glue) */ 465 if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */ 466 || ch == 0x00A0 /* NO-BREAK SPACE */ 467 || ch == 0x202F /* NARROW NO-BREAK SPACE */ 468 || ch == 0x2007 /* FIGURE SPACE */ 469 || ch == 0x2011 /* NON-BREAKING HYPHEN */ 470 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */) 471 attr |= 1 << LBP_GL; 472 473 /* contingent break opportunity */ 474 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) 475 attr |= 1 << LBP_CB; 476 477 /* space */ 478 if (ch == 0x0020 /* SPACE */) 479 attr |= 1 << LBP_SP; 480 481 /* break opportunity after */ 482 if (ch == 0x2000 /* EN QUAD */ 483 || ch == 0x2001 /* EM QUAD */ 484 || ch == 0x2002 /* EN SPACE */ 485 || ch == 0x2003 /* EM SPACE */ 486 || ch == 0x2004 /* THREE-PER-EM SPACE */ 487 || ch == 0x2005 /* FOUR-PER-EM SPACE */ 488 || ch == 0x2006 /* SIX-PER-EM SPACE */ 489 || ch == 0x2008 /* PUNCTUATION SPACE */ 490 || ch == 0x2009 /* THIN SPACE */ 491 || ch == 0x200A /* HAIR SPACE */ 492 || ch == 0x0009 /* tab */ 493 || ch == 0x058A /* ARMENIAN HYPHEN */ 494 || ch == 0x2010 /* HYPHEN */ 495 || ch == 0x2012 /* FIGURE DASH */ 496 || ch == 0x2013 /* EN DASH */ 497 || ch == 0x00AD /* SOFT HYPHEN */ 498 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ 499 || ch == 0x1361 /* ETHIOPIC WORDSPACE */ 500 || ch == 0x1680 /* OGHAM SPACE MARK */ 501 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */ 502 || ch == 0x2027 /* HYPHENATION POINT */ 503 || ch == 0x007C /* VERTICAL LINE */) 504 attr |= 1 << LBP_BA; 505 506 /* break opportunity before */ 507 if (ch == 0x00B4 /* ACUTE ACCENT */ 508 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ 509 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ 510 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) 511 attr |= 1 << LBP_BB; 512 513 /* break opportunity before and after */ 514 if (ch == 0x2014 /* EM DASH */) 515 attr |= 1 << LBP_B2; 516 517 /* hyphen */ 518 if (ch == 0x002D /* HYPHEN-MINUS */) 519 attr |= 1 << LBP_HY; 520 521 /* exclamation/interrogation */ 522 if (ch == 0x0021 /* EXCLAMATION MARK */ 523 || ch == 0x003F /* QUESTION MARK */ 524 || ch == 0xFE56 /* SMALL QUESTION MARK */ 525 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ 526 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ 527 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) 528 attr |= 1 << LBP_EX; 529 530 /* opening punctuation */ 531 if (unicode_attributes[ch].category[0] == 'P' 532 && unicode_attributes[ch].category[1] == 's') 533 attr |= 1 << LBP_OP; 534 535 /* closing punctuation */ 536 if (ch == 0x3001 /* IDEOGRAPHIC COMMA */ 537 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ 538 || ch == 0xFE50 /* SMALL COMMA */ 539 || ch == 0xFE52 /* SMALL FULL STOP */ 540 || ch == 0xFF0C /* FULLWIDTH COMMA */ 541 || ch == 0xFF0E /* FULLWIDTH FULL STOP */ 542 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ 543 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */ 544 || (unicode_attributes[ch].category[0] == 'P' 545 && unicode_attributes[ch].category[1] == 'e')) 546 attr |= 1 << LBP_CL; 547 548 /* ambiguous quotation */ 549 if (ch == 0x0022 /* QUOTATION MARK */ 550 || ch == 0x0027 /* APOSTROPHE */ 551 || (unicode_attributes[ch].category[0] == 'P' 552 && (unicode_attributes[ch].category[1] == 'f' 553 || unicode_attributes[ch].category[1] == 'i'))) 554 attr |= 1 << LBP_QU; 555 556 /* attached characters and combining marks */ 557 if ((unicode_attributes[ch].category[0] == 'M' 558 && (unicode_attributes[ch].category[1] == 'n' 559 || unicode_attributes[ch].category[1] == 'c' 560 || unicode_attributes[ch].category[1] == 'e')) 561 || (ch >= 0x1160 && ch <= 0x11F9) 562 || (unicode_attributes[ch].category[0] == 'C' 563 && (unicode_attributes[ch].category[1] == 'c' 564 || unicode_attributes[ch].category[1] == 'f'))) 565 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL)))) 566 attr |= 1 << LBP_CM; 567 568 /* non starter */ 569 if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ 570 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */ 571 || ch == 0x17D4 /* KHMER SIGN KHAN */ 572 || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ 573 || ch == 0x17D7 /* KHMER SIGN LEK TOO */ 574 || ch == 0x17D8 /* KHMER SIGN BEYYAL */ 575 || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */ 576 || ch == 0x17DA /* KHMER SIGN KOOMUUT */ 577 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */ 578 || ch == 0x2044 /* FRACTION SLASH */ 579 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */ 580 || ch == 0x301C /* WAVE DASH */ 581 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */ 582 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ 583 || ch == 0x309D /* HIRAGANA ITERATION MARK */ 584 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */ 585 || ch == 0x30FB /* KATAKANA MIDDLE DOT */ 586 || ch == 0x30FD /* KATAKANA ITERATION MARK */ 587 || ch == 0xFE54 /* SMALL SEMICOLON */ 588 || ch == 0xFE55 /* SMALL COLON */ 589 || ch == 0xFF1A /* FULLWIDTH COLON */ 590 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */ 591 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */ 592 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ 593 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */ 594 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ 595 || (unicode_attributes[ch].category[0] == 'L' 596 && unicode_attributes[ch].category[1] == 'm' 597 && (unicode_width[ch][0] == 'W' 598 || unicode_width[ch][0] == 'H')) 599 || (unicode_attributes[ch].category[0] == 'S' 600 && unicode_attributes[ch].category[1] == 'k' 601 && unicode_width[ch][0] == 'W') 602 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL 603 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) 604 attr |= 1 << LBP_NS; 605 606 /* numeric */ 607 if (unicode_attributes[ch].category[0] == 'N' 608 && unicode_attributes[ch].category[1] == 'd' 609 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) 610 attr |= 1 << LBP_NU; 611 612 /* infix separator (numeric) */ 613 if (ch == 0x002C /* COMMA */ 614 || ch == 0x002E /* FULL STOP */ 615 || ch == 0x003A /* COLON */ 616 || ch == 0x003B /* SEMICOLON */ 617 || ch == 0x0589 /* ARMENIAN FULL STOP */) 618 attr |= 1 << LBP_IS; 619 620 /* symbols allowing breaks */ 621 if (ch == 0x002F /* SOLIDUS */) 622 attr |= 1 << LBP_SY; 623 624 /* postfix (numeric) */ 625 if (ch == 0x0025 /* PERCENT SIGN */ 626 || ch == 0x00A2 /* CENT SIGN */ 627 || ch == 0x00B0 /* DEGREE SIGN */ 628 || ch == 0x2030 /* PER MILLE SIGN */ 629 || ch == 0x2031 /* PER TEN THOUSAND SIGN */ 630 || ch == 0x2032 /* PRIME */ 631 || ch == 0x2033 /* DOUBLE PRIME */ 632 || ch == 0x2034 /* TRIPLE PRIME */ 633 || ch == 0x2035 /* REVERSED PRIME */ 634 || ch == 0x2036 /* REVERSED DOUBLE PRIME */ 635 || ch == 0x2037 /* REVERSED TRIPLE PRIME */ 636 || ch == 0x20A7 /* PESETA SIGN */ 637 || ch == 0x2103 /* DEGREE CELSIUS */ 638 || ch == 0x2109 /* DEGREE FAHRENHEIT */ 639 || ch == 0x2126 /* OHM SIGN */ 640 || ch == 0xFE6A /* SMALL PERCENT SIGN */ 641 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ 642 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */) 643 attr |= 1 << LBP_PO; 644 645 /* prefix (numeric) */ 646 if (ch == 0x002B /* PLUS SIGN */ 647 || ch == 0x005C /* REVERSE SOLIDUS */ 648 || ch == 0x00B1 /* PLUS-MINUS SIGN */ 649 || ch == 0x2116 /* NUMERO SIGN */ 650 || ch == 0x2212 /* MINUS SIGN */ 651 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */ 652 || (unicode_attributes[ch].category[0] == 'S' 653 && unicode_attributes[ch].category[1] == 'c')) 654 if (!(attr & (1 << LBP_PO))) 655 attr |= 1 << LBP_PR; 656 657 /* complex context (South East Asian) */ 658 if (((ch >= 0x0E00 && ch <= 0x0EFF) 659 || (ch >= 0x1000 && ch <= 0x109F) 660 || (ch >= 0x1780 && ch <= 0x17FF)) 661 && unicode_attributes[ch].category[0] == 'L' 662 && (unicode_attributes[ch].category[1] == 'm' 663 || unicode_attributes[ch].category[1] == 'o')) 664 if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR)))) 665 attr |= 1 << LBP_SA; 666 667 /* ideographic */ 668 if ((ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */ 669 || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ 670 || ch == 0x3000 /* IDEOGRAPHIC SPACE */ 671 || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */ 672 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */ 673 || (ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */ 674 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */ 675 || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */ 676 || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */ 677 || (ch >= 0xA490 && ch <= 0xA4C6) /* YI RADICAL */ 678 || ch == 0xFE62 /* SMALL PLUS SIGN */ 679 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */ 680 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */ 681 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */ 682 || ch == 0xFE66 /* SMALL EQUALS SIGN */ 683 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */ 684 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */ 685 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ 686 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL 687 || (ch >= 0x3000 && ch <= 0x33FF 688 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))) 689 /* Extra characters for compatibility with Unicode LineBreak.txt. */ 690 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ 691 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ 692 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */ 693 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */ 694 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */ 695 || ch == 0xFE49 /* DASHED OVERLINE */ 696 || ch == 0xFE4A /* CENTRELINE OVERLINE */ 697 || ch == 0xFE4B /* WAVY OVERLINE */ 698 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */ 699 || ch == 0xFE4D /* DASHED LOW LINE */ 700 || ch == 0xFE4E /* CENTRELINE LOW LINE */ 701 || ch == 0xFE4F /* WAVY LOW LINE */ 702 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */ 703 || ch == 0xFE58 /* SMALL EM DASH */ 704 || ch == 0xFE5F /* SMALL NUMBER SIGN */ 705 || ch == 0xFE60 /* SMALL AMPERSAND */ 706 || ch == 0xFE61 /* SMALL ASTERISK */ 707 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */ 708 || ch == 0xFE6B /* SMALL COMMERCIAL AT */ 709 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */ 710 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */ 711 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */ 712 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */ 713 || ch == 0xFF0A /* FULLWIDTH ASTERISK */ 714 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */ 715 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */ 716 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */ 717 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */ 718 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */ 719 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */ 720 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */ 721 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */ 722 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */ 723 || ch == 0xFF3F /* FULLWIDTH LOW LINE */ 724 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */ 725 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */ 726 || ch == 0xFF5E /* FULLWIDTH TILDE */ 727 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ 728 || ch == 0xFFE3 /* FULLWIDTH MACRON */ 729 || ch == 0xFFE4) /* FULLWIDTH BROKEN BAR */ 730 { 731 /* ambiguous (ideograph) ? */ 732 if (unicode_width[ch] != NULL 733 && unicode_width[ch][0] == 'A') 734 attr |= 1 << LBP_AI; 735 else 736 attr |= 1 << LBP_ID; 737 } 738 739 /* ordinary alphabetic and symbol characters */ 740 if ((unicode_attributes[ch].category[0] == 'L' 741 && (unicode_attributes[ch].category[1] == 'u' 742 || unicode_attributes[ch].category[1] == 'l' 743 || unicode_attributes[ch].category[1] == 't' 744 || unicode_attributes[ch].category[1] == 'm' 745 || unicode_attributes[ch].category[1] == 'o')) 746 || (unicode_attributes[ch].category[0] == 'S' 747 && (unicode_attributes[ch].category[1] == 'm' 748 || unicode_attributes[ch].category[1] == 'c' 749 || unicode_attributes[ch].category[1] == 'k' 750 || unicode_attributes[ch].category[1] == 'o')) 751 /* Extra characters for compatibility with Unicode LineBreak.txt. */ 752 || ch == 0x0023 /* NUMBER SIGN */ 753 || ch == 0x0026 /* AMPERSAND */ 754 || ch == 0x002A /* ASTERISK */ 755 || ch == 0x0040 /* COMMERCIAL AT */ 756 || ch == 0x005F /* LOW LINE */ 757 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ 758 || ch == 0x00B2 /* SUPERSCRIPT TWO */ 759 || ch == 0x00B3 /* SUPERSCRIPT THREE */ 760 || ch == 0x00B7 /* MIDDLE DOT */ 761 || ch == 0x00B9 /* SUPERSCRIPT ONE */ 762 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ 763 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ 764 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ 765 || ch == 0x00BF /* INVERTED QUESTION MARK */ 766 || ch == 0x037E /* GREEK QUESTION MARK */ 767 || ch == 0x0387 /* GREEK ANO TELEIA */ 768 || ch == 0x055A /* ARMENIAN APOSTROPHE */ 769 || ch == 0x055B /* ARMENIAN EMPHASIS MARK */ 770 || ch == 0x055C /* ARMENIAN EXCLAMATION MARK */ 771 || ch == 0x055D /* ARMENIAN COMMA */ 772 || ch == 0x055E /* ARMENIAN QUESTION MARK */ 773 || ch == 0x055F /* ARMENIAN ABBREVIATION MARK */ 774 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ 775 || ch == 0x05C0 /* HEBREW PUNCTUATION PASEQ */ 776 || ch == 0x05C3 /* HEBREW PUNCTUATION SOF PASUQ */ 777 || ch == 0x05F3 /* HEBREW PUNCTUATION GERESH */ 778 || ch == 0x05F4 /* HEBREW PUNCTUATION GERSHAYIM */ 779 || ch == 0x060C /* ARABIC COMMA */ 780 || ch == 0x061B /* ARABIC SEMICOLON */ 781 || ch == 0x061F /* ARABIC QUESTION MARK */ 782 || ch == 0x066A /* ARABIC PERCENT SIGN */ 783 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ 784 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */ 785 || ch == 0x066D /* ARABIC FIVE POINTED STAR */ 786 || ch == 0x06D4 /* ARABIC FULL STOP */ 787 || ch == 0x0700 /* SYRIAC END OF PARAGRAPH */ 788 || ch == 0x0701 /* SYRIAC SUPRALINEAR FULL STOP */ 789 || ch == 0x0702 /* SYRIAC SUBLINEAR FULL STOP */ 790 || ch == 0x0703 /* SYRIAC SUPRALINEAR COLON */ 791 || ch == 0x0704 /* SYRIAC SUBLINEAR COLON */ 792 || ch == 0x0705 /* SYRIAC HORIZONTAL COLON */ 793 || ch == 0x0706 /* SYRIAC COLON SKEWED LEFT */ 794 || ch == 0x0707 /* SYRIAC COLON SKEWED RIGHT */ 795 || ch == 0x0708 /* SYRIAC SUPRALINEAR COLON SKEWED LEFT */ 796 || ch == 0x0709 /* SYRIAC SUBLINEAR COLON SKEWED RIGHT */ 797 || ch == 0x070A /* SYRIAC CONTRACTION */ 798 || ch == 0x070B /* SYRIAC HARKLEAN OBELUS */ 799 || ch == 0x070C /* SYRIAC HARKLEAN METOBELUS */ 800 || ch == 0x070D /* SYRIAC HARKLEAN ASTERISCUS */ 801 || ch == 0x0964 /* DEVANAGARI DANDA */ 802 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ 803 || ch == 0x0970 /* DEVANAGARI ABBREVIATION SIGN */ 804 || ch == 0x09F4 /* BENGALI CURRENCY NUMERATOR ONE */ 805 || ch == 0x09F5 /* BENGALI CURRENCY NUMERATOR TWO */ 806 || ch == 0x09F6 /* BENGALI CURRENCY NUMERATOR THREE */ 807 || ch == 0x09F7 /* BENGALI CURRENCY NUMERATOR FOUR */ 808 || ch == 0x09F8 /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */ 809 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */ 810 || ch == 0x0BF0 /* TAMIL NUMBER TEN */ 811 || ch == 0x0BF1 /* TAMIL NUMBER ONE HUNDRED */ 812 || ch == 0x0BF2 /* TAMIL NUMBER ONE THOUSAND */ 813 || ch == 0x0DF4 /* SINHALA PUNCTUATION KUNDDALIYA */ 814 || ch == 0x0E4F /* THAI CHARACTER FONGMAN */ 815 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */ 816 || ch == 0x0F05 /* TIBETAN MARK CLOSING YIG MGO SGAB MA */ 817 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */ 818 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */ 819 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ 820 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */ 821 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */ 822 || ch == 0x0F0D /* TIBETAN MARK SHAD */ 823 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */ 824 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */ 825 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ 826 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ 827 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ 828 || ch == 0x0F2A /* TIBETAN DIGIT HALF ONE */ 829 || ch == 0x0F2B /* TIBETAN DIGIT HALF TWO */ 830 || ch == 0x0F2C /* TIBETAN DIGIT HALF THREE */ 831 || ch == 0x0F2D /* TIBETAN DIGIT HALF FOUR */ 832 || ch == 0x0F2E /* TIBETAN DIGIT HALF FIVE */ 833 || ch == 0x0F2F /* TIBETAN DIGIT HALF SIX */ 834 || ch == 0x0F30 /* TIBETAN DIGIT HALF SEVEN */ 835 || ch == 0x0F31 /* TIBETAN DIGIT HALF EIGHT */ 836 || ch == 0x0F32 /* TIBETAN DIGIT HALF NINE */ 837 || ch == 0x0F33 /* TIBETAN DIGIT HALF ZERO */ 838 || ch == 0x0F85 /* TIBETAN MARK PALUTA */ 839 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */ 840 || ch == 0x104B /* MYANMAR SIGN SECTION */ 841 || ch == 0x104C /* MYANMAR SYMBOL LOCATIVE */ 842 || ch == 0x104D /* MYANMAR SYMBOL COMPLETED */ 843 || ch == 0x104E /* MYANMAR SYMBOL AFOREMENTIONED */ 844 || ch == 0x104F /* MYANMAR SYMBOL GENITIVE */ 845 || ch == 0x10FB /* GEORGIAN PARAGRAPH SEPARATOR */ 846 || ch == 0x1362 /* ETHIOPIC FULL STOP */ 847 || ch == 0x1363 /* ETHIOPIC COMMA */ 848 || ch == 0x1364 /* ETHIOPIC SEMICOLON */ 849 || ch == 0x1365 /* ETHIOPIC COLON */ 850 || ch == 0x1366 /* ETHIOPIC PREFACE COLON */ 851 || ch == 0x1367 /* ETHIOPIC QUESTION MARK */ 852 || ch == 0x1368 /* ETHIOPIC PARAGRAPH SEPARATOR */ 853 || ch == 0x1372 /* ETHIOPIC NUMBER TEN */ 854 || ch == 0x1373 /* ETHIOPIC NUMBER TWENTY */ 855 || ch == 0x1374 /* ETHIOPIC NUMBER THIRTY */ 856 || ch == 0x1375 /* ETHIOPIC NUMBER FORTY */ 857 || ch == 0x1376 /* ETHIOPIC NUMBER FIFTY */ 858 || ch == 0x1377 /* ETHIOPIC NUMBER SIXTY */ 859 || ch == 0x1378 /* ETHIOPIC NUMBER SEVENTY */ 860 || ch == 0x1379 /* ETHIOPIC NUMBER EIGHTY */ 861 || ch == 0x137A /* ETHIOPIC NUMBER NINETY */ 862 || ch == 0x137B /* ETHIOPIC NUMBER HUNDRED */ 863 || ch == 0x137C /* ETHIOPIC NUMBER TEN THOUSAND */ 864 || ch == 0x166D /* CANADIAN SYLLABICS CHI SIGN */ 865 || ch == 0x166E /* CANADIAN SYLLABICS FULL STOP */ 866 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ 867 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ 868 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ 869 || ch == 0x16EE /* RUNIC ARLAUG SYMBOL */ 870 || ch == 0x16EF /* RUNIC TVIMADUR SYMBOL */ 871 || ch == 0x16F0 /* RUNIC BELGTHOR SYMBOL */ 872 || ch == 0x17DC /* KHMER SIGN AVAKRAHASANYA */ 873 || ch == 0x1800 /* MONGOLIAN BIRGA */ 874 || ch == 0x1801 /* MONGOLIAN ELLIPSIS */ 875 || ch == 0x1802 /* MONGOLIAN COMMA */ 876 || ch == 0x1803 /* MONGOLIAN FULL STOP */ 877 || ch == 0x1804 /* MONGOLIAN COLON */ 878 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ 879 || ch == 0x1807 /* MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER */ 880 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ 881 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ 882 || ch == 0x180A /* MONGOLIAN NIRUGU */ 883 || ch == 0x2015 /* HORIZONTAL BAR */ 884 || ch == 0x2016 /* DOUBLE VERTICAL LINE */ 885 || ch == 0x2017 /* DOUBLE LOW LINE */ 886 || ch == 0x2020 /* DAGGER */ 887 || ch == 0x2021 /* DOUBLE DAGGER */ 888 || ch == 0x2022 /* BULLET */ 889 || ch == 0x2023 /* TRIANGULAR BULLET */ 890 || ch == 0x2038 /* CARET */ 891 || ch == 0x203B /* REFERENCE MARK */ 892 || ch == 0x203D /* INTERROBANG */ 893 || ch == 0x203E /* OVERLINE */ 894 || ch == 0x203F /* UNDERTIE */ 895 || ch == 0x2040 /* CHARACTER TIE */ 896 || ch == 0x2041 /* CARET INSERTION POINT */ 897 || ch == 0x2042 /* ASTERISM */ 898 || ch == 0x2043 /* HYPHEN BULLET */ 899 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */ 900 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */ 901 || ch == 0x204A /* TIRONIAN SIGN ET */ 902 || ch == 0x204B /* REVERSED PILCROW SIGN */ 903 || ch == 0x204C /* BLACK LEFTWARDS BULLET */ 904 || ch == 0x204D /* BLACK RIGHTWARDS BULLET */ 905 || ch == 0x2070 /* SUPERSCRIPT ZERO */ 906 || ch == 0x2074 /* SUPERSCRIPT FOUR */ 907 || ch == 0x2075 /* SUPERSCRIPT FIVE */ 908 || ch == 0x2076 /* SUPERSCRIPT SIX */ 909 || ch == 0x2077 /* SUPERSCRIPT SEVEN */ 910 || ch == 0x2078 /* SUPERSCRIPT EIGHT */ 911 || ch == 0x2079 /* SUPERSCRIPT NINE */ 912 || ch == 0x2080 /* SUBSCRIPT ZERO */ 913 || ch == 0x2081 /* SUBSCRIPT ONE */ 914 || ch == 0x2082 /* SUBSCRIPT TWO */ 915 || ch == 0x2083 /* SUBSCRIPT THREE */ 916 || ch == 0x2084 /* SUBSCRIPT FOUR */ 917 || ch == 0x2085 /* SUBSCRIPT FIVE */ 918 || ch == 0x2086 /* SUBSCRIPT SIX */ 919 || ch == 0x2087 /* SUBSCRIPT SEVEN */ 920 || ch == 0x2088 /* SUBSCRIPT EIGHT */ 921 || ch == 0x2089 /* SUBSCRIPT NINE */ 922 || (ch >= 0x2153 && ch <= 0x215E) /* VULGAR FRACTION */ 923 || ch == 0x215F /* FRACTION NUMERATOR ONE */ 924 || (ch >= 0x2160 && ch <= 0x2183) /* ROMAN NUMERAL */ 925 || (ch >= 0x2460 && ch <= 0x2473) /* CIRCLED NUMBER */ 926 || (ch >= 0x2474 && ch <= 0x2487) /* PARENTHESIZED NUMBER */ 927 || (ch >= 0x2488 && ch <= 0x249B) /* NUMBER FULL STOP */ 928 || ch == 0x24EA /* CIRCLED DIGIT ZERO */ 929 || (ch >= 0x2776 && ch <= 0x2793) /* DINGBAT CIRCLED DIGIT */ 930 || ch == 0x10320 /* OLD ITALIC NUMERAL ONE */ 931 || ch == 0x10321 /* OLD ITALIC NUMERAL FIVE */ 932 || ch == 0x10322 /* OLD ITALIC NUMERAL TEN */ 933 || ch == 0x10323 /* OLD ITALIC NUMERAL FIFTY */ 934 || ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */ 935 if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB)))) 936 { 937 /* ambiguous (alphabetic) ? */ 938 if (unicode_width[ch] != NULL 939 && unicode_width[ch][0] == 'A') 940 attr |= 1 << LBP_AI; 941 else 942 attr |= 1 << LBP_AL; 943 } 944 } 945 946 if (attr == 0) 947 /* unknown */ 948 attr |= 1 << LBP_XX; 949 950 return attr; 951} 952 953/* Output the line breaking properties in a human readable format. */ 954static void 955debug_output_lbp (FILE *stream) 956{ 957 unsigned int i; 958 959 for (i = 0; i < 0x110000; i++) 960 { 961 int attr = get_lbp (i); 962 if (attr != 1 << LBP_XX) 963 { 964 fprintf (stream, "0x%04X", i); 965#define PRINT_BIT(attr,bit) \ 966 if (attr & (1 << bit)) fprintf (stream, " " #bit); 967 PRINT_BIT(attr,LBP_BK); 968 PRINT_BIT(attr,LBP_CM); 969 PRINT_BIT(attr,LBP_ZW); 970 PRINT_BIT(attr,LBP_IN); 971 PRINT_BIT(attr,LBP_GL); 972 PRINT_BIT(attr,LBP_CB); 973 PRINT_BIT(attr,LBP_SP); 974 PRINT_BIT(attr,LBP_BA); 975 PRINT_BIT(attr,LBP_BB); 976 PRINT_BIT(attr,LBP_B2); 977 PRINT_BIT(attr,LBP_HY); 978 PRINT_BIT(attr,LBP_NS); 979 PRINT_BIT(attr,LBP_OP); 980 PRINT_BIT(attr,LBP_CL); 981 PRINT_BIT(attr,LBP_QU); 982 PRINT_BIT(attr,LBP_EX); 983 PRINT_BIT(attr,LBP_ID); 984 PRINT_BIT(attr,LBP_NU); 985 PRINT_BIT(attr,LBP_IS); 986 PRINT_BIT(attr,LBP_SY); 987 PRINT_BIT(attr,LBP_AL); 988 PRINT_BIT(attr,LBP_PR); 989 PRINT_BIT(attr,LBP_PO); 990 PRINT_BIT(attr,LBP_SA); 991 PRINT_BIT(attr,LBP_XX); 992 PRINT_BIT(attr,LBP_AI); 993#undef PRINT_BIT 994 fprintf (stream, "\n"); 995 } 996 } 997} 998 999static void 1000debug_output_tables (const char *filename) 1001{ 1002 FILE *stream; 1003 1004 stream = fopen (filename, "w"); 1005 if (stream == NULL) 1006 { 1007 fprintf (stderr, "cannot open '%s' for writing\n", filename); 1008 exit (1); 1009 } 1010 1011 debug_output_lbp (stream); 1012 1013 if (ferror (stream) || fclose (stream)) 1014 { 1015 fprintf (stderr, "error writing to '%s'\n", filename); 1016 exit (1); 1017 } 1018} 1019 1020/* The line breaking property from the LineBreak.txt file. */ 1021int unicode_org_lbp[0x110000]; 1022 1023/* Stores in unicode_org_lbp[] the line breaking property from the 1024 LineBreak.txt file. */ 1025static void 1026fill_org_lbp (const char *linebreak_filename) 1027{ 1028 unsigned int i, j; 1029 FILE *stream; 1030 char field0[FIELDLEN]; 1031 char field1[FIELDLEN]; 1032 char field2[FIELDLEN]; 1033 int lineno = 0; 1034 1035 for (i = 0; i < 0x110000; i++) 1036 unicode_org_lbp[i] = LBP_XX; 1037 1038 stream = fopen (linebreak_filename, "r"); 1039 if (stream == NULL) 1040 { 1041 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename); 1042 exit (1); 1043 } 1044 1045 for (;;) 1046 { 1047 int n; 1048 int c; 1049 int value; 1050 1051 lineno++; 1052 c = getc (stream); 1053 if (c == EOF) 1054 break; 1055 if (c == '#') 1056 { 1057 do c = getc (stream); while (c != EOF && c != '\n'); 1058 continue; 1059 } 1060 ungetc (c, stream); 1061 n = getfield (stream, field0, ';'); 1062 n += getfield (stream, field1, ' '); 1063 n += getfield (stream, field2, '\n'); 1064 if (n == 0) 1065 break; 1066 if (n != 3) 1067 { 1068 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename, 1069 lineno); 1070 exit (1); 1071 } 1072#define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit; 1073 if (false) {} 1074 TRY(LBP_BK) 1075 TRY(LBP_CM) 1076 TRY(LBP_ZW) 1077 TRY(LBP_IN) 1078 TRY(LBP_GL) 1079 TRY(LBP_CB) 1080 TRY(LBP_SP) 1081 TRY(LBP_BA) 1082 TRY(LBP_BB) 1083 TRY(LBP_B2) 1084 TRY(LBP_HY) 1085 TRY(LBP_NS) 1086 TRY(LBP_OP) 1087 TRY(LBP_CL) 1088 TRY(LBP_QU) 1089 TRY(LBP_EX) 1090 TRY(LBP_ID) 1091 TRY(LBP_NU) 1092 TRY(LBP_IS) 1093 TRY(LBP_SY) 1094 TRY(LBP_AL) 1095 TRY(LBP_PR) 1096 TRY(LBP_PO) 1097 TRY(LBP_SA) 1098 TRY(LBP_XX) 1099 TRY(LBP_AI) 1100#undef TRY 1101 else if (strcmp (field1, "LF") == 0) value = LBP_BK; 1102 else if (strcmp (field1, "CR") == 0) value = LBP_BK; 1103 else if (strcmp (field1, "SG") == 0) value = LBP_XX; 1104 else 1105 { 1106 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n", 1107 field1, linebreak_filename, lineno); 1108 exit (1); 1109 } 1110 i = strtoul (field0, NULL, 16); 1111 if (strstr (field0, "..") != NULL) 1112 { 1113 /* Deal with a range. */ 1114 j = strtoul (strstr (field0, "..") + 2, NULL, 16); 1115 for (; i <= j; i++) 1116 unicode_org_lbp[i] = value; 1117 } 1118 else 1119 { 1120 /* Single character line. */ 1121 unicode_org_lbp[i] = value; 1122 } 1123 } 1124 if (ferror (stream) || fclose (stream)) 1125 { 1126 fprintf (stderr, "error reading from '%s'\n", linebreak_filename); 1127 exit (1); 1128 } 1129} 1130 1131/* Output the line breaking properties in a human readable format. */ 1132static void 1133debug_output_org_lbp (FILE *stream) 1134{ 1135 unsigned int i; 1136 1137 for (i = 0; i < 0x110000; i++) 1138 { 1139 int attr = unicode_org_lbp[i]; 1140 if (attr != LBP_XX) 1141 { 1142 fprintf (stream, "0x%04X", i); 1143#define PRINT_BIT(attr,bit) \ 1144 if (attr == bit) fprintf (stream, " " #bit); 1145 PRINT_BIT(attr,LBP_BK); 1146 PRINT_BIT(attr,LBP_CM); 1147 PRINT_BIT(attr,LBP_ZW); 1148 PRINT_BIT(attr,LBP_IN); 1149 PRINT_BIT(attr,LBP_GL); 1150 PRINT_BIT(attr,LBP_CB); 1151 PRINT_BIT(attr,LBP_SP); 1152 PRINT_BIT(attr,LBP_BA); 1153 PRINT_BIT(attr,LBP_BB); 1154 PRINT_BIT(attr,LBP_B2); 1155 PRINT_BIT(attr,LBP_HY); 1156 PRINT_BIT(attr,LBP_NS); 1157 PRINT_BIT(attr,LBP_OP); 1158 PRINT_BIT(attr,LBP_CL); 1159 PRINT_BIT(attr,LBP_QU); 1160 PRINT_BIT(attr,LBP_EX); 1161 PRINT_BIT(attr,LBP_ID); 1162 PRINT_BIT(attr,LBP_NU); 1163 PRINT_BIT(attr,LBP_IS); 1164 PRINT_BIT(attr,LBP_SY); 1165 PRINT_BIT(attr,LBP_AL); 1166 PRINT_BIT(attr,LBP_PR); 1167 PRINT_BIT(attr,LBP_PO); 1168 PRINT_BIT(attr,LBP_SA); 1169 PRINT_BIT(attr,LBP_XX); 1170 PRINT_BIT(attr,LBP_AI); 1171#undef PRINT_BIT 1172 fprintf (stream, "\n"); 1173 } 1174 } 1175} 1176 1177static void 1178debug_output_org_tables (const char *filename) 1179{ 1180 FILE *stream; 1181 1182 stream = fopen (filename, "w"); 1183 if (stream == NULL) 1184 { 1185 fprintf (stderr, "cannot open '%s' for writing\n", filename); 1186 exit (1); 1187 } 1188 1189 debug_output_org_lbp (stream); 1190 1191 if (ferror (stream) || fclose (stream)) 1192 { 1193 fprintf (stderr, "error writing to '%s'\n", filename); 1194 exit (1); 1195 } 1196} 1197 1198/* Construction of sparse 3-level tables. */ 1199#define TABLE lbp_table 1200#define ELEMENT unsigned char 1201#define DEFAULT LBP_XX 1202#define xmalloc malloc 1203#define xrealloc realloc 1204#include "3level.h" 1205 1206static void 1207output_lbp (FILE *stream) 1208{ 1209 unsigned int i; 1210 struct lbp_table t; 1211 unsigned int level1_offset, level2_offset, level3_offset; 1212 1213 t.p = 7; 1214 t.q = 9; 1215 lbp_table_init (&t); 1216 1217 for (i = 0; i < 0x110000; i++) 1218 { 1219 int attr = get_lbp (i); 1220 1221 /* Now attr should contain exactly one bit. */ 1222 if (attr == 0 || ((attr & (attr - 1)) != 0)) 1223 abort (); 1224 1225 if (attr != 1 << LBP_XX) 1226 { 1227 unsigned int log2_attr; 1228 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); 1229 1230 lbp_table_add (&t, i, log2_attr); 1231 } 1232 } 1233 1234 lbp_table_finalize (&t); 1235 1236 level1_offset = 1237 5 * sizeof (uint32_t); 1238 level2_offset = 1239 5 * sizeof (uint32_t) 1240 + t.level1_size * sizeof (uint32_t); 1241 level3_offset = 1242 5 * sizeof (uint32_t) 1243 + t.level1_size * sizeof (uint32_t) 1244 + (t.level2_size << t.q) * sizeof (uint32_t); 1245 1246 for (i = 0; i < 5; i++) 1247 fprintf (stream, "#define lbrkprop_header_%d %d\n", i, 1248 ((uint32_t *) t.result)[i]); 1249 fprintf (stream, "static const\n"); 1250 fprintf (stream, "struct\n"); 1251 fprintf (stream, " {\n"); 1252 fprintf (stream, " int level1[%d];\n", t.level1_size); 1253 fprintf (stream, " int level2[%d << %d];\n", t.level2_size, t.q); 1254 fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size, t.p); 1255 fprintf (stream, " }\n"); 1256 fprintf (stream, "lbrkprop =\n"); 1257 fprintf (stream, "{\n"); 1258 fprintf (stream, " {"); 1259 for (i = 0; i < t.level1_size; i++) 1260 { 1261 uint32_t offset; 1262 if (i > 0 && (i % 8) == 0) 1263 fprintf (stream, "\n "); 1264 offset = ((uint32_t *) (t.result + level1_offset))[i]; 1265 fprintf (stream, " %5d%s", 1266 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t), 1267 (i+1 < t.level1_size ? "," : "")); 1268 } 1269 fprintf (stream, " },\n"); 1270 fprintf (stream, " {"); 1271 if (t.level2_size << t.q > 8) 1272 fprintf (stream, "\n "); 1273 for (i = 0; i < t.level2_size << t.q; i++) 1274 { 1275 uint32_t offset; 1276 if (i > 0 && (i % 8) == 0) 1277 fprintf (stream, "\n "); 1278 offset = ((uint32_t *) (t.result + level2_offset))[i]; 1279 fprintf (stream, " %5d%s", 1280 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t), 1281 (i+1 < t.level2_size << t.q ? "," : "")); 1282 } 1283 if (t.level2_size << t.q > 8) 1284 fprintf (stream, "\n "); 1285 fprintf (stream, " },\n"); 1286 fprintf (stream, " {"); 1287 if (t.level3_size << t.p > 8) 1288 fprintf (stream, "\n "); 1289 for (i = 0; i < t.level3_size << t.p; i++) 1290 { 1291 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; 1292 const char *value_string; 1293 switch (value) 1294 { 1295#define CASE(x) case x: value_string = #x; break; 1296 CASE(LBP_BK); 1297 CASE(LBP_CM); 1298 CASE(LBP_ZW); 1299 CASE(LBP_IN); 1300 CASE(LBP_GL); 1301 CASE(LBP_CB); 1302 CASE(LBP_SP); 1303 CASE(LBP_BA); 1304 CASE(LBP_BB); 1305 CASE(LBP_B2); 1306 CASE(LBP_HY); 1307 CASE(LBP_NS); 1308 CASE(LBP_OP); 1309 CASE(LBP_CL); 1310 CASE(LBP_QU); 1311 CASE(LBP_EX); 1312 CASE(LBP_ID); 1313 CASE(LBP_NU); 1314 CASE(LBP_IS); 1315 CASE(LBP_SY); 1316 CASE(LBP_AL); 1317 CASE(LBP_PR); 1318 CASE(LBP_PO); 1319 CASE(LBP_SA); 1320 CASE(LBP_XX); 1321 CASE(LBP_AI); 1322#undef CASE 1323 default: 1324 abort (); 1325 } 1326 if (i > 0 && (i % 8) == 0) 1327 fprintf (stream, "\n "); 1328 fprintf (stream, " %s%s", value_string, 1329 (i+1 < t.level3_size << t.p ? "," : "")); 1330 } 1331 if (t.level3_size << t.p > 8) 1332 fprintf (stream, "\n "); 1333 fprintf (stream, " }\n"); 1334 fprintf (stream, "};\n"); 1335} 1336 1337static void 1338output_tables (const char *filename, const char *version) 1339{ 1340 FILE *stream; 1341 1342 stream = fopen (filename, "w"); 1343 if (stream == NULL) 1344 { 1345 fprintf (stderr, "cannot open '%s' for writing\n", filename); 1346 exit (1); 1347 } 1348 1349 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); 1350 fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s. */\n", 1351 version); 1352 fprintf (stream, "\n"); 1353 1354 /* Put a GPL header on it. The gnulib module is under LGPL (although it 1355 still carries the GPL header), and it's gnulib-tool which replaces the 1356 GPL header with an LGPL header. */ 1357 fprintf (stream, "/* Copyright (C) 2000-2004 Free Software Foundation, Inc.\n"); 1358 fprintf (stream, "\n"); 1359 fprintf (stream, "This program is free software; you can redistribute it and/or modify\n"); 1360 fprintf (stream, "it under the terms of the GNU General Public License as published by\n"); 1361 fprintf (stream, "the Free Software Foundation; either version 2, or (at your option)\n"); 1362 fprintf (stream, "any later version.\n"); 1363 fprintf (stream, "\n"); 1364 fprintf (stream, "This program is distributed in the hope that it will be useful,\n"); 1365 fprintf (stream, "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); 1366 fprintf (stream, "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); 1367 fprintf (stream, "GNU General Public License for more details.\n"); 1368 fprintf (stream, "\n"); 1369 fprintf (stream, "You should have received a copy of the GNU General Public License\n"); 1370 fprintf (stream, "along with this program; if not, write to the Free Software\n"); 1371 fprintf (stream, "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */\n"); 1372 fprintf (stream, "\n"); 1373 1374 output_lbp (stream); 1375 1376 if (ferror (stream) || fclose (stream)) 1377 { 1378 fprintf (stderr, "error writing to '%s'\n", filename); 1379 exit (1); 1380 } 1381} 1382 1383int 1384main (int argc, char * argv[]) 1385{ 1386 if (argc != 6) 1387 { 1388 fprintf (stderr, "Usage: %s UnicodeData.txt Combining.txt EastAsianWidth.txt LineBreak.txt version\n", 1389 argv[0]); 1390 exit (1); 1391 } 1392 1393 fill_attributes (argv[1]); 1394 fill_combining (argv[2]); 1395 fill_width (argv[3]); 1396 fill_org_lbp (argv[4]); 1397 1398 debug_output_tables ("lbrkprop.txt"); 1399 debug_output_org_tables ("lbrkprop_org.txt"); 1400 1401 output_tables ("lbrkprop.h", argv[5]); 1402 1403 return 0; 1404} 1405