1/* $NetBSD: ucgendat.c,v 1.3 2021/08/14 16:14:57 christos Exp $ */ 2 3/* $OpenLDAP$ */ 4/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2021 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17/* Copyright 2001 Computing Research Labs, New Mexico State University 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a 20 * copy of this software and associated documentation files (the "Software"), 21 * to deal in the Software without restriction, including without limitation 22 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 23 * and/or sell copies of the Software, and to permit persons to whom the 24 * Software is furnished to do so, subject to the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 36 */ 37/* Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp " */ 38 39#include <sys/cdefs.h> 40__RCSID("$NetBSD: ucgendat.c,v 1.3 2021/08/14 16:14:57 christos Exp $"); 41 42#include "portable.h" 43#include "ldap_config.h" 44 45#include <stdio.h> 46#include <ac/ctype.h> 47#include <ac/stdlib.h> 48#include <ac/string.h> 49#include <ac/unistd.h> 50 51#include <ac/bytes.h> 52 53#include <lutil.h> 54 55#ifndef HARDCODE_DATA 56#define HARDCODE_DATA 1 57#endif 58 59#undef ishdigit 60#define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\ 61 ((cc) >= 'A' && (cc) <= 'F') ||\ 62 ((cc) >= 'a' && (cc) <= 'f')) 63 64/* 65 * A header written to the output file with the byte-order-mark and the number 66 * of property nodes. 67 */ 68static ac_uint2 hdr[2] = {0xfeff, 0}; 69 70#define NUMPROPS 50 71#define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3))) 72 73typedef struct { 74 char *name; 75 int len; 76} _prop_t; 77 78/* 79 * List of properties expected to be found in the Unicode Character Database 80 * including some implementation specific properties. 81 * 82 * The implementation specific properties are: 83 * Cm = Composed (can be decomposed) 84 * Nb = Non-breaking 85 * Sy = Symmetric (has left and right forms) 86 * Hd = Hex digit 87 * Qm = Quote marks 88 * Mr = Mirroring 89 * Ss = Space, other 90 * Cp = Defined character 91 */ 92static _prop_t props[NUMPROPS] = { 93 {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2}, 94 {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2}, 95 {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2}, 96 {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2}, 97 {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1}, 98 {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1}, 99 {"S", 1}, {"WS", 2}, {"ON", 2}, 100 {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2}, 101 {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2} 102}; 103 104typedef struct { 105 ac_uint4 *ranges; 106 ac_uint2 used; 107 ac_uint2 size; 108} _ranges_t; 109 110static _ranges_t proptbl[NUMPROPS]; 111 112/* 113 * Make sure this array is sized to be on a 4-byte boundary at compile time. 114 */ 115static ac_uint2 propcnt[NEEDPROPS]; 116 117/* 118 * Array used to collect a decomposition before adding it to the decomposition 119 * table. 120 */ 121static ac_uint4 dectmp[64]; 122static ac_uint4 dectmp_size; 123 124typedef struct { 125 ac_uint4 code; 126 ac_uint2 size; 127 ac_uint2 used; 128 ac_uint4 *decomp; 129} _decomp_t; 130 131/* 132 * List of decomposition. Created and expanded in order as the characters are 133 * encountered. First list contains canonical mappings, second also includes 134 * compatibility mappings. 135 */ 136static _decomp_t *decomps; 137static ac_uint4 decomps_used; 138static ac_uint4 decomps_size; 139 140static _decomp_t *kdecomps; 141static ac_uint4 kdecomps_used; 142static ac_uint4 kdecomps_size; 143 144/* 145 * Composition exclusion table stuff. 146 */ 147#define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31))) 148#define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31))) 149static ac_uint4 compexs[8192]; 150 151/* 152 * Struct for holding a composition pair, and array of composition pairs 153 */ 154typedef struct { 155 ac_uint4 comp; 156 ac_uint4 count; 157 ac_uint4 code1; 158 ac_uint4 code2; 159} _comp_t; 160 161static _comp_t *comps; 162static ac_uint4 comps_used; 163 164/* 165 * Types and lists for handling lists of case mappings. 166 */ 167typedef struct { 168 ac_uint4 key; 169 ac_uint4 other1; 170 ac_uint4 other2; 171} _case_t; 172 173static _case_t *upper; 174static _case_t *lower; 175static _case_t *title; 176static ac_uint4 upper_used; 177static ac_uint4 upper_size; 178static ac_uint4 lower_used; 179static ac_uint4 lower_size; 180static ac_uint4 title_used; 181static ac_uint4 title_size; 182 183/* 184 * Array used to collect case mappings before adding them to a list. 185 */ 186static ac_uint4 cases[3]; 187 188/* 189 * An array to hold ranges for combining classes. 190 */ 191static ac_uint4 *ccl; 192static ac_uint4 ccl_used; 193static ac_uint4 ccl_size; 194 195/* 196 * Structures for handling numbers. 197 */ 198typedef struct { 199 ac_uint4 code; 200 ac_uint4 idx; 201} _codeidx_t; 202 203typedef struct { 204 short numerator; 205 short denominator; 206} _num_t; 207 208/* 209 * Arrays to hold the mapping of codes to numbers. 210 */ 211static _codeidx_t *ncodes; 212static ac_uint4 ncodes_used; 213static ac_uint4 ncodes_size; 214 215static _num_t *nums; 216static ac_uint4 nums_used; 217static ac_uint4 nums_size; 218 219/* 220 * Array for holding numbers. 221 */ 222static _num_t *nums; 223static ac_uint4 nums_used; 224static ac_uint4 nums_size; 225 226static void 227add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2) 228{ 229 int i, j, k, len; 230 _ranges_t *rlp; 231 char *name; 232 233 for (k = 0; k < 2; k++) { 234 if (k == 0) { 235 name = p1; 236 len = 2; 237 } else { 238 if (p2 == 0) 239 break; 240 241 name = p2; 242 len = 1; 243 } 244 245 for (i = 0; i < NUMPROPS; i++) { 246 if (props[i].len == len && memcmp(props[i].name, name, len) == 0) 247 break; 248 } 249 250 if (i == NUMPROPS) 251 continue; 252 253 rlp = &proptbl[i]; 254 255 /* 256 * Resize the range list if necessary. 257 */ 258 if (rlp->used == rlp->size) { 259 if (rlp->size == 0) 260 rlp->ranges = (ac_uint4 *) 261 malloc(sizeof(ac_uint4) << 3); 262 else 263 rlp->ranges = (ac_uint4 *) 264 realloc((char *) rlp->ranges, 265 sizeof(ac_uint4) * (rlp->size + 8)); 266 rlp->size += 8; 267 } 268 269 /* 270 * If this is the first code for this property list, just add it 271 * and return. 272 */ 273 if (rlp->used == 0) { 274 rlp->ranges[0] = start; 275 rlp->ranges[1] = end; 276 rlp->used += 2; 277 continue; 278 } 279 280 /* 281 * Optimize the case of adding the range to the end. 282 */ 283 j = rlp->used - 1; 284 if (start > rlp->ranges[j]) { 285 j = rlp->used; 286 rlp->ranges[j++] = start; 287 rlp->ranges[j++] = end; 288 rlp->used = j; 289 continue; 290 } 291 292 /* 293 * Need to locate the insertion point. 294 */ 295 for (i = 0; 296 i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ; 297 298 /* 299 * If the start value lies in the current range, then simply set the 300 * new end point of the range to the end value passed as a parameter. 301 */ 302 if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) { 303 rlp->ranges[i + 1] = end; 304 return; 305 } 306 307 /* 308 * Shift following values up by two. 309 */ 310 for (j = rlp->used; j > i; j -= 2) { 311 rlp->ranges[j] = rlp->ranges[j - 2]; 312 rlp->ranges[j + 1] = rlp->ranges[j - 1]; 313 } 314 315 /* 316 * Add the new range at the insertion point. 317 */ 318 rlp->ranges[i] = start; 319 rlp->ranges[i + 1] = end; 320 rlp->used += 2; 321 } 322} 323 324static void 325ordered_range_insert(ac_uint4 c, char *name, int len) 326{ 327 int i, j; 328 ac_uint4 s, e; 329 _ranges_t *rlp; 330 331 if (len == 0) 332 return; 333 334 /* 335 * Deal with directionality codes introduced in Unicode 3.0. 336 */ 337 if ((len == 2 && memcmp(name, "BN", 2) == 0) || 338 (len == 3 && 339 (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 || 340 memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 || 341 memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) { 342 /* 343 * Mark all of these as Other Neutral to preserve compatibility with 344 * older versions. 345 */ 346 len = 2; 347 name = "ON"; 348 } 349 350 for (i = 0; i < NUMPROPS; i++) { 351 if (props[i].len == len && memcmp(props[i].name, name, len) == 0) 352 break; 353 } 354 355 if (i == NUMPROPS) 356 return; 357 358 /* 359 * Have a match, so insert the code in order. 360 */ 361 rlp = &proptbl[i]; 362 363 /* 364 * Resize the range list if necessary. 365 */ 366 if (rlp->used == rlp->size) { 367 if (rlp->size == 0) 368 rlp->ranges = (ac_uint4 *) 369 malloc(sizeof(ac_uint4) << 3); 370 else 371 rlp->ranges = (ac_uint4 *) 372 realloc((char *) rlp->ranges, 373 sizeof(ac_uint4) * (rlp->size + 8)); 374 rlp->size += 8; 375 } 376 377 /* 378 * If this is the first code for this property list, just add it 379 * and return. 380 */ 381 if (rlp->used == 0) { 382 rlp->ranges[0] = rlp->ranges[1] = c; 383 rlp->used += 2; 384 return; 385 } 386 387 /* 388 * Optimize the cases of extending the last range and adding new ranges to 389 * the end. 390 */ 391 j = rlp->used - 1; 392 e = rlp->ranges[j]; 393 s = rlp->ranges[j - 1]; 394 395 if (c == e + 1) { 396 /* 397 * Extend the last range. 398 */ 399 rlp->ranges[j] = c; 400 return; 401 } 402 403 if (c > e + 1) { 404 /* 405 * Start another range on the end. 406 */ 407 j = rlp->used; 408 rlp->ranges[j] = rlp->ranges[j + 1] = c; 409 rlp->used += 2; 410 return; 411 } 412 413 if (c >= s) 414 /* 415 * The code is a duplicate of a code in the last range, so just return. 416 */ 417 return; 418 419 /* 420 * The code should be inserted somewhere before the last range in the 421 * list. Locate the insertion point. 422 */ 423 for (i = 0; 424 i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ; 425 426 s = rlp->ranges[i]; 427 e = rlp->ranges[i + 1]; 428 429 if (c == e + 1) 430 /* 431 * Simply extend the current range. 432 */ 433 rlp->ranges[i + 1] = c; 434 else if (c < s) { 435 /* 436 * Add a new entry before the current location. Shift all entries 437 * before the current one up by one to make room. 438 */ 439 for (j = rlp->used; j > i; j -= 2) { 440 rlp->ranges[j] = rlp->ranges[j - 2]; 441 rlp->ranges[j + 1] = rlp->ranges[j - 1]; 442 } 443 rlp->ranges[i] = rlp->ranges[i + 1] = c; 444 445 rlp->used += 2; 446 } 447} 448 449static void 450add_decomp(ac_uint4 code, short compat) 451{ 452 ac_uint4 i, j, size; 453 _decomp_t **pdecomps; 454 ac_uint4 *pdecomps_used; 455 ac_uint4 *pdecomps_size; 456 457 if (compat) { 458 pdecomps = &kdecomps; 459 pdecomps_used = &kdecomps_used; 460 pdecomps_size = &kdecomps_size; 461 } else { 462 pdecomps = &decomps; 463 pdecomps_used = &decomps_used; 464 pdecomps_size = &decomps_size; 465 } 466 467 /* 468 * Add the code to the composite property. 469 */ 470 if (!compat) { 471 ordered_range_insert(code, "Cm", 2); 472 } 473 474 /* 475 * Locate the insertion point for the code. 476 */ 477 for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ; 478 479 /* 480 * Allocate space for a new decomposition. 481 */ 482 if (*pdecomps_used == *pdecomps_size) { 483 if (*pdecomps_size == 0) 484 *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); 485 else 486 *pdecomps = (_decomp_t *) 487 realloc((char *) *pdecomps, 488 sizeof(_decomp_t) * (*pdecomps_size + 8)); 489 (void) memset((char *) (*pdecomps + *pdecomps_size), '\0', 490 sizeof(_decomp_t) << 3); 491 *pdecomps_size += 8; 492 } 493 494 if (i < *pdecomps_used && code != (*pdecomps)[i].code) { 495 /* 496 * Shift the decomps up by one if the codes don't match. 497 */ 498 for (j = *pdecomps_used; j > i; j--) 499 (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1], 500 sizeof(_decomp_t)); 501 } 502 503 /* 504 * Insert or replace a decomposition. 505 */ 506 size = dectmp_size + (4 - (dectmp_size & 3)); 507 if ((*pdecomps)[i].size < size) { 508 if ((*pdecomps)[i].size == 0) 509 (*pdecomps)[i].decomp = (ac_uint4 *) 510 malloc(sizeof(ac_uint4) * size); 511 else 512 (*pdecomps)[i].decomp = (ac_uint4 *) 513 realloc((char *) (*pdecomps)[i].decomp, 514 sizeof(ac_uint4) * size); 515 (*pdecomps)[i].size = size; 516 } 517 518 if ((*pdecomps)[i].code != code) 519 (*pdecomps_used)++; 520 521 (*pdecomps)[i].code = code; 522 (*pdecomps)[i].used = dectmp_size; 523 (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp, 524 sizeof(ac_uint4) * dectmp_size); 525 526 /* 527 * NOTICE: This needs changing later so it is more general than simply 528 * pairs. This calculation is done here to simplify allocation elsewhere. 529 */ 530 if (!compat && dectmp_size == 2) 531 comps_used++; 532} 533 534static void 535add_title(ac_uint4 code) 536{ 537 ac_uint4 i, j; 538 539 /* 540 * Always map the code to itself. 541 */ 542 cases[2] = code; 543 544 /* 545 * If the upper case character is not present, then make it the same as 546 * the title case. 547 */ 548 if (cases[0] == 0) 549 cases[0] = code; 550 551 if (title_used == title_size) { 552 if (title_size == 0) 553 title = (_case_t *) malloc(sizeof(_case_t) << 3); 554 else 555 title = (_case_t *) realloc((char *) title, 556 sizeof(_case_t) * (title_size + 8)); 557 title_size += 8; 558 } 559 560 /* 561 * Locate the insertion point. 562 */ 563 for (i = 0; i < title_used && code > title[i].key; i++) ; 564 565 if (i < title_used) { 566 /* 567 * Shift the array up by one. 568 */ 569 for (j = title_used; j > i; j--) 570 (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1], 571 sizeof(_case_t)); 572 } 573 574 title[i].key = cases[2]; /* Title */ 575 title[i].other1 = cases[0]; /* Upper */ 576 title[i].other2 = cases[1]; /* Lower */ 577 578 title_used++; 579} 580 581static void 582add_upper(ac_uint4 code) 583{ 584 ac_uint4 i, j; 585 586 /* 587 * Always map the code to itself. 588 */ 589 cases[0] = code; 590 591 /* 592 * If the title case character is not present, then make it the same as 593 * the upper case. 594 */ 595 if (cases[2] == 0) 596 cases[2] = code; 597 598 if (upper_used == upper_size) { 599 if (upper_size == 0) 600 upper = (_case_t *) malloc(sizeof(_case_t) << 3); 601 else 602 upper = (_case_t *) realloc((char *) upper, 603 sizeof(_case_t) * (upper_size + 8)); 604 upper_size += 8; 605 } 606 607 /* 608 * Locate the insertion point. 609 */ 610 for (i = 0; i < upper_used && code > upper[i].key; i++) ; 611 612 if (i < upper_used) { 613 /* 614 * Shift the array up by one. 615 */ 616 for (j = upper_used; j > i; j--) 617 (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1], 618 sizeof(_case_t)); 619 } 620 621 upper[i].key = cases[0]; /* Upper */ 622 upper[i].other1 = cases[1]; /* Lower */ 623 upper[i].other2 = cases[2]; /* Title */ 624 625 upper_used++; 626} 627 628static void 629add_lower(ac_uint4 code) 630{ 631 ac_uint4 i, j; 632 633 /* 634 * Always map the code to itself. 635 */ 636 cases[1] = code; 637 638 /* 639 * If the title case character is empty, then make it the same as the 640 * upper case. 641 */ 642 if (cases[2] == 0) 643 cases[2] = cases[0]; 644 645 if (lower_used == lower_size) { 646 if (lower_size == 0) 647 lower = (_case_t *) malloc(sizeof(_case_t) << 3); 648 else 649 lower = (_case_t *) realloc((char *) lower, 650 sizeof(_case_t) * (lower_size + 8)); 651 lower_size += 8; 652 } 653 654 /* 655 * Locate the insertion point. 656 */ 657 for (i = 0; i < lower_used && code > lower[i].key; i++) ; 658 659 if (i < lower_used) { 660 /* 661 * Shift the array up by one. 662 */ 663 for (j = lower_used; j > i; j--) 664 (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1], 665 sizeof(_case_t)); 666 } 667 668 lower[i].key = cases[1]; /* Lower */ 669 lower[i].other1 = cases[0]; /* Upper */ 670 lower[i].other2 = cases[2]; /* Title */ 671 672 lower_used++; 673} 674 675static void 676ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code) 677{ 678 ac_uint4 i, j; 679 680 if (ccl_used == ccl_size) { 681 if (ccl_size == 0) 682 ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24); 683 else 684 ccl = (ac_uint4 *) 685 realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24)); 686 ccl_size += 24; 687 } 688 689 /* 690 * Optimize adding the first item. 691 */ 692 if (ccl_used == 0) { 693 ccl[0] = ccl[1] = c; 694 ccl[2] = ccl_code; 695 ccl_used += 3; 696 return; 697 } 698 699 /* 700 * Handle the special case of extending the range on the end. This 701 * requires that the combining class codes are the same. 702 */ 703 if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) { 704 ccl[ccl_used - 2] = c; 705 return; 706 } 707 708 /* 709 * Handle the special case of adding another range on the end. 710 */ 711 if (c > ccl[ccl_used - 2] + 1 || 712 (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) { 713 ccl[ccl_used++] = c; 714 ccl[ccl_used++] = c; 715 ccl[ccl_used++] = ccl_code; 716 return; 717 } 718 719 /* 720 * Locate either the insertion point or range for the code. 721 */ 722 for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ; 723 724 if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) { 725 /* 726 * Extend an existing range. 727 */ 728 ccl[i + 1] = c; 729 return; 730 } else if (c < ccl[i]) { 731 /* 732 * Start a new range before the current location. 733 */ 734 for (j = ccl_used; j > i; j -= 3) { 735 ccl[j] = ccl[j - 3]; 736 ccl[j - 1] = ccl[j - 4]; 737 ccl[j - 2] = ccl[j - 5]; 738 } 739 ccl[i] = ccl[i + 1] = c; 740 ccl[i + 2] = ccl_code; 741 } 742} 743 744/* 745 * Adds a number if it does not already exist and returns an index value 746 * multiplied by 2. 747 */ 748static ac_uint4 749make_number(short num, short denom) 750{ 751 ac_uint4 n; 752 753 /* 754 * Determine if the number already exists. 755 */ 756 for (n = 0; n < nums_used; n++) { 757 if (nums[n].numerator == num && nums[n].denominator == denom) 758 return n << 1; 759 } 760 761 if (nums_used == nums_size) { 762 if (nums_size == 0) 763 nums = (_num_t *) malloc(sizeof(_num_t) << 3); 764 else 765 nums = (_num_t *) realloc((char *) nums, 766 sizeof(_num_t) * (nums_size + 8)); 767 nums_size += 8; 768 } 769 770 n = nums_used++; 771 nums[n].numerator = num; 772 nums[n].denominator = denom; 773 774 return n << 1; 775} 776 777static void 778add_number(ac_uint4 code, short num, short denom) 779{ 780 ac_uint4 i, j; 781 782 /* 783 * Insert the code in order. 784 */ 785 for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ; 786 787 /* 788 * Handle the case of the codes matching and simply replace the number 789 * that was there before. 790 */ 791 if (i < ncodes_used && code == ncodes[i].code) { 792 ncodes[i].idx = make_number(num, denom); 793 return; 794 } 795 796 /* 797 * Resize the array if necessary. 798 */ 799 if (ncodes_used == ncodes_size) { 800 if (ncodes_size == 0) 801 ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3); 802 else 803 ncodes = (_codeidx_t *) 804 realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8)); 805 806 ncodes_size += 8; 807 } 808 809 /* 810 * Shift things around to insert the code if necessary. 811 */ 812 if (i < ncodes_used) { 813 for (j = ncodes_used; j > i; j--) { 814 ncodes[j].code = ncodes[j - 1].code; 815 ncodes[j].idx = ncodes[j - 1].idx; 816 } 817 } 818 ncodes[i].code = code; 819 ncodes[i].idx = make_number(num, denom); 820 821 ncodes_used++; 822} 823 824/* 825 * This routine assumes that the line is a valid Unicode Character Database 826 * entry. 827 */ 828static void 829read_cdata(FILE *in) 830{ 831 ac_uint4 i, lineno, skip, code, ccl_code; 832 short wnum, neg, number[2], compat; 833 char line[512], *s, *e, *first_prop; 834 835 lineno = skip = 0; 836 while (fgets(line, sizeof(line), in)) { 837 if( (s=strchr(line, '\n')) ) *s = '\0'; 838 lineno++; 839 840 /* 841 * Skip blank lines and lines that start with a '#'. 842 */ 843 if (line[0] == 0 || line[0] == '#') 844 continue; 845 846 /* 847 * If lines need to be skipped, do it here. 848 */ 849 if (skip) { 850 skip--; 851 continue; 852 } 853 854 /* 855 * Collect the code. The code can be up to 6 hex digits in length to 856 * allow surrogates to be specified. 857 */ 858 for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) { 859 code <<= 4; 860 if (*s >= '0' && *s <= '9') 861 code += *s - '0'; 862 else if (*s >= 'A' && *s <= 'F') 863 code += (*s - 'A') + 10; 864 else if (*s >= 'a' && *s <= 'f') 865 code += (*s - 'a') + 10; 866 } 867 868 /* 869 * Handle the following special cases: 870 * 1. 4E00-9FA5 CJK Ideographs. 871 * 2. AC00-D7A3 Hangul Syllables. 872 * 3. D800-DFFF Surrogates. 873 * 4. E000-F8FF Private Use Area. 874 * 5. F900-FA2D Han compatibility. 875 * ...Plus additional ranges in newer Unicode versions... 876 */ 877 switch (code) { 878 case 0x3400: 879 /* CJK Ideograph Extension A */ 880 add_range(0x3400, 0x4db5, "Lo", "L"); 881 882 add_range(0x3400, 0x4db5, "Cp", 0); 883 884 skip = 1; 885 break; 886 case 0x4e00: 887 /* 888 * The Han ideographs. 889 */ 890 add_range(0x4e00, 0x9fff, "Lo", "L"); 891 892 /* 893 * Add the characters to the defined category. 894 */ 895 add_range(0x4e00, 0x9fa5, "Cp", 0); 896 897 skip = 1; 898 break; 899 case 0xac00: 900 /* 901 * The Hangul syllables. 902 */ 903 add_range(0xac00, 0xd7a3, "Lo", "L"); 904 905 /* 906 * Add the characters to the defined category. 907 */ 908 add_range(0xac00, 0xd7a3, "Cp", 0); 909 910 skip = 1; 911 break; 912 case 0xd800: 913 /* 914 * Make a range of all surrogates and assume some default 915 * properties. 916 */ 917 add_range(0x010000, 0x10ffff, "Cs", "L"); 918 skip = 5; 919 break; 920 case 0xe000: 921 /* 922 * The Private Use area. Add with a default set of properties. 923 */ 924 add_range(0xe000, 0xf8ff, "Co", "L"); 925 skip = 1; 926 break; 927 case 0xf900: 928 /* 929 * The CJK compatibility area. 930 */ 931 add_range(0xf900, 0xfaff, "Lo", "L"); 932 933 /* 934 * Add the characters to the defined category. 935 */ 936 add_range(0xf900, 0xfaff, "Cp", 0); 937 938 skip = 1; 939 break; 940 case 0x20000: 941 /* CJK Ideograph Extension B */ 942 add_range(0x20000, 0x2a6d6, "Lo", "L"); 943 944 add_range(0x20000, 0x2a6d6, "Cp", 0); 945 946 skip = 1; 947 break; 948 case 0xf0000: 949 /* Plane 15 private use */ 950 add_range(0xf0000, 0xffffd, "Co", "L"); 951 skip = 1; 952 break; 953 954 case 0x100000: 955 /* Plane 16 private use */ 956 add_range(0x100000, 0x10fffd, "Co", "L"); 957 skip = 1; 958 break; 959 } 960 961 if (skip) 962 continue; 963 964 /* 965 * Add the code to the defined category. 966 */ 967 ordered_range_insert(code, "Cp", 2); 968 969 /* 970 * Locate the first character property field. 971 */ 972 for (i = 0; *s != 0 && i < 2; s++) { 973 if (*s == ';') 974 i++; 975 } 976 for (e = s; *e && *e != ';'; e++) ; 977 978 first_prop = s; 979 980 ordered_range_insert(code, s, e - s); 981 982 /* 983 * Locate the combining class code. 984 */ 985 for (s = e; *s != 0 && i < 3; s++) { 986 if (*s == ';') 987 i++; 988 } 989 990 /* 991 * Convert the combining class code from decimal. 992 */ 993 for (ccl_code = 0, e = s; *e && *e != ';'; e++) 994 ccl_code = (ccl_code * 10) + (*e - '0'); 995 996 /* 997 * Add the code if it not 0. 998 */ 999 if (ccl_code != 0) 1000 ordered_ccl_insert(code, ccl_code); 1001 1002 /* 1003 * Locate the second character property field. 1004 */ 1005 for (s = e; *s != 0 && i < 4; s++) { 1006 if (*s == ';') 1007 i++; 1008 } 1009 for (e = s; *e && *e != ';'; e++) ; 1010 1011 ordered_range_insert(code, s, e - s); 1012 1013 /* 1014 * Check for a decomposition. 1015 */ 1016 s = ++e; 1017 if (*s != ';') { 1018 compat = *s == '<'; 1019 if (compat) { 1020 /* 1021 * Skip compatibility formatting tag. 1022 */ 1023 while (*s++ != '>'); 1024 } 1025 /* 1026 * Collect the codes of the decomposition. 1027 */ 1028 for (dectmp_size = 0; *s != ';'; ) { 1029 /* 1030 * Skip all leading non-hex digits. 1031 */ 1032 while (!ishdigit(*s)) 1033 s++; 1034 1035 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) { 1036 dectmp[dectmp_size] <<= 4; 1037 if (*s >= '0' && *s <= '9') 1038 dectmp[dectmp_size] += *s - '0'; 1039 else if (*s >= 'A' && *s <= 'F') 1040 dectmp[dectmp_size] += (*s - 'A') + 10; 1041 else if (*s >= 'a' && *s <= 'f') 1042 dectmp[dectmp_size] += (*s - 'a') + 10; 1043 } 1044 dectmp_size++; 1045 } 1046 1047 /* 1048 * If there are any codes in the temporary decomposition array, 1049 * then add the character with its decomposition. 1050 */ 1051 if (dectmp_size > 0) { 1052 if (!compat) { 1053 add_decomp(code, 0); 1054 } 1055 add_decomp(code, 1); 1056 } 1057 } 1058 1059 /* 1060 * Skip to the number field. 1061 */ 1062 for (i = 0; i < 3 && *s; s++) { 1063 if (*s == ';') 1064 i++; 1065 } 1066 1067 /* 1068 * Scan the number in. 1069 */ 1070 number[0] = number[1] = 0; 1071 for (e = s, neg = wnum = 0; *e && *e != ';'; e++) { 1072 if (*e == '-') { 1073 neg = 1; 1074 continue; 1075 } 1076 1077 if (*e == '/') { 1078 /* 1079 * Move the the denominator of the fraction. 1080 */ 1081 if (neg) 1082 number[wnum] *= -1; 1083 neg = 0; 1084 e++; 1085 wnum++; 1086 } 1087 number[wnum] = (number[wnum] * 10) + (*e - '0'); 1088 } 1089 1090 if (e > s) { 1091 /* 1092 * Adjust the denominator in case of integers and add the number. 1093 */ 1094 if (wnum == 0) 1095 number[1] = 1; 1096 1097 add_number(code, number[0], number[1]); 1098 } 1099 1100 /* 1101 * Skip to the start of the possible case mappings. 1102 */ 1103 for (s = e, i = 0; i < 4 && *s; s++) { 1104 if (*s == ';') 1105 i++; 1106 } 1107 1108 /* 1109 * Collect the case mappings. 1110 */ 1111 cases[0] = cases[1] = cases[2] = 0; 1112 for (i = 0; i < 3; i++) { 1113 while (ishdigit(*s)) { 1114 cases[i] <<= 4; 1115 if (*s >= '0' && *s <= '9') 1116 cases[i] += *s - '0'; 1117 else if (*s >= 'A' && *s <= 'F') 1118 cases[i] += (*s - 'A') + 10; 1119 else if (*s >= 'a' && *s <= 'f') 1120 cases[i] += (*s - 'a') + 10; 1121 s++; 1122 } 1123 if (*s == ';') 1124 s++; 1125 } 1126 if (!strncmp(first_prop,"Lt",2) && (cases[0] || cases[1])) 1127 /* 1128 * Add the upper and lower mappings for a title case character. 1129 */ 1130 add_title(code); 1131 else if (cases[1]) 1132 /* 1133 * Add the lower and title case mappings for the upper case 1134 * character. 1135 */ 1136 add_upper(code); 1137 else if (cases[0]) 1138 /* 1139 * Add the upper and title case mappings for the lower case 1140 * character. 1141 */ 1142 add_lower(code); 1143 } 1144} 1145 1146static _decomp_t * 1147find_decomp(ac_uint4 code, short compat) 1148{ 1149 long l, r, m; 1150 _decomp_t *decs; 1151 1152 l = 0; 1153 r = (compat ? kdecomps_used : decomps_used) - 1; 1154 decs = compat ? kdecomps : decomps; 1155 while (l <= r) { 1156 m = (l + r) >> 1; 1157 if (code > decs[m].code) 1158 l = m + 1; 1159 else if (code < decs[m].code) 1160 r = m - 1; 1161 else 1162 return &decs[m]; 1163 } 1164 return 0; 1165} 1166 1167static void 1168decomp_it(_decomp_t *d, short compat) 1169{ 1170 ac_uint4 i; 1171 _decomp_t *dp; 1172 1173 for (i = 0; i < d->used; i++) { 1174 if ((dp = find_decomp(d->decomp[i], compat)) != 0) 1175 decomp_it(dp, compat); 1176 else 1177 dectmp[dectmp_size++] = d->decomp[i]; 1178 } 1179} 1180 1181/* 1182 * Expand all decompositions by recursively decomposing each character 1183 * in the decomposition. 1184 */ 1185static void 1186expand_decomp(void) 1187{ 1188 ac_uint4 i; 1189 1190 for (i = 0; i < decomps_used; i++) { 1191 dectmp_size = 0; 1192 decomp_it(&decomps[i], 0); 1193 if (dectmp_size > 0) 1194 add_decomp(decomps[i].code, 0); 1195 } 1196 1197 for (i = 0; i < kdecomps_used; i++) { 1198 dectmp_size = 0; 1199 decomp_it(&kdecomps[i], 1); 1200 if (dectmp_size > 0) 1201 add_decomp(kdecomps[i].code, 1); 1202 } 1203} 1204 1205static int 1206cmpcomps(const void *v_comp1, const void *v_comp2) 1207{ 1208 const _comp_t *comp1 = v_comp1, *comp2 = v_comp2; 1209 long diff = comp1->code1 - comp2->code1; 1210 1211 if (!diff) 1212 diff = comp1->code2 - comp2->code2; 1213 return (int) diff; 1214} 1215 1216/* 1217 * Load composition exclusion data 1218 */ 1219static void 1220read_compexdata(FILE *in) 1221{ 1222 ac_uint2 i; 1223 ac_uint4 code; 1224 char line[512], *s; 1225 1226 (void) memset((char *) compexs, 0, sizeof(compexs)); 1227 1228 while (fgets(line, sizeof(line), in)) { 1229 if( (s=strchr(line, '\n')) ) *s = '\0'; 1230 /* 1231 * Skip blank lines and lines that start with a '#'. 1232 */ 1233 if (line[0] == 0 || line[0] == '#') 1234 continue; 1235 1236 /* 1237 * Collect the code. Assume max 6 digits 1238 */ 1239 1240 for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) { 1241 if (isspace((unsigned char)*s)) break; 1242 code <<= 4; 1243 if (*s >= '0' && *s <= '9') 1244 code += *s - '0'; 1245 else if (*s >= 'A' && *s <= 'F') 1246 code += (*s - 'A') + 10; 1247 else if (*s >= 'a' && *s <= 'f') 1248 code += (*s - 'a') + 10; 1249 } 1250 COMPEX_SET(code); 1251 } 1252} 1253 1254/* 1255 * Creates array of compositions from decomposition array 1256 */ 1257static void 1258create_comps(void) 1259{ 1260 ac_uint4 i, cu; 1261 1262 comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t)); 1263 1264 for (i = cu = 0; i < decomps_used; i++) { 1265 if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code)) 1266 continue; 1267 comps[cu].comp = decomps[i].code; 1268 comps[cu].count = 2; 1269 comps[cu].code1 = decomps[i].decomp[0]; 1270 comps[cu].code2 = decomps[i].decomp[1]; 1271 cu++; 1272 } 1273 comps_used = cu; 1274 qsort(comps, comps_used, sizeof(_comp_t), cmpcomps); 1275} 1276 1277#if HARDCODE_DATA 1278static void 1279write_case(FILE *out, _case_t *tab, int num, int first) 1280{ 1281 int i; 1282 1283 for (i=0; i<num; i++) { 1284 if (first) first = 0; 1285 else fprintf(out, ","); 1286 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx", 1287 (unsigned long) tab[i].key, (unsigned long) tab[i].other1, 1288 (unsigned long) tab[i].other2); 1289 } 1290} 1291 1292#define PREF "static const " 1293 1294#endif 1295 1296static void 1297write_cdata(char *opath) 1298{ 1299 FILE *out; 1300 ac_uint4 bytes; 1301 ac_uint4 i, idx, nprops; 1302#if !(HARDCODE_DATA) 1303 ac_uint2 casecnt[2]; 1304#endif 1305 char path[BUFSIZ]; 1306#if HARDCODE_DATA 1307 int j, k; 1308 1309 /***************************************************************** 1310 * 1311 * Generate the ctype data. 1312 * 1313 *****************************************************************/ 1314 1315 /* 1316 * Open the output file. 1317 */ 1318 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath); 1319 if ((out = fopen(path, "w")) == 0) 1320 return; 1321#else 1322 /* 1323 * Open the ctype.dat file. 1324 */ 1325 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath); 1326 if ((out = fopen(path, "wb")) == 0) 1327 return; 1328#endif 1329 1330 /* 1331 * Collect the offsets for the properties. The offsets array is 1332 * on a 4-byte boundary to keep things efficient for architectures 1333 * that need such a thing. 1334 */ 1335 for (i = idx = 0; i < NUMPROPS; i++) { 1336 propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff; 1337 idx += proptbl[i].used; 1338 } 1339 1340 /* 1341 * Add the sentinel index which is used by the binary search as the upper 1342 * bound for a search. 1343 */ 1344 propcnt[i] = idx; 1345 1346 /* 1347 * Record the actual number of property lists. This may be different than 1348 * the number of offsets actually written because of aligning on a 4-byte 1349 * boundary. 1350 */ 1351 hdr[1] = NUMPROPS; 1352 1353 /* 1354 * Calculate the byte count needed and pad the property counts array to a 1355 * 4-byte boundary. 1356 */ 1357 if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3) 1358 bytes += 4 - (bytes & 3); 1359 nprops = bytes / sizeof(ac_uint2); 1360 bytes += sizeof(ac_uint4) * idx; 1361 1362#if HARDCODE_DATA 1363 fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS); 1364 1365 fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {"); 1366 1367 for (i = 0; i<nprops; i++) { 1368 if (i) fprintf(out, ","); 1369 if (!(i&7)) fprintf(out, "\n\t"); 1370 else fprintf(out, " "); 1371 fprintf(out, "0x%04x", propcnt[i]); 1372 } 1373 fprintf(out, "\n};\n\n"); 1374 1375 fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {"); 1376 1377 k = 0; 1378 for (i = 0; i < NUMPROPS; i++) { 1379 if (proptbl[i].used > 0) { 1380 for (j=0; j<proptbl[i].used; j++) { 1381 if (k) fprintf(out, ","); 1382 if (!(k&3)) fprintf(out,"\n\t"); 1383 else fprintf(out, " "); 1384 k++; 1385 fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]); 1386 } 1387 } 1388 } 1389 fprintf(out, "\n};\n\n"); 1390#else 1391 /* 1392 * Write the header. 1393 */ 1394 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1395 1396 /* 1397 * Write the byte count. 1398 */ 1399 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1400 1401 /* 1402 * Write the property list counts. 1403 */ 1404 fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out); 1405 1406 /* 1407 * Write the property lists. 1408 */ 1409 for (i = 0; i < NUMPROPS; i++) { 1410 if (proptbl[i].used > 0) 1411 fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4), 1412 proptbl[i].used, out); 1413 } 1414 1415 fclose(out); 1416#endif 1417 1418 /***************************************************************** 1419 * 1420 * Generate the case mapping data. 1421 * 1422 *****************************************************************/ 1423 1424#if HARDCODE_DATA 1425 fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n", 1426 (long) (upper_used + lower_used + title_used)); 1427 1428 fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n", 1429 (long) upper_used, (long) lower_used); 1430 fprintf(out, PREF "ac_uint4 _uccase_map[] = {"); 1431 1432 if (upper_used > 0) 1433 /* 1434 * Write the upper case table. 1435 */ 1436 write_case(out, upper, upper_used, 1); 1437 1438 if (lower_used > 0) 1439 /* 1440 * Write the lower case table. 1441 */ 1442 write_case(out, lower, lower_used, !upper_used); 1443 1444 if (title_used > 0) 1445 /* 1446 * Write the title case table. 1447 */ 1448 write_case(out, title, title_used, !(upper_used||lower_used)); 1449 1450 if (!(upper_used || lower_used || title_used)) 1451 fprintf(out, "\t0"); 1452 1453 fprintf(out, "\n};\n\n"); 1454#else 1455 /* 1456 * Open the case.dat file. 1457 */ 1458 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath); 1459 if ((out = fopen(path, "wb")) == 0) 1460 return; 1461 1462 /* 1463 * Write the case mapping tables. 1464 */ 1465 hdr[1] = upper_used + lower_used + title_used; 1466 casecnt[0] = upper_used; 1467 casecnt[1] = lower_used; 1468 1469 /* 1470 * Write the header. 1471 */ 1472 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1473 1474 /* 1475 * Write the upper and lower case table sizes. 1476 */ 1477 fwrite((char *) casecnt, sizeof(ac_uint2), 2, out); 1478 1479 if (upper_used > 0) 1480 /* 1481 * Write the upper case table. 1482 */ 1483 fwrite((char *) upper, sizeof(_case_t), upper_used, out); 1484 1485 if (lower_used > 0) 1486 /* 1487 * Write the lower case table. 1488 */ 1489 fwrite((char *) lower, sizeof(_case_t), lower_used, out); 1490 1491 if (title_used > 0) 1492 /* 1493 * Write the title case table. 1494 */ 1495 fwrite((char *) title, sizeof(_case_t), title_used, out); 1496 1497 fclose(out); 1498#endif 1499 1500 /***************************************************************** 1501 * 1502 * Generate the composition data. 1503 * 1504 *****************************************************************/ 1505 1506 /* 1507 * Create compositions from decomposition data 1508 */ 1509 create_comps(); 1510 1511#if HARDCODE_DATA 1512 fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n", 1513 comps_used * 4L); 1514 1515 fprintf(out, PREF "ac_uint4 _uccomp_data[] = {"); 1516 1517 /* 1518 * Now, if comps exist, write them out. 1519 */ 1520 if (comps_used > 0) { 1521 for (i=0; i<comps_used; i++) { 1522 if (i) fprintf(out, ","); 1523 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx", 1524 (unsigned long) comps[i].comp, (unsigned long) comps[i].count, 1525 (unsigned long) comps[i].code1, (unsigned long) comps[i].code2); 1526 } 1527 } else { 1528 fprintf(out, "\t0"); 1529 } 1530 fprintf(out, "\n};\n\n"); 1531#else 1532 /* 1533 * Open the comp.dat file. 1534 */ 1535 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath); 1536 if ((out = fopen(path, "wb")) == 0) 1537 return; 1538 1539 /* 1540 * Write the header. 1541 */ 1542 hdr[1] = (ac_uint2) comps_used * 4; 1543 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1544 1545 /* 1546 * Write out the byte count to maintain header size. 1547 */ 1548 bytes = comps_used * sizeof(_comp_t); 1549 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1550 1551 /* 1552 * Now, if comps exist, write them out. 1553 */ 1554 if (comps_used > 0) 1555 fwrite((char *) comps, sizeof(_comp_t), comps_used, out); 1556 1557 fclose(out); 1558#endif 1559 1560 /***************************************************************** 1561 * 1562 * Generate the decomposition data. 1563 * 1564 *****************************************************************/ 1565 1566 /* 1567 * Fully expand all decompositions before generating the output file. 1568 */ 1569 expand_decomp(); 1570 1571#if HARDCODE_DATA 1572 fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n", 1573 decomps_used * 2L); 1574 1575 fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {"); 1576 1577 if (decomps_used) { 1578 /* 1579 * Write the list of decomp nodes. 1580 */ 1581 for (i = idx = 0; i < decomps_used; i++) { 1582 fprintf(out, "\n\t0x%08lx, 0x%08lx,", 1583 (unsigned long) decomps[i].code, (unsigned long) idx); 1584 idx += decomps[i].used; 1585 } 1586 1587 /* 1588 * Write the sentinel index as the last decomp node. 1589 */ 1590 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx); 1591 1592 fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {"); 1593 /* 1594 * Write the decompositions themselves. 1595 */ 1596 k = 0; 1597 for (i = 0; i < decomps_used; i++) 1598 for (j=0; j<decomps[i].used; j++) { 1599 if (k) fprintf(out, ","); 1600 if (!(k&3)) fprintf(out,"\n\t"); 1601 else fprintf(out, " "); 1602 k++; 1603 fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]); 1604 } 1605 fprintf(out, "\n};\n\n"); 1606 } 1607#else 1608 /* 1609 * Open the decomp.dat file. 1610 */ 1611 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath); 1612 if ((out = fopen(path, "wb")) == 0) 1613 return; 1614 1615 hdr[1] = decomps_used; 1616 1617 /* 1618 * Write the header. 1619 */ 1620 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1621 1622 /* 1623 * Write a temporary byte count which will be calculated as the 1624 * decompositions are written out. 1625 */ 1626 bytes = 0; 1627 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1628 1629 if (decomps_used) { 1630 /* 1631 * Write the list of decomp nodes. 1632 */ 1633 for (i = idx = 0; i < decomps_used; i++) { 1634 fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out); 1635 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1636 idx += decomps[i].used; 1637 } 1638 1639 /* 1640 * Write the sentinel index as the last decomp node. 1641 */ 1642 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1643 1644 /* 1645 * Write the decompositions themselves. 1646 */ 1647 for (i = 0; i < decomps_used; i++) 1648 fwrite((char *) decomps[i].decomp, sizeof(ac_uint4), 1649 decomps[i].used, out); 1650 1651 /* 1652 * Seek back to the beginning and write the byte count. 1653 */ 1654 bytes = (sizeof(ac_uint4) * idx) + 1655 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1)); 1656 fseek(out, sizeof(ac_uint2) << 1, 0L); 1657 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1658 1659 fclose(out); 1660 } 1661#endif 1662 1663#ifdef HARDCODE_DATA 1664 fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n", 1665 kdecomps_used * 2L); 1666 1667 fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {"); 1668 1669 if (kdecomps_used) { 1670 /* 1671 * Write the list of kdecomp nodes. 1672 */ 1673 for (i = idx = 0; i < kdecomps_used; i++) { 1674 fprintf(out, "\n\t0x%08lx, 0x%08lx,", 1675 (unsigned long) kdecomps[i].code, (unsigned long) idx); 1676 idx += kdecomps[i].used; 1677 } 1678 1679 /* 1680 * Write the sentinel index as the last decomp node. 1681 */ 1682 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx); 1683 1684 fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {"); 1685 1686 /* 1687 * Write the decompositions themselves. 1688 */ 1689 k = 0; 1690 for (i = 0; i < kdecomps_used; i++) 1691 for (j=0; j<kdecomps[i].used; j++) { 1692 if (k) fprintf(out, ","); 1693 if (!(k&3)) fprintf(out,"\n\t"); 1694 else fprintf(out, " "); 1695 k++; 1696 fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]); 1697 } 1698 fprintf(out, "\n};\n\n"); 1699 } 1700#else 1701 /* 1702 * Open the kdecomp.dat file. 1703 */ 1704 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath); 1705 if ((out = fopen(path, "wb")) == 0) 1706 return; 1707 1708 hdr[1] = kdecomps_used; 1709 1710 /* 1711 * Write the header. 1712 */ 1713 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1714 1715 /* 1716 * Write a temporary byte count which will be calculated as the 1717 * decompositions are written out. 1718 */ 1719 bytes = 0; 1720 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1721 1722 if (kdecomps_used) { 1723 /* 1724 * Write the list of kdecomp nodes. 1725 */ 1726 for (i = idx = 0; i < kdecomps_used; i++) { 1727 fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out); 1728 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1729 idx += kdecomps[i].used; 1730 } 1731 1732 /* 1733 * Write the sentinel index as the last decomp node. 1734 */ 1735 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1736 1737 /* 1738 * Write the decompositions themselves. 1739 */ 1740 for (i = 0; i < kdecomps_used; i++) 1741 fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4), 1742 kdecomps[i].used, out); 1743 1744 /* 1745 * Seek back to the beginning and write the byte count. 1746 */ 1747 bytes = (sizeof(ac_uint4) * idx) + 1748 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1)); 1749 fseek(out, sizeof(ac_uint2) << 1, 0L); 1750 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1751 1752 fclose(out); 1753 } 1754#endif 1755 1756 /***************************************************************** 1757 * 1758 * Generate the combining class data. 1759 * 1760 *****************************************************************/ 1761#ifdef HARDCODE_DATA 1762 fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used); 1763 1764 fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {"); 1765 1766 if (ccl_used > 0) { 1767 /* 1768 * Write the combining class ranges out. 1769 */ 1770 for (i = 0; i<ccl_used; i++) { 1771 if (i) fprintf(out, ","); 1772 if (!(i&3)) fprintf(out, "\n\t"); 1773 else fprintf(out, " "); 1774 fprintf(out, "0x%08lx", (unsigned long) ccl[i]); 1775 } 1776 } else { 1777 fprintf(out, "\t0"); 1778 } 1779 fprintf(out, "\n};\n\n"); 1780#else 1781 /* 1782 * Open the cmbcl.dat file. 1783 */ 1784 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath); 1785 if ((out = fopen(path, "wb")) == 0) 1786 return; 1787 1788 /* 1789 * Set the number of ranges used. Each range has a combining class which 1790 * means each entry is a 3-tuple. 1791 */ 1792 hdr[1] = ccl_used / 3; 1793 1794 /* 1795 * Write the header. 1796 */ 1797 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1798 1799 /* 1800 * Write out the byte count to maintain header size. 1801 */ 1802 bytes = ccl_used * sizeof(ac_uint4); 1803 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1804 1805 if (ccl_used > 0) 1806 /* 1807 * Write the combining class ranges out. 1808 */ 1809 fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out); 1810 1811 fclose(out); 1812#endif 1813 1814 /***************************************************************** 1815 * 1816 * Generate the number data. 1817 * 1818 *****************************************************************/ 1819 1820#if HARDCODE_DATA 1821 fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n", 1822 (unsigned long)ncodes_used<<1); 1823 1824 fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {"); 1825 1826 /* 1827 * Now, if number mappings exist, write them out. 1828 */ 1829 if (ncodes_used > 0) { 1830 for (i = 0; i<ncodes_used; i++) { 1831 if (i) fprintf(out, ","); 1832 if (!(i&1)) fprintf(out, "\n\t"); 1833 else fprintf(out, " "); 1834 fprintf(out, "0x%08lx, 0x%08lx", 1835 (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx); 1836 } 1837 fprintf(out, "\n};\n\n"); 1838 1839 fprintf(out, PREF "short _ucnum_vals[] = {"); 1840 for (i = 0; i<nums_used; i++) { 1841 if (i) fprintf(out, ","); 1842 if (!(i&3)) fprintf(out, "\n\t"); 1843 else fprintf(out, " "); 1844 if (nums[i].numerator < 0) { 1845 fprintf(out, "%6d, 0x%04x", 1846 nums[i].numerator, nums[i].denominator); 1847 } else { 1848 fprintf(out, "0x%04x, 0x%04x", 1849 nums[i].numerator, nums[i].denominator); 1850 } 1851 } 1852 fprintf(out, "\n};\n\n"); 1853 } 1854#else 1855 /* 1856 * Open the num.dat file. 1857 */ 1858 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath); 1859 if ((out = fopen(path, "wb")) == 0) 1860 return; 1861 1862 /* 1863 * The count part of the header will be the total number of codes that 1864 * have numbers. 1865 */ 1866 hdr[1] = (ac_uint2) (ncodes_used << 1); 1867 bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t)); 1868 1869 /* 1870 * Write the header. 1871 */ 1872 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1873 1874 /* 1875 * Write out the byte count to maintain header size. 1876 */ 1877 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1878 1879 /* 1880 * Now, if number mappings exist, write them out. 1881 */ 1882 if (ncodes_used > 0) { 1883 fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out); 1884 fwrite((char *) nums, sizeof(_num_t), nums_used, out); 1885 } 1886#endif 1887 1888 fclose(out); 1889} 1890 1891static void 1892usage(char *prog) 1893{ 1894 fprintf(stderr, 1895 "Usage: %s [-o output-directory|-x composition-exclusions]", prog); 1896 fprintf(stderr, " datafile1 datafile2 ...\n\n"); 1897 fprintf(stderr, 1898 "-o output-directory\n\t\tWrite the output files to a different"); 1899 fprintf(stderr, " directory (default: .).\n"); 1900 fprintf(stderr, 1901 "-x composition-exclusion\n\t\tFile of composition codes"); 1902 fprintf(stderr, " that should be excluded.\n"); 1903 exit(1); 1904} 1905 1906int 1907main(int argc, char *argv[]) 1908{ 1909 FILE *in; 1910 char *prog, *opath; 1911 1912 prog = lutil_progname( "ucgendat", argc, argv ); 1913 1914 opath = 0; 1915 in = stdin; 1916 1917 argc--; 1918 argv++; 1919 1920 while (argc > 0) { 1921 if (argv[0][0] == '-') { 1922 switch (argv[0][1]) { 1923 case 'o': 1924 argc--; 1925 argv++; 1926 opath = argv[0]; 1927 break; 1928 case 'x': 1929 argc--; 1930 argv++; 1931 if ((in = fopen(argv[0], "r")) == 0) 1932 fprintf(stderr, 1933 "%s: unable to open composition exclusion file %s\n", 1934 prog, argv[0]); 1935 else { 1936 read_compexdata(in); 1937 fclose(in); 1938 in = 0; 1939 } 1940 break; 1941 default: 1942 usage(prog); 1943 } 1944 } else { 1945 if (in != stdin && in != NULL) 1946 fclose(in); 1947 if ((in = fopen(argv[0], "r")) == 0) 1948 fprintf(stderr, "%s: unable to open ctype file %s\n", 1949 prog, argv[0]); 1950 else { 1951 read_cdata(in); 1952 fclose(in); 1953 in = 0; 1954 } 1955 } 1956 argc--; 1957 argv++; 1958 } 1959 1960 if (opath == 0) 1961 opath = "."; 1962 write_cdata(opath); 1963 1964 return 0; 1965} 1966