1/* $OpenLDAP$ */ 2/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 3 * 4 * Copyright 1998-2011 The OpenLDAP Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted only as authorized by the OpenLDAP 9 * Public License. 10 * 11 * A copy of this license is available in file LICENSE in the 12 * top-level directory of the distribution or, alternatively, at 13 * <http://www.OpenLDAP.org/license.html>. 14 */ 15/* Copyright 2001 Computing Research Labs, New Mexico State University 16 * 17 * Permission is hereby granted, free of charge, to any person obtaining a 18 * copy of this software and associated documentation files (the "Software"), 19 * to deal in the Software without restriction, including without limitation 20 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 21 * and/or sell copies of the Software, and to permit persons to whom the 22 * Software is furnished to do so, subject to the following conditions: 23 * 24 * The above copyright notice and this permission notice shall be included in 25 * all copies or substantial portions of the Software. 26 * 27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 28 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 30 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 31 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 32 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 33 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 34 */ 35/* $Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */ 36 37#include "portable.h" 38#include "ldap_config.h" 39 40#include <stdio.h> 41#include <ac/ctype.h> 42#include <ac/stdlib.h> 43#include <ac/string.h> 44#include <ac/unistd.h> 45 46#include <ac/bytes.h> 47 48#include <lutil.h> 49 50#ifndef HARDCODE_DATA 51#define HARDCODE_DATA 1 52#endif 53 54#undef ishdigit 55#define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\ 56 ((cc) >= 'A' && (cc) <= 'F') ||\ 57 ((cc) >= 'a' && (cc) <= 'f')) 58 59/* 60 * A header written to the output file with the byte-order-mark and the number 61 * of property nodes. 62 */ 63static ac_uint2 hdr[2] = {0xfeff, 0}; 64 65#define NUMPROPS 50 66#define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3))) 67 68typedef struct { 69 char *name; 70 int len; 71} _prop_t; 72 73/* 74 * List of properties expected to be found in the Unicode Character Database 75 * including some implementation specific properties. 76 * 77 * The implementation specific properties are: 78 * Cm = Composed (can be decomposed) 79 * Nb = Non-breaking 80 * Sy = Symmetric (has left and right forms) 81 * Hd = Hex digit 82 * Qm = Quote marks 83 * Mr = Mirroring 84 * Ss = Space, other 85 * Cp = Defined character 86 */ 87static _prop_t props[NUMPROPS] = { 88 {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2}, 89 {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2}, 90 {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2}, 91 {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2}, 92 {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L", 1}, {"R", 1}, 93 {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B", 1}, 94 {"S", 1}, {"WS", 2}, {"ON", 2}, 95 {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2}, 96 {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2} 97}; 98 99typedef struct { 100 ac_uint4 *ranges; 101 ac_uint2 used; 102 ac_uint2 size; 103} _ranges_t; 104 105static _ranges_t proptbl[NUMPROPS]; 106 107/* 108 * Make sure this array is sized to be on a 4-byte boundary at compile time. 109 */ 110static ac_uint2 propcnt[NEEDPROPS]; 111 112/* 113 * Array used to collect a decomposition before adding it to the decomposition 114 * table. 115 */ 116static ac_uint4 dectmp[64]; 117static ac_uint4 dectmp_size; 118 119typedef struct { 120 ac_uint4 code; 121 ac_uint2 size; 122 ac_uint2 used; 123 ac_uint4 *decomp; 124} _decomp_t; 125 126/* 127 * List of decomposition. Created and expanded in order as the characters are 128 * encountered. First list contains canonical mappings, second also includes 129 * compatibility mappings. 130 */ 131static _decomp_t *decomps; 132static ac_uint4 decomps_used; 133static ac_uint4 decomps_size; 134 135static _decomp_t *kdecomps; 136static ac_uint4 kdecomps_used; 137static ac_uint4 kdecomps_size; 138 139/* 140 * Composition exclusion table stuff. 141 */ 142#define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31))) 143#define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31))) 144static ac_uint4 compexs[8192]; 145 146/* 147 * Struct for holding a composition pair, and array of composition pairs 148 */ 149typedef struct { 150 ac_uint4 comp; 151 ac_uint4 count; 152 ac_uint4 code1; 153 ac_uint4 code2; 154} _comp_t; 155 156static _comp_t *comps; 157static ac_uint4 comps_used; 158 159/* 160 * Types and lists for handling lists of case mappings. 161 */ 162typedef struct { 163 ac_uint4 key; 164 ac_uint4 other1; 165 ac_uint4 other2; 166} _case_t; 167 168static _case_t *upper; 169static _case_t *lower; 170static _case_t *title; 171static ac_uint4 upper_used; 172static ac_uint4 upper_size; 173static ac_uint4 lower_used; 174static ac_uint4 lower_size; 175static ac_uint4 title_used; 176static ac_uint4 title_size; 177 178/* 179 * Array used to collect case mappings before adding them to a list. 180 */ 181static ac_uint4 cases[3]; 182 183/* 184 * An array to hold ranges for combining classes. 185 */ 186static ac_uint4 *ccl; 187static ac_uint4 ccl_used; 188static ac_uint4 ccl_size; 189 190/* 191 * Structures for handling numbers. 192 */ 193typedef struct { 194 ac_uint4 code; 195 ac_uint4 idx; 196} _codeidx_t; 197 198typedef struct { 199 short numerator; 200 short denominator; 201} _num_t; 202 203/* 204 * Arrays to hold the mapping of codes to numbers. 205 */ 206static _codeidx_t *ncodes; 207static ac_uint4 ncodes_used; 208static ac_uint4 ncodes_size; 209 210static _num_t *nums; 211static ac_uint4 nums_used; 212static ac_uint4 nums_size; 213 214/* 215 * Array for holding numbers. 216 */ 217static _num_t *nums; 218static ac_uint4 nums_used; 219static ac_uint4 nums_size; 220 221static void 222add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2) 223{ 224 int i, j, k, len; 225 _ranges_t *rlp; 226 char *name; 227 228 for (k = 0; k < 2; k++) { 229 if (k == 0) { 230 name = p1; 231 len = 2; 232 } else { 233 if (p2 == 0) 234 break; 235 236 name = p2; 237 len = 1; 238 } 239 240 for (i = 0; i < NUMPROPS; i++) { 241 if (props[i].len == len && memcmp(props[i].name, name, len) == 0) 242 break; 243 } 244 245 if (i == NUMPROPS) 246 continue; 247 248 rlp = &proptbl[i]; 249 250 /* 251 * Resize the range list if necessary. 252 */ 253 if (rlp->used == rlp->size) { 254 if (rlp->size == 0) 255 rlp->ranges = (ac_uint4 *) 256 malloc(sizeof(ac_uint4) << 3); 257 else 258 rlp->ranges = (ac_uint4 *) 259 realloc((char *) rlp->ranges, 260 sizeof(ac_uint4) * (rlp->size + 8)); 261 rlp->size += 8; 262 } 263 264 /* 265 * If this is the first code for this property list, just add it 266 * and return. 267 */ 268 if (rlp->used == 0) { 269 rlp->ranges[0] = start; 270 rlp->ranges[1] = end; 271 rlp->used += 2; 272 continue; 273 } 274 275 /* 276 * Optimize the case of adding the range to the end. 277 */ 278 j = rlp->used - 1; 279 if (start > rlp->ranges[j]) { 280 j = rlp->used; 281 rlp->ranges[j++] = start; 282 rlp->ranges[j++] = end; 283 rlp->used = j; 284 continue; 285 } 286 287 /* 288 * Need to locate the insertion point. 289 */ 290 for (i = 0; 291 i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ; 292 293 /* 294 * If the start value lies in the current range, then simply set the 295 * new end point of the range to the end value passed as a parameter. 296 */ 297 if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) { 298 rlp->ranges[i + 1] = end; 299 return; 300 } 301 302 /* 303 * Shift following values up by two. 304 */ 305 for (j = rlp->used; j > i; j -= 2) { 306 rlp->ranges[j] = rlp->ranges[j - 2]; 307 rlp->ranges[j + 1] = rlp->ranges[j - 1]; 308 } 309 310 /* 311 * Add the new range at the insertion point. 312 */ 313 rlp->ranges[i] = start; 314 rlp->ranges[i + 1] = end; 315 rlp->used += 2; 316 } 317} 318 319static void 320ordered_range_insert(ac_uint4 c, char *name, int len) 321{ 322 int i, j; 323 ac_uint4 s, e; 324 _ranges_t *rlp; 325 326 if (len == 0) 327 return; 328 329 /* 330 * Deal with directionality codes introduced in Unicode 3.0. 331 */ 332 if ((len == 2 && memcmp(name, "BN", 2) == 0) || 333 (len == 3 && 334 (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 || 335 memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 || 336 memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) { 337 /* 338 * Mark all of these as Other Neutral to preserve compatibility with 339 * older versions. 340 */ 341 len = 2; 342 name = "ON"; 343 } 344 345 for (i = 0; i < NUMPROPS; i++) { 346 if (props[i].len == len && memcmp(props[i].name, name, len) == 0) 347 break; 348 } 349 350 if (i == NUMPROPS) 351 return; 352 353 /* 354 * Have a match, so insert the code in order. 355 */ 356 rlp = &proptbl[i]; 357 358 /* 359 * Resize the range list if necessary. 360 */ 361 if (rlp->used == rlp->size) { 362 if (rlp->size == 0) 363 rlp->ranges = (ac_uint4 *) 364 malloc(sizeof(ac_uint4) << 3); 365 else 366 rlp->ranges = (ac_uint4 *) 367 realloc((char *) rlp->ranges, 368 sizeof(ac_uint4) * (rlp->size + 8)); 369 rlp->size += 8; 370 } 371 372 /* 373 * If this is the first code for this property list, just add it 374 * and return. 375 */ 376 if (rlp->used == 0) { 377 rlp->ranges[0] = rlp->ranges[1] = c; 378 rlp->used += 2; 379 return; 380 } 381 382 /* 383 * Optimize the cases of extending the last range and adding new ranges to 384 * the end. 385 */ 386 j = rlp->used - 1; 387 e = rlp->ranges[j]; 388 s = rlp->ranges[j - 1]; 389 390 if (c == e + 1) { 391 /* 392 * Extend the last range. 393 */ 394 rlp->ranges[j] = c; 395 return; 396 } 397 398 if (c > e + 1) { 399 /* 400 * Start another range on the end. 401 */ 402 j = rlp->used; 403 rlp->ranges[j] = rlp->ranges[j + 1] = c; 404 rlp->used += 2; 405 return; 406 } 407 408 if (c >= s) 409 /* 410 * The code is a duplicate of a code in the last range, so just return. 411 */ 412 return; 413 414 /* 415 * The code should be inserted somewhere before the last range in the 416 * list. Locate the insertion point. 417 */ 418 for (i = 0; 419 i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ; 420 421 s = rlp->ranges[i]; 422 e = rlp->ranges[i + 1]; 423 424 if (c == e + 1) 425 /* 426 * Simply extend the current range. 427 */ 428 rlp->ranges[i + 1] = c; 429 else if (c < s) { 430 /* 431 * Add a new entry before the current location. Shift all entries 432 * before the current one up by one to make room. 433 */ 434 for (j = rlp->used; j > i; j -= 2) { 435 rlp->ranges[j] = rlp->ranges[j - 2]; 436 rlp->ranges[j + 1] = rlp->ranges[j - 1]; 437 } 438 rlp->ranges[i] = rlp->ranges[i + 1] = c; 439 440 rlp->used += 2; 441 } 442} 443 444static void 445add_decomp(ac_uint4 code, short compat) 446{ 447 ac_uint4 i, j, size; 448 _decomp_t **pdecomps; 449 ac_uint4 *pdecomps_used; 450 ac_uint4 *pdecomps_size; 451 452 if (compat) { 453 pdecomps = &kdecomps; 454 pdecomps_used = &kdecomps_used; 455 pdecomps_size = &kdecomps_size; 456 } else { 457 pdecomps = &decomps; 458 pdecomps_used = &decomps_used; 459 pdecomps_size = &decomps_size; 460 } 461 462 /* 463 * Add the code to the composite property. 464 */ 465 if (!compat) { 466 ordered_range_insert(code, "Cm", 2); 467 } 468 469 /* 470 * Locate the insertion point for the code. 471 */ 472 for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ; 473 474 /* 475 * Allocate space for a new decomposition. 476 */ 477 if (*pdecomps_used == *pdecomps_size) { 478 if (*pdecomps_size == 0) 479 *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3); 480 else 481 *pdecomps = (_decomp_t *) 482 realloc((char *) *pdecomps, 483 sizeof(_decomp_t) * (*pdecomps_size + 8)); 484 (void) memset((char *) (*pdecomps + *pdecomps_size), '\0', 485 sizeof(_decomp_t) << 3); 486 *pdecomps_size += 8; 487 } 488 489 if (i < *pdecomps_used && code != (*pdecomps)[i].code) { 490 /* 491 * Shift the decomps up by one if the codes don't match. 492 */ 493 for (j = *pdecomps_used; j > i; j--) 494 (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1], 495 sizeof(_decomp_t)); 496 } 497 498 /* 499 * Insert or replace a decomposition. 500 */ 501 size = dectmp_size + (4 - (dectmp_size & 3)); 502 if ((*pdecomps)[i].size < size) { 503 if ((*pdecomps)[i].size == 0) 504 (*pdecomps)[i].decomp = (ac_uint4 *) 505 malloc(sizeof(ac_uint4) * size); 506 else 507 (*pdecomps)[i].decomp = (ac_uint4 *) 508 realloc((char *) (*pdecomps)[i].decomp, 509 sizeof(ac_uint4) * size); 510 (*pdecomps)[i].size = size; 511 } 512 513 if ((*pdecomps)[i].code != code) 514 (*pdecomps_used)++; 515 516 (*pdecomps)[i].code = code; 517 (*pdecomps)[i].used = dectmp_size; 518 (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp, 519 sizeof(ac_uint4) * dectmp_size); 520 521 /* 522 * NOTICE: This needs changing later so it is more general than simply 523 * pairs. This calculation is done here to simplify allocation elsewhere. 524 */ 525 if (!compat && dectmp_size == 2) 526 comps_used++; 527} 528 529static void 530add_title(ac_uint4 code) 531{ 532 ac_uint4 i, j; 533 534 /* 535 * Always map the code to itself. 536 */ 537 cases[2] = code; 538 539 if (title_used == title_size) { 540 if (title_size == 0) 541 title = (_case_t *) malloc(sizeof(_case_t) << 3); 542 else 543 title = (_case_t *) realloc((char *) title, 544 sizeof(_case_t) * (title_size + 8)); 545 title_size += 8; 546 } 547 548 /* 549 * Locate the insertion point. 550 */ 551 for (i = 0; i < title_used && code > title[i].key; i++) ; 552 553 if (i < title_used) { 554 /* 555 * Shift the array up by one. 556 */ 557 for (j = title_used; j > i; j--) 558 (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1], 559 sizeof(_case_t)); 560 } 561 562 title[i].key = cases[2]; /* Title */ 563 title[i].other1 = cases[0]; /* Upper */ 564 title[i].other2 = cases[1]; /* Lower */ 565 566 title_used++; 567} 568 569static void 570add_upper(ac_uint4 code) 571{ 572 ac_uint4 i, j; 573 574 /* 575 * Always map the code to itself. 576 */ 577 cases[0] = code; 578 579 /* 580 * If the title case character is not present, then make it the same as 581 * the upper case. 582 */ 583 if (cases[2] == 0) 584 cases[2] = code; 585 586 if (upper_used == upper_size) { 587 if (upper_size == 0) 588 upper = (_case_t *) malloc(sizeof(_case_t) << 3); 589 else 590 upper = (_case_t *) realloc((char *) upper, 591 sizeof(_case_t) * (upper_size + 8)); 592 upper_size += 8; 593 } 594 595 /* 596 * Locate the insertion point. 597 */ 598 for (i = 0; i < upper_used && code > upper[i].key; i++) ; 599 600 if (i < upper_used) { 601 /* 602 * Shift the array up by one. 603 */ 604 for (j = upper_used; j > i; j--) 605 (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1], 606 sizeof(_case_t)); 607 } 608 609 upper[i].key = cases[0]; /* Upper */ 610 upper[i].other1 = cases[1]; /* Lower */ 611 upper[i].other2 = cases[2]; /* Title */ 612 613 upper_used++; 614} 615 616static void 617add_lower(ac_uint4 code) 618{ 619 ac_uint4 i, j; 620 621 /* 622 * Always map the code to itself. 623 */ 624 cases[1] = code; 625 626 /* 627 * If the title case character is empty, then make it the same as the 628 * upper case. 629 */ 630 if (cases[2] == 0) 631 cases[2] = cases[0]; 632 633 if (lower_used == lower_size) { 634 if (lower_size == 0) 635 lower = (_case_t *) malloc(sizeof(_case_t) << 3); 636 else 637 lower = (_case_t *) realloc((char *) lower, 638 sizeof(_case_t) * (lower_size + 8)); 639 lower_size += 8; 640 } 641 642 /* 643 * Locate the insertion point. 644 */ 645 for (i = 0; i < lower_used && code > lower[i].key; i++) ; 646 647 if (i < lower_used) { 648 /* 649 * Shift the array up by one. 650 */ 651 for (j = lower_used; j > i; j--) 652 (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1], 653 sizeof(_case_t)); 654 } 655 656 lower[i].key = cases[1]; /* Lower */ 657 lower[i].other1 = cases[0]; /* Upper */ 658 lower[i].other2 = cases[2]; /* Title */ 659 660 lower_used++; 661} 662 663static void 664ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code) 665{ 666 ac_uint4 i, j; 667 668 if (ccl_used == ccl_size) { 669 if (ccl_size == 0) 670 ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24); 671 else 672 ccl = (ac_uint4 *) 673 realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24)); 674 ccl_size += 24; 675 } 676 677 /* 678 * Optimize adding the first item. 679 */ 680 if (ccl_used == 0) { 681 ccl[0] = ccl[1] = c; 682 ccl[2] = ccl_code; 683 ccl_used += 3; 684 return; 685 } 686 687 /* 688 * Handle the special case of extending the range on the end. This 689 * requires that the combining class codes are the same. 690 */ 691 if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) { 692 ccl[ccl_used - 2] = c; 693 return; 694 } 695 696 /* 697 * Handle the special case of adding another range on the end. 698 */ 699 if (c > ccl[ccl_used - 2] + 1 || 700 (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) { 701 ccl[ccl_used++] = c; 702 ccl[ccl_used++] = c; 703 ccl[ccl_used++] = ccl_code; 704 return; 705 } 706 707 /* 708 * Locate either the insertion point or range for the code. 709 */ 710 for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ; 711 712 if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) { 713 /* 714 * Extend an existing range. 715 */ 716 ccl[i + 1] = c; 717 return; 718 } else if (c < ccl[i]) { 719 /* 720 * Start a new range before the current location. 721 */ 722 for (j = ccl_used; j > i; j -= 3) { 723 ccl[j] = ccl[j - 3]; 724 ccl[j - 1] = ccl[j - 4]; 725 ccl[j - 2] = ccl[j - 5]; 726 } 727 ccl[i] = ccl[i + 1] = c; 728 ccl[i + 2] = ccl_code; 729 } 730} 731 732/* 733 * Adds a number if it does not already exist and returns an index value 734 * multiplied by 2. 735 */ 736static ac_uint4 737make_number(short num, short denom) 738{ 739 ac_uint4 n; 740 741 /* 742 * Determine if the number already exists. 743 */ 744 for (n = 0; n < nums_used; n++) { 745 if (nums[n].numerator == num && nums[n].denominator == denom) 746 return n << 1; 747 } 748 749 if (nums_used == nums_size) { 750 if (nums_size == 0) 751 nums = (_num_t *) malloc(sizeof(_num_t) << 3); 752 else 753 nums = (_num_t *) realloc((char *) nums, 754 sizeof(_num_t) * (nums_size + 8)); 755 nums_size += 8; 756 } 757 758 n = nums_used++; 759 nums[n].numerator = num; 760 nums[n].denominator = denom; 761 762 return n << 1; 763} 764 765static void 766add_number(ac_uint4 code, short num, short denom) 767{ 768 ac_uint4 i, j; 769 770 /* 771 * Insert the code in order. 772 */ 773 for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ; 774 775 /* 776 * Handle the case of the codes matching and simply replace the number 777 * that was there before. 778 */ 779 if (i < ncodes_used && code == ncodes[i].code) { 780 ncodes[i].idx = make_number(num, denom); 781 return; 782 } 783 784 /* 785 * Resize the array if necessary. 786 */ 787 if (ncodes_used == ncodes_size) { 788 if (ncodes_size == 0) 789 ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3); 790 else 791 ncodes = (_codeidx_t *) 792 realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8)); 793 794 ncodes_size += 8; 795 } 796 797 /* 798 * Shift things around to insert the code if necessary. 799 */ 800 if (i < ncodes_used) { 801 for (j = ncodes_used; j > i; j--) { 802 ncodes[j].code = ncodes[j - 1].code; 803 ncodes[j].idx = ncodes[j - 1].idx; 804 } 805 } 806 ncodes[i].code = code; 807 ncodes[i].idx = make_number(num, denom); 808 809 ncodes_used++; 810} 811 812/* 813 * This routine assumes that the line is a valid Unicode Character Database 814 * entry. 815 */ 816static void 817read_cdata(FILE *in) 818{ 819 ac_uint4 i, lineno, skip, code, ccl_code; 820 short wnum, neg, number[2], compat; 821 char line[512], *s, *e; 822 823 lineno = skip = 0; 824 while (fgets(line, sizeof(line), in)) { 825 if( (s=strchr(line, '\n')) ) *s = '\0'; 826 lineno++; 827 828 /* 829 * Skip blank lines and lines that start with a '#'. 830 */ 831 if (line[0] == 0 || line[0] == '#') 832 continue; 833 834 /* 835 * If lines need to be skipped, do it here. 836 */ 837 if (skip) { 838 skip--; 839 continue; 840 } 841 842 /* 843 * Collect the code. The code can be up to 6 hex digits in length to 844 * allow surrogates to be specified. 845 */ 846 for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) { 847 code <<= 4; 848 if (*s >= '0' && *s <= '9') 849 code += *s - '0'; 850 else if (*s >= 'A' && *s <= 'F') 851 code += (*s - 'A') + 10; 852 else if (*s >= 'a' && *s <= 'f') 853 code += (*s - 'a') + 10; 854 } 855 856 /* 857 * Handle the following special cases: 858 * 1. 4E00-9FA5 CJK Ideographs. 859 * 2. AC00-D7A3 Hangul Syllables. 860 * 3. D800-DFFF Surrogates. 861 * 4. E000-F8FF Private Use Area. 862 * 5. F900-FA2D Han compatibility. 863 * ...Plus additional ranges in newer Unicode versions... 864 */ 865 switch (code) { 866 case 0x3400: 867 /* CJK Ideograph Extension A */ 868 add_range(0x3400, 0x4db5, "Lo", "L"); 869 870 add_range(0x3400, 0x4db5, "Cp", 0); 871 872 skip = 1; 873 break; 874 case 0x4e00: 875 /* 876 * The Han ideographs. 877 */ 878 add_range(0x4e00, 0x9fff, "Lo", "L"); 879 880 /* 881 * Add the characters to the defined category. 882 */ 883 add_range(0x4e00, 0x9fa5, "Cp", 0); 884 885 skip = 1; 886 break; 887 case 0xac00: 888 /* 889 * The Hangul syllables. 890 */ 891 add_range(0xac00, 0xd7a3, "Lo", "L"); 892 893 /* 894 * Add the characters to the defined category. 895 */ 896 add_range(0xac00, 0xd7a3, "Cp", 0); 897 898 skip = 1; 899 break; 900 case 0xd800: 901 /* 902 * Make a range of all surrogates and assume some default 903 * properties. 904 */ 905 add_range(0x010000, 0x10ffff, "Cs", "L"); 906 skip = 5; 907 break; 908 case 0xe000: 909 /* 910 * The Private Use area. Add with a default set of properties. 911 */ 912 add_range(0xe000, 0xf8ff, "Co", "L"); 913 skip = 1; 914 break; 915 case 0xf900: 916 /* 917 * The CJK compatibility area. 918 */ 919 add_range(0xf900, 0xfaff, "Lo", "L"); 920 921 /* 922 * Add the characters to the defined category. 923 */ 924 add_range(0xf900, 0xfaff, "Cp", 0); 925 926 skip = 1; 927 break; 928 case 0x20000: 929 /* CJK Ideograph Extension B */ 930 add_range(0x20000, 0x2a6d6, "Lo", "L"); 931 932 add_range(0x20000, 0x2a6d6, "Cp", 0); 933 934 skip = 1; 935 break; 936 case 0xf0000: 937 /* Plane 15 private use */ 938 add_range(0xf0000, 0xffffd, "Co", "L"); 939 skip = 1; 940 break; 941 942 case 0x100000: 943 /* Plane 16 private use */ 944 add_range(0x100000, 0x10fffd, "Co", "L"); 945 skip = 1; 946 break; 947 } 948 949 if (skip) 950 continue; 951 952 /* 953 * Add the code to the defined category. 954 */ 955 ordered_range_insert(code, "Cp", 2); 956 957 /* 958 * Locate the first character property field. 959 */ 960 for (i = 0; *s != 0 && i < 2; s++) { 961 if (*s == ';') 962 i++; 963 } 964 for (e = s; *e && *e != ';'; e++) ; 965 966 ordered_range_insert(code, s, e - s); 967 968 /* 969 * Locate the combining class code. 970 */ 971 for (s = e; *s != 0 && i < 3; s++) { 972 if (*s == ';') 973 i++; 974 } 975 976 /* 977 * Convert the combining class code from decimal. 978 */ 979 for (ccl_code = 0, e = s; *e && *e != ';'; e++) 980 ccl_code = (ccl_code * 10) + (*e - '0'); 981 982 /* 983 * Add the code if it not 0. 984 */ 985 if (ccl_code != 0) 986 ordered_ccl_insert(code, ccl_code); 987 988 /* 989 * Locate the second character property field. 990 */ 991 for (s = e; *s != 0 && i < 4; s++) { 992 if (*s == ';') 993 i++; 994 } 995 for (e = s; *e && *e != ';'; e++) ; 996 997 ordered_range_insert(code, s, e - s); 998 999 /* 1000 * Check for a decomposition. 1001 */ 1002 s = ++e; 1003 if (*s != ';') { 1004 compat = *s == '<'; 1005 if (compat) { 1006 /* 1007 * Skip compatibility formatting tag. 1008 */ 1009 while (*s++ != '>'); 1010 } 1011 /* 1012 * Collect the codes of the decomposition. 1013 */ 1014 for (dectmp_size = 0; *s != ';'; ) { 1015 /* 1016 * Skip all leading non-hex digits. 1017 */ 1018 while (!ishdigit(*s)) 1019 s++; 1020 1021 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) { 1022 dectmp[dectmp_size] <<= 4; 1023 if (*s >= '0' && *s <= '9') 1024 dectmp[dectmp_size] += *s - '0'; 1025 else if (*s >= 'A' && *s <= 'F') 1026 dectmp[dectmp_size] += (*s - 'A') + 10; 1027 else if (*s >= 'a' && *s <= 'f') 1028 dectmp[dectmp_size] += (*s - 'a') + 10; 1029 } 1030 dectmp_size++; 1031 } 1032 1033 /* 1034 * If there are any codes in the temporary decomposition array, 1035 * then add the character with its decomposition. 1036 */ 1037 if (dectmp_size > 0) { 1038 if (!compat) { 1039 add_decomp(code, 0); 1040 } 1041 add_decomp(code, 1); 1042 } 1043 } 1044 1045 /* 1046 * Skip to the number field. 1047 */ 1048 for (i = 0; i < 3 && *s; s++) { 1049 if (*s == ';') 1050 i++; 1051 } 1052 1053 /* 1054 * Scan the number in. 1055 */ 1056 number[0] = number[1] = 0; 1057 for (e = s, neg = wnum = 0; *e && *e != ';'; e++) { 1058 if (*e == '-') { 1059 neg = 1; 1060 continue; 1061 } 1062 1063 if (*e == '/') { 1064 /* 1065 * Move the the denominator of the fraction. 1066 */ 1067 if (neg) 1068 number[wnum] *= -1; 1069 neg = 0; 1070 e++; 1071 wnum++; 1072 } 1073 number[wnum] = (number[wnum] * 10) + (*e - '0'); 1074 } 1075 1076 if (e > s) { 1077 /* 1078 * Adjust the denominator in case of integers and add the number. 1079 */ 1080 if (wnum == 0) 1081 number[1] = 1; 1082 1083 add_number(code, number[0], number[1]); 1084 } 1085 1086 /* 1087 * Skip to the start of the possible case mappings. 1088 */ 1089 for (s = e, i = 0; i < 4 && *s; s++) { 1090 if (*s == ';') 1091 i++; 1092 } 1093 1094 /* 1095 * Collect the case mappings. 1096 */ 1097 cases[0] = cases[1] = cases[2] = 0; 1098 for (i = 0; i < 3; i++) { 1099 while (ishdigit(*s)) { 1100 cases[i] <<= 4; 1101 if (*s >= '0' && *s <= '9') 1102 cases[i] += *s - '0'; 1103 else if (*s >= 'A' && *s <= 'F') 1104 cases[i] += (*s - 'A') + 10; 1105 else if (*s >= 'a' && *s <= 'f') 1106 cases[i] += (*s - 'a') + 10; 1107 s++; 1108 } 1109 if (*s == ';') 1110 s++; 1111 } 1112 if (cases[0] && cases[1]) 1113 /* 1114 * Add the upper and lower mappings for a title case character. 1115 */ 1116 add_title(code); 1117 else if (cases[1]) 1118 /* 1119 * Add the lower and title case mappings for the upper case 1120 * character. 1121 */ 1122 add_upper(code); 1123 else if (cases[0]) 1124 /* 1125 * Add the upper and title case mappings for the lower case 1126 * character. 1127 */ 1128 add_lower(code); 1129 } 1130} 1131 1132static _decomp_t * 1133find_decomp(ac_uint4 code, short compat) 1134{ 1135 long l, r, m; 1136 _decomp_t *decs; 1137 1138 l = 0; 1139 r = (compat ? kdecomps_used : decomps_used) - 1; 1140 decs = compat ? kdecomps : decomps; 1141 while (l <= r) { 1142 m = (l + r) >> 1; 1143 if (code > decs[m].code) 1144 l = m + 1; 1145 else if (code < decs[m].code) 1146 r = m - 1; 1147 else 1148 return &decs[m]; 1149 } 1150 return 0; 1151} 1152 1153static void 1154decomp_it(_decomp_t *d, short compat) 1155{ 1156 ac_uint4 i; 1157 _decomp_t *dp; 1158 1159 for (i = 0; i < d->used; i++) { 1160 if ((dp = find_decomp(d->decomp[i], compat)) != 0) 1161 decomp_it(dp, compat); 1162 else 1163 dectmp[dectmp_size++] = d->decomp[i]; 1164 } 1165} 1166 1167/* 1168 * Expand all decompositions by recursively decomposing each character 1169 * in the decomposition. 1170 */ 1171static void 1172expand_decomp(void) 1173{ 1174 ac_uint4 i; 1175 1176 for (i = 0; i < decomps_used; i++) { 1177 dectmp_size = 0; 1178 decomp_it(&decomps[i], 0); 1179 if (dectmp_size > 0) 1180 add_decomp(decomps[i].code, 0); 1181 } 1182 1183 for (i = 0; i < kdecomps_used; i++) { 1184 dectmp_size = 0; 1185 decomp_it(&kdecomps[i], 1); 1186 if (dectmp_size > 0) 1187 add_decomp(kdecomps[i].code, 1); 1188 } 1189} 1190 1191static int 1192cmpcomps(const void *v_comp1, const void *v_comp2) 1193{ 1194 const _comp_t *comp1 = v_comp1, *comp2 = v_comp2; 1195 long diff = comp1->code1 - comp2->code1; 1196 1197 if (!diff) 1198 diff = comp1->code2 - comp2->code2; 1199 return (int) diff; 1200} 1201 1202/* 1203 * Load composition exclusion data 1204 */ 1205static void 1206read_compexdata(FILE *in) 1207{ 1208 ac_uint2 i; 1209 ac_uint4 code; 1210 char line[512], *s; 1211 1212 (void) memset((char *) compexs, 0, sizeof(compexs)); 1213 1214 while (fgets(line, sizeof(line), in)) { 1215 if( (s=strchr(line, '\n')) ) *s = '\0'; 1216 /* 1217 * Skip blank lines and lines that start with a '#'. 1218 */ 1219 if (line[0] == 0 || line[0] == '#') 1220 continue; 1221 1222 /* 1223 * Collect the code. Assume max 6 digits 1224 */ 1225 1226 for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) { 1227 if (isspace((unsigned char)*s)) break; 1228 code <<= 4; 1229 if (*s >= '0' && *s <= '9') 1230 code += *s - '0'; 1231 else if (*s >= 'A' && *s <= 'F') 1232 code += (*s - 'A') + 10; 1233 else if (*s >= 'a' && *s <= 'f') 1234 code += (*s - 'a') + 10; 1235 } 1236 COMPEX_SET(code); 1237 } 1238} 1239 1240/* 1241 * Creates array of compositions from decomposition array 1242 */ 1243static void 1244create_comps(void) 1245{ 1246 ac_uint4 i, cu; 1247 1248 comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t)); 1249 1250 for (i = cu = 0; i < decomps_used; i++) { 1251 if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code)) 1252 continue; 1253 comps[cu].comp = decomps[i].code; 1254 comps[cu].count = 2; 1255 comps[cu].code1 = decomps[i].decomp[0]; 1256 comps[cu].code2 = decomps[i].decomp[1]; 1257 cu++; 1258 } 1259 comps_used = cu; 1260 qsort(comps, comps_used, sizeof(_comp_t), cmpcomps); 1261} 1262 1263#if HARDCODE_DATA 1264static void 1265write_case(FILE *out, _case_t *tab, int num, int first) 1266{ 1267 int i; 1268 1269 for (i=0; i<num; i++) { 1270 if (first) first = 0; 1271 else fprintf(out, ","); 1272 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx", 1273 (unsigned long) tab[i].key, (unsigned long) tab[i].other1, 1274 (unsigned long) tab[i].other2); 1275 } 1276} 1277 1278#define PREF "static const " 1279 1280#endif 1281 1282static void 1283write_cdata(char *opath) 1284{ 1285 FILE *out; 1286 ac_uint4 bytes; 1287 ac_uint4 i, idx, nprops; 1288#if !(HARDCODE_DATA) 1289 ac_uint2 casecnt[2]; 1290#endif 1291 char path[BUFSIZ]; 1292#if HARDCODE_DATA 1293 int j, k; 1294 1295 /***************************************************************** 1296 * 1297 * Generate the ctype data. 1298 * 1299 *****************************************************************/ 1300 1301 /* 1302 * Open the output file. 1303 */ 1304 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath); 1305 if ((out = fopen(path, "w")) == 0) 1306 return; 1307#else 1308 /* 1309 * Open the ctype.dat file. 1310 */ 1311 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath); 1312 if ((out = fopen(path, "wb")) == 0) 1313 return; 1314#endif 1315 1316 /* 1317 * Collect the offsets for the properties. The offsets array is 1318 * on a 4-byte boundary to keep things efficient for architectures 1319 * that need such a thing. 1320 */ 1321 for (i = idx = 0; i < NUMPROPS; i++) { 1322 propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff; 1323 idx += proptbl[i].used; 1324 } 1325 1326 /* 1327 * Add the sentinel index which is used by the binary search as the upper 1328 * bound for a search. 1329 */ 1330 propcnt[i] = idx; 1331 1332 /* 1333 * Record the actual number of property lists. This may be different than 1334 * the number of offsets actually written because of aligning on a 4-byte 1335 * boundary. 1336 */ 1337 hdr[1] = NUMPROPS; 1338 1339 /* 1340 * Calculate the byte count needed and pad the property counts array to a 1341 * 4-byte boundary. 1342 */ 1343 if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3) 1344 bytes += 4 - (bytes & 3); 1345 nprops = bytes / sizeof(ac_uint2); 1346 bytes += sizeof(ac_uint4) * idx; 1347 1348#if HARDCODE_DATA 1349 fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS); 1350 1351 fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {"); 1352 1353 for (i = 0; i<nprops; i++) { 1354 if (i) fprintf(out, ","); 1355 if (!(i&7)) fprintf(out, "\n\t"); 1356 else fprintf(out, " "); 1357 fprintf(out, "0x%04x", propcnt[i]); 1358 } 1359 fprintf(out, "\n};\n\n"); 1360 1361 fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {"); 1362 1363 k = 0; 1364 for (i = 0; i < NUMPROPS; i++) { 1365 if (proptbl[i].used > 0) { 1366 for (j=0; j<proptbl[i].used; j++) { 1367 if (k) fprintf(out, ","); 1368 if (!(k&3)) fprintf(out,"\n\t"); 1369 else fprintf(out, " "); 1370 k++; 1371 fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]); 1372 } 1373 } 1374 } 1375 fprintf(out, "\n};\n\n"); 1376#else 1377 /* 1378 * Write the header. 1379 */ 1380 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1381 1382 /* 1383 * Write the byte count. 1384 */ 1385 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1386 1387 /* 1388 * Write the property list counts. 1389 */ 1390 fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out); 1391 1392 /* 1393 * Write the property lists. 1394 */ 1395 for (i = 0; i < NUMPROPS; i++) { 1396 if (proptbl[i].used > 0) 1397 fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4), 1398 proptbl[i].used, out); 1399 } 1400 1401 fclose(out); 1402#endif 1403 1404 /***************************************************************** 1405 * 1406 * Generate the case mapping data. 1407 * 1408 *****************************************************************/ 1409 1410#if HARDCODE_DATA 1411 fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n", 1412 (long) (upper_used + lower_used + title_used)); 1413 1414 fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n", 1415 (long) upper_used, (long) lower_used); 1416 fprintf(out, PREF "ac_uint4 _uccase_map[] = {"); 1417 1418 if (upper_used > 0) 1419 /* 1420 * Write the upper case table. 1421 */ 1422 write_case(out, upper, upper_used, 1); 1423 1424 if (lower_used > 0) 1425 /* 1426 * Write the lower case table. 1427 */ 1428 write_case(out, lower, lower_used, !upper_used); 1429 1430 if (title_used > 0) 1431 /* 1432 * Write the title case table. 1433 */ 1434 write_case(out, title, title_used, !(upper_used||lower_used)); 1435 1436 if (!(upper_used || lower_used || title_used)) 1437 fprintf(out, "\t0"); 1438 1439 fprintf(out, "\n};\n\n"); 1440#else 1441 /* 1442 * Open the case.dat file. 1443 */ 1444 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath); 1445 if ((out = fopen(path, "wb")) == 0) 1446 return; 1447 1448 /* 1449 * Write the case mapping tables. 1450 */ 1451 hdr[1] = upper_used + lower_used + title_used; 1452 casecnt[0] = upper_used; 1453 casecnt[1] = lower_used; 1454 1455 /* 1456 * Write the header. 1457 */ 1458 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1459 1460 /* 1461 * Write the upper and lower case table sizes. 1462 */ 1463 fwrite((char *) casecnt, sizeof(ac_uint2), 2, out); 1464 1465 if (upper_used > 0) 1466 /* 1467 * Write the upper case table. 1468 */ 1469 fwrite((char *) upper, sizeof(_case_t), upper_used, out); 1470 1471 if (lower_used > 0) 1472 /* 1473 * Write the lower case table. 1474 */ 1475 fwrite((char *) lower, sizeof(_case_t), lower_used, out); 1476 1477 if (title_used > 0) 1478 /* 1479 * Write the title case table. 1480 */ 1481 fwrite((char *) title, sizeof(_case_t), title_used, out); 1482 1483 fclose(out); 1484#endif 1485 1486 /***************************************************************** 1487 * 1488 * Generate the composition data. 1489 * 1490 *****************************************************************/ 1491 1492 /* 1493 * Create compositions from decomposition data 1494 */ 1495 create_comps(); 1496 1497#if HARDCODE_DATA 1498 fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n", 1499 comps_used * 4L); 1500 1501 fprintf(out, PREF "ac_uint4 _uccomp_data[] = {"); 1502 1503 /* 1504 * Now, if comps exist, write them out. 1505 */ 1506 if (comps_used > 0) { 1507 for (i=0; i<comps_used; i++) { 1508 if (i) fprintf(out, ","); 1509 fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx", 1510 (unsigned long) comps[i].comp, (unsigned long) comps[i].count, 1511 (unsigned long) comps[i].code1, (unsigned long) comps[i].code2); 1512 } 1513 } else { 1514 fprintf(out, "\t0"); 1515 } 1516 fprintf(out, "\n};\n\n"); 1517#else 1518 /* 1519 * Open the comp.dat file. 1520 */ 1521 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath); 1522 if ((out = fopen(path, "wb")) == 0) 1523 return; 1524 1525 /* 1526 * Write the header. 1527 */ 1528 hdr[1] = (ac_uint2) comps_used * 4; 1529 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1530 1531 /* 1532 * Write out the byte count to maintain header size. 1533 */ 1534 bytes = comps_used * sizeof(_comp_t); 1535 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1536 1537 /* 1538 * Now, if comps exist, write them out. 1539 */ 1540 if (comps_used > 0) 1541 fwrite((char *) comps, sizeof(_comp_t), comps_used, out); 1542 1543 fclose(out); 1544#endif 1545 1546 /***************************************************************** 1547 * 1548 * Generate the decomposition data. 1549 * 1550 *****************************************************************/ 1551 1552 /* 1553 * Fully expand all decompositions before generating the output file. 1554 */ 1555 expand_decomp(); 1556 1557#if HARDCODE_DATA 1558 fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n", 1559 decomps_used * 2L); 1560 1561 fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {"); 1562 1563 if (decomps_used) { 1564 /* 1565 * Write the list of decomp nodes. 1566 */ 1567 for (i = idx = 0; i < decomps_used; i++) { 1568 fprintf(out, "\n\t0x%08lx, 0x%08lx,", 1569 (unsigned long) decomps[i].code, (unsigned long) idx); 1570 idx += decomps[i].used; 1571 } 1572 1573 /* 1574 * Write the sentinel index as the last decomp node. 1575 */ 1576 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx); 1577 1578 fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {"); 1579 /* 1580 * Write the decompositions themselves. 1581 */ 1582 k = 0; 1583 for (i = 0; i < decomps_used; i++) 1584 for (j=0; j<decomps[i].used; j++) { 1585 if (k) fprintf(out, ","); 1586 if (!(k&3)) fprintf(out,"\n\t"); 1587 else fprintf(out, " "); 1588 k++; 1589 fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]); 1590 } 1591 fprintf(out, "\n};\n\n"); 1592 } 1593#else 1594 /* 1595 * Open the decomp.dat file. 1596 */ 1597 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath); 1598 if ((out = fopen(path, "wb")) == 0) 1599 return; 1600 1601 hdr[1] = decomps_used; 1602 1603 /* 1604 * Write the header. 1605 */ 1606 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1607 1608 /* 1609 * Write a temporary byte count which will be calculated as the 1610 * decompositions are written out. 1611 */ 1612 bytes = 0; 1613 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1614 1615 if (decomps_used) { 1616 /* 1617 * Write the list of decomp nodes. 1618 */ 1619 for (i = idx = 0; i < decomps_used; i++) { 1620 fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out); 1621 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1622 idx += decomps[i].used; 1623 } 1624 1625 /* 1626 * Write the sentinel index as the last decomp node. 1627 */ 1628 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1629 1630 /* 1631 * Write the decompositions themselves. 1632 */ 1633 for (i = 0; i < decomps_used; i++) 1634 fwrite((char *) decomps[i].decomp, sizeof(ac_uint4), 1635 decomps[i].used, out); 1636 1637 /* 1638 * Seek back to the beginning and write the byte count. 1639 */ 1640 bytes = (sizeof(ac_uint4) * idx) + 1641 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1)); 1642 fseek(out, sizeof(ac_uint2) << 1, 0L); 1643 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1644 1645 fclose(out); 1646 } 1647#endif 1648 1649#ifdef HARDCODE_DATA 1650 fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n", 1651 kdecomps_used * 2L); 1652 1653 fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {"); 1654 1655 if (kdecomps_used) { 1656 /* 1657 * Write the list of kdecomp nodes. 1658 */ 1659 for (i = idx = 0; i < kdecomps_used; i++) { 1660 fprintf(out, "\n\t0x%08lx, 0x%08lx,", 1661 (unsigned long) kdecomps[i].code, (unsigned long) idx); 1662 idx += kdecomps[i].used; 1663 } 1664 1665 /* 1666 * Write the sentinel index as the last decomp node. 1667 */ 1668 fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx); 1669 1670 fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {"); 1671 1672 /* 1673 * Write the decompositions themselves. 1674 */ 1675 k = 0; 1676 for (i = 0; i < kdecomps_used; i++) 1677 for (j=0; j<kdecomps[i].used; j++) { 1678 if (k) fprintf(out, ","); 1679 if (!(k&3)) fprintf(out,"\n\t"); 1680 else fprintf(out, " "); 1681 k++; 1682 fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]); 1683 } 1684 fprintf(out, "\n};\n\n"); 1685 } 1686#else 1687 /* 1688 * Open the kdecomp.dat file. 1689 */ 1690 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath); 1691 if ((out = fopen(path, "wb")) == 0) 1692 return; 1693 1694 hdr[1] = kdecomps_used; 1695 1696 /* 1697 * Write the header. 1698 */ 1699 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1700 1701 /* 1702 * Write a temporary byte count which will be calculated as the 1703 * decompositions are written out. 1704 */ 1705 bytes = 0; 1706 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1707 1708 if (kdecomps_used) { 1709 /* 1710 * Write the list of kdecomp nodes. 1711 */ 1712 for (i = idx = 0; i < kdecomps_used; i++) { 1713 fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out); 1714 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1715 idx += kdecomps[i].used; 1716 } 1717 1718 /* 1719 * Write the sentinel index as the last decomp node. 1720 */ 1721 fwrite((char *) &idx, sizeof(ac_uint4), 1, out); 1722 1723 /* 1724 * Write the decompositions themselves. 1725 */ 1726 for (i = 0; i < kdecomps_used; i++) 1727 fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4), 1728 kdecomps[i].used, out); 1729 1730 /* 1731 * Seek back to the beginning and write the byte count. 1732 */ 1733 bytes = (sizeof(ac_uint4) * idx) + 1734 (sizeof(ac_uint4) * ((hdr[1] << 1) + 1)); 1735 fseek(out, sizeof(ac_uint2) << 1, 0L); 1736 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1737 1738 fclose(out); 1739 } 1740#endif 1741 1742 /***************************************************************** 1743 * 1744 * Generate the combining class data. 1745 * 1746 *****************************************************************/ 1747#ifdef HARDCODE_DATA 1748 fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used); 1749 1750 fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {"); 1751 1752 if (ccl_used > 0) { 1753 /* 1754 * Write the combining class ranges out. 1755 */ 1756 for (i = 0; i<ccl_used; i++) { 1757 if (i) fprintf(out, ","); 1758 if (!(i&3)) fprintf(out, "\n\t"); 1759 else fprintf(out, " "); 1760 fprintf(out, "0x%08lx", (unsigned long) ccl[i]); 1761 } 1762 } else { 1763 fprintf(out, "\t0"); 1764 } 1765 fprintf(out, "\n};\n\n"); 1766#else 1767 /* 1768 * Open the cmbcl.dat file. 1769 */ 1770 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath); 1771 if ((out = fopen(path, "wb")) == 0) 1772 return; 1773 1774 /* 1775 * Set the number of ranges used. Each range has a combining class which 1776 * means each entry is a 3-tuple. 1777 */ 1778 hdr[1] = ccl_used / 3; 1779 1780 /* 1781 * Write the header. 1782 */ 1783 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1784 1785 /* 1786 * Write out the byte count to maintain header size. 1787 */ 1788 bytes = ccl_used * sizeof(ac_uint4); 1789 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1790 1791 if (ccl_used > 0) 1792 /* 1793 * Write the combining class ranges out. 1794 */ 1795 fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out); 1796 1797 fclose(out); 1798#endif 1799 1800 /***************************************************************** 1801 * 1802 * Generate the number data. 1803 * 1804 *****************************************************************/ 1805 1806#if HARDCODE_DATA 1807 fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n", 1808 (unsigned long)ncodes_used<<1); 1809 1810 fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {"); 1811 1812 /* 1813 * Now, if number mappings exist, write them out. 1814 */ 1815 if (ncodes_used > 0) { 1816 for (i = 0; i<ncodes_used; i++) { 1817 if (i) fprintf(out, ","); 1818 if (!(i&1)) fprintf(out, "\n\t"); 1819 else fprintf(out, " "); 1820 fprintf(out, "0x%08lx, 0x%08lx", 1821 (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx); 1822 } 1823 fprintf(out, "\n};\n\n"); 1824 1825 fprintf(out, PREF "short _ucnum_vals[] = {"); 1826 for (i = 0; i<nums_used; i++) { 1827 if (i) fprintf(out, ","); 1828 if (!(i&3)) fprintf(out, "\n\t"); 1829 else fprintf(out, " "); 1830 if (nums[i].numerator < 0) { 1831 fprintf(out, "%6d, 0x%04x", 1832 nums[i].numerator, nums[i].denominator); 1833 } else { 1834 fprintf(out, "0x%04x, 0x%04x", 1835 nums[i].numerator, nums[i].denominator); 1836 } 1837 } 1838 fprintf(out, "\n};\n\n"); 1839 } 1840#else 1841 /* 1842 * Open the num.dat file. 1843 */ 1844 snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath); 1845 if ((out = fopen(path, "wb")) == 0) 1846 return; 1847 1848 /* 1849 * The count part of the header will be the total number of codes that 1850 * have numbers. 1851 */ 1852 hdr[1] = (ac_uint2) (ncodes_used << 1); 1853 bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t)); 1854 1855 /* 1856 * Write the header. 1857 */ 1858 fwrite((char *) hdr, sizeof(ac_uint2), 2, out); 1859 1860 /* 1861 * Write out the byte count to maintain header size. 1862 */ 1863 fwrite((char *) &bytes, sizeof(ac_uint4), 1, out); 1864 1865 /* 1866 * Now, if number mappings exist, write them out. 1867 */ 1868 if (ncodes_used > 0) { 1869 fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out); 1870 fwrite((char *) nums, sizeof(_num_t), nums_used, out); 1871 } 1872#endif 1873 1874 fclose(out); 1875} 1876 1877static void 1878usage(char *prog) 1879{ 1880 fprintf(stderr, 1881 "Usage: %s [-o output-directory|-x composition-exclusions]", prog); 1882 fprintf(stderr, " datafile1 datafile2 ...\n\n"); 1883 fprintf(stderr, 1884 "-o output-directory\n\t\tWrite the output files to a different"); 1885 fprintf(stderr, " directory (default: .).\n"); 1886 fprintf(stderr, 1887 "-x composition-exclusion\n\t\tFile of composition codes"); 1888 fprintf(stderr, " that should be excluded.\n"); 1889 exit(1); 1890} 1891 1892int 1893main(int argc, char *argv[]) 1894{ 1895 FILE *in; 1896 char *prog, *opath; 1897 1898 prog = lutil_progname( "ucgendat", argc, argv ); 1899 1900 opath = 0; 1901 in = stdin; 1902 1903 argc--; 1904 argv++; 1905 1906 while (argc > 0) { 1907 if (argv[0][0] == '-') { 1908 switch (argv[0][1]) { 1909 case 'o': 1910 argc--; 1911 argv++; 1912 opath = argv[0]; 1913 break; 1914 case 'x': 1915 argc--; 1916 argv++; 1917 if ((in = fopen(argv[0], "r")) == 0) 1918 fprintf(stderr, 1919 "%s: unable to open composition exclusion file %s\n", 1920 prog, argv[0]); 1921 else { 1922 read_compexdata(in); 1923 fclose(in); 1924 in = 0; 1925 } 1926 break; 1927 default: 1928 usage(prog); 1929 } 1930 } else { 1931 if (in != stdin && in != NULL) 1932 fclose(in); 1933 if ((in = fopen(argv[0], "r")) == 0) 1934 fprintf(stderr, "%s: unable to open ctype file %s\n", 1935 prog, argv[0]); 1936 else { 1937 read_cdata(in); 1938 fclose(in); 1939 in = 0; 1940 } 1941 } 1942 argc--; 1943 argv++; 1944 } 1945 1946 if (opath == 0) 1947 opath = "."; 1948 write_cdata(opath); 1949 1950 return 0; 1951} 1952