1/* $OpenLDAP$ */ 2/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 3 * 4 * Copyright 1998-2011 The OpenLDAP Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted only as authorized by the OpenLDAP 9 * Public License. 10 * 11 * A copy of this license is available in file LICENSE in the 12 * top-level directory of the distribution or, alternatively, at 13 * <http://www.OpenLDAP.org/license.html>. 14 */ 15/* Copyright 2001 Computing Research Labs, New Mexico State University 16 * 17 * Permission is hereby granted, free of charge, to any person obtaining a 18 * copy of this software and associated documentation files (the "Software"), 19 * to deal in the Software without restriction, including without limitation 20 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 21 * and/or sell copies of the Software, and to permit persons to whom the 22 * Software is furnished to do so, subject to the following conditions: 23 * 24 * The above copyright notice and this permission notice shall be included in 25 * all copies or substantial portions of the Software. 26 * 27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 28 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 30 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 31 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 32 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 33 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 34 */ 35/* $Id: ucdata.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */ 36 37#include "portable.h" 38#include "ldap_config.h" 39 40#include <stdio.h> 41#include <ac/stdlib.h> 42#include <ac/string.h> 43#include <ac/unistd.h> 44 45#include <ac/bytes.h> 46 47#include "lber_pvt.h" 48#include "ucdata.h" 49 50#ifndef HARDCODE_DATA 51#define HARDCODE_DATA 1 52#endif 53 54#if HARDCODE_DATA 55#include "uctable.h" 56#endif 57 58/************************************************************************** 59 * 60 * Miscellaneous types, data, and support functions. 61 * 62 **************************************************************************/ 63 64typedef struct { 65 ac_uint2 bom; 66 ac_uint2 cnt; 67 union { 68 ac_uint4 bytes; 69 ac_uint2 len[2]; 70 } size; 71} _ucheader_t; 72 73/* 74 * A simple array of 32-bit masks for lookup. 75 */ 76static ac_uint4 masks32[32] = { 77 0x00000001UL, 0x00000002UL, 0x00000004UL, 0x00000008UL, 78 0x00000010UL, 0x00000020UL, 0x00000040UL, 0x00000080UL, 79 0x00000100UL, 0x00000200UL, 0x00000400UL, 0x00000800UL, 80 0x00001000UL, 0x00002000UL, 0x00004000UL, 0x00008000UL, 81 0x00010000UL, 0x00020000UL, 0x00040000UL, 0x00080000UL, 82 0x00100000UL, 0x00200000UL, 0x00400000UL, 0x00800000UL, 83 0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL, 84 0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL 85}; 86 87#define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8)) 88#define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\ 89 ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24)) 90 91#if !HARDCODE_DATA 92static FILE * 93_ucopenfile(char *paths, char *filename, char *mode) 94{ 95 FILE *f; 96 char *fp, *dp, *pp, path[BUFSIZ]; 97 98 if (filename == 0 || *filename == 0) 99 return 0; 100 101 dp = paths; 102 while (dp && *dp) { 103 pp = path; 104 while (*dp && *dp != ':') 105 *pp++ = *dp++; 106 *pp++ = *LDAP_DIRSEP; 107 108 fp = filename; 109 while (*fp) 110 *pp++ = *fp++; 111 *pp = 0; 112 113 if ((f = fopen(path, mode)) != 0) 114 return f; 115 116 if (*dp == ':') 117 dp++; 118 } 119 120 return 0; 121} 122#endif 123 124/************************************************************************** 125 * 126 * Support for the character properties. 127 * 128 **************************************************************************/ 129 130#if !HARDCODE_DATA 131 132static ac_uint4 _ucprop_size; 133static ac_uint2 *_ucprop_offsets; 134static ac_uint4 *_ucprop_ranges; 135 136/* 137 * Return -1 on error, 0 if okay 138 */ 139static int 140_ucprop_load(char *paths, int reload) 141{ 142 FILE *in; 143 ac_uint4 size, i; 144 _ucheader_t hdr; 145 146 if (_ucprop_size > 0) { 147 if (!reload) 148 /* 149 * The character properties have already been loaded. 150 */ 151 return 0; 152 153 /* 154 * Unload the current character property data in preparation for 155 * loading a new copy. Only the first array has to be deallocated 156 * because all the memory for the arrays is allocated as a single 157 * block. 158 */ 159 free((char *) _ucprop_offsets); 160 _ucprop_size = 0; 161 } 162 163 if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0) 164 return -1; 165 166 /* 167 * Load the header. 168 */ 169 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 170 171 if (hdr.bom == 0xfffe) { 172 hdr.cnt = endian_short(hdr.cnt); 173 hdr.size.bytes = endian_long(hdr.size.bytes); 174 } 175 176 if ((_ucprop_size = hdr.cnt) == 0) { 177 fclose(in); 178 return -1; 179 } 180 181 /* 182 * Allocate all the storage needed for the lookup table. 183 */ 184 _ucprop_offsets = (ac_uint2 *) malloc(hdr.size.bytes); 185 186 /* 187 * Calculate the offset into the storage for the ranges. The offsets 188 * array is on a 4-byte boundary and one larger than the value provided in 189 * the header count field. This means the offset to the ranges must be 190 * calculated after aligning the count to a 4-byte boundary. 191 */ 192 if ((size = ((hdr.cnt + 1) * sizeof(ac_uint2))) & 3) 193 size += 4 - (size & 3); 194 size >>= 1; 195 _ucprop_ranges = (ac_uint4 *) (_ucprop_offsets + size); 196 197 /* 198 * Load the offset array. 199 */ 200 fread((char *) _ucprop_offsets, sizeof(ac_uint2), size, in); 201 202 /* 203 * Do an endian swap if necessary. Don't forget there is an extra node on 204 * the end with the final index. 205 */ 206 if (hdr.bom == 0xfffe) { 207 for (i = 0; i <= _ucprop_size; i++) 208 _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]); 209 } 210 211 /* 212 * Load the ranges. The number of elements is in the last array position 213 * of the offsets. 214 */ 215 fread((char *) _ucprop_ranges, sizeof(ac_uint4), 216 _ucprop_offsets[_ucprop_size], in); 217 218 fclose(in); 219 220 /* 221 * Do an endian swap if necessary. 222 */ 223 if (hdr.bom == 0xfffe) { 224 for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++) 225 _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]); 226 } 227 return 0; 228} 229 230static void 231_ucprop_unload(void) 232{ 233 if (_ucprop_size == 0) 234 return; 235 236 /* 237 * Only need to free the offsets because the memory is allocated as a 238 * single block. 239 */ 240 free((char *) _ucprop_offsets); 241 _ucprop_size = 0; 242} 243#endif 244 245static int 246_ucprop_lookup(ac_uint4 code, ac_uint4 n) 247{ 248 long l, r, m; 249 250 if (_ucprop_size == 0) 251 return 0; 252 253 /* 254 * There is an extra node on the end of the offsets to allow this routine 255 * to work right. If the index is 0xffff, then there are no nodes for the 256 * property. 257 */ 258 if ((l = _ucprop_offsets[n]) == 0xffff) 259 return 0; 260 261 /* 262 * Locate the next offset that is not 0xffff. The sentinel at the end of 263 * the array is the max index value. 264 */ 265 for (m = 1; 266 n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ; 267 268 r = _ucprop_offsets[n + m] - 1; 269 270 while (l <= r) { 271 /* 272 * Determine a "mid" point and adjust to make sure the mid point is at 273 * the beginning of a range pair. 274 */ 275 m = (l + r) >> 1; 276 m -= (m & 1); 277 if (code > _ucprop_ranges[m + 1]) 278 l = m + 2; 279 else if (code < _ucprop_ranges[m]) 280 r = m - 2; 281 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1]) 282 return 1; 283 } 284 return 0; 285} 286 287int 288ucisprop(ac_uint4 code, ac_uint4 mask1, ac_uint4 mask2) 289{ 290 ac_uint4 i; 291 292 if (mask1 == 0 && mask2 == 0) 293 return 0; 294 295 for (i = 0; mask1 && i < 32; i++) { 296 if ((mask1 & masks32[i]) && _ucprop_lookup(code, i)) 297 return 1; 298 } 299 300 for (i = 32; mask2 && i < _ucprop_size; i++) { 301 if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i)) 302 return 1; 303 } 304 305 return 0; 306} 307 308/************************************************************************** 309 * 310 * Support for case mapping. 311 * 312 **************************************************************************/ 313 314#if !HARDCODE_DATA 315 316/* These record the number of slots in the map. 317 * There are 3 words per slot. 318 */ 319static ac_uint4 _uccase_size; 320static ac_uint2 _uccase_len[2]; 321static ac_uint4 *_uccase_map; 322 323/* 324 * Return -1 on error, 0 if okay 325 */ 326static int 327_uccase_load(char *paths, int reload) 328{ 329 FILE *in; 330 ac_uint4 i; 331 _ucheader_t hdr; 332 333 if (_uccase_size > 0) { 334 if (!reload) 335 /* 336 * The case mappings have already been loaded. 337 */ 338 return 0; 339 340 free((char *) _uccase_map); 341 _uccase_size = 0; 342 } 343 344 if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0) 345 return -1; 346 347 /* 348 * Load the header. 349 */ 350 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 351 352 if (hdr.bom == 0xfffe) { 353 hdr.cnt = endian_short(hdr.cnt); 354 hdr.size.len[0] = endian_short(hdr.size.len[0]); 355 hdr.size.len[1] = endian_short(hdr.size.len[1]); 356 } 357 358 /* 359 * Set the node count and lengths of the upper and lower case mapping 360 * tables. 361 */ 362 _uccase_size = hdr.cnt; 363 _uccase_len[0] = hdr.size.len[0]; 364 _uccase_len[1] = hdr.size.len[1]; 365 366 _uccase_map = (ac_uint4 *) 367 malloc(_uccase_size * 3 * sizeof(ac_uint4)); 368 369 /* 370 * Load the case mapping table. 371 */ 372 fread((char *) _uccase_map, sizeof(ac_uint4), _uccase_size * 3, in); 373 374 /* 375 * Do an endian swap if necessary. 376 */ 377 if (hdr.bom == 0xfffe) { 378 for (i = 0; i < _uccase_size * 3; i++) 379 _uccase_map[i] = endian_long(_uccase_map[i]); 380 } 381 fclose(in); 382 return 0; 383} 384 385static void 386_uccase_unload(void) 387{ 388 if (_uccase_size == 0) 389 return; 390 391 free((char *) _uccase_map); 392 _uccase_size = 0; 393} 394#endif 395 396static ac_uint4 397_uccase_lookup(ac_uint4 code, long l, long r, int field) 398{ 399 long m; 400 const ac_uint4 *tmp; 401 402 /* 403 * Do the binary search. 404 */ 405 while (l <= r) { 406 /* 407 * Determine a "mid" point and adjust to make sure the mid point is at 408 * the beginning of a case mapping triple. 409 */ 410 m = (l + r) >> 1; 411 tmp = &_uccase_map[m*3]; 412 if (code > *tmp) 413 l = m + 1; 414 else if (code < *tmp) 415 r = m - 1; 416 else if (code == *tmp) 417 return tmp[field]; 418 } 419 420 return code; 421} 422 423ac_uint4 424uctoupper(ac_uint4 code) 425{ 426 int field; 427 long l, r; 428 429 if (ucisupper(code)) 430 return code; 431 432 if (ucislower(code)) { 433 /* 434 * The character is lower case. 435 */ 436 field = 2; 437 l = _uccase_len[0]; 438 r = (l + _uccase_len[1]) - 1; 439 } else { 440 /* 441 * The character is title case. 442 */ 443 field = 1; 444 l = _uccase_len[0] + _uccase_len[1]; 445 r = _uccase_size - 1; 446 } 447 return _uccase_lookup(code, l, r, field); 448} 449 450ac_uint4 451uctolower(ac_uint4 code) 452{ 453 int field; 454 long l, r; 455 456 if (ucislower(code)) 457 return code; 458 459 if (ucisupper(code)) { 460 /* 461 * The character is upper case. 462 */ 463 field = 1; 464 l = 0; 465 r = _uccase_len[0] - 1; 466 } else { 467 /* 468 * The character is title case. 469 */ 470 field = 2; 471 l = _uccase_len[0] + _uccase_len[1]; 472 r = _uccase_size - 1; 473 } 474 return _uccase_lookup(code, l, r, field); 475} 476 477ac_uint4 478uctotitle(ac_uint4 code) 479{ 480 int field; 481 long l, r; 482 483 if (ucistitle(code)) 484 return code; 485 486 /* 487 * The offset will always be the same for converting to title case. 488 */ 489 field = 2; 490 491 if (ucisupper(code)) { 492 /* 493 * The character is upper case. 494 */ 495 l = 0; 496 r = _uccase_len[0] - 1; 497 } else { 498 /* 499 * The character is lower case. 500 */ 501 l = _uccase_len[0]; 502 r = (l + _uccase_len[1]) - 1; 503 } 504 return _uccase_lookup(code, l, r, field); 505} 506 507/************************************************************************** 508 * 509 * Support for compositions. 510 * 511 **************************************************************************/ 512 513#if !HARDCODE_DATA 514 515static ac_uint4 _uccomp_size; 516static ac_uint4 *_uccomp_data; 517 518/* 519 * Return -1 on error, 0 if okay 520 */ 521static int 522_uccomp_load(char *paths, int reload) 523{ 524 FILE *in; 525 ac_uint4 size, i; 526 _ucheader_t hdr; 527 528 if (_uccomp_size > 0) { 529 if (!reload) 530 /* 531 * The compositions have already been loaded. 532 */ 533 return 0; 534 535 free((char *) _uccomp_data); 536 _uccomp_size = 0; 537 } 538 539 if ((in = _ucopenfile(paths, "comp.dat", "rb")) == 0) 540 return -1; 541 542 /* 543 * Load the header. 544 */ 545 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 546 547 if (hdr.bom == 0xfffe) { 548 hdr.cnt = endian_short(hdr.cnt); 549 hdr.size.bytes = endian_long(hdr.size.bytes); 550 } 551 552 _uccomp_size = hdr.cnt; 553 _uccomp_data = (ac_uint4 *) malloc(hdr.size.bytes); 554 555 /* 556 * Read the composition data in. 557 */ 558 size = hdr.size.bytes / sizeof(ac_uint4); 559 fread((char *) _uccomp_data, sizeof(ac_uint4), size, in); 560 561 /* 562 * Do an endian swap if necessary. 563 */ 564 if (hdr.bom == 0xfffe) { 565 for (i = 0; i < size; i++) 566 _uccomp_data[i] = endian_long(_uccomp_data[i]); 567 } 568 569 /* 570 * Assume that the data is ordered on count, so that all compositions 571 * of length 2 come first. Only handling length 2 for now. 572 */ 573 for (i = 1; i < size; i += 4) 574 if (_uccomp_data[i] != 2) 575 break; 576 _uccomp_size = i - 1; 577 578 fclose(in); 579 return 0; 580} 581 582static void 583_uccomp_unload(void) 584{ 585 if (_uccomp_size == 0) 586 return; 587 588 free((char *) _uccomp_data); 589 _uccomp_size = 0; 590} 591#endif 592 593int 594uccomp(ac_uint4 node1, ac_uint4 node2, ac_uint4 *comp) 595{ 596 int l, r, m; 597 598 l = 0; 599 r = _uccomp_size - 1; 600 601 while (l <= r) { 602 m = ((r + l) >> 1); 603 m -= m & 3; 604 if (node1 > _uccomp_data[m+2]) 605 l = m + 4; 606 else if (node1 < _uccomp_data[m+2]) 607 r = m - 4; 608 else if (node2 > _uccomp_data[m+3]) 609 l = m + 4; 610 else if (node2 < _uccomp_data[m+3]) 611 r = m - 4; 612 else { 613 *comp = _uccomp_data[m]; 614 return 1; 615 } 616 } 617 return 0; 618} 619 620int 621uccomp_hangul(ac_uint4 *str, int len) 622{ 623 const int SBase = 0xAC00, LBase = 0x1100, 624 VBase = 0x1161, TBase = 0x11A7, 625 LCount = 19, VCount = 21, TCount = 28, 626 NCount = VCount * TCount, /* 588 */ 627 SCount = LCount * NCount; /* 11172 */ 628 629 int i, rlen; 630 ac_uint4 ch, last, lindex, sindex; 631 632 last = str[0]; 633 rlen = 1; 634 for ( i = 1; i < len; i++ ) { 635 ch = str[i]; 636 637 /* check if two current characters are L and V */ 638 lindex = last - LBase; 639 if (lindex < (ac_uint4) LCount) { 640 ac_uint4 vindex = ch - VBase; 641 if (vindex < (ac_uint4) VCount) { 642 /* make syllable of form LV */ 643 last = SBase + (lindex * VCount + vindex) * TCount; 644 str[rlen-1] = last; /* reset last */ 645 continue; 646 } 647 } 648 649 /* check if two current characters are LV and T */ 650 sindex = last - SBase; 651 if (sindex < (ac_uint4) SCount 652 && (sindex % TCount) == 0) 653 { 654 ac_uint4 tindex = ch - TBase; 655 if (tindex <= (ac_uint4) TCount) { 656 /* make syllable of form LVT */ 657 last += tindex; 658 str[rlen-1] = last; /* reset last */ 659 continue; 660 } 661 } 662 663 /* if neither case was true, just add the character */ 664 last = ch; 665 str[rlen] = ch; 666 rlen++; 667 } 668 return rlen; 669} 670 671int 672uccanoncomp(ac_uint4 *str, int len) 673{ 674 int i, stpos, copos; 675 ac_uint4 cl, prevcl, st, ch, co; 676 677 st = str[0]; 678 stpos = 0; 679 copos = 1; 680 prevcl = uccombining_class(st) == 0 ? 0 : 256; 681 682 for (i = 1; i < len; i++) { 683 ch = str[i]; 684 cl = uccombining_class(ch); 685 if (uccomp(st, ch, &co) && (prevcl < cl || prevcl == 0)) 686 st = str[stpos] = co; 687 else { 688 if (cl == 0) { 689 stpos = copos; 690 st = ch; 691 } 692 prevcl = cl; 693 str[copos++] = ch; 694 } 695 } 696 697 return uccomp_hangul(str, copos); 698} 699 700/************************************************************************** 701 * 702 * Support for decompositions. 703 * 704 **************************************************************************/ 705 706#if !HARDCODE_DATA 707 708static ac_uint4 _ucdcmp_size; 709static ac_uint4 *_ucdcmp_nodes; 710static ac_uint4 *_ucdcmp_decomp; 711 712static ac_uint4 _uckdcmp_size; 713static ac_uint4 *_uckdcmp_nodes; 714static ac_uint4 *_uckdcmp_decomp; 715 716/* 717 * Return -1 on error, 0 if okay 718 */ 719static int 720_ucdcmp_load(char *paths, int reload) 721{ 722 FILE *in; 723 ac_uint4 size, i; 724 _ucheader_t hdr; 725 726 if (_ucdcmp_size > 0) { 727 if (!reload) 728 /* 729 * The decompositions have already been loaded. 730 */ 731 return 0; 732 733 free((char *) _ucdcmp_nodes); 734 _ucdcmp_size = 0; 735 } 736 737 if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0) 738 return -1; 739 740 /* 741 * Load the header. 742 */ 743 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 744 745 if (hdr.bom == 0xfffe) { 746 hdr.cnt = endian_short(hdr.cnt); 747 hdr.size.bytes = endian_long(hdr.size.bytes); 748 } 749 750 _ucdcmp_size = hdr.cnt << 1; 751 _ucdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 752 _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1); 753 754 /* 755 * Read the decomposition data in. 756 */ 757 size = hdr.size.bytes / sizeof(ac_uint4); 758 fread((char *) _ucdcmp_nodes, sizeof(ac_uint4), size, in); 759 760 /* 761 * Do an endian swap if necessary. 762 */ 763 if (hdr.bom == 0xfffe) { 764 for (i = 0; i < size; i++) 765 _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]); 766 } 767 fclose(in); 768 return 0; 769} 770 771/* 772 * Return -1 on error, 0 if okay 773 */ 774static int 775_uckdcmp_load(char *paths, int reload) 776{ 777 FILE *in; 778 ac_uint4 size, i; 779 _ucheader_t hdr; 780 781 if (_uckdcmp_size > 0) { 782 if (!reload) 783 /* 784 * The decompositions have already been loaded. 785 */ 786 return 0; 787 788 free((char *) _uckdcmp_nodes); 789 _uckdcmp_size = 0; 790 } 791 792 if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0) 793 return -1; 794 795 /* 796 * Load the header. 797 */ 798 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 799 800 if (hdr.bom == 0xfffe) { 801 hdr.cnt = endian_short(hdr.cnt); 802 hdr.size.bytes = endian_long(hdr.size.bytes); 803 } 804 805 _uckdcmp_size = hdr.cnt << 1; 806 _uckdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 807 _uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1); 808 809 /* 810 * Read the decomposition data in. 811 */ 812 size = hdr.size.bytes / sizeof(ac_uint4); 813 fread((char *) _uckdcmp_nodes, sizeof(ac_uint4), size, in); 814 815 /* 816 * Do an endian swap if necessary. 817 */ 818 if (hdr.bom == 0xfffe) { 819 for (i = 0; i < size; i++) 820 _uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]); 821 } 822 fclose(in); 823 return 0; 824} 825 826static void 827_ucdcmp_unload(void) 828{ 829 if (_ucdcmp_size == 0) 830 return; 831 832 /* 833 * Only need to free the offsets because the memory is allocated as a 834 * single block. 835 */ 836 free((char *) _ucdcmp_nodes); 837 _ucdcmp_size = 0; 838} 839 840static void 841_uckdcmp_unload(void) 842{ 843 if (_uckdcmp_size == 0) 844 return; 845 846 /* 847 * Only need to free the offsets because the memory is allocated as a 848 * single block. 849 */ 850 free((char *) _uckdcmp_nodes); 851 _uckdcmp_size = 0; 852} 853#endif 854 855int 856ucdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp) 857{ 858 long l, r, m; 859 860 if (code < _ucdcmp_nodes[0]) { 861 return 0; 862 } 863 864 l = 0; 865 r = _ucdcmp_nodes[_ucdcmp_size] - 1; 866 867 while (l <= r) { 868 /* 869 * Determine a "mid" point and adjust to make sure the mid point is at 870 * the beginning of a code+offset pair. 871 */ 872 m = (l + r) >> 1; 873 m -= (m & 1); 874 if (code > _ucdcmp_nodes[m]) 875 l = m + 2; 876 else if (code < _ucdcmp_nodes[m]) 877 r = m - 2; 878 else if (code == _ucdcmp_nodes[m]) { 879 *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1]; 880 *decomp = (ac_uint4*)&_ucdcmp_decomp[_ucdcmp_nodes[m + 1]]; 881 return 1; 882 } 883 } 884 return 0; 885} 886 887int 888uckdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp) 889{ 890 long l, r, m; 891 892 if (code < _uckdcmp_nodes[0]) { 893 return 0; 894 } 895 896 l = 0; 897 r = _uckdcmp_nodes[_uckdcmp_size] - 1; 898 899 while (l <= r) { 900 /* 901 * Determine a "mid" point and adjust to make sure the mid point is at 902 * the beginning of a code+offset pair. 903 */ 904 m = (l + r) >> 1; 905 m -= (m & 1); 906 if (code > _uckdcmp_nodes[m]) 907 l = m + 2; 908 else if (code < _uckdcmp_nodes[m]) 909 r = m - 2; 910 else if (code == _uckdcmp_nodes[m]) { 911 *num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1]; 912 *decomp = (ac_uint4*)&_uckdcmp_decomp[_uckdcmp_nodes[m + 1]]; 913 return 1; 914 } 915 } 916 return 0; 917} 918 919int 920ucdecomp_hangul(ac_uint4 code, ac_uint4 *num, ac_uint4 decomp[]) 921{ 922 if (!ucishangul(code)) 923 return 0; 924 925 code -= 0xac00; 926 decomp[0] = 0x1100 + (ac_uint4) (code / 588); 927 decomp[1] = 0x1161 + (ac_uint4) ((code % 588) / 28); 928 decomp[2] = 0x11a7 + (ac_uint4) (code % 28); 929 *num = (decomp[2] != 0x11a7) ? 3 : 2; 930 931 return 1; 932} 933 934/* mode == 0 for canonical, mode == 1 for compatibility */ 935static int 936uccanoncompatdecomp(const ac_uint4 *in, int inlen, 937 ac_uint4 **out, int *outlen, short mode, void *ctx) 938{ 939 int l, size; 940 unsigned i, j, k; 941 ac_uint4 num, class, *decomp, hangdecomp[3]; 942 943 size = inlen * 2; 944 *out = (ac_uint4 *) ber_memalloc_x(size * sizeof(**out), ctx); 945 if (*out == NULL) 946 return *outlen = -1; 947 948 i = 0; 949 for (j = 0; j < (unsigned) inlen; j++) { 950 if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) { 951 if ( size - i < num) { 952 size = inlen + i - j + num - 1; 953 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx ); 954 if (*out == NULL) 955 return *outlen = -1; 956 } 957 for (k = 0; k < num; k++) { 958 class = uccombining_class(decomp[k]); 959 if (class == 0) { 960 (*out)[i] = decomp[k]; 961 } else { 962 for (l = i; l > 0; l--) 963 if (class >= uccombining_class((*out)[l-1])) 964 break; 965 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out)); 966 (*out)[l] = decomp[k]; 967 } 968 i++; 969 } 970 } else if (ucdecomp_hangul(in[j], &num, hangdecomp)) { 971 if (size - i < num) { 972 size = inlen + i - j + num - 1; 973 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx); 974 if (*out == NULL) 975 return *outlen = -1; 976 } 977 for (k = 0; k < num; k++) { 978 (*out)[i] = hangdecomp[k]; 979 i++; 980 } 981 } else { 982 if (size - i < 1) { 983 size = inlen + i - j; 984 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx); 985 if (*out == NULL) 986 return *outlen = -1; 987 } 988 class = uccombining_class(in[j]); 989 if (class == 0) { 990 (*out)[i] = in[j]; 991 } else { 992 for (l = i; l > 0; l--) 993 if (class >= uccombining_class((*out)[l-1])) 994 break; 995 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out)); 996 (*out)[l] = in[j]; 997 } 998 i++; 999 } 1000 } 1001 return *outlen = i; 1002} 1003 1004int 1005uccanondecomp(const ac_uint4 *in, int inlen, 1006 ac_uint4 **out, int *outlen, void *ctx) 1007{ 1008 return uccanoncompatdecomp(in, inlen, out, outlen, 0, ctx); 1009} 1010 1011int 1012uccompatdecomp(const ac_uint4 *in, int inlen, 1013 ac_uint4 **out, int *outlen, void *ctx) 1014{ 1015 return uccanoncompatdecomp(in, inlen, out, outlen, 1, ctx); 1016} 1017 1018/************************************************************************** 1019 * 1020 * Support for combining classes. 1021 * 1022 **************************************************************************/ 1023 1024#if !HARDCODE_DATA 1025static ac_uint4 _uccmcl_size; 1026static ac_uint4 *_uccmcl_nodes; 1027 1028/* 1029 * Return -1 on error, 0 if okay 1030 */ 1031static int 1032_uccmcl_load(char *paths, int reload) 1033{ 1034 FILE *in; 1035 ac_uint4 i; 1036 _ucheader_t hdr; 1037 1038 if (_uccmcl_size > 0) { 1039 if (!reload) 1040 /* 1041 * The combining classes have already been loaded. 1042 */ 1043 return 0; 1044 1045 free((char *) _uccmcl_nodes); 1046 _uccmcl_size = 0; 1047 } 1048 1049 if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0) 1050 return -1; 1051 1052 /* 1053 * Load the header. 1054 */ 1055 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 1056 1057 if (hdr.bom == 0xfffe) { 1058 hdr.cnt = endian_short(hdr.cnt); 1059 hdr.size.bytes = endian_long(hdr.size.bytes); 1060 } 1061 1062 _uccmcl_size = hdr.cnt * 3; 1063 _uccmcl_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 1064 1065 /* 1066 * Read the combining classes in. 1067 */ 1068 fread((char *) _uccmcl_nodes, sizeof(ac_uint4), _uccmcl_size, in); 1069 1070 /* 1071 * Do an endian swap if necessary. 1072 */ 1073 if (hdr.bom == 0xfffe) { 1074 for (i = 0; i < _uccmcl_size; i++) 1075 _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]); 1076 } 1077 fclose(in); 1078 return 0; 1079} 1080 1081static void 1082_uccmcl_unload(void) 1083{ 1084 if (_uccmcl_size == 0) 1085 return; 1086 1087 free((char *) _uccmcl_nodes); 1088 _uccmcl_size = 0; 1089} 1090#endif 1091 1092ac_uint4 1093uccombining_class(ac_uint4 code) 1094{ 1095 long l, r, m; 1096 1097 l = 0; 1098 r = _uccmcl_size - 1; 1099 1100 while (l <= r) { 1101 m = (l + r) >> 1; 1102 m -= (m % 3); 1103 if (code > _uccmcl_nodes[m + 1]) 1104 l = m + 3; 1105 else if (code < _uccmcl_nodes[m]) 1106 r = m - 3; 1107 else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1]) 1108 return _uccmcl_nodes[m + 2]; 1109 } 1110 return 0; 1111} 1112 1113/************************************************************************** 1114 * 1115 * Support for numeric values. 1116 * 1117 **************************************************************************/ 1118 1119#if !HARDCODE_DATA 1120static ac_uint4 *_ucnum_nodes; 1121static ac_uint4 _ucnum_size; 1122static short *_ucnum_vals; 1123 1124/* 1125 * Return -1 on error, 0 if okay 1126 */ 1127static int 1128_ucnumb_load(char *paths, int reload) 1129{ 1130 FILE *in; 1131 ac_uint4 size, i; 1132 _ucheader_t hdr; 1133 1134 if (_ucnum_size > 0) { 1135 if (!reload) 1136 /* 1137 * The numbers have already been loaded. 1138 */ 1139 return 0; 1140 1141 free((char *) _ucnum_nodes); 1142 _ucnum_size = 0; 1143 } 1144 1145 if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0) 1146 return -1; 1147 1148 /* 1149 * Load the header. 1150 */ 1151 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 1152 1153 if (hdr.bom == 0xfffe) { 1154 hdr.cnt = endian_short(hdr.cnt); 1155 hdr.size.bytes = endian_long(hdr.size.bytes); 1156 } 1157 1158 _ucnum_size = hdr.cnt; 1159 _ucnum_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 1160 _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size); 1161 1162 /* 1163 * Read the combining classes in. 1164 */ 1165 fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in); 1166 1167 /* 1168 * Do an endian swap if necessary. 1169 */ 1170 if (hdr.bom == 0xfffe) { 1171 for (i = 0; i < _ucnum_size; i++) 1172 _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]); 1173 1174 /* 1175 * Determine the number of values that have to be adjusted. 1176 */ 1177 size = (hdr.size.bytes - 1178 (_ucnum_size * (sizeof(ac_uint4) << 1))) / 1179 sizeof(short); 1180 1181 for (i = 0; i < size; i++) 1182 _ucnum_vals[i] = endian_short(_ucnum_vals[i]); 1183 } 1184 fclose(in); 1185 return 0; 1186} 1187 1188static void 1189_ucnumb_unload(void) 1190{ 1191 if (_ucnum_size == 0) 1192 return; 1193 1194 free((char *) _ucnum_nodes); 1195 _ucnum_size = 0; 1196} 1197#endif 1198 1199int 1200ucnumber_lookup(ac_uint4 code, struct ucnumber *num) 1201{ 1202 long l, r, m; 1203 short *vp; 1204 1205 l = 0; 1206 r = _ucnum_size - 1; 1207 while (l <= r) { 1208 /* 1209 * Determine a "mid" point and adjust to make sure the mid point is at 1210 * the beginning of a code+offset pair. 1211 */ 1212 m = (l + r) >> 1; 1213 m -= (m & 1); 1214 if (code > _ucnum_nodes[m]) 1215 l = m + 2; 1216 else if (code < _ucnum_nodes[m]) 1217 r = m - 2; 1218 else { 1219 vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1]; 1220 num->numerator = (int) *vp++; 1221 num->denominator = (int) *vp; 1222 return 1; 1223 } 1224 } 1225 return 0; 1226} 1227 1228int 1229ucdigit_lookup(ac_uint4 code, int *digit) 1230{ 1231 long l, r, m; 1232 short *vp; 1233 1234 l = 0; 1235 r = _ucnum_size - 1; 1236 while (l <= r) { 1237 /* 1238 * Determine a "mid" point and adjust to make sure the mid point is at 1239 * the beginning of a code+offset pair. 1240 */ 1241 m = (l + r) >> 1; 1242 m -= (m & 1); 1243 if (code > _ucnum_nodes[m]) 1244 l = m + 2; 1245 else if (code < _ucnum_nodes[m]) 1246 r = m - 2; 1247 else { 1248 vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1]; 1249 if (*vp == *(vp + 1)) { 1250 *digit = *vp; 1251 return 1; 1252 } 1253 return 0; 1254 } 1255 } 1256 return 0; 1257} 1258 1259struct ucnumber 1260ucgetnumber(ac_uint4 code) 1261{ 1262 struct ucnumber num; 1263 1264 /* 1265 * Initialize with some arbitrary value, because the caller simply cannot 1266 * tell for sure if the code is a number without calling the ucisnumber() 1267 * macro before calling this function. 1268 */ 1269 num.numerator = num.denominator = -111; 1270 1271 (void) ucnumber_lookup(code, &num); 1272 1273 return num; 1274} 1275 1276int 1277ucgetdigit(ac_uint4 code) 1278{ 1279 int dig; 1280 1281 /* 1282 * Initialize with some arbitrary value, because the caller simply cannot 1283 * tell for sure if the code is a number without calling the ucisdigit() 1284 * macro before calling this function. 1285 */ 1286 dig = -111; 1287 1288 (void) ucdigit_lookup(code, &dig); 1289 1290 return dig; 1291} 1292 1293/************************************************************************** 1294 * 1295 * Setup and cleanup routines. 1296 * 1297 **************************************************************************/ 1298 1299#if HARDCODE_DATA 1300int ucdata_load(char *paths, int masks) { return 0; } 1301void ucdata_unload(int masks) { } 1302int ucdata_reload(char *paths, int masks) { return 0; } 1303#else 1304/* 1305 * Return 0 if okay, negative on error 1306 */ 1307int 1308ucdata_load(char *paths, int masks) 1309{ 1310 int error = 0; 1311 1312 if (masks & UCDATA_CTYPE) 1313 error |= _ucprop_load(paths, 0) < 0 ? UCDATA_CTYPE : 0; 1314 if (masks & UCDATA_CASE) 1315 error |= _uccase_load(paths, 0) < 0 ? UCDATA_CASE : 0; 1316 if (masks & UCDATA_DECOMP) 1317 error |= _ucdcmp_load(paths, 0) < 0 ? UCDATA_DECOMP : 0; 1318 if (masks & UCDATA_CMBCL) 1319 error |= _uccmcl_load(paths, 0) < 0 ? UCDATA_CMBCL : 0; 1320 if (masks & UCDATA_NUM) 1321 error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0; 1322 if (masks & UCDATA_COMP) 1323 error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0; 1324 if (masks & UCDATA_KDECOMP) 1325 error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0; 1326 1327 return -error; 1328} 1329 1330void 1331ucdata_unload(int masks) 1332{ 1333 if (masks & UCDATA_CTYPE) 1334 _ucprop_unload(); 1335 if (masks & UCDATA_CASE) 1336 _uccase_unload(); 1337 if (masks & UCDATA_DECOMP) 1338 _ucdcmp_unload(); 1339 if (masks & UCDATA_CMBCL) 1340 _uccmcl_unload(); 1341 if (masks & UCDATA_NUM) 1342 _ucnumb_unload(); 1343 if (masks & UCDATA_COMP) 1344 _uccomp_unload(); 1345 if (masks & UCDATA_KDECOMP) 1346 _uckdcmp_unload(); 1347} 1348 1349/* 1350 * Return 0 if okay, negative on error 1351 */ 1352int 1353ucdata_reload(char *paths, int masks) 1354{ 1355 int error = 0; 1356 1357 if (masks & UCDATA_CTYPE) 1358 error |= _ucprop_load(paths, 1) < 0 ? UCDATA_CTYPE : 0; 1359 if (masks & UCDATA_CASE) 1360 error |= _uccase_load(paths, 1) < 0 ? UCDATA_CASE : 0; 1361 if (masks & UCDATA_DECOMP) 1362 error |= _ucdcmp_load(paths, 1) < 0 ? UCDATA_DECOMP : 0; 1363 if (masks & UCDATA_CMBCL) 1364 error |= _uccmcl_load(paths, 1) < 0 ? UCDATA_CMBCL : 0; 1365 if (masks & UCDATA_NUM) 1366 error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0; 1367 if (masks & UCDATA_COMP) 1368 error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0; 1369 if (masks & UCDATA_KDECOMP) 1370 error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0; 1371 1372 return -error; 1373} 1374#endif 1375 1376#ifdef TEST 1377 1378void 1379main(void) 1380{ 1381 int dig; 1382 ac_uint4 i, lo, *dec; 1383 struct ucnumber num; 1384 1385/* ucdata_setup("."); */ 1386 1387 if (ucisweak(0x30)) 1388 printf("WEAK\n"); 1389 else 1390 printf("NOT WEAK\n"); 1391 1392 printf("LOWER 0x%04lX\n", uctolower(0xff3a)); 1393 printf("UPPER 0x%04lX\n", uctoupper(0xff5a)); 1394 1395 if (ucisalpha(0x1d5)) 1396 printf("ALPHA\n"); 1397 else 1398 printf("NOT ALPHA\n"); 1399 1400 if (ucisupper(0x1d5)) { 1401 printf("UPPER\n"); 1402 lo = uctolower(0x1d5); 1403 printf("0x%04lx\n", lo); 1404 lo = uctotitle(0x1d5); 1405 printf("0x%04lx\n", lo); 1406 } else 1407 printf("NOT UPPER\n"); 1408 1409 if (ucistitle(0x1d5)) 1410 printf("TITLE\n"); 1411 else 1412 printf("NOT TITLE\n"); 1413 1414 if (uciscomposite(0x1d5)) 1415 printf("COMPOSITE\n"); 1416 else 1417 printf("NOT COMPOSITE\n"); 1418 1419 if (ucdecomp(0x1d5, &lo, &dec)) { 1420 for (i = 0; i < lo; i++) 1421 printf("0x%04lx ", dec[i]); 1422 putchar('\n'); 1423 } 1424 1425 if ((lo = uccombining_class(0x41)) != 0) 1426 printf("0x41 CCL %ld\n", lo); 1427 1428 if (ucisxdigit(0xfeff)) 1429 printf("0xFEFF HEX DIGIT\n"); 1430 else 1431 printf("0xFEFF NOT HEX DIGIT\n"); 1432 1433 if (ucisdefined(0x10000)) 1434 printf("0x10000 DEFINED\n"); 1435 else 1436 printf("0x10000 NOT DEFINED\n"); 1437 1438 if (ucnumber_lookup(0x30, &num)) { 1439 if (num.denominator != 1) 1440 printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); 1441 else 1442 printf("UCNUMBER: 0x30 = %d\n", num.numerator); 1443 } else 1444 printf("UCNUMBER: 0x30 NOT A NUMBER\n"); 1445 1446 if (ucnumber_lookup(0xbc, &num)) { 1447 if (num.denominator != 1) 1448 printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); 1449 else 1450 printf("UCNUMBER: 0xbc = %d\n", num.numerator); 1451 } else 1452 printf("UCNUMBER: 0xbc NOT A NUMBER\n"); 1453 1454 1455 if (ucnumber_lookup(0xff19, &num)) { 1456 if (num.denominator != 1) 1457 printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); 1458 else 1459 printf("UCNUMBER: 0xff19 = %d\n", num.numerator); 1460 } else 1461 printf("UCNUMBER: 0xff19 NOT A NUMBER\n"); 1462 1463 if (ucnumber_lookup(0x4e00, &num)) { 1464 if (num.denominator != 1) 1465 printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator); 1466 else 1467 printf("UCNUMBER: 0x4e00 = %d\n", num.numerator); 1468 } else 1469 printf("UCNUMBER: 0x4e00 NOT A NUMBER\n"); 1470 1471 if (ucdigit_lookup(0x06f9, &dig)) 1472 printf("UCDIGIT: 0x6f9 = %d\n", dig); 1473 else 1474 printf("UCDIGIT: 0x6f9 NOT A NUMBER\n"); 1475 1476 dig = ucgetdigit(0x0969); 1477 printf("UCGETDIGIT: 0x969 = %d\n", dig); 1478 1479 num = ucgetnumber(0x30); 1480 if (num.denominator != 1) 1481 printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); 1482 else 1483 printf("UCGETNUMBER: 0x30 = %d\n", num.numerator); 1484 1485 num = ucgetnumber(0xbc); 1486 if (num.denominator != 1) 1487 printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); 1488 else 1489 printf("UCGETNUMBER: 0xbc = %d\n", num.numerator); 1490 1491 num = ucgetnumber(0xff19); 1492 if (num.denominator != 1) 1493 printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); 1494 else 1495 printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator); 1496 1497/* ucdata_cleanup(); */ 1498 exit(0); 1499} 1500 1501#endif /* TEST */ 1502