1/* $NetBSD: ucdata.c,v 1.1.1.3 2010/12/12 15:21:56 adam Exp $ */ 2 3/* OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucdata.c,v 1.32.2.5 2010/04/13 20:23:04 kurt Exp */ 4/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2010 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17/* Copyright 2001 Computing Research Labs, New Mexico State University 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a 20 * copy of this software and associated documentation files (the "Software"), 21 * to deal in the Software without restriction, including without limitation 22 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 23 * and/or sell copies of the Software, and to permit persons to whom the 24 * Software is furnished to do so, subject to the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 36 */ 37/* Id: ucdata.c,v 1.4 2001/01/02 18:46:20 mleisher Exp" */ 38 39#include "portable.h" 40#include "ldap_config.h" 41 42#include <stdio.h> 43#include <ac/stdlib.h> 44#include <ac/string.h> 45#include <ac/unistd.h> 46 47#include <ac/bytes.h> 48 49#include "lber_pvt.h" 50#include "ucdata.h" 51 52#ifndef HARDCODE_DATA 53#define HARDCODE_DATA 1 54#endif 55 56#if HARDCODE_DATA 57#include "uctable.h" 58#endif 59 60/************************************************************************** 61 * 62 * Miscellaneous types, data, and support functions. 63 * 64 **************************************************************************/ 65 66typedef struct { 67 ac_uint2 bom; 68 ac_uint2 cnt; 69 union { 70 ac_uint4 bytes; 71 ac_uint2 len[2]; 72 } size; 73} _ucheader_t; 74 75/* 76 * A simple array of 32-bit masks for lookup. 77 */ 78static ac_uint4 masks32[32] = { 79 0x00000001UL, 0x00000002UL, 0x00000004UL, 0x00000008UL, 80 0x00000010UL, 0x00000020UL, 0x00000040UL, 0x00000080UL, 81 0x00000100UL, 0x00000200UL, 0x00000400UL, 0x00000800UL, 82 0x00001000UL, 0x00002000UL, 0x00004000UL, 0x00008000UL, 83 0x00010000UL, 0x00020000UL, 0x00040000UL, 0x00080000UL, 84 0x00100000UL, 0x00200000UL, 0x00400000UL, 0x00800000UL, 85 0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL, 86 0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL 87}; 88 89#define endian_short(cc) (((cc) >> 8) | (((cc) & 0xff) << 8)) 90#define endian_long(cc) ((((cc) & 0xff) << 24)|((((cc) >> 8) & 0xff) << 16)|\ 91 ((((cc) >> 16) & 0xff) << 8)|((cc) >> 24)) 92 93#if !HARDCODE_DATA 94static FILE * 95_ucopenfile(char *paths, char *filename, char *mode) 96{ 97 FILE *f; 98 char *fp, *dp, *pp, path[BUFSIZ]; 99 100 if (filename == 0 || *filename == 0) 101 return 0; 102 103 dp = paths; 104 while (dp && *dp) { 105 pp = path; 106 while (*dp && *dp != ':') 107 *pp++ = *dp++; 108 *pp++ = *LDAP_DIRSEP; 109 110 fp = filename; 111 while (*fp) 112 *pp++ = *fp++; 113 *pp = 0; 114 115 if ((f = fopen(path, mode)) != 0) 116 return f; 117 118 if (*dp == ':') 119 dp++; 120 } 121 122 return 0; 123} 124#endif 125 126/************************************************************************** 127 * 128 * Support for the character properties. 129 * 130 **************************************************************************/ 131 132#if !HARDCODE_DATA 133 134static ac_uint4 _ucprop_size; 135static ac_uint2 *_ucprop_offsets; 136static ac_uint4 *_ucprop_ranges; 137 138/* 139 * Return -1 on error, 0 if okay 140 */ 141static int 142_ucprop_load(char *paths, int reload) 143{ 144 FILE *in; 145 ac_uint4 size, i; 146 _ucheader_t hdr; 147 148 if (_ucprop_size > 0) { 149 if (!reload) 150 /* 151 * The character properties have already been loaded. 152 */ 153 return 0; 154 155 /* 156 * Unload the current character property data in preparation for 157 * loading a new copy. Only the first array has to be deallocated 158 * because all the memory for the arrays is allocated as a single 159 * block. 160 */ 161 free((char *) _ucprop_offsets); 162 _ucprop_size = 0; 163 } 164 165 if ((in = _ucopenfile(paths, "ctype.dat", "rb")) == 0) 166 return -1; 167 168 /* 169 * Load the header. 170 */ 171 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 172 173 if (hdr.bom == 0xfffe) { 174 hdr.cnt = endian_short(hdr.cnt); 175 hdr.size.bytes = endian_long(hdr.size.bytes); 176 } 177 178 if ((_ucprop_size = hdr.cnt) == 0) { 179 fclose(in); 180 return -1; 181 } 182 183 /* 184 * Allocate all the storage needed for the lookup table. 185 */ 186 _ucprop_offsets = (ac_uint2 *) malloc(hdr.size.bytes); 187 188 /* 189 * Calculate the offset into the storage for the ranges. The offsets 190 * array is on a 4-byte boundary and one larger than the value provided in 191 * the header count field. This means the offset to the ranges must be 192 * calculated after aligning the count to a 4-byte boundary. 193 */ 194 if ((size = ((hdr.cnt + 1) * sizeof(ac_uint2))) & 3) 195 size += 4 - (size & 3); 196 size >>= 1; 197 _ucprop_ranges = (ac_uint4 *) (_ucprop_offsets + size); 198 199 /* 200 * Load the offset array. 201 */ 202 fread((char *) _ucprop_offsets, sizeof(ac_uint2), size, in); 203 204 /* 205 * Do an endian swap if necessary. Don't forget there is an extra node on 206 * the end with the final index. 207 */ 208 if (hdr.bom == 0xfffe) { 209 for (i = 0; i <= _ucprop_size; i++) 210 _ucprop_offsets[i] = endian_short(_ucprop_offsets[i]); 211 } 212 213 /* 214 * Load the ranges. The number of elements is in the last array position 215 * of the offsets. 216 */ 217 fread((char *) _ucprop_ranges, sizeof(ac_uint4), 218 _ucprop_offsets[_ucprop_size], in); 219 220 fclose(in); 221 222 /* 223 * Do an endian swap if necessary. 224 */ 225 if (hdr.bom == 0xfffe) { 226 for (i = 0; i < _ucprop_offsets[_ucprop_size]; i++) 227 _ucprop_ranges[i] = endian_long(_ucprop_ranges[i]); 228 } 229 return 0; 230} 231 232static void 233_ucprop_unload(void) 234{ 235 if (_ucprop_size == 0) 236 return; 237 238 /* 239 * Only need to free the offsets because the memory is allocated as a 240 * single block. 241 */ 242 free((char *) _ucprop_offsets); 243 _ucprop_size = 0; 244} 245#endif 246 247static int 248_ucprop_lookup(ac_uint4 code, ac_uint4 n) 249{ 250 long l, r, m; 251 252 if (_ucprop_size == 0) 253 return 0; 254 255 /* 256 * There is an extra node on the end of the offsets to allow this routine 257 * to work right. If the index is 0xffff, then there are no nodes for the 258 * property. 259 */ 260 if ((l = _ucprop_offsets[n]) == 0xffff) 261 return 0; 262 263 /* 264 * Locate the next offset that is not 0xffff. The sentinel at the end of 265 * the array is the max index value. 266 */ 267 for (m = 1; 268 n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++) ; 269 270 r = _ucprop_offsets[n + m] - 1; 271 272 while (l <= r) { 273 /* 274 * Determine a "mid" point and adjust to make sure the mid point is at 275 * the beginning of a range pair. 276 */ 277 m = (l + r) >> 1; 278 m -= (m & 1); 279 if (code > _ucprop_ranges[m + 1]) 280 l = m + 2; 281 else if (code < _ucprop_ranges[m]) 282 r = m - 2; 283 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1]) 284 return 1; 285 } 286 return 0; 287} 288 289int 290ucisprop(ac_uint4 code, ac_uint4 mask1, ac_uint4 mask2) 291{ 292 ac_uint4 i; 293 294 if (mask1 == 0 && mask2 == 0) 295 return 0; 296 297 for (i = 0; mask1 && i < 32; i++) { 298 if ((mask1 & masks32[i]) && _ucprop_lookup(code, i)) 299 return 1; 300 } 301 302 for (i = 32; mask2 && i < _ucprop_size; i++) { 303 if ((mask2 & masks32[i & 31]) && _ucprop_lookup(code, i)) 304 return 1; 305 } 306 307 return 0; 308} 309 310/************************************************************************** 311 * 312 * Support for case mapping. 313 * 314 **************************************************************************/ 315 316#if !HARDCODE_DATA 317 318/* These record the number of slots in the map. 319 * There are 3 words per slot. 320 */ 321static ac_uint4 _uccase_size; 322static ac_uint2 _uccase_len[2]; 323static ac_uint4 *_uccase_map; 324 325/* 326 * Return -1 on error, 0 if okay 327 */ 328static int 329_uccase_load(char *paths, int reload) 330{ 331 FILE *in; 332 ac_uint4 i; 333 _ucheader_t hdr; 334 335 if (_uccase_size > 0) { 336 if (!reload) 337 /* 338 * The case mappings have already been loaded. 339 */ 340 return 0; 341 342 free((char *) _uccase_map); 343 _uccase_size = 0; 344 } 345 346 if ((in = _ucopenfile(paths, "case.dat", "rb")) == 0) 347 return -1; 348 349 /* 350 * Load the header. 351 */ 352 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 353 354 if (hdr.bom == 0xfffe) { 355 hdr.cnt = endian_short(hdr.cnt); 356 hdr.size.len[0] = endian_short(hdr.size.len[0]); 357 hdr.size.len[1] = endian_short(hdr.size.len[1]); 358 } 359 360 /* 361 * Set the node count and lengths of the upper and lower case mapping 362 * tables. 363 */ 364 _uccase_size = hdr.cnt; 365 _uccase_len[0] = hdr.size.len[0]; 366 _uccase_len[1] = hdr.size.len[1]; 367 368 _uccase_map = (ac_uint4 *) 369 malloc(_uccase_size * 3 * sizeof(ac_uint4)); 370 371 /* 372 * Load the case mapping table. 373 */ 374 fread((char *) _uccase_map, sizeof(ac_uint4), _uccase_size * 3, in); 375 376 /* 377 * Do an endian swap if necessary. 378 */ 379 if (hdr.bom == 0xfffe) { 380 for (i = 0; i < _uccase_size * 3; i++) 381 _uccase_map[i] = endian_long(_uccase_map[i]); 382 } 383 fclose(in); 384 return 0; 385} 386 387static void 388_uccase_unload(void) 389{ 390 if (_uccase_size == 0) 391 return; 392 393 free((char *) _uccase_map); 394 _uccase_size = 0; 395} 396#endif 397 398static ac_uint4 399_uccase_lookup(ac_uint4 code, long l, long r, int field) 400{ 401 long m; 402 const ac_uint4 *tmp; 403 404 /* 405 * Do the binary search. 406 */ 407 while (l <= r) { 408 /* 409 * Determine a "mid" point and adjust to make sure the mid point is at 410 * the beginning of a case mapping triple. 411 */ 412 m = (l + r) >> 1; 413 tmp = &_uccase_map[m*3]; 414 if (code > *tmp) 415 l = m + 1; 416 else if (code < *tmp) 417 r = m - 1; 418 else if (code == *tmp) 419 return tmp[field]; 420 } 421 422 return code; 423} 424 425ac_uint4 426uctoupper(ac_uint4 code) 427{ 428 int field; 429 long l, r; 430 431 if (ucisupper(code)) 432 return code; 433 434 if (ucislower(code)) { 435 /* 436 * The character is lower case. 437 */ 438 field = 2; 439 l = _uccase_len[0]; 440 r = (l + _uccase_len[1]) - 1; 441 } else { 442 /* 443 * The character is title case. 444 */ 445 field = 1; 446 l = _uccase_len[0] + _uccase_len[1]; 447 r = _uccase_size - 1; 448 } 449 return _uccase_lookup(code, l, r, field); 450} 451 452ac_uint4 453uctolower(ac_uint4 code) 454{ 455 int field; 456 long l, r; 457 458 if (ucislower(code)) 459 return code; 460 461 if (ucisupper(code)) { 462 /* 463 * The character is upper case. 464 */ 465 field = 1; 466 l = 0; 467 r = _uccase_len[0] - 1; 468 } else { 469 /* 470 * The character is title case. 471 */ 472 field = 2; 473 l = _uccase_len[0] + _uccase_len[1]; 474 r = _uccase_size - 1; 475 } 476 return _uccase_lookup(code, l, r, field); 477} 478 479ac_uint4 480uctotitle(ac_uint4 code) 481{ 482 int field; 483 long l, r; 484 485 if (ucistitle(code)) 486 return code; 487 488 /* 489 * The offset will always be the same for converting to title case. 490 */ 491 field = 2; 492 493 if (ucisupper(code)) { 494 /* 495 * The character is upper case. 496 */ 497 l = 0; 498 r = _uccase_len[0] - 1; 499 } else { 500 /* 501 * The character is lower case. 502 */ 503 l = _uccase_len[0]; 504 r = (l + _uccase_len[1]) - 1; 505 } 506 return _uccase_lookup(code, l, r, field); 507} 508 509/************************************************************************** 510 * 511 * Support for compositions. 512 * 513 **************************************************************************/ 514 515#if !HARDCODE_DATA 516 517static ac_uint4 _uccomp_size; 518static ac_uint4 *_uccomp_data; 519 520/* 521 * Return -1 on error, 0 if okay 522 */ 523static int 524_uccomp_load(char *paths, int reload) 525{ 526 FILE *in; 527 ac_uint4 size, i; 528 _ucheader_t hdr; 529 530 if (_uccomp_size > 0) { 531 if (!reload) 532 /* 533 * The compositions have already been loaded. 534 */ 535 return 0; 536 537 free((char *) _uccomp_data); 538 _uccomp_size = 0; 539 } 540 541 if ((in = _ucopenfile(paths, "comp.dat", "rb")) == 0) 542 return -1; 543 544 /* 545 * Load the header. 546 */ 547 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 548 549 if (hdr.bom == 0xfffe) { 550 hdr.cnt = endian_short(hdr.cnt); 551 hdr.size.bytes = endian_long(hdr.size.bytes); 552 } 553 554 _uccomp_size = hdr.cnt; 555 _uccomp_data = (ac_uint4 *) malloc(hdr.size.bytes); 556 557 /* 558 * Read the composition data in. 559 */ 560 size = hdr.size.bytes / sizeof(ac_uint4); 561 fread((char *) _uccomp_data, sizeof(ac_uint4), size, in); 562 563 /* 564 * Do an endian swap if necessary. 565 */ 566 if (hdr.bom == 0xfffe) { 567 for (i = 0; i < size; i++) 568 _uccomp_data[i] = endian_long(_uccomp_data[i]); 569 } 570 571 /* 572 * Assume that the data is ordered on count, so that all compositions 573 * of length 2 come first. Only handling length 2 for now. 574 */ 575 for (i = 1; i < size; i += 4) 576 if (_uccomp_data[i] != 2) 577 break; 578 _uccomp_size = i - 1; 579 580 fclose(in); 581 return 0; 582} 583 584static void 585_uccomp_unload(void) 586{ 587 if (_uccomp_size == 0) 588 return; 589 590 free((char *) _uccomp_data); 591 _uccomp_size = 0; 592} 593#endif 594 595int 596uccomp(ac_uint4 node1, ac_uint4 node2, ac_uint4 *comp) 597{ 598 int l, r, m; 599 600 l = 0; 601 r = _uccomp_size - 1; 602 603 while (l <= r) { 604 m = ((r + l) >> 1); 605 m -= m & 3; 606 if (node1 > _uccomp_data[m+2]) 607 l = m + 4; 608 else if (node1 < _uccomp_data[m+2]) 609 r = m - 4; 610 else if (node2 > _uccomp_data[m+3]) 611 l = m + 4; 612 else if (node2 < _uccomp_data[m+3]) 613 r = m - 4; 614 else { 615 *comp = _uccomp_data[m]; 616 return 1; 617 } 618 } 619 return 0; 620} 621 622int 623uccomp_hangul(ac_uint4 *str, int len) 624{ 625 const int SBase = 0xAC00, LBase = 0x1100, 626 VBase = 0x1161, TBase = 0x11A7, 627 LCount = 19, VCount = 21, TCount = 28, 628 NCount = VCount * TCount, /* 588 */ 629 SCount = LCount * NCount; /* 11172 */ 630 631 int i, rlen; 632 ac_uint4 ch, last, lindex, sindex; 633 634 last = str[0]; 635 rlen = 1; 636 for ( i = 1; i < len; i++ ) { 637 ch = str[i]; 638 639 /* check if two current characters are L and V */ 640 lindex = last - LBase; 641 if (lindex < (ac_uint4) LCount) { 642 ac_uint4 vindex = ch - VBase; 643 if (vindex < (ac_uint4) VCount) { 644 /* make syllable of form LV */ 645 last = SBase + (lindex * VCount + vindex) * TCount; 646 str[rlen-1] = last; /* reset last */ 647 continue; 648 } 649 } 650 651 /* check if two current characters are LV and T */ 652 sindex = last - SBase; 653 if (sindex < (ac_uint4) SCount 654 && (sindex % TCount) == 0) 655 { 656 ac_uint4 tindex = ch - TBase; 657 if (tindex <= (ac_uint4) TCount) { 658 /* make syllable of form LVT */ 659 last += tindex; 660 str[rlen-1] = last; /* reset last */ 661 continue; 662 } 663 } 664 665 /* if neither case was true, just add the character */ 666 last = ch; 667 str[rlen] = ch; 668 rlen++; 669 } 670 return rlen; 671} 672 673int 674uccanoncomp(ac_uint4 *str, int len) 675{ 676 int i, stpos, copos; 677 ac_uint4 cl, prevcl, st, ch, co; 678 679 st = str[0]; 680 stpos = 0; 681 copos = 1; 682 prevcl = uccombining_class(st) == 0 ? 0 : 256; 683 684 for (i = 1; i < len; i++) { 685 ch = str[i]; 686 cl = uccombining_class(ch); 687 if (uccomp(st, ch, &co) && (prevcl < cl || prevcl == 0)) 688 st = str[stpos] = co; 689 else { 690 if (cl == 0) { 691 stpos = copos; 692 st = ch; 693 } 694 prevcl = cl; 695 str[copos++] = ch; 696 } 697 } 698 699 return uccomp_hangul(str, copos); 700} 701 702/************************************************************************** 703 * 704 * Support for decompositions. 705 * 706 **************************************************************************/ 707 708#if !HARDCODE_DATA 709 710static ac_uint4 _ucdcmp_size; 711static ac_uint4 *_ucdcmp_nodes; 712static ac_uint4 *_ucdcmp_decomp; 713 714static ac_uint4 _uckdcmp_size; 715static ac_uint4 *_uckdcmp_nodes; 716static ac_uint4 *_uckdcmp_decomp; 717 718/* 719 * Return -1 on error, 0 if okay 720 */ 721static int 722_ucdcmp_load(char *paths, int reload) 723{ 724 FILE *in; 725 ac_uint4 size, i; 726 _ucheader_t hdr; 727 728 if (_ucdcmp_size > 0) { 729 if (!reload) 730 /* 731 * The decompositions have already been loaded. 732 */ 733 return 0; 734 735 free((char *) _ucdcmp_nodes); 736 _ucdcmp_size = 0; 737 } 738 739 if ((in = _ucopenfile(paths, "decomp.dat", "rb")) == 0) 740 return -1; 741 742 /* 743 * Load the header. 744 */ 745 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 746 747 if (hdr.bom == 0xfffe) { 748 hdr.cnt = endian_short(hdr.cnt); 749 hdr.size.bytes = endian_long(hdr.size.bytes); 750 } 751 752 _ucdcmp_size = hdr.cnt << 1; 753 _ucdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 754 _ucdcmp_decomp = _ucdcmp_nodes + (_ucdcmp_size + 1); 755 756 /* 757 * Read the decomposition data in. 758 */ 759 size = hdr.size.bytes / sizeof(ac_uint4); 760 fread((char *) _ucdcmp_nodes, sizeof(ac_uint4), size, in); 761 762 /* 763 * Do an endian swap if necessary. 764 */ 765 if (hdr.bom == 0xfffe) { 766 for (i = 0; i < size; i++) 767 _ucdcmp_nodes[i] = endian_long(_ucdcmp_nodes[i]); 768 } 769 fclose(in); 770 return 0; 771} 772 773/* 774 * Return -1 on error, 0 if okay 775 */ 776static int 777_uckdcmp_load(char *paths, int reload) 778{ 779 FILE *in; 780 ac_uint4 size, i; 781 _ucheader_t hdr; 782 783 if (_uckdcmp_size > 0) { 784 if (!reload) 785 /* 786 * The decompositions have already been loaded. 787 */ 788 return 0; 789 790 free((char *) _uckdcmp_nodes); 791 _uckdcmp_size = 0; 792 } 793 794 if ((in = _ucopenfile(paths, "kdecomp.dat", "rb")) == 0) 795 return -1; 796 797 /* 798 * Load the header. 799 */ 800 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 801 802 if (hdr.bom == 0xfffe) { 803 hdr.cnt = endian_short(hdr.cnt); 804 hdr.size.bytes = endian_long(hdr.size.bytes); 805 } 806 807 _uckdcmp_size = hdr.cnt << 1; 808 _uckdcmp_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 809 _uckdcmp_decomp = _uckdcmp_nodes + (_uckdcmp_size + 1); 810 811 /* 812 * Read the decomposition data in. 813 */ 814 size = hdr.size.bytes / sizeof(ac_uint4); 815 fread((char *) _uckdcmp_nodes, sizeof(ac_uint4), size, in); 816 817 /* 818 * Do an endian swap if necessary. 819 */ 820 if (hdr.bom == 0xfffe) { 821 for (i = 0; i < size; i++) 822 _uckdcmp_nodes[i] = endian_long(_uckdcmp_nodes[i]); 823 } 824 fclose(in); 825 return 0; 826} 827 828static void 829_ucdcmp_unload(void) 830{ 831 if (_ucdcmp_size == 0) 832 return; 833 834 /* 835 * Only need to free the offsets because the memory is allocated as a 836 * single block. 837 */ 838 free((char *) _ucdcmp_nodes); 839 _ucdcmp_size = 0; 840} 841 842static void 843_uckdcmp_unload(void) 844{ 845 if (_uckdcmp_size == 0) 846 return; 847 848 /* 849 * Only need to free the offsets because the memory is allocated as a 850 * single block. 851 */ 852 free((char *) _uckdcmp_nodes); 853 _uckdcmp_size = 0; 854} 855#endif 856 857int 858ucdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp) 859{ 860 long l, r, m; 861 862 if (code < _ucdcmp_nodes[0]) { 863 return 0; 864 } 865 866 l = 0; 867 r = _ucdcmp_nodes[_ucdcmp_size] - 1; 868 869 while (l <= r) { 870 /* 871 * Determine a "mid" point and adjust to make sure the mid point is at 872 * the beginning of a code+offset pair. 873 */ 874 m = (l + r) >> 1; 875 m -= (m & 1); 876 if (code > _ucdcmp_nodes[m]) 877 l = m + 2; 878 else if (code < _ucdcmp_nodes[m]) 879 r = m - 2; 880 else if (code == _ucdcmp_nodes[m]) { 881 *num = _ucdcmp_nodes[m + 3] - _ucdcmp_nodes[m + 1]; 882 *decomp = (ac_uint4*)&_ucdcmp_decomp[_ucdcmp_nodes[m + 1]]; 883 return 1; 884 } 885 } 886 return 0; 887} 888 889int 890uckdecomp(ac_uint4 code, ac_uint4 *num, ac_uint4 **decomp) 891{ 892 long l, r, m; 893 894 if (code < _uckdcmp_nodes[0]) { 895 return 0; 896 } 897 898 l = 0; 899 r = _uckdcmp_nodes[_uckdcmp_size] - 1; 900 901 while (l <= r) { 902 /* 903 * Determine a "mid" point and adjust to make sure the mid point is at 904 * the beginning of a code+offset pair. 905 */ 906 m = (l + r) >> 1; 907 m -= (m & 1); 908 if (code > _uckdcmp_nodes[m]) 909 l = m + 2; 910 else if (code < _uckdcmp_nodes[m]) 911 r = m - 2; 912 else if (code == _uckdcmp_nodes[m]) { 913 *num = _uckdcmp_nodes[m + 3] - _uckdcmp_nodes[m + 1]; 914 *decomp = (ac_uint4*)&_uckdcmp_decomp[_uckdcmp_nodes[m + 1]]; 915 return 1; 916 } 917 } 918 return 0; 919} 920 921int 922ucdecomp_hangul(ac_uint4 code, ac_uint4 *num, ac_uint4 decomp[]) 923{ 924 if (!ucishangul(code)) 925 return 0; 926 927 code -= 0xac00; 928 decomp[0] = 0x1100 + (ac_uint4) (code / 588); 929 decomp[1] = 0x1161 + (ac_uint4) ((code % 588) / 28); 930 decomp[2] = 0x11a7 + (ac_uint4) (code % 28); 931 *num = (decomp[2] != 0x11a7) ? 3 : 2; 932 933 return 1; 934} 935 936/* mode == 0 for canonical, mode == 1 for compatibility */ 937static int 938uccanoncompatdecomp(const ac_uint4 *in, int inlen, 939 ac_uint4 **out, int *outlen, short mode, void *ctx) 940{ 941 int l, size; 942 unsigned i, j, k; 943 ac_uint4 num, class, *decomp, hangdecomp[3]; 944 945 size = inlen * 2; 946 *out = (ac_uint4 *) ber_memalloc_x(size * sizeof(**out), ctx); 947 if (*out == NULL) 948 return *outlen = -1; 949 950 i = 0; 951 for (j = 0; j < (unsigned) inlen; j++) { 952 if (mode ? uckdecomp(in[j], &num, &decomp) : ucdecomp(in[j], &num, &decomp)) { 953 if ( size - i < num) { 954 size = inlen + i - j + num - 1; 955 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx ); 956 if (*out == NULL) 957 return *outlen = -1; 958 } 959 for (k = 0; k < num; k++) { 960 class = uccombining_class(decomp[k]); 961 if (class == 0) { 962 (*out)[i] = decomp[k]; 963 } else { 964 for (l = i; l > 0; l--) 965 if (class >= uccombining_class((*out)[l-1])) 966 break; 967 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out)); 968 (*out)[l] = decomp[k]; 969 } 970 i++; 971 } 972 } else if (ucdecomp_hangul(in[j], &num, hangdecomp)) { 973 if (size - i < num) { 974 size = inlen + i - j + num - 1; 975 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx); 976 if (*out == NULL) 977 return *outlen = -1; 978 } 979 for (k = 0; k < num; k++) { 980 (*out)[i] = hangdecomp[k]; 981 i++; 982 } 983 } else { 984 if (size - i < 1) { 985 size = inlen + i - j; 986 *out = (ac_uint4 *) ber_memrealloc_x(*out, size * sizeof(**out), ctx); 987 if (*out == NULL) 988 return *outlen = -1; 989 } 990 class = uccombining_class(in[j]); 991 if (class == 0) { 992 (*out)[i] = in[j]; 993 } else { 994 for (l = i; l > 0; l--) 995 if (class >= uccombining_class((*out)[l-1])) 996 break; 997 AC_MEMCPY(*out + l + 1, *out + l, (i - l) * sizeof(**out)); 998 (*out)[l] = in[j]; 999 } 1000 i++; 1001 } 1002 } 1003 return *outlen = i; 1004} 1005 1006int 1007uccanondecomp(const ac_uint4 *in, int inlen, 1008 ac_uint4 **out, int *outlen, void *ctx) 1009{ 1010 return uccanoncompatdecomp(in, inlen, out, outlen, 0, ctx); 1011} 1012 1013int 1014uccompatdecomp(const ac_uint4 *in, int inlen, 1015 ac_uint4 **out, int *outlen, void *ctx) 1016{ 1017 return uccanoncompatdecomp(in, inlen, out, outlen, 1, ctx); 1018} 1019 1020/************************************************************************** 1021 * 1022 * Support for combining classes. 1023 * 1024 **************************************************************************/ 1025 1026#if !HARDCODE_DATA 1027static ac_uint4 _uccmcl_size; 1028static ac_uint4 *_uccmcl_nodes; 1029 1030/* 1031 * Return -1 on error, 0 if okay 1032 */ 1033static int 1034_uccmcl_load(char *paths, int reload) 1035{ 1036 FILE *in; 1037 ac_uint4 i; 1038 _ucheader_t hdr; 1039 1040 if (_uccmcl_size > 0) { 1041 if (!reload) 1042 /* 1043 * The combining classes have already been loaded. 1044 */ 1045 return 0; 1046 1047 free((char *) _uccmcl_nodes); 1048 _uccmcl_size = 0; 1049 } 1050 1051 if ((in = _ucopenfile(paths, "cmbcl.dat", "rb")) == 0) 1052 return -1; 1053 1054 /* 1055 * Load the header. 1056 */ 1057 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 1058 1059 if (hdr.bom == 0xfffe) { 1060 hdr.cnt = endian_short(hdr.cnt); 1061 hdr.size.bytes = endian_long(hdr.size.bytes); 1062 } 1063 1064 _uccmcl_size = hdr.cnt * 3; 1065 _uccmcl_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 1066 1067 /* 1068 * Read the combining classes in. 1069 */ 1070 fread((char *) _uccmcl_nodes, sizeof(ac_uint4), _uccmcl_size, in); 1071 1072 /* 1073 * Do an endian swap if necessary. 1074 */ 1075 if (hdr.bom == 0xfffe) { 1076 for (i = 0; i < _uccmcl_size; i++) 1077 _uccmcl_nodes[i] = endian_long(_uccmcl_nodes[i]); 1078 } 1079 fclose(in); 1080 return 0; 1081} 1082 1083static void 1084_uccmcl_unload(void) 1085{ 1086 if (_uccmcl_size == 0) 1087 return; 1088 1089 free((char *) _uccmcl_nodes); 1090 _uccmcl_size = 0; 1091} 1092#endif 1093 1094ac_uint4 1095uccombining_class(ac_uint4 code) 1096{ 1097 long l, r, m; 1098 1099 l = 0; 1100 r = _uccmcl_size - 1; 1101 1102 while (l <= r) { 1103 m = (l + r) >> 1; 1104 m -= (m % 3); 1105 if (code > _uccmcl_nodes[m + 1]) 1106 l = m + 3; 1107 else if (code < _uccmcl_nodes[m]) 1108 r = m - 3; 1109 else if (code >= _uccmcl_nodes[m] && code <= _uccmcl_nodes[m + 1]) 1110 return _uccmcl_nodes[m + 2]; 1111 } 1112 return 0; 1113} 1114 1115/************************************************************************** 1116 * 1117 * Support for numeric values. 1118 * 1119 **************************************************************************/ 1120 1121#if !HARDCODE_DATA 1122static ac_uint4 *_ucnum_nodes; 1123static ac_uint4 _ucnum_size; 1124static short *_ucnum_vals; 1125 1126/* 1127 * Return -1 on error, 0 if okay 1128 */ 1129static int 1130_ucnumb_load(char *paths, int reload) 1131{ 1132 FILE *in; 1133 ac_uint4 size, i; 1134 _ucheader_t hdr; 1135 1136 if (_ucnum_size > 0) { 1137 if (!reload) 1138 /* 1139 * The numbers have already been loaded. 1140 */ 1141 return 0; 1142 1143 free((char *) _ucnum_nodes); 1144 _ucnum_size = 0; 1145 } 1146 1147 if ((in = _ucopenfile(paths, "num.dat", "rb")) == 0) 1148 return -1; 1149 1150 /* 1151 * Load the header. 1152 */ 1153 fread((char *) &hdr, sizeof(_ucheader_t), 1, in); 1154 1155 if (hdr.bom == 0xfffe) { 1156 hdr.cnt = endian_short(hdr.cnt); 1157 hdr.size.bytes = endian_long(hdr.size.bytes); 1158 } 1159 1160 _ucnum_size = hdr.cnt; 1161 _ucnum_nodes = (ac_uint4 *) malloc(hdr.size.bytes); 1162 _ucnum_vals = (short *) (_ucnum_nodes + _ucnum_size); 1163 1164 /* 1165 * Read the combining classes in. 1166 */ 1167 fread((char *) _ucnum_nodes, sizeof(unsigned char), hdr.size.bytes, in); 1168 1169 /* 1170 * Do an endian swap if necessary. 1171 */ 1172 if (hdr.bom == 0xfffe) { 1173 for (i = 0; i < _ucnum_size; i++) 1174 _ucnum_nodes[i] = endian_long(_ucnum_nodes[i]); 1175 1176 /* 1177 * Determine the number of values that have to be adjusted. 1178 */ 1179 size = (hdr.size.bytes - 1180 (_ucnum_size * (sizeof(ac_uint4) << 1))) / 1181 sizeof(short); 1182 1183 for (i = 0; i < size; i++) 1184 _ucnum_vals[i] = endian_short(_ucnum_vals[i]); 1185 } 1186 fclose(in); 1187 return 0; 1188} 1189 1190static void 1191_ucnumb_unload(void) 1192{ 1193 if (_ucnum_size == 0) 1194 return; 1195 1196 free((char *) _ucnum_nodes); 1197 _ucnum_size = 0; 1198} 1199#endif 1200 1201int 1202ucnumber_lookup(ac_uint4 code, struct ucnumber *num) 1203{ 1204 long l, r, m; 1205 short *vp; 1206 1207 l = 0; 1208 r = _ucnum_size - 1; 1209 while (l <= r) { 1210 /* 1211 * Determine a "mid" point and adjust to make sure the mid point is at 1212 * the beginning of a code+offset pair. 1213 */ 1214 m = (l + r) >> 1; 1215 m -= (m & 1); 1216 if (code > _ucnum_nodes[m]) 1217 l = m + 2; 1218 else if (code < _ucnum_nodes[m]) 1219 r = m - 2; 1220 else { 1221 vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1]; 1222 num->numerator = (int) *vp++; 1223 num->denominator = (int) *vp; 1224 return 1; 1225 } 1226 } 1227 return 0; 1228} 1229 1230int 1231ucdigit_lookup(ac_uint4 code, int *digit) 1232{ 1233 long l, r, m; 1234 short *vp; 1235 1236 l = 0; 1237 r = _ucnum_size - 1; 1238 while (l <= r) { 1239 /* 1240 * Determine a "mid" point and adjust to make sure the mid point is at 1241 * the beginning of a code+offset pair. 1242 */ 1243 m = (l + r) >> 1; 1244 m -= (m & 1); 1245 if (code > _ucnum_nodes[m]) 1246 l = m + 2; 1247 else if (code < _ucnum_nodes[m]) 1248 r = m - 2; 1249 else { 1250 vp = (short *)_ucnum_vals + _ucnum_nodes[m + 1]; 1251 if (*vp == *(vp + 1)) { 1252 *digit = *vp; 1253 return 1; 1254 } 1255 return 0; 1256 } 1257 } 1258 return 0; 1259} 1260 1261struct ucnumber 1262ucgetnumber(ac_uint4 code) 1263{ 1264 struct ucnumber num; 1265 1266 /* 1267 * Initialize with some arbitrary value, because the caller simply cannot 1268 * tell for sure if the code is a number without calling the ucisnumber() 1269 * macro before calling this function. 1270 */ 1271 num.numerator = num.denominator = -111; 1272 1273 (void) ucnumber_lookup(code, &num); 1274 1275 return num; 1276} 1277 1278int 1279ucgetdigit(ac_uint4 code) 1280{ 1281 int dig; 1282 1283 /* 1284 * Initialize with some arbitrary value, because the caller simply cannot 1285 * tell for sure if the code is a number without calling the ucisdigit() 1286 * macro before calling this function. 1287 */ 1288 dig = -111; 1289 1290 (void) ucdigit_lookup(code, &dig); 1291 1292 return dig; 1293} 1294 1295/************************************************************************** 1296 * 1297 * Setup and cleanup routines. 1298 * 1299 **************************************************************************/ 1300 1301#if HARDCODE_DATA 1302int ucdata_load(char *paths, int masks) { return 0; } 1303void ucdata_unload(int masks) { } 1304int ucdata_reload(char *paths, int masks) { return 0; } 1305#else 1306/* 1307 * Return 0 if okay, negative on error 1308 */ 1309int 1310ucdata_load(char *paths, int masks) 1311{ 1312 int error = 0; 1313 1314 if (masks & UCDATA_CTYPE) 1315 error |= _ucprop_load(paths, 0) < 0 ? UCDATA_CTYPE : 0; 1316 if (masks & UCDATA_CASE) 1317 error |= _uccase_load(paths, 0) < 0 ? UCDATA_CASE : 0; 1318 if (masks & UCDATA_DECOMP) 1319 error |= _ucdcmp_load(paths, 0) < 0 ? UCDATA_DECOMP : 0; 1320 if (masks & UCDATA_CMBCL) 1321 error |= _uccmcl_load(paths, 0) < 0 ? UCDATA_CMBCL : 0; 1322 if (masks & UCDATA_NUM) 1323 error |= _ucnumb_load(paths, 0) < 0 ? UCDATA_NUM : 0; 1324 if (masks & UCDATA_COMP) 1325 error |= _uccomp_load(paths, 0) < 0 ? UCDATA_COMP : 0; 1326 if (masks & UCDATA_KDECOMP) 1327 error |= _uckdcmp_load(paths, 0) < 0 ? UCDATA_KDECOMP : 0; 1328 1329 return -error; 1330} 1331 1332void 1333ucdata_unload(int masks) 1334{ 1335 if (masks & UCDATA_CTYPE) 1336 _ucprop_unload(); 1337 if (masks & UCDATA_CASE) 1338 _uccase_unload(); 1339 if (masks & UCDATA_DECOMP) 1340 _ucdcmp_unload(); 1341 if (masks & UCDATA_CMBCL) 1342 _uccmcl_unload(); 1343 if (masks & UCDATA_NUM) 1344 _ucnumb_unload(); 1345 if (masks & UCDATA_COMP) 1346 _uccomp_unload(); 1347 if (masks & UCDATA_KDECOMP) 1348 _uckdcmp_unload(); 1349} 1350 1351/* 1352 * Return 0 if okay, negative on error 1353 */ 1354int 1355ucdata_reload(char *paths, int masks) 1356{ 1357 int error = 0; 1358 1359 if (masks & UCDATA_CTYPE) 1360 error |= _ucprop_load(paths, 1) < 0 ? UCDATA_CTYPE : 0; 1361 if (masks & UCDATA_CASE) 1362 error |= _uccase_load(paths, 1) < 0 ? UCDATA_CASE : 0; 1363 if (masks & UCDATA_DECOMP) 1364 error |= _ucdcmp_load(paths, 1) < 0 ? UCDATA_DECOMP : 0; 1365 if (masks & UCDATA_CMBCL) 1366 error |= _uccmcl_load(paths, 1) < 0 ? UCDATA_CMBCL : 0; 1367 if (masks & UCDATA_NUM) 1368 error |= _ucnumb_load(paths, 1) < 0 ? UCDATA_NUM : 0; 1369 if (masks & UCDATA_COMP) 1370 error |= _uccomp_load(paths, 1) < 0 ? UCDATA_COMP : 0; 1371 if (masks & UCDATA_KDECOMP) 1372 error |= _uckdcmp_load(paths, 1) < 0 ? UCDATA_KDECOMP : 0; 1373 1374 return -error; 1375} 1376#endif 1377 1378#ifdef TEST 1379 1380void 1381main(void) 1382{ 1383 int dig; 1384 ac_uint4 i, lo, *dec; 1385 struct ucnumber num; 1386 1387/* ucdata_setup("."); */ 1388 1389 if (ucisweak(0x30)) 1390 printf("WEAK\n"); 1391 else 1392 printf("NOT WEAK\n"); 1393 1394 printf("LOWER 0x%04lX\n", uctolower(0xff3a)); 1395 printf("UPPER 0x%04lX\n", uctoupper(0xff5a)); 1396 1397 if (ucisalpha(0x1d5)) 1398 printf("ALPHA\n"); 1399 else 1400 printf("NOT ALPHA\n"); 1401 1402 if (ucisupper(0x1d5)) { 1403 printf("UPPER\n"); 1404 lo = uctolower(0x1d5); 1405 printf("0x%04lx\n", lo); 1406 lo = uctotitle(0x1d5); 1407 printf("0x%04lx\n", lo); 1408 } else 1409 printf("NOT UPPER\n"); 1410 1411 if (ucistitle(0x1d5)) 1412 printf("TITLE\n"); 1413 else 1414 printf("NOT TITLE\n"); 1415 1416 if (uciscomposite(0x1d5)) 1417 printf("COMPOSITE\n"); 1418 else 1419 printf("NOT COMPOSITE\n"); 1420 1421 if (ucdecomp(0x1d5, &lo, &dec)) { 1422 for (i = 0; i < lo; i++) 1423 printf("0x%04lx ", dec[i]); 1424 putchar('\n'); 1425 } 1426 1427 if ((lo = uccombining_class(0x41)) != 0) 1428 printf("0x41 CCL %ld\n", lo); 1429 1430 if (ucisxdigit(0xfeff)) 1431 printf("0xFEFF HEX DIGIT\n"); 1432 else 1433 printf("0xFEFF NOT HEX DIGIT\n"); 1434 1435 if (ucisdefined(0x10000)) 1436 printf("0x10000 DEFINED\n"); 1437 else 1438 printf("0x10000 NOT DEFINED\n"); 1439 1440 if (ucnumber_lookup(0x30, &num)) { 1441 if (num.denominator != 1) 1442 printf("UCNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); 1443 else 1444 printf("UCNUMBER: 0x30 = %d\n", num.numerator); 1445 } else 1446 printf("UCNUMBER: 0x30 NOT A NUMBER\n"); 1447 1448 if (ucnumber_lookup(0xbc, &num)) { 1449 if (num.denominator != 1) 1450 printf("UCNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); 1451 else 1452 printf("UCNUMBER: 0xbc = %d\n", num.numerator); 1453 } else 1454 printf("UCNUMBER: 0xbc NOT A NUMBER\n"); 1455 1456 1457 if (ucnumber_lookup(0xff19, &num)) { 1458 if (num.denominator != 1) 1459 printf("UCNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); 1460 else 1461 printf("UCNUMBER: 0xff19 = %d\n", num.numerator); 1462 } else 1463 printf("UCNUMBER: 0xff19 NOT A NUMBER\n"); 1464 1465 if (ucnumber_lookup(0x4e00, &num)) { 1466 if (num.denominator != 1) 1467 printf("UCNUMBER: 0x4e00 = %d/%d\n", num.numerator, num.denominator); 1468 else 1469 printf("UCNUMBER: 0x4e00 = %d\n", num.numerator); 1470 } else 1471 printf("UCNUMBER: 0x4e00 NOT A NUMBER\n"); 1472 1473 if (ucdigit_lookup(0x06f9, &dig)) 1474 printf("UCDIGIT: 0x6f9 = %d\n", dig); 1475 else 1476 printf("UCDIGIT: 0x6f9 NOT A NUMBER\n"); 1477 1478 dig = ucgetdigit(0x0969); 1479 printf("UCGETDIGIT: 0x969 = %d\n", dig); 1480 1481 num = ucgetnumber(0x30); 1482 if (num.denominator != 1) 1483 printf("UCGETNUMBER: 0x30 = %d/%d\n", num.numerator, num.denominator); 1484 else 1485 printf("UCGETNUMBER: 0x30 = %d\n", num.numerator); 1486 1487 num = ucgetnumber(0xbc); 1488 if (num.denominator != 1) 1489 printf("UCGETNUMBER: 0xbc = %d/%d\n", num.numerator, num.denominator); 1490 else 1491 printf("UCGETNUMBER: 0xbc = %d\n", num.numerator); 1492 1493 num = ucgetnumber(0xff19); 1494 if (num.denominator != 1) 1495 printf("UCGETNUMBER: 0xff19 = %d/%d\n", num.numerator, num.denominator); 1496 else 1497 printf("UCGETNUMBER: 0xff19 = %d\n", num.numerator); 1498 1499/* ucdata_cleanup(); */ 1500 exit(0); 1501} 1502 1503#endif /* TEST */ 1504