1/* Association between Unicode characters and their names. 2 Copyright (C) 2000-2002, 2005-2007, 2009-2010 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify it 5 under the terms of the GNU Lesser General Public License as published 6 by the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Lesser General Public License for more details. 13 14 You should have received a copy of the GNU Lesser General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17#include <config.h> 18 19/* Specification. */ 20#include "uniname.h" 21 22#include <assert.h> 23#include <stdbool.h> 24#include <stdint.h> 25#include <stdio.h> 26#include <string.h> 27 28#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 29 30 31/* Table of Unicode character names, derived from UnicodeData.txt. 32 This table is generated in a way to minimize the memory footprint: 33 1. its compiled size is small (less than 350 KB), 34 2. it resides entirely in the text or read-only data segment of the 35 executable or shared library: the table contains only immediate 36 integers, no pointers, and the functions don't do heap allocation. 37 */ 38#include "uninames.h" 39/* It contains: 40 static const char unicode_name_words[36303] = ...; 41 #define UNICODE_CHARNAME_NUM_WORDS 6260 42 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...; 43 #define UNICODE_CHARNAME_WORD_HANGUL 3902 44 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978 45 #define UNICODE_CHARNAME_WORD_CJK 417 46 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107 47 static const uint16_t unicode_names[68940] = ...; 48 static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...; 49 static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...; 50 #define UNICODE_CHARNAME_MAX_LENGTH 83 51 #define UNICODE_CHARNAME_MAX_WORDS 13 52*/ 53 54/* Returns the word with a given index. */ 55static const char * 56unicode_name_word (unsigned int index, unsigned int *lengthp) 57{ 58 unsigned int i1; 59 unsigned int i2; 60 unsigned int i; 61 62 assert (index < UNICODE_CHARNAME_NUM_WORDS); 63 64 /* Binary search for i with 65 unicode_name_by_length[i].ind_offset <= index 66 and 67 index < unicode_name_by_length[i+1].ind_offset 68 */ 69 70 i1 = 0; 71 i2 = SIZEOF (unicode_name_by_length) - 1; 72 while (i2 - i1 > 1) 73 { 74 unsigned int i = (i1 + i2) >> 1; 75 if (unicode_name_by_length[i].ind_offset <= index) 76 i1 = i; 77 else 78 i2 = i; 79 } 80 i = i1; 81 assert (unicode_name_by_length[i].ind_offset <= index 82 && index < unicode_name_by_length[i+1].ind_offset); 83 *lengthp = i; 84 return &unicode_name_words[unicode_name_by_length[i].extra_offset 85 + (index-unicode_name_by_length[i].ind_offset)*i]; 86} 87 88/* Looks up the index of a word. */ 89static int 90unicode_name_word_lookup (const char *word, unsigned int length) 91{ 92 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1) 93 { 94 /* Binary search among the words of given length. */ 95 unsigned int extra_offset = unicode_name_by_length[length].extra_offset; 96 unsigned int i0 = unicode_name_by_length[length].ind_offset; 97 unsigned int i1 = i0; 98 unsigned int i2 = unicode_name_by_length[length+1].ind_offset; 99 while (i2 - i1 > 0) 100 { 101 unsigned int i = (i1 + i2) >> 1; 102 const char *p = &unicode_name_words[extra_offset + (i-i0)*length]; 103 const char *w = word; 104 unsigned int n = length; 105 for (;;) 106 { 107 if (*p < *w) 108 { 109 if (i1 == i) 110 return -1; 111 /* Note here: i1 < i < i2. */ 112 i1 = i; 113 break; 114 } 115 if (*p > *w) 116 { 117 /* Note here: i1 <= i < i2. */ 118 i2 = i; 119 break; 120 } 121 p++; w++; n--; 122 if (n == 0) 123 return i; 124 } 125 } 126 } 127 return -1; 128} 129 130/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, 131 sections 3.11 and 4.4. */ 132static const char jamo_initial_short_name[19][3] = 133{ 134 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", 135 "C", "K", "T", "P", "H" 136}; 137static const char jamo_medial_short_name[21][4] = 138{ 139 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", 140 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I" 141}; 142static const char jamo_final_short_name[28][3] = 143{ 144 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", 145 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" 146}; 147 148/* Looks up the name of a Unicode character, in uppercase ASCII. 149 Returns the filled buf, or NULL if the character does not have a name. */ 150char * 151unicode_character_name (ucs4_t c, char *buf) 152{ 153 if (c >= 0xAC00 && c <= 0xD7A3) 154 { 155 /* Special case for Hangul syllables. Keeps the tables small. */ 156 char *ptr; 157 unsigned int tmp; 158 unsigned int index1; 159 unsigned int index2; 160 unsigned int index3; 161 const char *q; 162 163 /* buf needs to have at least 16 + 7 bytes here. */ 164 memcpy (buf, "HANGUL SYLLABLE ", 16); 165 ptr = buf + 16; 166 167 tmp = c - 0xAC00; 168 index3 = tmp % 28; tmp = tmp / 28; 169 index2 = tmp % 21; tmp = tmp / 21; 170 index1 = tmp; 171 172 q = jamo_initial_short_name[index1]; 173 while (*q != '\0') 174 *ptr++ = *q++; 175 q = jamo_medial_short_name[index2]; 176 while (*q != '\0') 177 *ptr++ = *q++; 178 q = jamo_final_short_name[index3]; 179 while (*q != '\0') 180 *ptr++ = *q++; 181 *ptr = '\0'; 182 return buf; 183 } 184 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A) 185 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D)) 186 { 187 /* Special case for CJK compatibility ideographs. Keeps the tables 188 small. */ 189 char *ptr; 190 int i; 191 192 /* buf needs to have at least 28 + 5 bytes here. */ 193 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28); 194 ptr = buf + 28; 195 196 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4) 197 { 198 unsigned int x = (c >> i) & 0xf; 199 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x; 200 } 201 *ptr = '\0'; 202 return buf; 203 } 204 else 205 { 206 const uint16_t *words; 207 208 /* Transform the code so that it fits in 16 bits. */ 209 switch (c >> 12) 210 { 211 case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: 212 break; 213 case 0x0A: 214 c -= 0x05000; 215 break; 216 case 0x0F: 217 c -= 0x09000; 218 break; 219 case 0x10: 220 c -= 0x09000; 221 break; 222 case 0x12: 223 c -= 0x0A000; 224 break; 225 case 0x1D: 226 c -= 0x14000; 227 break; 228 case 0x1F: 229 c -= 0x15000; 230 break; 231 case 0x2F: 232 c -= 0x24000; 233 break; 234 case 0xE0: 235 c -= 0xD4000; 236 break; 237 default: 238 return NULL; 239 } 240 241 { 242 /* Binary search in unicode_code_to_name. */ 243 unsigned int i1 = 0; 244 unsigned int i2 = SIZEOF (unicode_code_to_name); 245 for (;;) 246 { 247 unsigned int i = (i1 + i2) >> 1; 248 if (unicode_code_to_name[i].code == c) 249 { 250 words = &unicode_names[unicode_code_to_name[i].name]; 251 break; 252 } 253 else if (unicode_code_to_name[i].code < c) 254 { 255 if (i1 == i) 256 { 257 words = NULL; 258 break; 259 } 260 /* Note here: i1 < i < i2. */ 261 i1 = i; 262 } 263 else if (unicode_code_to_name[i].code > c) 264 { 265 if (i2 == i) 266 { 267 words = NULL; 268 break; 269 } 270 /* Note here: i1 <= i < i2. */ 271 i2 = i; 272 } 273 } 274 } 275 if (words != NULL) 276 { 277 /* Found it in unicode_code_to_name. Now concatenate the words. */ 278 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */ 279 char *ptr = buf; 280 for (;;) 281 { 282 unsigned int wordlen; 283 const char *word = unicode_name_word (*words>>1, &wordlen); 284 do 285 *ptr++ = *word++; 286 while (--wordlen > 0); 287 if ((*words & 1) == 0) 288 break; 289 *ptr++ = ' '; 290 words++; 291 } 292 *ptr = '\0'; 293 return buf; 294 } 295 return NULL; 296 } 297} 298 299/* Looks up the Unicode character with a given name, in upper- or lowercase 300 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */ 301ucs4_t 302unicode_name_character (const char *name) 303{ 304 unsigned int len = strlen (name); 305 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH) 306 { 307 /* Test for "word1 word2 ..." syntax. */ 308 char buf[UNICODE_CHARNAME_MAX_LENGTH]; 309 char *ptr = buf; 310 for (;;) 311 { 312 char c = *name++; 313 if (!(c >= ' ' && c <= '~')) 314 break; 315 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c); 316 if (--len == 0) 317 goto filled_buf; 318 } 319 if (false) 320 filled_buf: 321 { 322 /* Convert the constituents to uint16_t words. */ 323 uint16_t words[UNICODE_CHARNAME_MAX_WORDS]; 324 uint16_t *wordptr = words; 325 { 326 const char *p1 = buf; 327 for (;;) 328 { 329 { 330 int word; 331 const char *p2 = p1; 332 while (p2 < ptr && *p2 != ' ') 333 p2++; 334 word = unicode_name_word_lookup (p1, p2 - p1); 335 if (word < 0) 336 break; 337 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS]) 338 break; 339 *wordptr++ = word; 340 if (p2 == ptr) 341 goto filled_words; 342 p1 = p2 + 1; 343 } 344 /* Special case for Hangul syllables. Keeps the tables small. */ 345 if (wordptr == &words[2] 346 && words[0] == UNICODE_CHARNAME_WORD_HANGUL 347 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE) 348 { 349 /* Split the last word [p1..ptr) into three parts: 350 1) [BCDGHJKMNPRST] 351 2) [AEIOUWY] 352 3) [BCDGHIJKLMNPST] 353 */ 354 const char *p2; 355 const char *p3; 356 const char *p4; 357 358 p2 = p1; 359 while (p2 < ptr 360 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D' 361 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J' 362 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N' 363 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S' 364 || *p2 == 'T')) 365 p2++; 366 p3 = p2; 367 while (p3 < ptr 368 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I' 369 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W' 370 || *p3 == 'Y')) 371 p3++; 372 p4 = p3; 373 while (p4 < ptr 374 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D' 375 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I' 376 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L' 377 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P' 378 || *p4 == 'S' || *p4 == 'T')) 379 p4++; 380 if (p4 == ptr) 381 { 382 unsigned int n1 = p2 - p1; 383 unsigned int n2 = p3 - p2; 384 unsigned int n3 = p4 - p3; 385 386 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2) 387 { 388 unsigned int index1; 389 390 for (index1 = 0; index1 < 19; index1++) 391 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0 392 && jamo_initial_short_name[index1][n1] == '\0') 393 { 394 unsigned int index2; 395 396 for (index2 = 0; index2 < 21; index2++) 397 if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0 398 && jamo_medial_short_name[index2][n2] == '\0') 399 { 400 unsigned int index3; 401 402 for (index3 = 0; index3 < 28; index3++) 403 if (memcmp (jamo_final_short_name[index3], p3, n3) == 0 404 && jamo_final_short_name[index3][n3] == '\0') 405 { 406 return 0xAC00 + (index1 * 21 + index2) * 28 + index3; 407 } 408 break; 409 } 410 break; 411 } 412 } 413 } 414 } 415 /* Special case for CJK compatibility ideographs. Keeps the 416 tables small. */ 417 if (wordptr == &words[2] 418 && words[0] == UNICODE_CHARNAME_WORD_CJK 419 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY 420 && p1 + 14 <= ptr 421 && p1 + 15 >= ptr 422 && memcmp (p1, "IDEOGRAPH-", 10) == 0) 423 { 424 const char *p2 = p1 + 10; 425 426 if (*p2 != '0') 427 { 428 unsigned int c = 0; 429 430 for (;;) 431 { 432 if (*p2 >= '0' && *p2 <= '9') 433 c += (*p2 - '0'); 434 else if (*p2 >= 'A' && *p2 <= 'F') 435 c += (*p2 - 'A' + 10); 436 else 437 break; 438 p2++; 439 if (p2 == ptr) 440 { 441 if ((c >= 0xF900 && c <= 0xFA2D) 442 || (c >= 0xFA30 && c <= 0xFA6A) 443 || (c >= 0xFA70 && c <= 0xFAD9) 444 || (c >= 0x2F800 && c <= 0x2FA1D)) 445 return c; 446 else 447 break; 448 } 449 c = c << 4; 450 } 451 } 452 } 453 } 454 } 455 if (false) 456 filled_words: 457 { 458 /* Multiply by 2, to simplify later comparisons. */ 459 unsigned int words_length = wordptr - words; 460 { 461 int i = words_length - 1; 462 words[i] = 2 * words[i]; 463 for (; --i >= 0; ) 464 words[i] = 2 * words[i] + 1; 465 } 466 /* Binary search in unicode_name_to_code. */ 467 { 468 unsigned int i1 = 0; 469 unsigned int i2 = SIZEOF (unicode_name_to_code); 470 for (;;) 471 { 472 unsigned int i = (i1 + i2) >> 1; 473 const uint16_t *w = words; 474 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name]; 475 unsigned int n = words_length; 476 for (;;) 477 { 478 if (*p < *w) 479 { 480 if (i1 == i) 481 goto name_not_found; 482 /* Note here: i1 < i < i2. */ 483 i1 = i; 484 break; 485 } 486 else if (*p > *w) 487 { 488 if (i2 == i) 489 goto name_not_found; 490 /* Note here: i1 <= i < i2. */ 491 i2 = i; 492 break; 493 } 494 p++; w++; n--; 495 if (n == 0) 496 { 497 unsigned int c = unicode_name_to_code[i].code; 498 499 /* Undo the transformation to 16-bit space. */ 500 static const unsigned int offset[13] = 501 { 502 0x00000, 0x00000, 0x00000, 0x00000, 0x00000, 503 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000, 504 0x15000, 0x24000, 0xD4000 505 }; 506 return c + offset[c >> 12]; 507 } 508 } 509 } 510 } 511 name_not_found: ; 512 } 513 } 514 } 515 return UNINAME_INVALID; 516} 517