1/* Association between Unicode characters and their names. 2 Copyright (C) 2000-2002, 2005-2007 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify it 5 under the terms of the GNU General Public License as published 6 by the Free Software Foundation; either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Lesser General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17#include <config.h> 18 19/* Specification. */ 20#include "uniname.h" 21 22#include <assert.h> 23#include <stdbool.h> 24#include <stdint.h> 25#include <stdio.h> 26#include <string.h> 27 28#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 29 30 31/* Table of Unicode character names, derived from UnicodeData.txt. 32 This table is generated in a way to minimize the memory footprint: 33 1. its compiled size is small (less than 350 KB), 34 2. it resides entirely in the text or read-only data segment of the 35 executable or shared library: the table contains only immediate 36 integers, no pointers, and the functions don't do heap allocation. 37 */ 38#include "uninames.h" 39/* It contains: 40 static const char unicode_name_words[36303] = ...; 41 #define UNICODE_CHARNAME_NUM_WORDS 6260 42 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...; 43 #define UNICODE_CHARNAME_WORD_HANGUL 3902 44 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978 45 #define UNICODE_CHARNAME_WORD_CJK 417 46 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107 47 static const uint16_t unicode_names[68940] = ...; 48 static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...; 49 static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...; 50 #define UNICODE_CHARNAME_MAX_LENGTH 83 51 #define UNICODE_CHARNAME_MAX_WORDS 13 52*/ 53 54/* Returns the word with a given index. */ 55static const char * 56unicode_name_word (unsigned int index, unsigned int *lengthp) 57{ 58 unsigned int i1; 59 unsigned int i2; 60 unsigned int i; 61 62 assert (index < UNICODE_CHARNAME_NUM_WORDS); 63 64 /* Binary search for i with 65 unicode_name_by_length[i].ind_offset <= index 66 and 67 index < unicode_name_by_length[i+1].ind_offset 68 */ 69 70 i1 = 0; 71 i2 = SIZEOF (unicode_name_by_length) - 1; 72 while (i2 - i1 > 1) 73 { 74 unsigned int i = (i1 + i2) >> 1; 75 if (unicode_name_by_length[i].ind_offset <= index) 76 i1 = i; 77 else 78 i2 = i; 79 } 80 i = i1; 81 assert (unicode_name_by_length[i].ind_offset <= index 82 && index < unicode_name_by_length[i+1].ind_offset); 83 *lengthp = i; 84 return &unicode_name_words[unicode_name_by_length[i].extra_offset 85 + (index-unicode_name_by_length[i].ind_offset)*i]; 86} 87 88/* Looks up the index of a word. */ 89static int 90unicode_name_word_lookup (const char *word, unsigned int length) 91{ 92 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1) 93 { 94 /* Binary search among the words of given length. */ 95 unsigned int extra_offset = unicode_name_by_length[length].extra_offset; 96 unsigned int i0 = unicode_name_by_length[length].ind_offset; 97 unsigned int i1 = i0; 98 unsigned int i2 = unicode_name_by_length[length+1].ind_offset; 99 while (i2 - i1 > 0) 100 { 101 unsigned int i = (i1 + i2) >> 1; 102 const char *p = &unicode_name_words[extra_offset + (i-i0)*length]; 103 const char *w = word; 104 unsigned int n = length; 105 for (;;) 106 { 107 if (*p < *w) 108 { 109 if (i1 == i) 110 return -1; 111 /* Note here: i1 < i < i2. */ 112 i1 = i; 113 break; 114 } 115 if (*p > *w) 116 { 117 /* Note here: i1 <= i < i2. */ 118 i2 = i; 119 break; 120 } 121 p++; w++; n--; 122 if (n == 0) 123 return i; 124 } 125 } 126 } 127 return -1; 128} 129 130/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, 131 sections 3.11 and 4.4. */ 132static const char jamo_initial_short_name[19][3] = 133{ 134 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", 135 "C", "K", "T", "P", "H" 136}; 137static const char jamo_medial_short_name[21][4] = 138{ 139 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", 140 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I" 141}; 142static const char jamo_final_short_name[28][3] = 143{ 144 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", 145 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" 146}; 147 148/* Looks up the name of a Unicode character, in uppercase ASCII. 149 Returns the filled buf, or NULL if the character does not have a name. */ 150char * 151unicode_character_name (ucs4_t c, char *buf) 152{ 153 if (c >= 0xAC00 && c <= 0xD7A3) 154 { 155 /* Special case for Hangul syllables. Keeps the tables small. */ 156 char *ptr; 157 unsigned int tmp; 158 unsigned int index1; 159 unsigned int index2; 160 unsigned int index3; 161 const char *q; 162 163 /* buf needs to have at least 16 + 7 bytes here. */ 164 memcpy (buf, "HANGUL SYLLABLE ", 16); 165 ptr = buf + 16; 166 167 tmp = c - 0xAC00; 168 index3 = tmp % 28; tmp = tmp / 28; 169 index2 = tmp % 21; tmp = tmp / 21; 170 index1 = tmp; 171 172 q = jamo_initial_short_name[index1]; 173 while (*q != '\0') 174 *ptr++ = *q++; 175 q = jamo_medial_short_name[index2]; 176 while (*q != '\0') 177 *ptr++ = *q++; 178 q = jamo_final_short_name[index3]; 179 while (*q != '\0') 180 *ptr++ = *q++; 181 *ptr = '\0'; 182 return buf; 183 } 184 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A) 185 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D)) 186 { 187 /* Special case for CJK compatibility ideographs. Keeps the tables 188 small. */ 189 char *ptr; 190 int i; 191 192 /* buf needs to have at least 28 + 5 bytes here. */ 193 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28); 194 ptr = buf + 28; 195 196 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4) 197 { 198 unsigned int x = (c >> i) & 0xf; 199 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x; 200 } 201 *ptr = '\0'; 202 return buf; 203 } 204 else 205 { 206 const uint16_t *words; 207 208 /* Transform the code so that it fits in 16 bits. */ 209 switch (c >> 12) 210 { 211 case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: 212 break; 213 case 0x0A: 214 c -= 0x05000; 215 break; 216 case 0x0F: 217 c -= 0x09000; 218 break; 219 case 0x10: 220 c -= 0x09000; 221 break; 222 case 0x12: 223 c -= 0x0A000; 224 break; 225 case 0x1D: 226 c -= 0x14000; 227 break; 228 case 0x2F: 229 c -= 0x25000; 230 break; 231 case 0xE0: 232 c -= 0xD5000; 233 break; 234 default: 235 return NULL; 236 } 237 238 { 239 /* Binary search in unicode_code_to_name. */ 240 unsigned int i1 = 0; 241 unsigned int i2 = SIZEOF (unicode_code_to_name); 242 for (;;) 243 { 244 unsigned int i = (i1 + i2) >> 1; 245 if (unicode_code_to_name[i].code == c) 246 { 247 words = &unicode_names[unicode_code_to_name[i].name]; 248 break; 249 } 250 else if (unicode_code_to_name[i].code < c) 251 { 252 if (i1 == i) 253 { 254 words = NULL; 255 break; 256 } 257 /* Note here: i1 < i < i2. */ 258 i1 = i; 259 } 260 else if (unicode_code_to_name[i].code > c) 261 { 262 if (i2 == i) 263 { 264 words = NULL; 265 break; 266 } 267 /* Note here: i1 <= i < i2. */ 268 i2 = i; 269 } 270 } 271 } 272 if (words != NULL) 273 { 274 /* Found it in unicode_code_to_name. Now concatenate the words. */ 275 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */ 276 char *ptr = buf; 277 for (;;) 278 { 279 unsigned int wordlen; 280 const char *word = unicode_name_word (*words>>1, &wordlen); 281 do 282 *ptr++ = *word++; 283 while (--wordlen > 0); 284 if ((*words & 1) == 0) 285 break; 286 *ptr++ = ' '; 287 words++; 288 } 289 *ptr = '\0'; 290 return buf; 291 } 292 return NULL; 293 } 294} 295 296/* Looks up the Unicode character with a given name, in upper- or lowercase 297 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */ 298ucs4_t 299unicode_name_character (const char *name) 300{ 301 unsigned int len = strlen (name); 302 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH) 303 { 304 /* Test for "word1 word2 ..." syntax. */ 305 char buf[UNICODE_CHARNAME_MAX_LENGTH]; 306 char *ptr = buf; 307 for (;;) 308 { 309 char c = *name++; 310 if (!(c >= ' ' && c <= '~')) 311 break; 312 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c); 313 if (--len == 0) 314 goto filled_buf; 315 } 316 if (false) 317 filled_buf: 318 { 319 /* Convert the constituents to uint16_t words. */ 320 uint16_t words[UNICODE_CHARNAME_MAX_WORDS]; 321 uint16_t *wordptr = words; 322 { 323 const char *p1 = buf; 324 for (;;) 325 { 326 { 327 int word; 328 const char *p2 = p1; 329 while (p2 < ptr && *p2 != ' ') 330 p2++; 331 word = unicode_name_word_lookup (p1, p2 - p1); 332 if (word < 0) 333 break; 334 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS]) 335 break; 336 *wordptr++ = word; 337 if (p2 == ptr) 338 goto filled_words; 339 p1 = p2 + 1; 340 } 341 /* Special case for Hangul syllables. Keeps the tables small. */ 342 if (wordptr == &words[2] 343 && words[0] == UNICODE_CHARNAME_WORD_HANGUL 344 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE) 345 { 346 /* Split the last word [p1..ptr) into three parts: 347 1) [BCDGHJKMNPRST] 348 2) [AEIOUWY] 349 3) [BCDGHIJKLMNPST] 350 */ 351 const char *p2; 352 const char *p3; 353 const char *p4; 354 355 p2 = p1; 356 while (p2 < ptr 357 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D' 358 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J' 359 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N' 360 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S' 361 || *p2 == 'T')) 362 p2++; 363 p3 = p2; 364 while (p3 < ptr 365 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I' 366 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W' 367 || *p3 == 'Y')) 368 p3++; 369 p4 = p3; 370 while (p4 < ptr 371 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D' 372 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I' 373 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L' 374 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P' 375 || *p4 == 'S' || *p4 == 'T')) 376 p4++; 377 if (p4 == ptr) 378 { 379 unsigned int n1 = p2 - p1; 380 unsigned int n2 = p3 - p2; 381 unsigned int n3 = p4 - p3; 382 383 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2) 384 { 385 unsigned int index1; 386 387 for (index1 = 0; index1 < 19; index1++) 388 if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0 389 && jamo_initial_short_name[index1][n1] == '\0') 390 { 391 unsigned int index2; 392 393 for (index2 = 0; index2 < 21; index2++) 394 if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0 395 && jamo_medial_short_name[index2][n2] == '\0') 396 { 397 unsigned int index3; 398 399 for (index3 = 0; index3 < 28; index3++) 400 if (memcmp(jamo_final_short_name[index3], p3, n3) == 0 401 && jamo_final_short_name[index3][n3] == '\0') 402 { 403 return 0xAC00 + (index1 * 21 + index2) * 28 + index3; 404 } 405 break; 406 } 407 break; 408 } 409 } 410 } 411 } 412 /* Special case for CJK compatibility ideographs. Keeps the 413 tables small. */ 414 if (wordptr == &words[2] 415 && words[0] == UNICODE_CHARNAME_WORD_CJK 416 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY 417 && p1 + 14 <= ptr 418 && p1 + 15 >= ptr 419 && memcmp (p1, "IDEOGRAPH-", 10) == 0) 420 { 421 const char *p2 = p1 + 10; 422 423 if (*p2 != '0') 424 { 425 unsigned int c = 0; 426 427 for (;;) 428 { 429 if (*p2 >= '0' && *p2 <= '9') 430 c += (*p2 - '0'); 431 else if (*p2 >= 'A' && *p2 <= 'F') 432 c += (*p2 - 'A' + 10); 433 else 434 break; 435 p2++; 436 if (p2 == ptr) 437 { 438 if ((c >= 0xF900 && c <= 0xFA2D) 439 || (c >= 0xFA30 && c <= 0xFA6A) 440 || (c >= 0xFA70 && c <= 0xFAD9) 441 || (c >= 0x2F800 && c <= 0x2FA1D)) 442 return c; 443 else 444 break; 445 } 446 c = c << 4; 447 } 448 } 449 } 450 } 451 } 452 if (false) 453 filled_words: 454 { 455 /* Multiply by 2, to simplify later comparisons. */ 456 unsigned int words_length = wordptr - words; 457 { 458 int i = words_length - 1; 459 words[i] = 2 * words[i]; 460 for (; --i >= 0; ) 461 words[i] = 2 * words[i] + 1; 462 } 463 /* Binary search in unicode_name_to_code. */ 464 { 465 unsigned int i1 = 0; 466 unsigned int i2 = SIZEOF (unicode_name_to_code); 467 for (;;) 468 { 469 unsigned int i = (i1 + i2) >> 1; 470 const uint16_t *w = words; 471 const uint16_t *p = &unicode_names[unicode_name_to_code[i].name]; 472 unsigned int n = words_length; 473 for (;;) 474 { 475 if (*p < *w) 476 { 477 if (i1 == i) 478 goto name_not_found; 479 /* Note here: i1 < i < i2. */ 480 i1 = i; 481 break; 482 } 483 else if (*p > *w) 484 { 485 if (i2 == i) 486 goto name_not_found; 487 /* Note here: i1 <= i < i2. */ 488 i2 = i; 489 break; 490 } 491 p++; w++; n--; 492 if (n == 0) 493 { 494 unsigned int c = unicode_name_to_code[i].code; 495 496 /* Undo the transformation to 16-bit space. */ 497 static const unsigned int offset[12] = 498 { 499 0x00000, 0x00000, 0x00000, 0x00000, 0x00000, 500 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000, 501 0x25000, 0xD5000 502 }; 503 return c + offset[c >> 12]; 504 } 505 } 506 } 507 } 508 name_not_found: ; 509 } 510 } 511 } 512 return UNINAME_INVALID; 513} 514