1/* Association between Unicode characters and their names.
2   Copyright (C) 2000-2002, 2005-2007, 2009-2010 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify it
5   under the terms of the GNU Lesser General Public License as published
6   by the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17#include <config.h>
18
19/* Specification.  */
20#include "uniname.h"
21
22#include <assert.h>
23#include <stdbool.h>
24#include <stdint.h>
25#include <stdio.h>
26#include <string.h>
27
28#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
29
30
31/* Table of Unicode character names, derived from UnicodeData.txt.
32   This table is generated in a way to minimize the memory footprint:
33     1. its compiled size is small (less than 350 KB),
34     2. it resides entirely in the text or read-only data segment of the
35        executable or shared library: the table contains only immediate
36        integers, no pointers, and the functions don't do heap allocation.
37 */
38#include "uninames.h"
39/* It contains:
40  static const char unicode_name_words[36303] = ...;
41  #define UNICODE_CHARNAME_NUM_WORDS 6260
42  static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
43  #define UNICODE_CHARNAME_WORD_HANGUL 3902
44  #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
45  #define UNICODE_CHARNAME_WORD_CJK 417
46  #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
47  static const uint16_t unicode_names[68940] = ...;
48  static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...;
49  static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...;
50  #define UNICODE_CHARNAME_MAX_LENGTH 83
51  #define UNICODE_CHARNAME_MAX_WORDS 13
52*/
53
54/* Returns the word with a given index.  */
55static const char *
56unicode_name_word (unsigned int index, unsigned int *lengthp)
57{
58  unsigned int i1;
59  unsigned int i2;
60  unsigned int i;
61
62  assert (index < UNICODE_CHARNAME_NUM_WORDS);
63
64  /* Binary search for i with
65       unicode_name_by_length[i].ind_offset <= index
66     and
67       index < unicode_name_by_length[i+1].ind_offset
68   */
69
70  i1 = 0;
71  i2 = SIZEOF (unicode_name_by_length) - 1;
72  while (i2 - i1 > 1)
73    {
74      unsigned int i = (i1 + i2) >> 1;
75      if (unicode_name_by_length[i].ind_offset <= index)
76        i1 = i;
77      else
78        i2 = i;
79    }
80  i = i1;
81  assert (unicode_name_by_length[i].ind_offset <= index
82          && index < unicode_name_by_length[i+1].ind_offset);
83  *lengthp = i;
84  return &unicode_name_words[unicode_name_by_length[i].extra_offset
85                             + (index-unicode_name_by_length[i].ind_offset)*i];
86}
87
88/* Looks up the index of a word.  */
89static int
90unicode_name_word_lookup (const char *word, unsigned int length)
91{
92  if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
93    {
94      /* Binary search among the words of given length.  */
95      unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
96      unsigned int i0 = unicode_name_by_length[length].ind_offset;
97      unsigned int i1 = i0;
98      unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
99      while (i2 - i1 > 0)
100        {
101          unsigned int i = (i1 + i2) >> 1;
102          const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
103          const char *w = word;
104          unsigned int n = length;
105          for (;;)
106            {
107              if (*p < *w)
108                {
109                  if (i1 == i)
110                    return -1;
111                  /* Note here: i1 < i < i2.  */
112                  i1 = i;
113                  break;
114                }
115              if (*p > *w)
116                {
117                  /* Note here: i1 <= i < i2.  */
118                  i2 = i;
119                  break;
120                }
121              p++; w++; n--;
122              if (n == 0)
123                return i;
124            }
125        }
126    }
127  return -1;
128}
129
130/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
131   sections 3.11 and 4.4.  */
132static const char jamo_initial_short_name[19][3] =
133{
134  "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
135  "C", "K", "T", "P", "H"
136};
137static const char jamo_medial_short_name[21][4] =
138{
139  "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
140  "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
141};
142static const char jamo_final_short_name[28][3] =
143{
144  "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
145  "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
146};
147
148/* Looks up the name of a Unicode character, in uppercase ASCII.
149   Returns the filled buf, or NULL if the character does not have a name.  */
150char *
151unicode_character_name (ucs4_t c, char *buf)
152{
153  if (c >= 0xAC00 && c <= 0xD7A3)
154    {
155      /* Special case for Hangul syllables. Keeps the tables small.  */
156      char *ptr;
157      unsigned int tmp;
158      unsigned int index1;
159      unsigned int index2;
160      unsigned int index3;
161      const char *q;
162
163      /* buf needs to have at least 16 + 7 bytes here.  */
164      memcpy (buf, "HANGUL SYLLABLE ", 16);
165      ptr = buf + 16;
166
167      tmp = c - 0xAC00;
168      index3 = tmp % 28; tmp = tmp / 28;
169      index2 = tmp % 21; tmp = tmp / 21;
170      index1 = tmp;
171
172      q = jamo_initial_short_name[index1];
173      while (*q != '\0')
174        *ptr++ = *q++;
175      q = jamo_medial_short_name[index2];
176      while (*q != '\0')
177        *ptr++ = *q++;
178      q = jamo_final_short_name[index3];
179      while (*q != '\0')
180        *ptr++ = *q++;
181      *ptr = '\0';
182      return buf;
183    }
184  else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
185           || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
186    {
187      /* Special case for CJK compatibility ideographs. Keeps the tables
188         small.  */
189      char *ptr;
190      int i;
191
192      /* buf needs to have at least 28 + 5 bytes here.  */
193      memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
194      ptr = buf + 28;
195
196      for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
197        {
198          unsigned int x = (c >> i) & 0xf;
199          *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
200        }
201      *ptr = '\0';
202      return buf;
203    }
204  else
205    {
206      const uint16_t *words;
207
208      /* Transform the code so that it fits in 16 bits.  */
209      switch (c >> 12)
210        {
211        case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
212          break;
213        case 0x0A:
214          c -= 0x05000;
215          break;
216        case 0x0F:
217          c -= 0x09000;
218          break;
219        case 0x10:
220          c -= 0x09000;
221          break;
222        case 0x12:
223          c -= 0x0A000;
224          break;
225        case 0x1D:
226          c -= 0x14000;
227          break;
228        case 0x1F:
229          c -= 0x15000;
230          break;
231        case 0x2F:
232          c -= 0x24000;
233          break;
234        case 0xE0:
235          c -= 0xD4000;
236          break;
237        default:
238          return NULL;
239        }
240
241      {
242        /* Binary search in unicode_code_to_name.  */
243        unsigned int i1 = 0;
244        unsigned int i2 = SIZEOF (unicode_code_to_name);
245        for (;;)
246          {
247            unsigned int i = (i1 + i2) >> 1;
248            if (unicode_code_to_name[i].code == c)
249              {
250                words = &unicode_names[unicode_code_to_name[i].name];
251                break;
252              }
253            else if (unicode_code_to_name[i].code < c)
254              {
255                if (i1 == i)
256                  {
257                    words = NULL;
258                    break;
259                  }
260                /* Note here: i1 < i < i2.  */
261                i1 = i;
262              }
263            else if (unicode_code_to_name[i].code > c)
264              {
265                if (i2 == i)
266                  {
267                    words = NULL;
268                    break;
269                  }
270                /* Note here: i1 <= i < i2.  */
271                i2 = i;
272              }
273          }
274      }
275      if (words != NULL)
276        {
277          /* Found it in unicode_code_to_name. Now concatenate the words.  */
278          /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
279          char *ptr = buf;
280          for (;;)
281            {
282              unsigned int wordlen;
283              const char *word = unicode_name_word (*words>>1, &wordlen);
284              do
285                *ptr++ = *word++;
286              while (--wordlen > 0);
287              if ((*words & 1) == 0)
288                break;
289              *ptr++ = ' ';
290              words++;
291            }
292          *ptr = '\0';
293          return buf;
294        }
295      return NULL;
296    }
297}
298
299/* Looks up the Unicode character with a given name, in upper- or lowercase
300   ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
301ucs4_t
302unicode_name_character (const char *name)
303{
304  unsigned int len = strlen (name);
305  if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
306    {
307      /* Test for "word1 word2 ..." syntax.  */
308      char buf[UNICODE_CHARNAME_MAX_LENGTH];
309      char *ptr = buf;
310      for (;;)
311        {
312          char c = *name++;
313          if (!(c >= ' ' && c <= '~'))
314            break;
315          *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
316          if (--len == 0)
317            goto filled_buf;
318        }
319      if (false)
320      filled_buf:
321        {
322          /* Convert the constituents to uint16_t words.  */
323          uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
324          uint16_t *wordptr = words;
325          {
326            const char *p1 = buf;
327            for (;;)
328              {
329                {
330                  int word;
331                  const char *p2 = p1;
332                  while (p2 < ptr && *p2 != ' ')
333                    p2++;
334                  word = unicode_name_word_lookup (p1, p2 - p1);
335                  if (word < 0)
336                    break;
337                  if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
338                    break;
339                  *wordptr++ = word;
340                  if (p2 == ptr)
341                    goto filled_words;
342                  p1 = p2 + 1;
343                }
344                /* Special case for Hangul syllables. Keeps the tables small. */
345                if (wordptr == &words[2]
346                    && words[0] == UNICODE_CHARNAME_WORD_HANGUL
347                    && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
348                  {
349                    /* Split the last word [p1..ptr) into three parts:
350                         1) [BCDGHJKMNPRST]
351                         2) [AEIOUWY]
352                         3) [BCDGHIJKLMNPST]
353                     */
354                    const char *p2;
355                    const char *p3;
356                    const char *p4;
357
358                    p2 = p1;
359                    while (p2 < ptr
360                           && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
361                               || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
362                               || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
363                               || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
364                               || *p2 == 'T'))
365                      p2++;
366                    p3 = p2;
367                    while (p3 < ptr
368                           && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
369                               || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
370                               || *p3 == 'Y'))
371                      p3++;
372                    p4 = p3;
373                    while (p4 < ptr
374                           && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
375                               || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
376                               || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
377                               || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
378                               || *p4 == 'S' || *p4 == 'T'))
379                      p4++;
380                    if (p4 == ptr)
381                      {
382                        unsigned int n1 = p2 - p1;
383                        unsigned int n2 = p3 - p2;
384                        unsigned int n3 = p4 - p3;
385
386                        if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
387                          {
388                            unsigned int index1;
389
390                            for (index1 = 0; index1 < 19; index1++)
391                              if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
392                                  && jamo_initial_short_name[index1][n1] == '\0')
393                                {
394                                  unsigned int index2;
395
396                                  for (index2 = 0; index2 < 21; index2++)
397                                    if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
398                                        && jamo_medial_short_name[index2][n2] == '\0')
399                                      {
400                                        unsigned int index3;
401
402                                        for (index3 = 0; index3 < 28; index3++)
403                                          if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
404                                              && jamo_final_short_name[index3][n3] == '\0')
405                                            {
406                                              return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
407                                            }
408                                        break;
409                                      }
410                                  break;
411                                }
412                          }
413                      }
414                  }
415                /* Special case for CJK compatibility ideographs. Keeps the
416                   tables small.  */
417                if (wordptr == &words[2]
418                    && words[0] == UNICODE_CHARNAME_WORD_CJK
419                    && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
420                    && p1 + 14 <= ptr
421                    && p1 + 15 >= ptr
422                    && memcmp (p1, "IDEOGRAPH-", 10) == 0)
423                  {
424                    const char *p2 = p1 + 10;
425
426                    if (*p2 != '0')
427                      {
428                        unsigned int c = 0;
429
430                        for (;;)
431                          {
432                            if (*p2 >= '0' && *p2 <= '9')
433                              c += (*p2 - '0');
434                            else if (*p2 >= 'A' && *p2 <= 'F')
435                              c += (*p2 - 'A' + 10);
436                            else
437                              break;
438                            p2++;
439                            if (p2 == ptr)
440                              {
441                                if ((c >= 0xF900 && c <= 0xFA2D)
442                                    || (c >= 0xFA30 && c <= 0xFA6A)
443                                    || (c >= 0xFA70 && c <= 0xFAD9)
444                                    || (c >= 0x2F800 && c <= 0x2FA1D))
445                                  return c;
446                                else
447                                  break;
448                              }
449                            c = c << 4;
450                          }
451                      }
452                  }
453              }
454          }
455          if (false)
456          filled_words:
457            {
458              /* Multiply by 2, to simplify later comparisons.  */
459              unsigned int words_length = wordptr - words;
460              {
461                int i = words_length - 1;
462                words[i] = 2 * words[i];
463                for (; --i >= 0; )
464                  words[i] = 2 * words[i] + 1;
465              }
466              /* Binary search in unicode_name_to_code.  */
467              {
468                unsigned int i1 = 0;
469                unsigned int i2 = SIZEOF (unicode_name_to_code);
470                for (;;)
471                  {
472                    unsigned int i = (i1 + i2) >> 1;
473                    const uint16_t *w = words;
474                    const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
475                    unsigned int n = words_length;
476                    for (;;)
477                      {
478                        if (*p < *w)
479                          {
480                            if (i1 == i)
481                              goto name_not_found;
482                            /* Note here: i1 < i < i2.  */
483                            i1 = i;
484                            break;
485                          }
486                        else if (*p > *w)
487                          {
488                            if (i2 == i)
489                              goto name_not_found;
490                            /* Note here: i1 <= i < i2.  */
491                            i2 = i;
492                            break;
493                          }
494                        p++; w++; n--;
495                        if (n == 0)
496                          {
497                            unsigned int c = unicode_name_to_code[i].code;
498
499                            /* Undo the transformation to 16-bit space.  */
500                            static const unsigned int offset[13] =
501                              {
502                                0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
503                                0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
504                                0x15000, 0x24000, 0xD4000
505                              };
506                            return c + offset[c >> 12];
507                          }
508                      }
509                  }
510              }
511            name_not_found: ;
512            }
513        }
514    }
515  return UNINAME_INVALID;
516}
517