• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500-V1.0.1.40_1.0.68/ap/gpl/timemachine/gettext-0.17/gettext-tools/gnulib-lib/uniname/
1/* Association between Unicode characters and their names.
2   Copyright (C) 2000-2002, 2005-2007 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify it
5   under the terms of the GNU General Public License as published
6   by the Free Software Foundation; either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17#include <config.h>
18
19/* Specification.  */
20#include "uniname.h"
21
22#include <assert.h>
23#include <stdbool.h>
24#include <stdint.h>
25#include <stdio.h>
26#include <string.h>
27
28#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
29
30
31/* Table of Unicode character names, derived from UnicodeData.txt.
32   This table is generated in a way to minimize the memory footprint:
33     1. its compiled size is small (less than 350 KB),
34     2. it resides entirely in the text or read-only data segment of the
35        executable or shared library: the table contains only immediate
36        integers, no pointers, and the functions don't do heap allocation.
37 */
38#include "uninames.h"
39/* It contains:
40  static const char unicode_name_words[36303] = ...;
41  #define UNICODE_CHARNAME_NUM_WORDS 6260
42  static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
43  #define UNICODE_CHARNAME_WORD_HANGUL 3902
44  #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
45  #define UNICODE_CHARNAME_WORD_CJK 417
46  #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
47  static const uint16_t unicode_names[68940] = ...;
48  static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...;
49  static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...;
50  #define UNICODE_CHARNAME_MAX_LENGTH 83
51  #define UNICODE_CHARNAME_MAX_WORDS 13
52*/
53
54/* Returns the word with a given index.  */
55static const char *
56unicode_name_word (unsigned int index, unsigned int *lengthp)
57{
58  unsigned int i1;
59  unsigned int i2;
60  unsigned int i;
61
62  assert (index < UNICODE_CHARNAME_NUM_WORDS);
63
64  /* Binary search for i with
65       unicode_name_by_length[i].ind_offset <= index
66     and
67       index < unicode_name_by_length[i+1].ind_offset
68   */
69
70  i1 = 0;
71  i2 = SIZEOF (unicode_name_by_length) - 1;
72  while (i2 - i1 > 1)
73    {
74      unsigned int i = (i1 + i2) >> 1;
75      if (unicode_name_by_length[i].ind_offset <= index)
76	i1 = i;
77      else
78	i2 = i;
79    }
80  i = i1;
81  assert (unicode_name_by_length[i].ind_offset <= index
82	  && index < unicode_name_by_length[i+1].ind_offset);
83  *lengthp = i;
84  return &unicode_name_words[unicode_name_by_length[i].extra_offset
85			     + (index-unicode_name_by_length[i].ind_offset)*i];
86}
87
88/* Looks up the index of a word.  */
89static int
90unicode_name_word_lookup (const char *word, unsigned int length)
91{
92  if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
93    {
94      /* Binary search among the words of given length.  */
95      unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
96      unsigned int i0 = unicode_name_by_length[length].ind_offset;
97      unsigned int i1 = i0;
98      unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
99      while (i2 - i1 > 0)
100	{
101	  unsigned int i = (i1 + i2) >> 1;
102	  const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
103	  const char *w = word;
104	  unsigned int n = length;
105	  for (;;)
106	    {
107	      if (*p < *w)
108		{
109		  if (i1 == i)
110		    return -1;
111		  /* Note here: i1 < i < i2.  */
112		  i1 = i;
113		  break;
114		}
115	      if (*p > *w)
116		{
117		  /* Note here: i1 <= i < i2.  */
118		  i2 = i;
119		  break;
120		}
121	      p++; w++; n--;
122	      if (n == 0)
123		return i;
124	    }
125	}
126    }
127  return -1;
128}
129
130/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
131   sections 3.11 and 4.4.  */
132static const char jamo_initial_short_name[19][3] =
133{
134  "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
135  "C", "K", "T", "P", "H"
136};
137static const char jamo_medial_short_name[21][4] =
138{
139  "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
140  "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
141};
142static const char jamo_final_short_name[28][3] =
143{
144  "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
145  "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
146};
147
148/* Looks up the name of a Unicode character, in uppercase ASCII.
149   Returns the filled buf, or NULL if the character does not have a name.  */
150char *
151unicode_character_name (ucs4_t c, char *buf)
152{
153  if (c >= 0xAC00 && c <= 0xD7A3)
154    {
155      /* Special case for Hangul syllables. Keeps the tables small.  */
156      char *ptr;
157      unsigned int tmp;
158      unsigned int index1;
159      unsigned int index2;
160      unsigned int index3;
161      const char *q;
162
163      /* buf needs to have at least 16 + 7 bytes here.  */
164      memcpy (buf, "HANGUL SYLLABLE ", 16);
165      ptr = buf + 16;
166
167      tmp = c - 0xAC00;
168      index3 = tmp % 28; tmp = tmp / 28;
169      index2 = tmp % 21; tmp = tmp / 21;
170      index1 = tmp;
171
172      q = jamo_initial_short_name[index1];
173      while (*q != '\0')
174	*ptr++ = *q++;
175      q = jamo_medial_short_name[index2];
176      while (*q != '\0')
177	*ptr++ = *q++;
178      q = jamo_final_short_name[index3];
179      while (*q != '\0')
180	*ptr++ = *q++;
181      *ptr = '\0';
182      return buf;
183    }
184  else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
185	   || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
186    {
187      /* Special case for CJK compatibility ideographs. Keeps the tables
188	 small.  */
189      char *ptr;
190      int i;
191
192      /* buf needs to have at least 28 + 5 bytes here.  */
193      memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
194      ptr = buf + 28;
195
196      for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
197	{
198	  unsigned int x = (c >> i) & 0xf;
199	  *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
200	}
201      *ptr = '\0';
202      return buf;
203    }
204  else
205    {
206      const uint16_t *words;
207
208      /* Transform the code so that it fits in 16 bits.  */
209      switch (c >> 12)
210	{
211	case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
212	  break;
213	case 0x0A:
214	  c -= 0x05000;
215	  break;
216	case 0x0F:
217	  c -= 0x09000;
218	  break;
219	case 0x10:
220	  c -= 0x09000;
221	  break;
222	case 0x12:
223	  c -= 0x0A000;
224	  break;
225	case 0x1D:
226	  c -= 0x14000;
227	  break;
228	case 0x2F:
229	  c -= 0x25000;
230	  break;
231	case 0xE0:
232	  c -= 0xD5000;
233	  break;
234	default:
235	  return NULL;
236	}
237
238      {
239	/* Binary search in unicode_code_to_name.  */
240	unsigned int i1 = 0;
241	unsigned int i2 = SIZEOF (unicode_code_to_name);
242	for (;;)
243	  {
244	    unsigned int i = (i1 + i2) >> 1;
245	    if (unicode_code_to_name[i].code == c)
246	      {
247		words = &unicode_names[unicode_code_to_name[i].name];
248		break;
249	      }
250	    else if (unicode_code_to_name[i].code < c)
251	      {
252		if (i1 == i)
253		  {
254		    words = NULL;
255		    break;
256		  }
257		/* Note here: i1 < i < i2.  */
258		i1 = i;
259	      }
260	    else if (unicode_code_to_name[i].code > c)
261	      {
262		if (i2 == i)
263		  {
264		    words = NULL;
265		    break;
266		  }
267		/* Note here: i1 <= i < i2.  */
268		i2 = i;
269	      }
270	  }
271      }
272      if (words != NULL)
273	{
274	  /* Found it in unicode_code_to_name. Now concatenate the words.  */
275	  /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
276	  char *ptr = buf;
277	  for (;;)
278	    {
279	      unsigned int wordlen;
280	      const char *word = unicode_name_word (*words>>1, &wordlen);
281	      do
282		*ptr++ = *word++;
283	      while (--wordlen > 0);
284	      if ((*words & 1) == 0)
285		break;
286	      *ptr++ = ' ';
287	      words++;
288	    }
289	  *ptr = '\0';
290	  return buf;
291	}
292      return NULL;
293    }
294}
295
296/* Looks up the Unicode character with a given name, in upper- or lowercase
297   ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
298ucs4_t
299unicode_name_character (const char *name)
300{
301  unsigned int len = strlen (name);
302  if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
303    {
304      /* Test for "word1 word2 ..." syntax.  */
305      char buf[UNICODE_CHARNAME_MAX_LENGTH];
306      char *ptr = buf;
307      for (;;)
308	{
309	  char c = *name++;
310	  if (!(c >= ' ' && c <= '~'))
311	    break;
312	  *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
313	  if (--len == 0)
314	    goto filled_buf;
315	}
316      if (false)
317      filled_buf:
318	{
319	  /* Convert the constituents to uint16_t words.  */
320	  uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
321	  uint16_t *wordptr = words;
322	  {
323	    const char *p1 = buf;
324	    for (;;)
325	      {
326		{
327		  int word;
328		  const char *p2 = p1;
329		  while (p2 < ptr && *p2 != ' ')
330		    p2++;
331		  word = unicode_name_word_lookup (p1, p2 - p1);
332		  if (word < 0)
333		    break;
334		  if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
335		    break;
336		  *wordptr++ = word;
337		  if (p2 == ptr)
338		    goto filled_words;
339		  p1 = p2 + 1;
340		}
341		/* Special case for Hangul syllables. Keeps the tables small. */
342		if (wordptr == &words[2]
343		    && words[0] == UNICODE_CHARNAME_WORD_HANGUL
344		    && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
345		  {
346		    /* Split the last word [p1..ptr) into three parts:
347			 1) [BCDGHJKMNPRST]
348			 2) [AEIOUWY]
349			 3) [BCDGHIJKLMNPST]
350		     */
351		    const char *p2;
352		    const char *p3;
353		    const char *p4;
354
355		    p2 = p1;
356		    while (p2 < ptr
357			   && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
358			       || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
359			       || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
360			       || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
361			       || *p2 == 'T'))
362		      p2++;
363		    p3 = p2;
364		    while (p3 < ptr
365			   && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
366			       || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
367			       || *p3 == 'Y'))
368		      p3++;
369		    p4 = p3;
370		    while (p4 < ptr
371			   && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
372			       || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
373			       || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
374			       || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
375			       || *p4 == 'S' || *p4 == 'T'))
376		      p4++;
377		    if (p4 == ptr)
378		      {
379			unsigned int n1 = p2 - p1;
380			unsigned int n2 = p3 - p2;
381			unsigned int n3 = p4 - p3;
382
383			if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
384			  {
385			    unsigned int index1;
386
387			    for (index1 = 0; index1 < 19; index1++)
388			      if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
389				  && jamo_initial_short_name[index1][n1] == '\0')
390				{
391				  unsigned int index2;
392
393				  for (index2 = 0; index2 < 21; index2++)
394				    if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
395					&& jamo_medial_short_name[index2][n2] == '\0')
396				      {
397					unsigned int index3;
398
399					for (index3 = 0; index3 < 28; index3++)
400					  if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
401					      && jamo_final_short_name[index3][n3] == '\0')
402					    {
403					      return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
404					    }
405					break;
406				      }
407				  break;
408				}
409			  }
410		      }
411		  }
412		/* Special case for CJK compatibility ideographs. Keeps the
413		   tables small.  */
414		if (wordptr == &words[2]
415		    && words[0] == UNICODE_CHARNAME_WORD_CJK
416		    && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
417		    && p1 + 14 <= ptr
418		    && p1 + 15 >= ptr
419		    && memcmp (p1, "IDEOGRAPH-", 10) == 0)
420		  {
421		    const char *p2 = p1 + 10;
422
423		    if (*p2 != '0')
424		      {
425			unsigned int c = 0;
426
427			for (;;)
428			  {
429			    if (*p2 >= '0' && *p2 <= '9')
430			      c += (*p2 - '0');
431			    else if (*p2 >= 'A' && *p2 <= 'F')
432			      c += (*p2 - 'A' + 10);
433			    else
434			      break;
435			    p2++;
436			    if (p2 == ptr)
437			      {
438				if ((c >= 0xF900 && c <= 0xFA2D)
439				    || (c >= 0xFA30 && c <= 0xFA6A)
440				    || (c >= 0xFA70 && c <= 0xFAD9)
441				    || (c >= 0x2F800 && c <= 0x2FA1D))
442				  return c;
443				else
444				  break;
445			      }
446			    c = c << 4;
447			  }
448		      }
449		  }
450	      }
451	  }
452	  if (false)
453	  filled_words:
454	    {
455	      /* Multiply by 2, to simplify later comparisons.  */
456	      unsigned int words_length = wordptr - words;
457	      {
458		int i = words_length - 1;
459		words[i] = 2 * words[i];
460		for (; --i >= 0; )
461		  words[i] = 2 * words[i] + 1;
462	      }
463	      /* Binary search in unicode_name_to_code.  */
464	      {
465		unsigned int i1 = 0;
466		unsigned int i2 = SIZEOF (unicode_name_to_code);
467		for (;;)
468		  {
469		    unsigned int i = (i1 + i2) >> 1;
470		    const uint16_t *w = words;
471		    const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
472		    unsigned int n = words_length;
473		    for (;;)
474		      {
475			if (*p < *w)
476			  {
477			    if (i1 == i)
478			      goto name_not_found;
479			    /* Note here: i1 < i < i2.  */
480			    i1 = i;
481			    break;
482			  }
483			else if (*p > *w)
484			  {
485			    if (i2 == i)
486			      goto name_not_found;
487			    /* Note here: i1 <= i < i2.  */
488			    i2 = i;
489			    break;
490			  }
491			p++; w++; n--;
492			if (n == 0)
493			  {
494			    unsigned int c = unicode_name_to_code[i].code;
495
496			    /* Undo the transformation to 16-bit space.  */
497			    static const unsigned int offset[12] =
498			      {
499				0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
500				0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
501				0x25000, 0xD5000
502			      };
503			    return c + offset[c >> 12];
504			  }
505		      }
506		  }
507	      }
508	    name_not_found: ;
509	    }
510	}
511    }
512  return UNINAME_INVALID;
513}
514