1/* Test the Unicode character name functions.
2   Copyright (C) 2000-2003, 2005 Free Software Foundation, Inc.
3
4   This program is free software; you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation; either version 2, or (at your option)
7   any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program; if not, write to the Free Software Foundation,
16   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25
26#include "exit.h"
27#include "xalloc.h"
28#include "uniname.h"
29
30/* The names according to the UnicodeData.txt file, modified to contain the
31   Hangul syllable names, as described in the Unicode 3.0 book.  */
32const char * unicode_names [0x110000];
33
34/* Maximum length of a field in the UnicodeData.txt file.  */
35#define FIELDLEN 120
36
37/* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
38   Reads up to (but excluding) DELIM.
39   Returns 1 when a field was successfully read, otherwise 0.  */
40static int
41getfield (FILE *stream, char *buffer, int delim)
42{
43  int count = 0;
44  int c;
45
46  for (; (c = getc (stream)), (c != EOF && c != delim); )
47    {
48      /* Put c into the buffer.  */
49      if (++count >= FIELDLEN - 1)
50	{
51	  fprintf (stderr, "field too long\n");
52	  exit (EXIT_FAILURE);
53	}
54      *buffer++ = c;
55    }
56
57  if (c == EOF)
58    return 0;
59
60  *buffer = '\0';
61  return 1;
62}
63
64/* Stores in unicode_names[] the relevant contents of the UnicodeData.txt
65   file.  */
66static void
67fill_names (const char *unicodedata_filename)
68{
69  unsigned int i;
70  FILE *stream;
71  char field0[FIELDLEN];
72  char field1[FIELDLEN];
73  int lineno = 0;
74
75  for (i = 0; i < 0x110000; i++)
76    unicode_names[i] = NULL;
77
78  stream = fopen (unicodedata_filename, "r");
79  if (stream == NULL)
80    {
81      fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
82      exit (EXIT_FAILURE);
83    }
84
85  for (;;)
86    {
87      int n;
88      int c;
89
90      lineno++;
91      n = getfield (stream, field0, ';');
92      n += getfield (stream, field1, ';');
93      if (n == 0)
94	break;
95      if (n != 2)
96	{
97	  fprintf (stderr, "short line in '%s':%d\n",
98		   unicodedata_filename, lineno);
99	  exit (EXIT_FAILURE);
100	}
101      for (; (c = getc (stream)), (c != EOF && c != '\n'); )
102	;
103      i = strtoul (field0, NULL, 16);
104      if (i >= 0x110000)
105	{
106	  fprintf (stderr, "index too large\n");
107	  exit (EXIT_FAILURE);
108	}
109      unicode_names[i] = xstrdup (field1);
110    }
111  if (ferror (stream) || fclose (stream))
112    {
113      fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
114      exit (1);
115    }
116}
117
118/* Perform an exhaustive test of the unicode_character_name function.  */
119static int
120test_name_lookup ()
121{
122  int error = 0;
123  unsigned int i;
124  char buf[UNINAME_MAX];
125
126  for (i = 0; i < 0x11000; i++)
127    {
128      char *result = unicode_character_name (i, buf);
129
130      if (unicode_names[i] != NULL)
131	{
132	  if (result == NULL)
133	    {
134	      fprintf (stderr, "\\u%04X name lookup failed!\n", i);
135	      error = 1;
136	    }
137	  else if (strcmp (result, unicode_names[i]) != 0)
138	    {
139	      fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
140			       i, result);
141	      error = 1;
142	    }
143	}
144      else
145	{
146	  if (result != NULL)
147	    {
148	      fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
149			       i, result);
150	      error = 1;
151	    }
152	}
153    }
154
155  for (i = 0x110000; i < 0x1000000; i++)
156    {
157      char *result = unicode_character_name (i, buf);
158
159      if (result != NULL)
160	{
161	  fprintf (stderr, "\\u%04X name lookup returned wrong name: %s\n",
162			   i, result);
163	  error = 1;
164	}
165    }
166
167  return error;
168}
169
170/* Perform a test of the unicode_name_character function.  */
171static int
172test_inverse_lookup ()
173{
174  int error = 0;
175  unsigned int i;
176
177  /* First, verify all valid character names are recognized.  */
178  for (i = 0; i < 0x110000; i++)
179    if (unicode_names[i] != NULL)
180      {
181	unsigned int result = unicode_name_character (unicode_names[i]);
182	if (result != i)
183	  {
184	    if (result == UNINAME_INVALID)
185	      fprintf (stderr, "inverse name lookup of \"%s\" failed\n",
186		       unicode_names[i]);
187	    else
188	      fprintf (stderr,
189		       "inverse name lookup of \"%s\" returned 0x%04X\n",
190		       unicode_names[i], result);
191	    error = 1;
192	  }
193      }
194
195  /* Second, generate random but likely names and verify they are not
196     recognized unless really valid.  */
197  for (i = 0; i < 10000; i++)
198    {
199      unsigned int i1, i2;
200      const char *s1;
201      const char *s2;
202      unsigned int l1, l2, j1, j2;
203      char buf[2*UNINAME_MAX];
204      unsigned int result;
205
206      do i1 = ((rand () % 0x11) << 16)
207	      + ((rand () & 0xff) << 8)
208	      + (rand () & 0xff);
209      while (unicode_names[i1] == NULL);
210
211      do i2 = ((rand () % 0x11) << 16)
212	      + ((rand () & 0xff) << 8)
213	      + (rand () & 0xff);
214      while (unicode_names[i2] == NULL);
215
216      s1 = unicode_names[i1];
217      l1 = strlen (s1);
218      s2 = unicode_names[i2];
219      l2 = strlen (s2);
220
221      /* Concatenate a starting piece of s1 with an ending piece of s2.  */
222      for (j1 = 1; j1 <= l1; j1++)
223	if (j1 == l1 || s1[j1] == ' ')
224	  for (j2 = 0; j2 < l2; j2++)
225	    if (j2 == 0 || s2[j2-1] == ' ')
226	      {
227		memcpy (buf, s1, j1);
228		buf[j1] = ' ';
229		memcpy (buf + j1 + 1, s2 + j2, l2 - j2 + 1);
230
231		result = unicode_name_character (buf);
232		if (result != UNINAME_INVALID
233		    && !(unicode_names[result] != NULL
234			 && strcmp (unicode_names[result], buf) == 0))
235		  {
236		    fprintf (stderr,
237			     "inverse name lookup of \"%s\" returned 0x%04X\n",
238			     unicode_names[i], result);
239		    error = 1;
240		  }
241	      }
242    }
243
244  /* Third, some extreme case that used to loop.  */
245  if (unicode_name_character ("A A") != UNINAME_INVALID)
246    error = 1;
247
248  return error;
249}
250
251int
252main (int argc, char *argv[])
253{
254  int error = 0;
255
256  fill_names (argv[1]);
257
258  error |= test_name_lookup ();
259  error |= test_inverse_lookup ();
260
261  return error;
262}
263