makeucnid.c revision 169695
1/* Make ucnid.h from various sources.
2   Copyright (C) 2005 Free Software Foundation, Inc.
3
4This program is free software; you can redistribute it and/or modify it
5under the terms of the GNU General Public License as published by the
6Free Software Foundation; either version 2, or (at your option) any
7later version.
8
9This program is distributed in the hope that it will be useful,
10but WITHOUT ANY WARRANTY; without even the implied warranty of
11MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License
15along with this program; if not, write to the Free Software
16Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
17
18/* Run this program as
19   ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
20       > ucnid.h
21*/
22
23#include <stdio.h>
24#include <string.h>
25#include <ctype.h>
26#include <stdbool.h>
27#include <stdlib.h>
28
29enum {
30  C99 = 1,
31  CXX = 2,
32  digit = 4,
33  not_NFC = 8,
34  not_NFKC = 16,
35  maybe_not_NFC = 32
36};
37
38static unsigned flags[65536];
39static unsigned short decomp[65536][2];
40static unsigned char combining_value[65536];
41
42/* Die!  */
43
44static void
45fail (const char *s)
46{
47  fprintf (stderr, "%s\n", s);
48  exit (1);
49}
50
51/* Read ucnid.tab and set the C99 and CXX flags in header[].  */
52
53static void
54read_ucnid (const char *fname)
55{
56  FILE *f = fopen (fname, "r");
57  unsigned fl = 0;
58
59  if (!f)
60    fail ("opening ucnid.tab");
61  for (;;)
62    {
63      char line[256];
64
65      if (!fgets (line, sizeof (line), f))
66	break;
67      if (strcmp (line, "[C99]\n") == 0)
68	fl = C99;
69      else if (strcmp (line, "[CXX]\n") == 0)
70	fl = CXX;
71      else if (isxdigit (line[0]))
72	{
73	  char *l = line;
74	  while (*l)
75	    {
76	      unsigned long start, end;
77	      char *endptr;
78	      start = strtoul (l, &endptr, 16);
79	      if (endptr == l || (*endptr != '-' && ! isspace (*endptr)))
80		fail ("parsing ucnid.tab [1]");
81	      l = endptr;
82	      if (*l != '-')
83		end = start;
84	      else
85		{
86		  end = strtoul (l + 1, &endptr, 16);
87		  if (end < start)
88		    fail ("parsing ucnid.tab, end before start");
89		  l = endptr;
90		  if (! isspace (*l))
91		    fail ("parsing ucnid.tab, junk after range");
92		}
93	      while (isspace (*l))
94		l++;
95	      if (end > 0xFFFF)
96		fail ("parsing ucnid.tab, end too large");
97	      while (start <= end)
98		flags[start++] |= fl;
99	    }
100	}
101    }
102  if (ferror (f))
103    fail ("reading ucnid.tab");
104  fclose (f);
105}
106
107/* Read UnicodeData.txt and set the 'digit' flag, and
108   also fill in the 'decomp' table to be the decompositions of
109   characters for which both the character decomposed and all the code
110   points in the decomposition are either C99 or CXX.  */
111
112static void
113read_table (char *fname)
114{
115  FILE * f = fopen (fname, "r");
116
117  if (!f)
118    fail ("opening UnicodeData.txt");
119  for (;;)
120    {
121      char line[256];
122      unsigned long codepoint, this_decomp[4];
123      char *l;
124      int i;
125      int decomp_useful;
126
127      if (!fgets (line, sizeof (line), f))
128	break;
129      codepoint = strtoul (line, &l, 16);
130      if (l == line || *l != ';')
131	fail ("parsing UnicodeData.txt, reading code point");
132      if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX)))
133	continue;
134
135      do {
136	l++;
137      } while (*l != ';');
138      /* Category value; things starting with 'N' are numbers of some
139	 kind.  */
140      if (*++l == 'N')
141	flags[codepoint] |= digit;
142
143      do {
144	l++;
145      } while (*l != ';');
146      /* Canonical combining class; in NFC/NFKC, they must be increasing
147	 (or zero).  */
148      if (! isdigit (*++l))
149	fail ("parsing UnicodeData.txt, combining class not number");
150      combining_value[codepoint] = strtoul (l, &l, 10);
151      if (*l++ != ';')
152	fail ("parsing UnicodeData.txt, junk after combining class");
153
154      /* Skip over bidi value.  */
155      do {
156	l++;
157      } while (*l != ';');
158
159      /* Decomposition mapping.  */
160      decomp_useful = flags[codepoint];
161      if (*++l == '<')  /* Compatibility mapping. */
162	continue;
163      for (i = 0; i < 4; i++)
164	{
165	  if (*l == ';')
166	    break;
167	  if (!isxdigit (*l))
168	    fail ("parsing UnicodeData.txt, decomposition format");
169	  this_decomp[i] = strtoul (l, &l, 16);
170	  decomp_useful &= flags[this_decomp[i]];
171	  while (isspace (*l))
172	    l++;
173	}
174      if (i > 2)  /* Decomposition too long.  */
175	fail ("parsing UnicodeData.txt, decomposition too long");
176      if (decomp_useful)
177	while (--i >= 0)
178	  decomp[codepoint][i] = this_decomp[i];
179    }
180  if (ferror (f))
181    fail ("reading UnicodeData.txt");
182  fclose (f);
183}
184
185/* Read DerivedNormalizationProps.txt and set the flags that say whether
186   a character is in NFC, NFKC, or is context-dependent.  */
187
188static void
189read_derived (const char *fname)
190{
191  FILE * f = fopen (fname, "r");
192
193  if (!f)
194    fail ("opening DerivedNormalizationProps.txt");
195  for (;;)
196    {
197      char line[256];
198      unsigned long start, end;
199      char *l;
200      bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p;
201
202      if (!fgets (line, sizeof (line), f))
203	break;
204      not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL);
205      not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL);
206      maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL);
207      if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p)
208	continue;
209
210      start = strtoul (line, &l, 16);
211      if (l == line)
212	fail ("parsing DerivedNormalizationProps.txt, reading start");
213      if (start > 0xffff)
214	continue;
215      if (*l == '.' && l[1] == '.')
216	end = strtoul (l + 2, &l, 16);
217      else
218	end = start;
219
220      while (start <= end)
221	flags[start++] |= ((not_NFC_p ? not_NFC : 0)
222			   | (not_NFKC_p ? not_NFKC : 0)
223			   | (maybe_not_NFC_p ? maybe_not_NFC : 0)
224			   );
225    }
226  if (ferror (f))
227    fail ("reading DerivedNormalizationProps.txt");
228  fclose (f);
229}
230
231/* Write out the table.
232   The table consists of two words per entry.  The first word is the flags
233   for the unicode code points up to and including the second word.  */
234
235static void
236write_table (void)
237{
238  unsigned i;
239  unsigned last_flag = flags[0];
240  bool really_safe = decomp[0][0] == 0;
241  unsigned char last_combine = combining_value[0];
242
243  for (i = 1; i <= 65536; i++)
244    if (i == 65536
245	|| (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX)))
246	|| really_safe != (decomp[i][0] == 0)
247	|| combining_value[i] != last_combine)
248      {
249	printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
250		last_flag & C99 ? "C99" : "  0",
251		last_flag & digit ? "DIG" : "  0",
252		last_flag & CXX ? "CXX" : "  0",
253		really_safe ? "CID" : "  0",
254		last_flag & not_NFC ? "  0" : "NFC",
255		last_flag & not_NFKC ? "  0" : "NKC",
256		last_flag & maybe_not_NFC ? "CTX" : "  0",
257		combining_value[i - 1],
258		i - 1);
259	last_flag = flags[i];
260	last_combine = combining_value[0];
261	really_safe = decomp[i][0] == 0;
262      }
263}
264
265/* Print out the huge copyright notice.  */
266
267static void
268write_copyright (void)
269{
270  static const char copyright[] = "\
271/* Unicode characters and various properties.\n\
272   Copyright (C) 2003, 2005 Free Software Foundation, Inc.\n\
273\n\
274   This program is free software; you can redistribute it and/or modify it\n\
275   under the terms of the GNU General Public License as published by the\n\
276   Free Software Foundation; either version 2, or (at your option) any\n\
277   later version.\n\
278\n\
279   This program is distributed in the hope that it will be useful,\n\
280   but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
281   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
282   GNU General Public License for more details.\n\
283\n\
284   You should have received a copy of the GNU General Public License\n\
285   along with this program; if not, write to the Free Software\n\
286   Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n\
287\n\
288\n\
289   Copyright (C) 1991-2005 Unicode, Inc.  All rights reserved.\n\
290   Distributed under the Terms of Use in\n\
291   http://www.unicode.org/copyright.html.\n\
292\n\
293   Permission is hereby granted, free of charge, to any person\n\
294   obtaining a copy of the Unicode data files and any associated\n\
295   documentation (the \"Data Files\") or Unicode software and any\n\
296   associated documentation (the \"Software\") to deal in the Data Files\n\
297   or Software without restriction, including without limitation the\n\
298   rights to use, copy, modify, merge, publish, distribute, and/or\n\
299   sell copies of the Data Files or Software, and to permit persons to\n\
300   whom the Data Files or Software are furnished to do so, provided\n\
301   that (a) the above copyright notice(s) and this permission notice\n\
302   appear with all copies of the Data Files or Software, (b) both the\n\
303   above copyright notice(s) and this permission notice appear in\n\
304   associated documentation, and (c) there is clear notice in each\n\
305   modified Data File or in the Software as well as in the\n\
306   documentation associated with the Data File(s) or Software that the\n\
307   data or software has been modified.\n\
308\n\
309   THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
310   OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
311   WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
312   NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
313   COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
314   ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
315   DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
316   WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
317   ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
318   OF THE DATA FILES OR SOFTWARE.\n\
319\n\
320   Except as contained in this notice, the name of a copyright holder\n\
321   shall not be used in advertising or otherwise to promote the sale,\n\
322   use or other dealings in these Data Files or Software without prior\n\
323   written authorization of the copyright holder.  */\n";
324
325   puts (copyright);
326}
327
328/* Main program.  */
329
330int
331main(int argc, char ** argv)
332{
333  if (argc != 4)
334    fail ("too few arguments to makeucn");
335  read_ucnid (argv[1]);
336  read_table (argv[2]);
337  read_derived (argv[3]);
338
339  write_copyright ();
340  write_table ();
341  return 0;
342}
343