1/* Generate a Unicode conforming Line Break Properties tables from a
2   UnicodeData file.
3   Written by Bruno Haible <bruno@clisp.org>, 2000-2004.
4
5This program is free software: you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 3 of the License, or
8(at your option) any later version.
9
10This program is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17
18/* Usage example:
19     $ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \
20		    Combining.txt \
21		    /usr/local/share/Unidata/EastAsianWidth.txt \
22		    /usr/local/share/Unidata/LineBreak.txt \
23		    3.1.0
24 */
25
26#include <stdio.h>
27#include <stdlib.h>
28#include <stdbool.h>
29#include <stdint.h>
30#include <string.h>
31#include <time.h>
32
33/* This structure represents one line in the UnicodeData.txt file.  */
34struct unicode_attribute
35{
36  const char *name;           /* Character name */
37  const char *category;       /* General category */
38  const char *combining;      /* Canonical combining classes */
39  const char *bidi;           /* Bidirectional category */
40  const char *decomposition;  /* Character decomposition mapping */
41  const char *decdigit;       /* Decimal digit value */
42  const char *digit;          /* Digit value */
43  const char *numeric;        /* Numeric value */
44  int mirrored;               /* mirrored */
45  const char *oldname;        /* Old Unicode 1.0 name */
46  const char *comment;        /* Comment */
47  unsigned int upper;         /* Uppercase mapping */
48  unsigned int lower;         /* Lowercase mapping */
49  unsigned int title;         /* Titlecase mapping */
50};
51
52/* Missing fields are represented with "" for strings, and NONE for
53   characters.  */
54#define NONE (~(unsigned int)0)
55
56/* The entire contents of the UnicodeData.txt file.  */
57struct unicode_attribute unicode_attributes [0x110000];
58
59/* Stores in unicode_attributes[i] the values from the given fields.  */
60static void
61fill_attribute (unsigned int i,
62		const char *field1, const char *field2,
63		const char *field3, const char *field4,
64		const char *field5, const char *field6,
65		const char *field7, const char *field8,
66		const char *field9, const char *field10,
67		const char *field11, const char *field12,
68		const char *field13, const char *field14)
69{
70  struct unicode_attribute * uni;
71
72  if (i >= 0x110000)
73    {
74      fprintf (stderr, "index too large\n");
75      exit (1);
76    }
77  uni = &unicode_attributes[i];
78  /* Copy the strings.  */
79  uni->name          = strdup (field1);
80  uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
81  uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
82  uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
83  uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
84  uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
85  uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
86  uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
87  uni->mirrored      = (field9[0] == 'Y');
88  uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
89  uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
90  uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
91  uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
92  uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
93}
94
95/* Maximum length of a field in the UnicodeData.txt file.  */
96#define FIELDLEN 120
97
98/* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
99   Reads up to (but excluding) DELIM.
100   Returns 1 when a field was successfully read, otherwise 0.  */
101static int
102getfield (FILE *stream, char *buffer, int delim)
103{
104  int count = 0;
105  int c;
106
107  for (; (c = getc (stream)), (c != EOF && c != delim); )
108    {
109      /* The original unicode.org UnicodeData.txt file happens to have
110	 CR/LF line terminators.  Silently convert to LF.  */
111      if (c == '\r')
112	continue;
113
114      /* Put c into the buffer.  */
115      if (++count >= FIELDLEN - 1)
116	{
117	  fprintf (stderr, "field too long\n");
118	  exit (1);
119	}
120      *buffer++ = c;
121    }
122
123  if (c == EOF)
124    return 0;
125
126  *buffer = '\0';
127  return 1;
128}
129
130/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
131   file.  */
132static void
133fill_attributes (const char *unicodedata_filename)
134{
135  unsigned int i, j;
136  FILE *stream;
137  char field0[FIELDLEN];
138  char field1[FIELDLEN];
139  char field2[FIELDLEN];
140  char field3[FIELDLEN];
141  char field4[FIELDLEN];
142  char field5[FIELDLEN];
143  char field6[FIELDLEN];
144  char field7[FIELDLEN];
145  char field8[FIELDLEN];
146  char field9[FIELDLEN];
147  char field10[FIELDLEN];
148  char field11[FIELDLEN];
149  char field12[FIELDLEN];
150  char field13[FIELDLEN];
151  char field14[FIELDLEN];
152  int lineno = 0;
153
154  for (i = 0; i < 0x110000; i++)
155    unicode_attributes[i].name = NULL;
156
157  stream = fopen (unicodedata_filename, "r");
158  if (stream == NULL)
159    {
160      fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
161      exit (1);
162    }
163
164  for (;;)
165    {
166      int n;
167
168      lineno++;
169      n = getfield (stream, field0, ';');
170      n += getfield (stream, field1, ';');
171      n += getfield (stream, field2, ';');
172      n += getfield (stream, field3, ';');
173      n += getfield (stream, field4, ';');
174      n += getfield (stream, field5, ';');
175      n += getfield (stream, field6, ';');
176      n += getfield (stream, field7, ';');
177      n += getfield (stream, field8, ';');
178      n += getfield (stream, field9, ';');
179      n += getfield (stream, field10, ';');
180      n += getfield (stream, field11, ';');
181      n += getfield (stream, field12, ';');
182      n += getfield (stream, field13, ';');
183      n += getfield (stream, field14, '\n');
184      if (n == 0)
185	break;
186      if (n != 15)
187	{
188	  fprintf (stderr, "short line in'%s':%d\n",
189		   unicodedata_filename, lineno);
190	  exit (1);
191	}
192      i = strtoul (field0, NULL, 16);
193      if (field1[0] == '<'
194	  && strlen (field1) >= 9
195	  && !strcmp (field1 + strlen(field1) - 8, ", First>"))
196	{
197	  /* Deal with a range. */
198	  lineno++;
199	  n = getfield (stream, field0, ';');
200	  n += getfield (stream, field1, ';');
201	  n += getfield (stream, field2, ';');
202	  n += getfield (stream, field3, ';');
203	  n += getfield (stream, field4, ';');
204	  n += getfield (stream, field5, ';');
205	  n += getfield (stream, field6, ';');
206	  n += getfield (stream, field7, ';');
207	  n += getfield (stream, field8, ';');
208	  n += getfield (stream, field9, ';');
209	  n += getfield (stream, field10, ';');
210	  n += getfield (stream, field11, ';');
211	  n += getfield (stream, field12, ';');
212	  n += getfield (stream, field13, ';');
213	  n += getfield (stream, field14, '\n');
214	  if (n != 15)
215	    {
216	      fprintf (stderr, "missing end range in '%s':%d\n",
217		       unicodedata_filename, lineno);
218	      exit (1);
219	    }
220	  if (!(field1[0] == '<'
221		&& strlen (field1) >= 8
222		&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
223	    {
224	      fprintf (stderr, "missing end range in '%s':%d\n",
225		       unicodedata_filename, lineno);
226	      exit (1);
227	    }
228	  field1[strlen (field1) - 7] = '\0';
229	  j = strtoul (field0, NULL, 16);
230	  for (; i <= j; i++)
231	    fill_attribute (i, field1+1, field2, field3, field4, field5,
232			       field6, field7, field8, field9, field10,
233			       field11, field12, field13, field14);
234	}
235      else
236	{
237	  /* Single character line */
238	  fill_attribute (i, field1, field2, field3, field4, field5,
239			     field6, field7, field8, field9, field10,
240			     field11, field12, field13, field14);
241	}
242    }
243  if (ferror (stream) || fclose (stream))
244    {
245      fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
246      exit (1);
247    }
248}
249
250/* The combining property from the PropList.txt file.  */
251char unicode_combining[0x110000];
252
253/* Stores in unicode_combining[] the Combining property from the
254   Unicode 3.0 PropList.txt file.  */
255static void
256fill_combining (const char *proplist_filename)
257{
258  unsigned int i;
259  FILE *stream;
260  char buf[100+1];
261
262  for (i = 0; i < 0x110000; i++)
263    unicode_combining[i] = 0;
264
265  stream = fopen (proplist_filename, "r");
266  if (stream == NULL)
267    {
268      fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
269      exit (1);
270    }
271
272  /* Search for the "Property dump for: 0x20000004 (Combining)" line.  */
273  do
274    {
275      if (fscanf (stream, "%100[^\n]\n", buf) < 1)
276	{
277	  fprintf (stderr, "no combining property found in '%s'\n",
278		   proplist_filename);
279	  exit (1);
280	}
281    }
282  while (strstr (buf, "(Combining)") == NULL);
283
284  for (;;)
285    {
286      unsigned int i1, i2;
287
288      if (fscanf (stream, "%100[^\n]\n", buf) < 1)
289	{
290	  fprintf (stderr, "premature end of combining property in '%s'\n",
291		   proplist_filename);
292	  exit (1);
293	}
294      if (buf[0] == '*')
295	break;
296      if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
297	{
298	  if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
299	    {
300	      fprintf (stderr, "parse error in combining property in '%s'\n",
301		       proplist_filename);
302	      exit (1);
303	    }
304	}
305      else if (strlen (buf) >= 4)
306	{
307	  if (sscanf (buf, "%4X", &i1) < 1)
308	    {
309	      fprintf (stderr, "parse error in combining property in '%s'\n",
310		       proplist_filename);
311	      exit (1);
312	    }
313	  i2 = i1;
314	}
315      else
316	{
317	  fprintf (stderr, "parse error in combining property in '%s'\n",
318		   proplist_filename);
319	  exit (1);
320	}
321      for (i = i1; i <= i2; i++)
322	unicode_combining[i] = 1;
323    }
324  if (ferror (stream) || fclose (stream))
325    {
326      fprintf (stderr, "error reading from '%s'\n", proplist_filename);
327      exit (1);
328    }
329}
330
331/* The width property from the EastAsianWidth.txt file.
332   Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na".  */
333const char * unicode_width[0x110000];
334
335/* Stores in unicode_width[] the width property from the EastAsianWidth.txt
336   file.  */
337static void
338fill_width (const char *width_filename)
339{
340  unsigned int i, j;
341  FILE *stream;
342  char field0[FIELDLEN];
343  char field1[FIELDLEN];
344  char field2[FIELDLEN];
345  int lineno = 0;
346
347  for (i = 0; i < 0x110000; i++)
348    unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
349
350  stream = fopen (width_filename, "r");
351  if (stream == NULL)
352    {
353      fprintf (stderr, "error during fopen of '%s'\n", width_filename);
354      exit (1);
355    }
356
357  for (;;)
358    {
359      int n;
360      int c;
361
362      lineno++;
363      c = getc (stream);
364      if (c == EOF)
365	break;
366      if (c == '#')
367	{
368	  do c = getc (stream); while (c != EOF && c != '\n');
369	  continue;
370	}
371      ungetc (c, stream);
372      n = getfield (stream, field0, ';');
373      n += getfield (stream, field1, ' ');
374      n += getfield (stream, field2, '\n');
375      if (n == 0)
376	break;
377      if (n != 3)
378	{
379	  fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
380	  exit (1);
381	}
382      i = strtoul (field0, NULL, 16);
383      if (strstr (field0, "..") != NULL)
384	{
385	  /* Deal with a range.  */
386	  j = strtoul (strstr (field0, "..") + 2, NULL, 16);
387	  for (; i <= j; i++)
388	    unicode_width[i] = strdup (field1);
389	}
390      else
391	{
392	  /* Single character line.  */
393	  unicode_width[i] = strdup (field1);
394	}
395    }
396  if (ferror (stream) || fclose (stream))
397    {
398      fprintf (stderr, "error reading from '%s'\n", width_filename);
399      exit (1);
400    }
401}
402
403/* Line breaking classification.  */
404
405enum
406{
407  /* Values >= 20 are resolved at run time. */
408  LBP_BK =  0, /* mandatory break */
409/*LBP_CR,         carriage return - not used here because it's a DOSism */
410/*LBP_LF,         line feed - not used here because it's a DOSism */
411  LBP_CM = 20, /* attached characters and combining marks */
412/*LBP_SG,         surrogates - not used here because they are not characters */
413  LBP_ZW =  1, /* zero width space */
414  LBP_IN =  2, /* inseparable */
415  LBP_GL =  3, /* non-breaking (glue) */
416  LBP_CB = 22, /* contingent break opportunity */
417  LBP_SP = 21, /* space */
418  LBP_BA =  4, /* break opportunity after */
419  LBP_BB =  5, /* break opportunity before */
420  LBP_B2 =  6, /* break opportunity before and after */
421  LBP_HY =  7, /* hyphen */
422  LBP_NS =  8, /* non starter */
423  LBP_OP =  9, /* opening punctuation */
424  LBP_CL = 10, /* closing punctuation */
425  LBP_QU = 11, /* ambiguous quotation */
426  LBP_EX = 12, /* exclamation/interrogation */
427  LBP_ID = 13, /* ideographic */
428  LBP_NU = 14, /* numeric */
429  LBP_IS = 15, /* infix separator (numeric) */
430  LBP_SY = 16, /* symbols allowing breaks */
431  LBP_AL = 17, /* ordinary alphabetic and symbol characters */
432  LBP_PR = 18, /* prefix (numeric) */
433  LBP_PO = 19, /* postfix (numeric) */
434  LBP_SA = 23, /* complex context (South East Asian) */
435  LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
436  LBP_XX = 25  /* unknown */
437};
438
439/* Returns the line breaking classification for ch, as a bit mask.  */
440static int
441get_lbp (unsigned int ch)
442{
443  int attr = 0;
444
445  if (unicode_attributes[ch].name != NULL)
446    {
447      /* mandatory break */
448      if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
449	  || ch == 0x000C /* form feed */
450	  || ch == 0x2028 /* LINE SEPARATOR */
451	  || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
452	attr |= 1 << LBP_BK;
453
454      /* zero width space */
455      if (ch == 0x200B /* ZERO WIDTH SPACE */)
456	attr |= 1 << LBP_ZW;
457
458      /* inseparable */
459      if (ch == 0x2024 /* ONE DOT LEADER */
460	  || ch == 0x2025 /* TWO DOT LEADER */
461	  || ch == 0x2026 /* HORIZONTAL ELLIPSIS */)
462	attr |= 1 << LBP_IN;
463
464      /* non-breaking (glue) */
465      if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
466	  || ch == 0x00A0 /* NO-BREAK SPACE */
467	  || ch == 0x202F /* NARROW NO-BREAK SPACE */
468	  || ch == 0x2007 /* FIGURE SPACE */
469	  || ch == 0x2011 /* NON-BREAKING HYPHEN */
470	  || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */)
471	attr |= 1 << LBP_GL;
472
473      /* contingent break opportunity */
474      if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
475	attr |= 1 << LBP_CB;
476
477      /* space */
478      if (ch == 0x0020 /* SPACE */)
479	attr |= 1 << LBP_SP;
480
481      /* break opportunity after */
482      if (ch == 0x2000 /* EN QUAD */
483	  || ch == 0x2001 /* EM QUAD */
484	  || ch == 0x2002 /* EN SPACE */
485	  || ch == 0x2003 /* EM SPACE */
486	  || ch == 0x2004 /* THREE-PER-EM SPACE */
487	  || ch == 0x2005 /* FOUR-PER-EM SPACE */
488	  || ch == 0x2006 /* SIX-PER-EM SPACE */
489	  || ch == 0x2008 /* PUNCTUATION SPACE */
490	  || ch == 0x2009 /* THIN SPACE */
491	  || ch == 0x200A /* HAIR SPACE */
492	  || ch == 0x0009 /* tab */
493	  || ch == 0x058A /* ARMENIAN HYPHEN */
494	  || ch == 0x2010 /* HYPHEN */
495	  || ch == 0x2012 /* FIGURE DASH */
496	  || ch == 0x2013 /* EN DASH */
497	  || ch == 0x00AD /* SOFT HYPHEN */
498	  || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
499	  || ch == 0x1361 /* ETHIOPIC WORDSPACE */
500	  || ch == 0x1680 /* OGHAM SPACE MARK */
501	  || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
502	  || ch == 0x2027 /* HYPHENATION POINT */
503	  || ch == 0x007C /* VERTICAL LINE */)
504	attr |= 1 << LBP_BA;
505
506      /* break opportunity before */
507      if (ch == 0x00B4 /* ACUTE ACCENT */
508	  || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
509	  || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
510	  || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
511	attr |= 1 << LBP_BB;
512
513      /* break opportunity before and after */
514      if (ch == 0x2014 /* EM DASH */)
515	attr |= 1 << LBP_B2;
516
517      /* hyphen */
518      if (ch == 0x002D /* HYPHEN-MINUS */)
519	attr |= 1 << LBP_HY;
520
521      /* exclamation/interrogation */
522      if (ch == 0x0021 /* EXCLAMATION MARK */
523	  || ch == 0x003F /* QUESTION MARK */
524	  || ch == 0xFE56 /* SMALL QUESTION MARK */
525	  || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
526	  || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
527	  || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
528	attr |= 1 << LBP_EX;
529
530      /* opening punctuation */
531      if (unicode_attributes[ch].category[0] == 'P'
532	  && unicode_attributes[ch].category[1] == 's')
533	attr |= 1 << LBP_OP;
534
535      /* closing punctuation */
536      if (ch == 0x3001 /* IDEOGRAPHIC COMMA */
537	  || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
538	  || ch == 0xFE50 /* SMALL COMMA */
539	  || ch == 0xFE52 /* SMALL FULL STOP */
540	  || ch == 0xFF0C /* FULLWIDTH COMMA */
541	  || ch == 0xFF0E /* FULLWIDTH FULL STOP */
542	  || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
543	  || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
544	  || (unicode_attributes[ch].category[0] == 'P'
545	      && unicode_attributes[ch].category[1] == 'e'))
546	attr |= 1 << LBP_CL;
547
548      /* ambiguous quotation */
549      if (ch == 0x0022 /* QUOTATION MARK */
550	  || ch == 0x0027 /* APOSTROPHE */
551	  || (unicode_attributes[ch].category[0] == 'P'
552	      && (unicode_attributes[ch].category[1] == 'f'
553		  || unicode_attributes[ch].category[1] == 'i')))
554	attr |= 1 << LBP_QU;
555
556      /* attached characters and combining marks */
557      if ((unicode_attributes[ch].category[0] == 'M'
558	   && (unicode_attributes[ch].category[1] == 'n'
559	       || unicode_attributes[ch].category[1] == 'c'
560	       || unicode_attributes[ch].category[1] == 'e'))
561	  || (ch >= 0x1160 && ch <= 0x11F9)
562	  || (unicode_attributes[ch].category[0] == 'C'
563	      && (unicode_attributes[ch].category[1] == 'c'
564		  || unicode_attributes[ch].category[1] == 'f')))
565	if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL))))
566	  attr |= 1 << LBP_CM;
567
568      /* non starter */
569      if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
570	  || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
571	  || ch == 0x17D4 /* KHMER SIGN KHAN */
572	  || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
573	  || ch == 0x17D7 /* KHMER SIGN LEK TOO */
574	  || ch == 0x17D8 /* KHMER SIGN BEYYAL */
575	  || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */
576	  || ch == 0x17DA /* KHMER SIGN KOOMUUT */
577	  || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
578	  || ch == 0x2044 /* FRACTION SLASH */
579	  || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
580	  || ch == 0x301C /* WAVE DASH */
581	  || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
582	  || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
583	  || ch == 0x309D /* HIRAGANA ITERATION MARK */
584	  || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
585	  || ch == 0x30FB /* KATAKANA MIDDLE DOT */
586	  || ch == 0x30FD /* KATAKANA ITERATION MARK */
587	  || ch == 0xFE54 /* SMALL SEMICOLON */
588	  || ch == 0xFE55 /* SMALL COLON */
589	  || ch == 0xFF1A /* FULLWIDTH COLON */
590	  || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
591	  || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
592	  || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
593	  || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
594	  || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
595	  || (unicode_attributes[ch].category[0] == 'L'
596	      && unicode_attributes[ch].category[1] == 'm'
597	      && (unicode_width[ch][0] == 'W'
598		  || unicode_width[ch][0] == 'H'))
599	  || (unicode_attributes[ch].category[0] == 'S'
600	      && unicode_attributes[ch].category[1] == 'k'
601	      && unicode_width[ch][0] == 'W')
602	  || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
603	  || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
604	attr |= 1 << LBP_NS;
605
606      /* numeric */
607      if (unicode_attributes[ch].category[0] == 'N'
608	  && unicode_attributes[ch].category[1] == 'd'
609	  && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
610	attr |= 1 << LBP_NU;
611
612      /* infix separator (numeric) */
613      if (ch == 0x002C /* COMMA */
614	  || ch == 0x002E /* FULL STOP */
615	  || ch == 0x003A /* COLON */
616	  || ch == 0x003B /* SEMICOLON */
617	  || ch == 0x0589 /* ARMENIAN FULL STOP */)
618	attr |= 1 << LBP_IS;
619
620      /* symbols allowing breaks */
621      if (ch == 0x002F /* SOLIDUS */)
622	attr |= 1 << LBP_SY;
623
624      /* postfix (numeric) */
625      if (ch == 0x0025 /* PERCENT SIGN */
626	  || ch == 0x00A2 /* CENT SIGN */
627	  || ch == 0x00B0 /* DEGREE SIGN */
628	  || ch == 0x2030 /* PER MILLE SIGN */
629	  || ch == 0x2031 /* PER TEN THOUSAND SIGN */
630	  || ch == 0x2032 /* PRIME */
631	  || ch == 0x2033 /* DOUBLE PRIME */
632	  || ch == 0x2034 /* TRIPLE PRIME */
633	  || ch == 0x2035 /* REVERSED PRIME */
634	  || ch == 0x2036 /* REVERSED DOUBLE PRIME */
635	  || ch == 0x2037 /* REVERSED TRIPLE PRIME */
636	  || ch == 0x20A7 /* PESETA SIGN */
637	  || ch == 0x2103 /* DEGREE CELSIUS */
638	  || ch == 0x2109 /* DEGREE FAHRENHEIT */
639	  || ch == 0x2126 /* OHM SIGN */
640	  || ch == 0xFE6A /* SMALL PERCENT SIGN */
641	  || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
642	  || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
643	attr |= 1 << LBP_PO;
644
645      /* prefix (numeric) */
646      if (ch == 0x002B /* PLUS SIGN */
647	  || ch == 0x005C /* REVERSE SOLIDUS */
648	  || ch == 0x00B1 /* PLUS-MINUS SIGN */
649	  || ch == 0x2116 /* NUMERO SIGN */
650	  || ch == 0x2212 /* MINUS SIGN */
651	  || ch == 0x2213 /* MINUS-OR-PLUS SIGN */
652	  || (unicode_attributes[ch].category[0] == 'S'
653	      && unicode_attributes[ch].category[1] == 'c'))
654	if (!(attr & (1 << LBP_PO)))
655	  attr |= 1 << LBP_PR;
656
657      /* complex context (South East Asian) */
658      if (((ch >= 0x0E00 && ch <= 0x0EFF)
659	   || (ch >= 0x1000 && ch <= 0x109F)
660	   || (ch >= 0x1780 && ch <= 0x17FF))
661	  && unicode_attributes[ch].category[0] == 'L'
662	  && (unicode_attributes[ch].category[1] == 'm'
663	      || unicode_attributes[ch].category[1] == 'o'))
664	if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR))))
665	  attr |= 1 << LBP_SA;
666
667      /* ideographic */
668      if ((ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */
669	  || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
670	  || ch == 0x3000 /* IDEOGRAPHIC SPACE */
671	  || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */
672	  || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
673	  || (ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */
674	  || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */
675	  || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */
676	  || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */
677	  || (ch >= 0xA490 && ch <= 0xA4C6) /* YI RADICAL */
678	  || ch == 0xFE62 /* SMALL PLUS SIGN */
679	  || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
680	  || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
681	  || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
682	  || ch == 0xFE66 /* SMALL EQUALS SIGN */
683	  || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
684	  || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
685	  || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
686	  || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
687	  || (ch >= 0x3000 && ch <= 0x33FF
688	      && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
689	  /* Extra characters for compatibility with Unicode LineBreak.txt.  */
690	  || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
691	  || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
692	  || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
693	  || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
694	  || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
695	  || ch == 0xFE49 /* DASHED OVERLINE */
696	  || ch == 0xFE4A /* CENTRELINE OVERLINE */
697	  || ch == 0xFE4B /* WAVY OVERLINE */
698	  || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
699	  || ch == 0xFE4D /* DASHED LOW LINE */
700	  || ch == 0xFE4E /* CENTRELINE LOW LINE */
701	  || ch == 0xFE4F /* WAVY LOW LINE */
702	  || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
703	  || ch == 0xFE58 /* SMALL EM DASH */
704	  || ch == 0xFE5F /* SMALL NUMBER SIGN */
705	  || ch == 0xFE60 /* SMALL AMPERSAND */
706	  || ch == 0xFE61 /* SMALL ASTERISK */
707	  || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
708	  || ch == 0xFE6B /* SMALL COMMERCIAL AT */
709	  || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
710	  || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
711	  || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
712	  || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
713	  || ch == 0xFF0A /* FULLWIDTH ASTERISK */
714	  || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
715	  || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
716	  || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
717	  || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
718	  || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
719	  || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
720	  || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
721	  || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
722	  || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
723	  || ch == 0xFF3F /* FULLWIDTH LOW LINE */
724	  || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
725	  || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
726	  || ch == 0xFF5E /* FULLWIDTH TILDE */
727	  || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
728	  || ch == 0xFFE3 /* FULLWIDTH MACRON */
729	  || ch == 0xFFE4) /* FULLWIDTH BROKEN BAR */
730	{
731	  /* ambiguous (ideograph) ? */
732	  if (unicode_width[ch] != NULL
733	      && unicode_width[ch][0] == 'A')
734	    attr |= 1 << LBP_AI;
735	  else
736	    attr |= 1 << LBP_ID;
737	}
738
739      /* ordinary alphabetic and symbol characters */
740      if ((unicode_attributes[ch].category[0] == 'L'
741	   && (unicode_attributes[ch].category[1] == 'u'
742	       || unicode_attributes[ch].category[1] == 'l'
743	       || unicode_attributes[ch].category[1] == 't'
744	       || unicode_attributes[ch].category[1] == 'm'
745	       || unicode_attributes[ch].category[1] == 'o'))
746	  || (unicode_attributes[ch].category[0] == 'S'
747	      && (unicode_attributes[ch].category[1] == 'm'
748		  || unicode_attributes[ch].category[1] == 'c'
749		  || unicode_attributes[ch].category[1] == 'k'
750		  || unicode_attributes[ch].category[1] == 'o'))
751	  /* Extra characters for compatibility with Unicode LineBreak.txt.  */
752	  || ch == 0x0023 /* NUMBER SIGN */
753	  || ch == 0x0026 /* AMPERSAND */
754	  || ch == 0x002A /* ASTERISK */
755	  || ch == 0x0040 /* COMMERCIAL AT */
756	  || ch == 0x005F /* LOW LINE */
757	  || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
758	  || ch == 0x00B2 /* SUPERSCRIPT TWO */
759	  || ch == 0x00B3 /* SUPERSCRIPT THREE */
760	  || ch == 0x00B7 /* MIDDLE DOT */
761	  || ch == 0x00B9 /* SUPERSCRIPT ONE */
762	  || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
763	  || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
764	  || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
765	  || ch == 0x00BF /* INVERTED QUESTION MARK */
766	  || ch == 0x037E /* GREEK QUESTION MARK */
767	  || ch == 0x0387 /* GREEK ANO TELEIA */
768	  || ch == 0x055A /* ARMENIAN APOSTROPHE */
769	  || ch == 0x055B /* ARMENIAN EMPHASIS MARK */
770	  || ch == 0x055C /* ARMENIAN EXCLAMATION MARK */
771	  || ch == 0x055D /* ARMENIAN COMMA */
772	  || ch == 0x055E /* ARMENIAN QUESTION MARK */
773	  || ch == 0x055F /* ARMENIAN ABBREVIATION MARK */
774	  || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
775	  || ch == 0x05C0 /* HEBREW PUNCTUATION PASEQ */
776	  || ch == 0x05C3 /* HEBREW PUNCTUATION SOF PASUQ */
777	  || ch == 0x05F3 /* HEBREW PUNCTUATION GERESH */
778	  || ch == 0x05F4 /* HEBREW PUNCTUATION GERSHAYIM */
779	  || ch == 0x060C /* ARABIC COMMA */
780	  || ch == 0x061B /* ARABIC SEMICOLON */
781	  || ch == 0x061F /* ARABIC QUESTION MARK */
782	  || ch == 0x066A /* ARABIC PERCENT SIGN */
783	  || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
784	  || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */
785	  || ch == 0x066D /* ARABIC FIVE POINTED STAR */
786	  || ch == 0x06D4 /* ARABIC FULL STOP */
787	  || ch == 0x0700 /* SYRIAC END OF PARAGRAPH */
788	  || ch == 0x0701 /* SYRIAC SUPRALINEAR FULL STOP */
789	  || ch == 0x0702 /* SYRIAC SUBLINEAR FULL STOP */
790	  || ch == 0x0703 /* SYRIAC SUPRALINEAR COLON */
791	  || ch == 0x0704 /* SYRIAC SUBLINEAR COLON */
792	  || ch == 0x0705 /* SYRIAC HORIZONTAL COLON */
793	  || ch == 0x0706 /* SYRIAC COLON SKEWED LEFT */
794	  || ch == 0x0707 /* SYRIAC COLON SKEWED RIGHT */
795	  || ch == 0x0708 /* SYRIAC SUPRALINEAR COLON SKEWED LEFT */
796	  || ch == 0x0709 /* SYRIAC SUBLINEAR COLON SKEWED RIGHT */
797	  || ch == 0x070A /* SYRIAC CONTRACTION */
798	  || ch == 0x070B /* SYRIAC HARKLEAN OBELUS */
799	  || ch == 0x070C /* SYRIAC HARKLEAN METOBELUS */
800	  || ch == 0x070D /* SYRIAC HARKLEAN ASTERISCUS */
801	  || ch == 0x0964 /* DEVANAGARI DANDA */
802	  || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
803	  || ch == 0x0970 /* DEVANAGARI ABBREVIATION SIGN */
804	  || ch == 0x09F4 /* BENGALI CURRENCY NUMERATOR ONE */
805	  || ch == 0x09F5 /* BENGALI CURRENCY NUMERATOR TWO */
806	  || ch == 0x09F6 /* BENGALI CURRENCY NUMERATOR THREE */
807	  || ch == 0x09F7 /* BENGALI CURRENCY NUMERATOR FOUR */
808	  || ch == 0x09F8 /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
809	  || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
810	  || ch == 0x0BF0 /* TAMIL NUMBER TEN */
811	  || ch == 0x0BF1 /* TAMIL NUMBER ONE HUNDRED */
812	  || ch == 0x0BF2 /* TAMIL NUMBER ONE THOUSAND */
813	  || ch == 0x0DF4 /* SINHALA PUNCTUATION KUNDDALIYA */
814	  || ch == 0x0E4F /* THAI CHARACTER FONGMAN */
815	  || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
816	  || ch == 0x0F05 /* TIBETAN MARK CLOSING YIG MGO SGAB MA */
817	  || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
818	  || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
819	  || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
820	  || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
821	  || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
822	  || ch == 0x0F0D /* TIBETAN MARK SHAD */
823	  || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
824	  || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
825	  || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
826	  || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
827	  || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
828	  || ch == 0x0F2A /* TIBETAN DIGIT HALF ONE */
829	  || ch == 0x0F2B /* TIBETAN DIGIT HALF TWO */
830	  || ch == 0x0F2C /* TIBETAN DIGIT HALF THREE */
831	  || ch == 0x0F2D /* TIBETAN DIGIT HALF FOUR */
832	  || ch == 0x0F2E /* TIBETAN DIGIT HALF FIVE */
833	  || ch == 0x0F2F /* TIBETAN DIGIT HALF SIX */
834	  || ch == 0x0F30 /* TIBETAN DIGIT HALF SEVEN */
835	  || ch == 0x0F31 /* TIBETAN DIGIT HALF EIGHT */
836	  || ch == 0x0F32 /* TIBETAN DIGIT HALF NINE */
837	  || ch == 0x0F33 /* TIBETAN DIGIT HALF ZERO */
838	  || ch == 0x0F85 /* TIBETAN MARK PALUTA */
839	  || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
840	  || ch == 0x104B /* MYANMAR SIGN SECTION */
841	  || ch == 0x104C /* MYANMAR SYMBOL LOCATIVE */
842	  || ch == 0x104D /* MYANMAR SYMBOL COMPLETED */
843	  || ch == 0x104E /* MYANMAR SYMBOL AFOREMENTIONED */
844	  || ch == 0x104F /* MYANMAR SYMBOL GENITIVE */
845	  || ch == 0x10FB /* GEORGIAN PARAGRAPH SEPARATOR */
846	  || ch == 0x1362 /* ETHIOPIC FULL STOP */
847	  || ch == 0x1363 /* ETHIOPIC COMMA */
848	  || ch == 0x1364 /* ETHIOPIC SEMICOLON */
849	  || ch == 0x1365 /* ETHIOPIC COLON */
850	  || ch == 0x1366 /* ETHIOPIC PREFACE COLON */
851	  || ch == 0x1367 /* ETHIOPIC QUESTION MARK */
852	  || ch == 0x1368 /* ETHIOPIC PARAGRAPH SEPARATOR */
853	  || ch == 0x1372 /* ETHIOPIC NUMBER TEN */
854	  || ch == 0x1373 /* ETHIOPIC NUMBER TWENTY */
855	  || ch == 0x1374 /* ETHIOPIC NUMBER THIRTY */
856	  || ch == 0x1375 /* ETHIOPIC NUMBER FORTY */
857	  || ch == 0x1376 /* ETHIOPIC NUMBER FIFTY */
858	  || ch == 0x1377 /* ETHIOPIC NUMBER SIXTY */
859	  || ch == 0x1378 /* ETHIOPIC NUMBER SEVENTY */
860	  || ch == 0x1379 /* ETHIOPIC NUMBER EIGHTY */
861	  || ch == 0x137A /* ETHIOPIC NUMBER NINETY */
862	  || ch == 0x137B /* ETHIOPIC NUMBER HUNDRED */
863	  || ch == 0x137C /* ETHIOPIC NUMBER TEN THOUSAND */
864	  || ch == 0x166D /* CANADIAN SYLLABICS CHI SIGN */
865	  || ch == 0x166E /* CANADIAN SYLLABICS FULL STOP */
866	  || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
867	  || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
868	  || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
869	  || ch == 0x16EE /* RUNIC ARLAUG SYMBOL */
870	  || ch == 0x16EF /* RUNIC TVIMADUR SYMBOL */
871	  || ch == 0x16F0 /* RUNIC BELGTHOR SYMBOL */
872	  || ch == 0x17DC /* KHMER SIGN AVAKRAHASANYA */
873	  || ch == 0x1800 /* MONGOLIAN BIRGA */
874	  || ch == 0x1801 /* MONGOLIAN ELLIPSIS */
875	  || ch == 0x1802 /* MONGOLIAN COMMA */
876	  || ch == 0x1803 /* MONGOLIAN FULL STOP */
877	  || ch == 0x1804 /* MONGOLIAN COLON */
878	  || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
879	  || ch == 0x1807 /* MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER */
880	  || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
881	  || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
882	  || ch == 0x180A /* MONGOLIAN NIRUGU */
883	  || ch == 0x2015 /* HORIZONTAL BAR */
884	  || ch == 0x2016 /* DOUBLE VERTICAL LINE */
885	  || ch == 0x2017 /* DOUBLE LOW LINE */
886	  || ch == 0x2020 /* DAGGER */
887	  || ch == 0x2021 /* DOUBLE DAGGER */
888	  || ch == 0x2022 /* BULLET */
889	  || ch == 0x2023 /* TRIANGULAR BULLET */
890	  || ch == 0x2038 /* CARET */
891	  || ch == 0x203B /* REFERENCE MARK */
892	  || ch == 0x203D /* INTERROBANG */
893	  || ch == 0x203E /* OVERLINE */
894	  || ch == 0x203F /* UNDERTIE */
895	  || ch == 0x2040 /* CHARACTER TIE */
896	  || ch == 0x2041 /* CARET INSERTION POINT */
897	  || ch == 0x2042 /* ASTERISM */
898	  || ch == 0x2043 /* HYPHEN BULLET */
899	  || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
900	  || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
901	  || ch == 0x204A /* TIRONIAN SIGN ET */
902	  || ch == 0x204B /* REVERSED PILCROW SIGN */
903	  || ch == 0x204C /* BLACK LEFTWARDS BULLET */
904	  || ch == 0x204D /* BLACK RIGHTWARDS BULLET */
905	  || ch == 0x2070 /* SUPERSCRIPT ZERO */
906	  || ch == 0x2074 /* SUPERSCRIPT FOUR */
907	  || ch == 0x2075 /* SUPERSCRIPT FIVE */
908	  || ch == 0x2076 /* SUPERSCRIPT SIX */
909	  || ch == 0x2077 /* SUPERSCRIPT SEVEN */
910	  || ch == 0x2078 /* SUPERSCRIPT EIGHT */
911	  || ch == 0x2079 /* SUPERSCRIPT NINE */
912	  || ch == 0x2080 /* SUBSCRIPT ZERO */
913	  || ch == 0x2081 /* SUBSCRIPT ONE */
914	  || ch == 0x2082 /* SUBSCRIPT TWO */
915	  || ch == 0x2083 /* SUBSCRIPT THREE */
916	  || ch == 0x2084 /* SUBSCRIPT FOUR */
917	  || ch == 0x2085 /* SUBSCRIPT FIVE */
918	  || ch == 0x2086 /* SUBSCRIPT SIX */
919	  || ch == 0x2087 /* SUBSCRIPT SEVEN */
920	  || ch == 0x2088 /* SUBSCRIPT EIGHT */
921	  || ch == 0x2089 /* SUBSCRIPT NINE */
922	  || (ch >= 0x2153 && ch <= 0x215E) /* VULGAR FRACTION */
923	  || ch == 0x215F /* FRACTION NUMERATOR ONE */
924	  || (ch >= 0x2160 && ch <= 0x2183) /* ROMAN NUMERAL */
925	  || (ch >= 0x2460 && ch <= 0x2473) /* CIRCLED NUMBER */
926	  || (ch >= 0x2474 && ch <= 0x2487) /* PARENTHESIZED NUMBER */
927	  || (ch >= 0x2488 && ch <= 0x249B) /* NUMBER FULL STOP */
928	  || ch == 0x24EA /* CIRCLED DIGIT ZERO */
929	  || (ch >= 0x2776 && ch <= 0x2793) /* DINGBAT CIRCLED DIGIT */
930	  || ch == 0x10320 /* OLD ITALIC NUMERAL ONE */
931	  || ch == 0x10321 /* OLD ITALIC NUMERAL FIVE */
932	  || ch == 0x10322 /* OLD ITALIC NUMERAL TEN */
933	  || ch == 0x10323 /* OLD ITALIC NUMERAL FIFTY */
934	  || ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
935	if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB))))
936	  {
937	    /* ambiguous (alphabetic) ? */
938	    if (unicode_width[ch] != NULL
939		&& unicode_width[ch][0] == 'A')
940	      attr |= 1 << LBP_AI;
941	    else
942	      attr |= 1 << LBP_AL;
943	  }
944    }
945
946  if (attr == 0)
947    /* unknown */
948    attr |= 1 << LBP_XX;
949
950  return attr;
951}
952
953/* Output the line breaking properties in a human readable format.  */
954static void
955debug_output_lbp (FILE *stream)
956{
957  unsigned int i;
958
959  for (i = 0; i < 0x110000; i++)
960    {
961      int attr = get_lbp (i);
962      if (attr != 1 << LBP_XX)
963	{
964	  fprintf (stream, "0x%04X", i);
965#define PRINT_BIT(attr,bit) \
966  if (attr & (1 << bit)) fprintf (stream, " " #bit);
967	  PRINT_BIT(attr,LBP_BK);
968	  PRINT_BIT(attr,LBP_CM);
969	  PRINT_BIT(attr,LBP_ZW);
970	  PRINT_BIT(attr,LBP_IN);
971	  PRINT_BIT(attr,LBP_GL);
972	  PRINT_BIT(attr,LBP_CB);
973	  PRINT_BIT(attr,LBP_SP);
974	  PRINT_BIT(attr,LBP_BA);
975	  PRINT_BIT(attr,LBP_BB);
976	  PRINT_BIT(attr,LBP_B2);
977	  PRINT_BIT(attr,LBP_HY);
978	  PRINT_BIT(attr,LBP_NS);
979	  PRINT_BIT(attr,LBP_OP);
980	  PRINT_BIT(attr,LBP_CL);
981	  PRINT_BIT(attr,LBP_QU);
982	  PRINT_BIT(attr,LBP_EX);
983	  PRINT_BIT(attr,LBP_ID);
984	  PRINT_BIT(attr,LBP_NU);
985	  PRINT_BIT(attr,LBP_IS);
986	  PRINT_BIT(attr,LBP_SY);
987	  PRINT_BIT(attr,LBP_AL);
988	  PRINT_BIT(attr,LBP_PR);
989	  PRINT_BIT(attr,LBP_PO);
990	  PRINT_BIT(attr,LBP_SA);
991	  PRINT_BIT(attr,LBP_XX);
992	  PRINT_BIT(attr,LBP_AI);
993#undef PRINT_BIT
994	  fprintf (stream, "\n");
995	}
996    }
997}
998
999static void
1000debug_output_tables (const char *filename)
1001{
1002  FILE *stream;
1003
1004  stream = fopen (filename, "w");
1005  if (stream == NULL)
1006    {
1007      fprintf (stderr, "cannot open '%s' for writing\n", filename);
1008      exit (1);
1009    }
1010
1011  debug_output_lbp (stream);
1012
1013  if (ferror (stream) || fclose (stream))
1014    {
1015      fprintf (stderr, "error writing to '%s'\n", filename);
1016      exit (1);
1017    }
1018}
1019
1020/* The line breaking property from the LineBreak.txt file.  */
1021int unicode_org_lbp[0x110000];
1022
1023/* Stores in unicode_org_lbp[] the line breaking property from the
1024   LineBreak.txt file.  */
1025static void
1026fill_org_lbp (const char *linebreak_filename)
1027{
1028  unsigned int i, j;
1029  FILE *stream;
1030  char field0[FIELDLEN];
1031  char field1[FIELDLEN];
1032  char field2[FIELDLEN];
1033  int lineno = 0;
1034
1035  for (i = 0; i < 0x110000; i++)
1036    unicode_org_lbp[i] = LBP_XX;
1037
1038  stream = fopen (linebreak_filename, "r");
1039  if (stream == NULL)
1040    {
1041      fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
1042      exit (1);
1043    }
1044
1045  for (;;)
1046    {
1047      int n;
1048      int c;
1049      int value;
1050
1051      lineno++;
1052      c = getc (stream);
1053      if (c == EOF)
1054	break;
1055      if (c == '#')
1056	{
1057	  do c = getc (stream); while (c != EOF && c != '\n');
1058	  continue;
1059	}
1060      ungetc (c, stream);
1061      n = getfield (stream, field0, ';');
1062      n += getfield (stream, field1, ' ');
1063      n += getfield (stream, field2, '\n');
1064      if (n == 0)
1065	break;
1066      if (n != 3)
1067	{
1068	  fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
1069		   lineno);
1070	  exit (1);
1071	}
1072#define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
1073      if (false) {}
1074      TRY(LBP_BK)
1075      TRY(LBP_CM)
1076      TRY(LBP_ZW)
1077      TRY(LBP_IN)
1078      TRY(LBP_GL)
1079      TRY(LBP_CB)
1080      TRY(LBP_SP)
1081      TRY(LBP_BA)
1082      TRY(LBP_BB)
1083      TRY(LBP_B2)
1084      TRY(LBP_HY)
1085      TRY(LBP_NS)
1086      TRY(LBP_OP)
1087      TRY(LBP_CL)
1088      TRY(LBP_QU)
1089      TRY(LBP_EX)
1090      TRY(LBP_ID)
1091      TRY(LBP_NU)
1092      TRY(LBP_IS)
1093      TRY(LBP_SY)
1094      TRY(LBP_AL)
1095      TRY(LBP_PR)
1096      TRY(LBP_PO)
1097      TRY(LBP_SA)
1098      TRY(LBP_XX)
1099      TRY(LBP_AI)
1100#undef TRY
1101      else if (strcmp (field1, "LF") == 0) value = LBP_BK;
1102      else if (strcmp (field1, "CR") == 0) value = LBP_BK;
1103      else if (strcmp (field1, "SG") == 0) value = LBP_XX;
1104      else
1105	{
1106	  fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
1107		   field1, linebreak_filename, lineno);
1108	  exit (1);
1109	}
1110      i = strtoul (field0, NULL, 16);
1111      if (strstr (field0, "..") != NULL)
1112	{
1113	  /* Deal with a range.  */
1114	  j = strtoul (strstr (field0, "..") + 2, NULL, 16);
1115	  for (; i <= j; i++)
1116	    unicode_org_lbp[i] = value;
1117	}
1118      else
1119	{
1120	  /* Single character line.  */
1121	  unicode_org_lbp[i] = value;
1122	}
1123    }
1124  if (ferror (stream) || fclose (stream))
1125    {
1126      fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
1127      exit (1);
1128    }
1129}
1130
1131/* Output the line breaking properties in a human readable format.  */
1132static void
1133debug_output_org_lbp (FILE *stream)
1134{
1135  unsigned int i;
1136
1137  for (i = 0; i < 0x110000; i++)
1138    {
1139      int attr = unicode_org_lbp[i];
1140      if (attr != LBP_XX)
1141	{
1142	  fprintf (stream, "0x%04X", i);
1143#define PRINT_BIT(attr,bit) \
1144  if (attr == bit) fprintf (stream, " " #bit);
1145	  PRINT_BIT(attr,LBP_BK);
1146	  PRINT_BIT(attr,LBP_CM);
1147	  PRINT_BIT(attr,LBP_ZW);
1148	  PRINT_BIT(attr,LBP_IN);
1149	  PRINT_BIT(attr,LBP_GL);
1150	  PRINT_BIT(attr,LBP_CB);
1151	  PRINT_BIT(attr,LBP_SP);
1152	  PRINT_BIT(attr,LBP_BA);
1153	  PRINT_BIT(attr,LBP_BB);
1154	  PRINT_BIT(attr,LBP_B2);
1155	  PRINT_BIT(attr,LBP_HY);
1156	  PRINT_BIT(attr,LBP_NS);
1157	  PRINT_BIT(attr,LBP_OP);
1158	  PRINT_BIT(attr,LBP_CL);
1159	  PRINT_BIT(attr,LBP_QU);
1160	  PRINT_BIT(attr,LBP_EX);
1161	  PRINT_BIT(attr,LBP_ID);
1162	  PRINT_BIT(attr,LBP_NU);
1163	  PRINT_BIT(attr,LBP_IS);
1164	  PRINT_BIT(attr,LBP_SY);
1165	  PRINT_BIT(attr,LBP_AL);
1166	  PRINT_BIT(attr,LBP_PR);
1167	  PRINT_BIT(attr,LBP_PO);
1168	  PRINT_BIT(attr,LBP_SA);
1169	  PRINT_BIT(attr,LBP_XX);
1170	  PRINT_BIT(attr,LBP_AI);
1171#undef PRINT_BIT
1172	  fprintf (stream, "\n");
1173	}
1174    }
1175}
1176
1177static void
1178debug_output_org_tables (const char *filename)
1179{
1180  FILE *stream;
1181
1182  stream = fopen (filename, "w");
1183  if (stream == NULL)
1184    {
1185      fprintf (stderr, "cannot open '%s' for writing\n", filename);
1186      exit (1);
1187    }
1188
1189  debug_output_org_lbp (stream);
1190
1191  if (ferror (stream) || fclose (stream))
1192    {
1193      fprintf (stderr, "error writing to '%s'\n", filename);
1194      exit (1);
1195    }
1196}
1197
1198/* Construction of sparse 3-level tables.  */
1199#define TABLE lbp_table
1200#define ELEMENT unsigned char
1201#define DEFAULT LBP_XX
1202#define xmalloc malloc
1203#define xrealloc realloc
1204#include "3level.h"
1205
1206static void
1207output_lbp (FILE *stream)
1208{
1209  unsigned int i;
1210  struct lbp_table t;
1211  unsigned int level1_offset, level2_offset, level3_offset;
1212
1213  t.p = 7;
1214  t.q = 9;
1215  lbp_table_init (&t);
1216
1217  for (i = 0; i < 0x110000; i++)
1218    {
1219      int attr = get_lbp (i);
1220
1221      /* Now attr should contain exactly one bit.  */
1222      if (attr == 0 || ((attr & (attr - 1)) != 0))
1223	abort ();
1224
1225      if (attr != 1 << LBP_XX)
1226	{
1227	  unsigned int log2_attr;
1228	  for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
1229
1230	  lbp_table_add (&t, i, log2_attr);
1231	}
1232    }
1233
1234  lbp_table_finalize (&t);
1235
1236  level1_offset =
1237    5 * sizeof (uint32_t);
1238  level2_offset =
1239    5 * sizeof (uint32_t)
1240    + t.level1_size * sizeof (uint32_t);
1241  level3_offset =
1242    5 * sizeof (uint32_t)
1243    + t.level1_size * sizeof (uint32_t)
1244    + (t.level2_size << t.q) * sizeof (uint32_t);
1245
1246  for (i = 0; i < 5; i++)
1247    fprintf (stream, "#define lbrkprop_header_%d %d\n", i,
1248	     ((uint32_t *) t.result)[i]);
1249  fprintf (stream, "static const\n");
1250  fprintf (stream, "struct\n");
1251  fprintf (stream, "  {\n");
1252  fprintf (stream, "    int level1[%d];\n", t.level1_size);
1253  fprintf (stream, "    int level2[%d << %d];\n", t.level2_size, t.q);
1254  fprintf (stream, "    unsigned char level3[%d << %d];\n", t.level3_size, t.p);
1255  fprintf (stream, "  }\n");
1256  fprintf (stream, "lbrkprop =\n");
1257  fprintf (stream, "{\n");
1258  fprintf (stream, "  {");
1259  for (i = 0; i < t.level1_size; i++)
1260    {
1261      uint32_t offset;
1262      if (i > 0 && (i % 8) == 0)
1263	fprintf (stream, "\n   ");
1264      offset = ((uint32_t *) (t.result + level1_offset))[i];
1265      fprintf (stream, " %5d%s",
1266	       offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
1267	       (i+1 < t.level1_size ? "," : ""));
1268    }
1269  fprintf (stream, " },\n");
1270  fprintf (stream, "  {");
1271  if (t.level2_size << t.q > 8)
1272    fprintf (stream, "\n   ");
1273  for (i = 0; i < t.level2_size << t.q; i++)
1274    {
1275      uint32_t offset;
1276      if (i > 0 && (i % 8) == 0)
1277	fprintf (stream, "\n   ");
1278      offset = ((uint32_t *) (t.result + level2_offset))[i];
1279      fprintf (stream, " %5d%s",
1280	       offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
1281	       (i+1 < t.level2_size << t.q ? "," : ""));
1282    }
1283  if (t.level2_size << t.q > 8)
1284    fprintf (stream, "\n ");
1285  fprintf (stream, " },\n");
1286  fprintf (stream, "  {");
1287  if (t.level3_size << t.p > 8)
1288    fprintf (stream, "\n   ");
1289  for (i = 0; i < t.level3_size << t.p; i++)
1290    {
1291      unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
1292      const char *value_string;
1293      switch (value)
1294	{
1295#define CASE(x) case x: value_string = #x; break;
1296	  CASE(LBP_BK);
1297	  CASE(LBP_CM);
1298	  CASE(LBP_ZW);
1299	  CASE(LBP_IN);
1300	  CASE(LBP_GL);
1301	  CASE(LBP_CB);
1302	  CASE(LBP_SP);
1303	  CASE(LBP_BA);
1304	  CASE(LBP_BB);
1305	  CASE(LBP_B2);
1306	  CASE(LBP_HY);
1307	  CASE(LBP_NS);
1308	  CASE(LBP_OP);
1309	  CASE(LBP_CL);
1310	  CASE(LBP_QU);
1311	  CASE(LBP_EX);
1312	  CASE(LBP_ID);
1313	  CASE(LBP_NU);
1314	  CASE(LBP_IS);
1315	  CASE(LBP_SY);
1316	  CASE(LBP_AL);
1317	  CASE(LBP_PR);
1318	  CASE(LBP_PO);
1319	  CASE(LBP_SA);
1320	  CASE(LBP_XX);
1321	  CASE(LBP_AI);
1322#undef CASE
1323	  default:
1324	    abort ();
1325	}
1326      if (i > 0 && (i % 8) == 0)
1327	fprintf (stream, "\n   ");
1328      fprintf (stream, " %s%s", value_string,
1329	       (i+1 < t.level3_size << t.p ? "," : ""));
1330    }
1331  if (t.level3_size << t.p > 8)
1332    fprintf (stream, "\n ");
1333  fprintf (stream, " }\n");
1334  fprintf (stream, "};\n");
1335}
1336
1337static void
1338output_tables (const char *filename, const char *version)
1339{
1340  FILE *stream;
1341
1342  stream = fopen (filename, "w");
1343  if (stream == NULL)
1344    {
1345      fprintf (stderr, "cannot open '%s' for writing\n", filename);
1346      exit (1);
1347    }
1348
1349  fprintf (stream, "/* Line breaking properties of Unicode characters.  */\n");
1350  fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s.  */\n",
1351	   version);
1352  fprintf (stream, "\n");
1353
1354  /* Put a GPL header on it.  The gnulib module is under LGPL (although it
1355     still carries the GPL header), and it's gnulib-tool which replaces the
1356     GPL header with an LGPL header.  */
1357  fprintf (stream, "/* Copyright (C) 2000-2004 Free Software Foundation, Inc.\n");
1358  fprintf (stream, "\n");
1359  fprintf (stream, "This program is free software; you can redistribute it and/or modify\n");
1360  fprintf (stream, "it under the terms of the GNU General Public License as published by\n");
1361  fprintf (stream, "the Free Software Foundation; either version 2, or (at your option)\n");
1362  fprintf (stream, "any later version.\n");
1363  fprintf (stream, "\n");
1364  fprintf (stream, "This program is distributed in the hope that it will be useful,\n");
1365  fprintf (stream, "but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
1366  fprintf (stream, "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
1367  fprintf (stream, "GNU General Public License for more details.\n");
1368  fprintf (stream, "\n");
1369  fprintf (stream, "You should have received a copy of the GNU General Public License\n");
1370  fprintf (stream, "along with this program; if not, write to the Free Software\n");
1371  fprintf (stream, "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */\n");
1372  fprintf (stream, "\n");
1373
1374  output_lbp (stream);
1375
1376  if (ferror (stream) || fclose (stream))
1377    {
1378      fprintf (stderr, "error writing to '%s'\n", filename);
1379      exit (1);
1380    }
1381}
1382
1383int
1384main (int argc, char * argv[])
1385{
1386  if (argc != 6)
1387    {
1388      fprintf (stderr, "Usage: %s UnicodeData.txt Combining.txt EastAsianWidth.txt LineBreak.txt version\n",
1389	       argv[0]);
1390      exit (1);
1391    }
1392
1393  fill_attributes (argv[1]);
1394  fill_combining (argv[2]);
1395  fill_width (argv[3]);
1396  fill_org_lbp (argv[4]);
1397
1398  debug_output_tables ("lbrkprop.txt");
1399  debug_output_org_tables ("lbrkprop_org.txt");
1400
1401  output_tables ("lbrkprop.h", argv[5]);
1402
1403  return 0;
1404}
1405