1/*
2 * Copyright (c) 2014 SGI.
3 * All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19/* Generator for a compact trie for unicode normalization */
20
21#include <sys/types.h>
22#include <stddef.h>
23#include <stdlib.h>
24#include <stdio.h>
25#include <assert.h>
26#include <string.h>
27#include <unistd.h>
28#include <errno.h>
29
30/* Default names of the in- and output files. */
31
32#define AGE_NAME	"DerivedAge.txt"
33#define CCC_NAME	"DerivedCombiningClass.txt"
34#define PROP_NAME	"DerivedCoreProperties.txt"
35#define DATA_NAME	"UnicodeData.txt"
36#define FOLD_NAME	"CaseFolding.txt"
37#define NORM_NAME	"NormalizationCorrections.txt"
38#define TEST_NAME	"NormalizationTest.txt"
39#define UTF8_NAME	"utf8data.h"
40
41const char	*age_name  = AGE_NAME;
42const char	*ccc_name  = CCC_NAME;
43const char	*prop_name = PROP_NAME;
44const char	*data_name = DATA_NAME;
45const char	*fold_name = FOLD_NAME;
46const char	*norm_name = NORM_NAME;
47const char	*test_name = TEST_NAME;
48const char	*utf8_name = UTF8_NAME;
49
50int verbose = 0;
51
52/* An arbitrary line size limit on input lines. */
53
54#define LINESIZE	1024
55char line[LINESIZE];
56char buf0[LINESIZE];
57char buf1[LINESIZE];
58char buf2[LINESIZE];
59char buf3[LINESIZE];
60
61const char *argv0;
62
63#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
64
65/* ------------------------------------------------------------------ */
66
67/*
68 * Unicode version numbers consist of three parts: major, minor, and a
69 * revision.  These numbers are packed into an unsigned int to obtain
70 * a single version number.
71 *
72 * To save space in the generated trie, the unicode version is not
73 * stored directly, instead we calculate a generation number from the
74 * unicode versions seen in the DerivedAge file, and use that as an
75 * index into a table of unicode versions.
76 */
77#define UNICODE_MAJ_SHIFT		(16)
78#define UNICODE_MIN_SHIFT		(8)
79
80#define UNICODE_MAJ_MAX			((unsigned short)-1)
81#define UNICODE_MIN_MAX			((unsigned char)-1)
82#define UNICODE_REV_MAX			((unsigned char)-1)
83
84#define UNICODE_AGE(MAJ,MIN,REV)			\
85	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
86	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
87	 ((unsigned int)(REV)))
88
89unsigned int *ages;
90int ages_count;
91
92unsigned int unicode_maxage;
93
94static int age_valid(unsigned int major, unsigned int minor,
95		     unsigned int revision)
96{
97	if (major > UNICODE_MAJ_MAX)
98		return 0;
99	if (minor > UNICODE_MIN_MAX)
100		return 0;
101	if (revision > UNICODE_REV_MAX)
102		return 0;
103	return 1;
104}
105
106/* ------------------------------------------------------------------ */
107
108/*
109 * utf8trie_t
110 *
111 * A compact binary tree, used to decode UTF-8 characters.
112 *
113 * Internal nodes are one byte for the node itself, and up to three
114 * bytes for an offset into the tree.  The first byte contains the
115 * following information:
116 *  NEXTBYTE  - flag        - advance to next byte if set
117 *  BITNUM    - 3 bit field - the bit number to tested
118 *  OFFLEN    - 2 bit field - number of bytes in the offset
119 * if offlen == 0 (non-branching node)
120 *  RIGHTPATH - 1 bit field - set if the following node is for the
121 *                            right-hand path (tested bit is set)
122 *  TRIENODE  - 1 bit field - set if the following node is an internal
123 *                            node, otherwise it is a leaf node
124 * if offlen != 0 (branching node)
125 *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
126 *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
127 *
128 * Due to the way utf8 works, there cannot be branching nodes with
129 * NEXTBYTE set, and moreover those nodes always have a righthand
130 * descendant.
131 */
132typedef unsigned char utf8trie_t;
133#define BITNUM		0x07
134#define NEXTBYTE	0x08
135#define OFFLEN		0x30
136#define OFFLEN_SHIFT	4
137#define RIGHTPATH	0x40
138#define TRIENODE	0x80
139#define RIGHTNODE	0x40
140#define LEFTNODE	0x80
141
142/*
143 * utf8leaf_t
144 *
145 * The leaves of the trie are embedded in the trie, and so the same
146 * underlying datatype, unsigned char.
147 *
148 * leaf[0]: The unicode version, stored as a generation number that is
149 *          an index into utf8agetab[].  With this we can filter code
150 *          points based on the unicode version in which they were
151 *          defined.  The CCC of a non-defined code point is 0.
152 * leaf[1]: Canonical Combining Class. During normalization, we need
153 *          to do a stable sort into ascending order of all characters
154 *          with a non-zero CCC that occur between two characters with
155 *          a CCC of 0, or at the begin or end of a string.
156 *          The unicode standard guarantees that all CCC values are
157 *          between 0 and 254 inclusive, which leaves 255 available as
158 *          a special value.
159 *          Code points with CCC 0 are known as stoppers.
160 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161 *          start of a NUL-terminated string that is the decomposition
162 *          of the character.
163 *          The CCC of a decomposable character is the same as the CCC
164 *          of the first character of its decomposition.
165 *          Some characters decompose as the empty string: these are
166 *          characters with the Default_Ignorable_Code_Point property.
167 *          These do affect normalization, as they all have CCC 0.
168 *
169 * The decompositions in the trie have been fully expanded.
170 *
171 * Casefolding, if applicable, is also done using decompositions.
172 */
173typedef unsigned char utf8leaf_t;
174
175#define LEAF_GEN(LEAF)	((LEAF)[0])
176#define LEAF_CCC(LEAF)	((LEAF)[1])
177#define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
178
179#define MAXGEN		(255)
180
181#define MINCCC		(0)
182#define MAXCCC		(254)
183#define STOPPER		(0)
184#define DECOMPOSE	(255)
185#define HANGUL		((char)(255))
186
187#define UTF8HANGULLEAF	(12)
188
189struct tree;
190static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
191			       const char *, size_t);
192static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
193
194unsigned char *utf8data;
195size_t utf8data_size;
196
197utf8trie_t *nfdi;
198utf8trie_t *nfdicf;
199
200/* ------------------------------------------------------------------ */
201
202/*
203 * UTF8 valid ranges.
204 *
205 * The UTF-8 encoding spreads the bits of a 32bit word over several
206 * bytes. This table gives the ranges that can be held and how they'd
207 * be represented.
208 *
209 * 0x00000000 0x0000007F: 0xxxxxxx
210 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
211 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
212 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
213 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
215 *
216 * There is an additional requirement on UTF-8, in that only the
217 * shortest representation of a 32bit value is to be used.  A decoder
218 * must not decode sequences that do not satisfy this requirement.
219 * Thus the allowed ranges have a lower bound.
220 *
221 * 0x00000000 0x0000007F: 0xxxxxxx
222 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
223 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
224 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
225 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
227 *
228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
229 * 17 planes of 65536 values.  This limits the sequences actually seen
230 * even more, to just the following.
231 *
232 *          0 -     0x7f: 0                     0x7f
233 *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
234 *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
235 *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
236 *
237 * Even within those ranges not all values are allowed: the surrogates
238 * 0xd800 - 0xdfff should never be seen.
239 *
240 * Note that the longest sequence seen with valid usage is 4 bytes,
241 * the same a single UTF-32 character.  This makes the UTF-8
242 * representation of Unicode strictly smaller than UTF-32.
243 *
244 * The shortest sequence requirement was introduced by:
245 *    Corrigendum #1: UTF-8 Shortest Form
246 * It can be found here:
247 *    http://www.unicode.org/versions/corrigendum1.html
248 *
249 */
250
251#define UTF8_2_BITS     0xC0
252#define UTF8_3_BITS     0xE0
253#define UTF8_4_BITS     0xF0
254#define UTF8_N_BITS     0x80
255#define UTF8_2_MASK     0xE0
256#define UTF8_3_MASK     0xF0
257#define UTF8_4_MASK     0xF8
258#define UTF8_N_MASK     0xC0
259#define UTF8_V_MASK     0x3F
260#define UTF8_V_SHIFT    6
261
262static int utf8encode(char *str, unsigned int val)
263{
264	int len;
265
266	if (val < 0x80) {
267		str[0] = val;
268		len = 1;
269	} else if (val < 0x800) {
270		str[1] = val & UTF8_V_MASK;
271		str[1] |= UTF8_N_BITS;
272		val >>= UTF8_V_SHIFT;
273		str[0] = val;
274		str[0] |= UTF8_2_BITS;
275		len = 2;
276	} else if (val < 0x10000) {
277		str[2] = val & UTF8_V_MASK;
278		str[2] |= UTF8_N_BITS;
279		val >>= UTF8_V_SHIFT;
280		str[1] = val & UTF8_V_MASK;
281		str[1] |= UTF8_N_BITS;
282		val >>= UTF8_V_SHIFT;
283		str[0] = val;
284		str[0] |= UTF8_3_BITS;
285		len = 3;
286	} else if (val < 0x110000) {
287		str[3] = val & UTF8_V_MASK;
288		str[3] |= UTF8_N_BITS;
289		val >>= UTF8_V_SHIFT;
290		str[2] = val & UTF8_V_MASK;
291		str[2] |= UTF8_N_BITS;
292		val >>= UTF8_V_SHIFT;
293		str[1] = val & UTF8_V_MASK;
294		str[1] |= UTF8_N_BITS;
295		val >>= UTF8_V_SHIFT;
296		str[0] = val;
297		str[0] |= UTF8_4_BITS;
298		len = 4;
299	} else {
300		printf("%#x: illegal val\n", val);
301		len = 0;
302	}
303	return len;
304}
305
306static unsigned int utf8decode(const char *str)
307{
308	const unsigned char *s = (const unsigned char*)str;
309	unsigned int unichar = 0;
310
311	if (*s < 0x80) {
312		unichar = *s;
313	} else if (*s < UTF8_3_BITS) {
314		unichar = *s++ & 0x1F;
315		unichar <<= UTF8_V_SHIFT;
316		unichar |= *s & 0x3F;
317	} else if (*s < UTF8_4_BITS) {
318		unichar = *s++ & 0x0F;
319		unichar <<= UTF8_V_SHIFT;
320		unichar |= *s++ & 0x3F;
321		unichar <<= UTF8_V_SHIFT;
322		unichar |= *s & 0x3F;
323	} else {
324		unichar = *s++ & 0x0F;
325		unichar <<= UTF8_V_SHIFT;
326		unichar |= *s++ & 0x3F;
327		unichar <<= UTF8_V_SHIFT;
328		unichar |= *s++ & 0x3F;
329		unichar <<= UTF8_V_SHIFT;
330		unichar |= *s & 0x3F;
331	}
332	return unichar;
333}
334
335static int utf32valid(unsigned int unichar)
336{
337	return unichar < 0x110000;
338}
339
340#define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
341
342#define NODE 1
343#define LEAF 0
344
345struct tree {
346	void *root;
347	int childnode;
348	const char *type;
349	unsigned int maxage;
350	struct tree *next;
351	int (*leaf_equal)(void *, void *);
352	void (*leaf_print)(void *, int);
353	int (*leaf_mark)(void *);
354	int (*leaf_size)(void *);
355	int *(*leaf_index)(struct tree *, void *);
356	unsigned char *(*leaf_emit)(void *, unsigned char *);
357	int leafindex[0x110000];
358	int index;
359};
360
361struct node {
362	int index;
363	int offset;
364	int mark;
365	int size;
366	struct node *parent;
367	void *left;
368	void *right;
369	unsigned char bitnum;
370	unsigned char nextbyte;
371	unsigned char leftnode;
372	unsigned char rightnode;
373	unsigned int keybits;
374	unsigned int keymask;
375};
376
377/*
378 * Example lookup function for a tree.
379 */
380static void *lookup(struct tree *tree, const char *key)
381{
382	struct node *node;
383	void *leaf = NULL;
384
385	node = tree->root;
386	while (!leaf && node) {
387		if (node->nextbyte)
388			key++;
389		if (*key & (1 << (node->bitnum & 7))) {
390			/* Right leg */
391			if (node->rightnode == NODE) {
392				node = node->right;
393			} else if (node->rightnode == LEAF) {
394				leaf = node->right;
395			} else {
396				node = NULL;
397			}
398		} else {
399			/* Left leg */
400			if (node->leftnode == NODE) {
401				node = node->left;
402			} else if (node->leftnode == LEAF) {
403				leaf = node->left;
404			} else {
405				node = NULL;
406			}
407		}
408	}
409
410	return leaf;
411}
412
413/*
414 * A simple non-recursive tree walker: keep track of visits to the
415 * left and right branches in the leftmask and rightmask.
416 */
417static void tree_walk(struct tree *tree)
418{
419	struct node *node;
420	unsigned int leftmask;
421	unsigned int rightmask;
422	unsigned int bitmask;
423	int indent = 1;
424	int nodes, singletons, leaves;
425
426	nodes = singletons = leaves = 0;
427
428	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
429	if (tree->childnode == LEAF) {
430		assert(tree->root);
431		tree->leaf_print(tree->root, indent);
432		leaves = 1;
433	} else {
434		assert(tree->childnode == NODE);
435		node = tree->root;
436		leftmask = rightmask = 0;
437		while (node) {
438			printf("%*snode @ %p bitnum %d nextbyte %d"
439			       " left %p right %p mask %x bits %x\n",
440				indent, "", node,
441				node->bitnum, node->nextbyte,
442				node->left, node->right,
443				node->keymask, node->keybits);
444			nodes += 1;
445			if (!(node->left && node->right))
446				singletons += 1;
447
448			while (node) {
449				bitmask = 1 << node->bitnum;
450				if ((leftmask & bitmask) == 0) {
451					leftmask |= bitmask;
452					if (node->leftnode == LEAF) {
453						assert(node->left);
454						tree->leaf_print(node->left,
455								 indent+1);
456						leaves += 1;
457					} else if (node->left) {
458						assert(node->leftnode == NODE);
459						indent += 1;
460						node = node->left;
461						break;
462					}
463				}
464				if ((rightmask & bitmask) == 0) {
465					rightmask |= bitmask;
466					if (node->rightnode == LEAF) {
467						assert(node->right);
468						tree->leaf_print(node->right,
469								 indent+1);
470						leaves += 1;
471					} else if (node->right) {
472						assert(node->rightnode == NODE);
473						indent += 1;
474						node = node->right;
475						break;
476					}
477				}
478				leftmask &= ~bitmask;
479				rightmask &= ~bitmask;
480				node = node->parent;
481				indent -= 1;
482			}
483		}
484	}
485	printf("nodes %d leaves %d singletons %d\n",
486	       nodes, leaves, singletons);
487}
488
489/*
490 * Allocate an initialize a new internal node.
491 */
492static struct node *alloc_node(struct node *parent)
493{
494	struct node *node;
495	int bitnum;
496
497	node = malloc(sizeof(*node));
498	node->left = node->right = NULL;
499	node->parent = parent;
500	node->leftnode = NODE;
501	node->rightnode = NODE;
502	node->keybits = 0;
503	node->keymask = 0;
504	node->mark = 0;
505	node->index = 0;
506	node->offset = -1;
507	node->size = 4;
508
509	if (node->parent) {
510		bitnum = parent->bitnum;
511		if ((bitnum & 7) == 0) {
512			node->bitnum = bitnum + 7 + 8;
513			node->nextbyte = 1;
514		} else {
515			node->bitnum = bitnum - 1;
516			node->nextbyte = 0;
517		}
518	} else {
519		node->bitnum = 7;
520		node->nextbyte = 0;
521	}
522
523	return node;
524}
525
526/*
527 * Insert a new leaf into the tree, and collapse any subtrees that are
528 * fully populated and end in identical leaves. A nextbyte tagged
529 * internal node will not be removed to preserve the tree's integrity.
530 * Note that due to the structure of utf8, no nextbyte tagged node
531 * will be a candidate for removal.
532 */
533static int insert(struct tree *tree, char *key, int keylen, void *leaf)
534{
535	struct node *node;
536	struct node *parent;
537	void **cursor;
538	int keybits;
539
540	assert(keylen >= 1 && keylen <= 4);
541
542	node = NULL;
543	cursor = &tree->root;
544	keybits = 8 * keylen;
545
546	/* Insert, creating path along the way. */
547	while (keybits) {
548		if (!*cursor)
549			*cursor = alloc_node(node);
550		node = *cursor;
551		if (node->nextbyte)
552			key++;
553		if (*key & (1 << (node->bitnum & 7)))
554			cursor = &node->right;
555		else
556			cursor = &node->left;
557		keybits--;
558	}
559	*cursor = leaf;
560
561	/* Merge subtrees if possible. */
562	while (node) {
563		if (*key & (1 << (node->bitnum & 7)))
564			node->rightnode = LEAF;
565		else
566			node->leftnode = LEAF;
567		if (node->nextbyte)
568			break;
569		if (node->leftnode == NODE || node->rightnode == NODE)
570			break;
571		assert(node->left);
572		assert(node->right);
573		/* Compare */
574		if (! tree->leaf_equal(node->left, node->right))
575			break;
576		/* Keep left, drop right leaf. */
577		leaf = node->left;
578		/* Check in parent */
579		parent = node->parent;
580		if (!parent) {
581			/* root of tree! */
582			tree->root = leaf;
583			tree->childnode = LEAF;
584		} else if (parent->left == node) {
585			parent->left = leaf;
586			parent->leftnode = LEAF;
587			if (parent->right) {
588				parent->keymask = 0;
589				parent->keybits = 0;
590			} else {
591				parent->keymask |= (1 << node->bitnum);
592			}
593		} else if (parent->right == node) {
594			parent->right = leaf;
595			parent->rightnode = LEAF;
596			if (parent->left) {
597				parent->keymask = 0;
598				parent->keybits = 0;
599			} else {
600				parent->keymask |= (1 << node->bitnum);
601				parent->keybits |= (1 << node->bitnum);
602			}
603		} else {
604			/* internal tree error */
605			assert(0);
606		}
607		free(node);
608		node = parent;
609	}
610
611	/* Propagate keymasks up along singleton chains. */
612	while (node) {
613		parent = node->parent;
614		if (!parent)
615			break;
616		/* Nix the mask for parents with two children. */
617		if (node->keymask == 0) {
618			parent->keymask = 0;
619			parent->keybits = 0;
620		} else if (parent->left && parent->right) {
621			parent->keymask = 0;
622			parent->keybits = 0;
623		} else {
624			assert((parent->keymask & node->keymask) == 0);
625			parent->keymask |= node->keymask;
626			parent->keymask |= (1 << parent->bitnum);
627			parent->keybits |= node->keybits;
628			if (parent->right)
629				parent->keybits |= (1 << parent->bitnum);
630		}
631		node = parent;
632	}
633
634	return 0;
635}
636
637/*
638 * Prune internal nodes.
639 *
640 * Fully populated subtrees that end at the same leaf have already
641 * been collapsed.  There are still internal nodes that have for both
642 * their left and right branches a sequence of singletons that make
643 * identical choices and end in identical leaves.  The keymask and
644 * keybits collected in the nodes describe the choices made in these
645 * singleton chains.  When they are identical for the left and right
646 * branch of a node, and the two leaves comare identical, the node in
647 * question can be removed.
648 *
649 * Note that nodes with the nextbyte tag set will not be removed by
650 * this to ensure tree integrity.  Note as well that the structure of
651 * utf8 ensures that these nodes would not have been candidates for
652 * removal in any case.
653 */
654static void prune(struct tree *tree)
655{
656	struct node *node;
657	struct node *left;
658	struct node *right;
659	struct node *parent;
660	void *leftleaf;
661	void *rightleaf;
662	unsigned int leftmask;
663	unsigned int rightmask;
664	unsigned int bitmask;
665	int count;
666
667	if (verbose > 0)
668		printf("Pruning %s_%x\n", tree->type, tree->maxage);
669
670	count = 0;
671	if (tree->childnode == LEAF)
672		return;
673	if (!tree->root)
674		return;
675
676	leftmask = rightmask = 0;
677	node = tree->root;
678	while (node) {
679		if (node->nextbyte)
680			goto advance;
681		if (node->leftnode == LEAF)
682			goto advance;
683		if (node->rightnode == LEAF)
684			goto advance;
685		if (!node->left)
686			goto advance;
687		if (!node->right)
688			goto advance;
689		left = node->left;
690		right = node->right;
691		if (left->keymask == 0)
692			goto advance;
693		if (right->keymask == 0)
694			goto advance;
695		if (left->keymask != right->keymask)
696			goto advance;
697		if (left->keybits != right->keybits)
698			goto advance;
699		leftleaf = NULL;
700		while (!leftleaf) {
701			assert(left->left || left->right);
702			if (left->leftnode == LEAF)
703				leftleaf = left->left;
704			else if (left->rightnode == LEAF)
705				leftleaf = left->right;
706			else if (left->left)
707				left = left->left;
708			else if (left->right)
709				left = left->right;
710			else
711				assert(0);
712		}
713		rightleaf = NULL;
714		while (!rightleaf) {
715			assert(right->left || right->right);
716			if (right->leftnode == LEAF)
717				rightleaf = right->left;
718			else if (right->rightnode == LEAF)
719				rightleaf = right->right;
720			else if (right->left)
721				right = right->left;
722			else if (right->right)
723				right = right->right;
724			else
725				assert(0);
726		}
727		if (! tree->leaf_equal(leftleaf, rightleaf))
728			goto advance;
729		/*
730		 * This node has identical singleton-only subtrees.
731		 * Remove it.
732		 */
733		parent = node->parent;
734		left = node->left;
735		right = node->right;
736		if (parent->left == node)
737			parent->left = left;
738		else if (parent->right == node)
739			parent->right = left;
740		else
741			assert(0);
742		left->parent = parent;
743		left->keymask |= (1 << node->bitnum);
744		node->left = NULL;
745		while (node) {
746			bitmask = 1 << node->bitnum;
747			leftmask &= ~bitmask;
748			rightmask &= ~bitmask;
749			if (node->leftnode == NODE && node->left) {
750				left = node->left;
751				free(node);
752				count++;
753				node = left;
754			} else if (node->rightnode == NODE && node->right) {
755				right = node->right;
756				free(node);
757				count++;
758				node = right;
759			} else {
760				node = NULL;
761			}
762		}
763		/* Propagate keymasks up along singleton chains. */
764		node = parent;
765		/* Force re-check */
766		bitmask = 1 << node->bitnum;
767		leftmask &= ~bitmask;
768		rightmask &= ~bitmask;
769		for (;;) {
770			if (node->left && node->right)
771				break;
772			if (node->left) {
773				left = node->left;
774				node->keymask |= left->keymask;
775				node->keybits |= left->keybits;
776			}
777			if (node->right) {
778				right = node->right;
779				node->keymask |= right->keymask;
780				node->keybits |= right->keybits;
781			}
782			node->keymask |= (1 << node->bitnum);
783			node = node->parent;
784			/* Force re-check */
785			bitmask = 1 << node->bitnum;
786			leftmask &= ~bitmask;
787			rightmask &= ~bitmask;
788		}
789	advance:
790		bitmask = 1 << node->bitnum;
791		if ((leftmask & bitmask) == 0 &&
792		    node->leftnode == NODE &&
793		    node->left) {
794			leftmask |= bitmask;
795			node = node->left;
796		} else if ((rightmask & bitmask) == 0 &&
797			   node->rightnode == NODE &&
798			   node->right) {
799			rightmask |= bitmask;
800			node = node->right;
801		} else {
802			leftmask &= ~bitmask;
803			rightmask &= ~bitmask;
804			node = node->parent;
805		}
806	}
807	if (verbose > 0)
808		printf("Pruned %d nodes\n", count);
809}
810
811/*
812 * Mark the nodes in the tree that lead to leaves that must be
813 * emitted.
814 */
815static void mark_nodes(struct tree *tree)
816{
817	struct node *node;
818	struct node *n;
819	unsigned int leftmask;
820	unsigned int rightmask;
821	unsigned int bitmask;
822	int marked;
823
824	marked = 0;
825	if (verbose > 0)
826		printf("Marking %s_%x\n", tree->type, tree->maxage);
827	if (tree->childnode == LEAF)
828		goto done;
829
830	assert(tree->childnode == NODE);
831	node = tree->root;
832	leftmask = rightmask = 0;
833	while (node) {
834		bitmask = 1 << node->bitnum;
835		if ((leftmask & bitmask) == 0) {
836			leftmask |= bitmask;
837			if (node->leftnode == LEAF) {
838				assert(node->left);
839				if (tree->leaf_mark(node->left)) {
840					n = node;
841					while (n && !n->mark) {
842						marked++;
843						n->mark = 1;
844						n = n->parent;
845					}
846				}
847			} else if (node->left) {
848				assert(node->leftnode == NODE);
849				node = node->left;
850				continue;
851			}
852		}
853		if ((rightmask & bitmask) == 0) {
854			rightmask |= bitmask;
855			if (node->rightnode == LEAF) {
856				assert(node->right);
857				if (tree->leaf_mark(node->right)) {
858					n = node;
859					while (n && !n->mark) {
860						marked++;
861						n->mark = 1;
862						n = n->parent;
863					}
864				}
865			} else if (node->right) {
866				assert(node->rightnode == NODE);
867				node = node->right;
868				continue;
869			}
870		}
871		leftmask &= ~bitmask;
872		rightmask &= ~bitmask;
873		node = node->parent;
874	}
875
876	/* second pass: left siblings and singletons */
877
878	assert(tree->childnode == NODE);
879	node = tree->root;
880	leftmask = rightmask = 0;
881	while (node) {
882		bitmask = 1 << node->bitnum;
883		if ((leftmask & bitmask) == 0) {
884			leftmask |= bitmask;
885			if (node->leftnode == LEAF) {
886				assert(node->left);
887				if (tree->leaf_mark(node->left)) {
888					n = node;
889					while (n && !n->mark) {
890						marked++;
891						n->mark = 1;
892						n = n->parent;
893					}
894				}
895			} else if (node->left) {
896				assert(node->leftnode == NODE);
897				node = node->left;
898				if (!node->mark && node->parent->mark) {
899					marked++;
900					node->mark = 1;
901				}
902				continue;
903			}
904		}
905		if ((rightmask & bitmask) == 0) {
906			rightmask |= bitmask;
907			if (node->rightnode == LEAF) {
908				assert(node->right);
909				if (tree->leaf_mark(node->right)) {
910					n = node;
911					while (n && !n->mark) {
912						marked++;
913						n->mark = 1;
914						n = n->parent;
915					}
916				}
917			} else if (node->right) {
918				assert(node->rightnode == NODE);
919				node = node->right;
920				if (!node->mark && node->parent->mark &&
921				    !node->parent->left) {
922					marked++;
923					node->mark = 1;
924				}
925				continue;
926			}
927		}
928		leftmask &= ~bitmask;
929		rightmask &= ~bitmask;
930		node = node->parent;
931	}
932done:
933	if (verbose > 0)
934		printf("Marked %d nodes\n", marked);
935}
936
937/*
938 * Compute the index of each node and leaf, which is the offset in the
939 * emitted trie.  These values must be pre-computed because relative
940 * offsets between nodes are used to navigate the tree.
941 */
942static int index_nodes(struct tree *tree, int index)
943{
944	struct node *node;
945	unsigned int leftmask;
946	unsigned int rightmask;
947	unsigned int bitmask;
948	int count;
949	int indent;
950
951	/* Align to a cache line (or half a cache line?). */
952	while (index % 64)
953		index++;
954	tree->index = index;
955	indent = 1;
956	count = 0;
957
958	if (verbose > 0)
959		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
960	if (tree->childnode == LEAF) {
961		index += tree->leaf_size(tree->root);
962		goto done;
963	}
964
965	assert(tree->childnode == NODE);
966	node = tree->root;
967	leftmask = rightmask = 0;
968	while (node) {
969		if (!node->mark)
970			goto skip;
971		count++;
972		if (node->index != index)
973			node->index = index;
974		index += node->size;
975skip:
976		while (node) {
977			bitmask = 1 << node->bitnum;
978			if (node->mark && (leftmask & bitmask) == 0) {
979				leftmask |= bitmask;
980				if (node->leftnode == LEAF) {
981					assert(node->left);
982					*tree->leaf_index(tree, node->left) =
983									index;
984					index += tree->leaf_size(node->left);
985					count++;
986				} else if (node->left) {
987					assert(node->leftnode == NODE);
988					indent += 1;
989					node = node->left;
990					break;
991				}
992			}
993			if (node->mark && (rightmask & bitmask) == 0) {
994				rightmask |= bitmask;
995				if (node->rightnode == LEAF) {
996					assert(node->right);
997					*tree->leaf_index(tree, node->right) = index;
998					index += tree->leaf_size(node->right);
999					count++;
1000				} else if (node->right) {
1001					assert(node->rightnode == NODE);
1002					indent += 1;
1003					node = node->right;
1004					break;
1005				}
1006			}
1007			leftmask &= ~bitmask;
1008			rightmask &= ~bitmask;
1009			node = node->parent;
1010			indent -= 1;
1011		}
1012	}
1013done:
1014	/* Round up to a multiple of 16 */
1015	while (index % 16)
1016		index++;
1017	if (verbose > 0)
1018		printf("Final index %d\n", index);
1019	return index;
1020}
1021
1022/*
1023 * Mark the nodes in a subtree, helper for size_nodes().
1024 */
1025static int mark_subtree(struct node *node)
1026{
1027	int changed;
1028
1029	if (!node || node->mark)
1030		return 0;
1031	node->mark = 1;
1032	node->index = node->parent->index;
1033	changed = 1;
1034	if (node->leftnode == NODE)
1035		changed += mark_subtree(node->left);
1036	if (node->rightnode == NODE)
1037		changed += mark_subtree(node->right);
1038	return changed;
1039}
1040
1041/*
1042 * Compute the size of nodes and leaves. We start by assuming that
1043 * each node needs to store a three-byte offset. The indexes of the
1044 * nodes are calculated based on that, and then this function is
1045 * called to see if the sizes of some nodes can be reduced.  This is
1046 * repeated until no more changes are seen.
1047 */
1048static int size_nodes(struct tree *tree)
1049{
1050	struct tree *next;
1051	struct node *node;
1052	struct node *right;
1053	struct node *n;
1054	unsigned int leftmask;
1055	unsigned int rightmask;
1056	unsigned int bitmask;
1057	unsigned int pathbits;
1058	unsigned int pathmask;
1059	unsigned int nbit;
1060	int changed;
1061	int offset;
1062	int size;
1063	int indent;
1064
1065	indent = 1;
1066	changed = 0;
1067	size = 0;
1068
1069	if (verbose > 0)
1070		printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071	if (tree->childnode == LEAF)
1072		goto done;
1073
1074	assert(tree->childnode == NODE);
1075	pathbits = 0;
1076	pathmask = 0;
1077	node = tree->root;
1078	leftmask = rightmask = 0;
1079	while (node) {
1080		if (!node->mark)
1081			goto skip;
1082		offset = 0;
1083		if (!node->left || !node->right) {
1084			size = 1;
1085		} else {
1086			if (node->rightnode == NODE) {
1087				/*
1088				 * If the right node is not marked,
1089				 * look for a corresponding node in
1090				 * the next tree.  Such a node need
1091				 * not exist.
1092				 */
1093				right = node->right;
1094				next = tree->next;
1095				while (!right->mark) {
1096					assert(next);
1097					n = next->root;
1098					while (n->bitnum != node->bitnum) {
1099						nbit = 1 << n->bitnum;
1100						if (!(pathmask & nbit))
1101							break;
1102						if (pathbits & nbit) {
1103							if (n->rightnode == LEAF)
1104								break;
1105							n = n->right;
1106						} else {
1107							if (n->leftnode == LEAF)
1108								break;
1109							n = n->left;
1110						}
1111					}
1112					if (n->bitnum != node->bitnum)
1113						break;
1114					n = n->right;
1115					right = n;
1116					next = next->next;
1117				}
1118				/* Make sure the right node is marked. */
1119				if (!right->mark)
1120					changed += mark_subtree(right);
1121				offset = right->index - node->index;
1122			} else {
1123				offset = *tree->leaf_index(tree, node->right);
1124				offset -= node->index;
1125			}
1126			assert(offset >= 0);
1127			assert(offset <= 0xffffff);
1128			if (offset <= 0xff) {
1129				size = 2;
1130			} else if (offset <= 0xffff) {
1131				size = 3;
1132			} else { /* offset <= 0xffffff */
1133				size = 4;
1134			}
1135		}
1136		if (node->size != size || node->offset != offset) {
1137			node->size = size;
1138			node->offset = offset;
1139			changed++;
1140		}
1141skip:
1142		while (node) {
1143			bitmask = 1 << node->bitnum;
1144			pathmask |= bitmask;
1145			if (node->mark && (leftmask & bitmask) == 0) {
1146				leftmask |= bitmask;
1147				if (node->leftnode == LEAF) {
1148					assert(node->left);
1149				} else if (node->left) {
1150					assert(node->leftnode == NODE);
1151					indent += 1;
1152					node = node->left;
1153					break;
1154				}
1155			}
1156			if (node->mark && (rightmask & bitmask) == 0) {
1157				rightmask |= bitmask;
1158				pathbits |= bitmask;
1159				if (node->rightnode == LEAF) {
1160					assert(node->right);
1161				} else if (node->right) {
1162					assert(node->rightnode == NODE);
1163					indent += 1;
1164					node = node->right;
1165					break;
1166				}
1167			}
1168			leftmask &= ~bitmask;
1169			rightmask &= ~bitmask;
1170			pathmask &= ~bitmask;
1171			pathbits &= ~bitmask;
1172			node = node->parent;
1173			indent -= 1;
1174		}
1175	}
1176done:
1177	if (verbose > 0)
1178		printf("Found %d changes\n", changed);
1179	return changed;
1180}
1181
1182/*
1183 * Emit a trie for the given tree into the data array.
1184 */
1185static void emit(struct tree *tree, unsigned char *data)
1186{
1187	struct node *node;
1188	unsigned int leftmask;
1189	unsigned int rightmask;
1190	unsigned int bitmask;
1191	int offlen;
1192	int offset;
1193	int index;
1194	int indent;
1195	int size;
1196	int bytes;
1197	int leaves;
1198	int nodes[4];
1199	unsigned char byte;
1200
1201	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202	leaves = 0;
1203	bytes = 0;
1204	index = tree->index;
1205	data += index;
1206	indent = 1;
1207	if (verbose > 0)
1208		printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209	if (tree->childnode == LEAF) {
1210		assert(tree->root);
1211		tree->leaf_emit(tree->root, data);
1212		size = tree->leaf_size(tree->root);
1213		index += size;
1214		leaves++;
1215		goto done;
1216	}
1217
1218	assert(tree->childnode == NODE);
1219	node = tree->root;
1220	leftmask = rightmask = 0;
1221	while (node) {
1222		if (!node->mark)
1223			goto skip;
1224		assert(node->offset != -1);
1225		assert(node->index == index);
1226
1227		byte = 0;
1228		if (node->nextbyte)
1229			byte |= NEXTBYTE;
1230		byte |= (node->bitnum & BITNUM);
1231		if (node->left && node->right) {
1232			if (node->leftnode == NODE)
1233				byte |= LEFTNODE;
1234			if (node->rightnode == NODE)
1235				byte |= RIGHTNODE;
1236			if (node->offset <= 0xff)
1237				offlen = 1;
1238			else if (node->offset <= 0xffff)
1239				offlen = 2;
1240			else
1241				offlen = 3;
1242			nodes[offlen]++;
1243			offset = node->offset;
1244			byte |= offlen << OFFLEN_SHIFT;
1245			*data++ = byte;
1246			index++;
1247			while (offlen--) {
1248				*data++ = offset & 0xff;
1249				index++;
1250				offset >>= 8;
1251			}
1252		} else if (node->left) {
1253			if (node->leftnode == NODE)
1254				byte |= TRIENODE;
1255			nodes[0]++;
1256			*data++ = byte;
1257			index++;
1258		} else if (node->right) {
1259			byte |= RIGHTNODE;
1260			if (node->rightnode == NODE)
1261				byte |= TRIENODE;
1262			nodes[0]++;
1263			*data++ = byte;
1264			index++;
1265		} else {
1266			assert(0);
1267		}
1268skip:
1269		while (node) {
1270			bitmask = 1 << node->bitnum;
1271			if (node->mark && (leftmask & bitmask) == 0) {
1272				leftmask |= bitmask;
1273				if (node->leftnode == LEAF) {
1274					assert(node->left);
1275					data = tree->leaf_emit(node->left,
1276							       data);
1277					size = tree->leaf_size(node->left);
1278					index += size;
1279					bytes += size;
1280					leaves++;
1281				} else if (node->left) {
1282					assert(node->leftnode == NODE);
1283					indent += 1;
1284					node = node->left;
1285					break;
1286				}
1287			}
1288			if (node->mark && (rightmask & bitmask) == 0) {
1289				rightmask |= bitmask;
1290				if (node->rightnode == LEAF) {
1291					assert(node->right);
1292					data = tree->leaf_emit(node->right,
1293							       data);
1294					size = tree->leaf_size(node->right);
1295					index += size;
1296					bytes += size;
1297					leaves++;
1298				} else if (node->right) {
1299					assert(node->rightnode == NODE);
1300					indent += 1;
1301					node = node->right;
1302					break;
1303				}
1304			}
1305			leftmask &= ~bitmask;
1306			rightmask &= ~bitmask;
1307			node = node->parent;
1308			indent -= 1;
1309		}
1310	}
1311done:
1312	if (verbose > 0) {
1313		printf("Emitted %d (%d) leaves",
1314			leaves, bytes);
1315		printf(" %d (%d+%d+%d+%d) nodes",
1316			nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317			nodes[0], nodes[1], nodes[2], nodes[3]);
1318		printf(" %d total\n", index - tree->index);
1319	}
1320}
1321
1322/* ------------------------------------------------------------------ */
1323
1324/*
1325 * Unicode data.
1326 *
1327 * We need to keep track of the Canonical Combining Class, the Age,
1328 * and decompositions for a code point.
1329 *
1330 * For the Age, we store the index into the ages table.  Effectively
1331 * this is a generation number that the table maps to a unicode
1332 * version.
1333 *
1334 * The correction field is used to indicate that this entry is in the
1335 * corrections array, which contains decompositions that were
1336 * corrected in later revisions.  The value of the correction field is
1337 * the Unicode version in which the mapping was corrected.
1338 */
1339struct unicode_data {
1340	unsigned int code;
1341	int ccc;
1342	int gen;
1343	int correction;
1344	unsigned int *utf32nfdi;
1345	unsigned int *utf32nfdicf;
1346	char *utf8nfdi;
1347	char *utf8nfdicf;
1348};
1349
1350struct unicode_data unicode_data[0x110000];
1351struct unicode_data *corrections;
1352int    corrections_count;
1353
1354struct tree *nfdi_tree;
1355struct tree *nfdicf_tree;
1356
1357struct tree *trees;
1358int          trees_count;
1359
1360/*
1361 * Check the corrections array to see if this entry was corrected at
1362 * some point.
1363 */
1364static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365{
1366	int i;
1367
1368	for (i = 0; i != corrections_count; i++)
1369		if (u->code == corrections[i].code)
1370			return &corrections[i];
1371	return u;
1372}
1373
1374static int nfdi_equal(void *l, void *r)
1375{
1376	struct unicode_data *left = l;
1377	struct unicode_data *right = r;
1378
1379	if (left->gen != right->gen)
1380		return 0;
1381	if (left->ccc != right->ccc)
1382		return 0;
1383	if (left->utf8nfdi && right->utf8nfdi &&
1384	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385		return 1;
1386	if (left->utf8nfdi || right->utf8nfdi)
1387		return 0;
1388	return 1;
1389}
1390
1391static int nfdicf_equal(void *l, void *r)
1392{
1393	struct unicode_data *left = l;
1394	struct unicode_data *right = r;
1395
1396	if (left->gen != right->gen)
1397		return 0;
1398	if (left->ccc != right->ccc)
1399		return 0;
1400	if (left->utf8nfdicf && right->utf8nfdicf &&
1401	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402		return 1;
1403	if (left->utf8nfdicf && right->utf8nfdicf)
1404		return 0;
1405	if (left->utf8nfdicf || right->utf8nfdicf)
1406		return 0;
1407	if (left->utf8nfdi && right->utf8nfdi &&
1408	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409		return 1;
1410	if (left->utf8nfdi || right->utf8nfdi)
1411		return 0;
1412	return 1;
1413}
1414
1415static void nfdi_print(void *l, int indent)
1416{
1417	struct unicode_data *leaf = l;
1418
1419	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420		leaf->code, leaf->ccc, leaf->gen);
1421
1422	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424	else if (leaf->utf8nfdi)
1425		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427	printf("\n");
1428}
1429
1430static void nfdicf_print(void *l, int indent)
1431{
1432	struct unicode_data *leaf = l;
1433
1434	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435		leaf->code, leaf->ccc, leaf->gen);
1436
1437	if (leaf->utf8nfdicf)
1438		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441	else if (leaf->utf8nfdi)
1442		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443	printf("\n");
1444}
1445
1446static int nfdi_mark(void *l)
1447{
1448	return 1;
1449}
1450
1451static int nfdicf_mark(void *l)
1452{
1453	struct unicode_data *leaf = l;
1454
1455	if (leaf->utf8nfdicf)
1456		return 1;
1457	return 0;
1458}
1459
1460static int correction_mark(void *l)
1461{
1462	struct unicode_data *leaf = l;
1463
1464	return leaf->correction;
1465}
1466
1467static int nfdi_size(void *l)
1468{
1469	struct unicode_data *leaf = l;
1470	int size = 2;
1471
1472	if (HANGUL_SYLLABLE(leaf->code))
1473		size += 1;
1474	else if (leaf->utf8nfdi)
1475		size += strlen(leaf->utf8nfdi) + 1;
1476	return size;
1477}
1478
1479static int nfdicf_size(void *l)
1480{
1481	struct unicode_data *leaf = l;
1482	int size = 2;
1483
1484	if (HANGUL_SYLLABLE(leaf->code))
1485		size += 1;
1486	else if (leaf->utf8nfdicf)
1487		size += strlen(leaf->utf8nfdicf) + 1;
1488	else if (leaf->utf8nfdi)
1489		size += strlen(leaf->utf8nfdi) + 1;
1490	return size;
1491}
1492
1493static int *nfdi_index(struct tree *tree, void *l)
1494{
1495	struct unicode_data *leaf = l;
1496
1497	return &tree->leafindex[leaf->code];
1498}
1499
1500static int *nfdicf_index(struct tree *tree, void *l)
1501{
1502	struct unicode_data *leaf = l;
1503
1504	return &tree->leafindex[leaf->code];
1505}
1506
1507static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508{
1509	struct unicode_data *leaf = l;
1510	unsigned char *s;
1511
1512	*data++ = leaf->gen;
1513
1514	if (HANGUL_SYLLABLE(leaf->code)) {
1515		*data++ = DECOMPOSE;
1516		*data++ = HANGUL;
1517	} else if (leaf->utf8nfdi) {
1518		*data++ = DECOMPOSE;
1519		s = (unsigned char*)leaf->utf8nfdi;
1520		while ((*data++ = *s++) != 0)
1521			;
1522	} else {
1523		*data++ = leaf->ccc;
1524	}
1525	return data;
1526}
1527
1528static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529{
1530	struct unicode_data *leaf = l;
1531	unsigned char *s;
1532
1533	*data++ = leaf->gen;
1534
1535	if (HANGUL_SYLLABLE(leaf->code)) {
1536		*data++ = DECOMPOSE;
1537		*data++ = HANGUL;
1538	} else if (leaf->utf8nfdicf) {
1539		*data++ = DECOMPOSE;
1540		s = (unsigned char*)leaf->utf8nfdicf;
1541		while ((*data++ = *s++) != 0)
1542			;
1543	} else if (leaf->utf8nfdi) {
1544		*data++ = DECOMPOSE;
1545		s = (unsigned char*)leaf->utf8nfdi;
1546		while ((*data++ = *s++) != 0)
1547			;
1548	} else {
1549		*data++ = leaf->ccc;
1550	}
1551	return data;
1552}
1553
1554static void utf8_create(struct unicode_data *data)
1555{
1556	char utf[18*4+1];
1557	char *u;
1558	unsigned int *um;
1559	int i;
1560
1561	if (data->utf8nfdi) {
1562		assert(data->utf8nfdi[0] == HANGUL);
1563		return;
1564	}
1565
1566	u = utf;
1567	um = data->utf32nfdi;
1568	if (um) {
1569		for (i = 0; um[i]; i++)
1570			u += utf8encode(u, um[i]);
1571		*u = '\0';
1572		data->utf8nfdi = strdup(utf);
1573	}
1574	u = utf;
1575	um = data->utf32nfdicf;
1576	if (um) {
1577		for (i = 0; um[i]; i++)
1578			u += utf8encode(u, um[i]);
1579		*u = '\0';
1580		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581			data->utf8nfdicf = strdup(utf);
1582	}
1583}
1584
1585static void utf8_init(void)
1586{
1587	unsigned int unichar;
1588	int i;
1589
1590	for (unichar = 0; unichar != 0x110000; unichar++)
1591		utf8_create(&unicode_data[unichar]);
1592
1593	for (i = 0; i != corrections_count; i++)
1594		utf8_create(&corrections[i]);
1595}
1596
1597static void trees_init(void)
1598{
1599	struct unicode_data *data;
1600	unsigned int maxage;
1601	unsigned int nextage;
1602	int count;
1603	int i;
1604	int j;
1605
1606	/* Count the number of different ages. */
1607	count = 0;
1608	nextage = (unsigned int)-1;
1609	do {
1610		maxage = nextage;
1611		nextage = 0;
1612		for (i = 0; i <= corrections_count; i++) {
1613			data = &corrections[i];
1614			if (nextage < data->correction &&
1615			    data->correction < maxage)
1616				nextage = data->correction;
1617		}
1618		count++;
1619	} while (nextage);
1620
1621	/* Two trees per age: nfdi and nfdicf */
1622	trees_count = count * 2;
1623	trees = calloc(trees_count, sizeof(struct tree));
1624
1625	/* Assign ages to the trees. */
1626	count = trees_count;
1627	nextage = (unsigned int)-1;
1628	do {
1629		maxage = nextage;
1630		trees[--count].maxage = maxage;
1631		trees[--count].maxage = maxage;
1632		nextage = 0;
1633		for (i = 0; i <= corrections_count; i++) {
1634			data = &corrections[i];
1635			if (nextage < data->correction &&
1636			    data->correction < maxage)
1637				nextage = data->correction;
1638		}
1639	} while (nextage);
1640
1641	/* The ages assigned above are off by one. */
1642	for (i = 0; i != trees_count; i++) {
1643		j = 0;
1644		while (ages[j] < trees[i].maxage)
1645			j++;
1646		trees[i].maxage = ages[j-1];
1647	}
1648
1649	/* Set up the forwarding between trees. */
1650	trees[trees_count-2].next = &trees[trees_count-1];
1651	trees[trees_count-1].leaf_mark = nfdi_mark;
1652	trees[trees_count-2].leaf_mark = nfdicf_mark;
1653	for (i = 0; i != trees_count-2; i += 2) {
1654		trees[i].next = &trees[trees_count-2];
1655		trees[i].leaf_mark = correction_mark;
1656		trees[i+1].next = &trees[trees_count-1];
1657		trees[i+1].leaf_mark = correction_mark;
1658	}
1659
1660	/* Assign the callouts. */
1661	for (i = 0; i != trees_count; i += 2) {
1662		trees[i].type = "nfdicf";
1663		trees[i].leaf_equal = nfdicf_equal;
1664		trees[i].leaf_print = nfdicf_print;
1665		trees[i].leaf_size = nfdicf_size;
1666		trees[i].leaf_index = nfdicf_index;
1667		trees[i].leaf_emit = nfdicf_emit;
1668
1669		trees[i+1].type = "nfdi";
1670		trees[i+1].leaf_equal = nfdi_equal;
1671		trees[i+1].leaf_print = nfdi_print;
1672		trees[i+1].leaf_size = nfdi_size;
1673		trees[i+1].leaf_index = nfdi_index;
1674		trees[i+1].leaf_emit = nfdi_emit;
1675	}
1676
1677	/* Finish init. */
1678	for (i = 0; i != trees_count; i++)
1679		trees[i].childnode = NODE;
1680}
1681
1682static void trees_populate(void)
1683{
1684	struct unicode_data *data;
1685	unsigned int unichar;
1686	char keyval[4];
1687	int keylen;
1688	int i;
1689
1690	for (i = 0; i != trees_count; i++) {
1691		if (verbose > 0) {
1692			printf("Populating %s_%x\n",
1693				trees[i].type, trees[i].maxage);
1694		}
1695		for (unichar = 0; unichar != 0x110000; unichar++) {
1696			if (unicode_data[unichar].gen < 0)
1697				continue;
1698			keylen = utf8encode(keyval, unichar);
1699			data = corrections_lookup(&unicode_data[unichar]);
1700			if (data->correction <= trees[i].maxage)
1701				data = &unicode_data[unichar];
1702			insert(&trees[i], keyval, keylen, data);
1703		}
1704	}
1705}
1706
1707static void trees_reduce(void)
1708{
1709	int i;
1710	int size;
1711	int changed;
1712
1713	for (i = 0; i != trees_count; i++)
1714		prune(&trees[i]);
1715	for (i = 0; i != trees_count; i++)
1716		mark_nodes(&trees[i]);
1717	do {
1718		size = 0;
1719		for (i = 0; i != trees_count; i++)
1720			size = index_nodes(&trees[i], size);
1721		changed = 0;
1722		for (i = 0; i != trees_count; i++)
1723			changed += size_nodes(&trees[i]);
1724	} while (changed);
1725
1726	utf8data = calloc(size, 1);
1727	utf8data_size = size;
1728	for (i = 0; i != trees_count; i++)
1729		emit(&trees[i], utf8data);
1730
1731	if (verbose > 0) {
1732		for (i = 0; i != trees_count; i++) {
1733			printf("%s_%x idx %d\n",
1734				trees[i].type, trees[i].maxage, trees[i].index);
1735		}
1736	}
1737
1738	nfdi = utf8data + trees[trees_count-1].index;
1739	nfdicf = utf8data + trees[trees_count-2].index;
1740
1741	nfdi_tree = &trees[trees_count-1];
1742	nfdicf_tree = &trees[trees_count-2];
1743}
1744
1745static void verify(struct tree *tree)
1746{
1747	struct unicode_data *data;
1748	utf8leaf_t	*leaf;
1749	unsigned int	unichar;
1750	char		key[4];
1751	unsigned char	hangul[UTF8HANGULLEAF];
1752	int		report;
1753	int		nocf;
1754
1755	if (verbose > 0)
1756		printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757	nocf = strcmp(tree->type, "nfdicf");
1758
1759	for (unichar = 0; unichar != 0x110000; unichar++) {
1760		report = 0;
1761		data = corrections_lookup(&unicode_data[unichar]);
1762		if (data->correction <= tree->maxage)
1763			data = &unicode_data[unichar];
1764		utf8encode(key,unichar);
1765		leaf = utf8lookup(tree, hangul, key);
1766
1767		if (!leaf) {
1768			if (data->gen != -1)
1769				report++;
1770			if (unichar < 0xd800 || unichar > 0xdfff)
1771				report++;
1772		} else {
1773			if (unichar >= 0xd800 && unichar <= 0xdfff)
1774				report++;
1775			if (data->gen == -1)
1776				report++;
1777			if (data->gen != LEAF_GEN(leaf))
1778				report++;
1779			if (LEAF_CCC(leaf) == DECOMPOSE) {
1780				if (HANGUL_SYLLABLE(data->code)) {
1781					if (data->utf8nfdi[0] != HANGUL)
1782						report++;
1783				} else if (nocf) {
1784					if (!data->utf8nfdi) {
1785						report++;
1786					} else if (strcmp(data->utf8nfdi,
1787							  LEAF_STR(leaf))) {
1788						report++;
1789					}
1790				} else {
1791					if (!data->utf8nfdicf &&
1792					    !data->utf8nfdi) {
1793						report++;
1794					} else if (data->utf8nfdicf) {
1795						if (strcmp(data->utf8nfdicf,
1796							   LEAF_STR(leaf)))
1797							report++;
1798					} else if (strcmp(data->utf8nfdi,
1799							  LEAF_STR(leaf))) {
1800						report++;
1801					}
1802				}
1803			} else if (data->ccc != LEAF_CCC(leaf)) {
1804				report++;
1805			}
1806		}
1807		if (report) {
1808			printf("%X code %X gen %d ccc %d"
1809				" nfdi -> \"%s\"",
1810				unichar, data->code, data->gen,
1811				data->ccc,
1812				data->utf8nfdi);
1813			if (leaf) {
1814				printf(" gen %d ccc %d"
1815					" nfdi -> \"%s\"",
1816					LEAF_GEN(leaf),
1817					LEAF_CCC(leaf),
1818					LEAF_CCC(leaf) == DECOMPOSE ?
1819						LEAF_STR(leaf) : "");
1820			}
1821			printf("\n");
1822		}
1823	}
1824}
1825
1826static void trees_verify(void)
1827{
1828	int i;
1829
1830	for (i = 0; i != trees_count; i++)
1831		verify(&trees[i]);
1832}
1833
1834/* ------------------------------------------------------------------ */
1835
1836static void help(void)
1837{
1838	printf("Usage: %s [options]\n", argv0);
1839	printf("\n");
1840	printf("This program creates an a data trie used for parsing and\n");
1841	printf("normalization of UTF-8 strings. The trie is derived from\n");
1842	printf("a set of input files from the Unicode character database\n");
1843	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844	printf("\n");
1845	printf("The generated tree supports two normalization forms:\n");
1846	printf("\n");
1847	printf("\tnfdi:\n");
1848	printf("\t- Apply unicode normalization form NFD.\n");
1849	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850	printf("\n");
1851	printf("\tnfdicf:\n");
1852	printf("\t- Apply unicode normalization form NFD.\n");
1853	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854	printf("\t- Apply a full casefold (C + F).\n");
1855	printf("\n");
1856	printf("These forms were chosen as being most useful when dealing\n");
1857	printf("with file names: NFD catches most cases where characters\n");
1858	printf("should be considered equivalent. The ignorables are mostly\n");
1859	printf("invisible, making names hard to type.\n");
1860	printf("\n");
1861	printf("The options to specify the files to be used are listed\n");
1862	printf("below with their default values, which are the names used\n");
1863	printf("by version 11.0.0 of the Unicode Character Database.\n");
1864	printf("\n");
1865	printf("The input files:\n");
1866	printf("\t-a %s\n", AGE_NAME);
1867	printf("\t-c %s\n", CCC_NAME);
1868	printf("\t-p %s\n", PROP_NAME);
1869	printf("\t-d %s\n", DATA_NAME);
1870	printf("\t-f %s\n", FOLD_NAME);
1871	printf("\t-n %s\n", NORM_NAME);
1872	printf("\n");
1873	printf("Additionally, the generated tables are tested using:\n");
1874	printf("\t-t %s\n", TEST_NAME);
1875	printf("\n");
1876	printf("Finally, the output file:\n");
1877	printf("\t-o %s\n", UTF8_NAME);
1878	printf("\n");
1879}
1880
1881static void usage(void)
1882{
1883	help();
1884	exit(1);
1885}
1886
1887static void open_fail(const char *name, int error)
1888{
1889	printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890	exit(1);
1891}
1892
1893static void file_fail(const char *filename)
1894{
1895	printf("Error parsing %s\n", filename);
1896	exit(1);
1897}
1898
1899static void line_fail(const char *filename, const char *line)
1900{
1901	printf("Error parsing %s:%s\n", filename, line);
1902	exit(1);
1903}
1904
1905/* ------------------------------------------------------------------ */
1906
1907static void print_utf32(unsigned int *utf32str)
1908{
1909	int	i;
1910
1911	for (i = 0; utf32str[i]; i++)
1912		printf(" %X", utf32str[i]);
1913}
1914
1915static void print_utf32nfdi(unsigned int unichar)
1916{
1917	printf(" %X ->", unichar);
1918	print_utf32(unicode_data[unichar].utf32nfdi);
1919	printf("\n");
1920}
1921
1922static void print_utf32nfdicf(unsigned int unichar)
1923{
1924	printf(" %X ->", unichar);
1925	print_utf32(unicode_data[unichar].utf32nfdicf);
1926	printf("\n");
1927}
1928
1929/* ------------------------------------------------------------------ */
1930
1931static void age_init(void)
1932{
1933	FILE *file;
1934	unsigned int first;
1935	unsigned int last;
1936	unsigned int unichar;
1937	unsigned int major;
1938	unsigned int minor;
1939	unsigned int revision;
1940	int gen;
1941	int count;
1942	int ret;
1943
1944	if (verbose > 0)
1945		printf("Parsing %s\n", age_name);
1946
1947	file = fopen(age_name, "r");
1948	if (!file)
1949		open_fail(age_name, errno);
1950	count = 0;
1951
1952	gen = 0;
1953	while (fgets(line, LINESIZE, file)) {
1954		ret = sscanf(line, "# Age=V%d_%d_%d",
1955				&major, &minor, &revision);
1956		if (ret == 3) {
1957			ages_count++;
1958			if (verbose > 1)
1959				printf(" Age V%d_%d_%d\n",
1960					major, minor, revision);
1961			if (!age_valid(major, minor, revision))
1962				line_fail(age_name, line);
1963			continue;
1964		}
1965		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966		if (ret == 2) {
1967			ages_count++;
1968			if (verbose > 1)
1969				printf(" Age V%d_%d\n", major, minor);
1970			if (!age_valid(major, minor, 0))
1971				line_fail(age_name, line);
1972			continue;
1973		}
1974	}
1975
1976	/* We must have found something above. */
1977	if (verbose > 1)
1978		printf("%d age entries\n", ages_count);
1979	if (ages_count == 0 || ages_count > MAXGEN)
1980		file_fail(age_name);
1981
1982	/* There is a 0 entry. */
1983	ages_count++;
1984	ages = calloc(ages_count + 1, sizeof(*ages));
1985	/* And a guard entry. */
1986	ages[ages_count] = (unsigned int)-1;
1987
1988	rewind(file);
1989	count = 0;
1990	gen = 0;
1991	while (fgets(line, LINESIZE, file)) {
1992		ret = sscanf(line, "# Age=V%d_%d_%d",
1993				&major, &minor, &revision);
1994		if (ret == 3) {
1995			ages[++gen] =
1996				UNICODE_AGE(major, minor, revision);
1997			if (verbose > 1)
1998				printf(" Age V%d_%d_%d = gen %d\n",
1999					major, minor, revision, gen);
2000			if (!age_valid(major, minor, revision))
2001				line_fail(age_name, line);
2002			continue;
2003		}
2004		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005		if (ret == 2) {
2006			ages[++gen] = UNICODE_AGE(major, minor, 0);
2007			if (verbose > 1)
2008				printf(" Age V%d_%d = %d\n",
2009					major, minor, gen);
2010			if (!age_valid(major, minor, 0))
2011				line_fail(age_name, line);
2012			continue;
2013		}
2014		ret = sscanf(line, "%X..%X ; %d.%d #",
2015			     &first, &last, &major, &minor);
2016		if (ret == 4) {
2017			for (unichar = first; unichar <= last; unichar++)
2018				unicode_data[unichar].gen = gen;
2019			count += 1 + last - first;
2020			if (verbose > 1)
2021				printf("  %X..%X gen %d\n", first, last, gen);
2022			if (!utf32valid(first) || !utf32valid(last))
2023				line_fail(age_name, line);
2024			continue;
2025		}
2026		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027		if (ret == 3) {
2028			unicode_data[unichar].gen = gen;
2029			count++;
2030			if (verbose > 1)
2031				printf("  %X gen %d\n", unichar, gen);
2032			if (!utf32valid(unichar))
2033				line_fail(age_name, line);
2034			continue;
2035		}
2036	}
2037	unicode_maxage = ages[gen];
2038	fclose(file);
2039
2040	/* Nix surrogate block */
2041	if (verbose > 1)
2042		printf(" Removing surrogate block D800..DFFF\n");
2043	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044		unicode_data[unichar].gen = -1;
2045
2046	if (verbose > 0)
2047	        printf("Found %d entries\n", count);
2048	if (count == 0)
2049		file_fail(age_name);
2050}
2051
2052static void ccc_init(void)
2053{
2054	FILE *file;
2055	unsigned int first;
2056	unsigned int last;
2057	unsigned int unichar;
2058	unsigned int value;
2059	int count;
2060	int ret;
2061
2062	if (verbose > 0)
2063		printf("Parsing %s\n", ccc_name);
2064
2065	file = fopen(ccc_name, "r");
2066	if (!file)
2067		open_fail(ccc_name, errno);
2068
2069	count = 0;
2070	while (fgets(line, LINESIZE, file)) {
2071		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072		if (ret == 3) {
2073			for (unichar = first; unichar <= last; unichar++) {
2074				unicode_data[unichar].ccc = value;
2075                                count++;
2076			}
2077			if (verbose > 1)
2078				printf(" %X..%X ccc %d\n", first, last, value);
2079			if (!utf32valid(first) || !utf32valid(last))
2080				line_fail(ccc_name, line);
2081			continue;
2082		}
2083		ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084		if (ret == 2) {
2085			unicode_data[unichar].ccc = value;
2086                        count++;
2087			if (verbose > 1)
2088				printf(" %X ccc %d\n", unichar, value);
2089			if (!utf32valid(unichar))
2090				line_fail(ccc_name, line);
2091			continue;
2092		}
2093	}
2094	fclose(file);
2095
2096	if (verbose > 0)
2097		printf("Found %d entries\n", count);
2098	if (count == 0)
2099		file_fail(ccc_name);
2100}
2101
2102static int ignore_compatibility_form(char *type)
2103{
2104	int i;
2105	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106				 "final", "isolated", "circle", "super",
2107				 "sub", "vertical", "wide", "narrow",
2108				 "small", "square", "fraction", "compat"};
2109
2110	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111		if (strcmp(type, ignored_types[i]) == 0)
2112			return 1;
2113	return 0;
2114}
2115
2116static void nfdi_init(void)
2117{
2118	FILE *file;
2119	unsigned int unichar;
2120	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2121	char *s;
2122	char *type;
2123	unsigned int *um;
2124	int count;
2125	int i;
2126	int ret;
2127
2128	if (verbose > 0)
2129		printf("Parsing %s\n", data_name);
2130	file = fopen(data_name, "r");
2131	if (!file)
2132		open_fail(data_name, errno);
2133
2134	count = 0;
2135	while (fgets(line, LINESIZE, file)) {
2136		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137			     &unichar, buf0);
2138		if (ret != 2)
2139			continue;
2140		if (!utf32valid(unichar))
2141			line_fail(data_name, line);
2142
2143		s = buf0;
2144		/* skip over <tag> */
2145		if (*s == '<') {
2146			type = ++s;
2147			while (*++s != '>');
2148			*s++ = '\0';
2149			if(ignore_compatibility_form(type))
2150				continue;
2151		}
2152		/* decode the decomposition into UTF-32 */
2153		i = 0;
2154		while (*s) {
2155			mapping[i] = strtoul(s, &s, 16);
2156			if (!utf32valid(mapping[i]))
2157				line_fail(data_name, line);
2158			i++;
2159		}
2160		mapping[i++] = 0;
2161
2162		um = malloc(i * sizeof(unsigned int));
2163		memcpy(um, mapping, i * sizeof(unsigned int));
2164		unicode_data[unichar].utf32nfdi = um;
2165
2166		if (verbose > 1)
2167			print_utf32nfdi(unichar);
2168		count++;
2169	}
2170	fclose(file);
2171	if (verbose > 0)
2172		printf("Found %d entries\n", count);
2173	if (count == 0)
2174		file_fail(data_name);
2175}
2176
2177static void nfdicf_init(void)
2178{
2179	FILE *file;
2180	unsigned int unichar;
2181	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2182	char status;
2183	char *s;
2184	unsigned int *um;
2185	int i;
2186	int count;
2187	int ret;
2188
2189	if (verbose > 0)
2190		printf("Parsing %s\n", fold_name);
2191	file = fopen(fold_name, "r");
2192	if (!file)
2193		open_fail(fold_name, errno);
2194
2195	count = 0;
2196	while (fgets(line, LINESIZE, file)) {
2197		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198		if (ret != 3)
2199			continue;
2200		if (!utf32valid(unichar))
2201			line_fail(fold_name, line);
2202		/* Use the C+F casefold. */
2203		if (status != 'C' && status != 'F')
2204			continue;
2205		s = buf0;
2206		if (*s == '<')
2207			while (*s++ != ' ')
2208				;
2209		i = 0;
2210		while (*s) {
2211			mapping[i] = strtoul(s, &s, 16);
2212			if (!utf32valid(mapping[i]))
2213				line_fail(fold_name, line);
2214			i++;
2215		}
2216		mapping[i++] = 0;
2217
2218		um = malloc(i * sizeof(unsigned int));
2219		memcpy(um, mapping, i * sizeof(unsigned int));
2220		unicode_data[unichar].utf32nfdicf = um;
2221
2222		if (verbose > 1)
2223			print_utf32nfdicf(unichar);
2224		count++;
2225	}
2226	fclose(file);
2227	if (verbose > 0)
2228		printf("Found %d entries\n", count);
2229	if (count == 0)
2230		file_fail(fold_name);
2231}
2232
2233static void ignore_init(void)
2234{
2235	FILE *file;
2236	unsigned int unichar;
2237	unsigned int first;
2238	unsigned int last;
2239	unsigned int *um;
2240	int count;
2241	int ret;
2242
2243	if (verbose > 0)
2244		printf("Parsing %s\n", prop_name);
2245	file = fopen(prop_name, "r");
2246	if (!file)
2247		open_fail(prop_name, errno);
2248	assert(file);
2249	count = 0;
2250	while (fgets(line, LINESIZE, file)) {
2251		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252		if (ret == 3) {
2253			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254				continue;
2255			if (!utf32valid(first) || !utf32valid(last))
2256				line_fail(prop_name, line);
2257			for (unichar = first; unichar <= last; unichar++) {
2258				free(unicode_data[unichar].utf32nfdi);
2259				um = malloc(sizeof(unsigned int));
2260				*um = 0;
2261				unicode_data[unichar].utf32nfdi = um;
2262				free(unicode_data[unichar].utf32nfdicf);
2263				um = malloc(sizeof(unsigned int));
2264				*um = 0;
2265				unicode_data[unichar].utf32nfdicf = um;
2266				count++;
2267			}
2268			if (verbose > 1)
2269				printf(" %X..%X Default_Ignorable_Code_Point\n",
2270					first, last);
2271			continue;
2272		}
2273		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274		if (ret == 2) {
2275			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276				continue;
2277			if (!utf32valid(unichar))
2278				line_fail(prop_name, line);
2279			free(unicode_data[unichar].utf32nfdi);
2280			um = malloc(sizeof(unsigned int));
2281			*um = 0;
2282			unicode_data[unichar].utf32nfdi = um;
2283			free(unicode_data[unichar].utf32nfdicf);
2284			um = malloc(sizeof(unsigned int));
2285			*um = 0;
2286			unicode_data[unichar].utf32nfdicf = um;
2287			if (verbose > 1)
2288				printf(" %X Default_Ignorable_Code_Point\n",
2289					unichar);
2290			count++;
2291			continue;
2292		}
2293	}
2294	fclose(file);
2295
2296	if (verbose > 0)
2297		printf("Found %d entries\n", count);
2298	if (count == 0)
2299		file_fail(prop_name);
2300}
2301
2302static void corrections_init(void)
2303{
2304	FILE *file;
2305	unsigned int unichar;
2306	unsigned int major;
2307	unsigned int minor;
2308	unsigned int revision;
2309	unsigned int age;
2310	unsigned int *um;
2311	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2312	char *s;
2313	int i;
2314	int count;
2315	int ret;
2316
2317	if (verbose > 0)
2318		printf("Parsing %s\n", norm_name);
2319	file = fopen(norm_name, "r");
2320	if (!file)
2321		open_fail(norm_name, errno);
2322
2323	count = 0;
2324	while (fgets(line, LINESIZE, file)) {
2325		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326				&unichar, buf0, buf1,
2327				&major, &minor, &revision);
2328		if (ret != 6)
2329			continue;
2330		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331			line_fail(norm_name, line);
2332		count++;
2333	}
2334	corrections = calloc(count, sizeof(struct unicode_data));
2335	corrections_count = count;
2336	rewind(file);
2337
2338	count = 0;
2339	while (fgets(line, LINESIZE, file)) {
2340		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341				&unichar, buf0, buf1,
2342				&major, &minor, &revision);
2343		if (ret != 6)
2344			continue;
2345		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346			line_fail(norm_name, line);
2347		corrections[count] = unicode_data[unichar];
2348		assert(corrections[count].code == unichar);
2349		age = UNICODE_AGE(major, minor, revision);
2350		corrections[count].correction = age;
2351
2352		i = 0;
2353		s = buf0;
2354		while (*s) {
2355			mapping[i] = strtoul(s, &s, 16);
2356			if (!utf32valid(mapping[i]))
2357				line_fail(norm_name, line);
2358			i++;
2359		}
2360		mapping[i++] = 0;
2361
2362		um = malloc(i * sizeof(unsigned int));
2363		memcpy(um, mapping, i * sizeof(unsigned int));
2364		corrections[count].utf32nfdi = um;
2365
2366		if (verbose > 1)
2367			printf(" %X -> %s -> %s V%d_%d_%d\n",
2368				unichar, buf0, buf1, major, minor, revision);
2369		count++;
2370	}
2371	fclose(file);
2372
2373	if (verbose > 0)
2374	        printf("Found %d entries\n", count);
2375	if (count == 0)
2376		file_fail(norm_name);
2377}
2378
2379/* ------------------------------------------------------------------ */
2380
2381/*
2382 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2383 *
2384 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2386 *
2387 * SBase = 0xAC00
2388 * LBase = 0x1100
2389 * VBase = 0x1161
2390 * TBase = 0x11A7
2391 * LCount = 19
2392 * VCount = 21
2393 * TCount = 28
2394 * NCount = 588 (VCount * TCount)
2395 * SCount = 11172 (LCount * NCount)
2396 *
2397 * Decomposition:
2398 *   SIndex = s - SBase
2399 *
2400 * LV (Canonical/Full)
2401 *   LIndex = SIndex / NCount
2402 *   VIndex = (Sindex % NCount) / TCount
2403 *   LPart = LBase + LIndex
2404 *   VPart = VBase + VIndex
2405 *
2406 * LVT (Canonical)
2407 *   LVIndex = (SIndex / TCount) * TCount
2408 *   TIndex = (Sindex % TCount)
2409 *   LVPart = SBase + LVIndex
2410 *   TPart = TBase + TIndex
2411 *
2412 * LVT (Full)
2413 *   LIndex = SIndex / NCount
2414 *   VIndex = (Sindex % NCount) / TCount
2415 *   TIndex = (Sindex % TCount)
2416 *   LPart = LBase + LIndex
2417 *   VPart = VBase + VIndex
2418 *   if (TIndex == 0) {
2419 *          d = <LPart, VPart>
2420 *   } else {
2421 *          TPart = TBase + TIndex
2422 *          d = <LPart, VPart, TPart>
2423 *   }
2424 *
2425 */
2426
2427static void hangul_decompose(void)
2428{
2429	unsigned int sb = 0xAC00;
2430	unsigned int lb = 0x1100;
2431	unsigned int vb = 0x1161;
2432	unsigned int tb = 0x11a7;
2433	/* unsigned int lc = 19; */
2434	unsigned int vc = 21;
2435	unsigned int tc = 28;
2436	unsigned int nc = (vc * tc);
2437	/* unsigned int sc = (lc * nc); */
2438	unsigned int unichar;
2439	unsigned int mapping[4];
2440	unsigned int *um;
2441        int count;
2442	int i;
2443
2444	if (verbose > 0)
2445		printf("Decomposing hangul\n");
2446	/* Hangul */
2447	count = 0;
2448	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449		unsigned int si = unichar - sb;
2450		unsigned int li = si / nc;
2451		unsigned int vi = (si % nc) / tc;
2452		unsigned int ti = si % tc;
2453
2454		i = 0;
2455		mapping[i++] = lb + li;
2456		mapping[i++] = vb + vi;
2457		if (ti)
2458			mapping[i++] = tb + ti;
2459		mapping[i++] = 0;
2460
2461		assert(!unicode_data[unichar].utf32nfdi);
2462		um = malloc(i * sizeof(unsigned int));
2463		memcpy(um, mapping, i * sizeof(unsigned int));
2464		unicode_data[unichar].utf32nfdi = um;
2465
2466		assert(!unicode_data[unichar].utf32nfdicf);
2467		um = malloc(i * sizeof(unsigned int));
2468		memcpy(um, mapping, i * sizeof(unsigned int));
2469		unicode_data[unichar].utf32nfdicf = um;
2470
2471		/*
2472		 * Add a cookie as a reminder that the hangul syllable
2473		 * decompositions must not be stored in the generated
2474		 * trie.
2475		 */
2476		unicode_data[unichar].utf8nfdi = malloc(2);
2477		unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478		unicode_data[unichar].utf8nfdi[1] = '\0';
2479
2480		if (verbose > 1)
2481			print_utf32nfdi(unichar);
2482
2483		count++;
2484	}
2485	if (verbose > 0)
2486		printf("Created %d entries\n", count);
2487}
2488
2489static void nfdi_decompose(void)
2490{
2491	unsigned int unichar;
2492	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2493	unsigned int *um;
2494	unsigned int *dc;
2495	int count;
2496	int i;
2497	int j;
2498	int ret;
2499
2500	if (verbose > 0)
2501		printf("Decomposing nfdi\n");
2502
2503	count = 0;
2504	for (unichar = 0; unichar != 0x110000; unichar++) {
2505		if (!unicode_data[unichar].utf32nfdi)
2506			continue;
2507		for (;;) {
2508			ret = 1;
2509			i = 0;
2510			um = unicode_data[unichar].utf32nfdi;
2511			while (*um) {
2512				dc = unicode_data[*um].utf32nfdi;
2513				if (dc) {
2514					for (j = 0; dc[j]; j++)
2515						mapping[i++] = dc[j];
2516					ret = 0;
2517				} else {
2518					mapping[i++] = *um;
2519				}
2520				um++;
2521			}
2522			mapping[i++] = 0;
2523			if (ret)
2524				break;
2525			free(unicode_data[unichar].utf32nfdi);
2526			um = malloc(i * sizeof(unsigned int));
2527			memcpy(um, mapping, i * sizeof(unsigned int));
2528			unicode_data[unichar].utf32nfdi = um;
2529		}
2530		/* Add this decomposition to nfdicf if there is no entry. */
2531		if (!unicode_data[unichar].utf32nfdicf) {
2532			um = malloc(i * sizeof(unsigned int));
2533			memcpy(um, mapping, i * sizeof(unsigned int));
2534			unicode_data[unichar].utf32nfdicf = um;
2535		}
2536		if (verbose > 1)
2537			print_utf32nfdi(unichar);
2538		count++;
2539	}
2540	if (verbose > 0)
2541		printf("Processed %d entries\n", count);
2542}
2543
2544static void nfdicf_decompose(void)
2545{
2546	unsigned int unichar;
2547	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2548	unsigned int *um;
2549	unsigned int *dc;
2550	int count;
2551	int i;
2552	int j;
2553	int ret;
2554
2555	if (verbose > 0)
2556		printf("Decomposing nfdicf\n");
2557	count = 0;
2558	for (unichar = 0; unichar != 0x110000; unichar++) {
2559		if (!unicode_data[unichar].utf32nfdicf)
2560			continue;
2561		for (;;) {
2562			ret = 1;
2563			i = 0;
2564			um = unicode_data[unichar].utf32nfdicf;
2565			while (*um) {
2566				dc = unicode_data[*um].utf32nfdicf;
2567				if (dc) {
2568					for (j = 0; dc[j]; j++)
2569						mapping[i++] = dc[j];
2570					ret = 0;
2571				} else {
2572					mapping[i++] = *um;
2573				}
2574				um++;
2575			}
2576			mapping[i++] = 0;
2577			if (ret)
2578				break;
2579			free(unicode_data[unichar].utf32nfdicf);
2580			um = malloc(i * sizeof(unsigned int));
2581			memcpy(um, mapping, i * sizeof(unsigned int));
2582			unicode_data[unichar].utf32nfdicf = um;
2583		}
2584		if (verbose > 1)
2585			print_utf32nfdicf(unichar);
2586		count++;
2587	}
2588	if (verbose > 0)
2589		printf("Processed %d entries\n", count);
2590}
2591
2592/* ------------------------------------------------------------------ */
2593
2594int utf8agemax(struct tree *, const char *);
2595int utf8nagemax(struct tree *, const char *, size_t);
2596int utf8agemin(struct tree *, const char *);
2597int utf8nagemin(struct tree *, const char *, size_t);
2598ssize_t utf8len(struct tree *, const char *);
2599ssize_t utf8nlen(struct tree *, const char *, size_t);
2600struct utf8cursor;
2601int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603int utf8byte(struct utf8cursor *);
2604
2605/*
2606 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2607 *
2608 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2610 *
2611 * SBase = 0xAC00
2612 * LBase = 0x1100
2613 * VBase = 0x1161
2614 * TBase = 0x11A7
2615 * LCount = 19
2616 * VCount = 21
2617 * TCount = 28
2618 * NCount = 588 (VCount * TCount)
2619 * SCount = 11172 (LCount * NCount)
2620 *
2621 * Decomposition:
2622 *   SIndex = s - SBase
2623 *
2624 * LV (Canonical/Full)
2625 *   LIndex = SIndex / NCount
2626 *   VIndex = (Sindex % NCount) / TCount
2627 *   LPart = LBase + LIndex
2628 *   VPart = VBase + VIndex
2629 *
2630 * LVT (Canonical)
2631 *   LVIndex = (SIndex / TCount) * TCount
2632 *   TIndex = (Sindex % TCount)
2633 *   LVPart = SBase + LVIndex
2634 *   TPart = TBase + TIndex
2635 *
2636 * LVT (Full)
2637 *   LIndex = SIndex / NCount
2638 *   VIndex = (Sindex % NCount) / TCount
2639 *   TIndex = (Sindex % TCount)
2640 *   LPart = LBase + LIndex
2641 *   VPart = VBase + VIndex
2642 *   if (TIndex == 0) {
2643 *          d = <LPart, VPart>
2644 *   } else {
2645 *          TPart = TBase + TIndex
2646 *          d = <LPart, VPart, TPart>
2647 *   }
2648 */
2649
2650/* Constants */
2651#define SB	(0xAC00)
2652#define LB	(0x1100)
2653#define VB	(0x1161)
2654#define TB	(0x11A7)
2655#define LC	(19)
2656#define VC	(21)
2657#define TC	(28)
2658#define NC	(VC * TC)
2659#define SC	(LC * NC)
2660
2661/* Algorithmic decomposition of hangul syllable. */
2662static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663{
2664	unsigned int	si;
2665	unsigned int	li;
2666	unsigned int	vi;
2667	unsigned int	ti;
2668	unsigned char	*h;
2669
2670	/* Calculate the SI, LI, VI, and TI values. */
2671	si = utf8decode(str) - SB;
2672	li = si / NC;
2673	vi = (si % NC) / TC;
2674	ti = si % TC;
2675
2676	/* Fill in base of leaf. */
2677	h = hangul;
2678	LEAF_GEN(h) = 2;
2679	LEAF_CCC(h) = DECOMPOSE;
2680	h += 2;
2681
2682	/* Add LPart, a 3-byte UTF-8 sequence. */
2683	h += utf8encode((char *)h, li + LB);
2684
2685	/* Add VPart, a 3-byte UTF-8 sequence. */
2686	h += utf8encode((char *)h, vi + VB);
2687
2688	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
2689	if (ti)
2690		h += utf8encode((char *)h, ti + TB);
2691
2692	/* Terminate string. */
2693	h[0] = '\0';
2694
2695	return hangul;
2696}
2697
2698/*
2699 * Use trie to scan s, touching at most len bytes.
2700 * Returns the leaf if one exists, NULL otherwise.
2701 *
2702 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703 * is well-formed and corresponds to a known unicode code point.  The
2704 * shorthand for this will be "is valid UTF-8 unicode".
2705 */
2706static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707			       const char *s, size_t len)
2708{
2709	utf8trie_t	*trie;
2710	int		offlen;
2711	int		offset;
2712	int		mask;
2713	int		node;
2714
2715	if (!tree)
2716		return NULL;
2717	if (len == 0)
2718		return NULL;
2719	node = 1;
2720	trie = utf8data + tree->index;
2721	while (node) {
2722		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723		if (*trie & NEXTBYTE) {
2724			if (--len == 0)
2725				return NULL;
2726			s++;
2727		}
2728		mask = 1 << (*trie & BITNUM);
2729		if (*s & mask) {
2730			/* Right leg */
2731			if (offlen) {
2732				/* Right node at offset of trie */
2733				node = (*trie & RIGHTNODE);
2734				offset = trie[offlen];
2735				while (--offlen) {
2736					offset <<= 8;
2737					offset |= trie[offlen];
2738				}
2739				trie += offset;
2740			} else if (*trie & RIGHTPATH) {
2741				/* Right node after this node */
2742				node = (*trie & TRIENODE);
2743				trie++;
2744			} else {
2745				/* No right node. */
2746				return NULL;
2747			}
2748		} else {
2749			/* Left leg */
2750			if (offlen) {
2751				/* Left node after this node. */
2752				node = (*trie & LEFTNODE);
2753				trie += offlen + 1;
2754			} else if (*trie & RIGHTPATH) {
2755				/* No left node. */
2756				return NULL;
2757			} else {
2758				/* Left node after this node */
2759				node = (*trie & TRIENODE);
2760				trie++;
2761			}
2762		}
2763	}
2764	/*
2765	 * Hangul decomposition is done algorithmically. These are the
2766	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767	 * always 3 bytes long, so s has been advanced twice, and the
2768	 * start of the sequence is at s-2.
2769	 */
2770	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771		trie = utf8hangul(s - 2, hangul);
2772	return trie;
2773}
2774
2775/*
2776 * Use trie to scan s.
2777 * Returns the leaf if one exists, NULL otherwise.
2778 *
2779 * Forwards to trie_nlookup().
2780 */
2781static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782			      const char *s)
2783{
2784	return utf8nlookup(tree, hangul, s, (size_t)-1);
2785}
2786
2787/*
2788 * Return the number of bytes used by the current UTF-8 sequence.
2789 * Assumes the input points to the first byte of a valid UTF-8
2790 * sequence.
2791 */
2792static inline int utf8clen(const char *s)
2793{
2794	unsigned char c = *s;
2795	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796}
2797
2798/*
2799 * Maximum age of any character in s.
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if only non-assigned code points are used.
2802 */
2803int utf8agemax(struct tree *tree, const char *s)
2804{
2805	utf8leaf_t	*leaf;
2806	int		age = 0;
2807	int		leaf_age;
2808	unsigned char	hangul[UTF8HANGULLEAF];
2809
2810	if (!tree)
2811		return -1;
2812
2813	while (*s) {
2814		leaf = utf8lookup(tree, hangul, s);
2815		if (!leaf)
2816			return -1;
2817		leaf_age = ages[LEAF_GEN(leaf)];
2818		if (leaf_age <= tree->maxage && leaf_age > age)
2819			age = leaf_age;
2820		s += utf8clen(s);
2821	}
2822	return age;
2823}
2824
2825/*
2826 * Minimum age of any character in s.
2827 * Return -1 if s is not valid UTF-8 unicode.
2828 * Return 0 if non-assigned code points are used.
2829 */
2830int utf8agemin(struct tree *tree, const char *s)
2831{
2832	utf8leaf_t	*leaf;
2833	int		age;
2834	int		leaf_age;
2835	unsigned char	hangul[UTF8HANGULLEAF];
2836
2837	if (!tree)
2838		return -1;
2839	age = tree->maxage;
2840	while (*s) {
2841		leaf = utf8lookup(tree, hangul, s);
2842		if (!leaf)
2843			return -1;
2844		leaf_age = ages[LEAF_GEN(leaf)];
2845		if (leaf_age <= tree->maxage && leaf_age < age)
2846			age = leaf_age;
2847		s += utf8clen(s);
2848	}
2849	return age;
2850}
2851
2852/*
2853 * Maximum age of any character in s, touch at most len bytes.
2854 * Return -1 if s is not valid UTF-8 unicode.
2855 */
2856int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857{
2858	utf8leaf_t	*leaf;
2859	int		age = 0;
2860	int		leaf_age;
2861	unsigned char	hangul[UTF8HANGULLEAF];
2862
2863	if (!tree)
2864		return -1;
2865
2866        while (len && *s) {
2867		leaf = utf8nlookup(tree, hangul, s, len);
2868		if (!leaf)
2869			return -1;
2870		leaf_age = ages[LEAF_GEN(leaf)];
2871		if (leaf_age <= tree->maxage && leaf_age > age)
2872			age = leaf_age;
2873		len -= utf8clen(s);
2874		s += utf8clen(s);
2875	}
2876	return age;
2877}
2878
2879/*
2880 * Maximum age of any character in s, touch at most len bytes.
2881 * Return -1 if s is not valid UTF-8 unicode.
2882 */
2883int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884{
2885	utf8leaf_t	*leaf;
2886	int		leaf_age;
2887	int		age;
2888	unsigned char	hangul[UTF8HANGULLEAF];
2889
2890	if (!tree)
2891		return -1;
2892	age = tree->maxage;
2893        while (len && *s) {
2894		leaf = utf8nlookup(tree, hangul, s, len);
2895		if (!leaf)
2896			return -1;
2897		leaf_age = ages[LEAF_GEN(leaf)];
2898		if (leaf_age <= tree->maxage && leaf_age < age)
2899			age = leaf_age;
2900		len -= utf8clen(s);
2901		s += utf8clen(s);
2902	}
2903	return age;
2904}
2905
2906/*
2907 * Length of the normalization of s.
2908 * Return -1 if s is not valid UTF-8 unicode.
2909 *
2910 * A string of Default_Ignorable_Code_Point has length 0.
2911 */
2912ssize_t utf8len(struct tree *tree, const char *s)
2913{
2914	utf8leaf_t	*leaf;
2915	size_t		ret = 0;
2916	unsigned char	hangul[UTF8HANGULLEAF];
2917
2918	if (!tree)
2919		return -1;
2920	while (*s) {
2921		leaf = utf8lookup(tree, hangul, s);
2922		if (!leaf)
2923			return -1;
2924		if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925			ret += utf8clen(s);
2926		else if (LEAF_CCC(leaf) == DECOMPOSE)
2927			ret += strlen(LEAF_STR(leaf));
2928		else
2929			ret += utf8clen(s);
2930		s += utf8clen(s);
2931	}
2932	return ret;
2933}
2934
2935/*
2936 * Length of the normalization of s, touch at most len bytes.
2937 * Return -1 if s is not valid UTF-8 unicode.
2938 */
2939ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940{
2941	utf8leaf_t	*leaf;
2942	size_t		ret = 0;
2943	unsigned char	hangul[UTF8HANGULLEAF];
2944
2945	if (!tree)
2946		return -1;
2947	while (len && *s) {
2948		leaf = utf8nlookup(tree, hangul, s, len);
2949		if (!leaf)
2950			return -1;
2951		if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952			ret += utf8clen(s);
2953		else if (LEAF_CCC(leaf) == DECOMPOSE)
2954			ret += strlen(LEAF_STR(leaf));
2955		else
2956			ret += utf8clen(s);
2957		len -= utf8clen(s);
2958		s += utf8clen(s);
2959	}
2960	return ret;
2961}
2962
2963/*
2964 * Cursor structure used by the normalizer.
2965 */
2966struct utf8cursor {
2967	struct tree	*tree;
2968	const char	*s;
2969	const char	*p;
2970	const char	*ss;
2971	const char	*sp;
2972	unsigned int	len;
2973	unsigned int	slen;
2974	short int	ccc;
2975	short int	nccc;
2976	unsigned int	unichar;
2977	unsigned char	hangul[UTF8HANGULLEAF];
2978};
2979
2980/*
2981 * Set up an utf8cursor for use by utf8byte().
2982 *
2983 *   s      : string.
2984 *   len    : length of s.
2985 *   u8c    : pointer to cursor.
2986 *   trie   : utf8trie_t to use for normalization.
2987 *
2988 * Returns -1 on error, 0 on success.
2989 */
2990int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991		size_t len)
2992{
2993	if (!tree)
2994		return -1;
2995	if (!s)
2996		return -1;
2997	u8c->tree = tree;
2998	u8c->s = s;
2999	u8c->p = NULL;
3000	u8c->ss = NULL;
3001	u8c->sp = NULL;
3002	u8c->len = len;
3003	u8c->slen = 0;
3004	u8c->ccc = STOPPER;
3005	u8c->nccc = STOPPER;
3006	u8c->unichar = 0;
3007	/* Check we didn't clobber the maximum length. */
3008	if (u8c->len != len)
3009		return -1;
3010	/* The first byte of s may not be an utf8 continuation. */
3011	if (len > 0 && (*s & 0xC0) == 0x80)
3012		return -1;
3013	return 0;
3014}
3015
3016/*
3017 * Set up an utf8cursor for use by utf8byte().
3018 *
3019 *   s      : NUL-terminated string.
3020 *   u8c    : pointer to cursor.
3021 *   trie   : utf8trie_t to use for normalization.
3022 *
3023 * Returns -1 on error, 0 on success.
3024 */
3025int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026{
3027	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028}
3029
3030/*
3031 * Get one byte from the normalized form of the string described by u8c.
3032 *
3033 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3034 *
3035 * The cursor keeps track of the location in the string in u8c->s.
3036 * When a character is decomposed, the current location is stored in
3037 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038 * that bytes from a decomposition do not count against u8c->len.
3039 *
3040 * Characters are emitted if they match the current CCC in u8c->ccc.
3041 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042 * and the function returns 0 in that case.
3043 *
3044 * Sorting by CCC is done by repeatedly scanning the string.  The
3045 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046 * the start of the scan.  The first pass finds the lowest CCC to be
3047 * emitted and stores it in u8c->nccc, the second pass emits the
3048 * characters with this CCC and finds the next lowest CCC. This limits
3049 * the number of passes to 1 + the number of different CCCs in the
3050 * sequence being scanned.
3051 *
3052 * Therefore:
3053 *  u8c->p  != NULL -> a decomposition is being scanned.
3054 *  u8c->ss != NULL -> this is a repeating scan.
3055 *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
3056 */
3057int utf8byte(struct utf8cursor *u8c)
3058{
3059	utf8leaf_t *leaf;
3060	int ccc;
3061
3062	for (;;) {
3063		/* Check for the end of a decomposed character. */
3064		if (u8c->p && *u8c->s == '\0') {
3065			u8c->s = u8c->p;
3066			u8c->p = NULL;
3067		}
3068
3069		/* Check for end-of-string. */
3070		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071			/* There is no next byte. */
3072			if (u8c->ccc == STOPPER)
3073				return 0;
3074			/* End-of-string during a scan counts as a stopper. */
3075			ccc = STOPPER;
3076			goto ccc_mismatch;
3077		} else if ((*u8c->s & 0xC0) == 0x80) {
3078			/* This is a continuation of the current character. */
3079			if (!u8c->p)
3080				u8c->len--;
3081			return (unsigned char)*u8c->s++;
3082		}
3083
3084		/* Look up the data for the current character. */
3085		if (u8c->p) {
3086			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087		} else {
3088			leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089					   u8c->s, u8c->len);
3090		}
3091
3092		/* No leaf found implies that the input is a binary blob. */
3093		if (!leaf)
3094			return -1;
3095
3096		/* Characters that are too new have CCC 0. */
3097		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098			ccc = STOPPER;
3099		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100			u8c->len -= utf8clen(u8c->s);
3101			u8c->p = u8c->s + utf8clen(u8c->s);
3102			u8c->s = LEAF_STR(leaf);
3103			/* Empty decomposition implies CCC 0. */
3104			if (*u8c->s == '\0') {
3105				if (u8c->ccc == STOPPER)
3106					continue;
3107				ccc = STOPPER;
3108				goto ccc_mismatch;
3109			}
3110			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111			ccc = LEAF_CCC(leaf);
3112		}
3113		u8c->unichar = utf8decode(u8c->s);
3114
3115		/*
3116		 * If this is not a stopper, then see if it updates
3117		 * the next canonical class to be emitted.
3118		 */
3119		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120			u8c->nccc = ccc;
3121
3122		/*
3123		 * Return the current byte if this is the current
3124		 * combining class.
3125		 */
3126		if (ccc == u8c->ccc) {
3127			if (!u8c->p)
3128				u8c->len--;
3129			return (unsigned char)*u8c->s++;
3130		}
3131
3132		/* Current combining class mismatch. */
3133	ccc_mismatch:
3134		if (u8c->nccc == STOPPER) {
3135			/*
3136			 * Scan forward for the first canonical class
3137			 * to be emitted.  Save the position from
3138			 * which to restart.
3139			 */
3140			assert(u8c->ccc == STOPPER);
3141			u8c->ccc = MINCCC - 1;
3142			u8c->nccc = ccc;
3143			u8c->sp = u8c->p;
3144			u8c->ss = u8c->s;
3145			u8c->slen = u8c->len;
3146			if (!u8c->p)
3147				u8c->len -= utf8clen(u8c->s);
3148			u8c->s += utf8clen(u8c->s);
3149		} else if (ccc != STOPPER) {
3150			/* Not a stopper, and not the ccc we're emitting. */
3151			if (!u8c->p)
3152				u8c->len -= utf8clen(u8c->s);
3153			u8c->s += utf8clen(u8c->s);
3154		} else if (u8c->nccc != MAXCCC + 1) {
3155			/* At a stopper, restart for next ccc. */
3156			u8c->ccc = u8c->nccc;
3157			u8c->nccc = MAXCCC + 1;
3158			u8c->s = u8c->ss;
3159			u8c->p = u8c->sp;
3160			u8c->len = u8c->slen;
3161		} else {
3162			/* All done, proceed from here. */
3163			u8c->ccc = STOPPER;
3164			u8c->nccc = STOPPER;
3165			u8c->sp = NULL;
3166			u8c->ss = NULL;
3167			u8c->slen = 0;
3168		}
3169	}
3170}
3171
3172/* ------------------------------------------------------------------ */
3173
3174static int normalize_line(struct tree *tree)
3175{
3176	char *s;
3177	char *t;
3178	int c;
3179	struct utf8cursor u8c;
3180
3181	/* First test: null-terminated string. */
3182	s = buf2;
3183	t = buf3;
3184	if (utf8cursor(&u8c, tree, s))
3185		return -1;
3186	while ((c = utf8byte(&u8c)) > 0)
3187		if (c != (unsigned char)*t++)
3188			return -1;
3189	if (c < 0)
3190		return -1;
3191	if (*t != 0)
3192		return -1;
3193
3194	/* Second test: length-limited string. */
3195	s = buf2;
3196	/* Replace NUL with a value that will cause an error if seen. */
3197	s[strlen(s) + 1] = -1;
3198	t = buf3;
3199	if (utf8cursor(&u8c, tree, s))
3200		return -1;
3201	while ((c = utf8byte(&u8c)) > 0)
3202		if (c != (unsigned char)*t++)
3203			return -1;
3204	if (c < 0)
3205		return -1;
3206	if (*t != 0)
3207		return -1;
3208
3209	return 0;
3210}
3211
3212static void normalization_test(void)
3213{
3214	FILE *file;
3215	unsigned int unichar;
3216	struct unicode_data *data;
3217	char *s;
3218	char *t;
3219	int ret;
3220	int ignorables;
3221	int tests = 0;
3222	int failures = 0;
3223
3224	if (verbose > 0)
3225		printf("Parsing %s\n", test_name);
3226	/* Step one, read data from file. */
3227	file = fopen(test_name, "r");
3228	if (!file)
3229		open_fail(test_name, errno);
3230
3231	while (fgets(line, LINESIZE, file)) {
3232		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233			     buf0, buf1);
3234		if (ret != 2 || *line == '#')
3235			continue;
3236		s = buf0;
3237		t = buf2;
3238		while (*s) {
3239			unichar = strtoul(s, &s, 16);
3240			t += utf8encode(t, unichar);
3241		}
3242		*t = '\0';
3243
3244		ignorables = 0;
3245		s = buf1;
3246		t = buf3;
3247		while (*s) {
3248			unichar = strtoul(s, &s, 16);
3249			data = &unicode_data[unichar];
3250			if (data->utf8nfdi && !*data->utf8nfdi)
3251				ignorables = 1;
3252			else
3253				t += utf8encode(t, unichar);
3254		}
3255		*t = '\0';
3256
3257		tests++;
3258		if (normalize_line(nfdi_tree) < 0) {
3259			printf("Line %s -> %s", buf0, buf1);
3260			if (ignorables)
3261				printf(" (ignorables removed)");
3262			printf(" failure\n");
3263			failures++;
3264		}
3265	}
3266	fclose(file);
3267	if (verbose > 0)
3268		printf("Ran %d tests with %d failures\n", tests, failures);
3269	if (failures)
3270		file_fail(test_name);
3271}
3272
3273/* ------------------------------------------------------------------ */
3274
3275static void write_file(void)
3276{
3277	FILE *file;
3278	int i;
3279	int j;
3280	int t;
3281	int gen;
3282
3283	if (verbose > 0)
3284		printf("Writing %s\n", utf8_name);
3285	file = fopen(utf8_name, "w");
3286	if (!file)
3287		open_fail(utf8_name, errno);
3288
3289	fprintf(file, "/* This file is generated code, do not edit. */\n");
3290	fprintf(file, "\n");
3291	fprintf(file, "#include <linux/module.h>\n");
3292	fprintf(file, "#include <linux/kernel.h>\n");
3293	fprintf(file, "#include \"utf8n.h\"\n");
3294	fprintf(file, "\n");
3295	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3296	for (i = 0; i != ages_count; i++)
3297		fprintf(file, "\t%#x%s\n", ages[i],
3298			ages[i] == unicode_maxage ? "" : ",");
3299	fprintf(file, "};\n");
3300	fprintf(file, "\n");
3301	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3302	t = 0;
3303	for (gen = 0; gen < ages_count; gen++) {
3304		fprintf(file, "\t{ %#x, %d }%s\n",
3305			ages[gen], trees[t].index,
3306			ages[gen] == unicode_maxage ? "" : ",");
3307		if (trees[t].maxage == ages[gen])
3308			t += 2;
3309	}
3310	fprintf(file, "};\n");
3311	fprintf(file, "\n");
3312	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3313	t = 1;
3314	for (gen = 0; gen < ages_count; gen++) {
3315		fprintf(file, "\t{ %#x, %d }%s\n",
3316			ages[gen], trees[t].index,
3317			ages[gen] == unicode_maxage ? "" : ",");
3318		if (trees[t].maxage == ages[gen])
3319			t += 2;
3320	}
3321	fprintf(file, "};\n");
3322	fprintf(file, "\n");
3323	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3324		utf8data_size);
3325	t = 0;
3326	for (i = 0; i != utf8data_size; i += 16) {
3327		if (i == trees[t].index) {
3328			fprintf(file, "\t/* %s_%x */\n",
3329				trees[t].type, trees[t].maxage);
3330			if (t < trees_count-1)
3331				t++;
3332		}
3333		fprintf(file, "\t");
3334		for (j = i; j != i + 16; j++)
3335			fprintf(file, "0x%.2x%s", utf8data[j],
3336				(j < utf8data_size -1 ? "," : ""));
3337		fprintf(file, "\n");
3338	}
3339	fprintf(file, "};\n");
3340	fprintf(file, "\n");
3341	fprintf(file, "struct utf8data_table utf8_data_table = {\n");
3342	fprintf(file, "\t.utf8agetab = utf8agetab,\n");
3343	fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3344	fprintf(file, "\n");
3345	fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3346	fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3347	fprintf(file, "\n");
3348	fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
3349	fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3350	fprintf(file, "\n");
3351	fprintf(file, "\t.utf8data = utf8data,\n");
3352	fprintf(file, "};\n");
3353	fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3354	fprintf(file, "\n");
3355	fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
3356	fclose(file);
3357}
3358
3359/* ------------------------------------------------------------------ */
3360
3361int main(int argc, char *argv[])
3362{
3363	unsigned int unichar;
3364	int opt;
3365
3366	argv0 = argv[0];
3367
3368	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3369		switch (opt) {
3370		case 'a':
3371			age_name = optarg;
3372			break;
3373		case 'c':
3374			ccc_name = optarg;
3375			break;
3376		case 'd':
3377			data_name = optarg;
3378			break;
3379		case 'f':
3380			fold_name = optarg;
3381			break;
3382		case 'n':
3383			norm_name = optarg;
3384			break;
3385		case 'o':
3386			utf8_name = optarg;
3387			break;
3388		case 'p':
3389			prop_name = optarg;
3390			break;
3391		case 't':
3392			test_name = optarg;
3393			break;
3394		case 'v':
3395			verbose++;
3396			break;
3397		case 'h':
3398			help();
3399			exit(0);
3400		default:
3401			usage();
3402		}
3403	}
3404
3405	if (verbose > 1)
3406		help();
3407	for (unichar = 0; unichar != 0x110000; unichar++)
3408		unicode_data[unichar].code = unichar;
3409	age_init();
3410	ccc_init();
3411	nfdi_init();
3412	nfdicf_init();
3413	ignore_init();
3414	corrections_init();
3415	hangul_decompose();
3416	nfdi_decompose();
3417	nfdicf_decompose();
3418	utf8_init();
3419	trees_init();
3420	trees_populate();
3421	trees_reduce();
3422	trees_verify();
3423	/* Prevent "unused function" warning. */
3424	(void)lookup(nfdi_tree, " ");
3425	if (verbose > 2)
3426		tree_walk(nfdi_tree);
3427	if (verbose > 2)
3428		tree_walk(nfdicf_tree);
3429	normalization_test();
3430	write_file();
3431
3432	return 0;
3433}
3434