1286432Sbapt/*
2286432Sbapt * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3286432Sbapt * Copyright 2015 John Marino <draco@marino.st>
4286432Sbapt *
5286432Sbapt * This source code is derived from the illumos localedef command, and
6286432Sbapt * provided under BSD-style license terms by Nexenta Systems, Inc.
7286432Sbapt *
8286432Sbapt * Redistribution and use in source and binary forms, with or without
9286432Sbapt * modification, are permitted provided that the following conditions
10286432Sbapt * are met:
11286432Sbapt *
12286432Sbapt * 1. Redistributions of source code must retain the above copyright
13286432Sbapt *    notice, this list of conditions and the following disclaimer.
14286432Sbapt * 2. Redistributions in binary form must reproduce the above copyright
15286432Sbapt *    notice, this list of conditions and the following disclaimer in the
16286432Sbapt *    documentation and/or other materials provided with the distribution.
17286432Sbapt *
18286432Sbapt * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19286432Sbapt * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20286432Sbapt * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21286432Sbapt * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22286432Sbapt * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23286432Sbapt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24286432Sbapt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25286432Sbapt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26286432Sbapt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27286432Sbapt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28286432Sbapt * POSSIBILITY OF SUCH DAMAGE.
29286432Sbapt */
30286432Sbapt
31286432Sbapt/*
32286432Sbapt * LC_COLLATE database generation routines for localedef.
33286432Sbapt */
34286432Sbapt#include <sys/cdefs.h>
35286432Sbapt__FBSDID("$FreeBSD: stable/11/usr.bin/localedef/collate.c 307697 2016-10-21 03:10:05Z araujo $");
36286432Sbapt
37286432Sbapt#include <sys/types.h>
38286484Sbapt#include <sys/tree.h>
39286432Sbapt
40286432Sbapt#include <stdio.h>
41286432Sbapt#include <stddef.h>
42286432Sbapt#include <stdlib.h>
43286432Sbapt#include <errno.h>
44286432Sbapt#include <string.h>
45286432Sbapt#include <unistd.h>
46286432Sbapt#include <wchar.h>
47286432Sbapt#include <limits.h>
48286432Sbapt#include "localedef.h"
49286432Sbapt#include "parser.h"
50286432Sbapt#include "collate.h"
51286432Sbapt
52286432Sbapt/*
53286432Sbapt * Design notes.
54286432Sbapt *
55286432Sbapt * It will be extremely helpful to the reader if they have access to
56286432Sbapt * the localedef and locale file format specifications available.
57286432Sbapt * Latest versions of these are available from www.opengroup.org.
58286432Sbapt *
59286432Sbapt * The design for the collation code is a bit complex.  The goal is a
60286432Sbapt * single collation database as described in collate.h (in
61286432Sbapt * libc/port/locale).  However, there are some other tidbits:
62286432Sbapt *
63286432Sbapt * a) The substitution entries are now a directly indexable array.  A
64286432Sbapt * priority elsewhere in the table is taken as an index into the
65286432Sbapt * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
66286432Sbapt * set.  (The bit is cleared and the result is the index into the
67286432Sbapt * table.
68286432Sbapt *
69286432Sbapt * b) We eliminate duplicate entries into the substitution table.
70286432Sbapt * This saves a lot of space.
71286432Sbapt *
72286432Sbapt * c) The priorities for each level are "compressed", so that each
73286432Sbapt * sorting level has consecutively numbered priorities starting at 1.
74286432Sbapt * (O is reserved for the ignore priority.)  This means sort levels
75286432Sbapt * which only have a few distinct priorities can represent the
76286432Sbapt * priority level in fewer bits, which makes the strxfrm output
77286432Sbapt * smaller.
78286432Sbapt *
79286432Sbapt * d) We record the total number of priorities so that strxfrm can
80286432Sbapt * figure out how many bytes to expand a numeric priority into.
81286432Sbapt *
82286432Sbapt * e) For the UNDEFINED pass (the last pass), we record the maximum
83286432Sbapt * number of bits needed to uniquely prioritize these entries, so that
84286432Sbapt * the last pass can also use smaller strxfrm output when possible.
85286432Sbapt *
86286432Sbapt * f) Priorities with the sign bit set are verboten.  This works out
87286432Sbapt * because no active character set needs that bit to carry significant
88286432Sbapt * information once the character is in wide form.
89286432Sbapt *
90286432Sbapt * To process the entire data to make the database, we actually run
91286432Sbapt * multiple passes over the data.
92286432Sbapt *
93286432Sbapt * The first pass, which is done at parse time, identifies elements,
94286432Sbapt * substitutions, and such, and records them in priority order.  As
95286432Sbapt * some priorities can refer to other priorities, using forward
96286432Sbapt * references, we use a table of references indicating whether the
97286432Sbapt * priority's value has been resolved, or whether it is still a
98286432Sbapt * reference.
99286432Sbapt *
100286432Sbapt * The second pass walks over all the items in priority order, noting
101286432Sbapt * that they are used directly, and not just an indirect reference.
102286432Sbapt * This is done by creating a "weight" structure for the item.  The
103286484Sbapt * weights are stashed in an RB tree sorted by relative "priority".
104286432Sbapt *
105286432Sbapt * The third pass walks over all the weight structures, in priority
106286432Sbapt * order, and assigns a new monotonically increasing (per sort level)
107286432Sbapt * weight value to them.  These are the values that will actually be
108286432Sbapt * written to the file.
109286432Sbapt *
110286432Sbapt * The fourth pass just writes the data out.
111286432Sbapt */
112286432Sbapt
113286432Sbapt/*
114286432Sbapt * In order to resolve the priorities, we create a table of priorities.
115286432Sbapt * Entries in the table can be in one of three states.
116286432Sbapt *
117286432Sbapt * UNKNOWN is for newly allocated entries, and indicates that nothing
118286432Sbapt * is known about the priority.  (For example, when new entries are created
119286432Sbapt * for collating-symbols, this is the value assigned for them until the
120286432Sbapt * collating symbol's order has been determined.
121286432Sbapt *
122286432Sbapt * RESOLVED is used for an entry where the priority indicates the final
123286432Sbapt * numeric weight.
124286432Sbapt *
125286432Sbapt * REFER is used for entries that reference other entries.  Typically
126286432Sbapt * this is used for forward references.  A collating-symbol can never
127286432Sbapt * have this value.
128286432Sbapt *
129286432Sbapt * The "pass" field is used during final resolution to aid in detection
130286432Sbapt * of referencing loops.  (For example <A> depends on <B>, but <B> has its
131286432Sbapt * priority dependent on <A>.)
132286432Sbapt */
133286432Sbapttypedef enum {
134286432Sbapt	UNKNOWN,	/* priority is totally unknown */
135286432Sbapt	RESOLVED,	/* priority value fully resolved */
136286432Sbapt	REFER		/* priority is a reference (index) */
137286432Sbapt} res_t;
138286432Sbapt
139286432Sbapttypedef struct weight {
140286432Sbapt	int32_t		pri;
141286432Sbapt	int		opt;
142286484Sbapt	RB_ENTRY(weight) entry;
143286432Sbapt} weight_t;
144286432Sbapt
145286432Sbapttypedef struct priority {
146286432Sbapt	res_t		res;
147286432Sbapt	int32_t		pri;
148286432Sbapt	int		pass;
149286432Sbapt	int		lineno;
150286432Sbapt} collpri_t;
151286432Sbapt
152286432Sbapt#define	NUM_WT	collinfo.directive_count
153286432Sbapt
154286432Sbapt/*
155286432Sbapt * These are the abstract collating symbols, which are just a symbolic
156286432Sbapt * way to reference a priority.
157286432Sbapt */
158286432Sbaptstruct collsym {
159286432Sbapt	char		*name;
160286432Sbapt	int32_t		ref;
161286484Sbapt	RB_ENTRY(collsym) entry;
162286432Sbapt};
163286432Sbapt
164286432Sbapt/*
165286432Sbapt * These are also abstract collating symbols, but we allow them to have
166286432Sbapt * different priorities at different levels.
167286432Sbapt */
168286432Sbapttypedef struct collundef {
169286432Sbapt	char		*name;
170286432Sbapt	int32_t		ref[COLL_WEIGHTS_MAX];
171286484Sbapt	RB_ENTRY(collundef) entry;
172286432Sbapt} collundef_t;
173286432Sbapt
174286432Sbapt/*
175286432Sbapt * These are called "chains" in libc.  This records the fact that two
176286432Sbapt * more characters should be treated as a single collating entity when
177286432Sbapt * they appear together.  For example, in Spanish <C><h> gets collated
178286432Sbapt * as a character between <C> and <D>.
179286432Sbapt */
180286432Sbaptstruct collelem {
181286432Sbapt	char		*symbol;
182286432Sbapt	wchar_t		*expand;
183286432Sbapt	int32_t		ref[COLL_WEIGHTS_MAX];
184286484Sbapt	RB_ENTRY(collelem) rb_bysymbol;
185286484Sbapt	RB_ENTRY(collelem) rb_byexpand;
186286432Sbapt};
187286432Sbapt
188286432Sbapt/*
189286432Sbapt * Individual characters have a sequence of weights as well.
190286432Sbapt */
191286432Sbapttypedef struct collchar {
192286432Sbapt	wchar_t		wc;
193286432Sbapt	int32_t		ref[COLL_WEIGHTS_MAX];
194286484Sbapt	RB_ENTRY(collchar) entry;
195286432Sbapt} collchar_t;
196286432Sbapt
197286432Sbapt/*
198286432Sbapt * Substitution entries.  The key is itself a priority.  Note that
199286432Sbapt * when we create one of these, we *automatically* wind up with a
200286432Sbapt * fully resolved priority for the key, because creation of
201286432Sbapt * substitutions creates a resolved priority at the same time.
202286432Sbapt */
203286484Sbapttypedef struct subst{
204286432Sbapt	int32_t		key;
205286432Sbapt	int32_t		ref[COLLATE_STR_LEN];
206286484Sbapt	RB_ENTRY(subst)	entry;
207286484Sbapt	RB_ENTRY(subst)	entry_ref;
208286432Sbapt} subst_t;
209286432Sbapt
210286484Sbaptstatic RB_HEAD(collsyms, collsym) collsyms;
211286484Sbaptstatic RB_HEAD(collundefs, collundef) collundefs;
212286484Sbaptstatic RB_HEAD(elem_by_symbol, collelem) elem_by_symbol;
213286484Sbaptstatic RB_HEAD(elem_by_expand, collelem) elem_by_expand;
214286484Sbaptstatic RB_HEAD(collchars, collchar) collchars;
215286484Sbaptstatic RB_HEAD(substs, subst) substs[COLL_WEIGHTS_MAX];
216286484Sbaptstatic RB_HEAD(substs_ref, subst) substs_ref[COLL_WEIGHTS_MAX];
217286484Sbaptstatic RB_HEAD(weights, weight) weights[COLL_WEIGHTS_MAX];
218286432Sbaptstatic int32_t		nweight[COLL_WEIGHTS_MAX];
219286432Sbapt
220286432Sbapt/*
221286432Sbapt * This is state tracking for the ellipsis token.  Note that we start
222286432Sbapt * the initial values so that the ellipsis logic will think we got a
223286432Sbapt * magic starting value of NUL.  It starts at minus one because the
224286432Sbapt * starting point is exclusive -- i.e. the starting point is not
225286432Sbapt * itself handled by the ellipsis code.
226286432Sbapt */
227286432Sbaptstatic int currorder = EOF;
228286432Sbaptstatic int lastorder = EOF;
229286432Sbaptstatic collelem_t *currelem;
230286432Sbaptstatic collchar_t *currchar;
231286432Sbaptstatic collundef_t *currundef;
232286432Sbaptstatic wchar_t ellipsis_start = 0;
233286432Sbaptstatic int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
234286432Sbapt
235286432Sbapt/*
236286432Sbapt * We keep a running tally of weights.
237286432Sbapt */
238286432Sbaptstatic int nextpri = 1;
239286432Sbaptstatic int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
240286432Sbapt
241286432Sbapt/*
242286432Sbapt * This array collects up the weights for each level.
243286432Sbapt */
244286432Sbaptstatic int32_t order_weights[COLL_WEIGHTS_MAX];
245286432Sbaptstatic int curr_weight = 0;
246286432Sbaptstatic int32_t subst_weights[COLLATE_STR_LEN];
247286432Sbaptstatic int curr_subst = 0;
248286432Sbapt
249286432Sbapt/*
250286432Sbapt * Some initial priority values.
251286432Sbapt */
252286432Sbaptstatic int32_t pri_undefined[COLL_WEIGHTS_MAX];
253286432Sbaptstatic int32_t pri_ignore;
254286432Sbapt
255286432Sbaptstatic collate_info_t collinfo;
256286432Sbapt
257286432Sbaptstatic collpri_t	*prilist = NULL;
258286432Sbaptstatic int		numpri = 0;
259286432Sbaptstatic int		maxpri = 0;
260286432Sbapt
261286432Sbaptstatic void start_order(int);
262286432Sbapt
263286432Sbaptstatic int32_t
264286432Sbaptnew_pri(void)
265286432Sbapt{
266286432Sbapt	int i;
267286432Sbapt
268286432Sbapt	if (numpri >= maxpri) {
269286432Sbapt		maxpri = maxpri ? maxpri * 2 : 1024;
270286432Sbapt		prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
271286432Sbapt		if (prilist == NULL) {
272286432Sbapt			fprintf(stderr,"out of memory");
273286432Sbapt			return (-1);
274286432Sbapt		}
275286432Sbapt		for (i = numpri; i < maxpri; i++) {
276286432Sbapt			prilist[i].res = UNKNOWN;
277286432Sbapt			prilist[i].pri = 0;
278286432Sbapt			prilist[i].pass = 0;
279286432Sbapt		}
280286432Sbapt	}
281286432Sbapt	return (numpri++);
282286432Sbapt}
283286432Sbapt
284286432Sbaptstatic collpri_t *
285286432Sbaptget_pri(int32_t ref)
286286432Sbapt{
287286432Sbapt	if ((ref < 0) || (ref > numpri)) {
288286432Sbapt		INTERR;
289286432Sbapt		return (NULL);
290286432Sbapt	}
291286432Sbapt	return (&prilist[ref]);
292286432Sbapt}
293286432Sbapt
294286432Sbaptstatic void
295286432Sbaptset_pri(int32_t ref, int32_t v, res_t res)
296286432Sbapt{
297286432Sbapt	collpri_t	*pri;
298286432Sbapt
299286432Sbapt	pri = get_pri(ref);
300286432Sbapt
301286432Sbapt	if ((res == REFER) && ((v < 0) || (v >= numpri))) {
302286432Sbapt		INTERR;
303286432Sbapt	}
304286432Sbapt
305286432Sbapt	/* Resolve self references */
306286432Sbapt	if ((res == REFER) && (ref == v)) {
307286432Sbapt		v = nextpri;
308286432Sbapt		res = RESOLVED;
309286432Sbapt	}
310286432Sbapt
311286432Sbapt	if (pri->res != UNKNOWN) {
312286432Sbapt		warn("repeated item in order list (first on %d)",
313286432Sbapt		    pri->lineno);
314286432Sbapt		return;
315286432Sbapt	}
316286432Sbapt	pri->lineno = lineno;
317286432Sbapt	pri->pri = v;
318286432Sbapt	pri->res = res;
319286432Sbapt}
320286432Sbapt
321286432Sbaptstatic int32_t
322286432Sbaptresolve_pri(int32_t ref)
323286432Sbapt{
324286432Sbapt	collpri_t	*pri;
325286432Sbapt	static int32_t	pass = 0;
326286432Sbapt
327286432Sbapt	pri = get_pri(ref);
328286432Sbapt	pass++;
329286432Sbapt	while (pri->res == REFER) {
330286432Sbapt		if (pri->pass == pass) {
331286432Sbapt			/* report a line with the circular symbol */
332286432Sbapt			lineno = pri->lineno;
333286432Sbapt			fprintf(stderr,"circular reference in order list");
334286432Sbapt			return (-1);
335286432Sbapt		}
336286432Sbapt		if ((pri->pri < 0) || (pri->pri >= numpri)) {
337286432Sbapt			INTERR;
338286432Sbapt			return (-1);
339286432Sbapt		}
340286432Sbapt		pri->pass = pass;
341286432Sbapt		pri = &prilist[pri->pri];
342286432Sbapt	}
343286432Sbapt
344286432Sbapt	if (pri->res == UNKNOWN) {
345286432Sbapt		return (-1);
346286432Sbapt	}
347286432Sbapt	if (pri->res != RESOLVED)
348286432Sbapt		INTERR;
349286432Sbapt
350286432Sbapt	return (pri->pri);
351286432Sbapt}
352286432Sbapt
353286432Sbaptstatic int
354286432Sbaptweight_compare(const void *n1, const void *n2)
355286432Sbapt{
356286432Sbapt	int32_t	k1 = ((const weight_t *)n1)->pri;
357286432Sbapt	int32_t	k2 = ((const weight_t *)n2)->pri;
358286432Sbapt
359286432Sbapt	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
360286432Sbapt}
361286432Sbapt
362286484SbaptRB_GENERATE_STATIC(weights, weight, entry, weight_compare);
363286484Sbapt
364286432Sbaptstatic int
365286432Sbaptcollsym_compare(const void *n1, const void *n2)
366286432Sbapt{
367286432Sbapt	const collsym_t *c1 = n1;
368286432Sbapt	const collsym_t *c2 = n2;
369286432Sbapt	int rv;
370286432Sbapt
371286432Sbapt	rv = strcmp(c1->name, c2->name);
372286432Sbapt	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
373286432Sbapt}
374286432Sbapt
375286484SbaptRB_GENERATE_STATIC(collsyms, collsym, entry, collsym_compare);
376286484Sbapt
377286432Sbaptstatic int
378286432Sbaptcollundef_compare(const void *n1, const void *n2)
379286432Sbapt{
380286432Sbapt	const collundef_t *c1 = n1;
381286432Sbapt	const collundef_t *c2 = n2;
382286432Sbapt	int rv;
383286432Sbapt
384286432Sbapt	rv = strcmp(c1->name, c2->name);
385286432Sbapt	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
386286432Sbapt}
387286432Sbapt
388286484SbaptRB_GENERATE_STATIC(collundefs, collundef, entry, collundef_compare);
389286484Sbapt
390286432Sbaptstatic int
391286432Sbaptelement_compare_symbol(const void *n1, const void *n2)
392286432Sbapt{
393286432Sbapt	const collelem_t *c1 = n1;
394286432Sbapt	const collelem_t *c2 = n2;
395286432Sbapt	int rv;
396286432Sbapt
397286432Sbapt	rv = strcmp(c1->symbol, c2->symbol);
398286432Sbapt	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
399286432Sbapt}
400286432Sbapt
401286484SbaptRB_GENERATE_STATIC(elem_by_symbol, collelem, rb_bysymbol, element_compare_symbol);
402286484Sbapt
403286432Sbaptstatic int
404286432Sbaptelement_compare_expand(const void *n1, const void *n2)
405286432Sbapt{
406286432Sbapt	const collelem_t *c1 = n1;
407286432Sbapt	const collelem_t *c2 = n2;
408286432Sbapt	int rv;
409286432Sbapt
410286432Sbapt	rv = wcscmp(c1->expand, c2->expand);
411286432Sbapt	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
412286432Sbapt}
413286432Sbapt
414286484SbaptRB_GENERATE_STATIC(elem_by_expand, collelem, rb_byexpand, element_compare_expand);
415286484Sbapt
416286432Sbaptstatic int
417286432Sbaptcollchar_compare(const void *n1, const void *n2)
418286432Sbapt{
419286432Sbapt	wchar_t	k1 = ((const collchar_t *)n1)->wc;
420286432Sbapt	wchar_t	k2 = ((const collchar_t *)n2)->wc;
421286432Sbapt
422286432Sbapt	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
423286432Sbapt}
424286432Sbapt
425286484SbaptRB_GENERATE_STATIC(collchars, collchar, entry, collchar_compare);
426286484Sbapt
427286432Sbaptstatic int
428286432Sbaptsubst_compare(const void *n1, const void *n2)
429286432Sbapt{
430286432Sbapt	int32_t	k1 = ((const subst_t *)n1)->key;
431286432Sbapt	int32_t	k2 = ((const subst_t *)n2)->key;
432286432Sbapt
433286432Sbapt	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
434286432Sbapt}
435286432Sbapt
436286484SbaptRB_GENERATE_STATIC(substs, subst, entry, subst_compare);
437286484Sbapt
438286432Sbaptstatic int
439286432Sbaptsubst_compare_ref(const void *n1, const void *n2)
440286432Sbapt{
441290559Sbapt	const wchar_t *c1 = ((const subst_t *)n1)->ref;
442290559Sbapt	const wchar_t *c2 = ((const subst_t *)n2)->ref;
443286432Sbapt	int rv;
444286432Sbapt
445290559Sbapt	rv = wcscmp(c1, c2);
446286432Sbapt	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
447286432Sbapt}
448286432Sbapt
449286484SbaptRB_GENERATE_STATIC(substs_ref, subst, entry_ref, subst_compare_ref);
450286484Sbapt
451286432Sbaptvoid
452286432Sbaptinit_collate(void)
453286432Sbapt{
454286432Sbapt	int i;
455286432Sbapt
456286484Sbapt	RB_INIT(&collsyms);
457286432Sbapt
458286484Sbapt	RB_INIT(&collundefs);
459286432Sbapt
460286484Sbapt	RB_INIT(&elem_by_symbol);
461286432Sbapt
462286484Sbapt	RB_INIT(&elem_by_expand);
463286432Sbapt
464286484Sbapt	RB_INIT(&collchars);
465286484Sbapt
466286432Sbapt	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
467286484Sbapt		RB_INIT(&substs[i]);
468286484Sbapt		RB_INIT(&substs_ref[i]);
469286484Sbapt		RB_INIT(&weights[i]);
470286432Sbapt		nweight[i] = 1;
471286432Sbapt	}
472286432Sbapt
473286432Sbapt	(void) memset(&collinfo, 0, sizeof (collinfo));
474286432Sbapt
475286432Sbapt	/* allocate some initial priorities */
476286432Sbapt	pri_ignore = new_pri();
477286432Sbapt
478286432Sbapt	set_pri(pri_ignore, 0, RESOLVED);
479286432Sbapt
480286432Sbapt	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
481286432Sbapt		pri_undefined[i] = new_pri();
482286432Sbapt
483286432Sbapt		/* we will override this later */
484286432Sbapt		set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
485286432Sbapt	}
486286432Sbapt}
487286432Sbapt
488286432Sbaptvoid
489286432Sbaptdefine_collsym(char *name)
490286432Sbapt{
491286432Sbapt	collsym_t	*sym;
492286432Sbapt
493307697Saraujo	if ((sym = calloc(1, sizeof(*sym))) == NULL) {
494286432Sbapt		fprintf(stderr,"out of memory");
495286432Sbapt		return;
496286432Sbapt	}
497286432Sbapt	sym->name = name;
498286432Sbapt	sym->ref = new_pri();
499286432Sbapt
500286484Sbapt	if (RB_FIND(collsyms, &collsyms, sym) != NULL) {
501286432Sbapt		/*
502286432Sbapt		 * This should never happen because we are only called
503286432Sbapt		 * for undefined symbols.
504286432Sbapt		 */
505298378Sbapt		free(sym);
506286432Sbapt		INTERR;
507286432Sbapt		return;
508286432Sbapt	}
509286484Sbapt	RB_INSERT(collsyms, &collsyms, sym);
510286432Sbapt}
511286432Sbapt
512286432Sbaptcollsym_t *
513286432Sbaptlookup_collsym(char *name)
514286432Sbapt{
515286432Sbapt	collsym_t	srch;
516286432Sbapt
517286432Sbapt	srch.name = name;
518286484Sbapt	return (RB_FIND(collsyms, &collsyms, &srch));
519286432Sbapt}
520286432Sbapt
521286432Sbaptcollelem_t *
522286432Sbaptlookup_collelem(char *symbol)
523286432Sbapt{
524286432Sbapt	collelem_t	srch;
525286432Sbapt
526286432Sbapt	srch.symbol = symbol;
527286484Sbapt	return (RB_FIND(elem_by_symbol, &elem_by_symbol, &srch));
528286432Sbapt}
529286432Sbapt
530286432Sbaptstatic collundef_t *
531286432Sbaptget_collundef(char *name)
532286432Sbapt{
533286432Sbapt	collundef_t	srch;
534286432Sbapt	collundef_t	*ud;
535286432Sbapt	int		i;
536286432Sbapt
537286432Sbapt	srch.name = name;
538286484Sbapt	if ((ud = RB_FIND(collundefs, &collundefs, &srch)) == NULL) {
539307697Saraujo		if (((ud = calloc(1, sizeof(*ud))) == NULL) ||
540286432Sbapt		    ((ud->name = strdup(name)) == NULL)) {
541286432Sbapt			fprintf(stderr,"out of memory");
542298378Sbapt			free(ud);
543286432Sbapt			return (NULL);
544286432Sbapt		}
545286432Sbapt		for (i = 0; i < NUM_WT; i++) {
546286432Sbapt			ud->ref[i] = new_pri();
547286432Sbapt		}
548286484Sbapt		RB_INSERT(collundefs, &collundefs, ud);
549286432Sbapt	}
550286432Sbapt	add_charmap_undefined(name);
551286432Sbapt	return (ud);
552286432Sbapt}
553286432Sbapt
554286432Sbaptstatic collchar_t *
555286432Sbaptget_collchar(wchar_t wc, int create)
556286432Sbapt{
557286432Sbapt	collchar_t	srch;
558286432Sbapt	collchar_t	*cc;
559286432Sbapt	int		i;
560286432Sbapt
561286432Sbapt	srch.wc = wc;
562286484Sbapt	cc = RB_FIND(collchars, &collchars, &srch);
563286432Sbapt	if ((cc == NULL) && create) {
564307697Saraujo		if ((cc = calloc(1, sizeof(*cc))) == NULL) {
565286432Sbapt			fprintf(stderr, "out of memory");
566286432Sbapt			return (NULL);
567286432Sbapt		}
568286432Sbapt		for (i = 0; i < NUM_WT; i++) {
569286432Sbapt			cc->ref[i] = new_pri();
570286432Sbapt		}
571286432Sbapt		cc->wc = wc;
572286484Sbapt		RB_INSERT(collchars, &collchars, cc);
573286432Sbapt	}
574286432Sbapt	return (cc);
575286432Sbapt}
576286432Sbapt
577286432Sbaptvoid
578286432Sbaptend_order_collsym(collsym_t *sym)
579286432Sbapt{
580286432Sbapt	start_order(T_COLLSYM);
581286432Sbapt	/* update the weight */
582286432Sbapt
583286432Sbapt	set_pri(sym->ref, nextpri, RESOLVED);
584286432Sbapt	nextpri++;
585286432Sbapt}
586286432Sbapt
587286432Sbaptvoid
588286432Sbaptend_order(void)
589286432Sbapt{
590286432Sbapt	int		i;
591286432Sbapt	int32_t		pri;
592286432Sbapt	int32_t		ref;
593286432Sbapt	collpri_t	*p;
594286432Sbapt
595286432Sbapt	/* advance the priority/weight */
596286432Sbapt	pri = nextpri;
597286432Sbapt
598286432Sbapt	switch (currorder) {
599286432Sbapt	case T_CHAR:
600286432Sbapt		for (i = 0; i < NUM_WT; i++) {
601286432Sbapt			if (((ref = order_weights[i]) < 0) ||
602286432Sbapt			    ((p = get_pri(ref)) == NULL) ||
603286432Sbapt			    (p->pri == -1)) {
604286432Sbapt				/* unspecified weight is a self reference */
605286432Sbapt				set_pri(currchar->ref[i], pri, RESOLVED);
606286432Sbapt			} else {
607286432Sbapt				set_pri(currchar->ref[i], ref, REFER);
608286432Sbapt			}
609286432Sbapt			order_weights[i] = -1;
610286432Sbapt		}
611286432Sbapt
612286432Sbapt		/* leave a cookie trail in case next symbol is ellipsis */
613286432Sbapt		ellipsis_start = currchar->wc + 1;
614286432Sbapt		currchar = NULL;
615286432Sbapt		break;
616286432Sbapt
617286432Sbapt	case T_ELLIPSIS:
618286432Sbapt		/* save off the weights were we can find them */
619286432Sbapt		for (i = 0; i < NUM_WT; i++) {
620286432Sbapt			ellipsis_weights[i] = order_weights[i];
621286432Sbapt			order_weights[i] = -1;
622286432Sbapt		}
623286432Sbapt		break;
624286432Sbapt
625286432Sbapt	case T_COLLELEM:
626286432Sbapt		if (currelem == NULL) {
627286432Sbapt			INTERR;
628286432Sbapt		} else {
629286432Sbapt			for (i = 0; i < NUM_WT; i++) {
630286432Sbapt
631286432Sbapt				if (((ref = order_weights[i]) < 0) ||
632286432Sbapt				    ((p = get_pri(ref)) == NULL) ||
633286432Sbapt				    (p->pri == -1)) {
634286432Sbapt					set_pri(currelem->ref[i], pri,
635286432Sbapt					    RESOLVED);
636286432Sbapt				} else {
637286432Sbapt					set_pri(currelem->ref[i], ref, REFER);
638286432Sbapt				}
639286432Sbapt				order_weights[i] = -1;
640286432Sbapt			}
641286432Sbapt		}
642286432Sbapt		break;
643286432Sbapt
644286432Sbapt	case T_UNDEFINED:
645286432Sbapt		for (i = 0; i < NUM_WT; i++) {
646286432Sbapt			if (((ref = order_weights[i]) < 0) ||
647286432Sbapt			    ((p = get_pri(ref)) == NULL) ||
648286432Sbapt			    (p->pri == -1)) {
649286432Sbapt				set_pri(pri_undefined[i], -1, RESOLVED);
650286432Sbapt			} else {
651286432Sbapt				set_pri(pri_undefined[i], ref, REFER);
652286432Sbapt			}
653286432Sbapt			order_weights[i] = -1;
654286432Sbapt		}
655286432Sbapt		break;
656286432Sbapt
657286432Sbapt	case T_SYMBOL:
658286432Sbapt		for (i = 0; i < NUM_WT; i++) {
659286432Sbapt			if (((ref = order_weights[i]) < 0) ||
660286432Sbapt			    ((p = get_pri(ref)) == NULL) ||
661286432Sbapt			    (p->pri == -1)) {
662286432Sbapt				set_pri(currundef->ref[i], pri, RESOLVED);
663286432Sbapt			} else {
664286432Sbapt				set_pri(currundef->ref[i], ref, REFER);
665286432Sbapt			}
666286432Sbapt			order_weights[i] = -1;
667286432Sbapt		}
668286432Sbapt		break;
669286432Sbapt
670286432Sbapt	default:
671286432Sbapt		INTERR;
672286432Sbapt	}
673286432Sbapt
674286432Sbapt	nextpri++;
675286432Sbapt}
676286432Sbapt
677286432Sbaptstatic void
678286432Sbaptstart_order(int type)
679286432Sbapt{
680286432Sbapt	int	i;
681286432Sbapt
682286432Sbapt	lastorder = currorder;
683286432Sbapt	currorder = type;
684286432Sbapt
685286432Sbapt	/* this is used to protect ELLIPSIS processing */
686286432Sbapt	if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
687286432Sbapt		fprintf(stderr, "character value expected");
688286432Sbapt	}
689286432Sbapt
690286432Sbapt	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
691286432Sbapt		order_weights[i] = -1;
692286432Sbapt	}
693286432Sbapt	curr_weight = 0;
694286432Sbapt}
695286432Sbapt
696286432Sbaptvoid
697286432Sbaptstart_order_undefined(void)
698286432Sbapt{
699286432Sbapt	start_order(T_UNDEFINED);
700286432Sbapt}
701286432Sbapt
702286432Sbaptvoid
703286432Sbaptstart_order_symbol(char *name)
704286432Sbapt{
705286432Sbapt	currundef = get_collundef(name);
706286432Sbapt	start_order(T_SYMBOL);
707286432Sbapt}
708286432Sbapt
709286432Sbaptvoid
710286432Sbaptstart_order_char(wchar_t wc)
711286432Sbapt{
712286432Sbapt	collchar_t	*cc;
713286432Sbapt	int32_t		ref;
714286432Sbapt
715286432Sbapt	start_order(T_CHAR);
716286432Sbapt
717286432Sbapt	/*
718286432Sbapt	 * If we last saw an ellipsis, then we need to close the range.
719286432Sbapt	 * Handle that here.  Note that we have to be careful because the
720286432Sbapt	 * items *inside* the range are treated exclusiveley to the items
721286432Sbapt	 * outside of the range.  The ends of the range can have quite
722286432Sbapt	 * different weights than the range members.
723286432Sbapt	 */
724286432Sbapt	if (lastorder == T_ELLIPSIS) {
725286432Sbapt		int		i;
726286432Sbapt
727286432Sbapt		if (wc < ellipsis_start) {
728286432Sbapt			fprintf(stderr, "malformed range!");
729286432Sbapt			return;
730286432Sbapt		}
731286432Sbapt		while (ellipsis_start < wc) {
732286432Sbapt			/*
733286432Sbapt			 * pick all of the saved weights for the
734286432Sbapt			 * ellipsis.  note that -1 encodes for the
735286432Sbapt			 * ellipsis itself, which means to take the
736286432Sbapt			 * current relative priority.
737286432Sbapt			 */
738286432Sbapt			if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
739286432Sbapt				INTERR;
740286432Sbapt				return;
741286432Sbapt			}
742286432Sbapt			for (i = 0; i < NUM_WT; i++) {
743286432Sbapt				collpri_t *p;
744286432Sbapt				if (((ref = ellipsis_weights[i]) == -1) ||
745286432Sbapt				    ((p = get_pri(ref)) == NULL) ||
746286432Sbapt				    (p->pri == -1)) {
747286432Sbapt					set_pri(cc->ref[i], nextpri, RESOLVED);
748286432Sbapt				} else {
749286432Sbapt					set_pri(cc->ref[i], ref, REFER);
750286432Sbapt				}
751286432Sbapt				ellipsis_weights[i] = 0;
752286432Sbapt			}
753286432Sbapt			ellipsis_start++;
754286432Sbapt			nextpri++;
755286432Sbapt		}
756286432Sbapt	}
757286432Sbapt
758286432Sbapt	currchar = get_collchar(wc, 1);
759286432Sbapt}
760286432Sbapt
761286432Sbaptvoid
762286432Sbaptstart_order_collelem(collelem_t *e)
763286432Sbapt{
764286432Sbapt	start_order(T_COLLELEM);
765286432Sbapt	currelem = e;
766286432Sbapt}
767286432Sbapt
768286432Sbaptvoid
769286432Sbaptstart_order_ellipsis(void)
770286432Sbapt{
771286432Sbapt	int	i;
772286432Sbapt
773286432Sbapt	start_order(T_ELLIPSIS);
774286432Sbapt
775286432Sbapt	if (lastorder != T_CHAR) {
776286432Sbapt		fprintf(stderr, "illegal starting point for range");
777286432Sbapt		return;
778286432Sbapt	}
779286432Sbapt
780286432Sbapt	for (i = 0; i < NUM_WT; i++) {
781286432Sbapt		ellipsis_weights[i] = order_weights[i];
782286432Sbapt	}
783286432Sbapt}
784286432Sbapt
785286432Sbaptvoid
786286432Sbaptdefine_collelem(char *name, wchar_t *wcs)
787286432Sbapt{
788286432Sbapt	collelem_t	*e;
789286432Sbapt	int		i;
790286432Sbapt
791286432Sbapt	if (wcslen(wcs) >= COLLATE_STR_LEN) {
792286432Sbapt		fprintf(stderr,"expanded collation element too long");
793286432Sbapt		return;
794286432Sbapt	}
795286432Sbapt
796307697Saraujo	if ((e = calloc(1, sizeof(*e))) == NULL) {
797286432Sbapt		fprintf(stderr, "out of memory");
798286432Sbapt		return;
799286432Sbapt	}
800286432Sbapt	e->expand = wcs;
801286432Sbapt	e->symbol = name;
802286432Sbapt
803286432Sbapt	/*
804286432Sbapt	 * This is executed before the order statement, so we don't
805286432Sbapt	 * know how many priorities we *really* need.  We allocate one
806286432Sbapt	 * for each possible weight.  Not a big deal, as collating-elements
807286432Sbapt	 * prove to be quite rare.
808286432Sbapt	 */
809286432Sbapt	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
810286432Sbapt		e->ref[i] = new_pri();
811286432Sbapt	}
812286432Sbapt
813286432Sbapt	/* A character sequence can only reduce to one element. */
814286484Sbapt	if ((RB_FIND(elem_by_symbol, &elem_by_symbol, e) != NULL) ||
815286484Sbapt	    (RB_FIND(elem_by_expand, &elem_by_expand, e) != NULL)) {
816286432Sbapt		fprintf(stderr, "duplicate collating element definition");
817298378Sbapt		free(e);
818286432Sbapt		return;
819286432Sbapt	}
820286484Sbapt	RB_INSERT(elem_by_symbol, &elem_by_symbol, e);
821286484Sbapt	RB_INSERT(elem_by_expand, &elem_by_expand, e);
822286432Sbapt}
823286432Sbapt
824286432Sbaptvoid
825286432Sbaptadd_order_bit(int kw)
826286432Sbapt{
827286432Sbapt	uint8_t bit = DIRECTIVE_UNDEF;
828286432Sbapt
829286432Sbapt	switch (kw) {
830286432Sbapt	case T_FORWARD:
831286432Sbapt		bit = DIRECTIVE_FORWARD;
832286432Sbapt		break;
833286432Sbapt	case T_BACKWARD:
834286432Sbapt		bit = DIRECTIVE_BACKWARD;
835286432Sbapt		break;
836286432Sbapt	case T_POSITION:
837286432Sbapt		bit = DIRECTIVE_POSITION;
838286432Sbapt		break;
839286432Sbapt	default:
840286432Sbapt		INTERR;
841286432Sbapt		break;
842286432Sbapt	}
843286432Sbapt	collinfo.directive[collinfo.directive_count] |= bit;
844286432Sbapt}
845286432Sbapt
846286432Sbaptvoid
847286432Sbaptadd_order_directive(void)
848286432Sbapt{
849286432Sbapt	if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
850286432Sbapt		fprintf(stderr,"too many directives (max %d)", COLL_WEIGHTS_MAX);
851286432Sbapt	}
852286432Sbapt	collinfo.directive_count++;
853286432Sbapt}
854286432Sbapt
855286432Sbaptstatic void
856286432Sbaptadd_order_pri(int32_t ref)
857286432Sbapt{
858286432Sbapt	if (curr_weight >= NUM_WT) {
859286432Sbapt		fprintf(stderr,"too many weights (max %d)", NUM_WT);
860286432Sbapt		return;
861286432Sbapt	}
862286432Sbapt	order_weights[curr_weight] = ref;
863286432Sbapt	curr_weight++;
864286432Sbapt}
865286432Sbapt
866286432Sbaptvoid
867286432Sbaptadd_order_collsym(collsym_t *s)
868286432Sbapt{
869286432Sbapt	add_order_pri(s->ref);
870286432Sbapt}
871286432Sbapt
872286432Sbaptvoid
873286432Sbaptadd_order_char(wchar_t wc)
874286432Sbapt{
875286432Sbapt	collchar_t *cc;
876286432Sbapt
877286432Sbapt	if ((cc = get_collchar(wc, 1)) == NULL) {
878286432Sbapt		INTERR;
879286432Sbapt		return;
880286432Sbapt	}
881286432Sbapt
882286432Sbapt	add_order_pri(cc->ref[curr_weight]);
883286432Sbapt}
884286432Sbapt
885286432Sbaptvoid
886286432Sbaptadd_order_collelem(collelem_t *e)
887286432Sbapt{
888286432Sbapt	add_order_pri(e->ref[curr_weight]);
889286432Sbapt}
890286432Sbapt
891286432Sbaptvoid
892286432Sbaptadd_order_ignore(void)
893286432Sbapt{
894286432Sbapt	add_order_pri(pri_ignore);
895286432Sbapt}
896286432Sbapt
897286432Sbaptvoid
898286432Sbaptadd_order_symbol(char *sym)
899286432Sbapt{
900286432Sbapt	collundef_t *c;
901286432Sbapt	if ((c = get_collundef(sym)) == NULL) {
902286432Sbapt		INTERR;
903286432Sbapt		return;
904286432Sbapt	}
905286432Sbapt	add_order_pri(c->ref[curr_weight]);
906286432Sbapt}
907286432Sbapt
908286432Sbaptvoid
909286432Sbaptadd_order_ellipsis(void)
910286432Sbapt{
911286432Sbapt	/* special NULL value indicates self reference */
912286432Sbapt	add_order_pri(0);
913286432Sbapt}
914286432Sbapt
915286432Sbaptvoid
916286432Sbaptadd_order_subst(void)
917286432Sbapt{
918286432Sbapt	subst_t srch;
919286432Sbapt	subst_t	*s;
920286432Sbapt	int i;
921286432Sbapt
922286432Sbapt	(void) memset(&srch, 0, sizeof (srch));
923286432Sbapt	for (i = 0; i < curr_subst; i++) {
924286432Sbapt		srch.ref[i] = subst_weights[i];
925286432Sbapt		subst_weights[i] = 0;
926286432Sbapt	}
927286484Sbapt	s = RB_FIND(substs_ref, &substs_ref[curr_weight], &srch);
928286432Sbapt
929286432Sbapt	if (s == NULL) {
930307697Saraujo		if ((s = calloc(1, sizeof(*s))) == NULL) {
931286432Sbapt			fprintf(stderr,"out of memory");
932286432Sbapt			return;
933286432Sbapt		}
934286432Sbapt		s->key = new_pri();
935286432Sbapt
936286432Sbapt		/*
937286432Sbapt		 * We use a self reference for our key, but we set a
938286432Sbapt		 * high bit to indicate that this is a substitution
939286432Sbapt		 * reference.  This will expedite table lookups later,
940286432Sbapt		 * and prevent table lookups for situations that don't
941286432Sbapt		 * require it.  (In short, its a big win, because we
942286432Sbapt		 * can skip a lot of binary searching.)
943286432Sbapt		 */
944286432Sbapt		set_pri(s->key,
945286432Sbapt		    (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
946286432Sbapt		    RESOLVED);
947286432Sbapt		nextsubst[curr_weight] += 1;
948286432Sbapt
949286432Sbapt		for (i = 0; i < curr_subst; i++) {
950286432Sbapt			s->ref[i] = srch.ref[i];
951286432Sbapt		}
952286432Sbapt
953286484Sbapt		RB_INSERT(substs_ref, &substs_ref[curr_weight], s);
954286432Sbapt
955286484Sbapt		if (RB_FIND(substs, &substs[curr_weight], s) != NULL) {
956286432Sbapt			INTERR;
957286432Sbapt			return;
958286432Sbapt		}
959286484Sbapt		RB_INSERT(substs, &substs[curr_weight], s);
960286432Sbapt	}
961286432Sbapt	curr_subst = 0;
962286432Sbapt
963286432Sbapt
964286432Sbapt	/*
965286432Sbapt	 * We are using the current (unique) priority as a search key
966286432Sbapt	 * in the substitution table.
967286432Sbapt	 */
968286432Sbapt	add_order_pri(s->key);
969286432Sbapt}
970286432Sbapt
971286432Sbaptstatic void
972286432Sbaptadd_subst_pri(int32_t ref)
973286432Sbapt{
974286432Sbapt	if (curr_subst >= COLLATE_STR_LEN) {
975286432Sbapt		fprintf(stderr,"substitution string is too long");
976286432Sbapt		return;
977286432Sbapt	}
978286432Sbapt	subst_weights[curr_subst] = ref;
979286432Sbapt	curr_subst++;
980286432Sbapt}
981286432Sbapt
982286432Sbaptvoid
983286432Sbaptadd_subst_char(wchar_t wc)
984286432Sbapt{
985286432Sbapt	collchar_t *cc;
986286432Sbapt
987286432Sbapt
988286432Sbapt	if (((cc = get_collchar(wc, 1)) == NULL) ||
989286432Sbapt	    (cc->wc != wc)) {
990286432Sbapt		INTERR;
991286432Sbapt		return;
992286432Sbapt	}
993286432Sbapt	/* we take the weight for the character at that position */
994286432Sbapt	add_subst_pri(cc->ref[curr_weight]);
995286432Sbapt}
996286432Sbapt
997286432Sbaptvoid
998286432Sbaptadd_subst_collelem(collelem_t *e)
999286432Sbapt{
1000286432Sbapt	add_subst_pri(e->ref[curr_weight]);
1001286432Sbapt}
1002286432Sbapt
1003286432Sbaptvoid
1004286432Sbaptadd_subst_collsym(collsym_t *s)
1005286432Sbapt{
1006286432Sbapt	add_subst_pri(s->ref);
1007286432Sbapt}
1008286432Sbapt
1009286432Sbaptvoid
1010286432Sbaptadd_subst_symbol(char *ptr)
1011286432Sbapt{
1012286432Sbapt	collundef_t *cu;
1013286432Sbapt
1014286432Sbapt	if ((cu = get_collundef(ptr)) != NULL) {
1015286432Sbapt		add_subst_pri(cu->ref[curr_weight]);
1016286432Sbapt	}
1017286432Sbapt}
1018286432Sbapt
1019286432Sbaptvoid
1020286432Sbaptadd_weight(int32_t ref, int pass)
1021286432Sbapt{
1022286432Sbapt	weight_t srch;
1023286432Sbapt	weight_t *w;
1024286432Sbapt
1025286432Sbapt	srch.pri = resolve_pri(ref);
1026286432Sbapt
1027286432Sbapt	/* No translation of ignores */
1028286432Sbapt	if (srch.pri == 0)
1029286432Sbapt		return;
1030286432Sbapt
1031286432Sbapt	/* Substitution priorities are not weights */
1032286432Sbapt	if (srch.pri & COLLATE_SUBST_PRIORITY)
1033286432Sbapt		return;
1034286432Sbapt
1035286484Sbapt	if (RB_FIND(weights, &weights[pass], &srch) != NULL)
1036286432Sbapt		return;
1037286432Sbapt
1038307697Saraujo	if ((w = calloc(1, sizeof(*w))) == NULL) {
1039286432Sbapt		fprintf(stderr, "out of memory");
1040286432Sbapt		return;
1041286432Sbapt	}
1042286432Sbapt	w->pri = srch.pri;
1043286484Sbapt	RB_INSERT(weights, &weights[pass], w);
1044286432Sbapt}
1045286432Sbapt
1046286432Sbaptvoid
1047286432Sbaptadd_weights(int32_t *refs)
1048286432Sbapt{
1049286432Sbapt	int i;
1050286432Sbapt	for (i = 0; i < NUM_WT; i++) {
1051286432Sbapt		add_weight(refs[i], i);
1052286432Sbapt	}
1053286432Sbapt}
1054286432Sbapt
1055286432Sbaptint32_t
1056286432Sbaptget_weight(int32_t ref, int pass)
1057286432Sbapt{
1058286432Sbapt	weight_t	srch;
1059286432Sbapt	weight_t	*w;
1060286432Sbapt	int32_t		pri;
1061286432Sbapt
1062286432Sbapt	pri = resolve_pri(ref);
1063286432Sbapt	if (pri & COLLATE_SUBST_PRIORITY) {
1064286432Sbapt		return (pri);
1065286432Sbapt	}
1066286432Sbapt	if (pri <= 0) {
1067286432Sbapt		return (pri);
1068286432Sbapt	}
1069286432Sbapt	srch.pri = pri;
1070286484Sbapt	if ((w = RB_FIND(weights, &weights[pass], &srch)) == NULL) {
1071286432Sbapt		INTERR;
1072286432Sbapt		return (-1);
1073286432Sbapt	}
1074286432Sbapt	return (w->opt);
1075286432Sbapt}
1076286432Sbapt
1077286432Sbaptwchar_t *
1078286432Sbaptwsncpy(wchar_t *s1, const wchar_t *s2, size_t n)
1079286432Sbapt{
1080286432Sbapt	wchar_t *os1 = s1;
1081286432Sbapt
1082286432Sbapt	n++;
1083286432Sbapt	while (--n > 0 && (*s1++ = *s2++) != 0)
1084286432Sbapt		continue;
1085286432Sbapt	if (n > 0)
1086286432Sbapt		while (--n > 0)
1087286432Sbapt			*s1++ = 0;
1088286432Sbapt	return (os1);
1089286432Sbapt}
1090286432Sbapt
1091287142Sbapt#define RB_COUNT(x, name, head, cnt) do { \
1092287142Sbapt	(cnt) = 0; \
1093287142Sbapt	RB_FOREACH(x, name, (head)) { \
1094287142Sbapt		(cnt)++; \
1095287142Sbapt	} \
1096287142Sbapt} while (0)
1097287142Sbapt
1098286484Sbapt#define RB_NUMNODES(type, name, head, cnt) do { \
1099286484Sbapt	type *t; \
1100286484Sbapt	cnt = 0; \
1101286484Sbapt	RB_FOREACH(t, name, head) { \
1102286484Sbapt		cnt++; \
1103286484Sbapt	} \
1104287142Sbapt} while (0)
1105286484Sbapt
1106286432Sbaptvoid
1107286432Sbaptdump_collate(void)
1108286432Sbapt{
1109286432Sbapt	FILE			*f;
1110286432Sbapt	int			i, j, n;
1111286432Sbapt	size_t			sz;
1112286432Sbapt	int32_t			pri;
1113286432Sbapt	collelem_t		*ce;
1114286432Sbapt	collchar_t		*cc;
1115286432Sbapt	subst_t			*sb;
1116286432Sbapt	char			vers[COLLATE_STR_LEN];
1117286432Sbapt	collate_char_t		chars[UCHAR_MAX + 1];
1118286432Sbapt	collate_large_t		*large;
1119286432Sbapt	collate_subst_t		*subst[COLL_WEIGHTS_MAX];
1120286432Sbapt	collate_chain_t		*chain;
1121286432Sbapt
1122286432Sbapt	/*
1123298878Spfg	 * We have to run through a preliminary pass to identify all the
1124286432Sbapt	 * weights that we use for each sorting level.
1125286432Sbapt	 */
1126286432Sbapt	for (i = 0; i < NUM_WT; i++) {
1127286432Sbapt		add_weight(pri_ignore, i);
1128286432Sbapt	}
1129286432Sbapt	for (i = 0; i < NUM_WT; i++) {
1130286484Sbapt		RB_FOREACH(sb, substs, &substs[i]) {
1131286432Sbapt			for (j = 0; sb->ref[j]; j++) {
1132286432Sbapt				add_weight(sb->ref[j], i);
1133286432Sbapt			}
1134286432Sbapt		}
1135286432Sbapt	}
1136286484Sbapt	RB_FOREACH(ce, elem_by_expand, &elem_by_expand) {
1137286432Sbapt		add_weights(ce->ref);
1138286432Sbapt	}
1139286484Sbapt	RB_FOREACH(cc, collchars, &collchars) {
1140286432Sbapt		add_weights(cc->ref);
1141286432Sbapt	}
1142286432Sbapt
1143286432Sbapt	/*
1144286432Sbapt	 * Now we walk the entire set of weights, removing the gaps
1145286432Sbapt	 * in the weights.  This gives us optimum usage.  The walk
1146286432Sbapt	 * occurs in priority.
1147286432Sbapt	 */
1148286432Sbapt	for (i = 0; i < NUM_WT; i++) {
1149286432Sbapt		weight_t *w;
1150286484Sbapt		RB_FOREACH(w, weights, &weights[i]) {
1151286432Sbapt			w->opt = nweight[i];
1152286432Sbapt			nweight[i] += 1;
1153286432Sbapt		}
1154286432Sbapt	}
1155286432Sbapt
1156286432Sbapt	(void) memset(&chars, 0, sizeof (chars));
1157286432Sbapt	(void) memset(vers, 0, COLLATE_STR_LEN);
1158286432Sbapt	(void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1159286432Sbapt
1160286432Sbapt	/*
1161286432Sbapt	 * We need to make sure we arrange for the UNDEFINED field
1162286432Sbapt	 * to show up.  Also, set the total weight counts.
1163286432Sbapt	 */
1164286432Sbapt	for (i = 0; i < NUM_WT; i++) {
1165286432Sbapt		if (resolve_pri(pri_undefined[i]) == -1) {
1166286432Sbapt			set_pri(pri_undefined[i], -1, RESOLVED);
1167286432Sbapt			/* they collate at the end of everything else */
1168286432Sbapt			collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1169286432Sbapt		}
1170286432Sbapt		collinfo.pri_count[i] = nweight[i];
1171286432Sbapt	}
1172286432Sbapt
1173286432Sbapt	collinfo.pri_count[NUM_WT] = max_wide();
1174286432Sbapt	collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1175286432Sbapt	collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1176286432Sbapt
1177286432Sbapt	/*
1178286432Sbapt	 * Ordinary character priorities
1179286432Sbapt	 */
1180286432Sbapt	for (i = 0; i <= UCHAR_MAX; i++) {
1181286432Sbapt		if ((cc = get_collchar(i, 0)) != NULL) {
1182286432Sbapt			for (j = 0; j < NUM_WT; j++) {
1183286432Sbapt				chars[i].pri[j] = get_weight(cc->ref[j], j);
1184286432Sbapt			}
1185286432Sbapt		} else {
1186286432Sbapt			for (j = 0; j < NUM_WT; j++) {
1187286432Sbapt				chars[i].pri[j] =
1188286432Sbapt				    get_weight(pri_undefined[j], j);
1189286432Sbapt			}
1190286432Sbapt			/*
1191286432Sbapt			 * Per POSIX, for undefined characters, we
1192286432Sbapt			 * also have to add a last item, which is the
1193286432Sbapt			 * character code.
1194286432Sbapt			 */
1195286432Sbapt			chars[i].pri[NUM_WT] = i;
1196286432Sbapt		}
1197286432Sbapt	}
1198286432Sbapt
1199286432Sbapt	/*
1200286432Sbapt	 * Substitution tables
1201286432Sbapt	 */
1202286432Sbapt	for (i = 0; i < NUM_WT; i++) {
1203286432Sbapt		collate_subst_t *st = NULL;
1204287142Sbapt		subst_t *temp;
1205287142Sbapt		RB_COUNT(temp, substs, &substs[i], n);
1206286484Sbapt		collinfo.subst_count[i] = n;
1207306912Spfg		if ((st = calloc(n, sizeof(collate_subst_t))) == NULL) {
1208286432Sbapt			fprintf(stderr, "out of memory");
1209286432Sbapt			return;
1210286432Sbapt		}
1211286432Sbapt		n = 0;
1212286484Sbapt		RB_FOREACH(sb, substs, &substs[i]) {
1213286432Sbapt			if ((st[n].key = resolve_pri(sb->key)) < 0) {
1214286432Sbapt				/* by definition these resolve! */
1215286432Sbapt				INTERR;
1216286432Sbapt			}
1217286432Sbapt			if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1218286432Sbapt				INTERR;
1219286432Sbapt			}
1220286432Sbapt			for (j = 0; sb->ref[j]; j++) {
1221286432Sbapt				st[n].pri[j] = get_weight(sb->ref[j], i);
1222286432Sbapt			}
1223286432Sbapt			n++;
1224286432Sbapt		}
1225286432Sbapt		if (n != collinfo.subst_count[i])
1226286432Sbapt			INTERR;
1227286432Sbapt		subst[i] = st;
1228286432Sbapt	}
1229286432Sbapt
1230286432Sbapt
1231286432Sbapt	/*
1232286432Sbapt	 * Chains, i.e. collating elements
1233286432Sbapt	 */
1234286484Sbapt	RB_NUMNODES(collelem_t, elem_by_expand, &elem_by_expand,
1235286484Sbapt	    collinfo.chain_count);
1236306912Spfg	chain = calloc(collinfo.chain_count, sizeof(collate_chain_t));
1237286432Sbapt	if (chain == NULL) {
1238286432Sbapt		fprintf(stderr, "out of memory");
1239286432Sbapt		return;
1240286432Sbapt	}
1241286484Sbapt	n = 0;
1242286484Sbapt	RB_FOREACH(ce, elem_by_expand, &elem_by_expand) {
1243286432Sbapt		(void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1244286432Sbapt		for (i = 0; i < NUM_WT; i++) {
1245286432Sbapt			chain[n].pri[i] = get_weight(ce->ref[i], i);
1246286432Sbapt		}
1247290508Sbapt		n++;
1248286432Sbapt	}
1249286432Sbapt	if (n != collinfo.chain_count)
1250286432Sbapt		INTERR;
1251286432Sbapt
1252286432Sbapt	/*
1253286432Sbapt	 * Large (> UCHAR_MAX) character priorities
1254286432Sbapt	 */
1255286484Sbapt	RB_NUMNODES(collchar_t, collchars, &collchars, n);
1256306912Spfg	large = calloc(n, sizeof(collate_large_t));
1257286432Sbapt	if (large == NULL) {
1258286432Sbapt		fprintf(stderr, "out of memory");
1259286432Sbapt		return;
1260286432Sbapt	}
1261286432Sbapt
1262286432Sbapt	i = 0;
1263286484Sbapt	RB_FOREACH(cc, collchars, &collchars) {
1264286432Sbapt		int	undef = 0;
1265286432Sbapt		/* we already gathered those */
1266286432Sbapt		if (cc->wc <= UCHAR_MAX)
1267286432Sbapt			continue;
1268286432Sbapt		for (j = 0; j < NUM_WT; j++) {
1269286432Sbapt			if ((pri = get_weight(cc->ref[j], j)) < 0) {
1270286432Sbapt				undef = 1;
1271286432Sbapt			}
1272286432Sbapt			if (undef && (pri >= 0)) {
1273286432Sbapt				/* if undefined, then all priorities are */
1274286432Sbapt				INTERR;
1275286432Sbapt			} else {
1276286432Sbapt				large[i].pri.pri[j] = pri;
1277286432Sbapt			}
1278286432Sbapt		}
1279286432Sbapt		if (!undef) {
1280286432Sbapt			large[i].val = cc->wc;
1281286432Sbapt			collinfo.large_count = i++;
1282286432Sbapt		}
1283286432Sbapt	}
1284286432Sbapt
1285286432Sbapt	if ((f = open_category()) == NULL) {
1286286432Sbapt		return;
1287286432Sbapt	}
1288286432Sbapt
1289286432Sbapt	/* Time to write the entire data set out */
1290286432Sbapt
1291286432Sbapt	if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1292286432Sbapt	    (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1293286432Sbapt	    (wr_category(&chars, sizeof (chars), f) < 0)) {
1294286432Sbapt		return;
1295286432Sbapt	}
1296286432Sbapt
1297286432Sbapt	for (i = 0; i < NUM_WT; i++) {
1298286432Sbapt		sz =  sizeof (collate_subst_t) * collinfo.subst_count[i];
1299286432Sbapt		if (wr_category(subst[i], sz, f) < 0) {
1300286432Sbapt			return;
1301286432Sbapt		}
1302286432Sbapt	}
1303286432Sbapt	sz = sizeof (collate_chain_t) * collinfo.chain_count;
1304286432Sbapt	if (wr_category(chain, sz, f) < 0) {
1305286432Sbapt		return;
1306286432Sbapt	}
1307286432Sbapt	sz = sizeof (collate_large_t) * collinfo.large_count;
1308286432Sbapt	if (wr_category(large, sz, f) < 0) {
1309286432Sbapt		return;
1310286432Sbapt	}
1311286432Sbapt
1312286432Sbapt	close_category(f);
1313286432Sbapt}
1314