1/*-
2 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5 *		at Electronni Visti IA, Kiev, Ukraine.
6 *			All rights reserved.
7 *
8 * Copyright (c) 2011 The FreeBSD Foundation
9 * All rights reserved.
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * Adapted to xlocale by John Marino <draco@marino.st>
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD$");
39
40#include "namespace.h"
41
42#include <sys/types.h>
43#include <sys/stat.h>
44#include <sys/mman.h>
45
46#include <assert.h>
47#include <stdio.h>
48#include <stdlib.h>
49#include <string.h>
50#include <wchar.h>
51#include <errno.h>
52#include <unistd.h>
53#include <fcntl.h>
54#include "un-namespace.h"
55
56#include "collate.h"
57#include "setlocale.h"
58#include "ldpart.h"
59#include "libc_private.h"
60
61struct xlocale_collate __xlocale_global_collate = {
62	{{0}, "C"}, 1, 0, 0, 0
63};
64
65struct xlocale_collate __xlocale_C_collate = {
66	{{0}, "C"}, 1, 0, 0, 0
67};
68
69static int
70__collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
71
72static void
73destruct_collate(void *t)
74{
75	struct xlocale_collate *table = t;
76	if (table->map && (table->maplen > 0)) {
77		(void) munmap(table->map, table->maplen);
78	}
79	free(t);
80}
81
82void *
83__collate_load(const char *encoding, __unused locale_t unused)
84{
85	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
86		return &__xlocale_C_collate;
87	}
88	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
89	table->header.header.destructor = destruct_collate;
90	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
91	// the caching outside of this section
92	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
93		xlocale_release(table);
94		return NULL;
95	}
96	return table;
97}
98
99/**
100 * Load the collation tables for the specified encoding into the global table.
101 */
102int
103__collate_load_tables(const char *encoding)
104{
105
106	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
107}
108
109int
110__collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
111{
112	int i, chains, z;
113	char *buf;
114	char *TMP;
115	char *map;
116	collate_info_t *info;
117	struct stat sbuf;
118	int fd;
119
120	table->__collate_load_error = 1;
121
122	/* 'encoding' must be already checked. */
123	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
124		return (_LDP_CACHE);
125	}
126
127	asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding);
128	if (buf == NULL)
129		return (_LDP_ERROR);
130
131	if ((fd = _open(buf, O_RDONLY)) < 0) {
132		free(buf);
133		return (_LDP_ERROR);
134	}
135	free(buf);
136	if (_fstat(fd, &sbuf) < 0) {
137		(void) _close(fd);
138		return (_LDP_ERROR);
139	}
140	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
141		(void) _close(fd);
142		errno = EINVAL;
143		return (_LDP_ERROR);
144	}
145	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
146	(void) _close(fd);
147	if ((TMP = map) == NULL) {
148		return (_LDP_ERROR);
149	}
150
151	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
152		(void) munmap(map, sbuf.st_size);
153		errno = EINVAL;
154		return (_LDP_ERROR);
155	}
156	TMP += COLLATE_STR_LEN;
157
158	info = (void *)TMP;
159	TMP += sizeof (*info);
160
161	if ((info->directive_count < 1) ||
162	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
163	    ((chains = info->chain_count) < 0)) {
164		(void) munmap(map, sbuf.st_size);
165		errno = EINVAL;
166		return (_LDP_ERROR);
167	}
168
169	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
170	    (sizeof (collate_chain_t) * chains) +
171	    (sizeof (collate_large_t) * info->large_count);
172	for (z = 0; z < info->directive_count; z++) {
173		i += sizeof (collate_subst_t) * info->subst_count[z];
174	}
175	if (i != (sbuf.st_size - (TMP - map))) {
176		(void) munmap(map, sbuf.st_size);
177		errno = EINVAL;
178		return (_LDP_ERROR);
179	}
180
181	table->info = info;
182	table->char_pri_table = (void *)TMP;
183	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
184
185	for (z = 0; z < info->directive_count; z++) {
186		if (info->subst_count[z] > 0) {
187			table->subst_table[z] = (void *)TMP;
188			TMP += info->subst_count[z] * sizeof (collate_subst_t);
189		} else {
190			table->subst_table[z] = NULL;
191		}
192	}
193
194	if (chains > 0) {
195		table->chain_pri_table = (void *)TMP;
196		TMP += chains * sizeof (collate_chain_t);
197	} else
198		table->chain_pri_table = NULL;
199	if (info->large_count > 0)
200		table->large_pri_table = (void *)TMP;
201	else
202		table->large_pri_table = NULL;
203
204	table->__collate_load_error = 0;
205	return (_LDP_LOADED);
206}
207
208static const int32_t *
209substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
210{
211	const collate_subst_t *p;
212	int n = table->info->subst_count[pass];
213
214	if (n == 0)
215		return (NULL);
216
217	if (pass >= table->info->directive_count)
218		return (NULL);
219
220	if (!(key & COLLATE_SUBST_PRIORITY))
221		return (NULL);
222
223	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
224	assert(p->key == key);
225	return (p->pri);
226}
227
228static collate_chain_t *
229chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
230{
231	int low = 0;
232	int high = table->info->chain_count - 1;;
233	int next, compar, l;
234	collate_chain_t *p;
235	collate_chain_t *tab = table->chain_pri_table;
236
237	if (high < 0)
238		return (NULL);
239
240	while (low <= high) {
241		next = (low + high) / 2;
242		p = tab + next;
243		compar = *key - *p->str;
244		if (compar == 0) {
245			l = wcsnlen(p->str, COLLATE_STR_LEN);
246			compar = wcsncmp(key, p->str, l);
247			if (compar == 0) {
248				*len = l;
249				return (p);
250			}
251		}
252		if (compar > 0)
253			low = next + 1;
254		else
255			high = next - 1;
256	}
257	return (NULL);
258}
259
260static collate_large_t *
261largesearch(struct xlocale_collate *table, const wchar_t key)
262{
263	int low = 0;
264	int high = table->info->large_count - 1;
265	int next, compar;
266	collate_large_t *p;
267	collate_large_t *tab = table->large_pri_table;
268
269	if (high < 0)
270		return (NULL);
271
272	while (low <= high) {
273		next = (low + high) / 2;
274		p = tab + next;
275		compar = key - p->val;
276		if (compar == 0)
277			return (p);
278		if (compar > 0)
279			low = next + 1;
280		else
281			high = next - 1;
282	}
283	return (NULL);
284}
285
286void
287_collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
288    int *pri, int which, const int **state)
289{
290	collate_chain_t *p2;
291	collate_large_t *match;
292	int p, l;
293	const int *sptr;
294
295	/*
296	 * If this is the "last" pass for the UNDEFINED, then
297	 * we just return the priority itself.
298	 */
299	if (which >= table->info->directive_count) {
300		*pri = *t;
301		*len = 1;
302		*state = NULL;
303		return;
304	}
305
306	/*
307	 * If we have remaining substitution data from a previous
308	 * call, consume it first.
309	 */
310	if ((sptr = *state) != NULL) {
311		*pri = *sptr;
312		sptr++;
313		if ((sptr == *state) || (sptr == NULL))
314			*state = NULL;
315		else
316			*state = sptr;
317		*len = 0;
318		return;
319	}
320
321	/* No active substitutions */
322	*len = 1;
323
324	/*
325	 * Check for composites such as diphthongs that collate as a
326	 * single element (aka chains or collating-elements).
327	 */
328	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
329	    ((p = p2->pri[which]) >= 0)) {
330
331		*len = l;
332		*pri = p;
333
334	} else if (*t <= UCHAR_MAX) {
335
336		/*
337		 * Character is a small (8-bit) character.
338		 * We just look these up directly for speed.
339		 */
340		*pri = table->char_pri_table[*t].pri[which];
341
342	} else if ((table->info->large_count > 0) &&
343	    ((match = largesearch(table, *t)) != NULL)) {
344
345		/*
346		 * Character was found in the extended table.
347		 */
348		*pri = match->pri.pri[which];
349
350	} else {
351		/*
352		 * Character lacks a specific definition.
353		 */
354		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
355			/* Mask off sign bit to prevent ordering confusion. */
356			*pri = (*t & COLLATE_MAX_PRIORITY);
357		} else {
358			*pri = table->info->undef_pri[which];
359		}
360		/* No substitutions for undefined characters! */
361		return;
362	}
363
364	/*
365	 * Try substituting (expanding) the character.  We are
366	 * currently doing this *after* the chain compression.  I
367	 * think it should not matter, but this way might be slightly
368	 * faster.
369	 *
370	 * We do this after the priority search, as this will help us
371	 * to identify a single key value.  In order for this to work,
372	 * its important that the priority assigned to a given element
373	 * to be substituted be unique for that level.  The localedef
374	 * code ensures this for us.
375	 */
376	if ((sptr = substsearch(table, *pri, which)) != NULL) {
377		if ((*pri = *sptr) > 0) {
378			sptr++;
379			*state = *sptr ? sptr : NULL;
380		}
381	}
382
383}
384
385/*
386 * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
387 * NOT NULL terminate.  That is left to the caller.
388 */
389size_t
390_collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
391    size_t room)
392{
393	int		pri;
394	int		len;
395	const wchar_t	*t;
396	wchar_t		*tr = NULL;
397	int		direc;
398	int		pass;
399	const int32_t 	*state;
400	size_t		want = 0;
401	size_t		need = 0;
402	int		ndir = table->info->directive_count;
403
404	assert(src);
405
406	for (pass = 0; pass <= ndir; pass++) {
407
408		state = NULL;
409
410		if (pass != 0) {
411			/* insert level separator from the previous pass */
412			if (room) {
413				*xf++ = 1;
414				room--;
415			}
416			want++;
417		}
418
419		/* special pass for undefined */
420		if (pass == ndir) {
421			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
422		} else {
423			direc = table->info->directive[pass];
424		}
425
426		t = src;
427
428		if (direc & DIRECTIVE_BACKWARD) {
429			wchar_t *bp, *fp, c;
430			free(tr);
431			if ((tr = wcsdup(t)) == NULL) {
432				errno = ENOMEM;
433				goto fail;
434			}
435			bp = tr;
436			fp = tr + wcslen(tr) - 1;
437			while (bp < fp) {
438				c = *bp;
439				*bp++ = *fp;
440				*fp-- = c;
441			}
442			t = (const wchar_t *)tr;
443		}
444
445		if (direc & DIRECTIVE_POSITION) {
446			while (*t || state) {
447				_collate_lookup(table, t, &len, &pri, pass, &state);
448				t += len;
449				if (pri <= 0) {
450					if (pri < 0) {
451						errno = EINVAL;
452						goto fail;
453					}
454					state = NULL;
455					pri = COLLATE_MAX_PRIORITY;
456				}
457				if (room) {
458					*xf++ = pri;
459					room--;
460				}
461				want++;
462				need = want;
463			}
464		} else {
465			while (*t || state) {
466				_collate_lookup(table, t, &len, &pri, pass, &state);
467				t += len;
468				if (pri <= 0) {
469					if (pri < 0) {
470						errno = EINVAL;
471						goto fail;
472					}
473					state = NULL;
474					continue;
475				}
476				if (room) {
477					*xf++ = pri;
478					room--;
479				}
480				want++;
481				need = want;
482			}
483		}
484	}
485	free(tr);
486	return (need);
487
488fail:
489	free(tr);
490	return ((size_t)(-1));
491}
492
493/*
494 * In the non-POSIX case, we transform each character into a string of
495 * characters representing the character's priority.  Since char is usually
496 * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
497 * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
498 * bits per byte.
499 *
500 * It turns out that we sometimes have real priorities that are
501 * 31-bits wide.  (But: be careful using priorities where the high
502 * order bit is set -- i.e. the priority is negative.  The sort order
503 * may be surprising!)
504 *
505 * TODO: This would be a good area to optimize somewhat.  It turns out
506 * that real prioririties *except for the last UNDEFINED pass* are generally
507 * very small.  We need the localedef code to precalculate the max
508 * priority for us, and ideally also give us a mask, and then we could
509 * severely limit what we expand to.
510 */
511#define	XFRM_BYTES	6
512#define	XFRM_OFFSET	('0')	/* make all printable characters */
513#define	XFRM_SHIFT	6
514#define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
515#define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
516
517static int
518xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
519{
520	/* we use unsigned to ensure zero fill on right shift */
521	uint32_t val = (uint32_t)table->info->pri_count[pass];
522	int nc = 0;
523
524	while (val) {
525		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
526		pri >>= XFRM_SHIFT;
527		val >>= XFRM_SHIFT;
528		p++;
529		nc++;
530	}
531	return (nc);
532}
533
534size_t
535_collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
536    size_t room)
537{
538	int		pri;
539	int		len;
540	const wchar_t	*t;
541	wchar_t		*tr = NULL;
542	int		direc;
543	int		pass;
544	const int32_t 	*state;
545	size_t		want = 0;
546	size_t		need = 0;
547	int		b;
548	uint8_t		buf[XFRM_BYTES];
549	int		ndir = table->info->directive_count;
550
551	assert(src);
552
553	for (pass = 0; pass <= ndir; pass++) {
554
555		state = NULL;
556
557		if (pass != 0) {
558			/* insert level separator from the previous pass */
559			if (room) {
560				*xf++ = XFRM_SEP;
561				room--;
562			}
563			want++;
564		}
565
566		/* special pass for undefined */
567		if (pass == ndir) {
568			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
569		} else {
570			direc = table->info->directive[pass];
571		}
572
573		t = src;
574
575		if (direc & DIRECTIVE_BACKWARD) {
576			wchar_t *bp, *fp, c;
577			free(tr);
578			if ((tr = wcsdup(t)) == NULL) {
579				errno = ENOMEM;
580				goto fail;
581			}
582			bp = tr;
583			fp = tr + wcslen(tr) - 1;
584			while (bp < fp) {
585				c = *bp;
586				*bp++ = *fp;
587				*fp-- = c;
588			}
589			t = (const wchar_t *)tr;
590		}
591
592		if (direc & DIRECTIVE_POSITION) {
593			while (*t || state) {
594
595				_collate_lookup(table, t, &len, &pri, pass, &state);
596				t += len;
597				if (pri <= 0) {
598					if (pri < 0) {
599						errno = EINVAL;
600						goto fail;
601					}
602					state = NULL;
603					pri = COLLATE_MAX_PRIORITY;
604				}
605
606				b = xfrm(table, buf, pri, pass);
607				want += b;
608				if (room) {
609					while (b) {
610						b--;
611						if (room) {
612							*xf++ = buf[b];
613							room--;
614						}
615					}
616				}
617				need = want;
618			}
619		} else {
620			while (*t || state) {
621				_collate_lookup(table, t, &len, &pri, pass, &state);
622				t += len;
623				if (pri <= 0) {
624					if (pri < 0) {
625						errno = EINVAL;
626						goto fail;
627					}
628					state = NULL;
629					continue;
630				}
631
632				b = xfrm(table, buf, pri, pass);
633				want += b;
634				if (room) {
635
636					while (b) {
637						b--;
638						if (room) {
639							*xf++ = buf[b];
640							room--;
641						}
642					}
643				}
644				need = want;
645			}
646		}
647	}
648	free(tr);
649	return (need);
650
651fail:
652	free(tr);
653	return ((size_t)(-1));
654}
655
656/*
657 * __collate_equiv_value returns the primary collation value for the given
658 * collating symbol specified by str and len.  Zero or negative is returned
659 * if the collating symbol was not found.  This function is used by bracket
660 * code in the TRE regex library.
661 */
662int
663__collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
664{
665	int32_t e;
666
667	if (len < 1 || len >= COLLATE_STR_LEN)
668		return (-1);
669
670	FIX_LOCALE(locale);
671	struct xlocale_collate *table =
672		(struct xlocale_collate*)locale->components[XLC_COLLATE];
673
674	if (table->__collate_load_error)
675		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
676
677	if (len == 1) {
678		e = -1;
679		if (*str <= UCHAR_MAX)
680			e = table->char_pri_table[*str].pri[0];
681		else if (table->info->large_count > 0) {
682			collate_large_t *match_large;
683			match_large = largesearch(table, *str);
684			if (match_large)
685				e = match_large->pri.pri[0];
686		}
687		if (e == 0)
688			return (1);
689		return (e > 0 ? e : 0);
690	}
691	if (table->info->chain_count > 0) {
692		wchar_t name[COLLATE_STR_LEN];
693		collate_chain_t *match_chain;
694		int clen;
695
696		wcsncpy (name, str, len);
697		name[len] = 0;
698		match_chain = chainsearch(table, name, &clen);
699		if (match_chain) {
700			e = match_chain->pri[0];
701			if (e == 0)
702				return (1);
703			return (e < 0 ? -e : e);
704		}
705	}
706	return (0);
707}
708