1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
5 * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
6 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
7 *		at Electronni Visti IA, Kiev, Ukraine.
8 *			All rights reserved.
9 *
10 * Copyright (c) 2011 The FreeBSD Foundation
11 * All rights reserved.
12 * Portions of this software were developed by David Chisnall
13 * under sponsorship from the FreeBSD Foundation.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 *    notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 *    notice, this list of conditions and the following disclaimer in the
22 *    documentation and/or other materials provided with the distribution.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * Adapted to xlocale by John Marino <draco@marino.st>
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD$");
41
42#include "namespace.h"
43
44#include <sys/types.h>
45#include <sys/stat.h>
46#include <sys/mman.h>
47
48#include <assert.h>
49#include <stdio.h>
50#include <stdlib.h>
51#include <string.h>
52#include <wchar.h>
53#include <errno.h>
54#include <unistd.h>
55#include <fcntl.h>
56#include "un-namespace.h"
57
58#include "collate.h"
59#include "setlocale.h"
60#include "ldpart.h"
61#include "libc_private.h"
62
63struct xlocale_collate __xlocale_global_collate = {
64	{{0}, "C"}, 1, 0, 0, 0
65};
66
67struct xlocale_collate __xlocale_C_collate = {
68	{{0}, "C"}, 1, 0, 0, 0
69};
70
71static int
72__collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
73
74static void
75destruct_collate(void *t)
76{
77	struct xlocale_collate *table = t;
78	if (table->map && (table->maplen > 0)) {
79		(void) munmap(table->map, table->maplen);
80	}
81	free(t);
82}
83
84void *
85__collate_load(const char *encoding, __unused locale_t unused)
86{
87	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
88	    strncmp(encoding, "C.", 2) == 0) {
89		return &__xlocale_C_collate;
90	}
91	struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
92	table->header.header.destructor = destruct_collate;
93	// FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
94	// the caching outside of this section
95	if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
96		xlocale_release(table);
97		return NULL;
98	}
99	return table;
100}
101
102/**
103 * Load the collation tables for the specified encoding into the global table.
104 */
105int
106__collate_load_tables(const char *encoding)
107{
108
109	return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
110}
111
112static int
113__collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
114{
115	int i, chains, z;
116	char *buf;
117	char *TMP;
118	char *map;
119	collate_info_t *info;
120	struct stat sbuf;
121	int fd;
122
123	table->__collate_load_error = 1;
124
125	/* 'encoding' must be already checked. */
126	if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0 ||
127	    strncmp(encoding, "C.", 2) == 0) {
128		return (_LDP_CACHE);
129	}
130
131	if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
132		return (_LDP_ERROR);
133
134	if ((fd = _open(buf, O_RDONLY)) < 0) {
135		free(buf);
136		return (_LDP_ERROR);
137	}
138	free(buf);
139	if (_fstat(fd, &sbuf) < 0) {
140		(void) _close(fd);
141		return (_LDP_ERROR);
142	}
143	if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
144		(void) _close(fd);
145		errno = EINVAL;
146		return (_LDP_ERROR);
147	}
148	map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
149	(void) _close(fd);
150	if ((TMP = map) == MAP_FAILED) {
151		return (_LDP_ERROR);
152	}
153
154	if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
155		(void) munmap(map, sbuf.st_size);
156		errno = EINVAL;
157		return (_LDP_ERROR);
158	}
159	TMP += COLLATE_STR_LEN;
160
161	info = (void *)TMP;
162	TMP += sizeof (*info);
163
164	if ((info->directive_count < 1) ||
165	    (info->directive_count >= COLL_WEIGHTS_MAX) ||
166	    ((chains = info->chain_count) < 0)) {
167		(void) munmap(map, sbuf.st_size);
168		errno = EINVAL;
169		return (_LDP_ERROR);
170	}
171
172	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
173	    (sizeof (collate_chain_t) * chains) +
174	    (sizeof (collate_large_t) * info->large_count);
175	for (z = 0; z < info->directive_count; z++) {
176		i += sizeof (collate_subst_t) * info->subst_count[z];
177	}
178	if (i != (sbuf.st_size - (TMP - map))) {
179		(void) munmap(map, sbuf.st_size);
180		errno = EINVAL;
181		return (_LDP_ERROR);
182	}
183
184	if (table->map && (table->maplen > 0)) {
185		(void) munmap(table->map, table->maplen);
186	}
187	table->map = map;
188	table->maplen = sbuf.st_size;
189	table->info = info;
190	table->char_pri_table = (void *)TMP;
191	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
192
193	for (z = 0; z < info->directive_count; z++) {
194		if (info->subst_count[z] > 0) {
195			table->subst_table[z] = (void *)TMP;
196			TMP += info->subst_count[z] * sizeof (collate_subst_t);
197		} else {
198			table->subst_table[z] = NULL;
199		}
200	}
201
202	if (chains > 0) {
203		table->chain_pri_table = (void *)TMP;
204		TMP += chains * sizeof (collate_chain_t);
205	} else
206		table->chain_pri_table = NULL;
207	if (info->large_count > 0)
208		table->large_pri_table = (void *)TMP;
209	else
210		table->large_pri_table = NULL;
211
212	table->__collate_load_error = 0;
213	return (_LDP_LOADED);
214}
215
216static const int32_t *
217substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
218{
219	const collate_subst_t *p;
220	int n = table->info->subst_count[pass];
221
222	if (n == 0)
223		return (NULL);
224
225	if (pass >= table->info->directive_count)
226		return (NULL);
227
228	if (!(key & COLLATE_SUBST_PRIORITY))
229		return (NULL);
230
231	p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
232	assert(p->key == key);
233
234	return (p->pri);
235}
236
237static collate_chain_t *
238chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
239{
240	int low = 0;
241	int high = table->info->chain_count - 1;
242	int next, compar, l;
243	collate_chain_t *p;
244	collate_chain_t *tab = table->chain_pri_table;
245
246	if (high < 0)
247		return (NULL);
248
249	while (low <= high) {
250		next = (low + high) / 2;
251		p = tab + next;
252		compar = *key - *p->str;
253		if (compar == 0) {
254			l = wcsnlen(p->str, COLLATE_STR_LEN);
255			compar = wcsncmp(key, p->str, l);
256			if (compar == 0) {
257				*len = l;
258				return (p);
259			}
260		}
261		if (compar > 0)
262			low = next + 1;
263		else
264			high = next - 1;
265	}
266	return (NULL);
267}
268
269static collate_large_t *
270largesearch(struct xlocale_collate *table, const wchar_t key)
271{
272	int low = 0;
273	int high = table->info->large_count - 1;
274	int next, compar;
275	collate_large_t *p;
276	collate_large_t *tab = table->large_pri_table;
277
278	if (high < 0)
279		return (NULL);
280
281	while (low <= high) {
282		next = (low + high) / 2;
283		p = tab + next;
284		compar = key - p->val;
285		if (compar == 0)
286			return (p);
287		if (compar > 0)
288			low = next + 1;
289		else
290			high = next - 1;
291	}
292	return (NULL);
293}
294
295void
296_collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
297    int *pri, int which, const int **state)
298{
299	collate_chain_t *p2;
300	collate_large_t *match;
301	int p, l;
302	const int *sptr;
303
304	/*
305	 * If this is the "last" pass for the UNDEFINED, then
306	 * we just return the priority itself.
307	 */
308	if (which >= table->info->directive_count) {
309		*pri = *t;
310		*len = 1;
311		*state = NULL;
312		return;
313	}
314
315	/*
316	 * If we have remaining substitution data from a previous
317	 * call, consume it first.
318	 */
319	if ((sptr = *state) != NULL) {
320		*pri = *sptr;
321		sptr++;
322		if ((sptr == *state) || (sptr == NULL))
323			*state = NULL;
324		else
325			*state = sptr;
326		*len = 0;
327		return;
328	}
329
330	/* No active substitutions */
331	*len = 1;
332
333	/*
334	 * Check for composites such as diphthongs that collate as a
335	 * single element (aka chains or collating-elements).
336	 */
337	if (((p2 = chainsearch(table, t, &l)) != NULL) &&
338	    ((p = p2->pri[which]) >= 0)) {
339
340		*len = l;
341		*pri = p;
342
343	} else if (*t <= UCHAR_MAX) {
344
345		/*
346		 * Character is a small (8-bit) character.
347		 * We just look these up directly for speed.
348		 */
349		*pri = table->char_pri_table[*t].pri[which];
350
351	} else if ((table->info->large_count > 0) &&
352	    ((match = largesearch(table, *t)) != NULL)) {
353
354		/*
355		 * Character was found in the extended table.
356		 */
357		*pri = match->pri.pri[which];
358
359	} else {
360		/*
361		 * Character lacks a specific definition.
362		 */
363		if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
364			/* Mask off sign bit to prevent ordering confusion. */
365			*pri = (*t & COLLATE_MAX_PRIORITY);
366		} else {
367			*pri = table->info->undef_pri[which];
368		}
369		/* No substitutions for undefined characters! */
370		return;
371	}
372
373	/*
374	 * Try substituting (expanding) the character.  We are
375	 * currently doing this *after* the chain compression.  I
376	 * think it should not matter, but this way might be slightly
377	 * faster.
378	 *
379	 * We do this after the priority search, as this will help us
380	 * to identify a single key value.  In order for this to work,
381	 * its important that the priority assigned to a given element
382	 * to be substituted be unique for that level.  The localedef
383	 * code ensures this for us.
384	 */
385	if ((sptr = substsearch(table, *pri, which)) != NULL) {
386		if ((*pri = *sptr) > 0) {
387			sptr++;
388			*state = *sptr ? sptr : NULL;
389		}
390	}
391
392}
393
394/*
395 * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
396 * NOT NULL terminate.  That is left to the caller.
397 */
398size_t
399_collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
400    size_t room)
401{
402	int		pri;
403	int		len;
404	const wchar_t	*t;
405	wchar_t		*tr = NULL;
406	int		direc;
407	int		pass;
408	const int32_t 	*state;
409	size_t		want = 0;
410	size_t		need = 0;
411	int		ndir = table->info->directive_count;
412
413	assert(src);
414
415	for (pass = 0; pass <= ndir; pass++) {
416
417		state = NULL;
418
419		if (pass != 0) {
420			/* insert level separator from the previous pass */
421			if (room) {
422				*xf++ = 1;
423				room--;
424			}
425			want++;
426		}
427
428		/* special pass for undefined */
429		if (pass == ndir) {
430			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
431		} else {
432			direc = table->info->directive[pass];
433		}
434
435		t = src;
436
437		if (direc & DIRECTIVE_BACKWARD) {
438			wchar_t *bp, *fp, c;
439			free(tr);
440			if ((tr = wcsdup(t)) == NULL) {
441				errno = ENOMEM;
442				goto fail;
443			}
444			bp = tr;
445			fp = tr + wcslen(tr) - 1;
446			while (bp < fp) {
447				c = *bp;
448				*bp++ = *fp;
449				*fp-- = c;
450			}
451			t = (const wchar_t *)tr;
452		}
453
454		if (direc & DIRECTIVE_POSITION) {
455			while (*t || state) {
456				_collate_lookup(table, t, &len, &pri, pass, &state);
457				t += len;
458				if (pri <= 0) {
459					if (pri < 0) {
460						errno = EINVAL;
461						goto fail;
462					}
463					state = NULL;
464					pri = COLLATE_MAX_PRIORITY;
465				}
466				if (room) {
467					*xf++ = pri;
468					room--;
469				}
470				want++;
471				need = want;
472			}
473		} else {
474			while (*t || state) {
475				_collate_lookup(table, t, &len, &pri, pass, &state);
476				t += len;
477				if (pri <= 0) {
478					if (pri < 0) {
479						errno = EINVAL;
480						goto fail;
481					}
482					state = NULL;
483					continue;
484				}
485				if (room) {
486					*xf++ = pri;
487					room--;
488				}
489				want++;
490				need = want;
491			}
492		}
493	}
494	free(tr);
495	return (need);
496
497fail:
498	free(tr);
499	return ((size_t)(-1));
500}
501
502/*
503 * In the non-POSIX case, we transform each character into a string of
504 * characters representing the character's priority.  Since char is usually
505 * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
506 * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
507 * bits per byte.
508 *
509 * It turns out that we sometimes have real priorities that are
510 * 31-bits wide.  (But: be careful using priorities where the high
511 * order bit is set -- i.e. the priority is negative.  The sort order
512 * may be surprising!)
513 *
514 * TODO: This would be a good area to optimize somewhat.  It turns out
515 * that real prioririties *except for the last UNDEFINED pass* are generally
516 * very small.  We need the localedef code to precalculate the max
517 * priority for us, and ideally also give us a mask, and then we could
518 * severely limit what we expand to.
519 */
520#define	XFRM_BYTES	6
521#define	XFRM_OFFSET	('0')	/* make all printable characters */
522#define	XFRM_SHIFT	6
523#define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
524#define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
525
526static int
527xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
528{
529	/* we use unsigned to ensure zero fill on right shift */
530	uint32_t val = (uint32_t)table->info->pri_count[pass];
531	int nc = 0;
532
533	while (val) {
534		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
535		pri >>= XFRM_SHIFT;
536		val >>= XFRM_SHIFT;
537		p++;
538		nc++;
539	}
540	return (nc);
541}
542
543size_t
544_collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
545    size_t room)
546{
547	int		pri;
548	int		len;
549	const wchar_t	*t;
550	wchar_t		*tr = NULL;
551	int		direc;
552	int		pass;
553	const int32_t 	*state;
554	size_t		want = 0;
555	size_t		need = 0;
556	int		b;
557	uint8_t		buf[XFRM_BYTES];
558	int		ndir = table->info->directive_count;
559
560	assert(src);
561
562	for (pass = 0; pass <= ndir; pass++) {
563
564		state = NULL;
565
566		if (pass != 0) {
567			/* insert level separator from the previous pass */
568			if (room) {
569				*xf++ = XFRM_SEP;
570				room--;
571			}
572			want++;
573		}
574
575		/* special pass for undefined */
576		if (pass == ndir) {
577			direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
578		} else {
579			direc = table->info->directive[pass];
580		}
581
582		t = src;
583
584		if (direc & DIRECTIVE_BACKWARD) {
585			wchar_t *bp, *fp, c;
586			free(tr);
587			if ((tr = wcsdup(t)) == NULL) {
588				errno = ENOMEM;
589				goto fail;
590			}
591			bp = tr;
592			fp = tr + wcslen(tr) - 1;
593			while (bp < fp) {
594				c = *bp;
595				*bp++ = *fp;
596				*fp-- = c;
597			}
598			t = (const wchar_t *)tr;
599		}
600
601		if (direc & DIRECTIVE_POSITION) {
602			while (*t || state) {
603
604				_collate_lookup(table, t, &len, &pri, pass, &state);
605				t += len;
606				if (pri <= 0) {
607					if (pri < 0) {
608						errno = EINVAL;
609						goto fail;
610					}
611					state = NULL;
612					pri = COLLATE_MAX_PRIORITY;
613				}
614
615				b = xfrm(table, buf, pri, pass);
616				want += b;
617				if (room) {
618					while (b) {
619						b--;
620						if (room) {
621							*xf++ = buf[b];
622							room--;
623						}
624					}
625				}
626				need = want;
627			}
628		} else {
629			while (*t || state) {
630				_collate_lookup(table, t, &len, &pri, pass, &state);
631				t += len;
632				if (pri <= 0) {
633					if (pri < 0) {
634						errno = EINVAL;
635						goto fail;
636					}
637					state = NULL;
638					continue;
639				}
640
641				b = xfrm(table, buf, pri, pass);
642				want += b;
643				if (room) {
644
645					while (b) {
646						b--;
647						if (room) {
648							*xf++ = buf[b];
649							room--;
650						}
651					}
652				}
653				need = want;
654			}
655		}
656	}
657	free(tr);
658	return (need);
659
660fail:
661	free(tr);
662	return ((size_t)(-1));
663}
664
665/*
666 * __collate_equiv_value returns the primary collation value for the given
667 * collating symbol specified by str and len.  Zero or negative is returned
668 * if the collating symbol was not found.  This function is used by bracket
669 * code in the TRE regex library.
670 */
671int
672__collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
673{
674	int32_t e;
675
676	if (len < 1 || len >= COLLATE_STR_LEN)
677		return (-1);
678
679	FIX_LOCALE(locale);
680	struct xlocale_collate *table =
681		(struct xlocale_collate*)locale->components[XLC_COLLATE];
682
683	if (table->__collate_load_error)
684		return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
685
686	if (len == 1) {
687		e = -1;
688		if (*str <= UCHAR_MAX)
689			e = table->char_pri_table[*str].pri[0];
690		else if (table->info->large_count > 0) {
691			collate_large_t *match_large;
692			match_large = largesearch(table, *str);
693			if (match_large)
694				e = match_large->pri.pri[0];
695		}
696		if (e == 0)
697			return (1);
698		return (e > 0 ? e : 0);
699	}
700	if (table->info->chain_count > 0) {
701		wchar_t name[COLLATE_STR_LEN];
702		collate_chain_t *match_chain;
703		int clen;
704
705		wcsncpy (name, str, len);
706		name[len] = 0;
707		match_chain = chainsearch(table, name, &clen);
708		if (match_chain) {
709			e = match_chain->pri[0];
710			if (e == 0)
711				return (1);
712			return (e < 0 ? -e : e);
713		}
714	}
715	return (0);
716}
717