ctype.c revision 286484
1/*
2 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
3 * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4 * Copyright 2015 John Marino <draco@marino.st>
5 *
6 * This source code is derived from the illumos localedef command, and
7 * provided under BSD-style license terms by Nexenta Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * LC_CTYPE database generation routines for localedef.
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: projects/collation/usr.bin/localedef/ctype.c 286484 2015-08-08 22:57:17Z bapt $");
37
38#include <sys/tree.h>
39
40#include <stdio.h>
41#include <stdlib.h>
42#include <stddef.h>
43#include <string.h>
44#include <sys/types.h>
45#include <wchar.h>
46#include <ctype.h>
47#include <wctype.h>
48#include <unistd.h>
49#include "localedef.h"
50#include "parser.h"
51#include "runefile.h"
52
53
54#define _ISUPPER	_CTYPE_U
55#define _ISLOWER	_CTYPE_L
56#define	_ISDIGIT	_CTYPE_D
57#define	_ISXDIGIT	_CTYPE_X
58#define	_ISSPACE	_CTYPE_S
59#define	_ISBLANK	_CTYPE_B
60#define	_ISALPHA	_CTYPE_A
61#define	_ISPUNCT	_CTYPE_P
62#define	_ISGRAPH	_CTYPE_G
63#define	_ISPRINT	_CTYPE_R
64#define	_ISCNTRL	_CTYPE_C
65#define	_E1		_CTYPE_Q
66#define	_E2		_CTYPE_I
67#define	_E3		0
68#define	_E4		0
69#define	_E5		_CTYPE_T
70
71static wchar_t		last_ctype;
72static int ctype_compare(const void *n1, const void *n2);
73
74typedef struct ctype_node {
75	wchar_t wc;
76	int32_t	ctype;
77	int32_t	toupper;
78	int32_t	tolower;
79	RB_ENTRY(ctype_node) entry;
80} ctype_node_t;
81
82static RB_HEAD(ctypes, ctype_node) ctypes;
83RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare);
84
85static int
86ctype_compare(const void *n1, const void *n2)
87{
88	const ctype_node_t *c1 = n1;
89	const ctype_node_t *c2 = n2;
90
91	return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
92}
93
94void
95init_ctype(void)
96{
97	RB_INIT(&ctypes);
98}
99
100
101static void
102add_ctype_impl(ctype_node_t *ctn)
103{
104	switch (last_kw) {
105	case T_ISUPPER:
106		ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
107		break;
108	case T_ISLOWER:
109		ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
110		break;
111	case T_ISALPHA:
112		ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
113		break;
114	case T_ISDIGIT:
115		ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
116		break;
117	case T_ISSPACE:
118		ctn->ctype |= _ISSPACE;
119		break;
120	case T_ISCNTRL:
121		ctn->ctype |= _ISCNTRL;
122		break;
123	case T_ISGRAPH:
124		ctn->ctype |= (_ISGRAPH | _ISPRINT);
125		break;
126	case T_ISPRINT:
127		ctn->ctype |= _ISPRINT;
128		break;
129	case T_ISPUNCT:
130		ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
131		break;
132	case T_ISXDIGIT:
133		ctn->ctype |= (_ISXDIGIT | _ISPRINT);
134		break;
135	case T_ISBLANK:
136		ctn->ctype |= (_ISBLANK | _ISSPACE);
137		break;
138	case T_ISPHONOGRAM:
139		ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
140		break;
141	case T_ISIDEOGRAM:
142		ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
143		break;
144	case T_ISENGLISH:
145		ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
146		break;
147	case T_ISNUMBER:
148		ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
149		break;
150	case T_ISSPECIAL:
151		ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
152		break;
153	case T_ISALNUM:
154		/*
155		 * We can't do anything with this.  The character
156		 * should already be specified as a digit or alpha.
157		 */
158		break;
159	default:
160		errf("not a valid character class");
161	}
162}
163
164static ctype_node_t *
165get_ctype(wchar_t wc)
166{
167	ctype_node_t	srch;
168	ctype_node_t	*ctn;
169
170	srch.wc = wc;
171	if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) {
172		if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
173			errf("out of memory");
174			return (NULL);
175		}
176		ctn->wc = wc;
177
178		RB_INSERT(ctypes, &ctypes, ctn);
179	}
180	return (ctn);
181}
182
183void
184add_ctype(int val)
185{
186	ctype_node_t	*ctn;
187
188	if ((ctn = get_ctype(val)) == NULL) {
189		INTERR;
190		return;
191	}
192	add_ctype_impl(ctn);
193	last_ctype = ctn->wc;
194}
195
196void
197add_ctype_range(int end)
198{
199	ctype_node_t	*ctn;
200	wchar_t		cur;
201
202	if (end < last_ctype) {
203		errf("malformed character range (%u ... %u))",
204		    last_ctype, end);
205		return;
206	}
207	for (cur = last_ctype + 1; cur <= end; cur++) {
208		if ((ctn = get_ctype(cur)) == NULL) {
209			INTERR;
210			return;
211		}
212		add_ctype_impl(ctn);
213	}
214	last_ctype = end;
215
216}
217
218/*
219 * A word about widths: if the width mask is specified, then libc
220 * unconditionally honors it.  Otherwise, it assumes printable
221 * characters have width 1, and non-printable characters have width
222 * -1 (except for NULL which is special with with 0).  Hence, we have
223 * no need to inject defaults here -- the "default" unset value of 0
224 * indicates that libc should use its own logic in wcwidth as described.
225 */
226void
227add_width(int wc, int width)
228{
229	ctype_node_t	*ctn;
230
231	if ((ctn = get_ctype(wc)) == NULL) {
232		INTERR;
233		return;
234	}
235	ctn->ctype &= ~(_CTYPE_SWM);
236	switch (width) {
237	case 0:
238		ctn->ctype |= _CTYPE_SW0;
239		break;
240	case 1:
241		ctn->ctype |= _CTYPE_SW1;
242		break;
243	case 2:
244		ctn->ctype |= _CTYPE_SW2;
245		break;
246	case 3:
247		ctn->ctype |= _CTYPE_SW3;
248		break;
249	}
250}
251
252void
253add_width_range(int start, int end, int width)
254{
255	for (; start <= end; start++) {
256		add_width(start, width);
257	}
258}
259
260void
261add_caseconv(int val, int wc)
262{
263	ctype_node_t	*ctn;
264
265	ctn = get_ctype(val);
266	if (ctn == NULL) {
267		INTERR;
268		return;
269	}
270
271	switch (last_kw) {
272	case T_TOUPPER:
273		ctn->toupper = wc;
274		break;
275	case T_TOLOWER:
276		ctn->tolower = wc;
277		break;
278	default:
279		INTERR;
280		break;
281	}
282}
283
284void
285dump_ctype(void)
286{
287	FILE		*f;
288	_FileRuneLocale	rl;
289	ctype_node_t	*ctn, *last_ct, *last_lo, *last_up;
290	_FileRuneEntry	*ct = NULL;
291	_FileRuneEntry	*lo = NULL;
292	_FileRuneEntry	*up = NULL;
293	wchar_t		wc;
294
295	(void) memset(&rl, 0, sizeof (rl));
296	last_ct = NULL;
297	last_lo = NULL;
298	last_up = NULL;
299
300	if ((f = open_category()) == NULL)
301		return;
302
303	(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
304	(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
305
306	/*
307	 * Initialize the identity map.
308	 */
309	for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
310		rl.maplower[wc] = wc;
311		rl.mapupper[wc] = wc;
312	}
313
314	RB_FOREACH(ctn, ctypes, &ctypes) {
315		int conflict = 0;
316
317		wc = ctn->wc;
318
319		/*
320		 * POSIX requires certain portable characters have
321		 * certain types.  Add them if they are missing.
322		 */
323		if ((wc >= 1) && (wc <= 127)) {
324			if ((wc >= 'A') && (wc <= 'Z'))
325				ctn->ctype |= _ISUPPER;
326			if ((wc >= 'a') && (wc <= 'z'))
327				ctn->ctype |= _ISLOWER;
328			if ((wc >= '0') && (wc <= '9'))
329				ctn->ctype |= _ISDIGIT;
330			if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
331				ctn->ctype |= _ISSPACE;
332			if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
333				ctn->ctype |= _ISXDIGIT;
334			if (strchr(" \t", (char)wc))
335				ctn->ctype |= _ISBLANK;
336
337			/*
338			 * Technically these settings are only
339			 * required for the C locale.  However, it
340			 * turns out that because of the historical
341			 * version of isprint(), we need them for all
342			 * locales as well.  Note that these are not
343			 * necessarily valid punctation characters in
344			 * the current language, but ispunct() needs
345			 * to return TRUE for them.
346			 */
347			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
348			    (char)wc))
349				ctn->ctype |= _ISPUNCT;
350		}
351
352		/*
353		 * POSIX also requires that certain types imply
354		 * others.  Add any inferred types here.
355		 */
356		if (ctn->ctype & (_ISUPPER |_ISLOWER))
357			ctn->ctype |= _ISALPHA;
358		if (ctn->ctype & _ISDIGIT)
359			ctn->ctype |= _ISXDIGIT;
360		if (ctn->ctype & _ISBLANK)
361			ctn->ctype |= _ISSPACE;
362		if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
363			ctn->ctype |= _ISGRAPH;
364		if (ctn->ctype & _ISGRAPH)
365			ctn->ctype |= _ISPRINT;
366
367		/*
368		 * Finally, POSIX requires that certain combinations
369		 * are invalid.  We don't flag this as a fatal error,
370		 * but we will warn about.
371		 */
372		if ((ctn->ctype & _ISALPHA) &&
373		    (ctn->ctype & (_ISPUNCT|_ISDIGIT)))
374			conflict++;
375		if ((ctn->ctype & _ISPUNCT) &
376		    (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
377			conflict++;
378		if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
379			conflict++;
380		if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
381			conflict++;
382		if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
383			conflict++;
384
385		if (conflict) {
386			warn("conflicting classes for character 0x%x (%x)",
387			    wc, ctn->ctype);
388		}
389		/*
390		 * Handle the lower 256 characters using the simple
391		 * optimization.  Note that if we have not defined the
392		 * upper/lower case, then we identity map it.
393		 */
394		if ((unsigned)wc < _CACHED_RUNES) {
395			rl.runetype[wc] = ctn->ctype;
396			if (ctn->tolower)
397				rl.maplower[wc] = ctn->tolower;
398			if (ctn->toupper)
399				rl.mapupper[wc] = ctn->toupper;
400			continue;
401		}
402
403		if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
404			ct[rl.runetype_ext_nranges-1].max = wc;
405			last_ct = ctn;
406		} else {
407			rl.runetype_ext_nranges++;
408			ct = realloc(ct,
409			    sizeof (*ct) * rl.runetype_ext_nranges);
410			ct[rl.runetype_ext_nranges - 1].min = wc;
411			ct[rl.runetype_ext_nranges - 1].max = wc;
412			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
413			last_ct = ctn;
414		}
415		if (ctn->tolower == 0) {
416			last_lo = NULL;
417		} else if ((last_lo != NULL) &&
418		    (last_lo->tolower + 1 == ctn->tolower)) {
419			lo[rl.maplower_ext_nranges-1].max = wc;
420			last_lo = ctn;
421		} else {
422			rl.maplower_ext_nranges++;
423			lo = realloc(lo,
424			    sizeof (*lo) * rl.maplower_ext_nranges);
425			lo[rl.maplower_ext_nranges - 1].min = wc;
426			lo[rl.maplower_ext_nranges - 1].max = wc;
427			lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
428			last_lo = ctn;
429		}
430
431		if (ctn->toupper == 0) {
432			last_up = NULL;
433		} else if ((last_up != NULL) &&
434		    (last_up->toupper + 1 == ctn->toupper)) {
435			up[rl.mapupper_ext_nranges-1].max = wc;
436			last_up = ctn;
437		} else {
438			rl.mapupper_ext_nranges++;
439			up = realloc(up,
440			    sizeof (*up) * rl.mapupper_ext_nranges);
441			up[rl.mapupper_ext_nranges - 1].min = wc;
442			up[rl.mapupper_ext_nranges - 1].max = wc;
443			up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
444			last_up = ctn;
445		}
446	}
447
448	if ((wr_category(&rl, sizeof (rl), f) < 0) ||
449	    (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
450	    (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
451	    (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
452		return;
453	}
454
455	close_category(f);
456}
457