ctype.c revision 286484
1/* 2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4 * Copyright 2015 John Marino <draco@marino.st> 5 * 6 * This source code is derived from the illumos localedef command, and 7 * provided under BSD-style license terms by Nexenta Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * LC_CTYPE database generation routines for localedef. 34 */ 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD: projects/collation/usr.bin/localedef/ctype.c 286484 2015-08-08 22:57:17Z bapt $"); 37 38#include <sys/tree.h> 39 40#include <stdio.h> 41#include <stdlib.h> 42#include <stddef.h> 43#include <string.h> 44#include <sys/types.h> 45#include <wchar.h> 46#include <ctype.h> 47#include <wctype.h> 48#include <unistd.h> 49#include "localedef.h" 50#include "parser.h" 51#include "runefile.h" 52 53 54#define _ISUPPER _CTYPE_U 55#define _ISLOWER _CTYPE_L 56#define _ISDIGIT _CTYPE_D 57#define _ISXDIGIT _CTYPE_X 58#define _ISSPACE _CTYPE_S 59#define _ISBLANK _CTYPE_B 60#define _ISALPHA _CTYPE_A 61#define _ISPUNCT _CTYPE_P 62#define _ISGRAPH _CTYPE_G 63#define _ISPRINT _CTYPE_R 64#define _ISCNTRL _CTYPE_C 65#define _E1 _CTYPE_Q 66#define _E2 _CTYPE_I 67#define _E3 0 68#define _E4 0 69#define _E5 _CTYPE_T 70 71static wchar_t last_ctype; 72static int ctype_compare(const void *n1, const void *n2); 73 74typedef struct ctype_node { 75 wchar_t wc; 76 int32_t ctype; 77 int32_t toupper; 78 int32_t tolower; 79 RB_ENTRY(ctype_node) entry; 80} ctype_node_t; 81 82static RB_HEAD(ctypes, ctype_node) ctypes; 83RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare); 84 85static int 86ctype_compare(const void *n1, const void *n2) 87{ 88 const ctype_node_t *c1 = n1; 89 const ctype_node_t *c2 = n2; 90 91 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 92} 93 94void 95init_ctype(void) 96{ 97 RB_INIT(&ctypes); 98} 99 100 101static void 102add_ctype_impl(ctype_node_t *ctn) 103{ 104 switch (last_kw) { 105 case T_ISUPPER: 106 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 107 break; 108 case T_ISLOWER: 109 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 110 break; 111 case T_ISALPHA: 112 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 113 break; 114 case T_ISDIGIT: 115 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT); 116 break; 117 case T_ISSPACE: 118 ctn->ctype |= _ISSPACE; 119 break; 120 case T_ISCNTRL: 121 ctn->ctype |= _ISCNTRL; 122 break; 123 case T_ISGRAPH: 124 ctn->ctype |= (_ISGRAPH | _ISPRINT); 125 break; 126 case T_ISPRINT: 127 ctn->ctype |= _ISPRINT; 128 break; 129 case T_ISPUNCT: 130 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 131 break; 132 case T_ISXDIGIT: 133 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 134 break; 135 case T_ISBLANK: 136 ctn->ctype |= (_ISBLANK | _ISSPACE); 137 break; 138 case T_ISPHONOGRAM: 139 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 140 break; 141 case T_ISIDEOGRAM: 142 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 143 break; 144 case T_ISENGLISH: 145 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 146 break; 147 case T_ISNUMBER: 148 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 149 break; 150 case T_ISSPECIAL: 151 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 152 break; 153 case T_ISALNUM: 154 /* 155 * We can't do anything with this. The character 156 * should already be specified as a digit or alpha. 157 */ 158 break; 159 default: 160 errf("not a valid character class"); 161 } 162} 163 164static ctype_node_t * 165get_ctype(wchar_t wc) 166{ 167 ctype_node_t srch; 168 ctype_node_t *ctn; 169 170 srch.wc = wc; 171 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) { 172 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 173 errf("out of memory"); 174 return (NULL); 175 } 176 ctn->wc = wc; 177 178 RB_INSERT(ctypes, &ctypes, ctn); 179 } 180 return (ctn); 181} 182 183void 184add_ctype(int val) 185{ 186 ctype_node_t *ctn; 187 188 if ((ctn = get_ctype(val)) == NULL) { 189 INTERR; 190 return; 191 } 192 add_ctype_impl(ctn); 193 last_ctype = ctn->wc; 194} 195 196void 197add_ctype_range(int end) 198{ 199 ctype_node_t *ctn; 200 wchar_t cur; 201 202 if (end < last_ctype) { 203 errf("malformed character range (%u ... %u))", 204 last_ctype, end); 205 return; 206 } 207 for (cur = last_ctype + 1; cur <= end; cur++) { 208 if ((ctn = get_ctype(cur)) == NULL) { 209 INTERR; 210 return; 211 } 212 add_ctype_impl(ctn); 213 } 214 last_ctype = end; 215 216} 217 218/* 219 * A word about widths: if the width mask is specified, then libc 220 * unconditionally honors it. Otherwise, it assumes printable 221 * characters have width 1, and non-printable characters have width 222 * -1 (except for NULL which is special with with 0). Hence, we have 223 * no need to inject defaults here -- the "default" unset value of 0 224 * indicates that libc should use its own logic in wcwidth as described. 225 */ 226void 227add_width(int wc, int width) 228{ 229 ctype_node_t *ctn; 230 231 if ((ctn = get_ctype(wc)) == NULL) { 232 INTERR; 233 return; 234 } 235 ctn->ctype &= ~(_CTYPE_SWM); 236 switch (width) { 237 case 0: 238 ctn->ctype |= _CTYPE_SW0; 239 break; 240 case 1: 241 ctn->ctype |= _CTYPE_SW1; 242 break; 243 case 2: 244 ctn->ctype |= _CTYPE_SW2; 245 break; 246 case 3: 247 ctn->ctype |= _CTYPE_SW3; 248 break; 249 } 250} 251 252void 253add_width_range(int start, int end, int width) 254{ 255 for (; start <= end; start++) { 256 add_width(start, width); 257 } 258} 259 260void 261add_caseconv(int val, int wc) 262{ 263 ctype_node_t *ctn; 264 265 ctn = get_ctype(val); 266 if (ctn == NULL) { 267 INTERR; 268 return; 269 } 270 271 switch (last_kw) { 272 case T_TOUPPER: 273 ctn->toupper = wc; 274 break; 275 case T_TOLOWER: 276 ctn->tolower = wc; 277 break; 278 default: 279 INTERR; 280 break; 281 } 282} 283 284void 285dump_ctype(void) 286{ 287 FILE *f; 288 _FileRuneLocale rl; 289 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 290 _FileRuneEntry *ct = NULL; 291 _FileRuneEntry *lo = NULL; 292 _FileRuneEntry *up = NULL; 293 wchar_t wc; 294 295 (void) memset(&rl, 0, sizeof (rl)); 296 last_ct = NULL; 297 last_lo = NULL; 298 last_up = NULL; 299 300 if ((f = open_category()) == NULL) 301 return; 302 303 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 304 (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 305 306 /* 307 * Initialize the identity map. 308 */ 309 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 310 rl.maplower[wc] = wc; 311 rl.mapupper[wc] = wc; 312 } 313 314 RB_FOREACH(ctn, ctypes, &ctypes) { 315 int conflict = 0; 316 317 wc = ctn->wc; 318 319 /* 320 * POSIX requires certain portable characters have 321 * certain types. Add them if they are missing. 322 */ 323 if ((wc >= 1) && (wc <= 127)) { 324 if ((wc >= 'A') && (wc <= 'Z')) 325 ctn->ctype |= _ISUPPER; 326 if ((wc >= 'a') && (wc <= 'z')) 327 ctn->ctype |= _ISLOWER; 328 if ((wc >= '0') && (wc <= '9')) 329 ctn->ctype |= _ISDIGIT; 330 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 331 ctn->ctype |= _ISSPACE; 332 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 333 ctn->ctype |= _ISXDIGIT; 334 if (strchr(" \t", (char)wc)) 335 ctn->ctype |= _ISBLANK; 336 337 /* 338 * Technically these settings are only 339 * required for the C locale. However, it 340 * turns out that because of the historical 341 * version of isprint(), we need them for all 342 * locales as well. Note that these are not 343 * necessarily valid punctation characters in 344 * the current language, but ispunct() needs 345 * to return TRUE for them. 346 */ 347 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 348 (char)wc)) 349 ctn->ctype |= _ISPUNCT; 350 } 351 352 /* 353 * POSIX also requires that certain types imply 354 * others. Add any inferred types here. 355 */ 356 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 357 ctn->ctype |= _ISALPHA; 358 if (ctn->ctype & _ISDIGIT) 359 ctn->ctype |= _ISXDIGIT; 360 if (ctn->ctype & _ISBLANK) 361 ctn->ctype |= _ISSPACE; 362 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 363 ctn->ctype |= _ISGRAPH; 364 if (ctn->ctype & _ISGRAPH) 365 ctn->ctype |= _ISPRINT; 366 367 /* 368 * Finally, POSIX requires that certain combinations 369 * are invalid. We don't flag this as a fatal error, 370 * but we will warn about. 371 */ 372 if ((ctn->ctype & _ISALPHA) && 373 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 374 conflict++; 375 if ((ctn->ctype & _ISPUNCT) & 376 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 377 conflict++; 378 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 379 conflict++; 380 if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 381 conflict++; 382 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 383 conflict++; 384 385 if (conflict) { 386 warn("conflicting classes for character 0x%x (%x)", 387 wc, ctn->ctype); 388 } 389 /* 390 * Handle the lower 256 characters using the simple 391 * optimization. Note that if we have not defined the 392 * upper/lower case, then we identity map it. 393 */ 394 if ((unsigned)wc < _CACHED_RUNES) { 395 rl.runetype[wc] = ctn->ctype; 396 if (ctn->tolower) 397 rl.maplower[wc] = ctn->tolower; 398 if (ctn->toupper) 399 rl.mapupper[wc] = ctn->toupper; 400 continue; 401 } 402 403 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { 404 ct[rl.runetype_ext_nranges-1].max = wc; 405 last_ct = ctn; 406 } else { 407 rl.runetype_ext_nranges++; 408 ct = realloc(ct, 409 sizeof (*ct) * rl.runetype_ext_nranges); 410 ct[rl.runetype_ext_nranges - 1].min = wc; 411 ct[rl.runetype_ext_nranges - 1].max = wc; 412 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 413 last_ct = ctn; 414 } 415 if (ctn->tolower == 0) { 416 last_lo = NULL; 417 } else if ((last_lo != NULL) && 418 (last_lo->tolower + 1 == ctn->tolower)) { 419 lo[rl.maplower_ext_nranges-1].max = wc; 420 last_lo = ctn; 421 } else { 422 rl.maplower_ext_nranges++; 423 lo = realloc(lo, 424 sizeof (*lo) * rl.maplower_ext_nranges); 425 lo[rl.maplower_ext_nranges - 1].min = wc; 426 lo[rl.maplower_ext_nranges - 1].max = wc; 427 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 428 last_lo = ctn; 429 } 430 431 if (ctn->toupper == 0) { 432 last_up = NULL; 433 } else if ((last_up != NULL) && 434 (last_up->toupper + 1 == ctn->toupper)) { 435 up[rl.mapupper_ext_nranges-1].max = wc; 436 last_up = ctn; 437 } else { 438 rl.mapupper_ext_nranges++; 439 up = realloc(up, 440 sizeof (*up) * rl.mapupper_ext_nranges); 441 up[rl.mapupper_ext_nranges - 1].min = wc; 442 up[rl.mapupper_ext_nranges - 1].max = wc; 443 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 444 last_up = ctn; 445 } 446 } 447 448 if ((wr_category(&rl, sizeof (rl), f) < 0) || 449 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 450 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 451 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 452 return; 453 } 454 455 close_category(f); 456} 457