1/* 2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4 * Copyright 2015 John Marino <draco@marino.st> 5 * 6 * This source code is derived from the illumos localedef command, and 7 * provided under BSD-style license terms by Nexenta Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 * LC_CTYPE database generation routines for localedef. 34 */ 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD: stable/11/usr.bin/localedef/ctype.c 341631 2018-12-06 11:52:07Z yuripv $"); 37 38#include <sys/tree.h> 39 40#include <stdio.h> 41#include <stdlib.h> 42#include <stddef.h> 43#include <string.h> 44#include <sys/types.h> 45#include <wchar.h> 46#include <ctype.h> 47#include <wctype.h> 48#include <unistd.h> 49#include "localedef.h" 50#include "parser.h" 51#include "runefile.h" 52 53 54/* Needed for bootstrapping, _CTYPE_N */ 55#ifndef _CTYPE_N 56#define _CTYPE_N 0x00400000L 57#endif 58 59#define _ISUPPER _CTYPE_U 60#define _ISLOWER _CTYPE_L 61#define _ISDIGIT _CTYPE_D 62#define _ISXDIGIT _CTYPE_X 63#define _ISSPACE _CTYPE_S 64#define _ISBLANK _CTYPE_B 65#define _ISALPHA _CTYPE_A 66#define _ISPUNCT _CTYPE_P 67#define _ISGRAPH _CTYPE_G 68#define _ISPRINT _CTYPE_R 69#define _ISCNTRL _CTYPE_C 70#define _E1 _CTYPE_Q 71#define _E2 _CTYPE_I 72#define _E3 0 73#define _E4 _CTYPE_N 74#define _E5 _CTYPE_T 75 76static wchar_t last_ctype; 77static int ctype_compare(const void *n1, const void *n2); 78 79typedef struct ctype_node { 80 wchar_t wc; 81 int32_t ctype; 82 int32_t toupper; 83 int32_t tolower; 84 RB_ENTRY(ctype_node) entry; 85} ctype_node_t; 86 87static RB_HEAD(ctypes, ctype_node) ctypes; 88RB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare); 89 90static int 91ctype_compare(const void *n1, const void *n2) 92{ 93 const ctype_node_t *c1 = n1; 94 const ctype_node_t *c2 = n2; 95 96 return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 97} 98 99void 100init_ctype(void) 101{ 102 RB_INIT(&ctypes); 103} 104 105 106static void 107add_ctype_impl(ctype_node_t *ctn) 108{ 109 switch (last_kw) { 110 case T_ISUPPER: 111 ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 112 break; 113 case T_ISLOWER: 114 ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 115 break; 116 case T_ISALPHA: 117 ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 118 break; 119 case T_ISDIGIT: 120 ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4); 121 break; 122 case T_ISSPACE: 123 /* 124 * This can be troublesome as <form-feed>, <newline>, 125 * <carriage-return>, <tab>, and <vertical-tab> are defined both 126 * as space and cntrl, and POSIX doesn't allow cntrl/print 127 * combination. We will take care of this in dump_ctype(). 128 */ 129 ctn->ctype |= (_ISSPACE | _ISPRINT); 130 break; 131 case T_ISCNTRL: 132 ctn->ctype |= _ISCNTRL; 133 break; 134 case T_ISGRAPH: 135 ctn->ctype |= (_ISGRAPH | _ISPRINT); 136 break; 137 case T_ISPRINT: 138 ctn->ctype |= _ISPRINT; 139 break; 140 case T_ISPUNCT: 141 ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 142 break; 143 case T_ISXDIGIT: 144 ctn->ctype |= (_ISXDIGIT | _ISPRINT); 145 break; 146 case T_ISBLANK: 147 ctn->ctype |= (_ISBLANK | _ISSPACE); 148 break; 149 case T_ISPHONOGRAM: 150 ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 151 break; 152 case T_ISIDEOGRAM: 153 ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 154 break; 155 case T_ISENGLISH: 156 ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 157 break; 158 case T_ISNUMBER: 159 ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 160 break; 161 case T_ISSPECIAL: 162 ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 163 break; 164 case T_ISALNUM: 165 /* 166 * We can't do anything with this. The character 167 * should already be specified as a digit or alpha. 168 */ 169 break; 170 default: 171 errf("not a valid character class"); 172 } 173} 174 175static ctype_node_t * 176get_ctype(wchar_t wc) 177{ 178 ctype_node_t srch; 179 ctype_node_t *ctn; 180 181 srch.wc = wc; 182 if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) { 183 if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 184 errf("out of memory"); 185 return (NULL); 186 } 187 ctn->wc = wc; 188 189 RB_INSERT(ctypes, &ctypes, ctn); 190 } 191 return (ctn); 192} 193 194void 195add_ctype(int val) 196{ 197 ctype_node_t *ctn; 198 199 if ((ctn = get_ctype(val)) == NULL) { 200 INTERR; 201 return; 202 } 203 add_ctype_impl(ctn); 204 last_ctype = ctn->wc; 205} 206 207void 208add_ctype_range(wchar_t end) 209{ 210 ctype_node_t *ctn; 211 wchar_t cur; 212 213 if (end < last_ctype) { 214 errf("malformed character range (%u ... %u))", 215 last_ctype, end); 216 return; 217 } 218 for (cur = last_ctype + 1; cur <= end; cur++) { 219 if ((ctn = get_ctype(cur)) == NULL) { 220 INTERR; 221 return; 222 } 223 add_ctype_impl(ctn); 224 } 225 last_ctype = end; 226 227} 228 229/* 230 * A word about widths: if the width mask is specified, then libc 231 * unconditionally honors it. Otherwise, it assumes printable 232 * characters have width 1, and non-printable characters have width 233 * -1 (except for NULL which is special with with 0). Hence, we have 234 * no need to inject defaults here -- the "default" unset value of 0 235 * indicates that libc should use its own logic in wcwidth as described. 236 */ 237void 238add_width(int wc, int width) 239{ 240 ctype_node_t *ctn; 241 242 if ((ctn = get_ctype(wc)) == NULL) { 243 INTERR; 244 return; 245 } 246 ctn->ctype &= ~(_CTYPE_SWM); 247 switch (width) { 248 case 0: 249 ctn->ctype |= _CTYPE_SW0; 250 break; 251 case 1: 252 ctn->ctype |= _CTYPE_SW1; 253 break; 254 case 2: 255 ctn->ctype |= _CTYPE_SW2; 256 break; 257 case 3: 258 ctn->ctype |= _CTYPE_SW3; 259 break; 260 } 261} 262 263void 264add_width_range(int start, int end, int width) 265{ 266 for (; start <= end; start++) { 267 add_width(start, width); 268 } 269} 270 271void 272add_caseconv(int val, int wc) 273{ 274 ctype_node_t *ctn; 275 276 ctn = get_ctype(val); 277 if (ctn == NULL) { 278 INTERR; 279 return; 280 } 281 282 switch (last_kw) { 283 case T_TOUPPER: 284 ctn->toupper = wc; 285 break; 286 case T_TOLOWER: 287 ctn->tolower = wc; 288 break; 289 default: 290 INTERR; 291 break; 292 } 293} 294 295void 296dump_ctype(void) 297{ 298 FILE *f; 299 _FileRuneLocale rl; 300 ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 301 _FileRuneEntry *ct = NULL; 302 _FileRuneEntry *lo = NULL; 303 _FileRuneEntry *up = NULL; 304 wchar_t wc; 305 306 (void) memset(&rl, 0, sizeof (rl)); 307 last_ct = NULL; 308 last_lo = NULL; 309 last_up = NULL; 310 311 if ((f = open_category()) == NULL) 312 return; 313 314 (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 315 (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 316 317 /* 318 * Initialize the identity map. 319 */ 320 for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 321 rl.maplower[wc] = wc; 322 rl.mapupper[wc] = wc; 323 } 324 325 RB_FOREACH(ctn, ctypes, &ctypes) { 326 int conflict = 0; 327 328 wc = ctn->wc; 329 330 /* 331 * POSIX requires certain portable characters have 332 * certain types. Add them if they are missing. 333 */ 334 if ((wc >= 1) && (wc <= 127)) { 335 if ((wc >= 'A') && (wc <= 'Z')) 336 ctn->ctype |= _ISUPPER; 337 if ((wc >= 'a') && (wc <= 'z')) 338 ctn->ctype |= _ISLOWER; 339 if ((wc >= '0') && (wc <= '9')) 340 ctn->ctype |= _ISDIGIT; 341 if (wc == ' ') 342 ctn->ctype |= _ISPRINT; 343 if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 344 ctn->ctype |= _ISSPACE; 345 if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 346 ctn->ctype |= _ISXDIGIT; 347 if (strchr(" \t", (char)wc)) 348 ctn->ctype |= _ISBLANK; 349 350 /* 351 * Technically these settings are only 352 * required for the C locale. However, it 353 * turns out that because of the historical 354 * version of isprint(), we need them for all 355 * locales as well. Note that these are not 356 * necessarily valid punctation characters in 357 * the current language, but ispunct() needs 358 * to return TRUE for them. 359 */ 360 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 361 (char)wc)) 362 ctn->ctype |= _ISPUNCT; 363 } 364 365 /* 366 * POSIX also requires that certain types imply 367 * others. Add any inferred types here. 368 */ 369 if (ctn->ctype & (_ISUPPER |_ISLOWER)) 370 ctn->ctype |= _ISALPHA; 371 if (ctn->ctype & _ISDIGIT) 372 ctn->ctype |= _ISXDIGIT; 373 if (ctn->ctype & _ISBLANK) 374 ctn->ctype |= _ISSPACE; 375 if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 376 ctn->ctype |= _ISGRAPH; 377 if (ctn->ctype & _ISGRAPH) 378 ctn->ctype |= _ISPRINT; 379 380 /* 381 * POSIX requires that certain combinations are invalid. 382 * Try fixing the cases we know about (see add_ctype_impl()). 383 */ 384 if ((ctn->ctype & (_ISSPACE|_ISCNTRL)) == (_ISSPACE|_ISCNTRL)) 385 ctn->ctype &= ~_ISPRINT; 386 387 /* 388 * Finally, don't flag remaining cases as a fatal error, 389 * and just warn about them. 390 */ 391 if ((ctn->ctype & _ISALPHA) && 392 (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 393 conflict++; 394 if ((ctn->ctype & _ISPUNCT) && 395 (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 396 conflict++; 397 if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 398 conflict++; 399 if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT)) 400 conflict++; 401 if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 402 conflict++; 403 404 if (conflict) { 405 warn("conflicting classes for character 0x%x (%x)", 406 wc, ctn->ctype); 407 } 408 /* 409 * Handle the lower 256 characters using the simple 410 * optimization. Note that if we have not defined the 411 * upper/lower case, then we identity map it. 412 */ 413 if ((unsigned)wc < _CACHED_RUNES) { 414 rl.runetype[wc] = ctn->ctype; 415 if (ctn->tolower) 416 rl.maplower[wc] = ctn->tolower; 417 if (ctn->toupper) 418 rl.mapupper[wc] = ctn->toupper; 419 continue; 420 } 421 422 if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) && 423 (last_ct->wc + 1 == wc)) { 424 ct[rl.runetype_ext_nranges-1].max = wc; 425 } else { 426 rl.runetype_ext_nranges++; 427 ct = realloc(ct, 428 sizeof (*ct) * rl.runetype_ext_nranges); 429 ct[rl.runetype_ext_nranges - 1].min = wc; 430 ct[rl.runetype_ext_nranges - 1].max = wc; 431 ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 432 } 433 last_ct = ctn; 434 if (ctn->tolower == 0) { 435 last_lo = NULL; 436 } else if ((last_lo != NULL) && 437 (last_lo->tolower + 1 == ctn->tolower)) { 438 lo[rl.maplower_ext_nranges-1].max = wc; 439 last_lo = ctn; 440 } else { 441 rl.maplower_ext_nranges++; 442 lo = realloc(lo, 443 sizeof (*lo) * rl.maplower_ext_nranges); 444 lo[rl.maplower_ext_nranges - 1].min = wc; 445 lo[rl.maplower_ext_nranges - 1].max = wc; 446 lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 447 last_lo = ctn; 448 } 449 450 if (ctn->toupper == 0) { 451 last_up = NULL; 452 } else if ((last_up != NULL) && 453 (last_up->toupper + 1 == ctn->toupper)) { 454 up[rl.mapupper_ext_nranges-1].max = wc; 455 last_up = ctn; 456 } else { 457 rl.mapupper_ext_nranges++; 458 up = realloc(up, 459 sizeof (*up) * rl.mapupper_ext_nranges); 460 up[rl.mapupper_ext_nranges - 1].min = wc; 461 up[rl.mapupper_ext_nranges - 1].max = wc; 462 up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 463 last_up = ctn; 464 } 465 } 466 467 if ((wr_category(&rl, sizeof (rl), f) < 0) || 468 (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 469 (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 470 (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 471 return; 472 } 473 474 close_category(f); 475} 476