ctype.c revision 315224
1286432Sbapt/* 2286432Sbapt * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3286432Sbapt * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4286432Sbapt * Copyright 2015 John Marino <draco@marino.st> 5286432Sbapt * 6286432Sbapt * This source code is derived from the illumos localedef command, and 7286432Sbapt * provided under BSD-style license terms by Nexenta Systems, Inc. 8286432Sbapt * 9286432Sbapt * Redistribution and use in source and binary forms, with or without 10286432Sbapt * modification, are permitted provided that the following conditions 11286432Sbapt * are met: 12286432Sbapt * 13286432Sbapt * 1. Redistributions of source code must retain the above copyright 14286432Sbapt * notice, this list of conditions and the following disclaimer. 15286432Sbapt * 2. Redistributions in binary form must reproduce the above copyright 16286432Sbapt * notice, this list of conditions and the following disclaimer in the 17286432Sbapt * documentation and/or other materials provided with the distribution. 18286432Sbapt * 19286432Sbapt * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20286432Sbapt * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21286432Sbapt * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22286432Sbapt * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23286432Sbapt * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24286432Sbapt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25286432Sbapt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26286432Sbapt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27286432Sbapt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28286432Sbapt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29286432Sbapt * POSSIBILITY OF SUCH DAMAGE. 30286432Sbapt */ 31286432Sbapt 32286432Sbapt/* 33286432Sbapt * LC_CTYPE database generation routines for localedef. 34286432Sbapt */ 35286432Sbapt#include <sys/cdefs.h> 36286432Sbapt__FBSDID("$FreeBSD: stable/11/usr.bin/localedef/ctype.c 315224 2017-03-14 02:13:59Z pfg $"); 37286432Sbapt 38286481Sbapt#include <sys/tree.h> 39286432Sbapt 40286432Sbapt#include <stdio.h> 41286432Sbapt#include <stdlib.h> 42286432Sbapt#include <stddef.h> 43286432Sbapt#include <string.h> 44286432Sbapt#include <sys/types.h> 45286432Sbapt#include <wchar.h> 46286432Sbapt#include <ctype.h> 47286432Sbapt#include <wctype.h> 48286432Sbapt#include <unistd.h> 49286432Sbapt#include "localedef.h" 50286432Sbapt#include "parser.h" 51286432Sbapt#include "runefile.h" 52286432Sbapt 53286432Sbapt 54289263Sbapt/* Needed for bootstrapping, _CTYPE_N */ 55289263Sbapt#ifndef _CTYPE_N 56289263Sbapt#define _CTYPE_N 0x00400000L 57289263Sbapt#endif 58289263Sbapt 59286432Sbapt#define _ISUPPER _CTYPE_U 60286432Sbapt#define _ISLOWER _CTYPE_L 61286432Sbapt#define _ISDIGIT _CTYPE_D 62286432Sbapt#define _ISXDIGIT _CTYPE_X 63286432Sbapt#define _ISSPACE _CTYPE_S 64286432Sbapt#define _ISBLANK _CTYPE_B 65286432Sbapt#define _ISALPHA _CTYPE_A 66286432Sbapt#define _ISPUNCT _CTYPE_P 67286432Sbapt#define _ISGRAPH _CTYPE_G 68286432Sbapt#define _ISPRINT _CTYPE_R 69286432Sbapt#define _ISCNTRL _CTYPE_C 70286432Sbapt#define _E1 _CTYPE_Q 71286432Sbapt#define _E2 _CTYPE_I 72286432Sbapt#define _E3 0 73289263Sbapt#define _E4 _CTYPE_N 74286432Sbapt#define _E5 _CTYPE_T 75286432Sbapt 76286432Sbaptstatic wchar_t last_ctype; 77286481Sbaptstatic int ctype_compare(const void *n1, const void *n2); 78286432Sbapt 79286432Sbapttypedef struct ctype_node { 80286432Sbapt wchar_t wc; 81286432Sbapt int32_t ctype; 82286432Sbapt int32_t toupper; 83286432Sbapt int32_t tolower; 84286481Sbapt RB_ENTRY(ctype_node) entry; 85286432Sbapt} ctype_node_t; 86286432Sbapt 87286484Sbaptstatic RB_HEAD(ctypes, ctype_node) ctypes; 88286482SbaptRB_GENERATE_STATIC(ctypes, ctype_node, entry, ctype_compare); 89286432Sbapt 90286432Sbaptstatic int 91286432Sbaptctype_compare(const void *n1, const void *n2) 92286432Sbapt{ 93286432Sbapt const ctype_node_t *c1 = n1; 94286432Sbapt const ctype_node_t *c2 = n2; 95286432Sbapt 96286432Sbapt return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 97286432Sbapt} 98286432Sbapt 99286432Sbaptvoid 100286432Sbaptinit_ctype(void) 101286432Sbapt{ 102286481Sbapt RB_INIT(&ctypes); 103286432Sbapt} 104286432Sbapt 105286432Sbapt 106286432Sbaptstatic void 107286432Sbaptadd_ctype_impl(ctype_node_t *ctn) 108286432Sbapt{ 109286432Sbapt switch (last_kw) { 110286432Sbapt case T_ISUPPER: 111286432Sbapt ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 112286432Sbapt break; 113286432Sbapt case T_ISLOWER: 114286432Sbapt ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 115286432Sbapt break; 116286432Sbapt case T_ISALPHA: 117286432Sbapt ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 118286432Sbapt break; 119286432Sbapt case T_ISDIGIT: 120289586Sbapt ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT | _E4); 121286432Sbapt break; 122286432Sbapt case T_ISSPACE: 123286432Sbapt ctn->ctype |= _ISSPACE; 124286432Sbapt break; 125286432Sbapt case T_ISCNTRL: 126286432Sbapt ctn->ctype |= _ISCNTRL; 127286432Sbapt break; 128286432Sbapt case T_ISGRAPH: 129286432Sbapt ctn->ctype |= (_ISGRAPH | _ISPRINT); 130286432Sbapt break; 131286432Sbapt case T_ISPRINT: 132286432Sbapt ctn->ctype |= _ISPRINT; 133286432Sbapt break; 134286432Sbapt case T_ISPUNCT: 135286432Sbapt ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 136286432Sbapt break; 137286432Sbapt case T_ISXDIGIT: 138289590Sbapt ctn->ctype |= (_ISXDIGIT | _ISPRINT); 139286432Sbapt break; 140286432Sbapt case T_ISBLANK: 141286432Sbapt ctn->ctype |= (_ISBLANK | _ISSPACE); 142286432Sbapt break; 143286432Sbapt case T_ISPHONOGRAM: 144286432Sbapt ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 145286432Sbapt break; 146286432Sbapt case T_ISIDEOGRAM: 147286432Sbapt ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 148286432Sbapt break; 149286432Sbapt case T_ISENGLISH: 150286432Sbapt ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 151286432Sbapt break; 152286432Sbapt case T_ISNUMBER: 153286432Sbapt ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 154286432Sbapt break; 155286432Sbapt case T_ISSPECIAL: 156286432Sbapt ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 157286432Sbapt break; 158286432Sbapt case T_ISALNUM: 159286432Sbapt /* 160286432Sbapt * We can't do anything with this. The character 161286432Sbapt * should already be specified as a digit or alpha. 162286432Sbapt */ 163286432Sbapt break; 164286432Sbapt default: 165286432Sbapt errf("not a valid character class"); 166286432Sbapt } 167286432Sbapt} 168286432Sbapt 169286432Sbaptstatic ctype_node_t * 170286432Sbaptget_ctype(wchar_t wc) 171286432Sbapt{ 172286432Sbapt ctype_node_t srch; 173286432Sbapt ctype_node_t *ctn; 174286432Sbapt 175286432Sbapt srch.wc = wc; 176286481Sbapt if ((ctn = RB_FIND(ctypes, &ctypes, &srch)) == NULL) { 177286432Sbapt if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 178286432Sbapt errf("out of memory"); 179286432Sbapt return (NULL); 180286432Sbapt } 181286432Sbapt ctn->wc = wc; 182286432Sbapt 183286481Sbapt RB_INSERT(ctypes, &ctypes, ctn); 184286432Sbapt } 185286432Sbapt return (ctn); 186286432Sbapt} 187286432Sbapt 188286432Sbaptvoid 189286432Sbaptadd_ctype(int val) 190286432Sbapt{ 191286432Sbapt ctype_node_t *ctn; 192286432Sbapt 193286432Sbapt if ((ctn = get_ctype(val)) == NULL) { 194286432Sbapt INTERR; 195286432Sbapt return; 196286432Sbapt } 197286432Sbapt add_ctype_impl(ctn); 198286432Sbapt last_ctype = ctn->wc; 199286432Sbapt} 200286432Sbapt 201286432Sbaptvoid 202290517Sbaptadd_ctype_range(wchar_t end) 203286432Sbapt{ 204286432Sbapt ctype_node_t *ctn; 205286432Sbapt wchar_t cur; 206286432Sbapt 207286432Sbapt if (end < last_ctype) { 208286432Sbapt errf("malformed character range (%u ... %u))", 209286432Sbapt last_ctype, end); 210286432Sbapt return; 211286432Sbapt } 212286432Sbapt for (cur = last_ctype + 1; cur <= end; cur++) { 213286432Sbapt if ((ctn = get_ctype(cur)) == NULL) { 214286432Sbapt INTERR; 215286432Sbapt return; 216286432Sbapt } 217286432Sbapt add_ctype_impl(ctn); 218286432Sbapt } 219286432Sbapt last_ctype = end; 220286432Sbapt 221286432Sbapt} 222286432Sbapt 223286432Sbapt/* 224286432Sbapt * A word about widths: if the width mask is specified, then libc 225286432Sbapt * unconditionally honors it. Otherwise, it assumes printable 226286432Sbapt * characters have width 1, and non-printable characters have width 227286432Sbapt * -1 (except for NULL which is special with with 0). Hence, we have 228286432Sbapt * no need to inject defaults here -- the "default" unset value of 0 229286432Sbapt * indicates that libc should use its own logic in wcwidth as described. 230286432Sbapt */ 231286432Sbaptvoid 232286432Sbaptadd_width(int wc, int width) 233286432Sbapt{ 234286432Sbapt ctype_node_t *ctn; 235286432Sbapt 236286432Sbapt if ((ctn = get_ctype(wc)) == NULL) { 237286432Sbapt INTERR; 238286432Sbapt return; 239286432Sbapt } 240286432Sbapt ctn->ctype &= ~(_CTYPE_SWM); 241286432Sbapt switch (width) { 242286432Sbapt case 0: 243286432Sbapt ctn->ctype |= _CTYPE_SW0; 244286432Sbapt break; 245286432Sbapt case 1: 246286432Sbapt ctn->ctype |= _CTYPE_SW1; 247286432Sbapt break; 248286432Sbapt case 2: 249286432Sbapt ctn->ctype |= _CTYPE_SW2; 250286432Sbapt break; 251286432Sbapt case 3: 252286432Sbapt ctn->ctype |= _CTYPE_SW3; 253286432Sbapt break; 254286432Sbapt } 255286432Sbapt} 256286432Sbapt 257286432Sbaptvoid 258286432Sbaptadd_width_range(int start, int end, int width) 259286432Sbapt{ 260286432Sbapt for (; start <= end; start++) { 261286432Sbapt add_width(start, width); 262286432Sbapt } 263286432Sbapt} 264286432Sbapt 265286432Sbaptvoid 266286432Sbaptadd_caseconv(int val, int wc) 267286432Sbapt{ 268286432Sbapt ctype_node_t *ctn; 269286432Sbapt 270286432Sbapt ctn = get_ctype(val); 271286432Sbapt if (ctn == NULL) { 272286432Sbapt INTERR; 273286432Sbapt return; 274286432Sbapt } 275286432Sbapt 276286432Sbapt switch (last_kw) { 277286432Sbapt case T_TOUPPER: 278286432Sbapt ctn->toupper = wc; 279286432Sbapt break; 280286432Sbapt case T_TOLOWER: 281286432Sbapt ctn->tolower = wc; 282286432Sbapt break; 283286432Sbapt default: 284286432Sbapt INTERR; 285286432Sbapt break; 286286432Sbapt } 287286432Sbapt} 288286432Sbapt 289286432Sbaptvoid 290286432Sbaptdump_ctype(void) 291286432Sbapt{ 292286432Sbapt FILE *f; 293286432Sbapt _FileRuneLocale rl; 294286432Sbapt ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 295286432Sbapt _FileRuneEntry *ct = NULL; 296286432Sbapt _FileRuneEntry *lo = NULL; 297286432Sbapt _FileRuneEntry *up = NULL; 298286432Sbapt wchar_t wc; 299286432Sbapt 300286432Sbapt (void) memset(&rl, 0, sizeof (rl)); 301286432Sbapt last_ct = NULL; 302286432Sbapt last_lo = NULL; 303286432Sbapt last_up = NULL; 304286432Sbapt 305286432Sbapt if ((f = open_category()) == NULL) 306286432Sbapt return; 307286432Sbapt 308286432Sbapt (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 309315224Spfg (void) strlcpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 310286432Sbapt 311286432Sbapt /* 312286432Sbapt * Initialize the identity map. 313286432Sbapt */ 314286432Sbapt for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 315286432Sbapt rl.maplower[wc] = wc; 316286432Sbapt rl.mapupper[wc] = wc; 317286432Sbapt } 318286432Sbapt 319286481Sbapt RB_FOREACH(ctn, ctypes, &ctypes) { 320286432Sbapt int conflict = 0; 321286432Sbapt 322286432Sbapt wc = ctn->wc; 323286432Sbapt 324286432Sbapt /* 325286432Sbapt * POSIX requires certain portable characters have 326286432Sbapt * certain types. Add them if they are missing. 327286432Sbapt */ 328286432Sbapt if ((wc >= 1) && (wc <= 127)) { 329286432Sbapt if ((wc >= 'A') && (wc <= 'Z')) 330286432Sbapt ctn->ctype |= _ISUPPER; 331286432Sbapt if ((wc >= 'a') && (wc <= 'z')) 332286432Sbapt ctn->ctype |= _ISLOWER; 333286432Sbapt if ((wc >= '0') && (wc <= '9')) 334286432Sbapt ctn->ctype |= _ISDIGIT; 335297057Spfg if (wc == ' ') 336297057Spfg ctn->ctype |= _ISPRINT; 337286432Sbapt if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 338286432Sbapt ctn->ctype |= _ISSPACE; 339286432Sbapt if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 340286432Sbapt ctn->ctype |= _ISXDIGIT; 341286432Sbapt if (strchr(" \t", (char)wc)) 342286432Sbapt ctn->ctype |= _ISBLANK; 343286432Sbapt 344286432Sbapt /* 345286432Sbapt * Technically these settings are only 346286432Sbapt * required for the C locale. However, it 347286432Sbapt * turns out that because of the historical 348286432Sbapt * version of isprint(), we need them for all 349286432Sbapt * locales as well. Note that these are not 350286432Sbapt * necessarily valid punctation characters in 351286432Sbapt * the current language, but ispunct() needs 352286432Sbapt * to return TRUE for them. 353286432Sbapt */ 354286432Sbapt if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 355286432Sbapt (char)wc)) 356286432Sbapt ctn->ctype |= _ISPUNCT; 357286432Sbapt } 358286432Sbapt 359286432Sbapt /* 360286432Sbapt * POSIX also requires that certain types imply 361286432Sbapt * others. Add any inferred types here. 362286432Sbapt */ 363286432Sbapt if (ctn->ctype & (_ISUPPER |_ISLOWER)) 364286432Sbapt ctn->ctype |= _ISALPHA; 365286432Sbapt if (ctn->ctype & _ISDIGIT) 366286432Sbapt ctn->ctype |= _ISXDIGIT; 367286432Sbapt if (ctn->ctype & _ISBLANK) 368286432Sbapt ctn->ctype |= _ISSPACE; 369286432Sbapt if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 370286432Sbapt ctn->ctype |= _ISGRAPH; 371286432Sbapt if (ctn->ctype & _ISGRAPH) 372286432Sbapt ctn->ctype |= _ISPRINT; 373286432Sbapt 374286432Sbapt /* 375286432Sbapt * Finally, POSIX requires that certain combinations 376286432Sbapt * are invalid. We don't flag this as a fatal error, 377286432Sbapt * but we will warn about. 378286432Sbapt */ 379286432Sbapt if ((ctn->ctype & _ISALPHA) && 380286432Sbapt (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 381286432Sbapt conflict++; 382315224Spfg if ((ctn->ctype & _ISPUNCT) && 383286432Sbapt (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 384286432Sbapt conflict++; 385286432Sbapt if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 386286432Sbapt conflict++; 387315224Spfg if ((ctn->ctype & _ISCNTRL) && (ctn->ctype & _ISPRINT)) 388286432Sbapt conflict++; 389286432Sbapt if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 390286432Sbapt conflict++; 391286432Sbapt 392286432Sbapt if (conflict) { 393286432Sbapt warn("conflicting classes for character 0x%x (%x)", 394286432Sbapt wc, ctn->ctype); 395286432Sbapt } 396286432Sbapt /* 397286432Sbapt * Handle the lower 256 characters using the simple 398286432Sbapt * optimization. Note that if we have not defined the 399286432Sbapt * upper/lower case, then we identity map it. 400286432Sbapt */ 401286432Sbapt if ((unsigned)wc < _CACHED_RUNES) { 402286432Sbapt rl.runetype[wc] = ctn->ctype; 403286432Sbapt if (ctn->tolower) 404286432Sbapt rl.maplower[wc] = ctn->tolower; 405286432Sbapt if (ctn->toupper) 406286432Sbapt rl.mapupper[wc] = ctn->toupper; 407286432Sbapt continue; 408286432Sbapt } 409286432Sbapt 410308330Sbapt if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype) && 411308330Sbapt (last_ct->wc + 1 == wc)) { 412286432Sbapt ct[rl.runetype_ext_nranges-1].max = wc; 413286432Sbapt } else { 414286432Sbapt rl.runetype_ext_nranges++; 415286432Sbapt ct = realloc(ct, 416286432Sbapt sizeof (*ct) * rl.runetype_ext_nranges); 417286432Sbapt ct[rl.runetype_ext_nranges - 1].min = wc; 418286432Sbapt ct[rl.runetype_ext_nranges - 1].max = wc; 419286432Sbapt ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 420286432Sbapt } 421308330Sbapt last_ct = ctn; 422286432Sbapt if (ctn->tolower == 0) { 423286432Sbapt last_lo = NULL; 424286432Sbapt } else if ((last_lo != NULL) && 425286432Sbapt (last_lo->tolower + 1 == ctn->tolower)) { 426286432Sbapt lo[rl.maplower_ext_nranges-1].max = wc; 427286432Sbapt last_lo = ctn; 428286432Sbapt } else { 429286432Sbapt rl.maplower_ext_nranges++; 430286432Sbapt lo = realloc(lo, 431286432Sbapt sizeof (*lo) * rl.maplower_ext_nranges); 432286432Sbapt lo[rl.maplower_ext_nranges - 1].min = wc; 433286432Sbapt lo[rl.maplower_ext_nranges - 1].max = wc; 434286432Sbapt lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 435286432Sbapt last_lo = ctn; 436286432Sbapt } 437286432Sbapt 438286432Sbapt if (ctn->toupper == 0) { 439286432Sbapt last_up = NULL; 440286432Sbapt } else if ((last_up != NULL) && 441286432Sbapt (last_up->toupper + 1 == ctn->toupper)) { 442286432Sbapt up[rl.mapupper_ext_nranges-1].max = wc; 443286432Sbapt last_up = ctn; 444286432Sbapt } else { 445286432Sbapt rl.mapupper_ext_nranges++; 446286432Sbapt up = realloc(up, 447286432Sbapt sizeof (*up) * rl.mapupper_ext_nranges); 448286432Sbapt up[rl.mapupper_ext_nranges - 1].min = wc; 449286432Sbapt up[rl.mapupper_ext_nranges - 1].max = wc; 450286432Sbapt up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 451286432Sbapt last_up = ctn; 452286432Sbapt } 453286432Sbapt } 454286432Sbapt 455286432Sbapt if ((wr_category(&rl, sizeof (rl), f) < 0) || 456286432Sbapt (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 457286432Sbapt (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 458286432Sbapt (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 459286432Sbapt return; 460286432Sbapt } 461286432Sbapt 462286432Sbapt close_category(f); 463286432Sbapt} 464