ctype.c revision 286432
1286432Sbapt/* 2286432Sbapt * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3286432Sbapt * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4286432Sbapt * Copyright 2015 John Marino <draco@marino.st> 5286432Sbapt * 6286432Sbapt * This source code is derived from the illumos localedef command, and 7286432Sbapt * provided under BSD-style license terms by Nexenta Systems, Inc. 8286432Sbapt * 9286432Sbapt * Redistribution and use in source and binary forms, with or without 10286432Sbapt * modification, are permitted provided that the following conditions 11286432Sbapt * are met: 12286432Sbapt * 13286432Sbapt * 1. Redistributions of source code must retain the above copyright 14286432Sbapt * notice, this list of conditions and the following disclaimer. 15286432Sbapt * 2. Redistributions in binary form must reproduce the above copyright 16286432Sbapt * notice, this list of conditions and the following disclaimer in the 17286432Sbapt * documentation and/or other materials provided with the distribution. 18286432Sbapt * 19286432Sbapt * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20286432Sbapt * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21286432Sbapt * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22286432Sbapt * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23286432Sbapt * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24286432Sbapt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25286432Sbapt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26286432Sbapt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27286432Sbapt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28286432Sbapt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29286432Sbapt * POSSIBILITY OF SUCH DAMAGE. 30286432Sbapt */ 31286432Sbapt 32286432Sbapt/* 33286432Sbapt * LC_CTYPE database generation routines for localedef. 34286432Sbapt */ 35286432Sbapt#include <sys/cdefs.h> 36286432Sbapt__FBSDID("$FreeBSD: projects/collation/usr.bin/localedef/ctype.c 286432 2015-08-07 23:53:31Z bapt $"); 37286432Sbapt 38286432Sbapt#include <sys/avl.h> 39286432Sbapt 40286432Sbapt#include <stdio.h> 41286432Sbapt#include <stdlib.h> 42286432Sbapt#include <stddef.h> 43286432Sbapt#include <string.h> 44286432Sbapt#include <sys/types.h> 45286432Sbapt#include <wchar.h> 46286432Sbapt#include <ctype.h> 47286432Sbapt#include <wctype.h> 48286432Sbapt#include <unistd.h> 49286432Sbapt#include "localedef.h" 50286432Sbapt#include "parser.h" 51286432Sbapt#include "runefile.h" 52286432Sbapt 53286432Sbapt 54286432Sbapt#define _ISUPPER _CTYPE_U 55286432Sbapt#define _ISLOWER _CTYPE_L 56286432Sbapt#define _ISDIGIT _CTYPE_D 57286432Sbapt#define _ISXDIGIT _CTYPE_X 58286432Sbapt#define _ISSPACE _CTYPE_S 59286432Sbapt#define _ISBLANK _CTYPE_B 60286432Sbapt#define _ISALPHA _CTYPE_A 61286432Sbapt#define _ISPUNCT _CTYPE_P 62286432Sbapt#define _ISGRAPH _CTYPE_G 63286432Sbapt#define _ISPRINT _CTYPE_R 64286432Sbapt#define _ISCNTRL _CTYPE_C 65286432Sbapt#define _E1 _CTYPE_Q 66286432Sbapt#define _E2 _CTYPE_I 67286432Sbapt#define _E3 0 68286432Sbapt#define _E4 0 69286432Sbapt#define _E5 _CTYPE_T 70286432Sbapt 71286432Sbaptstatic avl_tree_t ctypes; 72286432Sbapt 73286432Sbaptstatic wchar_t last_ctype; 74286432Sbapt 75286432Sbapttypedef struct ctype_node { 76286432Sbapt wchar_t wc; 77286432Sbapt int32_t ctype; 78286432Sbapt int32_t toupper; 79286432Sbapt int32_t tolower; 80286432Sbapt avl_node_t avl; 81286432Sbapt} ctype_node_t; 82286432Sbapt 83286432Sbapttypedef struct width_node { 84286432Sbapt wchar_t start; 85286432Sbapt wchar_t end; 86286432Sbapt int8_t width; 87286432Sbapt avl_node_t avl; 88286432Sbapt} width_node_t; 89286432Sbapt 90286432Sbaptstatic int 91286432Sbaptctype_compare(const void *n1, const void *n2) 92286432Sbapt{ 93286432Sbapt const ctype_node_t *c1 = n1; 94286432Sbapt const ctype_node_t *c2 = n2; 95286432Sbapt 96286432Sbapt return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); 97286432Sbapt} 98286432Sbapt 99286432Sbaptvoid 100286432Sbaptinit_ctype(void) 101286432Sbapt{ 102286432Sbapt avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t), 103286432Sbapt offsetof(ctype_node_t, avl)); 104286432Sbapt} 105286432Sbapt 106286432Sbapt 107286432Sbaptstatic void 108286432Sbaptadd_ctype_impl(ctype_node_t *ctn) 109286432Sbapt{ 110286432Sbapt switch (last_kw) { 111286432Sbapt case T_ISUPPER: 112286432Sbapt ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); 113286432Sbapt break; 114286432Sbapt case T_ISLOWER: 115286432Sbapt ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); 116286432Sbapt break; 117286432Sbapt case T_ISALPHA: 118286432Sbapt ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); 119286432Sbapt break; 120286432Sbapt case T_ISDIGIT: 121286432Sbapt ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT); 122286432Sbapt break; 123286432Sbapt case T_ISSPACE: 124286432Sbapt ctn->ctype |= _ISSPACE; 125286432Sbapt break; 126286432Sbapt case T_ISCNTRL: 127286432Sbapt ctn->ctype |= _ISCNTRL; 128286432Sbapt break; 129286432Sbapt case T_ISGRAPH: 130286432Sbapt ctn->ctype |= (_ISGRAPH | _ISPRINT); 131286432Sbapt break; 132286432Sbapt case T_ISPRINT: 133286432Sbapt ctn->ctype |= _ISPRINT; 134286432Sbapt break; 135286432Sbapt case T_ISPUNCT: 136286432Sbapt ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); 137286432Sbapt break; 138286432Sbapt case T_ISXDIGIT: 139286432Sbapt ctn->ctype |= (_ISXDIGIT | _ISPRINT); 140286432Sbapt break; 141286432Sbapt case T_ISBLANK: 142286432Sbapt ctn->ctype |= (_ISBLANK | _ISSPACE); 143286432Sbapt break; 144286432Sbapt case T_ISPHONOGRAM: 145286432Sbapt ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); 146286432Sbapt break; 147286432Sbapt case T_ISIDEOGRAM: 148286432Sbapt ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); 149286432Sbapt break; 150286432Sbapt case T_ISENGLISH: 151286432Sbapt ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); 152286432Sbapt break; 153286432Sbapt case T_ISNUMBER: 154286432Sbapt ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); 155286432Sbapt break; 156286432Sbapt case T_ISSPECIAL: 157286432Sbapt ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); 158286432Sbapt break; 159286432Sbapt case T_ISALNUM: 160286432Sbapt /* 161286432Sbapt * We can't do anything with this. The character 162286432Sbapt * should already be specified as a digit or alpha. 163286432Sbapt */ 164286432Sbapt break; 165286432Sbapt default: 166286432Sbapt errf("not a valid character class"); 167286432Sbapt } 168286432Sbapt} 169286432Sbapt 170286432Sbaptstatic ctype_node_t * 171286432Sbaptget_ctype(wchar_t wc) 172286432Sbapt{ 173286432Sbapt ctype_node_t srch; 174286432Sbapt ctype_node_t *ctn; 175286432Sbapt avl_index_t where; 176286432Sbapt 177286432Sbapt srch.wc = wc; 178286432Sbapt if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) { 179286432Sbapt if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { 180286432Sbapt errf("out of memory"); 181286432Sbapt return (NULL); 182286432Sbapt } 183286432Sbapt ctn->wc = wc; 184286432Sbapt 185286432Sbapt avl_insert(&ctypes, ctn, where); 186286432Sbapt } 187286432Sbapt return (ctn); 188286432Sbapt} 189286432Sbapt 190286432Sbaptvoid 191286432Sbaptadd_ctype(int val) 192286432Sbapt{ 193286432Sbapt ctype_node_t *ctn; 194286432Sbapt 195286432Sbapt if ((ctn = get_ctype(val)) == NULL) { 196286432Sbapt INTERR; 197286432Sbapt return; 198286432Sbapt } 199286432Sbapt add_ctype_impl(ctn); 200286432Sbapt last_ctype = ctn->wc; 201286432Sbapt} 202286432Sbapt 203286432Sbaptvoid 204286432Sbaptadd_ctype_range(int end) 205286432Sbapt{ 206286432Sbapt ctype_node_t *ctn; 207286432Sbapt wchar_t cur; 208286432Sbapt 209286432Sbapt if (end < last_ctype) { 210286432Sbapt errf("malformed character range (%u ... %u))", 211286432Sbapt last_ctype, end); 212286432Sbapt return; 213286432Sbapt } 214286432Sbapt for (cur = last_ctype + 1; cur <= end; cur++) { 215286432Sbapt if ((ctn = get_ctype(cur)) == NULL) { 216286432Sbapt INTERR; 217286432Sbapt return; 218286432Sbapt } 219286432Sbapt add_ctype_impl(ctn); 220286432Sbapt } 221286432Sbapt last_ctype = end; 222286432Sbapt 223286432Sbapt} 224286432Sbapt 225286432Sbapt/* 226286432Sbapt * A word about widths: if the width mask is specified, then libc 227286432Sbapt * unconditionally honors it. Otherwise, it assumes printable 228286432Sbapt * characters have width 1, and non-printable characters have width 229286432Sbapt * -1 (except for NULL which is special with with 0). Hence, we have 230286432Sbapt * no need to inject defaults here -- the "default" unset value of 0 231286432Sbapt * indicates that libc should use its own logic in wcwidth as described. 232286432Sbapt */ 233286432Sbaptvoid 234286432Sbaptadd_width(int wc, int width) 235286432Sbapt{ 236286432Sbapt ctype_node_t *ctn; 237286432Sbapt 238286432Sbapt if ((ctn = get_ctype(wc)) == NULL) { 239286432Sbapt INTERR; 240286432Sbapt return; 241286432Sbapt } 242286432Sbapt ctn->ctype &= ~(_CTYPE_SWM); 243286432Sbapt switch (width) { 244286432Sbapt case 0: 245286432Sbapt ctn->ctype |= _CTYPE_SW0; 246286432Sbapt break; 247286432Sbapt case 1: 248286432Sbapt ctn->ctype |= _CTYPE_SW1; 249286432Sbapt break; 250286432Sbapt case 2: 251286432Sbapt ctn->ctype |= _CTYPE_SW2; 252286432Sbapt break; 253286432Sbapt case 3: 254286432Sbapt ctn->ctype |= _CTYPE_SW3; 255286432Sbapt break; 256286432Sbapt } 257286432Sbapt} 258286432Sbapt 259286432Sbaptvoid 260286432Sbaptadd_width_range(int start, int end, int width) 261286432Sbapt{ 262286432Sbapt for (; start <= end; start++) { 263286432Sbapt add_width(start, width); 264286432Sbapt } 265286432Sbapt} 266286432Sbapt 267286432Sbaptvoid 268286432Sbaptadd_caseconv(int val, int wc) 269286432Sbapt{ 270286432Sbapt ctype_node_t *ctn; 271286432Sbapt 272286432Sbapt ctn = get_ctype(val); 273286432Sbapt if (ctn == NULL) { 274286432Sbapt INTERR; 275286432Sbapt return; 276286432Sbapt } 277286432Sbapt 278286432Sbapt switch (last_kw) { 279286432Sbapt case T_TOUPPER: 280286432Sbapt ctn->toupper = wc; 281286432Sbapt break; 282286432Sbapt case T_TOLOWER: 283286432Sbapt ctn->tolower = wc; 284286432Sbapt break; 285286432Sbapt default: 286286432Sbapt INTERR; 287286432Sbapt break; 288286432Sbapt } 289286432Sbapt} 290286432Sbapt 291286432Sbaptvoid 292286432Sbaptdump_ctype(void) 293286432Sbapt{ 294286432Sbapt FILE *f; 295286432Sbapt _FileRuneLocale rl; 296286432Sbapt ctype_node_t *ctn, *last_ct, *last_lo, *last_up; 297286432Sbapt _FileRuneEntry *ct = NULL; 298286432Sbapt _FileRuneEntry *lo = NULL; 299286432Sbapt _FileRuneEntry *up = NULL; 300286432Sbapt wchar_t wc; 301286432Sbapt 302286432Sbapt (void) memset(&rl, 0, sizeof (rl)); 303286432Sbapt last_ct = NULL; 304286432Sbapt last_lo = NULL; 305286432Sbapt last_up = NULL; 306286432Sbapt 307286432Sbapt if ((f = open_category()) == NULL) 308286432Sbapt return; 309286432Sbapt 310286432Sbapt (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); 311286432Sbapt (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); 312286432Sbapt 313286432Sbapt /* 314286432Sbapt * Initialize the identity map. 315286432Sbapt */ 316286432Sbapt for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { 317286432Sbapt rl.maplower[wc] = wc; 318286432Sbapt rl.mapupper[wc] = wc; 319286432Sbapt } 320286432Sbapt 321286432Sbapt for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) { 322286432Sbapt int conflict = 0; 323286432Sbapt 324286432Sbapt 325286432Sbapt wc = ctn->wc; 326286432Sbapt 327286432Sbapt /* 328286432Sbapt * POSIX requires certain portable characters have 329286432Sbapt * certain types. Add them if they are missing. 330286432Sbapt */ 331286432Sbapt if ((wc >= 1) && (wc <= 127)) { 332286432Sbapt if ((wc >= 'A') && (wc <= 'Z')) 333286432Sbapt ctn->ctype |= _ISUPPER; 334286432Sbapt if ((wc >= 'a') && (wc <= 'z')) 335286432Sbapt ctn->ctype |= _ISLOWER; 336286432Sbapt if ((wc >= '0') && (wc <= '9')) 337286432Sbapt ctn->ctype |= _ISDIGIT; 338286432Sbapt if (strchr(" \f\n\r\t\v", (char)wc) != NULL) 339286432Sbapt ctn->ctype |= _ISSPACE; 340286432Sbapt if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) 341286432Sbapt ctn->ctype |= _ISXDIGIT; 342286432Sbapt if (strchr(" \t", (char)wc)) 343286432Sbapt ctn->ctype |= _ISBLANK; 344286432Sbapt 345286432Sbapt /* 346286432Sbapt * Technically these settings are only 347286432Sbapt * required for the C locale. However, it 348286432Sbapt * turns out that because of the historical 349286432Sbapt * version of isprint(), we need them for all 350286432Sbapt * locales as well. Note that these are not 351286432Sbapt * necessarily valid punctation characters in 352286432Sbapt * the current language, but ispunct() needs 353286432Sbapt * to return TRUE for them. 354286432Sbapt */ 355286432Sbapt if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", 356286432Sbapt (char)wc)) 357286432Sbapt ctn->ctype |= _ISPUNCT; 358286432Sbapt } 359286432Sbapt 360286432Sbapt /* 361286432Sbapt * POSIX also requires that certain types imply 362286432Sbapt * others. Add any inferred types here. 363286432Sbapt */ 364286432Sbapt if (ctn->ctype & (_ISUPPER |_ISLOWER)) 365286432Sbapt ctn->ctype |= _ISALPHA; 366286432Sbapt if (ctn->ctype & _ISDIGIT) 367286432Sbapt ctn->ctype |= _ISXDIGIT; 368286432Sbapt if (ctn->ctype & _ISBLANK) 369286432Sbapt ctn->ctype |= _ISSPACE; 370286432Sbapt if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) 371286432Sbapt ctn->ctype |= _ISGRAPH; 372286432Sbapt if (ctn->ctype & _ISGRAPH) 373286432Sbapt ctn->ctype |= _ISPRINT; 374286432Sbapt 375286432Sbapt /* 376286432Sbapt * Finally, POSIX requires that certain combinations 377286432Sbapt * are invalid. We don't flag this as a fatal error, 378286432Sbapt * but we will warn about. 379286432Sbapt */ 380286432Sbapt if ((ctn->ctype & _ISALPHA) && 381286432Sbapt (ctn->ctype & (_ISPUNCT|_ISDIGIT))) 382286432Sbapt conflict++; 383286432Sbapt if ((ctn->ctype & _ISPUNCT) & 384286432Sbapt (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) 385286432Sbapt conflict++; 386286432Sbapt if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) 387286432Sbapt conflict++; 388286432Sbapt if ((ctn->ctype & _ISCNTRL) & _ISPRINT) 389286432Sbapt conflict++; 390286432Sbapt if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) 391286432Sbapt conflict++; 392286432Sbapt 393286432Sbapt if (conflict) { 394286432Sbapt warn("conflicting classes for character 0x%x (%x)", 395286432Sbapt wc, ctn->ctype); 396286432Sbapt } 397286432Sbapt /* 398286432Sbapt * Handle the lower 256 characters using the simple 399286432Sbapt * optimization. Note that if we have not defined the 400286432Sbapt * upper/lower case, then we identity map it. 401286432Sbapt */ 402286432Sbapt if ((unsigned)wc < _CACHED_RUNES) { 403286432Sbapt rl.runetype[wc] = ctn->ctype; 404286432Sbapt if (ctn->tolower) 405286432Sbapt rl.maplower[wc] = ctn->tolower; 406286432Sbapt if (ctn->toupper) 407286432Sbapt rl.mapupper[wc] = ctn->toupper; 408286432Sbapt continue; 409286432Sbapt } 410286432Sbapt 411286432Sbapt if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { 412286432Sbapt ct[rl.runetype_ext_nranges-1].max = wc; 413286432Sbapt last_ct = ctn; 414286432Sbapt } else { 415286432Sbapt rl.runetype_ext_nranges++; 416286432Sbapt ct = realloc(ct, 417286432Sbapt sizeof (*ct) * rl.runetype_ext_nranges); 418286432Sbapt ct[rl.runetype_ext_nranges - 1].min = wc; 419286432Sbapt ct[rl.runetype_ext_nranges - 1].max = wc; 420286432Sbapt ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; 421286432Sbapt last_ct = ctn; 422286432Sbapt } 423286432Sbapt if (ctn->tolower == 0) { 424286432Sbapt last_lo = NULL; 425286432Sbapt } else if ((last_lo != NULL) && 426286432Sbapt (last_lo->tolower + 1 == ctn->tolower)) { 427286432Sbapt lo[rl.maplower_ext_nranges-1].max = wc; 428286432Sbapt last_lo = ctn; 429286432Sbapt } else { 430286432Sbapt rl.maplower_ext_nranges++; 431286432Sbapt lo = realloc(lo, 432286432Sbapt sizeof (*lo) * rl.maplower_ext_nranges); 433286432Sbapt lo[rl.maplower_ext_nranges - 1].min = wc; 434286432Sbapt lo[rl.maplower_ext_nranges - 1].max = wc; 435286432Sbapt lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; 436286432Sbapt last_lo = ctn; 437286432Sbapt } 438286432Sbapt 439286432Sbapt if (ctn->toupper == 0) { 440286432Sbapt last_up = NULL; 441286432Sbapt } else if ((last_up != NULL) && 442286432Sbapt (last_up->toupper + 1 == ctn->toupper)) { 443286432Sbapt up[rl.mapupper_ext_nranges-1].max = wc; 444286432Sbapt last_up = ctn; 445286432Sbapt } else { 446286432Sbapt rl.mapupper_ext_nranges++; 447286432Sbapt up = realloc(up, 448286432Sbapt sizeof (*up) * rl.mapupper_ext_nranges); 449286432Sbapt up[rl.mapupper_ext_nranges - 1].min = wc; 450286432Sbapt up[rl.mapupper_ext_nranges - 1].max = wc; 451286432Sbapt up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; 452286432Sbapt last_up = ctn; 453286432Sbapt } 454286432Sbapt } 455286432Sbapt 456286432Sbapt if ((wr_category(&rl, sizeof (rl), f) < 0) || 457286432Sbapt (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || 458286432Sbapt (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || 459286432Sbapt (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { 460286432Sbapt return; 461286432Sbapt } 462286432Sbapt 463286432Sbapt close_category(f); 464286432Sbapt} 465