1169695Skan/* CPP Library - charsets 2169695Skan Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004 3169695Skan Free Software Foundation, Inc. 4169695Skan 5169695Skan Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. 6169695Skan 7169695SkanThis program is free software; you can redistribute it and/or modify it 8169695Skanunder the terms of the GNU General Public License as published by the 9169695SkanFree Software Foundation; either version 2, or (at your option) any 10169695Skanlater version. 11169695Skan 12169695SkanThis program is distributed in the hope that it will be useful, 13169695Skanbut WITHOUT ANY WARRANTY; without even the implied warranty of 14169695SkanMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15169695SkanGNU General Public License for more details. 16169695Skan 17169695SkanYou should have received a copy of the GNU General Public License 18169695Skanalong with this program; if not, write to the Free Software 19169695SkanFoundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 20169695Skan 21169695Skan#include "config.h" 22169695Skan#include "system.h" 23169695Skan#include "cpplib.h" 24169695Skan#include "internal.h" 25169695Skan 26169695Skan/* Character set handling for C-family languages. 27169695Skan 28169695Skan Terminological note: In what follows, "charset" or "character set" 29169695Skan will be taken to mean both an abstract set of characters and an 30169695Skan encoding for that set. 31169695Skan 32169695Skan The C99 standard discusses two character sets: source and execution. 33169695Skan The source character set is used for internal processing in translation 34169695Skan phases 1 through 4; the execution character set is used thereafter. 35169695Skan Both are required by 5.2.1.2p1 to be multibyte encodings, not wide 36169695Skan character encodings (see 3.7.2, 3.7.3 for the standardese meanings 37169695Skan of these terms). Furthermore, the "basic character set" (listed in 38169695Skan 5.2.1p3) is to be encoded in each with values one byte wide, and is 39169695Skan to appear in the initial shift state. 40169695Skan 41169695Skan It is not explicitly mentioned, but there is also a "wide execution 42169695Skan character set" used to encode wide character constants and wide 43169695Skan string literals; this is supposed to be the result of applying the 44169695Skan standard library function mbstowcs() to an equivalent narrow string 45169695Skan (6.4.5p5). However, the behavior of hexadecimal and octal 46169695Skan \-escapes is at odds with this; they are supposed to be translated 47169695Skan directly to wchar_t values (6.4.4.4p5,6). 48169695Skan 49169695Skan The source character set is not necessarily the character set used 50169695Skan to encode physical source files on disk; translation phase 1 converts 51169695Skan from whatever that encoding is to the source character set. 52169695Skan 53169695Skan The presence of universal character names in C99 (6.4.3 et seq.) 54169695Skan forces the source character set to be isomorphic to ISO 10646, 55169695Skan that is, Unicode. There is no such constraint on the execution 56169695Skan character set; note also that the conversion from source to 57169695Skan execution character set does not occur for identifiers (5.1.1.2p1#5). 58169695Skan 59169695Skan For convenience of implementation, the source character set's 60169695Skan encoding of the basic character set should be identical to the 61169695Skan execution character set OF THE HOST SYSTEM's encoding of the basic 62169695Skan character set, and it should not be a state-dependent encoding. 63169695Skan 64169695Skan cpplib uses UTF-8 or UTF-EBCDIC for the source character set, 65169695Skan depending on whether the host is based on ASCII or EBCDIC (see 66169695Skan respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode 67169695Skan Technical Report #16). With limited exceptions, it relies on the 68169695Skan system library's iconv() primitive to do charset conversion 69169695Skan (specified in SUSv2). */ 70169695Skan 71169695Skan#if !HAVE_ICONV 72169695Skan/* Make certain that the uses of iconv(), iconv_open(), iconv_close() 73169695Skan below, which are guarded only by if statements with compile-time 74169695Skan constant conditions, do not cause link errors. */ 75169695Skan#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1) 76169695Skan#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1) 77169695Skan#define iconv_close(x) (void)0 78169695Skan#define ICONV_CONST 79169695Skan#endif 80169695Skan 81169695Skan#if HOST_CHARSET == HOST_CHARSET_ASCII 82169695Skan#define SOURCE_CHARSET "UTF-8" 83169695Skan#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e 84169695Skan#elif HOST_CHARSET == HOST_CHARSET_EBCDIC 85169695Skan#define SOURCE_CHARSET "UTF-EBCDIC" 86169695Skan#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF 87169695Skan#else 88169695Skan#error "Unrecognized basic host character set" 89169695Skan#endif 90169695Skan 91169695Skan#ifndef EILSEQ 92169695Skan#define EILSEQ EINVAL 93169695Skan#endif 94169695Skan 95169695Skan/* This structure is used for a resizable string buffer throughout. */ 96169695Skan/* Don't call it strbuf, as that conflicts with unistd.h on systems 97169695Skan such as DYNIX/ptx where unistd.h includes stropts.h. */ 98169695Skanstruct _cpp_strbuf 99169695Skan{ 100169695Skan uchar *text; 101169695Skan size_t asize; 102169695Skan size_t len; 103169695Skan}; 104169695Skan 105169695Skan/* This is enough to hold any string that fits on a single 80-column 106169695Skan line, even if iconv quadruples its size (e.g. conversion from 107169695Skan ASCII to UTF-32) rounded up to a power of two. */ 108169695Skan#define OUTBUF_BLOCK_SIZE 256 109169695Skan 110169695Skan/* Conversions between UTF-8 and UTF-16/32 are implemented by custom 111169695Skan logic. This is because a depressing number of systems lack iconv, 112169695Skan or have have iconv libraries that do not do these conversions, so 113169695Skan we need a fallback implementation for them. To ensure the fallback 114169695Skan doesn't break due to neglect, it is used on all systems. 115169695Skan 116169695Skan UTF-32 encoding is nice and simple: a four-byte binary number, 117169695Skan constrained to the range 00000000-7FFFFFFF to avoid questions of 118169695Skan signedness. We do have to cope with big- and little-endian 119169695Skan variants. 120169695Skan 121169695Skan UTF-16 encoding uses two-byte binary numbers, again in big- and 122169695Skan little-endian variants, for all values in the 00000000-0000FFFF 123169695Skan range. Values in the 00010000-0010FFFF range are encoded as pairs 124169695Skan of two-byte numbers, called "surrogate pairs": given a number S in 125169695Skan this range, it is mapped to a pair (H, L) as follows: 126169695Skan 127169695Skan H = (S - 0x10000) / 0x400 + 0xD800 128169695Skan L = (S - 0x10000) % 0x400 + 0xDC00 129169695Skan 130169695Skan Two-byte values in the D800...DFFF range are ill-formed except as a 131169695Skan component of a surrogate pair. Even if the encoding within a 132169695Skan two-byte value is little-endian, the H member of the surrogate pair 133169695Skan comes first. 134169695Skan 135169695Skan There is no way to encode values in the 00110000-7FFFFFFF range, 136169695Skan which is not currently a problem as there are no assigned code 137169695Skan points in that range; however, the author expects that it will 138169695Skan eventually become necessary to abandon UTF-16 due to this 139169695Skan limitation. Note also that, because of these pairs, UTF-16 does 140169695Skan not meet the requirements of the C standard for a wide character 141169695Skan encoding (see 3.7.3 and 6.4.4.4p11). 142169695Skan 143169695Skan UTF-8 encoding looks like this: 144169695Skan 145169695Skan value range encoded as 146169695Skan 00000000-0000007F 0xxxxxxx 147169695Skan 00000080-000007FF 110xxxxx 10xxxxxx 148169695Skan 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx 149169695Skan 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 150169695Skan 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 151169695Skan 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 152169695Skan 153169695Skan Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid, 154169695Skan which means that three-byte sequences ED xx yy, with A0 <= xx <= BF, 155169695Skan never occur. Note also that any value that can be encoded by a 156169695Skan given row of the table can also be encoded by all successive rows, 157169695Skan but this is not done; only the shortest possible encoding for any 158169695Skan given value is valid. For instance, the character 07C0 could be 159169695Skan encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or 160169695Skan FC 80 80 80 9F 80. Only the first is valid. 161169695Skan 162169695Skan An implementation note: the transformation from UTF-16 to UTF-8, or 163169695Skan vice versa, is easiest done by using UTF-32 as an intermediary. */ 164169695Skan 165169695Skan/* Internal primitives which go from an UTF-8 byte stream to native-endian 166169695Skan UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal 167169695Skan operation in several places below. */ 168169695Skanstatic inline int 169169695Skanone_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp, 170169695Skan cppchar_t *cp) 171169695Skan{ 172169695Skan static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 }; 173169695Skan static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 174169695Skan 175169695Skan cppchar_t c; 176169695Skan const uchar *inbuf = *inbufp; 177169695Skan size_t nbytes, i; 178169695Skan 179169695Skan if (*inbytesleftp < 1) 180169695Skan return EINVAL; 181169695Skan 182169695Skan c = *inbuf; 183169695Skan if (c < 0x80) 184169695Skan { 185169695Skan *cp = c; 186169695Skan *inbytesleftp -= 1; 187169695Skan *inbufp += 1; 188169695Skan return 0; 189169695Skan } 190169695Skan 191169695Skan /* The number of leading 1-bits in the first byte indicates how many 192169695Skan bytes follow. */ 193169695Skan for (nbytes = 2; nbytes < 7; nbytes++) 194169695Skan if ((c & ~masks[nbytes-1]) == patns[nbytes-1]) 195169695Skan goto found; 196169695Skan return EILSEQ; 197169695Skan found: 198169695Skan 199169695Skan if (*inbytesleftp < nbytes) 200169695Skan return EINVAL; 201169695Skan 202169695Skan c = (c & masks[nbytes-1]); 203169695Skan inbuf++; 204169695Skan for (i = 1; i < nbytes; i++) 205169695Skan { 206169695Skan cppchar_t n = *inbuf++; 207169695Skan if ((n & 0xC0) != 0x80) 208169695Skan return EILSEQ; 209169695Skan c = ((c << 6) + (n & 0x3F)); 210169695Skan } 211169695Skan 212169695Skan /* Make sure the shortest possible encoding was used. */ 213169695Skan if (c <= 0x7F && nbytes > 1) return EILSEQ; 214169695Skan if (c <= 0x7FF && nbytes > 2) return EILSEQ; 215169695Skan if (c <= 0xFFFF && nbytes > 3) return EILSEQ; 216169695Skan if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ; 217169695Skan if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ; 218169695Skan 219169695Skan /* Make sure the character is valid. */ 220169695Skan if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ; 221169695Skan 222169695Skan *cp = c; 223169695Skan *inbufp = inbuf; 224169695Skan *inbytesleftp -= nbytes; 225169695Skan return 0; 226169695Skan} 227169695Skan 228169695Skanstatic inline int 229169695Skanone_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp) 230169695Skan{ 231169695Skan static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 232169695Skan static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE }; 233169695Skan size_t nbytes; 234169695Skan uchar buf[6], *p = &buf[6]; 235169695Skan uchar *outbuf = *outbufp; 236169695Skan 237169695Skan nbytes = 1; 238169695Skan if (c < 0x80) 239169695Skan *--p = c; 240169695Skan else 241169695Skan { 242169695Skan do 243169695Skan { 244169695Skan *--p = ((c & 0x3F) | 0x80); 245169695Skan c >>= 6; 246169695Skan nbytes++; 247169695Skan } 248169695Skan while (c >= 0x3F || (c & limits[nbytes-1])); 249169695Skan *--p = (c | masks[nbytes-1]); 250169695Skan } 251169695Skan 252169695Skan if (*outbytesleftp < nbytes) 253169695Skan return E2BIG; 254169695Skan 255169695Skan while (p < &buf[6]) 256169695Skan *outbuf++ = *p++; 257169695Skan *outbytesleftp -= nbytes; 258169695Skan *outbufp = outbuf; 259169695Skan return 0; 260169695Skan} 261169695Skan 262169695Skan/* The following four functions transform one character between the two 263169695Skan encodings named in the function name. All have the signature 264169695Skan int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 265169695Skan uchar **outbufp, size_t *outbytesleftp) 266169695Skan 267169695Skan BIGEND must have the value 0 or 1, coerced to (iconv_t); it is 268169695Skan interpreted as a boolean indicating whether big-endian or 269169695Skan little-endian encoding is to be used for the member of the pair 270169695Skan that is not UTF-8. 271169695Skan 272169695Skan INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they 273169695Skan do for iconv. 274169695Skan 275169695Skan The return value is either 0 for success, or an errno value for 276169695Skan failure, which may be E2BIG (need more space), EILSEQ (ill-formed 277169695Skan input sequence), ir EINVAL (incomplete input sequence). */ 278169695Skan 279169695Skanstatic inline int 280169695Skanone_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 281169695Skan uchar **outbufp, size_t *outbytesleftp) 282169695Skan{ 283169695Skan uchar *outbuf; 284169695Skan cppchar_t s = 0; 285169695Skan int rval; 286169695Skan 287169695Skan /* Check for space first, since we know exactly how much we need. */ 288169695Skan if (*outbytesleftp < 4) 289169695Skan return E2BIG; 290169695Skan 291169695Skan rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s); 292169695Skan if (rval) 293169695Skan return rval; 294169695Skan 295169695Skan outbuf = *outbufp; 296169695Skan outbuf[bigend ? 3 : 0] = (s & 0x000000FF); 297169695Skan outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8; 298169695Skan outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16; 299169695Skan outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24; 300169695Skan 301169695Skan *outbufp += 4; 302169695Skan *outbytesleftp -= 4; 303169695Skan return 0; 304169695Skan} 305169695Skan 306169695Skanstatic inline int 307169695Skanone_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 308169695Skan uchar **outbufp, size_t *outbytesleftp) 309169695Skan{ 310169695Skan cppchar_t s; 311169695Skan int rval; 312169695Skan const uchar *inbuf; 313169695Skan 314169695Skan if (*inbytesleftp < 4) 315169695Skan return EINVAL; 316169695Skan 317169695Skan inbuf = *inbufp; 318169695Skan 319169695Skan s = inbuf[bigend ? 0 : 3] << 24; 320169695Skan s += inbuf[bigend ? 1 : 2] << 16; 321169695Skan s += inbuf[bigend ? 2 : 1] << 8; 322169695Skan s += inbuf[bigend ? 3 : 0]; 323169695Skan 324169695Skan if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF)) 325169695Skan return EILSEQ; 326169695Skan 327169695Skan rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp); 328169695Skan if (rval) 329169695Skan return rval; 330169695Skan 331169695Skan *inbufp += 4; 332169695Skan *inbytesleftp -= 4; 333169695Skan return 0; 334169695Skan} 335169695Skan 336169695Skanstatic inline int 337169695Skanone_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 338169695Skan uchar **outbufp, size_t *outbytesleftp) 339169695Skan{ 340169695Skan int rval; 341169695Skan cppchar_t s = 0; 342169695Skan const uchar *save_inbuf = *inbufp; 343169695Skan size_t save_inbytesleft = *inbytesleftp; 344169695Skan uchar *outbuf = *outbufp; 345169695Skan 346169695Skan rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s); 347169695Skan if (rval) 348169695Skan return rval; 349169695Skan 350169695Skan if (s > 0x0010FFFF) 351169695Skan { 352169695Skan *inbufp = save_inbuf; 353169695Skan *inbytesleftp = save_inbytesleft; 354169695Skan return EILSEQ; 355169695Skan } 356169695Skan 357169695Skan if (s < 0xFFFF) 358169695Skan { 359169695Skan if (*outbytesleftp < 2) 360169695Skan { 361169695Skan *inbufp = save_inbuf; 362169695Skan *inbytesleftp = save_inbytesleft; 363169695Skan return E2BIG; 364169695Skan } 365169695Skan outbuf[bigend ? 1 : 0] = (s & 0x00FF); 366169695Skan outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8; 367169695Skan 368169695Skan *outbufp += 2; 369169695Skan *outbytesleftp -= 2; 370169695Skan return 0; 371169695Skan } 372169695Skan else 373169695Skan { 374169695Skan cppchar_t hi, lo; 375169695Skan 376169695Skan if (*outbytesleftp < 4) 377169695Skan { 378169695Skan *inbufp = save_inbuf; 379169695Skan *inbytesleftp = save_inbytesleft; 380169695Skan return E2BIG; 381169695Skan } 382169695Skan 383169695Skan hi = (s - 0x10000) / 0x400 + 0xD800; 384169695Skan lo = (s - 0x10000) % 0x400 + 0xDC00; 385169695Skan 386169695Skan /* Even if we are little-endian, put the high surrogate first. 387169695Skan ??? Matches practice? */ 388169695Skan outbuf[bigend ? 1 : 0] = (hi & 0x00FF); 389169695Skan outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8; 390169695Skan outbuf[bigend ? 3 : 2] = (lo & 0x00FF); 391169695Skan outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8; 392169695Skan 393169695Skan *outbufp += 4; 394169695Skan *outbytesleftp -= 4; 395169695Skan return 0; 396169695Skan } 397169695Skan} 398169695Skan 399169695Skanstatic inline int 400169695Skanone_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 401169695Skan uchar **outbufp, size_t *outbytesleftp) 402169695Skan{ 403169695Skan cppchar_t s; 404169695Skan const uchar *inbuf = *inbufp; 405169695Skan int rval; 406169695Skan 407169695Skan if (*inbytesleftp < 2) 408169695Skan return EINVAL; 409169695Skan s = inbuf[bigend ? 0 : 1] << 8; 410169695Skan s += inbuf[bigend ? 1 : 0]; 411169695Skan 412169695Skan /* Low surrogate without immediately preceding high surrogate is invalid. */ 413169695Skan if (s >= 0xDC00 && s <= 0xDFFF) 414169695Skan return EILSEQ; 415169695Skan /* High surrogate must have a following low surrogate. */ 416169695Skan else if (s >= 0xD800 && s <= 0xDBFF) 417169695Skan { 418169695Skan cppchar_t hi = s, lo; 419169695Skan if (*inbytesleftp < 4) 420169695Skan return EINVAL; 421169695Skan 422169695Skan lo = inbuf[bigend ? 2 : 3] << 8; 423169695Skan lo += inbuf[bigend ? 3 : 2]; 424169695Skan 425169695Skan if (lo < 0xDC00 || lo > 0xDFFF) 426169695Skan return EILSEQ; 427169695Skan 428169695Skan s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000; 429169695Skan } 430169695Skan 431169695Skan rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp); 432169695Skan if (rval) 433169695Skan return rval; 434169695Skan 435169695Skan /* Success - update the input pointers (one_cppchar_to_utf8 has done 436169695Skan the output pointers for us). */ 437169695Skan if (s <= 0xFFFF) 438169695Skan { 439169695Skan *inbufp += 2; 440169695Skan *inbytesleftp -= 2; 441169695Skan } 442169695Skan else 443169695Skan { 444169695Skan *inbufp += 4; 445169695Skan *inbytesleftp -= 4; 446169695Skan } 447169695Skan return 0; 448169695Skan} 449169695Skan 450169695Skan/* Helper routine for the next few functions. The 'const' on 451169695Skan one_conversion means that we promise not to modify what function is 452169695Skan pointed to, which lets the inliner see through it. */ 453169695Skan 454169695Skanstatic inline bool 455169695Skanconversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *, 456169695Skan uchar **, size_t *), 457169695Skan iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to) 458169695Skan{ 459169695Skan const uchar *inbuf; 460169695Skan uchar *outbuf; 461169695Skan size_t inbytesleft, outbytesleft; 462169695Skan int rval; 463169695Skan 464169695Skan inbuf = from; 465169695Skan inbytesleft = flen; 466169695Skan outbuf = to->text + to->len; 467169695Skan outbytesleft = to->asize - to->len; 468169695Skan 469169695Skan for (;;) 470169695Skan { 471169695Skan do 472169695Skan rval = one_conversion (cd, &inbuf, &inbytesleft, 473169695Skan &outbuf, &outbytesleft); 474169695Skan while (inbytesleft && !rval); 475169695Skan 476169695Skan if (__builtin_expect (inbytesleft == 0, 1)) 477169695Skan { 478169695Skan to->len = to->asize - outbytesleft; 479169695Skan return true; 480169695Skan } 481169695Skan if (rval != E2BIG) 482169695Skan { 483169695Skan errno = rval; 484169695Skan return false; 485169695Skan } 486169695Skan 487169695Skan outbytesleft += OUTBUF_BLOCK_SIZE; 488169695Skan to->asize += OUTBUF_BLOCK_SIZE; 489169695Skan to->text = XRESIZEVEC (uchar, to->text, to->asize); 490169695Skan outbuf = to->text + to->asize - outbytesleft; 491169695Skan } 492169695Skan} 493169695Skan 494169695Skan 495169695Skan/* These functions convert entire strings between character sets. 496169695Skan They all have the signature 497169695Skan 498169695Skan bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to); 499169695Skan 500169695Skan The input string FROM is converted as specified by the function 501169695Skan name plus the iconv descriptor CD (which may be fake), and the 502169695Skan result appended to TO. On any error, false is returned, otherwise true. */ 503169695Skan 504169695Skan/* These four use the custom conversion code above. */ 505169695Skanstatic bool 506169695Skanconvert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen, 507169695Skan struct _cpp_strbuf *to) 508169695Skan{ 509169695Skan return conversion_loop (one_utf8_to_utf16, cd, from, flen, to); 510169695Skan} 511169695Skan 512169695Skanstatic bool 513169695Skanconvert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen, 514169695Skan struct _cpp_strbuf *to) 515169695Skan{ 516169695Skan return conversion_loop (one_utf8_to_utf32, cd, from, flen, to); 517169695Skan} 518169695Skan 519169695Skanstatic bool 520169695Skanconvert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen, 521169695Skan struct _cpp_strbuf *to) 522169695Skan{ 523169695Skan return conversion_loop (one_utf16_to_utf8, cd, from, flen, to); 524169695Skan} 525169695Skan 526169695Skanstatic bool 527169695Skanconvert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen, 528169695Skan struct _cpp_strbuf *to) 529169695Skan{ 530169695Skan return conversion_loop (one_utf32_to_utf8, cd, from, flen, to); 531169695Skan} 532169695Skan 533169695Skan/* Identity conversion, used when we have no alternative. */ 534169695Skanstatic bool 535169695Skanconvert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED, 536169695Skan const uchar *from, size_t flen, struct _cpp_strbuf *to) 537169695Skan{ 538169695Skan if (to->len + flen > to->asize) 539169695Skan { 540169695Skan to->asize = to->len + flen; 541169695Skan to->text = XRESIZEVEC (uchar, to->text, to->asize); 542169695Skan } 543169695Skan memcpy (to->text + to->len, from, flen); 544169695Skan to->len += flen; 545169695Skan return true; 546169695Skan} 547169695Skan 548169695Skan/* And this one uses the system iconv primitive. It's a little 549169695Skan different, since iconv's interface is a little different. */ 550169695Skan#if HAVE_ICONV 551169695Skanstatic bool 552169695Skanconvert_using_iconv (iconv_t cd, const uchar *from, size_t flen, 553169695Skan struct _cpp_strbuf *to) 554169695Skan{ 555169695Skan ICONV_CONST char *inbuf; 556169695Skan char *outbuf; 557169695Skan size_t inbytesleft, outbytesleft; 558169695Skan 559169695Skan /* Reset conversion descriptor and check that it is valid. */ 560169695Skan if (iconv (cd, 0, 0, 0, 0) == (size_t)-1) 561169695Skan return false; 562169695Skan 563169695Skan inbuf = (ICONV_CONST char *)from; 564169695Skan inbytesleft = flen; 565169695Skan outbuf = (char *)to->text + to->len; 566169695Skan outbytesleft = to->asize - to->len; 567169695Skan 568169695Skan for (;;) 569169695Skan { 570169695Skan iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); 571169695Skan if (__builtin_expect (inbytesleft == 0, 1)) 572169695Skan { 573169695Skan to->len = to->asize - outbytesleft; 574169695Skan return true; 575169695Skan } 576169695Skan if (errno != E2BIG) 577169695Skan return false; 578169695Skan 579169695Skan outbytesleft += OUTBUF_BLOCK_SIZE; 580169695Skan to->asize += OUTBUF_BLOCK_SIZE; 581169695Skan to->text = XRESIZEVEC (uchar, to->text, to->asize); 582169695Skan outbuf = (char *)to->text + to->asize - outbytesleft; 583169695Skan } 584169695Skan} 585169695Skan#else 586169695Skan#define convert_using_iconv 0 /* prevent undefined symbol error below */ 587169695Skan#endif 588169695Skan 589169695Skan/* Arrange for the above custom conversion logic to be used automatically 590169695Skan when conversion between a suitable pair of character sets is requested. */ 591169695Skan 592169695Skan#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \ 593169695Skan CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO) 594169695Skan 595169695Skanstruct conversion 596169695Skan{ 597169695Skan const char *pair; 598169695Skan convert_f func; 599169695Skan iconv_t fake_cd; 600169695Skan}; 601169695Skanstatic const struct conversion conversion_tab[] = { 602169695Skan { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 }, 603169695Skan { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 }, 604169695Skan { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 }, 605169695Skan { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 }, 606169695Skan { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 }, 607169695Skan { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 }, 608169695Skan { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 }, 609169695Skan { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 }, 610169695Skan}; 611169695Skan 612169695Skan/* Subroutine of cpp_init_iconv: initialize and return a 613169695Skan cset_converter structure for conversion from FROM to TO. If 614169695Skan iconv_open() fails, issue an error and return an identity 615169695Skan converter. Silently return an identity converter if FROM and TO 616169695Skan are identical. */ 617169695Skanstatic struct cset_converter 618169695Skaninit_iconv_desc (cpp_reader *pfile, const char *to, const char *from) 619169695Skan{ 620169695Skan struct cset_converter ret; 621169695Skan char *pair; 622169695Skan size_t i; 623169695Skan 624169695Skan if (!strcasecmp (to, from)) 625169695Skan { 626169695Skan ret.func = convert_no_conversion; 627169695Skan ret.cd = (iconv_t) -1; 628169695Skan return ret; 629169695Skan } 630169695Skan 631169695Skan pair = (char *) alloca(strlen(to) + strlen(from) + 2); 632169695Skan 633169695Skan strcpy(pair, from); 634169695Skan strcat(pair, "/"); 635169695Skan strcat(pair, to); 636169695Skan for (i = 0; i < ARRAY_SIZE (conversion_tab); i++) 637169695Skan if (!strcasecmp (pair, conversion_tab[i].pair)) 638169695Skan { 639169695Skan ret.func = conversion_tab[i].func; 640169695Skan ret.cd = conversion_tab[i].fake_cd; 641169695Skan return ret; 642169695Skan } 643169695Skan 644169695Skan /* No custom converter - try iconv. */ 645169695Skan if (HAVE_ICONV) 646169695Skan { 647169695Skan ret.func = convert_using_iconv; 648169695Skan ret.cd = iconv_open (to, from); 649169695Skan 650169695Skan if (ret.cd == (iconv_t) -1) 651169695Skan { 652169695Skan if (errno == EINVAL) 653169695Skan cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ 654169695Skan "conversion from %s to %s not supported by iconv", 655169695Skan from, to); 656169695Skan else 657169695Skan cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); 658169695Skan 659169695Skan ret.func = convert_no_conversion; 660169695Skan } 661169695Skan } 662169695Skan else 663169695Skan { 664169695Skan cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ 665169695Skan "no iconv implementation, cannot convert from %s to %s", 666169695Skan from, to); 667169695Skan ret.func = convert_no_conversion; 668169695Skan ret.cd = (iconv_t) -1; 669169695Skan } 670169695Skan return ret; 671169695Skan} 672169695Skan 673169695Skan/* If charset conversion is requested, initialize iconv(3) descriptors 674169695Skan for conversion from the source character set to the execution 675169695Skan character sets. If iconv is not present in the C library, and 676169695Skan conversion is requested, issue an error. */ 677169695Skan 678169695Skanvoid 679169695Skancpp_init_iconv (cpp_reader *pfile) 680169695Skan{ 681169695Skan const char *ncset = CPP_OPTION (pfile, narrow_charset); 682169695Skan const char *wcset = CPP_OPTION (pfile, wide_charset); 683169695Skan const char *default_wcset; 684169695Skan 685169695Skan bool be = CPP_OPTION (pfile, bytes_big_endian); 686169695Skan 687169695Skan if (CPP_OPTION (pfile, wchar_precision) >= 32) 688169695Skan default_wcset = be ? "UTF-32BE" : "UTF-32LE"; 689169695Skan else if (CPP_OPTION (pfile, wchar_precision) >= 16) 690169695Skan default_wcset = be ? "UTF-16BE" : "UTF-16LE"; 691169695Skan else 692169695Skan /* This effectively means that wide strings are not supported, 693169695Skan so don't do any conversion at all. */ 694169695Skan default_wcset = SOURCE_CHARSET; 695169695Skan 696169695Skan if (!ncset) 697169695Skan ncset = SOURCE_CHARSET; 698169695Skan if (!wcset) 699169695Skan wcset = default_wcset; 700169695Skan 701169695Skan pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET); 702169695Skan pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET); 703169695Skan} 704169695Skan 705169695Skan/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */ 706169695Skanvoid 707169695Skan_cpp_destroy_iconv (cpp_reader *pfile) 708169695Skan{ 709169695Skan if (HAVE_ICONV) 710169695Skan { 711169695Skan if (pfile->narrow_cset_desc.func == convert_using_iconv) 712169695Skan iconv_close (pfile->narrow_cset_desc.cd); 713169695Skan if (pfile->wide_cset_desc.func == convert_using_iconv) 714169695Skan iconv_close (pfile->wide_cset_desc.cd); 715169695Skan } 716169695Skan} 717169695Skan 718169695Skan/* Utility routine for use by a full compiler. C is a character taken 719169695Skan from the *basic* source character set, encoded in the host's 720169695Skan execution encoding. Convert it to (the target's) execution 721169695Skan encoding, and return that value. 722169695Skan 723169695Skan Issues an internal error if C's representation in the narrow 724169695Skan execution character set fails to be a single-byte value (C99 725169695Skan 5.2.1p3: "The representation of each member of the source and 726169695Skan execution character sets shall fit in a byte.") May also issue an 727169695Skan internal error if C fails to be a member of the basic source 728169695Skan character set (testing this exactly is too hard, especially when 729169695Skan the host character set is EBCDIC). */ 730169695Skancppchar_t 731169695Skancpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c) 732169695Skan{ 733169695Skan uchar sbuf[1]; 734169695Skan struct _cpp_strbuf tbuf; 735169695Skan 736169695Skan /* This test is merely an approximation, but it suffices to catch 737169695Skan the most important thing, which is that we don't get handed a 738169695Skan character outside the unibyte range of the host character set. */ 739169695Skan if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR) 740169695Skan { 741169695Skan cpp_error (pfile, CPP_DL_ICE, 742169695Skan "character 0x%lx is not in the basic source character set\n", 743169695Skan (unsigned long)c); 744169695Skan return 0; 745169695Skan } 746169695Skan 747169695Skan /* Being a character in the unibyte range of the host character set, 748169695Skan we can safely splat it into a one-byte buffer and trust that that 749169695Skan is a well-formed string. */ 750169695Skan sbuf[0] = c; 751169695Skan 752169695Skan /* This should never need to reallocate, but just in case... */ 753169695Skan tbuf.asize = 1; 754169695Skan tbuf.text = XNEWVEC (uchar, tbuf.asize); 755169695Skan tbuf.len = 0; 756169695Skan 757169695Skan if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf)) 758169695Skan { 759169695Skan cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set"); 760169695Skan return 0; 761169695Skan } 762169695Skan if (tbuf.len != 1) 763169695Skan { 764169695Skan cpp_error (pfile, CPP_DL_ICE, 765169695Skan "character 0x%lx is not unibyte in execution character set", 766169695Skan (unsigned long)c); 767169695Skan return 0; 768169695Skan } 769169695Skan c = tbuf.text[0]; 770169695Skan free(tbuf.text); 771169695Skan return c; 772169695Skan} 773169695Skan 774169695Skan 775169695Skan 776169695Skan/* Utility routine that computes a mask of the form 0000...111... with 777169695Skan WIDTH 1-bits. */ 778169695Skanstatic inline size_t 779169695Skanwidth_to_mask (size_t width) 780169695Skan{ 781169695Skan width = MIN (width, BITS_PER_CPPCHAR_T); 782169695Skan if (width >= CHAR_BIT * sizeof (size_t)) 783169695Skan return ~(size_t) 0; 784169695Skan else 785169695Skan return ((size_t) 1 << width) - 1; 786169695Skan} 787169695Skan 788169695Skan/* A large table of unicode character information. */ 789169695Skanenum { 790169695Skan /* Valid in a C99 identifier? */ 791169695Skan C99 = 1, 792169695Skan /* Valid in a C99 identifier, but not as the first character? */ 793169695Skan DIG = 2, 794169695Skan /* Valid in a C++ identifier? */ 795169695Skan CXX = 4, 796169695Skan /* NFC representation is not valid in an identifier? */ 797169695Skan CID = 8, 798169695Skan /* Might be valid NFC form? */ 799169695Skan NFC = 16, 800169695Skan /* Might be valid NFKC form? */ 801169695Skan NKC = 32, 802169695Skan /* Certain preceding characters might make it not valid NFC/NKFC form? */ 803169695Skan CTX = 64 804169695Skan}; 805169695Skan 806169695Skanstatic const struct { 807169695Skan /* Bitmap of flags above. */ 808169695Skan unsigned char flags; 809169695Skan /* Combining class of the character. */ 810169695Skan unsigned char combine; 811169695Skan /* Last character in the range described by this entry. */ 812169695Skan unsigned short end; 813169695Skan} ucnranges[] = { 814169695Skan#include "ucnid.h" 815169695Skan}; 816169695Skan 817169695Skan/* Returns 1 if C is valid in an identifier, 2 if C is valid except at 818169695Skan the start of an identifier, and 0 if C is not valid in an 819169695Skan identifier. We assume C has already gone through the checks of 820169695Skan _cpp_valid_ucn. Also update NST for C if returning nonzero. The 821169695Skan algorithm is a simple binary search on the table defined in 822169695Skan ucnid.h. */ 823169695Skan 824169695Skanstatic int 825169695Skanucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, 826169695Skan struct normalize_state *nst) 827169695Skan{ 828169695Skan int mn, mx, md; 829169695Skan 830169695Skan if (c > 0xFFFF) 831169695Skan return 0; 832169695Skan 833169695Skan mn = 0; 834169695Skan mx = ARRAY_SIZE (ucnranges) - 1; 835169695Skan while (mx != mn) 836169695Skan { 837169695Skan md = (mn + mx) / 2; 838169695Skan if (c <= ucnranges[md].end) 839169695Skan mx = md; 840169695Skan else 841169695Skan mn = md + 1; 842169695Skan } 843169695Skan 844169695Skan /* When -pedantic, we require the character to have been listed by 845169695Skan the standard for the current language. Otherwise, we accept the 846169695Skan union of the acceptable sets for C++98 and C99. */ 847169695Skan if (! (ucnranges[mn].flags & (C99 | CXX))) 848169695Skan return 0; 849169695Skan 850169695Skan if (CPP_PEDANTIC (pfile) 851169695Skan && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99)) 852169695Skan || (CPP_OPTION (pfile, cplusplus) 853169695Skan && !(ucnranges[mn].flags & CXX)))) 854169695Skan return 0; 855169695Skan 856169695Skan /* Update NST. */ 857169695Skan if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class) 858169695Skan nst->level = normalized_none; 859169695Skan else if (ucnranges[mn].flags & CTX) 860169695Skan { 861169695Skan bool safe; 862169695Skan cppchar_t p = nst->previous; 863169695Skan 864169695Skan /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */ 865169695Skan if (c == 0x09BE) 866169695Skan safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */ 867169695Skan else if (c == 0x0B3E) 868169695Skan safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */ 869169695Skan else if (c == 0x0BBE) 870169695Skan safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */ 871169695Skan else if (c == 0x0CC2) 872169695Skan safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */ 873169695Skan else if (c == 0x0D3E) 874169695Skan safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */ 875169695Skan /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC, 876169695Skan and are combined algorithmically from a sequence of the form 877169695Skan 1100-1112 1161-1175 11A8-11C2 878169695Skan (if the third is not present, it is treated as 11A7, which is not 879169695Skan really a valid character). 880169695Skan Unfortunately, C99 allows (only) the NFC form, but C++ allows 881169695Skan only the combining characters. */ 882169695Skan else if (c >= 0x1161 && c <= 0x1175) 883169695Skan safe = p < 0x1100 || p > 0x1112; 884169695Skan else if (c >= 0x11A8 && c <= 0x11C2) 885169695Skan safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0); 886169695Skan else 887169695Skan { 888169695Skan /* Uh-oh, someone updated ucnid.h without updating this code. */ 889169695Skan cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c); 890169695Skan safe = true; 891169695Skan } 892169695Skan if (!safe && c < 0x1161) 893169695Skan nst->level = normalized_none; 894169695Skan else if (!safe) 895169695Skan nst->level = MAX (nst->level, normalized_identifier_C); 896169695Skan } 897169695Skan else if (ucnranges[mn].flags & NKC) 898169695Skan ; 899169695Skan else if (ucnranges[mn].flags & NFC) 900169695Skan nst->level = MAX (nst->level, normalized_C); 901169695Skan else if (ucnranges[mn].flags & CID) 902169695Skan nst->level = MAX (nst->level, normalized_identifier_C); 903169695Skan else 904169695Skan nst->level = normalized_none; 905169695Skan nst->previous = c; 906169695Skan nst->prev_class = ucnranges[mn].combine; 907169695Skan 908169695Skan /* In C99, UCN digits may not begin identifiers. */ 909169695Skan if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG)) 910169695Skan return 2; 911169695Skan 912169695Skan return 1; 913169695Skan} 914169695Skan 915169695Skan/* [lex.charset]: The character designated by the universal character 916169695Skan name \UNNNNNNNN is that character whose character short name in 917169695Skan ISO/IEC 10646 is NNNNNNNN; the character designated by the 918169695Skan universal character name \uNNNN is that character whose character 919169695Skan short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value 920169695Skan for a universal character name is less than 0x20 or in the range 921169695Skan 0x7F-0x9F (inclusive), or if the universal character name 922169695Skan designates a character in the basic source character set, then the 923169695Skan program is ill-formed. 924169695Skan 925169695Skan *PSTR must be preceded by "\u" or "\U"; it is assumed that the 926169695Skan buffer end is delimited by a non-hex digit. Returns zero if the 927169695Skan UCN has not been consumed. 928169695Skan 929169695Skan Otherwise the nonzero value of the UCN, whether valid or invalid, 930169695Skan is returned. Diagnostics are emitted for invalid values. PSTR 931169695Skan is updated to point one beyond the UCN, or to the syntactically 932169695Skan invalid character. 933169695Skan 934169695Skan IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of 935169695Skan an identifier, or 2 otherwise. */ 936169695Skan 937169695Skancppchar_t 938169695Skan_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, 939169695Skan const uchar *limit, int identifier_pos, 940169695Skan struct normalize_state *nst) 941169695Skan{ 942169695Skan cppchar_t result, c; 943169695Skan unsigned int length; 944169695Skan const uchar *str = *pstr; 945169695Skan const uchar *base = str - 2; 946169695Skan 947169695Skan if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99)) 948169695Skan cpp_error (pfile, CPP_DL_WARNING, 949169695Skan "universal character names are only valid in C++ and C99"); 950169695Skan else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0) 951169695Skan cpp_error (pfile, CPP_DL_WARNING, 952169695Skan "the meaning of '\\%c' is different in traditional C", 953169695Skan (int) str[-1]); 954169695Skan 955169695Skan if (str[-1] == 'u') 956169695Skan length = 4; 957169695Skan else if (str[-1] == 'U') 958169695Skan length = 8; 959169695Skan else 960169695Skan { 961169695Skan cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN"); 962169695Skan length = 4; 963169695Skan } 964169695Skan 965169695Skan result = 0; 966169695Skan do 967169695Skan { 968169695Skan c = *str; 969169695Skan if (!ISXDIGIT (c)) 970169695Skan break; 971169695Skan str++; 972169695Skan result = (result << 4) + hex_value (c); 973169695Skan } 974169695Skan while (--length && str < limit); 975169695Skan 976169695Skan /* Partial UCNs are not valid in strings, but decompose into 977169695Skan multiple tokens in identifiers, so we can't give a helpful 978169695Skan error message in that case. */ 979169695Skan if (length && identifier_pos) 980169695Skan return 0; 981169695Skan 982169695Skan *pstr = str; 983169695Skan if (length) 984169695Skan { 985169695Skan cpp_error (pfile, CPP_DL_ERROR, 986169695Skan "incomplete universal character name %.*s", 987169695Skan (int) (str - base), base); 988169695Skan result = 1; 989169695Skan } 990169695Skan /* The standard permits $, @ and ` to be specified as UCNs. We use 991169695Skan hex escapes so that this also works with EBCDIC hosts. */ 992169695Skan else if ((result < 0xa0 993169695Skan && (result != 0x24 && result != 0x40 && result != 0x60)) 994169695Skan || (result & 0x80000000) 995169695Skan || (result >= 0xD800 && result <= 0xDFFF)) 996169695Skan { 997169695Skan cpp_error (pfile, CPP_DL_ERROR, 998169695Skan "%.*s is not a valid universal character", 999169695Skan (int) (str - base), base); 1000169695Skan result = 1; 1001169695Skan } 1002169695Skan else if (identifier_pos && result == 0x24 1003169695Skan && CPP_OPTION (pfile, dollars_in_ident)) 1004169695Skan { 1005169695Skan if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping) 1006169695Skan { 1007169695Skan CPP_OPTION (pfile, warn_dollars) = 0; 1008169695Skan cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number"); 1009169695Skan } 1010169695Skan NORMALIZE_STATE_UPDATE_IDNUM (nst); 1011169695Skan } 1012169695Skan else if (identifier_pos) 1013169695Skan { 1014169695Skan int validity = ucn_valid_in_identifier (pfile, result, nst); 1015169695Skan 1016169695Skan if (validity == 0) 1017169695Skan cpp_error (pfile, CPP_DL_ERROR, 1018169695Skan "universal character %.*s is not valid in an identifier", 1019169695Skan (int) (str - base), base); 1020169695Skan else if (validity == 2 && identifier_pos == 1) 1021169695Skan cpp_error (pfile, CPP_DL_ERROR, 1022169695Skan "universal character %.*s is not valid at the start of an identifier", 1023169695Skan (int) (str - base), base); 1024169695Skan } 1025169695Skan 1026169695Skan if (result == 0) 1027169695Skan result = 1; 1028169695Skan 1029169695Skan return result; 1030169695Skan} 1031169695Skan 1032169695Skan/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate 1033169695Skan it to the execution character set and write the result into TBUF. 1034169695Skan An advanced pointer is returned. Issues all relevant diagnostics. */ 1035169695Skanstatic const uchar * 1036169695Skanconvert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, 1037169695Skan struct _cpp_strbuf *tbuf, bool wide) 1038169695Skan{ 1039169695Skan cppchar_t ucn; 1040169695Skan uchar buf[6]; 1041169695Skan uchar *bufp = buf; 1042169695Skan size_t bytesleft = 6; 1043169695Skan int rval; 1044169695Skan struct cset_converter cvt 1045169695Skan = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc; 1046169695Skan struct normalize_state nst = INITIAL_NORMALIZE_STATE; 1047169695Skan 1048169695Skan from++; /* Skip u/U. */ 1049169695Skan ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst); 1050169695Skan 1051169695Skan rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft); 1052169695Skan if (rval) 1053169695Skan { 1054169695Skan errno = rval; 1055169695Skan cpp_errno (pfile, CPP_DL_ERROR, 1056169695Skan "converting UCN to source character set"); 1057169695Skan } 1058169695Skan else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf)) 1059169695Skan cpp_errno (pfile, CPP_DL_ERROR, 1060169695Skan "converting UCN to execution character set"); 1061169695Skan 1062169695Skan return from; 1063169695Skan} 1064169695Skan 1065169695Skan/* Subroutine of convert_hex and convert_oct. N is the representation 1066169695Skan in the execution character set of a numeric escape; write it into the 1067169695Skan string buffer TBUF and update the end-of-string pointer therein. WIDE 1068169695Skan is true if it's a wide string that's being assembled in TBUF. This 1069169695Skan function issues no diagnostics and never fails. */ 1070169695Skanstatic void 1071169695Skanemit_numeric_escape (cpp_reader *pfile, cppchar_t n, 1072169695Skan struct _cpp_strbuf *tbuf, bool wide) 1073169695Skan{ 1074169695Skan if (wide) 1075169695Skan { 1076169695Skan /* We have to render this into the target byte order, which may not 1077169695Skan be our byte order. */ 1078169695Skan bool bigend = CPP_OPTION (pfile, bytes_big_endian); 1079169695Skan size_t width = CPP_OPTION (pfile, wchar_precision); 1080169695Skan size_t cwidth = CPP_OPTION (pfile, char_precision); 1081169695Skan size_t cmask = width_to_mask (cwidth); 1082169695Skan size_t nbwc = width / cwidth; 1083169695Skan size_t i; 1084169695Skan size_t off = tbuf->len; 1085169695Skan cppchar_t c; 1086169695Skan 1087169695Skan if (tbuf->len + nbwc > tbuf->asize) 1088169695Skan { 1089169695Skan tbuf->asize += OUTBUF_BLOCK_SIZE; 1090169695Skan tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize); 1091169695Skan } 1092169695Skan 1093169695Skan for (i = 0; i < nbwc; i++) 1094169695Skan { 1095169695Skan c = n & cmask; 1096169695Skan n >>= cwidth; 1097169695Skan tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c; 1098169695Skan } 1099169695Skan tbuf->len += nbwc; 1100169695Skan } 1101169695Skan else 1102169695Skan { 1103169695Skan /* Note: this code does not handle the case where the target 1104169695Skan and host have a different number of bits in a byte. */ 1105169695Skan if (tbuf->len + 1 > tbuf->asize) 1106169695Skan { 1107169695Skan tbuf->asize += OUTBUF_BLOCK_SIZE; 1108169695Skan tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize); 1109169695Skan } 1110169695Skan tbuf->text[tbuf->len++] = n; 1111169695Skan } 1112169695Skan} 1113169695Skan 1114169695Skan/* Convert a hexadecimal escape, pointed to by FROM, to the execution 1115169695Skan character set and write it into the string buffer TBUF. Returns an 1116169695Skan advanced pointer, and issues diagnostics as necessary. 1117169695Skan No character set translation occurs; this routine always produces the 1118169695Skan execution-set character with numeric value equal to the given hex 1119169695Skan number. You can, e.g. generate surrogate pairs this way. */ 1120169695Skanstatic const uchar * 1121169695Skanconvert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit, 1122169695Skan struct _cpp_strbuf *tbuf, bool wide) 1123169695Skan{ 1124169695Skan cppchar_t c, n = 0, overflow = 0; 1125169695Skan int digits_found = 0; 1126169695Skan size_t width = (wide ? CPP_OPTION (pfile, wchar_precision) 1127169695Skan : CPP_OPTION (pfile, char_precision)); 1128169695Skan size_t mask = width_to_mask (width); 1129169695Skan 1130169695Skan if (CPP_WTRADITIONAL (pfile)) 1131169695Skan cpp_error (pfile, CPP_DL_WARNING, 1132169695Skan "the meaning of '\\x' is different in traditional C"); 1133169695Skan 1134169695Skan from++; /* Skip 'x'. */ 1135169695Skan while (from < limit) 1136169695Skan { 1137169695Skan c = *from; 1138169695Skan if (! hex_p (c)) 1139169695Skan break; 1140169695Skan from++; 1141169695Skan overflow |= n ^ (n << 4 >> 4); 1142169695Skan n = (n << 4) + hex_value (c); 1143169695Skan digits_found = 1; 1144169695Skan } 1145169695Skan 1146169695Skan if (!digits_found) 1147169695Skan { 1148169695Skan cpp_error (pfile, CPP_DL_ERROR, 1149169695Skan "\\x used with no following hex digits"); 1150169695Skan return from; 1151169695Skan } 1152169695Skan 1153169695Skan if (overflow | (n != (n & mask))) 1154169695Skan { 1155169695Skan cpp_error (pfile, CPP_DL_PEDWARN, 1156169695Skan "hex escape sequence out of range"); 1157169695Skan n &= mask; 1158169695Skan } 1159169695Skan 1160169695Skan emit_numeric_escape (pfile, n, tbuf, wide); 1161169695Skan 1162169695Skan return from; 1163169695Skan} 1164169695Skan 1165169695Skan/* Convert an octal escape, pointed to by FROM, to the execution 1166169695Skan character set and write it into the string buffer TBUF. Returns an 1167169695Skan advanced pointer, and issues diagnostics as necessary. 1168169695Skan No character set translation occurs; this routine always produces the 1169169695Skan execution-set character with numeric value equal to the given octal 1170169695Skan number. */ 1171169695Skanstatic const uchar * 1172169695Skanconvert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit, 1173169695Skan struct _cpp_strbuf *tbuf, bool wide) 1174169695Skan{ 1175169695Skan size_t count = 0; 1176169695Skan cppchar_t c, n = 0; 1177169695Skan size_t width = (wide ? CPP_OPTION (pfile, wchar_precision) 1178169695Skan : CPP_OPTION (pfile, char_precision)); 1179169695Skan size_t mask = width_to_mask (width); 1180169695Skan bool overflow = false; 1181169695Skan 1182169695Skan while (from < limit && count++ < 3) 1183169695Skan { 1184169695Skan c = *from; 1185169695Skan if (c < '0' || c > '7') 1186169695Skan break; 1187169695Skan from++; 1188169695Skan overflow |= n ^ (n << 3 >> 3); 1189169695Skan n = (n << 3) + c - '0'; 1190169695Skan } 1191169695Skan 1192169695Skan if (n != (n & mask)) 1193169695Skan { 1194169695Skan cpp_error (pfile, CPP_DL_PEDWARN, 1195169695Skan "octal escape sequence out of range"); 1196169695Skan n &= mask; 1197169695Skan } 1198169695Skan 1199169695Skan emit_numeric_escape (pfile, n, tbuf, wide); 1200169695Skan 1201169695Skan return from; 1202169695Skan} 1203169695Skan 1204169695Skan/* Convert an escape sequence (pointed to by FROM) to its value on 1205169695Skan the target, and to the execution character set. Do not scan past 1206169695Skan LIMIT. Write the converted value into TBUF. Returns an advanced 1207169695Skan pointer. Handles all relevant diagnostics. */ 1208169695Skanstatic const uchar * 1209169695Skanconvert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, 1210169695Skan struct _cpp_strbuf *tbuf, bool wide) 1211169695Skan{ 1212169695Skan /* Values of \a \b \e \f \n \r \t \v respectively. */ 1213169695Skan#if HOST_CHARSET == HOST_CHARSET_ASCII 1214169695Skan static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 }; 1215169695Skan#elif HOST_CHARSET == HOST_CHARSET_EBCDIC 1216169695Skan static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 }; 1217169695Skan#else 1218169695Skan#error "unknown host character set" 1219169695Skan#endif 1220169695Skan 1221169695Skan uchar c; 1222169695Skan struct cset_converter cvt 1223169695Skan = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc; 1224169695Skan 1225169695Skan c = *from; 1226169695Skan switch (c) 1227169695Skan { 1228169695Skan /* UCNs, hex escapes, and octal escapes are processed separately. */ 1229169695Skan case 'u': case 'U': 1230169695Skan return convert_ucn (pfile, from, limit, tbuf, wide); 1231169695Skan 1232169695Skan case 'x': 1233169695Skan return convert_hex (pfile, from, limit, tbuf, wide); 1234169695Skan break; 1235169695Skan 1236169695Skan case '0': case '1': case '2': case '3': 1237169695Skan case '4': case '5': case '6': case '7': 1238169695Skan return convert_oct (pfile, from, limit, tbuf, wide); 1239169695Skan 1240169695Skan /* Various letter escapes. Get the appropriate host-charset 1241169695Skan value into C. */ 1242169695Skan case '\\': case '\'': case '"': case '?': break; 1243169695Skan 1244169695Skan case '(': case '{': case '[': case '%': 1245169695Skan /* '\(', etc, can be used at the beginning of a line in a long 1246169695Skan string split onto multiple lines with \-newline, to prevent 1247169695Skan Emacs or other text editors from getting confused. '\%' can 1248169695Skan be used to prevent SCCS from mangling printf format strings. */ 1249169695Skan if (CPP_PEDANTIC (pfile)) 1250169695Skan goto unknown; 1251169695Skan break; 1252169695Skan 1253169695Skan case 'b': c = charconsts[1]; break; 1254169695Skan case 'f': c = charconsts[3]; break; 1255169695Skan case 'n': c = charconsts[4]; break; 1256169695Skan case 'r': c = charconsts[5]; break; 1257169695Skan case 't': c = charconsts[6]; break; 1258169695Skan case 'v': c = charconsts[7]; break; 1259169695Skan 1260169695Skan case 'a': 1261169695Skan if (CPP_WTRADITIONAL (pfile)) 1262169695Skan cpp_error (pfile, CPP_DL_WARNING, 1263169695Skan "the meaning of '\\a' is different in traditional C"); 1264169695Skan c = charconsts[0]; 1265169695Skan break; 1266169695Skan 1267169695Skan case 'e': case 'E': 1268169695Skan if (CPP_PEDANTIC (pfile)) 1269169695Skan cpp_error (pfile, CPP_DL_PEDWARN, 1270169695Skan "non-ISO-standard escape sequence, '\\%c'", (int) c); 1271169695Skan c = charconsts[2]; 1272169695Skan break; 1273169695Skan 1274169695Skan default: 1275169695Skan unknown: 1276169695Skan if (ISGRAPH (c)) 1277169695Skan cpp_error (pfile, CPP_DL_PEDWARN, 1278169695Skan "unknown escape sequence '\\%c'", (int) c); 1279169695Skan else 1280169695Skan { 1281169695Skan /* diagnostic.c does not support "%03o". When it does, this 1282169695Skan code can use %03o directly in the diagnostic again. */ 1283169695Skan char buf[32]; 1284169695Skan sprintf(buf, "%03o", (int) c); 1285169695Skan cpp_error (pfile, CPP_DL_PEDWARN, 1286169695Skan "unknown escape sequence: '\\%s'", buf); 1287169695Skan } 1288169695Skan } 1289169695Skan 1290169695Skan /* Now convert what we have to the execution character set. */ 1291169695Skan if (!APPLY_CONVERSION (cvt, &c, 1, tbuf)) 1292169695Skan cpp_errno (pfile, CPP_DL_ERROR, 1293169695Skan "converting escape sequence to execution character set"); 1294169695Skan 1295169695Skan return from + 1; 1296169695Skan} 1297169695Skan 1298169695Skan/* FROM is an array of cpp_string structures of length COUNT. These 1299169695Skan are to be converted from the source to the execution character set, 1300169695Skan escape sequences translated, and finally all are to be 1301169695Skan concatenated. WIDE indicates whether or not to produce a wide 1302169695Skan string. The result is written into TO. Returns true for success, 1303169695Skan false for failure. */ 1304169695Skanbool 1305169695Skancpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count, 1306169695Skan cpp_string *to, bool wide) 1307169695Skan{ 1308169695Skan struct _cpp_strbuf tbuf; 1309169695Skan const uchar *p, *base, *limit; 1310169695Skan size_t i; 1311169695Skan struct cset_converter cvt 1312169695Skan = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc; 1313169695Skan 1314169695Skan tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len); 1315169695Skan tbuf.text = XNEWVEC (uchar, tbuf.asize); 1316169695Skan tbuf.len = 0; 1317169695Skan 1318169695Skan for (i = 0; i < count; i++) 1319169695Skan { 1320169695Skan p = from[i].text; 1321169695Skan if (*p == 'L') p++; 1322169695Skan p++; /* Skip leading quote. */ 1323169695Skan limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */ 1324169695Skan 1325169695Skan for (;;) 1326169695Skan { 1327169695Skan base = p; 1328169695Skan while (p < limit && *p != '\\') 1329169695Skan p++; 1330169695Skan if (p > base) 1331169695Skan { 1332169695Skan /* We have a run of normal characters; these can be fed 1333169695Skan directly to convert_cset. */ 1334169695Skan if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf)) 1335169695Skan goto fail; 1336169695Skan } 1337169695Skan if (p == limit) 1338169695Skan break; 1339169695Skan 1340169695Skan p = convert_escape (pfile, p + 1, limit, &tbuf, wide); 1341169695Skan } 1342169695Skan } 1343169695Skan /* NUL-terminate the 'to' buffer and translate it to a cpp_string 1344169695Skan structure. */ 1345169695Skan emit_numeric_escape (pfile, 0, &tbuf, wide); 1346169695Skan tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len); 1347169695Skan to->text = tbuf.text; 1348169695Skan to->len = tbuf.len; 1349169695Skan return true; 1350169695Skan 1351169695Skan fail: 1352169695Skan cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set"); 1353169695Skan free (tbuf.text); 1354169695Skan return false; 1355169695Skan} 1356169695Skan 1357169695Skan/* Subroutine of do_line and do_linemarker. Convert escape sequences 1358169695Skan in a string, but do not perform character set conversion. */ 1359169695Skanbool 1360169695Skancpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from, 1361169695Skan size_t count, cpp_string *to, bool wide) 1362169695Skan{ 1363169695Skan struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc; 1364169695Skan bool retval; 1365169695Skan 1366169695Skan pfile->narrow_cset_desc.func = convert_no_conversion; 1367169695Skan pfile->narrow_cset_desc.cd = (iconv_t) -1; 1368169695Skan 1369169695Skan retval = cpp_interpret_string (pfile, from, count, to, wide); 1370169695Skan 1371169695Skan pfile->narrow_cset_desc = save_narrow_cset_desc; 1372169695Skan return retval; 1373169695Skan} 1374169695Skan 1375169695Skan 1376169695Skan/* Subroutine of cpp_interpret_charconst which performs the conversion 1377169695Skan to a number, for narrow strings. STR is the string structure returned 1378169695Skan by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for 1379169695Skan cpp_interpret_charconst. */ 1380169695Skanstatic cppchar_t 1381169695Skannarrow_str_to_charconst (cpp_reader *pfile, cpp_string str, 1382169695Skan unsigned int *pchars_seen, int *unsignedp) 1383169695Skan{ 1384169695Skan size_t width = CPP_OPTION (pfile, char_precision); 1385169695Skan size_t max_chars = CPP_OPTION (pfile, int_precision) / width; 1386169695Skan size_t mask = width_to_mask (width); 1387169695Skan size_t i; 1388169695Skan cppchar_t result, c; 1389169695Skan bool unsigned_p; 1390169695Skan 1391169695Skan /* The value of a multi-character character constant, or a 1392169695Skan single-character character constant whose representation in the 1393169695Skan execution character set is more than one byte long, is 1394169695Skan implementation defined. This implementation defines it to be the 1395169695Skan number formed by interpreting the byte sequence in memory as a 1396169695Skan big-endian binary number. If overflow occurs, the high bytes are 1397169695Skan lost, and a warning is issued. 1398169695Skan 1399169695Skan We don't want to process the NUL terminator handed back by 1400169695Skan cpp_interpret_string. */ 1401169695Skan result = 0; 1402169695Skan for (i = 0; i < str.len - 1; i++) 1403169695Skan { 1404169695Skan c = str.text[i] & mask; 1405169695Skan if (width < BITS_PER_CPPCHAR_T) 1406169695Skan result = (result << width) | c; 1407169695Skan else 1408169695Skan result = c; 1409169695Skan } 1410169695Skan 1411169695Skan if (i > max_chars) 1412169695Skan { 1413169695Skan i = max_chars; 1414169695Skan cpp_error (pfile, CPP_DL_WARNING, 1415169695Skan "character constant too long for its type"); 1416169695Skan } 1417169695Skan else if (i > 1 && CPP_OPTION (pfile, warn_multichar)) 1418169695Skan cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant"); 1419169695Skan 1420169695Skan /* Multichar constants are of type int and therefore signed. */ 1421169695Skan if (i > 1) 1422169695Skan unsigned_p = 0; 1423169695Skan else 1424169695Skan unsigned_p = CPP_OPTION (pfile, unsigned_char); 1425169695Skan 1426169695Skan /* Truncate the constant to its natural width, and simultaneously 1427169695Skan sign- or zero-extend to the full width of cppchar_t. 1428169695Skan For single-character constants, the value is WIDTH bits wide. 1429169695Skan For multi-character constants, the value is INT_PRECISION bits wide. */ 1430169695Skan if (i > 1) 1431169695Skan width = CPP_OPTION (pfile, int_precision); 1432169695Skan if (width < BITS_PER_CPPCHAR_T) 1433169695Skan { 1434169695Skan mask = ((cppchar_t) 1 << width) - 1; 1435169695Skan if (unsigned_p || !(result & (1 << (width - 1)))) 1436169695Skan result &= mask; 1437169695Skan else 1438169695Skan result |= ~mask; 1439169695Skan } 1440169695Skan *pchars_seen = i; 1441169695Skan *unsignedp = unsigned_p; 1442169695Skan return result; 1443169695Skan} 1444169695Skan 1445169695Skan/* Subroutine of cpp_interpret_charconst which performs the conversion 1446169695Skan to a number, for wide strings. STR is the string structure returned 1447169695Skan by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for 1448169695Skan cpp_interpret_charconst. */ 1449169695Skanstatic cppchar_t 1450169695Skanwide_str_to_charconst (cpp_reader *pfile, cpp_string str, 1451169695Skan unsigned int *pchars_seen, int *unsignedp) 1452169695Skan{ 1453169695Skan bool bigend = CPP_OPTION (pfile, bytes_big_endian); 1454169695Skan size_t width = CPP_OPTION (pfile, wchar_precision); 1455169695Skan size_t cwidth = CPP_OPTION (pfile, char_precision); 1456169695Skan size_t mask = width_to_mask (width); 1457169695Skan size_t cmask = width_to_mask (cwidth); 1458169695Skan size_t nbwc = width / cwidth; 1459169695Skan size_t off, i; 1460169695Skan cppchar_t result = 0, c; 1461169695Skan 1462169695Skan /* This is finicky because the string is in the target's byte order, 1463169695Skan which may not be our byte order. Only the last character, ignoring 1464169695Skan the NUL terminator, is relevant. */ 1465169695Skan off = str.len - (nbwc * 2); 1466169695Skan result = 0; 1467169695Skan for (i = 0; i < nbwc; i++) 1468169695Skan { 1469169695Skan c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1]; 1470169695Skan result = (result << cwidth) | (c & cmask); 1471169695Skan } 1472169695Skan 1473169695Skan /* Wide character constants have type wchar_t, and a single 1474169695Skan character exactly fills a wchar_t, so a multi-character wide 1475169695Skan character constant is guaranteed to overflow. */ 1476169695Skan if (off > 0) 1477169695Skan cpp_error (pfile, CPP_DL_WARNING, 1478169695Skan "character constant too long for its type"); 1479169695Skan 1480169695Skan /* Truncate the constant to its natural width, and simultaneously 1481169695Skan sign- or zero-extend to the full width of cppchar_t. */ 1482169695Skan if (width < BITS_PER_CPPCHAR_T) 1483169695Skan { 1484169695Skan if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1)))) 1485169695Skan result &= mask; 1486169695Skan else 1487169695Skan result |= ~mask; 1488169695Skan } 1489169695Skan 1490169695Skan *unsignedp = CPP_OPTION (pfile, unsigned_wchar); 1491169695Skan *pchars_seen = 1; 1492169695Skan return result; 1493169695Skan} 1494169695Skan 1495169695Skan/* Interpret a (possibly wide) character constant in TOKEN. 1496169695Skan PCHARS_SEEN points to a variable that is filled in with the number 1497169695Skan of characters seen, and UNSIGNEDP to a variable that indicates 1498169695Skan whether the result has signed type. */ 1499169695Skancppchar_t 1500169695Skancpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, 1501169695Skan unsigned int *pchars_seen, int *unsignedp) 1502169695Skan{ 1503169695Skan cpp_string str = { 0, 0 }; 1504169695Skan bool wide = (token->type == CPP_WCHAR); 1505169695Skan cppchar_t result; 1506169695Skan 1507169695Skan /* an empty constant will appear as L'' or '' */ 1508169695Skan if (token->val.str.len == (size_t) (2 + wide)) 1509169695Skan { 1510169695Skan cpp_error (pfile, CPP_DL_ERROR, "empty character constant"); 1511169695Skan return 0; 1512169695Skan } 1513169695Skan else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide)) 1514169695Skan return 0; 1515169695Skan 1516169695Skan if (wide) 1517169695Skan result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp); 1518169695Skan else 1519169695Skan result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp); 1520169695Skan 1521169695Skan if (str.text != token->val.str.text) 1522169695Skan free ((void *)str.text); 1523169695Skan 1524169695Skan return result; 1525169695Skan} 1526169695Skan 1527169695Skan/* Convert an identifier denoted by ID and LEN, which might contain 1528169695Skan UCN escapes, to the source character set, either UTF-8 or 1529169695Skan UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */ 1530169695Skancpp_hashnode * 1531169695Skan_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) 1532169695Skan{ 1533169695Skan /* It turns out that a UCN escape always turns into fewer characters 1534169695Skan than the escape itself, so we can allocate a temporary in advance. */ 1535169695Skan uchar * buf = (uchar *) alloca (len + 1); 1536169695Skan uchar * bufp = buf; 1537169695Skan size_t idp; 1538169695Skan 1539169695Skan for (idp = 0; idp < len; idp++) 1540169695Skan if (id[idp] != '\\') 1541169695Skan *bufp++ = id[idp]; 1542169695Skan else 1543169695Skan { 1544169695Skan unsigned length = id[idp+1] == 'u' ? 4 : 8; 1545169695Skan cppchar_t value = 0; 1546169695Skan size_t bufleft = len - (bufp - buf); 1547169695Skan int rval; 1548169695Skan 1549169695Skan idp += 2; 1550169695Skan while (length && idp < len && ISXDIGIT (id[idp])) 1551169695Skan { 1552169695Skan value = (value << 4) + hex_value (id[idp]); 1553169695Skan idp++; 1554169695Skan length--; 1555169695Skan } 1556169695Skan idp--; 1557169695Skan 1558169695Skan /* Special case for EBCDIC: if the identifier contains 1559169695Skan a '$' specified using a UCN, translate it to EBCDIC. */ 1560169695Skan if (value == 0x24) 1561169695Skan { 1562169695Skan *bufp++ = '$'; 1563169695Skan continue; 1564169695Skan } 1565169695Skan 1566169695Skan rval = one_cppchar_to_utf8 (value, &bufp, &bufleft); 1567169695Skan if (rval) 1568169695Skan { 1569169695Skan errno = rval; 1570169695Skan cpp_errno (pfile, CPP_DL_ERROR, 1571169695Skan "converting UCN to source character set"); 1572169695Skan break; 1573169695Skan } 1574169695Skan } 1575169695Skan 1576169695Skan return CPP_HASHNODE (ht_lookup (pfile->hash_table, 1577169695Skan buf, bufp - buf, HT_ALLOC)); 1578169695Skan} 1579169695Skan 1580169695Skan/* Convert an input buffer (containing the complete contents of one 1581169695Skan source file) from INPUT_CHARSET to the source character set. INPUT 1582169695Skan points to the input buffer, SIZE is its allocated size, and LEN is 1583169695Skan the length of the meaningful data within the buffer. The 1584169695Skan translated buffer is returned, and *ST_SIZE is set to the length of 1585169695Skan the meaningful data within the translated buffer. 1586169695Skan 1587169695Skan INPUT is expected to have been allocated with xmalloc. This function 1588169695Skan will either return INPUT, or free it and return a pointer to another 1589169695Skan xmalloc-allocated block of memory. */ 1590169695Skanuchar * 1591169695Skan_cpp_convert_input (cpp_reader *pfile, const char *input_charset, 1592169695Skan uchar *input, size_t size, size_t len, off_t *st_size) 1593169695Skan{ 1594169695Skan struct cset_converter input_cset; 1595169695Skan struct _cpp_strbuf to; 1596169695Skan 1597169695Skan input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset); 1598169695Skan if (input_cset.func == convert_no_conversion) 1599169695Skan { 1600260574Spfg /* APPLE LOCAL begin UTF-8 BOM 5774975 */ 1601260574Spfg /* Eat the UTF-8 BOM. */ 1602260574Spfg if (len >= 3 1603260574Spfg && input[0] == 0xef 1604260574Spfg && input[1] == 0xbb 1605260574Spfg && input[2] == 0xbf) 1606260574Spfg { 1607260574Spfg memmove (&input[0], &input[3], size-3); 1608260574Spfg len -= 3; 1609260574Spfg } 1610260574Spfg /* APPLE LOCAL end UTF-8 BOM 5774975 */ 1611169695Skan to.text = input; 1612169695Skan to.asize = size; 1613169695Skan to.len = len; 1614169695Skan } 1615169695Skan else 1616169695Skan { 1617169695Skan to.asize = MAX (65536, len); 1618169695Skan to.text = XNEWVEC (uchar, to.asize); 1619169695Skan to.len = 0; 1620169695Skan 1621169695Skan if (!APPLY_CONVERSION (input_cset, input, len, &to)) 1622169695Skan cpp_error (pfile, CPP_DL_ERROR, 1623169695Skan "failure to convert %s to %s", 1624169695Skan CPP_OPTION (pfile, input_charset), SOURCE_CHARSET); 1625169695Skan 1626169695Skan free (input); 1627169695Skan } 1628169695Skan 1629169695Skan /* Clean up the mess. */ 1630169695Skan if (input_cset.func == convert_using_iconv) 1631169695Skan iconv_close (input_cset.cd); 1632169695Skan 1633169695Skan /* Resize buffer if we allocated substantially too much, or if we 1634169695Skan haven't enough space for the \n-terminator. */ 1635169695Skan if (to.len + 4096 < to.asize || to.len >= to.asize) 1636169695Skan to.text = XRESIZEVEC (uchar, to.text, to.len + 1); 1637169695Skan 1638169695Skan /* If the file is using old-school Mac line endings (\r only), 1639169695Skan terminate with another \r, not an \n, so that we do not mistake 1640169695Skan the \r\n sequence for a single DOS line ending and erroneously 1641169695Skan issue the "No newline at end of file" diagnostic. */ 1642259891Spfg /* APPLE LOCAL don't access to.text[-1] radar 6121572 */ 1643259272Spfg if (to.len > 0 && to.text[to.len - 1] == '\r') 1644169695Skan to.text[to.len] = '\r'; 1645169695Skan else 1646169695Skan to.text[to.len] = '\n'; 1647169695Skan 1648169695Skan *st_size = to.len; 1649169695Skan return to.text; 1650169695Skan} 1651169695Skan 1652169695Skan/* Decide on the default encoding to assume for input files. */ 1653169695Skanconst char * 1654169695Skan_cpp_default_encoding (void) 1655169695Skan{ 1656169695Skan const char *current_encoding = NULL; 1657169695Skan 1658169695Skan /* We disable this because the default codeset is 7-bit ASCII on 1659169695Skan most platforms, and this causes conversion failures on every 1660169695Skan file in GCC that happens to have one of the upper 128 characters 1661169695Skan in it -- most likely, as part of the name of a contributor. 1662169695Skan We should definitely recognize in-band markers of file encoding, 1663169695Skan like: 1664169695Skan - the appropriate Unicode byte-order mark (FE FF) to recognize 1665169695Skan UTF16 and UCS4 (in both big-endian and little-endian flavors) 1666169695Skan and UTF8 1667169695Skan - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to 1668169695Skan distinguish ASCII and EBCDIC. 1669169695Skan - now we can parse something like "#pragma GCC encoding <xyz> 1670169695Skan on the first line, or even Emacs/VIM's mode line tags (there's 1671169695Skan a problem here in that VIM uses the last line, and Emacs has 1672169695Skan its more elaborate "local variables" convention). 1673169695Skan - investigate whether Java has another common convention, which 1674169695Skan would be friendly to support. 1675169695Skan (Zack Weinberg and Paolo Bonzini, May 20th 2004) */ 1676169695Skan#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0 1677169695Skan setlocale (LC_CTYPE, ""); 1678169695Skan current_encoding = nl_langinfo (CODESET); 1679169695Skan#endif 1680169695Skan if (current_encoding == NULL || *current_encoding == '\0') 1681169695Skan current_encoding = SOURCE_CHARSET; 1682169695Skan 1683169695Skan return current_encoding; 1684169695Skan} 1685