charset.c revision 259891
1219820Sjeff/* CPP Library - charsets 2219820Sjeff Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004 3272407Shselasky Free Software Foundation, Inc. 4219820Sjeff 5219820Sjeff Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. 6219820Sjeff 7219820SjeffThis program is free software; you can redistribute it and/or modify it 8219820Sjeffunder the terms of the GNU General Public License as published by the 9219820SjeffFree Software Foundation; either version 2, or (at your option) any 10219820Sjefflater version. 11219820Sjeff 12219820SjeffThis program is distributed in the hope that it will be useful, 13219820Sjeffbut WITHOUT ANY WARRANTY; without even the implied warranty of 14219820SjeffMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15219820SjeffGNU General Public License for more details. 16219820Sjeff 17219820SjeffYou should have received a copy of the GNU General Public License 18219820Sjeffalong with this program; if not, write to the Free Software 19219820SjeffFoundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 20219820Sjeff 21219820Sjeff#include "config.h" 22219820Sjeff#include "system.h" 23219820Sjeff#include "cpplib.h" 24219820Sjeff#include "internal.h" 25219820Sjeff 26219820Sjeff/* Character set handling for C-family languages. 27219820Sjeff 28219820Sjeff Terminological note: In what follows, "charset" or "character set" 29219820Sjeff will be taken to mean both an abstract set of characters and an 30219820Sjeff encoding for that set. 31219820Sjeff 32219820Sjeff The C99 standard discusses two character sets: source and execution. 33219820Sjeff The source character set is used for internal processing in translation 34219820Sjeff phases 1 through 4; the execution character set is used thereafter. 35272407Shselasky Both are required by 5.2.1.2p1 to be multibyte encodings, not wide 36219820Sjeff character encodings (see 3.7.2, 3.7.3 for the standardese meanings 37219820Sjeff of these terms). Furthermore, the "basic character set" (listed in 38272407Shselasky 5.2.1p3) is to be encoded in each with values one byte wide, and is 39279731Shselasky to appear in the initial shift state. 40219820Sjeff 41219820Sjeff It is not explicitly mentioned, but there is also a "wide execution 42219820Sjeff character set" used to encode wide character constants and wide 43255932Salfred string literals; this is supposed to be the result of applying the 44219820Sjeff standard library function mbstowcs() to an equivalent narrow string 45255932Salfred (6.4.5p5). However, the behavior of hexadecimal and octal 46255932Salfred \-escapes is at odds with this; they are supposed to be translated 47255932Salfred directly to wchar_t values (6.4.4.4p5,6). 48255932Salfred 49255932Salfred The source character set is not necessarily the character set used 50255932Salfred to encode physical source files on disk; translation phase 1 converts 51255932Salfred from whatever that encoding is to the source character set. 52255932Salfred 53255932Salfred The presence of universal character names in C99 (6.4.3 et seq.) 54255932Salfred forces the source character set to be isomorphic to ISO 10646, 55255932Salfred that is, Unicode. There is no such constraint on the execution 56255932Salfred character set; note also that the conversion from source to 57255932Salfred execution character set does not occur for identifiers (5.1.1.2p1#5). 58255932Salfred 59255932Salfred For convenience of implementation, the source character set's 60255932Salfred encoding of the basic character set should be identical to the 61255932Salfred execution character set OF THE HOST SYSTEM's encoding of the basic 62255932Salfred character set, and it should not be a state-dependent encoding. 63255932Salfred 64255932Salfred cpplib uses UTF-8 or UTF-EBCDIC for the source character set, 65255932Salfred depending on whether the host is based on ASCII or EBCDIC (see 66255932Salfred respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode 67255932Salfred Technical Report #16). With limited exceptions, it relies on the 68255932Salfred system library's iconv() primitive to do charset conversion 69255932Salfred (specified in SUSv2). */ 70255932Salfred 71255932Salfred#if !HAVE_ICONV 72255932Salfred/* Make certain that the uses of iconv(), iconv_open(), iconv_close() 73255932Salfred below, which are guarded only by if statements with compile-time 74255932Salfred constant conditions, do not cause link errors. */ 75255932Salfred#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1) 76255932Salfred#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1) 77255932Salfred#define iconv_close(x) (void)0 78255932Salfred#define ICONV_CONST 79255932Salfred#endif 80255932Salfred 81255932Salfred#if HOST_CHARSET == HOST_CHARSET_ASCII 82255932Salfred#define SOURCE_CHARSET "UTF-8" 83255932Salfred#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e 84255932Salfred#elif HOST_CHARSET == HOST_CHARSET_EBCDIC 85219820Sjeff#define SOURCE_CHARSET "UTF-EBCDIC" 86255932Salfred#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF 87219820Sjeff#else 88219820Sjeff#error "Unrecognized basic host character set" 89255932Salfred#endif 90255932Salfred 91219820Sjeff#ifndef EILSEQ 92219820Sjeff#define EILSEQ EINVAL 93255932Salfred#endif 94219820Sjeff 95219820Sjeff/* This structure is used for a resizable string buffer throughout. */ 96255932Salfred/* Don't call it strbuf, as that conflicts with unistd.h on systems 97255932Salfred such as DYNIX/ptx where unistd.h includes stropts.h. */ 98219820Sjeffstruct _cpp_strbuf 99255932Salfred{ 100255932Salfred uchar *text; 101255932Salfred size_t asize; 102255932Salfred size_t len; 103255932Salfred}; 104255932Salfred 105255932Salfred/* This is enough to hold any string that fits on a single 80-column 106255932Salfred line, even if iconv quadruples its size (e.g. conversion from 107255932Salfred ASCII to UTF-32) rounded up to a power of two. */ 108255932Salfred#define OUTBUF_BLOCK_SIZE 256 109255932Salfred 110219820Sjeff/* Conversions between UTF-8 and UTF-16/32 are implemented by custom 111219820Sjeff logic. This is because a depressing number of systems lack iconv, 112219820Sjeff or have have iconv libraries that do not do these conversions, so 113255932Salfred we need a fallback implementation for them. To ensure the fallback 114255932Salfred doesn't break due to neglect, it is used on all systems. 115255932Salfred 116219820Sjeff UTF-32 encoding is nice and simple: a four-byte binary number, 117219820Sjeff constrained to the range 00000000-7FFFFFFF to avoid questions of 118219820Sjeff signedness. We do have to cope with big- and little-endian 119219820Sjeff variants. 120219820Sjeff 121219820Sjeff UTF-16 encoding uses two-byte binary numbers, again in big- and 122219820Sjeff little-endian variants, for all values in the 00000000-0000FFFF 123255932Salfred range. Values in the 00010000-0010FFFF range are encoded as pairs 124255932Salfred of two-byte numbers, called "surrogate pairs": given a number S in 125255932Salfred this range, it is mapped to a pair (H, L) as follows: 126255932Salfred 127272407Shselasky H = (S - 0x10000) / 0x400 + 0xD800 128255932Salfred L = (S - 0x10000) % 0x400 + 0xDC00 129255932Salfred 130272407Shselasky Two-byte values in the D800...DFFF range are ill-formed except as a 131272407Shselasky component of a surrogate pair. Even if the encoding within a 132272407Shselasky two-byte value is little-endian, the H member of the surrogate pair 133272407Shselasky comes first. 134272407Shselasky 135255932Salfred There is no way to encode values in the 00110000-7FFFFFFF range, 136255932Salfred which is not currently a problem as there are no assigned code 137255932Salfred points in that range; however, the author expects that it will 138255932Salfred eventually become necessary to abandon UTF-16 due to this 139255932Salfred limitation. Note also that, because of these pairs, UTF-16 does 140255932Salfred not meet the requirements of the C standard for a wide character 141255932Salfred encoding (see 3.7.3 and 6.4.4.4p11). 142255932Salfred 143219820Sjeff UTF-8 encoding looks like this: 144255932Salfred 145255932Salfred value range encoded as 146255932Salfred 00000000-0000007F 0xxxxxxx 147255932Salfred 00000080-000007FF 110xxxxx 10xxxxxx 148255932Salfred 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx 149255932Salfred 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 150255932Salfred 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 151255932Salfred 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 152255932Salfred 153255932Salfred Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid, 154255932Salfred which means that three-byte sequences ED xx yy, with A0 <= xx <= BF, 155255932Salfred never occur. Note also that any value that can be encoded by a 156255932Salfred given row of the table can also be encoded by all successive rows, 157255932Salfred but this is not done; only the shortest possible encoding for any 158255932Salfred given value is valid. For instance, the character 07C0 could be 159255932Salfred encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or 160255932Salfred FC 80 80 80 9F 80. Only the first is valid. 161272407Shselasky 162272407Shselasky An implementation note: the transformation from UTF-16 to UTF-8, or 163272407Shselasky vice versa, is easiest done by using UTF-32 as an intermediary. */ 164255932Salfred 165255932Salfred/* Internal primitives which go from an UTF-8 byte stream to native-endian 166255932Salfred UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal 167255932Salfred operation in several places below. */ 168255932Salfredstatic inline int 169255932Salfredone_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp, 170255932Salfred cppchar_t *cp) 171255932Salfred{ 172255932Salfred static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 }; 173255932Salfred static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 174255932Salfred 175255932Salfred cppchar_t c; 176255932Salfred const uchar *inbuf = *inbufp; 177255932Salfred size_t nbytes, i; 178255932Salfred 179255932Salfred if (*inbytesleftp < 1) 180255932Salfred return EINVAL; 181255932Salfred 182255932Salfred c = *inbuf; 183255932Salfred if (c < 0x80) 184255932Salfred { 185255932Salfred *cp = c; 186255932Salfred *inbytesleftp -= 1; 187255932Salfred *inbufp += 1; 188255932Salfred return 0; 189255932Salfred } 190255932Salfred 191255932Salfred /* The number of leading 1-bits in the first byte indicates how many 192255932Salfred bytes follow. */ 193255932Salfred for (nbytes = 2; nbytes < 7; nbytes++) 194255932Salfred if ((c & ~masks[nbytes-1]) == patns[nbytes-1]) 195255932Salfred goto found; 196255932Salfred return EILSEQ; 197255932Salfred found: 198255932Salfred 199255932Salfred if (*inbytesleftp < nbytes) 200255932Salfred return EINVAL; 201255932Salfred 202255932Salfred c = (c & masks[nbytes-1]); 203255932Salfred inbuf++; 204255932Salfred for (i = 1; i < nbytes; i++) 205255932Salfred { 206255932Salfred cppchar_t n = *inbuf++; 207255932Salfred if ((n & 0xC0) != 0x80) 208255932Salfred return EILSEQ; 209255932Salfred c = ((c << 6) + (n & 0x3F)); 210255932Salfred } 211255932Salfred 212255932Salfred /* Make sure the shortest possible encoding was used. */ 213255932Salfred if (c <= 0x7F && nbytes > 1) return EILSEQ; 214255932Salfred if (c <= 0x7FF && nbytes > 2) return EILSEQ; 215255932Salfred if (c <= 0xFFFF && nbytes > 3) return EILSEQ; 216255932Salfred if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ; 217255932Salfred if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ; 218255932Salfred 219255932Salfred /* Make sure the character is valid. */ 220255932Salfred if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ; 221255932Salfred 222255932Salfred *cp = c; 223255932Salfred *inbufp = inbuf; 224255932Salfred *inbytesleftp -= nbytes; 225255932Salfred return 0; 226255932Salfred} 227255932Salfred 228255932Salfredstatic inline int 229255932Salfredone_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp) 230255932Salfred{ 231255932Salfred static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 232255932Salfred static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE }; 233255932Salfred size_t nbytes; 234255932Salfred uchar buf[6], *p = &buf[6]; 235255932Salfred uchar *outbuf = *outbufp; 236255932Salfred 237255932Salfred nbytes = 1; 238255932Salfred if (c < 0x80) 239255932Salfred *--p = c; 240255932Salfred else 241255932Salfred { 242255932Salfred do 243255932Salfred { 244255932Salfred *--p = ((c & 0x3F) | 0x80); 245255932Salfred c >>= 6; 246255932Salfred nbytes++; 247255932Salfred } 248272407Shselasky while (c >= 0x3F || (c & limits[nbytes-1])); 249272407Shselasky *--p = (c | masks[nbytes-1]); 250272407Shselasky } 251255932Salfred 252255932Salfred if (*outbytesleftp < nbytes) 253255932Salfred return E2BIG; 254255932Salfred 255255932Salfred while (p < &buf[6]) 256255932Salfred *outbuf++ = *p++; 257255932Salfred *outbytesleftp -= nbytes; 258255932Salfred *outbufp = outbuf; 259255932Salfred return 0; 260255932Salfred} 261255932Salfred 262255932Salfred/* The following four functions transform one character between the two 263255932Salfred encodings named in the function name. All have the signature 264255932Salfred int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 265255932Salfred uchar **outbufp, size_t *outbytesleftp) 266255932Salfred 267255932Salfred BIGEND must have the value 0 or 1, coerced to (iconv_t); it is 268255932Salfred interpreted as a boolean indicating whether big-endian or 269255932Salfred little-endian encoding is to be used for the member of the pair 270255932Salfred that is not UTF-8. 271255932Salfred 272272407Shselasky INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they 273255932Salfred do for iconv. 274255932Salfred 275255932Salfred The return value is either 0 for success, or an errno value for 276255932Salfred failure, which may be E2BIG (need more space), EILSEQ (ill-formed 277255932Salfred input sequence), ir EINVAL (incomplete input sequence). */ 278255932Salfred 279255932Salfredstatic inline int 280255932Salfredone_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 281255932Salfred uchar **outbufp, size_t *outbytesleftp) 282255932Salfred{ 283255932Salfred uchar *outbuf; 284255932Salfred cppchar_t s = 0; 285255932Salfred int rval; 286255932Salfred 287255932Salfred /* Check for space first, since we know exactly how much we need. */ 288255932Salfred if (*outbytesleftp < 4) 289255932Salfred return E2BIG; 290255932Salfred 291255932Salfred rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s); 292255932Salfred if (rval) 293255932Salfred return rval; 294255932Salfred 295255932Salfred outbuf = *outbufp; 296272407Shselasky outbuf[bigend ? 3 : 0] = (s & 0x000000FF); 297272407Shselasky outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8; 298272407Shselasky outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16; 299255932Salfred outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24; 300255932Salfred 301255932Salfred *outbufp += 4; 302255932Salfred *outbytesleftp -= 4; 303255932Salfred return 0; 304255932Salfred} 305255932Salfred 306255932Salfredstatic inline int 307255932Salfredone_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 308255932Salfred uchar **outbufp, size_t *outbytesleftp) 309255932Salfred{ 310255932Salfred cppchar_t s; 311255932Salfred int rval; 312255932Salfred const uchar *inbuf; 313255932Salfred 314255932Salfred if (*inbytesleftp < 4) 315255932Salfred return EINVAL; 316255932Salfred 317255932Salfred inbuf = *inbufp; 318255932Salfred 319255932Salfred s = inbuf[bigend ? 0 : 3] << 24; 320255932Salfred s += inbuf[bigend ? 1 : 2] << 16; 321255932Salfred s += inbuf[bigend ? 2 : 1] << 8; 322255932Salfred s += inbuf[bigend ? 3 : 0]; 323255932Salfred 324255932Salfred if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF)) 325255932Salfred return EILSEQ; 326272407Shselasky 327272407Shselasky rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp); 328272407Shselasky if (rval) 329272407Shselasky return rval; 330272407Shselasky 331255932Salfred *inbufp += 4; 332272407Shselasky *inbytesleftp -= 4; 333255932Salfred return 0; 334255932Salfred} 335255932Salfred 336255932Salfredstatic inline int 337272407Shselaskyone_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 338255932Salfred uchar **outbufp, size_t *outbytesleftp) 339255932Salfred{ 340255932Salfred int rval; 341272407Shselasky cppchar_t s = 0; 342272407Shselasky const uchar *save_inbuf = *inbufp; 343272407Shselasky size_t save_inbytesleft = *inbytesleftp; 344255932Salfred uchar *outbuf = *outbufp; 345255932Salfred 346255932Salfred rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s); 347255932Salfred if (rval) 348255932Salfred return rval; 349255932Salfred 350255932Salfred if (s > 0x0010FFFF) 351255932Salfred { 352255932Salfred *inbufp = save_inbuf; 353272407Shselasky *inbytesleftp = save_inbytesleft; 354272407Shselasky return EILSEQ; 355272407Shselasky } 356272407Shselasky 357272407Shselasky if (s < 0xFFFF) 358272407Shselasky { 359255932Salfred if (*outbytesleftp < 2) 360255932Salfred { 361255932Salfred *inbufp = save_inbuf; 362255932Salfred *inbytesleftp = save_inbytesleft; 363255932Salfred return E2BIG; 364272407Shselasky } 365272407Shselasky outbuf[bigend ? 1 : 0] = (s & 0x00FF); 366272407Shselasky outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8; 367272407Shselasky 368272407Shselasky *outbufp += 2; 369272407Shselasky *outbytesleftp -= 2; 370272407Shselasky return 0; 371272407Shselasky } 372272407Shselasky else 373272407Shselasky { 374272407Shselasky cppchar_t hi, lo; 375272407Shselasky 376272407Shselasky if (*outbytesleftp < 4) 377272407Shselasky { 378272407Shselasky *inbufp = save_inbuf; 379272407Shselasky *inbytesleftp = save_inbytesleft; 380272407Shselasky return E2BIG; 381272407Shselasky } 382272407Shselasky 383272407Shselasky hi = (s - 0x10000) / 0x400 + 0xD800; 384272407Shselasky lo = (s - 0x10000) % 0x400 + 0xDC00; 385272407Shselasky 386272407Shselasky /* Even if we are little-endian, put the high surrogate first. 387272407Shselasky ??? Matches practice? */ 388272407Shselasky outbuf[bigend ? 1 : 0] = (hi & 0x00FF); 389255932Salfred outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8; 390255932Salfred outbuf[bigend ? 3 : 2] = (lo & 0x00FF); 391255932Salfred outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8; 392255932Salfred 393255932Salfred *outbufp += 4; 394255932Salfred *outbytesleftp -= 4; 395255932Salfred return 0; 396255932Salfred } 397255932Salfred} 398255932Salfred 399255932Salfredstatic inline int 400255932Salfredone_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp, 401255932Salfred uchar **outbufp, size_t *outbytesleftp) 402255932Salfred{ 403255932Salfred cppchar_t s; 404255932Salfred const uchar *inbuf = *inbufp; 405255932Salfred int rval; 406255932Salfred 407255932Salfred if (*inbytesleftp < 2) 408255932Salfred return EINVAL; 409255932Salfred s = inbuf[bigend ? 0 : 1] << 8; 410255932Salfred s += inbuf[bigend ? 1 : 0]; 411255932Salfred 412255932Salfred /* Low surrogate without immediately preceding high surrogate is invalid. */ 413255932Salfred if (s >= 0xDC00 && s <= 0xDFFF) 414255932Salfred return EILSEQ; 415255932Salfred /* High surrogate must have a following low surrogate. */ 416255932Salfred else if (s >= 0xD800 && s <= 0xDBFF) 417255932Salfred { 418255932Salfred cppchar_t hi = s, lo; 419255932Salfred if (*inbytesleftp < 4) 420255932Salfred return EINVAL; 421255932Salfred 422255932Salfred lo = inbuf[bigend ? 2 : 3] << 8; 423255932Salfred lo += inbuf[bigend ? 3 : 2]; 424255932Salfred 425255932Salfred if (lo < 0xDC00 || lo > 0xDFFF) 426255932Salfred return EILSEQ; 427255932Salfred 428255932Salfred s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000; 429255932Salfred } 430255932Salfred 431255932Salfred rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp); 432255932Salfred if (rval) 433255932Salfred return rval; 434255932Salfred 435255932Salfred /* Success - update the input pointers (one_cppchar_to_utf8 has done 436272407Shselasky the output pointers for us). */ 437272407Shselasky if (s <= 0xFFFF) 438272407Shselasky { 439255932Salfred *inbufp += 2; 440255932Salfred *inbytesleftp -= 2; 441255932Salfred } 442255932Salfred else 443255932Salfred { 444255932Salfred *inbufp += 4; 445255932Salfred *inbytesleftp -= 4; 446255932Salfred } 447255932Salfred return 0; 448255932Salfred} 449255932Salfred 450255932Salfred/* Helper routine for the next few functions. The 'const' on 451255932Salfred one_conversion means that we promise not to modify what function is 452255932Salfred pointed to, which lets the inliner see through it. */ 453255932Salfred 454255932Salfredstatic inline bool 455255932Salfredconversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *, 456255932Salfred uchar **, size_t *), 457255932Salfred iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to) 458255932Salfred{ 459255932Salfred const uchar *inbuf; 460255932Salfred uchar *outbuf; 461255932Salfred size_t inbytesleft, outbytesleft; 462272407Shselasky int rval; 463272407Shselasky 464272407Shselasky inbuf = from; 465272407Shselasky inbytesleft = flen; 466272407Shselasky outbuf = to->text + to->len; 467272407Shselasky outbytesleft = to->asize - to->len; 468272407Shselasky 469272407Shselasky for (;;) 470255932Salfred { 471272407Shselasky do 472272407Shselasky rval = one_conversion (cd, &inbuf, &inbytesleft, 473272407Shselasky &outbuf, &outbytesleft); 474272407Shselasky while (inbytesleft && !rval); 475272407Shselasky 476272407Shselasky if (__builtin_expect (inbytesleft == 0, 1)) 477272407Shselasky { 478272407Shselasky to->len = to->asize - outbytesleft; 479272407Shselasky return true; 480272407Shselasky } 481272407Shselasky if (rval != E2BIG) 482272407Shselasky { 483272407Shselasky errno = rval; 484272407Shselasky return false; 485272407Shselasky } 486272407Shselasky 487272407Shselasky outbytesleft += OUTBUF_BLOCK_SIZE; 488272407Shselasky to->asize += OUTBUF_BLOCK_SIZE; 489272407Shselasky to->text = XRESIZEVEC (uchar, to->text, to->asize); 490272407Shselasky outbuf = to->text + to->asize - outbytesleft; 491255932Salfred } 492255932Salfred} 493255932Salfred 494272407Shselasky 495272407Shselasky/* These functions convert entire strings between character sets. 496272407Shselasky They all have the signature 497272407Shselasky 498272407Shselasky bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to); 499255932Salfred 500255932Salfred The input string FROM is converted as specified by the function 501255932Salfred name plus the iconv descriptor CD (which may be fake), and the 502255932Salfred result appended to TO. On any error, false is returned, otherwise true. */ 503255932Salfred 504255932Salfred/* These four use the custom conversion code above. */ 505255932Salfredstatic bool 506255932Salfredconvert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen, 507255932Salfred struct _cpp_strbuf *to) 508255932Salfred{ 509255932Salfred return conversion_loop (one_utf8_to_utf16, cd, from, flen, to); 510255932Salfred} 511255932Salfred 512255932Salfredstatic bool 513255932Salfredconvert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen, 514255932Salfred struct _cpp_strbuf *to) 515255932Salfred{ 516255932Salfred return conversion_loop (one_utf8_to_utf32, cd, from, flen, to); 517255932Salfred} 518255932Salfred 519255932Salfredstatic bool 520255932Salfredconvert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen, 521255932Salfred struct _cpp_strbuf *to) 522255932Salfred{ 523255932Salfred return conversion_loop (one_utf16_to_utf8, cd, from, flen, to); 524255932Salfred} 525255932Salfred 526255932Salfredstatic bool 527255932Salfredconvert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen, 528255932Salfred struct _cpp_strbuf *to) 529255932Salfred{ 530255932Salfred return conversion_loop (one_utf32_to_utf8, cd, from, flen, to); 531255932Salfred} 532255932Salfred 533255932Salfred/* Identity conversion, used when we have no alternative. */ 534255932Salfredstatic bool 535255932Salfredconvert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED, 536255932Salfred const uchar *from, size_t flen, struct _cpp_strbuf *to) 537255932Salfred{ 538255932Salfred if (to->len + flen > to->asize) 539255932Salfred { 540255932Salfred to->asize = to->len + flen; 541255932Salfred to->text = XRESIZEVEC (uchar, to->text, to->asize); 542255932Salfred } 543255932Salfred memcpy (to->text + to->len, from, flen); 544272407Shselasky to->len += flen; 545255932Salfred return true; 546255932Salfred} 547255932Salfred 548255932Salfred/* And this one uses the system iconv primitive. It's a little 549255932Salfred different, since iconv's interface is a little different. */ 550255932Salfred#if HAVE_ICONV 551255932Salfredstatic bool 552255932Salfredconvert_using_iconv (iconv_t cd, const uchar *from, size_t flen, 553272407Shselasky struct _cpp_strbuf *to) 554272407Shselasky{ 555272407Shselasky ICONV_CONST char *inbuf; 556255932Salfred char *outbuf; 557255932Salfred size_t inbytesleft, outbytesleft; 558255932Salfred 559255932Salfred /* Reset conversion descriptor and check that it is valid. */ 560255932Salfred if (iconv (cd, 0, 0, 0, 0) == (size_t)-1) 561255932Salfred return false; 562255932Salfred 563255932Salfred inbuf = (ICONV_CONST char *)from; 564255932Salfred inbytesleft = flen; 565255932Salfred outbuf = (char *)to->text + to->len; 566255932Salfred outbytesleft = to->asize - to->len; 567255932Salfred 568255932Salfred for (;;) 569255932Salfred { 570255932Salfred iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); 571255932Salfred if (__builtin_expect (inbytesleft == 0, 1)) 572255932Salfred { 573255932Salfred to->len = to->asize - outbytesleft; 574255932Salfred return true; 575255932Salfred } 576255932Salfred if (errno != E2BIG) 577255932Salfred return false; 578255932Salfred 579255932Salfred outbytesleft += OUTBUF_BLOCK_SIZE; 580255932Salfred to->asize += OUTBUF_BLOCK_SIZE; 581255932Salfred to->text = XRESIZEVEC (uchar, to->text, to->asize); 582255932Salfred outbuf = (char *)to->text + to->asize - outbytesleft; 583255932Salfred } 584255932Salfred} 585255932Salfred#else 586255932Salfred#define convert_using_iconv 0 /* prevent undefined symbol error below */ 587255932Salfred#endif 588272407Shselasky 589272407Shselasky/* Arrange for the above custom conversion logic to be used automatically 590272407Shselasky when conversion between a suitable pair of character sets is requested. */ 591272407Shselasky 592272407Shselasky#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \ 593272407Shselasky CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO) 594272407Shselasky 595255932Salfredstruct conversion 596255932Salfred{ 597255932Salfred const char *pair; 598272407Shselasky convert_f func; 599272407Shselasky iconv_t fake_cd; 600272407Shselasky}; 601272407Shselaskystatic const struct conversion conversion_tab[] = { 602272407Shselasky { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 }, 603272407Shselasky { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 }, 604272407Shselasky { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 }, 605272407Shselasky { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 }, 606272407Shselasky { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 }, 607272407Shselasky { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 }, 608272407Shselasky { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 }, 609272407Shselasky { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 }, 610272407Shselasky}; 611272407Shselasky 612272407Shselasky/* Subroutine of cpp_init_iconv: initialize and return a 613272407Shselasky cset_converter structure for conversion from FROM to TO. If 614272407Shselasky iconv_open() fails, issue an error and return an identity 615255932Salfred converter. Silently return an identity converter if FROM and TO 616272407Shselasky are identical. */ 617272407Shselaskystatic struct cset_converter 618272407Shselaskyinit_iconv_desc (cpp_reader *pfile, const char *to, const char *from) 619272407Shselasky{ 620272407Shselasky struct cset_converter ret; 621255932Salfred char *pair; 622272407Shselasky size_t i; 623272407Shselasky 624272407Shselasky if (!strcasecmp (to, from)) 625272407Shselasky { 626255932Salfred ret.func = convert_no_conversion; 627272407Shselasky ret.cd = (iconv_t) -1; 628272407Shselasky return ret; 629272407Shselasky } 630272407Shselasky 631272407Shselasky pair = (char *) alloca(strlen(to) + strlen(from) + 2); 632272407Shselasky 633272407Shselasky strcpy(pair, from); 634272407Shselasky strcat(pair, "/"); 635272407Shselasky strcat(pair, to); 636272407Shselasky for (i = 0; i < ARRAY_SIZE (conversion_tab); i++) 637272407Shselasky if (!strcasecmp (pair, conversion_tab[i].pair)) 638272407Shselasky { 639255932Salfred ret.func = conversion_tab[i].func; 640255932Salfred ret.cd = conversion_tab[i].fake_cd; 641255932Salfred return ret; 642255932Salfred } 643255932Salfred 644255932Salfred /* No custom converter - try iconv. */ 645255932Salfred if (HAVE_ICONV) 646255932Salfred { 647255932Salfred ret.func = convert_using_iconv; 648255932Salfred ret.cd = iconv_open (to, from); 649255932Salfred 650255932Salfred if (ret.cd == (iconv_t) -1) 651255932Salfred { 652255932Salfred if (errno == EINVAL) 653255932Salfred cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ 654255932Salfred "conversion from %s to %s not supported by iconv", 655219820Sjeff from, to); 656219820Sjeff else 657219820Sjeff cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); 658219820Sjeff 659219820Sjeff ret.func = convert_no_conversion; 660219820Sjeff } 661219820Sjeff } 662219820Sjeff else 663219820Sjeff { 664219820Sjeff cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ 665219820Sjeff "no iconv implementation, cannot convert from %s to %s", 666219820Sjeff from, to); 667219820Sjeff ret.func = convert_no_conversion; 668219820Sjeff ret.cd = (iconv_t) -1; 669255932Salfred } 670255932Salfred return ret; 671255932Salfred} 672255932Salfred 673219820Sjeff/* If charset conversion is requested, initialize iconv(3) descriptors 674219820Sjeff for conversion from the source character set to the execution 675219820Sjeff character sets. If iconv is not present in the C library, and 676219820Sjeff conversion is requested, issue an error. */ 677219820Sjeff 678255932Salfredvoid 679255932Salfredcpp_init_iconv (cpp_reader *pfile) 680255932Salfred{ 681219820Sjeff const char *ncset = CPP_OPTION (pfile, narrow_charset); 682219820Sjeff const char *wcset = CPP_OPTION (pfile, wide_charset); 683219820Sjeff const char *default_wcset; 684219820Sjeff 685219820Sjeff bool be = CPP_OPTION (pfile, bytes_big_endian); 686219820Sjeff 687219820Sjeff if (CPP_OPTION (pfile, wchar_precision) >= 32) 688219820Sjeff default_wcset = be ? "UTF-32BE" : "UTF-32LE"; 689255932Salfred else if (CPP_OPTION (pfile, wchar_precision) >= 16) 690219820Sjeff default_wcset = be ? "UTF-16BE" : "UTF-16LE"; 691219820Sjeff else 692219820Sjeff /* This effectively means that wide strings are not supported, 693219820Sjeff so don't do any conversion at all. */ 694279731Shselasky default_wcset = SOURCE_CHARSET; 695279731Shselasky 696279731Shselasky if (!ncset) 697279731Shselasky ncset = SOURCE_CHARSET; 698219820Sjeff if (!wcset) 699255932Salfred wcset = default_wcset; 700219820Sjeff 701219820Sjeff pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET); 702219820Sjeff pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET); 703255932Salfred} 704219820Sjeff 705219820Sjeff/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */ 706219820Sjeffvoid 707255932Salfred_cpp_destroy_iconv (cpp_reader *pfile) 708255932Salfred{ 709219820Sjeff if (HAVE_ICONV) 710219820Sjeff { 711219820Sjeff if (pfile->narrow_cset_desc.func == convert_using_iconv) 712219820Sjeff iconv_close (pfile->narrow_cset_desc.cd); 713219820Sjeff if (pfile->wide_cset_desc.func == convert_using_iconv) 714219820Sjeff iconv_close (pfile->wide_cset_desc.cd); 715219820Sjeff } 716255932Salfred} 717219820Sjeff 718219820Sjeff/* Utility routine for use by a full compiler. C is a character taken 719219820Sjeff from the *basic* source character set, encoded in the host's 720219820Sjeff execution encoding. Convert it to (the target's) execution 721219820Sjeff encoding, and return that value. 722219820Sjeff 723219820Sjeff Issues an internal error if C's representation in the narrow 724219820Sjeff execution character set fails to be a single-byte value (C99 725219820Sjeff 5.2.1p3: "The representation of each member of the source and 726219820Sjeff execution character sets shall fit in a byte.") May also issue an 727272407Shselasky internal error if C fails to be a member of the basic source 728272407Shselasky character set (testing this exactly is too hard, especially when 729272407Shselasky the host character set is EBCDIC). */ 730272407Shselaskycppchar_t 731272407Shselaskycpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c) 732272407Shselasky{ 733272407Shselasky uchar sbuf[1]; 734272407Shselasky struct _cpp_strbuf tbuf; 735272407Shselasky 736272407Shselasky /* This test is merely an approximation, but it suffices to catch 737272407Shselasky the most important thing, which is that we don't get handed a 738272407Shselasky character outside the unibyte range of the host character set. */ 739272407Shselasky if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR) 740272407Shselasky { 741272407Shselasky cpp_error (pfile, CPP_DL_ICE, 742272407Shselasky "character 0x%lx is not in the basic source character set\n", 743272407Shselasky (unsigned long)c); 744272407Shselasky return 0; 745272407Shselasky } 746255932Salfred 747255932Salfred /* Being a character in the unibyte range of the host character set, 748219820Sjeff we can safely splat it into a one-byte buffer and trust that that 749272407Shselasky is a well-formed string. */ 750255932Salfred sbuf[0] = c; 751272407Shselasky 752272407Shselasky /* This should never need to reallocate, but just in case... */ 753272407Shselasky tbuf.asize = 1; 754255932Salfred tbuf.text = XNEWVEC (uchar, tbuf.asize); 755272407Shselasky tbuf.len = 0; 756272407Shselasky 757272407Shselasky if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf)) 758255932Salfred { 759255932Salfred cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set"); 760255932Salfred return 0; 761255932Salfred } 762255932Salfred if (tbuf.len != 1) 763255932Salfred { 764255932Salfred cpp_error (pfile, CPP_DL_ICE, 765255932Salfred "character 0x%lx is not unibyte in execution character set", 766255932Salfred (unsigned long)c); 767255932Salfred return 0; 768255932Salfred } 769255932Salfred c = tbuf.text[0]; 770255932Salfred free(tbuf.text); 771272407Shselasky return c; 772272407Shselasky} 773272407Shselasky 774272407Shselasky 775272407Shselasky 776272407Shselasky/* Utility routine that computes a mask of the form 0000...111... with 777272407Shselasky WIDTH 1-bits. */ 778272407Shselaskystatic inline size_t 779272407Shselaskywidth_to_mask (size_t width) 780272407Shselasky{ 781272407Shselasky width = MIN (width, BITS_PER_CPPCHAR_T); 782272407Shselasky if (width >= CHAR_BIT * sizeof (size_t)) 783272407Shselasky return ~(size_t) 0; 784272407Shselasky else 785272407Shselasky return ((size_t) 1 << width) - 1; 786272407Shselasky} 787272407Shselasky 788272407Shselasky/* A large table of unicode character information. */ 789272407Shselaskyenum { 790272407Shselasky /* Valid in a C99 identifier? */ 791272407Shselasky C99 = 1, 792272407Shselasky /* Valid in a C99 identifier, but not as the first character? */ 793272407Shselasky DIG = 2, 794272407Shselasky /* Valid in a C++ identifier? */ 795272407Shselasky CXX = 4, 796272407Shselasky /* NFC representation is not valid in an identifier? */ 797272407Shselasky CID = 8, 798272407Shselasky /* Might be valid NFC form? */ 799272407Shselasky NFC = 16, 800272407Shselasky /* Might be valid NFKC form? */ 801272407Shselasky NKC = 32, 802272407Shselasky /* Certain preceding characters might make it not valid NFC/NKFC form? */ 803272407Shselasky CTX = 64 804272407Shselasky}; 805272407Shselasky 806272407Shselaskystatic const struct { 807272407Shselasky /* Bitmap of flags above. */ 808255932Salfred unsigned char flags; 809255932Salfred /* Combining class of the character. */ 810255932Salfred unsigned char combine; 811272407Shselasky /* Last character in the range described by this entry. */ 812255932Salfred unsigned short end; 813272407Shselasky} ucnranges[] = { 814255932Salfred#include "ucnid.h" 815272407Shselasky}; 816255932Salfred 817255932Salfred/* Returns 1 if C is valid in an identifier, 2 if C is valid except at 818255932Salfred the start of an identifier, and 0 if C is not valid in an 819255932Salfred identifier. We assume C has already gone through the checks of 820255932Salfred _cpp_valid_ucn. Also update NST for C if returning nonzero. The 821255932Salfred algorithm is a simple binary search on the table defined in 822255932Salfred ucnid.h. */ 823255932Salfred 824255932Salfredstatic int 825255932Salfreducn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, 826255932Salfred struct normalize_state *nst) 827255932Salfred{ 828255932Salfred int mn, mx, md; 829272407Shselasky 830272407Shselasky if (c > 0xFFFF) 831255932Salfred return 0; 832255932Salfred 833255932Salfred mn = 0; 834272407Shselasky mx = ARRAY_SIZE (ucnranges) - 1; 835255932Salfred while (mx != mn) 836255932Salfred { 837255932Salfred md = (mn + mx) / 2; 838255932Salfred if (c <= ucnranges[md].end) 839255932Salfred mx = md; 840255932Salfred else 841255932Salfred mn = md + 1; 842255932Salfred } 843255932Salfred 844255932Salfred /* When -pedantic, we require the character to have been listed by 845255932Salfred the standard for the current language. Otherwise, we accept the 846255932Salfred union of the acceptable sets for C++98 and C99. */ 847255932Salfred if (! (ucnranges[mn].flags & (C99 | CXX))) 848255932Salfred return 0; 849255932Salfred 850255932Salfred if (CPP_PEDANTIC (pfile) 851255932Salfred && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99)) 852255932Salfred || (CPP_OPTION (pfile, cplusplus) 853255932Salfred && !(ucnranges[mn].flags & CXX)))) 854255932Salfred return 0; 855255932Salfred 856255932Salfred /* Update NST. */ 857255932Salfred if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class) 858255932Salfred nst->level = normalized_none; 859255932Salfred else if (ucnranges[mn].flags & CTX) 860255932Salfred { 861255932Salfred bool safe; 862255932Salfred cppchar_t p = nst->previous; 863255932Salfred 864255932Salfred /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */ 865255932Salfred if (c == 0x09BE) 866255932Salfred safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */ 867255932Salfred else if (c == 0x0B3E) 868255932Salfred safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */ 869255932Salfred else if (c == 0x0BBE) 870255932Salfred safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */ 871255932Salfred else if (c == 0x0CC2) 872255932Salfred safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */ 873255932Salfred else if (c == 0x0D3E) 874255932Salfred safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */ 875255932Salfred /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC, 876255932Salfred and are combined algorithmically from a sequence of the form 877255932Salfred 1100-1112 1161-1175 11A8-11C2 878255932Salfred (if the third is not present, it is treated as 11A7, which is not 879255932Salfred really a valid character). 880255932Salfred Unfortunately, C99 allows (only) the NFC form, but C++ allows 881255932Salfred only the combining characters. */ 882255932Salfred else if (c >= 0x1161 && c <= 0x1175) 883255932Salfred safe = p < 0x1100 || p > 0x1112; 884255932Salfred else if (c >= 0x11A8 && c <= 0x11C2) 885255932Salfred safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0); 886255932Salfred else 887255932Salfred { 888255932Salfred /* Uh-oh, someone updated ucnid.h without updating this code. */ 889255932Salfred cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c); 890255932Salfred safe = true; 891255932Salfred } 892255932Salfred if (!safe && c < 0x1161) 893255932Salfred nst->level = normalized_none; 894255932Salfred else if (!safe) 895255932Salfred nst->level = MAX (nst->level, normalized_identifier_C); 896255932Salfred } 897255932Salfred else if (ucnranges[mn].flags & NKC) 898255932Salfred ; 899255932Salfred else if (ucnranges[mn].flags & NFC) 900255932Salfred nst->level = MAX (nst->level, normalized_C); 901255932Salfred else if (ucnranges[mn].flags & CID) 902255932Salfred nst->level = MAX (nst->level, normalized_identifier_C); 903255932Salfred else 904255932Salfred nst->level = normalized_none; 905255932Salfred nst->previous = c; 906255932Salfred nst->prev_class = ucnranges[mn].combine; 907255932Salfred 908255932Salfred /* In C99, UCN digits may not begin identifiers. */ 909255932Salfred if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG)) 910255932Salfred return 2; 911255932Salfred 912255932Salfred return 1; 913255932Salfred} 914255932Salfred 915255932Salfred/* [lex.charset]: The character designated by the universal character 916255932Salfred name \UNNNNNNNN is that character whose character short name in 917255932Salfred ISO/IEC 10646 is NNNNNNNN; the character designated by the 918279731Shselasky universal character name \uNNNN is that character whose character 919279731Shselasky short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value 920255932Salfred for a universal character name is less than 0x20 or in the range 921279731Shselasky 0x7F-0x9F (inclusive), or if the universal character name 922279731Shselasky designates a character in the basic source character set, then the 923255932Salfred program is ill-formed. 924255932Salfred 925255932Salfred *PSTR must be preceded by "\u" or "\U"; it is assumed that the 926255932Salfred buffer end is delimited by a non-hex digit. Returns zero if the 927255932Salfred UCN has not been consumed. 928255932Salfred 929255932Salfred Otherwise the nonzero value of the UCN, whether valid or invalid, 930255932Salfred is returned. Diagnostics are emitted for invalid values. PSTR 931255932Salfred is updated to point one beyond the UCN, or to the syntactically 932255932Salfred invalid character. 933255932Salfred 934255932Salfred IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of 935255932Salfred an identifier, or 2 otherwise. */ 936255932Salfred 937255932Salfredcppchar_t 938255932Salfred_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, 939255932Salfred const uchar *limit, int identifier_pos, 940255932Salfred struct normalize_state *nst) 941255932Salfred{ 942255932Salfred cppchar_t result, c; 943255932Salfred unsigned int length; 944255932Salfred const uchar *str = *pstr; 945255932Salfred const uchar *base = str - 2; 946255932Salfred 947255932Salfred if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99)) 948255932Salfred cpp_error (pfile, CPP_DL_WARNING, 949255932Salfred "universal character names are only valid in C++ and C99"); 950255932Salfred else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0) 951255932Salfred cpp_error (pfile, CPP_DL_WARNING, 952255932Salfred "the meaning of '\\%c' is different in traditional C", 953255932Salfred (int) str[-1]); 954255932Salfred 955255932Salfred if (str[-1] == 'u') 956255932Salfred length = 4; 957255932Salfred else if (str[-1] == 'U') 958255932Salfred length = 8; 959255932Salfred else 960255932Salfred { 961255932Salfred cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN"); 962255932Salfred length = 4; 963255932Salfred } 964255932Salfred 965255932Salfred result = 0; 966255932Salfred do 967255932Salfred { 968255932Salfred c = *str; 969255932Salfred if (!ISXDIGIT (c)) 970255932Salfred break; 971255932Salfred str++; 972255932Salfred result = (result << 4) + hex_value (c); 973255932Salfred } 974255932Salfred while (--length && str < limit); 975255932Salfred 976255932Salfred /* Partial UCNs are not valid in strings, but decompose into 977255932Salfred multiple tokens in identifiers, so we can't give a helpful 978255932Salfred error message in that case. */ 979255932Salfred if (length && identifier_pos) 980255932Salfred return 0; 981255932Salfred 982255932Salfred *pstr = str; 983255932Salfred if (length) 984255932Salfred { 985255932Salfred cpp_error (pfile, CPP_DL_ERROR, 986272407Shselasky "incomplete universal character name %.*s", 987255932Salfred (int) (str - base), base); 988255932Salfred result = 1; 989255932Salfred } 990255932Salfred /* The standard permits $, @ and ` to be specified as UCNs. We use 991255932Salfred hex escapes so that this also works with EBCDIC hosts. */ 992255932Salfred else if ((result < 0xa0 993255932Salfred && (result != 0x24 && result != 0x40 && result != 0x60)) 994255932Salfred || (result & 0x80000000) 995255932Salfred || (result >= 0xD800 && result <= 0xDFFF)) 996255932Salfred { 997255932Salfred cpp_error (pfile, CPP_DL_ERROR, 998255932Salfred "%.*s is not a valid universal character", 999255932Salfred (int) (str - base), base); 1000255932Salfred result = 1; 1001255932Salfred } 1002255932Salfred else if (identifier_pos && result == 0x24 1003255932Salfred && CPP_OPTION (pfile, dollars_in_ident)) 1004255932Salfred { 1005255932Salfred if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping) 1006255932Salfred { 1007255932Salfred CPP_OPTION (pfile, warn_dollars) = 0; 1008255932Salfred cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number"); 1009255932Salfred } 1010255932Salfred NORMALIZE_STATE_UPDATE_IDNUM (nst); 1011219820Sjeff } 1012219820Sjeff else if (identifier_pos) 1013219820Sjeff { 1014219820Sjeff int validity = ucn_valid_in_identifier (pfile, result, nst); 1015219820Sjeff 1016219820Sjeff if (validity == 0) 1017219820Sjeff cpp_error (pfile, CPP_DL_ERROR, 1018219820Sjeff "universal character %.*s is not valid in an identifier", 1019255932Salfred (int) (str - base), base); 1020255932Salfred else if (validity == 2 && identifier_pos == 1) 1021219820Sjeff cpp_error (pfile, CPP_DL_ERROR, 1022219820Sjeff "universal character %.*s is not valid at the start of an identifier", 1023219820Sjeff (int) (str - base), base); 1024219820Sjeff } 1025219820Sjeff 1026219820Sjeff if (result == 0) 1027219820Sjeff result = 1; 1028255932Salfred 1029255932Salfred return result; 1030219820Sjeff} 1031219820Sjeff 1032219820Sjeff/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate 1033219820Sjeff it to the execution character set and write the result into TBUF. 1034255932Salfred An advanced pointer is returned. Issues all relevant diagnostics. */ 1035255932Salfredstatic const uchar * 1036219820Sjeffconvert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, 1037255932Salfred struct _cpp_strbuf *tbuf, bool wide) 1038219820Sjeff{ 1039219820Sjeff cppchar_t ucn; 1040219820Sjeff uchar buf[6]; 1041219820Sjeff uchar *bufp = buf; 1042219820Sjeff size_t bytesleft = 6; 1043219820Sjeff int rval; 1044219820Sjeff struct cset_converter cvt 1045219820Sjeff = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc; 1046219820Sjeff struct normalize_state nst = INITIAL_NORMALIZE_STATE; 1047219820Sjeff 1048219820Sjeff from++; /* Skip u/U. */ 1049255932Salfred ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst); 1050219820Sjeff 1051219820Sjeff rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft); 1052219820Sjeff if (rval) 1053219820Sjeff { 1054219820Sjeff errno = rval; 1055255932Salfred cpp_errno (pfile, CPP_DL_ERROR, 1056219820Sjeff "converting UCN to source character set"); 1057219820Sjeff } 1058219820Sjeff else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf)) 1059219820Sjeff cpp_errno (pfile, CPP_DL_ERROR, 1060219820Sjeff "converting UCN to execution character set"); 1061219820Sjeff 1062219820Sjeff return from; 1063219820Sjeff} 1064219820Sjeff 1065219820Sjeff/* Subroutine of convert_hex and convert_oct. N is the representation 1066219820Sjeff in the execution character set of a numeric escape; write it into the 1067219820Sjeff string buffer TBUF and update the end-of-string pointer therein. WIDE 1068219820Sjeff is true if it's a wide string that's being assembled in TBUF. This 1069219820Sjeff function issues no diagnostics and never fails. */ 1070219820Sjeffstatic void 1071255932Salfredemit_numeric_escape (cpp_reader *pfile, cppchar_t n, 1072219820Sjeff struct _cpp_strbuf *tbuf, bool wide) 1073255932Salfred{ 1074219820Sjeff if (wide) 1075219820Sjeff { 1076219820Sjeff /* We have to render this into the target byte order, which may not 1077272407Shselasky be our byte order. */ 1078219820Sjeff bool bigend = CPP_OPTION (pfile, bytes_big_endian); 1079272407Shselasky size_t width = CPP_OPTION (pfile, wchar_precision); 1080219820Sjeff size_t cwidth = CPP_OPTION (pfile, char_precision); 1081255932Salfred size_t cmask = width_to_mask (cwidth); 1082219820Sjeff size_t nbwc = width / cwidth; 1083219820Sjeff size_t i; 1084219820Sjeff size_t off = tbuf->len; 1085219820Sjeff cppchar_t c; 1086219820Sjeff 1087255932Salfred if (tbuf->len + nbwc > tbuf->asize) 1088219820Sjeff { 1089219820Sjeff tbuf->asize += OUTBUF_BLOCK_SIZE; 1090219820Sjeff tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize); 1091272407Shselasky } 1092255932Salfred 1093255932Salfred for (i = 0; i < nbwc; i++) 1094255932Salfred { 1095255932Salfred c = n & cmask; 1096255932Salfred n >>= cwidth; 1097255932Salfred tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c; 1098255932Salfred } 1099255932Salfred tbuf->len += nbwc; 1100255932Salfred } 1101219820Sjeff else 1102219820Sjeff { 1103219820Sjeff /* Note: this code does not handle the case where the target 1104219820Sjeff and host have a different number of bits in a byte. */ 1105219820Sjeff if (tbuf->len + 1 > tbuf->asize) 1106219820Sjeff { 1107219820Sjeff tbuf->asize += OUTBUF_BLOCK_SIZE; 1108272407Shselasky tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize); 1109219820Sjeff } 1110219820Sjeff tbuf->text[tbuf->len++] = n; 1111219820Sjeff } 1112219820Sjeff} 1113219820Sjeff 1114219820Sjeff/* Convert a hexadecimal escape, pointed to by FROM, to the execution 1115219820Sjeff character set and write it into the string buffer TBUF. Returns an 1116255932Salfred advanced pointer, and issues diagnostics as necessary. 1117255932Salfred No character set translation occurs; this routine always produces the 1118219820Sjeff execution-set character with numeric value equal to the given hex 1119219820Sjeff number. You can, e.g. generate surrogate pairs this way. */ 1120219820Sjeffstatic const uchar * 1121219820Sjeffconvert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit, 1122219820Sjeff struct _cpp_strbuf *tbuf, bool wide) 1123219820Sjeff{ 1124255932Salfred cppchar_t c, n = 0, overflow = 0; 1125219820Sjeff int digits_found = 0; 1126255932Salfred size_t width = (wide ? CPP_OPTION (pfile, wchar_precision) 1127255932Salfred : CPP_OPTION (pfile, char_precision)); 1128219820Sjeff size_t mask = width_to_mask (width); 1129219820Sjeff 1130219820Sjeff if (CPP_WTRADITIONAL (pfile)) 1131219820Sjeff cpp_error (pfile, CPP_DL_WARNING, 1132219820Sjeff "the meaning of '\\x' is different in traditional C"); 1133219820Sjeff 1134219820Sjeff from++; /* Skip 'x'. */ 1135219820Sjeff while (from < limit) 1136255932Salfred { 1137255932Salfred c = *from; 1138219820Sjeff if (! hex_p (c)) 1139219820Sjeff break; 1140219820Sjeff from++; 1141219820Sjeff overflow |= n ^ (n << 4 >> 4); 1142279731Shselasky n = (n << 4) + hex_value (c); 1143279731Shselasky digits_found = 1; 1144219820Sjeff } 1145219820Sjeff 1146219820Sjeff if (!digits_found) 1147219820Sjeff { 1148272407Shselasky cpp_error (pfile, CPP_DL_ERROR, 1149272407Shselasky "\\x used with no following hex digits"); 1150272407Shselasky return from; 1151272407Shselasky } 1152255932Salfred 1153272407Shselasky if (overflow | (n != (n & mask))) 1154272407Shselasky { 1155272407Shselasky cpp_error (pfile, CPP_DL_PEDWARN, 1156255932Salfred "hex escape sequence out of range"); 1157219820Sjeff n &= mask; 1158255932Salfred } 1159255932Salfred 1160219820Sjeff emit_numeric_escape (pfile, n, tbuf, wide); 1161255932Salfred 1162255932Salfred return from; 1163219820Sjeff} 1164219820Sjeff 1165219820Sjeff/* Convert an octal escape, pointed to by FROM, to the execution 1166219820Sjeff character set and write it into the string buffer TBUF. Returns an 1167219820Sjeff advanced pointer, and issues diagnostics as necessary. 1168219820Sjeff No character set translation occurs; this routine always produces the 1169219820Sjeff execution-set character with numeric value equal to the given octal 1170255932Salfred number. */ 1171255932Salfredstatic const uchar * 1172255932Salfredconvert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit, 1173255932Salfred struct _cpp_strbuf *tbuf, bool wide) 1174219820Sjeff{ 1175255932Salfred size_t count = 0; 1176255932Salfred cppchar_t c, n = 0; 1177255932Salfred size_t width = (wide ? CPP_OPTION (pfile, wchar_precision) 1178255932Salfred : CPP_OPTION (pfile, char_precision)); 1179255932Salfred size_t mask = width_to_mask (width); 1180219820Sjeff bool overflow = false; 1181219820Sjeff 1182219820Sjeff while (from < limit && count++ < 3) 1183255932Salfred { 1184255932Salfred c = *from; 1185255932Salfred if (c < '0' || c > '7') 1186219820Sjeff break; 1187219820Sjeff from++; 1188219820Sjeff overflow |= n ^ (n << 3 >> 3); 1189219820Sjeff n = (n << 3) + c - '0'; 1190255932Salfred } 1191219820Sjeff 1192219820Sjeff if (n != (n & mask)) 1193219820Sjeff { 1194219820Sjeff cpp_error (pfile, CPP_DL_PEDWARN, 1195219820Sjeff "octal escape sequence out of range"); 1196255932Salfred n &= mask; 1197219820Sjeff } 1198219820Sjeff 1199219820Sjeff emit_numeric_escape (pfile, n, tbuf, wide); 1200219820Sjeff 1201219820Sjeff return from; 1202219820Sjeff} 1203219820Sjeff 1204219820Sjeff/* Convert an escape sequence (pointed to by FROM) to its value on 1205219820Sjeff the target, and to the execution character set. Do not scan past 1206272407Shselasky LIMIT. Write the converted value into TBUF. Returns an advanced 1207219820Sjeff pointer. Handles all relevant diagnostics. */ 1208219820Sjeffstatic const uchar * 1209219820Sjeffconvert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, 1210219820Sjeff struct _cpp_strbuf *tbuf, bool wide) 1211255932Salfred{ 1212219820Sjeff /* Values of \a \b \e \f \n \r \t \v respectively. */ 1213219820Sjeff#if HOST_CHARSET == HOST_CHARSET_ASCII 1214219820Sjeff static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 }; 1215219820Sjeff#elif HOST_CHARSET == HOST_CHARSET_EBCDIC 1216219820Sjeff static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 }; 1217255932Salfred#else 1218219820Sjeff#error "unknown host character set" 1219219820Sjeff#endif 1220219820Sjeff 1221219820Sjeff uchar c; 1222219820Sjeff struct cset_converter cvt 1223219820Sjeff = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc; 1224219820Sjeff 1225219820Sjeff c = *from; 1226272407Shselasky switch (c) 1227219820Sjeff { 1228219820Sjeff /* UCNs, hex escapes, and octal escapes are processed separately. */ 1229219820Sjeff case 'u': case 'U': 1230219820Sjeff return convert_ucn (pfile, from, limit, tbuf, wide); 1231219820Sjeff 1232219820Sjeff case 'x': 1233219820Sjeff return convert_hex (pfile, from, limit, tbuf, wide); 1234219820Sjeff break; 1235255932Salfred 1236255932Salfred case '0': case '1': case '2': case '3': 1237255932Salfred case '4': case '5': case '6': case '7': 1238255932Salfred return convert_oct (pfile, from, limit, tbuf, wide); 1239255932Salfred 1240255932Salfred /* Various letter escapes. Get the appropriate host-charset 1241255932Salfred value into C. */ 1242255932Salfred case '\\': case '\'': case '"': case '?': break; 1243255932Salfred 1244255932Salfred case '(': case '{': case '[': case '%': 1245255932Salfred /* '\(', etc, can be used at the beginning of a line in a long 1246255932Salfred string split onto multiple lines with \-newline, to prevent 1247255932Salfred Emacs or other text editors from getting confused. '\%' can 1248255932Salfred be used to prevent SCCS from mangling printf format strings. */ 1249255932Salfred if (CPP_PEDANTIC (pfile)) 1250255932Salfred goto unknown; 1251255932Salfred break; 1252255932Salfred 1253255932Salfred case 'b': c = charconsts[1]; break; 1254255932Salfred case 'f': c = charconsts[3]; break; 1255272407Shselasky case 'n': c = charconsts[4]; break; 1256255932Salfred case 'r': c = charconsts[5]; break; 1257255932Salfred case 't': c = charconsts[6]; break; 1258255932Salfred case 'v': c = charconsts[7]; break; 1259255932Salfred 1260255932Salfred case 'a': 1261255932Salfred if (CPP_WTRADITIONAL (pfile)) 1262255932Salfred cpp_error (pfile, CPP_DL_WARNING, 1263255932Salfred "the meaning of '\\a' is different in traditional C"); 1264255932Salfred c = charconsts[0]; 1265272407Shselasky break; 1266272407Shselasky 1267272407Shselasky case 'e': case 'E': 1268272407Shselasky if (CPP_PEDANTIC (pfile)) 1269255932Salfred cpp_error (pfile, CPP_DL_PEDWARN, 1270255932Salfred "non-ISO-standard escape sequence, '\\%c'", (int) c); 1271255932Salfred c = charconsts[2]; 1272255932Salfred break; 1273255932Salfred 1274255932Salfred default: 1275255932Salfred unknown: 1276255932Salfred if (ISGRAPH (c)) 1277255932Salfred cpp_error (pfile, CPP_DL_PEDWARN, 1278255932Salfred "unknown escape sequence '\\%c'", (int) c); 1279255932Salfred else 1280255932Salfred { 1281255932Salfred /* diagnostic.c does not support "%03o". When it does, this 1282255932Salfred code can use %03o directly in the diagnostic again. */ 1283255932Salfred char buf[32]; 1284255932Salfred sprintf(buf, "%03o", (int) c); 1285255932Salfred cpp_error (pfile, CPP_DL_PEDWARN, 1286255932Salfred "unknown escape sequence: '\\%s'", buf); 1287255932Salfred } 1288255932Salfred } 1289255932Salfred 1290255932Salfred /* Now convert what we have to the execution character set. */ 1291255932Salfred if (!APPLY_CONVERSION (cvt, &c, 1, tbuf)) 1292255932Salfred cpp_errno (pfile, CPP_DL_ERROR, 1293255932Salfred "converting escape sequence to execution character set"); 1294255932Salfred 1295255932Salfred return from + 1; 1296255932Salfred} 1297255932Salfred 1298255932Salfred/* FROM is an array of cpp_string structures of length COUNT. These 1299255932Salfred are to be converted from the source to the execution character set, 1300255932Salfred escape sequences translated, and finally all are to be 1301255932Salfred concatenated. WIDE indicates whether or not to produce a wide 1302255932Salfred string. The result is written into TO. Returns true for success, 1303272407Shselasky false for failure. */ 1304255932Salfredbool 1305272407Shselaskycpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count, 1306272407Shselasky cpp_string *to, bool wide) 1307272407Shselasky{ 1308272407Shselasky struct _cpp_strbuf tbuf; 1309272407Shselasky const uchar *p, *base, *limit; 1310272407Shselasky size_t i; 1311272407Shselasky struct cset_converter cvt 1312272407Shselasky = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc; 1313272407Shselasky 1314272407Shselasky tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len); 1315272407Shselasky tbuf.text = XNEWVEC (uchar, tbuf.asize); 1316272407Shselasky tbuf.len = 0; 1317272407Shselasky 1318272407Shselasky for (i = 0; i < count; i++) 1319272407Shselasky { 1320272407Shselasky p = from[i].text; 1321272407Shselasky if (*p == 'L') p++; 1322272407Shselasky p++; /* Skip leading quote. */ 1323272407Shselasky limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */ 1324272407Shselasky 1325272407Shselasky for (;;) 1326272407Shselasky { 1327272407Shselasky base = p; 1328272407Shselasky while (p < limit && *p != '\\') 1329272407Shselasky p++; 1330272407Shselasky if (p > base) 1331272407Shselasky { 1332255932Salfred /* We have a run of normal characters; these can be fed 1333255932Salfred directly to convert_cset. */ 1334255932Salfred if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf)) 1335255932Salfred goto fail; 1336255932Salfred } 1337255932Salfred if (p == limit) 1338255932Salfred break; 1339255932Salfred 1340255932Salfred p = convert_escape (pfile, p + 1, limit, &tbuf, wide); 1341272407Shselasky } 1342272407Shselasky } 1343272407Shselasky /* NUL-terminate the 'to' buffer and translate it to a cpp_string 1344255932Salfred structure. */ 1345255932Salfred emit_numeric_escape (pfile, 0, &tbuf, wide); 1346255932Salfred tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len); 1347255932Salfred to->text = tbuf.text; 1348255932Salfred to->len = tbuf.len; 1349255932Salfred return true; 1350255932Salfred 1351272407Shselasky fail: 1352255932Salfred cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set"); 1353255932Salfred free (tbuf.text); 1354255932Salfred return false; 1355255932Salfred} 1356255932Salfred 1357255932Salfred/* Subroutine of do_line and do_linemarker. Convert escape sequences 1358255932Salfred in a string, but do not perform character set conversion. */ 1359255932Salfredbool 1360255932Salfredcpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from, 1361255932Salfred size_t count, cpp_string *to, bool wide) 1362255932Salfred{ 1363255932Salfred struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc; 1364255932Salfred bool retval; 1365255932Salfred 1366219820Sjeff pfile->narrow_cset_desc.func = convert_no_conversion; 1367219820Sjeff pfile->narrow_cset_desc.cd = (iconv_t) -1; 1368255932Salfred 1369255932Salfred retval = cpp_interpret_string (pfile, from, count, to, wide); 1370255932Salfred 1371255932Salfred pfile->narrow_cset_desc = save_narrow_cset_desc; 1372255932Salfred return retval; 1373255932Salfred} 1374255932Salfred 1375255932Salfred 1376255932Salfred/* Subroutine of cpp_interpret_charconst which performs the conversion 1377255932Salfred to a number, for narrow strings. STR is the string structure returned 1378255932Salfred by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for 1379255932Salfred cpp_interpret_charconst. */ 1380255932Salfredstatic cppchar_t 1381255932Salfrednarrow_str_to_charconst (cpp_reader *pfile, cpp_string str, 1382255932Salfred unsigned int *pchars_seen, int *unsignedp) 1383255932Salfred{ 1384255932Salfred size_t width = CPP_OPTION (pfile, char_precision); 1385255932Salfred size_t max_chars = CPP_OPTION (pfile, int_precision) / width; 1386255932Salfred size_t mask = width_to_mask (width); 1387255932Salfred size_t i; 1388255932Salfred cppchar_t result, c; 1389255932Salfred bool unsigned_p; 1390255932Salfred 1391255932Salfred /* The value of a multi-character character constant, or a 1392255932Salfred single-character character constant whose representation in the 1393255932Salfred execution character set is more than one byte long, is 1394255932Salfred implementation defined. This implementation defines it to be the 1395255932Salfred number formed by interpreting the byte sequence in memory as a 1396255932Salfred big-endian binary number. If overflow occurs, the high bytes are 1397255932Salfred lost, and a warning is issued. 1398255932Salfred 1399255932Salfred We don't want to process the NUL terminator handed back by 1400255932Salfred cpp_interpret_string. */ 1401255932Salfred result = 0; 1402255932Salfred for (i = 0; i < str.len - 1; i++) 1403255932Salfred { 1404255932Salfred c = str.text[i] & mask; 1405255932Salfred if (width < BITS_PER_CPPCHAR_T) 1406255932Salfred result = (result << width) | c; 1407255932Salfred else 1408255932Salfred result = c; 1409255932Salfred } 1410255932Salfred 1411255932Salfred if (i > max_chars) 1412255932Salfred { 1413255932Salfred i = max_chars; 1414255932Salfred cpp_error (pfile, CPP_DL_WARNING, 1415255932Salfred "character constant too long for its type"); 1416255932Salfred } 1417255932Salfred else if (i > 1 && CPP_OPTION (pfile, warn_multichar)) 1418255932Salfred cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant"); 1419255932Salfred 1420255932Salfred /* Multichar constants are of type int and therefore signed. */ 1421255932Salfred if (i > 1) 1422255932Salfred unsigned_p = 0; 1423255932Salfred else 1424255932Salfred unsigned_p = CPP_OPTION (pfile, unsigned_char); 1425255932Salfred 1426255932Salfred /* Truncate the constant to its natural width, and simultaneously 1427255932Salfred sign- or zero-extend to the full width of cppchar_t. 1428255932Salfred For single-character constants, the value is WIDTH bits wide. 1429255932Salfred For multi-character constants, the value is INT_PRECISION bits wide. */ 1430255932Salfred if (i > 1) 1431255932Salfred width = CPP_OPTION (pfile, int_precision); 1432255932Salfred if (width < BITS_PER_CPPCHAR_T) 1433255932Salfred { 1434255932Salfred mask = ((cppchar_t) 1 << width) - 1; 1435255932Salfred if (unsigned_p || !(result & (1 << (width - 1)))) 1436255932Salfred result &= mask; 1437255932Salfred else 1438255932Salfred result |= ~mask; 1439255932Salfred } 1440255932Salfred *pchars_seen = i; 1441255932Salfred *unsignedp = unsigned_p; 1442255932Salfred return result; 1443255932Salfred} 1444255932Salfred 1445255932Salfred/* Subroutine of cpp_interpret_charconst which performs the conversion 1446255932Salfred to a number, for wide strings. STR is the string structure returned 1447255932Salfred by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for 1448255932Salfred cpp_interpret_charconst. */ 1449255932Salfredstatic cppchar_t 1450255932Salfredwide_str_to_charconst (cpp_reader *pfile, cpp_string str, 1451255932Salfred unsigned int *pchars_seen, int *unsignedp) 1452255932Salfred{ 1453255932Salfred bool bigend = CPP_OPTION (pfile, bytes_big_endian); 1454255932Salfred size_t width = CPP_OPTION (pfile, wchar_precision); 1455255932Salfred size_t cwidth = CPP_OPTION (pfile, char_precision); 1456255932Salfred size_t mask = width_to_mask (width); 1457255932Salfred size_t cmask = width_to_mask (cwidth); 1458255932Salfred size_t nbwc = width / cwidth; 1459255932Salfred size_t off, i; 1460255932Salfred cppchar_t result = 0, c; 1461255932Salfred 1462255932Salfred /* This is finicky because the string is in the target's byte order, 1463255932Salfred which may not be our byte order. Only the last character, ignoring 1464255932Salfred the NUL terminator, is relevant. */ 1465272407Shselasky off = str.len - (nbwc * 2); 1466272407Shselasky result = 0; 1467255932Salfred for (i = 0; i < nbwc; i++) 1468255932Salfred { 1469255932Salfred c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1]; 1470255932Salfred result = (result << cwidth) | (c & cmask); 1471255932Salfred } 1472255932Salfred 1473255932Salfred /* Wide character constants have type wchar_t, and a single 1474255932Salfred character exactly fills a wchar_t, so a multi-character wide 1475255932Salfred character constant is guaranteed to overflow. */ 1476255932Salfred if (off > 0) 1477255932Salfred cpp_error (pfile, CPP_DL_WARNING, 1478255932Salfred "character constant too long for its type"); 1479255932Salfred 1480255932Salfred /* Truncate the constant to its natural width, and simultaneously 1481255932Salfred sign- or zero-extend to the full width of cppchar_t. */ 1482255932Salfred if (width < BITS_PER_CPPCHAR_T) 1483255932Salfred { 1484255932Salfred if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1)))) 1485255932Salfred result &= mask; 1486255932Salfred else 1487255932Salfred result |= ~mask; 1488255932Salfred } 1489255932Salfred 1490255932Salfred *unsignedp = CPP_OPTION (pfile, unsigned_wchar); 1491255932Salfred *pchars_seen = 1; 1492255932Salfred return result; 1493255932Salfred} 1494255932Salfred 1495255932Salfred/* Interpret a (possibly wide) character constant in TOKEN. 1496255932Salfred PCHARS_SEEN points to a variable that is filled in with the number 1497255932Salfred of characters seen, and UNSIGNEDP to a variable that indicates 1498255932Salfred whether the result has signed type. */ 1499255932Salfredcppchar_t 1500255932Salfredcpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, 1501255932Salfred unsigned int *pchars_seen, int *unsignedp) 1502255932Salfred{ 1503255932Salfred cpp_string str = { 0, 0 }; 1504255932Salfred bool wide = (token->type == CPP_WCHAR); 1505255932Salfred cppchar_t result; 1506255932Salfred 1507255932Salfred /* an empty constant will appear as L'' or '' */ 1508255932Salfred if (token->val.str.len == (size_t) (2 + wide)) 1509255932Salfred { 1510255932Salfred cpp_error (pfile, CPP_DL_ERROR, "empty character constant"); 1511255932Salfred return 0; 1512255932Salfred } 1513255932Salfred else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide)) 1514255932Salfred return 0; 1515255932Salfred 1516255932Salfred if (wide) 1517255932Salfred result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp); 1518255932Salfred else 1519219820Sjeff result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp); 1520219820Sjeff 1521219820Sjeff if (str.text != token->val.str.text) 1522219820Sjeff free ((void *)str.text); 1523219820Sjeff 1524255932Salfred return result; 1525255932Salfred} 1526255932Salfred 1527255932Salfred/* Convert an identifier denoted by ID and LEN, which might contain 1528219820Sjeff UCN escapes, to the source character set, either UTF-8 or 1529219820Sjeff UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */ 1530219820Sjeffcpp_hashnode * 1531219820Sjeff_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) 1532219820Sjeff{ 1533219820Sjeff /* It turns out that a UCN escape always turns into fewer characters 1534219820Sjeff than the escape itself, so we can allocate a temporary in advance. */ 1535219820Sjeff uchar * buf = (uchar *) alloca (len + 1); 1536219820Sjeff uchar * bufp = buf; 1537219820Sjeff size_t idp; 1538219820Sjeff 1539219820Sjeff for (idp = 0; idp < len; idp++) 1540255932Salfred if (id[idp] != '\\') 1541255932Salfred *bufp++ = id[idp]; 1542255932Salfred else 1543219820Sjeff { 1544 unsigned length = id[idp+1] == 'u' ? 4 : 8; 1545 cppchar_t value = 0; 1546 size_t bufleft = len - (bufp - buf); 1547 int rval; 1548 1549 idp += 2; 1550 while (length && idp < len && ISXDIGIT (id[idp])) 1551 { 1552 value = (value << 4) + hex_value (id[idp]); 1553 idp++; 1554 length--; 1555 } 1556 idp--; 1557 1558 /* Special case for EBCDIC: if the identifier contains 1559 a '$' specified using a UCN, translate it to EBCDIC. */ 1560 if (value == 0x24) 1561 { 1562 *bufp++ = '$'; 1563 continue; 1564 } 1565 1566 rval = one_cppchar_to_utf8 (value, &bufp, &bufleft); 1567 if (rval) 1568 { 1569 errno = rval; 1570 cpp_errno (pfile, CPP_DL_ERROR, 1571 "converting UCN to source character set"); 1572 break; 1573 } 1574 } 1575 1576 return CPP_HASHNODE (ht_lookup (pfile->hash_table, 1577 buf, bufp - buf, HT_ALLOC)); 1578} 1579 1580/* Convert an input buffer (containing the complete contents of one 1581 source file) from INPUT_CHARSET to the source character set. INPUT 1582 points to the input buffer, SIZE is its allocated size, and LEN is 1583 the length of the meaningful data within the buffer. The 1584 translated buffer is returned, and *ST_SIZE is set to the length of 1585 the meaningful data within the translated buffer. 1586 1587 INPUT is expected to have been allocated with xmalloc. This function 1588 will either return INPUT, or free it and return a pointer to another 1589 xmalloc-allocated block of memory. */ 1590uchar * 1591_cpp_convert_input (cpp_reader *pfile, const char *input_charset, 1592 uchar *input, size_t size, size_t len, off_t *st_size) 1593{ 1594 struct cset_converter input_cset; 1595 struct _cpp_strbuf to; 1596 1597 input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset); 1598 if (input_cset.func == convert_no_conversion) 1599 { 1600 to.text = input; 1601 to.asize = size; 1602 to.len = len; 1603 } 1604 else 1605 { 1606 to.asize = MAX (65536, len); 1607 to.text = XNEWVEC (uchar, to.asize); 1608 to.len = 0; 1609 1610 if (!APPLY_CONVERSION (input_cset, input, len, &to)) 1611 cpp_error (pfile, CPP_DL_ERROR, 1612 "failure to convert %s to %s", 1613 CPP_OPTION (pfile, input_charset), SOURCE_CHARSET); 1614 1615 free (input); 1616 } 1617 1618 /* Clean up the mess. */ 1619 if (input_cset.func == convert_using_iconv) 1620 iconv_close (input_cset.cd); 1621 1622 /* Resize buffer if we allocated substantially too much, or if we 1623 haven't enough space for the \n-terminator. */ 1624 if (to.len + 4096 < to.asize || to.len >= to.asize) 1625 to.text = XRESIZEVEC (uchar, to.text, to.len + 1); 1626 1627 /* If the file is using old-school Mac line endings (\r only), 1628 terminate with another \r, not an \n, so that we do not mistake 1629 the \r\n sequence for a single DOS line ending and erroneously 1630 issue the "No newline at end of file" diagnostic. */ 1631 /* APPLE LOCAL don't access to.text[-1] radar 6121572 */ 1632 if (to.len > 0 && to.text[to.len - 1] == '\r') 1633 to.text[to.len] = '\r'; 1634 else 1635 to.text[to.len] = '\n'; 1636 1637 *st_size = to.len; 1638 return to.text; 1639} 1640 1641/* Decide on the default encoding to assume for input files. */ 1642const char * 1643_cpp_default_encoding (void) 1644{ 1645 const char *current_encoding = NULL; 1646 1647 /* We disable this because the default codeset is 7-bit ASCII on 1648 most platforms, and this causes conversion failures on every 1649 file in GCC that happens to have one of the upper 128 characters 1650 in it -- most likely, as part of the name of a contributor. 1651 We should definitely recognize in-band markers of file encoding, 1652 like: 1653 - the appropriate Unicode byte-order mark (FE FF) to recognize 1654 UTF16 and UCS4 (in both big-endian and little-endian flavors) 1655 and UTF8 1656 - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to 1657 distinguish ASCII and EBCDIC. 1658 - now we can parse something like "#pragma GCC encoding <xyz> 1659 on the first line, or even Emacs/VIM's mode line tags (there's 1660 a problem here in that VIM uses the last line, and Emacs has 1661 its more elaborate "local variables" convention). 1662 - investigate whether Java has another common convention, which 1663 would be friendly to support. 1664 (Zack Weinberg and Paolo Bonzini, May 20th 2004) */ 1665#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0 1666 setlocale (LC_CTYPE, ""); 1667 current_encoding = nl_langinfo (CODESET); 1668#endif 1669 if (current_encoding == NULL || *current_encoding == '\0') 1670 current_encoding = SOURCE_CHARSET; 1671 1672 return current_encoding; 1673} 1674