gcclibs/libcpp/charset.c

219820Sjeff/* CPP Library - charsets
219820Sjeff   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
272407Shselasky   Free Software Foundation, Inc.
219820Sjeff
219820Sjeff   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
219820Sjeff
219820SjeffThis program is free software; you can redistribute it and/or modify it
219820Sjeffunder the terms of the GNU General Public License as published by the
219820SjeffFree Software Foundation; either version 2, or (at your option) any
219820Sjefflater version.
219820Sjeff
219820SjeffThis program is distributed in the hope that it will be useful,
219820Sjeffbut WITHOUT ANY WARRANTY; without even the implied warranty of
219820SjeffMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
219820SjeffGNU General Public License for more details.
219820Sjeff
219820SjeffYou should have received a copy of the GNU General Public License
219820Sjeffalong with this program; if not, write to the Free Software
219820SjeffFoundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
219820Sjeff
219820Sjeff#include "config.h"
219820Sjeff#include "system.h"
219820Sjeff#include "cpplib.h"
219820Sjeff#include "internal.h"
219820Sjeff
219820Sjeff/* Character set handling for C-family languages.
219820Sjeff
219820Sjeff   Terminological note: In what follows, "charset" or "character set"
219820Sjeff   will be taken to mean both an abstract set of characters and an
219820Sjeff   encoding for that set.
219820Sjeff
219820Sjeff   The C99 standard discusses two character sets: source and execution.
219820Sjeff   The source character set is used for internal processing in translation
219820Sjeff   phases 1 through 4; the execution character set is used thereafter.
272407Shselasky   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
219820Sjeff   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
219820Sjeff   of these terms).  Furthermore, the "basic character set" (listed in
272407Shselasky   5.2.1p3) is to be encoded in each with values one byte wide, and is
279731Shselasky   to appear in the initial shift state.
219820Sjeff
219820Sjeff   It is not explicitly mentioned, but there is also a "wide execution
219820Sjeff   character set" used to encode wide character constants and wide
255932Salfred   string literals; this is supposed to be the result of applying the
219820Sjeff   standard library function mbstowcs() to an equivalent narrow string
255932Salfred   (6.4.5p5).  However, the behavior of hexadecimal and octal
255932Salfred   \-escapes is at odds with this; they are supposed to be translated
255932Salfred   directly to wchar_t values (6.4.4.4p5,6).
255932Salfred
255932Salfred   The source character set is not necessarily the character set used
255932Salfred   to encode physical source files on disk; translation phase 1 converts
255932Salfred   from whatever that encoding is to the source character set.
255932Salfred
255932Salfred   The presence of universal character names in C99 (6.4.3 et seq.)
255932Salfred   forces the source character set to be isomorphic to ISO 10646,
255932Salfred   that is, Unicode.  There is no such constraint on the execution
255932Salfred   character set; note also that the conversion from source to
255932Salfred   execution character set does not occur for identifiers (5.1.1.2p1#5).
255932Salfred
255932Salfred   For convenience of implementation, the source character set's
255932Salfred   encoding of the basic character set should be identical to the
255932Salfred   execution character set OF THE HOST SYSTEM's encoding of the basic
255932Salfred   character set, and it should not be a state-dependent encoding.
255932Salfred
255932Salfred   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
255932Salfred   depending on whether the host is based on ASCII or EBCDIC (see
255932Salfred   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
255932Salfred   Technical Report #16).  With limited exceptions, it relies on the
255932Salfred   system library's iconv() primitive to do charset conversion
255932Salfred   (specified in SUSv2).  */
255932Salfred
255932Salfred#if !HAVE_ICONV
255932Salfred/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
255932Salfred   below, which are guarded only by if statements with compile-time
255932Salfred   constant conditions, do not cause link errors.  */
255932Salfred#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
255932Salfred#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
255932Salfred#define iconv_close(x)   (void)0
255932Salfred#define ICONV_CONST
255932Salfred#endif
255932Salfred
255932Salfred#if HOST_CHARSET == HOST_CHARSET_ASCII
255932Salfred#define SOURCE_CHARSET "UTF-8"
255932Salfred#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
255932Salfred#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
219820Sjeff#define SOURCE_CHARSET "UTF-EBCDIC"
255932Salfred#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
219820Sjeff#else
219820Sjeff#error "Unrecognized basic host character set"
255932Salfred#endif
255932Salfred
219820Sjeff#ifndef EILSEQ
219820Sjeff#define EILSEQ EINVAL
255932Salfred#endif
219820Sjeff
219820Sjeff/* This structure is used for a resizable string buffer throughout.  */
255932Salfred/* Don't call it strbuf, as that conflicts with unistd.h on systems
255932Salfred   such as DYNIX/ptx where unistd.h includes stropts.h.  */
219820Sjeffstruct _cpp_strbuf
255932Salfred{
255932Salfred  uchar *text;
255932Salfred  size_t asize;
255932Salfred  size_t len;
255932Salfred};
255932Salfred
255932Salfred/* This is enough to hold any string that fits on a single 80-column
255932Salfred   line, even if iconv quadruples its size (e.g. conversion from
255932Salfred   ASCII to UTF-32) rounded up to a power of two.  */
255932Salfred#define OUTBUF_BLOCK_SIZE 256
255932Salfred
219820Sjeff/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
219820Sjeff   logic.  This is because a depressing number of systems lack iconv,
219820Sjeff   or have have iconv libraries that do not do these conversions, so
255932Salfred   we need a fallback implementation for them.  To ensure the fallback
255932Salfred   doesn't break due to neglect, it is used on all systems.
255932Salfred
219820Sjeff   UTF-32 encoding is nice and simple: a four-byte binary number,
219820Sjeff   constrained to the range 00000000-7FFFFFFF to avoid questions of
219820Sjeff   signedness.  We do have to cope with big- and little-endian
219820Sjeff   variants.
219820Sjeff
219820Sjeff   UTF-16 encoding uses two-byte binary numbers, again in big- and
219820Sjeff   little-endian variants, for all values in the 00000000-0000FFFF
255932Salfred   range.  Values in the 00010000-0010FFFF range are encoded as pairs
255932Salfred   of two-byte numbers, called "surrogate pairs": given a number S in
255932Salfred   this range, it is mapped to a pair (H, L) as follows:
255932Salfred
272407Shselasky     H = (S - 0x10000) / 0x400 + 0xD800
255932Salfred     L = (S - 0x10000) % 0x400 + 0xDC00
255932Salfred
272407Shselasky   Two-byte values in the D800...DFFF range are ill-formed except as a
272407Shselasky   component of a surrogate pair.  Even if the encoding within a
272407Shselasky   two-byte value is little-endian, the H member of the surrogate pair
272407Shselasky   comes first.
272407Shselasky
255932Salfred   There is no way to encode values in the 00110000-7FFFFFFF range,
255932Salfred   which is not currently a problem as there are no assigned code
255932Salfred   points in that range; however, the author expects that it will
255932Salfred   eventually become necessary to abandon UTF-16 due to this
255932Salfred   limitation.  Note also that, because of these pairs, UTF-16 does
255932Salfred   not meet the requirements of the C standard for a wide character
255932Salfred   encoding (see 3.7.3 and 6.4.4.4p11).
255932Salfred
219820Sjeff   UTF-8 encoding looks like this:
255932Salfred
255932Salfred   value range	       encoded as
255932Salfred   00000000-0000007F   0xxxxxxx
255932Salfred   00000080-000007FF   110xxxxx 10xxxxxx
255932Salfred   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
255932Salfred   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
255932Salfred   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
255932Salfred   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
255932Salfred
255932Salfred   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
255932Salfred   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
255932Salfred   never occur.  Note also that any value that can be encoded by a
255932Salfred   given row of the table can also be encoded by all successive rows,
255932Salfred   but this is not done; only the shortest possible encoding for any
255932Salfred   given value is valid.  For instance, the character 07C0 could be
255932Salfred   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
255932Salfred   FC 80 80 80 9F 80.  Only the first is valid.
272407Shselasky
272407Shselasky   An implementation note: the transformation from UTF-16 to UTF-8, or
272407Shselasky   vice versa, is easiest done by using UTF-32 as an intermediary.  */
255932Salfred
255932Salfred/* Internal primitives which go from an UTF-8 byte stream to native-endian
255932Salfred   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
255932Salfred   operation in several places below.  */
255932Salfredstatic inline int
255932Salfredone_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
255932Salfred		     cppchar_t *cp)
255932Salfred{
255932Salfred  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
255932Salfred  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
255932Salfred
255932Salfred  cppchar_t c;
255932Salfred  const uchar *inbuf = *inbufp;
255932Salfred  size_t nbytes, i;
255932Salfred
255932Salfred  if (*inbytesleftp < 1)
255932Salfred    return EINVAL;
255932Salfred
255932Salfred  c = *inbuf;
255932Salfred  if (c < 0x80)
255932Salfred    {
255932Salfred      *cp = c;
255932Salfred      *inbytesleftp -= 1;
255932Salfred      *inbufp += 1;
255932Salfred      return 0;
255932Salfred    }
255932Salfred
255932Salfred  /* The number of leading 1-bits in the first byte indicates how many
255932Salfred     bytes follow.  */
255932Salfred  for (nbytes = 2; nbytes < 7; nbytes++)
255932Salfred    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
255932Salfred      goto found;
255932Salfred  return EILSEQ;
255932Salfred found:
255932Salfred
255932Salfred  if (*inbytesleftp < nbytes)
255932Salfred    return EINVAL;
255932Salfred
255932Salfred  c = (c & masks[nbytes-1]);
255932Salfred  inbuf++;
255932Salfred  for (i = 1; i < nbytes; i++)
255932Salfred    {
255932Salfred      cppchar_t n = *inbuf++;
255932Salfred      if ((n & 0xC0) != 0x80)
255932Salfred	return EILSEQ;
255932Salfred      c = ((c << 6) + (n & 0x3F));
255932Salfred    }
255932Salfred
255932Salfred  /* Make sure the shortest possible encoding was used.  */
255932Salfred  if (c <=      0x7F && nbytes > 1) return EILSEQ;
255932Salfred  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
255932Salfred  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
255932Salfred  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
255932Salfred  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
255932Salfred
255932Salfred  /* Make sure the character is valid.  */
255932Salfred  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
255932Salfred
255932Salfred  *cp = c;
255932Salfred  *inbufp = inbuf;
255932Salfred  *inbytesleftp -= nbytes;
255932Salfred  return 0;
255932Salfred}
255932Salfred
255932Salfredstatic inline int
255932Salfredone_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
255932Salfred{
255932Salfred  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
255932Salfred  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
255932Salfred  size_t nbytes;
255932Salfred  uchar buf[6], *p = &buf[6];
255932Salfred  uchar *outbuf = *outbufp;
255932Salfred
255932Salfred  nbytes = 1;
255932Salfred  if (c < 0x80)
255932Salfred    *--p = c;
255932Salfred  else
255932Salfred    {
255932Salfred      do
255932Salfred	{
255932Salfred	  *--p = ((c & 0x3F) | 0x80);
255932Salfred	  c >>= 6;
255932Salfred	  nbytes++;
255932Salfred	}
272407Shselasky      while (c >= 0x3F || (c & limits[nbytes-1]));
272407Shselasky      *--p = (c | masks[nbytes-1]);
272407Shselasky    }
255932Salfred
255932Salfred  if (*outbytesleftp < nbytes)
255932Salfred    return E2BIG;
255932Salfred
255932Salfred  while (p < &buf[6])
255932Salfred    *outbuf++ = *p++;
255932Salfred  *outbytesleftp -= nbytes;
255932Salfred  *outbufp = outbuf;
255932Salfred  return 0;
255932Salfred}
255932Salfred
255932Salfred/* The following four functions transform one character between the two
255932Salfred   encodings named in the function name.  All have the signature
255932Salfred   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
255932Salfred           uchar **outbufp, size_t *outbytesleftp)
255932Salfred
255932Salfred   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
255932Salfred   interpreted as a boolean indicating whether big-endian or
255932Salfred   little-endian encoding is to be used for the member of the pair
255932Salfred   that is not UTF-8.
255932Salfred
272407Shselasky   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
255932Salfred   do for iconv.
255932Salfred
255932Salfred   The return value is either 0 for success, or an errno value for
255932Salfred   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
255932Salfred   input sequence), ir EINVAL (incomplete input sequence).  */
255932Salfred
255932Salfredstatic inline int
255932Salfredone_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
255932Salfred		   uchar **outbufp, size_t *outbytesleftp)
255932Salfred{
255932Salfred  uchar *outbuf;
255932Salfred  cppchar_t s = 0;
255932Salfred  int rval;
255932Salfred
255932Salfred  /* Check for space first, since we know exactly how much we need.  */
255932Salfred  if (*outbytesleftp < 4)
255932Salfred    return E2BIG;
255932Salfred
255932Salfred  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
255932Salfred  if (rval)
255932Salfred    return rval;
255932Salfred
255932Salfred  outbuf = *outbufp;
272407Shselasky  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
272407Shselasky  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
272407Shselasky  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
255932Salfred  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
255932Salfred
255932Salfred  *outbufp += 4;
255932Salfred  *outbytesleftp -= 4;
255932Salfred  return 0;
255932Salfred}
255932Salfred
255932Salfredstatic inline int
255932Salfredone_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
255932Salfred		   uchar **outbufp, size_t *outbytesleftp)
255932Salfred{
255932Salfred  cppchar_t s;
255932Salfred  int rval;
255932Salfred  const uchar *inbuf;
255932Salfred
255932Salfred  if (*inbytesleftp < 4)
255932Salfred    return EINVAL;
255932Salfred
255932Salfred  inbuf = *inbufp;
255932Salfred
255932Salfred  s  = inbuf[bigend ? 0 : 3] << 24;
255932Salfred  s += inbuf[bigend ? 1 : 2] << 16;
255932Salfred  s += inbuf[bigend ? 2 : 1] << 8;
255932Salfred  s += inbuf[bigend ? 3 : 0];
255932Salfred
255932Salfred  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
255932Salfred    return EILSEQ;
272407Shselasky
272407Shselasky  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
272407Shselasky  if (rval)
272407Shselasky    return rval;
272407Shselasky
255932Salfred  *inbufp += 4;
272407Shselasky  *inbytesleftp -= 4;
255932Salfred  return 0;
255932Salfred}
255932Salfred
255932Salfredstatic inline int
272407Shselaskyone_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
255932Salfred		   uchar **outbufp, size_t *outbytesleftp)
255932Salfred{
255932Salfred  int rval;
272407Shselasky  cppchar_t s = 0;
272407Shselasky  const uchar *save_inbuf = *inbufp;
272407Shselasky  size_t save_inbytesleft = *inbytesleftp;
255932Salfred  uchar *outbuf = *outbufp;
255932Salfred
255932Salfred  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
255932Salfred  if (rval)
255932Salfred    return rval;
255932Salfred
255932Salfred  if (s > 0x0010FFFF)
255932Salfred    {
255932Salfred      *inbufp = save_inbuf;
272407Shselasky      *inbytesleftp = save_inbytesleft;
272407Shselasky      return EILSEQ;
272407Shselasky    }
272407Shselasky
272407Shselasky  if (s < 0xFFFF)
272407Shselasky    {
255932Salfred      if (*outbytesleftp < 2)
255932Salfred	{
255932Salfred	  *inbufp = save_inbuf;
255932Salfred	  *inbytesleftp = save_inbytesleft;
255932Salfred	  return E2BIG;
272407Shselasky	}
272407Shselasky      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
272407Shselasky      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
272407Shselasky
272407Shselasky      *outbufp += 2;
272407Shselasky      *outbytesleftp -= 2;
272407Shselasky      return 0;
272407Shselasky    }
272407Shselasky  else
272407Shselasky    {
272407Shselasky      cppchar_t hi, lo;
272407Shselasky
272407Shselasky      if (*outbytesleftp < 4)
272407Shselasky	{
272407Shselasky	  *inbufp = save_inbuf;
272407Shselasky	  *inbytesleftp = save_inbytesleft;
272407Shselasky	  return E2BIG;
272407Shselasky	}
272407Shselasky
272407Shselasky      hi = (s - 0x10000) / 0x400 + 0xD800;
272407Shselasky      lo = (s - 0x10000) % 0x400 + 0xDC00;
272407Shselasky
272407Shselasky      /* Even if we are little-endian, put the high surrogate first.
272407Shselasky	 ??? Matches practice?  */
272407Shselasky      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
255932Salfred      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
255932Salfred      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
255932Salfred      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
255932Salfred
255932Salfred      *outbufp += 4;
255932Salfred      *outbytesleftp -= 4;
255932Salfred      return 0;
255932Salfred    }
255932Salfred}
255932Salfred
255932Salfredstatic inline int
255932Salfredone_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
255932Salfred		   uchar **outbufp, size_t *outbytesleftp)
255932Salfred{
255932Salfred  cppchar_t s;
255932Salfred  const uchar *inbuf = *inbufp;
255932Salfred  int rval;
255932Salfred
255932Salfred  if (*inbytesleftp < 2)
255932Salfred    return EINVAL;
255932Salfred  s  = inbuf[bigend ? 0 : 1] << 8;
255932Salfred  s += inbuf[bigend ? 1 : 0];
255932Salfred
255932Salfred  /* Low surrogate without immediately preceding high surrogate is invalid.  */
255932Salfred  if (s >= 0xDC00 && s <= 0xDFFF)
255932Salfred    return EILSEQ;
255932Salfred  /* High surrogate must have a following low surrogate.  */
255932Salfred  else if (s >= 0xD800 && s <= 0xDBFF)
255932Salfred    {
255932Salfred      cppchar_t hi = s, lo;
255932Salfred      if (*inbytesleftp < 4)
255932Salfred	return EINVAL;
255932Salfred
255932Salfred      lo  = inbuf[bigend ? 2 : 3] << 8;
255932Salfred      lo += inbuf[bigend ? 3 : 2];
255932Salfred
255932Salfred      if (lo < 0xDC00 || lo > 0xDFFF)
255932Salfred	return EILSEQ;
255932Salfred
255932Salfred      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
255932Salfred    }
255932Salfred
255932Salfred  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
255932Salfred  if (rval)
255932Salfred    return rval;
255932Salfred
255932Salfred  /* Success - update the input pointers (one_cppchar_to_utf8 has done
272407Shselasky     the output pointers for us).  */
272407Shselasky  if (s <= 0xFFFF)
272407Shselasky    {
255932Salfred      *inbufp += 2;
255932Salfred      *inbytesleftp -= 2;
255932Salfred    }
255932Salfred  else
255932Salfred    {
255932Salfred      *inbufp += 4;
255932Salfred      *inbytesleftp -= 4;
255932Salfred    }
255932Salfred  return 0;
255932Salfred}
255932Salfred
255932Salfred/* Helper routine for the next few functions.  The 'const' on
255932Salfred   one_conversion means that we promise not to modify what function is
255932Salfred   pointed to, which lets the inliner see through it.  */
255932Salfred
255932Salfredstatic inline bool
255932Salfredconversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
255932Salfred					     uchar **, size_t *),
255932Salfred		 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
255932Salfred{
255932Salfred  const uchar *inbuf;
255932Salfred  uchar *outbuf;
255932Salfred  size_t inbytesleft, outbytesleft;
272407Shselasky  int rval;
272407Shselasky
272407Shselasky  inbuf = from;
272407Shselasky  inbytesleft = flen;
272407Shselasky  outbuf = to->text + to->len;
272407Shselasky  outbytesleft = to->asize - to->len;
272407Shselasky
272407Shselasky  for (;;)
255932Salfred    {
272407Shselasky      do
272407Shselasky	rval = one_conversion (cd, &inbuf, &inbytesleft,
272407Shselasky			       &outbuf, &outbytesleft);
272407Shselasky      while (inbytesleft && !rval);
272407Shselasky
272407Shselasky      if (__builtin_expect (inbytesleft == 0, 1))
272407Shselasky	{
272407Shselasky	  to->len = to->asize - outbytesleft;
272407Shselasky	  return true;
272407Shselasky	}
272407Shselasky      if (rval != E2BIG)
272407Shselasky	{
272407Shselasky	  errno = rval;
272407Shselasky	  return false;
272407Shselasky	}
272407Shselasky
272407Shselasky      outbytesleft += OUTBUF_BLOCK_SIZE;
272407Shselasky      to->asize += OUTBUF_BLOCK_SIZE;
272407Shselasky      to->text = XRESIZEVEC (uchar, to->text, to->asize);
272407Shselasky      outbuf = to->text + to->asize - outbytesleft;
255932Salfred    }
255932Salfred}
255932Salfred
272407Shselasky
272407Shselasky/* These functions convert entire strings between character sets.
272407Shselasky   They all have the signature
272407Shselasky
272407Shselasky   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
255932Salfred
255932Salfred   The input string FROM is converted as specified by the function
255932Salfred   name plus the iconv descriptor CD (which may be fake), and the
255932Salfred   result appended to TO.  On any error, false is returned, otherwise true.  */
255932Salfred
255932Salfred/* These four use the custom conversion code above.  */
255932Salfredstatic bool
255932Salfredconvert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
255932Salfred		    struct _cpp_strbuf *to)
255932Salfred{
255932Salfred  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
255932Salfred}
255932Salfred
255932Salfredstatic bool
255932Salfredconvert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
255932Salfred		    struct _cpp_strbuf *to)
255932Salfred{
255932Salfred  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
255932Salfred}
255932Salfred
255932Salfredstatic bool
255932Salfredconvert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
255932Salfred		    struct _cpp_strbuf *to)
255932Salfred{
255932Salfred  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
255932Salfred}
255932Salfred
255932Salfredstatic bool
255932Salfredconvert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
255932Salfred		    struct _cpp_strbuf *to)
255932Salfred{
255932Salfred  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
255932Salfred}
255932Salfred
255932Salfred/* Identity conversion, used when we have no alternative.  */
255932Salfredstatic bool
255932Salfredconvert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
255932Salfred		       const uchar *from, size_t flen, struct _cpp_strbuf *to)
255932Salfred{
255932Salfred  if (to->len + flen > to->asize)
255932Salfred    {
255932Salfred      to->asize = to->len + flen;
255932Salfred      to->text = XRESIZEVEC (uchar, to->text, to->asize);
255932Salfred    }
255932Salfred  memcpy (to->text + to->len, from, flen);
272407Shselasky  to->len += flen;
255932Salfred  return true;
255932Salfred}
255932Salfred
255932Salfred/* And this one uses the system iconv primitive.  It's a little
255932Salfred   different, since iconv's interface is a little different.  */
255932Salfred#if HAVE_ICONV
255932Salfredstatic bool
255932Salfredconvert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
272407Shselasky		     struct _cpp_strbuf *to)
272407Shselasky{
272407Shselasky  ICONV_CONST char *inbuf;
255932Salfred  char *outbuf;
255932Salfred  size_t inbytesleft, outbytesleft;
255932Salfred
255932Salfred  /* Reset conversion descriptor and check that it is valid.  */
255932Salfred  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
255932Salfred    return false;
255932Salfred
255932Salfred  inbuf = (ICONV_CONST char *)from;
255932Salfred  inbytesleft = flen;
255932Salfred  outbuf = (char *)to->text + to->len;
255932Salfred  outbytesleft = to->asize - to->len;
255932Salfred
255932Salfred  for (;;)
255932Salfred    {
255932Salfred      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
255932Salfred      if (__builtin_expect (inbytesleft == 0, 1))
255932Salfred	{
255932Salfred	  to->len = to->asize - outbytesleft;
255932Salfred	  return true;
255932Salfred	}
255932Salfred      if (errno != E2BIG)
255932Salfred	return false;
255932Salfred
255932Salfred      outbytesleft += OUTBUF_BLOCK_SIZE;
255932Salfred      to->asize += OUTBUF_BLOCK_SIZE;
255932Salfred      to->text = XRESIZEVEC (uchar, to->text, to->asize);
255932Salfred      outbuf = (char *)to->text + to->asize - outbytesleft;
255932Salfred    }
255932Salfred}
255932Salfred#else
255932Salfred#define convert_using_iconv 0 /* prevent undefined symbol error below */
255932Salfred#endif
272407Shselasky
272407Shselasky/* Arrange for the above custom conversion logic to be used automatically
272407Shselasky   when conversion between a suitable pair of character sets is requested.  */
272407Shselasky
272407Shselasky#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
272407Shselasky   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
272407Shselasky
255932Salfredstruct conversion
255932Salfred{
255932Salfred  const char *pair;
272407Shselasky  convert_f func;
272407Shselasky  iconv_t fake_cd;
272407Shselasky};
272407Shselaskystatic const struct conversion conversion_tab[] = {
272407Shselasky  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
272407Shselasky  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
272407Shselasky  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
272407Shselasky  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
272407Shselasky  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
272407Shselasky  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
272407Shselasky  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
272407Shselasky  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
272407Shselasky};
272407Shselasky
272407Shselasky/* Subroutine of cpp_init_iconv: initialize and return a
272407Shselasky   cset_converter structure for conversion from FROM to TO.  If
272407Shselasky   iconv_open() fails, issue an error and return an identity
255932Salfred   converter.  Silently return an identity converter if FROM and TO
272407Shselasky   are identical.  */
272407Shselaskystatic struct cset_converter
272407Shselaskyinit_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
272407Shselasky{
272407Shselasky  struct cset_converter ret;
255932Salfred  char *pair;
272407Shselasky  size_t i;
272407Shselasky
272407Shselasky  if (!strcasecmp (to, from))
272407Shselasky    {
255932Salfred      ret.func = convert_no_conversion;
272407Shselasky      ret.cd = (iconv_t) -1;
272407Shselasky      return ret;
272407Shselasky    }
272407Shselasky
272407Shselasky  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
272407Shselasky
272407Shselasky  strcpy(pair, from);
272407Shselasky  strcat(pair, "/");
272407Shselasky  strcat(pair, to);
272407Shselasky  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
272407Shselasky    if (!strcasecmp (pair, conversion_tab[i].pair))
272407Shselasky      {
255932Salfred	ret.func = conversion_tab[i].func;
255932Salfred	ret.cd = conversion_tab[i].fake_cd;
255932Salfred	return ret;
255932Salfred      }
255932Salfred
255932Salfred  /* No custom converter - try iconv.  */
255932Salfred  if (HAVE_ICONV)
255932Salfred    {
255932Salfred      ret.func = convert_using_iconv;
255932Salfred      ret.cd = iconv_open (to, from);
255932Salfred
255932Salfred      if (ret.cd == (iconv_t) -1)
255932Salfred	{
255932Salfred	  if (errno == EINVAL)
255932Salfred	    cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
255932Salfred		       "conversion from %s to %s not supported by iconv",
219820Sjeff		       from, to);
219820Sjeff	  else
219820Sjeff	    cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
219820Sjeff
219820Sjeff	  ret.func = convert_no_conversion;
219820Sjeff	}
219820Sjeff    }
219820Sjeff  else
219820Sjeff    {
219820Sjeff      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
219820Sjeff		 "no iconv implementation, cannot convert from %s to %s",
219820Sjeff		 from, to);
219820Sjeff      ret.func = convert_no_conversion;
219820Sjeff      ret.cd = (iconv_t) -1;
255932Salfred    }
255932Salfred  return ret;
255932Salfred}
255932Salfred
219820Sjeff/* If charset conversion is requested, initialize iconv(3) descriptors
219820Sjeff   for conversion from the source character set to the execution
219820Sjeff   character sets.  If iconv is not present in the C library, and
219820Sjeff   conversion is requested, issue an error.  */
219820Sjeff
255932Salfredvoid
255932Salfredcpp_init_iconv (cpp_reader *pfile)
255932Salfred{
219820Sjeff  const char *ncset = CPP_OPTION (pfile, narrow_charset);
219820Sjeff  const char *wcset = CPP_OPTION (pfile, wide_charset);
219820Sjeff  const char *default_wcset;
219820Sjeff
219820Sjeff  bool be = CPP_OPTION (pfile, bytes_big_endian);
219820Sjeff
219820Sjeff  if (CPP_OPTION (pfile, wchar_precision) >= 32)
219820Sjeff    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
255932Salfred  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
219820Sjeff    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
219820Sjeff  else
219820Sjeff    /* This effectively means that wide strings are not supported,
219820Sjeff       so don't do any conversion at all.  */
279731Shselasky   default_wcset = SOURCE_CHARSET;
279731Shselasky
279731Shselasky  if (!ncset)
279731Shselasky    ncset = SOURCE_CHARSET;
219820Sjeff  if (!wcset)
255932Salfred    wcset = default_wcset;
219820Sjeff
219820Sjeff  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
219820Sjeff  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
255932Salfred}
219820Sjeff
219820Sjeff/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
219820Sjeffvoid
255932Salfred_cpp_destroy_iconv (cpp_reader *pfile)
255932Salfred{
219820Sjeff  if (HAVE_ICONV)
219820Sjeff    {
219820Sjeff      if (pfile->narrow_cset_desc.func == convert_using_iconv)
219820Sjeff	iconv_close (pfile->narrow_cset_desc.cd);
219820Sjeff      if (pfile->wide_cset_desc.func == convert_using_iconv)
219820Sjeff	iconv_close (pfile->wide_cset_desc.cd);
219820Sjeff    }
255932Salfred}
219820Sjeff
219820Sjeff/* Utility routine for use by a full compiler.  C is a character taken
219820Sjeff   from the *basic* source character set, encoded in the host's
219820Sjeff   execution encoding.  Convert it to (the target's) execution
219820Sjeff   encoding, and return that value.
219820Sjeff
219820Sjeff   Issues an internal error if C's representation in the narrow
219820Sjeff   execution character set fails to be a single-byte value (C99
219820Sjeff   5.2.1p3: "The representation of each member of the source and
219820Sjeff   execution character sets shall fit in a byte.")  May also issue an
272407Shselasky   internal error if C fails to be a member of the basic source
272407Shselasky   character set (testing this exactly is too hard, especially when
272407Shselasky   the host character set is EBCDIC).  */
272407Shselaskycppchar_t
272407Shselaskycpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
272407Shselasky{
272407Shselasky  uchar sbuf[1];
272407Shselasky  struct _cpp_strbuf tbuf;
272407Shselasky
272407Shselasky  /* This test is merely an approximation, but it suffices to catch
272407Shselasky     the most important thing, which is that we don't get handed a
272407Shselasky     character outside the unibyte range of the host character set.  */
272407Shselasky  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
272407Shselasky    {
272407Shselasky      cpp_error (pfile, CPP_DL_ICE,
272407Shselasky		 "character 0x%lx is not in the basic source character set\n",
272407Shselasky		 (unsigned long)c);
272407Shselasky      return 0;
272407Shselasky    }
255932Salfred
255932Salfred  /* Being a character in the unibyte range of the host character set,
219820Sjeff     we can safely splat it into a one-byte buffer and trust that that
272407Shselasky     is a well-formed string.  */
255932Salfred  sbuf[0] = c;
272407Shselasky
272407Shselasky  /* This should never need to reallocate, but just in case... */
272407Shselasky  tbuf.asize = 1;
255932Salfred  tbuf.text = XNEWVEC (uchar, tbuf.asize);
272407Shselasky  tbuf.len = 0;
272407Shselasky
272407Shselasky  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
255932Salfred    {
255932Salfred      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
255932Salfred      return 0;
255932Salfred    }
255932Salfred  if (tbuf.len != 1)
255932Salfred    {
255932Salfred      cpp_error (pfile, CPP_DL_ICE,
255932Salfred		 "character 0x%lx is not unibyte in execution character set",
255932Salfred		 (unsigned long)c);
255932Salfred      return 0;
255932Salfred    }
255932Salfred  c = tbuf.text[0];
255932Salfred  free(tbuf.text);
272407Shselasky  return c;
272407Shselasky}
272407Shselasky
272407Shselasky
272407Shselasky
272407Shselasky/* Utility routine that computes a mask of the form 0000...111... with
272407Shselasky   WIDTH 1-bits.  */
272407Shselaskystatic inline size_t
272407Shselaskywidth_to_mask (size_t width)
272407Shselasky{
272407Shselasky  width = MIN (width, BITS_PER_CPPCHAR_T);
272407Shselasky  if (width >= CHAR_BIT * sizeof (size_t))
272407Shselasky    return ~(size_t) 0;
272407Shselasky  else
272407Shselasky    return ((size_t) 1 << width) - 1;
272407Shselasky}
272407Shselasky
272407Shselasky/* A large table of unicode character information.  */
272407Shselaskyenum {
272407Shselasky  /* Valid in a C99 identifier?  */
272407Shselasky  C99 = 1,
272407Shselasky  /* Valid in a C99 identifier, but not as the first character?  */
272407Shselasky  DIG = 2,
272407Shselasky  /* Valid in a C++ identifier?  */
272407Shselasky  CXX = 4,
272407Shselasky  /* NFC representation is not valid in an identifier?  */
272407Shselasky  CID = 8,
272407Shselasky  /* Might be valid NFC form?  */
272407Shselasky  NFC = 16,
272407Shselasky  /* Might be valid NFKC form?  */
272407Shselasky  NKC = 32,
272407Shselasky  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
272407Shselasky  CTX = 64
272407Shselasky};
272407Shselasky
272407Shselaskystatic const struct {
272407Shselasky  /* Bitmap of flags above.  */
255932Salfred  unsigned char flags;
255932Salfred  /* Combining class of the character.  */
255932Salfred  unsigned char combine;
272407Shselasky  /* Last character in the range described by this entry.  */
255932Salfred  unsigned short end;
272407Shselasky} ucnranges[] = {
255932Salfred#include "ucnid.h"
272407Shselasky};
255932Salfred
255932Salfred/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
255932Salfred   the start of an identifier, and 0 if C is not valid in an
255932Salfred   identifier.  We assume C has already gone through the checks of
255932Salfred   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
255932Salfred   algorithm is a simple binary search on the table defined in
255932Salfred   ucnid.h.  */
255932Salfred
255932Salfredstatic int
255932Salfreducn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
255932Salfred			 struct normalize_state *nst)
255932Salfred{
255932Salfred  int mn, mx, md;
272407Shselasky
272407Shselasky  if (c > 0xFFFF)
255932Salfred    return 0;
255932Salfred
255932Salfred  mn = 0;
272407Shselasky  mx = ARRAY_SIZE (ucnranges) - 1;
255932Salfred  while (mx != mn)
255932Salfred    {
255932Salfred      md = (mn + mx) / 2;
255932Salfred      if (c <= ucnranges[md].end)
255932Salfred	mx = md;
255932Salfred      else
255932Salfred	mn = md + 1;
255932Salfred    }
255932Salfred
255932Salfred  /* When -pedantic, we require the character to have been listed by
255932Salfred     the standard for the current language.  Otherwise, we accept the
255932Salfred     union of the acceptable sets for C++98 and C99.  */
255932Salfred  if (! (ucnranges[mn].flags & (C99 | CXX)))
255932Salfred      return 0;
255932Salfred
255932Salfred  if (CPP_PEDANTIC (pfile)
255932Salfred      && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
255932Salfred	  || (CPP_OPTION (pfile, cplusplus)
255932Salfred	      && !(ucnranges[mn].flags & CXX))))
255932Salfred    return 0;
255932Salfred
255932Salfred  /* Update NST.  */
255932Salfred  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
255932Salfred    nst->level = normalized_none;
255932Salfred  else if (ucnranges[mn].flags & CTX)
255932Salfred    {
255932Salfred      bool safe;
255932Salfred      cppchar_t p = nst->previous;
255932Salfred
255932Salfred      /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
255932Salfred      if (c == 0x09BE)
255932Salfred	safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
255932Salfred      else if (c == 0x0B3E)
255932Salfred	safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
255932Salfred      else if (c == 0x0BBE)
255932Salfred	safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
255932Salfred      else if (c == 0x0CC2)
255932Salfred	safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
255932Salfred      else if (c == 0x0D3E)
255932Salfred	safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
255932Salfred      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
255932Salfred	 and are combined algorithmically from a sequence of the form
255932Salfred	 1100-1112 1161-1175 11A8-11C2
255932Salfred	 (if the third is not present, it is treated as 11A7, which is not
255932Salfred	 really a valid character).
255932Salfred	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
255932Salfred	 only the combining characters.  */
255932Salfred      else if (c >= 0x1161 && c <= 0x1175)
255932Salfred	safe = p < 0x1100 || p > 0x1112;
255932Salfred      else if (c >= 0x11A8 && c <= 0x11C2)
255932Salfred	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
255932Salfred      else
255932Salfred	{
255932Salfred	  /* Uh-oh, someone updated ucnid.h without updating this code.  */
255932Salfred	  cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
255932Salfred	  safe = true;
255932Salfred	}
255932Salfred      if (!safe && c < 0x1161)
255932Salfred	nst->level = normalized_none;
255932Salfred      else if (!safe)
255932Salfred	nst->level = MAX (nst->level, normalized_identifier_C);
255932Salfred    }
255932Salfred  else if (ucnranges[mn].flags & NKC)
255932Salfred    ;
255932Salfred  else if (ucnranges[mn].flags & NFC)
255932Salfred    nst->level = MAX (nst->level, normalized_C);
255932Salfred  else if (ucnranges[mn].flags & CID)
255932Salfred    nst->level = MAX (nst->level, normalized_identifier_C);
255932Salfred  else
255932Salfred    nst->level = normalized_none;
255932Salfred  nst->previous = c;
255932Salfred  nst->prev_class = ucnranges[mn].combine;
255932Salfred
255932Salfred  /* In C99, UCN digits may not begin identifiers.  */
255932Salfred  if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
255932Salfred    return 2;
255932Salfred
255932Salfred  return 1;
255932Salfred}
255932Salfred
255932Salfred/* [lex.charset]: The character designated by the universal character
255932Salfred   name \UNNNNNNNN is that character whose character short name in
255932Salfred   ISO/IEC 10646 is NNNNNNNN; the character designated by the
279731Shselasky   universal character name \uNNNN is that character whose character
279731Shselasky   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
255932Salfred   for a universal character name is less than 0x20 or in the range
279731Shselasky   0x7F-0x9F (inclusive), or if the universal character name
279731Shselasky   designates a character in the basic source character set, then the
255932Salfred   program is ill-formed.
255932Salfred
255932Salfred   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
255932Salfred   buffer end is delimited by a non-hex digit.  Returns zero if the
255932Salfred   UCN has not been consumed.
255932Salfred
255932Salfred   Otherwise the nonzero value of the UCN, whether valid or invalid,
255932Salfred   is returned.  Diagnostics are emitted for invalid values.  PSTR
255932Salfred   is updated to point one beyond the UCN, or to the syntactically
255932Salfred   invalid character.
255932Salfred
255932Salfred   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
255932Salfred   an identifier, or 2 otherwise.  */
255932Salfred
255932Salfredcppchar_t
255932Salfred_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
255932Salfred		const uchar *limit, int identifier_pos,
255932Salfred		struct normalize_state *nst)
255932Salfred{
255932Salfred  cppchar_t result, c;
255932Salfred  unsigned int length;
255932Salfred  const uchar *str = *pstr;
255932Salfred  const uchar *base = str - 2;
255932Salfred
255932Salfred  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
255932Salfred    cpp_error (pfile, CPP_DL_WARNING,
255932Salfred	       "universal character names are only valid in C++ and C99");
255932Salfred  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
255932Salfred    cpp_error (pfile, CPP_DL_WARNING,
255932Salfred	       "the meaning of '\\%c' is different in traditional C",
255932Salfred	       (int) str[-1]);
255932Salfred
255932Salfred  if (str[-1] == 'u')
255932Salfred    length = 4;
255932Salfred  else if (str[-1] == 'U')
255932Salfred    length = 8;
255932Salfred  else
255932Salfred    {
255932Salfred      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
255932Salfred      length = 4;
255932Salfred    }
255932Salfred
255932Salfred  result = 0;
255932Salfred  do
255932Salfred    {
255932Salfred      c = *str;
255932Salfred      if (!ISXDIGIT (c))
255932Salfred	break;
255932Salfred      str++;
255932Salfred      result = (result << 4) + hex_value (c);
255932Salfred    }
255932Salfred  while (--length && str < limit);
255932Salfred
255932Salfred  /* Partial UCNs are not valid in strings, but decompose into
255932Salfred     multiple tokens in identifiers, so we can't give a helpful
255932Salfred     error message in that case.  */
255932Salfred  if (length && identifier_pos)
255932Salfred    return 0;
255932Salfred
255932Salfred  *pstr = str;
255932Salfred  if (length)
255932Salfred    {
255932Salfred      cpp_error (pfile, CPP_DL_ERROR,
272407Shselasky		 "incomplete universal character name %.*s",
255932Salfred		 (int) (str - base), base);
255932Salfred      result = 1;
255932Salfred    }
255932Salfred  /* The standard permits $, @ and ` to be specified as UCNs.  We use
255932Salfred     hex escapes so that this also works with EBCDIC hosts.  */
255932Salfred  else if ((result < 0xa0
255932Salfred	    && (result != 0x24 && result != 0x40 && result != 0x60))
255932Salfred	   || (result & 0x80000000)
255932Salfred	   || (result >= 0xD800 && result <= 0xDFFF))
255932Salfred    {
255932Salfred      cpp_error (pfile, CPP_DL_ERROR,
255932Salfred		 "%.*s is not a valid universal character",
255932Salfred		 (int) (str - base), base);
255932Salfred      result = 1;
255932Salfred    }
255932Salfred  else if (identifier_pos && result == 0x24
255932Salfred	   && CPP_OPTION (pfile, dollars_in_ident))
255932Salfred    {
255932Salfred      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
255932Salfred	{
255932Salfred	  CPP_OPTION (pfile, warn_dollars) = 0;
255932Salfred	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
255932Salfred	}
255932Salfred      NORMALIZE_STATE_UPDATE_IDNUM (nst);
219820Sjeff    }
219820Sjeff  else if (identifier_pos)
219820Sjeff    {
219820Sjeff      int validity = ucn_valid_in_identifier (pfile, result, nst);
219820Sjeff
219820Sjeff      if (validity == 0)
219820Sjeff	cpp_error (pfile, CPP_DL_ERROR,
219820Sjeff		   "universal character %.*s is not valid in an identifier",
255932Salfred		   (int) (str - base), base);
255932Salfred      else if (validity == 2 && identifier_pos == 1)
219820Sjeff	cpp_error (pfile, CPP_DL_ERROR,
219820Sjeff   "universal character %.*s is not valid at the start of an identifier",
219820Sjeff		   (int) (str - base), base);
219820Sjeff    }
219820Sjeff
219820Sjeff  if (result == 0)
219820Sjeff    result = 1;
255932Salfred
255932Salfred  return result;
219820Sjeff}
219820Sjeff
219820Sjeff/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
219820Sjeff   it to the execution character set and write the result into TBUF.
255932Salfred   An advanced pointer is returned.  Issues all relevant diagnostics.  */
255932Salfredstatic const uchar *
219820Sjeffconvert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
255932Salfred	     struct _cpp_strbuf *tbuf, bool wide)
219820Sjeff{
219820Sjeff  cppchar_t ucn;
219820Sjeff  uchar buf[6];
219820Sjeff  uchar *bufp = buf;
219820Sjeff  size_t bytesleft = 6;
219820Sjeff  int rval;
219820Sjeff  struct cset_converter cvt
219820Sjeff    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
219820Sjeff  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
219820Sjeff
219820Sjeff  from++;  /* Skip u/U.  */
255932Salfred  ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
219820Sjeff
219820Sjeff  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
219820Sjeff  if (rval)
219820Sjeff    {
219820Sjeff      errno = rval;
255932Salfred      cpp_errno (pfile, CPP_DL_ERROR,
219820Sjeff		 "converting UCN to source character set");
219820Sjeff    }
219820Sjeff  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
219820Sjeff    cpp_errno (pfile, CPP_DL_ERROR,
219820Sjeff	       "converting UCN to execution character set");
219820Sjeff
219820Sjeff  return from;
219820Sjeff}
219820Sjeff
219820Sjeff/* Subroutine of convert_hex and convert_oct.  N is the representation
219820Sjeff   in the execution character set of a numeric escape; write it into the
219820Sjeff   string buffer TBUF and update the end-of-string pointer therein.  WIDE
219820Sjeff   is true if it's a wide string that's being assembled in TBUF.  This
219820Sjeff   function issues no diagnostics and never fails.  */
219820Sjeffstatic void
255932Salfredemit_numeric_escape (cpp_reader *pfile, cppchar_t n,
219820Sjeff		     struct _cpp_strbuf *tbuf, bool wide)
255932Salfred{
219820Sjeff  if (wide)
219820Sjeff    {
219820Sjeff      /* We have to render this into the target byte order, which may not
272407Shselasky	 be our byte order.  */
219820Sjeff      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
272407Shselasky      size_t width = CPP_OPTION (pfile, wchar_precision);
219820Sjeff      size_t cwidth = CPP_OPTION (pfile, char_precision);
255932Salfred      size_t cmask = width_to_mask (cwidth);
219820Sjeff      size_t nbwc = width / cwidth;
219820Sjeff      size_t i;
219820Sjeff      size_t off = tbuf->len;
219820Sjeff      cppchar_t c;
219820Sjeff
255932Salfred      if (tbuf->len + nbwc > tbuf->asize)
219820Sjeff	{
219820Sjeff	  tbuf->asize += OUTBUF_BLOCK_SIZE;
219820Sjeff	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
272407Shselasky	}
255932Salfred
255932Salfred      for (i = 0; i < nbwc; i++)
255932Salfred	{
255932Salfred	  c = n & cmask;
255932Salfred	  n >>= cwidth;
255932Salfred	  tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
255932Salfred	}
255932Salfred      tbuf->len += nbwc;
255932Salfred    }
219820Sjeff  else
219820Sjeff    {
219820Sjeff      /* Note: this code does not handle the case where the target
219820Sjeff	 and host have a different number of bits in a byte.  */
219820Sjeff      if (tbuf->len + 1 > tbuf->asize)
219820Sjeff	{
219820Sjeff	  tbuf->asize += OUTBUF_BLOCK_SIZE;
272407Shselasky	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
219820Sjeff	}
219820Sjeff      tbuf->text[tbuf->len++] = n;
219820Sjeff    }
219820Sjeff}
219820Sjeff
219820Sjeff/* Convert a hexadecimal escape, pointed to by FROM, to the execution
219820Sjeff   character set and write it into the string buffer TBUF.  Returns an
255932Salfred   advanced pointer, and issues diagnostics as necessary.
255932Salfred   No character set translation occurs; this routine always produces the
219820Sjeff   execution-set character with numeric value equal to the given hex
219820Sjeff   number.  You can, e.g. generate surrogate pairs this way.  */
219820Sjeffstatic const uchar *
219820Sjeffconvert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
219820Sjeff	     struct _cpp_strbuf *tbuf, bool wide)
219820Sjeff{
255932Salfred  cppchar_t c, n = 0, overflow = 0;
219820Sjeff  int digits_found = 0;
255932Salfred  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
255932Salfred		  : CPP_OPTION (pfile, char_precision));
219820Sjeff  size_t mask = width_to_mask (width);
219820Sjeff
219820Sjeff  if (CPP_WTRADITIONAL (pfile))
219820Sjeff    cpp_error (pfile, CPP_DL_WARNING,
219820Sjeff	       "the meaning of '\\x' is different in traditional C");
219820Sjeff
219820Sjeff  from++;  /* Skip 'x'.  */
219820Sjeff  while (from < limit)
255932Salfred    {
255932Salfred      c = *from;
219820Sjeff      if (! hex_p (c))
219820Sjeff	break;
219820Sjeff      from++;
219820Sjeff      overflow |= n ^ (n << 4 >> 4);
279731Shselasky      n = (n << 4) + hex_value (c);
279731Shselasky      digits_found = 1;
219820Sjeff    }
219820Sjeff
219820Sjeff  if (!digits_found)
219820Sjeff    {
272407Shselasky      cpp_error (pfile, CPP_DL_ERROR,
272407Shselasky		 "\\x used with no following hex digits");
272407Shselasky      return from;
272407Shselasky    }
255932Salfred
272407Shselasky  if (overflow | (n != (n & mask)))
272407Shselasky    {
272407Shselasky      cpp_error (pfile, CPP_DL_PEDWARN,
255932Salfred		 "hex escape sequence out of range");
219820Sjeff      n &= mask;
255932Salfred    }
255932Salfred
219820Sjeff  emit_numeric_escape (pfile, n, tbuf, wide);
255932Salfred
255932Salfred  return from;
219820Sjeff}
219820Sjeff
219820Sjeff/* Convert an octal escape, pointed to by FROM, to the execution
219820Sjeff   character set and write it into the string buffer TBUF.  Returns an
219820Sjeff   advanced pointer, and issues diagnostics as necessary.
219820Sjeff   No character set translation occurs; this routine always produces the
219820Sjeff   execution-set character with numeric value equal to the given octal
255932Salfred   number.  */
255932Salfredstatic const uchar *
255932Salfredconvert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
255932Salfred	     struct _cpp_strbuf *tbuf, bool wide)
219820Sjeff{
255932Salfred  size_t count = 0;
255932Salfred  cppchar_t c, n = 0;
255932Salfred  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
255932Salfred		  : CPP_OPTION (pfile, char_precision));
255932Salfred  size_t mask = width_to_mask (width);
219820Sjeff  bool overflow = false;
219820Sjeff
219820Sjeff  while (from < limit && count++ < 3)
255932Salfred    {
255932Salfred      c = *from;
255932Salfred      if (c < '0' || c > '7')
219820Sjeff	break;
219820Sjeff      from++;
219820Sjeff      overflow |= n ^ (n << 3 >> 3);
219820Sjeff      n = (n << 3) + c - '0';
255932Salfred    }
219820Sjeff
219820Sjeff  if (n != (n & mask))
219820Sjeff    {
219820Sjeff      cpp_error (pfile, CPP_DL_PEDWARN,
219820Sjeff		 "octal escape sequence out of range");
255932Salfred      n &= mask;
219820Sjeff    }
219820Sjeff
219820Sjeff  emit_numeric_escape (pfile, n, tbuf, wide);
219820Sjeff
219820Sjeff  return from;
219820Sjeff}
219820Sjeff
219820Sjeff/* Convert an escape sequence (pointed to by FROM) to its value on
219820Sjeff   the target, and to the execution character set.  Do not scan past
272407Shselasky   LIMIT.  Write the converted value into TBUF.  Returns an advanced
219820Sjeff   pointer.  Handles all relevant diagnostics.  */
219820Sjeffstatic const uchar *
219820Sjeffconvert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
219820Sjeff		struct _cpp_strbuf *tbuf, bool wide)
255932Salfred{
219820Sjeff  /* Values of \a \b \e \f \n \r \t \v respectively.  */
219820Sjeff#if HOST_CHARSET == HOST_CHARSET_ASCII
219820Sjeff  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
219820Sjeff#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
219820Sjeff  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
255932Salfred#else
219820Sjeff#error "unknown host character set"
219820Sjeff#endif
219820Sjeff
219820Sjeff  uchar c;
219820Sjeff  struct cset_converter cvt
219820Sjeff    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
219820Sjeff
219820Sjeff  c = *from;
272407Shselasky  switch (c)
219820Sjeff    {
219820Sjeff      /* UCNs, hex escapes, and octal escapes are processed separately.  */
219820Sjeff    case 'u': case 'U':
219820Sjeff      return convert_ucn (pfile, from, limit, tbuf, wide);
219820Sjeff
219820Sjeff    case 'x':
219820Sjeff      return convert_hex (pfile, from, limit, tbuf, wide);
219820Sjeff      break;
255932Salfred
255932Salfred    case '0':  case '1':  case '2':  case '3':
255932Salfred    case '4':  case '5':  case '6':  case '7':
255932Salfred      return convert_oct (pfile, from, limit, tbuf, wide);
255932Salfred
255932Salfred      /* Various letter escapes.  Get the appropriate host-charset
255932Salfred	 value into C.  */
255932Salfred    case '\\': case '\'': case '"': case '?': break;
255932Salfred
255932Salfred    case '(': case '{': case '[': case '%':
255932Salfred      /* '\(', etc, can be used at the beginning of a line in a long
255932Salfred	 string split onto multiple lines with \-newline, to prevent
255932Salfred	 Emacs or other text editors from getting confused.  '\%' can
255932Salfred	 be used to prevent SCCS from mangling printf format strings.  */
255932Salfred      if (CPP_PEDANTIC (pfile))
255932Salfred	goto unknown;
255932Salfred      break;
255932Salfred
255932Salfred    case 'b': c = charconsts[1];  break;
255932Salfred    case 'f': c = charconsts[3];  break;
272407Shselasky    case 'n': c = charconsts[4];  break;
255932Salfred    case 'r': c = charconsts[5];  break;
255932Salfred    case 't': c = charconsts[6];  break;
255932Salfred    case 'v': c = charconsts[7];  break;
255932Salfred
255932Salfred    case 'a':
255932Salfred      if (CPP_WTRADITIONAL (pfile))
255932Salfred	cpp_error (pfile, CPP_DL_WARNING,
255932Salfred		   "the meaning of '\\a' is different in traditional C");
255932Salfred      c = charconsts[0];
272407Shselasky      break;
272407Shselasky
272407Shselasky    case 'e': case 'E':
272407Shselasky      if (CPP_PEDANTIC (pfile))
255932Salfred	cpp_error (pfile, CPP_DL_PEDWARN,
255932Salfred		   "non-ISO-standard escape sequence, '\\%c'", (int) c);
255932Salfred      c = charconsts[2];
255932Salfred      break;
255932Salfred
255932Salfred    default:
255932Salfred    unknown:
255932Salfred      if (ISGRAPH (c))
255932Salfred	cpp_error (pfile, CPP_DL_PEDWARN,
255932Salfred		   "unknown escape sequence '\\%c'", (int) c);
255932Salfred      else
255932Salfred	{
255932Salfred	  /* diagnostic.c does not support "%03o".  When it does, this
255932Salfred	     code can use %03o directly in the diagnostic again.  */
255932Salfred	  char buf[32];
255932Salfred	  sprintf(buf, "%03o", (int) c);
255932Salfred	  cpp_error (pfile, CPP_DL_PEDWARN,
255932Salfred		     "unknown escape sequence: '\\%s'", buf);
255932Salfred	}
255932Salfred    }
255932Salfred
255932Salfred  /* Now convert what we have to the execution character set.  */
255932Salfred  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
255932Salfred    cpp_errno (pfile, CPP_DL_ERROR,
255932Salfred	       "converting escape sequence to execution character set");
255932Salfred
255932Salfred  return from + 1;
255932Salfred}
255932Salfred
255932Salfred/* FROM is an array of cpp_string structures of length COUNT.  These
255932Salfred   are to be converted from the source to the execution character set,
255932Salfred   escape sequences translated, and finally all are to be
255932Salfred   concatenated.  WIDE indicates whether or not to produce a wide
255932Salfred   string.  The result is written into TO.  Returns true for success,
272407Shselasky   false for failure.  */
255932Salfredbool
272407Shselaskycpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
272407Shselasky		      cpp_string *to, bool wide)
272407Shselasky{
272407Shselasky  struct _cpp_strbuf tbuf;
272407Shselasky  const uchar *p, *base, *limit;
272407Shselasky  size_t i;
272407Shselasky  struct cset_converter cvt
272407Shselasky    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
272407Shselasky
272407Shselasky  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
272407Shselasky  tbuf.text = XNEWVEC (uchar, tbuf.asize);
272407Shselasky  tbuf.len = 0;
272407Shselasky
272407Shselasky  for (i = 0; i < count; i++)
272407Shselasky    {
272407Shselasky      p = from[i].text;
272407Shselasky      if (*p == 'L') p++;
272407Shselasky      p++; /* Skip leading quote.  */
272407Shselasky      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
272407Shselasky
272407Shselasky      for (;;)
272407Shselasky	{
272407Shselasky	  base = p;
272407Shselasky	  while (p < limit && *p != '\\')
272407Shselasky	    p++;
272407Shselasky	  if (p > base)
272407Shselasky	    {
255932Salfred	      /* We have a run of normal characters; these can be fed
255932Salfred		 directly to convert_cset.  */
255932Salfred	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
255932Salfred		goto fail;
255932Salfred	    }
255932Salfred	  if (p == limit)
255932Salfred	    break;
255932Salfred
255932Salfred	  p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
272407Shselasky	}
272407Shselasky    }
272407Shselasky  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
255932Salfred     structure.  */
255932Salfred  emit_numeric_escape (pfile, 0, &tbuf, wide);
255932Salfred  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
255932Salfred  to->text = tbuf.text;
255932Salfred  to->len = tbuf.len;
255932Salfred  return true;
255932Salfred
272407Shselasky fail:
255932Salfred  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
255932Salfred  free (tbuf.text);
255932Salfred  return false;
255932Salfred}
255932Salfred
255932Salfred/* Subroutine of do_line and do_linemarker.  Convert escape sequences
255932Salfred   in a string, but do not perform character set conversion.  */
255932Salfredbool
255932Salfredcpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
255932Salfred				  size_t count,	cpp_string *to, bool wide)
255932Salfred{
255932Salfred  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
255932Salfred  bool retval;
255932Salfred
219820Sjeff  pfile->narrow_cset_desc.func = convert_no_conversion;
219820Sjeff  pfile->narrow_cset_desc.cd = (iconv_t) -1;
255932Salfred
255932Salfred  retval = cpp_interpret_string (pfile, from, count, to, wide);
255932Salfred
255932Salfred  pfile->narrow_cset_desc = save_narrow_cset_desc;
255932Salfred  return retval;
255932Salfred}
255932Salfred
255932Salfred
255932Salfred/* Subroutine of cpp_interpret_charconst which performs the conversion
255932Salfred   to a number, for narrow strings.  STR is the string structure returned
255932Salfred   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
255932Salfred   cpp_interpret_charconst.  */
255932Salfredstatic cppchar_t
255932Salfrednarrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
255932Salfred			 unsigned int *pchars_seen, int *unsignedp)
255932Salfred{
255932Salfred  size_t width = CPP_OPTION (pfile, char_precision);
255932Salfred  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
255932Salfred  size_t mask = width_to_mask (width);
255932Salfred  size_t i;
255932Salfred  cppchar_t result, c;
255932Salfred  bool unsigned_p;
255932Salfred
255932Salfred  /* The value of a multi-character character constant, or a
255932Salfred     single-character character constant whose representation in the
255932Salfred     execution character set is more than one byte long, is
255932Salfred     implementation defined.  This implementation defines it to be the
255932Salfred     number formed by interpreting the byte sequence in memory as a
255932Salfred     big-endian binary number.  If overflow occurs, the high bytes are
255932Salfred     lost, and a warning is issued.
255932Salfred
255932Salfred     We don't want to process the NUL terminator handed back by
255932Salfred     cpp_interpret_string.  */
255932Salfred  result = 0;
255932Salfred  for (i = 0; i < str.len - 1; i++)
255932Salfred    {
255932Salfred      c = str.text[i] & mask;
255932Salfred      if (width < BITS_PER_CPPCHAR_T)
255932Salfred	result = (result << width) | c;
255932Salfred      else
255932Salfred	result = c;
255932Salfred    }
255932Salfred
255932Salfred  if (i > max_chars)
255932Salfred    {
255932Salfred      i = max_chars;
255932Salfred      cpp_error (pfile, CPP_DL_WARNING,
255932Salfred		 "character constant too long for its type");
255932Salfred    }
255932Salfred  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
255932Salfred    cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
255932Salfred
255932Salfred  /* Multichar constants are of type int and therefore signed.  */
255932Salfred  if (i > 1)
255932Salfred    unsigned_p = 0;
255932Salfred  else
255932Salfred    unsigned_p = CPP_OPTION (pfile, unsigned_char);
255932Salfred
255932Salfred  /* Truncate the constant to its natural width, and simultaneously
255932Salfred     sign- or zero-extend to the full width of cppchar_t.
255932Salfred     For single-character constants, the value is WIDTH bits wide.
255932Salfred     For multi-character constants, the value is INT_PRECISION bits wide.  */
255932Salfred  if (i > 1)
255932Salfred    width = CPP_OPTION (pfile, int_precision);
255932Salfred  if (width < BITS_PER_CPPCHAR_T)
255932Salfred    {
255932Salfred      mask = ((cppchar_t) 1 << width) - 1;
255932Salfred      if (unsigned_p || !(result & (1 << (width - 1))))
255932Salfred	result &= mask;
255932Salfred      else
255932Salfred	result |= ~mask;
255932Salfred    }
255932Salfred  *pchars_seen = i;
255932Salfred  *unsignedp = unsigned_p;
255932Salfred  return result;
255932Salfred}
255932Salfred
255932Salfred/* Subroutine of cpp_interpret_charconst which performs the conversion
255932Salfred   to a number, for wide strings.  STR is the string structure returned
255932Salfred   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
255932Salfred   cpp_interpret_charconst.  */
255932Salfredstatic cppchar_t
255932Salfredwide_str_to_charconst (cpp_reader *pfile, cpp_string str,
255932Salfred		       unsigned int *pchars_seen, int *unsignedp)
255932Salfred{
255932Salfred  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
255932Salfred  size_t width = CPP_OPTION (pfile, wchar_precision);
255932Salfred  size_t cwidth = CPP_OPTION (pfile, char_precision);
255932Salfred  size_t mask = width_to_mask (width);
255932Salfred  size_t cmask = width_to_mask (cwidth);
255932Salfred  size_t nbwc = width / cwidth;
255932Salfred  size_t off, i;
255932Salfred  cppchar_t result = 0, c;
255932Salfred
255932Salfred  /* This is finicky because the string is in the target's byte order,
255932Salfred     which may not be our byte order.  Only the last character, ignoring
255932Salfred     the NUL terminator, is relevant.  */
272407Shselasky  off = str.len - (nbwc * 2);
272407Shselasky  result = 0;
255932Salfred  for (i = 0; i < nbwc; i++)
255932Salfred    {
255932Salfred      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
255932Salfred      result = (result << cwidth) | (c & cmask);
255932Salfred    }
255932Salfred
255932Salfred  /* Wide character constants have type wchar_t, and a single
255932Salfred     character exactly fills a wchar_t, so a multi-character wide
255932Salfred     character constant is guaranteed to overflow.  */
255932Salfred  if (off > 0)
255932Salfred    cpp_error (pfile, CPP_DL_WARNING,
255932Salfred	       "character constant too long for its type");
255932Salfred
255932Salfred  /* Truncate the constant to its natural width, and simultaneously
255932Salfred     sign- or zero-extend to the full width of cppchar_t.  */
255932Salfred  if (width < BITS_PER_CPPCHAR_T)
255932Salfred    {
255932Salfred      if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
255932Salfred	result &= mask;
255932Salfred      else
255932Salfred	result |= ~mask;
255932Salfred    }
255932Salfred
255932Salfred  *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
255932Salfred  *pchars_seen = 1;
255932Salfred  return result;
255932Salfred}
255932Salfred
255932Salfred/* Interpret a (possibly wide) character constant in TOKEN.
255932Salfred   PCHARS_SEEN points to a variable that is filled in with the number
255932Salfred   of characters seen, and UNSIGNEDP to a variable that indicates
255932Salfred   whether the result has signed type.  */
255932Salfredcppchar_t
255932Salfredcpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
255932Salfred			 unsigned int *pchars_seen, int *unsignedp)
255932Salfred{
255932Salfred  cpp_string str = { 0, 0 };
255932Salfred  bool wide = (token->type == CPP_WCHAR);
255932Salfred  cppchar_t result;
255932Salfred
255932Salfred  /* an empty constant will appear as L'' or '' */
255932Salfred  if (token->val.str.len == (size_t) (2 + wide))
255932Salfred    {
255932Salfred      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
255932Salfred      return 0;
255932Salfred    }
255932Salfred  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
255932Salfred    return 0;
255932Salfred
255932Salfred  if (wide)
255932Salfred    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
255932Salfred  else
219820Sjeff    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
219820Sjeff
219820Sjeff  if (str.text != token->val.str.text)
219820Sjeff    free ((void *)str.text);
219820Sjeff
255932Salfred  return result;
255932Salfred}
255932Salfred
255932Salfred/* Convert an identifier denoted by ID and LEN, which might contain
219820Sjeff   UCN escapes, to the source character set, either UTF-8 or
219820Sjeff   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
219820Sjeffcpp_hashnode *
219820Sjeff_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
219820Sjeff{
219820Sjeff  /* It turns out that a UCN escape always turns into fewer characters
219820Sjeff     than the escape itself, so we can allocate a temporary in advance.  */
219820Sjeff  uchar * buf = (uchar *) alloca (len + 1);
219820Sjeff  uchar * bufp = buf;
219820Sjeff  size_t idp;
219820Sjeff
219820Sjeff  for (idp = 0; idp < len; idp++)
255932Salfred    if (id[idp] != '\\')
255932Salfred      *bufp++ = id[idp];
255932Salfred    else
219820Sjeff      {
	unsigned length = id[idp+1] == 'u' ? 4 : 8;
	cppchar_t value = 0;
	size_t bufleft = len - (bufp - buf);
	int rval;

	idp += 2;
	while (length && idp < len && ISXDIGIT (id[idp]))
	  {
	    value = (value << 4) + hex_value (id[idp]);
	    idp++;
	    length--;
	  }
	idp--;

	/* Special case for EBCDIC: if the identifier contains
	   a '$' specified using a UCN, translate it to EBCDIC.  */
	if (value == 0x24)
	  {
	    *bufp++ = '$';
	    continue;
	  }

	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
	if (rval)
	  {
	    errno = rval;
	    cpp_errno (pfile, CPP_DL_ERROR,
		       "converting UCN to source character set");
	    break;
	  }
      }

  return CPP_HASHNODE (ht_lookup (pfile->hash_table,
				  buf, bufp - buf, HT_ALLOC));
}

/* Convert an input buffer (containing the complete contents of one
   source file) from INPUT_CHARSET to the source character set.  INPUT
   points to the input buffer, SIZE is its allocated size, and LEN is
   the length of the meaningful data within the buffer.  The
   translated buffer is returned, and *ST_SIZE is set to the length of
   the meaningful data within the translated buffer.

   INPUT is expected to have been allocated with xmalloc.  This function
   will either return INPUT, or free it and return a pointer to another
   xmalloc-allocated block of memory.  */
uchar *
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
		    uchar *input, size_t size, size_t len, off_t *st_size)
{
  struct cset_converter input_cset;
  struct _cpp_strbuf to;

  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
  if (input_cset.func == convert_no_conversion)
    {
      to.text = input;
      to.asize = size;
      to.len = len;
    }
  else
    {
      to.asize = MAX (65536, len);
      to.text = XNEWVEC (uchar, to.asize);
      to.len = 0;

      if (!APPLY_CONVERSION (input_cset, input, len, &to))
	cpp_error (pfile, CPP_DL_ERROR,
		   "failure to convert %s to %s",
		   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);

      free (input);
    }

  /* Clean up the mess.  */
  if (input_cset.func == convert_using_iconv)
    iconv_close (input_cset.cd);

  /* Resize buffer if we allocated substantially too much, or if we
     haven't enough space for the \n-terminator.  */
  if (to.len + 4096 < to.asize || to.len >= to.asize)
    to.text = XRESIZEVEC (uchar, to.text, to.len + 1);

  /* If the file is using old-school Mac line endings (\r only),
     terminate with another \r, not an \n, so that we do not mistake
     the \r\n sequence for a single DOS line ending and erroneously
     issue the "No newline at end of file" diagnostic.  */
  /* APPLE LOCAL don't access to.text[-1] radar 6121572 */
  if (to.len > 0 && to.text[to.len - 1] == '\r')
    to.text[to.len] = '\r';
  else
    to.text[to.len] = '\n';

  *st_size = to.len;
  return to.text;
}

/* Decide on the default encoding to assume for input files.  */
const char *
_cpp_default_encoding (void)
{
  const char *current_encoding = NULL;

  /* We disable this because the default codeset is 7-bit ASCII on
     most platforms, and this causes conversion failures on every
     file in GCC that happens to have one of the upper 128 characters
     in it -- most likely, as part of the name of a contributor.
     We should definitely recognize in-band markers of file encoding,
     like:
     - the appropriate Unicode byte-order mark (FE FF) to recognize
       UTF16 and UCS4 (in both big-endian and little-endian flavors)
       and UTF8
     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
       distinguish ASCII and EBCDIC.
     - now we can parse something like "#pragma GCC encoding <xyz>
       on the first line, or even Emacs/VIM's mode line tags (there's
       a problem here in that VIM uses the last line, and Emacs has
       its more elaborate "local variables" convention).
     - investigate whether Java has another common convention, which
       would be friendly to support.
     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
  setlocale (LC_CTYPE, "");
  current_encoding = nl_langinfo (CODESET);
#endif
  if (current_encoding == NULL || *current_encoding == '\0')
    current_encoding = SOURCE_CHARSET;

  return current_encoding;
}