gcclibs/libcpp/charset.c

169695Skan/* CPP Library - charsets
169695Skan   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
169695Skan   Free Software Foundation, Inc.
169695Skan
169695Skan   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
169695Skan
169695SkanThis program is free software; you can redistribute it and/or modify it
169695Skanunder the terms of the GNU General Public License as published by the
169695SkanFree Software Foundation; either version 2, or (at your option) any
169695Skanlater version.
169695Skan
169695SkanThis program is distributed in the hope that it will be useful,
169695Skanbut WITHOUT ANY WARRANTY; without even the implied warranty of
169695SkanMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
169695SkanGNU General Public License for more details.
169695Skan
169695SkanYou should have received a copy of the GNU General Public License
169695Skanalong with this program; if not, write to the Free Software
169695SkanFoundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
169695Skan
169695Skan#include "config.h"
169695Skan#include "system.h"
169695Skan#include "cpplib.h"
169695Skan#include "internal.h"
169695Skan
169695Skan/* Character set handling for C-family languages.
169695Skan
169695Skan   Terminological note: In what follows, "charset" or "character set"
169695Skan   will be taken to mean both an abstract set of characters and an
169695Skan   encoding for that set.
169695Skan
169695Skan   The C99 standard discusses two character sets: source and execution.
169695Skan   The source character set is used for internal processing in translation
169695Skan   phases 1 through 4; the execution character set is used thereafter.
169695Skan   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
169695Skan   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
169695Skan   of these terms).  Furthermore, the "basic character set" (listed in
169695Skan   5.2.1p3) is to be encoded in each with values one byte wide, and is
169695Skan   to appear in the initial shift state.
169695Skan
169695Skan   It is not explicitly mentioned, but there is also a "wide execution
169695Skan   character set" used to encode wide character constants and wide
169695Skan   string literals; this is supposed to be the result of applying the
169695Skan   standard library function mbstowcs() to an equivalent narrow string
169695Skan   (6.4.5p5).  However, the behavior of hexadecimal and octal
169695Skan   \-escapes is at odds with this; they are supposed to be translated
169695Skan   directly to wchar_t values (6.4.4.4p5,6).
169695Skan
169695Skan   The source character set is not necessarily the character set used
169695Skan   to encode physical source files on disk; translation phase 1 converts
169695Skan   from whatever that encoding is to the source character set.
169695Skan
169695Skan   The presence of universal character names in C99 (6.4.3 et seq.)
169695Skan   forces the source character set to be isomorphic to ISO 10646,
169695Skan   that is, Unicode.  There is no such constraint on the execution
169695Skan   character set; note also that the conversion from source to
169695Skan   execution character set does not occur for identifiers (5.1.1.2p1#5).
169695Skan
169695Skan   For convenience of implementation, the source character set's
169695Skan   encoding of the basic character set should be identical to the
169695Skan   execution character set OF THE HOST SYSTEM's encoding of the basic
169695Skan   character set, and it should not be a state-dependent encoding.
169695Skan
169695Skan   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
169695Skan   depending on whether the host is based on ASCII or EBCDIC (see
169695Skan   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
169695Skan   Technical Report #16).  With limited exceptions, it relies on the
169695Skan   system library's iconv() primitive to do charset conversion
169695Skan   (specified in SUSv2).  */
169695Skan
169695Skan#if !HAVE_ICONV
169695Skan/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
169695Skan   below, which are guarded only by if statements with compile-time
169695Skan   constant conditions, do not cause link errors.  */
169695Skan#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
169695Skan#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
169695Skan#define iconv_close(x)   (void)0
169695Skan#define ICONV_CONST
169695Skan#endif
169695Skan
169695Skan#if HOST_CHARSET == HOST_CHARSET_ASCII
169695Skan#define SOURCE_CHARSET "UTF-8"
169695Skan#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
169695Skan#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
169695Skan#define SOURCE_CHARSET "UTF-EBCDIC"
169695Skan#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
169695Skan#else
169695Skan#error "Unrecognized basic host character set"
169695Skan#endif
169695Skan
169695Skan#ifndef EILSEQ
169695Skan#define EILSEQ EINVAL
169695Skan#endif
169695Skan
169695Skan/* This structure is used for a resizable string buffer throughout.  */
169695Skan/* Don't call it strbuf, as that conflicts with unistd.h on systems
169695Skan   such as DYNIX/ptx where unistd.h includes stropts.h.  */
169695Skanstruct _cpp_strbuf
169695Skan{
169695Skan  uchar *text;
169695Skan  size_t asize;
169695Skan  size_t len;
169695Skan};
169695Skan
169695Skan/* This is enough to hold any string that fits on a single 80-column
169695Skan   line, even if iconv quadruples its size (e.g. conversion from
169695Skan   ASCII to UTF-32) rounded up to a power of two.  */
169695Skan#define OUTBUF_BLOCK_SIZE 256
169695Skan
169695Skan/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
169695Skan   logic.  This is because a depressing number of systems lack iconv,
169695Skan   or have have iconv libraries that do not do these conversions, so
169695Skan   we need a fallback implementation for them.  To ensure the fallback
169695Skan   doesn't break due to neglect, it is used on all systems.
169695Skan
169695Skan   UTF-32 encoding is nice and simple: a four-byte binary number,
169695Skan   constrained to the range 00000000-7FFFFFFF to avoid questions of
169695Skan   signedness.  We do have to cope with big- and little-endian
169695Skan   variants.
169695Skan
169695Skan   UTF-16 encoding uses two-byte binary numbers, again in big- and
169695Skan   little-endian variants, for all values in the 00000000-0000FFFF
169695Skan   range.  Values in the 00010000-0010FFFF range are encoded as pairs
169695Skan   of two-byte numbers, called "surrogate pairs": given a number S in
169695Skan   this range, it is mapped to a pair (H, L) as follows:
169695Skan
169695Skan     H = (S - 0x10000) / 0x400 + 0xD800
169695Skan     L = (S - 0x10000) % 0x400 + 0xDC00
169695Skan
169695Skan   Two-byte values in the D800...DFFF range are ill-formed except as a
169695Skan   component of a surrogate pair.  Even if the encoding within a
169695Skan   two-byte value is little-endian, the H member of the surrogate pair
169695Skan   comes first.
169695Skan
169695Skan   There is no way to encode values in the 00110000-7FFFFFFF range,
169695Skan   which is not currently a problem as there are no assigned code
169695Skan   points in that range; however, the author expects that it will
169695Skan   eventually become necessary to abandon UTF-16 due to this
169695Skan   limitation.  Note also that, because of these pairs, UTF-16 does
169695Skan   not meet the requirements of the C standard for a wide character
169695Skan   encoding (see 3.7.3 and 6.4.4.4p11).
169695Skan
169695Skan   UTF-8 encoding looks like this:
169695Skan
169695Skan   value range	       encoded as
169695Skan   00000000-0000007F   0xxxxxxx
169695Skan   00000080-000007FF   110xxxxx 10xxxxxx
169695Skan   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
169695Skan   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
169695Skan   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
169695Skan   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
169695Skan
169695Skan   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
169695Skan   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
169695Skan   never occur.  Note also that any value that can be encoded by a
169695Skan   given row of the table can also be encoded by all successive rows,
169695Skan   but this is not done; only the shortest possible encoding for any
169695Skan   given value is valid.  For instance, the character 07C0 could be
169695Skan   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
169695Skan   FC 80 80 80 9F 80.  Only the first is valid.
169695Skan
169695Skan   An implementation note: the transformation from UTF-16 to UTF-8, or
169695Skan   vice versa, is easiest done by using UTF-32 as an intermediary.  */
169695Skan
169695Skan/* Internal primitives which go from an UTF-8 byte stream to native-endian
169695Skan   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
169695Skan   operation in several places below.  */
169695Skanstatic inline int
169695Skanone_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
169695Skan		     cppchar_t *cp)
169695Skan{
169695Skan  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
169695Skan  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
169695Skan
169695Skan  cppchar_t c;
169695Skan  const uchar *inbuf = *inbufp;
169695Skan  size_t nbytes, i;
169695Skan
169695Skan  if (*inbytesleftp < 1)
169695Skan    return EINVAL;
169695Skan
169695Skan  c = *inbuf;
169695Skan  if (c < 0x80)
169695Skan    {
169695Skan      *cp = c;
169695Skan      *inbytesleftp -= 1;
169695Skan      *inbufp += 1;
169695Skan      return 0;
169695Skan    }
169695Skan
169695Skan  /* The number of leading 1-bits in the first byte indicates how many
169695Skan     bytes follow.  */
169695Skan  for (nbytes = 2; nbytes < 7; nbytes++)
169695Skan    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
169695Skan      goto found;
169695Skan  return EILSEQ;
169695Skan found:
169695Skan
169695Skan  if (*inbytesleftp < nbytes)
169695Skan    return EINVAL;
169695Skan
169695Skan  c = (c & masks[nbytes-1]);
169695Skan  inbuf++;
169695Skan  for (i = 1; i < nbytes; i++)
169695Skan    {
169695Skan      cppchar_t n = *inbuf++;
169695Skan      if ((n & 0xC0) != 0x80)
169695Skan	return EILSEQ;
169695Skan      c = ((c << 6) + (n & 0x3F));
169695Skan    }
169695Skan
169695Skan  /* Make sure the shortest possible encoding was used.  */
169695Skan  if (c <=      0x7F && nbytes > 1) return EILSEQ;
169695Skan  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
169695Skan  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
169695Skan  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
169695Skan  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
169695Skan
169695Skan  /* Make sure the character is valid.  */
169695Skan  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
169695Skan
169695Skan  *cp = c;
169695Skan  *inbufp = inbuf;
169695Skan  *inbytesleftp -= nbytes;
169695Skan  return 0;
169695Skan}
169695Skan
169695Skanstatic inline int
169695Skanone_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
169695Skan{
169695Skan  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
169695Skan  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
169695Skan  size_t nbytes;
169695Skan  uchar buf[6], *p = &buf[6];
169695Skan  uchar *outbuf = *outbufp;
169695Skan
169695Skan  nbytes = 1;
169695Skan  if (c < 0x80)
169695Skan    *--p = c;
169695Skan  else
169695Skan    {
169695Skan      do
169695Skan	{
169695Skan	  *--p = ((c & 0x3F) | 0x80);
169695Skan	  c >>= 6;
169695Skan	  nbytes++;
169695Skan	}
169695Skan      while (c >= 0x3F || (c & limits[nbytes-1]));
169695Skan      *--p = (c | masks[nbytes-1]);
169695Skan    }
169695Skan
169695Skan  if (*outbytesleftp < nbytes)
169695Skan    return E2BIG;
169695Skan
169695Skan  while (p < &buf[6])
169695Skan    *outbuf++ = *p++;
169695Skan  *outbytesleftp -= nbytes;
169695Skan  *outbufp = outbuf;
169695Skan  return 0;
169695Skan}
169695Skan
169695Skan/* The following four functions transform one character between the two
169695Skan   encodings named in the function name.  All have the signature
169695Skan   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
169695Skan           uchar **outbufp, size_t *outbytesleftp)
169695Skan
169695Skan   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
169695Skan   interpreted as a boolean indicating whether big-endian or
169695Skan   little-endian encoding is to be used for the member of the pair
169695Skan   that is not UTF-8.
169695Skan
169695Skan   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
169695Skan   do for iconv.
169695Skan
169695Skan   The return value is either 0 for success, or an errno value for
169695Skan   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
169695Skan   input sequence), ir EINVAL (incomplete input sequence).  */
169695Skan
169695Skanstatic inline int
169695Skanone_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
169695Skan		   uchar **outbufp, size_t *outbytesleftp)
169695Skan{
169695Skan  uchar *outbuf;
169695Skan  cppchar_t s = 0;
169695Skan  int rval;
169695Skan
169695Skan  /* Check for space first, since we know exactly how much we need.  */
169695Skan  if (*outbytesleftp < 4)
169695Skan    return E2BIG;
169695Skan
169695Skan  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
169695Skan  if (rval)
169695Skan    return rval;
169695Skan
169695Skan  outbuf = *outbufp;
169695Skan  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
169695Skan  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
169695Skan  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
169695Skan  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
169695Skan
169695Skan  *outbufp += 4;
169695Skan  *outbytesleftp -= 4;
169695Skan  return 0;
169695Skan}
169695Skan
169695Skanstatic inline int
169695Skanone_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
169695Skan		   uchar **outbufp, size_t *outbytesleftp)
169695Skan{
169695Skan  cppchar_t s;
169695Skan  int rval;
169695Skan  const uchar *inbuf;
169695Skan
169695Skan  if (*inbytesleftp < 4)
169695Skan    return EINVAL;
169695Skan
169695Skan  inbuf = *inbufp;
169695Skan
169695Skan  s  = inbuf[bigend ? 0 : 3] << 24;
169695Skan  s += inbuf[bigend ? 1 : 2] << 16;
169695Skan  s += inbuf[bigend ? 2 : 1] << 8;
169695Skan  s += inbuf[bigend ? 3 : 0];
169695Skan
169695Skan  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
169695Skan    return EILSEQ;
169695Skan
169695Skan  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
169695Skan  if (rval)
169695Skan    return rval;
169695Skan
169695Skan  *inbufp += 4;
169695Skan  *inbytesleftp -= 4;
169695Skan  return 0;
169695Skan}
169695Skan
169695Skanstatic inline int
169695Skanone_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
169695Skan		   uchar **outbufp, size_t *outbytesleftp)
169695Skan{
169695Skan  int rval;
169695Skan  cppchar_t s = 0;
169695Skan  const uchar *save_inbuf = *inbufp;
169695Skan  size_t save_inbytesleft = *inbytesleftp;
169695Skan  uchar *outbuf = *outbufp;
169695Skan
169695Skan  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
169695Skan  if (rval)
169695Skan    return rval;
169695Skan
169695Skan  if (s > 0x0010FFFF)
169695Skan    {
169695Skan      *inbufp = save_inbuf;
169695Skan      *inbytesleftp = save_inbytesleft;
169695Skan      return EILSEQ;
169695Skan    }
169695Skan
169695Skan  if (s < 0xFFFF)
169695Skan    {
169695Skan      if (*outbytesleftp < 2)
169695Skan	{
169695Skan	  *inbufp = save_inbuf;
169695Skan	  *inbytesleftp = save_inbytesleft;
169695Skan	  return E2BIG;
169695Skan	}
169695Skan      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
169695Skan      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
169695Skan
169695Skan      *outbufp += 2;
169695Skan      *outbytesleftp -= 2;
169695Skan      return 0;
169695Skan    }
169695Skan  else
169695Skan    {
169695Skan      cppchar_t hi, lo;
169695Skan
169695Skan      if (*outbytesleftp < 4)
169695Skan	{
169695Skan	  *inbufp = save_inbuf;
169695Skan	  *inbytesleftp = save_inbytesleft;
169695Skan	  return E2BIG;
169695Skan	}
169695Skan
169695Skan      hi = (s - 0x10000) / 0x400 + 0xD800;
169695Skan      lo = (s - 0x10000) % 0x400 + 0xDC00;
169695Skan
169695Skan      /* Even if we are little-endian, put the high surrogate first.
169695Skan	 ??? Matches practice?  */
169695Skan      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
169695Skan      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
169695Skan      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
169695Skan      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
169695Skan
169695Skan      *outbufp += 4;
169695Skan      *outbytesleftp -= 4;
169695Skan      return 0;
169695Skan    }
169695Skan}
169695Skan
169695Skanstatic inline int
169695Skanone_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
169695Skan		   uchar **outbufp, size_t *outbytesleftp)
169695Skan{
169695Skan  cppchar_t s;
169695Skan  const uchar *inbuf = *inbufp;
169695Skan  int rval;
169695Skan
169695Skan  if (*inbytesleftp < 2)
169695Skan    return EINVAL;
169695Skan  s  = inbuf[bigend ? 0 : 1] << 8;
169695Skan  s += inbuf[bigend ? 1 : 0];
169695Skan
169695Skan  /* Low surrogate without immediately preceding high surrogate is invalid.  */
169695Skan  if (s >= 0xDC00 && s <= 0xDFFF)
169695Skan    return EILSEQ;
169695Skan  /* High surrogate must have a following low surrogate.  */
169695Skan  else if (s >= 0xD800 && s <= 0xDBFF)
169695Skan    {
169695Skan      cppchar_t hi = s, lo;
169695Skan      if (*inbytesleftp < 4)
169695Skan	return EINVAL;
169695Skan
169695Skan      lo  = inbuf[bigend ? 2 : 3] << 8;
169695Skan      lo += inbuf[bigend ? 3 : 2];
169695Skan
169695Skan      if (lo < 0xDC00 || lo > 0xDFFF)
169695Skan	return EILSEQ;
169695Skan
169695Skan      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
169695Skan    }
169695Skan
169695Skan  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
169695Skan  if (rval)
169695Skan    return rval;
169695Skan
169695Skan  /* Success - update the input pointers (one_cppchar_to_utf8 has done
169695Skan     the output pointers for us).  */
169695Skan  if (s <= 0xFFFF)
169695Skan    {
169695Skan      *inbufp += 2;
169695Skan      *inbytesleftp -= 2;
169695Skan    }
169695Skan  else
169695Skan    {
169695Skan      *inbufp += 4;
169695Skan      *inbytesleftp -= 4;
169695Skan    }
169695Skan  return 0;
169695Skan}
169695Skan
169695Skan/* Helper routine for the next few functions.  The 'const' on
169695Skan   one_conversion means that we promise not to modify what function is
169695Skan   pointed to, which lets the inliner see through it.  */
169695Skan
169695Skanstatic inline bool
169695Skanconversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
169695Skan					     uchar **, size_t *),
169695Skan		 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
169695Skan{
169695Skan  const uchar *inbuf;
169695Skan  uchar *outbuf;
169695Skan  size_t inbytesleft, outbytesleft;
169695Skan  int rval;
169695Skan
169695Skan  inbuf = from;
169695Skan  inbytesleft = flen;
169695Skan  outbuf = to->text + to->len;
169695Skan  outbytesleft = to->asize - to->len;
169695Skan
169695Skan  for (;;)
169695Skan    {
169695Skan      do
169695Skan	rval = one_conversion (cd, &inbuf, &inbytesleft,
169695Skan			       &outbuf, &outbytesleft);
169695Skan      while (inbytesleft && !rval);
169695Skan
169695Skan      if (__builtin_expect (inbytesleft == 0, 1))
169695Skan	{
169695Skan	  to->len = to->asize - outbytesleft;
169695Skan	  return true;
169695Skan	}
169695Skan      if (rval != E2BIG)
169695Skan	{
169695Skan	  errno = rval;
169695Skan	  return false;
169695Skan	}
169695Skan
169695Skan      outbytesleft += OUTBUF_BLOCK_SIZE;
169695Skan      to->asize += OUTBUF_BLOCK_SIZE;
169695Skan      to->text = XRESIZEVEC (uchar, to->text, to->asize);
169695Skan      outbuf = to->text + to->asize - outbytesleft;
169695Skan    }
169695Skan}
169695Skan
169695Skan
169695Skan/* These functions convert entire strings between character sets.
169695Skan   They all have the signature
169695Skan
169695Skan   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
169695Skan
169695Skan   The input string FROM is converted as specified by the function
169695Skan   name plus the iconv descriptor CD (which may be fake), and the
169695Skan   result appended to TO.  On any error, false is returned, otherwise true.  */
169695Skan
169695Skan/* These four use the custom conversion code above.  */
169695Skanstatic bool
169695Skanconvert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
169695Skan		    struct _cpp_strbuf *to)
169695Skan{
169695Skan  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
169695Skan}
169695Skan
169695Skanstatic bool
169695Skanconvert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
169695Skan		    struct _cpp_strbuf *to)
169695Skan{
169695Skan  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
169695Skan}
169695Skan
169695Skanstatic bool
169695Skanconvert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
169695Skan		    struct _cpp_strbuf *to)
169695Skan{
169695Skan  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
169695Skan}
169695Skan
169695Skanstatic bool
169695Skanconvert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
169695Skan		    struct _cpp_strbuf *to)
169695Skan{
169695Skan  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
169695Skan}
169695Skan
169695Skan/* Identity conversion, used when we have no alternative.  */
169695Skanstatic bool
169695Skanconvert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
169695Skan		       const uchar *from, size_t flen, struct _cpp_strbuf *to)
169695Skan{
169695Skan  if (to->len + flen > to->asize)
169695Skan    {
169695Skan      to->asize = to->len + flen;
169695Skan      to->text = XRESIZEVEC (uchar, to->text, to->asize);
169695Skan    }
169695Skan  memcpy (to->text + to->len, from, flen);
169695Skan  to->len += flen;
169695Skan  return true;
169695Skan}
169695Skan
169695Skan/* And this one uses the system iconv primitive.  It's a little
169695Skan   different, since iconv's interface is a little different.  */
169695Skan#if HAVE_ICONV
169695Skanstatic bool
169695Skanconvert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
169695Skan		     struct _cpp_strbuf *to)
169695Skan{
169695Skan  ICONV_CONST char *inbuf;
169695Skan  char *outbuf;
169695Skan  size_t inbytesleft, outbytesleft;
169695Skan
169695Skan  /* Reset conversion descriptor and check that it is valid.  */
169695Skan  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
169695Skan    return false;
169695Skan
169695Skan  inbuf = (ICONV_CONST char *)from;
169695Skan  inbytesleft = flen;
169695Skan  outbuf = (char *)to->text + to->len;
169695Skan  outbytesleft = to->asize - to->len;
169695Skan
169695Skan  for (;;)
169695Skan    {
169695Skan      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
169695Skan      if (__builtin_expect (inbytesleft == 0, 1))
169695Skan	{
169695Skan	  to->len = to->asize - outbytesleft;
169695Skan	  return true;
169695Skan	}
169695Skan      if (errno != E2BIG)
169695Skan	return false;
169695Skan
169695Skan      outbytesleft += OUTBUF_BLOCK_SIZE;
169695Skan      to->asize += OUTBUF_BLOCK_SIZE;
169695Skan      to->text = XRESIZEVEC (uchar, to->text, to->asize);
169695Skan      outbuf = (char *)to->text + to->asize - outbytesleft;
169695Skan    }
169695Skan}
169695Skan#else
169695Skan#define convert_using_iconv 0 /* prevent undefined symbol error below */
169695Skan#endif
169695Skan
169695Skan/* Arrange for the above custom conversion logic to be used automatically
169695Skan   when conversion between a suitable pair of character sets is requested.  */
169695Skan
169695Skan#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
169695Skan   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
169695Skan
169695Skanstruct conversion
169695Skan{
169695Skan  const char *pair;
169695Skan  convert_f func;
169695Skan  iconv_t fake_cd;
169695Skan};
169695Skanstatic const struct conversion conversion_tab[] = {
169695Skan  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
169695Skan  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
169695Skan  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
169695Skan  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
169695Skan  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
169695Skan  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
169695Skan  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
169695Skan  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
169695Skan};
169695Skan
169695Skan/* Subroutine of cpp_init_iconv: initialize and return a
169695Skan   cset_converter structure for conversion from FROM to TO.  If
169695Skan   iconv_open() fails, issue an error and return an identity
169695Skan   converter.  Silently return an identity converter if FROM and TO
169695Skan   are identical.  */
169695Skanstatic struct cset_converter
169695Skaninit_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
169695Skan{
169695Skan  struct cset_converter ret;
169695Skan  char *pair;
169695Skan  size_t i;
169695Skan
169695Skan  if (!strcasecmp (to, from))
169695Skan    {
169695Skan      ret.func = convert_no_conversion;
169695Skan      ret.cd = (iconv_t) -1;
169695Skan      return ret;
169695Skan    }
169695Skan
169695Skan  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
169695Skan
169695Skan  strcpy(pair, from);
169695Skan  strcat(pair, "/");
169695Skan  strcat(pair, to);
169695Skan  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
169695Skan    if (!strcasecmp (pair, conversion_tab[i].pair))
169695Skan      {
169695Skan	ret.func = conversion_tab[i].func;
169695Skan	ret.cd = conversion_tab[i].fake_cd;
169695Skan	return ret;
169695Skan      }
169695Skan
169695Skan  /* No custom converter - try iconv.  */
169695Skan  if (HAVE_ICONV)
169695Skan    {
169695Skan      ret.func = convert_using_iconv;
169695Skan      ret.cd = iconv_open (to, from);
169695Skan
169695Skan      if (ret.cd == (iconv_t) -1)
169695Skan	{
169695Skan	  if (errno == EINVAL)
169695Skan	    cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
169695Skan		       "conversion from %s to %s not supported by iconv",
169695Skan		       from, to);
169695Skan	  else
169695Skan	    cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
169695Skan
169695Skan	  ret.func = convert_no_conversion;
169695Skan	}
169695Skan    }
169695Skan  else
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
169695Skan		 "no iconv implementation, cannot convert from %s to %s",
169695Skan		 from, to);
169695Skan      ret.func = convert_no_conversion;
169695Skan      ret.cd = (iconv_t) -1;
169695Skan    }
169695Skan  return ret;
169695Skan}
169695Skan
169695Skan/* If charset conversion is requested, initialize iconv(3) descriptors
169695Skan   for conversion from the source character set to the execution
169695Skan   character sets.  If iconv is not present in the C library, and
169695Skan   conversion is requested, issue an error.  */
169695Skan
169695Skanvoid
169695Skancpp_init_iconv (cpp_reader *pfile)
169695Skan{
169695Skan  const char *ncset = CPP_OPTION (pfile, narrow_charset);
169695Skan  const char *wcset = CPP_OPTION (pfile, wide_charset);
169695Skan  const char *default_wcset;
169695Skan
169695Skan  bool be = CPP_OPTION (pfile, bytes_big_endian);
169695Skan
169695Skan  if (CPP_OPTION (pfile, wchar_precision) >= 32)
169695Skan    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
169695Skan  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
169695Skan    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
169695Skan  else
169695Skan    /* This effectively means that wide strings are not supported,
169695Skan       so don't do any conversion at all.  */
169695Skan   default_wcset = SOURCE_CHARSET;
169695Skan
169695Skan  if (!ncset)
169695Skan    ncset = SOURCE_CHARSET;
169695Skan  if (!wcset)
169695Skan    wcset = default_wcset;
169695Skan
169695Skan  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
169695Skan  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
169695Skan}
169695Skan
169695Skan/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
169695Skanvoid
169695Skan_cpp_destroy_iconv (cpp_reader *pfile)
169695Skan{
169695Skan  if (HAVE_ICONV)
169695Skan    {
169695Skan      if (pfile->narrow_cset_desc.func == convert_using_iconv)
169695Skan	iconv_close (pfile->narrow_cset_desc.cd);
169695Skan      if (pfile->wide_cset_desc.func == convert_using_iconv)
169695Skan	iconv_close (pfile->wide_cset_desc.cd);
169695Skan    }
169695Skan}
169695Skan
169695Skan/* Utility routine for use by a full compiler.  C is a character taken
169695Skan   from the *basic* source character set, encoded in the host's
169695Skan   execution encoding.  Convert it to (the target's) execution
169695Skan   encoding, and return that value.
169695Skan
169695Skan   Issues an internal error if C's representation in the narrow
169695Skan   execution character set fails to be a single-byte value (C99
169695Skan   5.2.1p3: "The representation of each member of the source and
169695Skan   execution character sets shall fit in a byte.")  May also issue an
169695Skan   internal error if C fails to be a member of the basic source
169695Skan   character set (testing this exactly is too hard, especially when
169695Skan   the host character set is EBCDIC).  */
169695Skancppchar_t
169695Skancpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
169695Skan{
169695Skan  uchar sbuf[1];
169695Skan  struct _cpp_strbuf tbuf;
169695Skan
169695Skan  /* This test is merely an approximation, but it suffices to catch
169695Skan     the most important thing, which is that we don't get handed a
169695Skan     character outside the unibyte range of the host character set.  */
169695Skan  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_ICE,
169695Skan		 "character 0x%lx is not in the basic source character set\n",
169695Skan		 (unsigned long)c);
169695Skan      return 0;
169695Skan    }
169695Skan
169695Skan  /* Being a character in the unibyte range of the host character set,
169695Skan     we can safely splat it into a one-byte buffer and trust that that
169695Skan     is a well-formed string.  */
169695Skan  sbuf[0] = c;
169695Skan
169695Skan  /* This should never need to reallocate, but just in case... */
169695Skan  tbuf.asize = 1;
169695Skan  tbuf.text = XNEWVEC (uchar, tbuf.asize);
169695Skan  tbuf.len = 0;
169695Skan
169695Skan  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
169695Skan    {
169695Skan      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
169695Skan      return 0;
169695Skan    }
169695Skan  if (tbuf.len != 1)
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_ICE,
169695Skan		 "character 0x%lx is not unibyte in execution character set",
169695Skan		 (unsigned long)c);
169695Skan      return 0;
169695Skan    }
169695Skan  c = tbuf.text[0];
169695Skan  free(tbuf.text);
169695Skan  return c;
169695Skan}
169695Skan
169695Skan
169695Skan
169695Skan/* Utility routine that computes a mask of the form 0000...111... with
169695Skan   WIDTH 1-bits.  */
169695Skanstatic inline size_t
169695Skanwidth_to_mask (size_t width)
169695Skan{
169695Skan  width = MIN (width, BITS_PER_CPPCHAR_T);
169695Skan  if (width >= CHAR_BIT * sizeof (size_t))
169695Skan    return ~(size_t) 0;
169695Skan  else
169695Skan    return ((size_t) 1 << width) - 1;
169695Skan}
169695Skan
169695Skan/* A large table of unicode character information.  */
169695Skanenum {
169695Skan  /* Valid in a C99 identifier?  */
169695Skan  C99 = 1,
169695Skan  /* Valid in a C99 identifier, but not as the first character?  */
169695Skan  DIG = 2,
169695Skan  /* Valid in a C++ identifier?  */
169695Skan  CXX = 4,
169695Skan  /* NFC representation is not valid in an identifier?  */
169695Skan  CID = 8,
169695Skan  /* Might be valid NFC form?  */
169695Skan  NFC = 16,
169695Skan  /* Might be valid NFKC form?  */
169695Skan  NKC = 32,
169695Skan  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
169695Skan  CTX = 64
169695Skan};
169695Skan
169695Skanstatic const struct {
169695Skan  /* Bitmap of flags above.  */
169695Skan  unsigned char flags;
169695Skan  /* Combining class of the character.  */
169695Skan  unsigned char combine;
169695Skan  /* Last character in the range described by this entry.  */
169695Skan  unsigned short end;
169695Skan} ucnranges[] = {
169695Skan#include "ucnid.h"
169695Skan};
169695Skan
169695Skan/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
169695Skan   the start of an identifier, and 0 if C is not valid in an
169695Skan   identifier.  We assume C has already gone through the checks of
169695Skan   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
169695Skan   algorithm is a simple binary search on the table defined in
169695Skan   ucnid.h.  */
169695Skan
169695Skanstatic int
169695Skanucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
169695Skan			 struct normalize_state *nst)
169695Skan{
169695Skan  int mn, mx, md;
169695Skan
169695Skan  if (c > 0xFFFF)
169695Skan    return 0;
169695Skan
169695Skan  mn = 0;
169695Skan  mx = ARRAY_SIZE (ucnranges) - 1;
169695Skan  while (mx != mn)
169695Skan    {
169695Skan      md = (mn + mx) / 2;
169695Skan      if (c <= ucnranges[md].end)
169695Skan	mx = md;
169695Skan      else
169695Skan	mn = md + 1;
169695Skan    }
169695Skan
169695Skan  /* When -pedantic, we require the character to have been listed by
169695Skan     the standard for the current language.  Otherwise, we accept the
169695Skan     union of the acceptable sets for C++98 and C99.  */
169695Skan  if (! (ucnranges[mn].flags & (C99 | CXX)))
169695Skan      return 0;
169695Skan
169695Skan  if (CPP_PEDANTIC (pfile)
169695Skan      && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
169695Skan	  || (CPP_OPTION (pfile, cplusplus)
169695Skan	      && !(ucnranges[mn].flags & CXX))))
169695Skan    return 0;
169695Skan
169695Skan  /* Update NST.  */
169695Skan  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
169695Skan    nst->level = normalized_none;
169695Skan  else if (ucnranges[mn].flags & CTX)
169695Skan    {
169695Skan      bool safe;
169695Skan      cppchar_t p = nst->previous;
169695Skan
169695Skan      /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
169695Skan      if (c == 0x09BE)
169695Skan	safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
169695Skan      else if (c == 0x0B3E)
169695Skan	safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
169695Skan      else if (c == 0x0BBE)
169695Skan	safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
169695Skan      else if (c == 0x0CC2)
169695Skan	safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
169695Skan      else if (c == 0x0D3E)
169695Skan	safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
169695Skan      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
169695Skan	 and are combined algorithmically from a sequence of the form
169695Skan	 1100-1112 1161-1175 11A8-11C2
169695Skan	 (if the third is not present, it is treated as 11A7, which is not
169695Skan	 really a valid character).
169695Skan	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
169695Skan	 only the combining characters.  */
169695Skan      else if (c >= 0x1161 && c <= 0x1175)
169695Skan	safe = p < 0x1100 || p > 0x1112;
169695Skan      else if (c >= 0x11A8 && c <= 0x11C2)
169695Skan	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
169695Skan      else
169695Skan	{
169695Skan	  /* Uh-oh, someone updated ucnid.h without updating this code.  */
169695Skan	  cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
169695Skan	  safe = true;
169695Skan	}
169695Skan      if (!safe && c < 0x1161)
169695Skan	nst->level = normalized_none;
169695Skan      else if (!safe)
169695Skan	nst->level = MAX (nst->level, normalized_identifier_C);
169695Skan    }
169695Skan  else if (ucnranges[mn].flags & NKC)
169695Skan    ;
169695Skan  else if (ucnranges[mn].flags & NFC)
169695Skan    nst->level = MAX (nst->level, normalized_C);
169695Skan  else if (ucnranges[mn].flags & CID)
169695Skan    nst->level = MAX (nst->level, normalized_identifier_C);
169695Skan  else
169695Skan    nst->level = normalized_none;
169695Skan  nst->previous = c;
169695Skan  nst->prev_class = ucnranges[mn].combine;
169695Skan
169695Skan  /* In C99, UCN digits may not begin identifiers.  */
169695Skan  if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
169695Skan    return 2;
169695Skan
169695Skan  return 1;
169695Skan}
169695Skan
169695Skan/* [lex.charset]: The character designated by the universal character
169695Skan   name \UNNNNNNNN is that character whose character short name in
169695Skan   ISO/IEC 10646 is NNNNNNNN; the character designated by the
169695Skan   universal character name \uNNNN is that character whose character
169695Skan   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
169695Skan   for a universal character name is less than 0x20 or in the range
169695Skan   0x7F-0x9F (inclusive), or if the universal character name
169695Skan   designates a character in the basic source character set, then the
169695Skan   program is ill-formed.
169695Skan
169695Skan   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
169695Skan   buffer end is delimited by a non-hex digit.  Returns zero if the
169695Skan   UCN has not been consumed.
169695Skan
169695Skan   Otherwise the nonzero value of the UCN, whether valid or invalid,
169695Skan   is returned.  Diagnostics are emitted for invalid values.  PSTR
169695Skan   is updated to point one beyond the UCN, or to the syntactically
169695Skan   invalid character.
169695Skan
169695Skan   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
169695Skan   an identifier, or 2 otherwise.  */
169695Skan
169695Skancppchar_t
169695Skan_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
169695Skan		const uchar *limit, int identifier_pos,
169695Skan		struct normalize_state *nst)
169695Skan{
169695Skan  cppchar_t result, c;
169695Skan  unsigned int length;
169695Skan  const uchar *str = *pstr;
169695Skan  const uchar *base = str - 2;
169695Skan
169695Skan  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
169695Skan    cpp_error (pfile, CPP_DL_WARNING,
169695Skan	       "universal character names are only valid in C++ and C99");
169695Skan  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
169695Skan    cpp_error (pfile, CPP_DL_WARNING,
169695Skan	       "the meaning of '\\%c' is different in traditional C",
169695Skan	       (int) str[-1]);
169695Skan
169695Skan  if (str[-1] == 'u')
169695Skan    length = 4;
169695Skan  else if (str[-1] == 'U')
169695Skan    length = 8;
169695Skan  else
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
169695Skan      length = 4;
169695Skan    }
169695Skan
169695Skan  result = 0;
169695Skan  do
169695Skan    {
169695Skan      c = *str;
169695Skan      if (!ISXDIGIT (c))
169695Skan	break;
169695Skan      str++;
169695Skan      result = (result << 4) + hex_value (c);
169695Skan    }
169695Skan  while (--length && str < limit);
169695Skan
169695Skan  /* Partial UCNs are not valid in strings, but decompose into
169695Skan     multiple tokens in identifiers, so we can't give a helpful
169695Skan     error message in that case.  */
169695Skan  if (length && identifier_pos)
169695Skan    return 0;
169695Skan
169695Skan  *pstr = str;
169695Skan  if (length)
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_ERROR,
169695Skan		 "incomplete universal character name %.*s",
169695Skan		 (int) (str - base), base);
169695Skan      result = 1;
169695Skan    }
169695Skan  /* The standard permits $, @ and ` to be specified as UCNs.  We use
169695Skan     hex escapes so that this also works with EBCDIC hosts.  */
169695Skan  else if ((result < 0xa0
169695Skan	    && (result != 0x24 && result != 0x40 && result != 0x60))
169695Skan	   || (result & 0x80000000)
169695Skan	   || (result >= 0xD800 && result <= 0xDFFF))
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_ERROR,
169695Skan		 "%.*s is not a valid universal character",
169695Skan		 (int) (str - base), base);
169695Skan      result = 1;
169695Skan    }
169695Skan  else if (identifier_pos && result == 0x24
169695Skan	   && CPP_OPTION (pfile, dollars_in_ident))
169695Skan    {
169695Skan      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
169695Skan	{
169695Skan	  CPP_OPTION (pfile, warn_dollars) = 0;
169695Skan	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
169695Skan	}
169695Skan      NORMALIZE_STATE_UPDATE_IDNUM (nst);
169695Skan    }
169695Skan  else if (identifier_pos)
169695Skan    {
169695Skan      int validity = ucn_valid_in_identifier (pfile, result, nst);
169695Skan
169695Skan      if (validity == 0)
169695Skan	cpp_error (pfile, CPP_DL_ERROR,
169695Skan		   "universal character %.*s is not valid in an identifier",
169695Skan		   (int) (str - base), base);
169695Skan      else if (validity == 2 && identifier_pos == 1)
169695Skan	cpp_error (pfile, CPP_DL_ERROR,
169695Skan   "universal character %.*s is not valid at the start of an identifier",
169695Skan		   (int) (str - base), base);
169695Skan    }
169695Skan
169695Skan  if (result == 0)
169695Skan    result = 1;
169695Skan
169695Skan  return result;
169695Skan}
169695Skan
169695Skan/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
169695Skan   it to the execution character set and write the result into TBUF.
169695Skan   An advanced pointer is returned.  Issues all relevant diagnostics.  */
169695Skanstatic const uchar *
169695Skanconvert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
169695Skan	     struct _cpp_strbuf *tbuf, bool wide)
169695Skan{
169695Skan  cppchar_t ucn;
169695Skan  uchar buf[6];
169695Skan  uchar *bufp = buf;
169695Skan  size_t bytesleft = 6;
169695Skan  int rval;
169695Skan  struct cset_converter cvt
169695Skan    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
169695Skan  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
169695Skan
169695Skan  from++;  /* Skip u/U.  */
169695Skan  ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
169695Skan
169695Skan  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
169695Skan  if (rval)
169695Skan    {
169695Skan      errno = rval;
169695Skan      cpp_errno (pfile, CPP_DL_ERROR,
169695Skan		 "converting UCN to source character set");
169695Skan    }
169695Skan  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
169695Skan    cpp_errno (pfile, CPP_DL_ERROR,
169695Skan	       "converting UCN to execution character set");
169695Skan
169695Skan  return from;
169695Skan}
169695Skan
169695Skan/* Subroutine of convert_hex and convert_oct.  N is the representation
169695Skan   in the execution character set of a numeric escape; write it into the
169695Skan   string buffer TBUF and update the end-of-string pointer therein.  WIDE
169695Skan   is true if it's a wide string that's being assembled in TBUF.  This
169695Skan   function issues no diagnostics and never fails.  */
169695Skanstatic void
169695Skanemit_numeric_escape (cpp_reader *pfile, cppchar_t n,
169695Skan		     struct _cpp_strbuf *tbuf, bool wide)
169695Skan{
169695Skan  if (wide)
169695Skan    {
169695Skan      /* We have to render this into the target byte order, which may not
169695Skan	 be our byte order.  */
169695Skan      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
169695Skan      size_t width = CPP_OPTION (pfile, wchar_precision);
169695Skan      size_t cwidth = CPP_OPTION (pfile, char_precision);
169695Skan      size_t cmask = width_to_mask (cwidth);
169695Skan      size_t nbwc = width / cwidth;
169695Skan      size_t i;
169695Skan      size_t off = tbuf->len;
169695Skan      cppchar_t c;
169695Skan
169695Skan      if (tbuf->len + nbwc > tbuf->asize)
169695Skan	{
169695Skan	  tbuf->asize += OUTBUF_BLOCK_SIZE;
169695Skan	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
169695Skan	}
169695Skan
169695Skan      for (i = 0; i < nbwc; i++)
169695Skan	{
169695Skan	  c = n & cmask;
169695Skan	  n >>= cwidth;
169695Skan	  tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
169695Skan	}
169695Skan      tbuf->len += nbwc;
169695Skan    }
169695Skan  else
169695Skan    {
169695Skan      /* Note: this code does not handle the case where the target
169695Skan	 and host have a different number of bits in a byte.  */
169695Skan      if (tbuf->len + 1 > tbuf->asize)
169695Skan	{
169695Skan	  tbuf->asize += OUTBUF_BLOCK_SIZE;
169695Skan	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
169695Skan	}
169695Skan      tbuf->text[tbuf->len++] = n;
169695Skan    }
169695Skan}
169695Skan
169695Skan/* Convert a hexadecimal escape, pointed to by FROM, to the execution
169695Skan   character set and write it into the string buffer TBUF.  Returns an
169695Skan   advanced pointer, and issues diagnostics as necessary.
169695Skan   No character set translation occurs; this routine always produces the
169695Skan   execution-set character with numeric value equal to the given hex
169695Skan   number.  You can, e.g. generate surrogate pairs this way.  */
169695Skanstatic const uchar *
169695Skanconvert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
169695Skan	     struct _cpp_strbuf *tbuf, bool wide)
169695Skan{
169695Skan  cppchar_t c, n = 0, overflow = 0;
169695Skan  int digits_found = 0;
169695Skan  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
169695Skan		  : CPP_OPTION (pfile, char_precision));
169695Skan  size_t mask = width_to_mask (width);
169695Skan
169695Skan  if (CPP_WTRADITIONAL (pfile))
169695Skan    cpp_error (pfile, CPP_DL_WARNING,
169695Skan	       "the meaning of '\\x' is different in traditional C");
169695Skan
169695Skan  from++;  /* Skip 'x'.  */
169695Skan  while (from < limit)
169695Skan    {
169695Skan      c = *from;
169695Skan      if (! hex_p (c))
169695Skan	break;
169695Skan      from++;
169695Skan      overflow |= n ^ (n << 4 >> 4);
169695Skan      n = (n << 4) + hex_value (c);
169695Skan      digits_found = 1;
169695Skan    }
169695Skan
169695Skan  if (!digits_found)
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_ERROR,
169695Skan		 "\\x used with no following hex digits");
169695Skan      return from;
169695Skan    }
169695Skan
169695Skan  if (overflow | (n != (n & mask)))
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_PEDWARN,
169695Skan		 "hex escape sequence out of range");
169695Skan      n &= mask;
169695Skan    }
169695Skan
169695Skan  emit_numeric_escape (pfile, n, tbuf, wide);
169695Skan
169695Skan  return from;
169695Skan}
169695Skan
169695Skan/* Convert an octal escape, pointed to by FROM, to the execution
169695Skan   character set and write it into the string buffer TBUF.  Returns an
169695Skan   advanced pointer, and issues diagnostics as necessary.
169695Skan   No character set translation occurs; this routine always produces the
169695Skan   execution-set character with numeric value equal to the given octal
169695Skan   number.  */
169695Skanstatic const uchar *
169695Skanconvert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
169695Skan	     struct _cpp_strbuf *tbuf, bool wide)
169695Skan{
169695Skan  size_t count = 0;
169695Skan  cppchar_t c, n = 0;
169695Skan  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
169695Skan		  : CPP_OPTION (pfile, char_precision));
169695Skan  size_t mask = width_to_mask (width);
169695Skan  bool overflow = false;
169695Skan
169695Skan  while (from < limit && count++ < 3)
169695Skan    {
169695Skan      c = *from;
169695Skan      if (c < '0' || c > '7')
169695Skan	break;
169695Skan      from++;
169695Skan      overflow |= n ^ (n << 3 >> 3);
169695Skan      n = (n << 3) + c - '0';
169695Skan    }
169695Skan
169695Skan  if (n != (n & mask))
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_PEDWARN,
169695Skan		 "octal escape sequence out of range");
169695Skan      n &= mask;
169695Skan    }
169695Skan
169695Skan  emit_numeric_escape (pfile, n, tbuf, wide);
169695Skan
169695Skan  return from;
169695Skan}
169695Skan
169695Skan/* Convert an escape sequence (pointed to by FROM) to its value on
169695Skan   the target, and to the execution character set.  Do not scan past
169695Skan   LIMIT.  Write the converted value into TBUF.  Returns an advanced
169695Skan   pointer.  Handles all relevant diagnostics.  */
169695Skanstatic const uchar *
169695Skanconvert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
169695Skan		struct _cpp_strbuf *tbuf, bool wide)
169695Skan{
169695Skan  /* Values of \a \b \e \f \n \r \t \v respectively.  */
169695Skan#if HOST_CHARSET == HOST_CHARSET_ASCII
169695Skan  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
169695Skan#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
169695Skan  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
169695Skan#else
169695Skan#error "unknown host character set"
169695Skan#endif
169695Skan
169695Skan  uchar c;
169695Skan  struct cset_converter cvt
169695Skan    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
169695Skan
169695Skan  c = *from;
169695Skan  switch (c)
169695Skan    {
169695Skan      /* UCNs, hex escapes, and octal escapes are processed separately.  */
169695Skan    case 'u': case 'U':
169695Skan      return convert_ucn (pfile, from, limit, tbuf, wide);
169695Skan
169695Skan    case 'x':
169695Skan      return convert_hex (pfile, from, limit, tbuf, wide);
169695Skan      break;
169695Skan
169695Skan    case '0':  case '1':  case '2':  case '3':
169695Skan    case '4':  case '5':  case '6':  case '7':
169695Skan      return convert_oct (pfile, from, limit, tbuf, wide);
169695Skan
169695Skan      /* Various letter escapes.  Get the appropriate host-charset
169695Skan	 value into C.  */
169695Skan    case '\\': case '\'': case '"': case '?': break;
169695Skan
169695Skan    case '(': case '{': case '[': case '%':
169695Skan      /* '\(', etc, can be used at the beginning of a line in a long
169695Skan	 string split onto multiple lines with \-newline, to prevent
169695Skan	 Emacs or other text editors from getting confused.  '\%' can
169695Skan	 be used to prevent SCCS from mangling printf format strings.  */
169695Skan      if (CPP_PEDANTIC (pfile))
169695Skan	goto unknown;
169695Skan      break;
169695Skan
169695Skan    case 'b': c = charconsts[1];  break;
169695Skan    case 'f': c = charconsts[3];  break;
169695Skan    case 'n': c = charconsts[4];  break;
169695Skan    case 'r': c = charconsts[5];  break;
169695Skan    case 't': c = charconsts[6];  break;
169695Skan    case 'v': c = charconsts[7];  break;
169695Skan
169695Skan    case 'a':
169695Skan      if (CPP_WTRADITIONAL (pfile))
169695Skan	cpp_error (pfile, CPP_DL_WARNING,
169695Skan		   "the meaning of '\\a' is different in traditional C");
169695Skan      c = charconsts[0];
169695Skan      break;
169695Skan
169695Skan    case 'e': case 'E':
169695Skan      if (CPP_PEDANTIC (pfile))
169695Skan	cpp_error (pfile, CPP_DL_PEDWARN,
169695Skan		   "non-ISO-standard escape sequence, '\\%c'", (int) c);
169695Skan      c = charconsts[2];
169695Skan      break;
169695Skan
169695Skan    default:
169695Skan    unknown:
169695Skan      if (ISGRAPH (c))
169695Skan	cpp_error (pfile, CPP_DL_PEDWARN,
169695Skan		   "unknown escape sequence '\\%c'", (int) c);
169695Skan      else
169695Skan	{
169695Skan	  /* diagnostic.c does not support "%03o".  When it does, this
169695Skan	     code can use %03o directly in the diagnostic again.  */
169695Skan	  char buf[32];
169695Skan	  sprintf(buf, "%03o", (int) c);
169695Skan	  cpp_error (pfile, CPP_DL_PEDWARN,
169695Skan		     "unknown escape sequence: '\\%s'", buf);
169695Skan	}
169695Skan    }
169695Skan
169695Skan  /* Now convert what we have to the execution character set.  */
169695Skan  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
169695Skan    cpp_errno (pfile, CPP_DL_ERROR,
169695Skan	       "converting escape sequence to execution character set");
169695Skan
169695Skan  return from + 1;
169695Skan}
169695Skan
169695Skan/* FROM is an array of cpp_string structures of length COUNT.  These
169695Skan   are to be converted from the source to the execution character set,
169695Skan   escape sequences translated, and finally all are to be
169695Skan   concatenated.  WIDE indicates whether or not to produce a wide
169695Skan   string.  The result is written into TO.  Returns true for success,
169695Skan   false for failure.  */
169695Skanbool
169695Skancpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
169695Skan		      cpp_string *to, bool wide)
169695Skan{
169695Skan  struct _cpp_strbuf tbuf;
169695Skan  const uchar *p, *base, *limit;
169695Skan  size_t i;
169695Skan  struct cset_converter cvt
169695Skan    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
169695Skan
169695Skan  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
169695Skan  tbuf.text = XNEWVEC (uchar, tbuf.asize);
169695Skan  tbuf.len = 0;
169695Skan
169695Skan  for (i = 0; i < count; i++)
169695Skan    {
169695Skan      p = from[i].text;
169695Skan      if (*p == 'L') p++;
169695Skan      p++; /* Skip leading quote.  */
169695Skan      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
169695Skan
169695Skan      for (;;)
169695Skan	{
169695Skan	  base = p;
169695Skan	  while (p < limit && *p != '\\')
169695Skan	    p++;
169695Skan	  if (p > base)
169695Skan	    {
169695Skan	      /* We have a run of normal characters; these can be fed
169695Skan		 directly to convert_cset.  */
169695Skan	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
169695Skan		goto fail;
169695Skan	    }
169695Skan	  if (p == limit)
169695Skan	    break;
169695Skan
169695Skan	  p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
169695Skan	}
169695Skan    }
169695Skan  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
169695Skan     structure.  */
169695Skan  emit_numeric_escape (pfile, 0, &tbuf, wide);
169695Skan  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
169695Skan  to->text = tbuf.text;
169695Skan  to->len = tbuf.len;
169695Skan  return true;
169695Skan
169695Skan fail:
169695Skan  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
169695Skan  free (tbuf.text);
169695Skan  return false;
169695Skan}
169695Skan
169695Skan/* Subroutine of do_line and do_linemarker.  Convert escape sequences
169695Skan   in a string, but do not perform character set conversion.  */
169695Skanbool
169695Skancpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
169695Skan				  size_t count,	cpp_string *to, bool wide)
169695Skan{
169695Skan  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
169695Skan  bool retval;
169695Skan
169695Skan  pfile->narrow_cset_desc.func = convert_no_conversion;
169695Skan  pfile->narrow_cset_desc.cd = (iconv_t) -1;
169695Skan
169695Skan  retval = cpp_interpret_string (pfile, from, count, to, wide);
169695Skan
169695Skan  pfile->narrow_cset_desc = save_narrow_cset_desc;
169695Skan  return retval;
169695Skan}
169695Skan
169695Skan
169695Skan/* Subroutine of cpp_interpret_charconst which performs the conversion
169695Skan   to a number, for narrow strings.  STR is the string structure returned
169695Skan   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
169695Skan   cpp_interpret_charconst.  */
169695Skanstatic cppchar_t
169695Skannarrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
169695Skan			 unsigned int *pchars_seen, int *unsignedp)
169695Skan{
169695Skan  size_t width = CPP_OPTION (pfile, char_precision);
169695Skan  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
169695Skan  size_t mask = width_to_mask (width);
169695Skan  size_t i;
169695Skan  cppchar_t result, c;
169695Skan  bool unsigned_p;
169695Skan
169695Skan  /* The value of a multi-character character constant, or a
169695Skan     single-character character constant whose representation in the
169695Skan     execution character set is more than one byte long, is
169695Skan     implementation defined.  This implementation defines it to be the
169695Skan     number formed by interpreting the byte sequence in memory as a
169695Skan     big-endian binary number.  If overflow occurs, the high bytes are
169695Skan     lost, and a warning is issued.
169695Skan
169695Skan     We don't want to process the NUL terminator handed back by
169695Skan     cpp_interpret_string.  */
169695Skan  result = 0;
169695Skan  for (i = 0; i < str.len - 1; i++)
169695Skan    {
169695Skan      c = str.text[i] & mask;
169695Skan      if (width < BITS_PER_CPPCHAR_T)
169695Skan	result = (result << width) | c;
169695Skan      else
169695Skan	result = c;
169695Skan    }
169695Skan
169695Skan  if (i > max_chars)
169695Skan    {
169695Skan      i = max_chars;
169695Skan      cpp_error (pfile, CPP_DL_WARNING,
169695Skan		 "character constant too long for its type");
169695Skan    }
169695Skan  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
169695Skan    cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
169695Skan
169695Skan  /* Multichar constants are of type int and therefore signed.  */
169695Skan  if (i > 1)
169695Skan    unsigned_p = 0;
169695Skan  else
169695Skan    unsigned_p = CPP_OPTION (pfile, unsigned_char);
169695Skan
169695Skan  /* Truncate the constant to its natural width, and simultaneously
169695Skan     sign- or zero-extend to the full width of cppchar_t.
169695Skan     For single-character constants, the value is WIDTH bits wide.
169695Skan     For multi-character constants, the value is INT_PRECISION bits wide.  */
169695Skan  if (i > 1)
169695Skan    width = CPP_OPTION (pfile, int_precision);
169695Skan  if (width < BITS_PER_CPPCHAR_T)
169695Skan    {
169695Skan      mask = ((cppchar_t) 1 << width) - 1;
169695Skan      if (unsigned_p || !(result & (1 << (width - 1))))
169695Skan	result &= mask;
169695Skan      else
169695Skan	result |= ~mask;
169695Skan    }
169695Skan  *pchars_seen = i;
169695Skan  *unsignedp = unsigned_p;
169695Skan  return result;
169695Skan}
169695Skan
169695Skan/* Subroutine of cpp_interpret_charconst which performs the conversion
169695Skan   to a number, for wide strings.  STR is the string structure returned
169695Skan   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
169695Skan   cpp_interpret_charconst.  */
169695Skanstatic cppchar_t
169695Skanwide_str_to_charconst (cpp_reader *pfile, cpp_string str,
169695Skan		       unsigned int *pchars_seen, int *unsignedp)
169695Skan{
169695Skan  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
169695Skan  size_t width = CPP_OPTION (pfile, wchar_precision);
169695Skan  size_t cwidth = CPP_OPTION (pfile, char_precision);
169695Skan  size_t mask = width_to_mask (width);
169695Skan  size_t cmask = width_to_mask (cwidth);
169695Skan  size_t nbwc = width / cwidth;
169695Skan  size_t off, i;
169695Skan  cppchar_t result = 0, c;
169695Skan
169695Skan  /* This is finicky because the string is in the target's byte order,
169695Skan     which may not be our byte order.  Only the last character, ignoring
169695Skan     the NUL terminator, is relevant.  */
169695Skan  off = str.len - (nbwc * 2);
169695Skan  result = 0;
169695Skan  for (i = 0; i < nbwc; i++)
169695Skan    {
169695Skan      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
169695Skan      result = (result << cwidth) | (c & cmask);
169695Skan    }
169695Skan
169695Skan  /* Wide character constants have type wchar_t, and a single
169695Skan     character exactly fills a wchar_t, so a multi-character wide
169695Skan     character constant is guaranteed to overflow.  */
169695Skan  if (off > 0)
169695Skan    cpp_error (pfile, CPP_DL_WARNING,
169695Skan	       "character constant too long for its type");
169695Skan
169695Skan  /* Truncate the constant to its natural width, and simultaneously
169695Skan     sign- or zero-extend to the full width of cppchar_t.  */
169695Skan  if (width < BITS_PER_CPPCHAR_T)
169695Skan    {
169695Skan      if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
169695Skan	result &= mask;
169695Skan      else
169695Skan	result |= ~mask;
169695Skan    }
169695Skan
169695Skan  *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
169695Skan  *pchars_seen = 1;
169695Skan  return result;
169695Skan}
169695Skan
169695Skan/* Interpret a (possibly wide) character constant in TOKEN.
169695Skan   PCHARS_SEEN points to a variable that is filled in with the number
169695Skan   of characters seen, and UNSIGNEDP to a variable that indicates
169695Skan   whether the result has signed type.  */
169695Skancppchar_t
169695Skancpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
169695Skan			 unsigned int *pchars_seen, int *unsignedp)
169695Skan{
169695Skan  cpp_string str = { 0, 0 };
169695Skan  bool wide = (token->type == CPP_WCHAR);
169695Skan  cppchar_t result;
169695Skan
169695Skan  /* an empty constant will appear as L'' or '' */
169695Skan  if (token->val.str.len == (size_t) (2 + wide))
169695Skan    {
169695Skan      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
169695Skan      return 0;
169695Skan    }
169695Skan  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
169695Skan    return 0;
169695Skan
169695Skan  if (wide)
169695Skan    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
169695Skan  else
169695Skan    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
169695Skan
169695Skan  if (str.text != token->val.str.text)
169695Skan    free ((void *)str.text);
169695Skan
169695Skan  return result;
169695Skan}
169695Skan
169695Skan/* Convert an identifier denoted by ID and LEN, which might contain
169695Skan   UCN escapes, to the source character set, either UTF-8 or
169695Skan   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
169695Skancpp_hashnode *
169695Skan_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
169695Skan{
169695Skan  /* It turns out that a UCN escape always turns into fewer characters
169695Skan     than the escape itself, so we can allocate a temporary in advance.  */
169695Skan  uchar * buf = (uchar *) alloca (len + 1);
169695Skan  uchar * bufp = buf;
169695Skan  size_t idp;
169695Skan
169695Skan  for (idp = 0; idp < len; idp++)
169695Skan    if (id[idp] != '\\')
169695Skan      *bufp++ = id[idp];
169695Skan    else
169695Skan      {
169695Skan	unsigned length = id[idp+1] == 'u' ? 4 : 8;
169695Skan	cppchar_t value = 0;
169695Skan	size_t bufleft = len - (bufp - buf);
169695Skan	int rval;
169695Skan
169695Skan	idp += 2;
169695Skan	while (length && idp < len && ISXDIGIT (id[idp]))
169695Skan	  {
169695Skan	    value = (value << 4) + hex_value (id[idp]);
169695Skan	    idp++;
169695Skan	    length--;
169695Skan	  }
169695Skan	idp--;
169695Skan
169695Skan	/* Special case for EBCDIC: if the identifier contains
169695Skan	   a '$' specified using a UCN, translate it to EBCDIC.  */
169695Skan	if (value == 0x24)
169695Skan	  {
169695Skan	    *bufp++ = '$';
169695Skan	    continue;
169695Skan	  }
169695Skan
169695Skan	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
169695Skan	if (rval)
169695Skan	  {
169695Skan	    errno = rval;
169695Skan	    cpp_errno (pfile, CPP_DL_ERROR,
169695Skan		       "converting UCN to source character set");
169695Skan	    break;
169695Skan	  }
169695Skan      }
169695Skan
169695Skan  return CPP_HASHNODE (ht_lookup (pfile->hash_table,
169695Skan				  buf, bufp - buf, HT_ALLOC));
169695Skan}
169695Skan
169695Skan/* Convert an input buffer (containing the complete contents of one
169695Skan   source file) from INPUT_CHARSET to the source character set.  INPUT
169695Skan   points to the input buffer, SIZE is its allocated size, and LEN is
169695Skan   the length of the meaningful data within the buffer.  The
169695Skan   translated buffer is returned, and *ST_SIZE is set to the length of
169695Skan   the meaningful data within the translated buffer.
169695Skan
169695Skan   INPUT is expected to have been allocated with xmalloc.  This function
169695Skan   will either return INPUT, or free it and return a pointer to another
169695Skan   xmalloc-allocated block of memory.  */
169695Skanuchar *
169695Skan_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
169695Skan		    uchar *input, size_t size, size_t len, off_t *st_size)
169695Skan{
169695Skan  struct cset_converter input_cset;
169695Skan  struct _cpp_strbuf to;
169695Skan
169695Skan  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
169695Skan  if (input_cset.func == convert_no_conversion)
169695Skan    {
260574Spfg      /* APPLE LOCAL begin UTF-8 BOM 5774975 */
260574Spfg      /* Eat the UTF-8 BOM.  */
260574Spfg      if (len >= 3
260574Spfg	  && input[0] == 0xef
260574Spfg	  && input[1] == 0xbb
260574Spfg	  && input[2] == 0xbf)
260574Spfg	{
260574Spfg	  memmove (&input[0], &input[3], size-3);
260574Spfg	  len -= 3;
260574Spfg	}
260574Spfg      /* APPLE LOCAL end UTF-8 BOM 5774975 */
169695Skan      to.text = input;
169695Skan      to.asize = size;
169695Skan      to.len = len;
169695Skan    }
169695Skan  else
169695Skan    {
169695Skan      to.asize = MAX (65536, len);
169695Skan      to.text = XNEWVEC (uchar, to.asize);
169695Skan      to.len = 0;
169695Skan
169695Skan      if (!APPLY_CONVERSION (input_cset, input, len, &to))
169695Skan	cpp_error (pfile, CPP_DL_ERROR,
169695Skan		   "failure to convert %s to %s",
169695Skan		   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
169695Skan
169695Skan      free (input);
169695Skan    }
169695Skan
169695Skan  /* Clean up the mess.  */
169695Skan  if (input_cset.func == convert_using_iconv)
169695Skan    iconv_close (input_cset.cd);
169695Skan
169695Skan  /* Resize buffer if we allocated substantially too much, or if we
169695Skan     haven't enough space for the \n-terminator.  */
169695Skan  if (to.len + 4096 < to.asize || to.len >= to.asize)
169695Skan    to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
169695Skan
169695Skan  /* If the file is using old-school Mac line endings (\r only),
169695Skan     terminate with another \r, not an \n, so that we do not mistake
169695Skan     the \r\n sequence for a single DOS line ending and erroneously
169695Skan     issue the "No newline at end of file" diagnostic.  */
259891Spfg  /* APPLE LOCAL don't access to.text[-1] radar 6121572 */
259272Spfg  if (to.len > 0 && to.text[to.len - 1] == '\r')
169695Skan    to.text[to.len] = '\r';
169695Skan  else
169695Skan    to.text[to.len] = '\n';
169695Skan
169695Skan  *st_size = to.len;
169695Skan  return to.text;
169695Skan}
169695Skan
169695Skan/* Decide on the default encoding to assume for input files.  */
169695Skanconst char *
169695Skan_cpp_default_encoding (void)
169695Skan{
169695Skan  const char *current_encoding = NULL;
169695Skan
169695Skan  /* We disable this because the default codeset is 7-bit ASCII on
169695Skan     most platforms, and this causes conversion failures on every
169695Skan     file in GCC that happens to have one of the upper 128 characters
169695Skan     in it -- most likely, as part of the name of a contributor.
169695Skan     We should definitely recognize in-band markers of file encoding,
169695Skan     like:
169695Skan     - the appropriate Unicode byte-order mark (FE FF) to recognize
169695Skan       UTF16 and UCS4 (in both big-endian and little-endian flavors)
169695Skan       and UTF8
169695Skan     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
169695Skan       distinguish ASCII and EBCDIC.
169695Skan     - now we can parse something like "#pragma GCC encoding <xyz>
169695Skan       on the first line, or even Emacs/VIM's mode line tags (there's
169695Skan       a problem here in that VIM uses the last line, and Emacs has
169695Skan       its more elaborate "local variables" convention).
169695Skan     - investigate whether Java has another common convention, which
169695Skan       would be friendly to support.
169695Skan     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
169695Skan#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
169695Skan  setlocale (LC_CTYPE, "");
169695Skan  current_encoding = nl_langinfo (CODESET);
169695Skan#endif
169695Skan  if (current_encoding == NULL || *current_encoding == '\0')
169695Skan    current_encoding = SOURCE_CHARSET;
169695Skan
169695Skan  return current_encoding;
169695Skan}