1169695Skan/* CPP Library - charsets
2169695Skan   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
3169695Skan   Free Software Foundation, Inc.
4169695Skan
5169695Skan   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6169695Skan
7169695SkanThis program is free software; you can redistribute it and/or modify it
8169695Skanunder the terms of the GNU General Public License as published by the
9169695SkanFree Software Foundation; either version 2, or (at your option) any
10169695Skanlater version.
11169695Skan
12169695SkanThis program is distributed in the hope that it will be useful,
13169695Skanbut WITHOUT ANY WARRANTY; without even the implied warranty of
14169695SkanMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15169695SkanGNU General Public License for more details.
16169695Skan
17169695SkanYou should have received a copy of the GNU General Public License
18169695Skanalong with this program; if not, write to the Free Software
19169695SkanFoundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
20169695Skan
21169695Skan#include "config.h"
22169695Skan#include "system.h"
23169695Skan#include "cpplib.h"
24169695Skan#include "internal.h"
25169695Skan
26169695Skan/* Character set handling for C-family languages.
27169695Skan
28169695Skan   Terminological note: In what follows, "charset" or "character set"
29169695Skan   will be taken to mean both an abstract set of characters and an
30169695Skan   encoding for that set.
31169695Skan
32169695Skan   The C99 standard discusses two character sets: source and execution.
33169695Skan   The source character set is used for internal processing in translation
34169695Skan   phases 1 through 4; the execution character set is used thereafter.
35169695Skan   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
36169695Skan   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
37169695Skan   of these terms).  Furthermore, the "basic character set" (listed in
38169695Skan   5.2.1p3) is to be encoded in each with values one byte wide, and is
39169695Skan   to appear in the initial shift state.
40169695Skan
41169695Skan   It is not explicitly mentioned, but there is also a "wide execution
42169695Skan   character set" used to encode wide character constants and wide
43169695Skan   string literals; this is supposed to be the result of applying the
44169695Skan   standard library function mbstowcs() to an equivalent narrow string
45169695Skan   (6.4.5p5).  However, the behavior of hexadecimal and octal
46169695Skan   \-escapes is at odds with this; they are supposed to be translated
47169695Skan   directly to wchar_t values (6.4.4.4p5,6).
48169695Skan
49169695Skan   The source character set is not necessarily the character set used
50169695Skan   to encode physical source files on disk; translation phase 1 converts
51169695Skan   from whatever that encoding is to the source character set.
52169695Skan
53169695Skan   The presence of universal character names in C99 (6.4.3 et seq.)
54169695Skan   forces the source character set to be isomorphic to ISO 10646,
55169695Skan   that is, Unicode.  There is no such constraint on the execution
56169695Skan   character set; note also that the conversion from source to
57169695Skan   execution character set does not occur for identifiers (5.1.1.2p1#5).
58169695Skan
59169695Skan   For convenience of implementation, the source character set's
60169695Skan   encoding of the basic character set should be identical to the
61169695Skan   execution character set OF THE HOST SYSTEM's encoding of the basic
62169695Skan   character set, and it should not be a state-dependent encoding.
63169695Skan
64169695Skan   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
65169695Skan   depending on whether the host is based on ASCII or EBCDIC (see
66169695Skan   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
67169695Skan   Technical Report #16).  With limited exceptions, it relies on the
68169695Skan   system library's iconv() primitive to do charset conversion
69169695Skan   (specified in SUSv2).  */
70169695Skan
71169695Skan#if !HAVE_ICONV
72169695Skan/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
73169695Skan   below, which are guarded only by if statements with compile-time
74169695Skan   constant conditions, do not cause link errors.  */
75169695Skan#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
76169695Skan#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
77169695Skan#define iconv_close(x)   (void)0
78169695Skan#define ICONV_CONST
79169695Skan#endif
80169695Skan
81169695Skan#if HOST_CHARSET == HOST_CHARSET_ASCII
82169695Skan#define SOURCE_CHARSET "UTF-8"
83169695Skan#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
84169695Skan#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
85169695Skan#define SOURCE_CHARSET "UTF-EBCDIC"
86169695Skan#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
87169695Skan#else
88169695Skan#error "Unrecognized basic host character set"
89169695Skan#endif
90169695Skan
91169695Skan#ifndef EILSEQ
92169695Skan#define EILSEQ EINVAL
93169695Skan#endif
94169695Skan
95169695Skan/* This structure is used for a resizable string buffer throughout.  */
96169695Skan/* Don't call it strbuf, as that conflicts with unistd.h on systems
97169695Skan   such as DYNIX/ptx where unistd.h includes stropts.h.  */
98169695Skanstruct _cpp_strbuf
99169695Skan{
100169695Skan  uchar *text;
101169695Skan  size_t asize;
102169695Skan  size_t len;
103169695Skan};
104169695Skan
105169695Skan/* This is enough to hold any string that fits on a single 80-column
106169695Skan   line, even if iconv quadruples its size (e.g. conversion from
107169695Skan   ASCII to UTF-32) rounded up to a power of two.  */
108169695Skan#define OUTBUF_BLOCK_SIZE 256
109169695Skan
110169695Skan/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
111169695Skan   logic.  This is because a depressing number of systems lack iconv,
112169695Skan   or have have iconv libraries that do not do these conversions, so
113169695Skan   we need a fallback implementation for them.  To ensure the fallback
114169695Skan   doesn't break due to neglect, it is used on all systems.
115169695Skan
116169695Skan   UTF-32 encoding is nice and simple: a four-byte binary number,
117169695Skan   constrained to the range 00000000-7FFFFFFF to avoid questions of
118169695Skan   signedness.  We do have to cope with big- and little-endian
119169695Skan   variants.
120169695Skan
121169695Skan   UTF-16 encoding uses two-byte binary numbers, again in big- and
122169695Skan   little-endian variants, for all values in the 00000000-0000FFFF
123169695Skan   range.  Values in the 00010000-0010FFFF range are encoded as pairs
124169695Skan   of two-byte numbers, called "surrogate pairs": given a number S in
125169695Skan   this range, it is mapped to a pair (H, L) as follows:
126169695Skan
127169695Skan     H = (S - 0x10000) / 0x400 + 0xD800
128169695Skan     L = (S - 0x10000) % 0x400 + 0xDC00
129169695Skan
130169695Skan   Two-byte values in the D800...DFFF range are ill-formed except as a
131169695Skan   component of a surrogate pair.  Even if the encoding within a
132169695Skan   two-byte value is little-endian, the H member of the surrogate pair
133169695Skan   comes first.
134169695Skan
135169695Skan   There is no way to encode values in the 00110000-7FFFFFFF range,
136169695Skan   which is not currently a problem as there are no assigned code
137169695Skan   points in that range; however, the author expects that it will
138169695Skan   eventually become necessary to abandon UTF-16 due to this
139169695Skan   limitation.  Note also that, because of these pairs, UTF-16 does
140169695Skan   not meet the requirements of the C standard for a wide character
141169695Skan   encoding (see 3.7.3 and 6.4.4.4p11).
142169695Skan
143169695Skan   UTF-8 encoding looks like this:
144169695Skan
145169695Skan   value range	       encoded as
146169695Skan   00000000-0000007F   0xxxxxxx
147169695Skan   00000080-000007FF   110xxxxx 10xxxxxx
148169695Skan   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
149169695Skan   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
150169695Skan   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151169695Skan   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152169695Skan
153169695Skan   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
154169695Skan   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
155169695Skan   never occur.  Note also that any value that can be encoded by a
156169695Skan   given row of the table can also be encoded by all successive rows,
157169695Skan   but this is not done; only the shortest possible encoding for any
158169695Skan   given value is valid.  For instance, the character 07C0 could be
159169695Skan   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
160169695Skan   FC 80 80 80 9F 80.  Only the first is valid.
161169695Skan
162169695Skan   An implementation note: the transformation from UTF-16 to UTF-8, or
163169695Skan   vice versa, is easiest done by using UTF-32 as an intermediary.  */
164169695Skan
165169695Skan/* Internal primitives which go from an UTF-8 byte stream to native-endian
166169695Skan   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
167169695Skan   operation in several places below.  */
168169695Skanstatic inline int
169169695Skanone_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
170169695Skan		     cppchar_t *cp)
171169695Skan{
172169695Skan  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
173169695Skan  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
174169695Skan
175169695Skan  cppchar_t c;
176169695Skan  const uchar *inbuf = *inbufp;
177169695Skan  size_t nbytes, i;
178169695Skan
179169695Skan  if (*inbytesleftp < 1)
180169695Skan    return EINVAL;
181169695Skan
182169695Skan  c = *inbuf;
183169695Skan  if (c < 0x80)
184169695Skan    {
185169695Skan      *cp = c;
186169695Skan      *inbytesleftp -= 1;
187169695Skan      *inbufp += 1;
188169695Skan      return 0;
189169695Skan    }
190169695Skan
191169695Skan  /* The number of leading 1-bits in the first byte indicates how many
192169695Skan     bytes follow.  */
193169695Skan  for (nbytes = 2; nbytes < 7; nbytes++)
194169695Skan    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
195169695Skan      goto found;
196169695Skan  return EILSEQ;
197169695Skan found:
198169695Skan
199169695Skan  if (*inbytesleftp < nbytes)
200169695Skan    return EINVAL;
201169695Skan
202169695Skan  c = (c & masks[nbytes-1]);
203169695Skan  inbuf++;
204169695Skan  for (i = 1; i < nbytes; i++)
205169695Skan    {
206169695Skan      cppchar_t n = *inbuf++;
207169695Skan      if ((n & 0xC0) != 0x80)
208169695Skan	return EILSEQ;
209169695Skan      c = ((c << 6) + (n & 0x3F));
210169695Skan    }
211169695Skan
212169695Skan  /* Make sure the shortest possible encoding was used.  */
213169695Skan  if (c <=      0x7F && nbytes > 1) return EILSEQ;
214169695Skan  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
215169695Skan  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
216169695Skan  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
217169695Skan  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
218169695Skan
219169695Skan  /* Make sure the character is valid.  */
220169695Skan  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
221169695Skan
222169695Skan  *cp = c;
223169695Skan  *inbufp = inbuf;
224169695Skan  *inbytesleftp -= nbytes;
225169695Skan  return 0;
226169695Skan}
227169695Skan
228169695Skanstatic inline int
229169695Skanone_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
230169695Skan{
231169695Skan  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
232169695Skan  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
233169695Skan  size_t nbytes;
234169695Skan  uchar buf[6], *p = &buf[6];
235169695Skan  uchar *outbuf = *outbufp;
236169695Skan
237169695Skan  nbytes = 1;
238169695Skan  if (c < 0x80)
239169695Skan    *--p = c;
240169695Skan  else
241169695Skan    {
242169695Skan      do
243169695Skan	{
244169695Skan	  *--p = ((c & 0x3F) | 0x80);
245169695Skan	  c >>= 6;
246169695Skan	  nbytes++;
247169695Skan	}
248169695Skan      while (c >= 0x3F || (c & limits[nbytes-1]));
249169695Skan      *--p = (c | masks[nbytes-1]);
250169695Skan    }
251169695Skan
252169695Skan  if (*outbytesleftp < nbytes)
253169695Skan    return E2BIG;
254169695Skan
255169695Skan  while (p < &buf[6])
256169695Skan    *outbuf++ = *p++;
257169695Skan  *outbytesleftp -= nbytes;
258169695Skan  *outbufp = outbuf;
259169695Skan  return 0;
260169695Skan}
261169695Skan
262169695Skan/* The following four functions transform one character between the two
263169695Skan   encodings named in the function name.  All have the signature
264169695Skan   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
265169695Skan           uchar **outbufp, size_t *outbytesleftp)
266169695Skan
267169695Skan   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
268169695Skan   interpreted as a boolean indicating whether big-endian or
269169695Skan   little-endian encoding is to be used for the member of the pair
270169695Skan   that is not UTF-8.
271169695Skan
272169695Skan   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
273169695Skan   do for iconv.
274169695Skan
275169695Skan   The return value is either 0 for success, or an errno value for
276169695Skan   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
277169695Skan   input sequence), ir EINVAL (incomplete input sequence).  */
278169695Skan
279169695Skanstatic inline int
280169695Skanone_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
281169695Skan		   uchar **outbufp, size_t *outbytesleftp)
282169695Skan{
283169695Skan  uchar *outbuf;
284169695Skan  cppchar_t s = 0;
285169695Skan  int rval;
286169695Skan
287169695Skan  /* Check for space first, since we know exactly how much we need.  */
288169695Skan  if (*outbytesleftp < 4)
289169695Skan    return E2BIG;
290169695Skan
291169695Skan  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
292169695Skan  if (rval)
293169695Skan    return rval;
294169695Skan
295169695Skan  outbuf = *outbufp;
296169695Skan  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
297169695Skan  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
298169695Skan  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
299169695Skan  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
300169695Skan
301169695Skan  *outbufp += 4;
302169695Skan  *outbytesleftp -= 4;
303169695Skan  return 0;
304169695Skan}
305169695Skan
306169695Skanstatic inline int
307169695Skanone_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
308169695Skan		   uchar **outbufp, size_t *outbytesleftp)
309169695Skan{
310169695Skan  cppchar_t s;
311169695Skan  int rval;
312169695Skan  const uchar *inbuf;
313169695Skan
314169695Skan  if (*inbytesleftp < 4)
315169695Skan    return EINVAL;
316169695Skan
317169695Skan  inbuf = *inbufp;
318169695Skan
319169695Skan  s  = inbuf[bigend ? 0 : 3] << 24;
320169695Skan  s += inbuf[bigend ? 1 : 2] << 16;
321169695Skan  s += inbuf[bigend ? 2 : 1] << 8;
322169695Skan  s += inbuf[bigend ? 3 : 0];
323169695Skan
324169695Skan  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
325169695Skan    return EILSEQ;
326169695Skan
327169695Skan  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
328169695Skan  if (rval)
329169695Skan    return rval;
330169695Skan
331169695Skan  *inbufp += 4;
332169695Skan  *inbytesleftp -= 4;
333169695Skan  return 0;
334169695Skan}
335169695Skan
336169695Skanstatic inline int
337169695Skanone_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
338169695Skan		   uchar **outbufp, size_t *outbytesleftp)
339169695Skan{
340169695Skan  int rval;
341169695Skan  cppchar_t s = 0;
342169695Skan  const uchar *save_inbuf = *inbufp;
343169695Skan  size_t save_inbytesleft = *inbytesleftp;
344169695Skan  uchar *outbuf = *outbufp;
345169695Skan
346169695Skan  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
347169695Skan  if (rval)
348169695Skan    return rval;
349169695Skan
350169695Skan  if (s > 0x0010FFFF)
351169695Skan    {
352169695Skan      *inbufp = save_inbuf;
353169695Skan      *inbytesleftp = save_inbytesleft;
354169695Skan      return EILSEQ;
355169695Skan    }
356169695Skan
357169695Skan  if (s < 0xFFFF)
358169695Skan    {
359169695Skan      if (*outbytesleftp < 2)
360169695Skan	{
361169695Skan	  *inbufp = save_inbuf;
362169695Skan	  *inbytesleftp = save_inbytesleft;
363169695Skan	  return E2BIG;
364169695Skan	}
365169695Skan      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
366169695Skan      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
367169695Skan
368169695Skan      *outbufp += 2;
369169695Skan      *outbytesleftp -= 2;
370169695Skan      return 0;
371169695Skan    }
372169695Skan  else
373169695Skan    {
374169695Skan      cppchar_t hi, lo;
375169695Skan
376169695Skan      if (*outbytesleftp < 4)
377169695Skan	{
378169695Skan	  *inbufp = save_inbuf;
379169695Skan	  *inbytesleftp = save_inbytesleft;
380169695Skan	  return E2BIG;
381169695Skan	}
382169695Skan
383169695Skan      hi = (s - 0x10000) / 0x400 + 0xD800;
384169695Skan      lo = (s - 0x10000) % 0x400 + 0xDC00;
385169695Skan
386169695Skan      /* Even if we are little-endian, put the high surrogate first.
387169695Skan	 ??? Matches practice?  */
388169695Skan      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
389169695Skan      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
390169695Skan      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
391169695Skan      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
392169695Skan
393169695Skan      *outbufp += 4;
394169695Skan      *outbytesleftp -= 4;
395169695Skan      return 0;
396169695Skan    }
397169695Skan}
398169695Skan
399169695Skanstatic inline int
400169695Skanone_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
401169695Skan		   uchar **outbufp, size_t *outbytesleftp)
402169695Skan{
403169695Skan  cppchar_t s;
404169695Skan  const uchar *inbuf = *inbufp;
405169695Skan  int rval;
406169695Skan
407169695Skan  if (*inbytesleftp < 2)
408169695Skan    return EINVAL;
409169695Skan  s  = inbuf[bigend ? 0 : 1] << 8;
410169695Skan  s += inbuf[bigend ? 1 : 0];
411169695Skan
412169695Skan  /* Low surrogate without immediately preceding high surrogate is invalid.  */
413169695Skan  if (s >= 0xDC00 && s <= 0xDFFF)
414169695Skan    return EILSEQ;
415169695Skan  /* High surrogate must have a following low surrogate.  */
416169695Skan  else if (s >= 0xD800 && s <= 0xDBFF)
417169695Skan    {
418169695Skan      cppchar_t hi = s, lo;
419169695Skan      if (*inbytesleftp < 4)
420169695Skan	return EINVAL;
421169695Skan
422169695Skan      lo  = inbuf[bigend ? 2 : 3] << 8;
423169695Skan      lo += inbuf[bigend ? 3 : 2];
424169695Skan
425169695Skan      if (lo < 0xDC00 || lo > 0xDFFF)
426169695Skan	return EILSEQ;
427169695Skan
428169695Skan      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
429169695Skan    }
430169695Skan
431169695Skan  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
432169695Skan  if (rval)
433169695Skan    return rval;
434169695Skan
435169695Skan  /* Success - update the input pointers (one_cppchar_to_utf8 has done
436169695Skan     the output pointers for us).  */
437169695Skan  if (s <= 0xFFFF)
438169695Skan    {
439169695Skan      *inbufp += 2;
440169695Skan      *inbytesleftp -= 2;
441169695Skan    }
442169695Skan  else
443169695Skan    {
444169695Skan      *inbufp += 4;
445169695Skan      *inbytesleftp -= 4;
446169695Skan    }
447169695Skan  return 0;
448169695Skan}
449169695Skan
450169695Skan/* Helper routine for the next few functions.  The 'const' on
451169695Skan   one_conversion means that we promise not to modify what function is
452169695Skan   pointed to, which lets the inliner see through it.  */
453169695Skan
454169695Skanstatic inline bool
455169695Skanconversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
456169695Skan					     uchar **, size_t *),
457169695Skan		 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
458169695Skan{
459169695Skan  const uchar *inbuf;
460169695Skan  uchar *outbuf;
461169695Skan  size_t inbytesleft, outbytesleft;
462169695Skan  int rval;
463169695Skan
464169695Skan  inbuf = from;
465169695Skan  inbytesleft = flen;
466169695Skan  outbuf = to->text + to->len;
467169695Skan  outbytesleft = to->asize - to->len;
468169695Skan
469169695Skan  for (;;)
470169695Skan    {
471169695Skan      do
472169695Skan	rval = one_conversion (cd, &inbuf, &inbytesleft,
473169695Skan			       &outbuf, &outbytesleft);
474169695Skan      while (inbytesleft && !rval);
475169695Skan
476169695Skan      if (__builtin_expect (inbytesleft == 0, 1))
477169695Skan	{
478169695Skan	  to->len = to->asize - outbytesleft;
479169695Skan	  return true;
480169695Skan	}
481169695Skan      if (rval != E2BIG)
482169695Skan	{
483169695Skan	  errno = rval;
484169695Skan	  return false;
485169695Skan	}
486169695Skan
487169695Skan      outbytesleft += OUTBUF_BLOCK_SIZE;
488169695Skan      to->asize += OUTBUF_BLOCK_SIZE;
489169695Skan      to->text = XRESIZEVEC (uchar, to->text, to->asize);
490169695Skan      outbuf = to->text + to->asize - outbytesleft;
491169695Skan    }
492169695Skan}
493169695Skan
494169695Skan
495169695Skan/* These functions convert entire strings between character sets.
496169695Skan   They all have the signature
497169695Skan
498169695Skan   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
499169695Skan
500169695Skan   The input string FROM is converted as specified by the function
501169695Skan   name plus the iconv descriptor CD (which may be fake), and the
502169695Skan   result appended to TO.  On any error, false is returned, otherwise true.  */
503169695Skan
504169695Skan/* These four use the custom conversion code above.  */
505169695Skanstatic bool
506169695Skanconvert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
507169695Skan		    struct _cpp_strbuf *to)
508169695Skan{
509169695Skan  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
510169695Skan}
511169695Skan
512169695Skanstatic bool
513169695Skanconvert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
514169695Skan		    struct _cpp_strbuf *to)
515169695Skan{
516169695Skan  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
517169695Skan}
518169695Skan
519169695Skanstatic bool
520169695Skanconvert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
521169695Skan		    struct _cpp_strbuf *to)
522169695Skan{
523169695Skan  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
524169695Skan}
525169695Skan
526169695Skanstatic bool
527169695Skanconvert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
528169695Skan		    struct _cpp_strbuf *to)
529169695Skan{
530169695Skan  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
531169695Skan}
532169695Skan
533169695Skan/* Identity conversion, used when we have no alternative.  */
534169695Skanstatic bool
535169695Skanconvert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
536169695Skan		       const uchar *from, size_t flen, struct _cpp_strbuf *to)
537169695Skan{
538169695Skan  if (to->len + flen > to->asize)
539169695Skan    {
540169695Skan      to->asize = to->len + flen;
541169695Skan      to->text = XRESIZEVEC (uchar, to->text, to->asize);
542169695Skan    }
543169695Skan  memcpy (to->text + to->len, from, flen);
544169695Skan  to->len += flen;
545169695Skan  return true;
546169695Skan}
547169695Skan
548169695Skan/* And this one uses the system iconv primitive.  It's a little
549169695Skan   different, since iconv's interface is a little different.  */
550169695Skan#if HAVE_ICONV
551169695Skanstatic bool
552169695Skanconvert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
553169695Skan		     struct _cpp_strbuf *to)
554169695Skan{
555169695Skan  ICONV_CONST char *inbuf;
556169695Skan  char *outbuf;
557169695Skan  size_t inbytesleft, outbytesleft;
558169695Skan
559169695Skan  /* Reset conversion descriptor and check that it is valid.  */
560169695Skan  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
561169695Skan    return false;
562169695Skan
563169695Skan  inbuf = (ICONV_CONST char *)from;
564169695Skan  inbytesleft = flen;
565169695Skan  outbuf = (char *)to->text + to->len;
566169695Skan  outbytesleft = to->asize - to->len;
567169695Skan
568169695Skan  for (;;)
569169695Skan    {
570169695Skan      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
571169695Skan      if (__builtin_expect (inbytesleft == 0, 1))
572169695Skan	{
573169695Skan	  to->len = to->asize - outbytesleft;
574169695Skan	  return true;
575169695Skan	}
576169695Skan      if (errno != E2BIG)
577169695Skan	return false;
578169695Skan
579169695Skan      outbytesleft += OUTBUF_BLOCK_SIZE;
580169695Skan      to->asize += OUTBUF_BLOCK_SIZE;
581169695Skan      to->text = XRESIZEVEC (uchar, to->text, to->asize);
582169695Skan      outbuf = (char *)to->text + to->asize - outbytesleft;
583169695Skan    }
584169695Skan}
585169695Skan#else
586169695Skan#define convert_using_iconv 0 /* prevent undefined symbol error below */
587169695Skan#endif
588169695Skan
589169695Skan/* Arrange for the above custom conversion logic to be used automatically
590169695Skan   when conversion between a suitable pair of character sets is requested.  */
591169695Skan
592169695Skan#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
593169695Skan   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
594169695Skan
595169695Skanstruct conversion
596169695Skan{
597169695Skan  const char *pair;
598169695Skan  convert_f func;
599169695Skan  iconv_t fake_cd;
600169695Skan};
601169695Skanstatic const struct conversion conversion_tab[] = {
602169695Skan  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
603169695Skan  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
604169695Skan  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
605169695Skan  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
606169695Skan  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
607169695Skan  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
608169695Skan  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
609169695Skan  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
610169695Skan};
611169695Skan
612169695Skan/* Subroutine of cpp_init_iconv: initialize and return a
613169695Skan   cset_converter structure for conversion from FROM to TO.  If
614169695Skan   iconv_open() fails, issue an error and return an identity
615169695Skan   converter.  Silently return an identity converter if FROM and TO
616169695Skan   are identical.  */
617169695Skanstatic struct cset_converter
618169695Skaninit_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
619169695Skan{
620169695Skan  struct cset_converter ret;
621169695Skan  char *pair;
622169695Skan  size_t i;
623169695Skan
624169695Skan  if (!strcasecmp (to, from))
625169695Skan    {
626169695Skan      ret.func = convert_no_conversion;
627169695Skan      ret.cd = (iconv_t) -1;
628169695Skan      return ret;
629169695Skan    }
630169695Skan
631169695Skan  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
632169695Skan
633169695Skan  strcpy(pair, from);
634169695Skan  strcat(pair, "/");
635169695Skan  strcat(pair, to);
636169695Skan  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
637169695Skan    if (!strcasecmp (pair, conversion_tab[i].pair))
638169695Skan      {
639169695Skan	ret.func = conversion_tab[i].func;
640169695Skan	ret.cd = conversion_tab[i].fake_cd;
641169695Skan	return ret;
642169695Skan      }
643169695Skan
644169695Skan  /* No custom converter - try iconv.  */
645169695Skan  if (HAVE_ICONV)
646169695Skan    {
647169695Skan      ret.func = convert_using_iconv;
648169695Skan      ret.cd = iconv_open (to, from);
649169695Skan
650169695Skan      if (ret.cd == (iconv_t) -1)
651169695Skan	{
652169695Skan	  if (errno == EINVAL)
653169695Skan	    cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
654169695Skan		       "conversion from %s to %s not supported by iconv",
655169695Skan		       from, to);
656169695Skan	  else
657169695Skan	    cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
658169695Skan
659169695Skan	  ret.func = convert_no_conversion;
660169695Skan	}
661169695Skan    }
662169695Skan  else
663169695Skan    {
664169695Skan      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
665169695Skan		 "no iconv implementation, cannot convert from %s to %s",
666169695Skan		 from, to);
667169695Skan      ret.func = convert_no_conversion;
668169695Skan      ret.cd = (iconv_t) -1;
669169695Skan    }
670169695Skan  return ret;
671169695Skan}
672169695Skan
673169695Skan/* If charset conversion is requested, initialize iconv(3) descriptors
674169695Skan   for conversion from the source character set to the execution
675169695Skan   character sets.  If iconv is not present in the C library, and
676169695Skan   conversion is requested, issue an error.  */
677169695Skan
678169695Skanvoid
679169695Skancpp_init_iconv (cpp_reader *pfile)
680169695Skan{
681169695Skan  const char *ncset = CPP_OPTION (pfile, narrow_charset);
682169695Skan  const char *wcset = CPP_OPTION (pfile, wide_charset);
683169695Skan  const char *default_wcset;
684169695Skan
685169695Skan  bool be = CPP_OPTION (pfile, bytes_big_endian);
686169695Skan
687169695Skan  if (CPP_OPTION (pfile, wchar_precision) >= 32)
688169695Skan    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
689169695Skan  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
690169695Skan    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
691169695Skan  else
692169695Skan    /* This effectively means that wide strings are not supported,
693169695Skan       so don't do any conversion at all.  */
694169695Skan   default_wcset = SOURCE_CHARSET;
695169695Skan
696169695Skan  if (!ncset)
697169695Skan    ncset = SOURCE_CHARSET;
698169695Skan  if (!wcset)
699169695Skan    wcset = default_wcset;
700169695Skan
701169695Skan  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
702169695Skan  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
703169695Skan}
704169695Skan
705169695Skan/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
706169695Skanvoid
707169695Skan_cpp_destroy_iconv (cpp_reader *pfile)
708169695Skan{
709169695Skan  if (HAVE_ICONV)
710169695Skan    {
711169695Skan      if (pfile->narrow_cset_desc.func == convert_using_iconv)
712169695Skan	iconv_close (pfile->narrow_cset_desc.cd);
713169695Skan      if (pfile->wide_cset_desc.func == convert_using_iconv)
714169695Skan	iconv_close (pfile->wide_cset_desc.cd);
715169695Skan    }
716169695Skan}
717169695Skan
718169695Skan/* Utility routine for use by a full compiler.  C is a character taken
719169695Skan   from the *basic* source character set, encoded in the host's
720169695Skan   execution encoding.  Convert it to (the target's) execution
721169695Skan   encoding, and return that value.
722169695Skan
723169695Skan   Issues an internal error if C's representation in the narrow
724169695Skan   execution character set fails to be a single-byte value (C99
725169695Skan   5.2.1p3: "The representation of each member of the source and
726169695Skan   execution character sets shall fit in a byte.")  May also issue an
727169695Skan   internal error if C fails to be a member of the basic source
728169695Skan   character set (testing this exactly is too hard, especially when
729169695Skan   the host character set is EBCDIC).  */
730169695Skancppchar_t
731169695Skancpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
732169695Skan{
733169695Skan  uchar sbuf[1];
734169695Skan  struct _cpp_strbuf tbuf;
735169695Skan
736169695Skan  /* This test is merely an approximation, but it suffices to catch
737169695Skan     the most important thing, which is that we don't get handed a
738169695Skan     character outside the unibyte range of the host character set.  */
739169695Skan  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
740169695Skan    {
741169695Skan      cpp_error (pfile, CPP_DL_ICE,
742169695Skan		 "character 0x%lx is not in the basic source character set\n",
743169695Skan		 (unsigned long)c);
744169695Skan      return 0;
745169695Skan    }
746169695Skan
747169695Skan  /* Being a character in the unibyte range of the host character set,
748169695Skan     we can safely splat it into a one-byte buffer and trust that that
749169695Skan     is a well-formed string.  */
750169695Skan  sbuf[0] = c;
751169695Skan
752169695Skan  /* This should never need to reallocate, but just in case... */
753169695Skan  tbuf.asize = 1;
754169695Skan  tbuf.text = XNEWVEC (uchar, tbuf.asize);
755169695Skan  tbuf.len = 0;
756169695Skan
757169695Skan  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
758169695Skan    {
759169695Skan      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
760169695Skan      return 0;
761169695Skan    }
762169695Skan  if (tbuf.len != 1)
763169695Skan    {
764169695Skan      cpp_error (pfile, CPP_DL_ICE,
765169695Skan		 "character 0x%lx is not unibyte in execution character set",
766169695Skan		 (unsigned long)c);
767169695Skan      return 0;
768169695Skan    }
769169695Skan  c = tbuf.text[0];
770169695Skan  free(tbuf.text);
771169695Skan  return c;
772169695Skan}
773169695Skan
774169695Skan
775169695Skan
776169695Skan/* Utility routine that computes a mask of the form 0000...111... with
777169695Skan   WIDTH 1-bits.  */
778169695Skanstatic inline size_t
779169695Skanwidth_to_mask (size_t width)
780169695Skan{
781169695Skan  width = MIN (width, BITS_PER_CPPCHAR_T);
782169695Skan  if (width >= CHAR_BIT * sizeof (size_t))
783169695Skan    return ~(size_t) 0;
784169695Skan  else
785169695Skan    return ((size_t) 1 << width) - 1;
786169695Skan}
787169695Skan
788169695Skan/* A large table of unicode character information.  */
789169695Skanenum {
790169695Skan  /* Valid in a C99 identifier?  */
791169695Skan  C99 = 1,
792169695Skan  /* Valid in a C99 identifier, but not as the first character?  */
793169695Skan  DIG = 2,
794169695Skan  /* Valid in a C++ identifier?  */
795169695Skan  CXX = 4,
796169695Skan  /* NFC representation is not valid in an identifier?  */
797169695Skan  CID = 8,
798169695Skan  /* Might be valid NFC form?  */
799169695Skan  NFC = 16,
800169695Skan  /* Might be valid NFKC form?  */
801169695Skan  NKC = 32,
802169695Skan  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
803169695Skan  CTX = 64
804169695Skan};
805169695Skan
806169695Skanstatic const struct {
807169695Skan  /* Bitmap of flags above.  */
808169695Skan  unsigned char flags;
809169695Skan  /* Combining class of the character.  */
810169695Skan  unsigned char combine;
811169695Skan  /* Last character in the range described by this entry.  */
812169695Skan  unsigned short end;
813169695Skan} ucnranges[] = {
814169695Skan#include "ucnid.h"
815169695Skan};
816169695Skan
817169695Skan/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
818169695Skan   the start of an identifier, and 0 if C is not valid in an
819169695Skan   identifier.  We assume C has already gone through the checks of
820169695Skan   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
821169695Skan   algorithm is a simple binary search on the table defined in
822169695Skan   ucnid.h.  */
823169695Skan
824169695Skanstatic int
825169695Skanucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
826169695Skan			 struct normalize_state *nst)
827169695Skan{
828169695Skan  int mn, mx, md;
829169695Skan
830169695Skan  if (c > 0xFFFF)
831169695Skan    return 0;
832169695Skan
833169695Skan  mn = 0;
834169695Skan  mx = ARRAY_SIZE (ucnranges) - 1;
835169695Skan  while (mx != mn)
836169695Skan    {
837169695Skan      md = (mn + mx) / 2;
838169695Skan      if (c <= ucnranges[md].end)
839169695Skan	mx = md;
840169695Skan      else
841169695Skan	mn = md + 1;
842169695Skan    }
843169695Skan
844169695Skan  /* When -pedantic, we require the character to have been listed by
845169695Skan     the standard for the current language.  Otherwise, we accept the
846169695Skan     union of the acceptable sets for C++98 and C99.  */
847169695Skan  if (! (ucnranges[mn].flags & (C99 | CXX)))
848169695Skan      return 0;
849169695Skan
850169695Skan  if (CPP_PEDANTIC (pfile)
851169695Skan      && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
852169695Skan	  || (CPP_OPTION (pfile, cplusplus)
853169695Skan	      && !(ucnranges[mn].flags & CXX))))
854169695Skan    return 0;
855169695Skan
856169695Skan  /* Update NST.  */
857169695Skan  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
858169695Skan    nst->level = normalized_none;
859169695Skan  else if (ucnranges[mn].flags & CTX)
860169695Skan    {
861169695Skan      bool safe;
862169695Skan      cppchar_t p = nst->previous;
863169695Skan
864169695Skan      /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
865169695Skan      if (c == 0x09BE)
866169695Skan	safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
867169695Skan      else if (c == 0x0B3E)
868169695Skan	safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
869169695Skan      else if (c == 0x0BBE)
870169695Skan	safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
871169695Skan      else if (c == 0x0CC2)
872169695Skan	safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
873169695Skan      else if (c == 0x0D3E)
874169695Skan	safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
875169695Skan      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
876169695Skan	 and are combined algorithmically from a sequence of the form
877169695Skan	 1100-1112 1161-1175 11A8-11C2
878169695Skan	 (if the third is not present, it is treated as 11A7, which is not
879169695Skan	 really a valid character).
880169695Skan	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
881169695Skan	 only the combining characters.  */
882169695Skan      else if (c >= 0x1161 && c <= 0x1175)
883169695Skan	safe = p < 0x1100 || p > 0x1112;
884169695Skan      else if (c >= 0x11A8 && c <= 0x11C2)
885169695Skan	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
886169695Skan      else
887169695Skan	{
888169695Skan	  /* Uh-oh, someone updated ucnid.h without updating this code.  */
889169695Skan	  cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
890169695Skan	  safe = true;
891169695Skan	}
892169695Skan      if (!safe && c < 0x1161)
893169695Skan	nst->level = normalized_none;
894169695Skan      else if (!safe)
895169695Skan	nst->level = MAX (nst->level, normalized_identifier_C);
896169695Skan    }
897169695Skan  else if (ucnranges[mn].flags & NKC)
898169695Skan    ;
899169695Skan  else if (ucnranges[mn].flags & NFC)
900169695Skan    nst->level = MAX (nst->level, normalized_C);
901169695Skan  else if (ucnranges[mn].flags & CID)
902169695Skan    nst->level = MAX (nst->level, normalized_identifier_C);
903169695Skan  else
904169695Skan    nst->level = normalized_none;
905169695Skan  nst->previous = c;
906169695Skan  nst->prev_class = ucnranges[mn].combine;
907169695Skan
908169695Skan  /* In C99, UCN digits may not begin identifiers.  */
909169695Skan  if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
910169695Skan    return 2;
911169695Skan
912169695Skan  return 1;
913169695Skan}
914169695Skan
915169695Skan/* [lex.charset]: The character designated by the universal character
916169695Skan   name \UNNNNNNNN is that character whose character short name in
917169695Skan   ISO/IEC 10646 is NNNNNNNN; the character designated by the
918169695Skan   universal character name \uNNNN is that character whose character
919169695Skan   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
920169695Skan   for a universal character name is less than 0x20 or in the range
921169695Skan   0x7F-0x9F (inclusive), or if the universal character name
922169695Skan   designates a character in the basic source character set, then the
923169695Skan   program is ill-formed.
924169695Skan
925169695Skan   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
926169695Skan   buffer end is delimited by a non-hex digit.  Returns zero if the
927169695Skan   UCN has not been consumed.
928169695Skan
929169695Skan   Otherwise the nonzero value of the UCN, whether valid or invalid,
930169695Skan   is returned.  Diagnostics are emitted for invalid values.  PSTR
931169695Skan   is updated to point one beyond the UCN, or to the syntactically
932169695Skan   invalid character.
933169695Skan
934169695Skan   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
935169695Skan   an identifier, or 2 otherwise.  */
936169695Skan
937169695Skancppchar_t
938169695Skan_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
939169695Skan		const uchar *limit, int identifier_pos,
940169695Skan		struct normalize_state *nst)
941169695Skan{
942169695Skan  cppchar_t result, c;
943169695Skan  unsigned int length;
944169695Skan  const uchar *str = *pstr;
945169695Skan  const uchar *base = str - 2;
946169695Skan
947169695Skan  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
948169695Skan    cpp_error (pfile, CPP_DL_WARNING,
949169695Skan	       "universal character names are only valid in C++ and C99");
950169695Skan  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
951169695Skan    cpp_error (pfile, CPP_DL_WARNING,
952169695Skan	       "the meaning of '\\%c' is different in traditional C",
953169695Skan	       (int) str[-1]);
954169695Skan
955169695Skan  if (str[-1] == 'u')
956169695Skan    length = 4;
957169695Skan  else if (str[-1] == 'U')
958169695Skan    length = 8;
959169695Skan  else
960169695Skan    {
961169695Skan      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
962169695Skan      length = 4;
963169695Skan    }
964169695Skan
965169695Skan  result = 0;
966169695Skan  do
967169695Skan    {
968169695Skan      c = *str;
969169695Skan      if (!ISXDIGIT (c))
970169695Skan	break;
971169695Skan      str++;
972169695Skan      result = (result << 4) + hex_value (c);
973169695Skan    }
974169695Skan  while (--length && str < limit);
975169695Skan
976169695Skan  /* Partial UCNs are not valid in strings, but decompose into
977169695Skan     multiple tokens in identifiers, so we can't give a helpful
978169695Skan     error message in that case.  */
979169695Skan  if (length && identifier_pos)
980169695Skan    return 0;
981169695Skan
982169695Skan  *pstr = str;
983169695Skan  if (length)
984169695Skan    {
985169695Skan      cpp_error (pfile, CPP_DL_ERROR,
986169695Skan		 "incomplete universal character name %.*s",
987169695Skan		 (int) (str - base), base);
988169695Skan      result = 1;
989169695Skan    }
990169695Skan  /* The standard permits $, @ and ` to be specified as UCNs.  We use
991169695Skan     hex escapes so that this also works with EBCDIC hosts.  */
992169695Skan  else if ((result < 0xa0
993169695Skan	    && (result != 0x24 && result != 0x40 && result != 0x60))
994169695Skan	   || (result & 0x80000000)
995169695Skan	   || (result >= 0xD800 && result <= 0xDFFF))
996169695Skan    {
997169695Skan      cpp_error (pfile, CPP_DL_ERROR,
998169695Skan		 "%.*s is not a valid universal character",
999169695Skan		 (int) (str - base), base);
1000169695Skan      result = 1;
1001169695Skan    }
1002169695Skan  else if (identifier_pos && result == 0x24
1003169695Skan	   && CPP_OPTION (pfile, dollars_in_ident))
1004169695Skan    {
1005169695Skan      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1006169695Skan	{
1007169695Skan	  CPP_OPTION (pfile, warn_dollars) = 0;
1008169695Skan	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1009169695Skan	}
1010169695Skan      NORMALIZE_STATE_UPDATE_IDNUM (nst);
1011169695Skan    }
1012169695Skan  else if (identifier_pos)
1013169695Skan    {
1014169695Skan      int validity = ucn_valid_in_identifier (pfile, result, nst);
1015169695Skan
1016169695Skan      if (validity == 0)
1017169695Skan	cpp_error (pfile, CPP_DL_ERROR,
1018169695Skan		   "universal character %.*s is not valid in an identifier",
1019169695Skan		   (int) (str - base), base);
1020169695Skan      else if (validity == 2 && identifier_pos == 1)
1021169695Skan	cpp_error (pfile, CPP_DL_ERROR,
1022169695Skan   "universal character %.*s is not valid at the start of an identifier",
1023169695Skan		   (int) (str - base), base);
1024169695Skan    }
1025169695Skan
1026169695Skan  if (result == 0)
1027169695Skan    result = 1;
1028169695Skan
1029169695Skan  return result;
1030169695Skan}
1031169695Skan
1032169695Skan/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1033169695Skan   it to the execution character set and write the result into TBUF.
1034169695Skan   An advanced pointer is returned.  Issues all relevant diagnostics.  */
1035169695Skanstatic const uchar *
1036169695Skanconvert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1037169695Skan	     struct _cpp_strbuf *tbuf, bool wide)
1038169695Skan{
1039169695Skan  cppchar_t ucn;
1040169695Skan  uchar buf[6];
1041169695Skan  uchar *bufp = buf;
1042169695Skan  size_t bytesleft = 6;
1043169695Skan  int rval;
1044169695Skan  struct cset_converter cvt
1045169695Skan    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1046169695Skan  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1047169695Skan
1048169695Skan  from++;  /* Skip u/U.  */
1049169695Skan  ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
1050169695Skan
1051169695Skan  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
1052169695Skan  if (rval)
1053169695Skan    {
1054169695Skan      errno = rval;
1055169695Skan      cpp_errno (pfile, CPP_DL_ERROR,
1056169695Skan		 "converting UCN to source character set");
1057169695Skan    }
1058169695Skan  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1059169695Skan    cpp_errno (pfile, CPP_DL_ERROR,
1060169695Skan	       "converting UCN to execution character set");
1061169695Skan
1062169695Skan  return from;
1063169695Skan}
1064169695Skan
1065169695Skan/* Subroutine of convert_hex and convert_oct.  N is the representation
1066169695Skan   in the execution character set of a numeric escape; write it into the
1067169695Skan   string buffer TBUF and update the end-of-string pointer therein.  WIDE
1068169695Skan   is true if it's a wide string that's being assembled in TBUF.  This
1069169695Skan   function issues no diagnostics and never fails.  */
1070169695Skanstatic void
1071169695Skanemit_numeric_escape (cpp_reader *pfile, cppchar_t n,
1072169695Skan		     struct _cpp_strbuf *tbuf, bool wide)
1073169695Skan{
1074169695Skan  if (wide)
1075169695Skan    {
1076169695Skan      /* We have to render this into the target byte order, which may not
1077169695Skan	 be our byte order.  */
1078169695Skan      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1079169695Skan      size_t width = CPP_OPTION (pfile, wchar_precision);
1080169695Skan      size_t cwidth = CPP_OPTION (pfile, char_precision);
1081169695Skan      size_t cmask = width_to_mask (cwidth);
1082169695Skan      size_t nbwc = width / cwidth;
1083169695Skan      size_t i;
1084169695Skan      size_t off = tbuf->len;
1085169695Skan      cppchar_t c;
1086169695Skan
1087169695Skan      if (tbuf->len + nbwc > tbuf->asize)
1088169695Skan	{
1089169695Skan	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1090169695Skan	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1091169695Skan	}
1092169695Skan
1093169695Skan      for (i = 0; i < nbwc; i++)
1094169695Skan	{
1095169695Skan	  c = n & cmask;
1096169695Skan	  n >>= cwidth;
1097169695Skan	  tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1098169695Skan	}
1099169695Skan      tbuf->len += nbwc;
1100169695Skan    }
1101169695Skan  else
1102169695Skan    {
1103169695Skan      /* Note: this code does not handle the case where the target
1104169695Skan	 and host have a different number of bits in a byte.  */
1105169695Skan      if (tbuf->len + 1 > tbuf->asize)
1106169695Skan	{
1107169695Skan	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1108169695Skan	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1109169695Skan	}
1110169695Skan      tbuf->text[tbuf->len++] = n;
1111169695Skan    }
1112169695Skan}
1113169695Skan
1114169695Skan/* Convert a hexadecimal escape, pointed to by FROM, to the execution
1115169695Skan   character set and write it into the string buffer TBUF.  Returns an
1116169695Skan   advanced pointer, and issues diagnostics as necessary.
1117169695Skan   No character set translation occurs; this routine always produces the
1118169695Skan   execution-set character with numeric value equal to the given hex
1119169695Skan   number.  You can, e.g. generate surrogate pairs this way.  */
1120169695Skanstatic const uchar *
1121169695Skanconvert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1122169695Skan	     struct _cpp_strbuf *tbuf, bool wide)
1123169695Skan{
1124169695Skan  cppchar_t c, n = 0, overflow = 0;
1125169695Skan  int digits_found = 0;
1126169695Skan  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1127169695Skan		  : CPP_OPTION (pfile, char_precision));
1128169695Skan  size_t mask = width_to_mask (width);
1129169695Skan
1130169695Skan  if (CPP_WTRADITIONAL (pfile))
1131169695Skan    cpp_error (pfile, CPP_DL_WARNING,
1132169695Skan	       "the meaning of '\\x' is different in traditional C");
1133169695Skan
1134169695Skan  from++;  /* Skip 'x'.  */
1135169695Skan  while (from < limit)
1136169695Skan    {
1137169695Skan      c = *from;
1138169695Skan      if (! hex_p (c))
1139169695Skan	break;
1140169695Skan      from++;
1141169695Skan      overflow |= n ^ (n << 4 >> 4);
1142169695Skan      n = (n << 4) + hex_value (c);
1143169695Skan      digits_found = 1;
1144169695Skan    }
1145169695Skan
1146169695Skan  if (!digits_found)
1147169695Skan    {
1148169695Skan      cpp_error (pfile, CPP_DL_ERROR,
1149169695Skan		 "\\x used with no following hex digits");
1150169695Skan      return from;
1151169695Skan    }
1152169695Skan
1153169695Skan  if (overflow | (n != (n & mask)))
1154169695Skan    {
1155169695Skan      cpp_error (pfile, CPP_DL_PEDWARN,
1156169695Skan		 "hex escape sequence out of range");
1157169695Skan      n &= mask;
1158169695Skan    }
1159169695Skan
1160169695Skan  emit_numeric_escape (pfile, n, tbuf, wide);
1161169695Skan
1162169695Skan  return from;
1163169695Skan}
1164169695Skan
1165169695Skan/* Convert an octal escape, pointed to by FROM, to the execution
1166169695Skan   character set and write it into the string buffer TBUF.  Returns an
1167169695Skan   advanced pointer, and issues diagnostics as necessary.
1168169695Skan   No character set translation occurs; this routine always produces the
1169169695Skan   execution-set character with numeric value equal to the given octal
1170169695Skan   number.  */
1171169695Skanstatic const uchar *
1172169695Skanconvert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1173169695Skan	     struct _cpp_strbuf *tbuf, bool wide)
1174169695Skan{
1175169695Skan  size_t count = 0;
1176169695Skan  cppchar_t c, n = 0;
1177169695Skan  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1178169695Skan		  : CPP_OPTION (pfile, char_precision));
1179169695Skan  size_t mask = width_to_mask (width);
1180169695Skan  bool overflow = false;
1181169695Skan
1182169695Skan  while (from < limit && count++ < 3)
1183169695Skan    {
1184169695Skan      c = *from;
1185169695Skan      if (c < '0' || c > '7')
1186169695Skan	break;
1187169695Skan      from++;
1188169695Skan      overflow |= n ^ (n << 3 >> 3);
1189169695Skan      n = (n << 3) + c - '0';
1190169695Skan    }
1191169695Skan
1192169695Skan  if (n != (n & mask))
1193169695Skan    {
1194169695Skan      cpp_error (pfile, CPP_DL_PEDWARN,
1195169695Skan		 "octal escape sequence out of range");
1196169695Skan      n &= mask;
1197169695Skan    }
1198169695Skan
1199169695Skan  emit_numeric_escape (pfile, n, tbuf, wide);
1200169695Skan
1201169695Skan  return from;
1202169695Skan}
1203169695Skan
1204169695Skan/* Convert an escape sequence (pointed to by FROM) to its value on
1205169695Skan   the target, and to the execution character set.  Do not scan past
1206169695Skan   LIMIT.  Write the converted value into TBUF.  Returns an advanced
1207169695Skan   pointer.  Handles all relevant diagnostics.  */
1208169695Skanstatic const uchar *
1209169695Skanconvert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1210169695Skan		struct _cpp_strbuf *tbuf, bool wide)
1211169695Skan{
1212169695Skan  /* Values of \a \b \e \f \n \r \t \v respectively.  */
1213169695Skan#if HOST_CHARSET == HOST_CHARSET_ASCII
1214169695Skan  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
1215169695Skan#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1216169695Skan  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
1217169695Skan#else
1218169695Skan#error "unknown host character set"
1219169695Skan#endif
1220169695Skan
1221169695Skan  uchar c;
1222169695Skan  struct cset_converter cvt
1223169695Skan    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1224169695Skan
1225169695Skan  c = *from;
1226169695Skan  switch (c)
1227169695Skan    {
1228169695Skan      /* UCNs, hex escapes, and octal escapes are processed separately.  */
1229169695Skan    case 'u': case 'U':
1230169695Skan      return convert_ucn (pfile, from, limit, tbuf, wide);
1231169695Skan
1232169695Skan    case 'x':
1233169695Skan      return convert_hex (pfile, from, limit, tbuf, wide);
1234169695Skan      break;
1235169695Skan
1236169695Skan    case '0':  case '1':  case '2':  case '3':
1237169695Skan    case '4':  case '5':  case '6':  case '7':
1238169695Skan      return convert_oct (pfile, from, limit, tbuf, wide);
1239169695Skan
1240169695Skan      /* Various letter escapes.  Get the appropriate host-charset
1241169695Skan	 value into C.  */
1242169695Skan    case '\\': case '\'': case '"': case '?': break;
1243169695Skan
1244169695Skan    case '(': case '{': case '[': case '%':
1245169695Skan      /* '\(', etc, can be used at the beginning of a line in a long
1246169695Skan	 string split onto multiple lines with \-newline, to prevent
1247169695Skan	 Emacs or other text editors from getting confused.  '\%' can
1248169695Skan	 be used to prevent SCCS from mangling printf format strings.  */
1249169695Skan      if (CPP_PEDANTIC (pfile))
1250169695Skan	goto unknown;
1251169695Skan      break;
1252169695Skan
1253169695Skan    case 'b': c = charconsts[1];  break;
1254169695Skan    case 'f': c = charconsts[3];  break;
1255169695Skan    case 'n': c = charconsts[4];  break;
1256169695Skan    case 'r': c = charconsts[5];  break;
1257169695Skan    case 't': c = charconsts[6];  break;
1258169695Skan    case 'v': c = charconsts[7];  break;
1259169695Skan
1260169695Skan    case 'a':
1261169695Skan      if (CPP_WTRADITIONAL (pfile))
1262169695Skan	cpp_error (pfile, CPP_DL_WARNING,
1263169695Skan		   "the meaning of '\\a' is different in traditional C");
1264169695Skan      c = charconsts[0];
1265169695Skan      break;
1266169695Skan
1267169695Skan    case 'e': case 'E':
1268169695Skan      if (CPP_PEDANTIC (pfile))
1269169695Skan	cpp_error (pfile, CPP_DL_PEDWARN,
1270169695Skan		   "non-ISO-standard escape sequence, '\\%c'", (int) c);
1271169695Skan      c = charconsts[2];
1272169695Skan      break;
1273169695Skan
1274169695Skan    default:
1275169695Skan    unknown:
1276169695Skan      if (ISGRAPH (c))
1277169695Skan	cpp_error (pfile, CPP_DL_PEDWARN,
1278169695Skan		   "unknown escape sequence '\\%c'", (int) c);
1279169695Skan      else
1280169695Skan	{
1281169695Skan	  /* diagnostic.c does not support "%03o".  When it does, this
1282169695Skan	     code can use %03o directly in the diagnostic again.  */
1283169695Skan	  char buf[32];
1284169695Skan	  sprintf(buf, "%03o", (int) c);
1285169695Skan	  cpp_error (pfile, CPP_DL_PEDWARN,
1286169695Skan		     "unknown escape sequence: '\\%s'", buf);
1287169695Skan	}
1288169695Skan    }
1289169695Skan
1290169695Skan  /* Now convert what we have to the execution character set.  */
1291169695Skan  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1292169695Skan    cpp_errno (pfile, CPP_DL_ERROR,
1293169695Skan	       "converting escape sequence to execution character set");
1294169695Skan
1295169695Skan  return from + 1;
1296169695Skan}
1297169695Skan
1298169695Skan/* FROM is an array of cpp_string structures of length COUNT.  These
1299169695Skan   are to be converted from the source to the execution character set,
1300169695Skan   escape sequences translated, and finally all are to be
1301169695Skan   concatenated.  WIDE indicates whether or not to produce a wide
1302169695Skan   string.  The result is written into TO.  Returns true for success,
1303169695Skan   false for failure.  */
1304169695Skanbool
1305169695Skancpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1306169695Skan		      cpp_string *to, bool wide)
1307169695Skan{
1308169695Skan  struct _cpp_strbuf tbuf;
1309169695Skan  const uchar *p, *base, *limit;
1310169695Skan  size_t i;
1311169695Skan  struct cset_converter cvt
1312169695Skan    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1313169695Skan
1314169695Skan  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1315169695Skan  tbuf.text = XNEWVEC (uchar, tbuf.asize);
1316169695Skan  tbuf.len = 0;
1317169695Skan
1318169695Skan  for (i = 0; i < count; i++)
1319169695Skan    {
1320169695Skan      p = from[i].text;
1321169695Skan      if (*p == 'L') p++;
1322169695Skan      p++; /* Skip leading quote.  */
1323169695Skan      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1324169695Skan
1325169695Skan      for (;;)
1326169695Skan	{
1327169695Skan	  base = p;
1328169695Skan	  while (p < limit && *p != '\\')
1329169695Skan	    p++;
1330169695Skan	  if (p > base)
1331169695Skan	    {
1332169695Skan	      /* We have a run of normal characters; these can be fed
1333169695Skan		 directly to convert_cset.  */
1334169695Skan	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1335169695Skan		goto fail;
1336169695Skan	    }
1337169695Skan	  if (p == limit)
1338169695Skan	    break;
1339169695Skan
1340169695Skan	  p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1341169695Skan	}
1342169695Skan    }
1343169695Skan  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1344169695Skan     structure.  */
1345169695Skan  emit_numeric_escape (pfile, 0, &tbuf, wide);
1346169695Skan  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1347169695Skan  to->text = tbuf.text;
1348169695Skan  to->len = tbuf.len;
1349169695Skan  return true;
1350169695Skan
1351169695Skan fail:
1352169695Skan  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1353169695Skan  free (tbuf.text);
1354169695Skan  return false;
1355169695Skan}
1356169695Skan
1357169695Skan/* Subroutine of do_line and do_linemarker.  Convert escape sequences
1358169695Skan   in a string, but do not perform character set conversion.  */
1359169695Skanbool
1360169695Skancpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1361169695Skan				  size_t count,	cpp_string *to, bool wide)
1362169695Skan{
1363169695Skan  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1364169695Skan  bool retval;
1365169695Skan
1366169695Skan  pfile->narrow_cset_desc.func = convert_no_conversion;
1367169695Skan  pfile->narrow_cset_desc.cd = (iconv_t) -1;
1368169695Skan
1369169695Skan  retval = cpp_interpret_string (pfile, from, count, to, wide);
1370169695Skan
1371169695Skan  pfile->narrow_cset_desc = save_narrow_cset_desc;
1372169695Skan  return retval;
1373169695Skan}
1374169695Skan
1375169695Skan
1376169695Skan/* Subroutine of cpp_interpret_charconst which performs the conversion
1377169695Skan   to a number, for narrow strings.  STR is the string structure returned
1378169695Skan   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1379169695Skan   cpp_interpret_charconst.  */
1380169695Skanstatic cppchar_t
1381169695Skannarrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1382169695Skan			 unsigned int *pchars_seen, int *unsignedp)
1383169695Skan{
1384169695Skan  size_t width = CPP_OPTION (pfile, char_precision);
1385169695Skan  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1386169695Skan  size_t mask = width_to_mask (width);
1387169695Skan  size_t i;
1388169695Skan  cppchar_t result, c;
1389169695Skan  bool unsigned_p;
1390169695Skan
1391169695Skan  /* The value of a multi-character character constant, or a
1392169695Skan     single-character character constant whose representation in the
1393169695Skan     execution character set is more than one byte long, is
1394169695Skan     implementation defined.  This implementation defines it to be the
1395169695Skan     number formed by interpreting the byte sequence in memory as a
1396169695Skan     big-endian binary number.  If overflow occurs, the high bytes are
1397169695Skan     lost, and a warning is issued.
1398169695Skan
1399169695Skan     We don't want to process the NUL terminator handed back by
1400169695Skan     cpp_interpret_string.  */
1401169695Skan  result = 0;
1402169695Skan  for (i = 0; i < str.len - 1; i++)
1403169695Skan    {
1404169695Skan      c = str.text[i] & mask;
1405169695Skan      if (width < BITS_PER_CPPCHAR_T)
1406169695Skan	result = (result << width) | c;
1407169695Skan      else
1408169695Skan	result = c;
1409169695Skan    }
1410169695Skan
1411169695Skan  if (i > max_chars)
1412169695Skan    {
1413169695Skan      i = max_chars;
1414169695Skan      cpp_error (pfile, CPP_DL_WARNING,
1415169695Skan		 "character constant too long for its type");
1416169695Skan    }
1417169695Skan  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1418169695Skan    cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1419169695Skan
1420169695Skan  /* Multichar constants are of type int and therefore signed.  */
1421169695Skan  if (i > 1)
1422169695Skan    unsigned_p = 0;
1423169695Skan  else
1424169695Skan    unsigned_p = CPP_OPTION (pfile, unsigned_char);
1425169695Skan
1426169695Skan  /* Truncate the constant to its natural width, and simultaneously
1427169695Skan     sign- or zero-extend to the full width of cppchar_t.
1428169695Skan     For single-character constants, the value is WIDTH bits wide.
1429169695Skan     For multi-character constants, the value is INT_PRECISION bits wide.  */
1430169695Skan  if (i > 1)
1431169695Skan    width = CPP_OPTION (pfile, int_precision);
1432169695Skan  if (width < BITS_PER_CPPCHAR_T)
1433169695Skan    {
1434169695Skan      mask = ((cppchar_t) 1 << width) - 1;
1435169695Skan      if (unsigned_p || !(result & (1 << (width - 1))))
1436169695Skan	result &= mask;
1437169695Skan      else
1438169695Skan	result |= ~mask;
1439169695Skan    }
1440169695Skan  *pchars_seen = i;
1441169695Skan  *unsignedp = unsigned_p;
1442169695Skan  return result;
1443169695Skan}
1444169695Skan
1445169695Skan/* Subroutine of cpp_interpret_charconst which performs the conversion
1446169695Skan   to a number, for wide strings.  STR is the string structure returned
1447169695Skan   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1448169695Skan   cpp_interpret_charconst.  */
1449169695Skanstatic cppchar_t
1450169695Skanwide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1451169695Skan		       unsigned int *pchars_seen, int *unsignedp)
1452169695Skan{
1453169695Skan  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1454169695Skan  size_t width = CPP_OPTION (pfile, wchar_precision);
1455169695Skan  size_t cwidth = CPP_OPTION (pfile, char_precision);
1456169695Skan  size_t mask = width_to_mask (width);
1457169695Skan  size_t cmask = width_to_mask (cwidth);
1458169695Skan  size_t nbwc = width / cwidth;
1459169695Skan  size_t off, i;
1460169695Skan  cppchar_t result = 0, c;
1461169695Skan
1462169695Skan  /* This is finicky because the string is in the target's byte order,
1463169695Skan     which may not be our byte order.  Only the last character, ignoring
1464169695Skan     the NUL terminator, is relevant.  */
1465169695Skan  off = str.len - (nbwc * 2);
1466169695Skan  result = 0;
1467169695Skan  for (i = 0; i < nbwc; i++)
1468169695Skan    {
1469169695Skan      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1470169695Skan      result = (result << cwidth) | (c & cmask);
1471169695Skan    }
1472169695Skan
1473169695Skan  /* Wide character constants have type wchar_t, and a single
1474169695Skan     character exactly fills a wchar_t, so a multi-character wide
1475169695Skan     character constant is guaranteed to overflow.  */
1476169695Skan  if (off > 0)
1477169695Skan    cpp_error (pfile, CPP_DL_WARNING,
1478169695Skan	       "character constant too long for its type");
1479169695Skan
1480169695Skan  /* Truncate the constant to its natural width, and simultaneously
1481169695Skan     sign- or zero-extend to the full width of cppchar_t.  */
1482169695Skan  if (width < BITS_PER_CPPCHAR_T)
1483169695Skan    {
1484169695Skan      if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1485169695Skan	result &= mask;
1486169695Skan      else
1487169695Skan	result |= ~mask;
1488169695Skan    }
1489169695Skan
1490169695Skan  *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1491169695Skan  *pchars_seen = 1;
1492169695Skan  return result;
1493169695Skan}
1494169695Skan
1495169695Skan/* Interpret a (possibly wide) character constant in TOKEN.
1496169695Skan   PCHARS_SEEN points to a variable that is filled in with the number
1497169695Skan   of characters seen, and UNSIGNEDP to a variable that indicates
1498169695Skan   whether the result has signed type.  */
1499169695Skancppchar_t
1500169695Skancpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1501169695Skan			 unsigned int *pchars_seen, int *unsignedp)
1502169695Skan{
1503169695Skan  cpp_string str = { 0, 0 };
1504169695Skan  bool wide = (token->type == CPP_WCHAR);
1505169695Skan  cppchar_t result;
1506169695Skan
1507169695Skan  /* an empty constant will appear as L'' or '' */
1508169695Skan  if (token->val.str.len == (size_t) (2 + wide))
1509169695Skan    {
1510169695Skan      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1511169695Skan      return 0;
1512169695Skan    }
1513169695Skan  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1514169695Skan    return 0;
1515169695Skan
1516169695Skan  if (wide)
1517169695Skan    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1518169695Skan  else
1519169695Skan    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1520169695Skan
1521169695Skan  if (str.text != token->val.str.text)
1522169695Skan    free ((void *)str.text);
1523169695Skan
1524169695Skan  return result;
1525169695Skan}
1526169695Skan
1527169695Skan/* Convert an identifier denoted by ID and LEN, which might contain
1528169695Skan   UCN escapes, to the source character set, either UTF-8 or
1529169695Skan   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
1530169695Skancpp_hashnode *
1531169695Skan_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1532169695Skan{
1533169695Skan  /* It turns out that a UCN escape always turns into fewer characters
1534169695Skan     than the escape itself, so we can allocate a temporary in advance.  */
1535169695Skan  uchar * buf = (uchar *) alloca (len + 1);
1536169695Skan  uchar * bufp = buf;
1537169695Skan  size_t idp;
1538169695Skan
1539169695Skan  for (idp = 0; idp < len; idp++)
1540169695Skan    if (id[idp] != '\\')
1541169695Skan      *bufp++ = id[idp];
1542169695Skan    else
1543169695Skan      {
1544169695Skan	unsigned length = id[idp+1] == 'u' ? 4 : 8;
1545169695Skan	cppchar_t value = 0;
1546169695Skan	size_t bufleft = len - (bufp - buf);
1547169695Skan	int rval;
1548169695Skan
1549169695Skan	idp += 2;
1550169695Skan	while (length && idp < len && ISXDIGIT (id[idp]))
1551169695Skan	  {
1552169695Skan	    value = (value << 4) + hex_value (id[idp]);
1553169695Skan	    idp++;
1554169695Skan	    length--;
1555169695Skan	  }
1556169695Skan	idp--;
1557169695Skan
1558169695Skan	/* Special case for EBCDIC: if the identifier contains
1559169695Skan	   a '$' specified using a UCN, translate it to EBCDIC.  */
1560169695Skan	if (value == 0x24)
1561169695Skan	  {
1562169695Skan	    *bufp++ = '$';
1563169695Skan	    continue;
1564169695Skan	  }
1565169695Skan
1566169695Skan	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1567169695Skan	if (rval)
1568169695Skan	  {
1569169695Skan	    errno = rval;
1570169695Skan	    cpp_errno (pfile, CPP_DL_ERROR,
1571169695Skan		       "converting UCN to source character set");
1572169695Skan	    break;
1573169695Skan	  }
1574169695Skan      }
1575169695Skan
1576169695Skan  return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1577169695Skan				  buf, bufp - buf, HT_ALLOC));
1578169695Skan}
1579169695Skan
1580169695Skan/* Convert an input buffer (containing the complete contents of one
1581169695Skan   source file) from INPUT_CHARSET to the source character set.  INPUT
1582169695Skan   points to the input buffer, SIZE is its allocated size, and LEN is
1583169695Skan   the length of the meaningful data within the buffer.  The
1584169695Skan   translated buffer is returned, and *ST_SIZE is set to the length of
1585169695Skan   the meaningful data within the translated buffer.
1586169695Skan
1587169695Skan   INPUT is expected to have been allocated with xmalloc.  This function
1588169695Skan   will either return INPUT, or free it and return a pointer to another
1589169695Skan   xmalloc-allocated block of memory.  */
1590169695Skanuchar *
1591169695Skan_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1592169695Skan		    uchar *input, size_t size, size_t len, off_t *st_size)
1593169695Skan{
1594169695Skan  struct cset_converter input_cset;
1595169695Skan  struct _cpp_strbuf to;
1596169695Skan
1597169695Skan  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1598169695Skan  if (input_cset.func == convert_no_conversion)
1599169695Skan    {
1600260574Spfg      /* APPLE LOCAL begin UTF-8 BOM 5774975 */
1601260574Spfg      /* Eat the UTF-8 BOM.  */
1602260574Spfg      if (len >= 3
1603260574Spfg	  && input[0] == 0xef
1604260574Spfg	  && input[1] == 0xbb
1605260574Spfg	  && input[2] == 0xbf)
1606260574Spfg	{
1607260574Spfg	  memmove (&input[0], &input[3], size-3);
1608260574Spfg	  len -= 3;
1609260574Spfg	}
1610260574Spfg      /* APPLE LOCAL end UTF-8 BOM 5774975 */
1611169695Skan      to.text = input;
1612169695Skan      to.asize = size;
1613169695Skan      to.len = len;
1614169695Skan    }
1615169695Skan  else
1616169695Skan    {
1617169695Skan      to.asize = MAX (65536, len);
1618169695Skan      to.text = XNEWVEC (uchar, to.asize);
1619169695Skan      to.len = 0;
1620169695Skan
1621169695Skan      if (!APPLY_CONVERSION (input_cset, input, len, &to))
1622169695Skan	cpp_error (pfile, CPP_DL_ERROR,
1623169695Skan		   "failure to convert %s to %s",
1624169695Skan		   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1625169695Skan
1626169695Skan      free (input);
1627169695Skan    }
1628169695Skan
1629169695Skan  /* Clean up the mess.  */
1630169695Skan  if (input_cset.func == convert_using_iconv)
1631169695Skan    iconv_close (input_cset.cd);
1632169695Skan
1633169695Skan  /* Resize buffer if we allocated substantially too much, or if we
1634169695Skan     haven't enough space for the \n-terminator.  */
1635169695Skan  if (to.len + 4096 < to.asize || to.len >= to.asize)
1636169695Skan    to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
1637169695Skan
1638169695Skan  /* If the file is using old-school Mac line endings (\r only),
1639169695Skan     terminate with another \r, not an \n, so that we do not mistake
1640169695Skan     the \r\n sequence for a single DOS line ending and erroneously
1641169695Skan     issue the "No newline at end of file" diagnostic.  */
1642259891Spfg  /* APPLE LOCAL don't access to.text[-1] radar 6121572 */
1643259272Spfg  if (to.len > 0 && to.text[to.len - 1] == '\r')
1644169695Skan    to.text[to.len] = '\r';
1645169695Skan  else
1646169695Skan    to.text[to.len] = '\n';
1647169695Skan
1648169695Skan  *st_size = to.len;
1649169695Skan  return to.text;
1650169695Skan}
1651169695Skan
1652169695Skan/* Decide on the default encoding to assume for input files.  */
1653169695Skanconst char *
1654169695Skan_cpp_default_encoding (void)
1655169695Skan{
1656169695Skan  const char *current_encoding = NULL;
1657169695Skan
1658169695Skan  /* We disable this because the default codeset is 7-bit ASCII on
1659169695Skan     most platforms, and this causes conversion failures on every
1660169695Skan     file in GCC that happens to have one of the upper 128 characters
1661169695Skan     in it -- most likely, as part of the name of a contributor.
1662169695Skan     We should definitely recognize in-band markers of file encoding,
1663169695Skan     like:
1664169695Skan     - the appropriate Unicode byte-order mark (FE FF) to recognize
1665169695Skan       UTF16 and UCS4 (in both big-endian and little-endian flavors)
1666169695Skan       and UTF8
1667169695Skan     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1668169695Skan       distinguish ASCII and EBCDIC.
1669169695Skan     - now we can parse something like "#pragma GCC encoding <xyz>
1670169695Skan       on the first line, or even Emacs/VIM's mode line tags (there's
1671169695Skan       a problem here in that VIM uses the last line, and Emacs has
1672169695Skan       its more elaborate "local variables" convention).
1673169695Skan     - investigate whether Java has another common convention, which
1674169695Skan       would be friendly to support.
1675169695Skan     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
1676169695Skan#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1677169695Skan  setlocale (LC_CTYPE, "");
1678169695Skan  current_encoding = nl_langinfo (CODESET);
1679169695Skan#endif
1680169695Skan  if (current_encoding == NULL || *current_encoding == '\0')
1681169695Skan    current_encoding = SOURCE_CHARSET;
1682169695Skan
1683169695Skan  return current_encoding;
1684169695Skan}
1685