1/* CPP Library - charsets
2   Copyright (C) 1998-2022 Free Software Foundation, Inc.
3
4   Broken out of c-lex.cc Apr 2003, adding valid C99 UCN ranges.
5
6This program is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 3, or (at your option) any
9later version.
10
11This program is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with this program; see the file COPYING3.  If not see
18<http://www.gnu.org/licenses/>.  */
19
20#include "config.h"
21#include "system.h"
22#include "cpplib.h"
23#include "internal.h"
24
25/* Character set handling for C-family languages.
26
27   Terminological note: In what follows, "charset" or "character set"
28   will be taken to mean both an abstract set of characters and an
29   encoding for that set.
30
31   The C99 standard discusses two character sets: source and execution.
32   The source character set is used for internal processing in translation
33   phases 1 through 4; the execution character set is used thereafter.
34   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
35   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
36   of these terms).  Furthermore, the "basic character set" (listed in
37   5.2.1p3) is to be encoded in each with values one byte wide, and is
38   to appear in the initial shift state.
39
40   It is not explicitly mentioned, but there is also a "wide execution
41   character set" used to encode wide character constants and wide
42   string literals; this is supposed to be the result of applying the
43   standard library function mbstowcs() to an equivalent narrow string
44   (6.4.5p5).  However, the behavior of hexadecimal and octal
45   \-escapes is at odds with this; they are supposed to be translated
46   directly to wchar_t values (6.4.4.4p5,6).
47
48   The source character set is not necessarily the character set used
49   to encode physical source files on disk; translation phase 1 converts
50   from whatever that encoding is to the source character set.
51
52   The presence of universal character names in C99 (6.4.3 et seq.)
53   forces the source character set to be isomorphic to ISO 10646,
54   that is, Unicode.  There is no such constraint on the execution
55   character set; note also that the conversion from source to
56   execution character set does not occur for identifiers (5.1.1.2p1#5).
57
58   For convenience of implementation, the source character set's
59   encoding of the basic character set should be identical to the
60   execution character set OF THE HOST SYSTEM's encoding of the basic
61   character set, and it should not be a state-dependent encoding.
62
63   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
64   depending on whether the host is based on ASCII or EBCDIC (see
65   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
66   Technical Report #16).  With limited exceptions, it relies on the
67   system library's iconv() primitive to do charset conversion
68   (specified in SUSv2).  */
69
70#if !HAVE_ICONV
71/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
72   below, which are guarded only by if statements with compile-time
73   constant conditions, do not cause link errors.  */
74#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
75#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
76#define iconv_close(x)   (void)0
77#define ICONV_CONST
78#endif
79
80#if HOST_CHARSET == HOST_CHARSET_ASCII
81#define SOURCE_CHARSET "UTF-8"
82#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
83#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
84#define SOURCE_CHARSET "UTF-EBCDIC"
85#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
86#else
87#error "Unrecognized basic host character set"
88#endif
89
90#ifndef EILSEQ
91#define EILSEQ EINVAL
92#endif
93
94/* This structure is used for a resizable string buffer throughout.  */
95/* Don't call it strbuf, as that conflicts with unistd.h on systems
96   such as DYNIX/ptx where unistd.h includes stropts.h.  */
97struct _cpp_strbuf
98{
99  uchar *text;
100  size_t asize;
101  size_t len;
102};
103
104/* This is enough to hold any string that fits on a single 80-column
105   line, even if iconv quadruples its size (e.g. conversion from
106   ASCII to UTF-32) rounded up to a power of two.  */
107#define OUTBUF_BLOCK_SIZE 256
108
109/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
110   logic.  This is because a depressing number of systems lack iconv,
111   or have have iconv libraries that do not do these conversions, so
112   we need a fallback implementation for them.  To ensure the fallback
113   doesn't break due to neglect, it is used on all systems.
114
115   UTF-32 encoding is nice and simple: a four-byte binary number,
116   constrained to the range 00000000-7FFFFFFF to avoid questions of
117   signedness.  We do have to cope with big- and little-endian
118   variants.
119
120   UTF-16 encoding uses two-byte binary numbers, again in big- and
121   little-endian variants, for all values in the 00000000-0000FFFF
122   range.  Values in the 00010000-0010FFFF range are encoded as pairs
123   of two-byte numbers, called "surrogate pairs": given a number S in
124   this range, it is mapped to a pair (H, L) as follows:
125
126     H = (S - 0x10000) / 0x400 + 0xD800
127     L = (S - 0x10000) % 0x400 + 0xDC00
128
129   Two-byte values in the D800...DFFF range are ill-formed except as a
130   component of a surrogate pair.  Even if the encoding within a
131   two-byte value is little-endian, the H member of the surrogate pair
132   comes first.
133
134   There is no way to encode values in the 00110000-7FFFFFFF range,
135   which is not currently a problem as there are no assigned code
136   points in that range; however, the author expects that it will
137   eventually become necessary to abandon UTF-16 due to this
138   limitation.  Note also that, because of these pairs, UTF-16 does
139   not meet the requirements of the C standard for a wide character
140   encoding (see 3.7.3 and 6.4.4.4p11).
141
142   UTF-8 encoding looks like this:
143
144   value range	       encoded as
145   00000000-0000007F   0xxxxxxx
146   00000080-000007FF   110xxxxx 10xxxxxx
147   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
148   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
149   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
150   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151
152   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
153   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
154   never occur.  Note also that any value that can be encoded by a
155   given row of the table can also be encoded by all successive rows,
156   but this is not done; only the shortest possible encoding for any
157   given value is valid.  For instance, the character 07C0 could be
158   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
159   FC 80 80 80 9F 80.  Only the first is valid.
160
161   An implementation note: the transformation from UTF-16 to UTF-8, or
162   vice versa, is easiest done by using UTF-32 as an intermediary.  */
163
164/* Internal primitives which go from an UTF-8 byte stream to native-endian
165   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
166   operation in several places below.  */
167static inline int
168one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
169		     cppchar_t *cp)
170{
171  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
172  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
173
174  cppchar_t c;
175  const uchar *inbuf = *inbufp;
176  size_t nbytes, i;
177
178  if (*inbytesleftp < 1)
179    return EINVAL;
180
181  c = *inbuf;
182  if (c < 0x80)
183    {
184      *cp = c;
185      *inbytesleftp -= 1;
186      *inbufp += 1;
187      return 0;
188    }
189
190  /* The number of leading 1-bits in the first byte indicates how many
191     bytes follow.  */
192  for (nbytes = 2; nbytes < 7; nbytes++)
193    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
194      goto found;
195  return EILSEQ;
196 found:
197
198  if (*inbytesleftp < nbytes)
199    return EINVAL;
200
201  c = (c & masks[nbytes-1]);
202  inbuf++;
203  for (i = 1; i < nbytes; i++)
204    {
205      cppchar_t n = *inbuf++;
206      if ((n & 0xC0) != 0x80)
207	return EILSEQ;
208      c = ((c << 6) + (n & 0x3F));
209    }
210
211  /* Make sure the shortest possible encoding was used.  */
212  if (c <=      0x7F && nbytes > 1) return EILSEQ;
213  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
214  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
215  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
216  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
217
218  /* Make sure the character is valid.  */
219  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
220
221  *cp = c;
222  *inbufp = inbuf;
223  *inbytesleftp -= nbytes;
224  return 0;
225}
226
227static inline int
228one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
229{
230  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
231  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
232  size_t nbytes;
233  uchar buf[6], *p = &buf[6];
234  uchar *outbuf = *outbufp;
235
236  nbytes = 1;
237  if (c < 0x80)
238    *--p = c;
239  else
240    {
241      do
242	{
243	  *--p = ((c & 0x3F) | 0x80);
244	  c >>= 6;
245	  nbytes++;
246	}
247      while (c >= 0x3F || (c & limits[nbytes-1]));
248      *--p = (c | masks[nbytes-1]);
249    }
250
251  if (*outbytesleftp < nbytes)
252    return E2BIG;
253
254  while (p < &buf[6])
255    *outbuf++ = *p++;
256  *outbytesleftp -= nbytes;
257  *outbufp = outbuf;
258  return 0;
259}
260
261/* The following four functions transform one character between the two
262   encodings named in the function name.  All have the signature
263   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
264           uchar **outbufp, size_t *outbytesleftp)
265
266   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
267   interpreted as a boolean indicating whether big-endian or
268   little-endian encoding is to be used for the member of the pair
269   that is not UTF-8.
270
271   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
272   do for iconv.
273
274   The return value is either 0 for success, or an errno value for
275   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
276   input sequence), ir EINVAL (incomplete input sequence).  */
277
278static inline int
279one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
280		   uchar **outbufp, size_t *outbytesleftp)
281{
282  uchar *outbuf;
283  cppchar_t s = 0;
284  int rval;
285
286  /* Check for space first, since we know exactly how much we need.  */
287  if (*outbytesleftp < 4)
288    return E2BIG;
289
290  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
291  if (rval)
292    return rval;
293
294  outbuf = *outbufp;
295  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
296  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
297  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
298  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
299
300  *outbufp += 4;
301  *outbytesleftp -= 4;
302  return 0;
303}
304
305static inline int
306one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
307		   uchar **outbufp, size_t *outbytesleftp)
308{
309  cppchar_t s;
310  int rval;
311  const uchar *inbuf;
312
313  if (*inbytesleftp < 4)
314    return EINVAL;
315
316  inbuf = *inbufp;
317
318  s  = inbuf[bigend ? 0 : 3] << 24;
319  s += inbuf[bigend ? 1 : 2] << 16;
320  s += inbuf[bigend ? 2 : 1] << 8;
321  s += inbuf[bigend ? 3 : 0];
322
323  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
324    return EILSEQ;
325
326  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
327  if (rval)
328    return rval;
329
330  *inbufp += 4;
331  *inbytesleftp -= 4;
332  return 0;
333}
334
335static inline int
336one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
337		   uchar **outbufp, size_t *outbytesleftp)
338{
339  int rval;
340  cppchar_t s = 0;
341  const uchar *save_inbuf = *inbufp;
342  size_t save_inbytesleft = *inbytesleftp;
343  uchar *outbuf = *outbufp;
344
345  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
346  if (rval)
347    return rval;
348
349  if (s > 0x0010FFFF)
350    {
351      *inbufp = save_inbuf;
352      *inbytesleftp = save_inbytesleft;
353      return EILSEQ;
354    }
355
356  if (s <= 0xFFFF)
357    {
358      if (*outbytesleftp < 2)
359	{
360	  *inbufp = save_inbuf;
361	  *inbytesleftp = save_inbytesleft;
362	  return E2BIG;
363	}
364      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
365      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
366
367      *outbufp += 2;
368      *outbytesleftp -= 2;
369      return 0;
370    }
371  else
372    {
373      cppchar_t hi, lo;
374
375      if (*outbytesleftp < 4)
376	{
377	  *inbufp = save_inbuf;
378	  *inbytesleftp = save_inbytesleft;
379	  return E2BIG;
380	}
381
382      hi = (s - 0x10000) / 0x400 + 0xD800;
383      lo = (s - 0x10000) % 0x400 + 0xDC00;
384
385      /* Even if we are little-endian, put the high surrogate first.
386	 ??? Matches practice?  */
387      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
388      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
389      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
390      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
391
392      *outbufp += 4;
393      *outbytesleftp -= 4;
394      return 0;
395    }
396}
397
398static inline int
399one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
400		   uchar **outbufp, size_t *outbytesleftp)
401{
402  cppchar_t s;
403  const uchar *inbuf = *inbufp;
404  int rval;
405
406  if (*inbytesleftp < 2)
407    return EINVAL;
408  s  = inbuf[bigend ? 0 : 1] << 8;
409  s += inbuf[bigend ? 1 : 0];
410
411  /* Low surrogate without immediately preceding high surrogate is invalid.  */
412  if (s >= 0xDC00 && s <= 0xDFFF)
413    return EILSEQ;
414  /* High surrogate must have a following low surrogate.  */
415  else if (s >= 0xD800 && s <= 0xDBFF)
416    {
417      cppchar_t hi = s, lo;
418      if (*inbytesleftp < 4)
419	return EINVAL;
420
421      lo  = inbuf[bigend ? 2 : 3] << 8;
422      lo += inbuf[bigend ? 3 : 2];
423
424      if (lo < 0xDC00 || lo > 0xDFFF)
425	return EILSEQ;
426
427      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
428    }
429
430  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
431  if (rval)
432    return rval;
433
434  /* Success - update the input pointers (one_cppchar_to_utf8 has done
435     the output pointers for us).  */
436  if (s <= 0xFFFF)
437    {
438      *inbufp += 2;
439      *inbytesleftp -= 2;
440    }
441  else
442    {
443      *inbufp += 4;
444      *inbytesleftp -= 4;
445    }
446  return 0;
447}
448
449/* Helper routine for the next few functions.  The 'const' on
450   one_conversion means that we promise not to modify what function is
451   pointed to, which lets the inliner see through it.  */
452
453static inline bool
454conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
455					     uchar **, size_t *),
456		 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
457{
458  const uchar *inbuf;
459  uchar *outbuf;
460  size_t inbytesleft, outbytesleft;
461  int rval;
462
463  inbuf = from;
464  inbytesleft = flen;
465  outbuf = to->text + to->len;
466  outbytesleft = to->asize - to->len;
467
468  for (;;)
469    {
470      do
471	rval = one_conversion (cd, &inbuf, &inbytesleft,
472			       &outbuf, &outbytesleft);
473      while (inbytesleft && !rval);
474
475      if (__builtin_expect (inbytesleft == 0, 1))
476	{
477	  to->len = to->asize - outbytesleft;
478	  return true;
479	}
480      if (rval != E2BIG)
481	{
482	  errno = rval;
483	  return false;
484	}
485
486      outbytesleft += OUTBUF_BLOCK_SIZE;
487      to->asize += OUTBUF_BLOCK_SIZE;
488      to->text = XRESIZEVEC (uchar, to->text, to->asize);
489      outbuf = to->text + to->asize - outbytesleft;
490    }
491}
492
493
494/* These functions convert entire strings between character sets.
495   They all have the signature
496
497   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
498
499   The input string FROM is converted as specified by the function
500   name plus the iconv descriptor CD (which may be fake), and the
501   result appended to TO.  On any error, false is returned, otherwise true.  */
502
503/* These four use the custom conversion code above.  */
504static bool
505convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
506		    struct _cpp_strbuf *to)
507{
508  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
509}
510
511static bool
512convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
513		    struct _cpp_strbuf *to)
514{
515  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
516}
517
518static bool
519convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
520		    struct _cpp_strbuf *to)
521{
522  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
523}
524
525static bool
526convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
527		    struct _cpp_strbuf *to)
528{
529  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
530}
531
532/* Identity conversion, used when we have no alternative.  */
533static bool
534convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
535		       const uchar *from, size_t flen, struct _cpp_strbuf *to)
536{
537  if (to->len + flen > to->asize)
538    {
539      to->asize = to->len + flen;
540      to->asize += to->asize / 4;
541      to->text = XRESIZEVEC (uchar, to->text, to->asize);
542    }
543  memcpy (to->text + to->len, from, flen);
544  to->len += flen;
545  return true;
546}
547
548/* And this one uses the system iconv primitive.  It's a little
549   different, since iconv's interface is a little different.  */
550#if HAVE_ICONV
551
552#define CONVERT_ICONV_GROW_BUFFER \
553  do { \
554      outbytesleft += OUTBUF_BLOCK_SIZE; \
555      to->asize += OUTBUF_BLOCK_SIZE; \
556      to->text = XRESIZEVEC (uchar, to->text, to->asize); \
557      outbuf = (char *)to->text + to->asize - outbytesleft; \
558  } while (0)
559
560static bool
561convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
562		     struct _cpp_strbuf *to)
563{
564  ICONV_CONST char *inbuf;
565  char *outbuf;
566  size_t inbytesleft, outbytesleft;
567
568  /* Reset conversion descriptor and check that it is valid.  */
569  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
570    return false;
571
572  inbuf = (ICONV_CONST char *)from;
573  inbytesleft = flen;
574  outbuf = (char *)to->text + to->len;
575  outbytesleft = to->asize - to->len;
576
577  for (;;)
578    {
579      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
580      if (__builtin_expect (inbytesleft == 0, 1))
581	{
582	  /* Close out any shift states, returning to the initial state.  */
583	  if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
584	    {
585	      if (errno != E2BIG)
586		return false;
587
588	      CONVERT_ICONV_GROW_BUFFER;
589	      if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
590		return false;
591	    }
592
593	  to->len = to->asize - outbytesleft;
594	  return true;
595	}
596      if (errno != E2BIG)
597	return false;
598
599      CONVERT_ICONV_GROW_BUFFER;
600    }
601}
602#else
603#define convert_using_iconv 0 /* prevent undefined symbol error below */
604#endif
605
606/* Arrange for the above custom conversion logic to be used automatically
607   when conversion between a suitable pair of character sets is requested.  */
608
609#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
610   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
611
612struct cpp_conversion
613{
614  const char *pair;
615  convert_f func;
616  iconv_t fake_cd;
617};
618static const struct cpp_conversion conversion_tab[] = {
619  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
620  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
621  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
622  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
623  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
624  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
625  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
626  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
627};
628
629/* Subroutine of cpp_init_iconv: initialize and return a
630   cset_converter structure for conversion from FROM to TO.  If
631   iconv_open() fails, issue an error and return an identity
632   converter.  Silently return an identity converter if FROM and TO
633   are identical.
634
635   PFILE is only used for generating diagnostics; setting it to NULL
636   suppresses diagnostics.  */
637
638static struct cset_converter
639init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
640{
641  struct cset_converter ret;
642  char *pair;
643  size_t i;
644
645  ret.to = to;
646  ret.from = from;
647
648  if (!strcasecmp (to, from))
649    {
650      ret.func = convert_no_conversion;
651      ret.cd = (iconv_t) -1;
652      ret.width = -1;
653      return ret;
654    }
655
656  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
657
658  strcpy(pair, from);
659  strcat(pair, "/");
660  strcat(pair, to);
661  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
662    if (!strcasecmp (pair, conversion_tab[i].pair))
663      {
664	ret.func = conversion_tab[i].func;
665	ret.cd = conversion_tab[i].fake_cd;
666	ret.width = -1;
667	return ret;
668      }
669
670  /* No custom converter - try iconv.  */
671  if (HAVE_ICONV)
672    {
673      ret.func = convert_using_iconv;
674      ret.cd = iconv_open (to, from);
675      ret.width = -1;
676
677      if (ret.cd == (iconv_t) -1)
678	{
679	  if (pfile)
680	    {
681	      if (errno == EINVAL)
682		cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
683			   "conversion from %s to %s not supported by iconv",
684			   from, to);
685	      else
686		cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
687	    }
688	  ret.func = convert_no_conversion;
689	}
690    }
691  else
692    {
693      if (pfile)
694	{
695	  cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
696		     "no iconv implementation, cannot convert from %s to %s",
697		     from, to);
698	}
699      ret.func = convert_no_conversion;
700      ret.cd = (iconv_t) -1;
701      ret.width = -1;
702    }
703
704  return ret;
705}
706
707/* If charset conversion is requested, initialize iconv(3) descriptors
708   for conversion from the source character set to the execution
709   character sets.  If iconv is not present in the C library, and
710   conversion is requested, issue an error.  */
711
712void
713cpp_init_iconv (cpp_reader *pfile)
714{
715  const char *ncset = CPP_OPTION (pfile, narrow_charset);
716  const char *wcset = CPP_OPTION (pfile, wide_charset);
717  const char *default_wcset;
718
719  bool be = CPP_OPTION (pfile, bytes_big_endian);
720
721  if (CPP_OPTION (pfile, wchar_precision) >= 32)
722    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
723  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
724    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
725  else
726    /* This effectively means that wide strings are not supported,
727       so don't do any conversion at all.  */
728   default_wcset = SOURCE_CHARSET;
729
730  if (!ncset)
731    ncset = SOURCE_CHARSET;
732  if (!wcset)
733    wcset = default_wcset;
734
735  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
736  pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
737  pfile->utf8_cset_desc = init_iconv_desc (pfile, "UTF-8", SOURCE_CHARSET);
738  pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
739  pfile->char16_cset_desc = init_iconv_desc (pfile,
740					     be ? "UTF-16BE" : "UTF-16LE",
741					     SOURCE_CHARSET);
742  pfile->char16_cset_desc.width = 16;
743  pfile->char32_cset_desc = init_iconv_desc (pfile,
744					     be ? "UTF-32BE" : "UTF-32LE",
745					     SOURCE_CHARSET);
746  pfile->char32_cset_desc.width = 32;
747  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
748  pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
749}
750
751/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
752void
753_cpp_destroy_iconv (cpp_reader *pfile)
754{
755  if (HAVE_ICONV)
756    {
757      if (pfile->narrow_cset_desc.func == convert_using_iconv)
758	iconv_close (pfile->narrow_cset_desc.cd);
759      if (pfile->utf8_cset_desc.func == convert_using_iconv)
760	iconv_close (pfile->utf8_cset_desc.cd);
761      if (pfile->char16_cset_desc.func == convert_using_iconv)
762	iconv_close (pfile->char16_cset_desc.cd);
763      if (pfile->char32_cset_desc.func == convert_using_iconv)
764	iconv_close (pfile->char32_cset_desc.cd);
765      if (pfile->wide_cset_desc.func == convert_using_iconv)
766	iconv_close (pfile->wide_cset_desc.cd);
767    }
768}
769
770/* Utility routine for use by a full compiler.  C is a character taken
771   from the *basic* source character set, encoded in the host's
772   execution encoding.  Convert it to (the target's) execution
773   encoding, and return that value.
774
775   Issues an internal error if C's representation in the narrow
776   execution character set fails to be a single-byte value (C99
777   5.2.1p3: "The representation of each member of the source and
778   execution character sets shall fit in a byte.")  May also issue an
779   internal error if C fails to be a member of the basic source
780   character set (testing this exactly is too hard, especially when
781   the host character set is EBCDIC).  */
782cppchar_t
783cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
784{
785  uchar sbuf[1];
786  struct _cpp_strbuf tbuf;
787
788  /* This test is merely an approximation, but it suffices to catch
789     the most important thing, which is that we don't get handed a
790     character outside the unibyte range of the host character set.  */
791  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
792    {
793      cpp_error (pfile, CPP_DL_ICE,
794		 "character 0x%lx is not in the basic source character set\n",
795		 (unsigned long)c);
796      return 0;
797    }
798
799  /* Being a character in the unibyte range of the host character set,
800     we can safely splat it into a one-byte buffer and trust that that
801     is a well-formed string.  */
802  sbuf[0] = c;
803
804  /* This should never need to reallocate, but just in case... */
805  tbuf.asize = 1;
806  tbuf.text = XNEWVEC (uchar, tbuf.asize);
807  tbuf.len = 0;
808
809  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
810    {
811      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
812      return 0;
813    }
814  if (tbuf.len != 1)
815    {
816      cpp_error (pfile, CPP_DL_ICE,
817		 "character 0x%lx is not unibyte in execution character set",
818		 (unsigned long)c);
819      return 0;
820    }
821  c = tbuf.text[0];
822  free(tbuf.text);
823  return c;
824}
825
826
827
828/* cpp_substring_ranges's constructor. */
829
830cpp_substring_ranges::cpp_substring_ranges () :
831  m_ranges (NULL),
832  m_num_ranges (0),
833  m_alloc_ranges (8)
834{
835  m_ranges = XNEWVEC (source_range, m_alloc_ranges);
836}
837
838/* cpp_substring_ranges's destructor. */
839
840cpp_substring_ranges::~cpp_substring_ranges ()
841{
842  free (m_ranges);
843}
844
845/* Add RANGE to the vector of source_range information.  */
846
847void
848cpp_substring_ranges::add_range (source_range range)
849{
850  if (m_num_ranges >= m_alloc_ranges)
851    {
852      m_alloc_ranges *= 2;
853      m_ranges
854	= (source_range *)xrealloc (m_ranges,
855				    sizeof (source_range) * m_alloc_ranges);
856    }
857  m_ranges[m_num_ranges++] = range;
858}
859
860/* Read NUM ranges from LOC_READER, adding them to the vector of source_range
861   information.  */
862
863void
864cpp_substring_ranges::add_n_ranges (int num,
865				    cpp_string_location_reader &loc_reader)
866{
867  for (int i = 0; i < num; i++)
868    add_range (loc_reader.get_next ());
869}
870
871
872
873/* Utility routine that computes a mask of the form 0000...111... with
874   WIDTH 1-bits.  */
875static inline size_t
876width_to_mask (size_t width)
877{
878  width = MIN (width, BITS_PER_CPPCHAR_T);
879  if (width >= CHAR_BIT * sizeof (size_t))
880    return ~(size_t) 0;
881  else
882    return ((size_t) 1 << width) - 1;
883}
884
885/* A large table of unicode character information.  */
886enum {
887  /* Valid in a C99 identifier?  */
888  C99 = 1,
889  /* Valid in a C99 identifier, but not as the first character?  */
890  N99 = 2,
891  /* Valid in a C++ identifier?  */
892  CXX = 4,
893  /* Valid in a C11/C++11 identifier?  */
894  C11 = 8,
895  /* Valid in a C11/C++11 identifier, but not as the first character?  */
896  N11 = 16,
897  /* Valid in a C++23 identifier?  */
898  CXX23 = 32,
899  /* Valid in a C++23 identifier, but not as the first character?  */
900  NXX23 = 64,
901  /* NFC representation is not valid in an identifier?  */
902  CID = 128,
903  /* Might be valid NFC form?  */
904  NFC = 256,
905  /* Might be valid NFKC form?  */
906  NKC = 512,
907  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
908  CTX = 1024
909};
910
911struct ucnrange {
912  /* Bitmap of flags above.  */
913  unsigned short flags;
914  /* Combining class of the character.  */
915  unsigned char combine;
916  /* Last character in the range described by this entry.  */
917  unsigned int end;
918};
919#include "ucnid.h"
920
921/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
922#define UCS_LIMIT 0x10FFFF
923
924/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
925   the start of an identifier, and 0 if C is not valid in an
926   identifier.  We assume C has already gone through the checks of
927   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
928   algorithm is a simple binary search on the table defined in
929   ucnid.h.  */
930
931static int
932ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
933			 struct normalize_state *nst)
934{
935  int mn, mx, md;
936  unsigned short valid_flags, invalid_start_flags;
937
938  if (c > UCS_LIMIT)
939    return 0;
940
941  mn = 0;
942  mx = ARRAY_SIZE (ucnranges) - 1;
943  while (mx != mn)
944    {
945      md = (mn + mx) / 2;
946      if (c <= ucnranges[md].end)
947	mx = md;
948      else
949	mn = md + 1;
950    }
951
952  /* When -pedantic, we require the character to have been listed by
953     the standard for the current language.  Otherwise, we accept the
954     union of the acceptable sets for all supported language versions.  */
955  valid_flags = C99 | CXX | C11 | CXX23;
956  if (CPP_PEDANTIC (pfile))
957    {
958      if (CPP_OPTION (pfile, cplusplus))
959	valid_flags = CXX23;
960      else if (CPP_OPTION (pfile, c11_identifiers))
961	valid_flags = C11;
962      else if (CPP_OPTION (pfile, c99))
963	valid_flags = C99;
964    }
965  if (! (ucnranges[mn].flags & valid_flags))
966      return 0;
967
968  /* Update NST.  */
969  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
970    nst->level = normalized_none;
971  else if (ucnranges[mn].flags & CTX)
972    {
973      bool safe;
974      cppchar_t p = nst->previous;
975
976      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
977	 and are combined algorithmically from a sequence of the form
978	 1100-1112 1161-1175 11A8-11C2
979	 (if the third is not present, it is treated as 11A7, which is not
980	 really a valid character).
981	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
982	 only the combining characters.  */
983      if (c >= 0x1161 && c <= 0x1175)
984	safe = p < 0x1100 || p > 0x1112;
985      else if (c >= 0x11A8 && c <= 0x11C2)
986	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
987      else
988	safe = check_nfc (pfile, c, p);
989      if (!safe)
990	{
991	  if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2))
992	    nst->level = MAX (nst->level, normalized_identifier_C);
993	  else
994	    nst->level = normalized_none;
995	}
996    }
997  else if (ucnranges[mn].flags & NKC)
998    ;
999  else if (ucnranges[mn].flags & NFC)
1000    nst->level = MAX (nst->level, normalized_C);
1001  else if (ucnranges[mn].flags & CID)
1002    nst->level = MAX (nst->level, normalized_identifier_C);
1003  else
1004    nst->level = normalized_none;
1005  if (ucnranges[mn].combine == 0)
1006    nst->previous = c;
1007  nst->prev_class = ucnranges[mn].combine;
1008
1009  if (!CPP_PEDANTIC (pfile))
1010    {
1011      /* If not -pedantic, accept as character that may
1012	 begin an identifier a union of characters allowed
1013	 at that position in each of the character sets.  */
1014      if ((ucnranges[mn].flags & (C99 | N99)) == C99
1015	  || (ucnranges[mn].flags & CXX) != 0
1016	  || (ucnranges[mn].flags & (C11 | N11)) == C11
1017	  || (ucnranges[mn].flags & (CXX23 | NXX23)) == CXX23)
1018	return 1;
1019      return 2;
1020    }
1021
1022  if (CPP_OPTION (pfile, cplusplus))
1023    invalid_start_flags = NXX23;
1024  else if (CPP_OPTION (pfile, c11_identifiers))
1025    invalid_start_flags = N11;
1026  else if (CPP_OPTION (pfile, c99))
1027    invalid_start_flags = N99;
1028  else
1029    invalid_start_flags = 0;
1030
1031  /* In C99, UCN digits may not begin identifiers.  In C11 and C++11,
1032     UCN combining characters may not begin identifiers.  */
1033  if (ucnranges[mn].flags & invalid_start_flags)
1034    return 2;
1035
1036  return 1;
1037}
1038
1039/* [lex.charset]: The character designated by the universal character
1040   name \UNNNNNNNN is that character whose character short name in
1041   ISO/IEC 10646 is NNNNNNNN; the character designated by the
1042   universal character name \uNNNN is that character whose character
1043   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
1044   for a universal character name corresponds to a surrogate code point
1045   (in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
1046   Additionally, if the hexadecimal value for a universal-character-name
1047   outside a character or string literal corresponds to a control character
1048   (in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
1049   character in the basic source character set, the program is ill-formed.
1050
1051   C99 6.4.3: A universal character name shall not specify a character
1052   whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
1053   or 0060 (`), nor one in the range D800 through DFFF inclusive.
1054
1055   If the hexadecimal value is larger than the upper bound of the UCS
1056   codespace specified in ISO/IEC 10646, a pedantic warning is issued
1057   in all versions of C and in the C++20 or later versions of C++.
1058
1059   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
1060   buffer end is delimited by a non-hex digit.  Returns false if the
1061   UCN has not been consumed, true otherwise.
1062
1063   The value of the UCN, whether valid or invalid, is returned in *CP.
1064   Diagnostics are emitted for invalid values.  PSTR is updated to point
1065   one beyond the UCN, or to the syntactically invalid character.
1066
1067   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
1068   an identifier, or 2 otherwise.
1069
1070   If LOC_READER is non-NULL, then position information is
1071   read from *LOC_READER and CHAR_RANGE->m_finish is updated accordingly.  */
1072
1073bool
1074_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
1075		const uchar *limit, int identifier_pos,
1076		struct normalize_state *nst, cppchar_t *cp,
1077		source_range *char_range,
1078		cpp_string_location_reader *loc_reader)
1079{
1080  cppchar_t result, c;
1081  unsigned int length;
1082  const uchar *str = *pstr;
1083  const uchar *base = str - 2;
1084
1085  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
1086    cpp_error (pfile, CPP_DL_WARNING,
1087	       "universal character names are only valid in C++ and C99");
1088  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
1089	   && !CPP_OPTION (pfile, cplusplus))
1090    cpp_error (pfile, CPP_DL_WARNING,
1091	       "C99's universal character names are incompatible with C90");
1092  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
1093    cpp_warning (pfile, CPP_W_TRADITIONAL,
1094	         "the meaning of '\\%c' is different in traditional C",
1095	         (int) str[-1]);
1096
1097  if (str[-1] == 'u')
1098    length = 4;
1099  else if (str[-1] == 'U')
1100    length = 8;
1101  else
1102    {
1103      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
1104      length = 4;
1105    }
1106
1107  result = 0;
1108  do
1109    {
1110      c = *str;
1111      if (!ISXDIGIT (c))
1112	break;
1113      str++;
1114      if (loc_reader)
1115	{
1116	  gcc_assert (char_range);
1117	  char_range->m_finish = loc_reader->get_next ().m_finish;
1118	}
1119      result = (result << 4) + hex_value (c);
1120    }
1121  while (--length && str < limit);
1122
1123  /* Partial UCNs are not valid in strings, but decompose into
1124     multiple tokens in identifiers, so we can't give a helpful
1125     error message in that case.  */
1126  if (length && identifier_pos)
1127    {
1128      *cp = 0;
1129      return false;
1130    }
1131
1132  *pstr = str;
1133  if (length)
1134    {
1135      cpp_error (pfile, CPP_DL_ERROR,
1136		 "incomplete universal character name %.*s",
1137		 (int) (str - base), base);
1138      result = 1;
1139    }
1140  /* The C99 standard permits $, @ and ` to be specified as UCNs.  We use
1141     hex escapes so that this also works with EBCDIC hosts.
1142     C++0x permits everything below 0xa0 within literals;
1143     ucn_valid_in_identifier will complain about identifiers.  */
1144  else if ((result < 0xa0
1145	    && !CPP_OPTION (pfile, cplusplus)
1146	    && (result != 0x24 && result != 0x40 && result != 0x60))
1147	   || (result & 0x80000000)
1148	   || (result >= 0xD800 && result <= 0xDFFF))
1149    {
1150      cpp_error (pfile, CPP_DL_ERROR,
1151		 "%.*s is not a valid universal character",
1152		 (int) (str - base), base);
1153      result = 1;
1154    }
1155  else if (identifier_pos && result == 0x24
1156	   && CPP_OPTION (pfile, dollars_in_ident))
1157    {
1158      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1159	{
1160	  CPP_OPTION (pfile, warn_dollars) = 0;
1161	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1162	}
1163      NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
1164    }
1165  else if (identifier_pos)
1166    {
1167      int validity = ucn_valid_in_identifier (pfile, result, nst);
1168
1169      if (validity == 0)
1170	cpp_error (pfile, CPP_DL_ERROR,
1171		   "universal character %.*s is not valid in an identifier",
1172		   (int) (str - base), base);
1173      else if (validity == 2 && identifier_pos == 1)
1174	cpp_error (pfile, CPP_DL_ERROR,
1175   "universal character %.*s is not valid at the start of an identifier",
1176		   (int) (str - base), base);
1177    }
1178  else if (result > UCS_LIMIT
1179	   && (!CPP_OPTION (pfile, cplusplus)
1180	       || CPP_OPTION (pfile, lang) > CLK_CXX17))
1181    cpp_error (pfile, CPP_DL_PEDWARN,
1182	       "%.*s is outside the UCS codespace",
1183	       (int) (str - base), base);
1184
1185  *cp = result;
1186  return true;
1187}
1188
1189/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1190   it to the execution character set and write the result into TBUF,
1191   if TBUF is non-NULL.
1192   An advanced pointer is returned.  Issues all relevant diagnostics.
1193   If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
1194   contains the location of the character so far: location information
1195   is read from *LOC_READER, and *RANGES is updated accordingly.  */
1196static const uchar *
1197convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1198	     struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1199	     source_range char_range,
1200	     cpp_string_location_reader *loc_reader,
1201	     cpp_substring_ranges *ranges)
1202{
1203  cppchar_t ucn;
1204  uchar buf[6];
1205  uchar *bufp = buf;
1206  size_t bytesleft = 6;
1207  int rval;
1208  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1209
1210  /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
1211  gcc_assert ((loc_reader != NULL) == (ranges != NULL));
1212
1213  from++;  /* Skip u/U.  */
1214
1215  if (loc_reader)
1216    /* The u/U is part of the spelling of this character.  */
1217    char_range.m_finish = loc_reader->get_next ().m_finish;
1218
1219  _cpp_valid_ucn (pfile, &from, limit, 0, &nst,
1220		  &ucn, &char_range, loc_reader);
1221
1222  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
1223  if (rval)
1224    {
1225      errno = rval;
1226      cpp_errno (pfile, CPP_DL_ERROR,
1227		 "converting UCN to source character set");
1228    }
1229  else
1230    {
1231      if (tbuf)
1232	if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1233	  cpp_errno (pfile, CPP_DL_ERROR,
1234		     "converting UCN to execution character set");
1235
1236      if (loc_reader)
1237	{
1238	  int num_encoded_bytes = 6 - bytesleft;
1239	  for (int i = 0; i < num_encoded_bytes; i++)
1240	    ranges->add_range (char_range);
1241	}
1242    }
1243
1244  return from;
1245}
1246
1247/*  Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded
1248    extended characters rather than UCNs.  If the return value is TRUE, then a
1249    character was successfully decoded and stored in *CP; *PSTR has been
1250    updated to point one past the valid UTF-8 sequence.  Diagnostics may have
1251    been emitted if the character parsed is not allowed in the current context.
1252    If the return value is FALSE, then *PSTR has not been modified and *CP may
1253    equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it
1254    may, when processing an identifier in C mode, equal a codepoint that was
1255    validly encoded but is not allowed to appear in an identifier.  In either
1256    case, no diagnostic is emitted, and the return value of FALSE should cause
1257    a new token to be formed.
1258
1259    Unlike _cpp_valid_ucn, this will never be called when lexing a string; only
1260    a potential identifier, or a CPP_OTHER token.  NST is unused in the latter
1261    case.
1262
1263    As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
1264    the start of an identifier, or 2 otherwise.  */
1265
1266extern bool
1267_cpp_valid_utf8 (cpp_reader *pfile,
1268		 const uchar **pstr,
1269		 const uchar *limit,
1270		 int identifier_pos,
1271		 struct normalize_state *nst,
1272		 cppchar_t *cp)
1273{
1274  const uchar *base = *pstr;
1275  size_t inbytesleft = limit - base;
1276  if (one_utf8_to_cppchar (pstr, &inbytesleft, cp))
1277    {
1278      /* No diagnostic here as this byte will rather become a
1279	 new token.  */
1280      *cp = 0;
1281      return false;
1282    }
1283
1284  if (identifier_pos)
1285    {
1286      switch (ucn_valid_in_identifier (pfile, *cp, nst))
1287	{
1288
1289	case 0:
1290	  /* In C++, this is an error for invalid character in an identifier
1291	     because logically, the UTF-8 was converted to a UCN during
1292	     translation phase 1 (even though we don't physically do it that
1293	     way).  In C, this byte rather becomes grammatically a separate
1294	     token.  */
1295
1296	  if (CPP_OPTION (pfile, cplusplus))
1297	    cpp_error (pfile, CPP_DL_ERROR,
1298		       "extended character %.*s is not valid in an identifier",
1299		       (int) (*pstr - base), base);
1300	  else
1301	    {
1302	      *pstr = base;
1303	      return false;
1304	    }
1305
1306	  break;
1307
1308	case 2:
1309	  if (identifier_pos == 1)
1310	    {
1311	      /* This is treated the same way in C++ or C99 -- lexed as an
1312		 identifier which is then invalid because an identifier is
1313		 not allowed to start with this character.  */
1314	      cpp_error (pfile, CPP_DL_ERROR,
1315	  "extended character %.*s is not valid at the start of an identifier",
1316			 (int) (*pstr - base), base);
1317	    }
1318	  break;
1319	}
1320    }
1321
1322  return true;
1323}
1324
1325/* Subroutine of convert_hex and convert_oct.  N is the representation
1326   in the execution character set of a numeric escape; write it into the
1327   string buffer TBUF and update the end-of-string pointer therein.  WIDE
1328   is true if it's a wide string that's being assembled in TBUF.  This
1329   function issues no diagnostics and never fails.  */
1330static void
1331emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
1332		     struct _cpp_strbuf *tbuf, struct cset_converter cvt)
1333{
1334  size_t width = cvt.width;
1335
1336  if (width != CPP_OPTION (pfile, char_precision))
1337    {
1338      /* We have to render this into the target byte order, which may not
1339	 be our byte order.  */
1340      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1341      size_t cwidth = CPP_OPTION (pfile, char_precision);
1342      size_t cmask = width_to_mask (cwidth);
1343      size_t nbwc = width / cwidth;
1344      size_t i;
1345      size_t off = tbuf->len;
1346      cppchar_t c;
1347
1348      if (tbuf->len + nbwc > tbuf->asize)
1349	{
1350	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1351	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1352	}
1353
1354      for (i = 0; i < nbwc; i++)
1355	{
1356	  c = n & cmask;
1357	  n >>= cwidth;
1358	  tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1359	}
1360      tbuf->len += nbwc;
1361    }
1362  else
1363    {
1364      /* Note: this code does not handle the case where the target
1365	 and host have a different number of bits in a byte.  */
1366      if (tbuf->len + 1 > tbuf->asize)
1367	{
1368	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1369	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1370	}
1371      tbuf->text[tbuf->len++] = n;
1372    }
1373}
1374
1375/* Convert a hexadecimal escape, pointed to by FROM, to the execution
1376   character set and write it into the string buffer TBUF (if non-NULL).
1377   Returns an advanced pointer, and issues diagnostics as necessary.
1378   No character set translation occurs; this routine always produces the
1379   execution-set character with numeric value equal to the given hex
1380   number.  You can, e.g. generate surrogate pairs this way.
1381   If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
1382   contains the location of the character so far: location information
1383   is read from *LOC_READER, and *RANGES is updated accordingly.  */
1384static const uchar *
1385convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1386	     struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1387	     source_range char_range,
1388	     cpp_string_location_reader *loc_reader,
1389	     cpp_substring_ranges *ranges)
1390{
1391  cppchar_t c, n = 0, overflow = 0;
1392  int digits_found = 0;
1393  size_t width = cvt.width;
1394  size_t mask = width_to_mask (width);
1395
1396  /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
1397  gcc_assert ((loc_reader != NULL) == (ranges != NULL));
1398
1399  if (CPP_WTRADITIONAL (pfile))
1400    cpp_warning (pfile, CPP_W_TRADITIONAL,
1401	         "the meaning of '\\x' is different in traditional C");
1402
1403  /* Skip 'x'.  */
1404  from++;
1405
1406  /* The 'x' is part of the spelling of this character.  */
1407  if (loc_reader)
1408    char_range.m_finish = loc_reader->get_next ().m_finish;
1409
1410  while (from < limit)
1411    {
1412      c = *from;
1413      if (! hex_p (c))
1414	break;
1415      from++;
1416      if (loc_reader)
1417	char_range.m_finish = loc_reader->get_next ().m_finish;
1418      overflow |= n ^ (n << 4 >> 4);
1419      n = (n << 4) + hex_value (c);
1420      digits_found = 1;
1421    }
1422
1423  if (!digits_found)
1424    {
1425      cpp_error (pfile, CPP_DL_ERROR,
1426		 "\\x used with no following hex digits");
1427      return from;
1428    }
1429
1430  if (overflow | (n != (n & mask)))
1431    {
1432      cpp_error (pfile, CPP_DL_PEDWARN,
1433		 "hex escape sequence out of range");
1434      n &= mask;
1435    }
1436
1437  if (tbuf)
1438    emit_numeric_escape (pfile, n, tbuf, cvt);
1439  if (ranges)
1440    ranges->add_range (char_range);
1441
1442  return from;
1443}
1444
1445/* Convert an octal escape, pointed to by FROM, to the execution
1446   character set and write it into the string buffer TBUF.  Returns an
1447   advanced pointer, and issues diagnostics as necessary.
1448   No character set translation occurs; this routine always produces the
1449   execution-set character with numeric value equal to the given octal
1450   number.
1451   If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
1452   contains the location of the character so far: location information
1453   is read from *LOC_READER, and *RANGES is updated accordingly.  */
1454static const uchar *
1455convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1456	     struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1457	     source_range char_range,
1458	     cpp_string_location_reader *loc_reader,
1459	     cpp_substring_ranges *ranges)
1460{
1461  size_t count = 0;
1462  cppchar_t c, n = 0;
1463  size_t width = cvt.width;
1464  size_t mask = width_to_mask (width);
1465
1466  /* loc_reader and ranges must either be both NULL, or both be non-NULL.  */
1467  gcc_assert ((loc_reader != NULL) == (ranges != NULL));
1468
1469  while (from < limit && count++ < 3)
1470    {
1471      c = *from;
1472      if (c < '0' || c > '7')
1473	break;
1474      from++;
1475      if (loc_reader)
1476	char_range.m_finish = loc_reader->get_next ().m_finish;
1477      n = (n << 3) + c - '0';
1478    }
1479
1480  if (n != (n & mask))
1481    {
1482      cpp_error (pfile, CPP_DL_PEDWARN,
1483		 "octal escape sequence out of range");
1484      n &= mask;
1485    }
1486
1487  if (tbuf)
1488    emit_numeric_escape (pfile, n, tbuf, cvt);
1489  if (ranges)
1490    ranges->add_range (char_range);
1491
1492  return from;
1493}
1494
1495/* Convert an escape sequence (pointed to by FROM) to its value on
1496   the target, and to the execution character set.  Do not scan past
1497   LIMIT.  Write the converted value into TBUF, if TBUF is non-NULL.
1498   Returns an advanced pointer.  Handles all relevant diagnostics.
1499   If LOC_READER is non-NULL, then RANGES must be non-NULL: location
1500   information is read from *LOC_READER, and *RANGES is updated
1501   accordingly.  */
1502static const uchar *
1503convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1504		struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1505		cpp_string_location_reader *loc_reader,
1506		cpp_substring_ranges *ranges)
1507{
1508  /* Values of \a \b \e \f \n \r \t \v respectively.  */
1509#if HOST_CHARSET == HOST_CHARSET_ASCII
1510  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
1511#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1512  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
1513#else
1514#error "unknown host character set"
1515#endif
1516
1517  uchar c;
1518
1519  /* Record the location of the backslash.  */
1520  source_range char_range;
1521  if (loc_reader)
1522    char_range = loc_reader->get_next ();
1523
1524  c = *from;
1525  switch (c)
1526    {
1527      /* UCNs, hex escapes, and octal escapes are processed separately.  */
1528    case 'u': case 'U':
1529      return convert_ucn (pfile, from, limit, tbuf, cvt,
1530			  char_range, loc_reader, ranges);
1531
1532    case 'x':
1533      return convert_hex (pfile, from, limit, tbuf, cvt,
1534			  char_range, loc_reader, ranges);
1535
1536    case '0':  case '1':  case '2':  case '3':
1537    case '4':  case '5':  case '6':  case '7':
1538      return convert_oct (pfile, from, limit, tbuf, cvt,
1539			  char_range, loc_reader, ranges);
1540
1541      /* Various letter escapes.  Get the appropriate host-charset
1542	 value into C.  */
1543    case '\\': case '\'': case '"': case '?': break;
1544
1545    case '(': case '{': case '[': case '%':
1546      /* '\(', etc, can be used at the beginning of a line in a long
1547	 string split onto multiple lines with \-newline, to prevent
1548	 Emacs or other text editors from getting confused.  '\%' can
1549	 be used to prevent SCCS from mangling printf format strings.  */
1550      if (CPP_PEDANTIC (pfile))
1551	goto unknown;
1552      break;
1553
1554    case 'b': c = charconsts[1];  break;
1555    case 'f': c = charconsts[3];  break;
1556    case 'n': c = charconsts[4];  break;
1557    case 'r': c = charconsts[5];  break;
1558    case 't': c = charconsts[6];  break;
1559    case 'v': c = charconsts[7];  break;
1560
1561    case 'a':
1562      if (CPP_WTRADITIONAL (pfile))
1563	cpp_warning (pfile, CPP_W_TRADITIONAL,
1564		     "the meaning of '\\a' is different in traditional C");
1565      c = charconsts[0];
1566      break;
1567
1568    case 'e': case 'E':
1569      if (CPP_PEDANTIC (pfile))
1570	cpp_error (pfile, CPP_DL_PEDWARN,
1571		   "non-ISO-standard escape sequence, '\\%c'", (int) c);
1572      c = charconsts[2];
1573      break;
1574
1575    default:
1576    unknown:
1577      if (ISGRAPH (c))
1578	cpp_error (pfile, CPP_DL_PEDWARN,
1579		   "unknown escape sequence: '\\%c'", (int) c);
1580      else
1581	{
1582	  encoding_rich_location rich_loc (pfile);
1583
1584	  /* diagnostic.cc does not support "%03o".  When it does, this
1585	     code can use %03o directly in the diagnostic again.  */
1586	  char buf[32];
1587	  sprintf(buf, "%03o", (int) c);
1588	  cpp_error_at (pfile, CPP_DL_PEDWARN, &rich_loc,
1589			"unknown escape sequence: '\\%s'", buf);
1590	}
1591    }
1592
1593  if (tbuf)
1594    /* Now convert what we have to the execution character set.  */
1595    if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1596      cpp_errno (pfile, CPP_DL_ERROR,
1597		 "converting escape sequence to execution character set");
1598
1599  if (loc_reader)
1600    {
1601      char_range.m_finish = loc_reader->get_next ().m_finish;
1602      ranges->add_range (char_range);
1603    }
1604
1605  return from + 1;
1606}
1607
1608/* TYPE is a token type.  The return value is the conversion needed to
1609   convert from source to execution character set for the given type. */
1610static struct cset_converter
1611converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
1612{
1613  switch (type)
1614    {
1615    default:
1616	return pfile->narrow_cset_desc;
1617    case CPP_UTF8CHAR:
1618    case CPP_UTF8STRING:
1619	return pfile->utf8_cset_desc;
1620    case CPP_CHAR16:
1621    case CPP_STRING16:
1622	return pfile->char16_cset_desc;
1623    case CPP_CHAR32:
1624    case CPP_STRING32:
1625	return pfile->char32_cset_desc;
1626    case CPP_WCHAR:
1627    case CPP_WSTRING:
1628	return pfile->wide_cset_desc;
1629    }
1630}
1631
1632/* FROM is an array of cpp_string structures of length COUNT.  These
1633   are to be converted from the source to the execution character set,
1634   escape sequences translated, and finally all are to be
1635   concatenated.  WIDE indicates whether or not to produce a wide
1636   string.  If TO is non-NULL, the result is written into TO.
1637   If LOC_READERS and OUT are non-NULL, then location information
1638   is read from LOC_READERS (which must be an array of length COUNT),
1639   and location information is written to *RANGES.
1640
1641   Returns true for success, false for failure.  */
1642
1643static bool
1644cpp_interpret_string_1 (cpp_reader *pfile, const cpp_string *from, size_t count,
1645			cpp_string *to,  enum cpp_ttype type,
1646			cpp_string_location_reader *loc_readers,
1647			cpp_substring_ranges *out)
1648{
1649  struct _cpp_strbuf tbuf;
1650  const uchar *p, *base, *limit;
1651  size_t i;
1652  struct cset_converter cvt = converter_for_type (pfile, type);
1653
1654  /* loc_readers and out must either be both NULL, or both be non-NULL.  */
1655  gcc_assert ((loc_readers != NULL) == (out != NULL));
1656
1657  if (to)
1658    {
1659      tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1660      tbuf.text = XNEWVEC (uchar, tbuf.asize);
1661      tbuf.len = 0;
1662    }
1663
1664  cpp_string_location_reader *loc_reader = NULL;
1665  for (i = 0; i < count; i++)
1666    {
1667      if (loc_readers)
1668	loc_reader = &loc_readers[i];
1669
1670      p = from[i].text;
1671      if (*p == 'u')
1672	{
1673	  p++;
1674	  if (loc_reader)
1675	    loc_reader->get_next ();
1676	  if (*p == '8')
1677	    {
1678	      p++;
1679	      if (loc_reader)
1680		loc_reader->get_next ();
1681	    }
1682	}
1683      else if (*p == 'L' || *p == 'U') p++;
1684      if (*p == 'R')
1685	{
1686	  const uchar *prefix;
1687
1688	  /* Skip over 'R"'.  */
1689	  p += 2;
1690	  if (loc_reader)
1691	    {
1692	      loc_reader->get_next ();
1693	      loc_reader->get_next ();
1694	    }
1695	  prefix = p;
1696	  while (*p != '(')
1697	    {
1698	      p++;
1699	      if (loc_reader)
1700		loc_reader->get_next ();
1701	    }
1702	  p++;
1703	  if (loc_reader)
1704	    loc_reader->get_next ();
1705	  limit = from[i].text + from[i].len;
1706	  if (limit >= p + (p - prefix) + 1)
1707	    limit -= (p - prefix) + 1;
1708
1709	  /* Raw strings are all normal characters; these can be fed
1710	     directly to convert_cset.  */
1711	  if (to)
1712	    if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
1713	      goto fail;
1714
1715	  if (loc_reader)
1716	    {
1717	      /* If generating source ranges, assume we have a 1:1
1718		 correspondence between bytes in the source encoding and bytes
1719		 in the execution encoding (e.g. if we have a UTF-8 to UTF-8
1720		 conversion), so that this run of bytes in the source file
1721		 corresponds to a run of bytes in the execution string.
1722		 This requirement is guaranteed by an early-reject in
1723		 cpp_interpret_string_ranges.  */
1724	      gcc_assert (cvt.func == convert_no_conversion);
1725	      out->add_n_ranges (limit - p, *loc_reader);
1726	    }
1727
1728	  continue;
1729	}
1730
1731      /* If we don't now have a leading quote, something has gone wrong.
1732	 This can occur if cpp_interpret_string_ranges is handling a
1733	 stringified macro argument, but should not be possible otherwise.  */
1734      if (*p != '"' && *p != '\'')
1735	{
1736	  gcc_assert (out != NULL);
1737	  cpp_error (pfile, CPP_DL_ERROR, "missing open quote");
1738	  if (to)
1739	    free (tbuf.text);
1740	  return false;
1741	}
1742
1743      /* Skip leading quote.  */
1744      p++;
1745      if (loc_reader)
1746	loc_reader->get_next ();
1747
1748      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1749
1750      for (;;)
1751	{
1752	  base = p;
1753	  while (p < limit && *p != '\\')
1754	    p++;
1755	  if (p > base)
1756	    {
1757	      /* We have a run of normal characters; these can be fed
1758		 directly to convert_cset.  */
1759	      if (to)
1760		if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1761		  goto fail;
1762	    /* Similar to above: assumes we have a 1:1 correspondence
1763	       between bytes in the source encoding and bytes in the
1764	       execution encoding.  */
1765	      if (loc_reader)
1766		{
1767		  gcc_assert (cvt.func == convert_no_conversion);
1768		  out->add_n_ranges (p - base, *loc_reader);
1769		}
1770	    }
1771	  if (p >= limit)
1772	    break;
1773
1774	  struct _cpp_strbuf *tbuf_ptr = to ? &tbuf : NULL;
1775	  p = convert_escape (pfile, p + 1, limit, tbuf_ptr, cvt,
1776			      loc_reader, out);
1777	}
1778    }
1779
1780  if (to)
1781    {
1782      /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1783	 structure.  */
1784      emit_numeric_escape (pfile, 0, &tbuf, cvt);
1785      tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1786      to->text = tbuf.text;
1787      to->len = tbuf.len;
1788    }
1789  /* Use the location of the trailing quote as the location of the
1790     NUL-terminator.  */
1791  if (loc_reader)
1792    {
1793      source_range range = loc_reader->get_next ();
1794      out->add_range (range);
1795    }
1796
1797  return true;
1798
1799 fail:
1800  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1801  if (to)
1802    free (tbuf.text);
1803  return false;
1804}
1805
1806/* FROM is an array of cpp_string structures of length COUNT.  These
1807   are to be converted from the source to the execution character set,
1808   escape sequences translated, and finally all are to be
1809   concatenated.  WIDE indicates whether or not to produce a wide
1810   string.  The result is written into TO.  Returns true for success,
1811   false for failure.  */
1812bool
1813cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1814		      cpp_string *to,  enum cpp_ttype type)
1815{
1816  return cpp_interpret_string_1 (pfile, from, count, to, type, NULL, NULL);
1817}
1818
1819/* A "do nothing" diagnostic-handling callback for use by
1820   cpp_interpret_string_ranges, so that it can temporarily suppress
1821   diagnostic-handling.  */
1822
1823static bool
1824noop_diagnostic_cb (cpp_reader *, enum cpp_diagnostic_level,
1825		    enum cpp_warning_reason, rich_location *,
1826		    const char *, va_list *)
1827{
1828  /* no-op.  */
1829  return true;
1830}
1831
1832/* This function mimics the behavior of cpp_interpret_string, but
1833   rather than generating a string in the execution character set,
1834   *OUT is written to with the source code ranges of the characters
1835   in such a string.
1836   FROM and LOC_READERS should both be arrays of length COUNT.
1837   Returns NULL for success, or an error message for failure.  */
1838
1839const char *
1840cpp_interpret_string_ranges (cpp_reader *pfile, const cpp_string *from,
1841			     cpp_string_location_reader *loc_readers,
1842			     size_t count,
1843			     cpp_substring_ranges *out,
1844			     enum cpp_ttype type)
1845{
1846  /* There are a couple of cases in the range-handling in
1847     cpp_interpret_string_1 that rely on there being a 1:1 correspondence
1848     between bytes in the source encoding and bytes in the execution
1849     encoding, so that each byte in the execution string can correspond
1850     to the location of a byte in the source string.
1851
1852     This holds for the typical case of a UTF-8 to UTF-8 conversion.
1853     Enforce this requirement by only attempting to track substring
1854     locations if we have source encoding == execution encoding.
1855
1856     This is a stronger condition than we need, since we could e.g.
1857     have ASCII to EBCDIC (with 1 byte per character before and after),
1858     but it seems to be a reasonable restriction.  */
1859  struct cset_converter cvt = converter_for_type (pfile, type);
1860  if (cvt.func != convert_no_conversion)
1861    return "execution character set != source character set";
1862
1863  /* For on-demand strings we have already lexed the strings, so there
1864     should be no diagnostics.  However, if we have bogus source location
1865     data (or stringified macro arguments), the attempt to lex the
1866     strings could fail with an diagnostic.  Temporarily install an
1867     diagnostic-handler to catch the diagnostic, so that it can lead to this call
1868     failing, rather than being emitted as a user-visible diagnostic.
1869     If an diagnostic does occur, we should see it via the return value of
1870     cpp_interpret_string_1.  */
1871  bool (*saved_diagnostic_handler) (cpp_reader *, enum cpp_diagnostic_level,
1872				    enum cpp_warning_reason, rich_location *,
1873				    const char *, va_list *)
1874    ATTRIBUTE_FPTR_PRINTF(5,0);
1875
1876  saved_diagnostic_handler = pfile->cb.diagnostic;
1877  pfile->cb.diagnostic = noop_diagnostic_cb;
1878
1879  bool result = cpp_interpret_string_1 (pfile, from, count, NULL, type,
1880					loc_readers, out);
1881
1882  /* Restore the saved diagnostic-handler.  */
1883  pfile->cb.diagnostic = saved_diagnostic_handler;
1884
1885  if (!result)
1886    return "cpp_interpret_string_1 failed";
1887
1888  /* Success.  */
1889  return NULL;
1890}
1891
1892/* Subroutine of do_line and do_linemarker.  Convert escape sequences
1893   in a string, but do not perform character set conversion.  */
1894bool
1895cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1896				  size_t count,	cpp_string *to,
1897				  enum cpp_ttype type ATTRIBUTE_UNUSED)
1898{
1899  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1900  bool retval;
1901
1902  pfile->narrow_cset_desc.func = convert_no_conversion;
1903  pfile->narrow_cset_desc.cd = (iconv_t) -1;
1904  pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
1905
1906  retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
1907
1908  pfile->narrow_cset_desc = save_narrow_cset_desc;
1909  return retval;
1910}
1911
1912
1913/* Subroutine of cpp_interpret_charconst which performs the conversion
1914   to a number, for narrow strings.  STR is the string structure returned
1915   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1916   cpp_interpret_charconst.  TYPE is the token type.  */
1917static cppchar_t
1918narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1919			 unsigned int *pchars_seen, int *unsignedp,
1920			 enum cpp_ttype type)
1921{
1922  size_t width = CPP_OPTION (pfile, char_precision);
1923  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1924  size_t mask = width_to_mask (width);
1925  size_t i;
1926  cppchar_t result, c;
1927  bool unsigned_p;
1928
1929  /* The value of a multi-character character constant, or a
1930     single-character character constant whose representation in the
1931     execution character set is more than one byte long, is
1932     implementation defined.  This implementation defines it to be the
1933     number formed by interpreting the byte sequence in memory as a
1934     big-endian binary number.  If overflow occurs, the high bytes are
1935     lost, and a warning is issued.
1936
1937     We don't want to process the NUL terminator handed back by
1938     cpp_interpret_string.  */
1939  result = 0;
1940  for (i = 0; i < str.len - 1; i++)
1941    {
1942      c = str.text[i] & mask;
1943      if (width < BITS_PER_CPPCHAR_T)
1944	result = (result << width) | c;
1945      else
1946	result = c;
1947    }
1948
1949  if (type == CPP_UTF8CHAR)
1950    max_chars = 1;
1951  if (i > max_chars)
1952    {
1953      i = max_chars;
1954      cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
1955		 "character constant too long for its type");
1956    }
1957  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1958    cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
1959
1960  /* Multichar constants are of type int and therefore signed.  */
1961  if (i > 1)
1962    unsigned_p = 0;
1963  else if (type == CPP_UTF8CHAR && !CPP_OPTION (pfile, cplusplus))
1964    unsigned_p = 1;
1965  else
1966    unsigned_p = CPP_OPTION (pfile, unsigned_char);
1967
1968  /* Truncate the constant to its natural width, and simultaneously
1969     sign- or zero-extend to the full width of cppchar_t.
1970     For single-character constants, the value is WIDTH bits wide.
1971     For multi-character constants, the value is INT_PRECISION bits wide.  */
1972  if (i > 1)
1973    width = CPP_OPTION (pfile, int_precision);
1974  if (width < BITS_PER_CPPCHAR_T)
1975    {
1976      mask = ((cppchar_t) 1 << width) - 1;
1977      if (unsigned_p || !(result & (1 << (width - 1))))
1978	result &= mask;
1979      else
1980	result |= ~mask;
1981    }
1982  *pchars_seen = i;
1983  *unsignedp = unsigned_p;
1984  return result;
1985}
1986
1987/* Subroutine of cpp_interpret_charconst which performs the conversion
1988   to a number, for wide strings.  STR is the string structure returned
1989   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1990   cpp_interpret_charconst.  TYPE is the token type.  */
1991static cppchar_t
1992wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1993		       unsigned int *pchars_seen, int *unsignedp,
1994		       enum cpp_ttype type)
1995{
1996  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1997  size_t width = converter_for_type (pfile, type).width;
1998  size_t cwidth = CPP_OPTION (pfile, char_precision);
1999  size_t mask = width_to_mask (width);
2000  size_t cmask = width_to_mask (cwidth);
2001  size_t nbwc = width / cwidth;
2002  size_t off, i;
2003  cppchar_t result = 0, c;
2004
2005  if (str.len <= nbwc)
2006    {
2007      /* Error recovery, if no errors have been diagnosed previously,
2008	 there should be at least two wide characters.  Empty literals
2009	 are diagnosed earlier and we can get just the zero terminator
2010	 only if there were errors diagnosed during conversion.  */
2011      *pchars_seen = 0;
2012      *unsignedp = 0;
2013      return 0;
2014    }
2015
2016  /* This is finicky because the string is in the target's byte order,
2017     which may not be our byte order.  Only the last character, ignoring
2018     the NUL terminator, is relevant.  */
2019  off = str.len - (nbwc * 2);
2020  result = 0;
2021  for (i = 0; i < nbwc; i++)
2022    {
2023      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
2024      result = (result << cwidth) | (c & cmask);
2025    }
2026
2027  /* Wide character constants have type wchar_t, and a single
2028     character exactly fills a wchar_t, so a multi-character wide
2029     character constant is guaranteed to overflow.  */
2030  if (str.len > nbwc * 2)
2031    cpp_error (pfile, (CPP_OPTION (pfile, cplusplus)
2032		       && (type == CPP_CHAR16 || type == CPP_CHAR32))
2033		      ? CPP_DL_ERROR : CPP_DL_WARNING,
2034	       "character constant too long for its type");
2035
2036  /* Truncate the constant to its natural width, and simultaneously
2037     sign- or zero-extend to the full width of cppchar_t.  */
2038  if (width < BITS_PER_CPPCHAR_T)
2039    {
2040      if (type == CPP_CHAR16 || type == CPP_CHAR32
2041	  || CPP_OPTION (pfile, unsigned_wchar)
2042	  || !(result & (1 << (width - 1))))
2043	result &= mask;
2044      else
2045	result |= ~mask;
2046    }
2047
2048  if (type == CPP_CHAR16 || type == CPP_CHAR32
2049      || CPP_OPTION (pfile, unsigned_wchar))
2050    *unsignedp = 1;
2051  else
2052    *unsignedp = 0;
2053
2054  *pchars_seen = 1;
2055  return result;
2056}
2057
2058/* Interpret a (possibly wide) character constant in TOKEN.
2059   PCHARS_SEEN points to a variable that is filled in with the number
2060   of characters seen, and UNSIGNEDP to a variable that indicates
2061   whether the result has signed type.  */
2062cppchar_t
2063cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
2064			 unsigned int *pchars_seen, int *unsignedp)
2065{
2066  cpp_string str = { 0, 0 };
2067  bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR);
2068  int u8 = 2 * int(token->type == CPP_UTF8CHAR);
2069  cppchar_t result;
2070
2071  /* An empty constant will appear as L'', u'', U'', u8'', or '' */
2072  if (token->val.str.len == (size_t) (2 + wide + u8))
2073    {
2074      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
2075      *pchars_seen = 0;
2076      *unsignedp = 0;
2077      return 0;
2078    }
2079  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str,
2080				  token->type))
2081    {
2082      *pchars_seen = 0;
2083      *unsignedp = 0;
2084      return 0;
2085    }
2086
2087  if (wide)
2088    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
2089				    token->type);
2090  else
2091    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
2092				      token->type);
2093
2094  if (str.text != token->val.str.text)
2095    free ((void *)str.text);
2096
2097  return result;
2098}
2099
2100/* Convert an identifier denoted by ID and LEN, which might contain
2101   UCN escapes or UTF-8 multibyte chars, to the source character set,
2102   either UTF-8 or UTF-EBCDIC.  Assumes that the identifier is actually
2103   a valid identifier.  */
2104cpp_hashnode *
2105_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
2106{
2107  /* It turns out that a UCN escape always turns into fewer characters
2108     than the escape itself, so we can allocate a temporary in advance.  */
2109  uchar * buf = (uchar *) alloca (len + 1);
2110  uchar * bufp = buf;
2111  size_t idp;
2112
2113  for (idp = 0; idp < len; idp++)
2114    if (id[idp] != '\\')
2115      *bufp++ = id[idp];
2116    else
2117      {
2118	unsigned length = id[idp+1] == 'u' ? 4 : 8;
2119	cppchar_t value = 0;
2120	size_t bufleft = len - (bufp - buf);
2121	int rval;
2122
2123	idp += 2;
2124	while (length && idp < len && ISXDIGIT (id[idp]))
2125	  {
2126	    value = (value << 4) + hex_value (id[idp]);
2127	    idp++;
2128	    length--;
2129	  }
2130	idp--;
2131
2132	/* Special case for EBCDIC: if the identifier contains
2133	   a '$' specified using a UCN, translate it to EBCDIC.  */
2134	if (value == 0x24)
2135	  {
2136	    *bufp++ = '$';
2137	    continue;
2138	  }
2139
2140	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
2141	if (rval)
2142	  {
2143	    errno = rval;
2144	    cpp_errno (pfile, CPP_DL_ERROR,
2145		       "converting UCN to source character set");
2146	    break;
2147	  }
2148      }
2149
2150  return CPP_HASHNODE (ht_lookup (pfile->hash_table,
2151				  buf, bufp - buf, HT_ALLOC));
2152}
2153
2154
2155/* Utility to strip a UTF-8 byte order marking from the beginning
2156   of a buffer.  Returns the number of bytes to skip, which currently
2157   will be either 0 or 3.  */
2158int
2159cpp_check_utf8_bom (const char *data, size_t data_length)
2160{
2161
2162#if HOST_CHARSET == HOST_CHARSET_ASCII
2163  const unsigned char *udata = (const unsigned char *) data;
2164  if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb
2165      && udata[2] == 0xbf)
2166    return 3;
2167#endif
2168
2169  return 0;
2170}
2171
2172
2173/* Convert an input buffer (containing the complete contents of one
2174   source file) from INPUT_CHARSET to the source character set.  INPUT
2175   points to the input buffer, SIZE is its allocated size, and LEN is
2176   the length of the meaningful data within the buffer.  The
2177   translated buffer is returned, *ST_SIZE is set to the length of
2178   the meaningful data within the translated buffer, and *BUFFER_START
2179   is set to the start of the returned buffer.  *BUFFER_START may
2180   differ from the return value in the case of a BOM or other ignored
2181   marker information.
2182
2183   INPUT is expected to have been allocated with xmalloc.  This
2184   function will either set *BUFFER_START to INPUT, or free it and set
2185   *BUFFER_START to a pointer to another xmalloc-allocated block of
2186   memory.
2187
2188   PFILE is only used to generate diagnostics; setting it to NULL suppresses
2189   diagnostics, and causes a return of NULL if there was any error instead.  */
2190
2191uchar *
2192_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
2193		    uchar *input, size_t size, size_t len,
2194		    const unsigned char **buffer_start, off_t *st_size)
2195{
2196  struct cset_converter input_cset;
2197  struct _cpp_strbuf to;
2198  unsigned char *buffer;
2199
2200  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
2201  if (input_cset.func == convert_no_conversion)
2202    {
2203      to.text = input;
2204      to.asize = size;
2205      to.len = len;
2206    }
2207  else
2208    {
2209      to.asize = MAX (65536, len);
2210      to.text = XNEWVEC (uchar, to.asize);
2211      to.len = 0;
2212
2213      const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
2214      free (input);
2215
2216      /* Clean up the mess.  */
2217      if (input_cset.func == convert_using_iconv)
2218	iconv_close (input_cset.cd);
2219
2220      /* Handle conversion failure.  */
2221      if (!ok)
2222	{
2223	  if (!pfile)
2224	    {
2225	      XDELETEVEC (to.text);
2226	      *buffer_start = NULL;
2227	      *st_size = 0;
2228	      return NULL;
2229	    }
2230	  cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s",
2231		     input_charset, SOURCE_CHARSET);
2232	}
2233    }
2234
2235  /* Resize buffer if we allocated substantially too much, or if we
2236     haven't enough space for the \n-terminator or following
2237     15 bytes of padding (used to quiet warnings from valgrind or
2238     Address Sanitizer, when the optimized lexer accesses aligned
2239     16-byte memory chunks, including the bytes after the malloced,
2240     area, and stops lexing on '\n').  */
2241  if (to.len + 4096 < to.asize || to.len + 16 > to.asize)
2242    to.text = XRESIZEVEC (uchar, to.text, to.len + 16);
2243
2244  memset (to.text + to.len, '\0', 16);
2245
2246  /* If the file is using old-school Mac line endings (\r only),
2247     terminate with another \r, not an \n, so that we do not mistake
2248     the \r\n sequence for a single DOS line ending and erroneously
2249     issue the "No newline at end of file" diagnostic.  */
2250  if (to.len && to.text[to.len - 1] == '\r')
2251    to.text[to.len] = '\r';
2252  else
2253    to.text[to.len] = '\n';
2254
2255  buffer = to.text;
2256  *st_size = to.len;
2257
2258  /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8.  Note
2259     that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
2260     BOM -- however, even if it did, we would still need this code due
2261     to the 'convert_no_conversion' case.  */
2262  const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len);
2263  *st_size -= bom_len;
2264  buffer += bom_len;
2265
2266  *buffer_start = to.text;
2267  return buffer;
2268}
2269
2270/* Decide on the default encoding to assume for input files.  */
2271const char *
2272_cpp_default_encoding (void)
2273{
2274  const char *current_encoding = NULL;
2275
2276  /* We disable this because the default codeset is 7-bit ASCII on
2277     most platforms, and this causes conversion failures on every
2278     file in GCC that happens to have one of the upper 128 characters
2279     in it -- most likely, as part of the name of a contributor.
2280     We should definitely recognize in-band markers of file encoding,
2281     like:
2282     - the appropriate Unicode byte-order mark (FE FF) to recognize
2283       UTF16 and UCS4 (in both big-endian and little-endian flavors)
2284       and UTF8
2285     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
2286       distinguish ASCII and EBCDIC.
2287     - now we can parse something like "#pragma GCC encoding <xyz>
2288       on the first line, or even Emacs/VIM's mode line tags (there's
2289       a problem here in that VIM uses the last line, and Emacs has
2290       its more elaborate "local variables" convention).
2291     - investigate whether Java has another common convention, which
2292       would be friendly to support.
2293     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
2294#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
2295  setlocale (LC_CTYPE, "");
2296  current_encoding = nl_langinfo (CODESET);
2297#endif
2298  if (current_encoding == NULL || *current_encoding == '\0')
2299    current_encoding = SOURCE_CHARSET;
2300
2301  return current_encoding;
2302}
2303
2304/* Check if the configured input charset requires no conversion, other than
2305   possibly stripping a UTF-8 BOM.  */
2306bool cpp_input_conversion_is_trivial (const char *input_charset)
2307{
2308  return !strcasecmp (input_charset, SOURCE_CHARSET);
2309}
2310
2311/* Implementation of class cpp_string_location_reader.  */
2312
2313/* Constructor for cpp_string_location_reader.  */
2314
2315cpp_string_location_reader::
2316cpp_string_location_reader (location_t src_loc,
2317			    line_maps *line_table)
2318{
2319  src_loc = get_range_from_loc (line_table, src_loc).m_start;
2320
2321  /* SRC_LOC might be a macro location.  It only makes sense to do
2322     column-by-column calculations on ordinary maps, so get the
2323     corresponding location in an ordinary map.  */
2324  m_loc
2325    = linemap_resolve_location (line_table, src_loc,
2326				LRK_SPELLING_LOCATION, NULL);
2327
2328  const line_map_ordinary *map
2329    = linemap_check_ordinary (linemap_lookup (line_table, m_loc));
2330  m_offset_per_column = (1 << map->m_range_bits);
2331}
2332
2333/* Get the range of the next source byte.  */
2334
2335source_range
2336cpp_string_location_reader::get_next ()
2337{
2338  source_range result;
2339  result.m_start = m_loc;
2340  result.m_finish = m_loc;
2341  if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS)
2342    m_loc += m_offset_per_column;
2343  return result;
2344}
2345
2346cpp_display_width_computation::
2347cpp_display_width_computation (const char *data, int data_length,
2348			       const cpp_char_column_policy &policy) :
2349  m_begin (data),
2350  m_next (m_begin),
2351  m_bytes_left (data_length),
2352  m_policy (policy),
2353  m_display_cols (0)
2354{
2355  gcc_assert (policy.m_tabstop > 0);
2356  gcc_assert (policy.m_width_cb);
2357}
2358
2359
2360/* The main implementation function for class cpp_display_width_computation.
2361   m_next points on entry to the start of the UTF-8 encoding of the next
2362   character, and is updated to point just after the last byte of the encoding.
2363   m_bytes_left contains on entry the remaining size of the buffer into which
2364   m_next points, and this is also updated accordingly.  If m_next does not
2365   point to a valid UTF-8-encoded sequence, then it will be treated as a single
2366   byte with display width 1.  m_cur_display_col is the current display column,
2367   relative to which tab stops should be expanded.  Returns the display width of
2368   the codepoint just processed.
2369   If OUT is non-NULL, it is populated.  */
2370
2371int
2372cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out)
2373{
2374  cppchar_t c;
2375  int next_width;
2376
2377  if (out)
2378    out->m_start_byte = m_next;
2379
2380  if (*m_next == '\t')
2381    {
2382      ++m_next;
2383      --m_bytes_left;
2384      next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop);
2385      if (out)
2386	{
2387	  out->m_ch = '\t';
2388	  out->m_valid_ch = true;
2389	}
2390    }
2391  else if (one_utf8_to_cppchar ((const uchar **) &m_next, &m_bytes_left, &c)
2392	   != 0)
2393    {
2394      /* Input is not convertible to UTF-8.  This could be fine, e.g. in a
2395	 string literal, so don't complain.  Just treat it as if it has a width
2396	 of one.  */
2397      ++m_next;
2398      --m_bytes_left;
2399      next_width = m_policy.m_undecoded_byte_width;
2400      if (out)
2401	out->m_valid_ch = false;
2402    }
2403  else
2404    {
2405      /*  one_utf8_to_cppchar() has updated m_next and m_bytes_left for us.  */
2406      next_width = m_policy.m_width_cb (c);
2407      if (out)
2408	{
2409	  out->m_ch = c;
2410	  out->m_valid_ch = true;
2411	}
2412    }
2413
2414  if (out)
2415    out->m_next_byte = m_next;
2416
2417  m_display_cols += next_width;
2418  return next_width;
2419}
2420
2421/*  Utility to advance the byte stream by the minimum amount needed to consume
2422    N display columns.  Returns the number of display columns that were
2423    actually skipped.  This could be less than N, if there was not enough data,
2424    or more than N, if the last character to be skipped had a sufficiently large
2425    display width.  */
2426int
2427cpp_display_width_computation::advance_display_cols (int n)
2428{
2429  const int start = m_display_cols;
2430  const int target = start + n;
2431  while (m_display_cols < target && !done ())
2432    process_next_codepoint (NULL);
2433  return m_display_cols - start;
2434}
2435
2436/*  For the string of length DATA_LENGTH bytes that begins at DATA, compute
2437    how many display columns are occupied by the first COLUMN bytes.  COLUMN
2438    may exceed DATA_LENGTH, in which case the phantom bytes at the end are
2439    treated as if they have display width 1.  Tabs are expanded to the next tab
2440    stop, relative to the start of DATA, and non-printable-ASCII characters
2441    will be escaped as per POLICY.  */
2442
2443int
2444cpp_byte_column_to_display_column (const char *data, int data_length,
2445				   int column,
2446				   const cpp_char_column_policy &policy)
2447{
2448  const int offset = MAX (0, column - data_length);
2449  cpp_display_width_computation dw (data, column - offset, policy);
2450  while (!dw.done ())
2451    dw.process_next_codepoint (NULL);
2452  return dw.display_cols_processed () + offset;
2453}
2454
2455/*  For the string of length DATA_LENGTH bytes that begins at DATA, compute
2456    the least number of bytes that will result in at least DISPLAY_COL display
2457    columns.  The return value may exceed DATA_LENGTH if the entire string does
2458    not occupy enough display columns.  Non-printable-ASCII characters
2459    will be escaped as per POLICY.  */
2460
2461int
2462cpp_display_column_to_byte_column (const char *data, int data_length,
2463				   int display_col,
2464				   const cpp_char_column_policy &policy)
2465{
2466  cpp_display_width_computation dw (data, data_length, policy);
2467  const int avail_display = dw.advance_display_cols (display_col);
2468  return dw.bytes_processed () + MAX (0, display_col - avail_display);
2469}
2470
2471/* Our own version of wcwidth().  We don't use the actual wcwidth() in glibc,
2472   because that will inspect the user's locale, and in particular in an ASCII
2473   locale, it will not return anything useful for extended characters.  But GCC
2474   in other respects (see e.g. _cpp_default_encoding()) behaves as if
2475   everything is UTF-8.  We also make some tweaks that are useful for the way
2476   GCC needs to use this data, e.g. tabs and other control characters should be
2477   treated as having width 1.  The lookup tables are generated from
2478   contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
2479   wcwidth() on all codepoints, then applying the small tweaks.  These tables
2480   are not highly optimized, but for the present purpose of outputting
2481   diagnostics, they are sufficient.  */
2482
2483#include "generated_cpp_wcwidth.h"
2484int cpp_wcwidth (cppchar_t c)
2485{
2486  if (__builtin_expect (c <= wcwidth_range_ends[0], true))
2487    return wcwidth_widths[0];
2488
2489  /* Binary search the tables.  */
2490  int begin = 1;
2491  static const int end
2492      = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
2493  int len = end - begin;
2494  do
2495    {
2496      int half = len/2;
2497      int middle = begin + half;
2498      if (c > wcwidth_range_ends[middle])
2499	{
2500	  begin = middle + 1;
2501	  len -= half + 1;
2502	}
2503      else
2504	len = half;
2505    } while (len);
2506
2507  if (__builtin_expect (begin != end, true))
2508    return wcwidth_widths[begin];
2509  return 1;
2510}
2511