charset.c revision 259891
1219820Sjeff/* CPP Library - charsets
2219820Sjeff   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
3272407Shselasky   Free Software Foundation, Inc.
4219820Sjeff
5219820Sjeff   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6219820Sjeff
7219820SjeffThis program is free software; you can redistribute it and/or modify it
8219820Sjeffunder the terms of the GNU General Public License as published by the
9219820SjeffFree Software Foundation; either version 2, or (at your option) any
10219820Sjefflater version.
11219820Sjeff
12219820SjeffThis program is distributed in the hope that it will be useful,
13219820Sjeffbut WITHOUT ANY WARRANTY; without even the implied warranty of
14219820SjeffMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15219820SjeffGNU General Public License for more details.
16219820Sjeff
17219820SjeffYou should have received a copy of the GNU General Public License
18219820Sjeffalong with this program; if not, write to the Free Software
19219820SjeffFoundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
20219820Sjeff
21219820Sjeff#include "config.h"
22219820Sjeff#include "system.h"
23219820Sjeff#include "cpplib.h"
24219820Sjeff#include "internal.h"
25219820Sjeff
26219820Sjeff/* Character set handling for C-family languages.
27219820Sjeff
28219820Sjeff   Terminological note: In what follows, "charset" or "character set"
29219820Sjeff   will be taken to mean both an abstract set of characters and an
30219820Sjeff   encoding for that set.
31219820Sjeff
32219820Sjeff   The C99 standard discusses two character sets: source and execution.
33219820Sjeff   The source character set is used for internal processing in translation
34219820Sjeff   phases 1 through 4; the execution character set is used thereafter.
35272407Shselasky   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
36219820Sjeff   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
37219820Sjeff   of these terms).  Furthermore, the "basic character set" (listed in
38272407Shselasky   5.2.1p3) is to be encoded in each with values one byte wide, and is
39279731Shselasky   to appear in the initial shift state.
40219820Sjeff
41219820Sjeff   It is not explicitly mentioned, but there is also a "wide execution
42219820Sjeff   character set" used to encode wide character constants and wide
43255932Salfred   string literals; this is supposed to be the result of applying the
44219820Sjeff   standard library function mbstowcs() to an equivalent narrow string
45255932Salfred   (6.4.5p5).  However, the behavior of hexadecimal and octal
46255932Salfred   \-escapes is at odds with this; they are supposed to be translated
47255932Salfred   directly to wchar_t values (6.4.4.4p5,6).
48255932Salfred
49255932Salfred   The source character set is not necessarily the character set used
50255932Salfred   to encode physical source files on disk; translation phase 1 converts
51255932Salfred   from whatever that encoding is to the source character set.
52255932Salfred
53255932Salfred   The presence of universal character names in C99 (6.4.3 et seq.)
54255932Salfred   forces the source character set to be isomorphic to ISO 10646,
55255932Salfred   that is, Unicode.  There is no such constraint on the execution
56255932Salfred   character set; note also that the conversion from source to
57255932Salfred   execution character set does not occur for identifiers (5.1.1.2p1#5).
58255932Salfred
59255932Salfred   For convenience of implementation, the source character set's
60255932Salfred   encoding of the basic character set should be identical to the
61255932Salfred   execution character set OF THE HOST SYSTEM's encoding of the basic
62255932Salfred   character set, and it should not be a state-dependent encoding.
63255932Salfred
64255932Salfred   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
65255932Salfred   depending on whether the host is based on ASCII or EBCDIC (see
66255932Salfred   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
67255932Salfred   Technical Report #16).  With limited exceptions, it relies on the
68255932Salfred   system library's iconv() primitive to do charset conversion
69255932Salfred   (specified in SUSv2).  */
70255932Salfred
71255932Salfred#if !HAVE_ICONV
72255932Salfred/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
73255932Salfred   below, which are guarded only by if statements with compile-time
74255932Salfred   constant conditions, do not cause link errors.  */
75255932Salfred#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
76255932Salfred#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
77255932Salfred#define iconv_close(x)   (void)0
78255932Salfred#define ICONV_CONST
79255932Salfred#endif
80255932Salfred
81255932Salfred#if HOST_CHARSET == HOST_CHARSET_ASCII
82255932Salfred#define SOURCE_CHARSET "UTF-8"
83255932Salfred#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
84255932Salfred#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
85219820Sjeff#define SOURCE_CHARSET "UTF-EBCDIC"
86255932Salfred#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
87219820Sjeff#else
88219820Sjeff#error "Unrecognized basic host character set"
89255932Salfred#endif
90255932Salfred
91219820Sjeff#ifndef EILSEQ
92219820Sjeff#define EILSEQ EINVAL
93255932Salfred#endif
94219820Sjeff
95219820Sjeff/* This structure is used for a resizable string buffer throughout.  */
96255932Salfred/* Don't call it strbuf, as that conflicts with unistd.h on systems
97255932Salfred   such as DYNIX/ptx where unistd.h includes stropts.h.  */
98219820Sjeffstruct _cpp_strbuf
99255932Salfred{
100255932Salfred  uchar *text;
101255932Salfred  size_t asize;
102255932Salfred  size_t len;
103255932Salfred};
104255932Salfred
105255932Salfred/* This is enough to hold any string that fits on a single 80-column
106255932Salfred   line, even if iconv quadruples its size (e.g. conversion from
107255932Salfred   ASCII to UTF-32) rounded up to a power of two.  */
108255932Salfred#define OUTBUF_BLOCK_SIZE 256
109255932Salfred
110219820Sjeff/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
111219820Sjeff   logic.  This is because a depressing number of systems lack iconv,
112219820Sjeff   or have have iconv libraries that do not do these conversions, so
113255932Salfred   we need a fallback implementation for them.  To ensure the fallback
114255932Salfred   doesn't break due to neglect, it is used on all systems.
115255932Salfred
116219820Sjeff   UTF-32 encoding is nice and simple: a four-byte binary number,
117219820Sjeff   constrained to the range 00000000-7FFFFFFF to avoid questions of
118219820Sjeff   signedness.  We do have to cope with big- and little-endian
119219820Sjeff   variants.
120219820Sjeff
121219820Sjeff   UTF-16 encoding uses two-byte binary numbers, again in big- and
122219820Sjeff   little-endian variants, for all values in the 00000000-0000FFFF
123255932Salfred   range.  Values in the 00010000-0010FFFF range are encoded as pairs
124255932Salfred   of two-byte numbers, called "surrogate pairs": given a number S in
125255932Salfred   this range, it is mapped to a pair (H, L) as follows:
126255932Salfred
127272407Shselasky     H = (S - 0x10000) / 0x400 + 0xD800
128255932Salfred     L = (S - 0x10000) % 0x400 + 0xDC00
129255932Salfred
130272407Shselasky   Two-byte values in the D800...DFFF range are ill-formed except as a
131272407Shselasky   component of a surrogate pair.  Even if the encoding within a
132272407Shselasky   two-byte value is little-endian, the H member of the surrogate pair
133272407Shselasky   comes first.
134272407Shselasky
135255932Salfred   There is no way to encode values in the 00110000-7FFFFFFF range,
136255932Salfred   which is not currently a problem as there are no assigned code
137255932Salfred   points in that range; however, the author expects that it will
138255932Salfred   eventually become necessary to abandon UTF-16 due to this
139255932Salfred   limitation.  Note also that, because of these pairs, UTF-16 does
140255932Salfred   not meet the requirements of the C standard for a wide character
141255932Salfred   encoding (see 3.7.3 and 6.4.4.4p11).
142255932Salfred
143219820Sjeff   UTF-8 encoding looks like this:
144255932Salfred
145255932Salfred   value range	       encoded as
146255932Salfred   00000000-0000007F   0xxxxxxx
147255932Salfred   00000080-000007FF   110xxxxx 10xxxxxx
148255932Salfred   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
149255932Salfred   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
150255932Salfred   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151255932Salfred   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152255932Salfred
153255932Salfred   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
154255932Salfred   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
155255932Salfred   never occur.  Note also that any value that can be encoded by a
156255932Salfred   given row of the table can also be encoded by all successive rows,
157255932Salfred   but this is not done; only the shortest possible encoding for any
158255932Salfred   given value is valid.  For instance, the character 07C0 could be
159255932Salfred   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
160255932Salfred   FC 80 80 80 9F 80.  Only the first is valid.
161272407Shselasky
162272407Shselasky   An implementation note: the transformation from UTF-16 to UTF-8, or
163272407Shselasky   vice versa, is easiest done by using UTF-32 as an intermediary.  */
164255932Salfred
165255932Salfred/* Internal primitives which go from an UTF-8 byte stream to native-endian
166255932Salfred   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
167255932Salfred   operation in several places below.  */
168255932Salfredstatic inline int
169255932Salfredone_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
170255932Salfred		     cppchar_t *cp)
171255932Salfred{
172255932Salfred  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
173255932Salfred  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
174255932Salfred
175255932Salfred  cppchar_t c;
176255932Salfred  const uchar *inbuf = *inbufp;
177255932Salfred  size_t nbytes, i;
178255932Salfred
179255932Salfred  if (*inbytesleftp < 1)
180255932Salfred    return EINVAL;
181255932Salfred
182255932Salfred  c = *inbuf;
183255932Salfred  if (c < 0x80)
184255932Salfred    {
185255932Salfred      *cp = c;
186255932Salfred      *inbytesleftp -= 1;
187255932Salfred      *inbufp += 1;
188255932Salfred      return 0;
189255932Salfred    }
190255932Salfred
191255932Salfred  /* The number of leading 1-bits in the first byte indicates how many
192255932Salfred     bytes follow.  */
193255932Salfred  for (nbytes = 2; nbytes < 7; nbytes++)
194255932Salfred    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
195255932Salfred      goto found;
196255932Salfred  return EILSEQ;
197255932Salfred found:
198255932Salfred
199255932Salfred  if (*inbytesleftp < nbytes)
200255932Salfred    return EINVAL;
201255932Salfred
202255932Salfred  c = (c & masks[nbytes-1]);
203255932Salfred  inbuf++;
204255932Salfred  for (i = 1; i < nbytes; i++)
205255932Salfred    {
206255932Salfred      cppchar_t n = *inbuf++;
207255932Salfred      if ((n & 0xC0) != 0x80)
208255932Salfred	return EILSEQ;
209255932Salfred      c = ((c << 6) + (n & 0x3F));
210255932Salfred    }
211255932Salfred
212255932Salfred  /* Make sure the shortest possible encoding was used.  */
213255932Salfred  if (c <=      0x7F && nbytes > 1) return EILSEQ;
214255932Salfred  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
215255932Salfred  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
216255932Salfred  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
217255932Salfred  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
218255932Salfred
219255932Salfred  /* Make sure the character is valid.  */
220255932Salfred  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
221255932Salfred
222255932Salfred  *cp = c;
223255932Salfred  *inbufp = inbuf;
224255932Salfred  *inbytesleftp -= nbytes;
225255932Salfred  return 0;
226255932Salfred}
227255932Salfred
228255932Salfredstatic inline int
229255932Salfredone_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
230255932Salfred{
231255932Salfred  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
232255932Salfred  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
233255932Salfred  size_t nbytes;
234255932Salfred  uchar buf[6], *p = &buf[6];
235255932Salfred  uchar *outbuf = *outbufp;
236255932Salfred
237255932Salfred  nbytes = 1;
238255932Salfred  if (c < 0x80)
239255932Salfred    *--p = c;
240255932Salfred  else
241255932Salfred    {
242255932Salfred      do
243255932Salfred	{
244255932Salfred	  *--p = ((c & 0x3F) | 0x80);
245255932Salfred	  c >>= 6;
246255932Salfred	  nbytes++;
247255932Salfred	}
248272407Shselasky      while (c >= 0x3F || (c & limits[nbytes-1]));
249272407Shselasky      *--p = (c | masks[nbytes-1]);
250272407Shselasky    }
251255932Salfred
252255932Salfred  if (*outbytesleftp < nbytes)
253255932Salfred    return E2BIG;
254255932Salfred
255255932Salfred  while (p < &buf[6])
256255932Salfred    *outbuf++ = *p++;
257255932Salfred  *outbytesleftp -= nbytes;
258255932Salfred  *outbufp = outbuf;
259255932Salfred  return 0;
260255932Salfred}
261255932Salfred
262255932Salfred/* The following four functions transform one character between the two
263255932Salfred   encodings named in the function name.  All have the signature
264255932Salfred   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
265255932Salfred           uchar **outbufp, size_t *outbytesleftp)
266255932Salfred
267255932Salfred   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
268255932Salfred   interpreted as a boolean indicating whether big-endian or
269255932Salfred   little-endian encoding is to be used for the member of the pair
270255932Salfred   that is not UTF-8.
271255932Salfred
272272407Shselasky   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
273255932Salfred   do for iconv.
274255932Salfred
275255932Salfred   The return value is either 0 for success, or an errno value for
276255932Salfred   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
277255932Salfred   input sequence), ir EINVAL (incomplete input sequence).  */
278255932Salfred
279255932Salfredstatic inline int
280255932Salfredone_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
281255932Salfred		   uchar **outbufp, size_t *outbytesleftp)
282255932Salfred{
283255932Salfred  uchar *outbuf;
284255932Salfred  cppchar_t s = 0;
285255932Salfred  int rval;
286255932Salfred
287255932Salfred  /* Check for space first, since we know exactly how much we need.  */
288255932Salfred  if (*outbytesleftp < 4)
289255932Salfred    return E2BIG;
290255932Salfred
291255932Salfred  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
292255932Salfred  if (rval)
293255932Salfred    return rval;
294255932Salfred
295255932Salfred  outbuf = *outbufp;
296272407Shselasky  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
297272407Shselasky  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
298272407Shselasky  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
299255932Salfred  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
300255932Salfred
301255932Salfred  *outbufp += 4;
302255932Salfred  *outbytesleftp -= 4;
303255932Salfred  return 0;
304255932Salfred}
305255932Salfred
306255932Salfredstatic inline int
307255932Salfredone_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
308255932Salfred		   uchar **outbufp, size_t *outbytesleftp)
309255932Salfred{
310255932Salfred  cppchar_t s;
311255932Salfred  int rval;
312255932Salfred  const uchar *inbuf;
313255932Salfred
314255932Salfred  if (*inbytesleftp < 4)
315255932Salfred    return EINVAL;
316255932Salfred
317255932Salfred  inbuf = *inbufp;
318255932Salfred
319255932Salfred  s  = inbuf[bigend ? 0 : 3] << 24;
320255932Salfred  s += inbuf[bigend ? 1 : 2] << 16;
321255932Salfred  s += inbuf[bigend ? 2 : 1] << 8;
322255932Salfred  s += inbuf[bigend ? 3 : 0];
323255932Salfred
324255932Salfred  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
325255932Salfred    return EILSEQ;
326272407Shselasky
327272407Shselasky  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
328272407Shselasky  if (rval)
329272407Shselasky    return rval;
330272407Shselasky
331255932Salfred  *inbufp += 4;
332272407Shselasky  *inbytesleftp -= 4;
333255932Salfred  return 0;
334255932Salfred}
335255932Salfred
336255932Salfredstatic inline int
337272407Shselaskyone_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
338255932Salfred		   uchar **outbufp, size_t *outbytesleftp)
339255932Salfred{
340255932Salfred  int rval;
341272407Shselasky  cppchar_t s = 0;
342272407Shselasky  const uchar *save_inbuf = *inbufp;
343272407Shselasky  size_t save_inbytesleft = *inbytesleftp;
344255932Salfred  uchar *outbuf = *outbufp;
345255932Salfred
346255932Salfred  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
347255932Salfred  if (rval)
348255932Salfred    return rval;
349255932Salfred
350255932Salfred  if (s > 0x0010FFFF)
351255932Salfred    {
352255932Salfred      *inbufp = save_inbuf;
353272407Shselasky      *inbytesleftp = save_inbytesleft;
354272407Shselasky      return EILSEQ;
355272407Shselasky    }
356272407Shselasky
357272407Shselasky  if (s < 0xFFFF)
358272407Shselasky    {
359255932Salfred      if (*outbytesleftp < 2)
360255932Salfred	{
361255932Salfred	  *inbufp = save_inbuf;
362255932Salfred	  *inbytesleftp = save_inbytesleft;
363255932Salfred	  return E2BIG;
364272407Shselasky	}
365272407Shselasky      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
366272407Shselasky      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
367272407Shselasky
368272407Shselasky      *outbufp += 2;
369272407Shselasky      *outbytesleftp -= 2;
370272407Shselasky      return 0;
371272407Shselasky    }
372272407Shselasky  else
373272407Shselasky    {
374272407Shselasky      cppchar_t hi, lo;
375272407Shselasky
376272407Shselasky      if (*outbytesleftp < 4)
377272407Shselasky	{
378272407Shselasky	  *inbufp = save_inbuf;
379272407Shselasky	  *inbytesleftp = save_inbytesleft;
380272407Shselasky	  return E2BIG;
381272407Shselasky	}
382272407Shselasky
383272407Shselasky      hi = (s - 0x10000) / 0x400 + 0xD800;
384272407Shselasky      lo = (s - 0x10000) % 0x400 + 0xDC00;
385272407Shselasky
386272407Shselasky      /* Even if we are little-endian, put the high surrogate first.
387272407Shselasky	 ??? Matches practice?  */
388272407Shselasky      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
389255932Salfred      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
390255932Salfred      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
391255932Salfred      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
392255932Salfred
393255932Salfred      *outbufp += 4;
394255932Salfred      *outbytesleftp -= 4;
395255932Salfred      return 0;
396255932Salfred    }
397255932Salfred}
398255932Salfred
399255932Salfredstatic inline int
400255932Salfredone_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
401255932Salfred		   uchar **outbufp, size_t *outbytesleftp)
402255932Salfred{
403255932Salfred  cppchar_t s;
404255932Salfred  const uchar *inbuf = *inbufp;
405255932Salfred  int rval;
406255932Salfred
407255932Salfred  if (*inbytesleftp < 2)
408255932Salfred    return EINVAL;
409255932Salfred  s  = inbuf[bigend ? 0 : 1] << 8;
410255932Salfred  s += inbuf[bigend ? 1 : 0];
411255932Salfred
412255932Salfred  /* Low surrogate without immediately preceding high surrogate is invalid.  */
413255932Salfred  if (s >= 0xDC00 && s <= 0xDFFF)
414255932Salfred    return EILSEQ;
415255932Salfred  /* High surrogate must have a following low surrogate.  */
416255932Salfred  else if (s >= 0xD800 && s <= 0xDBFF)
417255932Salfred    {
418255932Salfred      cppchar_t hi = s, lo;
419255932Salfred      if (*inbytesleftp < 4)
420255932Salfred	return EINVAL;
421255932Salfred
422255932Salfred      lo  = inbuf[bigend ? 2 : 3] << 8;
423255932Salfred      lo += inbuf[bigend ? 3 : 2];
424255932Salfred
425255932Salfred      if (lo < 0xDC00 || lo > 0xDFFF)
426255932Salfred	return EILSEQ;
427255932Salfred
428255932Salfred      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
429255932Salfred    }
430255932Salfred
431255932Salfred  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
432255932Salfred  if (rval)
433255932Salfred    return rval;
434255932Salfred
435255932Salfred  /* Success - update the input pointers (one_cppchar_to_utf8 has done
436272407Shselasky     the output pointers for us).  */
437272407Shselasky  if (s <= 0xFFFF)
438272407Shselasky    {
439255932Salfred      *inbufp += 2;
440255932Salfred      *inbytesleftp -= 2;
441255932Salfred    }
442255932Salfred  else
443255932Salfred    {
444255932Salfred      *inbufp += 4;
445255932Salfred      *inbytesleftp -= 4;
446255932Salfred    }
447255932Salfred  return 0;
448255932Salfred}
449255932Salfred
450255932Salfred/* Helper routine for the next few functions.  The 'const' on
451255932Salfred   one_conversion means that we promise not to modify what function is
452255932Salfred   pointed to, which lets the inliner see through it.  */
453255932Salfred
454255932Salfredstatic inline bool
455255932Salfredconversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
456255932Salfred					     uchar **, size_t *),
457255932Salfred		 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
458255932Salfred{
459255932Salfred  const uchar *inbuf;
460255932Salfred  uchar *outbuf;
461255932Salfred  size_t inbytesleft, outbytesleft;
462272407Shselasky  int rval;
463272407Shselasky
464272407Shselasky  inbuf = from;
465272407Shselasky  inbytesleft = flen;
466272407Shselasky  outbuf = to->text + to->len;
467272407Shselasky  outbytesleft = to->asize - to->len;
468272407Shselasky
469272407Shselasky  for (;;)
470255932Salfred    {
471272407Shselasky      do
472272407Shselasky	rval = one_conversion (cd, &inbuf, &inbytesleft,
473272407Shselasky			       &outbuf, &outbytesleft);
474272407Shselasky      while (inbytesleft && !rval);
475272407Shselasky
476272407Shselasky      if (__builtin_expect (inbytesleft == 0, 1))
477272407Shselasky	{
478272407Shselasky	  to->len = to->asize - outbytesleft;
479272407Shselasky	  return true;
480272407Shselasky	}
481272407Shselasky      if (rval != E2BIG)
482272407Shselasky	{
483272407Shselasky	  errno = rval;
484272407Shselasky	  return false;
485272407Shselasky	}
486272407Shselasky
487272407Shselasky      outbytesleft += OUTBUF_BLOCK_SIZE;
488272407Shselasky      to->asize += OUTBUF_BLOCK_SIZE;
489272407Shselasky      to->text = XRESIZEVEC (uchar, to->text, to->asize);
490272407Shselasky      outbuf = to->text + to->asize - outbytesleft;
491255932Salfred    }
492255932Salfred}
493255932Salfred
494272407Shselasky
495272407Shselasky/* These functions convert entire strings between character sets.
496272407Shselasky   They all have the signature
497272407Shselasky
498272407Shselasky   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
499255932Salfred
500255932Salfred   The input string FROM is converted as specified by the function
501255932Salfred   name plus the iconv descriptor CD (which may be fake), and the
502255932Salfred   result appended to TO.  On any error, false is returned, otherwise true.  */
503255932Salfred
504255932Salfred/* These four use the custom conversion code above.  */
505255932Salfredstatic bool
506255932Salfredconvert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
507255932Salfred		    struct _cpp_strbuf *to)
508255932Salfred{
509255932Salfred  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
510255932Salfred}
511255932Salfred
512255932Salfredstatic bool
513255932Salfredconvert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
514255932Salfred		    struct _cpp_strbuf *to)
515255932Salfred{
516255932Salfred  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
517255932Salfred}
518255932Salfred
519255932Salfredstatic bool
520255932Salfredconvert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
521255932Salfred		    struct _cpp_strbuf *to)
522255932Salfred{
523255932Salfred  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
524255932Salfred}
525255932Salfred
526255932Salfredstatic bool
527255932Salfredconvert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
528255932Salfred		    struct _cpp_strbuf *to)
529255932Salfred{
530255932Salfred  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
531255932Salfred}
532255932Salfred
533255932Salfred/* Identity conversion, used when we have no alternative.  */
534255932Salfredstatic bool
535255932Salfredconvert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
536255932Salfred		       const uchar *from, size_t flen, struct _cpp_strbuf *to)
537255932Salfred{
538255932Salfred  if (to->len + flen > to->asize)
539255932Salfred    {
540255932Salfred      to->asize = to->len + flen;
541255932Salfred      to->text = XRESIZEVEC (uchar, to->text, to->asize);
542255932Salfred    }
543255932Salfred  memcpy (to->text + to->len, from, flen);
544272407Shselasky  to->len += flen;
545255932Salfred  return true;
546255932Salfred}
547255932Salfred
548255932Salfred/* And this one uses the system iconv primitive.  It's a little
549255932Salfred   different, since iconv's interface is a little different.  */
550255932Salfred#if HAVE_ICONV
551255932Salfredstatic bool
552255932Salfredconvert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
553272407Shselasky		     struct _cpp_strbuf *to)
554272407Shselasky{
555272407Shselasky  ICONV_CONST char *inbuf;
556255932Salfred  char *outbuf;
557255932Salfred  size_t inbytesleft, outbytesleft;
558255932Salfred
559255932Salfred  /* Reset conversion descriptor and check that it is valid.  */
560255932Salfred  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
561255932Salfred    return false;
562255932Salfred
563255932Salfred  inbuf = (ICONV_CONST char *)from;
564255932Salfred  inbytesleft = flen;
565255932Salfred  outbuf = (char *)to->text + to->len;
566255932Salfred  outbytesleft = to->asize - to->len;
567255932Salfred
568255932Salfred  for (;;)
569255932Salfred    {
570255932Salfred      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
571255932Salfred      if (__builtin_expect (inbytesleft == 0, 1))
572255932Salfred	{
573255932Salfred	  to->len = to->asize - outbytesleft;
574255932Salfred	  return true;
575255932Salfred	}
576255932Salfred      if (errno != E2BIG)
577255932Salfred	return false;
578255932Salfred
579255932Salfred      outbytesleft += OUTBUF_BLOCK_SIZE;
580255932Salfred      to->asize += OUTBUF_BLOCK_SIZE;
581255932Salfred      to->text = XRESIZEVEC (uchar, to->text, to->asize);
582255932Salfred      outbuf = (char *)to->text + to->asize - outbytesleft;
583255932Salfred    }
584255932Salfred}
585255932Salfred#else
586255932Salfred#define convert_using_iconv 0 /* prevent undefined symbol error below */
587255932Salfred#endif
588272407Shselasky
589272407Shselasky/* Arrange for the above custom conversion logic to be used automatically
590272407Shselasky   when conversion between a suitable pair of character sets is requested.  */
591272407Shselasky
592272407Shselasky#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
593272407Shselasky   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
594272407Shselasky
595255932Salfredstruct conversion
596255932Salfred{
597255932Salfred  const char *pair;
598272407Shselasky  convert_f func;
599272407Shselasky  iconv_t fake_cd;
600272407Shselasky};
601272407Shselaskystatic const struct conversion conversion_tab[] = {
602272407Shselasky  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
603272407Shselasky  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
604272407Shselasky  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
605272407Shselasky  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
606272407Shselasky  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
607272407Shselasky  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
608272407Shselasky  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
609272407Shselasky  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
610272407Shselasky};
611272407Shselasky
612272407Shselasky/* Subroutine of cpp_init_iconv: initialize and return a
613272407Shselasky   cset_converter structure for conversion from FROM to TO.  If
614272407Shselasky   iconv_open() fails, issue an error and return an identity
615255932Salfred   converter.  Silently return an identity converter if FROM and TO
616272407Shselasky   are identical.  */
617272407Shselaskystatic struct cset_converter
618272407Shselaskyinit_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
619272407Shselasky{
620272407Shselasky  struct cset_converter ret;
621255932Salfred  char *pair;
622272407Shselasky  size_t i;
623272407Shselasky
624272407Shselasky  if (!strcasecmp (to, from))
625272407Shselasky    {
626255932Salfred      ret.func = convert_no_conversion;
627272407Shselasky      ret.cd = (iconv_t) -1;
628272407Shselasky      return ret;
629272407Shselasky    }
630272407Shselasky
631272407Shselasky  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
632272407Shselasky
633272407Shselasky  strcpy(pair, from);
634272407Shselasky  strcat(pair, "/");
635272407Shselasky  strcat(pair, to);
636272407Shselasky  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
637272407Shselasky    if (!strcasecmp (pair, conversion_tab[i].pair))
638272407Shselasky      {
639255932Salfred	ret.func = conversion_tab[i].func;
640255932Salfred	ret.cd = conversion_tab[i].fake_cd;
641255932Salfred	return ret;
642255932Salfred      }
643255932Salfred
644255932Salfred  /* No custom converter - try iconv.  */
645255932Salfred  if (HAVE_ICONV)
646255932Salfred    {
647255932Salfred      ret.func = convert_using_iconv;
648255932Salfred      ret.cd = iconv_open (to, from);
649255932Salfred
650255932Salfred      if (ret.cd == (iconv_t) -1)
651255932Salfred	{
652255932Salfred	  if (errno == EINVAL)
653255932Salfred	    cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
654255932Salfred		       "conversion from %s to %s not supported by iconv",
655219820Sjeff		       from, to);
656219820Sjeff	  else
657219820Sjeff	    cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
658219820Sjeff
659219820Sjeff	  ret.func = convert_no_conversion;
660219820Sjeff	}
661219820Sjeff    }
662219820Sjeff  else
663219820Sjeff    {
664219820Sjeff      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
665219820Sjeff		 "no iconv implementation, cannot convert from %s to %s",
666219820Sjeff		 from, to);
667219820Sjeff      ret.func = convert_no_conversion;
668219820Sjeff      ret.cd = (iconv_t) -1;
669255932Salfred    }
670255932Salfred  return ret;
671255932Salfred}
672255932Salfred
673219820Sjeff/* If charset conversion is requested, initialize iconv(3) descriptors
674219820Sjeff   for conversion from the source character set to the execution
675219820Sjeff   character sets.  If iconv is not present in the C library, and
676219820Sjeff   conversion is requested, issue an error.  */
677219820Sjeff
678255932Salfredvoid
679255932Salfredcpp_init_iconv (cpp_reader *pfile)
680255932Salfred{
681219820Sjeff  const char *ncset = CPP_OPTION (pfile, narrow_charset);
682219820Sjeff  const char *wcset = CPP_OPTION (pfile, wide_charset);
683219820Sjeff  const char *default_wcset;
684219820Sjeff
685219820Sjeff  bool be = CPP_OPTION (pfile, bytes_big_endian);
686219820Sjeff
687219820Sjeff  if (CPP_OPTION (pfile, wchar_precision) >= 32)
688219820Sjeff    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
689255932Salfred  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
690219820Sjeff    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
691219820Sjeff  else
692219820Sjeff    /* This effectively means that wide strings are not supported,
693219820Sjeff       so don't do any conversion at all.  */
694279731Shselasky   default_wcset = SOURCE_CHARSET;
695279731Shselasky
696279731Shselasky  if (!ncset)
697279731Shselasky    ncset = SOURCE_CHARSET;
698219820Sjeff  if (!wcset)
699255932Salfred    wcset = default_wcset;
700219820Sjeff
701219820Sjeff  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
702219820Sjeff  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
703255932Salfred}
704219820Sjeff
705219820Sjeff/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
706219820Sjeffvoid
707255932Salfred_cpp_destroy_iconv (cpp_reader *pfile)
708255932Salfred{
709219820Sjeff  if (HAVE_ICONV)
710219820Sjeff    {
711219820Sjeff      if (pfile->narrow_cset_desc.func == convert_using_iconv)
712219820Sjeff	iconv_close (pfile->narrow_cset_desc.cd);
713219820Sjeff      if (pfile->wide_cset_desc.func == convert_using_iconv)
714219820Sjeff	iconv_close (pfile->wide_cset_desc.cd);
715219820Sjeff    }
716255932Salfred}
717219820Sjeff
718219820Sjeff/* Utility routine for use by a full compiler.  C is a character taken
719219820Sjeff   from the *basic* source character set, encoded in the host's
720219820Sjeff   execution encoding.  Convert it to (the target's) execution
721219820Sjeff   encoding, and return that value.
722219820Sjeff
723219820Sjeff   Issues an internal error if C's representation in the narrow
724219820Sjeff   execution character set fails to be a single-byte value (C99
725219820Sjeff   5.2.1p3: "The representation of each member of the source and
726219820Sjeff   execution character sets shall fit in a byte.")  May also issue an
727272407Shselasky   internal error if C fails to be a member of the basic source
728272407Shselasky   character set (testing this exactly is too hard, especially when
729272407Shselasky   the host character set is EBCDIC).  */
730272407Shselaskycppchar_t
731272407Shselaskycpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
732272407Shselasky{
733272407Shselasky  uchar sbuf[1];
734272407Shselasky  struct _cpp_strbuf tbuf;
735272407Shselasky
736272407Shselasky  /* This test is merely an approximation, but it suffices to catch
737272407Shselasky     the most important thing, which is that we don't get handed a
738272407Shselasky     character outside the unibyte range of the host character set.  */
739272407Shselasky  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
740272407Shselasky    {
741272407Shselasky      cpp_error (pfile, CPP_DL_ICE,
742272407Shselasky		 "character 0x%lx is not in the basic source character set\n",
743272407Shselasky		 (unsigned long)c);
744272407Shselasky      return 0;
745272407Shselasky    }
746255932Salfred
747255932Salfred  /* Being a character in the unibyte range of the host character set,
748219820Sjeff     we can safely splat it into a one-byte buffer and trust that that
749272407Shselasky     is a well-formed string.  */
750255932Salfred  sbuf[0] = c;
751272407Shselasky
752272407Shselasky  /* This should never need to reallocate, but just in case... */
753272407Shselasky  tbuf.asize = 1;
754255932Salfred  tbuf.text = XNEWVEC (uchar, tbuf.asize);
755272407Shselasky  tbuf.len = 0;
756272407Shselasky
757272407Shselasky  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
758255932Salfred    {
759255932Salfred      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
760255932Salfred      return 0;
761255932Salfred    }
762255932Salfred  if (tbuf.len != 1)
763255932Salfred    {
764255932Salfred      cpp_error (pfile, CPP_DL_ICE,
765255932Salfred		 "character 0x%lx is not unibyte in execution character set",
766255932Salfred		 (unsigned long)c);
767255932Salfred      return 0;
768255932Salfred    }
769255932Salfred  c = tbuf.text[0];
770255932Salfred  free(tbuf.text);
771272407Shselasky  return c;
772272407Shselasky}
773272407Shselasky
774272407Shselasky
775272407Shselasky
776272407Shselasky/* Utility routine that computes a mask of the form 0000...111... with
777272407Shselasky   WIDTH 1-bits.  */
778272407Shselaskystatic inline size_t
779272407Shselaskywidth_to_mask (size_t width)
780272407Shselasky{
781272407Shselasky  width = MIN (width, BITS_PER_CPPCHAR_T);
782272407Shselasky  if (width >= CHAR_BIT * sizeof (size_t))
783272407Shselasky    return ~(size_t) 0;
784272407Shselasky  else
785272407Shselasky    return ((size_t) 1 << width) - 1;
786272407Shselasky}
787272407Shselasky
788272407Shselasky/* A large table of unicode character information.  */
789272407Shselaskyenum {
790272407Shselasky  /* Valid in a C99 identifier?  */
791272407Shselasky  C99 = 1,
792272407Shselasky  /* Valid in a C99 identifier, but not as the first character?  */
793272407Shselasky  DIG = 2,
794272407Shselasky  /* Valid in a C++ identifier?  */
795272407Shselasky  CXX = 4,
796272407Shselasky  /* NFC representation is not valid in an identifier?  */
797272407Shselasky  CID = 8,
798272407Shselasky  /* Might be valid NFC form?  */
799272407Shselasky  NFC = 16,
800272407Shselasky  /* Might be valid NFKC form?  */
801272407Shselasky  NKC = 32,
802272407Shselasky  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
803272407Shselasky  CTX = 64
804272407Shselasky};
805272407Shselasky
806272407Shselaskystatic const struct {
807272407Shselasky  /* Bitmap of flags above.  */
808255932Salfred  unsigned char flags;
809255932Salfred  /* Combining class of the character.  */
810255932Salfred  unsigned char combine;
811272407Shselasky  /* Last character in the range described by this entry.  */
812255932Salfred  unsigned short end;
813272407Shselasky} ucnranges[] = {
814255932Salfred#include "ucnid.h"
815272407Shselasky};
816255932Salfred
817255932Salfred/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
818255932Salfred   the start of an identifier, and 0 if C is not valid in an
819255932Salfred   identifier.  We assume C has already gone through the checks of
820255932Salfred   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
821255932Salfred   algorithm is a simple binary search on the table defined in
822255932Salfred   ucnid.h.  */
823255932Salfred
824255932Salfredstatic int
825255932Salfreducn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
826255932Salfred			 struct normalize_state *nst)
827255932Salfred{
828255932Salfred  int mn, mx, md;
829272407Shselasky
830272407Shselasky  if (c > 0xFFFF)
831255932Salfred    return 0;
832255932Salfred
833255932Salfred  mn = 0;
834272407Shselasky  mx = ARRAY_SIZE (ucnranges) - 1;
835255932Salfred  while (mx != mn)
836255932Salfred    {
837255932Salfred      md = (mn + mx) / 2;
838255932Salfred      if (c <= ucnranges[md].end)
839255932Salfred	mx = md;
840255932Salfred      else
841255932Salfred	mn = md + 1;
842255932Salfred    }
843255932Salfred
844255932Salfred  /* When -pedantic, we require the character to have been listed by
845255932Salfred     the standard for the current language.  Otherwise, we accept the
846255932Salfred     union of the acceptable sets for C++98 and C99.  */
847255932Salfred  if (! (ucnranges[mn].flags & (C99 | CXX)))
848255932Salfred      return 0;
849255932Salfred
850255932Salfred  if (CPP_PEDANTIC (pfile)
851255932Salfred      && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
852255932Salfred	  || (CPP_OPTION (pfile, cplusplus)
853255932Salfred	      && !(ucnranges[mn].flags & CXX))))
854255932Salfred    return 0;
855255932Salfred
856255932Salfred  /* Update NST.  */
857255932Salfred  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
858255932Salfred    nst->level = normalized_none;
859255932Salfred  else if (ucnranges[mn].flags & CTX)
860255932Salfred    {
861255932Salfred      bool safe;
862255932Salfred      cppchar_t p = nst->previous;
863255932Salfred
864255932Salfred      /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
865255932Salfred      if (c == 0x09BE)
866255932Salfred	safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
867255932Salfred      else if (c == 0x0B3E)
868255932Salfred	safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
869255932Salfred      else if (c == 0x0BBE)
870255932Salfred	safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
871255932Salfred      else if (c == 0x0CC2)
872255932Salfred	safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
873255932Salfred      else if (c == 0x0D3E)
874255932Salfred	safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
875255932Salfred      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
876255932Salfred	 and are combined algorithmically from a sequence of the form
877255932Salfred	 1100-1112 1161-1175 11A8-11C2
878255932Salfred	 (if the third is not present, it is treated as 11A7, which is not
879255932Salfred	 really a valid character).
880255932Salfred	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
881255932Salfred	 only the combining characters.  */
882255932Salfred      else if (c >= 0x1161 && c <= 0x1175)
883255932Salfred	safe = p < 0x1100 || p > 0x1112;
884255932Salfred      else if (c >= 0x11A8 && c <= 0x11C2)
885255932Salfred	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
886255932Salfred      else
887255932Salfred	{
888255932Salfred	  /* Uh-oh, someone updated ucnid.h without updating this code.  */
889255932Salfred	  cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
890255932Salfred	  safe = true;
891255932Salfred	}
892255932Salfred      if (!safe && c < 0x1161)
893255932Salfred	nst->level = normalized_none;
894255932Salfred      else if (!safe)
895255932Salfred	nst->level = MAX (nst->level, normalized_identifier_C);
896255932Salfred    }
897255932Salfred  else if (ucnranges[mn].flags & NKC)
898255932Salfred    ;
899255932Salfred  else if (ucnranges[mn].flags & NFC)
900255932Salfred    nst->level = MAX (nst->level, normalized_C);
901255932Salfred  else if (ucnranges[mn].flags & CID)
902255932Salfred    nst->level = MAX (nst->level, normalized_identifier_C);
903255932Salfred  else
904255932Salfred    nst->level = normalized_none;
905255932Salfred  nst->previous = c;
906255932Salfred  nst->prev_class = ucnranges[mn].combine;
907255932Salfred
908255932Salfred  /* In C99, UCN digits may not begin identifiers.  */
909255932Salfred  if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
910255932Salfred    return 2;
911255932Salfred
912255932Salfred  return 1;
913255932Salfred}
914255932Salfred
915255932Salfred/* [lex.charset]: The character designated by the universal character
916255932Salfred   name \UNNNNNNNN is that character whose character short name in
917255932Salfred   ISO/IEC 10646 is NNNNNNNN; the character designated by the
918279731Shselasky   universal character name \uNNNN is that character whose character
919279731Shselasky   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
920255932Salfred   for a universal character name is less than 0x20 or in the range
921279731Shselasky   0x7F-0x9F (inclusive), or if the universal character name
922279731Shselasky   designates a character in the basic source character set, then the
923255932Salfred   program is ill-formed.
924255932Salfred
925255932Salfred   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
926255932Salfred   buffer end is delimited by a non-hex digit.  Returns zero if the
927255932Salfred   UCN has not been consumed.
928255932Salfred
929255932Salfred   Otherwise the nonzero value of the UCN, whether valid or invalid,
930255932Salfred   is returned.  Diagnostics are emitted for invalid values.  PSTR
931255932Salfred   is updated to point one beyond the UCN, or to the syntactically
932255932Salfred   invalid character.
933255932Salfred
934255932Salfred   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
935255932Salfred   an identifier, or 2 otherwise.  */
936255932Salfred
937255932Salfredcppchar_t
938255932Salfred_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
939255932Salfred		const uchar *limit, int identifier_pos,
940255932Salfred		struct normalize_state *nst)
941255932Salfred{
942255932Salfred  cppchar_t result, c;
943255932Salfred  unsigned int length;
944255932Salfred  const uchar *str = *pstr;
945255932Salfred  const uchar *base = str - 2;
946255932Salfred
947255932Salfred  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
948255932Salfred    cpp_error (pfile, CPP_DL_WARNING,
949255932Salfred	       "universal character names are only valid in C++ and C99");
950255932Salfred  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
951255932Salfred    cpp_error (pfile, CPP_DL_WARNING,
952255932Salfred	       "the meaning of '\\%c' is different in traditional C",
953255932Salfred	       (int) str[-1]);
954255932Salfred
955255932Salfred  if (str[-1] == 'u')
956255932Salfred    length = 4;
957255932Salfred  else if (str[-1] == 'U')
958255932Salfred    length = 8;
959255932Salfred  else
960255932Salfred    {
961255932Salfred      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
962255932Salfred      length = 4;
963255932Salfred    }
964255932Salfred
965255932Salfred  result = 0;
966255932Salfred  do
967255932Salfred    {
968255932Salfred      c = *str;
969255932Salfred      if (!ISXDIGIT (c))
970255932Salfred	break;
971255932Salfred      str++;
972255932Salfred      result = (result << 4) + hex_value (c);
973255932Salfred    }
974255932Salfred  while (--length && str < limit);
975255932Salfred
976255932Salfred  /* Partial UCNs are not valid in strings, but decompose into
977255932Salfred     multiple tokens in identifiers, so we can't give a helpful
978255932Salfred     error message in that case.  */
979255932Salfred  if (length && identifier_pos)
980255932Salfred    return 0;
981255932Salfred
982255932Salfred  *pstr = str;
983255932Salfred  if (length)
984255932Salfred    {
985255932Salfred      cpp_error (pfile, CPP_DL_ERROR,
986272407Shselasky		 "incomplete universal character name %.*s",
987255932Salfred		 (int) (str - base), base);
988255932Salfred      result = 1;
989255932Salfred    }
990255932Salfred  /* The standard permits $, @ and ` to be specified as UCNs.  We use
991255932Salfred     hex escapes so that this also works with EBCDIC hosts.  */
992255932Salfred  else if ((result < 0xa0
993255932Salfred	    && (result != 0x24 && result != 0x40 && result != 0x60))
994255932Salfred	   || (result & 0x80000000)
995255932Salfred	   || (result >= 0xD800 && result <= 0xDFFF))
996255932Salfred    {
997255932Salfred      cpp_error (pfile, CPP_DL_ERROR,
998255932Salfred		 "%.*s is not a valid universal character",
999255932Salfred		 (int) (str - base), base);
1000255932Salfred      result = 1;
1001255932Salfred    }
1002255932Salfred  else if (identifier_pos && result == 0x24
1003255932Salfred	   && CPP_OPTION (pfile, dollars_in_ident))
1004255932Salfred    {
1005255932Salfred      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1006255932Salfred	{
1007255932Salfred	  CPP_OPTION (pfile, warn_dollars) = 0;
1008255932Salfred	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1009255932Salfred	}
1010255932Salfred      NORMALIZE_STATE_UPDATE_IDNUM (nst);
1011219820Sjeff    }
1012219820Sjeff  else if (identifier_pos)
1013219820Sjeff    {
1014219820Sjeff      int validity = ucn_valid_in_identifier (pfile, result, nst);
1015219820Sjeff
1016219820Sjeff      if (validity == 0)
1017219820Sjeff	cpp_error (pfile, CPP_DL_ERROR,
1018219820Sjeff		   "universal character %.*s is not valid in an identifier",
1019255932Salfred		   (int) (str - base), base);
1020255932Salfred      else if (validity == 2 && identifier_pos == 1)
1021219820Sjeff	cpp_error (pfile, CPP_DL_ERROR,
1022219820Sjeff   "universal character %.*s is not valid at the start of an identifier",
1023219820Sjeff		   (int) (str - base), base);
1024219820Sjeff    }
1025219820Sjeff
1026219820Sjeff  if (result == 0)
1027219820Sjeff    result = 1;
1028255932Salfred
1029255932Salfred  return result;
1030219820Sjeff}
1031219820Sjeff
1032219820Sjeff/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1033219820Sjeff   it to the execution character set and write the result into TBUF.
1034255932Salfred   An advanced pointer is returned.  Issues all relevant diagnostics.  */
1035255932Salfredstatic const uchar *
1036219820Sjeffconvert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1037255932Salfred	     struct _cpp_strbuf *tbuf, bool wide)
1038219820Sjeff{
1039219820Sjeff  cppchar_t ucn;
1040219820Sjeff  uchar buf[6];
1041219820Sjeff  uchar *bufp = buf;
1042219820Sjeff  size_t bytesleft = 6;
1043219820Sjeff  int rval;
1044219820Sjeff  struct cset_converter cvt
1045219820Sjeff    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1046219820Sjeff  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1047219820Sjeff
1048219820Sjeff  from++;  /* Skip u/U.  */
1049255932Salfred  ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
1050219820Sjeff
1051219820Sjeff  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
1052219820Sjeff  if (rval)
1053219820Sjeff    {
1054219820Sjeff      errno = rval;
1055255932Salfred      cpp_errno (pfile, CPP_DL_ERROR,
1056219820Sjeff		 "converting UCN to source character set");
1057219820Sjeff    }
1058219820Sjeff  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1059219820Sjeff    cpp_errno (pfile, CPP_DL_ERROR,
1060219820Sjeff	       "converting UCN to execution character set");
1061219820Sjeff
1062219820Sjeff  return from;
1063219820Sjeff}
1064219820Sjeff
1065219820Sjeff/* Subroutine of convert_hex and convert_oct.  N is the representation
1066219820Sjeff   in the execution character set of a numeric escape; write it into the
1067219820Sjeff   string buffer TBUF and update the end-of-string pointer therein.  WIDE
1068219820Sjeff   is true if it's a wide string that's being assembled in TBUF.  This
1069219820Sjeff   function issues no diagnostics and never fails.  */
1070219820Sjeffstatic void
1071255932Salfredemit_numeric_escape (cpp_reader *pfile, cppchar_t n,
1072219820Sjeff		     struct _cpp_strbuf *tbuf, bool wide)
1073255932Salfred{
1074219820Sjeff  if (wide)
1075219820Sjeff    {
1076219820Sjeff      /* We have to render this into the target byte order, which may not
1077272407Shselasky	 be our byte order.  */
1078219820Sjeff      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1079272407Shselasky      size_t width = CPP_OPTION (pfile, wchar_precision);
1080219820Sjeff      size_t cwidth = CPP_OPTION (pfile, char_precision);
1081255932Salfred      size_t cmask = width_to_mask (cwidth);
1082219820Sjeff      size_t nbwc = width / cwidth;
1083219820Sjeff      size_t i;
1084219820Sjeff      size_t off = tbuf->len;
1085219820Sjeff      cppchar_t c;
1086219820Sjeff
1087255932Salfred      if (tbuf->len + nbwc > tbuf->asize)
1088219820Sjeff	{
1089219820Sjeff	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1090219820Sjeff	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1091272407Shselasky	}
1092255932Salfred
1093255932Salfred      for (i = 0; i < nbwc; i++)
1094255932Salfred	{
1095255932Salfred	  c = n & cmask;
1096255932Salfred	  n >>= cwidth;
1097255932Salfred	  tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1098255932Salfred	}
1099255932Salfred      tbuf->len += nbwc;
1100255932Salfred    }
1101219820Sjeff  else
1102219820Sjeff    {
1103219820Sjeff      /* Note: this code does not handle the case where the target
1104219820Sjeff	 and host have a different number of bits in a byte.  */
1105219820Sjeff      if (tbuf->len + 1 > tbuf->asize)
1106219820Sjeff	{
1107219820Sjeff	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1108272407Shselasky	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1109219820Sjeff	}
1110219820Sjeff      tbuf->text[tbuf->len++] = n;
1111219820Sjeff    }
1112219820Sjeff}
1113219820Sjeff
1114219820Sjeff/* Convert a hexadecimal escape, pointed to by FROM, to the execution
1115219820Sjeff   character set and write it into the string buffer TBUF.  Returns an
1116255932Salfred   advanced pointer, and issues diagnostics as necessary.
1117255932Salfred   No character set translation occurs; this routine always produces the
1118219820Sjeff   execution-set character with numeric value equal to the given hex
1119219820Sjeff   number.  You can, e.g. generate surrogate pairs this way.  */
1120219820Sjeffstatic const uchar *
1121219820Sjeffconvert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1122219820Sjeff	     struct _cpp_strbuf *tbuf, bool wide)
1123219820Sjeff{
1124255932Salfred  cppchar_t c, n = 0, overflow = 0;
1125219820Sjeff  int digits_found = 0;
1126255932Salfred  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1127255932Salfred		  : CPP_OPTION (pfile, char_precision));
1128219820Sjeff  size_t mask = width_to_mask (width);
1129219820Sjeff
1130219820Sjeff  if (CPP_WTRADITIONAL (pfile))
1131219820Sjeff    cpp_error (pfile, CPP_DL_WARNING,
1132219820Sjeff	       "the meaning of '\\x' is different in traditional C");
1133219820Sjeff
1134219820Sjeff  from++;  /* Skip 'x'.  */
1135219820Sjeff  while (from < limit)
1136255932Salfred    {
1137255932Salfred      c = *from;
1138219820Sjeff      if (! hex_p (c))
1139219820Sjeff	break;
1140219820Sjeff      from++;
1141219820Sjeff      overflow |= n ^ (n << 4 >> 4);
1142279731Shselasky      n = (n << 4) + hex_value (c);
1143279731Shselasky      digits_found = 1;
1144219820Sjeff    }
1145219820Sjeff
1146219820Sjeff  if (!digits_found)
1147219820Sjeff    {
1148272407Shselasky      cpp_error (pfile, CPP_DL_ERROR,
1149272407Shselasky		 "\\x used with no following hex digits");
1150272407Shselasky      return from;
1151272407Shselasky    }
1152255932Salfred
1153272407Shselasky  if (overflow | (n != (n & mask)))
1154272407Shselasky    {
1155272407Shselasky      cpp_error (pfile, CPP_DL_PEDWARN,
1156255932Salfred		 "hex escape sequence out of range");
1157219820Sjeff      n &= mask;
1158255932Salfred    }
1159255932Salfred
1160219820Sjeff  emit_numeric_escape (pfile, n, tbuf, wide);
1161255932Salfred
1162255932Salfred  return from;
1163219820Sjeff}
1164219820Sjeff
1165219820Sjeff/* Convert an octal escape, pointed to by FROM, to the execution
1166219820Sjeff   character set and write it into the string buffer TBUF.  Returns an
1167219820Sjeff   advanced pointer, and issues diagnostics as necessary.
1168219820Sjeff   No character set translation occurs; this routine always produces the
1169219820Sjeff   execution-set character with numeric value equal to the given octal
1170255932Salfred   number.  */
1171255932Salfredstatic const uchar *
1172255932Salfredconvert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1173255932Salfred	     struct _cpp_strbuf *tbuf, bool wide)
1174219820Sjeff{
1175255932Salfred  size_t count = 0;
1176255932Salfred  cppchar_t c, n = 0;
1177255932Salfred  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1178255932Salfred		  : CPP_OPTION (pfile, char_precision));
1179255932Salfred  size_t mask = width_to_mask (width);
1180219820Sjeff  bool overflow = false;
1181219820Sjeff
1182219820Sjeff  while (from < limit && count++ < 3)
1183255932Salfred    {
1184255932Salfred      c = *from;
1185255932Salfred      if (c < '0' || c > '7')
1186219820Sjeff	break;
1187219820Sjeff      from++;
1188219820Sjeff      overflow |= n ^ (n << 3 >> 3);
1189219820Sjeff      n = (n << 3) + c - '0';
1190255932Salfred    }
1191219820Sjeff
1192219820Sjeff  if (n != (n & mask))
1193219820Sjeff    {
1194219820Sjeff      cpp_error (pfile, CPP_DL_PEDWARN,
1195219820Sjeff		 "octal escape sequence out of range");
1196255932Salfred      n &= mask;
1197219820Sjeff    }
1198219820Sjeff
1199219820Sjeff  emit_numeric_escape (pfile, n, tbuf, wide);
1200219820Sjeff
1201219820Sjeff  return from;
1202219820Sjeff}
1203219820Sjeff
1204219820Sjeff/* Convert an escape sequence (pointed to by FROM) to its value on
1205219820Sjeff   the target, and to the execution character set.  Do not scan past
1206272407Shselasky   LIMIT.  Write the converted value into TBUF.  Returns an advanced
1207219820Sjeff   pointer.  Handles all relevant diagnostics.  */
1208219820Sjeffstatic const uchar *
1209219820Sjeffconvert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1210219820Sjeff		struct _cpp_strbuf *tbuf, bool wide)
1211255932Salfred{
1212219820Sjeff  /* Values of \a \b \e \f \n \r \t \v respectively.  */
1213219820Sjeff#if HOST_CHARSET == HOST_CHARSET_ASCII
1214219820Sjeff  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
1215219820Sjeff#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1216219820Sjeff  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
1217255932Salfred#else
1218219820Sjeff#error "unknown host character set"
1219219820Sjeff#endif
1220219820Sjeff
1221219820Sjeff  uchar c;
1222219820Sjeff  struct cset_converter cvt
1223219820Sjeff    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1224219820Sjeff
1225219820Sjeff  c = *from;
1226272407Shselasky  switch (c)
1227219820Sjeff    {
1228219820Sjeff      /* UCNs, hex escapes, and octal escapes are processed separately.  */
1229219820Sjeff    case 'u': case 'U':
1230219820Sjeff      return convert_ucn (pfile, from, limit, tbuf, wide);
1231219820Sjeff
1232219820Sjeff    case 'x':
1233219820Sjeff      return convert_hex (pfile, from, limit, tbuf, wide);
1234219820Sjeff      break;
1235255932Salfred
1236255932Salfred    case '0':  case '1':  case '2':  case '3':
1237255932Salfred    case '4':  case '5':  case '6':  case '7':
1238255932Salfred      return convert_oct (pfile, from, limit, tbuf, wide);
1239255932Salfred
1240255932Salfred      /* Various letter escapes.  Get the appropriate host-charset
1241255932Salfred	 value into C.  */
1242255932Salfred    case '\\': case '\'': case '"': case '?': break;
1243255932Salfred
1244255932Salfred    case '(': case '{': case '[': case '%':
1245255932Salfred      /* '\(', etc, can be used at the beginning of a line in a long
1246255932Salfred	 string split onto multiple lines with \-newline, to prevent
1247255932Salfred	 Emacs or other text editors from getting confused.  '\%' can
1248255932Salfred	 be used to prevent SCCS from mangling printf format strings.  */
1249255932Salfred      if (CPP_PEDANTIC (pfile))
1250255932Salfred	goto unknown;
1251255932Salfred      break;
1252255932Salfred
1253255932Salfred    case 'b': c = charconsts[1];  break;
1254255932Salfred    case 'f': c = charconsts[3];  break;
1255272407Shselasky    case 'n': c = charconsts[4];  break;
1256255932Salfred    case 'r': c = charconsts[5];  break;
1257255932Salfred    case 't': c = charconsts[6];  break;
1258255932Salfred    case 'v': c = charconsts[7];  break;
1259255932Salfred
1260255932Salfred    case 'a':
1261255932Salfred      if (CPP_WTRADITIONAL (pfile))
1262255932Salfred	cpp_error (pfile, CPP_DL_WARNING,
1263255932Salfred		   "the meaning of '\\a' is different in traditional C");
1264255932Salfred      c = charconsts[0];
1265272407Shselasky      break;
1266272407Shselasky
1267272407Shselasky    case 'e': case 'E':
1268272407Shselasky      if (CPP_PEDANTIC (pfile))
1269255932Salfred	cpp_error (pfile, CPP_DL_PEDWARN,
1270255932Salfred		   "non-ISO-standard escape sequence, '\\%c'", (int) c);
1271255932Salfred      c = charconsts[2];
1272255932Salfred      break;
1273255932Salfred
1274255932Salfred    default:
1275255932Salfred    unknown:
1276255932Salfred      if (ISGRAPH (c))
1277255932Salfred	cpp_error (pfile, CPP_DL_PEDWARN,
1278255932Salfred		   "unknown escape sequence '\\%c'", (int) c);
1279255932Salfred      else
1280255932Salfred	{
1281255932Salfred	  /* diagnostic.c does not support "%03o".  When it does, this
1282255932Salfred	     code can use %03o directly in the diagnostic again.  */
1283255932Salfred	  char buf[32];
1284255932Salfred	  sprintf(buf, "%03o", (int) c);
1285255932Salfred	  cpp_error (pfile, CPP_DL_PEDWARN,
1286255932Salfred		     "unknown escape sequence: '\\%s'", buf);
1287255932Salfred	}
1288255932Salfred    }
1289255932Salfred
1290255932Salfred  /* Now convert what we have to the execution character set.  */
1291255932Salfred  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1292255932Salfred    cpp_errno (pfile, CPP_DL_ERROR,
1293255932Salfred	       "converting escape sequence to execution character set");
1294255932Salfred
1295255932Salfred  return from + 1;
1296255932Salfred}
1297255932Salfred
1298255932Salfred/* FROM is an array of cpp_string structures of length COUNT.  These
1299255932Salfred   are to be converted from the source to the execution character set,
1300255932Salfred   escape sequences translated, and finally all are to be
1301255932Salfred   concatenated.  WIDE indicates whether or not to produce a wide
1302255932Salfred   string.  The result is written into TO.  Returns true for success,
1303272407Shselasky   false for failure.  */
1304255932Salfredbool
1305272407Shselaskycpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1306272407Shselasky		      cpp_string *to, bool wide)
1307272407Shselasky{
1308272407Shselasky  struct _cpp_strbuf tbuf;
1309272407Shselasky  const uchar *p, *base, *limit;
1310272407Shselasky  size_t i;
1311272407Shselasky  struct cset_converter cvt
1312272407Shselasky    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1313272407Shselasky
1314272407Shselasky  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1315272407Shselasky  tbuf.text = XNEWVEC (uchar, tbuf.asize);
1316272407Shselasky  tbuf.len = 0;
1317272407Shselasky
1318272407Shselasky  for (i = 0; i < count; i++)
1319272407Shselasky    {
1320272407Shselasky      p = from[i].text;
1321272407Shselasky      if (*p == 'L') p++;
1322272407Shselasky      p++; /* Skip leading quote.  */
1323272407Shselasky      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1324272407Shselasky
1325272407Shselasky      for (;;)
1326272407Shselasky	{
1327272407Shselasky	  base = p;
1328272407Shselasky	  while (p < limit && *p != '\\')
1329272407Shselasky	    p++;
1330272407Shselasky	  if (p > base)
1331272407Shselasky	    {
1332255932Salfred	      /* We have a run of normal characters; these can be fed
1333255932Salfred		 directly to convert_cset.  */
1334255932Salfred	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1335255932Salfred		goto fail;
1336255932Salfred	    }
1337255932Salfred	  if (p == limit)
1338255932Salfred	    break;
1339255932Salfred
1340255932Salfred	  p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1341272407Shselasky	}
1342272407Shselasky    }
1343272407Shselasky  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1344255932Salfred     structure.  */
1345255932Salfred  emit_numeric_escape (pfile, 0, &tbuf, wide);
1346255932Salfred  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1347255932Salfred  to->text = tbuf.text;
1348255932Salfred  to->len = tbuf.len;
1349255932Salfred  return true;
1350255932Salfred
1351272407Shselasky fail:
1352255932Salfred  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1353255932Salfred  free (tbuf.text);
1354255932Salfred  return false;
1355255932Salfred}
1356255932Salfred
1357255932Salfred/* Subroutine of do_line and do_linemarker.  Convert escape sequences
1358255932Salfred   in a string, but do not perform character set conversion.  */
1359255932Salfredbool
1360255932Salfredcpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1361255932Salfred				  size_t count,	cpp_string *to, bool wide)
1362255932Salfred{
1363255932Salfred  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1364255932Salfred  bool retval;
1365255932Salfred
1366219820Sjeff  pfile->narrow_cset_desc.func = convert_no_conversion;
1367219820Sjeff  pfile->narrow_cset_desc.cd = (iconv_t) -1;
1368255932Salfred
1369255932Salfred  retval = cpp_interpret_string (pfile, from, count, to, wide);
1370255932Salfred
1371255932Salfred  pfile->narrow_cset_desc = save_narrow_cset_desc;
1372255932Salfred  return retval;
1373255932Salfred}
1374255932Salfred
1375255932Salfred
1376255932Salfred/* Subroutine of cpp_interpret_charconst which performs the conversion
1377255932Salfred   to a number, for narrow strings.  STR is the string structure returned
1378255932Salfred   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1379255932Salfred   cpp_interpret_charconst.  */
1380255932Salfredstatic cppchar_t
1381255932Salfrednarrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1382255932Salfred			 unsigned int *pchars_seen, int *unsignedp)
1383255932Salfred{
1384255932Salfred  size_t width = CPP_OPTION (pfile, char_precision);
1385255932Salfred  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1386255932Salfred  size_t mask = width_to_mask (width);
1387255932Salfred  size_t i;
1388255932Salfred  cppchar_t result, c;
1389255932Salfred  bool unsigned_p;
1390255932Salfred
1391255932Salfred  /* The value of a multi-character character constant, or a
1392255932Salfred     single-character character constant whose representation in the
1393255932Salfred     execution character set is more than one byte long, is
1394255932Salfred     implementation defined.  This implementation defines it to be the
1395255932Salfred     number formed by interpreting the byte sequence in memory as a
1396255932Salfred     big-endian binary number.  If overflow occurs, the high bytes are
1397255932Salfred     lost, and a warning is issued.
1398255932Salfred
1399255932Salfred     We don't want to process the NUL terminator handed back by
1400255932Salfred     cpp_interpret_string.  */
1401255932Salfred  result = 0;
1402255932Salfred  for (i = 0; i < str.len - 1; i++)
1403255932Salfred    {
1404255932Salfred      c = str.text[i] & mask;
1405255932Salfred      if (width < BITS_PER_CPPCHAR_T)
1406255932Salfred	result = (result << width) | c;
1407255932Salfred      else
1408255932Salfred	result = c;
1409255932Salfred    }
1410255932Salfred
1411255932Salfred  if (i > max_chars)
1412255932Salfred    {
1413255932Salfred      i = max_chars;
1414255932Salfred      cpp_error (pfile, CPP_DL_WARNING,
1415255932Salfred		 "character constant too long for its type");
1416255932Salfred    }
1417255932Salfred  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1418255932Salfred    cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1419255932Salfred
1420255932Salfred  /* Multichar constants are of type int and therefore signed.  */
1421255932Salfred  if (i > 1)
1422255932Salfred    unsigned_p = 0;
1423255932Salfred  else
1424255932Salfred    unsigned_p = CPP_OPTION (pfile, unsigned_char);
1425255932Salfred
1426255932Salfred  /* Truncate the constant to its natural width, and simultaneously
1427255932Salfred     sign- or zero-extend to the full width of cppchar_t.
1428255932Salfred     For single-character constants, the value is WIDTH bits wide.
1429255932Salfred     For multi-character constants, the value is INT_PRECISION bits wide.  */
1430255932Salfred  if (i > 1)
1431255932Salfred    width = CPP_OPTION (pfile, int_precision);
1432255932Salfred  if (width < BITS_PER_CPPCHAR_T)
1433255932Salfred    {
1434255932Salfred      mask = ((cppchar_t) 1 << width) - 1;
1435255932Salfred      if (unsigned_p || !(result & (1 << (width - 1))))
1436255932Salfred	result &= mask;
1437255932Salfred      else
1438255932Salfred	result |= ~mask;
1439255932Salfred    }
1440255932Salfred  *pchars_seen = i;
1441255932Salfred  *unsignedp = unsigned_p;
1442255932Salfred  return result;
1443255932Salfred}
1444255932Salfred
1445255932Salfred/* Subroutine of cpp_interpret_charconst which performs the conversion
1446255932Salfred   to a number, for wide strings.  STR is the string structure returned
1447255932Salfred   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1448255932Salfred   cpp_interpret_charconst.  */
1449255932Salfredstatic cppchar_t
1450255932Salfredwide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1451255932Salfred		       unsigned int *pchars_seen, int *unsignedp)
1452255932Salfred{
1453255932Salfred  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1454255932Salfred  size_t width = CPP_OPTION (pfile, wchar_precision);
1455255932Salfred  size_t cwidth = CPP_OPTION (pfile, char_precision);
1456255932Salfred  size_t mask = width_to_mask (width);
1457255932Salfred  size_t cmask = width_to_mask (cwidth);
1458255932Salfred  size_t nbwc = width / cwidth;
1459255932Salfred  size_t off, i;
1460255932Salfred  cppchar_t result = 0, c;
1461255932Salfred
1462255932Salfred  /* This is finicky because the string is in the target's byte order,
1463255932Salfred     which may not be our byte order.  Only the last character, ignoring
1464255932Salfred     the NUL terminator, is relevant.  */
1465272407Shselasky  off = str.len - (nbwc * 2);
1466272407Shselasky  result = 0;
1467255932Salfred  for (i = 0; i < nbwc; i++)
1468255932Salfred    {
1469255932Salfred      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1470255932Salfred      result = (result << cwidth) | (c & cmask);
1471255932Salfred    }
1472255932Salfred
1473255932Salfred  /* Wide character constants have type wchar_t, and a single
1474255932Salfred     character exactly fills a wchar_t, so a multi-character wide
1475255932Salfred     character constant is guaranteed to overflow.  */
1476255932Salfred  if (off > 0)
1477255932Salfred    cpp_error (pfile, CPP_DL_WARNING,
1478255932Salfred	       "character constant too long for its type");
1479255932Salfred
1480255932Salfred  /* Truncate the constant to its natural width, and simultaneously
1481255932Salfred     sign- or zero-extend to the full width of cppchar_t.  */
1482255932Salfred  if (width < BITS_PER_CPPCHAR_T)
1483255932Salfred    {
1484255932Salfred      if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1485255932Salfred	result &= mask;
1486255932Salfred      else
1487255932Salfred	result |= ~mask;
1488255932Salfred    }
1489255932Salfred
1490255932Salfred  *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1491255932Salfred  *pchars_seen = 1;
1492255932Salfred  return result;
1493255932Salfred}
1494255932Salfred
1495255932Salfred/* Interpret a (possibly wide) character constant in TOKEN.
1496255932Salfred   PCHARS_SEEN points to a variable that is filled in with the number
1497255932Salfred   of characters seen, and UNSIGNEDP to a variable that indicates
1498255932Salfred   whether the result has signed type.  */
1499255932Salfredcppchar_t
1500255932Salfredcpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1501255932Salfred			 unsigned int *pchars_seen, int *unsignedp)
1502255932Salfred{
1503255932Salfred  cpp_string str = { 0, 0 };
1504255932Salfred  bool wide = (token->type == CPP_WCHAR);
1505255932Salfred  cppchar_t result;
1506255932Salfred
1507255932Salfred  /* an empty constant will appear as L'' or '' */
1508255932Salfred  if (token->val.str.len == (size_t) (2 + wide))
1509255932Salfred    {
1510255932Salfred      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1511255932Salfred      return 0;
1512255932Salfred    }
1513255932Salfred  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1514255932Salfred    return 0;
1515255932Salfred
1516255932Salfred  if (wide)
1517255932Salfred    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1518255932Salfred  else
1519219820Sjeff    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1520219820Sjeff
1521219820Sjeff  if (str.text != token->val.str.text)
1522219820Sjeff    free ((void *)str.text);
1523219820Sjeff
1524255932Salfred  return result;
1525255932Salfred}
1526255932Salfred
1527255932Salfred/* Convert an identifier denoted by ID and LEN, which might contain
1528219820Sjeff   UCN escapes, to the source character set, either UTF-8 or
1529219820Sjeff   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
1530219820Sjeffcpp_hashnode *
1531219820Sjeff_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1532219820Sjeff{
1533219820Sjeff  /* It turns out that a UCN escape always turns into fewer characters
1534219820Sjeff     than the escape itself, so we can allocate a temporary in advance.  */
1535219820Sjeff  uchar * buf = (uchar *) alloca (len + 1);
1536219820Sjeff  uchar * bufp = buf;
1537219820Sjeff  size_t idp;
1538219820Sjeff
1539219820Sjeff  for (idp = 0; idp < len; idp++)
1540255932Salfred    if (id[idp] != '\\')
1541255932Salfred      *bufp++ = id[idp];
1542255932Salfred    else
1543219820Sjeff      {
1544	unsigned length = id[idp+1] == 'u' ? 4 : 8;
1545	cppchar_t value = 0;
1546	size_t bufleft = len - (bufp - buf);
1547	int rval;
1548
1549	idp += 2;
1550	while (length && idp < len && ISXDIGIT (id[idp]))
1551	  {
1552	    value = (value << 4) + hex_value (id[idp]);
1553	    idp++;
1554	    length--;
1555	  }
1556	idp--;
1557
1558	/* Special case for EBCDIC: if the identifier contains
1559	   a '$' specified using a UCN, translate it to EBCDIC.  */
1560	if (value == 0x24)
1561	  {
1562	    *bufp++ = '$';
1563	    continue;
1564	  }
1565
1566	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1567	if (rval)
1568	  {
1569	    errno = rval;
1570	    cpp_errno (pfile, CPP_DL_ERROR,
1571		       "converting UCN to source character set");
1572	    break;
1573	  }
1574      }
1575
1576  return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1577				  buf, bufp - buf, HT_ALLOC));
1578}
1579
1580/* Convert an input buffer (containing the complete contents of one
1581   source file) from INPUT_CHARSET to the source character set.  INPUT
1582   points to the input buffer, SIZE is its allocated size, and LEN is
1583   the length of the meaningful data within the buffer.  The
1584   translated buffer is returned, and *ST_SIZE is set to the length of
1585   the meaningful data within the translated buffer.
1586
1587   INPUT is expected to have been allocated with xmalloc.  This function
1588   will either return INPUT, or free it and return a pointer to another
1589   xmalloc-allocated block of memory.  */
1590uchar *
1591_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1592		    uchar *input, size_t size, size_t len, off_t *st_size)
1593{
1594  struct cset_converter input_cset;
1595  struct _cpp_strbuf to;
1596
1597  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1598  if (input_cset.func == convert_no_conversion)
1599    {
1600      to.text = input;
1601      to.asize = size;
1602      to.len = len;
1603    }
1604  else
1605    {
1606      to.asize = MAX (65536, len);
1607      to.text = XNEWVEC (uchar, to.asize);
1608      to.len = 0;
1609
1610      if (!APPLY_CONVERSION (input_cset, input, len, &to))
1611	cpp_error (pfile, CPP_DL_ERROR,
1612		   "failure to convert %s to %s",
1613		   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1614
1615      free (input);
1616    }
1617
1618  /* Clean up the mess.  */
1619  if (input_cset.func == convert_using_iconv)
1620    iconv_close (input_cset.cd);
1621
1622  /* Resize buffer if we allocated substantially too much, or if we
1623     haven't enough space for the \n-terminator.  */
1624  if (to.len + 4096 < to.asize || to.len >= to.asize)
1625    to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
1626
1627  /* If the file is using old-school Mac line endings (\r only),
1628     terminate with another \r, not an \n, so that we do not mistake
1629     the \r\n sequence for a single DOS line ending and erroneously
1630     issue the "No newline at end of file" diagnostic.  */
1631  /* APPLE LOCAL don't access to.text[-1] radar 6121572 */
1632  if (to.len > 0 && to.text[to.len - 1] == '\r')
1633    to.text[to.len] = '\r';
1634  else
1635    to.text[to.len] = '\n';
1636
1637  *st_size = to.len;
1638  return to.text;
1639}
1640
1641/* Decide on the default encoding to assume for input files.  */
1642const char *
1643_cpp_default_encoding (void)
1644{
1645  const char *current_encoding = NULL;
1646
1647  /* We disable this because the default codeset is 7-bit ASCII on
1648     most platforms, and this causes conversion failures on every
1649     file in GCC that happens to have one of the upper 128 characters
1650     in it -- most likely, as part of the name of a contributor.
1651     We should definitely recognize in-band markers of file encoding,
1652     like:
1653     - the appropriate Unicode byte-order mark (FE FF) to recognize
1654       UTF16 and UCS4 (in both big-endian and little-endian flavors)
1655       and UTF8
1656     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1657       distinguish ASCII and EBCDIC.
1658     - now we can parse something like "#pragma GCC encoding <xyz>
1659       on the first line, or even Emacs/VIM's mode line tags (there's
1660       a problem here in that VIM uses the last line, and Emacs has
1661       its more elaborate "local variables" convention).
1662     - investigate whether Java has another common convention, which
1663       would be friendly to support.
1664     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
1665#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1666  setlocale (LC_CTYPE, "");
1667  current_encoding = nl_langinfo (CODESET);
1668#endif
1669  if (current_encoding == NULL || *current_encoding == '\0')
1670    current_encoding = SOURCE_CHARSET;
1671
1672  return current_encoding;
1673}
1674