1/* xgettext C/C++/ObjectiveC backend.
2   Copyright (C) 1995-1998, 2000-2007 Free Software Foundation, Inc.
3
4   This file was written by Peter Miller <millerp@canb.auug.org.au>
5
6   This program is free software: you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 3 of the License, or
9   (at your option) any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
18
19#ifdef HAVE_CONFIG_H
20# include "config.h"
21#endif
22
23/* Specification.  */
24#include "x-c.h"
25
26#include <errno.h>
27#include <stdbool.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include "message.h"
33#include "xgettext.h"
34#include "x-c.h"
35#include "error.h"
36#include "error-progname.h"
37#include "xalloc.h"
38#include "xvasprintf.h"
39#include "hash.h"
40#include "gettext.h"
41
42#define _(s) gettext(s)
43
44#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
45
46
47/* The ANSI C standard defines several phases of translation:
48
49   1. Terminate line by \n, regardless of the external representation
50      of a text line.  Stdio does this for us.
51
52   2. Convert trigraphs to their single character equivalents.
53
54   3. Concatenate each line ending in backslash (\) with the following
55      line.
56
57   4. Replace each comment with a space character.
58
59   5. Parse each resulting logical line as preprocessing tokens a
60      white space.
61
62   6. Recognize and carry out directives (it also expands macros on
63      non-directive lines, which we do not do here).
64
65   7. Replaces escape sequences within character strings with their
66      single character equivalents (we do this in step 5, because we
67      don't have to worry about the #include argument).
68
69   8. Concatenates adjacent string literals to form single string
70      literals (because we don't expand macros, there are a few things
71      we will miss).
72
73   9. Converts the remaining preprocessing tokens to C tokens and
74      discards any white space from the translation unit.
75
76   This lexer implements the above, and presents the scanner (in
77   xgettext.c) with a stream of C tokens.  The comments are
78   accumulated in a buffer, and given to xgettext when asked for.  */
79
80
81/* ========================= Lexer customization.  ========================= */
82
83static bool trigraphs = false;
84
85void
86x_c_trigraphs ()
87{
88  trigraphs = true;
89}
90
91
92/* ====================== Keyword set customization.  ====================== */
93
94/* If true extract all strings.  */
95static bool extract_all = false;
96
97static hash_table c_keywords;
98static hash_table objc_keywords;
99static bool default_keywords = true;
100
101
102void
103x_c_extract_all ()
104{
105  extract_all = true;
106}
107
108
109static void
110add_keyword (const char *name, hash_table *keywords)
111{
112  if (name == NULL)
113    default_keywords = false;
114  else
115    {
116      const char *end;
117      struct callshape shape;
118      const char *colon;
119
120      if (keywords->table == NULL)
121	hash_init (keywords, 100);
122
123      split_keywordspec (name, &end, &shape);
124
125      /* The characters between name and end should form a valid C identifier.
126	 A colon means an invalid parse in split_keywordspec().  */
127      colon = strchr (name, ':');
128      if (colon == NULL || colon >= end)
129	insert_keyword_callshape (keywords, name, end - name, &shape);
130    }
131}
132
133void
134x_c_keyword (const char *name)
135{
136  add_keyword (name, &c_keywords);
137}
138
139void
140x_objc_keyword (const char *name)
141{
142  add_keyword (name, &objc_keywords);
143}
144
145/* Finish initializing the keywords hash tables.
146   Called after argument processing, before each file is processed.  */
147static void
148init_keywords ()
149{
150  if (default_keywords)
151    {
152      /* When adding new keywords here, also update the documentation in
153	 xgettext.texi!  */
154      x_c_keyword ("gettext");
155      x_c_keyword ("dgettext:2");
156      x_c_keyword ("dcgettext:2");
157      x_c_keyword ("ngettext:1,2");
158      x_c_keyword ("dngettext:2,3");
159      x_c_keyword ("dcngettext:2,3");
160      x_c_keyword ("gettext_noop");
161      x_c_keyword ("pgettext:1c,2");
162      x_c_keyword ("dpgettext:2c,3");
163      x_c_keyword ("dcpgettext:2c,3");
164      x_c_keyword ("npgettext:1c,2,3");
165      x_c_keyword ("dnpgettext:2c,3,4");
166      x_c_keyword ("dcnpgettext:2c,3,4");
167
168      x_objc_keyword ("gettext");
169      x_objc_keyword ("dgettext:2");
170      x_objc_keyword ("dcgettext:2");
171      x_objc_keyword ("ngettext:1,2");
172      x_objc_keyword ("dngettext:2,3");
173      x_objc_keyword ("dcngettext:2,3");
174      x_objc_keyword ("gettext_noop");
175      x_objc_keyword ("pgettext:1c,2");
176      x_objc_keyword ("dpgettext:2c,3");
177      x_objc_keyword ("dcpgettext:2c,3");
178      x_objc_keyword ("npgettext:1c,2,3");
179      x_objc_keyword ("dnpgettext:2c,3,4");
180      x_objc_keyword ("dcnpgettext:2c,3,4");
181      x_objc_keyword ("NSLocalizedString");	  /* similar to gettext */
182      x_objc_keyword ("_");			  /* similar to gettext */
183      x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
184      x_objc_keyword ("__");			  /* similar to gettext_noop */
185
186      default_keywords = false;
187    }
188}
189
190void
191init_flag_table_c ()
192{
193  xgettext_record_flag ("gettext:1:pass-c-format");
194  xgettext_record_flag ("dgettext:2:pass-c-format");
195  xgettext_record_flag ("dcgettext:2:pass-c-format");
196  xgettext_record_flag ("ngettext:1:pass-c-format");
197  xgettext_record_flag ("ngettext:2:pass-c-format");
198  xgettext_record_flag ("dngettext:2:pass-c-format");
199  xgettext_record_flag ("dngettext:3:pass-c-format");
200  xgettext_record_flag ("dcngettext:2:pass-c-format");
201  xgettext_record_flag ("dcngettext:3:pass-c-format");
202  xgettext_record_flag ("gettext_noop:1:pass-c-format");
203  xgettext_record_flag ("pgettext:2:pass-c-format");
204  xgettext_record_flag ("dpgettext:3:pass-c-format");
205  xgettext_record_flag ("dcpgettext:3:pass-c-format");
206  xgettext_record_flag ("npgettext:2:pass-c-format");
207  xgettext_record_flag ("npgettext:3:pass-c-format");
208  xgettext_record_flag ("dnpgettext:3:pass-c-format");
209  xgettext_record_flag ("dnpgettext:4:pass-c-format");
210  xgettext_record_flag ("dcnpgettext:3:pass-c-format");
211  xgettext_record_flag ("dcnpgettext:4:pass-c-format");
212
213  /* <stdio.h> */
214  xgettext_record_flag ("fprintf:2:c-format");
215  xgettext_record_flag ("vfprintf:2:c-format");
216  xgettext_record_flag ("printf:1:c-format");
217  xgettext_record_flag ("vprintf:1:c-format");
218  xgettext_record_flag ("sprintf:2:c-format");
219  xgettext_record_flag ("vsprintf:2:c-format");
220  xgettext_record_flag ("snprintf:3:c-format");
221  xgettext_record_flag ("vsnprintf:3:c-format");
222#if 0 /* These functions are not standard.  */
223  /* <stdio.h> */
224  xgettext_record_flag ("asprintf:2:c-format");
225  xgettext_record_flag ("vasprintf:2:c-format");
226  xgettext_record_flag ("dprintf:2:c-format");
227  xgettext_record_flag ("vdprintf:2:c-format");
228  xgettext_record_flag ("obstack_printf:2:c-format");
229  xgettext_record_flag ("obstack_vprintf:2:c-format");
230  /* <error.h> */
231  xgettext_record_flag ("error:3:c-format");
232  xgettext_record_flag ("error_at_line:5:c-format");
233  /* <argp.h> */
234  xgettext_record_flag ("argp_error:2:c-format");
235  xgettext_record_flag ("argp_failure:2:c-format");
236#endif
237
238  xgettext_record_flag ("gettext:1:pass-qt-format");
239  xgettext_record_flag ("dgettext:2:pass-qt-format");
240  xgettext_record_flag ("dcgettext:2:pass-qt-format");
241  xgettext_record_flag ("ngettext:1:pass-qt-format");
242  xgettext_record_flag ("ngettext:2:pass-qt-format");
243  xgettext_record_flag ("dngettext:2:pass-qt-format");
244  xgettext_record_flag ("dngettext:3:pass-qt-format");
245  xgettext_record_flag ("dcngettext:2:pass-qt-format");
246  xgettext_record_flag ("dcngettext:3:pass-qt-format");
247  xgettext_record_flag ("gettext_noop:1:pass-qt-format");
248  xgettext_record_flag ("pgettext:2:pass-qt-format");
249  xgettext_record_flag ("dpgettext:3:pass-qt-format");
250  xgettext_record_flag ("dcpgettext:3:pass-qt-format");
251  xgettext_record_flag ("npgettext:2:pass-qt-format");
252  xgettext_record_flag ("npgettext:3:pass-qt-format");
253  xgettext_record_flag ("dnpgettext:3:pass-qt-format");
254  xgettext_record_flag ("dnpgettext:4:pass-qt-format");
255  xgettext_record_flag ("dcnpgettext:3:pass-qt-format");
256  xgettext_record_flag ("dcnpgettext:4:pass-qt-format");
257
258  xgettext_record_flag ("gettext:1:pass-kde-format");
259  xgettext_record_flag ("dgettext:2:pass-kde-format");
260  xgettext_record_flag ("dcgettext:2:pass-kde-format");
261  xgettext_record_flag ("ngettext:1:pass-kde-format");
262  xgettext_record_flag ("ngettext:2:pass-kde-format");
263  xgettext_record_flag ("dngettext:2:pass-kde-format");
264  xgettext_record_flag ("dngettext:3:pass-kde-format");
265  xgettext_record_flag ("dcngettext:2:pass-kde-format");
266  xgettext_record_flag ("dcngettext:3:pass-kde-format");
267  xgettext_record_flag ("gettext_noop:1:pass-kde-format");
268  xgettext_record_flag ("pgettext:2:pass-kde-format");
269  xgettext_record_flag ("dpgettext:3:pass-kde-format");
270  xgettext_record_flag ("dcpgettext:3:pass-kde-format");
271  xgettext_record_flag ("npgettext:2:pass-kde-format");
272  xgettext_record_flag ("npgettext:3:pass-kde-format");
273  xgettext_record_flag ("dnpgettext:3:pass-kde-format");
274  xgettext_record_flag ("dnpgettext:4:pass-kde-format");
275  xgettext_record_flag ("dcnpgettext:3:pass-kde-format");
276  xgettext_record_flag ("dcnpgettext:4:pass-kde-format");
277
278  xgettext_record_flag ("gettext:1:pass-boost-format");
279  xgettext_record_flag ("dgettext:2:pass-boost-format");
280  xgettext_record_flag ("dcgettext:2:pass-boost-format");
281  xgettext_record_flag ("ngettext:1:pass-boost-format");
282  xgettext_record_flag ("ngettext:2:pass-boost-format");
283  xgettext_record_flag ("dngettext:2:pass-boost-format");
284  xgettext_record_flag ("dngettext:3:pass-boost-format");
285  xgettext_record_flag ("dcngettext:2:pass-boost-format");
286  xgettext_record_flag ("dcngettext:3:pass-boost-format");
287  xgettext_record_flag ("gettext_noop:1:pass-boost-format");
288  xgettext_record_flag ("pgettext:2:pass-boost-format");
289  xgettext_record_flag ("dpgettext:3:pass-boost-format");
290  xgettext_record_flag ("dcpgettext:3:pass-boost-format");
291  xgettext_record_flag ("npgettext:2:pass-boost-format");
292  xgettext_record_flag ("npgettext:3:pass-boost-format");
293  xgettext_record_flag ("dnpgettext:3:pass-boost-format");
294  xgettext_record_flag ("dnpgettext:4:pass-boost-format");
295  xgettext_record_flag ("dcnpgettext:3:pass-boost-format");
296  xgettext_record_flag ("dcnpgettext:4:pass-boost-format");
297
298  /* <boost/format.hpp> */
299  xgettext_record_flag ("format:1:boost-format");
300}
301
302void
303init_flag_table_objc ()
304{
305  /* Since the settings done in init_flag_table_c() also have an effect for
306     the ObjectiveC parser, we don't have to repeat them here.  */
307  xgettext_record_flag ("gettext:1:pass-objc-format");
308  xgettext_record_flag ("dgettext:2:pass-objc-format");
309  xgettext_record_flag ("dcgettext:2:pass-objc-format");
310  xgettext_record_flag ("ngettext:1:pass-objc-format");
311  xgettext_record_flag ("ngettext:2:pass-objc-format");
312  xgettext_record_flag ("dngettext:2:pass-objc-format");
313  xgettext_record_flag ("dngettext:3:pass-objc-format");
314  xgettext_record_flag ("dcngettext:2:pass-objc-format");
315  xgettext_record_flag ("dcngettext:3:pass-objc-format");
316  xgettext_record_flag ("gettext_noop:1:pass-objc-format");
317  xgettext_record_flag ("pgettext:2:pass-objc-format");
318  xgettext_record_flag ("dpgettext:3:pass-objc-format");
319  xgettext_record_flag ("dcpgettext:3:pass-objc-format");
320  xgettext_record_flag ("npgettext:2:pass-objc-format");
321  xgettext_record_flag ("npgettext:3:pass-objc-format");
322  xgettext_record_flag ("dnpgettext:3:pass-objc-format");
323  xgettext_record_flag ("dnpgettext:4:pass-objc-format");
324  xgettext_record_flag ("dcnpgettext:3:pass-objc-format");
325  xgettext_record_flag ("dcnpgettext:4:pass-objc-format");
326  xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
327  xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
328  xgettext_record_flag ("_:1:pass-c-format");
329  xgettext_record_flag ("_:1:pass-objc-format");
330  xgettext_record_flag ("stringWithFormat::1:objc-format");
331  xgettext_record_flag ("initWithFormat::1:objc-format");
332  xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
333  xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
334  xgettext_record_flag ("appendFormat::1:objc-format");
335}
336
337void
338init_flag_table_gcc_internal ()
339{
340  xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
341  xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
342  xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
343  xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
344  xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
345  xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
346  xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
347  xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
348  xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
349  xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
350  xgettext_record_flag ("pgettext:2:pass-gcc-internal-format");
351  xgettext_record_flag ("dpgettext:3:pass-gcc-internal-format");
352  xgettext_record_flag ("dcpgettext:3:pass-gcc-internal-format");
353  xgettext_record_flag ("npgettext:2:pass-gcc-internal-format");
354  xgettext_record_flag ("npgettext:3:pass-gcc-internal-format");
355  xgettext_record_flag ("dnpgettext:3:pass-gcc-internal-format");
356  xgettext_record_flag ("dnpgettext:4:pass-gcc-internal-format");
357  xgettext_record_flag ("dcnpgettext:3:pass-gcc-internal-format");
358  xgettext_record_flag ("dcnpgettext:4:pass-gcc-internal-format");
359#if 0 /* This should better be done inside GCC.  */
360  /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
361  /* c-format.c */
362  xgettext_record_flag ("status_warning:2:gcc-internal-format");
363  /* c-tree.h */
364  xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
365  /* collect2.h */
366  //xgettext_record_flag ("error:1:c-format"); // 3 different versions
367  xgettext_record_flag ("notice:1:c-format");
368  //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
369  xgettext_record_flag ("fatal_perror:1:c-format");
370  /* cpplib.h */
371  xgettext_record_flag ("cpp_error:3:c-format");
372  xgettext_record_flag ("cpp_error_with_line:5:c-format");
373  /* diagnostic.h */
374  xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
375  xgettext_record_flag ("output_printf:2:gcc-internal-format");
376  xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
377  xgettext_record_flag ("verbatim:1:gcc-internal-format");
378  xgettext_record_flag ("inform:1:pass-gcc-internal-format");
379  /* gcc.h */
380  //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
381  //xgettext_record_flag ("error:1:c-format"); // 3 different versions
382  /* genattrtab.h */
383  xgettext_record_flag ("attr_printf:2:pass-c-format");
384  /* gengtype.h */
385  xgettext_record_flag ("error_at_line:2:pass-c-format");
386  xgettext_record_flag ("xvasprintf:2:pass-c-format");
387  xgettext_record_flag ("xasprintf:1:pass-c-format");
388  xgettext_record_flag ("oprintf:2:pass-c-format");
389  /* gensupport.h */
390  xgettext_record_flag ("message_with_line:2:pass-c-format");
391  /* output.h */
392  xgettext_record_flag ("output_operand_lossage:1:c-format");
393  /* ra.h */
394   xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
395  /* toplev.h */
396  xgettext_record_flag ("fnotice:2:c-format");
397  xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
398  xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
399  xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
400  xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
401  xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
402  xgettext_record_flag ("pedwarn:1:gcc-internal-format");
403  xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
404  xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
405  xgettext_record_flag ("sorry:1:gcc-internal-format");
406  xgettext_record_flag ("error:1:pass-gcc-internal-format");
407  xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
408  xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
409  xgettext_record_flag ("warning:1:pass-gcc-internal-format");
410  xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
411  xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
412  /* f/com.h */
413  xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
414  /* f/sts.h */
415  xgettext_record_flag ("ffests_printf:2:pass-c-format");
416  /* java/java-tree.h */
417  xgettext_record_flag ("parse_error_context:2:pass-c-format");
418#endif
419}
420
421
422/* ======================== Reading of characters.  ======================== */
423
424/* Real filename, used in error messages about the input file.  */
425static const char *real_file_name;
426
427/* Logical filename and line number, used to label the extracted messages.  */
428static char *logical_file_name;
429static int line_number;
430
431/* The input file stream.  */
432static FILE *fp;
433
434
435/* 0. Terminate line by \n, regardless whether the external representation of
436   a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
437   It is debatable whether supporting CR/LF line terminators in C sources
438   on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
439   unconditionally, it must be OK.
440   The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
441   automatically, but here we also need this conversion on Unix.  As a side
442   effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
443   is not a problem.  */
444
445
446static int
447phase0_getc ()
448{
449  int c;
450
451  c = getc (fp);
452  if (c == EOF)
453    {
454      if (ferror (fp))
455	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
456	       real_file_name);
457      return EOF;
458    }
459
460  if (c == '\r')
461    {
462      int c1 = getc (fp);
463
464      if (c1 != EOF && c1 != '\n')
465	ungetc (c1, fp);
466
467      /* Seen line terminator CR or CR/LF.  */
468      return '\n';
469    }
470
471  return c;
472}
473
474
475/* Supports only one pushback character, and not '\n'.  */
476static inline void
477phase0_ungetc (int c)
478{
479  if (c != EOF)
480    ungetc (c, fp);
481}
482
483
484/* 1. line_number handling.  Combine backslash-newline to nothing.  */
485
486static unsigned char phase1_pushback[2];
487static int phase1_pushback_length;
488
489
490static int
491phase1_getc ()
492{
493  int c;
494
495  if (phase1_pushback_length)
496    {
497      c = phase1_pushback[--phase1_pushback_length];
498      if (c == '\n')
499	++line_number;
500      return c;
501    }
502  for (;;)
503    {
504      c = phase0_getc ();
505      switch (c)
506	{
507	case '\n':
508	  ++line_number;
509	  return '\n';
510
511	case '\\':
512	  c = phase0_getc ();
513	  if (c != '\n')
514	    {
515	      phase0_ungetc (c);
516	      return '\\';
517	    }
518	  ++line_number;
519	  break;
520
521	default:
522	  return c;
523	}
524    }
525}
526
527
528/* Supports 2 characters of pushback.  */
529static void
530phase1_ungetc (int c)
531{
532  switch (c)
533    {
534    case EOF:
535      break;
536
537    case '\n':
538      --line_number;
539      /* FALLTHROUGH */
540
541    default:
542      if (phase1_pushback_length == SIZEOF (phase1_pushback))
543	abort ();
544      phase1_pushback[phase1_pushback_length++] = c;
545      break;
546    }
547}
548
549
550/* 2. Convert trigraphs to their single character equivalents.  Most
551   sane human beings vomit copiously at the mention of trigraphs, which
552   is why they are an option.  */
553
554static unsigned char phase2_pushback[1];
555static int phase2_pushback_length;
556
557
558static int
559phase2_getc ()
560{
561  int c;
562
563  if (phase2_pushback_length)
564    return phase2_pushback[--phase2_pushback_length];
565  if (!trigraphs)
566    return phase1_getc ();
567
568  c = phase1_getc ();
569  if (c != '?')
570    return c;
571  c = phase1_getc ();
572  if (c != '?')
573    {
574      phase1_ungetc (c);
575      return '?';
576    }
577  c = phase1_getc ();
578  switch (c)
579    {
580    case '(':
581      return '[';
582    case '/':
583      return '\\';
584    case ')':
585      return ']';
586    case '\'':
587      return '^';
588    case '<':
589      return '{';
590    case '!':
591      return '|';
592    case '>':
593      return '}';
594    case '-':
595      return '~';
596    case '#':
597      return '=';
598    }
599  phase1_ungetc (c);
600  phase1_ungetc ('?');
601  return '?';
602}
603
604
605/* Supports only one pushback character.  */
606static void
607phase2_ungetc (int c)
608{
609  if (c != EOF)
610    {
611      if (phase2_pushback_length == SIZEOF (phase2_pushback))
612	abort ();
613      phase2_pushback[phase2_pushback_length++] = c;
614    }
615}
616
617
618/* 3. Concatenate each line ending in backslash (\) with the following
619   line.  Basically, all you need to do is elide "\\\n" sequences from
620   the input.  */
621
622static unsigned char phase3_pushback[2];
623static int phase3_pushback_length;
624
625
626static int
627phase3_getc ()
628{
629  if (phase3_pushback_length)
630    return phase3_pushback[--phase3_pushback_length];
631  for (;;)
632    {
633      int c = phase2_getc ();
634      if (c != '\\')
635	return c;
636      c = phase2_getc ();
637      if (c != '\n')
638	{
639	  phase2_ungetc (c);
640	  return '\\';
641	}
642    }
643}
644
645
646/* Supports 2 characters of pushback.  */
647static void
648phase3_ungetc (int c)
649{
650  if (c != EOF)
651    {
652      if (phase3_pushback_length == SIZEOF (phase3_pushback))
653	abort ();
654      phase3_pushback[phase3_pushback_length++] = c;
655    }
656}
657
658
659/* Accumulating comments.  */
660
661static char *buffer;
662static size_t bufmax;
663static size_t buflen;
664
665static inline void
666comment_start ()
667{
668  buflen = 0;
669}
670
671static inline void
672comment_add (int c)
673{
674  if (buflen >= bufmax)
675    {
676      bufmax = 2 * bufmax + 10;
677      buffer = xrealloc (buffer, bufmax);
678    }
679  buffer[buflen++] = c;
680}
681
682static inline void
683comment_line_end (size_t chars_to_remove)
684{
685  buflen -= chars_to_remove;
686  while (buflen >= 1
687	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
688    --buflen;
689  if (chars_to_remove == 0 && buflen >= bufmax)
690    {
691      bufmax = 2 * bufmax + 10;
692      buffer = xrealloc (buffer, bufmax);
693    }
694  buffer[buflen] = '\0';
695  savable_comment_add (buffer);
696}
697
698
699/* These are for tracking whether comments count as immediately before
700   keyword.  */
701static int last_comment_line;
702static int last_non_comment_line;
703static int newline_count;
704
705
706/* 4. Replace each comment that is not inside a character constant or
707   string literal with a space character.  We need to remember the
708   comment for later, because it may be attached to a keyword string.
709   We also optionally understand C++ comments.  */
710
711static int
712phase4_getc ()
713{
714  int c;
715  bool last_was_star;
716
717  c = phase3_getc ();
718  if (c != '/')
719    return c;
720  c = phase3_getc ();
721  switch (c)
722    {
723    default:
724      phase3_ungetc (c);
725      return '/';
726
727    case '*':
728      /* C comment.  */
729      comment_start ();
730      last_was_star = false;
731      for (;;)
732	{
733	  c = phase3_getc ();
734	  if (c == EOF)
735	    break;
736	  /* We skip all leading white space, but not EOLs.  */
737	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
738	    comment_add (c);
739	  switch (c)
740	    {
741	    case '\n':
742	      comment_line_end (1);
743	      comment_start ();
744	      last_was_star = false;
745	      continue;
746
747	    case '*':
748	      last_was_star = true;
749	      continue;
750
751	    case '/':
752	      if (last_was_star)
753		{
754		  comment_line_end (2);
755		  break;
756		}
757	      /* FALLTHROUGH */
758
759	    default:
760	      last_was_star = false;
761	      continue;
762	    }
763	  break;
764	}
765      last_comment_line = newline_count;
766      return ' ';
767
768    case '/':
769      /* C++ or ISO C 99 comment.  */
770      comment_start ();
771      for (;;)
772	{
773	  c = phase3_getc ();
774	  if (c == '\n' || c == EOF)
775	    break;
776	  /* We skip all leading white space, but not EOLs.  */
777	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
778	    comment_add (c);
779	}
780      comment_line_end (0);
781      last_comment_line = newline_count;
782      return '\n';
783    }
784}
785
786
787/* Supports only one pushback character.  */
788static void
789phase4_ungetc (int c)
790{
791  phase3_ungetc (c);
792}
793
794
795/* ========================== Reading of tokens.  ========================== */
796
797
798/* True if ObjectiveC extensions are recognized.  */
799static bool objc_extensions;
800
801enum token_type_ty
802{
803  token_type_character_constant,	/* 'x' */
804  token_type_eof,
805  token_type_eoln,
806  token_type_hash,			/* # */
807  token_type_lparen,			/* ( */
808  token_type_rparen,			/* ) */
809  token_type_comma,			/* , */
810  token_type_colon,			/* : */
811  token_type_name,			/* abc */
812  token_type_number,			/* 2.7 */
813  token_type_string_literal,		/* "abc" */
814  token_type_symbol,			/* < > = etc. */
815  token_type_objc_special,		/* @ */
816  token_type_white_space
817};
818typedef enum token_type_ty token_type_ty;
819
820typedef struct token_ty token_ty;
821struct token_ty
822{
823  token_type_ty type;
824  char *string;		/* for token_type_name, token_type_string_literal */
825  refcounted_string_list_ty *comment;	/* for token_type_string_literal,
826					   token_type_objc_special */
827  long number;
828  int line_number;
829};
830
831
832/* 7. Replace escape sequences within character strings with their
833   single character equivalents.  This is called from phase 5, because
834   we don't have to worry about the #include argument.  There are
835   pathological cases which could bite us (like the DOS directory
836   separator), but just pretend it can't happen.  */
837
838#define P7_QUOTES (1000 + '"')
839#define P7_QUOTE (1000 + '\'')
840#define P7_NEWLINE (1000 + '\n')
841
842static int
843phase7_getc ()
844{
845  int c, n, j;
846
847  /* Use phase 3, because phase 4 elides comments.  */
848  c = phase3_getc ();
849
850  /* Return a magic newline indicator, so that we can distinguish
851     between the user requesting a newline in the string (e.g. using
852     "\n" or "\012") from the user failing to terminate the string or
853     character constant.  The ANSI C standard says: 3.1.3.4 Character
854     Constants contain ``any character except single quote, backslash or
855     newline; or an escape sequence'' and 3.1.4 String Literals contain
856     ``any character except double quote, backslash or newline; or an
857     escape sequence''.
858
859     Most compilers give a fatal error in this case, however gcc is
860     stupidly silent, even though this is a very common typo.  OK, so
861     gcc --pedantic will tell me, but that gripes about too much other
862     stuff.  Could I have a ``gcc -Wnewline-in-string'' option, or
863     better yet a ``gcc -fno-newline-in-string'' option, please?  Gcc is
864     also inconsistent between string literals and character constants:
865     you may not embed newlines in character constants; try it, you get
866     a useful diagnostic.  --PMiller  */
867  if (c == '\n')
868    return P7_NEWLINE;
869
870  if (c == '"')
871    return P7_QUOTES;
872  if (c == '\'')
873    return P7_QUOTE;
874  if (c != '\\')
875    return c;
876  c = phase3_getc ();
877  switch (c)
878    {
879    default:
880      /* Unknown escape sequences really should be an error, but just
881	 ignore them, and let the real compiler complain.  */
882      phase3_ungetc (c);
883      return '\\';
884
885    case '"':
886    case '\'':
887    case '?':
888    case '\\':
889      return c;
890
891    case 'a':
892      return '\a';
893    case 'b':
894      return '\b';
895
896      /* The \e escape is preculiar to gcc, and assumes an ASCII
897	 character set (or superset).  We don't provide support for it
898	 here.  */
899
900    case 'f':
901      return '\f';
902    case 'n':
903      return '\n';
904    case 'r':
905      return '\r';
906    case 't':
907      return '\t';
908    case 'v':
909      return '\v';
910
911    case 'x':
912      c = phase3_getc ();
913      switch (c)
914	{
915	default:
916	  phase3_ungetc (c);
917	  phase3_ungetc ('x');
918	  return '\\';
919
920	case '0': case '1': case '2': case '3': case '4':
921	case '5': case '6': case '7': case '8': case '9':
922	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
923	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
924	  break;
925	}
926      n = 0;
927      for (;;)
928	{
929	  switch (c)
930	    {
931	    default:
932	      phase3_ungetc (c);
933	      return n;
934
935	    case '0': case '1': case '2': case '3': case '4':
936	    case '5': case '6': case '7': case '8': case '9':
937	      n = n * 16 + c - '0';
938	      break;
939
940	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
941	      n = n * 16 + 10 + c - 'A';
942	      break;
943
944	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
945	      n = n * 16 + 10 + c - 'a';
946	      break;
947	    }
948	  c = phase3_getc ();
949	}
950      return n;
951
952    case '0': case '1': case '2': case '3':
953    case '4': case '5': case '6': case '7':
954      n = 0;
955      for (j = 0; j < 3; ++j)
956	{
957	  n = n * 8 + c - '0';
958	  c = phase3_getc ();
959	  switch (c)
960	    {
961	    default:
962	      break;
963
964	    case '0': case '1': case '2': case '3':
965	    case '4': case '5': case '6': case '7':
966	      continue;
967	    }
968	  break;
969	}
970      phase3_ungetc (c);
971      return n;
972    }
973}
974
975
976static void
977phase7_ungetc (int c)
978{
979  phase3_ungetc (c);
980}
981
982
983/* Free the memory pointed to by a 'struct token_ty'.  */
984static inline void
985free_token (token_ty *tp)
986{
987  if (tp->type == token_type_name || tp->type == token_type_string_literal)
988    free (tp->string);
989  if (tp->type == token_type_string_literal
990      || tp->type == token_type_objc_special)
991    drop_reference (tp->comment);
992}
993
994
995/* 5. Parse each resulting logical line as preprocessing tokens and
996   white space.  Preprocessing tokens and C tokens don't always match.  */
997
998static token_ty phase5_pushback[1];
999static int phase5_pushback_length;
1000
1001
1002static void
1003phase5_get (token_ty *tp)
1004{
1005  static char *buffer;
1006  static int bufmax;
1007  int bufpos;
1008  int c;
1009
1010  if (phase5_pushback_length)
1011    {
1012      *tp = phase5_pushback[--phase5_pushback_length];
1013      return;
1014    }
1015  tp->string = NULL;
1016  tp->number = 0;
1017  tp->line_number = line_number;
1018  c = phase4_getc ();
1019  switch (c)
1020    {
1021    case EOF:
1022      tp->type = token_type_eof;
1023      return;
1024
1025    case '\n':
1026      tp->type = token_type_eoln;
1027      return;
1028
1029    case ' ':
1030    case '\f':
1031    case '\t':
1032      for (;;)
1033	{
1034	  c = phase4_getc ();
1035	  switch (c)
1036	    {
1037	    case ' ':
1038	    case '\f':
1039	    case '\t':
1040	      continue;
1041
1042	    default:
1043	      phase4_ungetc (c);
1044	      break;
1045	    }
1046	  break;
1047	}
1048      tp->type = token_type_white_space;
1049      return;
1050
1051    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1052    case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1053    case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1054    case 'V': case 'W': case 'X': case 'Y': case 'Z':
1055    case '_':
1056    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1057    case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1058    case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1059    case 'v': case 'w': case 'x': case 'y': case 'z':
1060      bufpos = 0;
1061      for (;;)
1062	{
1063	  if (bufpos >= bufmax)
1064	    {
1065	      bufmax = 2 * bufmax + 10;
1066	      buffer = xrealloc (buffer, bufmax);
1067	    }
1068	  buffer[bufpos++] = c;
1069	  c = phase4_getc ();
1070	  switch (c)
1071	    {
1072	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1073	    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1074	    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1075	    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1076	    case 'Y': case 'Z':
1077	    case '_':
1078	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1079	    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1080	    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1081	    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1082	    case 'y': case 'z':
1083	    case '0': case '1': case '2': case '3': case '4':
1084	    case '5': case '6': case '7': case '8': case '9':
1085	      continue;
1086
1087	    default:
1088	      phase4_ungetc (c);
1089	      break;
1090	    }
1091	  break;
1092	}
1093      if (bufpos >= bufmax)
1094	{
1095	  bufmax = 2 * bufmax + 10;
1096	  buffer = xrealloc (buffer, bufmax);
1097	}
1098      buffer[bufpos] = 0;
1099      tp->string = xstrdup (buffer);
1100      tp->type = token_type_name;
1101      return;
1102
1103    case '.':
1104      c = phase4_getc ();
1105      phase4_ungetc (c);
1106      switch (c)
1107	{
1108	default:
1109	  tp->type = token_type_symbol;
1110	  return;
1111
1112	case '0': case '1': case '2': case '3': case '4':
1113	case '5': case '6': case '7': case '8': case '9':
1114	  c = '.';
1115	  break;
1116	}
1117      /* FALLTHROUGH */
1118
1119    case '0': case '1': case '2': case '3': case '4':
1120    case '5': case '6': case '7': case '8': case '9':
1121      /* The preprocessing number token is more "generous" than the C
1122	 number tokens.  This is mostly due to token pasting (another
1123	 thing we can ignore here).  */
1124      bufpos = 0;
1125      for (;;)
1126	{
1127	  if (bufpos >= bufmax)
1128	    {
1129	      bufmax = 2 * bufmax + 10;
1130	      buffer = xrealloc (buffer, bufmax);
1131	    }
1132	  buffer[bufpos++] = c;
1133	  c = phase4_getc ();
1134	  switch (c)
1135	    {
1136	    case 'e':
1137	    case 'E':
1138	      if (bufpos >= bufmax)
1139		{
1140		  bufmax = 2 * bufmax + 10;
1141		  buffer = xrealloc (buffer, bufmax);
1142		}
1143	      buffer[bufpos++] = c;
1144	      c = phase4_getc ();
1145	      if (c != '+' || c != '-')
1146		{
1147		  phase4_ungetc (c);
1148		  break;
1149		}
1150	      continue;
1151
1152	    case 'A': case 'B': case 'C': case 'D':           case 'F':
1153	    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1154	    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1155	    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1156	    case 'Y': case 'Z':
1157	    case 'a': case 'b': case 'c': case 'd':           case 'f':
1158	    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1159	    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1160	    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1161	    case 'y': case 'z':
1162	    case '0': case '1': case '2': case '3': case '4':
1163	    case '5': case '6': case '7': case '8': case '9':
1164	    case '.':
1165	      continue;
1166
1167	    default:
1168	      phase4_ungetc (c);
1169	      break;
1170	    }
1171	  break;
1172	}
1173      if (bufpos >= bufmax)
1174	{
1175	  bufmax = 2 * bufmax + 10;
1176	  buffer = xrealloc (buffer, bufmax);
1177	}
1178      buffer[bufpos] = 0;
1179      tp->type = token_type_number;
1180      tp->number = atol (buffer);
1181      return;
1182
1183    case '\'':
1184      /* We could worry about the 'L' before wide character constants,
1185	 but ignoring it has no effect unless one of the keywords is
1186	 "L".  Just pretend it won't happen.  Also, we don't need to
1187	 remember the character constant.  */
1188      for (;;)
1189	{
1190	  c = phase7_getc ();
1191	  if (c == P7_NEWLINE)
1192	    {
1193	      error_with_progname = false;
1194	      error (0, 0, _("%s:%d: warning: unterminated character constant"),
1195		     logical_file_name, line_number - 1);
1196	      error_with_progname = true;
1197	      phase7_ungetc ('\n');
1198	      break;
1199	    }
1200	  if (c == EOF || c == P7_QUOTE)
1201	    break;
1202	}
1203      tp->type = token_type_character_constant;
1204      return;
1205
1206    case '"':
1207      /* We could worry about the 'L' before wide string constants,
1208	 but since gettext's argument is not a wide character string,
1209	 let the compiler complain about the argument not matching the
1210	 prototype.  Just pretend it won't happen.  */
1211      bufpos = 0;
1212      for (;;)
1213	{
1214	  c = phase7_getc ();
1215	  if (c == P7_NEWLINE)
1216	    {
1217	      error_with_progname = false;
1218	      error (0, 0, _("%s:%d: warning: unterminated string literal"),
1219		     logical_file_name, line_number - 1);
1220	      error_with_progname = true;
1221	      phase7_ungetc ('\n');
1222	      break;
1223	    }
1224	  if (c == EOF || c == P7_QUOTES)
1225	    break;
1226	  if (c == P7_QUOTE)
1227	    c = '\'';
1228	  if (bufpos >= bufmax)
1229	    {
1230	      bufmax = 2 * bufmax + 10;
1231	      buffer = xrealloc (buffer, bufmax);
1232	    }
1233	  buffer[bufpos++] = c;
1234	}
1235      if (bufpos >= bufmax)
1236	{
1237	  bufmax = 2 * bufmax + 10;
1238	  buffer = xrealloc (buffer, bufmax);
1239	}
1240      buffer[bufpos] = 0;
1241      tp->type = token_type_string_literal;
1242      tp->string = xstrdup (buffer);
1243      tp->comment = add_reference (savable_comment);
1244      return;
1245
1246    case '(':
1247      tp->type = token_type_lparen;
1248      return;
1249
1250    case ')':
1251      tp->type = token_type_rparen;
1252      return;
1253
1254    case ',':
1255      tp->type = token_type_comma;
1256      return;
1257
1258    case '#':
1259      tp->type = token_type_hash;
1260      return;
1261
1262    case ':':
1263      tp->type = token_type_colon;
1264      return;
1265
1266    case '@':
1267      if (objc_extensions)
1268	{
1269	  tp->type = token_type_objc_special;
1270	  tp->comment = add_reference (savable_comment);
1271	  return;
1272	}
1273      /* FALLTHROUGH */
1274
1275    default:
1276      /* We could carefully recognize each of the 2 and 3 character
1277	operators, but it is not necessary, as we only need to recognize
1278	gettext invocations.  Don't bother.  */
1279      tp->type = token_type_symbol;
1280      return;
1281    }
1282}
1283
1284
1285/* Supports only one pushback token.  */
1286static void
1287phase5_unget (token_ty *tp)
1288{
1289  if (tp->type != token_type_eof)
1290    {
1291      if (phase5_pushback_length == SIZEOF (phase5_pushback))
1292	abort ();
1293      phase5_pushback[phase5_pushback_length++] = *tp;
1294    }
1295}
1296
1297
1298/* X. Recognize a leading # symbol.  Leave leading hash as a hash, but
1299   turn hash in the middle of a line into a plain symbol token.  This
1300   makes the phase 6 easier.  */
1301
1302static void
1303phaseX_get (token_ty *tp)
1304{
1305  static bool middle;	/* false at the beginning of a line, true otherwise.  */
1306
1307  phase5_get (tp);
1308
1309  if (tp->type == token_type_eoln || tp->type == token_type_eof)
1310    middle = false;
1311  else
1312    {
1313      if (middle)
1314	{
1315	  /* Turn hash in the middle of a line into a plain symbol token.  */
1316	  if (tp->type == token_type_hash)
1317	    tp->type = token_type_symbol;
1318	}
1319      else
1320	{
1321	  /* When we see leading whitespace followed by a hash sign,
1322	     discard the leading white space token.  The hash is all
1323	     phase 6 is interested in.  */
1324	  if (tp->type == token_type_white_space)
1325	    {
1326	      token_ty next;
1327
1328	      phase5_get (&next);
1329	      if (next.type == token_type_hash)
1330		*tp = next;
1331	      else
1332		phase5_unget (&next);
1333	    }
1334	  middle = true;
1335	}
1336    }
1337}
1338
1339
1340/* 6. Recognize and carry out directives (it also expands macros on
1341   non-directive lines, which we do not do here).  The only directive
1342   we care about are the #line and #define directive.  We throw all the
1343   others away.  */
1344
1345static token_ty phase6_pushback[2];
1346static int phase6_pushback_length;
1347
1348
1349static void
1350phase6_get (token_ty *tp)
1351{
1352  static token_ty *buf;
1353  static int bufmax;
1354  int bufpos;
1355  int j;
1356
1357  if (phase6_pushback_length)
1358    {
1359      *tp = phase6_pushback[--phase6_pushback_length];
1360      return;
1361    }
1362  for (;;)
1363    {
1364      /* Get the next token.  If it is not a '#' at the beginning of a
1365	 line (ignoring whitespace), return immediately.  */
1366      phaseX_get (tp);
1367      if (tp->type != token_type_hash)
1368	return;
1369
1370      /* Accumulate the rest of the directive in a buffer, until the
1371	 "define" keyword is seen or until end of line.  */
1372      bufpos = 0;
1373      for (;;)
1374	{
1375	  phaseX_get (tp);
1376	  if (tp->type == token_type_eoln || tp->type == token_type_eof)
1377	    break;
1378
1379	  /* Before the "define" keyword and inside other directives
1380	     white space is irrelevant.  So just throw it away.  */
1381	  if (tp->type != token_type_white_space)
1382	    {
1383	      /* If it is a #define directive, return immediately,
1384		 thus treating the body of the #define directive like
1385		 normal input.  */
1386	      if (bufpos == 0
1387		  && tp->type == token_type_name
1388		  && strcmp (tp->string, "define") == 0)
1389		return;
1390
1391	      /* Accumulate.  */
1392	      if (bufpos >= bufmax)
1393		{
1394		  bufmax = 2 * bufmax + 10;
1395		  buf = xrealloc (buf, bufmax * sizeof (buf[0]));
1396		}
1397	      buf[bufpos++] = *tp;
1398	    }
1399	}
1400
1401      /* If it is a #line directive, with no macros to expand, act on
1402	 it.  Ignore all other directives.  */
1403      if (bufpos >= 3 && buf[0].type == token_type_name
1404	  && strcmp (buf[0].string, "line") == 0
1405	  && buf[1].type == token_type_number
1406	  && buf[2].type == token_type_string_literal)
1407	{
1408	  logical_file_name = xstrdup (buf[2].string);
1409	  line_number = buf[1].number;
1410	}
1411      if (bufpos >= 2 && buf[0].type == token_type_number
1412	  && buf[1].type == token_type_string_literal)
1413	{
1414	  logical_file_name = xstrdup (buf[1].string);
1415	  line_number = buf[0].number;
1416	}
1417
1418      /* Release the storage held by the directive.  */
1419      for (j = 0; j < bufpos; ++j)
1420	free_token (&buf[j]);
1421
1422      /* We must reset the selected comments.  */
1423      savable_comment_reset ();
1424    }
1425}
1426
1427
1428/* Supports 2 tokens of pushback.  */
1429static void
1430phase6_unget (token_ty *tp)
1431{
1432  if (tp->type != token_type_eof)
1433    {
1434      if (phase6_pushback_length == SIZEOF (phase6_pushback))
1435	abort ();
1436      phase6_pushback[phase6_pushback_length++] = *tp;
1437    }
1438}
1439
1440
1441/* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1442   literal placeholders.  */
1443
1444/* Test for an ISO C 99 section 7.8.1 format string directive.  */
1445static bool
1446is_inttypes_macro (const char *name)
1447{
1448  /* Syntax:
1449     P R I { d | i | o | u | x | X }
1450     { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR }  */
1451  if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I')
1452    {
1453      name += 3;
1454      if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u'
1455	  || name[0] == 'x' || name[0] == 'X')
1456	{
1457	  name += 1;
1458	  if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X'
1459	      && name[3] == '\0')
1460	    return true;
1461	  if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R'
1462	      && name[3] == '\0')
1463	    return true;
1464	  if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A'
1465	      && name[3] == 'S' && name[4] == 'T')
1466	    name += 5;
1467	  else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S'
1468		   && name[3] == 'T')
1469	    name += 4;
1470	  if (name[0] == '8' && name[1] == '\0')
1471	    return true;
1472	  if (name[0] == '1' && name[1] == '6' && name[2] == '\0')
1473	    return true;
1474	  if (name[0] == '3' && name[1] == '2' && name[2] == '\0')
1475	    return true;
1476	  if (name[0] == '6' && name[1] == '4' && name[2] == '\0')
1477	    return true;
1478	}
1479    }
1480  return false;
1481}
1482
1483static void
1484phase8a_get (token_ty *tp)
1485{
1486  phase6_get (tp);
1487  if (tp->type == token_type_name && is_inttypes_macro (tp->string))
1488    {
1489      /* Turn PRIdXXX into "<PRIdXXX>".  */
1490      char *new_string = xasprintf ("<%s>", tp->string);
1491      free (tp->string);
1492      tp->string = new_string;
1493      tp->comment = add_reference (savable_comment);
1494      tp->type = token_type_string_literal;
1495    }
1496}
1497
1498/* Supports 2 tokens of pushback.  */
1499static inline void
1500phase8a_unget (token_ty *tp)
1501{
1502  phase6_unget (tp);
1503}
1504
1505
1506/* 8b. Drop whitespace.  */
1507static void
1508phase8b_get (token_ty *tp)
1509{
1510  for (;;)
1511    {
1512      phase8a_get (tp);
1513
1514      if (tp->type == token_type_white_space)
1515	continue;
1516      if (tp->type == token_type_eoln)
1517	{
1518	  /* We have to track the last occurrence of a string.  One
1519	     mode of xgettext allows to group an extracted message
1520	     with a comment for documentation.  The rule which states
1521	     which comment is assumed to be grouped with the message
1522	     says it should immediately precede it.  Our
1523	     interpretation: between the last line of the comment and
1524	     the line in which the keyword is found must be no line
1525	     with non-white space tokens.  */
1526	  ++newline_count;
1527	  if (last_non_comment_line > last_comment_line)
1528	    savable_comment_reset ();
1529	  continue;
1530	}
1531      break;
1532    }
1533}
1534
1535/* Supports 2 tokens of pushback.  */
1536static inline void
1537phase8b_unget (token_ty *tp)
1538{
1539  phase8a_unget (tp);
1540}
1541
1542
1543/* 8c. In ObjectiveC mode, drop '@' before a literal string.  We need to
1544   do this before performing concatenation of adjacent string literals.  */
1545static void
1546phase8c_get (token_ty *tp)
1547{
1548  token_ty tmp;
1549
1550  phase8b_get (tp);
1551  if (tp->type != token_type_objc_special)
1552    return;
1553  phase8b_get (&tmp);
1554  if (tmp.type != token_type_string_literal)
1555    {
1556      phase8b_unget (&tmp);
1557      return;
1558    }
1559  /* Drop the '@' token and return immediately the following string.  */
1560  drop_reference (tmp.comment);
1561  tmp.comment = tp->comment;
1562  *tp = tmp;
1563}
1564
1565/* Supports only one pushback token.  */
1566static inline void
1567phase8c_unget (token_ty *tp)
1568{
1569  phase8b_unget (tp);
1570}
1571
1572
1573/* 8. Concatenate adjacent string literals to form single string
1574   literals (because we don't expand macros, there are a few things we
1575   will miss).  */
1576
1577static void
1578phase8_get (token_ty *tp)
1579{
1580  phase8c_get (tp);
1581  if (tp->type != token_type_string_literal)
1582    return;
1583  for (;;)
1584    {
1585      token_ty tmp;
1586      size_t len;
1587
1588      phase8c_get (&tmp);
1589      if (tmp.type != token_type_string_literal)
1590	{
1591	  phase8c_unget (&tmp);
1592	  return;
1593	}
1594      len = strlen (tp->string);
1595      tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1596      strcpy (tp->string + len, tmp.string);
1597      free_token (&tmp);
1598    }
1599}
1600
1601
1602/* ===================== Reading of high-level tokens.  ==================== */
1603
1604
1605enum xgettext_token_type_ty
1606{
1607  xgettext_token_type_eof,
1608  xgettext_token_type_keyword,
1609  xgettext_token_type_symbol,
1610  xgettext_token_type_lparen,
1611  xgettext_token_type_rparen,
1612  xgettext_token_type_comma,
1613  xgettext_token_type_colon,
1614  xgettext_token_type_string_literal,
1615  xgettext_token_type_other
1616};
1617typedef enum xgettext_token_type_ty xgettext_token_type_ty;
1618
1619typedef struct xgettext_token_ty xgettext_token_ty;
1620struct xgettext_token_ty
1621{
1622  xgettext_token_type_ty type;
1623
1624  /* This field is used only for xgettext_token_type_keyword.  */
1625  const struct callshapes *shapes;
1626
1627  /* This field is used only for xgettext_token_type_string_literal,
1628     xgettext_token_type_keyword, xgettext_token_type_symbol.  */
1629  char *string;
1630
1631  /* This field is used only for xgettext_token_type_string_literal.  */
1632  refcounted_string_list_ty *comment;
1633
1634  /* These fields are only for
1635       xgettext_token_type_keyword,
1636       xgettext_token_type_string_literal.  */
1637  lex_pos_ty pos;
1638};
1639
1640
1641/* 9. Convert the remaining preprocessing tokens to C tokens and
1642   discards any white space from the translation unit.  */
1643
1644static void
1645x_c_lex (xgettext_token_ty *tp)
1646{
1647  for (;;)
1648    {
1649      token_ty token;
1650      void *keyword_value;
1651
1652      phase8_get (&token);
1653      switch (token.type)
1654	{
1655	case token_type_eof:
1656	  tp->type = xgettext_token_type_eof;
1657	  return;
1658
1659	case token_type_name:
1660	  last_non_comment_line = newline_count;
1661
1662	  if (hash_find_entry (objc_extensions ? &objc_keywords : &c_keywords,
1663			       token.string, strlen (token.string),
1664			       &keyword_value)
1665	      == 0)
1666	    {
1667	      tp->type = xgettext_token_type_keyword;
1668	      tp->shapes = (const struct callshapes *) keyword_value;
1669	      tp->pos.file_name = logical_file_name;
1670	      tp->pos.line_number = token.line_number;
1671	    }
1672	  else
1673	    tp->type = xgettext_token_type_symbol;
1674	  tp->string = token.string;
1675	  return;
1676
1677	case token_type_lparen:
1678	  last_non_comment_line = newline_count;
1679
1680	  tp->type = xgettext_token_type_lparen;
1681	  return;
1682
1683	case token_type_rparen:
1684	  last_non_comment_line = newline_count;
1685
1686	  tp->type = xgettext_token_type_rparen;
1687	  return;
1688
1689	case token_type_comma:
1690	  last_non_comment_line = newline_count;
1691
1692	  tp->type = xgettext_token_type_comma;
1693	  return;
1694
1695	case token_type_colon:
1696	  last_non_comment_line = newline_count;
1697
1698	  tp->type = xgettext_token_type_colon;
1699	  return;
1700
1701	case token_type_string_literal:
1702	  last_non_comment_line = newline_count;
1703
1704	  tp->type = xgettext_token_type_string_literal;
1705	  tp->string = token.string;
1706	  tp->comment = token.comment;
1707	  tp->pos.file_name = logical_file_name;
1708	  tp->pos.line_number = token.line_number;
1709	  return;
1710
1711	case token_type_objc_special:
1712	  drop_reference (token.comment);
1713	  /* FALLTHROUGH */
1714
1715	default:
1716	  last_non_comment_line = newline_count;
1717
1718	  tp->type = xgettext_token_type_other;
1719	  return;
1720	}
1721    }
1722}
1723
1724
1725/* ========================= Extracting strings.  ========================== */
1726
1727
1728/* Context lookup table.  */
1729static flag_context_list_table_ty *flag_context_list_table;
1730
1731
1732/* The file is broken into tokens.  Scan the token stream, looking for
1733   a keyword, followed by a left paren, followed by a string.  When we
1734   see this sequence, we have something to remember.  We assume we are
1735   looking at a valid C or C++ program, and leave the complaints about
1736   the grammar to the compiler.
1737
1738     Normal handling: Look for
1739       keyword ( ... msgid ... )
1740     Plural handling: Look for
1741       keyword ( ... msgid ... msgid_plural ... )
1742
1743   We use recursion because the arguments before msgid or between msgid
1744   and msgid_plural can contain subexpressions of the same form.  */
1745
1746
1747/* Extract messages until the next balanced closing parenthesis.
1748   Extracted messages are added to MLP.
1749   Return true upon eof, false upon closing parenthesis.  */
1750static bool
1751extract_parenthesized (message_list_ty *mlp,
1752		       flag_context_ty outer_context,
1753		       flag_context_list_iterator_ty context_iter,
1754		       struct arglist_parser *argparser)
1755{
1756  /* Current argument number.  */
1757  int arg = 1;
1758  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1759  int state;
1760  /* Parameters of the keyword just seen.  Defined only in state 1.  */
1761  const struct callshapes *next_shapes = NULL;
1762  /* Context iterator that will be used if the next token is a '('.  */
1763  flag_context_list_iterator_ty next_context_iter =
1764    passthrough_context_list_iterator;
1765  /* Context iterator that will be used if the next token is a ':'.
1766     (Objective C selector syntax.)  */
1767  flag_context_list_iterator_ty selectorcall_context_iter =
1768    passthrough_context_list_iterator;
1769  /* Current context.  */
1770  flag_context_ty inner_context =
1771    inherited_context (outer_context,
1772		       flag_context_list_iterator_advance (&context_iter));
1773
1774  /* Start state is 0.  */
1775  state = 0;
1776
1777  for (;;)
1778    {
1779      xgettext_token_ty token;
1780
1781      x_c_lex (&token);
1782      switch (token.type)
1783	{
1784	case xgettext_token_type_keyword:
1785	  next_shapes = token.shapes;
1786	  state = 1;
1787	  goto keyword_or_symbol;
1788
1789	case xgettext_token_type_symbol:
1790	  state = 0;
1791	keyword_or_symbol:
1792	  next_context_iter =
1793	    flag_context_list_iterator (
1794	      flag_context_list_table_lookup (
1795		flag_context_list_table,
1796		token.string, strlen (token.string)));
1797	  if (objc_extensions)
1798	    {
1799	      size_t token_string_len = strlen (token.string);
1800	      token.string = xrealloc (token.string, token_string_len + 2);
1801	      token.string[token_string_len] = ':';
1802	      token.string[token_string_len + 1] = '\0';
1803	      selectorcall_context_iter =
1804		flag_context_list_iterator (
1805		  flag_context_list_table_lookup (
1806		    flag_context_list_table,
1807		    token.string, token_string_len + 1));
1808	    }
1809	  free (token.string);
1810	  continue;
1811
1812	case xgettext_token_type_lparen:
1813	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
1814				     arglist_parser_alloc (mlp,
1815							   state ? next_shapes : NULL)))
1816	    {
1817	      arglist_parser_done (argparser, arg);
1818	      return true;
1819	    }
1820	  next_context_iter = null_context_list_iterator;
1821	  selectorcall_context_iter = null_context_list_iterator;
1822	  state = 0;
1823	  continue;
1824
1825	case xgettext_token_type_rparen:
1826	  arglist_parser_done (argparser, arg);
1827	  return false;
1828
1829	case xgettext_token_type_comma:
1830	  arg++;
1831	  inner_context =
1832	    inherited_context (outer_context,
1833			       flag_context_list_iterator_advance (
1834				 &context_iter));
1835	  next_context_iter = passthrough_context_list_iterator;
1836	  selectorcall_context_iter = passthrough_context_list_iterator;
1837	  state = 0;
1838	  continue;
1839
1840	case xgettext_token_type_colon:
1841	  if (objc_extensions)
1842	    {
1843	      context_iter = selectorcall_context_iter;
1844	      inner_context =
1845		inherited_context (inner_context,
1846				   flag_context_list_iterator_advance (
1847				     &context_iter));
1848	      next_context_iter = passthrough_context_list_iterator;
1849	      selectorcall_context_iter = passthrough_context_list_iterator;
1850	    }
1851	  else
1852	    {
1853	      next_context_iter = null_context_list_iterator;
1854	      selectorcall_context_iter = null_context_list_iterator;
1855	    }
1856	  state = 0;
1857	  continue;
1858
1859	case xgettext_token_type_string_literal:
1860	  if (extract_all)
1861	    remember_a_message (mlp, NULL, token.string, inner_context,
1862				&token.pos, token.comment);
1863	  else
1864	    arglist_parser_remember (argparser, arg, token.string,
1865				     inner_context,
1866				     token.pos.file_name, token.pos.line_number,
1867				     token.comment);
1868	  drop_reference (token.comment);
1869	  next_context_iter = null_context_list_iterator;
1870	  selectorcall_context_iter = null_context_list_iterator;
1871	  state = 0;
1872	  continue;
1873
1874	case xgettext_token_type_other:
1875	  next_context_iter = null_context_list_iterator;
1876	  selectorcall_context_iter = null_context_list_iterator;
1877	  state = 0;
1878	  continue;
1879
1880	case xgettext_token_type_eof:
1881	  arglist_parser_done (argparser, arg);
1882	  return true;
1883
1884	default:
1885	  abort ();
1886	}
1887    }
1888}
1889
1890
1891static void
1892extract_whole_file (FILE *f,
1893		    const char *real_filename, const char *logical_filename,
1894		    flag_context_list_table_ty *flag_table,
1895		    msgdomain_list_ty *mdlp)
1896{
1897  message_list_ty *mlp = mdlp->item[0]->messages;
1898
1899  fp = f;
1900  real_file_name = real_filename;
1901  logical_file_name = xstrdup (logical_filename);
1902  line_number = 1;
1903
1904  newline_count = 0;
1905  last_comment_line = -1;
1906  last_non_comment_line = -1;
1907
1908  flag_context_list_table = flag_table;
1909
1910  init_keywords ();
1911
1912  /* Eat tokens until eof is seen.  When extract_parenthesized returns
1913     due to an unbalanced closing parenthesis, just restart it.  */
1914  while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1915				 arglist_parser_alloc (mlp, NULL)))
1916    ;
1917
1918  /* Close scanner.  */
1919  fp = NULL;
1920  real_file_name = NULL;
1921  logical_file_name = NULL;
1922  line_number = 0;
1923}
1924
1925
1926void
1927extract_c (FILE *f,
1928	   const char *real_filename, const char *logical_filename,
1929	   flag_context_list_table_ty *flag_table,
1930	   msgdomain_list_ty *mdlp)
1931{
1932  objc_extensions = false;
1933  extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1934}
1935
1936void
1937extract_objc (FILE *f,
1938	      const char *real_filename, const char *logical_filename,
1939	      flag_context_list_table_ty *flag_table,
1940	      msgdomain_list_ty *mdlp)
1941{
1942  objc_extensions = true;
1943  extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1944}
1945