1/* xgettext Lisp backend.
2   Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3
4   This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19
20#ifdef HAVE_CONFIG_H
21# include "config.h"
22#endif
23
24#include <errno.h>
25#include <stdbool.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29
30#include "message.h"
31#include "xgettext.h"
32#include "x-lisp.h"
33#include "error.h"
34#include "xalloc.h"
35#include "exit.h"
36#include "hash.h"
37#include "gettext.h"
38
39#define _(s) gettext(s)
40
41
42/* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
43   Since we are interested only in strings and in forms similar to
44        (gettext msgid ...)
45   or   (ngettext msgid msgid_plural ...)
46   we make the following simplifications:
47
48   - Assume the keywords and strings are in an ASCII compatible encoding.
49     This means we can read the input file one byte at a time, instead of
50     one character at a time.  No need to worry about multibyte characters:
51     If they occur as part of identifiers, they most probably act as
52     constituent characters, and the byte based approach will do the same.
53
54   - Assume the read table is the standard Common Lisp read table.
55     Non-standard read tables are mostly used to read data, not programs.
56
57   - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
58
59   - Don't interpret #n= and #n#, they usually don't appear in programs.
60
61   - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
62
63   The remaining syntax rules are:
64
65   - The syntax code assigned to each character, and how tokens are built
66     up from characters (single escape, multiple escape etc.).
67
68   - Comment syntax: ';' and '#| ... |#'.
69
70   - String syntax: "..." with single escapes.
71
72   - Read macros and dispatch macro character '#'.  Needed to be able to
73     tell which is the n-th argument of a function call.
74
75 */
76
77
78/* ========================= Lexer customization.  ========================= */
79
80/* 'readtable_case' is the case conversion that is applied to non-escaped
81    parts of symbol tokens.  In Common Lisp: (readtable-case *readtable*).  */
82
83enum rtcase
84{
85  case_upcase,
86  case_downcase,
87  case_preserve,
88  case_invert
89};
90
91static enum rtcase readtable_case = case_upcase;
92
93/* 'read_base' is the assumed radix of integers and rational numbers.
94   In Common Lisp: *read-base*.  */
95static int read_base = 10;
96
97/* 'read_preserve_whitespace' specifies whether a whitespace character
98   that terminates a token must be pushed back on the input stream.
99   We set it to true, because the special newline side effect in read_object()
100   requires that read_object() sees every newline not inside a token.  */
101static bool read_preserve_whitespace = true;
102
103
104/* ====================== Keyword set customization.  ====================== */
105
106/* If true extract all strings.  */
107static bool extract_all = false;
108
109static hash_table keywords;
110static bool default_keywords = true;
111
112
113void
114x_lisp_extract_all ()
115{
116  extract_all = true;
117}
118
119
120void
121x_lisp_keyword (const char *name)
122{
123  if (name == NULL)
124    default_keywords = false;
125  else
126    {
127      const char *end;
128      struct callshape shape;
129      const char *colon;
130      size_t len;
131      char *symname;
132      size_t i;
133
134      if (keywords.table == NULL)
135	hash_init (&keywords, 100);
136
137      split_keywordspec (name, &end, &shape);
138
139      /* The characters between name and end should form a valid Lisp symbol.
140	 Extract the symbol name part.  */
141      colon = strchr (name, ':');
142      if (colon != NULL && colon < end)
143	{
144	  name = colon + 1;
145	  if (name < end && *name == ':')
146	    name++;
147	  colon = strchr (name, ':');
148	  if (colon != NULL && colon < end)
149	    return;
150	}
151
152      /* Uppercase it.  */
153      len = end - name;
154      symname = (char *) xmalloc (len);
155      for (i = 0; i < len; i++)
156	symname[i] =
157	  (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
158
159      insert_keyword_callshape (&keywords, symname, len, &shape);
160    }
161}
162
163/* Finish initializing the keywords hash table.
164   Called after argument processing, before each file is processed.  */
165static void
166init_keywords ()
167{
168  if (default_keywords)
169    {
170      /* When adding new keywords here, also update the documentation in
171	 xgettext.texi!  */
172      x_lisp_keyword ("gettext");	/* I18N:GETTEXT */
173      x_lisp_keyword ("ngettext:1,2");	/* I18N:NGETTEXT */
174      x_lisp_keyword ("gettext-noop");
175      default_keywords = false;
176    }
177}
178
179void
180init_flag_table_lisp ()
181{
182  xgettext_record_flag ("gettext:1:pass-lisp-format");
183  xgettext_record_flag ("ngettext:1:pass-lisp-format");
184  xgettext_record_flag ("ngettext:2:pass-lisp-format");
185  xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
186  xgettext_record_flag ("format:2:lisp-format");
187}
188
189
190/* ======================== Reading of characters.  ======================== */
191
192/* Real filename, used in error messages about the input file.  */
193static const char *real_file_name;
194
195/* Logical filename and line number, used to label the extracted messages.  */
196static char *logical_file_name;
197static int line_number;
198
199/* The input file stream.  */
200static FILE *fp;
201
202
203/* Fetch the next character from the input file.  */
204static int
205do_getc ()
206{
207  int c = getc (fp);
208
209  if (c == EOF)
210    {
211      if (ferror (fp))
212	error (EXIT_FAILURE, errno, _("\
213error while reading \"%s\""), real_file_name);
214    }
215  else if (c == '\n')
216   line_number++;
217
218  return c;
219}
220
221/* Put back the last fetched character, not EOF.  */
222static void
223do_ungetc (int c)
224{
225  if (c == '\n')
226    line_number--;
227  ungetc (c, fp);
228}
229
230
231/* ========= Reading of tokens.  See CLHS 2.2 "Reader Algorithm".  ========= */
232
233
234/* Syntax code.  See CLHS 2.1.4 "Character Syntax Types".  */
235
236enum syntax_code
237{
238  syntax_illegal,	/* non-printable, except whitespace	*/
239  syntax_single_esc,	/* '\' (single escape)			*/
240  syntax_multi_esc,	/* '|' (multiple escape)		*/
241  syntax_constituent,	/* everything else (constituent)	*/
242  syntax_whitespace,	/* TAB,LF,FF,CR,' ' (whitespace)	*/
243  syntax_eof,		/* EOF					*/
244  syntax_t_macro,	/* '()'"' (terminating macro)		*/
245  syntax_nt_macro	/* '#' (non-terminating macro)		*/
246};
247
248/* Returns the syntax code of a character.  */
249static enum syntax_code
250syntax_code_of (unsigned char c)
251{
252  switch (c)
253    {
254    case '\\':
255      return syntax_single_esc;
256    case '|':
257      return syntax_multi_esc;
258    case '\t': case '\n': case '\f': case '\r': case ' ':
259      return syntax_whitespace;
260    case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
261      return syntax_t_macro;
262    case '#':
263      return syntax_nt_macro;
264    default:
265      if (c < ' ' && c != '\b')
266	return syntax_illegal;
267      else
268	return syntax_constituent;
269    }
270}
271
272struct char_syntax
273{
274  int ch;			/* character */
275  enum syntax_code scode;	/* syntax code */
276};
277
278/* Returns the next character and its syntax code.  */
279static void
280read_char_syntax (struct char_syntax *p)
281{
282  int c = do_getc ();
283
284  p->ch = c;
285  p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
286}
287
288/* Every character in a token has an attribute assigned.  The attributes
289   help during interpretation of the token.  See
290   CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
291   and CLHS 2.1.4.2 "Constituent Traits".  */
292
293enum attribute
294{
295  a_illg,	/* invalid constituent */
296  a_pack_m,	/* ':' package marker */
297  a_alpha,	/* normal alphabetic */
298  a_escaped,	/* alphabetic but not subject to case conversion */
299  a_ratio,	/* '/' */
300  a_dot,	/* '.' */
301  a_sign,	/* '+-' */
302  a_extens,	/* '_^' extension characters */
303  a_digit,	/* '0123456789' */
304  a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
305  a_expodigit,	/* 'esfdlESFDL' below base */
306  a_letter,	/* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
307  a_expo	/* 'esfdlESFDL' */
308};
309
310#define is_letter_attribute(a) ((a) >= a_letter)
311#define is_number_attribute(a) ((a) >= a_ratio)
312
313/* Returns the attribute of a character, assuming base 10.  */
314static enum attribute
315attribute_of (unsigned char c)
316{
317  switch (c)
318    {
319    case ':':
320      return a_pack_m;
321    case '/':
322      return a_ratio;
323    case '.':
324      return a_dot;
325    case '+': case '-':
326      return a_sign;
327    case '_': case '^':
328      return a_extens;
329    case '0': case '1': case '2': case '3': case '4':
330    case '5': case '6': case '7': case '8': case '9':
331      return a_digit;
332    case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
333    case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
334    case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
335    case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
336    case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
337    case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
338      return a_letter;
339    case 'e': case 's': case 'd': case 'f': case 'l':
340    case 'E': case 'S': case 'D': case 'F': case 'L':
341      return a_expo;
342    default:
343      /* Treat everything as valid.  Never return a_illg.  */
344      return a_alpha;
345    }
346}
347
348struct token_char
349{
350  unsigned char ch;		/* character */
351  unsigned char attribute;	/* attribute */
352};
353
354/* A token consists of a sequence of characters with associated attribute.  */
355struct token
356{
357  int allocated;		/* number of allocated 'token_char's */
358  int charcount;		/* number of used 'token_char's */
359  struct token_char *chars;	/* the token's constituents */
360  bool with_escape;		/* whether single-escape or multiple escape occurs */
361};
362
363/* Initialize a 'struct token'.  */
364static inline void
365init_token (struct token *tp)
366{
367  tp->allocated = 10;
368  tp->chars =
369    (struct token_char *) xmalloc (tp->allocated * sizeof (struct token_char));
370  tp->charcount = 0;
371}
372
373/* Free the memory pointed to by a 'struct token'.  */
374static inline void
375free_token (struct token *tp)
376{
377  free (tp->chars);
378}
379
380/* Ensure there is enough room in the token for one more character.  */
381static inline void
382grow_token (struct token *tp)
383{
384  if (tp->charcount == tp->allocated)
385    {
386      tp->allocated *= 2;
387      tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
388    }
389}
390
391/* Read the next token.  If 'first' is given, it points to the first
392   character, which has already been read.
393   The algorithm follows CLHS 2.2 "Reader Algorithm".  */
394static void
395read_token (struct token *tp, const struct char_syntax *first)
396{
397  bool multiple_escape_flag;
398  struct char_syntax curr;
399
400  init_token (tp);
401  tp->with_escape = false;
402
403  multiple_escape_flag = false;
404  if (first)
405    curr = *first;
406  else
407    read_char_syntax (&curr);
408
409  for (;; read_char_syntax (&curr))
410    {
411      switch (curr.scode)
412	{
413	case syntax_illegal:
414	  /* Invalid input.  Be tolerant, no error message.  */
415	  do_ungetc (curr.ch);
416	  return;
417
418	case syntax_single_esc:
419	  tp->with_escape = true;
420	  read_char_syntax (&curr);
421	  if (curr.scode == syntax_eof)
422	    /* Invalid input.  Be tolerant, no error message.  */
423	    return;
424	  grow_token (tp);
425	  tp->chars[tp->charcount].ch = curr.ch;
426	  tp->chars[tp->charcount].attribute = a_escaped;
427	  tp->charcount++;
428	  break;
429
430	case syntax_multi_esc:
431	  multiple_escape_flag = !multiple_escape_flag;
432	  tp->with_escape = true;
433	  break;
434
435	case syntax_constituent:
436	case syntax_nt_macro:
437	  grow_token (tp);
438	  if (multiple_escape_flag)
439	    {
440	      tp->chars[tp->charcount].ch = curr.ch;
441	      tp->chars[tp->charcount].attribute = a_escaped;
442	      tp->charcount++;
443	    }
444	  else
445	    {
446	      tp->chars[tp->charcount].ch = curr.ch;
447	      tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
448	      tp->charcount++;
449	    }
450	  break;
451
452	case syntax_whitespace:
453	case syntax_t_macro:
454	  if (multiple_escape_flag)
455	    {
456	      grow_token (tp);
457	      tp->chars[tp->charcount].ch = curr.ch;
458	      tp->chars[tp->charcount].attribute = a_escaped;
459	      tp->charcount++;
460	    }
461	  else
462	    {
463	      if (curr.scode != syntax_whitespace || read_preserve_whitespace)
464		do_ungetc (curr.ch);
465	      return;
466	    }
467	  break;
468
469	case syntax_eof:
470	  if (multiple_escape_flag)
471	    /* Invalid input.  Be tolerant, no error message.  */
472	    ;
473	  return;
474	}
475    }
476}
477
478/* A potential number is a token which
479   1. consists only of digits, '+','-','/','^','_','.' and number markers.
480      The base for digits is context dependent, but always 10 if a dot '.'
481      occurs. A number marker is a non-digit letter which is not adjacent
482      to a non-digit letter.
483   2. has at least one digit.
484   3. starts with a digit, '+','-','.','^' or '_'.
485   4. does not end with '+' or '-'.
486   See CLHS 2.3.1.1 "Potential Numbers as Tokens".
487 */
488
489static inline bool
490has_a_dot (const struct token *tp)
491{
492  int n = tp->charcount;
493  int i;
494
495  for (i = 0; i < n; i++)
496    if (tp->chars[i].attribute == a_dot)
497      return true;
498  return false;
499}
500
501static inline bool
502all_a_number (const struct token *tp)
503{
504  int n = tp->charcount;
505  int i;
506
507  for (i = 0; i < n; i++)
508    if (!is_number_attribute (tp->chars[i].attribute))
509      return false;
510  return true;
511}
512
513static inline void
514a_letter_to_digit (const struct token *tp, int base)
515{
516  int n = tp->charcount;
517  int i;
518
519  for (i = 0; i < n; i++)
520    if (is_letter_attribute (tp->chars[i].attribute))
521      {
522	int c = tp->chars[i].ch;
523
524	if (c >= 'a')
525	  c -= 'a' - 'A';
526	if (c - 'A' + 10 < base)
527	  tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
528					  a_expo -> a_expodigit */
529      }
530}
531
532static inline bool
533has_a_digit (const struct token *tp)
534{
535  int n = tp->charcount;
536  int i;
537
538  for (i = 0; i < n; i++)
539    if (tp->chars[i].attribute == a_digit
540	|| tp->chars[i].attribute == a_letterdigit
541	|| tp->chars[i].attribute == a_expodigit)
542      return true;
543  return false;
544}
545
546static inline bool
547has_adjacent_letters (const struct token *tp)
548{
549  int n = tp->charcount;
550  int i;
551
552  for (i = 1; i < n; i++)
553    if (is_letter_attribute (tp->chars[i-1].attribute)
554	&& is_letter_attribute (tp->chars[i].attribute))
555      return true;
556  return false;
557}
558
559static bool
560is_potential_number (const struct token *tp, int *basep)
561{
562  /* CLHS 2.3.1.1.1:
563     "A potential number cannot contain any escape characters."  */
564  if (tp->with_escape)
565    return false;
566
567  if (has_a_dot (tp))
568    *basep = 10;
569
570  if (!all_a_number (tp))
571    return false;
572
573  a_letter_to_digit (tp, *basep);
574
575  if (!has_a_digit (tp))
576    return false;
577
578  if (has_adjacent_letters (tp))
579    return false;
580
581  if (!(tp->chars[0].attribute >= a_dot
582	&& tp->chars[0].attribute <= a_expodigit))
583    return false;
584
585  if (tp->chars[tp->charcount - 1].attribute == a_sign)
586    return false;
587
588  return true;
589}
590
591/* A number is one of integer, ratio, float.  Each has a particular syntax.
592   See CLHS 2.3.1 "Numbers as Tokens".
593   But note a mistake: The exponent rule should read:
594       exponent ::= exponent-marker [sign] {decimal-digit}+
595   (see 22.1.3.1.3 "Printing Floats").  */
596
597enum number_type
598{
599  n_none,
600  n_integer,
601  n_ratio,
602  n_float
603};
604
605static enum number_type
606is_number (const struct token *tp, int *basep)
607{
608  struct token_char *ptr_limit;
609  struct token_char *ptr1;
610
611  if (!is_potential_number (tp, basep))
612    return n_none;
613
614  /* is_potential_number guarantees
615     - all attributes are >= a_ratio,
616     - there is at least one a_digit or a_letterdigit or a_expodigit, and
617     - if there is an a_dot, then *basep = 10.  */
618
619  ptr1 = &tp->chars[0];
620  ptr_limit = &tp->chars[tp->charcount];
621
622  if (ptr1->attribute == a_sign)
623    ptr1++;
624
625  /* Test for syntax
626   * { a_sign | }
627   * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
628   */
629  {
630    bool seen_a_ratio = false;
631    bool seen_a_digit = false;	/* seen a digit in last digit block? */
632    struct token_char *ptr;
633
634    for (ptr = ptr1;; ptr++)
635      {
636	if (ptr >= ptr_limit)
637	  {
638	    if (!seen_a_digit)
639	      break;
640	    if (seen_a_ratio)
641	      return n_ratio;
642	    else
643	      return n_integer;
644	  }
645	if (ptr->attribute == a_digit
646	    || ptr->attribute == a_letterdigit
647	    || ptr->attribute == a_expodigit)
648	  {
649	    int c = ptr->ch;
650
651	    c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
652	    if (c >= *basep)
653	      break;
654	    seen_a_digit = true;
655	  }
656	else if (ptr->attribute == a_ratio)
657	  {
658	    if (seen_a_ratio || !seen_a_digit)
659	      break;
660	    seen_a_ratio = true;
661	    seen_a_digit = false;
662	  }
663	else
664	  break;
665      }
666  }
667
668  /* Test for syntax
669   * { a_sign | }
670   * { a_digit }* { a_dot { a_digit }* | }
671   * { a_expo { a_sign | } { a_digit }+ | }
672   *
673   * If there is an exponent part, there must be digits before the dot or
674   * after the dot. The result is a float.
675   * If there is no exponen:
676   *   If there is no dot, it would an integer in base 10, but is has already
677   *   been verified to not be an integer in the current base.
678   *   If there is a dot:
679   *     If there are digits after the dot, it's a float.
680   *     Otherwise, if there are digits before the dot, it's an integer.
681   */
682  *basep = 10;
683  {
684    bool seen_a_dot = false;
685    bool seen_a_dot_with_leading_digits = false;
686    bool seen_a_digit = false;	/* seen a digit in last digit block? */
687    struct token_char *ptr;
688
689    for (ptr = ptr1;; ptr++)
690      {
691	if (ptr >= ptr_limit)
692	  {
693	    /* no exponent */
694	    if (!seen_a_dot)
695	      return n_none;
696	    if (seen_a_digit)
697	      return n_float;
698	    if (seen_a_dot_with_leading_digits)
699	      return n_integer;
700	    else
701	      return n_none;
702	  }
703	if (ptr->attribute == a_digit)
704	  {
705	    seen_a_digit = true;
706	  }
707	else if (ptr->attribute == a_dot)
708	  {
709	    if (seen_a_dot)
710	      return n_none;
711	    seen_a_dot = true;
712	    if (seen_a_digit)
713	      seen_a_dot_with_leading_digits = true;
714	    seen_a_digit = false;
715	  }
716	else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
717	  break;
718	else
719	  return n_none;
720      }
721    ptr++;
722    if (!seen_a_dot_with_leading_digits || !seen_a_digit)
723      return n_none;
724    if (ptr >= ptr_limit)
725      return n_none;
726    if (ptr->attribute == a_sign)
727      ptr++;
728    seen_a_digit = false;
729    for (;; ptr++)
730      {
731	if (ptr >= ptr_limit)
732	  break;
733	if (ptr->attribute != a_digit)
734	  return n_none;
735	seen_a_digit = true;
736      }
737    if (!seen_a_digit)
738      return n_none;
739    return n_float;
740  }
741}
742
743/* A token representing a symbol must be case converted.
744   For portability, we convert only ASCII characters here.  */
745
746static void
747upcase_token (struct token *tp)
748{
749  int n = tp->charcount;
750  int i;
751
752  for (i = 0; i < n; i++)
753    if (tp->chars[i].attribute != a_escaped)
754      {
755	unsigned char c = tp->chars[i].ch;
756	if (c >= 'a' && c <= 'z')
757	  tp->chars[i].ch = c - 'a' + 'A';
758      }
759}
760
761static void
762downcase_token (struct token *tp)
763{
764  int n = tp->charcount;
765  int i;
766
767  for (i = 0; i < n; i++)
768    if (tp->chars[i].attribute != a_escaped)
769      {
770	unsigned char c = tp->chars[i].ch;
771	if (c >= 'A' && c <= 'Z')
772	  tp->chars[i].ch = c - 'A' + 'a';
773      }
774}
775
776static void
777case_convert_token (struct token *tp)
778{
779  int n = tp->charcount;
780  int i;
781
782  switch (readtable_case)
783    {
784    case case_upcase:
785      upcase_token (tp);
786      break;
787
788    case case_downcase:
789      downcase_token (tp);
790      break;
791
792    case case_preserve:
793      break;
794
795    case case_invert:
796      {
797	bool seen_uppercase = false;
798	bool seen_lowercase = false;
799	for (i = 0; i < n; i++)
800	  if (tp->chars[i].attribute != a_escaped)
801	    {
802	      unsigned char c = tp->chars[i].ch;
803	      if (c >= 'a' && c <= 'z')
804		seen_lowercase = true;
805	      if (c >= 'A' && c <= 'Z')
806		seen_uppercase = true;
807	    }
808	if (seen_uppercase)
809	  {
810	    if (!seen_lowercase)
811	      downcase_token (tp);
812	  }
813	else
814	  {
815	    if (seen_lowercase)
816	      upcase_token (tp);
817	  }
818      }
819      break;
820    }
821}
822
823
824/* ========================= Accumulating comments ========================= */
825
826
827static char *buffer;
828static size_t bufmax;
829static size_t buflen;
830
831static inline void
832comment_start ()
833{
834  buflen = 0;
835}
836
837static inline void
838comment_add (int c)
839{
840  if (buflen >= bufmax)
841    {
842      bufmax = 2 * bufmax + 10;
843      buffer = xrealloc (buffer, bufmax);
844    }
845  buffer[buflen++] = c;
846}
847
848static inline void
849comment_line_end (size_t chars_to_remove)
850{
851  buflen -= chars_to_remove;
852  while (buflen >= 1
853	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
854    --buflen;
855  if (chars_to_remove == 0 && buflen >= bufmax)
856    {
857      bufmax = 2 * bufmax + 10;
858      buffer = xrealloc (buffer, bufmax);
859    }
860  buffer[buflen] = '\0';
861  savable_comment_add (buffer);
862}
863
864
865/* These are for tracking whether comments count as immediately before
866   keyword.  */
867static int last_comment_line;
868static int last_non_comment_line;
869
870
871/* ========================= Accumulating messages ========================= */
872
873
874static message_list_ty *mlp;
875
876
877/* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
878
879
880/* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
881   Other objects need not to be represented precisely.  */
882enum object_type
883{
884  t_symbol,	/* symbol */
885  t_string,	/* string */
886  t_other,	/* other kind of real object */
887  t_dot,	/* '.' pseudo object */
888  t_close,	/* ')' pseudo object */
889  t_eof		/* EOF marker */
890};
891
892struct object
893{
894  enum object_type type;
895  struct token *token;		/* for t_symbol and t_string */
896  int line_number_at_start;	/* for t_string */
897};
898
899/* Free the memory pointed to by a 'struct object'.  */
900static inline void
901free_object (struct object *op)
902{
903  if (op->type == t_symbol || op->type == t_string)
904    {
905      free_token (op->token);
906      free (op->token);
907    }
908}
909
910/* Convert a t_symbol/t_string token to a char*.  */
911static char *
912string_of_object (const struct object *op)
913{
914  char *str;
915  const struct token_char *p;
916  char *q;
917  int n;
918
919  if (!(op->type == t_symbol || op->type == t_string))
920    abort ();
921  n = op->token->charcount;
922  str = (char *) xmalloc (n + 1);
923  q = str;
924  for (p = op->token->chars; n > 0; p++, n--)
925    *q++ = p->ch;
926  *q = '\0';
927  return str;
928}
929
930/* Context lookup table.  */
931static flag_context_list_table_ty *flag_context_list_table;
932
933/* Read the next object.  */
934static void
935read_object (struct object *op, flag_context_ty outer_context)
936{
937  for (;;)
938    {
939      struct char_syntax curr;
940
941      read_char_syntax (&curr);
942
943      switch (curr.scode)
944	{
945	case syntax_eof:
946	  op->type = t_eof;
947	  return;
948
949	case syntax_whitespace:
950	  if (curr.ch == '\n')
951	    /* Comments assumed to be grouped with a message must immediately
952	       precede it, with no non-whitespace token on a line between
953	       both.  */
954	    if (last_non_comment_line > last_comment_line)
955	      savable_comment_reset ();
956	  continue;
957
958	case syntax_illegal:
959	  op->type = t_other;
960	  return;
961
962	case syntax_single_esc:
963	case syntax_multi_esc:
964	case syntax_constituent:
965	  /* Start reading a token.  */
966	  op->token = (struct token *) xmalloc (sizeof (struct token));
967	  read_token (op->token, &curr);
968	  last_non_comment_line = line_number;
969
970	  /* Interpret the token.  */
971
972	  /* Dots.  */
973	  if (!op->token->with_escape
974	      && op->token->charcount == 1
975	      && op->token->chars[0].attribute == a_dot)
976	    {
977	      free_token (op->token);
978	      free (op->token);
979	      op->type = t_dot;
980	      return;
981	    }
982	  /* Tokens consisting entirely of dots are illegal, but be tolerant
983	     here.  */
984
985	  /* Number.  */
986	  {
987	    int base = read_base;
988
989	    if (is_number (op->token, &base) != n_none)
990	      {
991		free_token (op->token);
992		free (op->token);
993		op->type = t_other;
994		return;
995	      }
996	  }
997
998	  /* We interpret all other tokens as symbols (including 'reserved
999	     tokens', i.e. potential numbers which are not numbers).  */
1000	  case_convert_token (op->token);
1001	  op->type = t_symbol;
1002	  return;
1003
1004	case syntax_t_macro:
1005	case syntax_nt_macro:
1006	  /* Read a macro.  */
1007	  switch (curr.ch)
1008	    {
1009	    case '(':
1010	      {
1011		int arg = 0;		/* Current argument number.  */
1012		flag_context_list_iterator_ty context_iter;
1013		const struct callshapes *shapes = NULL;
1014		struct arglist_parser *argparser = NULL;
1015
1016		for (;; arg++)
1017		  {
1018		    struct object inner;
1019		    flag_context_ty inner_context;
1020
1021		    if (arg == 0)
1022		      inner_context = null_context;
1023		    else
1024		      inner_context =
1025			inherited_context (outer_context,
1026					   flag_context_list_iterator_advance (
1027					     &context_iter));
1028
1029		    read_object (&inner, inner_context);
1030
1031		    /* Recognize end of list.  */
1032		    if (inner.type == t_close)
1033		      {
1034			op->type = t_other;
1035			/* Don't bother converting "()" to "NIL".  */
1036			last_non_comment_line = line_number;
1037			if (argparser != NULL)
1038			  arglist_parser_done (argparser, arg);
1039			return;
1040		      }
1041
1042		    /* Dots are not allowed in every position.
1043		       But be tolerant.  */
1044
1045		    /* EOF inside list is illegal.
1046		       But be tolerant.  */
1047		    if (inner.type == t_eof)
1048		      break;
1049
1050		    if (arg == 0)
1051		      {
1052			/* This is the function position.  */
1053			if (inner.type == t_symbol)
1054			  {
1055			    char *symbol_name = string_of_object (&inner);
1056			    int i;
1057			    int prefix_len;
1058			    void *keyword_value;
1059
1060			    /* Omit any package name.  */
1061			    i = inner.token->charcount;
1062			    while (i > 0
1063				   && inner.token->chars[i-1].attribute != a_pack_m)
1064			      i--;
1065			    prefix_len = i;
1066
1067			    if (hash_find_entry (&keywords,
1068						 symbol_name + prefix_len,
1069						 strlen (symbol_name + prefix_len),
1070						 &keyword_value)
1071				== 0)
1072			      shapes = (const struct callshapes *) keyword_value;
1073
1074			    argparser = arglist_parser_alloc (mlp, shapes);
1075
1076			    context_iter =
1077			      flag_context_list_iterator (
1078				flag_context_list_table_lookup (
1079				  flag_context_list_table,
1080				  symbol_name, strlen (symbol_name)));
1081
1082			    free (symbol_name);
1083			  }
1084			else
1085			  context_iter = null_context_list_iterator;
1086		      }
1087		    else
1088		      {
1089			/* These are the argument positions.  */
1090			if (argparser != NULL && inner.type == t_string)
1091			  arglist_parser_remember (argparser, arg,
1092						   string_of_object (&inner),
1093						   inner_context,
1094						   logical_file_name,
1095						   inner.line_number_at_start,
1096						   savable_comment);
1097		      }
1098
1099		    free_object (&inner);
1100		  }
1101
1102		if (argparser != NULL)
1103		  arglist_parser_done (argparser, arg);
1104	      }
1105	      op->type = t_other;
1106	      last_non_comment_line = line_number;
1107	      return;
1108
1109	    case ')':
1110	      /* Tell the caller about the end of list.
1111		 Unmatched closing parenthesis is illegal.
1112		 But be tolerant.  */
1113	      op->type = t_close;
1114	      last_non_comment_line = line_number;
1115	      return;
1116
1117	    case ',':
1118	      {
1119		int c = do_getc ();
1120		/* The ,@ handling inside lists is wrong anyway, because
1121		   ,@form expands to an unknown number of elements.  */
1122		if (c != EOF && c != '@' && c != '.')
1123		  do_ungetc (c);
1124	      }
1125	      /*FALLTHROUGH*/
1126	    case '\'':
1127	    case '`':
1128	      {
1129		struct object inner;
1130
1131		read_object (&inner, null_context);
1132
1133		/* Dots and EOF are not allowed here.  But be tolerant.  */
1134
1135		free_object (&inner);
1136
1137		op->type = t_other;
1138		last_non_comment_line = line_number;
1139		return;
1140	      }
1141
1142	    case ';':
1143	      {
1144		bool all_semicolons = true;
1145
1146		last_comment_line = line_number;
1147		comment_start ();
1148		for (;;)
1149		  {
1150		    int c = do_getc ();
1151		    if (c == EOF || c == '\n')
1152		      break;
1153		    if (c != ';')
1154		      all_semicolons = false;
1155		    if (!all_semicolons)
1156		      {
1157			/* We skip all leading white space, but not EOLs.  */
1158			if (!(buflen == 0 && (c == ' ' || c == '\t')))
1159			  comment_add (c);
1160		      }
1161		  }
1162		comment_line_end (0);
1163		continue;
1164	      }
1165
1166	    case '"':
1167	      {
1168		op->token = (struct token *) xmalloc (sizeof (struct token));
1169		init_token (op->token);
1170		op->line_number_at_start = line_number;
1171		for (;;)
1172		  {
1173		    int c = do_getc ();
1174		    if (c == EOF)
1175		      /* Invalid input.  Be tolerant, no error message.  */
1176		      break;
1177		    if (c == '"')
1178		      break;
1179		    if (c == '\\') /* syntax_single_esc */
1180		      {
1181			c = do_getc ();
1182			if (c == EOF)
1183			  /* Invalid input.  Be tolerant, no error message.  */
1184			  break;
1185		      }
1186		    grow_token (op->token);
1187		    op->token->chars[op->token->charcount++].ch = c;
1188		  }
1189		op->type = t_string;
1190
1191		if (extract_all)
1192		  {
1193		    lex_pos_ty pos;
1194
1195		    pos.file_name = logical_file_name;
1196		    pos.line_number = op->line_number_at_start;
1197		    remember_a_message (mlp, NULL, string_of_object (op),
1198					null_context, &pos, savable_comment);
1199		  }
1200		last_non_comment_line = line_number;
1201		return;
1202	      }
1203
1204	    case '#':
1205	      /* Dispatch macro handling.  */
1206	      {
1207		int c;
1208
1209		for (;;)
1210		  {
1211		    c = do_getc ();
1212		    if (c == EOF)
1213		      /* Invalid input.  Be tolerant, no error message.  */
1214		      {
1215			op->type = t_other;
1216			return;
1217		      }
1218		    if (!(c >= '0' && c <= '9'))
1219		      break;
1220		  }
1221
1222		switch (c)
1223		  {
1224		  case '(':
1225		  case '"':
1226		    do_ungetc (c);
1227		    /*FALLTHROUGH*/
1228		  case '\'':
1229		  case ':':
1230		  case '.':
1231		  case ',':
1232		  case 'A': case 'a':
1233		  case 'C': case 'c':
1234		  case 'P': case 'p':
1235		  case 'S': case 's':
1236		    {
1237		      struct object inner;
1238		      read_object (&inner, null_context);
1239		      /* Dots and EOF are not allowed here.
1240			 But be tolerant.  */
1241		      free_object (&inner);
1242		      op->type = t_other;
1243		      last_non_comment_line = line_number;
1244		      return;
1245		    }
1246
1247		  case '|':
1248		    {
1249		      int depth = 0;
1250		      int c;
1251
1252		      comment_start ();
1253		      c = do_getc ();
1254		      for (;;)
1255			{
1256			  if (c == EOF)
1257			    break;
1258			  if (c == '|')
1259			    {
1260			      c = do_getc ();
1261			      if (c == EOF)
1262				break;
1263			      if (c == '#')
1264				{
1265				  if (depth == 0)
1266				    {
1267				      comment_line_end (0);
1268				      break;
1269				    }
1270				  depth--;
1271				  comment_add ('|');
1272				  comment_add ('#');
1273				  c = do_getc ();
1274				}
1275			      else
1276				comment_add ('|');
1277			    }
1278			  else if (c == '#')
1279			    {
1280			      c = do_getc ();
1281			      if (c == EOF)
1282				break;
1283			      comment_add ('#');
1284			      if (c == '|')
1285				{
1286				  depth++;
1287				  comment_add ('|');
1288				  c = do_getc ();
1289				}
1290			    }
1291			  else
1292			    {
1293			      /* We skip all leading white space.  */
1294			      if (!(buflen == 0 && (c == ' ' || c == '\t')))
1295				comment_add (c);
1296			      if (c == '\n')
1297				{
1298				  comment_line_end (1);
1299				  comment_start ();
1300				}
1301			      c = do_getc ();
1302			    }
1303			}
1304		      if (c == EOF)
1305			{
1306			  /* EOF not allowed here.  But be tolerant.  */
1307			  op->type = t_eof;
1308			  return;
1309			}
1310		      last_comment_line = line_number;
1311		      continue;
1312		    }
1313
1314		  case '\\':
1315		    {
1316		      struct token token;
1317		      struct char_syntax first;
1318		      first.ch = '\\';
1319		      first.scode = syntax_single_esc;
1320		      read_token (&token, &first);
1321		      free_token (&token);
1322		      op->type = t_other;
1323		      last_non_comment_line = line_number;
1324		      return;
1325		    }
1326
1327		  case 'B': case 'b':
1328		  case 'O': case 'o':
1329		  case 'X': case 'x':
1330		  case 'R': case 'r':
1331		  case '*':
1332		    {
1333		      struct token token;
1334		      read_token (&token, NULL);
1335		      free_token (&token);
1336		      op->type = t_other;
1337		      last_non_comment_line = line_number;
1338		      return;
1339		    }
1340
1341		  case '=':
1342		    /* Ignore read labels.  */
1343		    continue;
1344
1345		  case '#':
1346		    /* Don't bother looking up the corresponding object.  */
1347		    op->type = t_other;
1348		    last_non_comment_line = line_number;
1349		    return;
1350
1351		  case '+':
1352		  case '-':
1353		    /* Simply assume every feature expression is true.  */
1354		    {
1355		      struct object inner;
1356		      read_object (&inner, null_context);
1357		      /* Dots and EOF are not allowed here.
1358			 But be tolerant.  */
1359		      free_object (&inner);
1360		      continue;
1361		    }
1362
1363		  default:
1364		    op->type = t_other;
1365		    last_non_comment_line = line_number;
1366		    return;
1367		  }
1368		/*NOTREACHED*/
1369		abort ();
1370	      }
1371
1372	    default:
1373	      /*NOTREACHED*/
1374	      abort ();
1375	    }
1376
1377	default:
1378	  /*NOTREACHED*/
1379	  abort ();
1380	}
1381    }
1382}
1383
1384
1385void
1386extract_lisp (FILE *f,
1387	      const char *real_filename, const char *logical_filename,
1388	      flag_context_list_table_ty *flag_table,
1389	      msgdomain_list_ty *mdlp)
1390{
1391  mlp = mdlp->item[0]->messages;
1392
1393  fp = f;
1394  real_file_name = real_filename;
1395  logical_file_name = xstrdup (logical_filename);
1396  line_number = 1;
1397
1398  last_comment_line = -1;
1399  last_non_comment_line = -1;
1400
1401  flag_context_list_table = flag_table;
1402
1403  init_keywords ();
1404
1405  /* Eat tokens until eof is seen.  When read_object returns
1406     due to an unbalanced closing parenthesis, just restart it.  */
1407  do
1408    {
1409      struct object toplevel_object;
1410
1411      read_object (&toplevel_object, null_context);
1412
1413      if (toplevel_object.type == t_eof)
1414	break;
1415
1416      free_object (&toplevel_object);
1417    }
1418  while (!feof (fp));
1419
1420  /* Close scanner.  */
1421  fp = NULL;
1422  real_file_name = NULL;
1423  logical_file_name = NULL;
1424  line_number = 0;
1425}
1426