1/* xgettext PHP backend.
2   Copyright (C) 2001-2003, 2005-2007 Free Software Foundation, Inc.
3
4   This file was written by Bruno Haible <bruno@clisp.org>, 2002.
5
6   This program is free software: you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 3 of the License, or
9   (at your option) any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
18
19#ifdef HAVE_CONFIG_H
20# include "config.h"
21#endif
22
23/* Specification.  */
24#include "x-php.h"
25
26#include <errno.h>
27#include <stdbool.h>
28#include <stdio.h>
29#include <stdlib.h>
30
31#include "message.h"
32#include "xgettext.h"
33#include "x-php.h"
34#include "error.h"
35#include "xalloc.h"
36#include "gettext.h"
37
38#define _(s) gettext(s)
39
40#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
41
42
43/* The PHP syntax is defined in phpdoc/manual/langref.html.
44   See also php-4.1.0/Zend/zend_language_scanner.l
45   and      php-4.1.0/Zend/zend_language_parser.y.
46   Note that variable and function names can contain bytes in the range
47   0x7f..0xff; see
48     http://www.php.net/manual/en/language.variables.php
49     http://www.php.net/manual/en/language.functions.php  */
50
51
52/* ====================== Keyword set customization.  ====================== */
53
54/* If true extract all strings.  */
55static bool extract_all = false;
56
57static hash_table keywords;
58static bool default_keywords = true;
59
60
61void
62x_php_extract_all ()
63{
64  extract_all = true;
65}
66
67
68void
69x_php_keyword (const char *name)
70{
71  if (name == NULL)
72    default_keywords = false;
73  else
74    {
75      const char *end;
76      struct callshape shape;
77      const char *colon;
78
79      if (keywords.table == NULL)
80	hash_init (&keywords, 100);
81
82      split_keywordspec (name, &end, &shape);
83
84      /* The characters between name and end should form a valid C identifier.
85	 A colon means an invalid parse in split_keywordspec().  */
86      colon = strchr (name, ':');
87      if (colon == NULL || colon >= end)
88	insert_keyword_callshape (&keywords, name, end - name, &shape);
89    }
90}
91
92/* Finish initializing the keywords hash table.
93   Called after argument processing, before each file is processed.  */
94static void
95init_keywords ()
96{
97  if (default_keywords)
98    {
99      /* When adding new keywords here, also update the documentation in
100	 xgettext.texi!  */
101      x_php_keyword ("_");
102      x_php_keyword ("gettext");
103      x_php_keyword ("dgettext:2");
104      x_php_keyword ("dcgettext:2");
105      /* The following were added in PHP 4.2.0.  */
106      x_php_keyword ("ngettext:1,2");
107      x_php_keyword ("dngettext:2,3");
108      x_php_keyword ("dcngettext:2,3");
109      default_keywords = false;
110    }
111}
112
113void
114init_flag_table_php ()
115{
116  xgettext_record_flag ("_:1:pass-php-format");
117  xgettext_record_flag ("gettext:1:pass-php-format");
118  xgettext_record_flag ("dgettext:2:pass-php-format");
119  xgettext_record_flag ("dcgettext:2:pass-php-format");
120  xgettext_record_flag ("ngettext:1:pass-php-format");
121  xgettext_record_flag ("ngettext:2:pass-php-format");
122  xgettext_record_flag ("dngettext:2:pass-php-format");
123  xgettext_record_flag ("dngettext:3:pass-php-format");
124  xgettext_record_flag ("dcngettext:2:pass-php-format");
125  xgettext_record_flag ("dcngettext:3:pass-php-format");
126  xgettext_record_flag ("sprintf:1:php-format");
127  xgettext_record_flag ("printf:1:php-format");
128}
129
130
131/* ======================== Reading of characters.  ======================== */
132
133
134/* Real filename, used in error messages about the input file.  */
135static const char *real_file_name;
136
137/* Logical filename and line number, used to label the extracted messages.  */
138static char *logical_file_name;
139static int line_number;
140
141/* The input file stream.  */
142static FILE *fp;
143
144
145/* 1. line_number handling.  */
146
147static unsigned char phase1_pushback[2];
148static int phase1_pushback_length;
149
150static int
151phase1_getc ()
152{
153  int c;
154
155  if (phase1_pushback_length)
156    c = phase1_pushback[--phase1_pushback_length];
157  else
158    {
159      c = getc (fp);
160
161      if (c == EOF)
162	{
163	  if (ferror (fp))
164	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
165		   real_file_name);
166	  return EOF;
167	}
168    }
169
170  if (c == '\n')
171    line_number++;
172
173  return c;
174}
175
176/* Supports 2 characters of pushback.  */
177static void
178phase1_ungetc (int c)
179{
180  if (c != EOF)
181    {
182      if (c == '\n')
183	--line_number;
184
185      if (phase1_pushback_length == SIZEOF (phase1_pushback))
186	abort ();
187      phase1_pushback[phase1_pushback_length++] = c;
188    }
189}
190
191
192/* 2. Ignore HTML sections.  They are equivalent to PHP echo commands and
193   therefore don't contain translatable strings.  */
194
195static void
196skip_html ()
197{
198  for (;;)
199    {
200      int c = phase1_getc ();
201
202      if (c == EOF)
203	return;
204
205      if (c == '<')
206	{
207	  int c2 = phase1_getc ();
208
209	  if (c2 == EOF)
210	    break;
211
212	  if (c2 == '?')
213	    {
214	      /* <?php is the normal way to enter PHP mode. <? and <?= are
215		 recognized by PHP depending on a configuration setting.  */
216	      int c3 = phase1_getc ();
217
218	      if (c3 != '=')
219		phase1_ungetc (c3);
220
221	      return;
222	    }
223
224	  if (c2 == '%')
225	    {
226	      /* <% and <%= are recognized by PHP depending on a configuration
227		 setting.  */
228	      int c3 = phase1_getc ();
229
230	      if (c3 != '=')
231		phase1_ungetc (c3);
232
233	      return;
234	    }
235
236	  if (c2 == '<')
237	    {
238	      phase1_ungetc (c2);
239	      continue;
240	    }
241
242	  /* < script language = php >
243	     < script language = "php" >
244	     < script language = 'php' >
245	     are always recognized.  */
246	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
247	    c2 = phase1_getc ();
248	  if (c2 != 's' && c2 != 'S')
249	    {
250	      phase1_ungetc (c2);
251	      continue;
252	    }
253	  c2 = phase1_getc ();
254	  if (c2 != 'c' && c2 != 'C')
255	    {
256	      phase1_ungetc (c2);
257	      continue;
258	    }
259	  c2 = phase1_getc ();
260	  if (c2 != 'r' && c2 != 'R')
261	    {
262	      phase1_ungetc (c2);
263	      continue;
264	    }
265	  c2 = phase1_getc ();
266	  if (c2 != 'i' && c2 != 'I')
267	    {
268	      phase1_ungetc (c2);
269	      continue;
270	    }
271	  c2 = phase1_getc ();
272	  if (c2 != 'p' && c2 != 'P')
273	    {
274	      phase1_ungetc (c2);
275	      continue;
276	    }
277	  c2 = phase1_getc ();
278	  if (c2 != 't' && c2 != 'T')
279	    {
280	      phase1_ungetc (c2);
281	      continue;
282	    }
283	  c2 = phase1_getc ();
284	  if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
285	    {
286	      phase1_ungetc (c2);
287	      continue;
288	    }
289	  do
290	    c2 = phase1_getc ();
291	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
292	  if (c2 != 'l' && c2 != 'L')
293	    {
294	      phase1_ungetc (c2);
295	      continue;
296	    }
297	  c2 = phase1_getc ();
298	  if (c2 != 'a' && c2 != 'A')
299	    {
300	      phase1_ungetc (c2);
301	      continue;
302	    }
303	  c2 = phase1_getc ();
304	  if (c2 != 'n' && c2 != 'N')
305	    {
306	      phase1_ungetc (c2);
307	      continue;
308	    }
309	  c2 = phase1_getc ();
310	  if (c2 != 'g' && c2 != 'G')
311	    {
312	      phase1_ungetc (c2);
313	      continue;
314	    }
315	  c2 = phase1_getc ();
316	  if (c2 != 'u' && c2 != 'U')
317	    {
318	      phase1_ungetc (c2);
319	      continue;
320	    }
321	  c2 = phase1_getc ();
322	  if (c2 != 'a' && c2 != 'A')
323	    {
324	      phase1_ungetc (c2);
325	      continue;
326	    }
327	  c2 = phase1_getc ();
328	  if (c2 != 'g' && c2 != 'G')
329	    {
330	      phase1_ungetc (c2);
331	      continue;
332	    }
333	  c2 = phase1_getc ();
334	  if (c2 != 'e' && c2 != 'E')
335	    {
336	      phase1_ungetc (c2);
337	      continue;
338	    }
339	  c2 = phase1_getc ();
340	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
341	    c2 = phase1_getc ();
342	  if (c2 != '=')
343	    {
344	      phase1_ungetc (c2);
345	      continue;
346	    }
347	  c2 = phase1_getc ();
348	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
349	    c2 = phase1_getc ();
350	  if (c2 == '"')
351	    {
352	      c2 = phase1_getc ();
353	      if (c2 != 'p')
354		{
355		  phase1_ungetc (c2);
356		  continue;
357		}
358	      c2 = phase1_getc ();
359	      if (c2 != 'h')
360		{
361		  phase1_ungetc (c2);
362		  continue;
363		}
364	      c2 = phase1_getc ();
365	      if (c2 != 'p')
366		{
367		  phase1_ungetc (c2);
368		  continue;
369		}
370	      c2 = phase1_getc ();
371	      if (c2 != '"')
372		{
373		  phase1_ungetc (c2);
374		  continue;
375		}
376	    }
377	  else if (c2 == '\'')
378	    {
379	      c2 = phase1_getc ();
380	      if (c2 != 'p')
381		{
382		  phase1_ungetc (c2);
383		  continue;
384		}
385	      c2 = phase1_getc ();
386	      if (c2 != 'h')
387		{
388		  phase1_ungetc (c2);
389		  continue;
390		}
391	      c2 = phase1_getc ();
392	      if (c2 != 'p')
393		{
394		  phase1_ungetc (c2);
395		  continue;
396		}
397	      c2 = phase1_getc ();
398	      if (c2 != '\'')
399		{
400		  phase1_ungetc (c2);
401		  continue;
402		}
403	    }
404	  else
405	    {
406	      if (c2 != 'p')
407		{
408		  phase1_ungetc (c2);
409		  continue;
410		}
411	      c2 = phase1_getc ();
412	      if (c2 != 'h')
413		{
414		  phase1_ungetc (c2);
415		  continue;
416		}
417	      c2 = phase1_getc ();
418	      if (c2 != 'p')
419		{
420		  phase1_ungetc (c2);
421		  continue;
422		}
423	    }
424	  c2 = phase1_getc ();
425	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
426	    c2 = phase1_getc ();
427	  if (c2 != '>')
428	    {
429	      phase1_ungetc (c2);
430	      continue;
431	    }
432	  return;
433	}
434    }
435}
436
437#if 0
438
439static unsigned char phase2_pushback[1];
440static int phase2_pushback_length;
441
442static int
443phase2_getc ()
444{
445  int c;
446
447  if (phase2_pushback_length)
448    return phase2_pushback[--phase2_pushback_length];
449
450  c = phase1_getc ();
451  switch (c)
452    {
453    case '?':
454    case '%':
455      {
456	int c2 = phase1_getc ();
457	if (c2 == '>')
458	  {
459	    /* ?> and %> terminate PHP mode and switch back to HTML mode.  */
460	    skip_html ();
461	    return ' ';
462	  }
463	phase1_ungetc (c2);
464      }
465      break;
466
467    case '<':
468      {
469	int c2 = phase1_getc ();
470
471	/* < / script > terminates PHP mode and switches back to HTML mode.  */
472	while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
473	  c2 = phase1_getc ();
474	if (c2 == '/')
475	  {
476	    do
477	      c2 = phase1_getc ();
478	    while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
479	    if (c2 == 's' || c2 == 'S')
480	      {
481		c2 = phase1_getc ();
482		if (c2 == 'c' || c2 == 'C')
483		  {
484		    c2 = phase1_getc ();
485		    if (c2 == 'r' || c2 == 'R')
486		      {
487			c2 = phase1_getc ();
488			if (c2 == 'i' || c2 == 'I')
489			  {
490			    c2 = phase1_getc ();
491			    if (c2 == 'p' || c2 == 'P')
492			      {
493				c2 = phase1_getc ();
494				if (c2 == 't' || c2 == 'T')
495				  {
496				    do
497				      c2 = phase1_getc ();
498				    while (c2 == ' ' || c2 == '\t'
499					   || c2 == '\n' || c2 == '\r');
500				    if (c2 == '>')
501				      {
502					skip_html ();
503					return ' ';
504				      }
505				  }
506			      }
507			  }
508		      }
509		  }
510	      }
511	  }
512	phase1_ungetc (c2);
513      }
514      break;
515    }
516
517  return c;
518}
519
520static void
521phase2_ungetc (int c)
522{
523  if (c != EOF)
524    {
525      if (phase2_pushback_length == SIZEOF (phase2_pushback))
526	abort ();
527      phase2_pushback[phase2_pushback_length++] = c;
528    }
529}
530
531#endif
532
533
534/* Accumulating comments.  */
535
536static char *buffer;
537static size_t bufmax;
538static size_t buflen;
539
540static inline void
541comment_start ()
542{
543  buflen = 0;
544}
545
546static inline void
547comment_add (int c)
548{
549  if (buflen >= bufmax)
550    {
551      bufmax = 2 * bufmax + 10;
552      buffer = xrealloc (buffer, bufmax);
553    }
554  buffer[buflen++] = c;
555}
556
557static inline void
558comment_line_end (size_t chars_to_remove)
559{
560  buflen -= chars_to_remove;
561  while (buflen >= 1
562	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
563    --buflen;
564  if (chars_to_remove == 0 && buflen >= bufmax)
565    {
566      bufmax = 2 * bufmax + 10;
567      buffer = xrealloc (buffer, bufmax);
568    }
569  buffer[buflen] = '\0';
570  savable_comment_add (buffer);
571}
572
573
574/* 3. Replace each comment that is not inside a string literal with a
575   space character.  We need to remember the comment for later, because
576   it may be attached to a keyword string.  */
577
578/* These are for tracking whether comments count as immediately before
579   keyword.  */
580static int last_comment_line;
581static int last_non_comment_line;
582
583static unsigned char phase3_pushback[1];
584static int phase3_pushback_length;
585
586static int
587phase3_getc ()
588{
589  int lineno;
590  int c;
591
592  if (phase3_pushback_length)
593    return phase3_pushback[--phase3_pushback_length];
594
595  c = phase1_getc ();
596
597  if (c == '#')
598    {
599      /* sh comment.  */
600      bool last_was_qmark = false;
601
602      comment_start ();
603      lineno = line_number;
604      for (;;)
605	{
606	  c = phase1_getc ();
607	  if (c == '\n' || c == EOF)
608	    {
609	      comment_line_end (0);
610	      break;
611	    }
612	  if (last_was_qmark && c == '>')
613	    {
614	      comment_line_end (1);
615	      skip_html ();
616	      break;
617	    }
618	  /* We skip all leading white space, but not EOLs.  */
619	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
620	    comment_add (c);
621	  last_was_qmark = (c == '?' || c == '%');
622	}
623      last_comment_line = lineno;
624      return '\n';
625    }
626  else if (c == '/')
627    {
628      c = phase1_getc ();
629
630      switch (c)
631	{
632	default:
633	  phase1_ungetc (c);
634	  return '/';
635
636	case '*':
637	  {
638	    /* C comment.  */
639	    bool last_was_star;
640
641	    comment_start ();
642	    lineno = line_number;
643	    last_was_star = false;
644	    for (;;)
645	      {
646		c = phase1_getc ();
647		if (c == EOF)
648		  break;
649		/* We skip all leading white space, but not EOLs.  */
650		if (buflen == 0 && (c == ' ' || c == '\t'))
651		  continue;
652		comment_add (c);
653		switch (c)
654		  {
655		  case '\n':
656		    comment_line_end (1);
657		    comment_start ();
658		    lineno = line_number;
659		    last_was_star = false;
660		    continue;
661
662		  case '*':
663		    last_was_star = true;
664		    continue;
665
666		  case '/':
667		    if (last_was_star)
668		      {
669			comment_line_end (2);
670			break;
671		      }
672		    /* FALLTHROUGH */
673
674		  default:
675		    last_was_star = false;
676		    continue;
677		  }
678		break;
679	      }
680	    last_comment_line = lineno;
681	    return ' ';
682	  }
683
684	case '/':
685	  {
686	    /* C++ comment.  */
687	    bool last_was_qmark = false;
688
689	    comment_start ();
690	    lineno = line_number;
691	    for (;;)
692	      {
693		c = phase1_getc ();
694		if (c == '\n' || c == EOF)
695		  {
696		    comment_line_end (0);
697		    break;
698		  }
699		if (last_was_qmark && c == '>')
700		  {
701		    comment_line_end (1);
702		    skip_html ();
703		    break;
704		  }
705		/* We skip all leading white space, but not EOLs.  */
706		if (!(buflen == 0 && (c == ' ' || c == '\t')))
707		  comment_add (c);
708		last_was_qmark = (c == '?' || c == '%');
709	      }
710	    last_comment_line = lineno;
711	    return '\n';
712	  }
713	}
714    }
715  else
716    return c;
717}
718
719#ifdef unused
720static void
721phase3_ungetc (int c)
722{
723  if (c != EOF)
724    {
725      if (phase3_pushback_length == SIZEOF (phase3_pushback))
726	abort ();
727      phase3_pushback[phase3_pushback_length++] = c;
728    }
729}
730#endif
731
732
733/* ========================== Reading of tokens.  ========================== */
734
735
736enum token_type_ty
737{
738  token_type_eof,
739  token_type_lparen,		/* ( */
740  token_type_rparen,		/* ) */
741  token_type_comma,		/* , */
742  token_type_lbracket,		/* [ */
743  token_type_rbracket,		/* ] */
744  token_type_dot,		/* . */
745  token_type_operator1,		/* * / % ++ -- */
746  token_type_operator2,		/* + - ! ~ @ */
747  token_type_string_literal,	/* "abc" */
748  token_type_symbol,		/* symbol, number */
749  token_type_other		/* misc. operator */
750};
751typedef enum token_type_ty token_type_ty;
752
753typedef struct token_ty token_ty;
754struct token_ty
755{
756  token_type_ty type;
757  char *string;		/* for token_type_string_literal, token_type_symbol */
758  refcounted_string_list_ty *comment;	/* for token_type_string_literal */
759  int line_number;
760};
761
762
763/* Free the memory pointed to by a 'struct token_ty'.  */
764static inline void
765free_token (token_ty *tp)
766{
767  if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
768    free (tp->string);
769  if (tp->type == token_type_string_literal)
770    drop_reference (tp->comment);
771}
772
773
774/* 4. Combine characters into tokens.  Discard whitespace.  */
775
776static token_ty phase4_pushback[3];
777static int phase4_pushback_length;
778
779static void
780phase4_get (token_ty *tp)
781{
782  static char *buffer;
783  static int bufmax;
784  int bufpos;
785  int c;
786
787  if (phase4_pushback_length)
788    {
789      *tp = phase4_pushback[--phase4_pushback_length];
790      return;
791    }
792  tp->string = NULL;
793
794  for (;;)
795    {
796      tp->line_number = line_number;
797      c = phase3_getc ();
798      switch (c)
799	{
800	case EOF:
801	  tp->type = token_type_eof;
802	  return;
803
804	case '\n':
805	  if (last_non_comment_line > last_comment_line)
806	    savable_comment_reset ();
807	  /* FALLTHROUGH */
808	case ' ':
809	case '\t':
810	case '\r':
811	  /* Ignore whitespace.  */
812	  continue;
813	}
814
815      last_non_comment_line = tp->line_number;
816
817      switch (c)
818	{
819	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
820	case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
821	case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
822	case 'V': case 'W': case 'X': case 'Y': case 'Z':
823	case '_':
824	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
825	case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
826	case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
827	case 'v': case 'w': case 'x': case 'y': case 'z':
828	case 127: case 128: case 129: case 130: case 131: case 132: case 133:
829	case 134: case 135: case 136: case 137: case 138: case 139: case 140:
830	case 141: case 142: case 143: case 144: case 145: case 146: case 147:
831	case 148: case 149: case 150: case 151: case 152: case 153: case 154:
832	case 155: case 156: case 157: case 158: case 159: case 160: case 161:
833	case 162: case 163: case 164: case 165: case 166: case 167: case 168:
834	case 169: case 170: case 171: case 172: case 173: case 174: case 175:
835	case 176: case 177: case 178: case 179: case 180: case 181: case 182:
836	case 183: case 184: case 185: case 186: case 187: case 188: case 189:
837	case 190: case 191: case 192: case 193: case 194: case 195: case 196:
838	case 197: case 198: case 199: case 200: case 201: case 202: case 203:
839	case 204: case 205: case 206: case 207: case 208: case 209: case 210:
840	case 211: case 212: case 213: case 214: case 215: case 216: case 217:
841	case 218: case 219: case 220: case 221: case 222: case 223: case 224:
842	case 225: case 226: case 227: case 228: case 229: case 230: case 231:
843	case 232: case 233: case 234: case 235: case 236: case 237: case 238:
844	case 239: case 240: case 241: case 242: case 243: case 244: case 245:
845	case 246: case 247: case 248: case 249: case 250: case 251: case 252:
846	case 253: case 254: case 255:
847	  bufpos = 0;
848	  for (;;)
849	    {
850	      if (bufpos >= bufmax)
851		{
852		  bufmax = 2 * bufmax + 10;
853		  buffer = xrealloc (buffer, bufmax);
854		}
855	      buffer[bufpos++] = c;
856	      c = phase1_getc ();
857	      switch (c)
858		{
859		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
860		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
861		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
862		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
863		case 'Y': case 'Z':
864		case '_':
865		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
866		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
867		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
868		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
869		case 'y': case 'z':
870		case '0': case '1': case '2': case '3': case '4':
871		case '5': case '6': case '7': case '8': case '9':
872		case 127: case 128: case 129: case 130: case 131: case 132:
873		case 133: case 134: case 135: case 136: case 137: case 138:
874		case 139: case 140: case 141: case 142: case 143: case 144:
875		case 145: case 146: case 147: case 148: case 149: case 150:
876		case 151: case 152: case 153: case 154: case 155: case 156:
877		case 157: case 158: case 159: case 160: case 161: case 162:
878		case 163: case 164: case 165: case 166: case 167: case 168:
879		case 169: case 170: case 171: case 172: case 173: case 174:
880		case 175: case 176: case 177: case 178: case 179: case 180:
881		case 181: case 182: case 183: case 184: case 185: case 186:
882		case 187: case 188: case 189: case 190: case 191: case 192:
883		case 193: case 194: case 195: case 196: case 197: case 198:
884		case 199: case 200: case 201: case 202: case 203: case 204:
885		case 205: case 206: case 207: case 208: case 209: case 210:
886		case 211: case 212: case 213: case 214: case 215: case 216:
887		case 217: case 218: case 219: case 220: case 221: case 222:
888		case 223: case 224: case 225: case 226: case 227: case 228:
889		case 229: case 230: case 231: case 232: case 233: case 234:
890		case 235: case 236: case 237: case 238: case 239: case 240:
891		case 241: case 242: case 243: case 244: case 245: case 246:
892		case 247: case 248: case 249: case 250: case 251: case 252:
893		case 253: case 254: case 255:
894		  continue;
895
896		default:
897		  phase1_ungetc (c);
898		  break;
899		}
900	      break;
901	    }
902	  if (bufpos >= bufmax)
903	    {
904	      bufmax = 2 * bufmax + 10;
905	      buffer = xrealloc (buffer, bufmax);
906	    }
907	  buffer[bufpos] = 0;
908	  tp->string = xstrdup (buffer);
909	  tp->type = token_type_symbol;
910	  return;
911
912	case '\'':
913	  /* Single-quoted string literal.  */
914	  bufpos = 0;
915	  for (;;)
916	    {
917	      c = phase1_getc ();
918	      if (c == EOF || c == '\'')
919		break;
920	      if (c == '\\')
921		{
922		  c = phase1_getc ();
923		  if (c != '\\' && c != '\'')
924		    {
925		      phase1_ungetc (c);
926		      c = '\\';
927		    }
928		}
929	      if (bufpos >= bufmax)
930		{
931		  bufmax = 2 * bufmax + 10;
932		  buffer = xrealloc (buffer, bufmax);
933		}
934	      buffer[bufpos++] = c;
935	    }
936	  if (bufpos >= bufmax)
937	    {
938	      bufmax = 2 * bufmax + 10;
939	      buffer = xrealloc (buffer, bufmax);
940	    }
941	  buffer[bufpos] = 0;
942	  tp->type = token_type_string_literal;
943	  tp->string = xstrdup (buffer);
944	  tp->comment = add_reference (savable_comment);
945	  return;
946
947	case '"':
948	  /* Double-quoted string literal.  */
949	  tp->type = token_type_string_literal;
950	  bufpos = 0;
951	  for (;;)
952	    {
953	      c = phase1_getc ();
954	      if (c == EOF || c == '"')
955		break;
956	      if (c == '$')
957		{
958		  c = phase1_getc ();
959		  if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
960		      || c == '_' || c == '{' || c >= 0x7f)
961		    {
962		      /* String with variables.  */
963		      tp->type = token_type_other;
964		      continue;
965		    }
966		  phase1_ungetc (c);
967		  c = '$';
968		}
969	      if (c == '{')
970		{
971		  c = phase1_getc ();
972		  if (c == '$')
973		    {
974		      /* String with expressions.  */
975		      tp->type = token_type_other;
976		      continue;
977		    }
978		  phase1_ungetc (c);
979		  c = '{';
980		}
981	      if (c == '\\')
982		{
983		  int n, j;
984
985		  c = phase1_getc ();
986		  switch (c)
987		    {
988		    case '"':
989		    case '\\':
990		    case '$':
991		      break;
992
993		    case '0': case '1': case '2': case '3':
994		    case '4': case '5': case '6': case '7':
995		      n = 0;
996		      for (j = 0; j < 3; ++j)
997			{
998			  n = n * 8 + c - '0';
999			  c = phase1_getc ();
1000			  switch (c)
1001			    {
1002			    default:
1003			      break;
1004
1005			    case '0': case '1': case '2': case '3':
1006			    case '4': case '5': case '6': case '7':
1007			      continue;
1008			    }
1009			  break;
1010			}
1011		      phase1_ungetc (c);
1012		      c = n;
1013		      break;
1014
1015		    case 'x':
1016		      n = 0;
1017		      for (j = 0; j < 2; ++j)
1018			{
1019			  c = phase1_getc ();
1020			  switch (c)
1021			    {
1022			    case '0': case '1': case '2': case '3': case '4':
1023			    case '5': case '6': case '7': case '8': case '9':
1024			      n = n * 16 + c - '0';
1025			      break;
1026			    case 'A': case 'B': case 'C': case 'D': case 'E':
1027			    case 'F':
1028			      n = n * 16 + 10 + c - 'A';
1029			      break;
1030			    case 'a': case 'b': case 'c': case 'd': case 'e':
1031			    case 'f':
1032			      n = n * 16 + 10 + c - 'a';
1033			      break;
1034			    default:
1035			      phase1_ungetc (c);
1036			      c = 0;
1037			      break;
1038			    }
1039			  if (c == 0)
1040			    break;
1041			}
1042		      if (j == 0)
1043			{
1044			  phase1_ungetc ('x');
1045			  c = '\\';
1046			}
1047		      else
1048			c = n;
1049		      break;
1050
1051		    case 'n':
1052		      c = '\n';
1053		      break;
1054		    case 't':
1055		      c = '\t';
1056		      break;
1057		    case 'r':
1058		      c = '\r';
1059		      break;
1060
1061		    default:
1062		      phase1_ungetc (c);
1063		      c = '\\';
1064		      break;
1065		    }
1066		}
1067	      if (bufpos >= bufmax)
1068		{
1069		  bufmax = 2 * bufmax + 10;
1070		  buffer = xrealloc (buffer, bufmax);
1071		}
1072	      buffer[bufpos++] = c;
1073	    }
1074	  if (bufpos >= bufmax)
1075	    {
1076	      bufmax = 2 * bufmax + 10;
1077	      buffer = xrealloc (buffer, bufmax);
1078	    }
1079	  buffer[bufpos] = 0;
1080	  if (tp->type == token_type_string_literal)
1081	    {
1082	      tp->string = xstrdup (buffer);
1083	      tp->comment = add_reference (savable_comment);
1084	    }
1085	  return;
1086
1087	case '?':
1088	case '%':
1089	  {
1090	    int c2 = phase1_getc ();
1091	    if (c2 == '>')
1092	      {
1093		/* ?> and %> terminate PHP mode and switch back to HTML
1094		   mode.  */
1095		skip_html ();
1096		tp->type = token_type_other;
1097	      }
1098	    else
1099	      {
1100		phase1_ungetc (c2);
1101		tp->type = (c == '%' ? token_type_operator1 : token_type_other);
1102	      }
1103	    return;
1104	  }
1105
1106	case '(':
1107	  tp->type = token_type_lparen;
1108	  return;
1109
1110	case ')':
1111	  tp->type = token_type_rparen;
1112	  return;
1113
1114	case ',':
1115	  tp->type = token_type_comma;
1116	  return;
1117
1118	case '[':
1119	  tp->type = token_type_lbracket;
1120	  return;
1121
1122	case ']':
1123	  tp->type = token_type_rbracket;
1124	  return;
1125
1126	case '.':
1127	  tp->type = token_type_dot;
1128	  return;
1129
1130	case '*':
1131	case '/':
1132	  tp->type = token_type_operator1;
1133	  return;
1134
1135	case '+':
1136	case '-':
1137	  {
1138	    int c2 = phase1_getc ();
1139	    if (c2 == c)
1140	      /* ++ or -- */
1141	      tp->type = token_type_operator1;
1142	    else
1143	      /* + or - */
1144	      {
1145		phase1_ungetc (c2);
1146		tp->type = token_type_operator2;
1147	      }
1148	    return;
1149	  }
1150
1151	case '!':
1152	case '~':
1153	case '@':
1154	  tp->type = token_type_operator2;
1155	  return;
1156
1157	case '<':
1158	  {
1159	    int c2 = phase1_getc ();
1160	    if (c2 == '<')
1161	      {
1162		int c3 = phase1_getc ();
1163		if (c3 == '<')
1164		  {
1165		    /* Start of here document.
1166		       Parse whitespace, then label, then newline.  */
1167		    do
1168		      c = phase3_getc ();
1169		    while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
1170
1171		    bufpos = 0;
1172		    do
1173		      {
1174			if (bufpos >= bufmax)
1175			  {
1176			    bufmax = 2 * bufmax + 10;
1177			    buffer = xrealloc (buffer, bufmax);
1178			  }
1179			buffer[bufpos++] = c;
1180			c = phase3_getc ();
1181		      }
1182		    while (c != EOF && c != '\n' && c != '\r');
1183		    /* buffer[0..bufpos-1] now contains the label.  */
1184
1185		    /* Now skip the here document.  */
1186		    for (;;)
1187		      {
1188			c = phase1_getc ();
1189			if (c == EOF)
1190			  break;
1191			if (c == '\n' || c == '\r')
1192			  {
1193			    int bufidx = 0;
1194
1195			    while (bufidx < bufpos)
1196			      {
1197				c = phase1_getc ();
1198				if (c == EOF)
1199				  break;
1200				if (c != buffer[bufidx])
1201				  {
1202				    phase1_ungetc (c);
1203				    break;
1204				  }
1205				bufidx++;
1206			      }
1207			    if (bufidx == bufpos)
1208			      {
1209				c = phase1_getc ();
1210				if (c != ';')
1211				  phase1_ungetc (c);
1212				c = phase1_getc ();
1213				if (c == '\n' || c == '\r')
1214				  break;
1215			      }
1216			  }
1217		      }
1218
1219		    /* FIXME: Ideally we should turn the here document into a
1220		       string literal if it didn't contain $ substitution.  And
1221		       we should also respect backslash escape sequences like
1222		       in double-quoted strings.  */
1223		    tp->type = token_type_other;
1224		    return;
1225		  }
1226		phase1_ungetc (c3);
1227	      }
1228
1229	    /* < / script > terminates PHP mode and switches back to HTML
1230	       mode.  */
1231	    while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
1232	      c2 = phase1_getc ();
1233	    if (c2 == '/')
1234	      {
1235		do
1236		  c2 = phase1_getc ();
1237		while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
1238		if (c2 == 's' || c2 == 'S')
1239		  {
1240		    c2 = phase1_getc ();
1241		    if (c2 == 'c' || c2 == 'C')
1242		      {
1243			c2 = phase1_getc ();
1244			if (c2 == 'r' || c2 == 'R')
1245			  {
1246			    c2 = phase1_getc ();
1247			    if (c2 == 'i' || c2 == 'I')
1248			      {
1249				c2 = phase1_getc ();
1250				if (c2 == 'p' || c2 == 'P')
1251				  {
1252				    c2 = phase1_getc ();
1253				    if (c2 == 't' || c2 == 'T')
1254				      {
1255					do
1256					  c2 = phase1_getc ();
1257					while (c2 == ' ' || c2 == '\t'
1258					       || c2 == '\n' || c2 == '\r');
1259					if (c2 == '>')
1260					  {
1261					    skip_html ();
1262					  }
1263					else
1264					  phase1_ungetc (c2);
1265				      }
1266				    else
1267				      phase1_ungetc (c2);
1268				  }
1269				else
1270				  phase1_ungetc (c2);
1271			      }
1272			    else
1273			      phase1_ungetc (c2);
1274			  }
1275			else
1276			  phase1_ungetc (c2);
1277		      }
1278		    else
1279		      phase1_ungetc (c2);
1280		  }
1281		else
1282		  phase1_ungetc (c2);
1283	      }
1284	    else
1285	      phase1_ungetc (c2);
1286
1287	    tp->type = token_type_other;
1288	    return;
1289	  }
1290
1291	case '`':
1292	  /* Execution operator.  */
1293	default:
1294	  /* We could carefully recognize each of the 2 and 3 character
1295	     operators, but it is not necessary, as we only need to recognize
1296	     gettext invocations.  Don't bother.  */
1297	  tp->type = token_type_other;
1298	  return;
1299	}
1300    }
1301}
1302
1303/* Supports 3 tokens of pushback.  */
1304static void
1305phase4_unget (token_ty *tp)
1306{
1307  if (tp->type != token_type_eof)
1308    {
1309      if (phase4_pushback_length == SIZEOF (phase4_pushback))
1310	abort ();
1311      phase4_pushback[phase4_pushback_length++] = *tp;
1312    }
1313}
1314
1315
1316/* 5. Compile-time optimization of string literal concatenation.
1317   Combine "string1" . ... . "stringN" to the concatenated string if
1318     - the token before this expression is none of
1319       '+' '-' '.' '*' '/' '%' '!' '~' '++' '--' ')' '@'
1320       (because then the first string could be part of an expression with
1321       the same or higher precedence as '.', such as an additive,
1322       multiplicative, negation, preincrement, or cast expression),
1323     - the token after this expression is none of
1324       '*' '/' '%' '++' '--'
1325       (because then the last string could be part of an expression with
1326       higher precedence as '.', such as a multiplicative or postincrement
1327       expression).  */
1328
1329static token_type_ty phase5_last;
1330
1331static void
1332x_php_lex (token_ty *tp)
1333{
1334  phase4_get (tp);
1335  if (tp->type == token_type_string_literal
1336      && !(phase5_last == token_type_dot
1337	   || phase5_last == token_type_operator1
1338	   || phase5_last == token_type_operator2
1339	   || phase5_last == token_type_rparen))
1340    {
1341      char *sum = tp->string;
1342      size_t sum_len = strlen (sum);
1343
1344      for (;;)
1345	{
1346	  token_ty token2;
1347
1348	  phase4_get (&token2);
1349	  if (token2.type == token_type_dot)
1350	    {
1351	      token_ty token3;
1352
1353	      phase4_get (&token3);
1354	      if (token3.type == token_type_string_literal)
1355		{
1356		  token_ty token_after;
1357
1358		  phase4_get (&token_after);
1359		  if (token_after.type != token_type_operator1)
1360		    {
1361		      char *addend = token3.string;
1362		      size_t addend_len = strlen (addend);
1363
1364		      sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1365		      memcpy (sum + sum_len, addend, addend_len + 1);
1366		      sum_len += addend_len;
1367
1368		      phase4_unget (&token_after);
1369		      free_token (&token3);
1370		      free_token (&token2);
1371		      continue;
1372		    }
1373		  phase4_unget (&token_after);
1374		}
1375	      phase4_unget (&token3);
1376	    }
1377	  phase4_unget (&token2);
1378	  break;
1379	}
1380      tp->string = sum;
1381    }
1382  phase5_last = tp->type;
1383}
1384
1385
1386/* ========================= Extracting strings.  ========================== */
1387
1388
1389/* Context lookup table.  */
1390static flag_context_list_table_ty *flag_context_list_table;
1391
1392
1393/* The file is broken into tokens.  Scan the token stream, looking for
1394   a keyword, followed by a left paren, followed by a string.  When we
1395   see this sequence, we have something to remember.  We assume we are
1396   looking at a valid C or C++ program, and leave the complaints about
1397   the grammar to the compiler.
1398
1399     Normal handling: Look for
1400       keyword ( ... msgid ... )
1401     Plural handling: Look for
1402       keyword ( ... msgid ... msgid_plural ... )
1403
1404   We use recursion because the arguments before msgid or between msgid
1405   and msgid_plural can contain subexpressions of the same form.  */
1406
1407
1408/* Extract messages until the next balanced closing parenthesis or bracket.
1409   Extracted messages are added to MLP.
1410   DELIM can be either token_type_rparen or token_type_rbracket, or
1411   token_type_eof to accept both.
1412   Return true upon eof, false upon closing parenthesis.  */
1413static bool
1414extract_balanced (message_list_ty *mlp,
1415		  token_type_ty delim,
1416		  flag_context_ty outer_context,
1417		  flag_context_list_iterator_ty context_iter,
1418		  struct arglist_parser *argparser)
1419{
1420  /* Current argument number.  */
1421  int arg = 1;
1422  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1423  int state;
1424  /* Parameters of the keyword just seen.  Defined only in state 1.  */
1425  const struct callshapes *next_shapes = NULL;
1426  /* Context iterator that will be used if the next token is a '('.  */
1427  flag_context_list_iterator_ty next_context_iter =
1428    passthrough_context_list_iterator;
1429  /* Current context.  */
1430  flag_context_ty inner_context =
1431    inherited_context (outer_context,
1432		       flag_context_list_iterator_advance (&context_iter));
1433
1434  /* Start state is 0.  */
1435  state = 0;
1436
1437  for (;;)
1438    {
1439      token_ty token;
1440
1441      x_php_lex (&token);
1442      switch (token.type)
1443	{
1444	case token_type_symbol:
1445	  {
1446	    void *keyword_value;
1447
1448	    if (hash_find_entry (&keywords, token.string, strlen (token.string),
1449				 &keyword_value)
1450		== 0)
1451	      {
1452		next_shapes = (const struct callshapes *) keyword_value;
1453		state = 1;
1454	      }
1455	    else
1456	      state = 0;
1457	  }
1458	  next_context_iter =
1459	    flag_context_list_iterator (
1460	      flag_context_list_table_lookup (
1461		flag_context_list_table,
1462		token.string, strlen (token.string)));
1463	  free (token.string);
1464	  continue;
1465
1466	case token_type_lparen:
1467	  if (extract_balanced (mlp, token_type_rparen,
1468				inner_context, next_context_iter,
1469				arglist_parser_alloc (mlp,
1470						      state ? next_shapes : NULL)))
1471	    {
1472	      arglist_parser_done (argparser, arg);
1473	      return true;
1474	    }
1475	  next_context_iter = null_context_list_iterator;
1476	  state = 0;
1477	  continue;
1478
1479	case token_type_rparen:
1480	  if (delim == token_type_rparen || delim == token_type_eof)
1481	    {
1482	      arglist_parser_done (argparser, arg);
1483	      return false;
1484	    }
1485	  next_context_iter = null_context_list_iterator;
1486	  state = 0;
1487	  continue;
1488
1489	case token_type_comma:
1490	  arg++;
1491	  inner_context =
1492	    inherited_context (outer_context,
1493			       flag_context_list_iterator_advance (
1494				 &context_iter));
1495	  next_context_iter = passthrough_context_list_iterator;
1496	  state = 0;
1497	  continue;
1498
1499	case token_type_lbracket:
1500	  if (extract_balanced (mlp, token_type_rbracket,
1501				null_context, null_context_list_iterator,
1502				arglist_parser_alloc (mlp, NULL)))
1503	    {
1504	      arglist_parser_done (argparser, arg);
1505	      return true;
1506	    }
1507
1508	case token_type_rbracket:
1509	  if (delim == token_type_rbracket || delim == token_type_eof)
1510	    {
1511	      arglist_parser_done (argparser, arg);
1512	      return false;
1513	    }
1514	  next_context_iter = null_context_list_iterator;
1515	  state = 0;
1516	  continue;
1517
1518	case token_type_string_literal:
1519	  {
1520	    lex_pos_ty pos;
1521	    pos.file_name = logical_file_name;
1522	    pos.line_number = token.line_number;
1523
1524	    if (extract_all)
1525	      remember_a_message (mlp, NULL, token.string, inner_context,
1526				  &pos, token.comment);
1527	    else
1528	      arglist_parser_remember (argparser, arg, token.string,
1529				       inner_context,
1530				       pos.file_name, pos.line_number,
1531				       token.comment);
1532	    drop_reference (token.comment);
1533	  }
1534	  next_context_iter = null_context_list_iterator;
1535	  state = 0;
1536	  continue;
1537
1538	case token_type_dot:
1539	case token_type_operator1:
1540	case token_type_operator2:
1541	case token_type_other:
1542	  next_context_iter = null_context_list_iterator;
1543	  state = 0;
1544	  continue;
1545
1546	case token_type_eof:
1547	  arglist_parser_done (argparser, arg);
1548	  return true;
1549
1550	default:
1551	  abort ();
1552	}
1553    }
1554}
1555
1556
1557void
1558extract_php (FILE *f,
1559	     const char *real_filename, const char *logical_filename,
1560	     flag_context_list_table_ty *flag_table,
1561	     msgdomain_list_ty *mdlp)
1562{
1563  message_list_ty *mlp = mdlp->item[0]->messages;
1564
1565  fp = f;
1566  real_file_name = real_filename;
1567  logical_file_name = xstrdup (logical_filename);
1568  line_number = 1;
1569
1570  last_comment_line = -1;
1571  last_non_comment_line = -1;
1572
1573  phase5_last = token_type_eof;
1574
1575  flag_context_list_table = flag_table;
1576
1577  init_keywords ();
1578
1579  /* Initial mode is HTML mode, not PHP mode.  */
1580  skip_html ();
1581
1582  /* Eat tokens until eof is seen.  When extract_balanced returns
1583     due to an unbalanced closing parenthesis, just restart it.  */
1584  while (!extract_balanced (mlp, token_type_eof,
1585			    null_context, null_context_list_iterator,
1586			    arglist_parser_alloc (mlp, NULL)))
1587    ;
1588
1589  /* Close scanner.  */
1590  fp = NULL;
1591  real_file_name = NULL;
1592  logical_file_name = NULL;
1593  line_number = 0;
1594}
1595