1/* xgettext YCP backend.
2   Copyright (C) 2001-2003 Free Software Foundation, Inc.
3
4   This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software Foundation,
18   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
19
20#ifdef HAVE_CONFIG_H
21# include "config.h"
22#endif
23
24#include <ctype.h>
25#include <errno.h>
26#include <limits.h>
27#include <stdbool.h>
28#include <stdio.h>
29#include <stdlib.h>
30
31#include "message.h"
32#include "xgettext.h"
33#include "x-ycp.h"
34#include "error.h"
35#include "xalloc.h"
36#include "exit.h"
37#include "gettext.h"
38
39#define _(s) gettext(s)
40
41#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
42
43
44/* The YCP syntax is defined in libycp/doc/syntax.html.
45   See also libycp/src/scanner.ll.  */
46
47
48void
49init_flag_table_ycp ()
50{
51  xgettext_record_flag ("sformat:1:ycp-format");
52  xgettext_record_flag ("y2debug:1:ycp-format");
53  xgettext_record_flag ("y2milestone:1:ycp-format");
54  xgettext_record_flag ("y2warning:1:ycp-format");
55  xgettext_record_flag ("y2error:1:ycp-format");
56  xgettext_record_flag ("y2security:1:ycp-format");
57  xgettext_record_flag ("y2internal:1:ycp-format");
58}
59
60
61/* ======================== Reading of characters.  ======================== */
62
63
64/* Real filename, used in error messages about the input file.  */
65static const char *real_file_name;
66
67/* Logical filename and line number, used to label the extracted messages.  */
68static char *logical_file_name;
69static int line_number;
70static int char_in_line;
71
72/* The input file stream.  */
73static FILE *fp;
74
75/* These are for tracking whether comments count as immediately before
76   keyword.  */
77static int last_comment_line;
78static int last_non_comment_line;
79
80
81/* 1. line_number handling.  */
82
83static int
84phase1_getc ()
85{
86  int c = getc (fp);
87
88  if (c == EOF)
89    {
90      if (ferror (fp))
91	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
92	       real_file_name);
93      return EOF;
94    }
95
96  if (c == '\n')
97    {
98      line_number++;
99      char_in_line = 0;
100    }
101  else
102    char_in_line++;
103
104  return c;
105}
106
107/* Supports only one pushback character.  */
108static void
109phase1_ungetc (int c)
110{
111  if (c != EOF)
112    {
113      if (c == '\n')
114	{
115	  --line_number;
116	  char_in_line = INT_MAX;
117	}
118      else
119	--char_in_line;
120
121      ungetc (c, fp);
122    }
123}
124
125
126/* 2. Replace each comment that is not inside a character constant or
127   string literal with a space character.  We need to remember the
128   comment for later, because it may be attached to a keyword string.
129   YCP comments can be in C comment syntax, C++ comment syntax or sh
130   comment syntax.  */
131
132static unsigned char phase2_pushback[1];
133static int phase2_pushback_length;
134
135static int
136phase2_getc ()
137{
138  static char *buffer;
139  static size_t bufmax;
140  size_t buflen;
141  int lineno;
142  int c;
143  bool last_was_star;
144
145  if (phase2_pushback_length)
146    return phase2_pushback[--phase2_pushback_length];
147
148  if (char_in_line == 0)
149    {
150      /* Eat whitespace, to recognize ^[\t ]*# pattern.  */
151      do
152	c = phase1_getc ();
153      while (c == '\t' || c == ' ');
154
155      if (c == '#')
156	{
157	  /* sh comment.  */
158	  buflen = 0;
159	  lineno = line_number;
160	  for (;;)
161	    {
162	      c = phase1_getc ();
163	      if (c == '\n' || c == EOF)
164		break;
165	      /* We skip all leading white space, but not EOLs.  */
166	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
167		{
168		  if (buflen >= bufmax)
169		    {
170		      bufmax = 2 * bufmax + 10;
171		      buffer = xrealloc (buffer, bufmax);
172		    }
173		  buffer[buflen++] = c;
174		}
175	    }
176	  if (buflen >= bufmax)
177	    {
178	      bufmax = 2 * bufmax + 10;
179	      buffer = xrealloc (buffer, bufmax);
180	    }
181	  buffer[buflen] = '\0';
182	  xgettext_comment_add (buffer);
183	  last_comment_line = lineno;
184	  return '\n';
185	}
186    }
187  else
188    c = phase1_getc ();
189
190  if (c == '/')
191    {
192      c = phase1_getc ();
193
194      switch (c)
195	{
196	default:
197	  phase1_ungetc (c);
198	  return '/';
199
200	case '*':
201	  /* C comment.  */
202	  buflen = 0;
203	  lineno = line_number;
204	  last_was_star = false;
205	  for (;;)
206	    {
207	      c = phase1_getc ();
208	      if (c == EOF)
209		break;
210	      /* We skip all leading white space, but not EOLs.  */
211	      if (buflen == 0 && (c == ' ' || c == '\t'))
212		continue;
213	      if (buflen >= bufmax)
214		{
215		  bufmax = 2 * bufmax + 10;
216		  buffer = xrealloc (buffer, bufmax);
217	        }
218	      buffer[buflen++] = c;
219	      switch (c)
220		{
221		case '\n':
222		  --buflen;
223		  while (buflen >= 1
224			 && (buffer[buflen - 1] == ' '
225			     || buffer[buflen - 1] == '\t'))
226		    --buflen;
227		  buffer[buflen] = '\0';
228		  xgettext_comment_add (buffer);
229		  buflen = 0;
230		  lineno = line_number;
231		  last_was_star = false;
232		  continue;
233
234		case '*':
235		  last_was_star = true;
236		  continue;
237
238		case '/':
239		  if (last_was_star)
240		    {
241		      buflen -= 2;
242		      while (buflen >= 1
243			     && (buffer[buflen - 1] == ' '
244				 || buffer[buflen - 1] == '\t'))
245			--buflen;
246		      buffer[buflen] = '\0';
247		      xgettext_comment_add (buffer);
248		      break;
249		    }
250		  /* FALLTHROUGH */
251
252		default:
253		  last_was_star = false;
254		  continue;
255		}
256	      break;
257	    }
258	  last_comment_line = lineno;
259	  return ' ';
260
261	case '/':
262	  /* C++ comment.  */
263	  buflen = 0;
264	  lineno = line_number;
265	  for (;;)
266	    {
267	      c = phase1_getc ();
268	      if (c == '\n' || c == EOF)
269		break;
270	      /* We skip all leading white space, but not EOLs.  */
271	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
272		{
273		  if (buflen >= bufmax)
274		    {
275		      bufmax = 2 * bufmax + 10;
276		      buffer = xrealloc (buffer, bufmax);
277		    }
278		  buffer[buflen++] = c;
279		}
280	    }
281	  if (buflen >= bufmax)
282	    {
283	      bufmax = 2 * bufmax + 10;
284	      buffer = xrealloc (buffer, bufmax);
285	    }
286	  buffer[buflen] = '\0';
287	  xgettext_comment_add (buffer);
288	  last_comment_line = lineno;
289	  return '\n';
290	}
291    }
292  else
293    return c;
294}
295
296/* Supports only one pushback character.  */
297static void
298phase2_ungetc (int c)
299{
300  if (c != EOF)
301    {
302      if (phase2_pushback_length == SIZEOF (phase2_pushback))
303	abort ();
304      phase2_pushback[phase2_pushback_length++] = c;
305    }
306}
307
308
309/* ========================== Reading of tokens.  ========================== */
310
311
312enum token_type_ty
313{
314  token_type_eof,
315  token_type_lparen,		/* ( */
316  token_type_rparen,		/* ) */
317  token_type_comma,		/* , */
318  token_type_i18n,		/* _( */
319  token_type_string_literal,	/* "abc" */
320  token_type_symbol,		/* symbol, number */
321  token_type_other		/* misc. operator */
322};
323typedef enum token_type_ty token_type_ty;
324
325typedef struct token_ty token_ty;
326struct token_ty
327{
328  token_type_ty type;
329  char *string;		/* for token_type_string_literal, token_type_symbol */
330  int line_number;
331};
332
333
334/* 7. Replace escape sequences within character strings with their
335   single character equivalents.  */
336
337#define P7_QUOTES (1000 + '"')
338
339static int
340phase7_getc ()
341{
342  int c;
343
344  for (;;)
345    {
346      /* Use phase 1, because phase 2 elides comments.  */
347      c = phase1_getc ();
348
349      if (c == '"')
350	return P7_QUOTES;
351      if (c != '\\')
352	return c;
353      c = phase1_getc ();
354      if (c != '\n')
355	switch (c)
356	  {
357	  case 'b':
358	    return '\b';
359	  case 'f':
360	    return '\f';
361	  case 'n':
362	    return '\n';
363	  case 'r':
364	    return '\r';
365	  case 't':
366	    return '\t';
367
368	  /* FIXME: What is the octal escape syntax?
369	     syntax.html says: [0] [0-7]+
370	     scanner.ll says:  [0-7] [0-7] [0-7]
371	   */
372#if 0
373	  case '0': case '1': case '2': case '3':
374	  case '4': case '5': case '6': case '7':
375	    {
376	      int n, j;
377
378	      n = 0;
379	      for (j = 0; j < 3; ++j)
380		{
381		  n = n * 8 + c - '0';
382		  c = phase1_getc ();
383		  switch (c)
384		    {
385		    default:
386		      break;
387
388		    case '0': case '1': case '2': case '3':
389		    case '4': case '5': case '6': case '7':
390		      continue;
391		    }
392		  break;
393		}
394	      phase1_ungetc (c);
395	      return n;
396	    }
397#endif
398
399	  default:
400	    return c;
401	  }
402    }
403}
404
405
406/* Combine characters into tokens.  Discard whitespace.  */
407
408static void
409x_ycp_lex (token_ty *tp)
410{
411  static char *buffer;
412  static int bufmax;
413  int bufpos;
414  int c;
415
416  for (;;)
417    {
418      tp->line_number = line_number;
419      c = phase2_getc ();
420
421      switch (c)
422	{
423	case EOF:
424	  tp->type = token_type_eof;
425	  return;
426
427	case '\n':
428	  if (last_non_comment_line > last_comment_line)
429	    xgettext_comment_reset ();
430	  /* FALLTHROUGH */
431	case '\r':
432	case '\t':
433	case ' ':
434	  /* Ignore whitespace and comments.  */
435	  continue;
436	}
437
438      last_non_comment_line = tp->line_number;
439
440      switch (c)
441	{
442	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
443	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
444	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
445	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
446	case 'Y': case 'Z':
447	case '_':
448	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
449	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
450	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
451	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
452	case 'y': case 'z':
453	case '0': case '1': case '2': case '3': case '4':
454	case '5': case '6': case '7': case '8': case '9':
455	  /* Symbol, or part of a number.  */
456	  bufpos = 0;
457	  for (;;)
458	    {
459	      if (bufpos >= bufmax)
460		{
461		  bufmax = 2 * bufmax + 10;
462		  buffer = xrealloc (buffer, bufmax);
463		}
464	      buffer[bufpos++] = c;
465	      c = phase2_getc ();
466	      switch (c)
467		{
468		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
469		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
470		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
471		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
472		case 'Y': case 'Z':
473		case '_':
474		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
475		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
476		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
477		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
478		case 'y': case 'z':
479		case '0': case '1': case '2': case '3': case '4':
480		case '5': case '6': case '7': case '8': case '9':
481		  continue;
482		default:
483		  if (bufpos == 1 && buffer[0] == '_' && c == '(')
484		    {
485		      tp->type = token_type_i18n;
486		      return;
487		    }
488		  phase2_ungetc (c);
489		  break;
490		}
491	      break;
492	    }
493	  if (bufpos >= bufmax)
494	    {
495	      bufmax = 2 * bufmax + 10;
496	      buffer = xrealloc (buffer, bufmax);
497	    }
498	  buffer[bufpos] = '\0';
499	  tp->string = xstrdup (buffer);
500	  tp->type = token_type_symbol;
501	  return;
502
503	case '"':
504	  bufpos = 0;
505	  for (;;)
506	    {
507	      c = phase7_getc ();
508	      if (c == EOF || c == P7_QUOTES)
509		break;
510	      if (bufpos >= bufmax)
511		{
512		  bufmax = 2 * bufmax + 10;
513		  buffer = xrealloc (buffer, bufmax);
514		}
515	      buffer[bufpos++] = c;
516	    }
517	  if (bufpos >= bufmax)
518	    {
519	      bufmax = 2 * bufmax + 10;
520	      buffer = xrealloc (buffer, bufmax);
521	    }
522	  buffer[bufpos] = '\0';
523	  tp->string = xstrdup (buffer);
524	  tp->type = token_type_string_literal;
525	  return;
526
527	case '(':
528	  tp->type = token_type_lparen;
529	  return;
530
531	case ')':
532	  tp->type = token_type_rparen;
533	  return;
534
535	case ',':
536	  tp->type = token_type_comma;
537	  return;
538
539	default:
540	  /* We could carefully recognize each of the 2 and 3 character
541	     operators, but it is not necessary, as we only need to recognize
542	     gettext invocations.  Don't bother.  */
543	  tp->type = token_type_other;
544	  return;
545	}
546    }
547}
548
549
550/* ========================= Extracting strings.  ========================== */
551
552
553/* Context lookup table.  */
554static flag_context_list_table_ty *flag_context_list_table;
555
556
557/* The file is broken into tokens.
558
559     Normal handling: Look for
560       [A] _( [B] msgid ... )
561     Plural handling: Look for
562       [A] _( [B] msgid [C] , [D] msgid_plural ... )
563     At point [A]: state == 0.
564     At point [B]: state == 1, plural_mp == NULL.
565     At point [C]: state == 2, plural_mp != NULL.
566     At point [D]: state == 1, plural_mp != NULL.
567
568   We use recursion because we have to set the context according to the given
569   flags.  */
570
571
572/* Extract messages until the next balanced closing parenthesis.
573   Extracted messages are added to MLP.
574   Return true upon eof, false upon closing parenthesis.  */
575static bool
576extract_parenthesized (message_list_ty *mlp,
577		       flag_context_ty outer_context,
578		       flag_context_list_iterator_ty context_iter,
579		       bool in_i18n)
580{
581  int state; /* 1 or 2 inside _( ... ), otherwise 0 */
582  message_ty *plural_mp = NULL;	/* defined only when in states 1 and 2 */
583  /* Context iterator that will be used if the next token is a '('.  */
584  flag_context_list_iterator_ty next_context_iter =
585    passthrough_context_list_iterator;
586  /* Current context.  */
587  flag_context_ty inner_context =
588    inherited_context (outer_context,
589		       flag_context_list_iterator_advance (&context_iter));
590
591  /* Start state is 0 or 1.  */
592  state = (in_i18n ? 1 : 0);
593
594  for (;;)
595    {
596      token_ty token;
597
598      x_ycp_lex (&token);
599      switch (token.type)
600	{
601	case token_type_i18n:
602	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
603				     true))
604	    return true;
605	  next_context_iter = null_context_list_iterator;
606	  state = 0;
607	  continue;
608
609	case token_type_string_literal:
610	  if (state == 1)
611	    {
612	      lex_pos_ty pos;
613	      pos.file_name = logical_file_name;
614	      pos.line_number = token.line_number;
615
616	      if (plural_mp == NULL)
617		{
618		  /* Seen an msgid.  */
619		  plural_mp = remember_a_message (mlp, token.string,
620						  inner_context, &pos);
621		  state = 2;
622		}
623	      else
624		{
625		  /* Seen an msgid_plural.  */
626		  remember_a_message_plural (plural_mp, token.string,
627					     inner_context, &pos);
628		  state = 0;
629		}
630	    }
631	  else
632	    {
633	      free (token.string);
634	      state = 0;
635	    }
636	  next_context_iter = null_context_list_iterator;
637	  continue;
638
639	case token_type_symbol:
640	  next_context_iter =
641	    flag_context_list_iterator (
642	      flag_context_list_table_lookup (
643		flag_context_list_table,
644		token.string, strlen (token.string)));
645	  free (token.string);
646	  state = 0;
647	  continue;
648
649	case token_type_lparen:
650	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
651				     false))
652	    return true;
653	  next_context_iter = null_context_list_iterator;
654	  state = 0;
655	  continue;
656
657	case token_type_rparen:
658	  return false;
659
660	case token_type_comma:
661	  if (state == 2)
662	    state = 1;
663	  else
664	    state = 0;
665	  inner_context =
666	    inherited_context (outer_context,
667			       flag_context_list_iterator_advance (
668				 &context_iter));
669	  next_context_iter = passthrough_context_list_iterator;
670	  continue;
671
672	case token_type_other:
673	  next_context_iter = null_context_list_iterator;
674	  state = 0;
675	  continue;
676
677	case token_type_eof:
678	  return true;
679
680	default:
681	  abort ();
682	}
683    }
684}
685
686
687void
688extract_ycp (FILE *f,
689	     const char *real_filename, const char *logical_filename,
690	     flag_context_list_table_ty *flag_table,
691	     msgdomain_list_ty *mdlp)
692{
693  message_list_ty *mlp = mdlp->item[0]->messages;
694
695  fp = f;
696  real_file_name = real_filename;
697  logical_file_name = xstrdup (logical_filename);
698  line_number = 1;
699  char_in_line = 0;
700
701  last_comment_line = -1;
702  last_non_comment_line = -1;
703
704  flag_context_list_table = flag_table;
705
706  /* Eat tokens until eof is seen.  When extract_parenthesized returns
707     due to an unbalanced closing parenthesis, just restart it.  */
708  while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
709				 false))
710    ;
711
712  fp = NULL;
713  real_file_name = NULL;
714  logical_file_name = NULL;
715  line_number = 0;
716  char_in_line = 0;
717}
718