1/* xgettext YCP backend.
2   Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3
4   This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software Foundation,
18   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19
20#ifdef HAVE_CONFIG_H
21# include "config.h"
22#endif
23
24#include <errno.h>
25#include <limits.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29
30#include "message.h"
31#include "xgettext.h"
32#include "x-ycp.h"
33#include "error.h"
34#include "xalloc.h"
35#include "exit.h"
36#include "gettext.h"
37
38#define _(s) gettext(s)
39
40#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
41
42
43/* The YCP syntax is defined in libycp/doc/syntax.html.
44   See also libycp/src/scanner.ll.
45   Both are part of the yast2-core package in SuSE Linux distributions.  */
46
47
48void
49init_flag_table_ycp ()
50{
51  xgettext_record_flag ("sformat:1:ycp-format");
52  xgettext_record_flag ("y2debug:1:ycp-format");
53  xgettext_record_flag ("y2milestone:1:ycp-format");
54  xgettext_record_flag ("y2warning:1:ycp-format");
55  xgettext_record_flag ("y2error:1:ycp-format");
56  xgettext_record_flag ("y2security:1:ycp-format");
57  xgettext_record_flag ("y2internal:1:ycp-format");
58}
59
60
61/* ======================== Reading of characters.  ======================== */
62
63
64/* Real filename, used in error messages about the input file.  */
65static const char *real_file_name;
66
67/* Logical filename and line number, used to label the extracted messages.  */
68static char *logical_file_name;
69static int line_number;
70static int char_in_line;
71
72/* The input file stream.  */
73static FILE *fp;
74
75/* These are for tracking whether comments count as immediately before
76   keyword.  */
77static int last_comment_line;
78static int last_non_comment_line;
79
80
81/* 1. line_number handling.  */
82
83static int
84phase1_getc ()
85{
86  int c = getc (fp);
87
88  if (c == EOF)
89    {
90      if (ferror (fp))
91	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
92	       real_file_name);
93      return EOF;
94    }
95
96  if (c == '\n')
97    {
98      line_number++;
99      char_in_line = 0;
100    }
101  else
102    char_in_line++;
103
104  return c;
105}
106
107/* Supports only one pushback character.  */
108static void
109phase1_ungetc (int c)
110{
111  if (c != EOF)
112    {
113      if (c == '\n')
114	{
115	  --line_number;
116	  char_in_line = INT_MAX;
117	}
118      else
119	--char_in_line;
120
121      ungetc (c, fp);
122    }
123}
124
125
126/* 2. Replace each comment that is not inside a character constant or
127   string literal with a space character.  We need to remember the
128   comment for later, because it may be attached to a keyword string.
129   YCP comments can be in C comment syntax, C++ comment syntax or sh
130   comment syntax.  */
131
132static unsigned char phase2_pushback[1];
133static int phase2_pushback_length;
134
135static int
136phase2_getc ()
137{
138  static char *buffer;
139  static size_t bufmax;
140  size_t buflen;
141  int lineno;
142  int c;
143  bool last_was_star;
144
145  if (phase2_pushback_length)
146    return phase2_pushback[--phase2_pushback_length];
147
148  if (char_in_line == 0)
149    {
150      /* Eat whitespace, to recognize ^[\t ]*# pattern.  */
151      do
152	c = phase1_getc ();
153      while (c == '\t' || c == ' ');
154
155      if (c == '#')
156	{
157	  /* sh comment.  */
158	  buflen = 0;
159	  lineno = line_number;
160	  for (;;)
161	    {
162	      c = phase1_getc ();
163	      if (c == '\n' || c == EOF)
164		break;
165	      /* We skip all leading white space, but not EOLs.  */
166	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
167		{
168		  if (buflen >= bufmax)
169		    {
170		      bufmax = 2 * bufmax + 10;
171		      buffer = xrealloc (buffer, bufmax);
172		    }
173		  buffer[buflen++] = c;
174		}
175	    }
176	  if (buflen >= bufmax)
177	    {
178	      bufmax = 2 * bufmax + 10;
179	      buffer = xrealloc (buffer, bufmax);
180	    }
181	  buffer[buflen] = '\0';
182	  savable_comment_add (buffer);
183	  last_comment_line = lineno;
184	  return '\n';
185	}
186    }
187  else
188    c = phase1_getc ();
189
190  if (c == '/')
191    {
192      c = phase1_getc ();
193
194      switch (c)
195	{
196	default:
197	  phase1_ungetc (c);
198	  return '/';
199
200	case '*':
201	  /* C comment.  */
202	  buflen = 0;
203	  lineno = line_number;
204	  last_was_star = false;
205	  for (;;)
206	    {
207	      c = phase1_getc ();
208	      if (c == EOF)
209		break;
210	      /* We skip all leading white space, but not EOLs.  */
211	      if (buflen == 0 && (c == ' ' || c == '\t'))
212		continue;
213	      if (buflen >= bufmax)
214		{
215		  bufmax = 2 * bufmax + 10;
216		  buffer = xrealloc (buffer, bufmax);
217	        }
218	      buffer[buflen++] = c;
219	      switch (c)
220		{
221		case '\n':
222		  --buflen;
223		  while (buflen >= 1
224			 && (buffer[buflen - 1] == ' '
225			     || buffer[buflen - 1] == '\t'))
226		    --buflen;
227		  buffer[buflen] = '\0';
228		  savable_comment_add (buffer);
229		  buflen = 0;
230		  lineno = line_number;
231		  last_was_star = false;
232		  continue;
233
234		case '*':
235		  last_was_star = true;
236		  continue;
237
238		case '/':
239		  if (last_was_star)
240		    {
241		      buflen -= 2;
242		      while (buflen >= 1
243			     && (buffer[buflen - 1] == ' '
244				 || buffer[buflen - 1] == '\t'))
245			--buflen;
246		      buffer[buflen] = '\0';
247		      savable_comment_add (buffer);
248		      break;
249		    }
250		  /* FALLTHROUGH */
251
252		default:
253		  last_was_star = false;
254		  continue;
255		}
256	      break;
257	    }
258	  last_comment_line = lineno;
259	  return ' ';
260
261	case '/':
262	  /* C++ comment.  */
263	  buflen = 0;
264	  lineno = line_number;
265	  for (;;)
266	    {
267	      c = phase1_getc ();
268	      if (c == '\n' || c == EOF)
269		break;
270	      /* We skip all leading white space, but not EOLs.  */
271	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
272		{
273		  if (buflen >= bufmax)
274		    {
275		      bufmax = 2 * bufmax + 10;
276		      buffer = xrealloc (buffer, bufmax);
277		    }
278		  buffer[buflen++] = c;
279		}
280	    }
281	  if (buflen >= bufmax)
282	    {
283	      bufmax = 2 * bufmax + 10;
284	      buffer = xrealloc (buffer, bufmax);
285	    }
286	  buffer[buflen] = '\0';
287	  savable_comment_add (buffer);
288	  last_comment_line = lineno;
289	  return '\n';
290	}
291    }
292  else
293    return c;
294}
295
296/* Supports only one pushback character.  */
297static void
298phase2_ungetc (int c)
299{
300  if (c != EOF)
301    {
302      if (phase2_pushback_length == SIZEOF (phase2_pushback))
303	abort ();
304      phase2_pushback[phase2_pushback_length++] = c;
305    }
306}
307
308
309/* ========================== Reading of tokens.  ========================== */
310
311
312enum token_type_ty
313{
314  token_type_eof,
315  token_type_lparen,		/* ( */
316  token_type_rparen,		/* ) */
317  token_type_comma,		/* , */
318  token_type_i18n,		/* _( */
319  token_type_string_literal,	/* "abc" */
320  token_type_symbol,		/* symbol, number */
321  token_type_other		/* misc. operator */
322};
323typedef enum token_type_ty token_type_ty;
324
325typedef struct token_ty token_ty;
326struct token_ty
327{
328  token_type_ty type;
329  char *string;		/* for token_type_string_literal, token_type_symbol */
330  int line_number;
331};
332
333
334/* 7. Replace escape sequences within character strings with their
335   single character equivalents.  */
336
337#define P7_QUOTES (1000 + '"')
338
339static int
340phase7_getc ()
341{
342  int c;
343
344  for (;;)
345    {
346      /* Use phase 1, because phase 2 elides comments.  */
347      c = phase1_getc ();
348
349      if (c == '"')
350	return P7_QUOTES;
351      if (c != '\\')
352	return c;
353      c = phase1_getc ();
354      if (c != '\n')
355	switch (c)
356	  {
357	  case 'b':
358	    return '\b';
359	  case 'f':
360	    return '\f';
361	  case 'n':
362	    return '\n';
363	  case 'r':
364	    return '\r';
365	  case 't':
366	    return '\t';
367
368	  /* FIXME: What is the octal escape syntax?
369	     syntax.html says: [0] [0-7]+
370	     scanner.ll says:  [0-7] [0-7] [0-7]
371	   */
372#if 0
373	  case '0': case '1': case '2': case '3':
374	  case '4': case '5': case '6': case '7':
375	    {
376	      int n, j;
377
378	      n = 0;
379	      for (j = 0; j < 3; ++j)
380		{
381		  n = n * 8 + c - '0';
382		  c = phase1_getc ();
383		  switch (c)
384		    {
385		    default:
386		      break;
387
388		    case '0': case '1': case '2': case '3':
389		    case '4': case '5': case '6': case '7':
390		      continue;
391		    }
392		  break;
393		}
394	      phase1_ungetc (c);
395	      return n;
396	    }
397#endif
398
399	  default:
400	    return c;
401	  }
402    }
403}
404
405
406/* Combine characters into tokens.  Discard whitespace.  */
407
408static token_ty phase5_pushback[1];
409static int phase5_pushback_length;
410
411static void
412phase5_get (token_ty *tp)
413{
414  static char *buffer;
415  static int bufmax;
416  int bufpos;
417  int c;
418
419  if (phase5_pushback_length)
420    {
421      *tp = phase5_pushback[--phase5_pushback_length];
422      return;
423    }
424  for (;;)
425    {
426      tp->line_number = line_number;
427      c = phase2_getc ();
428
429      switch (c)
430	{
431	case EOF:
432	  tp->type = token_type_eof;
433	  return;
434
435	case '\n':
436	  if (last_non_comment_line > last_comment_line)
437	    savable_comment_reset ();
438	  /* FALLTHROUGH */
439	case '\r':
440	case '\t':
441	case ' ':
442	  /* Ignore whitespace and comments.  */
443	  continue;
444	}
445
446      last_non_comment_line = tp->line_number;
447
448      switch (c)
449	{
450	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
451	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
452	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
453	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
454	case 'Y': case 'Z':
455	case '_':
456	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
457	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
458	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
459	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
460	case 'y': case 'z':
461	case '0': case '1': case '2': case '3': case '4':
462	case '5': case '6': case '7': case '8': case '9':
463	  /* Symbol, or part of a number.  */
464	  bufpos = 0;
465	  for (;;)
466	    {
467	      if (bufpos >= bufmax)
468		{
469		  bufmax = 2 * bufmax + 10;
470		  buffer = xrealloc (buffer, bufmax);
471		}
472	      buffer[bufpos++] = c;
473	      c = phase2_getc ();
474	      switch (c)
475		{
476		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
477		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
478		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
479		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
480		case 'Y': case 'Z':
481		case '_':
482		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
483		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
484		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
485		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
486		case 'y': case 'z':
487		case '0': case '1': case '2': case '3': case '4':
488		case '5': case '6': case '7': case '8': case '9':
489		  continue;
490		default:
491		  if (bufpos == 1 && buffer[0] == '_' && c == '(')
492		    {
493		      tp->type = token_type_i18n;
494		      return;
495		    }
496		  phase2_ungetc (c);
497		  break;
498		}
499	      break;
500	    }
501	  if (bufpos >= bufmax)
502	    {
503	      bufmax = 2 * bufmax + 10;
504	      buffer = xrealloc (buffer, bufmax);
505	    }
506	  buffer[bufpos] = '\0';
507	  tp->string = xstrdup (buffer);
508	  tp->type = token_type_symbol;
509	  return;
510
511	case '"':
512	  bufpos = 0;
513	  for (;;)
514	    {
515	      c = phase7_getc ();
516	      if (c == EOF || c == P7_QUOTES)
517		break;
518	      if (bufpos >= bufmax)
519		{
520		  bufmax = 2 * bufmax + 10;
521		  buffer = xrealloc (buffer, bufmax);
522		}
523	      buffer[bufpos++] = c;
524	    }
525	  if (bufpos >= bufmax)
526	    {
527	      bufmax = 2 * bufmax + 10;
528	      buffer = xrealloc (buffer, bufmax);
529	    }
530	  buffer[bufpos] = '\0';
531	  tp->string = xstrdup (buffer);
532	  tp->type = token_type_string_literal;
533	  return;
534
535	case '(':
536	  tp->type = token_type_lparen;
537	  return;
538
539	case ')':
540	  tp->type = token_type_rparen;
541	  return;
542
543	case ',':
544	  tp->type = token_type_comma;
545	  return;
546
547	default:
548	  /* We could carefully recognize each of the 2 and 3 character
549	     operators, but it is not necessary, as we only need to recognize
550	     gettext invocations.  Don't bother.  */
551	  tp->type = token_type_other;
552	  return;
553	}
554    }
555}
556
557/* Supports only one pushback token.  */
558static void
559phase5_unget (token_ty *tp)
560{
561  if (tp->type != token_type_eof)
562    {
563      if (phase5_pushback_length == SIZEOF (phase5_pushback))
564	abort ();
565      phase5_pushback[phase5_pushback_length++] = *tp;
566    }
567}
568
569
570/* Concatenate adjacent string literals to form single string literals.
571   (See libycp/src/parser.yy, rule 'string' vs. terminal 'STRING'.)  */
572
573static void
574phase8_get (token_ty *tp)
575{
576  phase5_get (tp);
577  if (tp->type != token_type_string_literal)
578    return;
579  for (;;)
580    {
581      token_ty tmp;
582      size_t len;
583
584      phase5_get (&tmp);
585      if (tmp.type != token_type_string_literal)
586	{
587	  phase5_unget (&tmp);
588	  return;
589	}
590      len = strlen (tp->string);
591      tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
592      strcpy (tp->string + len, tmp.string);
593      free (tmp.string);
594    }
595}
596
597
598/* ========================= Extracting strings.  ========================== */
599
600
601/* Context lookup table.  */
602static flag_context_list_table_ty *flag_context_list_table;
603
604
605/* The file is broken into tokens.
606
607     Normal handling: Look for
608       [A] _( [B] msgid ... )
609     Plural handling: Look for
610       [A] _( [B] msgid [C] , [D] msgid_plural ... )
611     At point [A]: state == 0.
612     At point [B]: state == 1, plural_mp == NULL.
613     At point [C]: state == 2, plural_mp != NULL.
614     At point [D]: state == 1, plural_mp != NULL.
615
616   We use recursion because we have to set the context according to the given
617   flags.  */
618
619
620/* Extract messages until the next balanced closing parenthesis.
621   Extracted messages are added to MLP.
622   Return true upon eof, false upon closing parenthesis.  */
623static bool
624extract_parenthesized (message_list_ty *mlp,
625		       flag_context_ty outer_context,
626		       flag_context_list_iterator_ty context_iter,
627		       bool in_i18n)
628{
629  int state; /* 1 or 2 inside _( ... ), otherwise 0 */
630  message_ty *plural_mp = NULL;	/* defined only when in states 1 and 2 */
631  /* Context iterator that will be used if the next token is a '('.  */
632  flag_context_list_iterator_ty next_context_iter =
633    passthrough_context_list_iterator;
634  /* Current context.  */
635  flag_context_ty inner_context =
636    inherited_context (outer_context,
637		       flag_context_list_iterator_advance (&context_iter));
638
639  /* Start state is 0 or 1.  */
640  state = (in_i18n ? 1 : 0);
641
642  for (;;)
643    {
644      token_ty token;
645
646      if (in_i18n)
647	phase8_get (&token);
648      else
649	phase5_get (&token);
650
651      switch (token.type)
652	{
653	case token_type_i18n:
654	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
655				     true))
656	    return true;
657	  next_context_iter = null_context_list_iterator;
658	  state = 0;
659	  continue;
660
661	case token_type_string_literal:
662	  if (state == 1)
663	    {
664	      lex_pos_ty pos;
665	      pos.file_name = logical_file_name;
666	      pos.line_number = token.line_number;
667
668	      if (plural_mp == NULL)
669		{
670		  /* Seen an msgid.  */
671		  plural_mp = remember_a_message (mlp, NULL, token.string,
672						  inner_context, &pos,
673						  savable_comment);
674		  state = 2;
675		}
676	      else
677		{
678		  /* Seen an msgid_plural.  */
679		  remember_a_message_plural (plural_mp, token.string,
680					     inner_context, &pos,
681					     savable_comment);
682		  state = 0;
683		}
684	    }
685	  else
686	    {
687	      free (token.string);
688	      state = 0;
689	    }
690	  next_context_iter = null_context_list_iterator;
691	  continue;
692
693	case token_type_symbol:
694	  next_context_iter =
695	    flag_context_list_iterator (
696	      flag_context_list_table_lookup (
697		flag_context_list_table,
698		token.string, strlen (token.string)));
699	  free (token.string);
700	  state = 0;
701	  continue;
702
703	case token_type_lparen:
704	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
705				     false))
706	    return true;
707	  next_context_iter = null_context_list_iterator;
708	  state = 0;
709	  continue;
710
711	case token_type_rparen:
712	  return false;
713
714	case token_type_comma:
715	  if (state == 2)
716	    state = 1;
717	  else
718	    state = 0;
719	  inner_context =
720	    inherited_context (outer_context,
721			       flag_context_list_iterator_advance (
722				 &context_iter));
723	  next_context_iter = passthrough_context_list_iterator;
724	  continue;
725
726	case token_type_other:
727	  next_context_iter = null_context_list_iterator;
728	  state = 0;
729	  continue;
730
731	case token_type_eof:
732	  return true;
733
734	default:
735	  abort ();
736	}
737    }
738}
739
740
741void
742extract_ycp (FILE *f,
743	     const char *real_filename, const char *logical_filename,
744	     flag_context_list_table_ty *flag_table,
745	     msgdomain_list_ty *mdlp)
746{
747  message_list_ty *mlp = mdlp->item[0]->messages;
748
749  fp = f;
750  real_file_name = real_filename;
751  logical_file_name = xstrdup (logical_filename);
752  line_number = 1;
753  char_in_line = 0;
754
755  last_comment_line = -1;
756  last_non_comment_line = -1;
757
758  flag_context_list_table = flag_table;
759
760  /* Eat tokens until eof is seen.  When extract_parenthesized returns
761     due to an unbalanced closing parenthesis, just restart it.  */
762  while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
763				 false))
764    ;
765
766  fp = NULL;
767  real_file_name = NULL;
768  logical_file_name = NULL;
769  line_number = 0;
770  char_in_line = 0;
771}
772