1/* regexprops.c -- document the properties of the regular expressions
2                   understood by gnulib.
3
4   Copyright 2005, 2007 Free Software Foundation, Inc.
5
6   This program is free software: you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation, either version 3 of the License, or
9   (at your option) any later version.
10
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   GNU General Public License for more details.
15
16   You should have received a copy of the GNU General Public License
17   along with this program.  If not, see <http://www.gnu.org/licenses/>.
18*/
19
20/* Written by James Youngman, <jay@gnu.org>. */
21
22#if HAVE_CONFIG_H
23# include <config.h>
24#endif
25
26#include <stdio.h>
27#include <unistd.h>
28#include <errno.h>
29
30#include "regex.h"
31#include "regextype.h"
32
33
34/* Name this program was run with. */
35char *program_name;
36
37static void output(const char *s, int escape)
38{
39  fputs(s, stdout);
40}
41
42
43static void newline(void)
44{
45  output("\n", 0);
46}
47
48static void content(const char *s)
49{
50  output(s, 1);
51}
52
53static void literal(const char *s)
54{
55  output(s, 0);
56}
57
58static void directive(const char *s)
59{
60  output(s, 0);
61}
62
63static void enum_item(const char *s)
64{
65  newline();
66  directive("@item ");
67  literal(s);
68  newline();
69}
70static void table_item(const char *s)
71{
72  directive("@item");
73  newline();
74  content(s);
75  newline();
76}
77
78static void code(const char *s)
79{
80  directive("@code{");
81  content(s);
82  directive("}");
83}
84
85static void begin_subsection(const char *name,
86			  const char *next,
87			  const char *prev,
88			  const char *up)
89{
90  newline();
91
92  directive("@node ");
93  content(name);
94  content(" regular expression syntax");
95  newline();
96
97  directive("@subsection ");
98  output("@samp{", 0);
99  content(name);
100  output("}", 0);
101  content(" regular expression syntax");
102  newline();
103}
104
105static void begintable_asis()
106{
107  newline();
108  directive("@table @asis");
109  newline();
110}
111
112static void begintable_markup(char const *markup)
113{
114  newline();
115  directive("@table ");
116  literal(markup);
117  newline();
118}
119
120static void endtable()
121{
122  newline();
123  directive("@end table");
124  newline();
125}
126
127static void beginenum()
128{
129  newline();
130  directive("@enumerate");
131  newline();
132}
133
134static void endenum()
135{
136  newline();
137  directive("@end enumerate");
138  newline();
139}
140
141static void newpara()
142{
143  content("\n\n");
144}
145
146
147static int describe_regex_syntax(int options)
148{
149  newpara();
150  content("The character @samp{.} matches any single character");
151  if ( (options & RE_DOT_NEWLINE)  == 0 )
152    {
153      content(" except newline");
154    }
155  if (options & RE_DOT_NOT_NULL)
156    {
157      if ( (options & RE_DOT_NEWLINE)  == 0 )
158	content(" and");
159      else
160	content(" except");
161
162      content(" the null character");
163    }
164  content(".  ");
165  newpara();
166
167  if (!(options & RE_LIMITED_OPS))
168    {
169      begintable_markup("@samp");
170      if (options & RE_BK_PLUS_QM)
171	{
172	  enum_item("\\+");
173	  content("indicates that the regular expression should match one"
174		  " or more occurrences of the previous atom or regexp.  ");
175	  enum_item("\\?");
176	  content("indicates that the regular expression should match zero"
177		  " or one occurrence of the previous atom or regexp.  ");
178	  enum_item("+ and ? ");
179	  content("match themselves.  ");
180	}
181      else
182	{
183	  enum_item("+");
184	  content("indicates that the regular expression should match one"
185		  " or more occurrences of the previous atom or regexp.  ");
186	  enum_item("?");
187	  content("indicates that the regular expression should match zero"
188		  " or one occurrence of the previous atom or regexp.  ");
189	  enum_item("\\+");
190	  literal("matches a @samp{+}");
191	  enum_item("\\?");
192	  literal("matches a @samp{?}.  ");
193	}
194      endtable();
195    }
196
197  newpara();
198
199  content("Bracket expressions are used to match ranges of characters.  ");
200  literal("Bracket expressions where the range is backward, for example @samp{[z-a]}, are ");
201  if (options & RE_NO_EMPTY_RANGES)
202    content("invalid");
203  else
204    content("ignored");
205  content(".  ");
206
207  if (options &  RE_BACKSLASH_ESCAPE_IN_LISTS)
208    literal("Within square brackets, @samp{\\} can be used to quote "
209	    "the following character.  ");
210  else
211    literal("Within square brackets, @samp{\\} is taken literally.  ");
212
213  if (options & RE_CHAR_CLASSES)
214    content("Character classes are supported; for example "
215	    "@samp{[[:digit:]]} will match a single decimal digit.  ");
216  else
217    literal("Character classes are not supported, so for example "
218	    "you would need to use @samp{[0-9]} "
219	    "instead of @samp{[[:digit:]]}.  ");
220
221  if (options & RE_HAT_LISTS_NOT_NEWLINE)
222    {
223      literal("Non-matching lists @samp{[^@dots{}]} do not ever match newline.  ");
224    }
225  newpara();
226  if (options & RE_NO_GNU_OPS)
227    {
228      content("GNU extensions are not supported and so "
229	      "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} "
230	      "match "
231	      "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively.  ");
232    }
233  else
234    {
235      content("GNU extensions are supported:");
236      beginenum();
237      enum_item("@samp{\\w} matches a character within a word");
238      enum_item("@samp{\\W} matches a character which is not within a word");
239      enum_item("@samp{\\<} matches the beginning of a word");
240      enum_item("@samp{\\>} matches the end of a word");
241      enum_item("@samp{\\b} matches a word boundary");
242      enum_item("@samp{\\B} matches characters which are not a word boundary");
243      enum_item("@samp{\\`} matches the beginning of the whole input");
244      enum_item("@samp{\\'} matches the end of the whole input");
245      endenum();
246    }
247
248  newpara();
249
250
251  if (options & RE_NO_BK_PARENS)
252    {
253      literal("Grouping is performed with parentheses @samp{()}.  ");
254
255      if (options & RE_UNMATCHED_RIGHT_PAREN_ORD)
256	literal("An unmatched @samp{)} matches just itself.  ");
257    }
258  else
259    {
260      literal("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}.  ");
261    }
262
263  if (options & RE_NO_BK_REFS)
264    {
265      content("A backslash followed by a digit matches that digit.  ");
266    }
267  else
268    {
269      literal("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number.  For example @samp{\\2} matches the second group expression.  The order of group expressions is determined by the position of their opening parenthesis ");
270      if (options & RE_NO_BK_PARENS)
271	  literal("@samp{(}");
272      else
273	literal("@samp{\\(}");
274      content(".  ");
275    }
276
277
278  newpara();
279  if (!(options & RE_LIMITED_OPS))
280    {
281      if (options & RE_NO_BK_VBAR)
282	literal("The alternation operator is @samp{|}.  ");
283      else
284	literal("The alternation operator is @samp{\\|}. ");
285    }
286  newpara();
287
288  if (options & RE_CONTEXT_INDEP_ANCHORS)
289    {
290      literal("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets.  Within brackets, @samp{^} can be used to invert the membership of the character class being specified.  ");
291    }
292  else
293    {
294      literal("The character @samp{^} only represents the beginning of a string when it appears:");
295      beginenum();
296      enum_item("\nAt the beginning of a regular expression");
297      enum_item("After an open-group, signified by ");
298      if (options & RE_NO_BK_PARENS)
299	{
300	  literal("@samp{(}");
301	}
302      else
303	{
304	  literal("@samp{\\(}");
305	}
306      newline();
307      if (!(options & RE_LIMITED_OPS))
308	{
309	  if (options & RE_NEWLINE_ALT)
310	    enum_item("After a newline");
311
312	  if (options & RE_NO_BK_VBAR )
313	    enum_item("After the alternation operator @samp{|}");
314	  else
315	    enum_item("After the alternation operator @samp{\\|}");
316	}
317      endenum();
318
319      newpara();
320      literal("The character @samp{$} only represents the end of a string when it appears:");
321      beginenum();
322      enum_item("At the end of a regular expression");
323      enum_item("Before a close-group, signified by ");
324      if (options & RE_NO_BK_PARENS)
325	{
326	  literal("@samp{)}");
327	}
328      else
329	{
330	  literal("@samp{\\)}");
331	}
332      if (!(options & RE_LIMITED_OPS))
333	{
334	  if (options & RE_NEWLINE_ALT)
335	    enum_item("Before a newline");
336
337	  if (options & RE_NO_BK_VBAR)
338	    enum_item("Before the alternation operator @samp{|}");
339	  else
340	    enum_item("Before the alternation operator @samp{\\|}");
341	}
342      endenum();
343    }
344  newpara();
345  if (!(options & RE_LIMITED_OPS) )
346    {
347      if ((options & RE_CONTEXT_INDEP_OPS)
348	  && !(options & RE_CONTEXT_INVALID_OPS))
349	{
350	  literal("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression.  ");
351	}
352      else
353	{
354	  if (options & RE_BK_PLUS_QM)
355	    literal("@samp{\\*}, @samp{\\+} and @samp{\\?} ");
356	  else
357	    literal("@samp{*}, @samp{+} and @samp{?} ");
358
359	  if (options & RE_CONTEXT_INVALID_OPS)
360	    {
361	      content("are special at any point in a regular expression except the following places, where they are not allowed:");
362	    }
363	  else
364	    {
365	      content("are special at any point in a regular expression except:");
366	    }
367
368	  beginenum();
369	  enum_item("At the beginning of a regular expression");
370	  enum_item("After an open-group, signified by ");
371	  if (options & RE_NO_BK_PARENS)
372	    {
373	      literal("@samp{(}");
374	    }
375	  else
376	    {
377	      literal("@samp{\\(}");
378	    }
379	  if (!(options & RE_LIMITED_OPS))
380	    {
381	      if (options & RE_NEWLINE_ALT)
382		enum_item("After a newline");
383
384	      if (options & RE_NO_BK_VBAR)
385		enum_item("After the alternation operator @samp{|}");
386	      else
387		enum_item("After the alternation operator @samp{\\|}");
388	    }
389	  endenum();
390	}
391    }
392
393
394  newpara();
395  if (options & RE_INTERVALS)
396    {
397      if (options & RE_NO_BK_BRACES)
398	{
399	  literal("Intervals are specified by @samp{@{} and @samp{@}}.  ");
400	  if (options & RE_INVALID_INTERVAL_ORD)
401	    {
402	      literal("Invalid intervals are treated as literals, for example @samp{a@{1} is treated as @samp{a\\@{1}");
403	    }
404	  else
405	    {
406	      literal("Invalid intervals such as @samp{a@{1z} are not accepted.  ");
407	    }
408	}
409      else
410	{
411	  literal("Intervals are specified by @samp{\\@{} and @samp{\\@}}.  ");
412	  if (options & RE_INVALID_INTERVAL_ORD)
413	    {
414	      literal("Invalid intervals are treated as literals, for example @samp{a\\@{1} is treated as @samp{a@{1}");
415	    }
416	  else
417	    {
418	      literal("Invalid intervals such as @samp{a\\@{1z} are not accepted.  ");
419	    }
420	}
421
422    }
423
424  newpara();
425  if (options & RE_NO_POSIX_BACKTRACKING)
426    {
427      content("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match.  ");
428    }
429  else
430    {
431      content("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups.  ");
432    }
433  newpara();
434}
435
436
437
438static int menu()
439{
440  int i, options;
441  const char *name;
442
443  output("@menu\n", 0);
444  for (i=0;
445       options = get_regex_type_flags(i),
446	 name=get_regex_type_name(i);
447       ++i)
448    {
449      output("* ", 0);
450      output(name, 0);
451      content(" regular expression syntax");
452      output("::", 0);
453      newline();
454    }
455  output("@end menu\n", 0);
456}
457
458
459static int describe_all(const char *up)
460{
461  const char *name, *next, *previous;
462  int options;
463  int i, parent;
464
465  menu();
466
467  previous = "";
468
469  for (i=0;
470       options = get_regex_type_flags(i),
471	 name=get_regex_type_name(i);
472       ++i)
473    {
474      next = get_regex_type_name(i+1);
475      if (NULL == next)
476	next = "";
477      begin_subsection(name, next, previous, up);
478      parent = get_regex_type_synonym(i);
479      if (parent >= 0)
480	{
481	  content("This is a synonym for ");
482	  content(get_regex_type_name(parent));
483	  content(".");
484	}
485      else
486	{
487	  describe_regex_syntax(options);
488	}
489      previous = name;
490    }
491}
492
493
494
495int main (int argc, char *argv[])
496{
497  const char *up = "";
498  program_name = argv[0];
499
500  if (argc > 1)
501    up = argv[1];
502
503  describe_all(up);
504  return 0;
505}
506