1/* regexprops.c -- document the properties of the regular expressions 2 understood by gnulib. 3 4 Copyright 2005, 2007 Free Software Foundation, Inc. 5 6 This program is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18*/ 19 20/* Written by James Youngman, <jay@gnu.org>. */ 21 22#if HAVE_CONFIG_H 23# include <config.h> 24#endif 25 26#include <stdio.h> 27#include <unistd.h> 28#include <errno.h> 29 30#include "regex.h" 31#include "regextype.h" 32 33 34/* Name this program was run with. */ 35char *program_name; 36 37static void output(const char *s, int escape) 38{ 39 fputs(s, stdout); 40} 41 42 43static void newline(void) 44{ 45 output("\n", 0); 46} 47 48static void content(const char *s) 49{ 50 output(s, 1); 51} 52 53static void literal(const char *s) 54{ 55 output(s, 0); 56} 57 58static void directive(const char *s) 59{ 60 output(s, 0); 61} 62 63static void enum_item(const char *s) 64{ 65 newline(); 66 directive("@item "); 67 literal(s); 68 newline(); 69} 70static void table_item(const char *s) 71{ 72 directive("@item"); 73 newline(); 74 content(s); 75 newline(); 76} 77 78static void code(const char *s) 79{ 80 directive("@code{"); 81 content(s); 82 directive("}"); 83} 84 85static void begin_subsection(const char *name, 86 const char *next, 87 const char *prev, 88 const char *up) 89{ 90 newline(); 91 92 directive("@node "); 93 content(name); 94 content(" regular expression syntax"); 95 newline(); 96 97 directive("@subsection "); 98 output("@samp{", 0); 99 content(name); 100 output("}", 0); 101 content(" regular expression syntax"); 102 newline(); 103} 104 105static void begintable_asis() 106{ 107 newline(); 108 directive("@table @asis"); 109 newline(); 110} 111 112static void begintable_markup(char const *markup) 113{ 114 newline(); 115 directive("@table "); 116 literal(markup); 117 newline(); 118} 119 120static void endtable() 121{ 122 newline(); 123 directive("@end table"); 124 newline(); 125} 126 127static void beginenum() 128{ 129 newline(); 130 directive("@enumerate"); 131 newline(); 132} 133 134static void endenum() 135{ 136 newline(); 137 directive("@end enumerate"); 138 newline(); 139} 140 141static void newpara() 142{ 143 content("\n\n"); 144} 145 146 147static int describe_regex_syntax(int options) 148{ 149 newpara(); 150 content("The character @samp{.} matches any single character"); 151 if ( (options & RE_DOT_NEWLINE) == 0 ) 152 { 153 content(" except newline"); 154 } 155 if (options & RE_DOT_NOT_NULL) 156 { 157 if ( (options & RE_DOT_NEWLINE) == 0 ) 158 content(" and"); 159 else 160 content(" except"); 161 162 content(" the null character"); 163 } 164 content(". "); 165 newpara(); 166 167 if (!(options & RE_LIMITED_OPS)) 168 { 169 begintable_markup("@samp"); 170 if (options & RE_BK_PLUS_QM) 171 { 172 enum_item("\\+"); 173 content("indicates that the regular expression should match one" 174 " or more occurrences of the previous atom or regexp. "); 175 enum_item("\\?"); 176 content("indicates that the regular expression should match zero" 177 " or one occurrence of the previous atom or regexp. "); 178 enum_item("+ and ? "); 179 content("match themselves. "); 180 } 181 else 182 { 183 enum_item("+"); 184 content("indicates that the regular expression should match one" 185 " or more occurrences of the previous atom or regexp. "); 186 enum_item("?"); 187 content("indicates that the regular expression should match zero" 188 " or one occurrence of the previous atom or regexp. "); 189 enum_item("\\+"); 190 literal("matches a @samp{+}"); 191 enum_item("\\?"); 192 literal("matches a @samp{?}. "); 193 } 194 endtable(); 195 } 196 197 newpara(); 198 199 content("Bracket expressions are used to match ranges of characters. "); 200 literal("Bracket expressions where the range is backward, for example @samp{[z-a]}, are "); 201 if (options & RE_NO_EMPTY_RANGES) 202 content("invalid"); 203 else 204 content("ignored"); 205 content(". "); 206 207 if (options & RE_BACKSLASH_ESCAPE_IN_LISTS) 208 literal("Within square brackets, @samp{\\} can be used to quote " 209 "the following character. "); 210 else 211 literal("Within square brackets, @samp{\\} is taken literally. "); 212 213 if (options & RE_CHAR_CLASSES) 214 content("Character classes are supported; for example " 215 "@samp{[[:digit:]]} will match a single decimal digit. "); 216 else 217 literal("Character classes are not supported, so for example " 218 "you would need to use @samp{[0-9]} " 219 "instead of @samp{[[:digit:]]}. "); 220 221 if (options & RE_HAT_LISTS_NOT_NEWLINE) 222 { 223 literal("Non-matching lists @samp{[^@dots{}]} do not ever match newline. "); 224 } 225 newpara(); 226 if (options & RE_NO_GNU_OPS) 227 { 228 content("GNU extensions are not supported and so " 229 "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} " 230 "match " 231 "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively. "); 232 } 233 else 234 { 235 content("GNU extensions are supported:"); 236 beginenum(); 237 enum_item("@samp{\\w} matches a character within a word"); 238 enum_item("@samp{\\W} matches a character which is not within a word"); 239 enum_item("@samp{\\<} matches the beginning of a word"); 240 enum_item("@samp{\\>} matches the end of a word"); 241 enum_item("@samp{\\b} matches a word boundary"); 242 enum_item("@samp{\\B} matches characters which are not a word boundary"); 243 enum_item("@samp{\\`} matches the beginning of the whole input"); 244 enum_item("@samp{\\'} matches the end of the whole input"); 245 endenum(); 246 } 247 248 newpara(); 249 250 251 if (options & RE_NO_BK_PARENS) 252 { 253 literal("Grouping is performed with parentheses @samp{()}. "); 254 255 if (options & RE_UNMATCHED_RIGHT_PAREN_ORD) 256 literal("An unmatched @samp{)} matches just itself. "); 257 } 258 else 259 { 260 literal("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}. "); 261 } 262 263 if (options & RE_NO_BK_REFS) 264 { 265 content("A backslash followed by a digit matches that digit. "); 266 } 267 else 268 { 269 literal("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number. For example @samp{\\2} matches the second group expression. The order of group expressions is determined by the position of their opening parenthesis "); 270 if (options & RE_NO_BK_PARENS) 271 literal("@samp{(}"); 272 else 273 literal("@samp{\\(}"); 274 content(". "); 275 } 276 277 278 newpara(); 279 if (!(options & RE_LIMITED_OPS)) 280 { 281 if (options & RE_NO_BK_VBAR) 282 literal("The alternation operator is @samp{|}. "); 283 else 284 literal("The alternation operator is @samp{\\|}. "); 285 } 286 newpara(); 287 288 if (options & RE_CONTEXT_INDEP_ANCHORS) 289 { 290 literal("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets. Within brackets, @samp{^} can be used to invert the membership of the character class being specified. "); 291 } 292 else 293 { 294 literal("The character @samp{^} only represents the beginning of a string when it appears:"); 295 beginenum(); 296 enum_item("\nAt the beginning of a regular expression"); 297 enum_item("After an open-group, signified by "); 298 if (options & RE_NO_BK_PARENS) 299 { 300 literal("@samp{(}"); 301 } 302 else 303 { 304 literal("@samp{\\(}"); 305 } 306 newline(); 307 if (!(options & RE_LIMITED_OPS)) 308 { 309 if (options & RE_NEWLINE_ALT) 310 enum_item("After a newline"); 311 312 if (options & RE_NO_BK_VBAR ) 313 enum_item("After the alternation operator @samp{|}"); 314 else 315 enum_item("After the alternation operator @samp{\\|}"); 316 } 317 endenum(); 318 319 newpara(); 320 literal("The character @samp{$} only represents the end of a string when it appears:"); 321 beginenum(); 322 enum_item("At the end of a regular expression"); 323 enum_item("Before a close-group, signified by "); 324 if (options & RE_NO_BK_PARENS) 325 { 326 literal("@samp{)}"); 327 } 328 else 329 { 330 literal("@samp{\\)}"); 331 } 332 if (!(options & RE_LIMITED_OPS)) 333 { 334 if (options & RE_NEWLINE_ALT) 335 enum_item("Before a newline"); 336 337 if (options & RE_NO_BK_VBAR) 338 enum_item("Before the alternation operator @samp{|}"); 339 else 340 enum_item("Before the alternation operator @samp{\\|}"); 341 } 342 endenum(); 343 } 344 newpara(); 345 if (!(options & RE_LIMITED_OPS) ) 346 { 347 if ((options & RE_CONTEXT_INDEP_OPS) 348 && !(options & RE_CONTEXT_INVALID_OPS)) 349 { 350 literal("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression. "); 351 } 352 else 353 { 354 if (options & RE_BK_PLUS_QM) 355 literal("@samp{\\*}, @samp{\\+} and @samp{\\?} "); 356 else 357 literal("@samp{*}, @samp{+} and @samp{?} "); 358 359 if (options & RE_CONTEXT_INVALID_OPS) 360 { 361 content("are special at any point in a regular expression except the following places, where they are not allowed:"); 362 } 363 else 364 { 365 content("are special at any point in a regular expression except:"); 366 } 367 368 beginenum(); 369 enum_item("At the beginning of a regular expression"); 370 enum_item("After an open-group, signified by "); 371 if (options & RE_NO_BK_PARENS) 372 { 373 literal("@samp{(}"); 374 } 375 else 376 { 377 literal("@samp{\\(}"); 378 } 379 if (!(options & RE_LIMITED_OPS)) 380 { 381 if (options & RE_NEWLINE_ALT) 382 enum_item("After a newline"); 383 384 if (options & RE_NO_BK_VBAR) 385 enum_item("After the alternation operator @samp{|}"); 386 else 387 enum_item("After the alternation operator @samp{\\|}"); 388 } 389 endenum(); 390 } 391 } 392 393 394 newpara(); 395 if (options & RE_INTERVALS) 396 { 397 if (options & RE_NO_BK_BRACES) 398 { 399 literal("Intervals are specified by @samp{@{} and @samp{@}}. "); 400 if (options & RE_INVALID_INTERVAL_ORD) 401 { 402 literal("Invalid intervals are treated as literals, for example @samp{a@{1} is treated as @samp{a\\@{1}"); 403 } 404 else 405 { 406 literal("Invalid intervals such as @samp{a@{1z} are not accepted. "); 407 } 408 } 409 else 410 { 411 literal("Intervals are specified by @samp{\\@{} and @samp{\\@}}. "); 412 if (options & RE_INVALID_INTERVAL_ORD) 413 { 414 literal("Invalid intervals are treated as literals, for example @samp{a\\@{1} is treated as @samp{a@{1}"); 415 } 416 else 417 { 418 literal("Invalid intervals such as @samp{a\\@{1z} are not accepted. "); 419 } 420 } 421 422 } 423 424 newpara(); 425 if (options & RE_NO_POSIX_BACKTRACKING) 426 { 427 content("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match. "); 428 } 429 else 430 { 431 content("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups. "); 432 } 433 newpara(); 434} 435 436 437 438static int menu() 439{ 440 int i, options; 441 const char *name; 442 443 output("@menu\n", 0); 444 for (i=0; 445 options = get_regex_type_flags(i), 446 name=get_regex_type_name(i); 447 ++i) 448 { 449 output("* ", 0); 450 output(name, 0); 451 content(" regular expression syntax"); 452 output("::", 0); 453 newline(); 454 } 455 output("@end menu\n", 0); 456} 457 458 459static int describe_all(const char *up) 460{ 461 const char *name, *next, *previous; 462 int options; 463 int i, parent; 464 465 menu(); 466 467 previous = ""; 468 469 for (i=0; 470 options = get_regex_type_flags(i), 471 name=get_regex_type_name(i); 472 ++i) 473 { 474 next = get_regex_type_name(i+1); 475 if (NULL == next) 476 next = ""; 477 begin_subsection(name, next, previous, up); 478 parent = get_regex_type_synonym(i); 479 if (parent >= 0) 480 { 481 content("This is a synonym for "); 482 content(get_regex_type_name(parent)); 483 content("."); 484 } 485 else 486 { 487 describe_regex_syntax(options); 488 } 489 previous = name; 490 } 491} 492 493 494 495int main (int argc, char *argv[]) 496{ 497 const char *up = ""; 498 program_name = argv[0]; 499 500 if (argc > 1) 501 up = argv[1]; 502 503 describe_all(up); 504 return 0; 505} 506