1;;; sregex.el --- symbolic regular expressions 2 3;; Copyright (C) 1997, 1998, 2000, 2001, 2002, 2003, 2004, 4;; 2005, 2006, 2007 Free Software Foundation, Inc. 5 6;; Author: Bob Glickstein <bobg+sregex@zanshin.com> 7;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com> 8;; Keywords: extensions 9 10;; This file is part of GNU Emacs. 11 12;; GNU Emacs is free software; you can redistribute it and/or modify 13;; it under the terms of the GNU General Public License as published by 14;; the Free Software Foundation; either version 2, or (at your option) 15;; any later version. 16 17;; GNU Emacs is distributed in the hope that it will be useful, 18;; but WITHOUT ANY WARRANTY; without even the implied warranty of 19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20;; GNU General Public License for more details. 21 22;; You should have received a copy of the GNU General Public License 23;; along with GNU Emacs; see the file COPYING. If not, write to the 24;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 25;; Boston, MA 02110-1301, USA. 26 27;;; Commentary: 28 29;; This package allows you to write regular expressions using a 30;; totally new, Lisp-like syntax. 31 32;; A "symbolic regular expression" (sregex for short) is a Lisp form 33;; that, when evaluated, produces the string form of the specified 34;; regular expression. Here's a simple example: 35 36;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert" 37 38;; As you can see, an sregex is specified by placing one or more 39;; special clauses in a call to `sregexq'. The clause in this case is 40;; the `or' of two strings (not to be confused with the Lisp function 41;; `or'). The list of allowable clauses appears below. 42 43;; With sregex, it is never necessary to "escape" magic characters 44;; that are meant to be taken literally; that happens automatically. 45;; For example: 46 47;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H" 48 49;; It is also unnecessary to "group" parts of the expression together 50;; to overcome operator precedence; that also happens automatically. 51;; For example: 52 53;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?" 54 55;; It *is* possible to group parts of the expression in order to refer 56;; to them with numbered backreferences: 57 58;; (sregexq (group (or "Go" "Run")) 59;; ", Spot, " 60;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1" 61 62;; `sregexq' is a macro. Each time it is used, it constructs a simple 63;; Lisp expression that then invokes a moderately complex engine to 64;; interpret the sregex and render the string form. Because of this, 65;; I don't recommend sprinkling calls to `sregexq' throughout your 66;; code, the way one normally does with string regexes (which are 67;; cheap to evaluate). Instead, it's wiser to precompute the regexes 68;; you need wherever possible instead of repeatedly constructing the 69;; same ones over and over. Example: 70 71;; (let ((field-regex (sregexq (opt "resent-") 72;; (or "to" "cc" "bcc")))) 73;; ... 74;; (while ... 75;; ... 76;; (re-search-forward field-regex ...) 77;; ...)) 78 79;; The arguments to `sregexq' are automatically quoted, but the 80;; flipside of this is that it is not straightforward to include 81;; computed (i.e., non-constant) values in `sregexq' expressions. So 82;; `sregex' is a function that is like `sregexq' but which does not 83;; automatically quote its values. Literal sregex clauses must be 84;; explicitly quoted like so: 85 86;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert" 87 88;; but computed clauses can be included easily, allowing for the reuse 89;; of common clauses: 90 91;; (let ((dotstar '(0+ any)) 92;; (whitespace '(1+ (syntax ?-))) 93;; (digits '(1+ (char (?0 . ?9))))) 94;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+" 95 96;; To use this package in a Lisp program, simply (require 'sregex). 97 98;; Here are the clauses allowed in an `sregex' or `sregexq' 99;; expression: 100 101;; - a string 102;; This stands for the literal string. If it contains 103;; metacharacters, they will be escaped in the resulting regex 104;; (using `regexp-quote'). 105 106;; - the symbol `any' 107;; This stands for ".", a regex matching any character except 108;; newline. 109 110;; - the symbol `bol' 111;; Stands for "^", matching the empty string at the beginning of a line 112 113;; - the symbol `eol' 114;; Stands for "$", matching the empty string at the end of a line 115 116;; - (group CLAUSE ...) 117;; Groups the given CLAUSEs using "\\(" and "\\)". 118 119;; - (sequence CLAUSE ...) 120 121;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)". 122;; Clauses grouped by `sequence' do not count for purposes of 123;; numbering backreferences. Use `sequence' in situations like 124;; this: 125 126;; (sregexq (or "dog" "cat" 127;; (sequence (opt "sea ") "monkey"))) 128;; => "dog\\|cat\\|\\(?:sea \\)?monkey" 129 130;; where a single `or' alternate needs to contain multiple 131;; subclauses. 132 133;; - (backref N) 134;; Matches the same string previously matched by the Nth "group" in 135;; the same sregex. N is a positive integer. 136 137;; - (or CLAUSE ...) 138;; Matches any one of the CLAUSEs by separating them with "\\|". 139 140;; - (0+ CLAUSE ...) 141;; Concatenates the given CLAUSEs and matches zero or more 142;; occurrences by appending "*". 143 144;; - (1+ CLAUSE ...) 145;; Concatenates the given CLAUSEs and matches one or more 146;; occurrences by appending "+". 147 148;; - (opt CLAUSE ...) 149;; Concatenates the given CLAUSEs and matches zero or one occurrence 150;; by appending "?". 151 152;; - (repeat MIN MAX CLAUSE ...) 153;; Concatenates the given CLAUSEs and constructs a regex matching at 154;; least MIN occurrences and at most MAX occurrences. MIN must be a 155;; non-negative integer. MAX must be a non-negative integer greater 156;; than or equal to MIN; or MAX can be nil to mean "infinity." 157 158;; - (char CHAR-CLAUSE ...) 159;; Creates a "character class" matching one character from the given 160;; set. See below for how to construct a CHAR-CLAUSE. 161 162;; - (not-char CHAR-CLAUSE ...) 163;; Creates a "character class" matching any one character not in the 164;; given set. See below for how to construct a CHAR-CLAUSE. 165 166;; - the symbol `bot' 167;; Stands for "\\`", matching the empty string at the beginning of 168;; text (beginning of a string or of a buffer). 169 170;; - the symbol `eot' 171;; Stands for "\\'", matching the empty string at the end of text. 172 173;; - the symbol `point' 174;; Stands for "\\=", matching the empty string at point. 175 176;; - the symbol `word-boundary' 177;; Stands for "\\b", matching the empty string at the beginning or 178;; end of a word. 179 180;; - the symbol `not-word-boundary' 181;; Stands for "\\B", matching the empty string not at the beginning 182;; or end of a word. 183 184;; - the symbol `bow' 185;; Stands for "\\<", matching the empty string at the beginning of a 186;; word. 187 188;; - the symbol `eow' 189;; Stands for "\\>", matching the empty string at the end of a word. 190 191;; - the symbol `wordchar' 192;; Stands for the regex "\\w", matching a word-constituent character 193;; (as determined by the current syntax table) 194 195;; - the symbol `not-wordchar' 196;; Stands for the regex "\\W", matching a non-word-constituent 197;; character. 198 199;; - (syntax CODE) 200;; Stands for the regex "\\sCODE", where CODE is a syntax table code 201;; (a single character). Matches any character with the requested 202;; syntax. 203 204;; - (not-syntax CODE) 205;; Stands for the regex "\\SCODE", where CODE is a syntax table code 206;; (a single character). Matches any character without the 207;; requested syntax. 208 209;; - (regex REGEX) 210;; This is a "trapdoor" for including ordinary regular expression 211;; strings in the result. Some regular expressions are clearer when 212;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for 213;; instance. However, see the note under "Bugs," below. 214 215;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) 216;; has one of the following forms: 217 218;; - a character 219;; Adds that character to the set. 220 221;; - a string 222;; Adds all the characters in the string to the set. 223 224;; - A pair (MIN . MAX) 225;; Where MIN and MAX are characters, adds the range of characters 226;; from MIN through MAX to the set. 227 228;;; To do: 229 230;; An earlier version of this package could optionally translate the 231;; symbolic regex into other languages' syntaxes, e.g. Perl. For 232;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would 233;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore 234;; such a facility. 235 236;; - handle multibyte chars in sregex--char-aux 237;; - add support for character classes ([:blank:], ...) 238;; - add support for non-greedy operators *? and +? 239;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?" 240 241;;; Bugs: 242 243;;; Code: 244 245(eval-when-compile (require 'cl)) 246 247;; Compatibility code for when we didn't have shy-groups 248(defvar sregex--current-sregex nil) 249(defun sregex-info () nil) 250(defmacro sregex-save-match-data (&rest forms) (cons 'save-match-data forms)) 251(defun sregex-replace-match (r &optional f l str subexp x) 252 (replace-match r f l str subexp)) 253(defun sregex-match-string (c &optional i x) (match-string c i)) 254(defun sregex-match-string-no-properties (count &optional in-string sregex) 255 (match-string-no-properties count in-string)) 256(defun sregex-match-beginning (count &optional sregex) (match-beginning count)) 257(defun sregex-match-end (count &optional sregex) (match-end count)) 258(defun sregex-match-data (&optional sregex) (match-data)) 259(defun sregex-backref-num (n &optional sregex) n) 260 261 262(defun sregex (&rest exps) 263 "Symbolic regular expression interpreter. 264This is exactly like `sregexq' (q.v.) except that it evaluates all its 265arguments, so literal sregex clauses must be quoted. For example: 266 267 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" 268 269An argument-evaluating sregex interpreter lets you reuse sregex 270subexpressions: 271 272 (let ((dotstar '(0+ any)) 273 (whitespace '(1+ (syntax ?-))) 274 (digits '(1+ (char (?0 . ?9))))) 275 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"" 276 (sregex--sequence exps nil)) 277 278(defmacro sregexq (&rest exps) 279 "Symbolic regular expression interpreter. 280This macro allows you to specify a regular expression (regexp) in 281symbolic form, and converts it into the string form required by Emacs's 282regex functions such as `re-search-forward' and `looking-at'. Here is 283a simple example: 284 285 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" 286 287As you can see, an sregex is specified by placing one or more special 288clauses in a call to `sregexq'. The clause in this case is the `or' 289of two strings (not to be confused with the Lisp function `or'). The 290list of allowable clauses appears below. 291 292With `sregex', it is never necessary to \"escape\" magic characters 293that are meant to be taken literally; that happens automatically. 294For example: 295 296 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\" 297 298It is also unnecessary to \"group\" parts of the expression together 299to overcome operator precedence; that also happens automatically. 300For example: 301 302 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\" 303 304It *is* possible to group parts of the expression in order to refer 305to them with numbered backreferences: 306 307 (sregexq (group (or \"Go\" \"Run\")) 308 \", Spot, \" 309 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\" 310 311If `sregexq' needs to introduce its own grouping parentheses, it will 312automatically renumber your backreferences: 313 314 (sregexq (opt \"resent-\") 315 (group (or \"to\" \"cc\" \"bcc\")) 316 \": \" 317 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\" 318 319`sregexq' is a macro. Each time it is used, it constructs a simple 320Lisp expression that then invokes a moderately complex engine to 321interpret the sregex and render the string form. Because of this, I 322don't recommend sprinkling calls to `sregexq' throughout your code, 323the way one normally does with string regexes (which are cheap to 324evaluate). Instead, it's wiser to precompute the regexes you need 325wherever possible instead of repeatedly constructing the same ones 326over and over. Example: 327 328 (let ((field-regex (sregexq (opt \"resent-\") 329 (or \"to\" \"cc\" \"bcc\")))) 330 ... 331 (while ... 332 ... 333 (re-search-forward field-regex ...) 334 ...)) 335 336The arguments to `sregexq' are automatically quoted, but the 337flipside of this is that it is not straightforward to include 338computed (i.e., non-constant) values in `sregexq' expressions. So 339`sregex' is a function that is like `sregexq' but which does not 340automatically quote its values. Literal sregex clauses must be 341explicitly quoted like so: 342 343 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" 344 345but computed clauses can be included easily, allowing for the reuse 346of common clauses: 347 348 (let ((dotstar '(0+ any)) 349 (whitespace '(1+ (syntax ?-))) 350 (digits '(1+ (char (?0 . ?9))))) 351 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\" 352 353Here are the clauses allowed in an `sregex' or `sregexq' expression: 354 355- a string 356 This stands for the literal string. If it contains 357 metacharacters, they will be escaped in the resulting regex 358 (using `regexp-quote'). 359 360- the symbol `any' 361 This stands for \".\", a regex matching any character except 362 newline. 363 364- the symbol `bol' 365 Stands for \"^\", matching the empty string at the beginning of a line 366 367- the symbol `eol' 368 Stands for \"$\", matching the empty string at the end of a line 369 370- (group CLAUSE ...) 371 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\". 372 373- (sequence CLAUSE ...) 374 375 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\". 376 Clauses grouped by `sequence' do not count for purposes of 377 numbering backreferences. Use `sequence' in situations like 378 this: 379 380 (sregexq (or \"dog\" \"cat\" 381 (sequence (opt \"sea \") \"monkey\"))) 382 => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\" 383 384 where a single `or' alternate needs to contain multiple 385 subclauses. 386 387- (backref N) 388 Matches the same string previously matched by the Nth \"group\" in 389 the same sregex. N is a positive integer. 390 391- (or CLAUSE ...) 392 Matches any one of the CLAUSEs by separating them with \"\\\\|\". 393 394- (0+ CLAUSE ...) 395 Concatenates the given CLAUSEs and matches zero or more 396 occurrences by appending \"*\". 397 398- (1+ CLAUSE ...) 399 Concatenates the given CLAUSEs and matches one or more 400 occurrences by appending \"+\". 401 402- (opt CLAUSE ...) 403 Concatenates the given CLAUSEs and matches zero or one occurrence 404 by appending \"?\". 405 406- (repeat MIN MAX CLAUSE ...) 407 Concatenates the given CLAUSEs and constructs a regex matching at 408 least MIN occurrences and at most MAX occurrences. MIN must be a 409 non-negative integer. MAX must be a non-negative integer greater 410 than or equal to MIN; or MAX can be nil to mean \"infinity.\" 411 412- (char CHAR-CLAUSE ...) 413 Creates a \"character class\" matching one character from the given 414 set. See below for how to construct a CHAR-CLAUSE. 415 416- (not-char CHAR-CLAUSE ...) 417 Creates a \"character class\" matching any one character not in the 418 given set. See below for how to construct a CHAR-CLAUSE. 419 420- the symbol `bot' 421 Stands for \"\\\\`\", matching the empty string at the beginning of 422 text (beginning of a string or of a buffer). 423 424- the symbol `eot' 425 Stands for \"\\\\'\", matching the empty string at the end of text. 426 427- the symbol `point' 428 Stands for \"\\\\=\\=\", matching the empty string at point. 429 430- the symbol `word-boundary' 431 Stands for \"\\\\b\", matching the empty string at the beginning or 432 end of a word. 433 434- the symbol `not-word-boundary' 435 Stands for \"\\\\B\", matching the empty string not at the beginning 436 or end of a word. 437 438- the symbol `bow' 439 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a 440 word. 441 442- the symbol `eow' 443 Stands for \"\\\\\\=>\", matching the empty string at the end of a word. 444 445- the symbol `wordchar' 446 Stands for the regex \"\\\\w\", matching a word-constituent character 447 (as determined by the current syntax table) 448 449- the symbol `not-wordchar' 450 Stands for the regex \"\\\\W\", matching a non-word-constituent 451 character. 452 453- (syntax CODE) 454 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code 455 (a single character). Matches any character with the requested 456 syntax. 457 458- (not-syntax CODE) 459 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code 460 (a single character). Matches any character without the 461 requested syntax. 462 463- (regex REGEX) 464 This is a \"trapdoor\" for including ordinary regular expression 465 strings in the result. Some regular expressions are clearer when 466 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for 467 instance. 468 469Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) 470has one of the following forms: 471 472- a character 473 Adds that character to the set. 474 475- a string 476 Adds all the characters in the string to the set. 477 478- A pair (MIN . MAX) 479 Where MIN and MAX are characters, adds the range of characters 480 from MIN through MAX to the set." 481 `(apply 'sregex ',exps)) 482 483(defun sregex--engine (exp combine) 484 (cond 485 ((stringp exp) 486 (if (and combine 487 (eq combine 'suffix) 488 (/= (length exp) 1)) 489 (concat "\\(?:" (regexp-quote exp) "\\)") 490 (regexp-quote exp))) 491 ((symbolp exp) 492 (ecase exp 493 (any ".") 494 (bol "^") 495 (eol "$") 496 (wordchar "\\w") 497 (not-wordchar "\\W") 498 (bot "\\`") 499 (eot "\\'") 500 (point "\\=") 501 (word-boundary "\\b") 502 (not-word-boundary "\\B") 503 (bow "\\<") 504 (eow "\\>"))) 505 ((consp exp) 506 (funcall (intern (concat "sregex--" 507 (symbol-name (car exp)))) 508 (cdr exp) 509 combine)) 510 (t (error "Invalid expression: %s" exp)))) 511 512(defun sregex--sequence (exps combine) 513 (if (= (length exps) 1) (sregex--engine (car exps) combine) 514 (let ((re (mapconcat 515 (lambda (e) (sregex--engine e 'concat)) 516 exps ""))) 517 (if (eq combine 'suffix) 518 (concat "\\(?:" re "\\)") 519 re)))) 520 521(defun sregex--or (exps combine) 522 (if (= (length exps) 1) (sregex--engine (car exps) combine) 523 (let ((re (mapconcat 524 (lambda (e) (sregex--engine e 'or)) 525 exps "\\|"))) 526 (if (not (eq combine 'or)) 527 (concat "\\(?:" re "\\)") 528 re)))) 529 530(defun sregex--group (exps combine) (concat "\\(" (sregex--sequence exps nil) "\\)")) 531 532(defun sregex--backref (exps combine) (concat "\\" (int-to-string (car exps)))) 533(defun sregex--opt (exps combine) (concat (sregex--sequence exps 'suffix) "?")) 534(defun sregex--0+ (exps combine) (concat (sregex--sequence exps 'suffix) "*")) 535(defun sregex--1+ (exps combine) (concat (sregex--sequence exps 'suffix) "+")) 536 537(defun sregex--char (exps combine) (sregex--char-aux nil exps)) 538(defun sregex--not-char (exps combine) (sregex--char-aux t exps)) 539 540(defun sregex--syntax (exps combine) (format "\\s%c" (car exps))) 541(defun sregex--not-syntax (exps combine) (format "\\S%c" (car exps))) 542 543(defun sregex--regex (exps combine) 544 (if combine (concat "\\(?:" (car exps) "\\)") (car exps))) 545 546(defun sregex--repeat (exps combine) 547 (let* ((min (or (pop exps) 0)) 548 (minstr (number-to-string min)) 549 (max (pop exps))) 550 (concat (sregex--sequence exps 'suffix) 551 (concat "\\{" minstr "," 552 (when max (number-to-string max)) "\\}")))) 553 554(defun sregex--char-range (start end) 555 (let ((startc (char-to-string start)) 556 (endc (char-to-string end))) 557 (cond 558 ((> end (+ start 2)) (concat startc "-" endc)) 559 ((> end (+ start 1)) (concat startc (char-to-string (1+ start)) endc)) 560 ((> end start) (concat startc endc)) 561 (t startc)))) 562 563(defun sregex--char-aux (complement args) 564 ;; regex-opt does the same, we should join effort. 565 (let ((chars (make-bool-vector 256 nil))) ; Yeah, right! 566 (dolist (arg args) 567 (cond ((integerp arg) (aset chars arg t)) 568 ((stringp arg) (mapcar (lambda (c) (aset chars c t)) arg)) 569 ((consp arg) 570 (let ((start (car arg)) 571 (end (cdr arg))) 572 (when (> start end) 573 (let ((tmp start)) (setq start end) (setq end tmp))) 574 ;; now start <= end 575 (let ((i start)) 576 (while (<= i end) 577 (aset chars i t) 578 (setq i (1+ i)))))))) 579 ;; now chars is a map of the characters in the class 580 (let ((caret (aref chars ?^)) 581 (dash (aref chars ?-)) 582 (class (if (aref chars ?\]) "]" ""))) 583 (aset chars ?^ nil) 584 (aset chars ?- nil) 585 (aset chars ?\] nil) 586 587 (let (start end) 588 (dotimes (i 256) 589 (if (aref chars i) 590 (progn 591 (unless start (setq start i)) 592 (setq end i) 593 (aset chars i nil)) 594 (when start 595 (setq class (concat class (sregex--char-range start end))) 596 (setq start nil)))) 597 (if start 598 (setq class (concat class (sregex--char-range start end))))) 599 600 (if (> (length class) 0) 601 (setq class (concat class (if caret "^") (if dash "-"))) 602 (setq class (concat class (if dash "-") (if caret "^")))) 603 (if (and (not complement) (= (length class) 1)) 604 (regexp-quote class) 605 (concat "[" (if complement "^") class "]"))))) 606 607(provide 'sregex) 608 609;;; arch-tag: 460c1f5a-eb6e-42ec-a451-ffac78bdf492 610;;; sregex.el ends here 611