1;;; rx.el --- sexp notation for regular expressions 2 3;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 4;; 2006, 2007 Free Software Foundation, Inc. 5 6;; Author: Gerd Moellmann <gerd@gnu.org> 7;; Maintainer: FSF 8;; Keywords: strings, regexps, extensions 9 10;; This file is part of GNU Emacs. 11 12;; GNU Emacs is free software; you can redistribute it and/or modify 13;; it under the terms of the GNU General Public License as published by 14;; the Free Software Foundation; either version 2, or (at your option) 15;; any later version. 16 17;; GNU Emacs is distributed in the hope that it will be useful, 18;; but WITHOUT ANY WARRANTY; without even the implied warranty of 19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20;; GNU General Public License for more details. 21 22;; You should have received a copy of the GNU General Public License 23;; along with GNU Emacs; see the file COPYING. If not, write to the 24;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 25;; Boston, MA 02110-1301, USA. 26 27;;; Commentary: 28 29;; This is another implementation of sexp-form regular expressions. 30;; It was unfortunately written without being aware of the Sregex 31;; package coming with Emacs, but as things stand, Rx completely 32;; covers all regexp features, which Sregex doesn't, doesn't suffer 33;; from the bugs mentioned in the commentary section of Sregex, and 34;; uses a nicer syntax (IMHO, of course :-). 35 36;; This significantly extended version of the original, is almost 37;; compatible with Sregex. The only incompatibility I (fx) know of is 38;; that the `repeat' form can't have multiple regexp args. 39 40;; Now alternative forms are provided for a degree of compatibility 41;; with Shivers' attempted definitive SRE notation 42;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not 43;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>, 44;; ,<exp>, (word ...), word+, posix-string, and character class forms. 45;; Some forms are inconsistent with SRE, either for historical reasons 46;; or because of the implementation -- simple translation into Emacs 47;; regexp strings. These include: any, word. Also, case-sensitivity 48;; and greediness are controlled by variables external to the regexp, 49;; and you need to feed the forms to the `posix-' functions to get 50;; SRE's POSIX semantics. There are probably more difficulties. 51 52;; Rx translates a sexp notation for regular expressions into the 53;; usual string notation. The translation can be done at compile-time 54;; by using the `rx' macro. It can be done at run-time by calling 55;; function `rx-to-string'. See the documentation of `rx' for a 56;; complete description of the sexp notation. 57;; 58;; Some examples of string regexps and their sexp counterparts: 59;; 60;; "^[a-z]*" 61;; (rx (and line-start (0+ (in "a-z")))) 62;; 63;; "\n[^ \t]" 64;; (rx (and "\n" (not blank))), or 65;; (rx (and "\n" (not (any " \t")))) 66;; 67;; "\\*\\*\\* EOOH \\*\\*\\*\n" 68;; (rx "*** EOOH ***\n") 69;; 70;; "\\<\\(catch\\|finally\\)\\>[^_]" 71;; (rx (and word-start (submatch (or "catch" "finally")) word-end 72;; (not (any ?_)))) 73;; 74;; "[ \t\n]*:\\([^:]+\\|$\\)" 75;; (rx (and (zero-or-more (in " \t\n")) ":" 76;; (submatch (or line-end (one-or-more (not (any ?:))))))) 77;; 78;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*" 79;; (rx (and line-start 80;; "content-transfer-encoding:" 81;; (+ (? ?\n)) blank 82;; "quoted-printable" 83;; (+ (? ?\n)) blank)) 84;; 85;; (concat "^\\(?:" something-else "\\)") 86;; (rx (and line-start (eval something-else))), statically or 87;; (rx-to-string '(and line-start ,something-else)), dynamically. 88;; 89;; (regexp-opt '(STRING1 STRING2 ...)) 90;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically 91;; calls `regexp-opt' as needed. 92;; 93;; "^;;\\s-*\n\\|^\n" 94;; (rx (or (and line-start ";;" (0+ space) ?\n) 95;; (and line-start ?\n))) 96;; 97;; "\\$[I]d: [^ ]+ \\([^ ]+\\) " 98;; (rx (and "$Id: " 99;; (1+ (not (in " "))) 100;; " " 101;; (submatch (1+ (not (in " ")))) 102;; " ")) 103;; 104;; "\\\\\\\\\\[\\w+" 105;; (rx (and ?\\ ?\\ ?\[ (1+ word))) 106;; 107;; etc. 108 109;;; History: 110;; 111 112;;; Code: 113 114(defconst rx-constituents 115 '((and . (rx-and 1 nil)) 116 (seq . and) ; SRE 117 (: . and) ; SRE 118 (sequence . and) ; sregex 119 (or . (rx-or 1 nil)) 120 (| . or) ; SRE 121 (not-newline . ".") 122 (nonl . not-newline) ; SRE 123 (anything . ".\\|\n") 124 (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE 125 (in . any) 126 (char . any) ; sregex 127 (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex 128 (not . (rx-not 1 1 rx-check-not)) 129 ;; Partially consistent with sregex, whose `repeat' is like our 130 ;; `**'. (`repeat' with optional max arg and multiple sexp forms 131 ;; is ambiguous.) 132 (repeat . (rx-repeat 2 3)) 133 (= . (rx-= 2 nil)) ; SRE 134 (>= . (rx->= 2 nil)) ; SRE 135 (** . (rx-** 2 nil)) ; SRE 136 (submatch . (rx-submatch 1 nil)) ; SRE 137 (group . submatch) 138 (zero-or-more . (rx-kleene 1 nil)) 139 (one-or-more . (rx-kleene 1 nil)) 140 (zero-or-one . (rx-kleene 1 nil)) 141 (\? . zero-or-one) ; SRE 142 (\?? . zero-or-one) 143 (* . zero-or-more) ; SRE 144 (*? . zero-or-more) 145 (0+ . zero-or-more) 146 (+ . one-or-more) ; SRE 147 (+? . one-or-more) 148 (1+ . one-or-more) 149 (optional . zero-or-one) 150 (opt . zero-or-one) ; sregex 151 (minimal-match . (rx-greedy 1 1)) 152 (maximal-match . (rx-greedy 1 1)) 153 (backref . (rx-backref 1 1 rx-check-backref)) 154 (line-start . "^") 155 (bol . line-start) ; SRE 156 (line-end . "$") 157 (eol . line-end) ; SRE 158 (string-start . "\\`") 159 (bos . string-start) ; SRE 160 (bot . string-start) ; sregex 161 (string-end . "\\'") 162 (eos . string-end) ; SRE 163 (eot . string-end) ; sregex 164 (buffer-start . "\\`") 165 (buffer-end . "\\'") 166 (point . "\\=") 167 (word-start . "\\<") 168 (bow . word-start) ; SRE 169 (word-end . "\\>") 170 (eow . word-end) ; SRE 171 (word-boundary . "\\b") 172 (not-word-boundary . "\\B") ; sregex 173 (symbol-start . "\\_<") 174 (symbol-end . "\\_>") 175 (syntax . (rx-syntax 1 1)) 176 (not-syntax . (rx-not-syntax 1 1)) ; sregex 177 (category . (rx-category 1 1 rx-check-category)) 178 (eval . (rx-eval 1 1)) 179 (regexp . (rx-regexp 1 1 stringp)) 180 (digit . "[[:digit:]]") 181 (numeric . digit) ; SRE 182 (num . digit) ; SRE 183 (control . "[[:cntrl:]]") ; SRE 184 (cntrl . control) ; SRE 185 (hex-digit . "[[:xdigit:]]") ; SRE 186 (hex . hex-digit) ; SRE 187 (xdigit . hex-digit) ; SRE 188 (blank . "[[:blank:]]") ; SRE 189 (graphic . "[[:graph:]]") ; SRE 190 (graph . graphic) ; SRE 191 (printing . "[[:print:]]") ; SRE 192 (print . printing) ; SRE 193 (alphanumeric . "[[:alnum:]]") ; SRE 194 (alnum . alphanumeric) ; SRE 195 (letter . "[[:alpha:]]") 196 (alphabetic . letter) ; SRE 197 (alpha . letter) ; SRE 198 (ascii . "[[:ascii:]]") ; SRE 199 (nonascii . "[[:nonascii:]]") 200 (lower . "[[:lower:]]") ; SRE 201 (lower-case . lower) ; SRE 202 (punctuation . "[[:punct:]]") ; SRE 203 (punct . punctuation) ; SRE 204 (space . "[[:space:]]") ; SRE 205 (whitespace . space) ; SRE 206 (white . space) ; SRE 207 (upper . "[[:upper:]]") ; SRE 208 (upper-case . upper) ; SRE 209 (word . "[[:word:]]") ; inconsistent with SRE 210 (wordchar . word) ; sregex 211 (not-wordchar . "[^[:word:]]") ; sregex (use \\W?) 212 ) 213 "Alist of sexp form regexp constituents. 214Each element of the alist has the form (SYMBOL . DEFN). 215SYMBOL is a valid constituent of sexp regular expressions. 216If DEFN is a string, SYMBOL is translated into DEFN. 217If DEFN is a symbol, use the definition of DEFN, recursively. 218Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE). 219FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS 220are the minimum and maximum number of arguments the function-form 221sexp constituent SYMBOL may have in sexp regular expressions. 222MAX-ARGS nil means no limit. PREDICATE, if specified, means that 223all arguments must satisfy PREDICATE.") 224 225 226(defconst rx-syntax 227 '((whitespace . ?-) 228 (punctuation . ?.) 229 (word . ?w) 230 (symbol . ?_) 231 (open-parenthesis . ?\() 232 (close-parenthesis . ?\)) 233 (expression-prefix . ?\') 234 (string-quote . ?\") 235 (paired-delimiter . ?$) 236 (escape . ?\\) 237 (character-quote . ?/) 238 (comment-start . ?<) 239 (comment-end . ?>) 240 (string-delimiter . ?|) 241 (comment-delimiter . ?!)) 242 "Alist mapping Rx syntax symbols to syntax characters. 243Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid 244symbol in `(syntax SYMBOL)', and CHAR is the syntax character 245corresponding to SYMBOL, as it would be used with \\s or \\S in 246regular expressions.") 247 248 249(defconst rx-categories 250 '((consonant . ?0) 251 (base-vowel . ?1) 252 (upper-diacritical-mark . ?2) 253 (lower-diacritical-mark . ?3) 254 (tone-mark . ?4) 255 (symbol . ?5) 256 (digit . ?6) 257 (vowel-modifying-diacritical-mark . ?7) 258 (vowel-sign . ?8) 259 (semivowel-lower . ?9) 260 (not-at-end-of-line . ?<) 261 (not-at-beginning-of-line . ?>) 262 (alpha-numeric-two-byte . ?A) 263 (chinse-two-byte . ?C) 264 (greek-two-byte . ?G) 265 (japanese-hiragana-two-byte . ?H) 266 (indian-two-byte . ?I) 267 (japanese-katakana-two-byte . ?K) 268 (korean-hangul-two-byte . ?N) 269 (cyrillic-two-byte . ?Y) 270 (combining-diacritic . ?^) 271 (ascii . ?a) 272 (arabic . ?b) 273 (chinese . ?c) 274 (ethiopic . ?e) 275 (greek . ?g) 276 (korean . ?h) 277 (indian . ?i) 278 (japanese . ?j) 279 (japanese-katakana . ?k) 280 (latin . ?l) 281 (lao . ?o) 282 (tibetan . ?q) 283 (japanese-roman . ?r) 284 (thai . ?t) 285 (vietnamese . ?v) 286 (hebrew . ?w) 287 (cyrillic . ?y) 288 (can-break . ?|)) 289 "Alist mapping symbols to category characters. 290Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid 291symbol in `(category SYMBOL)', and CHAR is the category character 292corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in 293regular expression strings.") 294 295 296(defvar rx-greedy-flag t 297 "Non-nil means produce greedy regular expressions for `zero-or-one', 298`zero-or-more', and `one-or-more'. Dynamically bound.") 299 300 301(defun rx-info (op) 302 "Return parsing/code generation info for OP. 303If OP is the space character ASCII 32, return info for the symbol `?'. 304If OP is the character `?', return info for the symbol `??'. 305See also `rx-constituents'." 306 (cond ((eq op ? ) (setq op '\?)) 307 ((eq op ??) (setq op '\??))) 308 (while (and (not (null op)) (symbolp op)) 309 (setq op (cdr (assq op rx-constituents)))) 310 op) 311 312 313(defun rx-check (form) 314 "Check FORM according to its car's parsing info." 315 (unless (listp form) 316 (error "rx `%s' needs argument(s)" form)) 317 (let* ((rx (rx-info (car form))) 318 (nargs (1- (length form))) 319 (min-args (nth 1 rx)) 320 (max-args (nth 2 rx)) 321 (type-pred (nth 3 rx))) 322 (when (and (not (null min-args)) 323 (< nargs min-args)) 324 (error "rx form `%s' requires at least %d args" 325 (car form) min-args)) 326 (when (and (not (null max-args)) 327 (> nargs max-args)) 328 (error "rx form `%s' accepts at most %d args" 329 (car form) max-args)) 330 (when (not (null type-pred)) 331 (dolist (sub-form (cdr form)) 332 (unless (funcall type-pred sub-form) 333 (error "rx form `%s' requires args satisfying `%s'" 334 (car form) type-pred)))))) 335 336 337(defun rx-and (form) 338 "Parse and produce code from FORM. 339FORM is of the form `(and FORM1 ...)'." 340 (rx-check form) 341 (concat "\\(?:" 342 (mapconcat 343 (function (lambda (x) (rx-to-string x 'no-group))) 344 (cdr form) nil) 345 "\\)")) 346 347 348(defun rx-or (form) 349 "Parse and produce code from FORM, which is `(or FORM1 ...)'." 350 (rx-check form) 351 (let ((all-args-strings t)) 352 (dolist (arg (cdr form)) 353 (unless (stringp arg) 354 (setq all-args-strings nil))) 355 (concat "\\(?:" 356 (if all-args-strings 357 (regexp-opt (cdr form)) 358 (mapconcat #'rx-to-string (cdr form) "\\|")) 359 "\\)"))) 360 361 362(defvar rx-bracket) ; dynamically bound in `rx-any' 363 364(defun rx-check-any (arg) 365 "Check arg ARG for Rx `any'." 366 (if (integerp arg) 367 (setq arg (string arg))) 368 (when (stringp arg) 369 (if (zerop (length arg)) 370 (error "String arg for Rx `any' must not be empty")) 371 ;; Quote ^ at start; don't bother to check whether this is first arg. 372 (if (eq ?^ (aref arg 0)) 373 (setq arg (concat "\\" arg))) 374 ;; Remove ] and set flag for adding it to start of overall result. 375 (when (string-match "\\]" arg) 376 (setq arg (replace-regexp-in-string "\\]" "" arg) 377 rx-bracket "]"))) 378 (when (symbolp arg) 379 (let ((translation (condition-case nil 380 (rx-to-string arg 'no-group) 381 (error nil)))) 382 (unless translation (error "Invalid char class `%s' in Rx `any'" arg)) 383 (setq arg (substring translation 1 -1)))) ; strip outer brackets 384 ;; sregex compatibility 385 (when (and (integerp (car-safe arg)) 386 (integerp (cdr-safe arg))) 387 (setq arg (string (car arg) ?- (cdr arg)))) 388 (unless (stringp arg) 389 (error "rx `any' requires string, character, char pair or char class args")) 390 arg) 391 392(defun rx-any (form) 393 "Parse and produce code from FORM, which is `(any ARG ...)'. 394ARG is optional." 395 (rx-check form) 396 (let* ((rx-bracket nil) 397 (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `rx-bracket' 398 ;; If there was a ?- in the form, move it to the front to avoid 399 ;; accidental range. 400 (if (member "-" args) 401 (setq args (cons "-" (delete "-" args)))) 402 (apply #'concat "[" rx-bracket (append args '("]"))))) 403 404 405(defun rx-check-not (arg) 406 "Check arg ARG for Rx `not'." 407 (unless (or (and (symbolp arg) 408 (string-match "\\`\\[\\[:[-a-z]:\\]\\]\\'" 409 (condition-case nil 410 (rx-to-string arg 'no-group) 411 (error "")))) 412 (eq arg 'word-boundary) 413 (and (consp arg) 414 (memq (car arg) '(not any in syntax category)))) 415 (error "rx `not' syntax error: %s" arg)) 416 t) 417 418 419(defun rx-not (form) 420 "Parse and produce code from FORM. FORM is `(not ...)'." 421 (rx-check form) 422 (let ((result (rx-to-string (cadr form) 'no-group)) 423 case-fold-search) 424 (cond ((string-match "\\`\\[^" result) 425 (if (= (length result) 4) 426 (substring result 2 3) 427 (concat "[" (substring result 2)))) 428 ((eq ?\[ (aref result 0)) 429 (concat "[^" (substring result 1))) 430 ((string-match "\\`\\\\[scb]" result) 431 (concat (capitalize (substring result 0 2)) (substring result 2))) 432 (t 433 (concat "[^" result "]"))))) 434 435 436(defun rx-not-char (form) 437 "Parse and produce code from FORM. FORM is `(not-char ...)'." 438 (rx-check form) 439 (rx-not `(not (in ,@(cdr form))))) 440 441 442(defun rx-not-syntax (form) 443 "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'." 444 (rx-check form) 445 (rx-not `(not (syntax ,@(cdr form))))) 446 447 448(defun rx-trans-forms (form &optional skip) 449 "If FORM's length is greater than two, transform it to length two. 450A form (HEAD REST ...) becomes (HEAD (and REST ...)). 451If SKIP is non-nil, allow that number of items after the head, i.e. 452`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1." 453 (unless skip (setq skip 0)) 454 (let ((tail (nthcdr (1+ skip) form))) 455 (if (= (length tail) 1) 456 form 457 (let ((form (copy-sequence form))) 458 (setcdr (nthcdr skip form) (list (cons 'and tail))) 459 form)))) 460 461 462(defun rx-= (form) 463 "Parse and produce code from FORM `(= N ...)'." 464 (rx-check form) 465 (setq form (rx-trans-forms form 1)) 466 (unless (and (integerp (nth 1 form)) 467 (> (nth 1 form) 0)) 468 (error "rx `=' requires positive integer first arg")) 469 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) 470 471 472(defun rx->= (form) 473 "Parse and produce code from FORM `(>= N ...)'." 474 (rx-check form) 475 (setq form (rx-trans-forms form 1)) 476 (unless (and (integerp (nth 1 form)) 477 (> (nth 1 form) 0)) 478 (error "rx `>=' requires positive integer first arg")) 479 (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form))) 480 481 482(defun rx-** (form) 483 "Parse and produce code from FORM `(** N M ...)'." 484 (rx-check form) 485 (setq form (cons 'repeat (cdr (rx-trans-forms form 2)))) 486 (rx-to-string form)) 487 488 489(defun rx-repeat (form) 490 "Parse and produce code from FORM. 491FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." 492 (rx-check form) 493 (cond ((= (length form) 3) 494 (unless (and (integerp (nth 1 form)) 495 (> (nth 1 form) 0)) 496 (error "rx `repeat' requires positive integer first arg")) 497 (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) 498 ((or (not (integerp (nth 2 form))) 499 (< (nth 2 form) 0) 500 (not (integerp (nth 1 form))) 501 (< (nth 1 form) 0) 502 (< (nth 2 form) (nth 1 form))) 503 (error "rx `repeat' range error")) 504 (t 505 (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form)) 506 (nth 1 form) (nth 2 form))))) 507 508 509(defun rx-submatch (form) 510 "Parse and produce code from FORM, which is `(submatch ...)'." 511 (concat "\\(" 512 (mapconcat (function (lambda (x) (rx-to-string x 'no-group))) 513 (cdr form) nil) 514 "\\)")) 515 516(defun rx-backref (form) 517 "Parse and produce code from FORM, which is `(backref N)'." 518 (rx-check form) 519 (format "\\%d" (nth 1 form))) 520 521(defun rx-check-backref (arg) 522 "Check arg ARG for Rx `backref'." 523 (or (and (integerp arg) (>= arg 1) (<= arg 9)) 524 (error "rx `backref' requires numeric 1<=arg<=9: %s" arg))) 525 526(defun rx-kleene (form) 527 "Parse and produce code from FORM. 528FORM is `(OP FORM1)', where OP is one of the `zero-or-one', 529`zero-or-more' etc. operators. 530If OP is one of `*', `+', `?', produce a greedy regexp. 531If OP is one of `*?', `+?', `??', produce a non-greedy regexp. 532If OP is anything else, produce a greedy regexp if `rx-greedy-flag' 533is non-nil." 534 (rx-check form) 535 (setq form (rx-trans-forms form)) 536 (let ((suffix (cond ((memq (car form) '(* + ? )) "") 537 ((memq (car form) '(*? +? ??)) "?") 538 (rx-greedy-flag "") 539 (t "?"))) 540 (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") 541 ((memq (car form) '(+ +? 1+ one-or-more)) "+") 542 (t "?"))) 543 (result (rx-to-string (cadr form) 'no-group))) 544 (if (not (rx-atomic-p result)) 545 (setq result (concat "\\(?:" result "\\)"))) 546 (concat result op suffix))) 547 548(defun rx-atomic-p (r) 549 "Return non-nil if regexp string R is atomic. 550An atomic regexp R is one such that a suffix operator 551appended to R will apply to all of R. For example, \"a\" 552\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\", 553\"[ab]c\", and \"ab\\|ab*c\" are not atomic. 554 555This function may return false negatives, but it will not 556return false positives. It is nevertheless useful in 557situations where an efficiency shortcut can be taken iff a 558regexp is atomic. The function can be improved to detect 559more cases of atomic regexps. Presently, this function 560detects the following categories of atomic regexp; 561 562 a group or shy group: \\(...\\) 563 a character class: [...] 564 a single character: a 565 566On the other hand, false negatives will be returned for 567regexps that are atomic but end in operators, such as 568\"a+\". I think these are rare. Probably such cases could 569be detected without much effort. A guarantee of no false 570negatives would require a theoretic specification of the set 571of all atomic regexps." 572 (let ((l (length r))) 573 (or (equal l 1) 574 (and (>= l 6) 575 (equal (substring r 0 2) "\\(") 576 (equal (substring r -2) "\\)")) 577 (and (>= l 2) 578 (equal (substring r 0 1) "[") 579 (equal (substring r -1) "]"))))) 580 581 582(defun rx-syntax (form) 583 "Parse and produce code from FORM, which is `(syntax SYMBOL)'." 584 (rx-check form) 585 (let* ((sym (cadr form)) 586 (syntax (assq sym rx-syntax))) 587 (unless syntax 588 ;; Try sregex compatibility. 589 (let ((name (symbol-name sym))) 590 (if (= 1 (length name)) 591 (setq syntax (rassq (aref name 0) rx-syntax)))) 592 (unless syntax 593 (error "Unknown rx syntax `%s'" (cadr form)))) 594 (format "\\s%c" (cdr syntax)))) 595 596 597(defun rx-check-category (form) 598 "Check the argument FORM of a `(category FORM)'." 599 (unless (or (integerp form) 600 (cdr (assq form rx-categories))) 601 (error "Unknown category `%s'" form)) 602 t) 603 604 605(defun rx-category (form) 606 "Parse and produce code from FORM, which is `(category SYMBOL)'." 607 (rx-check form) 608 (let ((char (if (integerp (cadr form)) 609 (cadr form) 610 (cdr (assq (cadr form) rx-categories))))) 611 (format "\\c%c" char))) 612 613 614(defun rx-eval (form) 615 "Parse and produce code from FORM, which is `(eval FORM)'." 616 (rx-check form) 617 (rx-to-string (eval (cadr form)))) 618 619 620(defun rx-greedy (form) 621 "Parse and produce code from FORM. 622If FORM is '(minimal-match FORM1)', non-greedy versions of `*', 623`+', and `?' operators will be used in FORM1. If FORM is 624'(maximal-match FORM1)', greedy operators will be used." 625 (rx-check form) 626 (let ((rx-greedy-flag (eq (car form) 'maximal-match))) 627 (rx-to-string (cadr form)))) 628 629 630(defun rx-regexp (form) 631 "Parse and produce code from FORM, which is `(regexp STRING)'." 632 (rx-check form) 633 (concat "\\(?:" (cadr form) "\\)")) 634 635 636;;;###autoload 637(defun rx-to-string (form &optional no-group) 638 "Parse and produce code for regular expression FORM. 639FORM is a regular expression in sexp form. 640NO-GROUP non-nil means don't put shy groups around the result." 641 (cond ((stringp form) 642 (regexp-quote form)) 643 ((integerp form) 644 (regexp-quote (char-to-string form))) 645 ((symbolp form) 646 (let ((info (rx-info form))) 647 (cond ((stringp info) 648 info) 649 ((null info) 650 (error "Unknown rx form `%s'" form)) 651 (t 652 (funcall (nth 0 info) form))))) 653 ((consp form) 654 (let ((info (rx-info (car form)))) 655 (unless (consp info) 656 (error "Unknown rx form `%s'" (car form))) 657 (let ((result (funcall (nth 0 info) form))) 658 (if (or no-group (string-match "\\`\\\\[(]" result)) 659 result 660 (concat "\\(?:" result "\\)"))))) 661 (t 662 (error "rx syntax error at `%s'" form)))) 663 664 665;;;###autoload 666(defmacro rx (&rest regexps) 667 "Translate regular expressions REGEXPS in sexp form to a regexp string. 668REGEXPS is a non-empty sequence of forms of the sort listed below. 669See also `rx-to-string' for how to do such a translation at run-time. 670 671The following are valid subforms of regular expressions in sexp 672notation. 673 674STRING 675 matches string STRING literally. 676 677CHAR 678 matches character CHAR literally. 679 680`not-newline', `nonl' 681 matches any character except a newline. 682 . 683`anything' 684 matches any character 685 686`(any SET ...)' 687`(in SET ...)' 688`(char SET ...)' 689 matches any character in SET .... SET may be a character or string. 690 Ranges of characters can be specified as `A-Z' in strings. 691 Ranges may also be specified as conses like `(?A . ?Z)'. 692 693 SET may also be the name of a character class: `digit', 694 `control', `hex-digit', `blank', `graph', `print', `alnum', 695 `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper', 696 `word', or one of their synonyms. 697 698`(not (any SET ...))' 699 matches any character not in SET ... 700 701`line-start', `bol' 702 matches the empty string, but only at the beginning of a line 703 in the text being matched 704 705`line-end', `eol' 706 is similar to `line-start' but matches only at the end of a line 707 708`string-start', `bos', `bot' 709 matches the empty string, but only at the beginning of the 710 string being matched against. 711 712`string-end', `eos', `eot' 713 matches the empty string, but only at the end of the 714 string being matched against. 715 716`buffer-start' 717 matches the empty string, but only at the beginning of the 718 buffer being matched against. Actually equivalent to `string-start'. 719 720`buffer-end' 721 matches the empty string, but only at the end of the 722 buffer being matched against. Actually equivalent to `string-end'. 723 724`point' 725 matches the empty string, but only at point. 726 727`word-start', `bow' 728 matches the empty string, but only at the beginning of a word. 729 730`word-end', `eow' 731 matches the empty string, but only at the end of a word. 732 733`word-boundary' 734 matches the empty string, but only at the beginning or end of a 735 word. 736 737`(not word-boundary)' 738`not-word-boundary' 739 matches the empty string, but not at the beginning or end of a 740 word. 741 742`symbol-start' 743 matches the empty string, but only at the beginning of a symbol. 744 745`symbol-end' 746 matches the empty string, but only at the end of a symbol. 747 748`digit', `numeric', `num' 749 matches 0 through 9. 750 751`control', `cntrl' 752 matches ASCII control characters. 753 754`hex-digit', `hex', `xdigit' 755 matches 0 through 9, a through f and A through F. 756 757`blank' 758 matches space and tab only. 759 760`graphic', `graph' 761 matches graphic characters--everything except ASCII control chars, 762 space, and DEL. 763 764`printing', `print' 765 matches printing characters--everything except ASCII control chars 766 and DEL. 767 768`alphanumeric', `alnum' 769 matches letters and digits. (But at present, for multibyte characters, 770 it matches anything that has word syntax.) 771 772`letter', `alphabetic', `alpha' 773 matches letters. (But at present, for multibyte characters, 774 it matches anything that has word syntax.) 775 776`ascii' 777 matches ASCII (unibyte) characters. 778 779`nonascii' 780 matches non-ASCII (multibyte) characters. 781 782`lower', `lower-case' 783 matches anything lower-case. 784 785`upper', `upper-case' 786 matches anything upper-case. 787 788`punctuation', `punct' 789 matches punctuation. (But at present, for multibyte characters, 790 it matches anything that has non-word syntax.) 791 792`space', `whitespace', `white' 793 matches anything that has whitespace syntax. 794 795`word', `wordchar' 796 matches anything that has word syntax. 797 798`not-wordchar' 799 matches anything that has non-word syntax. 800 801`(syntax SYNTAX)' 802 matches a character with syntax SYNTAX. SYNTAX must be one 803 of the following symbols, or a symbol corresponding to the syntax 804 character, e.g. `\\.' for `\\s.'. 805 806 `whitespace' (\\s- in string notation) 807 `punctuation' (\\s.) 808 `word' (\\sw) 809 `symbol' (\\s_) 810 `open-parenthesis' (\\s() 811 `close-parenthesis' (\\s)) 812 `expression-prefix' (\\s') 813 `string-quote' (\\s\") 814 `paired-delimiter' (\\s$) 815 `escape' (\\s\\) 816 `character-quote' (\\s/) 817 `comment-start' (\\s<) 818 `comment-end' (\\s>) 819 `string-delimiter' (\\s|) 820 `comment-delimiter' (\\s!) 821 822`(not (syntax SYNTAX))' 823 matches a character that doesn't have syntax SYNTAX. 824 825`(category CATEGORY)' 826 matches a character with category CATEGORY. CATEGORY must be 827 either a character to use for C, or one of the following symbols. 828 829 `consonant' (\\c0 in string notation) 830 `base-vowel' (\\c1) 831 `upper-diacritical-mark' (\\c2) 832 `lower-diacritical-mark' (\\c3) 833 `tone-mark' (\\c4) 834 `symbol' (\\c5) 835 `digit' (\\c6) 836 `vowel-modifying-diacritical-mark' (\\c7) 837 `vowel-sign' (\\c8) 838 `semivowel-lower' (\\c9) 839 `not-at-end-of-line' (\\c<) 840 `not-at-beginning-of-line' (\\c>) 841 `alpha-numeric-two-byte' (\\cA) 842 `chinse-two-byte' (\\cC) 843 `greek-two-byte' (\\cG) 844 `japanese-hiragana-two-byte' (\\cH) 845 `indian-tow-byte' (\\cI) 846 `japanese-katakana-two-byte' (\\cK) 847 `korean-hangul-two-byte' (\\cN) 848 `cyrillic-two-byte' (\\cY) 849 `combining-diacritic' (\\c^) 850 `ascii' (\\ca) 851 `arabic' (\\cb) 852 `chinese' (\\cc) 853 `ethiopic' (\\ce) 854 `greek' (\\cg) 855 `korean' (\\ch) 856 `indian' (\\ci) 857 `japanese' (\\cj) 858 `japanese-katakana' (\\ck) 859 `latin' (\\cl) 860 `lao' (\\co) 861 `tibetan' (\\cq) 862 `japanese-roman' (\\cr) 863 `thai' (\\ct) 864 `vietnamese' (\\cv) 865 `hebrew' (\\cw) 866 `cyrillic' (\\cy) 867 `can-break' (\\c|) 868 869`(not (category CATEGORY))' 870 matches a character that doesn't have category CATEGORY. 871 872`(and SEXP1 SEXP2 ...)' 873`(: SEXP1 SEXP2 ...)' 874`(seq SEXP1 SEXP2 ...)' 875`(sequence SEXP1 SEXP2 ...)' 876 matches what SEXP1 matches, followed by what SEXP2 matches, etc. 877 878`(submatch SEXP1 SEXP2 ...)' 879`(group SEXP1 SEXP2 ...)' 880 like `and', but makes the match accessible with `match-end', 881 `match-beginning', and `match-string'. 882 883`(group SEXP1 SEXP2 ...)' 884 another name for `submatch'. 885 886`(or SEXP1 SEXP2 ...)' 887`(| SEXP1 SEXP2 ...)' 888 matches anything that matches SEXP1 or SEXP2, etc. If all 889 args are strings, use `regexp-opt' to optimize the resulting 890 regular expression. 891 892`(minimal-match SEXP)' 893 produce a non-greedy regexp for SEXP. Normally, regexps matching 894 zero or more occurrences of something are \"greedy\" in that they 895 match as much as they can, as long as the overall regexp can 896 still match. A non-greedy regexp matches as little as possible. 897 898`(maximal-match SEXP)' 899 produce a greedy regexp for SEXP. This is the default. 900 901Below, `SEXP ...' represents a sequence of regexp forms, treated as if 902enclosed in `(and ...)'. 903 904`(zero-or-more SEXP ...)' 905`(0+ SEXP ...)' 906 matches zero or more occurrences of what SEXP ... matches. 907 908`(* SEXP ...)' 909 like `zero-or-more', but always produces a greedy regexp, independent 910 of `rx-greedy-flag'. 911 912`(*? SEXP ...)' 913 like `zero-or-more', but always produces a non-greedy regexp, 914 independent of `rx-greedy-flag'. 915 916`(one-or-more SEXP ...)' 917`(1+ SEXP ...)' 918 matches one or more occurrences of SEXP ... 919 920`(+ SEXP ...)' 921 like `one-or-more', but always produces a greedy regexp. 922 923`(+? SEXP ...)' 924 like `one-or-more', but always produces a non-greedy regexp. 925 926`(zero-or-one SEXP ...)' 927`(optional SEXP ...)' 928`(opt SEXP ...)' 929 matches zero or one occurrences of A. 930 931`(? SEXP ...)' 932 like `zero-or-one', but always produces a greedy regexp. 933 934`(?? SEXP ...)' 935 like `zero-or-one', but always produces a non-greedy regexp. 936 937`(repeat N SEXP)' 938`(= N SEXP ...)' 939 matches N occurrences. 940 941`(>= N SEXP ...)' 942 matches N or more occurrences. 943 944`(repeat N M SEXP)' 945`(** N M SEXP ...)' 946 matches N to M occurrences. 947 948`(backref N)' 949 matches what was matched previously by submatch N. 950 951`(backref N)' 952 matches what was matched previously by submatch N. 953 954`(backref N)' 955 matches what was matched previously by submatch N. 956 957`(eval FORM)' 958 evaluate FORM and insert result. If result is a string, 959 `regexp-quote' it. 960 961`(regexp REGEXP)' 962 include REGEXP in string notation in the result." 963 (cond ((null regexps) 964 (error "No regexp")) 965 ((cdr regexps) 966 (rx-to-string `(and ,@regexps) t)) 967 (t 968 (rx-to-string (car regexps) t)))) 969 970;; ;; sregex.el replacement 971 972;; ;;;###autoload (provide 'sregex) 973;; ;;;###autoload (autoload 'sregex "rx") 974;; (defalias 'sregex 'rx-to-string) 975;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro) 976;; (defalias 'sregexq 'rx) 977 978(provide 'rx) 979 980;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b 981;;; rx.el ends here 982