1;;; regexp-opt.el --- generate efficient regexps to match strings 2 3;; Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 4;; 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. 5 6;; Author: Simon Marshall <simon@gnu.org> 7;; Maintainer: FSF 8;; Keywords: strings, regexps, extensions 9 10;; This file is part of GNU Emacs. 11 12;; GNU Emacs is free software; you can redistribute it and/or modify 13;; it under the terms of the GNU General Public License as published by 14;; the Free Software Foundation; either version 2, or (at your option) 15;; any later version. 16 17;; GNU Emacs is distributed in the hope that it will be useful, 18;; but WITHOUT ANY WARRANTY; without even the implied warranty of 19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20;; GNU General Public License for more details. 21 22;; You should have received a copy of the GNU General Public License 23;; along with GNU Emacs; see the file COPYING. If not, write to the 24;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 25;; Boston, MA 02110-1301, USA. 26 27;;; Commentary: 28 29;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i[sz]e\\)". 30;; 31;; This package generates a regexp from a given list of strings (which matches 32;; one of those strings) so that the regexp generated by: 33;; 34;; (regexp-opt strings) 35;; 36;; is equivalent to, but more efficient than, the regexp generated by: 37;; 38;; (mapconcat 'regexp-quote strings "\\|") 39;; 40;; For example: 41;; 42;; (let ((strings '("cond" "if" "when" "unless" "while" 43;; "let" "let*" "progn" "prog1" "prog2" 44;; "save-restriction" "save-excursion" "save-window-excursion" 45;; "save-current-buffer" "save-match-data" 46;; "catch" "throw" "unwind-protect" "condition-case"))) 47;; (concat "(" (regexp-opt strings t) "\\>")) 48;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>" 49;; 50;; Searching using the above example `regexp-opt' regexp takes approximately 51;; two-thirds of the time taken using the equivalent `mapconcat' regexp. 52 53;; Since this package was written to produce efficient regexps, not regexps 54;; efficiently, it is probably not a good idea to in-line too many calls in 55;; your code, unless you use the following trick with `eval-when-compile': 56;; 57;; (defvar definition-regexp 58;; (eval-when-compile 59;; (concat "^(" 60;; (regexp-opt '("defun" "defsubst" "defmacro" "defalias" 61;; "defvar" "defconst") t) 62;; "\\>"))) 63;; 64;; The `byte-compile' code will be as if you had defined the variable thus: 65;; 66;; (defvar definition-regexp 67;; "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>") 68;; 69;; Note that if you use this trick for all instances of `regexp-opt' and 70;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded 71;; at compile time. But note also that using this trick means that should 72;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to 73;; improve the efficiency of `regexp-opt' regexps, you would have to recompile 74;; your code for such changes to have effect in your code. 75 76;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with 77;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu and 78;; Stefan Monnier. 79;; No doubt `regexp-opt' doesn't always produce optimal regexps, so code, ideas 80;; or any other information to improve things are welcome. 81;; 82;; One possible improvement would be to compile '("aa" "ab" "ba" "bb") 83;; into "[ab][ab]" rather than "a[ab]\\|b[ab]". I'm not sure it's worth 84;; it but if someone knows how to do it without going through too many 85;; contortions, I'm all ears. 86 87;;; Code: 88 89;;;###autoload 90(defun regexp-opt (strings &optional paren) 91 "Return a regexp to match a string in the list STRINGS. 92Each string should be unique in STRINGS and should not contain any regexps, 93quoted or not. If optional PAREN is non-nil, ensure that the returned regexp 94is enclosed by at least one regexp grouping construct. 95The returned regexp is typically more efficient than the equivalent regexp: 96 97 (let ((open (if PAREN \"\\\\(\" \"\")) (close (if PAREN \"\\\\)\" \"\"))) 98 (concat open (mapconcat 'regexp-quote STRINGS \"\\\\|\") close)) 99 100If PAREN is `words', then the resulting regexp is additionally surrounded 101by \\=\\< and \\>." 102 (save-match-data 103 ;; Recurse on the sorted list. 104 (let* ((max-lisp-eval-depth (* 1024 1024)) 105 (max-specpdl-size (* 1024 1024)) 106 (completion-ignore-case nil) 107 (completion-regexp-list nil) 108 (words (eq paren 'words)) 109 (open (cond ((stringp paren) paren) (paren "\\("))) 110 (sorted-strings (delete-dups 111 (sort (copy-sequence strings) 'string-lessp))) 112 (re (regexp-opt-group sorted-strings open))) 113 (if words (concat "\\<" re "\\>") re)))) 114 115;;;###autoload 116(defun regexp-opt-depth (regexp) 117 "Return the depth of REGEXP. 118This means the number of non-shy regexp grouping constructs 119\(parenthesized expressions) in REGEXP." 120 (save-match-data 121 ;; Hack to signal an error if REGEXP does not have balanced parentheses. 122 (string-match regexp "") 123 ;; Count the number of open parentheses in REGEXP. 124 (let ((count 0) start last) 125 (while (string-match "\\\\(\\(\\?:\\)?" regexp start) 126 (setq start (match-end 0)) ; Start of next search. 127 (when (and (not (match-beginning 1)) 128 (subregexp-context-p regexp (match-beginning 0) last)) 129 ;; It's not a shy group and it's not inside brackets or after 130 ;; a backslash: it's really a group-open marker. 131 (setq last start) ; Speed up next regexp-opt-re-context-p. 132 (setq count (1+ count)))) 133 count))) 134 135;;; Workhorse functions. 136 137(eval-when-compile 138 (require 'cl)) 139 140(defun regexp-opt-group (strings &optional paren lax) 141 ;; Return a regexp to match a string in the sorted list STRINGS. 142 ;; If PAREN non-nil, output regexp parentheses around returned regexp. 143 ;; If LAX non-nil, don't output parentheses if it doesn't require them. 144 ;; Merges keywords to avoid backtracking in Emacs' regexp matcher. 145 146 ;; The basic idea is to find the shortest common prefix or suffix, remove it 147 ;; and recurse. If there is no prefix, we divide the list into two so that 148 ;; \(at least) one half will have at least a one-character common prefix. 149 150 ;; Also we delay the addition of grouping parenthesis as long as possible 151 ;; until we're sure we need them, and try to remove one-character sequences 152 ;; so we can use character sets rather than grouping parenthesis. 153 (let* ((open-group (cond ((stringp paren) paren) (paren "\\(?:") (t ""))) 154 (close-group (if paren "\\)" "")) 155 (open-charset (if lax "" open-group)) 156 (close-charset (if lax "" close-group))) 157 (cond 158 ;; 159 ;; If there are no strings, just return the empty string. 160 ((= (length strings) 0) 161 "") 162 ;; 163 ;; If there is only one string, just return it. 164 ((= (length strings) 1) 165 (if (= (length (car strings)) 1) 166 (concat open-charset (regexp-quote (car strings)) close-charset) 167 (concat open-group (regexp-quote (car strings)) close-group))) 168 ;; 169 ;; If there is an empty string, remove it and recurse on the rest. 170 ((= (length (car strings)) 0) 171 (concat open-charset 172 (regexp-opt-group (cdr strings) t t) "?" 173 close-charset)) 174 ;; 175 ;; If there are several one-char strings, use charsets 176 ((and (= (length (car strings)) 1) 177 (let ((strs (cdr strings))) 178 (while (and strs (/= (length (car strs)) 1)) 179 (pop strs)) 180 strs)) 181 (let (letters rest) 182 ;; Collect one-char strings 183 (dolist (s strings) 184 (if (= (length s) 1) (push (string-to-char s) letters) (push s rest))) 185 186 (if rest 187 ;; several one-char strings: take them and recurse 188 ;; on the rest (first so as to match the longest). 189 (concat open-group 190 (regexp-opt-group (nreverse rest)) 191 "\\|" (regexp-opt-charset letters) 192 close-group) 193 ;; all are one-char strings: just return a character set. 194 (concat open-charset 195 (regexp-opt-charset letters) 196 close-charset)))) 197 ;; 198 ;; We have a list of different length strings. 199 (t 200 (let ((prefix (try-completion "" strings))) 201 (if (> (length prefix) 0) 202 ;; common prefix: take it and recurse on the suffixes. 203 (let* ((n (length prefix)) 204 (suffixes (mapcar (lambda (s) (substring s n)) strings))) 205 (concat open-group 206 (regexp-quote prefix) 207 (regexp-opt-group suffixes t t) 208 close-group)) 209 210 (let* ((sgnirts (mapcar (lambda (s) 211 (concat (nreverse (string-to-list s)))) 212 strings)) 213 (xiffus (try-completion "" sgnirts))) 214 (if (> (length xiffus) 0) 215 ;; common suffix: take it and recurse on the prefixes. 216 (let* ((n (- (length xiffus))) 217 (prefixes 218 ;; Sorting is necessary in cases such as ("ad" "d"). 219 (sort (mapcar (lambda (s) (substring s 0 n)) strings) 220 'string-lessp))) 221 (concat open-group 222 (regexp-opt-group prefixes t t) 223 (regexp-quote 224 (concat (nreverse (string-to-list xiffus)))) 225 close-group)) 226 227 ;; Otherwise, divide the list into those that start with a 228 ;; particular letter and those that do not, and recurse on them. 229 (let* ((char (char-to-string (string-to-char (car strings)))) 230 (half1 (all-completions char strings)) 231 (half2 (nthcdr (length half1) strings))) 232 (concat open-group 233 (regexp-opt-group half1) 234 "\\|" (regexp-opt-group half2) 235 close-group)))))))))) 236 237 238(defun regexp-opt-charset (chars) 239 ;; 240 ;; Return a regexp to match a character in CHARS. 241 ;; 242 ;; The basic idea is to find character ranges. Also we take care in the 243 ;; position of character set meta characters in the character set regexp. 244 ;; 245 (let* ((charmap (make-char-table 'case-table)) 246 (start -1) (end -2) 247 (charset "") 248 (bracket "") (dash "") (caret "")) 249 ;; 250 ;; Make a character map but extract character set meta characters. 251 (dolist (char chars) 252 (case char 253 (?\] 254 (setq bracket "]")) 255 (?^ 256 (setq caret "^")) 257 (?- 258 (setq dash "-")) 259 (otherwise 260 (aset charmap char t)))) 261 ;; 262 ;; Make a character set from the map using ranges where applicable. 263 (map-char-table 264 (lambda (c v) 265 (when v 266 (if (= (1- c) end) (setq end c) 267 (if (> end (+ start 2)) 268 (setq charset (format "%s%c-%c" charset start end)) 269 (while (>= end start) 270 (setq charset (format "%s%c" charset start)) 271 (incf start))) 272 (setq start c end c)))) 273 charmap) 274 (when (>= end start) 275 (if (> end (+ start 2)) 276 (setq charset (format "%s%c-%c" charset start end)) 277 (while (>= end start) 278 (setq charset (format "%s%c" charset start)) 279 (incf start)))) 280 ;; 281 ;; Make sure a caret is not first and a dash is first or last. 282 (if (and (string-equal charset "") (string-equal bracket "")) 283 (concat "[" dash caret "]") 284 (concat "[" bracket charset caret dash "]")))) 285 286(provide 'regexp-opt) 287 288;; arch-tag: 6c5a66f4-29af-4fd6-8c3b-4b554d5b4370 289;;; regexp-opt.el ends here 290