1(*---------------------------------------------------------------------------*) 2(* Brzozowski-style regexp compilation with extension to character classes *) 3(* and other improvements from a paper by Owens,Reppy, and Turon. *) 4(*---------------------------------------------------------------------------*) 5 6structure regexpMatch :> regexpMatch = 7struct 8 9exception regexpErr of string * string 10 11fun ERR s1 s2 = regexpErr(s1,s2) 12fun pair x y = (x,y) 13fun K x y = x 14 15val debug = ref false 16 17val print = fn s => if !debug then print s else () 18 19(*---------------------------------------------------------------------------*) 20(* Regular expressions have character sets at the leaves *) 21(*---------------------------------------------------------------------------*) 22 23type charset = Char.char Binaryset.set; 24 25val Empty_Charset = Binaryset.empty Char.compare; 26val empty_cset = Empty_Charset 27fun isEmpty_Charset cset = Binaryset.isEmpty cset; 28 29(* ---------------------------------------------------------------------- 30 We use full range of 8-bit characters, but this can be changed. 31 ---------------------------------------------------------------------- *) 32 33val MAX_ORD = 255; (* Char.maxOrd is 255 *) 34val alphabet = List.tabulate(MAX_ORD + 1, Char.chr) 35val ALPHABET_SIZE = MAX_ORD + 1; 36val allchars = Binaryset.addList(Empty_Charset, alphabet); 37val univ_cset = allchars 38 39fun charset_compare(cset1,cset2) = 40 let open Binaryset 41 fun compare [] [] = EQUAL 42 | compare [] _ = LESS 43 | compare _ [] = GREATER 44 | compare ((h1:char)::t1) (h2::t2) = 45 if h1 < h2 then LESS else 46 if h1 > h2 then GREATER 47 else compare t1 t2 48 in 49 if Systeml.pointer_eq(cset1,cset2) then EQUAL 50 else compare (listItems cset1) (listItems cset2) 51 end; 52 53(*---------------------------------------------------------------------------*) 54(* The type of regular expressions *) 55(*---------------------------------------------------------------------------*) 56 57datatype regexp 58 = Epsilon 59 | Symbs of charset 60 | Not of regexp 61 | Sum of regexp * regexp 62 | And of regexp * regexp 63 | Dot of regexp * regexp 64 | Star of regexp; 65 66val Empty_Regexp = Symbs Empty_Charset; 67 68(*---------------------------------------------------------------------------*) 69(* Total order on regexp. *) 70(*---------------------------------------------------------------------------*) 71 72fun regexp_compare (r1,r2) = 73 if Systeml.pointer_eq(r1,r2) then EQUAL 74 else 75 case (r1,r2) 76 of (Epsilon, Epsilon) => EQUAL 77 | (Epsilon, _) => LESS 78 79 | (Symbs _, Epsilon) => GREATER 80 | (Symbs a, Symbs b) => charset_compare(a,b) 81 | (Symbs _, _) => LESS 82 83 | (Not _, Epsilon) => GREATER 84 | (Not _, Symbs _) => GREATER 85 | (Not r1, Not r2) => regexp_compare(r1,r2) 86 | (Not r1, _) => LESS 87 88 | (Sum _, Epsilon) => GREATER 89 | (Sum _, Symbs _) => GREATER 90 | (Sum _, Not _) => GREATER 91 | (Sum p1, Sum p2) => pair_regexp_compare (p1,p2) 92 | (Sum _, _) => LESS 93 94 | (And _, Epsilon) => GREATER 95 | (And _, Symbs _) => GREATER 96 | (And _, Not _) => GREATER 97 | (And _, Sum _) => GREATER 98 | (And p1, And p2) => pair_regexp_compare (p1,p2) 99 | (And _, _) => LESS 100 101 | (Dot _, Epsilon) => GREATER 102 | (Dot _, Symbs _) => GREATER 103 | (Dot _, Not _) => GREATER 104 | (Dot _, Sum _) => GREATER 105 | (Dot _, And _) => GREATER 106 | (Dot p1, Dot p2) => pair_regexp_compare (p1,p2) 107 | (Dot _, _) => LESS 108 109 | (Star _, Epsilon) => GREATER 110 | (Star _, Symbs _) => GREATER 111 | (Star _, Not _) => GREATER 112 | (Star _, Sum _) => GREATER 113 | (Star _, And _) => GREATER 114 | (Star _, Dot _) => GREATER 115 | (Star r1, Star r2) => regexp_compare (r1,r2) 116and 117 pair_regexp_compare ((r1,r2), (r3,r4)) = 118 case regexp_compare (r1,r3) 119 of EQUAL => regexp_compare (r2,r4) 120 | other => other; 121 122fun regexpEqual r1 r2 = (regexp_compare(r1,r2) = EQUAL); 123 124(*---------------------------------------------------------------------------*) 125(* Is Epsilon in the language of a regular expression? *) 126(*---------------------------------------------------------------------------*) 127 128fun hasEpsilon Epsilon = true 129 | hasEpsilon (Symbs _) = false 130 | hasEpsilon (Not r) = not(hasEpsilon r) 131 | hasEpsilon (Sum(r1,r2)) = hasEpsilon r1 orelse hasEpsilon r2 132 | hasEpsilon (And(r1,r2)) = hasEpsilon r1 andalso hasEpsilon r2 133 | hasEpsilon (Dot(r1,r2)) = hasEpsilon r1 andalso hasEpsilon r2 134 | hasEpsilon (Star _) = true; 135 136(*---------------------------------------------------------------------------*) 137(* Translation to DFA. *) 138(*---------------------------------------------------------------------------*) 139 140(*---------------------------------------------------------------------------*) 141(* Two regexps are equivalent if they have the same language. This is *) 142(* expensive to decide, however Brzozowski found that a weaker notion of *) 143(* equivalence sufficed in order to build DFAs from regexps. *) 144(*---------------------------------------------------------------------------*) 145 146fun normNot (Not(Not r)) = r 147 | normNot other = other; 148 149fun normStar (Star Epsilon) = Epsilon 150 | normStar (r as Star(Symbs cset)) = if isEmpty_Charset cset then Epsilon else r 151 | normStar (Star (r as Star _)) = r 152 | normStar otherwise = otherwise; 153 154fun normDot (Dot(Epsilon,r)) = r 155 | normDot (Dot(r,Epsilon)) = r 156 | normDot (r as Dot(r1 as Star a,Star b)) = if regexpEqual a b then r1 else r 157 | normDot (r as Dot(Dot(r1,r2),r3)) = 158 let fun linear (Dot(Dot(r1,r2),r3)) = Dot(r1,linear(Dot(r2,r3))) 159 | linear x = x 160 in linear r 161 end 162 | normDot (r as Dot(Symbs cset1,Symbs cset2)) = 163 if isEmpty_Charset cset1 orelse isEmpty_Charset cset2 164 then Empty_Regexp else r 165 | normDot (r as Dot(Symbs cset,_)) = if isEmpty_Charset cset then Empty_Regexp else r 166 | normDot (r as Dot(_,Symbs cset)) = if isEmpty_Charset cset then Empty_Regexp else r 167 | normDot otherwise = otherwise; 168 169fun normAnd (r as And(Symbs a,Symbs b)) = Symbs (Binaryset.intersection(a,b)) 170 | normAnd (r as And(Symbs cset,_)) = if isEmpty_Charset cset then Empty_Regexp else r 171 | normAnd (r as And(_,Symbs cset)) = if isEmpty_Charset cset then Empty_Regexp else r 172 | normAnd (And(And(r1,r2),r3)) = normAnd(And(r1,normAnd(And(r2,r3)))) 173 | normAnd (r as And(r1,s as And(r2,r3))) = 174 (case regexp_compare(r1,r2) 175 of EQUAL => s 176 | LESS => r 177 | GREATER => And(r2,normAnd(And(r1,r3)))) 178 | normAnd (r as And(r1,r2)) = 179 (case regexp_compare(r1,r2) 180 of EQUAL => r1 181 | LESS => r 182 | GREATER => And(r2,r1)) 183 | normAnd r = r; 184 185fun normSum (Sum(Symbs a, Symbs b)) = Symbs(Binaryset.union(a,b)) 186 | normSum (t as Sum(Symbs a,r)) = if isEmpty_Charset a then r else t 187 | normSum (t as Sum(r, Symbs a)) = if isEmpty_Charset a then r else t 188 | normSum (Sum(Sum(r1,r2),r3)) = normSum(Sum(r1,normSum(Sum(r2,r3)))) 189 | normSum (r as Sum(r1,s as Sum(r2,r3))) = 190 (case regexp_compare(r1,r2) 191 of EQUAL => s 192 | LESS => r 193 | GREATER => Sum(r2,normSum(Sum(r1,r3)))) 194 | normSum (r as Sum(r1,r2)) = 195 (case regexp_compare(r1,r2) 196 of EQUAL => r1 197 | LESS => r 198 | GREATER => Sum(r2,r1)) 199 | normSum r = r; 200 201fun norm (Not P) = normNot(Not (norm P)) 202 | norm (Sum(P,Q)) = normSum (Sum (norm P, norm Q)) 203 | norm (And(P,Q)) = normAnd(And (norm P, norm Q)) 204 | norm (Dot(P,Q)) = normDot(Dot (norm P, norm Q)) 205 | norm (Star P) = normStar (Star (norm P)) 206 | norm r = r; 207 208fun simpDeriv x Epsilon = Empty_Regexp 209 | simpDeriv x (Symbs cs) = if Binaryset.member(cs,x) then Epsilon else Empty_Regexp 210 | simpDeriv x (Not P) = normNot(Not (simpDeriv x P)) 211 | simpDeriv x (Sum(P,Q)) = normSum (Sum (simpDeriv x P,simpDeriv x Q)) 212 | simpDeriv x (And(P,Q)) = normAnd(And (simpDeriv x P, simpDeriv x Q)) 213 | simpDeriv x (Dot(P,Q)) = 214 let val pq = normDot(Dot (simpDeriv x P, Q)) 215 in if hasEpsilon P 216 then normSum(Sum(pq,simpDeriv x Q)) 217 else pq 218 end 219 | simpDeriv x (Star P) = normDot (Dot(simpDeriv x P, normStar(Star P))); 220 221 222(*---------------------------------------------------------------------------*) 223(* Approximate character classes. A character class is a set of charsets. *) 224(*---------------------------------------------------------------------------*) 225 226val empty_class = Binaryset.empty charset_compare; 227fun classList chlist = Binaryset.addList(empty_class, chlist) 228val allchar_class = classList [allchars]; 229 230fun cross [] l2 = [] 231 | cross (h::t) l2 = map (pair h) l2 @ cross t l2; 232 233fun moodge class1 class2 = 234 let open Binaryset 235 val pairs = cross (listItems class1) (listItems class2) 236 in 237 List.foldl (fn ((p:char set * char set), acc) => 238 add (acc,Binaryset.intersection p)) 239 empty_class 240 pairs 241 end; 242 243fun approxClasses Epsilon = allchar_class 244 | approxClasses (Symbs a) = classList [a, Binaryset.difference(allchars,a)] 245 | approxClasses (Not r) = approxClasses r 246 | approxClasses (Star r) = approxClasses r 247 | approxClasses (Sum(r1,r2)) = moodge (approxClasses r1) (approxClasses r2) 248 | approxClasses (And(r1,r2)) = moodge (approxClasses r1) (approxClasses r2) 249 | approxClasses (Dot(r1,r2)) = 250 if hasEpsilon r1 251 then moodge (approxClasses r1) (approxClasses r2) 252 else approxClasses r1; 253 254val Qempty = Binaryset.empty regexp_compare 255fun pair_compare (f1, f2) ((x1,y1), (x2,y2)) = 256 case f1 (x1, x2) of 257 EQUAL => f2 (y1, y2) 258 | x => x 259 260val deltaMap_key_compare = pair_compare (regexp_compare, charset_compare) 261val deltaMap_initial = 262 Binarymap.mkDict deltaMap_key_compare :(regexp*charset,regexp)Binarymap.dict 263fun insert_deltaMap ((d,r), deltaMap) = Binarymap.insert(deltaMap,d,r); 264 265fun pick class = Option.valOf(Binaryset.find (K true) class); 266 267(*---------------------------------------------------------------------------*) 268(* Map a regular expression to a DFA representation. *) 269(*---------------------------------------------------------------------------*) 270 271fun regexp_to_dfa r = 272 let val counter = ref 0 273 fun work [] (Q,deltaMap) = (Q,deltaMap) 274 | work (r::t) (Q,deltaMap) = 275 if Binaryset.member(Q,r) (* state r already seen? *) 276 then (print "re-encountering state.\n"; 277 work t (Q,deltaMap)) 278 else let 279 val _ = print ("computing outarcs for state :"^ 280 Int.toString(!counter)^".\n") 281 val _ = counter := !counter + 1 282 val charclasses = List.filter (not o Binaryset.isEmpty) 283 (Binaryset.listItems(approxClasses r)) 284 val _ = print ("Number of arcs: "^ 285 Int.toString(List.length charclasses)^"\n") 286 val outarcs = 287 List.map (fn c => ((r,c),simpDeriv (pick c) r)) charclasses 288 val targets = List.map #2 outarcs 289 val deltaMap' = List.foldl insert_deltaMap deltaMap outarcs 290 val Q' = Binaryset.add(Q,r) 291 in 292 work (targets@t) (Q',deltaMap') 293 end 294 val initialState = norm r 295 val (Q,deltaMap) = work [initialState] (Qempty,deltaMap_initial) 296 val finalStates = Binaryset.foldl (fn (q,acc) => 297 if hasEpsilon q then Binaryset.add(acc,q) else acc) 298 Qempty Q 299 in 300 (Q,initialState,deltaMap,finalStates) 301 end 302 303fun dfa_to_arrays (Q,q0,deltaMap,F) = 304 let val _ = print "Mapping to array representation.\n" 305 val states = Binaryset.listItems Q 306 val nstates = List.length states 307 val Qmap = Vector.fromList states 308 fun Qindex i = Vector.sub(Qmap,i) (* int -> regexp *) 309 val QindexInv = Vector.foldli(fn(i,r,m) => Binarymap.insert(m,r,i)) 310 (Binarymap.mkDict regexp_compare) Qmap 311 fun QindexInvFn r = Binarymap.find(QindexInv,r) (* regexp -> int *) 312 fun row i = 313 let val q = Qindex i 314 val outarcs = Binarymap.foldl(fn((regexp,charset),regexp',arcs) 315 => if regexpEqual regexp q 316 then (charset,regexp')::arcs 317 else arcs) [] deltaMap 318 fun flatarc (charset,regexp) = 319 List.map (fn c => (Char.ord(c),regexp)) 320 (Binaryset.listItems charset) 321 val flatarcs = List.concat (List.map flatarc outarcs) 322 val sortarcs = 323 Listsort.sort (fn ((a,b), (c,d)) => Int.compare(a,c)) flatarcs 324 val sortarcs' = List.map (fn (i,q) => QindexInvFn q) sortarcs 325 val _ = if List.length sortarcs' <> ALPHABET_SIZE 326 then raise ERR 327 "dfa_to_arrays" 328 "total partition elements differs from \ 329 \size of alphabet" 330 else () 331 in 332 Vector.fromList sortarcs' 333 end 334 in 335 {delta = Vector.tabulate(nstates, row), 336 start = QindexInvFn q0, 337 final = Vector.tabulate(nstates, fn i => Binaryset.member(F,Qindex i)) 338 } 339 end; 340 341val regexp_to_dfa_arrays = dfa_to_arrays o regexp_to_dfa; 342 343fun match r = 344 let val {delta,start,final} = regexp_to_dfa_arrays r 345 val _ = print (String.concat["DFA states: ", 346 Int.toString(Vector.length delta),".\n"]) 347 fun step (a,q) = Vector.sub(Vector.sub(delta,q), Char.ord a) 348 fun exec ss = Substring.foldl step start ss 349 in 350 fn s => Vector.sub(final,exec (Substring.full s)) 351 end; 352 353fun pred_to_set P = 354 Binaryset.foldl (fn (c, acc) => if P c then Binaryset.add(acc, c) else acc) 355 empty_cset 356 allchars 357 358fun fromList cs = Binaryset.addList (empty_cset, cs) 359 360structure POSIX = struct 361 val alnum_set = pred_to_set Char.isAlphaNum 362 val alpha_set = pred_to_set Char.isAlpha 363 val ascii_set = pred_to_set Char.isAscii 364 val blank_set = fromList [#" ", #"\t"] 365 val cntrl_set = pred_to_set Char.isCntrl 366 val digit_set = pred_to_set Char.isDigit 367 val graph_set = pred_to_set Char.isGraph 368 val lower_set = pred_to_set Char.isLower 369 val print_set = pred_to_set Char.isPrint 370 val punct_set = pred_to_set Char.isPunct 371 val space_set = pred_to_set Char.isSpace 372 val upper_set = pred_to_set Char.isUpper 373 val xdigit_set = pred_to_set Char.isHexDigit 374 val word_set = Binaryset.add(alnum_set, #"_") 375 376end 377 378end 379