1structure Regexp_Match :> Regexp_Match = 2struct 3 4open Lib Feedback Regexp_Type; 5 6val ERR = mk_HOL_ERR "Regexp_Match"; 7 8type regexp = Regexp_Type.regexp; 9 10fun regexp_compare r1 r2 = Regexp_Type.regexp_compare(r1,r2); 11 12fun regexp_leq r s = 13 case regexp_compare r s 14 of LESS => true 15 | EQUAL => true 16 | GREATER => false 17; 18 19(*---------------------------------------------------------------------------*) 20(* Taken from cakeML version, itself derived from the HOL model *) 21(*---------------------------------------------------------------------------*) 22 23val the = Option.valOf; 24 25fun inv_image rel f x y = rel (f x) (f y); 26 27fun el n list = if n = 0 then hd list else el (n-1) (tl list); 28 29fun upto b t = if b <= t then b::upto (b+1) t else [];; 30 31fun genlist f n = if n<0 then [] else map f (upto 0 (n-1)); 32 33fun drop n l = if n<=0 then l else drop (n-1) (tl l); 34 35(*---------------------------------------------------------------------------*) 36(* merge sort *) 37(*---------------------------------------------------------------------------*) 38 39local 40fun merge rel l1 l2 = 41 case (l1,l2) 42 of ([],[]) => [] 43 | ([],_) => l2 44 | (_,[]) => l1 45 | (x::t1,y::t2) => 46 if rel x y then 47 x::merge rel t1 l2 else 48 y::merge rel l1 t2; 49 50fun sort2 rel x y = if rel x y then [x, y] else [y, x]; 51 52fun sort3 rel x y z = 53 if rel x y then 54 if rel y z then [x, y, z] 55 else if rel x z then [x, z, y] 56 else [z, x, y] 57 else if rel y z then if rel x z then [y, x, z] else [y, z, x] 58 else [z, y, x]; 59 60fun mergesortN rel n l = 61 if n = 0 then [] else 62 if n = 1 then (if null l then [] else [hd l]) else 63 if n = 2 64 then (case l 65 of (x::y::t) => sort2 rel x y 66 | other => other) else 67 if n = 3 68 then (case l 69 of (x::y::z::t) => sort3 rel x y z 70 | [x,y] => sort2 rel x y 71 | other => other) 72 else 73 let val len1 = n div 2 74 in merge rel (mergesortN rel len1 l) 75 (mergesortN rel (n - len1) (drop len1 l)) 76 end; 77 78in 79 fun mergesort rel l = mergesortN rel (length l) l 80end 81 82(*---------------------------------------------------------------------------*) 83(* Finite maps, from examples/balanced_mapTheory. *) 84(*---------------------------------------------------------------------------*) 85 86structure Finite_Map = 87struct 88 89datatype ('k,'v) balanced_map 90 = Tip 91 | Bin of int * 'k * 'v * ('k,'v)balanced_map * ('k,'v) balanced_map; 92 93val ratio = 2; 94val delta = 3; 95 96val empty = Tip; 97 98fun singleton k x = Bin(1,k,x,Tip,Tip); 99 100fun size fmap = 101 case fmap 102 of Tip => 0 103 | Bin (s,k,v,l,r) => s; 104 105fun member cmp k fmap = 106 case fmap 107 of Tip => false 108 | Bin (s,k', v, l, r) => 109 case cmp k k' 110 of LESS => member cmp k l 111 | GREATER => member cmp k r 112 | EQUAL => true; 113 114fun lookup cmp k fmap = 115 case fmap 116 of Tip => NONE 117 | Bin(s, k', v, l, r) => 118 case cmp k k' 119 of LESS => lookup cmp k l 120 | GREATER => lookup cmp k r 121 | EQUAL => SOME v; 122 123fun foldrWithKey f z fmap = 124 case fmap 125 of Tip => z 126 | Bin(u, kx, x, l, r) => 127 foldrWithKey f (f kx x (foldrWithKey f z r)) l; 128 129fun balanceL k x fmap1 fmap2 = 130 case (fmap1,fmap2) 131 of (Tip,Tip) 132 => Bin (1, k, x, Tip, Tip) 133 | (Bin (s', k', v', Tip, Tip), Tip) 134 => Bin (2, k, x, Bin (s', k', v', Tip, Tip), Tip) 135 | (Bin (u1, lk, lx, Tip, Bin(u2, lrk, lrx, u3, u4)), Tip) 136 => Bin(3, lrk, lrx, Bin(1, lk, lx, Tip, Tip), Bin(1,k, x, Tip, Tip)) 137 | (Bin(u5, lk, lx, Bin (s', k', v', l', r'), Tip), Tip) 138 => Bin(3, lk, lx, Bin(s', k', v', l', r'), Bin(1, k, x, Tip, Tip)) 139 | (Bin(ls, lk, lx, Bin(lls, k', v', l', r'), Bin(lrs, lrk, lrx, lrl, lrr)), Tip) 140 => if lrs < ratio*lls 141 then 142 Bin (1+ls, lk, lx, Bin(lls, k', v', l', r'), 143 Bin (1+lrs, k, x, Bin (lrs, lrk, lrx, lrl, lrr), Tip)) 144 else 145 Bin (1+ls, lrk, lrx, 146 Bin (1+lls+size lrl, lk, lx, Bin(lls, k', v', l', r'), lrl), 147 Bin (1+size lrr, k, x, lrr, Tip)) 148 | (Tip, Bin(rs, k', v', l', r')) 149 => Bin (1+rs, k, x, Tip, Bin(rs, k', v', l', r')) 150 | (Bin(ls, lk, lx, ll, lr), Bin(rs, k', v', l', r')) 151 => if ls > delta*rs 152 then (case (ll, lr) 153 of (Bin (lls, u6, u7, u8, u9), Bin(lrs, lrk, lrx, lrl, lrr)) 154 => if lrs < ratio*lls then 155 Bin (1+ls+rs, lk, lx, ll, 156 Bin (1+rs+lrs, k, x, lr, Bin(rs, k', v', l', r'))) 157 else 158 Bin (1+ls+rs, lrk, lrx, 159 Bin (1+lls+size lrl, lk, lx, ll, lrl), 160 Bin (1+rs+size lrr, k, x, lrr, Bin (rs, k', v', l', r'))) 161 | (u10, u11) => Tip (* error "Failure in Data.Map.balanceL" *) 162 ) 163 else 164 Bin (1+ls+rs, k, x, Bin (ls, lk, lx, ll, lr), Bin(rs, k', v', l', r')); 165 166fun balanceR k x fmap1 fmap2 = 167 case (fmap1,fmap2) 168 of (Tip,Tip) 169 => Bin(1, k, x, Tip, Tip) 170 | (Tip, Bin(s', k', v', Tip, Tip)) 171 => Bin(2, k, x, Tip, Bin(s', k', v', Tip, Tip)) 172 | (Tip,Bin(u1, rk, rx, Tip, Bin(s', k', v', l', r'))) 173 => Bin(3, rk, rx, Bin(1, k, x, Tip, Tip), Bin(s', k', v', l', r')) 174 | (Tip, Bin(u2, rk, rx, Bin(u3, rlk, rlx, u4, u5), Tip)) 175 => Bin(3, rlk, rlx, 176 Bin(1, k, x, Tip, Tip), 177 Bin(1, rk, rx, Tip, Tip)) 178 | (Tip, Bin(rs, rk, rx, Bin(rls, rlk, rlx, rll, rlr), 179 Bin(rrs, k', v', l', r'))) 180 => if rls < ratio*rrs 181 then 182 Bin (1+rs, rk, rx, 183 Bin (1+rls,k, x, Tip, Bin(rls, rlk, rlx, rll, rlr)), 184 Bin(rrs, k', v', l', r')) 185 else 186 Bin (1+rs, rlk, rlx, 187 Bin (1+size rll, k, x, Tip, rll), 188 Bin (1+rrs+size rlr, rk, rx, rlr, Bin(rrs, k', v', l', r'))) 189 | (Bin(ls, k', v', l', r'), Tip) 190 => Bin (1+ls, k, x, Bin(ls, k', v', l', r'), Tip) 191 | (Bin(ls, k', v', l', r'), Bin(rs, rk, rx, rl, rr)) 192 => if rs > delta*ls 193 then 194 (case (rl, rr) 195 of (Bin(rls, rlk, rlx, rll, rlr), Bin(rrs, u6, u7,u8,u9)) 196 => if rls < ratio*rrs 197 then Bin (1+ls+rs, rk, rx, 198 Bin (1+ls+rls, k, x, Bin(ls, k', v', l', r'), rl), 199 rr) 200 else Bin (1+ls+rs, rlk, rlx, 201 Bin (1+ls+size rll, k, x, Bin(ls, k', v', l', r'), rll), 202 Bin (1+rrs+size rlr, rk, rx, rlr, rr)) 203 | (u10,u11) => Tip (* error "Failure in Data.Map.balanceR" *) 204 ) 205 else 206 Bin (1+ls+rs, k, x, 207 Bin(ls, k', v', l', r'), Bin(rs, rk,rx,rl, rr)); 208 209fun insert cmp k v fmap = 210 case fmap 211 of Tip => singleton k v 212 | Bin(s, k', v', l, r) => 213 case cmp k k' 214 of LESS => balanceL k' v' (insert cmp k v l) r 215 | GREATER => balanceR k' v' l (insert cmp k v r) 216 | EQUAL => Bin(s,k,v,l,r); 217 218end; (* structure Finite_Map *) 219 220(*---------------------------------------------------------------------------*) 221(* Support for smart derivatives *) 222(*---------------------------------------------------------------------------*) 223 224fun build_char_set cs = Chset cs; 225 226fun assoc_cat r s = 227 case r 228 of Cat(r1,r2) => Cat(r1,assoc_cat r2 s) 229 | otherwise => Cat(r,s); 230 231fun build_cat r s = 232 if (r = EMPTY) orelse (s = EMPTY) then EMPTY else 233 if r = EPSILON then s else 234 if s = EPSILON then r 235 else assoc_cat r s; 236 237fun build_neg r = 238 case r 239 of Neg s => s 240 | otherwise => Neg r; 241 242fun build_star r = 243 case r 244 of Star s => r 245 | otherwise => Star r; 246 247fun flatten_or rlist = 248 case rlist 249 of [] => [] 250 | Or rs::t => rs @ flatten_or t 251 | r::rs => r :: flatten_or rs; 252 253(*---------------------------------------------------------------------------*) 254(* Requires all charsets to be leftmost in rs. Mergesort takes care of that. *) 255(*---------------------------------------------------------------------------*) 256 257fun merge_charsets list = 258 case list 259 of (Chset a::Chset b::t) => merge_charsets (Chset(charset_union a b)::t) 260 | otherwise => list; 261 262fun remove_dups rlist = 263 case rlist 264 of [] => [] 265 | [r] => [r] 266 | r::s::t => 267 if regexp_compare r s = EQUAL then 268 remove_dups (s::t) 269 else 270 r::remove_dups (s::t); 271 272fun build_or rlist = 273 let val rlist' = 274 remove_dups 275 (merge_charsets 276 (mergesort regexp_leq 277 (flatten_or rlist))) 278 in 279 if mem (Neg EMPTY) rlist' then Neg EMPTY else 280 if mem (Star SIGMA) rlist' then Neg EMPTY 281 else 282 case rlist' 283 of [] => EMPTY 284 | [r] => r 285 | [Chset cs, r] => (if r = Neg (Chset cs) then Neg EMPTY else 286 if cs = charset_empty then r 287 else Or rlist') 288 | Chset cs :: t => 289 (if mem (Neg (Chset cs)) t then Neg EMPTY else 290 if cs = charset_empty then Or t 291 else Or rlist') 292 | _ => Or rlist' 293end; 294 295fun normalize r = 296 case r 297 of Chset cs => build_char_set cs 298 | Cat(s,t) => build_cat (normalize s) (normalize t) 299 | Star s => build_star (normalize s) 300 | Or rs => build_or (List.map normalize rs) 301 | Neg s => build_neg (normalize s); 302 303(*---------------------------------------------------------------------------*) 304(* Does a regexp admit EPSILON (the empty string) *) 305(*---------------------------------------------------------------------------*) 306 307fun nullable r = 308 case r 309 of Chset cs => false 310 | Star s => true 311 | Cat(s,t) => nullable s andalso nullable t 312 | Or rs => List.exists nullable (List.rev rs) 313 | Neg s => not(nullable s); 314 315(*---------------------------------------------------------------------------*) 316(* Take derivative, then apply smart constructors. *) 317(*---------------------------------------------------------------------------*) 318 319fun smart_deriv c r = 320 case r 321 of Chset cs => if charset_mem c cs then EPSILON else EMPTY 322 | Cat(Chset cs,t) => if charset_mem c cs then t else EMPTY 323 | Cat(s,t) => 324 let val dr = build_cat (smart_deriv c s) t 325 in if nullable s 326 then build_or [dr, smart_deriv c t] 327 else dr 328 end 329 | Star s => build_cat (smart_deriv c s) (build_star s) 330 | Or rs => build_or (map (smart_deriv c) rs) 331 | Neg s => build_neg (smart_deriv c s); 332 333(*---------------------------------------------------------------------------*) 334(* Cache applications of smart_deriv. Unhelpful right now ... might be too *) 335(* fine-grained. *) 336(*---------------------------------------------------------------------------*) 337(* 338type table = (char * regexp,regexp) Redblackmap.dict 339type cache = table ref 340 341fun regexp_cmp (r1,r2) = regexp_compare r1 r2; 342 343fun new_table() = 344 Redblackmap.mkDict (pair_compare (Char.compare,regexp_cmp)) : table 345 346 347local 348 val cache = ref (new_table()) : cache 349 fun lookup x = Redblackmap.peek (!cache, x) 350 fun store (x,y) = (cache := Redblackmap.insert(!cache,x,y)) 351 fun storable (Chset _) = false 352 | storable other = true 353in 354fun clear_cache () = (cache := new_table()) 355fun cached_values () = Redblackmap.listItems (!cache) 356 357fun smart_deriv c r = 358 if storable r then 359 (case lookup (c,r) 360 of SOME y => y 361 | NONE => 362 let val r' = 363 (case r 364 of Chset cs => if charset_mem c cs then EPSILON else EMPTY 365 | Cat(Chset cs,t) => if charset_mem c cs then t else EMPTY 366 | Cat(s,t) => 367 let val dr = build_cat (smart_deriv c s) t 368 in if nullable s 369 then build_or [dr, smart_deriv c t] 370 else dr 371 end 372 | Star s => build_cat (smart_deriv c s) (build_star s) 373 | Or rs => build_or (map (smart_deriv c) rs) 374 | Neg s => build_neg (smart_deriv c s)) 375 in 376 store((c,r),r') 377 ; r' 378 end) 379 else 380 (case r 381 of Chset cs => if charset_mem c cs then EPSILON else EMPTY 382 | Cat(Chset cs,t) => if charset_mem c cs then t else EMPTY 383 | Cat(s,t) => 384 let val dr = build_cat (smart_deriv c s) t 385 in if nullable s 386 then build_or [dr, smart_deriv c t] 387 else dr 388 end 389 | Star s => build_cat (smart_deriv c s) (build_star s) 390 | Or rs => build_or (map (smart_deriv c) rs) 391 | Neg s => build_neg (smart_deriv c s)) 392end 393*) 394 395(*---------------------------------------------------------------------------*) 396(* Support for the core regexp compiler. *) 397(*---------------------------------------------------------------------------*) 398 399fun transitions r = List.map (fn c => (c,smart_deriv c r)) alphabet; 400 401fun extend_states next_state state_map trans translist = 402 case translist 403 of [] => (next_state,state_map,trans) 404 | (c,s)::t => 405 case Finite_Map.lookup regexp_compare s state_map 406 of SOME n => extend_states next_state state_map ((c,n)::trans) t 407 | NONE => extend_states (next_state + 1) 408 (Finite_Map.insert regexp_compare 409 s next_state state_map) 410 ((c,next_state)::trans) 411 t; 412 413fun build_table translist r arg = 414 case arg of 415 (next_state,state_map,table) => 416 case extend_states next_state state_map [] translist of 417 (next_state,state_map,trans) => 418 case Finite_Map.lookup regexp_compare r state_map 419 of SOME n => (next_state, state_map, (n,trans)::table) 420 | NONE => (next_state + 1, 421 Finite_Map.insert regexp_compare r next_state state_map, 422 (next_state,trans)::table); 423 424fun insert_regexp r seen = Finite_Map.insert regexp_compare r () seen; 425fun mem_regexp r seen = Finite_Map.member regexp_compare r seen; 426 427(*---------------------------------------------------------------------------*) 428(* Core regexp compiler. The seen argument holds the already-seen regexps, *) 429(* which become states in the final DFA. The n argument is a step-index used *) 430(* to ensure that the function terminates. *) 431(*---------------------------------------------------------------------------*) 432 433fun brzozo seen worklist acc n = 434 case worklist 435 of [] => SOME(seen,acc) 436 | r::t => 437 if n <= 0 then NONE else 438 if mem_regexp r seen 439 then brzozo seen t acc (n-1) 440 else let val translist = transitions r 441 in brzozo 442 (insert_regexp r seen) 443 (remove_dups (map snd translist) @ t) 444 (build_table translist r acc) 445 (n-1) 446 end; 447 448val bigIndex = 2147483647; 449 450fun get_accepts fmap = 451 mergesort (fn x => fn y => x <= y) 452 (Finite_Map.foldrWithKey 453 (fn r => fn n => fn nlist => if nullable r then n::nlist else nlist) 454 [] fmap); 455 456fun alist_to_vec list default n max = 457 if n=0 then [] else 458 case list 459 of [] => default :: alist_to_vec [] default (n-1) max 460 | ((x,y)::t) => 461 if n <= max then 462 (if x = max - n then y :: alist_to_vec t default (n-1) max 463 else if x < max - n then alist_to_vec t default n max 464 else default :: alist_to_vec list default (n-1) max) 465 else []; 466 467fun accepts_to_final_vector accept_states max_state = 468 alist_to_vec (map (fn x => (x,true)) accept_states) 469 false max_state max_state; 470 471fun table_to_vectors table = 472 map (fn p => case p of (k,table2) => 473 alist_to_vec 474 (mergesort (inv_image (fn x => fn y => x <= y) fst) 475 (map (fn p1 => case p1 of (c,n) => (Char.ord c,SOME n)) table2)) 476 NONE alphabet_size alphabet_size) 477 (mergesort (inv_image (fn x => fn y => x <= y) fst) table); 478 479fun compile_regexp r = 480 let val s = normalize r 481 in case the (brzozo Finite_Map.empty [s] 482 (1, Finite_Map.singleton s 0, []) bigIndex) 483 of (states,(last_state,state_numbering,table)) 484 => let val delta_vecs = table_to_vectors table 485 val final = accepts_to_final_vector 486 (get_accepts state_numbering) last_state 487 in (state_numbering, delta_vecs, final) 488 end 489 end; 490 491fun exec_dfa final_states table n string = 492 case string 493 of [] => el n final_states 494 | c::t => 495 case el (Char.ord c) (el n table) 496 of NONE => false 497 | SOME k => exec_dfa final_states table k t; 498 499(*---------------------------------------------------------------------------*) 500(* Returns a function of type :string -> bool *) 501(*---------------------------------------------------------------------------*) 502 503fun matcher r = 504 case compile_regexp r 505 of (state_numbering,delta,final) 506 => let val start_state = the (Finite_Map.lookup regexp_compare 507 (normalize r) state_numbering) 508 val match_string = exec_dfa final delta start_state 509 in 510 {table = map (map Option.valOf) delta, 511 start = start_state, 512 final = final, 513 matchfn = fn s => match_string (String.explode s)} 514 end; 515 516 517(*---------------------------------------------------------------------------*) 518(* Vector-based DFA instead of list-based. *) 519(*---------------------------------------------------------------------------*) 520 521fun vector_matcher r = 522 let val {table,start,final,matchfn} = matcher r 523 val vfinal = Vector.fromList final 524 val vtable = Vector.fromList (List.map Vector.fromList table) 525 fun run n [] = Vector.sub(vfinal,n) 526 | run n (c::t) = run (Vector.sub(Vector.sub(vtable,n), Char.ord c)) t 527 val match_string = run start 528 in {start = start, 529 table = vtable, 530 final = vfinal, 531 matchfn = fn s => match_string (String.explode s)} 532 end 533 534(*---------------------------------------------------------------------------*) 535(* Just compute the states; no tracking of info needed to construct the rest *) 536(* of the DFA. *) 537(*---------------------------------------------------------------------------*) 538 539(*---------------------------------------------------------------------------*) 540(* Optimization of certain annoying (because slow to compile) Or regexps. *) 541(* *) 542(* The following code asserts that *) 543(* *) 544(* smart_deriv c (Or [w, wr_1, ..., wr_n, EPSILON]) *) 545(* *) 546(* is equal to *) 547(* *) 548(* Or [r_1, ..., r_n, EPSILON] OR EMPTY *) 549(* *) 550(* (where w is a charset) depending on whether c is in the charset or not. *) 551(* *) 552(* Assumption: the input Or regexp is in normal form. That would mean that *) 553(* EPSILON comes after all the charsets and Cats, so should be at the end. *) 554(* Also, if args is the list constructed by Or, len(args) > 1 so there is a *) 555(* last element x of args, and a separate head element w. Also, w can't be *) 556(* the empty charset. If w is the full charset then EMPTY is not returned. *) 557(*---------------------------------------------------------------------------*) 558 559val tracing = ref true; 560 561fun kprint s = if !tracing then print s else (); 562 563val _ = Feedback.register_btrace("regexp-compiler",tracing); 564 565fun drop w (Cat(a,b)) = 566 (case regexp_compare w a 567 of EQUAL => b 568 | otherwise => raise ERR "drop" "") 569 | drop r _ = raise ERR "drop" ""; 570 571fun lift_cset_from_or rlist = 572 case rlist 573 of (w as Chset _) :: t => 574 let val (t',x) = Lib.front_last t 575 val t'' = mapfilter (drop w) t' 576 in 577 if x = EPSILON andalso length t' = length t'' 578 then let val _ = kprint "\n\n Optimization SUCCEEDS!!\n\n" 579 val trimmed = build_or (t'' @ [EPSILON]) 580 in if w = SIGMA 581 then SOME [trimmed] 582 else SOME [EMPTY,trimmed] 583 end 584 else NONE 585 end 586 | otherwise => NONE 587 588fun uniq rlist = remove_dups (mergesort regexp_leq rlist); 589 590fun transitions r = 591 case r of 592 Or rlist => 593 (case lift_cset_from_or rlist 594 of SOME rlist' => rlist' 595 | NONE => uniq(List.map (fn c => smart_deriv c r) alphabet) 596 ) 597 | otherwise => 598 uniq(List.map (fn c => smart_deriv c r) alphabet); 599 600fun dom_brzozo seen [] = seen 601 | dom_brzozo seen (r::t) = 602 (kprint ("Worklist length: "^Int.toString(length t + 1)^". Head:\n ") 603 ; if !tracing then print_regexp r else () 604 ; kprint "\n" 605 ; 606 if mem_regexp r seen 607 then (kprint "already seen; trying next element.\n\n" 608 ; 609 dom_brzozo seen t 610 ) 611 else let val _ = kprint ("will be a new state (size: "^Int.toString (PolyML.objSize r)^"). ") 612 val prospects = transitions r 613 val _ = kprint ("Successors: "^Int.toString (length prospects)^"\n\n") 614 in dom_brzozo (insert_regexp r seen) 615 (prospects @ t) 616 end 617 ); 618 619fun domBrz r = 620 let open regexpMisc 621 val _ = stdErr_print "Starting Brzozowski.\n" 622 val states = time (dom_brzozo Finite_Map.empty) [normalize r] 623 val _ = stdErr_print ("states: "^Int.toString (Finite_Map.size states)^"\n"); 624 in 625 () 626 end; 627 628end 629