1structure Regexp_Match :> Regexp_Match =
2struct
3
4open Lib Feedback Regexp_Type;
5
6val ERR = mk_HOL_ERR "Regexp_Match";
7
8type regexp = Regexp_Type.regexp;
9
10fun regexp_compare r1 r2 = Regexp_Type.regexp_compare(r1,r2);
11
12fun regexp_leq r s =
13  case regexp_compare r s
14   of LESS => true
15    | EQUAL => true
16    | GREATER => false
17;
18
19(*---------------------------------------------------------------------------*)
20(* Taken from cakeML version, itself derived from the HOL model              *)
21(*---------------------------------------------------------------------------*)
22
23val the = Option.valOf;
24
25fun inv_image rel f x y = rel (f x) (f y);
26
27fun el n list = if n = 0 then hd list else el (n-1) (tl list);
28
29fun upto b t = if b <= t then b::upto (b+1) t else [];;
30
31fun genlist f n = if n<0 then [] else map f (upto 0 (n-1));
32
33fun drop n l = if n<=0 then l else drop (n-1) (tl l);
34
35(*---------------------------------------------------------------------------*)
36(* merge sort                                                                *)
37(*---------------------------------------------------------------------------*)
38
39local
40fun merge rel l1 l2 =
41 case (l1,l2)
42  of ([],[]) => []
43   | ([],_) => l2
44   | (_,[]) => l1
45   | (x::t1,y::t2) =>
46       if rel x y then
47          x::merge rel t1 l2 else
48          y::merge rel l1 t2;
49
50fun sort2 rel x y = if rel x y then [x, y] else [y, x];
51
52fun sort3 rel x y z =
53 if rel x y then
54     if rel y z then [x, y, z]
55     else if rel x z then [x, z, y]
56     else [z, x, y]
57 else if rel y z then if rel x z then [y, x, z] else [y, z, x]
58      else [z, y, x];
59
60fun mergesortN rel n l =
61 if n = 0 then [] else
62 if n = 1 then (if null l then [] else [hd l]) else
63 if n = 2
64   then (case l
65          of (x::y::t) => sort2 rel x y
66           | other => other) else
67 if n = 3
68   then (case l
69          of (x::y::z::t) => sort3 rel x y z
70           | [x,y] => sort2 rel x y
71           | other => other)
72 else
73 let val len1 = n div 2
74 in merge rel (mergesortN rel len1 l)
75              (mergesortN rel (n - len1) (drop len1 l))
76 end;
77
78in
79  fun mergesort rel l = mergesortN rel (length l) l
80end
81
82(*---------------------------------------------------------------------------*)
83(* Finite maps, from examples/balanced_mapTheory.                            *)
84(*---------------------------------------------------------------------------*)
85
86structure Finite_Map =
87struct
88
89datatype ('k,'v) balanced_map
90  = Tip
91  | Bin of int * 'k * 'v * ('k,'v)balanced_map * ('k,'v) balanced_map;
92
93val ratio = 2;
94val delta  = 3;
95
96val empty = Tip;
97
98fun singleton k x = Bin(1,k,x,Tip,Tip);
99
100fun size fmap =
101 case fmap
102  of Tip => 0
103   | Bin (s,k,v,l,r) => s;
104
105fun member cmp k fmap =
106 case fmap
107  of Tip => false
108   | Bin (s,k', v, l, r) =>
109     case cmp k k'
110      of LESS => member cmp k l
111       | GREATER => member cmp k r
112       | EQUAL => true;
113
114fun lookup cmp k fmap =
115 case fmap
116  of Tip => NONE
117   | Bin(s, k', v, l, r) =>
118      case cmp k k'
119       of LESS => lookup cmp k l
120        | GREATER => lookup cmp k r
121        | EQUAL => SOME v;
122
123fun foldrWithKey f z fmap =
124 case fmap
125  of Tip => z
126   | Bin(u, kx, x, l, r) =>
127        foldrWithKey f (f kx x (foldrWithKey f z r)) l;
128
129fun balanceL k x fmap1 fmap2 =
130 case (fmap1,fmap2)
131  of (Tip,Tip)
132      => Bin (1, k, x, Tip, Tip)
133   | (Bin (s', k', v', Tip, Tip), Tip)
134      => Bin (2, k, x, Bin (s', k', v', Tip, Tip), Tip)
135   | (Bin (u1, lk, lx, Tip, Bin(u2, lrk, lrx, u3, u4)), Tip)
136      => Bin(3, lrk, lrx, Bin(1, lk, lx, Tip, Tip), Bin(1,k, x, Tip, Tip))
137   | (Bin(u5, lk, lx, Bin (s', k', v', l', r'), Tip), Tip)
138      => Bin(3, lk, lx, Bin(s', k', v', l', r'), Bin(1, k, x, Tip, Tip))
139   | (Bin(ls, lk, lx, Bin(lls, k', v', l', r'), Bin(lrs, lrk, lrx, lrl, lrr)), Tip)
140      => if lrs < ratio*lls
141          then
142            Bin (1+ls, lk, lx, Bin(lls, k', v', l', r'),
143                 Bin (1+lrs, k, x, Bin (lrs, lrk, lrx, lrl, lrr), Tip))
144          else
145            Bin (1+ls, lrk, lrx,
146                 Bin (1+lls+size lrl, lk, lx, Bin(lls, k', v', l', r'), lrl),
147                 Bin (1+size lrr, k, x, lrr, Tip))
148   | (Tip, Bin(rs, k', v', l', r'))
149      => Bin (1+rs, k, x, Tip, Bin(rs, k', v', l', r'))
150   | (Bin(ls, lk, lx, ll, lr), Bin(rs, k', v', l', r'))
151      => if ls > delta*rs
152          then (case (ll, lr)
153                 of (Bin (lls, u6, u7, u8, u9), Bin(lrs, lrk, lrx, lrl, lrr))
154                    => if lrs < ratio*lls then
155                        Bin (1+ls+rs, lk, lx, ll,
156                             Bin (1+rs+lrs, k, x, lr, Bin(rs, k', v', l', r')))
157                      else
158                        Bin (1+ls+rs, lrk, lrx,
159                             Bin (1+lls+size lrl, lk, lx, ll, lrl),
160                             Bin (1+rs+size lrr, k, x, lrr, Bin (rs, k', v', l', r')))
161                  | (u10, u11) => Tip (* error "Failure in Data.Map.balanceL" *)
162               )
163         else
164            Bin (1+ls+rs, k, x, Bin (ls, lk, lx, ll, lr), Bin(rs, k', v', l', r'));
165
166fun balanceR k x fmap1 fmap2 =
167 case (fmap1,fmap2)
168  of (Tip,Tip)
169      => Bin(1, k, x, Tip, Tip)
170   | (Tip, Bin(s', k', v', Tip, Tip))
171     => Bin(2, k, x, Tip, Bin(s', k', v', Tip, Tip))
172   | (Tip,Bin(u1, rk, rx, Tip, Bin(s', k', v', l', r')))
173     => Bin(3, rk, rx, Bin(1, k, x, Tip, Tip), Bin(s', k', v', l', r'))
174   | (Tip, Bin(u2, rk, rx, Bin(u3, rlk, rlx, u4, u5), Tip))
175      => Bin(3, rlk, rlx,
176             Bin(1, k, x, Tip, Tip),
177             Bin(1, rk, rx, Tip, Tip))
178   | (Tip, Bin(rs, rk, rx, Bin(rls, rlk, rlx, rll, rlr),
179                           Bin(rrs, k', v', l', r')))
180     => if rls < ratio*rrs
181        then
182          Bin (1+rs, rk, rx,
183               Bin (1+rls,k, x, Tip, Bin(rls, rlk, rlx, rll, rlr)),
184               Bin(rrs, k', v', l', r'))
185        else
186          Bin (1+rs, rlk, rlx,
187               Bin (1+size rll, k, x, Tip, rll),
188               Bin (1+rrs+size rlr, rk, rx, rlr, Bin(rrs, k', v', l', r')))
189   | (Bin(ls, k', v', l', r'), Tip)
190     =>  Bin (1+ls, k, x, Bin(ls, k', v', l', r'), Tip)
191   | (Bin(ls, k', v', l', r'), Bin(rs, rk, rx, rl, rr))
192     => if rs > delta*ls
193        then
194         (case (rl, rr)
195           of (Bin(rls, rlk, rlx, rll, rlr), Bin(rrs, u6, u7,u8,u9))
196               => if rls < ratio*rrs
197                  then Bin (1+ls+rs, rk, rx,
198                            Bin (1+ls+rls, k, x, Bin(ls, k', v', l', r'), rl),
199                            rr)
200                  else Bin (1+ls+rs, rlk, rlx,
201                            Bin (1+ls+size rll, k, x, Bin(ls, k', v', l', r'), rll),
202                            Bin (1+rrs+size rlr, rk, rx, rlr, rr))
203            | (u10,u11) => Tip (* error "Failure in Data.Map.balanceR" *)
204          )
205        else
206           Bin (1+ls+rs, k, x,
207                Bin(ls, k', v', l', r'), Bin(rs, rk,rx,rl, rr));
208
209fun insert cmp k v fmap =
210 case fmap
211  of Tip => singleton k v
212   | Bin(s, k', v', l, r) =>
213       case cmp k k'
214        of LESS => balanceL k' v' (insert cmp k v l) r
215         | GREATER => balanceR k' v' l (insert cmp k v r)
216         | EQUAL => Bin(s,k,v,l,r);
217
218end; (* structure Finite_Map *)
219
220(*---------------------------------------------------------------------------*)
221(* Support for smart derivatives                                             *)
222(*---------------------------------------------------------------------------*)
223
224fun build_char_set cs = Chset cs;
225
226fun assoc_cat r s =
227  case r
228   of Cat(r1,r2) => Cat(r1,assoc_cat r2 s)
229    | otherwise => Cat(r,s);
230
231fun build_cat r s =
232   if (r = EMPTY) orelse (s = EMPTY) then EMPTY else
233   if r = EPSILON then s else
234   if s = EPSILON then r
235   else assoc_cat r s;
236
237fun build_neg r =
238  case r
239   of Neg s => s
240    | otherwise => Neg r;
241
242fun build_star r =
243  case r
244   of Star s => r
245    | otherwise => Star r;
246
247fun flatten_or rlist =
248 case rlist
249  of [] => []
250   | Or rs::t => rs @ flatten_or t
251   | r::rs => r :: flatten_or rs;
252
253(*---------------------------------------------------------------------------*)
254(* Requires all charsets to be leftmost in rs. Mergesort takes care of that. *)
255(*---------------------------------------------------------------------------*)
256
257fun merge_charsets list =
258 case list
259  of (Chset a::Chset b::t) => merge_charsets (Chset(charset_union a b)::t)
260   | otherwise => list;
261
262fun remove_dups rlist =
263 case rlist
264  of [] => []
265   | [r] => [r]
266   | r::s::t =>
267      if regexp_compare r s = EQUAL then
268        remove_dups (s::t)
269      else
270        r::remove_dups (s::t);
271
272fun build_or rlist =
273 let val rlist' =
274         remove_dups
275           (merge_charsets
276              (mergesort regexp_leq
277                 (flatten_or rlist)))
278 in
279  if mem (Neg EMPTY) rlist' then Neg EMPTY else
280  if mem (Star SIGMA) rlist' then Neg EMPTY
281  else
282  case rlist'
283   of [] => EMPTY
284    | [r] => r
285    | [Chset cs, r] => (if r = Neg (Chset cs) then Neg EMPTY else
286                        if cs = charset_empty then r
287                        else Or rlist')
288    | Chset cs :: t =>
289       (if mem (Neg (Chset cs)) t then Neg EMPTY else
290        if cs = charset_empty then Or t
291        else Or rlist')
292    | _ => Or rlist'
293end;
294
295fun normalize r =
296 case r
297  of Chset cs => build_char_set cs
298   | Cat(s,t) => build_cat (normalize s) (normalize t)
299   | Star s   => build_star (normalize s)
300   | Or rs    => build_or (List.map normalize rs)
301   | Neg s    => build_neg (normalize s);
302
303(*---------------------------------------------------------------------------*)
304(* Does a regexp admit EPSILON (the empty string)                            *)
305(*---------------------------------------------------------------------------*)
306
307fun nullable r =
308 case r
309  of Chset cs => false
310   | Star s   => true
311   | Cat(s,t) => nullable s andalso nullable t
312   | Or rs    => List.exists nullable (List.rev rs)
313   | Neg s    => not(nullable s);
314
315(*---------------------------------------------------------------------------*)
316(* Take derivative, then apply smart constructors.                           *)
317(*---------------------------------------------------------------------------*)
318
319fun smart_deriv c r =
320 case r
321  of Chset cs => if charset_mem c cs then EPSILON else EMPTY
322   | Cat(Chset cs,t) => if charset_mem c cs then t else EMPTY
323   | Cat(s,t) =>
324      let val dr = build_cat (smart_deriv c s) t
325      in if nullable s
326          then build_or [dr, smart_deriv c t]
327          else dr
328      end
329   | Star s => build_cat (smart_deriv c s) (build_star s)
330   | Or rs  => build_or (map (smart_deriv c) rs)
331   | Neg s  => build_neg (smart_deriv c s);
332
333(*---------------------------------------------------------------------------*)
334(* Cache applications of smart_deriv. Unhelpful right now ... might be too   *)
335(* fine-grained.                                                             *)
336(*---------------------------------------------------------------------------*)
337(*
338type table = (char * regexp,regexp) Redblackmap.dict
339type cache = table ref
340
341fun regexp_cmp (r1,r2) = regexp_compare r1 r2;
342
343fun new_table() =
344  Redblackmap.mkDict (pair_compare (Char.compare,regexp_cmp)) : table
345
346
347local
348  val cache = ref (new_table()) : cache
349  fun lookup x = Redblackmap.peek (!cache, x)
350  fun store (x,y) = (cache := Redblackmap.insert(!cache,x,y))
351  fun storable (Chset _) = false
352    | storable other = true
353in
354fun clear_cache () = (cache := new_table())
355fun cached_values () = Redblackmap.listItems (!cache)
356
357fun smart_deriv c r =
358 if storable r then
359  (case lookup (c,r)
360    of SOME y => y
361     | NONE =>
362       let val r' =
363           (case r
364             of Chset cs => if charset_mem c cs then EPSILON else EMPTY
365              | Cat(Chset cs,t) => if charset_mem c cs then t else EMPTY
366              | Cat(s,t) =>
367                let val dr = build_cat (smart_deriv c s) t
368                in if nullable s
369                    then build_or [dr, smart_deriv c t]
370                    else dr
371                end
372             | Star s => build_cat (smart_deriv c s) (build_star s)
373             | Or rs  => build_or (map (smart_deriv c) rs)
374             | Neg s  => build_neg (smart_deriv c s))
375      in
376         store((c,r),r')
377       ; r'
378      end)
379  else
380   (case r
381     of Chset cs => if charset_mem c cs then EPSILON else EMPTY
382      | Cat(Chset cs,t) => if charset_mem c cs then t else EMPTY
383      | Cat(s,t) =>
384        let val dr = build_cat (smart_deriv c s) t
385        in if nullable s
386           then build_or [dr, smart_deriv c t]
387           else dr
388        end
389      | Star s => build_cat (smart_deriv c s) (build_star s)
390      | Or rs  => build_or (map (smart_deriv c) rs)
391      | Neg s  => build_neg (smart_deriv c s))
392end
393*)
394
395(*---------------------------------------------------------------------------*)
396(* Support for the core regexp compiler.                                     *)
397(*---------------------------------------------------------------------------*)
398
399fun transitions r = List.map (fn c => (c,smart_deriv c r)) alphabet;
400
401fun extend_states next_state state_map trans translist =
402 case translist
403  of [] => (next_state,state_map,trans)
404   | (c,s)::t =>
405       case Finite_Map.lookup regexp_compare s state_map
406        of SOME n => extend_states next_state state_map ((c,n)::trans) t
407         | NONE   => extend_states (next_state + 1)
408                        (Finite_Map.insert regexp_compare
409                                   s next_state state_map)
410                        ((c,next_state)::trans)
411                        t;
412
413fun build_table translist r arg =
414 case arg of
415      (next_state,state_map,table) =>
416 case extend_states next_state state_map [] translist of
417      (next_state,state_map,trans) =>
418 case Finite_Map.lookup regexp_compare r state_map
419  of SOME n => (next_state, state_map, (n,trans)::table)
420   | NONE   => (next_state + 1,
421                Finite_Map.insert regexp_compare r next_state state_map,
422                (next_state,trans)::table);
423
424fun insert_regexp r seen = Finite_Map.insert regexp_compare r () seen;
425fun mem_regexp r seen    = Finite_Map.member regexp_compare r seen;
426
427(*---------------------------------------------------------------------------*)
428(* Core regexp compiler. The seen argument holds the already-seen regexps,   *)
429(* which become states in the final DFA. The n argument is a step-index used *)
430(* to ensure that the function terminates.                                   *)
431(*---------------------------------------------------------------------------*)
432
433fun brzozo seen worklist acc n =
434 case worklist
435  of [] => SOME(seen,acc)
436   | r::t =>
437     if n <= 0 then NONE else
438     if mem_regexp r seen
439       then brzozo seen t acc (n-1)
440        else let val translist = transitions r
441             in brzozo
442                  (insert_regexp r seen)
443                  (remove_dups (map snd translist) @ t)
444                  (build_table translist r acc)
445                  (n-1)
446             end;
447
448val bigIndex = 2147483647;
449
450fun get_accepts fmap =
451 mergesort (fn x => fn y => x <= y)
452  (Finite_Map.foldrWithKey
453      (fn r => fn n => fn nlist => if nullable r then n::nlist else nlist)
454      [] fmap);
455
456fun alist_to_vec list default n max =
457 if n=0 then [] else
458 case list
459  of [] => default :: alist_to_vec [] default (n-1) max
460   | ((x,y)::t) =>
461      if n <= max then
462        (if x = max - n then y :: alist_to_vec t default (n-1) max
463         else if x < max - n then alist_to_vec t default n max
464         else default :: alist_to_vec list default (n-1) max)
465      else [];
466
467fun accepts_to_final_vector accept_states max_state =
468  alist_to_vec (map (fn x => (x,true)) accept_states)
469               false max_state max_state;
470
471fun table_to_vectors table =
472 map (fn p => case p of (k,table2) =>
473       alist_to_vec
474           (mergesort (inv_image (fn x => fn y => x <= y) fst)
475              (map (fn p1 => case p1 of (c,n) => (Char.ord c,SOME n)) table2))
476           NONE alphabet_size alphabet_size)
477     (mergesort (inv_image (fn x => fn y => x <= y) fst) table);
478
479fun compile_regexp r =
480 let val s = normalize r
481 in case the (brzozo Finite_Map.empty [s]
482                     (1, Finite_Map.singleton s 0, []) bigIndex)
483    of (states,(last_state,state_numbering,table))
484    => let val delta_vecs = table_to_vectors table
485           val final = accepts_to_final_vector
486                            (get_accepts state_numbering) last_state
487       in (state_numbering, delta_vecs, final)
488       end
489 end;
490
491fun exec_dfa final_states table n string =
492 case string
493  of [] => el n final_states
494   | c::t =>
495       case el (Char.ord c) (el n table)
496        of NONE => false
497         | SOME k => exec_dfa final_states table k t;
498
499(*---------------------------------------------------------------------------*)
500(* Returns a function of type :string -> bool                                *)
501(*---------------------------------------------------------------------------*)
502
503fun matcher r =
504 case compile_regexp r
505  of (state_numbering,delta,final)
506  => let val start_state = the (Finite_Map.lookup regexp_compare
507                                      (normalize r) state_numbering)
508         val match_string = exec_dfa final delta start_state
509     in
510        {table = map (map Option.valOf) delta,
511         start = start_state,
512         final = final,
513         matchfn = fn s =>  match_string (String.explode s)}
514     end;
515
516
517(*---------------------------------------------------------------------------*)
518(* Vector-based DFA instead of list-based.                                   *)
519(*---------------------------------------------------------------------------*)
520
521fun vector_matcher r =
522 let val {table,start,final,matchfn} = matcher r
523     val vfinal  = Vector.fromList final
524     val vtable = Vector.fromList (List.map Vector.fromList table)
525     fun run n [] = Vector.sub(vfinal,n)
526       | run n (c::t) = run (Vector.sub(Vector.sub(vtable,n), Char.ord c)) t
527     val match_string = run start
528 in {start = start,
529     table = vtable,
530     final = vfinal,
531     matchfn = fn s => match_string (String.explode s)}
532 end
533
534(*---------------------------------------------------------------------------*)
535(* Just compute the states; no tracking of info needed to construct the rest *)
536(* of the DFA.                                                               *)
537(*---------------------------------------------------------------------------*)
538
539(*---------------------------------------------------------------------------*)
540(* Optimization of certain annoying (because slow to compile) Or regexps.    *)
541(*                                                                           *)
542(* The following code asserts that                                           *)
543(*                                                                           *)
544(*  smart_deriv c (Or [w, wr_1, ..., wr_n, EPSILON])                         *)
545(*                                                                           *)
546(* is equal to                                                               *)
547(*                                                                           *)
548(*  Or [r_1, ..., r_n, EPSILON] OR EMPTY                                     *)
549(*                                                                           *)
550(* (where w is a charset) depending on whether c is in the charset or not.   *)
551(*                                                                           *)
552(* Assumption: the input Or regexp is in normal form. That would mean that   *)
553(* EPSILON comes after all the charsets and Cats, so should be at the end.   *)
554(* Also, if args is the list constructed by Or, len(args) > 1 so there is a  *)
555(* last element x of args, and a separate head element w. Also, w can't be   *)
556(* the empty charset. If w is the full charset then EMPTY is not returned.   *)
557(*---------------------------------------------------------------------------*)
558
559val tracing = ref true;
560
561fun kprint s = if !tracing then print s else ();
562
563val _ = Feedback.register_btrace("regexp-compiler",tracing);
564
565fun drop w (Cat(a,b)) =
566     (case regexp_compare w a
567       of EQUAL => b
568        | otherwise => raise ERR "drop" "")
569  | drop r _ = raise ERR "drop" "";
570
571fun lift_cset_from_or rlist =
572 case rlist
573  of (w as Chset _) :: t =>
574      let val (t',x) = Lib.front_last t
575          val t'' = mapfilter (drop w) t'
576      in
577        if x = EPSILON andalso length t' = length t''
578        then let val _ = kprint "\n\n Optimization SUCCEEDS!!\n\n"
579                 val trimmed = build_or (t'' @ [EPSILON])
580             in if w = SIGMA
581                then SOME [trimmed]
582                else SOME [EMPTY,trimmed]
583             end
584        else NONE
585      end
586   | otherwise => NONE
587
588fun uniq rlist = remove_dups (mergesort regexp_leq rlist);
589
590fun transitions r =
591 case r of
592  Or rlist =>
593     (case lift_cset_from_or rlist
594       of SOME rlist' => rlist'
595        | NONE => uniq(List.map (fn c => smart_deriv c r) alphabet)
596     )
597   | otherwise =>
598      uniq(List.map (fn c => smart_deriv c r) alphabet);
599
600fun dom_brzozo seen [] = seen
601  | dom_brzozo seen (r::t) =
602      (kprint ("Worklist length: "^Int.toString(length t + 1)^". Head:\n  ")
603       ; if !tracing then print_regexp r else ()
604       ; kprint "\n"
605       ;
606        if mem_regexp r seen
607         then (kprint "already seen; trying next element.\n\n"
608               ;
609               dom_brzozo seen t
610              )
611         else let val _ = kprint ("will be a new state (size: "^Int.toString (PolyML.objSize r)^"). ")
612                  val prospects = transitions r
613                  val _ = kprint ("Successors: "^Int.toString (length prospects)^"\n\n")
614              in dom_brzozo (insert_regexp r seen)
615                            (prospects @ t)
616              end
617        );
618
619fun domBrz r =
620 let open regexpMisc
621     val _ = stdErr_print "Starting Brzozowski.\n"
622     val states = time (dom_brzozo Finite_Map.empty) [normalize r]
623     val _ = stdErr_print ("states: "^Int.toString (Finite_Map.size states)^"\n");
624 in
625   ()
626 end;
627
628end
629