1(*===========================================================================*)
2(* Regular expression matcher                                                *)
3(* Author: Scott Owens                                                       *)
4(*===========================================================================*)
5
6(*---------------------------------------------------------------------------*)
7(* SML code                                                                  *)
8(*                                                                           *)
9(*  datatype ''a re                                                          *)
10(*        = Epsilon                                                          *)
11(*           | Charset of ''a list                                           *)
12(*           | Or      of ''a re * ''a re                                    *)
13(*           | Then    of ''a re * ''a re                                    *)
14(*           | Repeat  of ''a re;                                            *)
15(*                                                                           *)
16(* Match: ''a re list -> ''a list -> ''a re list option -> bool              *)
17(*                                                                           *)
18(* fun Match [] [] _ = true                                                  *)
19(*   | Match [] _  _ = false                                                 *)
20(*   | Match (Epsilon::t) w s = Match t w s                                  *)
21(*   | Match (Charset c::t) [] s = false                                     *)
22(*   | Match (Charset c::t) (d::w) s = c d andalso Match t w NONE            *)
23(*   | Match (Or(r1,r2)::t) w s = Match (r1::t) w s orelse Match (r2::t) w s *)
24(*   | Match (Then(r1,r2)::t) w s = Match (r1::r2::t) w s                    *)
25(*   | Match (Repeat r::t) w s =                                             *)
26(*       let val R = SOME (Repeat r::t)                                      *)
27(*       in if s = R then false                                              *)
28(*          else Match t w s orelse Match (r::Repeat r::t) w R               *)
29(*       end                                                                 *)
30(*                                                                           *)
31(*---------------------------------------------------------------------------*)
32
33
34(* ------------------------------------------------------------------------- *)
35(*  HOL Preliminaries                                                        *)
36(* ------------------------------------------------------------------------- *)
37
38quietdec := true;
39app load ["stringLib"]; 
40open pairTheory arithmeticTheory listTheory optionTheory;
41quietdec := false;
42
43val thm_counter = Count.mk_meter();
44
45(*---------------------------------------------------------------------------*)
46(* Change free variable names to desired ones. Takes a list of (old,new)     *)
47(* pairs (strings).                                                          *)
48(*---------------------------------------------------------------------------*)
49
50fun RENAME_FREES_TAC [] = ALL_TAC
51  | RENAME_FREES_TAC ((old,new)::t) = 
52     Q.SPEC_TAC([QUOTE old],[QUOTE new]) THEN GEN_TAC THEN RENAME_FREES_TAC t;
53
54(*---------------------------------------------------------------------------*)
55(* Miscellaneous lemma; should already exist in HOL                          *)
56(*---------------------------------------------------------------------------*)
57
58val MEM_EXISTS_LEM = Q.prove
59(`!x l. MEM x l = EXISTS (\y. y = x) l`,
60 Induct_on `l` THEN EVAL_TAC THEN METIS_TAC []);
61
62val _ = new_theory "regexp";
63
64(*--------------------------------------------------------------------------*)
65(* Datatype of regular expressions. Note that by having Charset take a      *)
66(* characteristic function means that regexp is not computably testable for *)
67(* equality. Alternative: have Charset be a list or a finite map ...        *)
68(*--------------------------------------------------------------------------*)
69
70Hol_datatype 
71 `regexp = Epsilon                   (* Empty string *)
72         | Charset of 'a -> bool     (* Character set *)
73         | Or of regexp => regexp    (* Union *)
74         | Then of regexp => regexp  (* Concatenation *)
75         | Repeat of regexp`;        (* Iterated concat, >= 0 *)
76
77(*---------------------------------------------------------------------------*)
78(* Parser fiddling to get | and # as infixes - we have to first get rid      *)
79(* of their pre-defined behaviour.                                           *)
80(*---------------------------------------------------------------------------*)
81
82val _ = overload_on ("+", Term`$Or`);
83val _ = overload_on ("#", Term`$Then`);
84
85val _ = set_fixity "+" (Infixr 501);
86val _ = set_fixity "#" (Infixr 601);
87
88val regexp_cases = TypeBase.nchotomy_of ``:'a regexp``;
89
90(*---------------------------------------------------------------------------*)
91(* Semantics of regular expressions. sem r w means that word w (represented  *)
92(* as a list) is in the language of regular expression r.                    *)
93(*---------------------------------------------------------------------------*)
94
95val sem_def = 
96 Define
97  `(sem Epsilon w     = (w = []))                                         /\
98   (sem (Charset C) w = (?x. (w = [x]) /\ C x))                           /\
99   (sem (r1 + r2) w   = sem r1 w \/ sem r2 w)                             /\
100   (sem (r1 # r2) w   = ?w1 w2. (w = w1 ++ w2) /\ sem r1 w1 /\ sem r2 w2) /\
101   (sem (Repeat r) w  = ?wlist. (w = FLAT wlist) /\ EVERY (sem r) wlist)`;
102
103(*---------------------------------------------------------------------------*)
104(* Some basic equivalences for regular expressions.                          *)
105(*---------------------------------------------------------------------------*)
106
107val regexp_repeat_thm = Q.prove
108(`!r w.sem (Repeat r) w = sem (Epsilon + (r # (Repeat r))) w`,
109 RW_TAC std_ss [] THEN EQ_TAC THEN RW_TAC list_ss [sem_def, FLAT] 
110 THENL [Cases_on `wlist` THEN RW_TAC list_ss [FLAT], ALL_TAC, ALL_TAC]
111 THEN METIS_TAC [FLAT,EVERY_DEF]);
112
113val repeat_to_ncat_thm = Q.prove
114(`!r w.sem (Repeat r) w ==> ?n. sem (FUNPOW ($# r) n Epsilon) w`,
115 RW_TAC list_ss [sem_def, FLAT, FUNPOW_SUC] 
116   THEN Q.EXISTS_TAC `LENGTH wlist` 
117   THEN Induct_on `wlist` 
118   THEN RW_TAC list_ss [sem_def, FLAT, FUNPOW_SUC]
119   THEN EVAL_TAC THEN METIS_TAC []);
120
121val ncat_to_repeat_thm = Q.prove
122(`!r w n. sem (FUNPOW ($# r) n Epsilon) w ==> sem (Repeat r) w`,
123 Induct_on `n` THENL [ALL_TAC, POP_ASSUM MP_TAC] THEN EVAL_TAC
124   THEN RW_TAC list_ss [sem_def,FUNPOW_SUC] THENL
125   [METIS_TAC [FLAT, EVERY_DEF],
126    RES_TAC THEN Q.EXISTS_TAC `w1::wlist` THEN METIS_TAC [EVERY_DEF,FLAT]]);
127
128val repeat_equals_ncat_thm = Q.prove
129(`!r w. sem (Repeat r) w = ?n. sem (FUNPOW ($# r) n Epsilon) w`,
130 METIS_TAC [ncat_to_repeat_thm, repeat_to_ncat_thm]);
131
132
133(* ------------------------------------------------------------------------- *)
134(* Regular expression matching algorithm                                     *)
135(* ------------------------------------------------------------------------- *)
136
137val match_defn = 
138 Hol_defn 
139   "match"
140  `(match [] [] _                 = T)
141/\ (match [] _  _                 = F)
142/\ (match (Epsilon::t) w s        = match t w s)
143/\ (match (Charset C::t) [] s     = F)
144/\ (match (Charset C::t) (d::w) s = C d /\ match t w NONE) 
145/\ (match ((r1 + r2)::t) w s      = match (r1::t) w s \/ match (r2::t) w s)
146/\ (match ((r1 # r2)::t) w s      = match (r1::r2::t) w s)
147/\ (match (Repeat r::t) w s       =
148       if s = SOME (Repeat r::t) then
149         F
150       else
151         match t w s \/ match (r::Repeat r::t) w (SOME (Repeat r::t)))`;
152
153(*---------------------------------------------------------------------------*)
154(* Termination proof                                                         *)
155(*---------------------------------------------------------------------------*)
156
157(*---------------------------------------------------------------------------*)
158(* Size of a regular expression, needed because the default metric gives     *)
159(* Epsilon size 0 which won't work for the matcher's termination proof.      *)
160(*---------------------------------------------------------------------------*)
161
162val my_regexp_size_def = 
163 Define
164  `(my_regexp_size Epsilon     = 1) /\
165   (my_regexp_size (Charset _) = 1) /\
166   (my_regexp_size (r1 + r2)   = 1 + my_regexp_size r1 + my_regexp_size r2) /\
167   (my_regexp_size (r1 # r2)   = 1 + my_regexp_size r1 + my_regexp_size r2) /\
168   (my_regexp_size (Repeat r)  = 1 + my_regexp_size r)`;
169
170(*---------------------------------------------------------------------------*)
171(* Star height of a regular expression                                       *)
172(*---------------------------------------------------------------------------*)
173
174val starHeight_def = 
175 Define
176  `(starHeight Epsilon     = 0) /\
177   (starHeight (Charset _) = 0) /\
178   (starHeight (r1 + r2)   = MAX (starHeight r1) (starHeight r2)) /\
179   (starHeight (r1 # r2)   = MAX (starHeight r1) (starHeight r2)) /\
180   (starHeight (Repeat r)  = 1 + starHeight r)`;
181
182val front_starHeight_def = 
183 Define
184  `(front_starHeight [] sl = 0) /\
185   (front_starHeight (h::t) sl =
186      if (sl = SOME (h::t)) /\ ?r. h = Repeat r 
187       then 0
188       else MAX (starHeight h) (front_starHeight t sl))`;
189
190(*---------------------------------------------------------------------------*)
191(* SUM (MAP my_regexp_size res) doesn't count the conses in the list like    *)
192(* the builtin list measure does.  This is important for termination proof.  *)
193(*---------------------------------------------------------------------------*)
194
195val term_meas_def = 
196 Define
197   `term_meas (res, str, stop) =
198     (LENGTH str,
199      front_starHeight res stop,
200      SUM (MAP my_regexp_size res))`;
201
202val (match_def, match_ind) = Defn.tprove
203(match_defn,
204 WF_REL_TAC `inv_image ($< LEX $< LEX $<) term_meas` THEN
205   RW_TAC list_ss [term_meas_def, my_regexp_size_def] THEN
206   PURE_ONCE_REWRITE_TAC [GSYM arithmeticTheory.LESS_OR_EQ] THEN
207   RW_TAC list_ss [front_starHeight_def, starHeight_def]);
208
209
210(* ------------------------------------------------------------------------- *)
211(* Correctness Proof                                                         *)
212(* ------------------------------------------------------------------------- *)
213
214(*---------------------------------------------------------------------------*)
215(* Match implies the semantics                                               *)
216(*---------------------------------------------------------------------------*)
217
218val match_implies_sem = Q.prove
219(`!rl w s. match rl w s ==> sem (FOLDR $# Epsilon rl) w`,
220 recInduct match_ind THEN RW_TAC list_ss [match_def, sem_def] THENL
221 [METIS_TAC [APPEND],
222  METIS_TAC [],
223  METIS_TAC [],
224  FULL_SIMP_TAC list_ss [] THEN METIS_TAC [],
225  MAP_EVERY Q.EXISTS_TAC [`[]`, `w`]
226    THEN RW_TAC list_ss [] THEN METIS_TAC [FLAT,EVERY_DEF],
227  FULL_SIMP_TAC list_ss [] 
228    THEN MAP_EVERY Q.EXISTS_TAC [`w1 ++ FLAT wlist`, `w2'`] 
229    THEN RW_TAC list_ss []
230    THEN METIS_TAC [FLAT,EVERY_DEF]]);
231
232(*---------------------------------------------------------------------------*)
233(* Semantics imply match                                                     *)
234(*---------------------------------------------------------------------------*)
235
236(*---------------------------------------------------------------------------*)
237(* A match sequence is a witness to a path through the match algorithm that  *)
238(* ends with T.                                                              *)
239(*---------------------------------------------------------------------------*)
240
241val m_def = 
242 Define
243  `(m (Epsilon::t,w,s)  x     = (x = (t,w,s))) /\ 
244   (m (Charset C::t,d::w,s) x = (x = (t,w,NONE)) /\ C d) /\
245   (m ((r1 + r2)::t, w, s) x  = (x = (r1::t,w,s)) \/ (x = (r2::t,w,s))) /\
246   (m ((r1 # r2)::t, w, s) x  = (x = (r1::r2::t,w,s))) /\
247   (m (Repeat r::t, w,s) x    = (x = (t, w, s)) \/ 
248                                (x = (r::Repeat r::t, w, SOME(Repeat r::t))))/\
249   (m _ _ = F)`;
250
251
252val match_seq_def = 
253 Define
254  `(match_seq [] = T) /\
255   (match_seq [a] = (FST a = []) /\ (FST (SND a) = [])) /\
256   (match_seq (a::b::rest) = m a b /\ match_seq (b::rest))`;
257
258(*---------------------------------------------------------------------------*)
259(* Induction and case analysis theorems for match sequences                  *)
260(*---------------------------------------------------------------------------*)
261
262val m_ind  = fetch "-" "m_ind";
263val ms_ind = fetch "-" "match_seq_ind";
264
265val m_ind_thm = Q.prove 
266(`!P:'a regexp list # 'a list # 'a regexp list option ->
267     'a regexp list # 'a list # 'a regexp list option -> bool.
268  (!t w s x. P (Epsilon::t,w,s) x) /\
269  (!c t d w s x. P (Charset c::t,d::w,s) x) /\
270  (!r1 r2 t w s x. P ((r1 + r2)::t,w,s) x) /\
271  (!r1 r2 t w s x. P ((r1 # r2)::t,w,s) x) /\
272  (!r t w s x. P (Repeat r::t,w,s) x) /\
273  (!c t s x. P (Charset c::t,[],s) x) /\
274  (!w s. P ([],w) s) ==>
275  !v v1 v2 v3 v4 v5. P (v,v1,v2) (v3,v4,v5)`,
276 METIS_TAC [ABS_PAIR_THM,m_ind]);
277
278val m_thm = SIMP_RULE list_ss [] (Q.prove
279(`!rl w s rl' w' s' r l. 
280      ~(s = SOME (Repeat r::l))   ==>
281       (s' =  SOME (Repeat r::l)) ==>
282       m (rl, w, s) (rl', w', s') ==> 
283      ((rl = Repeat r::l) /\ (w = w'))`,
284 recInduct m_ind_thm THEN RW_TAC list_ss [m_def]));
285
286val m_thm2 = Q.prove
287(`!rl w s rl' w' s'. ~(w = w') ==> m (rl, w, s) (rl',w',s') ==> (s' = NONE)`,
288 recInduct m_ind_thm THEN RW_TAC list_ss [m_def]);
289
290val cases_lemma = Q.prove
291(`!a: 'a regexp list # 'a list # 'a regexp list option.
292     (?x. a = ([],[],x)) \/ (?h t x. a = ([],h::t,x)) \/
293     (?t w s. a = (Epsilon::t,w,s)) \/
294     (?c t w s. a = (Charset c::t,[],s)) \/
295     (?c d t w s. a = (Charset c::t,d::w,s)) \/
296     (?r1 r2 t w s. a = ((r1+r2)::t,w,s)) \/
297     (?r1 r2 t w s. a = ((r1#r2)::t,w,s)) \/
298     (?r t w s. a = (Repeat r::t,w,s))`,
299 METIS_TAC [list_CASES,ABS_PAIR_THM,regexp_cases]);
300
301val ms_ind_thm = Q.prove
302(`!P:('a regexp list # 'a list # 'a regexp list option) list -> bool.
303    P [] /\ 
304    (!x. P [([],[],x)]) /\
305    (!r t w s. P [(Repeat r::t,w,s)]) /\
306    (!r1 r2 t w s. P [((r1 # r2)::t,w,s)]) /\
307    (!r1 r2 t w s. P [((r1 + r2)::t,w,s)]) /\
308    (!c d t w s. P [(Charset c::t,d::w,s)]) /\
309    (!c t s. P [(Charset c::t,[],s)]) /\
310    (!t w s. P [(Epsilon::t,w,s)]) /\
311    (!h t s. P [([],h::t,s)]) /\
312    (!t w s t' w' s' rst.
313        P ((t',w',s')::rst) ==> P((Epsilon::t,w,s)::(t',w',s')::rst)) /\
314    (!c t d w s t' w' s' rst.
315        P ((t',w',s')::rst) ==> P((Charset c::t,d::w,s)::(t',w',s')::rst)) /\
316    (!r1 r2 t w s t' w' s' rst.
317        P ((t',w',s')::rst) ==> P(((r1 + r2)::t,w,s)::(t',w',s')::rst)) /\
318    (!r1 r2 t w s t' w' s' rst.
319        P ((t',w',s')::rst) ==> P(((r1 # r2)::t,w,s)::(t',w',s')::rst)) /\
320    (!r t w s t' w' s' rst.
321        P ((t',w',s')::rst) ==> P((Repeat r::t,w,s)::(t',w',s')::rst)) /\
322    (!c t s h1 t1. P ((Charset c::t,[],s)::h1::t1)) /\
323    (!c w s h1 t1. P (([],c::w,s)::h1::t1)) /\
324    (!s h1 t1. P (([],[],s)::h1::t1)) 
325   ==>
326       !v. P v`,
327 GEN_TAC THEN STRIP_TAC THEN MATCH_MP_TAC ms_ind THEN
328 METIS_TAC [ABS_PAIR_THM, cases_lemma]);
329
330val match_seq_down_closed = Q.prove
331(`!L1 L2. match_seq (L1 ++ L2) ==> match_seq L2`,
332 Induct THEN RW_TAC list_ss [match_seq_def]
333   THEN Cases_on `L1++L2` 
334   THEN FULL_SIMP_TAC list_ss [match_seq_def]);
335
336
337(*---------------------------------------------------------------------------*)
338(* Concatenation of match sequences requires the "stopper" to be changed.    *)
339(* change_stop_on_prefix sets the stop argument in the first element in a    *)
340(* match sequence to the specified value.  This requires changing the stop   *)
341(* argument in subsequent entries in the match sequence until a entry is     *)
342(* reached that sets a new stop argument instead of just propogating the     *)
343(* previous one.                                                             *)
344(*---------------------------------------------------------------------------*)
345
346val change_stop_on_prefix_def = 
347 Define
348  `(change_stop_on_prefix ([] : ('a regexp list # 'a list #
349                                 'a regexp list option) list) ns = []) /\
350   (change_stop_on_prefix ((Repeat r::t,w,s)::b::c) ns = 
351     if b = (r::Repeat r::t, w, SOME (Repeat r::t)) 
352      then(Repeat r::t,w,ns)::b::c
353      else (Repeat r::t,w,ns)::change_stop_on_prefix (b::c) ns) /\
354   (change_stop_on_prefix ((Charset c1::t,w,s)::c) ns 
355       = (Charset c1::t,w,ns)::c) /\
356   (change_stop_on_prefix ((rl,w,s)::c) ns 
357       = (rl,w,ns)::change_stop_on_prefix c ns)`;
358
359val csp_def = change_stop_on_prefix_def;
360val csp_ind = fetch "-" "change_stop_on_prefix_ind";
361
362val csp_step_lem = Q.prove
363(`!alist stop rl w s x. 
364    (alist = ((rl,w,s)::x)) ==>
365    ?t. change_stop_on_prefix alist stop = (rl,w,stop)::t`,
366 recInduct csp_ind THEN RW_TAC list_ss [csp_def]);
367
368val csp_step = SIMP_RULE list_ss [] csp_step_lem;
369
370val change_stop_on_prefix_thm = Q.prove
371(`!L stop. match_seq L ==> match_seq (change_stop_on_prefix L stop)`,
372 recInduct ms_ind_thm THEN RW_TAC list_ss [match_seq_def,m_def,csp_def] THEN
373 METIS_TAC [match_seq_def, m_def, csp_step]);
374
375val LENGTH_CSP = Q.prove
376(`!l s. LENGTH (change_stop_on_prefix l s) = LENGTH l`,
377 recInduct csp_ind THEN RW_TAC list_ss [csp_def]);
378
379(*---------------------------------------------------------------------------*)
380(* If match sequences m1 and m2 represent matching rl1 against w1 and rl2    *)
381(* against w2 respectively, then the composing m1 and m2 should represent    *)
382(* matching rl1++rl2 against w1++w2.  Thus the first element of m2           *)
383(* (rl2, w2, s2) should be appended to the end of all the elements in m1,    *)
384(* and s2 should be changed to the last stop argument of m1.  Then the       *)
385(* sequences can be joined.                                                  *)
386(*---------------------------------------------------------------------------*)
387
388val compose_m_seq_def = 
389 Define
390  `(compose_m_seq [] x2 = x2) /\
391   (compose_m_seq x1 [] = x1) /\
392   (compose_m_seq x1 ((r1,w1,s1)::x2) =
393     let x1 = MAP (\(r,w,s). (r++r1, w++w1, OPTION_MAP (\e. e++r1) s)) x1 in
394     let x2 = change_stop_on_prefix ((r1,w1,s1)::x2) (SND (SND (LAST x1))) 
395     in
396       x1 ++ TL x2)`;
397
398val compose_m_seq_null = Q.prove
399(`(!x. compose_m_seq [] x = x) /\ (!x. compose_m_seq x [] = x)`,
400 CONJ_TAC THEN Cases THEN RW_TAC list_ss [compose_m_seq_def] THEN 
401 Q.ID_SPEC_TAC `h` THEN SIMP_TAC list_ss [FORALL_PROD,compose_m_seq_def]);
402
403val lem = Q.prove
404(`!mseq r w s r1 w1 s1 s2 x.
405     (mseq = ((r1,w1,s1)::x)) ==>
406      match_seq mseq ==> 
407      m (r,w,s) (r1,w1, s2) ==>
408      match_seq ((r,w,s)::change_stop_on_prefix ((r1,w1,s1)::x) s2)`,
409 recInduct ms_ind_thm
410  THEN NTAC 2 (RW_TAC list_ss 
411         [csp_step, change_stop_on_prefix_def, match_seq_def, m_def])
412  THEN METIS_TAC [CONS_ACYCLIC,list_CASES, CONS_11]);
413
414val compose_m_seq_thm = Q.prove
415(`!x1 x2. match_seq x1 ==> match_seq x2 ==> match_seq (compose_m_seq x1 x2)`,
416 REPEAT STRIP_TAC 
417   THEN Induct_on `x1`
418   THEN RW_TAC list_ss []
419   THEN `(?rl w s. h = (rl,w,s)) /\ 
420         ((x1 = []) \/ ?g t1. x1 = g::t1) /\
421         ((x2 = []) \/ ?rl' w' s' t2 . x2 = (rl',w',s')::t2)` 
422      by METIS_TAC [list_CASES,ABS_PAIR_THM]
423   THEN RW_TAC list_ss [compose_m_seq_null] THENL 
424   [(*Base Case*)
425      `(rl' = []) \/ ?h t. rl' = h::t` by METIS_TAC [list_CASES] 
426      THEN `(t2 = []) \/ ?a z. t2 = a::z` by METIS_TAC [list_CASES] 
427      THEN FULL_SIMP_TAC list_ss [match_seq_def]
428      THEN `(h = Epsilon) \/ (?c. h = Charset c) \/
429            (?r1 r2. h = r1+r2) \/ (?r1 r2. h = r1#r2) \/ 
430            (?r. h = Repeat r)` by METIS_TAC [regexp_cases]
431      THEN RW_TAC list_ss [] THEN EVAL_TAC THEN FULL_SIMP_TAC list_ss [m_def]
432      THEN RW_TAC list_ss [m_def, match_seq_def, SIMP_RULE std_ss [] lem]
433      THEN METIS_TAC [list_CASES, m_def, CONS_ACYCLIC, CONS_11],
434    (*Step Case*)
435    `?rlist w1 s1. g = (rlist,w1,s1)` by METIS_TAC [ABS_PAIR_THM]
436       THEN `((rl = []) \/ ?h t. rl = h::t) /\
437             ((w = []) \/ ?b y. w = b::y)` by METIS_TAC [list_CASES] 
438       THEN `(h = Epsilon) \/ (?c. h = Charset c) \/
439             (?r1 r2. h = r1+r2) \/ (?r1 r2. h = r1#r2) \/ 
440             (?r. h = Repeat r)` by METIS_TAC [regexp_cases]
441       THEN NTAC 2 (RW_TAC list_ss [] THEN
442                    FULL_SIMP_TAC list_ss [LET_THM, m_def, match_seq_def,
443		                           compose_m_seq_def])]);
444
445(*---------------------------------------------------------------------------*)
446(* Match sequence suffixes can be swapped out.                               *)
447(*---------------------------------------------------------------------------*)
448
449val match_seq_lemma = Q.prove
450(`!a x y z. match_seq (x++[a]++z) /\ match_seq ([a]++y)
451            ==> match_seq (x++[a]++y)`,
452 Induct_on `x` 
453   THEN RW_TAC list_ss []
454   THEN Cases_on `x`
455   THEN METIS_TAC [APPEND, match_seq_def]);
456 
457
458(*---------------------------------------------------------------------------*)
459(* If r matches w, then we can inductively build up a match sequence that    *)
460(* starts with ([r], w, NONE).                                               *)
461(*---------------------------------------------------------------------------*)
462
463val lem = Q.prove
464(`!(r: 'a regexp list) (w:'a list) s r' w' s' x x'. 
465  ?t. compose_m_seq ((r, w, s)::x) ((r', w', s')::x') 
466        =
467      (r++r', w++w', OPTION_MAP (\e.e++r') s)::t`,
468 RW_TAC list_ss [compose_m_seq_def, LET_THM]);
469
470val sem_implies_match_seq_exists = Q.prove
471(`!r w. sem r w ==> ?x. match_seq (([r], w, NONE)::x)`,
472 Induct THENL 
473 [RW_TAC list_ss [sem_def] 
474    THEN Q.EXISTS_TAC `[([],[],NONE)]`
475    THEN RW_TAC list_ss [match_seq_def, m_def],
476  RW_TAC list_ss [sem_def]
477    THEN Q.EXISTS_TAC `[([],[],NONE)]` 
478    THEN RW_TAC list_ss [match_seq_def, m_def], 
479  RW_TAC list_ss [sem_def] THEN RES_TAC THENL
480    [Q.EXISTS_TAC `([r],w,NONE)::x` THEN RW_TAC list_ss [match_seq_def, m_def],
481     Q.EXISTS_TAC `([r'],w,NONE)::x` 
482       THEN RW_TAC list_ss [match_seq_def, m_def]],
483  RW_TAC list_ss [sem_def] THEN RES_TAC
484    THEN Q.EXISTS_TAC `compose_m_seq (([r],w1,NONE)::x') (([r'],w2,NONE)::x)` 
485    THEN STRIP_ASSUME_TAC 
486           (Q.SPECL [`[r]`, `w1`, `NONE`, `[r']`, `w2`,`NONE`, `x'`, `x`] lem)
487    THEN FULL_SIMP_TAC list_ss [match_seq_def]
488    THEN METIS_TAC [m_def, compose_m_seq_thm],
489  SIMP_TAC list_ss [repeat_equals_ncat_thm, GSYM LEFT_FORALL_IMP_THM]
490    THEN Induct_on `n` THENL
491    [RW_TAC list_ss [FUNPOW] 
492       THEN FULL_SIMP_TAC list_ss [sem_def]
493       THEN Q.EXISTS_TAC `[([],[],NONE)]`
494       THEN RW_TAC list_ss [match_seq_def, m_def],
495     FULL_SIMP_TAC list_ss [FUNPOW_SUC] 
496       THEN RW_TAC list_ss [sem_def]
497       THEN `?x1 x2. match_seq (([r],w1,NONE)::x1) /\
498                     match_seq (([Repeat r],w2,NONE)::x2)` by METIS_TAC []
499       THEN Q.EXISTS_TAC `change_stop_on_prefix 
500                              (compose_m_seq (([r],w1,NONE)::x1)
501                              (([Repeat r],w2,NONE)::x2)) (SOME [Repeat r])`
502       THEN STRIP_ASSUME_TAC 
503               (Q.SPECL [`[r]`, `w1`, `NONE`, `[Repeat r]`, 
504                         `w2`,`NONE`, `x1`, `x2`] lem)
505       THEN STRIP_ASSUME_TAC 
506               (Q.SPECL [`SOME[Repeat r]`, `[r]++[Repeat r]`, 
507                         `w1++w2`, `NONE`, `t`] csp_step)
508       THEN FULL_SIMP_TAC list_ss [match_seq_def]
509       THEN METIS_TAC [m_def, compose_m_seq_thm, change_stop_on_prefix_thm]]]);
510
511
512(*---------------------------------------------------------------------------*)
513(* Given a match sequence starting with (rl,w,s), then (match rl w s) holds  *)
514(* as long as the match sequence avoids cycles in the matcher.               *)
515(*---------------------------------------------------------------------------*)
516
517val witness_implies_match_thm = 
518let val thms = [match_def, match_seq_def, m_def] in
519Q.prove
520(`!rl w s x y. 
521    (((rl,w,s)::y) = x) 
522    ==> match_seq x /\ 
523        (!r1 t1 w1. ~MEM (Repeat r1::t1, w1, SOME (Repeat r1::t1)) x)
524    ==> match rl w s`,
525 Induct_on `x` THEN RW_TAC list_ss [] THEN
526 `((x=[])  \/ ?h t. x = h::t) /\
527  ((rl=[]) \/ (?rst. rl = Epsilon::rst)  
528           \/ (?c rst. rl = Charset c::rst) 
529           \/ (?r1 r2 rst. rl = (r1+r2)::rst) 
530           \/ (?r1 r2 rst. rl = (r1#r2)::rst) 
531           \/ (?r2 rst. rl = Repeat r2::rst))`
532      by METIS_TAC [list_CASES,regexp_cases] 
533   THEN RW_TAC list_ss [] 
534   THEN FULL_SIMP_TAC list_ss thms THENL
535   [Cases_on `w` THEN FULL_SIMP_TAC list_ss thms,
536    RW_TAC list_ss [] THEN METIS_TAC thms,
537    RW_TAC list_ss [] THEN METIS_TAC thms])
538end;
539
540(*---------------------------------------------------------------------------*)
541(* Split a list in two around the first point where predicate P holds.       *)
542(*---------------------------------------------------------------------------*)
543
544val split_def = 
545 Define
546  `(split P [] acc = NONE) /\
547   (split P (h::t) acc = if P h then SOME (acc, h, t)
548                                else split P t (acc++[h]))`;
549
550val split_thm = Q.prove
551(`!P l acc l1 v l2. 
552     (split P l acc = (SOME (l1, v, l2))) ==> (l1 ++ [v] ++ l2 = acc ++ l)`,
553 Induct_on `l` 
554   THEN RW_TAC list_ss [split_def] 
555   THEN METIS_TAC [APPEND, APPEND_ASSOC]);
556
557val split_thm2 = Q.prove
558(`!P l acc l1 v l2. 
559     (split P l acc = (SOME (l1, v, l2))) ==> ?l3. (l1 = acc++l3)`,
560 Induct_on `l` 
561   THEN RW_TAC list_ss [split_def] THENL
562   [Q.EXISTS_TAC `[]` THEN RW_TAC list_ss [],
563    RES_TAC THEN Q.EXISTS_TAC `[h]++l3` THEN RW_TAC list_ss []]);
564
565val split_thm3 = Q.prove
566(`!P l acc l1 v l2 l3.
567    (split P l (l3 ++ acc) = SOME (l3++l1, v, l2)) 
568       = 
569    (split P l acc = (SOME (l1, v, l2)))`,
570 Induct_on `l` 
571    THEN RW_TAC list_ss [split_def] 
572    THEN METIS_TAC [APPEND_ASSOC]);
573
574val split_thm4 = Q.prove
575(`!P l acc. (split P l acc = NONE) ==> ~EXISTS P l`,
576 Induct_on `l` 
577   THEN RW_TAC list_ss [split_def]
578   THEN FULL_SIMP_TAC list_ss []
579   THEN METIS_TAC []);
580
581val split_thm5 = Q.prove
582(`!P l acc acc2. (split P l acc = NONE) ==> (split P l acc2 = NONE)`,
583 Induct_on `l` 
584   THEN RW_TAC list_ss [split_def] 
585   THEN METIS_TAC [split_def]);
586
587val split_prop_thm = Q.prove
588(`!L P a b c acc. (split P L acc = SOME(a,b,c)) ==> P b`,
589 Induct 
590   THEN RW_TAC list_ss [split_def] 
591   THEN METIS_TAC []);
592
593val split_cong_thm = Q.prove
594(`!l P acc P' l' acc'. 
595     (l=l') /\ (!x. MEM x l ==> (P x = P' x)) /\ (acc=acc') 
596       ==> (split P l acc = split P' l' acc')`,
597 Induct_on `l` 
598   THEN RW_TAC list_ss [split_def] 
599   THEN FULL_SIMP_TAC list_ss [split_def]);
600
601(*---------------------------------------------------------------------------*)
602(* If a cycle exists in a match sequence, then the part of the sequence      *)
603(* before the cycle contains the Repeat expressions that lead to the cycle.  *)
604(*---------------------------------------------------------------------------*)
605
606val match_seq_find_lemma = Q.prove
607(`!ms l1 r l w z. 
608  match_seq ms 
609   ==> (split (\x. ?r l w. x = (Repeat r::l, w, SOME (Repeat r::l))) ms [] =
610        SOME (l1,(Repeat r::l,w,SOME(Repeat r::l)), z))
611   ==> (~((SND(SND(HD ms))) = SOME (Repeat r::l)) \/ ~((FST(SND(HD ms))) = w)) 
612   ==> ?s. MEM (Repeat r::l, w, s) l1`,
613 Induct 
614    THEN FULL_SIMP_TAC list_ss [split_def, match_seq_def, m_def]
615    THEN REPEAT GEN_TAC 
616    THEN `(l1 = []) \/ ?h1 t1. l1 = h1::t1` by METIS_TAC [list_CASES]
617    THEN RW_TAC list_ss [split_def, match_seq_def, m_def]
618    THEN TRY (IMP_RES_TAC split_thm2 THEN FULL_SIMP_TAC list_ss [] THEN NO_TAC)
619    (* 2 cases left, almost identical; do both cases with same tactic *)
620    THEN 
621    (Q.PAT_ASSUM `$! M `(MP_TAC o Q.SPECL [`t1`, `r`, `l`, `w`, `z`])
622    THEN RW_TAC list_ss [split_def, match_seq_def, m_def]
623    THEN `match_seq ms` by METIS_TAC [match_seq_down_closed, APPEND]
624    THEN FULL_SIMP_TAC list_ss [split_def, match_seq_def, m_def]
625    THEN IMP_RES_TAC split_thm2
626    THEN FULL_SIMP_TAC list_ss [] THEN RW_TAC list_ss []
627    THEN ASSUME_TAC (Q.SPECL 
628             [`\x. ?r l w. x = (Repeat r::l,w,SOME (Repeat r::l))`,
629              `ms`, `[]`, `l3`, `(Repeat r::l,w,SOME (Repeat r::l))`, `z`,
630	      `[h]`]
631     (INST_TYPE [alpha |-> Type`:'a regexp list#'a list#'a regexp list option`]
632                   split_thm3))
633   THEN FULL_SIMP_TAC list_ss []
634   THEN RES_TAC THEN FULL_SIMP_TAC list_ss [] THEN RW_TAC list_ss []
635   THEN Cases_on `~(SND (SND (HD ms)) = SOME (Repeat r::l))`
636   THENL [METIS_TAC [],FULL_SIMP_TAC list_ss [] THEN RW_TAC list_ss []]
637   THEN Cases_on `~(FST (SND (HD ms)) = w)`
638   THENL [METIS_TAC [],FULL_SIMP_TAC list_ss [] THEN RW_TAC list_ss []]
639   THEN `(ms = []) \/ ?a b c mst. ms = (a,b,c)::mst` 
640        by METIS_TAC [list_CASES,ABS_PAIR_THM]
641   THENL [FULL_SIMP_TAC list_ss [split_def, match_seq_def, m_def],
642          FULL_SIMP_TAC list_ss [] THEN RW_TAC list_ss [] 
643            THEN `?a1 b1 c1. h = (a1,b1,c1)` by METIS_TAC [ABS_PAIR_THM]
644            THEN RW_TAC list_ss [] THEN FULL_SIMP_TAC list_ss [match_seq_def]
645            THEN METIS_TAC [m_thm,m_thm2,NOT_SOME_NONE]]));
646
647
648(*---------------------------------------------------------------------------*)
649(* Normalize a match sequence to cut the cycles out of it.                   *)
650(* To remove a cycle, find it and the regexp that started the cycle and cut  *)
651(* out the intermediate parts of the match sequence.                         *)
652(*---------------------------------------------------------------------------*)
653
654val NS_DEF = 
655 tDefine
656  "NS"
657  `NS ms =
658     case split (\x. ?r l w. x = (Repeat r::l, w, SOME (Repeat r::l))) ms [] of
659         NONE -> ms
660      || SOME (l1, (bad_r, bad_w, bad_s), z) ->
661           let opt = split (\x. ?s. x = (bad_r, bad_w, s)) l1 []
662           in if opt = NONE then []
663              else
664                let (x1, (rl, w1, s1), y) = THE opt in
665                if (?ra L. (rl = Repeat ra::L) /\ (FST(HD z) = L))
666                  then NS (x1 ++ [(rl,w1,s1)] ++ change_stop_on_prefix z s1)
667                  else NS (x1 ++ [(rl,w1,s1)] ++ z)`
668(WF_REL_TAC `measure LENGTH`
669  THEN RW_TAC list_ss [LENGTH_CSP]
670  THEN IMP_RES_TAC split_thm THEN FULL_SIMP_TAC list_ss []
671  THEN RW_TAC list_ss []
672  THEN Cases_on `split (\(p1,p1',p2). (p1 = bad_r) /\ (p1' = bad_w)) l1 []`
673  THEN RW_TAC list_ss [] 
674  THEN FULL_SIMP_TAC list_ss []
675  THEN Cases_on `x` THEN Cases_on `r`
676  THEN IMP_RES_TAC split_thm THEN FULL_SIMP_TAC list_ss []
677  THEN RW_TAC list_ss []);
678
679val NS_IND = fetch "-" "NS_ind";
680
681(*---------------------------------------------------------------------------*)
682(* One step of NS preserves match sequences                                  *)
683(*---------------------------------------------------------------------------*)
684
685val match_seq_surg_thm = Q.prove
686(`!ms l1 bad_r bad_w z x1 rl w1 s1 y.
687        match_seq ms ==>
688        (split (\x. ?r l w. x = (Repeat r::l, w, SOME (Repeat r::l))) ms [] =
689         SOME (l1, (bad_r, bad_w, bad_s), z)) ==>
690        (split (\x. ?s. x = (bad_r, bad_w, s)) l1 [] =
691         SOME (x1, (rl, w1, s1), y)) ==>
692        if (?ra L. (rl = Repeat ra::L) /\ (FST(HD z) = L)) then
693          match_seq (x1 ++ [(rl, w1, s1)] ++ change_stop_on_prefix z s1)
694        else
695          match_seq (x1 ++ [(rl, w1, s1)] ++ z)`,
696 Induct THENL
697 [RW_TAC list_ss [] 
698    THEN IMP_RES_TAC split_thm 
699    THEN FULL_SIMP_TAC list_ss [],
700  REPEAT STRIP_TAC
701    THEN `(bad_r = rl) /\ (bad_w = w1)` 
702         by (IMP_RES_TAC split_prop_thm THEN POP_ASSUM (K ALL_TAC)
703             THEN FULL_SIMP_TAC list_ss [LAMBDA_PROD])
704    THEN IMP_RES_TAC split_thm THEN FULL_SIMP_TAC list_ss []
705    THEN `match_seq (x1 ++ [(rl,w1,s1)] ++ y ++ [(rl,w1,bad_s)] ++ z)` 
706         by METIS_TAC []
707    THEN `(z=[]) \/ ?a b c t1. z = (a,b,c)::t1` 
708         by METIS_TAC [list_CASES,ABS_PAIR_THM] THENL
709    [FULL_SIMP_TAC list_ss [match_seq_def]
710       THEN `match_seq [(bad_r,bad_w,bad_s)]` 
711            by METIS_TAC [match_seq_down_closed]
712       THEN FULL_SIMP_TAC list_ss [match_seq_def]
713       THEN RW_TAC list_ss [change_stop_on_prefix_def, match_seq_def]
714       THEN `match_seq ([([],[],s1)] ++ y ++ [([],[],bad_s)])` 
715             by METIS_TAC [match_seq_down_closed,APPEND_ASSOC]
716       THEN Cases_on `y`  THEN FULL_SIMP_TAC list_ss [match_seq_def,m_def],
717     RW_TAC list_ss [] THENL
718     [`match_seq (change_stop_on_prefix ((a,b,c)::t1) s1)` 
719          by METIS_TAC [match_seq_down_closed,change_stop_on_prefix_thm]
720        THEN ASSUME_TAC (Q.SPECL 
721              [`(Repeat ra::a,bad_w,s1)`, `x1`, 
722               `change_stop_on_prefix ((a,b,c)::t1) s1`, 
723               `y ++ [(Repeat ra::a,bad_w,bad_s)] ++ ((a,b,c)::t1)`] 
724              match_seq_lemma)
725        THEN FIRST_ASSUM MATCH_MP_TAC THEN RW_TAC list_ss []
726        THEN `?t2. change_stop_on_prefix ((a,b,c)::t1) s1 = (a,b,s1)::t2` 
727             by METIS_TAC[csp_step]
728        THEN POP_ASSUM SUBST_ALL_TAC
729        THEN RW_TAC list_ss [match_seq_def]
730        THEN `m (Repeat ra::a,bad_w,bad_s) (a,b,c)` 
731             by METIS_TAC [match_seq_down_closed,APPEND,APPEND_ASSOC,
732                           match_seq_def]
733        THEN FULL_SIMP_TAC list_ss [m_def],
734      ASSUME_TAC (Q.SPECL 
735         [`(bad_r,bad_w,s1)`, `x1`, `(a,b,c)::t1`, 
736          `y ++ [(bad_r,bad_w,bad_s)] ++ ((a,b,c)::t1)`] match_seq_lemma)
737        THEN FIRST_ASSUM MATCH_MP_TAC THEN RW_TAC list_ss [match_seq_def]
738        THEN `m (bad_r,bad_w,bad_s) (a,b,c)` 
739             by METIS_TAC [match_seq_down_closed,APPEND,APPEND_ASSOC,
740                           match_seq_def]
741        THEN `?r l. bad_r = Repeat r::l` by IMP_RES_TAC split_prop_thm 
742        THEN POP_ASSUM MP_TAC THEN EVAL_TAC THEN RW_TAC list_ss []
743        THENL [FULL_SIMP_TAC list_ss [m_def],
744               METIS_TAC [match_seq_down_closed]]]]]);
745
746val NS_match_thm = Q.prove
747(`!ms. match_seq ms ==> match_seq (NS ms)`,
748 recInduct NS_IND THEN RW_TAC list_ss []
749   THEN ONCE_REWRITE_TAC [NS_DEF] THEN CASE_TAC THENL
750   [METIS_TAC [],
751    REPEAT CASE_TAC 
752     THEN RW_TAC list_ss [match_seq_def, LET_THM]
753     THEN REPEAT (POP_ASSUM MP_TAC) 
754     THEN RENAME_FREES_TAC [("q","l1"), ("r'","l2"), 
755                            ("q''","a"),("q'","b"),("r''","c")]
756     THEN REPEAT STRIP_TAC
757     THEN `(split (\x. ?s. x = (a,b,s)) l1 [] = NONE) \/
758           ?a1 b1a b1b b1c c1. 
759              split (\x. ?s. x = (a,b,s)) l1 [] = SOME (a1,(b1a,b1b,b1c),c1)`
760          by METIS_TAC [optionTheory.option_nchotomy,ABS_PAIR_THM]
761     THEN RW_TAC list_ss [match_seq_def]
762     THEN IMP_RES_TAC match_seq_surg_thm
763     THEN FULL_SIMP_TAC list_ss [] 
764     THEN RW_TAC list_ss [] THEN METIS_TAC []]);
765
766val NS_is_normal_thm = Q.prove
767(`!ms r w t. ~MEM (Repeat r::t, w, SOME (Repeat r::t)) (NS ms)`,
768 recInduct NS_IND THEN RW_TAC list_ss []
769   THEN ONCE_REWRITE_TAC [NS_DEF]
770   THEN CASE_TAC THENL
771   [IMP_RES_TAC split_thm4
772      THEN FULL_SIMP_TAC list_ss [] THEN RW_TAC list_ss []
773      THEN Induct_on `ms` 
774      THEN RW_TAC list_ss []
775      THEN FULL_SIMP_TAC list_ss [split_def] 
776      THEN METIS_TAC [split_thm5],
777    REPEAT CASE_TAC THEN RW_TAC list_ss [LET_THM]
778      THEN REPEAT (POP_ASSUM MP_TAC) 
779      THEN RENAME_FREES_TAC [("q","l1"), ("r''","l2"), 
780                             ("q''","a"),("q'","b"),("r'''","c")]
781      THEN NTAC 4 STRIP_TAC
782      THEN `(split (\x. ?s. x = (a,b,s)) l1 [] = NONE) \/
783            ?a1 b1a b1b b1c c1. 
784               split (\x. ?s. x = (a,b,s)) l1 [] = SOME (a1,(b1a,b1b,b1c),c1)`
785           by METIS_TAC [optionTheory.option_nchotomy,ABS_PAIR_THM]
786      THEN FULL_SIMP_TAC list_ss [] 
787      THEN RW_TAC list_ss []
788      THEN FULL_SIMP_TAC list_ss [] 
789      THEN METIS_TAC []]);   
790
791val lem = Q.prove
792(`EXISTS (\y. y = (a,b,c)) L ==> EXISTS (\y. ?c. y = (a,b,c)) L`,
793Induct_on `L` THEN EVAL_TAC THEN RW_TAC list_ss [] THEN METIS_TAC[]);
794
795(*---------------------------------------------------------------------------*)
796(* Normalizing a match sequence doesn't change the first element             *)
797(*---------------------------------------------------------------------------*)
798
799val  NS_HD_THM = Q.prove
800(`!t t' r w. (t = (r,w,NONE)::t') ==> 
801              match_seq t ==> 
802             ?t''. (NS t = (r, w, NONE)::t'')`,
803 recInduct NS_IND THEN RW_TAC list_ss []
804   THEN ONCE_REWRITE_TAC [NS_DEF]
805   THEN REPEAT CASE_TAC
806   THEN REPEAT (POP_ASSUM MP_TAC) 
807   THEN RENAME_FREES_TAC [("q","l1"), ("r''","l2"), 
808                          ("q''","a"),("q'","b"),("r'''","c")]
809   THEN NTAC 4 STRIP_TAC
810   THEN RW_TAC list_ss [LET_THM] THENL
811   [IMP_RES_TAC split_prop_thm THEN FULL_SIMP_TAC list_ss [] 
812     THEN IMP_RES_TAC match_seq_find_lemma THEN FULL_SIMP_TAC list_ss [] 
813     THEN IMP_RES_TAC split_thm4 
814     THEN `(a=[]) \/ ?d u. a = d::u` by METIS_TAC [list_CASES]
815     THEN RW_TAC list_ss [] THEN METIS_TAC [MEM_EXISTS_LEM,lem],
816    `?a1 b1a b1b b1c c1. 
817         split (\x. ?s. x = (a,b,s)) l1 [] = SOME (a1,(b1a,b1b,b1c),c1)`
818         by METIS_TAC [optionTheory.option_nchotomy,ABS_PAIR_THM]
819     THEN Cases_on `l1` THEN FULL_SIMP_TAC list_ss []
820     THENL [FULL_SIMP_TAC list_ss [split_def], ALL_TAC]
821     THEN `h = (r,w,NONE)` by IMP_RES_TAC split_thm THEN FULL_SIMP_TAC list_ss []
822     THEN IMP_RES_TAC match_seq_surg_thm
823     THEN Cases_on `a1` THEN RW_TAC list_ss [match_seq_def] 
824     THEN IMP_RES_TAC split_thm THEN FULL_SIMP_TAC list_ss []]);
825
826(*---------------------------------------------------------------------------*)
827(* The semantics imply match.                                                *)
828(*---------------------------------------------------------------------------*)
829
830val sem_implies_match = Q.prove
831(`!r w. sem r w ==> match [r] w NONE`,
832 METIS_TAC [sem_implies_match_seq_exists, NS_match_thm,
833            NS_is_normal_thm, NS_HD_THM, witness_implies_match_thm]);
834
835(*---------------------------------------------------------------------------*)
836(* match correctly implements the semantics.                                 *)
837(*---------------------------------------------------------------------------*)
838
839val match_is_correct = Q.prove
840(`!r w. sem r w = match [r] w NONE`,
841 REPEAT (STRIP_TAC ORELSE EQ_TAC) THENL
842   [RW_TAC list_ss [sem_implies_match],
843    IMP_RES_TAC match_implies_sem THEN FULL_SIMP_TAC list_ss [FOLDR,sem_def]]);
844
845val _ = Count.report (Count.read thm_counter);
846
847