1structure regexpLib :> regexpLib =
2struct
3
4open HolKernel boolLib bossLib;
5open pairLib optionLib pred_setLib listLib stringLib wordsLib;
6open listTheory stringTheory arithmeticTheory pred_setTheory
7     sortingTheory mergesortTheory comparisonTheory balanced_mapTheory vec_mapTheory
8     charsetTheory regexpTheory regexp_compilerTheory regexp_parserTheory;
9
10open Regexp_Type regexpSyntax regexpMisc;
11
12local open Regexp_Numerics DFA_Codegen in end;
13
14val ERR = mk_HOL_ERR "regexpLib";
15
16fun gen_sml_dfa r =
17 let val {matchfn,start,table,final} = Regexp_Match.vector_matcher r
18     val _ = stdErr_print (Int.toString(Vector.length final)^" states.\n")
19 in {certificate = NONE:thm option,
20     aux = NONE,
21     matchfn = matchfn,
22     table = table,
23     start = start,
24     final = final}
25 end;
26
27val numeral_cmp_thm = Q.prove
28(`(num_cmp (NUMERAL x) (NUMERAL y) = num_cmp x y) /\
29  (num_cmp (NUMERAL x) y = num_cmp x y) /\
30  (num_cmp x (NUMERAL y) = num_cmp x y) /\
31  (num_cmp 0 n = num_cmp ZERO n) /\
32  (num_cmp n 0 = num_cmp n ZERO) /\
33  (num_cmp ZERO ZERO = Equal) /\
34  (num_cmp ZERO (BIT1 y) = Less) /\
35  (num_cmp ZERO (BIT2 y) = Less) /\
36  (num_cmp (BIT1 x) ZERO = Greater) /\
37  (num_cmp (BIT2 x) ZERO = Greater) /\
38  (num_cmp (BIT1 x) (BIT1 y) = num_cmp x y) /\
39  (num_cmp (BIT2 x) (BIT2 y) = num_cmp x y) /\
40  (num_cmp (BIT1 x) (BIT2 y) = case num_cmp x y of Greater => Greater | _ => Less) /\
41  (num_cmp (BIT2 x) (BIT1 y) = case num_cmp x y of Less => Less | _ => Greater)`,
42 METIS_TAC [arithmeticTheory.NUMERAL_DEF,comparisonTheory.num_cmp_numOrd,
43            totoTheory.numeralOrd,arithmeticTheory.ALT_ZERO]);
44
45val vector_defs =
46 let (* open ml_translatorTheory *)
47 in [fromList_def, sub_def, length_def]
48 end;
49
50
51val exec_dfa_thms = exec_dfa_def :: vector_defs;
52
53val base_compute_thms =
54  vector_defs
55  @
56  [ALPHABET_def, alphabet_size_def, And_def,
57   charset_empty_def, charset_full_def, charset_sing_def,
58   merge_charsets_def, charset_cmp_def, numeral_cmp_thm, len_cmp_def,
59   regexp_compare_def, regexp_leq_def, regexp_compareW_compute,
60
61   build_char_set_def, DOT_def, Empty_def, Epsilon_def, catstring_def,
62   assoc_cat_def, build_cat_def, build_neg_def, build_star_def,
63   flatten_or_def, remove_dups_def, build_or_def,
64   nullable_def, nullableW_def, smart_deriv_thm, normalize_def,
65   build_table_def, extend_states_def,
66   insert_regexp_def, mem_regexp_def, relationTheory.inv_image_def,
67
68   get_accepts_def, numLib.SUC_RULE (vec_mapTheory.alist_to_vec_def),
69   accepts_to_vector_def, table_to_vectors_def,
70
71   mergesort_def, mergesortN_def,sort2_def,sort3_def,merge_def,
72   balanced_mapTheory.null_def,
73   balanced_mapTheory.empty_def,
74   balanced_mapTheory.member_def,
75   balanced_mapTheory.insert_def,
76   balanced_mapTheory.lookup_def,
77   balanced_mapTheory.singleton_def,
78   balanced_mapTheory.balanceL_def,
79   balanced_mapTheory.balanceR_def,
80   balanced_mapTheory.ratio_def,
81   balanced_mapTheory.size_def,
82   balanced_mapTheory.delta_def,
83   balanced_mapTheory.foldrWithKey_def,
84   balanced_mapTheory.deleteFindMin_def,
85   balanced_mapTheory.fromList_def,
86
87   (* adding stuff revealed by computeLib.unmapped *)
88   combinTheory.o_DEF, combinTheory.I_THM, combinTheory.C_DEF,
89   combinTheory.FAIL_DEF, rich_listTheory.SPLITP_AUX_def,
90   rich_listTheory.SPLITP_compute,rich_listTheory.SEG_compute
91 ];
92
93fun chsets_of regexp =
94 let open HOLset
95     fun trav (x as Chset _) set = add(set,x)
96       | trav (Star r) set   = trav r set
97       | trav (Neg r) set    = trav r set
98       | trav (Cat(r,s)) set = trav r (trav s set)
99       | trav (Or rlist) set = itlist trav rlist set
100 in
101   listItems (trav regexp (empty regexp_compare))
102 end
103
104val alphabet_tms = map numSyntax.term_of_int (upto 0 255)
105
106val charset_mem_tm = prim_mk_const{Name="charset_mem",Thy="charset"};
107val smart_deriv_tm = prim_mk_const{Name="smart_deriv",Thy="regexp"};
108val build_or_tm    = prim_mk_const{Name="build_or",   Thy="regexp"};
109val transitions_tm = prim_mk_const{Name="transitions",Thy="regexp_compiler"};
110
111val Empty_charset =
112   Empty_def |> SIMP_RULE std_ss [charset_empty_def] |> concl |> rhs |> rand
113
114val DOT_charset = DOT_def |> concl |> rhs |> EVAL |> concl |> rhs |> rand
115
116val charset_mem_conv =
117 let val cs_memEval =
118      let open computeLib
119          val compset = listLib.list_compset()
120      in
121          wordsLib.add_words_compset true compset
122        ; add_datatype_info compset (valOf(TypeBase.fetch ``:charset``))
123        ; add_thms [alphabet_size_def, words4_bit_def, charset_mem_def] compset
124        ; computeLib.CBV_CONV compset
125      end
126     val pairs = cross alphabet_tms [Empty_charset, DOT_charset]
127     val probs = map (fn (x,y) => list_mk_comb(charset_mem_tm, [x,y])) pairs
128     val table = List.map (fn x => (x,cs_memEval x)) probs
129     val init_Map = Redblackmap.fromList Term.compare table
130 in fn () =>
131    let val theMap = ref init_Map
132    in fn tm =>
133       Redblackmap.find (!theMap,tm)
134       handle NotFound =>
135       let val _ = print "previously unseen charset ... "
136           val cset_tm = rand tm
137           val pairs = cross alphabet_tms [cset_tm]
138           val probs = map (fn (x,y) => list_mk_comb(charset_mem_tm, [x,y])) pairs
139           val table = List.map (fn x => (x,cs_memEval x)) probs
140           val _ = (theMap := Redblackmap.insertList (!theMap, table))
141           val _ = print "tabulated.\n"
142       in
143          Redblackmap.find (!theMap,tm)
144       end
145    end
146 end;
147
148val charset_union_conv =
149 let val charset_unionEval =
150        let open computeLib
151            val compset = listLib.list_compset()
152        in wordsLib.add_words_compset true compset
153         ; add_datatype_info compset (valOf(TypeBase.fetch ``:charset``))
154         ; add_thms [charset_union_def, charset_empty_def] compset
155         ; CBV_CONV compset
156        end
157 in
158   fn () =>
159   let val theMap = ref (Redblackmap.mkDict Term.compare)
160   in fn tm =>
161      Redblackmap.find (!theMap,tm)
162      handle NotFound =>
163        let val thm = charset_unionEval tm
164            val _ = (theMap := Redblackmap.insert(!theMap, tm,thm))
165        in
166           thm
167        end
168   end
169end;
170
171
172(*---------------------------------------------------------------------------*)
173(* Assumes that input tm has form "transitions r"                            *)
174(*---------------------------------------------------------------------------*)
175
176fun transitions_conv compset =
177 let open regexpSyntax listSyntax
178     val smart_deriv_or = el 5 (CONJUNCTS smart_deriv_thm)
179     val sdi_tms = map (curry mk_comb smart_deriv_tm) alphabet_tms
180     val RW_MAP = PURE_REWRITE_CONV [listTheory.MAP]
181     val RW_SD_OR = PURE_REWRITE_RULE [GSYM smart_deriv_or]
182     val transitions_CONV1 =
183       PURE_REWRITE_CONV [transitions_def,listTheory.MAP,ALPHABET_def]
184       THENC DEPTH_CONV BETA_CONV
185     val Eval = computeLib.CBV_CONV compset
186     val theMap = ref (Redblackmap.mkDict Term.compare)
187 in
188 fn tm =>
189   Redblackmap.find (!theMap,tm)
190   handle NotFound =>
191   let val r = rand tm
192   in if not (is_or r) then
193        let val thm = (REWR_CONV transitions_def THENC Eval) tm
194            val _ = (theMap := Redblackmap.insert(!theMap,tm,thm))
195        in thm
196        end
197      else
198      let val rlist = dest_or r
199          fun sdtms r = map (C (curry mk_comb) r) sdi_tms
200          val sdlists = map sdtms rlist
201          val thmlists = map (map Eval) sdlists
202          val orlists = transpose thmlists
203          fun map_sd n = ``MAP (smart_deriv ^n) ^(rand r)``
204          val map_tms = map map_sd alphabet_tms
205          val map_thms = map RW_MAP map_tms
206          val map_thms' = map2 PURE_REWRITE_RULE orlists map_thms
207          val map_thms'' = map (AP_TERM build_or_tm) map_thms'
208          val build_or_thms = map (CONV_RULE (RHS_CONV Eval)) map_thms''
209          val smart_deriv_thms = map RW_SD_OR build_or_thms
210          val thm =
211             (transitions_CONV1 THENC PURE_REWRITE_CONV smart_deriv_thms) tm
212      in
213        theMap := Redblackmap.insert(!theMap,tm,thm)
214      ; thm
215      end
216   end
217 end;
218
219(*---------------------------------------------------------------------------*)
220(* The base compset                                                          *)
221(*---------------------------------------------------------------------------*)
222
223fun base_compset() =
224 let open computeLib
225     val compset = listLib.list_compset()
226 in
227     optionLib.OPTION_rws compset
228   ; pairLib.add_pair_compset compset
229   ; pred_setLib.add_pred_set_compset compset
230   ; wordsLib.add_words_compset true compset
231   ; stringLib.add_string_compset compset
232   ; add_datatype_info compset (valOf(TypeBase.fetch ``:ordering``))
233   ; add_datatype_info compset (valOf(TypeBase.fetch ``:('a,'b)balanced_map``))
234   ; add_datatype_info compset (valOf(TypeBase.fetch ``:charset``))
235   ; add_datatype_info compset (valOf(TypeBase.fetch ``:regexp``))
236   ; add_thms base_compute_thms compset
237   ; add_conv(``charset_mem``, 2, charset_mem_conv ()) compset
238   ; add_conv(``charset_union``, 2, charset_union_conv()) compset
239   ; add_conv(``transitions``, 1, transitions_conv compset) compset
240   ; compset
241 end;
242
243fun gen_dfa_conv r =
244 let val regexp_tm = regexp_to_term r
245     val compset = base_compset()
246     val baseEval = computeLib.CBV_CONV compset
247     val dom_Brz_alt_conv = REPEATC (time (REWR_CONV dom_Brz_alt_eqns THENC baseEval))
248     val _ = print "\nProving domain property ...\n"
249     val dom_Brz_thm = EQT_ELIM (Count.apply dom_Brz_alt_conv
250                         ``dom_Brz_alt empty (singleton (normalize ^regexp_tm) ())``)
251     val _ = print "---> done.\n"
252
253     val _ = print "\nCompiling regexp ...\n"
254     val exec_Brz_conv = REPEATC (time (REWR_CONV exec_Brz_def THENC baseEval))
255     fun compile_regexp_conv regexp_tm =
256       let val th1 = baseEval ``normalize ^regexp_tm``
257           val nr = th1 |> concl |> rhs
258           val th2 = (PURE_REWRITE_CONV [Brzozowski_exec_Brz, MAXNUM_32_def]
259                      THENC exec_Brz_conv)
260                ``Brzozowski empty (singleton ^nr ()) (1,singleton ^nr 0,[])``
261           val _ = print "\nState transitions computed; now building DFA.\n\n"
262       in
263           compile_regexp_def
264             |> SPEC regexp_tm
265             |> PURE_ONCE_REWRITE_RULE [th1]
266             |> CONV_RULE (ONCE_DEPTH_CONV let_CONV)
267             |> PURE_ONCE_REWRITE_RULE [th2]
268             |> CONV_RULE (RHS_CONV baseEval)
269       end
270     val compile_thm = Count.apply compile_regexp_conv regexp_tm
271     val _ = print "---> done.\n"
272     val triple = rhs (concl compile_thm)
273     val [t1,t2,t3] = strip_pair triple
274     val start_state_thm = baseEval ``lookup regexp_compare (normalize ^regexp_tm) ^t1``
275     val hyps_thm = LIST_CONJ [compile_thm, start_state_thm,dom_Brz_thm]
276     val thm = SIMP_RULE list_ss [fromList_Vector,ORD_BOUND,alphabet_size_def]
277                       (SPEC regexp_tm Brzozowski_partial_eval_conseq)
278     val dfa_thm = MATCH_MP thm hyps_thm
279 in
280   (dfa_thm,hyps_thm)
281 end
282
283fun exec_dfa_compset() =
284 let open computeLib
285     val compset = listLib.list_compset()
286 in
287     optionLib.OPTION_rws compset
288   ; stringLib.add_string_compset compset
289   ; add_thms exec_dfa_thms compset
290   ; compset
291 end;
292
293val exec_dfa_conv = computeLib.CBV_CONV (exec_dfa_compset());
294
295val Brzozowski_partial_eval_alt =
296   SIMP_RULE bool_ss [dom_Brz_alt_equal] Brzozowski_partial_eval;
297
298
299fun gen_hol_dfa r =
300 let open listSyntax regexpSyntax
301     val _ = stdErr_print "Compiling regexp to DFA by deduction (can be slow) :\n"
302     val (dfa_thm, hyps_thm) = gen_dfa_conv r
303     fun match_string_by_proof s =
304       let val stm = stringLib.fromMLstring s
305           val dfa_thm1 = SPEC stm dfa_thm
306           val _ = stdErr_print ("Running DFA on string: "^Lib.quote s^" ... ")
307           val dfa_exec_thm = exec_dfa_conv (lhs(concl dfa_thm1))
308           val verdict = Teq (rhs(concl dfa_exec_thm))
309           val _ = if verdict then stdErr_print "accepted.\n"
310                   else stdErr_print "rejected.\n"
311       in
312         verdict
313       end
314     val eq_tm = snd(strip_forall (concl dfa_thm))
315     val (_,[final,table,start,_]) = strip_comb(boolSyntax.lhs eq_tm)
316     val ifinal = List.map (aconv boolSyntax.T)
317                           (fst(listSyntax.dest_list (dest_vector final)))
318     val istart = numSyntax.int_of_term start
319     val rows1 = dest_vector table
320     fun dest_row row =
321        let val opts = fst (listSyntax.dest_list row)
322        in List.map (numSyntax.int_of_term o optionSyntax.dest_some) opts
323        end
324     val rows2 = fst(listSyntax.dest_list(snd(listSyntax.dest_map rows1)))
325     val itable = List.map dest_row rows2
326     val len = length ifinal
327     val _ = stdErr_print (Int.toString len^" states.\n")
328 in
329     {certificate = SOME dfa_thm,
330      (* the aux field is used by the CakeML translator, see
331         <cakemldir>/examples/filterProgScript.sml *)
332      aux = SOME hyps_thm,
333      matchfn = match_string_by_proof,
334      table = Vector.fromList (map Vector.fromList itable),
335      start = istart,
336      final = Vector.fromList ifinal}
337 end;
338
339datatype evaluator = HOL | SML ;
340
341fun gen_dfa HOL = gen_hol_dfa
342  | gen_dfa SML = gen_sml_dfa
343
344fun dfa_by_proof (name,r) =
345 let val name = if Lexis.ok_identifier name then name
346               else (HOL_MESG (Lib.quote name^
347                       " is not a suitable identifier, using \"foo\" instead");
348                     "foo")
349     val {certificate,aux,start,table,final,matchfn} = gen_hol_dfa r
350     val SOME thm = certificate
351     val eqn = snd(dest_forall(concl thm))
352     val (exec_dfa,[finals,table,start,s]) = strip_comb(lhs eqn)
353     val finals_var = mk_var(name^"_finals",type_of finals)
354     val table_var  = mk_var(name^"_table",type_of table)
355     val start_var  = mk_var(name^"_start",type_of start)
356     val finals_def = Define `^finals_var = ^finals`
357     val table_def  = Define `^table_var = ^table`
358     val start_def  = Define `^start_var = ^start`
359     val thm' = CONV_RULE (BINDER_CONV
360                  (LHS_CONV (REWRITE_CONV [GSYM finals_def, GSYM table_def, GSYM start_def])))
361                  thm
362     val thm'' = LIST_CONJ [thm',table_def, finals_def, start_def]
363 in
364   save_thm(name^"_regexp_compilation",thm'')
365 end;
366
367(*---------------------------------------------------------------------------*)
368(* Reasoner for character sets. charset_conv converts terms of the form      *)
369(*                                                                           *)
370(*   regexp_lang (Chset cs)                                                  *)
371(*                                                                           *)
372(* into theorems of the form                                                 *)
373(*                                                                           *)
374(*   |- regexp_lang (Chset cs) = {#"c1"; ...; #"cn"}                         *)
375(*                                                                           *)
376(* where c1 ... cn are the elements of cs.                                   *)
377(*---------------------------------------------------------------------------*)
378
379fun charset_term_elts (cs:term) =
380  Regexp_Type.charset_elts (regexpSyntax.term_to_charset cs);
381
382val csvar = mk_var("cs",regexpSyntax.charset_ty);
383val regexp_chset_pat = ``regexp$regexp_lang ^(regexpSyntax.mk_chset csvar)``;
384
385fun char_tac (asl,c) =
386    let val ctm = fst(dest_eq (last (strip_conj (snd (dest_exists c)))))
387    in Q.EXISTS_TAC `ORD ^ctm` >> EVAL_TAC
388    end
389
390val tactic =
391   RW_TAC (list_ss ++ pred_setLib.PRED_SET_ss)
392          [pred_setTheory.EXTENSION,
393           regexpTheory.regexp_lang_def,
394           charsetTheory.charset_mem_def,
395           charsetTheory.alphabet_size_def,EQ_IMP_THM]
396    >> TRY (ntac 2 (pop_assum mp_tac)
397            >> Q.ID_SPEC_TAC `c`
398            >> REPEAT (CONV_TAC (numLib.BOUNDED_FORALL_CONV EVAL))
399            >> rw_tac bool_ss []
400            >> NO_TAC)
401    >> W char_tac;
402
403fun charset_conv tm =
404 case total (match_term regexp_chset_pat) tm
405  of NONE => raise ERR "charset_conv"
406                    "expected ``regexp_lang (Chset cs)`` term"
407  | SOME (theta, _) =>
408     let open pred_setSyntax
409         val chars = charset_term_elts (subst theta csvar)
410         val char_tms = map fromMLchar chars
411         val string_tms = map (fromMLstring o String.str) chars
412         val the_goal = mk_eq(tm, mk_set string_tms)
413  in
414     prove(the_goal,tactic)
415  end
416
417val charset_conv_ss =
418  simpLib.std_conv_ss
419    {name="charset_conv",
420     conv = charset_conv,
421     pats = [regexp_chset_pat]}
422
423
424(*---------------------------------------------------------------------------*)
425(* Set up default generator for interval regexps                             *)
426(*---------------------------------------------------------------------------*)
427
428val _ =
429 let open Regexp_Numerics
430     fun iFn p =
431        twos_comp_interval LSB (interval_width Twos_comp p) p
432 in
433    Regexp_Type.set_intervalFn iFn
434 end
435
436end (* regexpLib *)
437