1structure regexpLib :> regexpLib = 2struct 3 4open HolKernel boolLib bossLib; 5open pairLib optionLib pred_setLib listLib stringLib wordsLib; 6open listTheory stringTheory arithmeticTheory pred_setTheory 7 sortingTheory mergesortTheory comparisonTheory balanced_mapTheory vec_mapTheory 8 charsetTheory regexpTheory regexp_compilerTheory regexp_parserTheory; 9 10open Regexp_Type regexpSyntax regexpMisc; 11 12local open Regexp_Numerics DFA_Codegen in end; 13 14val ERR = mk_HOL_ERR "regexpLib"; 15 16fun gen_sml_dfa r = 17 let val {matchfn,start,table,final} = Regexp_Match.vector_matcher r 18 val _ = stdErr_print (Int.toString(Vector.length final)^" states.\n") 19 in {certificate = NONE:thm option, 20 aux = NONE, 21 matchfn = matchfn, 22 table = table, 23 start = start, 24 final = final} 25 end; 26 27val numeral_cmp_thm = Q.prove 28(`(num_cmp (NUMERAL x) (NUMERAL y) = num_cmp x y) /\ 29 (num_cmp (NUMERAL x) y = num_cmp x y) /\ 30 (num_cmp x (NUMERAL y) = num_cmp x y) /\ 31 (num_cmp 0 n = num_cmp ZERO n) /\ 32 (num_cmp n 0 = num_cmp n ZERO) /\ 33 (num_cmp ZERO ZERO = Equal) /\ 34 (num_cmp ZERO (BIT1 y) = Less) /\ 35 (num_cmp ZERO (BIT2 y) = Less) /\ 36 (num_cmp (BIT1 x) ZERO = Greater) /\ 37 (num_cmp (BIT2 x) ZERO = Greater) /\ 38 (num_cmp (BIT1 x) (BIT1 y) = num_cmp x y) /\ 39 (num_cmp (BIT2 x) (BIT2 y) = num_cmp x y) /\ 40 (num_cmp (BIT1 x) (BIT2 y) = case num_cmp x y of Greater => Greater | _ => Less) /\ 41 (num_cmp (BIT2 x) (BIT1 y) = case num_cmp x y of Less => Less | _ => Greater)`, 42 METIS_TAC [arithmeticTheory.NUMERAL_DEF,comparisonTheory.num_cmp_numOrd, 43 totoTheory.numeralOrd,arithmeticTheory.ALT_ZERO]); 44 45val vector_defs = 46 let (* open ml_translatorTheory *) 47 in [fromList_def, sub_def, length_def] 48 end; 49 50 51val exec_dfa_thms = exec_dfa_def :: vector_defs; 52 53val base_compute_thms = 54 vector_defs 55 @ 56 [ALPHABET_def, alphabet_size_def, And_def, 57 charset_empty_def, charset_full_def, charset_sing_def, 58 merge_charsets_def, charset_cmp_def, numeral_cmp_thm, len_cmp_def, 59 regexp_compare_def, regexp_leq_def, regexp_compareW_compute, 60 61 build_char_set_def, DOT_def, Empty_def, Epsilon_def, catstring_def, 62 assoc_cat_def, build_cat_def, build_neg_def, build_star_def, 63 flatten_or_def, remove_dups_def, build_or_def, 64 nullable_def, nullableW_def, smart_deriv_thm, normalize_def, 65 build_table_def, extend_states_def, 66 insert_regexp_def, mem_regexp_def, relationTheory.inv_image_def, 67 68 get_accepts_def, numLib.SUC_RULE (vec_mapTheory.alist_to_vec_def), 69 accepts_to_vector_def, table_to_vectors_def, 70 71 mergesort_def, mergesortN_def,sort2_def,sort3_def,merge_def, 72 balanced_mapTheory.null_def, 73 balanced_mapTheory.empty_def, 74 balanced_mapTheory.member_def, 75 balanced_mapTheory.insert_def, 76 balanced_mapTheory.lookup_def, 77 balanced_mapTheory.singleton_def, 78 balanced_mapTheory.balanceL_def, 79 balanced_mapTheory.balanceR_def, 80 balanced_mapTheory.ratio_def, 81 balanced_mapTheory.size_def, 82 balanced_mapTheory.delta_def, 83 balanced_mapTheory.foldrWithKey_def, 84 balanced_mapTheory.deleteFindMin_def, 85 balanced_mapTheory.fromList_def, 86 87 (* adding stuff revealed by computeLib.unmapped *) 88 combinTheory.o_DEF, combinTheory.I_THM, combinTheory.C_DEF, 89 combinTheory.FAIL_DEF, rich_listTheory.SPLITP_AUX_def, 90 rich_listTheory.SPLITP_compute,rich_listTheory.SEG_compute 91 ]; 92 93fun chsets_of regexp = 94 let open HOLset 95 fun trav (x as Chset _) set = add(set,x) 96 | trav (Star r) set = trav r set 97 | trav (Neg r) set = trav r set 98 | trav (Cat(r,s)) set = trav r (trav s set) 99 | trav (Or rlist) set = itlist trav rlist set 100 in 101 listItems (trav regexp (empty regexp_compare)) 102 end 103 104val alphabet_tms = map numSyntax.term_of_int (upto 0 255) 105 106val charset_mem_tm = prim_mk_const{Name="charset_mem",Thy="charset"}; 107val smart_deriv_tm = prim_mk_const{Name="smart_deriv",Thy="regexp"}; 108val build_or_tm = prim_mk_const{Name="build_or", Thy="regexp"}; 109val transitions_tm = prim_mk_const{Name="transitions",Thy="regexp_compiler"}; 110 111val Empty_charset = 112 Empty_def |> SIMP_RULE std_ss [charset_empty_def] |> concl |> rhs |> rand 113 114val DOT_charset = DOT_def |> concl |> rhs |> EVAL |> concl |> rhs |> rand 115 116val charset_mem_conv = 117 let val cs_memEval = 118 let open computeLib 119 val compset = listLib.list_compset() 120 in 121 wordsLib.add_words_compset true compset 122 ; add_datatype_info compset (valOf(TypeBase.fetch ``:charset``)) 123 ; add_thms [alphabet_size_def, words4_bit_def, charset_mem_def] compset 124 ; computeLib.CBV_CONV compset 125 end 126 val pairs = cross alphabet_tms [Empty_charset, DOT_charset] 127 val probs = map (fn (x,y) => list_mk_comb(charset_mem_tm, [x,y])) pairs 128 val table = List.map (fn x => (x,cs_memEval x)) probs 129 val init_Map = Redblackmap.fromList Term.compare table 130 in fn () => 131 let val theMap = ref init_Map 132 in fn tm => 133 Redblackmap.find (!theMap,tm) 134 handle NotFound => 135 let val _ = print "previously unseen charset ... " 136 val cset_tm = rand tm 137 val pairs = cross alphabet_tms [cset_tm] 138 val probs = map (fn (x,y) => list_mk_comb(charset_mem_tm, [x,y])) pairs 139 val table = List.map (fn x => (x,cs_memEval x)) probs 140 val _ = (theMap := Redblackmap.insertList (!theMap, table)) 141 val _ = print "tabulated.\n" 142 in 143 Redblackmap.find (!theMap,tm) 144 end 145 end 146 end; 147 148val charset_union_conv = 149 let val charset_unionEval = 150 let open computeLib 151 val compset = listLib.list_compset() 152 in wordsLib.add_words_compset true compset 153 ; add_datatype_info compset (valOf(TypeBase.fetch ``:charset``)) 154 ; add_thms [charset_union_def, charset_empty_def] compset 155 ; CBV_CONV compset 156 end 157 in 158 fn () => 159 let val theMap = ref (Redblackmap.mkDict Term.compare) 160 in fn tm => 161 Redblackmap.find (!theMap,tm) 162 handle NotFound => 163 let val thm = charset_unionEval tm 164 val _ = (theMap := Redblackmap.insert(!theMap, tm,thm)) 165 in 166 thm 167 end 168 end 169end; 170 171 172(*---------------------------------------------------------------------------*) 173(* Assumes that input tm has form "transitions r" *) 174(*---------------------------------------------------------------------------*) 175 176fun transitions_conv compset = 177 let open regexpSyntax listSyntax 178 val smart_deriv_or = el 5 (CONJUNCTS smart_deriv_thm) 179 val sdi_tms = map (curry mk_comb smart_deriv_tm) alphabet_tms 180 val RW_MAP = PURE_REWRITE_CONV [listTheory.MAP] 181 val RW_SD_OR = PURE_REWRITE_RULE [GSYM smart_deriv_or] 182 val transitions_CONV1 = 183 PURE_REWRITE_CONV [transitions_def,listTheory.MAP,ALPHABET_def] 184 THENC DEPTH_CONV BETA_CONV 185 val Eval = computeLib.CBV_CONV compset 186 val theMap = ref (Redblackmap.mkDict Term.compare) 187 in 188 fn tm => 189 Redblackmap.find (!theMap,tm) 190 handle NotFound => 191 let val r = rand tm 192 in if not (is_or r) then 193 let val thm = (REWR_CONV transitions_def THENC Eval) tm 194 val _ = (theMap := Redblackmap.insert(!theMap,tm,thm)) 195 in thm 196 end 197 else 198 let val rlist = dest_or r 199 fun sdtms r = map (C (curry mk_comb) r) sdi_tms 200 val sdlists = map sdtms rlist 201 val thmlists = map (map Eval) sdlists 202 val orlists = transpose thmlists 203 fun map_sd n = ``MAP (smart_deriv ^n) ^(rand r)`` 204 val map_tms = map map_sd alphabet_tms 205 val map_thms = map RW_MAP map_tms 206 val map_thms' = map2 PURE_REWRITE_RULE orlists map_thms 207 val map_thms'' = map (AP_TERM build_or_tm) map_thms' 208 val build_or_thms = map (CONV_RULE (RHS_CONV Eval)) map_thms'' 209 val smart_deriv_thms = map RW_SD_OR build_or_thms 210 val thm = 211 (transitions_CONV1 THENC PURE_REWRITE_CONV smart_deriv_thms) tm 212 in 213 theMap := Redblackmap.insert(!theMap,tm,thm) 214 ; thm 215 end 216 end 217 end; 218 219(*---------------------------------------------------------------------------*) 220(* The base compset *) 221(*---------------------------------------------------------------------------*) 222 223fun base_compset() = 224 let open computeLib 225 val compset = listLib.list_compset() 226 in 227 optionLib.OPTION_rws compset 228 ; pairLib.add_pair_compset compset 229 ; pred_setLib.add_pred_set_compset compset 230 ; wordsLib.add_words_compset true compset 231 ; stringLib.add_string_compset compset 232 ; add_datatype_info compset (valOf(TypeBase.fetch ``:ordering``)) 233 ; add_datatype_info compset (valOf(TypeBase.fetch ``:('a,'b)balanced_map``)) 234 ; add_datatype_info compset (valOf(TypeBase.fetch ``:charset``)) 235 ; add_datatype_info compset (valOf(TypeBase.fetch ``:regexp``)) 236 ; add_thms base_compute_thms compset 237 ; add_conv(``charset_mem``, 2, charset_mem_conv ()) compset 238 ; add_conv(``charset_union``, 2, charset_union_conv()) compset 239 ; add_conv(``transitions``, 1, transitions_conv compset) compset 240 ; compset 241 end; 242 243fun gen_dfa_conv r = 244 let val regexp_tm = regexp_to_term r 245 val compset = base_compset() 246 val baseEval = computeLib.CBV_CONV compset 247 val dom_Brz_alt_conv = REPEATC (time (REWR_CONV dom_Brz_alt_eqns THENC baseEval)) 248 val _ = print "\nProving domain property ...\n" 249 val dom_Brz_thm = EQT_ELIM (Count.apply dom_Brz_alt_conv 250 ``dom_Brz_alt empty (singleton (normalize ^regexp_tm) ())``) 251 val _ = print "---> done.\n" 252 253 val _ = print "\nCompiling regexp ...\n" 254 val exec_Brz_conv = REPEATC (time (REWR_CONV exec_Brz_def THENC baseEval)) 255 fun compile_regexp_conv regexp_tm = 256 let val th1 = baseEval ``normalize ^regexp_tm`` 257 val nr = th1 |> concl |> rhs 258 val th2 = (PURE_REWRITE_CONV [Brzozowski_exec_Brz, MAXNUM_32_def] 259 THENC exec_Brz_conv) 260 ``Brzozowski empty (singleton ^nr ()) (1,singleton ^nr 0,[])`` 261 val _ = print "\nState transitions computed; now building DFA.\n\n" 262 in 263 compile_regexp_def 264 |> SPEC regexp_tm 265 |> PURE_ONCE_REWRITE_RULE [th1] 266 |> CONV_RULE (ONCE_DEPTH_CONV let_CONV) 267 |> PURE_ONCE_REWRITE_RULE [th2] 268 |> CONV_RULE (RHS_CONV baseEval) 269 end 270 val compile_thm = Count.apply compile_regexp_conv regexp_tm 271 val _ = print "---> done.\n" 272 val triple = rhs (concl compile_thm) 273 val [t1,t2,t3] = strip_pair triple 274 val start_state_thm = baseEval ``lookup regexp_compare (normalize ^regexp_tm) ^t1`` 275 val hyps_thm = LIST_CONJ [compile_thm, start_state_thm,dom_Brz_thm] 276 val thm = SIMP_RULE list_ss [fromList_Vector,ORD_BOUND,alphabet_size_def] 277 (SPEC regexp_tm Brzozowski_partial_eval_conseq) 278 val dfa_thm = MATCH_MP thm hyps_thm 279 in 280 (dfa_thm,hyps_thm) 281 end 282 283fun exec_dfa_compset() = 284 let open computeLib 285 val compset = listLib.list_compset() 286 in 287 optionLib.OPTION_rws compset 288 ; stringLib.add_string_compset compset 289 ; add_thms exec_dfa_thms compset 290 ; compset 291 end; 292 293val exec_dfa_conv = computeLib.CBV_CONV (exec_dfa_compset()); 294 295val Brzozowski_partial_eval_alt = 296 SIMP_RULE bool_ss [dom_Brz_alt_equal] Brzozowski_partial_eval; 297 298 299fun gen_hol_dfa r = 300 let open listSyntax regexpSyntax 301 val _ = stdErr_print "Compiling regexp to DFA by deduction (can be slow) :\n" 302 val (dfa_thm, hyps_thm) = gen_dfa_conv r 303 fun match_string_by_proof s = 304 let val stm = stringLib.fromMLstring s 305 val dfa_thm1 = SPEC stm dfa_thm 306 val _ = stdErr_print ("Running DFA on string: "^Lib.quote s^" ... ") 307 val dfa_exec_thm = exec_dfa_conv (lhs(concl dfa_thm1)) 308 val verdict = Teq (rhs(concl dfa_exec_thm)) 309 val _ = if verdict then stdErr_print "accepted.\n" 310 else stdErr_print "rejected.\n" 311 in 312 verdict 313 end 314 val eq_tm = snd(strip_forall (concl dfa_thm)) 315 val (_,[final,table,start,_]) = strip_comb(boolSyntax.lhs eq_tm) 316 val ifinal = List.map (aconv boolSyntax.T) 317 (fst(listSyntax.dest_list (dest_vector final))) 318 val istart = numSyntax.int_of_term start 319 val rows1 = dest_vector table 320 fun dest_row row = 321 let val opts = fst (listSyntax.dest_list row) 322 in List.map (numSyntax.int_of_term o optionSyntax.dest_some) opts 323 end 324 val rows2 = fst(listSyntax.dest_list(snd(listSyntax.dest_map rows1))) 325 val itable = List.map dest_row rows2 326 val len = length ifinal 327 val _ = stdErr_print (Int.toString len^" states.\n") 328 in 329 {certificate = SOME dfa_thm, 330 (* the aux field is used by the CakeML translator, see 331 <cakemldir>/examples/filterProgScript.sml *) 332 aux = SOME hyps_thm, 333 matchfn = match_string_by_proof, 334 table = Vector.fromList (map Vector.fromList itable), 335 start = istart, 336 final = Vector.fromList ifinal} 337 end; 338 339datatype evaluator = HOL | SML ; 340 341fun gen_dfa HOL = gen_hol_dfa 342 | gen_dfa SML = gen_sml_dfa 343 344fun dfa_by_proof (name,r) = 345 let val name = if Lexis.ok_identifier name then name 346 else (HOL_MESG (Lib.quote name^ 347 " is not a suitable identifier, using \"foo\" instead"); 348 "foo") 349 val {certificate,aux,start,table,final,matchfn} = gen_hol_dfa r 350 val SOME thm = certificate 351 val eqn = snd(dest_forall(concl thm)) 352 val (exec_dfa,[finals,table,start,s]) = strip_comb(lhs eqn) 353 val finals_var = mk_var(name^"_finals",type_of finals) 354 val table_var = mk_var(name^"_table",type_of table) 355 val start_var = mk_var(name^"_start",type_of start) 356 val finals_def = Define `^finals_var = ^finals` 357 val table_def = Define `^table_var = ^table` 358 val start_def = Define `^start_var = ^start` 359 val thm' = CONV_RULE (BINDER_CONV 360 (LHS_CONV (REWRITE_CONV [GSYM finals_def, GSYM table_def, GSYM start_def]))) 361 thm 362 val thm'' = LIST_CONJ [thm',table_def, finals_def, start_def] 363 in 364 save_thm(name^"_regexp_compilation",thm'') 365 end; 366 367(*---------------------------------------------------------------------------*) 368(* Reasoner for character sets. charset_conv converts terms of the form *) 369(* *) 370(* regexp_lang (Chset cs) *) 371(* *) 372(* into theorems of the form *) 373(* *) 374(* |- regexp_lang (Chset cs) = {#"c1"; ...; #"cn"} *) 375(* *) 376(* where c1 ... cn are the elements of cs. *) 377(*---------------------------------------------------------------------------*) 378 379fun charset_term_elts (cs:term) = 380 Regexp_Type.charset_elts (regexpSyntax.term_to_charset cs); 381 382val csvar = mk_var("cs",regexpSyntax.charset_ty); 383val regexp_chset_pat = ``regexp$regexp_lang ^(regexpSyntax.mk_chset csvar)``; 384 385fun char_tac (asl,c) = 386 let val ctm = fst(dest_eq (last (strip_conj (snd (dest_exists c))))) 387 in Q.EXISTS_TAC `ORD ^ctm` >> EVAL_TAC 388 end 389 390val tactic = 391 RW_TAC (list_ss ++ pred_setLib.PRED_SET_ss) 392 [pred_setTheory.EXTENSION, 393 regexpTheory.regexp_lang_def, 394 charsetTheory.charset_mem_def, 395 charsetTheory.alphabet_size_def,EQ_IMP_THM] 396 >> TRY (ntac 2 (pop_assum mp_tac) 397 >> Q.ID_SPEC_TAC `c` 398 >> REPEAT (CONV_TAC (numLib.BOUNDED_FORALL_CONV EVAL)) 399 >> rw_tac bool_ss [] 400 >> NO_TAC) 401 >> W char_tac; 402 403fun charset_conv tm = 404 case total (match_term regexp_chset_pat) tm 405 of NONE => raise ERR "charset_conv" 406 "expected ``regexp_lang (Chset cs)`` term" 407 | SOME (theta, _) => 408 let open pred_setSyntax 409 val chars = charset_term_elts (subst theta csvar) 410 val char_tms = map fromMLchar chars 411 val string_tms = map (fromMLstring o String.str) chars 412 val the_goal = mk_eq(tm, mk_set string_tms) 413 in 414 prove(the_goal,tactic) 415 end 416 417val charset_conv_ss = 418 simpLib.std_conv_ss 419 {name="charset_conv", 420 conv = charset_conv, 421 pats = [regexp_chset_pat]} 422 423 424(*---------------------------------------------------------------------------*) 425(* Set up default generator for interval regexps *) 426(*---------------------------------------------------------------------------*) 427 428val _ = 429 let open Regexp_Numerics 430 fun iFn p = 431 twos_comp_interval LSB (interval_width Twos_comp p) p 432 in 433 Regexp_Type.set_intervalFn iFn 434 end 435 436end (* regexpLib *) 437