1open HolKernel boolLib bossLib Parse; 2 3open boolTheory; 4open pred_setTheory; 5open pairTheory; 6 7open arithmeticTheory; 8open numTheory; 9 10open prim_recTheory; 11 12open listTheory; 13open rich_listTheory; 14 15open stringTheory; 16 17open set_lemmasTheory; 18open helper_funcsTheory; 19open boyer_moore_specTheory; 20 21val _ = new_theory"boyer_moore"; 22 23(* -- IMPLICIT CHARACTER MISMATCH TABLE CONSTRUCTION -- *) 24(* Assess potential shift based on character mismatch rule *) 25val checkDeltaC_def = 26 Define 27 ` 28 checkDeltaC pat all_chars j a d = 29 ((d = j+1) \/ (EL (j-d) pat = EL a all_chars)) 30 `; 31 32(* Relationship between checkDeltaC function 33 and valid_cha_shift specification *) 34val CHECK_DELTAC_THM = store_thm( 35 "CHECK_DELTAC_THM", 36 ``!pat all_chars j a d . d IN valid_cha_shifts pat all_chars j a 37 <=> (1 <= d /\ d <= j+1 /\ checkDeltaC pat all_chars j a d)``, 38 rw[valid_cha_shifts_def,checkDeltaC_def] 39 >> Cases_on `d=j+1` 40 >- simp[] 41 >- (`(d <= j) <=> (d <= j + 1)` 42 suffices_by metis_tac[] 43 >> simp[]) 44 ); 45 46(* Check Delta C in terms of set comprehensions *) 47val CHECK_DELTAC_SET = store_thm( 48 "CHECK_DELTAC_SET", 49 ``valid_cha_shifts pat all_chars j a = {d | 1 <= d /\ d <= j+1 50 /\ checkDeltaC pat all_chars j a d}``, 51 rw [CHECK_DELTAC_THM, EXTENSION] 52 ); 53 54(* Find minimum valid character mismatch based shift *) 55val cmRecur_def = 56 tDefine "cmRecur" 57 ` 58 cmRecur pat all_chars j a d = 59 if (j+1 < d) then d 60 else if (checkDeltaC pat all_chars j a d) then d 61 else (cmRecur pat all_chars j a (d+1)) 62 ` 63 (WF_REL_TAC `measure (\(p, c, j, a, d). SUC j - d)` 64 >> fs[ADD1,checkDeltaC_def]); 65 66(* Intermediate lemmas to reason about recursive function bounds *) 67val CMRECUR_LEM = store_thm( 68 "CMRECUR_LEM", 69 ``!pat all_chars j a d. d <= cmRecur pat all_chars j a d``, 70 Induct_on `j + 2 - d` 71 >> rpt STRIP_TAC 72 >> ONCE_REWRITE_TAC[cmRecur_def] 73 >> fs[] 74 >> `v = j + 2 - (d+1)` by fs[ADD_CLAUSES] 75 >> `(d+1) <= cmRecur pat all_chars j a (d+1)` by metis_tac[] 76 >> Cases_on `checkDeltaC pat all_chars j a d` 77 >> fs[]); 78 79val CMRECUR_BND_THM = store_thm( 80 "CMRECUR_BND_THM", 81 ``!pat all_chars j a d . d IN valid_cha_shifts pat all_chars j a 82 ==> (!x. x <= d ==> cmRecur pat all_chars j a x <= d)``, 83 rpt STRIP_TAC 84 >> Induct_on `d - x` 85 >- (fs[] 86 >> rpt STRIP_TAC 87 >> `x=d` by rw[] 88 >> fs[valid_cha_shifts_def] 89 >> ONCE_REWRITE_TAC [cmRecur_def] 90 >> simp[checkDeltaC_def]) 91 >- (fs[] 92 >> rpt STRIP_TAC 93 >> `cmRecur pat all_chars j a (SUC x) <= d` by fs[ADD_CLAUSES] 94 >> `cmRecur pat all_chars j a x <= cmRecur pat all_chars j a (SUC x)` suffices_by fs[] 95 >> qabbrev_tac `L = cmRecur pat all_chars j a (SUC x)` 96 >> Cases_on `j + 1 < x` 97 >- (`cmRecur pat all_chars j a x = x` by (ONCE_REWRITE_TAC[cmRecur_def] >> fs[]) 98 >> simp[] 99 >> `(SUC x) <= cmRecur pat all_chars j a (SUC x)` suffices_by simp[Abbr `L`] 100 >> metis_tac[CMRECUR_LEM]) 101 >- (Cases_on `checkDeltaC pat all_chars j a x` 102 >> ONCE_REWRITE_TAC[cmRecur_def] 103 >> simp[ADD1,Abbr `L`] 104 >> `(x + 1) <= cmRecur pat all_chars j a (x + 1)` by metis_tac[CMRECUR_LEM] 105 >> fs[]) 106 ) 107 ); 108 109(* Confirming we only get valid values *) 110val CMRECUR_COR_THM = store_thm( 111 "CMRECUR_COR_THM", 112 ``! pat all_chars j a d . 1 <= cmRecur pat all_chars j a d 113 /\ cmRecur pat all_chars j a d <= j + 1 114 ==> cmRecur pat all_chars j a d 115 IN valid_cha_shifts pat all_chars j a``, 116 rpt STRIP_TAC 117 >> `d <= j + 1` 118 by (CCONTR_TAC 119 >> `cmRecur pat all_chars j a d > j + 1` suffices_by simp[] 120 >> ONCE_REWRITE_TAC [cmRecur_def] 121 >> fs[]) 122 >> Induct_on `j + 1 - d` 123 >- (rpt STRIP_TAC 124 >> ONCE_REWRITE_TAC[cmRecur_def] 125 >> fs[checkDeltaC_def,valid_cha_shifts_def]) 126 >- (rpt STRIP_TAC 127 >> `v = j + 1 - SUC d` by fs[] 128 >> `SUC d <= j + 1` by fs[] 129 >> `SUC d <= cmRecur pat all_chars j a (SUC d)` 130 by metis_tac[CMRECUR_LEM,LESS_EQ_TRANS] 131 >> `cmRecur pat all_chars j a (SUC d) <= j + 1` 132 by (`j + 1 IN valid_cha_shifts pat all_chars j a` by fs[valid_cha_shifts_def] 133 >> metis_tac[CMRECUR_BND_THM]) 134 >> `cmRecur pat all_chars j a (SUC d) IN valid_cha_shifts pat all_chars j a` 135 by (first_x_assum (qspec_then `j` (qspec_then `SUC d` mp_tac)) >> fs[]) 136 >> ONCE_REWRITE_TAC[cmRecur_def] 137 >> Cases_on `j + 1 < d` 138 >> fs[] 139 >> `checkDeltaC pat all_chars j a d ==> d IN valid_cha_shifts pat all_chars j a` 140 suffices_by (Cases_on `checkDeltaC pat all_chars j a d` >> fs[ADD1]) 141 >> `checkDeltaC pat all_chars j a d ==> 1 <= d` 142 suffices_by simp[checkDeltaC_def, valid_cha_shifts_def] 143 >> STRIP_TAC 144 >> `cmRecur pat all_chars j a d = d` 145 by (ONCE_REWRITE_TAC[cmRecur_def] >> fs[]) 146 >> fs[]) 147 ); 148 149(* initiate recursion *) 150val cmVal_def = 151 Define 152 ` 153 cmVal pat all_chars j a = cmRecur pat all_chars j a 1 154 `; 155 156(* Relationship between cmVal function and valid_cha_shift specification *) 157val CMVAL_THM = store_thm( 158 "CMVAL_THM", 159 ``cmVal pat all_chars j a = MIN_SET (valid_cha_shifts pat all_chars j a)``, 160 simp[cmVal_def] 161 >> `! d. d IN valid_cha_shifts pat all_chars j a ==> cmRecur pat all_chars j a 1 <= d` 162 by (rpt STRIP_TAC 163 >> `1 <= d` by fs[valid_cha_shifts_def] 164 >> simp[CMRECUR_BND_THM]) 165 >> `cmRecur pat all_chars j a 1 IN valid_cha_shifts pat all_chars j a` 166 by (`1 <= cmRecur pat all_chars j a 1` by simp[CMRECUR_LEM] 167 >> `cmRecur pat all_chars j a 1 <= j + 1` by fs[valid_cha_shifts_def] 168 >> simp[CMRECUR_COR_THM]) 169 >> fs[LOWER_BOUND_IS_MIN_SET] 170 ); 171 172(* Proving bounds on cmVal function with valid input *) 173val CMVAL_BND = store_thm( 174 "CMVAL_BND", 175 ``!j a. j < LENGTH pat ��� a < LENGTH all_chars 176 ==> 0 < cmVal pat all_char j a 177 /\ cmVal pat all_char j a <= LENGTH pat``, 178 simp[CMVAL_THM] 179 >> REPEAT strip_tac 180 >- (`0 <> MIN_SET (valid_cha_shifts pat all_char j a)` 181 suffices_by rw[] 182 >> `~(0 IN valid_cha_shifts pat all_char j a)` 183 suffices_by metis_tac[CHA_SHIFT_EXISTS_THM,MIN_SET_LEM] 184 >> simp[valid_cha_shifts_def]) 185 >- (`?x. (x IN valid_cha_shifts pat all_char j a) /\ (x <= LENGTH pat)` 186 suffices_by metis_tac[MIN_SET_LEM, LESS_EQ_TRANS, MEMBER_NOT_EMPTY] 187 >> simp[valid_cha_shifts_def] 188 >> qexists_tac `j+1` 189 >> simp[]) 190 ); 191 192(* -- IMPLICIT SUFFIX MATCH TABLE CONSTRUCTION -- *) 193(* Assess potential shift based on suffix mismatch rule *) 194val checkDeltaS_def = 195 Define 196 ` 197 checkDeltaS pat j d <=> 198 ((d >= SUC j) \/ ~(EL (j-d) pat = EL j pat)) /\ 199 (extract (MAX (SUC j) d,LENGTH pat) pat 200 = extract ((MAX (SUC j) d) - d,LENGTH pat - d) pat) 201 `; 202 203(* Relationship between checkDeltaS function 204 and valid_suf_shift specification *) 205val CHECK_DELTAS_THM = store_thm( 206 "CHECK_DELTAS_THM", 207 ``! pat j d . d IN valid_suf_shifts pat j 208 <=> 1 <= d /\ d <= LENGTH pat /\ checkDeltaS pat j d``, 209 ONCE_REWRITE_TAC [EQ_SYM_EQ] 210 >> simp[valid_suf_shifts_def,checkDeltaS_def,EXTRACT_THM,LIST_EQ_REWRITE] 211 >> rw[EQ_IMP_THM] 212 >> simp[] 213 >- (`MAX (SUC j) d = d` 214 by simp[MAX_DEF] 215 >> fs[] 216 >> first_x_assum(qspec_then `i-d` mp_tac) 217 >> simp[]) 218 >- (qabbrev_tac `M = MAX (SUC j) d` 219 >> `d <= M` 220 by rw[MAX_DEF,Abbr `M`] 221 >> Q.UNDISCH_THEN `d<= LENGTH pat` assume_tac 222 >> fs[] 223 >> first_x_assum(qspec_then `i-M` mp_tac) 224 >> simp[] 225 >> `M <= i` 226 by simp[MAX_DEF, Abbr `M`] 227 >> simp[]) 228 >- (simp[MAX_DEF]) 229 >- (qabbrev_tac `M = MAX (SUC j) d` 230 >> `d <= M` 231 by rw[MAX_DEF,Abbr `M`] 232 >> simp[]) 233 >- (simp[MAX_DEF]) 234 >- (qabbrev_tac `M = MAX (SUC j) d` 235 >> `d <= M` 236 by rw[MAX_DEF,Abbr `M`] 237 >> simp[] 238 >> `(j + 1) <= (M + x)` 239 by rw[MAX_DEF,Abbr `M`] 240 >> `d <= (M + x)` 241 by rw[MAX_DEF,Abbr `M`] 242 >> first_x_assum(qspec_then `M + x` mp_tac) 243 >> simp[]) 244 ); 245 246(* Check Delta S in terms of set comprehensions *) 247val CHECK_DELTAS_SET = store_thm( 248 "CHECK_DELTAS_SET", 249 ``valid_suf_shifts pat j = { d | 1 <= d 250 /\ d <= LENGTH pat 251 /\ checkDeltaS pat j d}``, 252 rw [CHECK_DELTAS_THM, EXTENSION] 253 ); 254 255val smRecur_def = 256 tDefine "smRecur" 257 ` 258 smRecur pat j d = 259 if (LENGTH pat < d) then d 260 else if (checkDeltaS pat j d) then d 261 else (smRecur pat j (d+1)) 262 ` 263 (WF_REL_TAC `measure (\(p, j, d). SUC (LENGTH p) - d)`); 264 265(* Find minimum valid suffix mismatch based shift *) 266val smVal_def = 267 Define 268 ` 269 smVal pat j = smRecur pat j 1 270 `; 271 272(* Intermediate lemmas to reason about recursive function bounds *) 273val SMRECUR_LEM = store_thm( 274 "SMRECUR_LEM", 275 ``!pat j d. d <= smRecur pat j d``, 276 Induct_on `LENGTH pat + 1 - d` 277 >> rpt STRIP_TAC 278 >> ONCE_REWRITE_TAC[smRecur_def] 279 >> fs[] 280 >> `v = LENGTH pat + 1 - (d+1)` by fs[ADD_CLAUSES] 281 >> `(d+1) <= smRecur pat j (d+1)` by metis_tac[] 282 >> Cases_on `checkDeltaS pat j d` 283 >> fs[]); 284 285val SMRECUR_BND_THM = store_thm( 286 "SMRECUR_BND_THM", 287 ``!pat j d . d IN valid_suf_shifts pat j 288 ==> (!x. x <= d ==> smRecur pat j x <= d)``, 289 rpt STRIP_TAC 290 >> Induct_on `d - x` 291 >- (fs[] 292 >> rpt STRIP_TAC 293 >> `x=d` by rw[] 294 >> ONCE_REWRITE_TAC [smRecur_def] 295 >> fs[CHECK_DELTAS_SET]) 296 >- (fs[] 297 >> rpt STRIP_TAC 298 >> `smRecur pat j (SUC x) <= d` by fs[ADD_CLAUSES] 299 >> `smRecur pat j x <= smRecur pat j (SUC x)` suffices_by fs[] 300 >> qabbrev_tac `L = smRecur pat j (SUC x)` 301 >> Cases_on `LENGTH pat < x` 302 >- (`smRecur pat j x = x` by (ONCE_REWRITE_TAC[smRecur_def] >> fs[]) 303 >> simp[] 304 >> `(SUC x) <= smRecur pat j (SUC x)` suffices_by simp[Abbr `L`] 305 >> metis_tac[SMRECUR_LEM]) 306 >- (Cases_on `checkDeltaS pat j x` 307 >> ONCE_REWRITE_TAC[smRecur_def] 308 >> simp[ADD1,Abbr `L`] 309 >> `(x + 1) <= smRecur pat j (x + 1)` by metis_tac[SMRECUR_LEM] 310 >> fs[]) 311 ) 312 ); 313 314(* Confirming we only get valid values *) 315val SMRECUR_COR_THM = store_thm( 316 "SMRECUR_COR_THM", 317 ``! pat all_chars j a d . pat <> [] 318 /\ j < LENGTH pat 319 /\ 1 <= smRecur pat j d 320 /\ smRecur pat j d <= LENGTH pat 321 ==> smRecur pat j d 322 IN valid_suf_shifts pat j``, 323 rpt STRIP_TAC 324 >> `d <= LENGTH pat` 325 by (CCONTR_TAC 326 >> `d <= smRecur pat j d` by metis_tac[SMRECUR_LEM] 327 >> fs[]) 328 >> Induct_on `LENGTH pat - d` 329 >- (rpt STRIP_TAC 330 >> `d = LENGTH pat` by simp[] 331 >> `LENGTH pat = smRecur pat j d` by fs[SMRECUR_LEM, LESS_EQUAL_ANTISYM] 332 >> `LENGTH pat IN valid_suf_shifts pat j` suffices_by fs[] 333 >> `checkDeltaS pat j (LENGTH pat)` 334 by (qpat_x_assum `LENGTH pat = smRecur pat j d` mp_tac 335 >> ONCE_REWRITE_TAC[smRecur_def] 336 >> simp[] 337 >> rpt STRIP_TAC 338 >> CCONTR_TAC 339 >> fs[] 340 >> `LENGTH pat + 1 <= smRecur pat j (LENGTH pat + 1)` by simp[SMRECUR_LEM] 341 >> `LENGTH pat <> smRecur pat j (LENGHT pat + 1)` by simp[] 342 >> fs[]) 343 >> ONCE_REWRITE_TAC[CHECK_DELTAS_THM] 344 >> fs[]) 345 >- (rpt STRIP_TAC 346 >> `v = LENGTH pat - SUC d` by fs[] 347 >> `SUC d <= LENGTH pat` by fs[] 348 >> `SUC d <= smRecur pat j (SUC d)` 349 by metis_tac[SMRECUR_LEM,LESS_EQ_TRANS] 350 >> `smRecur pat j (SUC d) <= LENGTH pat` 351 by (`LENGTH pat IN valid_suf_shifts pat j` 352 by rw[valid_suf_shifts_def] 353 >> metis_tac[SMRECUR_BND_THM]) 354 >> `smRecur pat j (SUC d) IN valid_suf_shifts pat j` 355 by (first_x_assum (qspec_then `pat` (qspec_then `SUC d` mp_tac)) >> fs[]) 356 >> ONCE_REWRITE_TAC[smRecur_def] 357 >> Cases_on `LENGTH pat < d` 358 >> fs[] 359 >> `checkDeltaS pat j d ==> d IN valid_suf_shifts pat j` 360 suffices_by (Cases_on `checkDeltaS pat j d` >> fs[ADD1]) 361 >> `checkDeltaS pat j d ==> 1 <= d` 362 suffices_by (ONCE_REWRITE_TAC[CHECK_DELTAS_THM] >> simp[]) 363 >> STRIP_TAC 364 >> `smRecur pat j d = d` 365 by (ONCE_REWRITE_TAC[smRecur_def] >> fs[]) 366 >> fs[]) 367 ); 368 369(* Relationship between smVal function and valid_suf_shift specification *) 370val SMVAL_THM = store_thm( 371 "SMVAL_THM", 372 ``(pat <> []) /\ (j < LENGTH pat) 373 ==> (smVal pat j = MIN_SET (valid_suf_shifts pat j))``, 374 STRIP_TAC 375 >> simp[smVal_def] 376 >> `! d. d IN valid_suf_shifts pat j ==> smRecur pat j 1 <= d` 377 by (rpt STRIP_TAC 378 >> `1 <= d` 379 by (first_assum mp_tac 380 >> ONCE_REWRITE_TAC[CHECK_DELTAS_THM] 381 >> simp[]) 382 >> simp[SMRECUR_BND_THM]) 383 >> `smRecur pat j 1 IN valid_suf_shifts pat j` 384 by (`1 <= smRecur pat j 1` by simp[SMRECUR_LEM] 385 >> `smRecur pat j 1 <= LENGTH pat` by fs[valid_suf_shifts_def] 386 >> simp[SMRECUR_COR_THM]) 387 >> fs[LOWER_BOUND_IS_MIN_SET] 388 ); 389 390(* Proving bounds on smVal function with valid input *) 391val SMVAL_BND = store_thm( 392 "SMVAL_BND", 393 ``!j . j < LENGTH pat 394 ==> 0 < smVal pat j 395 /\ smVal pat j <= LENGTH pat``, 396 Cases_on `pat = []` 397 >- simp[LENGTH] 398 >- (simp[SMVAL_THM] 399 >> REPEAT strip_tac 400 >- (`0 <> MIN_SET (valid_suf_shifts pat j)` 401 suffices_by rw[] 402 >> `~(0 IN valid_suf_shifts pat j)` 403 suffices_by metis_tac[SUF_SHIFT_EXISTS_THM,MIN_SET_LEM] 404 >> simp[valid_suf_shifts_def]) 405 >- (`?x. (x IN valid_suf_shifts pat j) /\ (x <= LENGTH pat)` 406 suffices_by metis_tac[MIN_SET_LEM, LESS_EQ_TRANS, 407 MEMBER_NOT_EMPTY] 408 >> simp[valid_suf_shifts_def] 409 >> qexists_tac `LENGTH pat` 410 >> simp[]) 411 ) 412 ); 413 414 415(* -- ACTUAL MISMATCH TABLE CONSTRUCTION -- *) 416(* Find mismatch table value at particular point based 417 on best shift available between suffix mismatch table 418 and character mismatch table *) 419val mVal_def = 420 Define 421 ` 422 mVal calc_smVal pat all_chars j a = 423 MAX calc_smVal (cmVal pat all_chars j a) 424 `; 425 426(* Generate a row of mismatch table *) 427val mSubTab_def = 428 Define 429 ` 430 mSubTab pat all_chars j = 431 GENLIST (mVal (smVal pat j) pat all_chars j) (LENGTH all_chars) 432 `; 433 434(* Generate mismatch table *) 435val mTab_def = 436 Define 437 ` 438 mTab pat all_chars = 439 GENLIST (mSubTab pat all_chars) (LENGTH pat) 440 `; 441 442(* Dimensional properties of mTab *) 443val MTAB_DIM = store_thm( 444 "MTAB_DIM", 445 ``(LENGTH (mTab pat all_chars) = LENGTH pat) 446 /\ (!j. j < LENGTH pat 447 ==> (LENGTH (EL j (mTab pat all_chars)) 448 = LENGTH all_chars))``, 449 simp[mTab_def,mSubTab_def] 450 ); 451 452(* Bounds on mTab values for valid inputs *) 453val MTAB_BND = store_thm( 454 "MTAB_BND", 455 ``!a j. (j < LENGTH pat) /\ (a < LENGTH all_chars) 456 ==> 0 < EL a (EL j (mTab pat all_chars)) 457 /\ EL a (EL j (mTab pat all_chars)) <= LENGTH pat``, 458 simp[mTab_def,mSubTab_def,mVal_def] 459 >> metis_tac[SMVAL_BND,CMVAL_BND] 460 ); 461 462(* Important solution skipping capacity of mTab *) 463Theorem MTAB_THM: 464 !search j k a. 465 pat <> [] 466 /\ (k <= LENGTH search - LENGTH pat) 467 /\ (j < LENGTH pat) 468 /\ (!i. (j<i /\ i < LENGTH pat) 469 ==> (EL i pat = EL (k+i) search)) 470 /\ (EL j pat <> EL (k+j) search) 471 /\ (a < LENGTH all_chars) 472 /\ (EL (k+j) search = EL a all_chars) 473 ==> 474 !d. d < EL a (EL j (mTab pat all_chars)) 475 ==> (k+d) NOTIN solutions pat search 476Proof 477 strip_tac 478 >> strip_tac 479 >> strip_tac 480 >> strip_tac 481 >> strip_tac 482 >> `(EL a (EL j (mTab pat all_chars)) = (smVal pat j)) \/ 483 (EL a (EL j (mTab pat all_chars)) = (cmVal pat all_chars j a))` 484 by rw[mTab_def,mSubTab_def,mVal_def,EL_GENLIST,MAX_DEF] 485 >- (fs[] 486 >> fs[SMVAL_THM] 487 >> `EL j pat <> EL (k+j) search` 488 by rw[EQ_SYM_EQ] 489 >> FREEZE_THEN drule SUF_SKIP_NOT_SOL 490 >> fs[]) 491 >- (fs[] 492 >> fs[CMVAL_THM] 493 >> `EL j pat <> EL (k+j) search` 494 by rw[EQ_SYM_EQ] 495 >> FREEZE_THEN drule CHA_SKIP_NOT_SOL 496 >> fs[]) 497QED 498 499(* -- BOYER-MOORE SEARCH IMPLEMENTATION -- *) 500(* Checks to see if pat is prefix of search and skips verified bad alignments 501 based on patTab and all_chars if not to recursively find the minimum 502 solution. Returning LENGTH search indicates no substrings, returning 503 LENGTH search + 1 indicates there's been an error likely due to a malformed 504 patTab *) 505val bmRecur_def = 506 tDefine "bmRecur" 507 ` 508 (bmRecur [] _ _ _ = 0) /\ 509 (bmRecur _ _ _ [] = 0) /\ 510 (bmRecur pat patTab all_chars search = 511 if 512 ~(LENGTH pat <= LENGTH search) 513 then 514 (LENGTH search) 515 else 516 let 517 (j = checkPrefixRL pat search) 518 in 519 if 520 ~(j < LENGTH patTab) 521 then 522 0 523 else 524 let 525 (a = findElem all_chars (EL j search)) 526 in 527 if 528 ~(a < LENGTH (EL j patTab)) 529 then 530 (LENGTH search + 1) 531 else 532 let 533 (d = EL a (EL j patTab)) 534 in 535 if 536 ~(0 < d) 537 then 538 (LENGTH search + 1) 539 else 540 (d + (bmRecur pat patTab 541 all_chars (DROP d search))) 542 ) 543 ` 544 (WF_REL_TAC `measure (LENGTH ��� SND ��� SND ��� SND)` 545 >> rw[DROP]); 546 547(* Simple theorem for cleaness enforcing one definition 548 of bmRecur for non-null lists *) 549val BMRECUR_NON_EMPTY_DEF = store_thm( 550 "BMRECUR_NON_EMPTY_DEF", 551 ``(pat <> []) /\ (search <> []) 552 ==> 553 (bmRecur pat patTab all_chars search = 554 if ��(LENGTH pat ��� LENGTH search) then LENGTH search 555 else 556 (let 557 j = checkPrefixRL pat search 558 in 559 if ��(j < LENGTH patTab) then 0 560 else 561 (let 562 a = findElem all_chars (EL j search) 563 in 564 if ��(a < LENGTH (EL j patTab)) then LENGTH search + 1 565 else 566 (let 567 d = EL a (EL j patTab) 568 in 569 if ��(0 < d) then LENGTH search + 1 570 else 571 d + 572 bmRecur pat patTab all_chars 573 (DROP d search)))) 574 )``, 575 Cases_on `pat` 576 >> Cases_on `search` 577 >> fs[bmRecur_def] 578 ); 579 580(* Proves that bmRecur returns correct solutions with valid inputs 581 for patTab and all_chars *) 582val BMRECUR_THM = store_thm( 583 "BMRECUR_THM", 584 ``(LENGTH patTab = LENGTH pat) 585 /\ (!j. j < LENGTH pat 586 ==> (LENGTH (EL j patTab) = LENGTH all_chars)) 587 /\ (!a j. j < LENGTH pat /\ a < LENGTH all_chars 588 ==> 0 < EL a (EL j patTab) 589 /\ EL a (EL j patTab) <= LENGTH pat) 590 /\ (!search j k a. (pat <> []) 591 /\ k <= LENGTH search - LENGTH pat 592 /\ j < LENGTH pat 593 /\ (!i. j < i /\ i < LENGTH pat 594 ==> (EL i pat = EL (k+i) search)) 595 /\ (EL j pat <> EL (k+j) search) 596 /\ a < LENGTH all_chars 597 /\ (EL (k+j) search = EL a all_chars) 598 ==> !d. d < EL a (EL j patTab) 599 ==> ~((k+d) IN solutions pat search)) 600 ==> (!j. j < LENGTH search ==> MEM (EL j search) all_chars) 601 ==> (bmRecur pat patTab all_chars search = solution pat search)``, 602 strip_tac 603 >> completeInduct_on `LENGTH search` 604 >> fs[PULL_FORALL] 605 >> rw[] 606 >> Cases_on `pat =[]` 607 >- (fs[bmRecur_def,solution_def,solutions_def] 608 >> rw[IN_INSERT, MIN_SET_LEM,DECIDE ``(a <= 0) ==> (a = 0)``]) 609 >- (Cases_on `search = []` 610 >- (Cases_on `pat` 611 >> fs[bmRecur_def,solution_def,solutions_def,MIN_SET_THM]) 612 >- (rw[BMRECUR_NON_EMPTY_DEF] 613 >> qabbrev_tac `j_i = checkPrefixRL pat search` 614 >> qabbrev_tac `a_i = findElem all_chars (EL j_i search)` 615 >> qabbrev_tac `d_i = EL a_i (EL j_i patTab)` 616 >- (rename [`~(LENGTH pat <= LENGTH search)`] 617 >> rw[solution_def,solutions_def,MIN_SET_THM]) 618 >- (rename [`~(j_i < LENGTH pat)`] 619 >> `j_i <= LENGTH pat` 620 by rw[Abbr `j_i`, CHECK_PREFIX_RL_BND] 621 >> `pat = TAKE (LENGTH pat) search` 622 by metis_tac[DECIDE ``(x<=y) /\ ~(x<y) ==> (x=y)``, 623 CHECK_PREFIX_RL_MATCH] 624 >> pop_assum mp_tac 625 >> simp[LIST_EQ_REWRITE,EL_TAKE] 626 >> strip_tac 627 >> simp[solution_def] 628 >> `0 IN solutions pat search` 629 suffices_by metis_tac[DECIDE ``(a <= 0) ==> (a=0)``, 630 MEMBER_NOT_EMPTY, MIN_SET_LEM, 631 IN_INSERT] 632 >> simp[solutions_def]) 633 >- (rename [`~(a_i < LENGTH all_chars)`] 634 >> `MEM (EL j_i search) all_chars` 635 by rw[] 636 >> `a_i = LENGTH all_chars` 637 by rw[Abbr `a_i`, 638 DECIDE ``(x<=y) /\ ~(x<y) ==> (x=y)``, 639 FIND_ELEM_BND] 640 >> metis_tac[FIND_ELEM_NO_MATCH]) 641 >- (rename [`d_i = 0`] 642 >> `0 < d_i` 643 by rw[Abbr `d_i`] 644 >> fs[]) 645 >- (rename [`d_i <> 0`] 646 >> `bmRecur pat patTab all_chars (DROP d_i search) 647 = solution pat (DROP d_i search)` 648 by fs[LENGTH_DROP,EL_DROP] 649 >> simp[] 650 >> `(LENGTH pat <= LENGTH search) /\ (d_i <= LENGTH search) 651 /\ (!x. x < d_i ==> ~(x IN solutions pat search))` 652 suffices_by metis_tac[SOL_SHF_THM] 653 >> simp[] 654 >> strip_tac 655 >- (first_x_assum (qspecl_then [`a_i`,`j_i`] mp_tac) 656 >> fs[LESS_EQ_TRANS]) 657 >- (strip_tac 658 >> first_x_assum (qspecl_then [`search`,`j_i`,`0`,`a_i`,`x`] 659 mp_tac) 660 >> fs[Abbr `a_i`, Abbr `j_i`,Abbr `d_i`, 661 CHECK_PREFIX_RL_THM,FIND_ELEM_THM]) 662 ) 663 ) 664 ) 665 ); 666 667(* Calculates lookup table and all_chars to call bmRecur for the first time. 668 That is: this implements the boyer-moore substring search algorithm to look 669 for the first appearance of a substring in a string *) 670val bmSearch_def = 671 Define 672 ` 673 bmSearch pat search = 674 let 675 all_chars = uniqueElems search 676 in 677 bmRecur pat (mTab pat all_chars) all_chars search 678 `; 679 680(* Final proof that the bmSearch function correctly searches 681 for the first substring *) 682val BMSEARCH_THM = store_thm( 683 "BMSEARCH_THM", 684 ``bmSearch pat search = solution pat search``, 685 simp[bmSearch_def] 686 >> irule BMRECUR_THM 687 >> rpt conj_tac 688 >- metis_tac [MTAB_THM] 689 >- rw [MTAB_BND] 690 >- rw [MTAB_DIM] 691 >- rw [UNIQUE_ELEMS_THM] 692 >- rw [MTAB_DIM] 693 ); 694 695(* STRING SPECIALISATION *) 696 697(* Generate an alphabet with all the characters *) 698val alphabet_def = 699 Define 700 ` 701 alphabet = GENLIST CHR 256 702 `; 703 704val ALPHABET_THM = store_thm( 705 "ALPHABET_THM", 706 ``! c:char. MEM c alphabet``, 707 STRIP_TAC 708 >> `? n. (c = CHR n) /\ (n < 256)` 709 by rw[ranged_char_nchotomy] 710 >> rw[] 711 >> ONCE_REWRITE_TAC[alphabet_def] 712 >> rw[MEM_GENLIST] 713 >> qexists_tac `n` 714 >> simp[]); 715 716val ALPHABET_FIND_THM = store_thm( 717 "ALPHABET_FIND_THM", 718 ``!c. findElem alphabet c = ORD c``, 719 STRIP_TAC 720 >> `? n. (c = CHR n) /\ (n < 256)` 721 by rw[ranged_char_nchotomy] 722 >> `MEM c alphabet` 723 by (`MEM (EL (ORD c) alphabet) alphabet` 724 suffices_by rw[ALPHABET_THM] 725 >> `ORD c < LENGTH alphabet` 726 suffices_by rw[EL_MEM] 727 >> `LENGTH alphabet = 256` 728 by (rw[alphabet_def,LENGTH] 729 >> EVAL_TAC) 730 >> rw[ORD_CHR]) 731 >> `findElem alphabet c < LENGTH alphabet` 732 by (`findElem alphabet c <> LENGTH alphabet` 733 suffices_by rw[FIND_ELEM_BND, DECIDE ``a <= b /\ a <> b ==> a < b``] 734 >> CCONTR_TAC 735 >> rw[FIND_ELEM_NO_MATCH]) 736 >> `EL (findElem alphabet c) alphabet = c` 737 by rw[FIND_ELEM_THM] 738 >> `EL (findElem alphabet c) alphabet = CHR (findElem alphabet c)` 739 by (ONCE_REWRITE_TAC[alphabet_def] 740 >> `findElem (GENLIST CHR 256) c < LENGTH (GENLIST CHR 256)` 741 suffices_by rw[EL_GENLIST] 742 >> qpat_assum `findElem a c < STRLEN a` mp_tac 743 >> ONCE_REWRITE_TAC[alphabet_def] 744 >> simp[]) 745 >> `c = CHR (findElem alphabet c)` 746 by fs[] 747 >> `ORD (CHR (findElem alphabet c)) = findElem alphabet c` 748 by (`findElem alphabet c < 256` 749 suffices_by rw[ORD_CHR_RWT] 750 >> `STRLEN alphabet = 256` 751 suffices_by rw[] 752 >> ONCE_REWRITE_TAC[alphabet_def] 753 >> rw[LENGTH_GENLIST]) 754 >> `ORD (CHR (findElem alphabet c)) = ORD c` 755 suffices_by rw[] 756 >> `CHR (findElem alphabet c) = c` 757 suffices_by EVAL_TAC 758 >> fs[]); 759 760(* Build up a bmRecur specialised to strings *) 761val bmRecurString_def = 762 tDefine "bmRecurString" 763 ` 764 (bmRecurString [] _ _ = 0) /\ 765 (bmRecurString _ _ [] = 0) /\ 766 (bmRecurString pat patTab search = 767 if 768 ~(LENGTH pat <= LENGTH search) 769 then 770 (LENGTH search) 771 else 772 let 773 (j = checkPrefixRL pat search) 774 in 775 if 776 ~(j < LENGTH patTab) 777 then 778 0 779 else 780 let 781 (a = ORD (EL j search)) 782 in 783 if 784 ~(a < LENGTH (EL j patTab)) 785 then 786 (LENGTH search + 1) 787 else 788 let 789 (d = EL a (EL j patTab)) 790 in 791 if 792 ~(0 < d) 793 then 794 (LENGTH search + 1) 795 else 796 (d + (bmRecurString pat patTab (DROP d search))) 797 ) 798 ` 799 (WF_REL_TAC `measure (\(p, t, s). LENGTH s)` 800 >> rw[DROP]); 801 802val BMRECUR_STRING_THM = store_thm( 803 "BMRECUR_STRING_THM", 804 ``bmRecur pat patTab alphabet search = bmRecurString pat patTab search``, 805 Cases_on `pat` 806 >> ONCE_REWRITE_TAC[bmRecur_def, bmRecurString_def] 807 >> simp[] 808 >> completeInduct_on `LENGTH search` 809 >> rpt STRIP_TAC 810 >> Cases_on `search` 811 >> ONCE_REWRITE_TAC[bmRecur_def, bmRecurString_def] 812 >> simp[ALPHABET_FIND_THM]); 813 814(* Calls bmRecurString instead of bmRecur. 815 That is: this implements the boyer-moore substring search algorithm to look 816 for the first appearance of a substring in a string - but in ACTUAL string 817 types *) 818val bmSearchString_def = 819 Define 820 ` 821 bmSearchString (pat : string) (search : string) = 822 bmRecurString pat (mTab pat alphabet) search 823 `; 824 825(* Final proof that the bmSearchString function correctly searches 826 for the first substring *) 827val BMSEARCH_STRING_THM = store_thm( 828 "BMSEARCH_STRING_THM", 829 ``bmSearchString pat search = solution pat search``, 830 simp[bmSearchString_def] 831 >> `bmRecur pat (mTab pat alphabet) alphabet search = solution pat search` 832 suffices_by rw[BMRECUR_STRING_THM] 833 >> irule BMRECUR_THM 834 >> rpt conj_tac 835 >- metis_tac[MTAB_THM] 836 >- rw[MTAB_BND] 837 >- rw[MTAB_DIM] 838 >- rw[ALPHABET_THM] 839 >- rw[MTAB_DIM] 840 ); 841 842val _ = export_theory(); 843