1open HolKernel boolLib bossLib Parse;
2
3open boolTheory;
4open pred_setTheory;
5open pairTheory;
6
7open arithmeticTheory;
8open numTheory;
9
10open prim_recTheory;
11
12open listTheory;
13open rich_listTheory;
14
15open stringTheory;
16
17open set_lemmasTheory;
18open helper_funcsTheory;
19open boyer_moore_specTheory;
20
21val _ = new_theory"boyer_moore";
22
23(* -- IMPLICIT CHARACTER MISMATCH TABLE CONSTRUCTION -- *)
24(* Assess potential shift based on character mismatch rule *)
25val checkDeltaC_def =
26    Define
27    `
28    checkDeltaC pat all_chars j a d =
29        ((d = j+1) \/ (EL (j-d) pat = EL a all_chars))
30    `;
31
32(* Relationship between checkDeltaC function
33   and valid_cha_shift specification *)
34val CHECK_DELTAC_THM = store_thm(
35    "CHECK_DELTAC_THM",
36    ``!pat all_chars j a d . d IN valid_cha_shifts pat all_chars j a
37                 <=> (1 <= d /\ d <= j+1 /\ checkDeltaC pat all_chars j a d)``,
38    rw[valid_cha_shifts_def,checkDeltaC_def]
39    >> Cases_on `d=j+1`
40    >- simp[]
41    >- (`(d <= j) <=> (d <= j + 1)`
42            suffices_by metis_tac[]
43        >> simp[])
44    );
45
46(* Check Delta C in terms of set comprehensions *)
47val CHECK_DELTAC_SET = store_thm(
48    "CHECK_DELTAC_SET",
49    ``valid_cha_shifts pat all_chars j a = {d | 1 <= d /\ d <= j+1
50                        /\ checkDeltaC pat all_chars j a d}``,
51    rw [CHECK_DELTAC_THM, EXTENSION]
52    );
53
54(* Find minimum valid character mismatch based shift *)
55val cmRecur_def =
56    tDefine "cmRecur"
57    `
58    cmRecur pat all_chars j a d =
59        if (j+1 < d) then d
60        else if (checkDeltaC pat all_chars j a d) then d
61        else (cmRecur pat all_chars j a (d+1))
62    `
63    (WF_REL_TAC `measure (\(p, c, j, a, d). SUC j - d)`
64    >> fs[ADD1,checkDeltaC_def]);
65
66(* Intermediate lemmas to reason about recursive function bounds *)
67val CMRECUR_LEM = store_thm(
68    "CMRECUR_LEM",
69    ``!pat all_chars j a d. d <= cmRecur pat all_chars j a d``,
70    Induct_on `j + 2 - d`
71    >> rpt STRIP_TAC
72    >> ONCE_REWRITE_TAC[cmRecur_def]
73    >> fs[]
74    >> `v = j + 2 - (d+1)` by fs[ADD_CLAUSES]
75    >> `(d+1) <= cmRecur pat all_chars j a (d+1)` by metis_tac[]
76    >> Cases_on `checkDeltaC pat all_chars j a d`
77    >> fs[]);
78
79val CMRECUR_BND_THM = store_thm(
80    "CMRECUR_BND_THM",
81    ``!pat all_chars j a d . d IN valid_cha_shifts pat all_chars j a
82        ==> (!x. x <= d ==> cmRecur pat all_chars j a x <= d)``,
83    rpt STRIP_TAC
84    >> Induct_on `d - x`
85    >- (fs[]
86        >> rpt STRIP_TAC
87        >> `x=d` by rw[]
88        >> fs[valid_cha_shifts_def]
89        >> ONCE_REWRITE_TAC [cmRecur_def]
90        >> simp[checkDeltaC_def])
91    >- (fs[]
92        >> rpt STRIP_TAC
93        >> `cmRecur pat all_chars j a (SUC x) <= d` by fs[ADD_CLAUSES]
94        >> `cmRecur pat all_chars j a x <= cmRecur pat all_chars j a (SUC x)` suffices_by fs[]
95        >> qabbrev_tac `L = cmRecur pat all_chars j a (SUC x)`
96        >> Cases_on `j + 1 < x`
97        >- (`cmRecur pat all_chars j a x = x` by (ONCE_REWRITE_TAC[cmRecur_def] >> fs[])
98            >> simp[]
99            >> `(SUC x) <= cmRecur pat all_chars j a (SUC x)` suffices_by simp[Abbr `L`]
100            >> metis_tac[CMRECUR_LEM])
101        >- (Cases_on `checkDeltaC pat all_chars j a x`
102            >> ONCE_REWRITE_TAC[cmRecur_def]
103            >> simp[ADD1,Abbr `L`]
104            >> `(x + 1) <= cmRecur pat all_chars j a (x + 1)` by metis_tac[CMRECUR_LEM]
105            >> fs[])
106        )
107    );
108
109(* Confirming we only get valid values *)
110val CMRECUR_COR_THM = store_thm(
111    "CMRECUR_COR_THM",
112    ``! pat all_chars j a d . 1 <= cmRecur pat all_chars j a d
113                             /\ cmRecur pat all_chars j a d <= j + 1
114                                ==> cmRecur pat all_chars j a d
115                                        IN valid_cha_shifts pat all_chars j a``,
116    rpt STRIP_TAC
117    >> `d <= j + 1`
118        by (CCONTR_TAC
119            >> `cmRecur pat all_chars j a d > j + 1` suffices_by simp[]
120            >> ONCE_REWRITE_TAC [cmRecur_def]
121            >> fs[])
122    >> Induct_on `j + 1 - d`
123    >- (rpt STRIP_TAC
124        >> ONCE_REWRITE_TAC[cmRecur_def]
125        >> fs[checkDeltaC_def,valid_cha_shifts_def])
126    >- (rpt STRIP_TAC
127        >> `v = j + 1 - SUC d` by fs[]
128        >> `SUC d <= j + 1` by fs[]
129        >> `SUC d <= cmRecur pat all_chars j a (SUC d)`
130                by metis_tac[CMRECUR_LEM,LESS_EQ_TRANS]
131        >> `cmRecur pat all_chars j a (SUC d) <= j + 1`
132                by (`j + 1 IN valid_cha_shifts pat all_chars j a` by fs[valid_cha_shifts_def]
133                    >> metis_tac[CMRECUR_BND_THM])
134        >> `cmRecur pat all_chars j a (SUC d) IN valid_cha_shifts pat all_chars j a`
135                by (first_x_assum (qspec_then `j` (qspec_then `SUC d` mp_tac)) >> fs[])
136        >> ONCE_REWRITE_TAC[cmRecur_def]
137        >> Cases_on `j + 1 < d`
138        >> fs[]
139        >> `checkDeltaC pat all_chars j a d ==> d IN valid_cha_shifts pat all_chars j a`
140                suffices_by (Cases_on `checkDeltaC pat all_chars j a d` >> fs[ADD1])
141        >> `checkDeltaC pat all_chars j a d ==> 1 <= d`
142                suffices_by simp[checkDeltaC_def, valid_cha_shifts_def]
143        >> STRIP_TAC
144        >> `cmRecur pat all_chars j a d = d`
145                by (ONCE_REWRITE_TAC[cmRecur_def] >> fs[])
146        >> fs[])
147    );
148
149(* initiate recursion *)
150val cmVal_def =
151    Define
152    `
153    cmVal pat all_chars j a = cmRecur pat all_chars j a 1
154    `;
155
156(* Relationship between cmVal function and valid_cha_shift specification *)
157val CMVAL_THM = store_thm(
158    "CMVAL_THM",
159    ``cmVal pat all_chars j a = MIN_SET (valid_cha_shifts pat all_chars j a)``,
160    simp[cmVal_def]
161    >> `! d. d IN valid_cha_shifts pat all_chars j a ==> cmRecur pat all_chars j a 1 <= d`
162            by (rpt STRIP_TAC
163                >> `1 <= d` by fs[valid_cha_shifts_def]
164                >> simp[CMRECUR_BND_THM])
165    >> `cmRecur pat all_chars j a 1 IN valid_cha_shifts pat all_chars j a`
166            by (`1 <= cmRecur pat all_chars j a 1` by simp[CMRECUR_LEM]
167                >> `cmRecur pat all_chars j a 1 <= j + 1` by fs[valid_cha_shifts_def]
168                >> simp[CMRECUR_COR_THM])
169    >> fs[LOWER_BOUND_IS_MIN_SET]
170    );
171
172(* Proving bounds on cmVal function with valid input *)
173val CMVAL_BND = store_thm(
174    "CMVAL_BND",
175    ``!j a. j < LENGTH pat ��� a < LENGTH all_chars
176           ==> 0 < cmVal pat all_char j a
177               /\ cmVal pat all_char j a <= LENGTH pat``,
178    simp[CMVAL_THM]
179    >> REPEAT strip_tac
180    >- (`0 <> MIN_SET (valid_cha_shifts pat all_char j a)`
181            suffices_by rw[]
182        >> `~(0 IN valid_cha_shifts pat all_char j a)`
183            suffices_by metis_tac[CHA_SHIFT_EXISTS_THM,MIN_SET_LEM]
184        >> simp[valid_cha_shifts_def])
185    >- (`?x. (x IN valid_cha_shifts pat all_char j a) /\ (x <= LENGTH pat)`
186            suffices_by metis_tac[MIN_SET_LEM, LESS_EQ_TRANS, MEMBER_NOT_EMPTY]
187        >> simp[valid_cha_shifts_def]
188        >> qexists_tac `j+1`
189        >> simp[])
190    );
191
192(* -- IMPLICIT SUFFIX MATCH TABLE CONSTRUCTION -- *)
193(* Assess potential shift based on suffix mismatch rule *)
194val checkDeltaS_def =
195    Define
196    `
197    checkDeltaS pat j d <=>
198        ((d >= SUC j) \/ ~(EL (j-d) pat = EL j pat)) /\
199        (extract (MAX (SUC j) d,LENGTH pat) pat
200            = extract ((MAX (SUC j) d) - d,LENGTH pat - d) pat)
201    `;
202
203(* Relationship between checkDeltaS function
204   and valid_suf_shift specification *)
205val CHECK_DELTAS_THM = store_thm(
206    "CHECK_DELTAS_THM",
207    ``! pat j d . d IN valid_suf_shifts pat j
208                 <=> 1 <= d /\ d <= LENGTH pat /\ checkDeltaS pat j d``,
209    ONCE_REWRITE_TAC [EQ_SYM_EQ]
210    >> simp[valid_suf_shifts_def,checkDeltaS_def,EXTRACT_THM,LIST_EQ_REWRITE]
211    >> rw[EQ_IMP_THM]
212    >> simp[]
213    >- (`MAX (SUC j) d = d`
214            by simp[MAX_DEF]
215        >> fs[]
216        >> first_x_assum(qspec_then `i-d` mp_tac)
217        >> simp[])
218    >- (qabbrev_tac `M = MAX (SUC j) d`
219        >> `d <= M`
220                by rw[MAX_DEF,Abbr `M`]
221        >> Q.UNDISCH_THEN `d<= LENGTH pat` assume_tac
222        >> fs[]
223        >> first_x_assum(qspec_then `i-M` mp_tac)
224        >> simp[]
225        >> `M <= i`
226                by simp[MAX_DEF, Abbr `M`]
227        >> simp[])
228    >- (simp[MAX_DEF])
229    >- (qabbrev_tac `M = MAX (SUC j) d`
230        >> `d <= M`
231                by rw[MAX_DEF,Abbr `M`]
232        >> simp[])
233    >- (simp[MAX_DEF])
234    >- (qabbrev_tac `M = MAX (SUC j) d`
235        >> `d <= M`
236                by rw[MAX_DEF,Abbr `M`]
237        >> simp[]
238        >> `(j + 1) <= (M + x)`
239                by rw[MAX_DEF,Abbr `M`]
240        >> `d <= (M + x)`
241                by rw[MAX_DEF,Abbr `M`]
242        >> first_x_assum(qspec_then `M + x` mp_tac)
243        >> simp[])
244    );
245
246(* Check Delta S in terms of set comprehensions *)
247val CHECK_DELTAS_SET = store_thm(
248    "CHECK_DELTAS_SET",
249    ``valid_suf_shifts pat j = { d | 1 <= d
250                                    /\ d <= LENGTH pat
251                                    /\ checkDeltaS pat j d}``,
252    rw [CHECK_DELTAS_THM, EXTENSION]
253    );
254
255val smRecur_def =
256    tDefine "smRecur"
257    `
258    smRecur pat j d =
259        if (LENGTH pat < d) then d
260        else if (checkDeltaS pat j d) then d
261        else (smRecur pat j (d+1))
262    `
263    (WF_REL_TAC `measure (\(p, j, d). SUC (LENGTH p) - d)`);
264
265(* Find minimum valid suffix mismatch based shift *)
266val smVal_def =
267    Define
268    `
269    smVal pat j = smRecur pat j 1
270    `;
271
272(* Intermediate lemmas to reason about recursive function bounds *)
273val SMRECUR_LEM = store_thm(
274    "SMRECUR_LEM",
275    ``!pat j d. d <= smRecur pat j d``,
276    Induct_on `LENGTH pat + 1 - d`
277    >> rpt STRIP_TAC
278    >> ONCE_REWRITE_TAC[smRecur_def]
279    >> fs[]
280    >> `v = LENGTH pat + 1 - (d+1)` by fs[ADD_CLAUSES]
281    >> `(d+1) <= smRecur pat j (d+1)` by metis_tac[]
282    >> Cases_on `checkDeltaS pat j d`
283    >> fs[]);
284
285val SMRECUR_BND_THM = store_thm(
286    "SMRECUR_BND_THM",
287    ``!pat j d . d IN valid_suf_shifts pat j
288        ==> (!x. x <= d ==> smRecur pat j x <= d)``,
289    rpt STRIP_TAC
290    >> Induct_on `d - x`
291    >- (fs[]
292        >> rpt STRIP_TAC
293        >> `x=d` by rw[]
294        >> ONCE_REWRITE_TAC [smRecur_def]
295        >> fs[CHECK_DELTAS_SET])
296    >- (fs[]
297        >> rpt STRIP_TAC
298        >> `smRecur pat j (SUC x) <= d` by fs[ADD_CLAUSES]
299        >> `smRecur pat j x <= smRecur pat j (SUC x)` suffices_by fs[]
300        >> qabbrev_tac `L = smRecur pat j (SUC x)`
301        >> Cases_on `LENGTH pat < x`
302        >- (`smRecur pat j x = x` by (ONCE_REWRITE_TAC[smRecur_def] >> fs[])
303            >> simp[]
304            >> `(SUC x) <= smRecur pat j (SUC x)` suffices_by simp[Abbr `L`]
305            >> metis_tac[SMRECUR_LEM])
306        >- (Cases_on `checkDeltaS pat j x`
307            >> ONCE_REWRITE_TAC[smRecur_def]
308            >> simp[ADD1,Abbr `L`]
309            >> `(x + 1) <= smRecur pat j (x + 1)` by metis_tac[SMRECUR_LEM]
310            >> fs[])
311        )
312    );
313
314(* Confirming we only get valid values *)
315val SMRECUR_COR_THM = store_thm(
316    "SMRECUR_COR_THM",
317    ``! pat all_chars j a d . pat <> []
318                             /\ j < LENGTH pat
319                             /\ 1 <= smRecur pat j d
320                             /\ smRecur pat j d <= LENGTH pat
321                                ==> smRecur pat j d
322                                        IN valid_suf_shifts pat j``,
323    rpt STRIP_TAC
324    >> `d <= LENGTH pat`
325        by (CCONTR_TAC
326            >> `d <= smRecur pat j d` by metis_tac[SMRECUR_LEM]
327            >> fs[])
328    >> Induct_on `LENGTH pat - d`
329    >- (rpt STRIP_TAC
330        >> `d = LENGTH pat` by simp[]
331        >> `LENGTH pat = smRecur pat j d` by fs[SMRECUR_LEM, LESS_EQUAL_ANTISYM]
332        >> `LENGTH pat IN valid_suf_shifts pat j` suffices_by fs[]
333        >> `checkDeltaS pat j (LENGTH pat)`
334                by (qpat_x_assum `LENGTH pat = smRecur pat j d` mp_tac
335                    >> ONCE_REWRITE_TAC[smRecur_def]
336                    >> simp[]
337                    >> rpt STRIP_TAC
338                    >> CCONTR_TAC
339                    >> fs[]
340                    >> `LENGTH pat + 1 <= smRecur pat j (LENGTH pat + 1)` by simp[SMRECUR_LEM]
341                    >> `LENGTH pat <> smRecur pat j (LENGHT pat + 1)` by simp[]
342                    >> fs[])
343        >> ONCE_REWRITE_TAC[CHECK_DELTAS_THM]
344        >> fs[])
345    >- (rpt STRIP_TAC
346        >> `v = LENGTH pat - SUC d` by fs[]
347        >> `SUC d <= LENGTH pat` by fs[]
348        >> `SUC d <= smRecur pat j (SUC d)`
349                by metis_tac[SMRECUR_LEM,LESS_EQ_TRANS]
350        >> `smRecur pat j (SUC d) <= LENGTH pat`
351                by (`LENGTH pat IN valid_suf_shifts pat j`
352                        by rw[valid_suf_shifts_def]
353                    >> metis_tac[SMRECUR_BND_THM])
354        >> `smRecur pat j (SUC d) IN valid_suf_shifts pat j`
355                by (first_x_assum (qspec_then `pat` (qspec_then `SUC d` mp_tac)) >> fs[])
356        >> ONCE_REWRITE_TAC[smRecur_def]
357        >> Cases_on `LENGTH pat < d`
358        >> fs[]
359        >> `checkDeltaS pat j d ==> d IN valid_suf_shifts pat j`
360                suffices_by (Cases_on `checkDeltaS pat j d` >> fs[ADD1])
361        >> `checkDeltaS pat j d ==> 1 <= d`
362                suffices_by (ONCE_REWRITE_TAC[CHECK_DELTAS_THM] >> simp[])
363        >> STRIP_TAC
364        >> `smRecur pat j d = d`
365                by (ONCE_REWRITE_TAC[smRecur_def] >> fs[])
366        >> fs[])
367    );
368
369(* Relationship between smVal function and valid_suf_shift specification *)
370val SMVAL_THM = store_thm(
371    "SMVAL_THM",
372    ``(pat <> []) /\ (j < LENGTH pat)
373     ==> (smVal pat j = MIN_SET (valid_suf_shifts pat j))``,
374    STRIP_TAC
375    >> simp[smVal_def]
376    >> `! d. d IN valid_suf_shifts pat j ==> smRecur pat j 1 <= d`
377            by (rpt STRIP_TAC
378                >> `1 <= d`
379                    by (first_assum mp_tac
380                        >> ONCE_REWRITE_TAC[CHECK_DELTAS_THM]
381                        >> simp[])
382                >> simp[SMRECUR_BND_THM])
383    >> `smRecur pat j 1 IN valid_suf_shifts pat j`
384            by (`1 <= smRecur pat j 1` by simp[SMRECUR_LEM]
385                >> `smRecur pat j 1 <= LENGTH pat` by fs[valid_suf_shifts_def]
386                >> simp[SMRECUR_COR_THM])
387    >> fs[LOWER_BOUND_IS_MIN_SET]
388    );
389
390(* Proving bounds on smVal function with valid input *)
391val SMVAL_BND = store_thm(
392    "SMVAL_BND",
393    ``!j . j < LENGTH pat
394            ==> 0 < smVal pat j
395                /\ smVal pat j <= LENGTH pat``,
396    Cases_on `pat = []`
397    >- simp[LENGTH]
398    >- (simp[SMVAL_THM]
399        >> REPEAT strip_tac
400        >- (`0 <> MIN_SET (valid_suf_shifts pat j)`
401                    suffices_by rw[]
402            >> `~(0 IN valid_suf_shifts pat j)`
403                    suffices_by metis_tac[SUF_SHIFT_EXISTS_THM,MIN_SET_LEM]
404            >> simp[valid_suf_shifts_def])
405        >- (`?x. (x IN valid_suf_shifts pat j) /\ (x <= LENGTH pat)`
406                suffices_by metis_tac[MIN_SET_LEM, LESS_EQ_TRANS,
407                                      MEMBER_NOT_EMPTY]
408            >> simp[valid_suf_shifts_def]
409            >> qexists_tac `LENGTH pat`
410            >> simp[])
411        )
412    );
413
414
415(* -- ACTUAL MISMATCH TABLE CONSTRUCTION --  *)
416(* Find mismatch table value at particular point based
417   on best shift available between suffix mismatch table
418   and character mismatch table *)
419val mVal_def =
420    Define
421    `
422    mVal calc_smVal pat all_chars j a =
423        MAX calc_smVal (cmVal pat all_chars j a)
424    `;
425
426(* Generate a row of mismatch table *)
427val mSubTab_def =
428    Define
429    `
430    mSubTab pat all_chars j =
431        GENLIST (mVal (smVal pat j) pat all_chars j) (LENGTH all_chars)
432    `;
433
434(* Generate mismatch table *)
435val mTab_def =
436    Define
437    `
438    mTab pat all_chars =
439        GENLIST (mSubTab pat all_chars) (LENGTH pat)
440    `;
441
442(* Dimensional properties of mTab *)
443val MTAB_DIM = store_thm(
444    "MTAB_DIM",
445    ``(LENGTH (mTab pat all_chars) = LENGTH pat)
446     /\ (!j. j < LENGTH pat
447             ==> (LENGTH (EL j (mTab pat all_chars))
448                  = LENGTH all_chars))``,
449    simp[mTab_def,mSubTab_def]
450    );
451
452(* Bounds on mTab values for valid inputs *)
453val MTAB_BND = store_thm(
454    "MTAB_BND",
455    ``!a j.  (j < LENGTH pat) /\ (a < LENGTH all_chars)
456            ==> 0 < EL a (EL j (mTab pat all_chars))
457                /\ EL a (EL j (mTab pat all_chars)) <= LENGTH pat``,
458    simp[mTab_def,mSubTab_def,mVal_def]
459    >> metis_tac[SMVAL_BND,CMVAL_BND]
460    );
461
462(* Important solution skipping capacity of mTab *)
463Theorem MTAB_THM:
464  !search j k a.
465     pat <> []
466      /\ (k <= LENGTH search - LENGTH pat)
467      /\ (j < LENGTH pat)
468      /\ (!i. (j<i /\ i < LENGTH pat)
469              ==> (EL i pat = EL (k+i) search))
470      /\ (EL j pat <> EL (k+j) search)
471      /\ (a < LENGTH all_chars)
472      /\ (EL (k+j) search = EL a all_chars)
473    ==>
474      !d. d < EL a (EL j (mTab pat all_chars))
475              ==> (k+d) NOTIN solutions pat search
476Proof
477    strip_tac
478    >> strip_tac
479    >> strip_tac
480    >> strip_tac
481    >> strip_tac
482    >> `(EL a (EL j (mTab pat all_chars)) = (smVal pat j)) \/
483        (EL a (EL j (mTab pat all_chars)) = (cmVal pat all_chars j a))`
484            by rw[mTab_def,mSubTab_def,mVal_def,EL_GENLIST,MAX_DEF]
485    >- (fs[]
486        >> fs[SMVAL_THM]
487        >> `EL j pat <> EL (k+j) search`
488                by rw[EQ_SYM_EQ]
489        >> FREEZE_THEN drule SUF_SKIP_NOT_SOL
490        >> fs[])
491    >- (fs[]
492        >> fs[CMVAL_THM]
493        >> `EL j pat <> EL (k+j) search`
494                by rw[EQ_SYM_EQ]
495        >> FREEZE_THEN drule CHA_SKIP_NOT_SOL
496        >> fs[])
497QED
498
499(* -- BOYER-MOORE SEARCH IMPLEMENTATION -- *)
500(* Checks to see if pat is prefix of search and skips verified bad alignments
501   based on patTab and all_chars if not to recursively find the minimum
502   solution. Returning LENGTH search indicates no substrings, returning
503   LENGTH search + 1 indicates there's been an error likely due to a malformed
504   patTab *)
505val bmRecur_def =
506    tDefine "bmRecur"
507    `
508    (bmRecur [] _ _ _ = 0) /\
509    (bmRecur _ _ _ [] = 0) /\
510    (bmRecur pat patTab all_chars search =
511        if
512            ~(LENGTH pat <= LENGTH search)
513        then
514            (LENGTH search)
515        else
516            let
517                (j = checkPrefixRL pat search)
518            in
519                if
520                    ~(j < LENGTH patTab)
521                then
522                    0
523                else
524                    let
525                        (a = findElem all_chars (EL j search))
526                    in
527                        if
528                            ~(a < LENGTH (EL j patTab))
529                        then
530                            (LENGTH search + 1)
531                        else
532                            let
533                                (d = EL a (EL j patTab))
534                            in
535                                if
536                                    ~(0 < d)
537                                then
538                                    (LENGTH search + 1)
539                                else
540                                    (d + (bmRecur pat patTab
541                                            all_chars (DROP d search)))
542    )
543    `
544    (WF_REL_TAC `measure (LENGTH ��� SND ��� SND ��� SND)`
545    >> rw[DROP]);
546
547(* Simple theorem for cleaness enforcing one definition
548   of bmRecur for non-null lists *)
549val BMRECUR_NON_EMPTY_DEF = store_thm(
550    "BMRECUR_NON_EMPTY_DEF",
551    ``(pat <> []) /\ (search <> [])
552    ==>
553    (bmRecur pat patTab all_chars search =
554     if ��(LENGTH pat ��� LENGTH search) then LENGTH search
555     else
556       (let
557          j = checkPrefixRL pat search
558        in
559          if ��(j < LENGTH patTab) then 0
560          else
561            (let
562               a = findElem all_chars (EL j search)
563             in
564               if ��(a < LENGTH (EL j patTab)) then LENGTH search + 1
565               else
566                 (let
567                    d = EL a (EL j patTab)
568                  in
569                    if ��(0 < d) then LENGTH search + 1
570                    else
571                      d +
572                      bmRecur pat patTab all_chars
573                        (DROP d search))))
574    )``,
575    Cases_on `pat`
576    >> Cases_on `search`
577    >> fs[bmRecur_def]
578    );
579
580(* Proves that bmRecur returns correct solutions with valid inputs
581   for patTab and all_chars *)
582val BMRECUR_THM = store_thm(
583    "BMRECUR_THM",
584    ``(LENGTH patTab = LENGTH pat)
585     /\ (!j. j < LENGTH pat
586             ==> (LENGTH (EL j patTab) = LENGTH all_chars))
587     /\ (!a j. j < LENGTH pat /\ a < LENGTH all_chars
588               ==> 0 < EL a (EL j patTab)
589                   /\ EL a (EL j patTab) <= LENGTH pat)
590     /\ (!search j k a. (pat <> [])
591                         /\ k <= LENGTH search - LENGTH pat
592                         /\ j < LENGTH pat
593                         /\ (!i. j < i /\ i < LENGTH pat
594                                 ==> (EL i pat = EL (k+i) search))
595                         /\ (EL j pat <> EL (k+j) search)
596                         /\ a < LENGTH all_chars
597                         /\ (EL (k+j) search = EL a all_chars)
598                         ==> !d. d < EL a (EL j patTab)
599                                 ==> ~((k+d) IN solutions pat search))
600     ==> (!j. j < LENGTH search ==> MEM (EL j search) all_chars)
601         ==> (bmRecur pat patTab all_chars search = solution pat search)``,
602    strip_tac
603    >> completeInduct_on `LENGTH search`
604    >> fs[PULL_FORALL]
605    >> rw[]
606    >> Cases_on `pat =[]`
607    >- (fs[bmRecur_def,solution_def,solutions_def]
608        >> rw[IN_INSERT, MIN_SET_LEM,DECIDE ``(a <= 0) ==> (a = 0)``])
609    >- (Cases_on `search = []`
610        >- (Cases_on `pat`
611            >> fs[bmRecur_def,solution_def,solutions_def,MIN_SET_THM])
612        >- (rw[BMRECUR_NON_EMPTY_DEF]
613            >> qabbrev_tac `j_i = checkPrefixRL pat search`
614            >> qabbrev_tac `a_i = findElem all_chars (EL j_i search)`
615            >> qabbrev_tac `d_i = EL a_i (EL j_i patTab)`
616            >- (rename [`~(LENGTH pat <= LENGTH search)`]
617                >> rw[solution_def,solutions_def,MIN_SET_THM])
618            >- (rename [`~(j_i < LENGTH pat)`]
619                >> `j_i <= LENGTH pat`
620                        by rw[Abbr `j_i`, CHECK_PREFIX_RL_BND]
621                >> `pat = TAKE (LENGTH pat) search`
622                        by metis_tac[DECIDE ``(x<=y) /\ ~(x<y) ==> (x=y)``,
623                                     CHECK_PREFIX_RL_MATCH]
624                >> pop_assum mp_tac
625                >> simp[LIST_EQ_REWRITE,EL_TAKE]
626                >> strip_tac
627                >> simp[solution_def]
628                >> `0 IN solutions pat search`
629                        suffices_by metis_tac[DECIDE ``(a <= 0) ==> (a=0)``,
630                                              MEMBER_NOT_EMPTY, MIN_SET_LEM,
631                                              IN_INSERT]
632                >> simp[solutions_def])
633            >- (rename [`~(a_i < LENGTH all_chars)`]
634                >> `MEM (EL j_i search) all_chars`
635                        by rw[]
636                >> `a_i = LENGTH all_chars`
637                        by rw[Abbr `a_i`,
638                              DECIDE ``(x<=y) /\ ~(x<y) ==> (x=y)``,
639                              FIND_ELEM_BND]
640                >> metis_tac[FIND_ELEM_NO_MATCH])
641            >- (rename [`d_i = 0`]
642                >> `0 < d_i`
643                        by rw[Abbr `d_i`]
644                >> fs[])
645            >- (rename [`d_i <> 0`]
646                >> `bmRecur pat patTab all_chars (DROP d_i search)
647                    = solution pat (DROP d_i search)`
648                        by fs[LENGTH_DROP,EL_DROP]
649                >> simp[]
650                >> `(LENGTH pat <= LENGTH search) /\ (d_i <= LENGTH search)
651                    /\ (!x. x < d_i ==> ~(x IN solutions pat search))`
652                        suffices_by metis_tac[SOL_SHF_THM]
653                >> simp[]
654                >> strip_tac
655                >- (first_x_assum (qspecl_then [`a_i`,`j_i`] mp_tac)
656                    >> fs[LESS_EQ_TRANS])
657                >- (strip_tac
658                    >> first_x_assum (qspecl_then [`search`,`j_i`,`0`,`a_i`,`x`]
659                                                  mp_tac)
660                    >> fs[Abbr `a_i`, Abbr `j_i`,Abbr `d_i`,
661                          CHECK_PREFIX_RL_THM,FIND_ELEM_THM])
662                )
663            )
664        )
665    );
666
667(* Calculates lookup table and all_chars to call bmRecur for the first time.
668   That is: this implements the boyer-moore substring search algorithm to look
669   for the first appearance of a substring in a string *)
670val bmSearch_def =
671    Define
672    `
673    bmSearch pat search =
674        let
675            all_chars = uniqueElems search
676        in
677            bmRecur pat (mTab pat all_chars) all_chars search
678    `;
679
680(* Final proof that the bmSearch function correctly searches
681   for the first substring *)
682val BMSEARCH_THM = store_thm(
683    "BMSEARCH_THM",
684    ``bmSearch pat search = solution pat search``,
685    simp[bmSearch_def]
686    >> irule BMRECUR_THM
687    >> rpt conj_tac
688    >- metis_tac [MTAB_THM]
689    >- rw [MTAB_BND]
690    >- rw [MTAB_DIM]
691    >- rw [UNIQUE_ELEMS_THM]
692    >- rw [MTAB_DIM]
693    );
694
695(* STRING SPECIALISATION *)
696
697(* Generate an alphabet with all the characters *)
698val alphabet_def =
699    Define
700    `
701    alphabet = GENLIST CHR 256
702    `;
703
704val ALPHABET_THM = store_thm(
705    "ALPHABET_THM",
706    ``! c:char. MEM c alphabet``,
707    STRIP_TAC
708    >> `? n. (c = CHR n) /\ (n < 256)`
709            by rw[ranged_char_nchotomy]
710    >> rw[]
711    >> ONCE_REWRITE_TAC[alphabet_def]
712    >> rw[MEM_GENLIST]
713    >> qexists_tac `n`
714    >> simp[]);
715
716val ALPHABET_FIND_THM = store_thm(
717    "ALPHABET_FIND_THM",
718    ``!c. findElem alphabet c = ORD c``,
719    STRIP_TAC
720    >> `? n. (c = CHR n) /\ (n < 256)`
721            by rw[ranged_char_nchotomy]
722    >> `MEM c alphabet`
723            by (`MEM (EL (ORD c) alphabet) alphabet`
724                    suffices_by rw[ALPHABET_THM]
725                >> `ORD c < LENGTH alphabet`
726                    suffices_by rw[EL_MEM]
727                >> `LENGTH alphabet = 256`
728                        by (rw[alphabet_def,LENGTH]
729                            >> EVAL_TAC)
730                >> rw[ORD_CHR])
731    >> `findElem alphabet c < LENGTH alphabet`
732            by (`findElem alphabet c <> LENGTH alphabet`
733                    suffices_by rw[FIND_ELEM_BND, DECIDE ``a <= b /\ a <> b ==> a < b``]
734                >> CCONTR_TAC
735                >> rw[FIND_ELEM_NO_MATCH])
736    >> `EL (findElem alphabet c) alphabet = c`
737            by rw[FIND_ELEM_THM]
738    >> `EL (findElem alphabet c) alphabet = CHR (findElem alphabet c)`
739            by (ONCE_REWRITE_TAC[alphabet_def]
740                >> `findElem (GENLIST CHR 256) c < LENGTH (GENLIST CHR 256)`
741                        suffices_by rw[EL_GENLIST]
742                >> qpat_assum `findElem a c < STRLEN a` mp_tac
743                >> ONCE_REWRITE_TAC[alphabet_def]
744                >> simp[])
745    >> `c = CHR (findElem alphabet c)`
746            by fs[]
747    >> `ORD (CHR (findElem alphabet c)) = findElem alphabet c`
748            by (`findElem alphabet c < 256`
749                    suffices_by rw[ORD_CHR_RWT]
750                >> `STRLEN alphabet = 256`
751                        suffices_by rw[]
752                >> ONCE_REWRITE_TAC[alphabet_def]
753                >> rw[LENGTH_GENLIST])
754    >> `ORD (CHR (findElem alphabet c)) = ORD c`
755            suffices_by rw[]
756    >> `CHR (findElem alphabet c) = c`
757            suffices_by EVAL_TAC
758    >> fs[]);
759
760(* Build up a bmRecur specialised to strings *)
761val bmRecurString_def =
762    tDefine "bmRecurString"
763    `
764    (bmRecurString [] _ _ = 0) /\
765    (bmRecurString  _ _ [] = 0) /\
766    (bmRecurString pat patTab search =
767        if
768            ~(LENGTH pat <= LENGTH search)
769        then
770            (LENGTH search)
771        else
772            let
773                (j = checkPrefixRL pat search)
774            in
775                if
776                    ~(j < LENGTH patTab)
777                then
778                    0
779                else
780                    let
781                        (a = ORD (EL j search))
782                    in
783                        if
784                            ~(a < LENGTH (EL j patTab))
785                        then
786                            (LENGTH search + 1)
787                        else
788                            let
789                                (d = EL a (EL j patTab))
790                            in
791                                if
792                                    ~(0 < d)
793                                then
794                                    (LENGTH search + 1)
795                                else
796                                    (d + (bmRecurString pat patTab (DROP d search)))
797    )
798    `
799    (WF_REL_TAC `measure (\(p, t, s). LENGTH s)`
800    >> rw[DROP]);
801
802val BMRECUR_STRING_THM = store_thm(
803    "BMRECUR_STRING_THM",
804    ``bmRecur pat patTab alphabet search = bmRecurString pat patTab search``,
805    Cases_on `pat`
806    >> ONCE_REWRITE_TAC[bmRecur_def, bmRecurString_def]
807    >> simp[]
808    >> completeInduct_on `LENGTH search`
809    >> rpt STRIP_TAC
810    >> Cases_on `search`
811    >> ONCE_REWRITE_TAC[bmRecur_def, bmRecurString_def]
812    >> simp[ALPHABET_FIND_THM]);
813
814(* Calls bmRecurString instead of bmRecur.
815   That is: this implements the boyer-moore substring search algorithm to look
816   for the first appearance of a substring in a string - but in ACTUAL string
817   types *)
818val bmSearchString_def =
819    Define
820    `
821    bmSearchString (pat : string) (search : string) =
822        bmRecurString pat (mTab pat alphabet) search
823    `;
824
825(* Final proof that the bmSearchString function correctly searches
826   for the first substring *)
827val BMSEARCH_STRING_THM = store_thm(
828    "BMSEARCH_STRING_THM",
829    ``bmSearchString pat search = solution pat search``,
830    simp[bmSearchString_def]
831    >> `bmRecur pat (mTab pat alphabet) alphabet search = solution pat search`
832            suffices_by rw[BMRECUR_STRING_THM]
833    >> irule BMRECUR_THM
834    >> rpt conj_tac
835    >- metis_tac[MTAB_THM]
836    >- rw[MTAB_BND]
837    >- rw[MTAB_DIM]
838    >- rw[ALPHABET_THM]
839    >- rw[MTAB_DIM]
840    );
841
842val _ = export_theory();
843