1(*---------------------------------------------------------------------------*)
2(* Brzozowski-style regexp compilation with extension to character classes   *)
3(* and other improvements from a paper by Owens,Reppy, and Turon.            *)
4(*---------------------------------------------------------------------------*)
5
6structure regexpMatch :> regexpMatch =
7struct
8
9exception regexpErr of string * string
10
11fun ERR s1 s2 = regexpErr(s1,s2)
12fun pair x y = (x,y)
13fun K x y = x
14
15val debug = ref false
16
17val print = fn s => if !debug then print s else ()
18
19(*---------------------------------------------------------------------------*)
20(* Regular expressions have character sets at the leaves                     *)
21(*---------------------------------------------------------------------------*)
22
23type charset = Char.char Binaryset.set;
24
25val Empty_Charset = Binaryset.empty Char.compare;
26val empty_cset = Empty_Charset
27fun isEmpty_Charset cset = Binaryset.isEmpty cset;
28
29(* ----------------------------------------------------------------------
30    We use full range of 8-bit characters, but this can be changed.
31   ---------------------------------------------------------------------- *)
32
33val MAX_ORD = 255;  (* Char.maxOrd is 255 *)
34val alphabet = List.tabulate(MAX_ORD + 1, Char.chr)
35val ALPHABET_SIZE = MAX_ORD + 1;
36val allchars = Binaryset.addList(Empty_Charset, alphabet);
37val univ_cset = allchars
38
39fun charset_compare(cset1,cset2) =
40 let open Binaryset
41     fun compare [] [] = EQUAL
42       | compare [] _  = LESS
43       | compare _ []  = GREATER
44       | compare ((h1:char)::t1) (h2::t2) =
45           if h1 < h2 then LESS else
46           if h1 > h2 then GREATER
47           else compare t1 t2
48 in
49    if Systeml.pointer_eq(cset1,cset2) then EQUAL
50    else compare (listItems cset1) (listItems cset2)
51 end;
52
53(*---------------------------------------------------------------------------*)
54(* The type of regular expressions                                           *)
55(*---------------------------------------------------------------------------*)
56
57datatype regexp
58   = Epsilon
59   | Symbs of charset
60   | Not of regexp
61   | Sum of regexp * regexp
62   | And of regexp * regexp
63   | Dot of regexp * regexp
64   | Star of regexp;
65
66val Empty_Regexp = Symbs Empty_Charset;
67
68(*---------------------------------------------------------------------------*)
69(* Total order on regexp.                                                     *)
70(*---------------------------------------------------------------------------*)
71
72fun regexp_compare (r1,r2) =
73 if Systeml.pointer_eq(r1,r2) then EQUAL
74 else
75 case (r1,r2)
76  of (Epsilon, Epsilon) => EQUAL
77   | (Epsilon, _)       => LESS
78
79   | (Symbs _, Epsilon) => GREATER
80   | (Symbs a, Symbs b) => charset_compare(a,b)
81   | (Symbs _, _)       => LESS
82
83   | (Not _, Epsilon)   => GREATER
84   | (Not _, Symbs _)   => GREATER
85   | (Not r1, Not r2)   => regexp_compare(r1,r2)
86   | (Not r1, _)        => LESS
87
88   | (Sum _, Epsilon)    => GREATER
89   | (Sum _, Symbs _)    => GREATER
90   | (Sum _, Not _)      => GREATER
91   | (Sum p1, Sum p2)     => pair_regexp_compare (p1,p2)
92   | (Sum _, _)          => LESS
93
94   | (And _, Epsilon)   => GREATER
95   | (And _, Symbs _)   => GREATER
96   | (And _, Not _)     => GREATER
97   | (And _, Sum _)      => GREATER
98   | (And p1, And p2)   => pair_regexp_compare (p1,p2)
99   | (And _, _)         => LESS
100
101   | (Dot _, Epsilon)   => GREATER
102   | (Dot _, Symbs _)   => GREATER
103   | (Dot _, Not _)     => GREATER
104   | (Dot _, Sum _)      => GREATER
105   | (Dot _, And _)     => GREATER
106   | (Dot p1, Dot p2)   => pair_regexp_compare (p1,p2)
107   | (Dot _, _)         => LESS
108
109   | (Star _, Epsilon)  => GREATER
110   | (Star _, Symbs _)  => GREATER
111   | (Star _, Not _)    => GREATER
112   | (Star _, Sum _)    => GREATER
113   | (Star _, And _)    => GREATER
114   | (Star _, Dot _)    => GREATER
115   | (Star r1, Star r2) => regexp_compare (r1,r2)
116and
117 pair_regexp_compare ((r1,r2), (r3,r4)) =
118    case regexp_compare (r1,r3)
119     of EQUAL => regexp_compare (r2,r4)
120      | other => other;
121
122fun regexpEqual r1 r2 = (regexp_compare(r1,r2) = EQUAL);
123
124(*---------------------------------------------------------------------------*)
125(* Is Epsilon in the language of a regular expression?                       *)
126(*---------------------------------------------------------------------------*)
127
128fun hasEpsilon Epsilon      = true
129  | hasEpsilon (Symbs _)    = false
130  | hasEpsilon (Not r)      = not(hasEpsilon r)
131  | hasEpsilon (Sum(r1,r2))  = hasEpsilon r1 orelse hasEpsilon r2
132  | hasEpsilon (And(r1,r2)) = hasEpsilon r1 andalso hasEpsilon r2
133  | hasEpsilon (Dot(r1,r2)) = hasEpsilon r1 andalso hasEpsilon r2
134  | hasEpsilon (Star _)     = true;
135
136(*---------------------------------------------------------------------------*)
137(* Translation to DFA.                                                       *)
138(*---------------------------------------------------------------------------*)
139
140(*---------------------------------------------------------------------------*)
141(* Two regexps are equivalent if they have the same language. This is         *)
142(* expensive to decide, however Brzozowski found that a weaker notion of     *)
143(* equivalence sufficed in order to build DFAs from regexps.                  *)
144(*---------------------------------------------------------------------------*)
145
146fun normNot (Not(Not r)) = r
147  | normNot other = other;
148
149fun normStar (Star Epsilon) = Epsilon
150  | normStar (r as Star(Symbs cset)) = if isEmpty_Charset cset then Epsilon else r
151  | normStar (Star (r as Star _)) = r
152  | normStar otherwise = otherwise;
153
154fun normDot (Dot(Epsilon,r)) = r
155  | normDot (Dot(r,Epsilon)) = r
156  | normDot (r as Dot(r1 as Star a,Star b)) = if regexpEqual a b then r1 else r
157  | normDot (r as Dot(Dot(r1,r2),r3)) =
158    let fun linear (Dot(Dot(r1,r2),r3)) = Dot(r1,linear(Dot(r2,r3)))
159          | linear x = x
160    in linear r
161    end
162  | normDot (r as Dot(Symbs cset1,Symbs cset2)) =
163     if isEmpty_Charset cset1 orelse isEmpty_Charset cset2
164     then Empty_Regexp else r
165  | normDot (r as Dot(Symbs cset,_)) = if isEmpty_Charset cset then Empty_Regexp else r
166  | normDot (r as Dot(_,Symbs cset)) = if isEmpty_Charset cset then Empty_Regexp else r
167  | normDot otherwise = otherwise;
168
169fun normAnd (r as And(Symbs a,Symbs b)) = Symbs (Binaryset.intersection(a,b))
170  | normAnd (r as And(Symbs cset,_)) = if isEmpty_Charset cset then Empty_Regexp else r
171  | normAnd (r as And(_,Symbs cset)) = if isEmpty_Charset cset then Empty_Regexp else r
172  | normAnd (And(And(r1,r2),r3)) = normAnd(And(r1,normAnd(And(r2,r3))))
173  | normAnd (r as And(r1,s as And(r2,r3))) =
174     (case regexp_compare(r1,r2)
175       of EQUAL => s
176        | LESS => r
177        | GREATER => And(r2,normAnd(And(r1,r3))))
178  | normAnd (r as And(r1,r2)) =
179     (case regexp_compare(r1,r2)
180       of EQUAL => r1
181        | LESS => r
182        | GREATER => And(r2,r1))
183  | normAnd r = r;
184
185fun normSum (Sum(Symbs a, Symbs b)) = Symbs(Binaryset.union(a,b))
186  | normSum (t as Sum(Symbs a,r))   = if isEmpty_Charset a then r else t
187  | normSum (t as Sum(r, Symbs a))  = if isEmpty_Charset a then r else t
188  | normSum (Sum(Sum(r1,r2),r3)) = normSum(Sum(r1,normSum(Sum(r2,r3))))
189  | normSum (r as Sum(r1,s as Sum(r2,r3))) =
190     (case regexp_compare(r1,r2)
191       of EQUAL => s
192        | LESS => r
193        | GREATER => Sum(r2,normSum(Sum(r1,r3))))
194  | normSum (r as Sum(r1,r2)) =
195     (case regexp_compare(r1,r2)
196       of EQUAL => r1
197        | LESS => r
198        | GREATER => Sum(r2,r1))
199  | normSum r = r;
200
201fun norm (Not P)    = normNot(Not (norm P))
202  | norm (Sum(P,Q))  = normSum (Sum  (norm P, norm Q))
203  | norm (And(P,Q)) = normAnd(And (norm P, norm Q))
204  | norm (Dot(P,Q)) = normDot(Dot (norm P, norm Q))
205  | norm (Star P)   = normStar (Star (norm P))
206  | norm r = r;
207
208fun simpDeriv x Epsilon    = Empty_Regexp
209  | simpDeriv x (Symbs cs) = if Binaryset.member(cs,x) then Epsilon else Empty_Regexp
210  | simpDeriv x (Not P)    = normNot(Not (simpDeriv x P))
211  | simpDeriv x (Sum(P,Q))  = normSum (Sum  (simpDeriv x P,simpDeriv x Q))
212  | simpDeriv x (And(P,Q)) = normAnd(And (simpDeriv x P, simpDeriv x Q))
213  | simpDeriv x (Dot(P,Q)) =
214      let val pq = normDot(Dot (simpDeriv x P, Q))
215      in if hasEpsilon P
216           then normSum(Sum(pq,simpDeriv x Q))
217           else pq
218      end
219  | simpDeriv x (Star P) = normDot (Dot(simpDeriv x P, normStar(Star P)));
220
221
222(*---------------------------------------------------------------------------*)
223(* Approximate character classes. A character class is a set of charsets.    *)
224(*---------------------------------------------------------------------------*)
225
226val empty_class = Binaryset.empty charset_compare;
227fun classList chlist = Binaryset.addList(empty_class, chlist)
228val allchar_class = classList [allchars];
229
230fun cross [] l2 = []
231  | cross (h::t) l2 = map (pair h) l2 @ cross t l2;
232
233fun moodge class1 class2 =
234 let open Binaryset
235     val pairs = cross (listItems class1) (listItems class2)
236 in
237   List.foldl (fn ((p:char set * char set), acc) =>
238                  add (acc,Binaryset.intersection p))
239              empty_class
240              pairs
241 end;
242
243fun approxClasses Epsilon = allchar_class
244  | approxClasses (Symbs a) = classList [a, Binaryset.difference(allchars,a)]
245  | approxClasses (Not r) = approxClasses r
246  | approxClasses (Star r) = approxClasses r
247  | approxClasses (Sum(r1,r2)) = moodge (approxClasses r1) (approxClasses r2)
248  | approxClasses (And(r1,r2)) = moodge (approxClasses r1) (approxClasses r2)
249  | approxClasses (Dot(r1,r2)) =
250       if hasEpsilon r1
251         then moodge (approxClasses r1) (approxClasses r2)
252         else approxClasses r1;
253
254val Qempty = Binaryset.empty regexp_compare
255fun pair_compare (f1, f2) ((x1,y1), (x2,y2)) =
256    case f1 (x1, x2) of
257        EQUAL => f2 (y1, y2)
258      | x => x
259
260val deltaMap_key_compare = pair_compare (regexp_compare, charset_compare)
261val deltaMap_initial =
262      Binarymap.mkDict deltaMap_key_compare :(regexp*charset,regexp)Binarymap.dict
263fun insert_deltaMap ((d,r), deltaMap) = Binarymap.insert(deltaMap,d,r);
264
265fun pick class = Option.valOf(Binaryset.find (K true) class);
266
267(*---------------------------------------------------------------------------*)
268(* Map a regular expression to a DFA representation.                         *)
269(*---------------------------------------------------------------------------*)
270
271fun regexp_to_dfa r =
272 let val counter = ref 0
273     fun work [] (Q,deltaMap) = (Q,deltaMap)
274       | work (r::t) (Q,deltaMap) =
275          if Binaryset.member(Q,r)  (* state r already seen? *)
276            then (print "re-encountering state.\n";
277                  work t (Q,deltaMap))
278          else let
279            val _ = print ("computing outarcs for state :"^
280                           Int.toString(!counter)^".\n")
281            val _ = counter := !counter + 1
282            val charclasses = List.filter (not o Binaryset.isEmpty)
283                                          (Binaryset.listItems(approxClasses r))
284            val _ = print ("Number of arcs: "^
285                           Int.toString(List.length charclasses)^"\n")
286            val outarcs =
287                List.map (fn c => ((r,c),simpDeriv (pick c) r)) charclasses
288            val targets = List.map #2 outarcs
289            val deltaMap' = List.foldl insert_deltaMap deltaMap outarcs
290            val Q' = Binaryset.add(Q,r)
291          in
292             work (targets@t) (Q',deltaMap')
293          end
294     val initialState = norm r
295     val (Q,deltaMap) = work [initialState] (Qempty,deltaMap_initial)
296     val finalStates = Binaryset.foldl (fn (q,acc) =>
297               if hasEpsilon q then Binaryset.add(acc,q) else acc)
298               Qempty Q
299 in
300   (Q,initialState,deltaMap,finalStates)
301 end
302
303fun dfa_to_arrays (Q,q0,deltaMap,F) =
304 let val _ = print "Mapping to array representation.\n"
305     val states = Binaryset.listItems Q
306     val nstates = List.length states
307     val Qmap = Vector.fromList states
308     fun Qindex i = Vector.sub(Qmap,i)  (* int -> regexp *)
309     val QindexInv = Vector.foldli(fn(i,r,m) => Binarymap.insert(m,r,i))
310                       (Binarymap.mkDict regexp_compare) Qmap
311     fun QindexInvFn r = Binarymap.find(QindexInv,r) (* regexp -> int *)
312     fun row i =
313      let val q = Qindex i
314          val outarcs = Binarymap.foldl(fn((regexp,charset),regexp',arcs)
315                         => if regexpEqual regexp q
316                             then (charset,regexp')::arcs
317                             else arcs) [] deltaMap
318          fun flatarc (charset,regexp) =
319                 List.map (fn c => (Char.ord(c),regexp))
320                          (Binaryset.listItems charset)
321          val flatarcs = List.concat (List.map flatarc outarcs)
322          val sortarcs =
323              Listsort.sort (fn ((a,b), (c,d)) => Int.compare(a,c)) flatarcs
324          val sortarcs' = List.map (fn (i,q) => QindexInvFn q) sortarcs
325          val _ = if List.length sortarcs' <> ALPHABET_SIZE
326                   then raise ERR
327                              "dfa_to_arrays"
328                              "total partition elements differs from \
329                              \size of alphabet"
330                   else ()
331      in
332          Vector.fromList sortarcs'
333      end
334 in
335   {delta = Vector.tabulate(nstates, row),
336    start = QindexInvFn q0,
337    final = Vector.tabulate(nstates, fn i => Binaryset.member(F,Qindex i))
338   }
339 end;
340
341val regexp_to_dfa_arrays = dfa_to_arrays o regexp_to_dfa;
342
343fun match r =
344 let val {delta,start,final} = regexp_to_dfa_arrays r
345     val _ = print (String.concat["DFA states: ",
346                    Int.toString(Vector.length delta),".\n"])
347     fun step (a,q) = Vector.sub(Vector.sub(delta,q), Char.ord a)
348     fun exec ss = Substring.foldl step start ss
349 in
350   fn s => Vector.sub(final,exec (Substring.full s))
351 end;
352
353fun pred_to_set P =
354    Binaryset.foldl (fn (c, acc) => if P c then Binaryset.add(acc, c) else acc)
355                    empty_cset
356                    allchars
357
358fun fromList cs = Binaryset.addList (empty_cset, cs)
359
360structure POSIX = struct
361  val alnum_set = pred_to_set Char.isAlphaNum
362  val alpha_set = pred_to_set Char.isAlpha
363  val ascii_set = pred_to_set Char.isAscii
364  val blank_set = fromList [#" ", #"\t"]
365  val cntrl_set = pred_to_set Char.isCntrl
366  val digit_set = pred_to_set Char.isDigit
367  val graph_set = pred_to_set Char.isGraph
368  val lower_set = pred_to_set Char.isLower
369  val print_set = pred_to_set Char.isPrint
370  val punct_set = pred_to_set Char.isPunct
371  val space_set = pred_to_set Char.isSpace
372  val upper_set = pred_to_set Char.isUpper
373  val xdigit_set = pred_to_set Char.isHexDigit
374  val word_set = Binaryset.add(alnum_set, #"_")
375
376end
377
378end
379