1(*---------------------------------------------------------------------------*)
2(* ML type of regexps                                                        *)
3(*---------------------------------------------------------------------------*)
4
5structure Regexp_Type :> Regexp_Type =
6struct
7
8open Lib Feedback regexpMisc;
9
10val ERR = mk_HOL_ERR "Regexp_Type";
11
12fun copy x n = List.tabulate (n,K x) handle _ => [];
13
14(*---------------------------------------------------------------------------*)
15(* Alphabet                                                                  *)
16(*---------------------------------------------------------------------------*)
17
18val alphabet_size = 256;
19val alphabetN = List.tabulate (alphabet_size,I)
20val alphabet = map Char.chr alphabetN;
21
22(*---------------------------------------------------------------------------*)
23(* Charsets                                                                  *)
24(*---------------------------------------------------------------------------*)
25
26type w64 = Word64.word;
27type charset = w64 * w64 * w64 * w64;
28
29val charset_empty : charset = (0wx0,0wx0,0wx0,0wx0);
30
31fun charset_compl (u1,u2,u3,u4) : charset =
32 let open Word64
33 in (notb u1, notb u2, notb u3, notb u4)
34 end;
35
36val charset_full : charset = charset_compl charset_empty;
37
38fun charset_union (u1,u2,u3,u4) (v1,v2,v3,v4) : charset =
39 let open Word64
40 in (orb (u1,v1), orb (u2,v2), orb (u3,v3), orb (u4,v4))
41 end;
42
43fun charset_inter (u1,u2,u3,u4) (v1,v2,v3,v4) =
44 let open Word64
45 in (andb (u1,v1), andb (u2,v2), andb (u3,v3), andb (u4,v4))
46 end;
47
48fun charset_diff cs1 cs2 = charset_inter cs1 (charset_compl cs2);
49
50val sing_charsets =
51 let fun setbit n = Word64.<< (0wx1,Word.fromInt n)
52     val sing64 = List.map setbit (upto 0 63)
53 in Vector.fromList
54     (List.map (fn w => (w,0wx0,0wx0,0wx0):charset) sing64 @
55      List.map (fn w => (0wx0,w,0wx0,0wx0):charset) sing64 @
56      List.map (fn w => (0wx0,0wx0,w,0wx0):charset) sing64 @
57      List.map (fn w => (0wx0,0wx0,0wx0,w):charset) sing64)
58 end;
59
60fun charset_mem c ((w1,w2,w3,w4):charset) =
61 let val i = Char.ord c
62     val (s1,s2,s3,s4) = Vector.sub(sing_charsets,i)
63     val result =
64        Word64.andb
65          (if i < 64  then (w1,s1) else
66           if i < 128 then (w2,s2) else
67           if i < 192 then (w3,s3) else (w4,s4))
68 in
69   result <> 0wx0
70 end
71
72fun charset_insert c cset =
73  charset_union (Vector.sub(sing_charsets,Char.ord c)) cset;
74
75fun charset_elts cs = filter (C charset_mem cs) alphabet;
76
77fun charset_of clist = itlist charset_insert clist charset_empty;
78
79fun charset_sing c = charset_of [c];
80
81fun charset_span b t =
82 if 0 <= b andalso b <= 255 andalso
83    0 <= t andalso t <= 255
84 then
85   charset_of (map Char.chr (upto b t))
86 else raise ERR "charset_span" "";
87
88fun charset_compare ((u1,u2,u3,u4),(v1,v2,v3,v4)) =
89 let open Word64
90 in case compare(u1,v1)
91     of LESS => LESS
92      | GREATER => GREATER
93      | EQUAL =>
94    case compare (u2,v2)
95     of LESS => LESS
96      | GREATER => GREATER
97      | EQUAL =>
98    case compare (u3,v3)
99     of LESS => LESS
100      | GREATER => GREATER
101      | EQUAL => compare (u4,v4)
102 end;
103
104val charset_digit = charset_of (String.explode"0123456789");
105
106val charset_alpha = charset_of
107  (String.explode "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
108
109val charset_alphanum =
110  charset_insert #"_"
111      (charset_union charset_digit charset_alpha);
112
113val charset_whitespace = charset_of (String.explode" \n\r\t\f");
114
115
116(*---------------------------------------------------------------------------*)
117(* regexp datatype                                                           *)
118(*---------------------------------------------------------------------------*)
119
120datatype regexp
121   = Chset of charset
122   | Cat of regexp * regexp
123   | Star of regexp
124   | Or of regexp list
125   | Neg of regexp;
126
127fun And (r1,r2) = Neg(Or[Neg r1,Neg r2]);
128fun Diff (r1,r2) = And(r1,Neg r2);
129
130val WHITESPACE = Chset charset_whitespace
131val DIGIT      = Chset charset_digit
132val ALPHA      = Chset charset_alpha
133val ALPHANUM   = Chset charset_alphanum
134val EMPTY      = Chset charset_empty
135val SIGMA      = Chset charset_full
136val DOT        = SIGMA;
137val EPSILON    = Star EMPTY;
138val SIGMASTAR  = Star SIGMA;
139
140fun mk_or [] = EMPTY
141  | mk_or [x] = x
142  | mk_or list = Or list;
143
144fun catlist [] = EPSILON
145  | catlist [x] = x
146  | catlist (h::t) = Cat (h,catlist t);
147
148fun strip_cat r =
149 case r
150  of Cat(a,b) => a::strip_cat b
151   | otherwise => [r]
152
153val dots = copy DOT;
154
155fun replicate x (n:int) = catlist (copy x n);
156
157fun ranged r i j =
158 if j < i then EMPTY
159 else Or (map (replicate r) (upto i j));
160
161(*---------------------------------------------------------------------------*)
162(* compressed version of ranged. Not used since it is hard for derivative    *)
163(* taker to do well with the nesting.                                        *)
164(*---------------------------------------------------------------------------*)
165
166fun ranged_alt r i j =
167 if j < i then EMPTY
168 else Cat (replicate r i,replicate (Or [EPSILON,r]) (j-i))
169
170(*---------------------------------------------------------------------------*)
171(* Prettyprinting                                                            *)
172(*---------------------------------------------------------------------------*)
173
174fun charset_string cset =
175 if cset = charset_full
176  then "." else
177 if cset = charset_empty
178  then "[]" else
179 if cset = charset_digit
180  then "\\d" else
181 if cset = charset_alphanum
182  then "\\w" else
183 if cset = charset_whitespace
184  then "\\s"
185 else
186 let fun prchar ch =
187      if mem ch [#"[", #"]"] then
188         "\\" ^ String.str ch
189      else
190      if Char.isGraph ch then
191         String.str ch
192      else
193      let val i = Char.ord ch
194      in String.concat
195               ["\\", (if i <= 9 then "00" else
196                      if i <= 100 then "0" else ""),
197                Int.toString i]
198      end
199     fun printerval (i,j) =
200      if i=j then prchar (Char.chr i)
201      else String.concat [prchar (Char.chr i),"-", prchar(Char.chr j)]
202     val ords = List.map Char.ord (charset_elts cset)
203     val interval_strings = String.concat (map printerval (intervals ords))
204 in
205   String.concat ["[", interval_strings, "]"]
206 end;
207
208
209(*---------------------------------------------------------------------------*)
210(* precedence: Or = 1                                                        *)
211(*            Cat = 2                                                        *)
212(*            Neg = 3                                                        *)
213(*           Star = 4                                                        *)
214(*        charset = 5                                                        *)
215(*---------------------------------------------------------------------------*)
216
217val pp_regexp =
218 let open Portable PP
219     fun paren i j s1 s2 ps =
220       if i < j then block CONSISTENT 0 ps
221       else block INCONSISTENT (size s1) (add_string s1 :: ps @ [add_string s2])
222     fun pp i regexp =
223      case regexp
224       of Chset cs => add_string (charset_string cs)
225        | Cat(s,t) =>
226            let val rlist = strip_cat regexp
227            in
228               paren i 2 "(" ")" (
229                 pr_list (pp 2) [add_break(0,0)] rlist
230               )
231            end
232        | Or rlist =>
233               paren i 1 "(" ")" (
234                 pr_list (pp 1) [add_string " |", add_break(1,0)] rlist
235               )
236        | Neg r =>
237               paren i 3 "(" ")" [
238                  add_string "~",
239                  pp 3 r
240                ]
241        | Star r =>
242               paren i 4 "(" ")" [
243                 pp 4 r,
244                 add_string "*"
245               ]
246 in
247   pp 0
248 end;
249
250val _ = PolyML.addPrettyPrinter (fn d => fn _ => fn r => pp_regexp r);
251
252(*---------------------------------------------------------------------------*)
253(* print regexp as string to stdOut                                          *)
254(*---------------------------------------------------------------------------*)
255
256val print_regexp = Portable.pprint pp_regexp;
257fun println_regexp r = (print_regexp r; print "\n");
258
259(*---------------------------------------------------------------------------*)
260(* regexp comparison                                                         *)
261(*---------------------------------------------------------------------------*)
262
263fun list_cmp cmp pair =
264 case pair
265  of ([],[]) => EQUAL
266   | ([],_)  => LESS
267   | (_,[])  => GREATER
268   | (h1::t1,h2::t2) =>
269      (case cmp (h1,h2)
270        of EQUAL => list_cmp cmp (t1,t2)
271         | verdict => verdict);
272
273fun len_cmp l1 l2 =
274 if null l1
275   then (if null l2 then EQUAL else LESS)
276   else (if null l2 then GREATER else len_cmp(tl l1) (tl l2));
277
278fun list_compare cmp l1 l2 =
279  case len_cmp l1 l2
280   of EQUAL => list_cmp cmp (l1,l2)
281    | strict => strict;
282
283fun regexp_compare pair =
284 if PolyML.pointerEq pair
285 then EQUAL
286 else
287 case pair
288  of (Chset cs1, Chset cs2) => charset_compare(cs1,cs2)
289   | (Chset cs, r) => LESS
290   | (Cat (r,s), Chset _) => GREATER
291   | (Cat(r1,r2),Cat(r3,r4)) =>
292         (case regexp_compare (r1,r3)
293           of LESS => LESS
294            | GREATER => GREATER
295            | EQUAL => regexp_compare (r2,r4))
296   | (Cat(r,s), _) => LESS
297   | (Star r, Chset _) => GREATER
298   | (Star r, Cat (s,t)) => GREATER
299   | (Star r,Star s) => regexp_compare(r,s)
300   | (Star r, _) => LESS
301   | (Or rs,Chset _) => GREATER
302   | (Or rs,Cat (r,s)) => GREATER
303   | (Or rs,Star _) => GREATER
304   | (Or rs1,Or rs2) => list_compare regexp_compare (List.rev rs1) (List.rev rs2)
305   | (Or rs, _) => LESS
306   | (Neg r,Neg s) =>  regexp_compare (r,s)
307   | (Neg r,_) => GREATER
308;
309
310
311(*---------------------------------------------------------------------------*)
312(* Lexer and parser for Python-style regexp syntax.                          *)
313(*---------------------------------------------------------------------------*)
314
315datatype lexeme
316   = lparen
317   | rparen
318   | dot
319   | digit
320   | alphanum
321   | whitespace
322   | interval
323   | char of Char.char
324   | chars of charset
325   | preFix of Char.char  (* ~ *)
326   | inFix of Char.char   (* &| *)
327   | postFix of Char.char (* *+? *)
328   | power of IntInf.int
329   | range of IntInf.int option * IntInf.int option  (* range + direction *)
330   | const of IntInf.int
331;
332
333fun lexeme_equal lparen lparen = true
334  | lexeme_equal rparen rparen = true
335  | lexeme_equal dot dot = true
336  | lexeme_equal digit digit = true
337  | lexeme_equal alphanum alphanum = true
338  | lexeme_equal whitespace whitespace = true
339  | lexeme_equal interval interval = true
340  | lexeme_equal (const cd1) (const cd2) = (cd1=cd2)
341  | lexeme_equal (char c1) (char c2) = (c1=c2)
342  | lexeme_equal (chars c1) (chars c2) = (charset_compare(c1,c2) = EQUAL)
343  | lexeme_equal (preFix c1) (preFix c2) = (c1=c2)
344  | lexeme_equal (inFix c1) (inFix c2) = (c1=c2)
345  | lexeme_equal (postFix c1) (postFix c2) = (c1=c2)
346  | lexeme_equal (power i1) (power i2) = (i1=i2)
347  | lexeme_equal (range io1) (range io2) = (io1=io2)
348  | lexeme_equal x y = false
349
350(*---------------------------------------------------------------------------*)
351(* Following special characters need to be back-slashed to avoid             *)
352(* interpretation.                                                           *)
353(*                                                                           *)
354(*    val specialChar = Char.contains ".^$*+?{}[]\\|()"                      *)
355(*                                                                           *)
356(*---------------------------------------------------------------------------*)
357
358fun opt2string (lowOpt,hiOpt) = String.concat
359 ["{",
360     case (lowOpt,hiOpt)
361      of (SOME i,SOME j) => (IntInf.toString i^","^ IntInf.toString j)
362       | (SOME i, NONE)  => (IntInf.toString i^",")
363       | (NONE, SOME j)  => (","^IntInf.toString j)
364       | otherwise => raise ERR "lexeme2string"
365                                "opt2string: unexpected format",
366  "}"]
367
368fun const2string i =
369 String.concat
370      ["\\k{", IntInf.toString i, "}"]
371
372fun bstring b = if b then "1" else "0";
373
374fun lexeme2string lparen    = "("
375  | lexeme2string rparen    = ")"
376  | lexeme2string dot       = "."
377  | lexeme2string digit     = "\\d"
378  | lexeme2string alphanum  = "\\w"
379  | lexeme2string whitespace = "\\s"
380  | lexeme2string interval   = "\\i"
381  | lexeme2string (char c)   = Char.toString c
382  | lexeme2string (chars s)  = "<charset>"
383  | lexeme2string (preFix c) = Char.toString c
384  | lexeme2string (inFix c)  = Char.toString c
385  | lexeme2string (postFix c) = Char.toString c
386  | lexeme2string (power n)   = "{"^IntInf.toString n^"}"
387  | lexeme2string (range t)   = opt2string t
388  | lexeme2string (const c)   = const2string c
389
390
391(*---------------------------------------------------------------------------*)
392(* Lexing support                                                            *)
393(*---------------------------------------------------------------------------*)
394
395fun takeWhile P ss =
396  let open Substring
397      val (ls, ss') = splitl P ss
398  in if isEmpty ls
399      then NONE
400      else SOME (string ls, ss')
401  end
402
403fun compose f NONE = NONE
404  | compose f (SOME (x,y)) = f x y;
405
406fun try_alt f1 f2 strm =
407  case f1 strm
408   of NONE => f2 strm
409    | other => other
410
411fun chomp ch =
412 let open Substring
413 in compose (fn l => fn strm =>
414      case getc strm
415       of SOME(c,strm') => if ch=c then SOME(l,strm') else NONE
416        | NONE => NONE)
417 end;
418
419fun eat_then ch f strm =
420 case Substring.getc strm
421  of NONE => NONE
422   | SOME(ch1,strm) => if ch=ch1 then f strm else NONE;
423
424fun is_tilde ch = (ch = #"~");
425
426fun getNum ss =
427 compose (fn s => fn ss' =>
428    case IntInf.fromString s
429     of NONE => NONE
430      | SOME i => SOME(i,ss'))
431  (takeWhile (fn c => Char.isDigit c orelse is_tilde c) ss);
432
433fun mk_right_range x = compose
434 (fn i => fn strm => SOME(range(NONE,SOME i),strm)) x;
435
436fun mk_left_range i x = compose
437 (fn _ => fn strm => SOME(range(SOME i,NONE),strm)) x;
438
439(*---------------------------------------------------------------------------*)
440(* Lexing powers and ranges: we've seen a "{": now get the rest : one of     *)
441(*                                                                           *)
442(*    "<d>}"                                                                 *)
443(*    "<d>,}"                                                                *)
444(*    ",<d>}"                                                                *)
445(*    "<d>,<d>}"                                                             *)
446(*                                                                           *)
447(*---------------------------------------------------------------------------*)
448
449val comma = #",";
450
451fun get_range strm =
452 let open Substring
453 in case getc strm
454     of NONE => NONE
455      | SOME(#",",strm') => mk_right_range(chomp #"}" (getNum strm'))
456      | otherwise =>
457    case getNum strm
458     of NONE => NONE
459      | SOME(i,strm') =>
460    case getc strm'
461     of NONE => NONE
462      | SOME(#"}",strm'') => SOME(power i,strm'')
463      | SOME(c,strm'') => if c <> comma then NONE else
464    case getc strm''
465     of NONE => NONE
466      | SOME(#"}",strm''') => SOME(range(SOME i,NONE),strm''')
467      | otherwise =>
468    case getNum strm''
469     of NONE => NONE
470      | SOME(j,strm''') =>
471    case getc strm'''
472     of NONE => NONE
473      | SOME(#"}",strm1) => SOME(range(SOME i,SOME j),strm1)
474      | otherwise => NONE
475 end
476
477
478fun get_const strm =
479 let fun get strm =
480      case getNum strm
481       of NONE =>  NONE
482        | SOME(i,strm) =>
483             (eat_then #"}" (fn strm => SOME (const(i), strm)))
484             strm
485 in
486  eat_then #"{" get strm
487 end
488
489(*---------------------------------------------------------------------------*)
490(* Lexing a character set, which has the form                                *)
491(*                                                                           *)
492(*    [ ... ]  or [^ ... ]                                                   *)
493(*                                                                           *)
494(* where the latter takes the complement of the following charset.  Ranges   *)
495(* of the form c1-c2 are supported, provided they make sense. Backslashing   *)
496(* characters is also allowed. Inside a charset, an explicit number denoting *)
497(* a char can be written as a backslash followed by three decimal digits,    *)
498(* i.e., \ddd.                                                               *)
499(*---------------------------------------------------------------------------*)
500
501fun get_charset strm =
502 let fun compl(chars cs,strm) = (chars(charset_diff charset_full cs),strm)
503       | compl otherwise = raise ERR "get_charset" "compl"
504     fun elim_decimal_chars list =
505       let in
506         case list
507          of #"\\"::d1::d2::d3::t =>
508            if Char.isDigit d1 andalso Char.isDigit d2 andalso Char.isDigit d3
509              then (case Int.fromString (String.implode [d1,d2,d3])
510                    of SOME i =>
511                        if Int.<(i,256) then chr(i)::elim_decimal_chars t
512                        else raise ERR "elim_decimal_chars"
513                              "malformed charset element: looking for \\ddd less than 256"
514                     | NONE => raise ERR "elim_decimal_chars"
515                              "number from \\ddd is too big"
516                   )
517              else List.hd list :: elim_decimal_chars (List.tl list)
518          | ch::t => ch :: elim_decimal_chars t
519          | [] => []
520      end
521     fun mk_chars (ch1 :: #"-" :: ch2 :: t) =
522         if Char.<=(ch1,ch2)
523          then let val clist = map Char.chr (upto (Char.ord ch1) (Char.ord ch2))
524               in clist @ mk_chars t
525               end
526          else raise ERR "lex.get_charset.mk_chars" "bad range"
527       | mk_chars (#"\\"::ch::t) = ch::mk_chars t
528       | mk_chars (ch::t) = ch::mk_chars t
529       | mk_chars [] = []
530     fun get_charset_body strm acc =
531        case Substring.getc strm
532         of SOME(#"]",strm') => (rev acc,strm')
533          | SOME(ch,strm') => get_charset_body strm' (ch::acc)
534          | NONE => raise ERR "lex.get_charset.get_charset_body"
535                     "end of input before getting to end of charset"
536     fun mk_cset (clist,strm) =
537        (chars (charset_of (mk_chars (elim_decimal_chars clist))),
538         strm)
539 in case Substring.getc strm
540     of NONE => NONE
541      | SOME(#"]",strm') => SOME(chars charset_empty, strm')
542      | SOME(#"^",strm') => SOME(compl (mk_cset (get_charset_body strm' [])))
543      | SOME(ch, strm')  => SOME(mk_cset(get_charset_body strm' [ch]))
544 end
545
546fun lex strm =
547 let open Substring
548 in case getc strm
549     of NONE => NONE
550      | SOME (#"(",strm') => SOME(lparen,strm')
551      | SOME (#")",strm') => SOME(rparen,strm')
552      | SOME (#"[",strm') => get_charset strm'
553      | SOME (#"{",strm') => get_range strm'
554      | SOME (#".",strm') => SOME(dot,strm')
555      | SOME (#"~",strm') => SOME(preFix #"~",strm')
556      | SOME (#"|",strm') => SOME(inFix #"|",strm')
557      | SOME (#"&",strm') => SOME(inFix #"&",strm')
558      | SOME (#"*",strm') => SOME(postFix #"*",strm')
559      | SOME (#"+",strm') => SOME(postFix #"+",strm')
560      | SOME (#"?",strm') => SOME(postFix #"?",strm')
561      | SOME (#"\\",strm') =>
562        (case getc strm'
563          of NONE => NONE
564           | SOME(#"d",strm'') => SOME(digit,strm'')
565           | SOME(#"w",strm'') => SOME(alphanum,strm'')
566           | SOME(#"s",strm'') => SOME(whitespace,strm'')
567           | SOME(#"i",strm'') => SOME(interval,strm'')
568           | SOME(#"k",strm'') => get_const strm''
569           | SOME(ch,strm'')   => SOME(char ch,strm'')
570        )
571      | SOME (ch,strm') => SOME (char ch,strm')
572 end
573
574fun lexemes ss = case lex ss of NONE => [] | SOME(l,ss') => l::lexemes ss'
575
576(*
577val ss = Substring.full "[abc\234d]";
578val ss = Substring.full "[abc\\234d]";
579val ss = Substring.full "[\117-\234\010]";
580val [chars cset] = lexemes ss;
581*)
582
583
584(*---------------------------------------------------------------------------*)
585(* Now parsing                                                               *)
586(*---------------------------------------------------------------------------*)
587
588fun PARSE_ERR s ss =
589 let open Substring
590     val estring = String.concat
591         ["Regexp parser failed!\n   ",s,
592          "\n   Remaining input: ", string ss, ".\n"]
593 in
594   raise ERR "PARSE_ERR" estring
595 end;
596
597(*---------------------------------------------------------------------------*)
598(* We parse into tree, from which it is easy to generate regexp               *)
599(*---------------------------------------------------------------------------*)
600
601datatype tree
602   = Ident of Char.char
603   | Cset of charset
604   | Ap of string * tree list
605   | Power of tree * int
606   | Range of tree * int option * int option
607   | Interval of IntInf.int * IntInf.int
608   | Const of IntInf.int;
609
610
611fun expect lexeme (stk,ss) =
612  case lex ss
613   of SOME (lexed, ss') =>
614       if lexeme_equal lexed lexeme
615         then (stk,ss')
616         else PARSE_ERR ("Expected "^quote(lexeme2string lexeme)) ss
617    | NONE => PARSE_ERR ("Empty input when expecting "
618                        ^quote(lexeme2string lexeme)) ss;
619
620fun opr #"*" = "Star"
621  | opr #"+" = "Plus"
622  | opr #"?" = "Opt"
623  | opr #"|" = "Sum"
624  | opr #"&" = "And"
625  | opr #"~" = "Not"
626  | opr ch = raise ERR "opr" ("unknown operator: "^Char.toString ch)
627
628val isInfix = Char.contains"|&";
629
630(*---------------------------------------------------------------------------*)
631(* Essential grammar being parsed. Rules given in order of increasing        *)
632(* precedence.                                                               *)
633(*                                                                           *)
634(*  re ::= re "|" re                                                         *)
635(*      |  re "&" re                                                         *)
636(*      | re <dot> re  ;; <dot> is implicit because concat. is invisible     *)
637(*      | ~re                                                                *)
638(*      | re<p>        ;; p in {+,*,?} or a power or a range                 *)
639(*      | "(" re ")"                                                         *)
640(*      | charset                                                            *)
641(*                                                                           *)
642(*---------------------------------------------------------------------------*)
643
644fun parse ss = Disj ([],ss)
645and
646 Disj (stk,ss) =
647  let val (stk',ss') = Conj (stk,ss)
648  in case lex ss'
649      of SOME (inFix #"|",ss'') =>
650          (case Disj(stk',ss'')
651           of (e1::e2::t,ss''') => (Ap("Sum",[e2,e1])::t,ss''')
652            | otherwise => PARSE_ERR "expected right part of | expression" ss'')
653       | other => (stk',ss')
654  end
655and
656 Conj (stk,ss) =
657  let val (stk',ss') = Juxt (stk,ss)
658  in case lex ss'
659      of SOME (inFix #"&",ss'') =>
660          (case Conj(stk',ss'')
661           of (e1::e2::t,ss''') => (Ap("And",[e2,e1])::t,ss''')
662            | otherwise => PARSE_ERR "expected right part of & expression" ss'')
663       | other => (stk',ss')
664  end
665and
666 Juxt (stk,ss) =
667  let val (stk',ss') = Prefix (stk,ss)
668  in case lex ss'
669      of SOME(inFix _,_)   => (stk',ss')
670       | SOME(rparen,_)    => (stk',ss')
671       | SOME(postFix _,_) => (stk',ss')
672       | SOME(power _,_)   => (stk',ss')
673       | SOME(range _,_)   => (stk',ss')
674       | SOME other =>
675          (case Juxt(stk',ss')
676            of (e1::e2::t,ss'') => (Ap("Juxt",[e2,e1])::t,ss'')
677             | otherwise => PARSE_ERR "expected another regexp" ss')
678       | NONE => (stk',ss')
679  end
680and
681 Prefix (stk,ss) =
682  case lex ss
683   of SOME(preFix ch,ss') =>
684       (case Prefix(stk,ss')
685         of (h::t,ss'') => (Ap(opr ch,[h])::t,ss'')
686          | ([],_) => PARSE_ERR "missing argument to ~" ss')
687    | otherwise => Postfix(stk,ss)
688and
689 Postfix (stk,ss) =
690  let fun Postfixen (stk,ss) = (* deal with multiple post-fixes *)
691       case lex ss
692         of SOME(postFix ch,ss') =>
693             (case stk
694               of h::t => Postfixen(Ap(opr ch,[h])::t,ss')
695                | [] => PARSE_ERR "missing argument to postfix operator" ss)
696          | SOME(power i,ss') =>
697             (case stk
698               of h::t => Postfixen(Power(h,IntInf.toInt i)::t,ss')
699                | [] => PARSE_ERR "missing regexp for {-} operator" ss)
700          | SOME(range(o1,o2),ss') =>
701             (case stk
702               of Ap("interval",[])::t
703                   => (case (o1,o2)
704                        of (SOME i, SOME j) => Postfixen(Interval(i,j)::t,ss')
705                         | otherwise => PARSE_ERR "incomplete interval" ss)
706                | h::t => let val o1' = Option.map IntInf.toInt o1
707                              val o2' = Option.map IntInf.toInt o2
708                          in Postfixen(Range(h,o1',o2')::t,ss')
709                          end
710                | [] => PARSE_ERR "missing regexp for {-,-} operator" ss)
711          | otherwise => (stk,ss)
712  in Postfixen (Atom (stk,ss))
713  end
714and
715 Atom (stk,ss) =
716  case lex ss
717   of SOME(dot,ss')        => (Ap("dot",[])::stk,ss')
718    | SOME(char ch,ss')    => (Ident ch::stk,ss')
719    | SOME(chars cset,ss') => (Cset cset::stk,ss')
720    | SOME(digit,ss')      => (Ap("digit",[])::stk,ss')
721    | SOME(alphanum,ss')   => (Ap("alphanum",[])::stk,ss')
722    | SOME(whitespace,ss') => (Ap("whitespace",[])::stk,ss')
723    | SOME(interval,ss')   => (Ap("interval",[])::stk,ss')
724    | SOME(const copt,ss') => (Const copt::stk,ss')
725    | SOME(lparen,ss')     =>
726       let in
727         case expect rparen (parse ss')
728          of ([x],ss'') => (x::stk,ss'')
729           | other => PARSE_ERR "unable to complete parse inside parentheses" ss'
730       end
731    | other => (stk,ss)
732;
733
734val tree_parse = parse;
735
736fun substring_to_tree sstring =
737 case parse sstring
738  of ([t],ss) => if Substring.isEmpty ss then t
739                 else PARSE_ERR "remaining input after successful parse" ss
740   | ([],ss) => PARSE_ERR "no parse" ss
741   | (other,ss) => PARSE_ERR "multiple parses" ss;
742
743val string_to_tree = substring_to_tree o Substring.full;
744
745fun quote_to_tree input =
746 case input
747  of [QUOTE s] =>
748     let open Substring
749         val ss = full s
750         val (_,ss') = position (String.implode[#"*",#")"]) ss
751     in substring_to_tree (slice(ss',2,NONE))
752     end
753 | otherwise => raise ERR "quote_to_tree"
754               "expected a simple quotation of the form ` .... `";
755
756fun tree_to_regexp intervalFn =
757 let fun t2r tree =
758  case tree
759   of Ident ch => Chset (charset_of [ch])
760    | Cset cset => Chset cset
761    | Power(t,i) => replicate (t2r t) i
762    | Range(t,SOME i,SOME j) => ranged (t2r t) i j
763    | Range(t,NONE,SOME j) => ranged (t2r t) 0 j
764    | Range(t,SOME i,NONE) =>
765      let val r = t2r t
766      in Cat(replicate r i, Star r)
767      end
768    | Range (t,NONE,NONE) => Star (t2r t)
769    | Interval(i,j) => intervalFn (i,j)
770    | Const k => intervalFn (k,k)
771    | Ap("dot",[]) => DOT
772    | Ap("digit",[]) => DIGIT
773    | Ap("alphanum",[]) => ALPHANUM
774    | Ap("whitespace",[]) => WHITESPACE
775    | Ap("Star",[t]) => Star(t2r t)
776    | Ap("Plus",[t]) => let val r = t2r t in Cat(r,Star r) end
777    | Ap("Opt",[t]) => Or[EPSILON,t2r t]
778    | Ap("Sum",[t1,t2]) => Or[t2r t1,t2r t2]
779    | Ap("And",[t1,t2]) => Neg(Or[Neg(t2r t1),Neg(t2r t2)])
780    | Ap("Juxt",[t1,t2]) => Cat(t2r t1,t2r t2)
781    | Ap("Not",[t]) => Neg(t2r t)
782    | Ap("interval",[]) => raise ERR "tree_to_regexp" "missing interval parameters"
783    | Ap(other,_) => raise ERR "tree_to_regexp" ("unknown operator: "^other)
784 in
785   t2r
786 end
787;
788
789val the_intervalFn : (IntInf.int * IntInf.int -> regexp) ref =
790  ref (fn _ => raise ERR "tree_to_regexp" "interval regexp generator not installed")
791
792fun get_intervalFn() = !the_intervalFn
793fun set_intervalFn f = (the_intervalFn := f);
794
795fun fromSubstring sstring =
796 tree_to_regexp (get_intervalFn()) (substring_to_tree sstring)
797 handle e => raise wrap_exn "Regexp_Type" "fromSubstring" e;
798
799fun fromString s = tree_to_regexp (get_intervalFn()) (string_to_tree s)
800 handle e => raise wrap_exn "Regexp_Type" "fromString" e;
801
802fun fromQuote q = tree_to_regexp (get_intervalFn()) (quote_to_tree q)
803 handle e => raise wrap_exn "Regexp_Type" "fromQuote" e;
804
805end
806