1(*---------------------------------------------------------------------------*) 2(* ML type of regexps *) 3(*---------------------------------------------------------------------------*) 4 5structure Regexp_Type :> Regexp_Type = 6struct 7 8open Lib Feedback regexpMisc; 9 10val ERR = mk_HOL_ERR "Regexp_Type"; 11 12fun copy x n = List.tabulate (n,K x) handle _ => []; 13 14(*---------------------------------------------------------------------------*) 15(* Alphabet *) 16(*---------------------------------------------------------------------------*) 17 18val alphabet_size = 256; 19val alphabetN = List.tabulate (alphabet_size,I) 20val alphabet = map Char.chr alphabetN; 21 22(*---------------------------------------------------------------------------*) 23(* Charsets *) 24(*---------------------------------------------------------------------------*) 25 26type w64 = Word64.word; 27type charset = w64 * w64 * w64 * w64; 28 29val charset_empty : charset = (0wx0,0wx0,0wx0,0wx0); 30 31fun charset_compl (u1,u2,u3,u4) : charset = 32 let open Word64 33 in (notb u1, notb u2, notb u3, notb u4) 34 end; 35 36val charset_full : charset = charset_compl charset_empty; 37 38fun charset_union (u1,u2,u3,u4) (v1,v2,v3,v4) : charset = 39 let open Word64 40 in (orb (u1,v1), orb (u2,v2), orb (u3,v3), orb (u4,v4)) 41 end; 42 43fun charset_inter (u1,u2,u3,u4) (v1,v2,v3,v4) = 44 let open Word64 45 in (andb (u1,v1), andb (u2,v2), andb (u3,v3), andb (u4,v4)) 46 end; 47 48fun charset_diff cs1 cs2 = charset_inter cs1 (charset_compl cs2); 49 50val sing_charsets = 51 let fun setbit n = Word64.<< (0wx1,Word.fromInt n) 52 val sing64 = List.map setbit (upto 0 63) 53 in Vector.fromList 54 (List.map (fn w => (w,0wx0,0wx0,0wx0):charset) sing64 @ 55 List.map (fn w => (0wx0,w,0wx0,0wx0):charset) sing64 @ 56 List.map (fn w => (0wx0,0wx0,w,0wx0):charset) sing64 @ 57 List.map (fn w => (0wx0,0wx0,0wx0,w):charset) sing64) 58 end; 59 60fun charset_mem c ((w1,w2,w3,w4):charset) = 61 let val i = Char.ord c 62 val (s1,s2,s3,s4) = Vector.sub(sing_charsets,i) 63 val result = 64 Word64.andb 65 (if i < 64 then (w1,s1) else 66 if i < 128 then (w2,s2) else 67 if i < 192 then (w3,s3) else (w4,s4)) 68 in 69 result <> 0wx0 70 end 71 72fun charset_insert c cset = 73 charset_union (Vector.sub(sing_charsets,Char.ord c)) cset; 74 75fun charset_elts cs = filter (C charset_mem cs) alphabet; 76 77fun charset_of clist = itlist charset_insert clist charset_empty; 78 79fun charset_sing c = charset_of [c]; 80 81fun charset_span b t = 82 if 0 <= b andalso b <= 255 andalso 83 0 <= t andalso t <= 255 84 then 85 charset_of (map Char.chr (upto b t)) 86 else raise ERR "charset_span" ""; 87 88fun charset_compare ((u1,u2,u3,u4),(v1,v2,v3,v4)) = 89 let open Word64 90 in case compare(u1,v1) 91 of LESS => LESS 92 | GREATER => GREATER 93 | EQUAL => 94 case compare (u2,v2) 95 of LESS => LESS 96 | GREATER => GREATER 97 | EQUAL => 98 case compare (u3,v3) 99 of LESS => LESS 100 | GREATER => GREATER 101 | EQUAL => compare (u4,v4) 102 end; 103 104val charset_digit = charset_of (String.explode"0123456789"); 105 106val charset_alpha = charset_of 107 (String.explode "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"); 108 109val charset_alphanum = 110 charset_insert #"_" 111 (charset_union charset_digit charset_alpha); 112 113val charset_whitespace = charset_of (String.explode" \n\r\t\f"); 114 115 116(*---------------------------------------------------------------------------*) 117(* regexp datatype *) 118(*---------------------------------------------------------------------------*) 119 120datatype regexp 121 = Chset of charset 122 | Cat of regexp * regexp 123 | Star of regexp 124 | Or of regexp list 125 | Neg of regexp; 126 127fun And (r1,r2) = Neg(Or[Neg r1,Neg r2]); 128fun Diff (r1,r2) = And(r1,Neg r2); 129 130val WHITESPACE = Chset charset_whitespace 131val DIGIT = Chset charset_digit 132val ALPHA = Chset charset_alpha 133val ALPHANUM = Chset charset_alphanum 134val EMPTY = Chset charset_empty 135val SIGMA = Chset charset_full 136val DOT = SIGMA; 137val EPSILON = Star EMPTY; 138val SIGMASTAR = Star SIGMA; 139 140fun mk_or [] = EMPTY 141 | mk_or [x] = x 142 | mk_or list = Or list; 143 144fun catlist [] = EPSILON 145 | catlist [x] = x 146 | catlist (h::t) = Cat (h,catlist t); 147 148fun strip_cat r = 149 case r 150 of Cat(a,b) => a::strip_cat b 151 | otherwise => [r] 152 153val dots = copy DOT; 154 155fun replicate x (n:int) = catlist (copy x n); 156 157fun ranged r i j = 158 if j < i then EMPTY 159 else Or (map (replicate r) (upto i j)); 160 161(*---------------------------------------------------------------------------*) 162(* compressed version of ranged. Not used since it is hard for derivative *) 163(* taker to do well with the nesting. *) 164(*---------------------------------------------------------------------------*) 165 166fun ranged_alt r i j = 167 if j < i then EMPTY 168 else Cat (replicate r i,replicate (Or [EPSILON,r]) (j-i)) 169 170(*---------------------------------------------------------------------------*) 171(* Prettyprinting *) 172(*---------------------------------------------------------------------------*) 173 174fun charset_string cset = 175 if cset = charset_full 176 then "." else 177 if cset = charset_empty 178 then "[]" else 179 if cset = charset_digit 180 then "\\d" else 181 if cset = charset_alphanum 182 then "\\w" else 183 if cset = charset_whitespace 184 then "\\s" 185 else 186 let fun prchar ch = 187 if mem ch [#"[", #"]"] then 188 "\\" ^ String.str ch 189 else 190 if Char.isGraph ch then 191 String.str ch 192 else 193 let val i = Char.ord ch 194 in String.concat 195 ["\\", (if i <= 9 then "00" else 196 if i <= 100 then "0" else ""), 197 Int.toString i] 198 end 199 fun printerval (i,j) = 200 if i=j then prchar (Char.chr i) 201 else String.concat [prchar (Char.chr i),"-", prchar(Char.chr j)] 202 val ords = List.map Char.ord (charset_elts cset) 203 val interval_strings = String.concat (map printerval (intervals ords)) 204 in 205 String.concat ["[", interval_strings, "]"] 206 end; 207 208 209(*---------------------------------------------------------------------------*) 210(* precedence: Or = 1 *) 211(* Cat = 2 *) 212(* Neg = 3 *) 213(* Star = 4 *) 214(* charset = 5 *) 215(*---------------------------------------------------------------------------*) 216 217val pp_regexp = 218 let open Portable PP 219 fun paren i j s1 s2 ps = 220 if i < j then block CONSISTENT 0 ps 221 else block INCONSISTENT (size s1) (add_string s1 :: ps @ [add_string s2]) 222 fun pp i regexp = 223 case regexp 224 of Chset cs => add_string (charset_string cs) 225 | Cat(s,t) => 226 let val rlist = strip_cat regexp 227 in 228 paren i 2 "(" ")" ( 229 pr_list (pp 2) [add_break(0,0)] rlist 230 ) 231 end 232 | Or rlist => 233 paren i 1 "(" ")" ( 234 pr_list (pp 1) [add_string " |", add_break(1,0)] rlist 235 ) 236 | Neg r => 237 paren i 3 "(" ")" [ 238 add_string "~", 239 pp 3 r 240 ] 241 | Star r => 242 paren i 4 "(" ")" [ 243 pp 4 r, 244 add_string "*" 245 ] 246 in 247 pp 0 248 end; 249 250val _ = PolyML.addPrettyPrinter (fn d => fn _ => fn r => pp_regexp r); 251 252(*---------------------------------------------------------------------------*) 253(* print regexp as string to stdOut *) 254(*---------------------------------------------------------------------------*) 255 256val print_regexp = Portable.pprint pp_regexp; 257fun println_regexp r = (print_regexp r; print "\n"); 258 259(*---------------------------------------------------------------------------*) 260(* regexp comparison *) 261(*---------------------------------------------------------------------------*) 262 263fun list_cmp cmp pair = 264 case pair 265 of ([],[]) => EQUAL 266 | ([],_) => LESS 267 | (_,[]) => GREATER 268 | (h1::t1,h2::t2) => 269 (case cmp (h1,h2) 270 of EQUAL => list_cmp cmp (t1,t2) 271 | verdict => verdict); 272 273fun len_cmp l1 l2 = 274 if null l1 275 then (if null l2 then EQUAL else LESS) 276 else (if null l2 then GREATER else len_cmp(tl l1) (tl l2)); 277 278fun list_compare cmp l1 l2 = 279 case len_cmp l1 l2 280 of EQUAL => list_cmp cmp (l1,l2) 281 | strict => strict; 282 283fun regexp_compare pair = 284 if PolyML.pointerEq pair 285 then EQUAL 286 else 287 case pair 288 of (Chset cs1, Chset cs2) => charset_compare(cs1,cs2) 289 | (Chset cs, r) => LESS 290 | (Cat (r,s), Chset _) => GREATER 291 | (Cat(r1,r2),Cat(r3,r4)) => 292 (case regexp_compare (r1,r3) 293 of LESS => LESS 294 | GREATER => GREATER 295 | EQUAL => regexp_compare (r2,r4)) 296 | (Cat(r,s), _) => LESS 297 | (Star r, Chset _) => GREATER 298 | (Star r, Cat (s,t)) => GREATER 299 | (Star r,Star s) => regexp_compare(r,s) 300 | (Star r, _) => LESS 301 | (Or rs,Chset _) => GREATER 302 | (Or rs,Cat (r,s)) => GREATER 303 | (Or rs,Star _) => GREATER 304 | (Or rs1,Or rs2) => list_compare regexp_compare (List.rev rs1) (List.rev rs2) 305 | (Or rs, _) => LESS 306 | (Neg r,Neg s) => regexp_compare (r,s) 307 | (Neg r,_) => GREATER 308; 309 310 311(*---------------------------------------------------------------------------*) 312(* Lexer and parser for Python-style regexp syntax. *) 313(*---------------------------------------------------------------------------*) 314 315datatype lexeme 316 = lparen 317 | rparen 318 | dot 319 | digit 320 | alphanum 321 | whitespace 322 | interval 323 | char of Char.char 324 | chars of charset 325 | preFix of Char.char (* ~ *) 326 | inFix of Char.char (* &| *) 327 | postFix of Char.char (* *+? *) 328 | power of IntInf.int 329 | range of IntInf.int option * IntInf.int option (* range + direction *) 330 | const of IntInf.int 331; 332 333fun lexeme_equal lparen lparen = true 334 | lexeme_equal rparen rparen = true 335 | lexeme_equal dot dot = true 336 | lexeme_equal digit digit = true 337 | lexeme_equal alphanum alphanum = true 338 | lexeme_equal whitespace whitespace = true 339 | lexeme_equal interval interval = true 340 | lexeme_equal (const cd1) (const cd2) = (cd1=cd2) 341 | lexeme_equal (char c1) (char c2) = (c1=c2) 342 | lexeme_equal (chars c1) (chars c2) = (charset_compare(c1,c2) = EQUAL) 343 | lexeme_equal (preFix c1) (preFix c2) = (c1=c2) 344 | lexeme_equal (inFix c1) (inFix c2) = (c1=c2) 345 | lexeme_equal (postFix c1) (postFix c2) = (c1=c2) 346 | lexeme_equal (power i1) (power i2) = (i1=i2) 347 | lexeme_equal (range io1) (range io2) = (io1=io2) 348 | lexeme_equal x y = false 349 350(*---------------------------------------------------------------------------*) 351(* Following special characters need to be back-slashed to avoid *) 352(* interpretation. *) 353(* *) 354(* val specialChar = Char.contains ".^$*+?{}[]\\|()" *) 355(* *) 356(*---------------------------------------------------------------------------*) 357 358fun opt2string (lowOpt,hiOpt) = String.concat 359 ["{", 360 case (lowOpt,hiOpt) 361 of (SOME i,SOME j) => (IntInf.toString i^","^ IntInf.toString j) 362 | (SOME i, NONE) => (IntInf.toString i^",") 363 | (NONE, SOME j) => (","^IntInf.toString j) 364 | otherwise => raise ERR "lexeme2string" 365 "opt2string: unexpected format", 366 "}"] 367 368fun const2string i = 369 String.concat 370 ["\\k{", IntInf.toString i, "}"] 371 372fun bstring b = if b then "1" else "0"; 373 374fun lexeme2string lparen = "(" 375 | lexeme2string rparen = ")" 376 | lexeme2string dot = "." 377 | lexeme2string digit = "\\d" 378 | lexeme2string alphanum = "\\w" 379 | lexeme2string whitespace = "\\s" 380 | lexeme2string interval = "\\i" 381 | lexeme2string (char c) = Char.toString c 382 | lexeme2string (chars s) = "<charset>" 383 | lexeme2string (preFix c) = Char.toString c 384 | lexeme2string (inFix c) = Char.toString c 385 | lexeme2string (postFix c) = Char.toString c 386 | lexeme2string (power n) = "{"^IntInf.toString n^"}" 387 | lexeme2string (range t) = opt2string t 388 | lexeme2string (const c) = const2string c 389 390 391(*---------------------------------------------------------------------------*) 392(* Lexing support *) 393(*---------------------------------------------------------------------------*) 394 395fun takeWhile P ss = 396 let open Substring 397 val (ls, ss') = splitl P ss 398 in if isEmpty ls 399 then NONE 400 else SOME (string ls, ss') 401 end 402 403fun compose f NONE = NONE 404 | compose f (SOME (x,y)) = f x y; 405 406fun try_alt f1 f2 strm = 407 case f1 strm 408 of NONE => f2 strm 409 | other => other 410 411fun chomp ch = 412 let open Substring 413 in compose (fn l => fn strm => 414 case getc strm 415 of SOME(c,strm') => if ch=c then SOME(l,strm') else NONE 416 | NONE => NONE) 417 end; 418 419fun eat_then ch f strm = 420 case Substring.getc strm 421 of NONE => NONE 422 | SOME(ch1,strm) => if ch=ch1 then f strm else NONE; 423 424fun is_tilde ch = (ch = #"~"); 425 426fun getNum ss = 427 compose (fn s => fn ss' => 428 case IntInf.fromString s 429 of NONE => NONE 430 | SOME i => SOME(i,ss')) 431 (takeWhile (fn c => Char.isDigit c orelse is_tilde c) ss); 432 433fun mk_right_range x = compose 434 (fn i => fn strm => SOME(range(NONE,SOME i),strm)) x; 435 436fun mk_left_range i x = compose 437 (fn _ => fn strm => SOME(range(SOME i,NONE),strm)) x; 438 439(*---------------------------------------------------------------------------*) 440(* Lexing powers and ranges: we've seen a "{": now get the rest : one of *) 441(* *) 442(* "<d>}" *) 443(* "<d>,}" *) 444(* ",<d>}" *) 445(* "<d>,<d>}" *) 446(* *) 447(*---------------------------------------------------------------------------*) 448 449val comma = #","; 450 451fun get_range strm = 452 let open Substring 453 in case getc strm 454 of NONE => NONE 455 | SOME(#",",strm') => mk_right_range(chomp #"}" (getNum strm')) 456 | otherwise => 457 case getNum strm 458 of NONE => NONE 459 | SOME(i,strm') => 460 case getc strm' 461 of NONE => NONE 462 | SOME(#"}",strm'') => SOME(power i,strm'') 463 | SOME(c,strm'') => if c <> comma then NONE else 464 case getc strm'' 465 of NONE => NONE 466 | SOME(#"}",strm''') => SOME(range(SOME i,NONE),strm''') 467 | otherwise => 468 case getNum strm'' 469 of NONE => NONE 470 | SOME(j,strm''') => 471 case getc strm''' 472 of NONE => NONE 473 | SOME(#"}",strm1) => SOME(range(SOME i,SOME j),strm1) 474 | otherwise => NONE 475 end 476 477 478fun get_const strm = 479 let fun get strm = 480 case getNum strm 481 of NONE => NONE 482 | SOME(i,strm) => 483 (eat_then #"}" (fn strm => SOME (const(i), strm))) 484 strm 485 in 486 eat_then #"{" get strm 487 end 488 489(*---------------------------------------------------------------------------*) 490(* Lexing a character set, which has the form *) 491(* *) 492(* [ ... ] or [^ ... ] *) 493(* *) 494(* where the latter takes the complement of the following charset. Ranges *) 495(* of the form c1-c2 are supported, provided they make sense. Backslashing *) 496(* characters is also allowed. Inside a charset, an explicit number denoting *) 497(* a char can be written as a backslash followed by three decimal digits, *) 498(* i.e., \ddd. *) 499(*---------------------------------------------------------------------------*) 500 501fun get_charset strm = 502 let fun compl(chars cs,strm) = (chars(charset_diff charset_full cs),strm) 503 | compl otherwise = raise ERR "get_charset" "compl" 504 fun elim_decimal_chars list = 505 let in 506 case list 507 of #"\\"::d1::d2::d3::t => 508 if Char.isDigit d1 andalso Char.isDigit d2 andalso Char.isDigit d3 509 then (case Int.fromString (String.implode [d1,d2,d3]) 510 of SOME i => 511 if Int.<(i,256) then chr(i)::elim_decimal_chars t 512 else raise ERR "elim_decimal_chars" 513 "malformed charset element: looking for \\ddd less than 256" 514 | NONE => raise ERR "elim_decimal_chars" 515 "number from \\ddd is too big" 516 ) 517 else List.hd list :: elim_decimal_chars (List.tl list) 518 | ch::t => ch :: elim_decimal_chars t 519 | [] => [] 520 end 521 fun mk_chars (ch1 :: #"-" :: ch2 :: t) = 522 if Char.<=(ch1,ch2) 523 then let val clist = map Char.chr (upto (Char.ord ch1) (Char.ord ch2)) 524 in clist @ mk_chars t 525 end 526 else raise ERR "lex.get_charset.mk_chars" "bad range" 527 | mk_chars (#"\\"::ch::t) = ch::mk_chars t 528 | mk_chars (ch::t) = ch::mk_chars t 529 | mk_chars [] = [] 530 fun get_charset_body strm acc = 531 case Substring.getc strm 532 of SOME(#"]",strm') => (rev acc,strm') 533 | SOME(ch,strm') => get_charset_body strm' (ch::acc) 534 | NONE => raise ERR "lex.get_charset.get_charset_body" 535 "end of input before getting to end of charset" 536 fun mk_cset (clist,strm) = 537 (chars (charset_of (mk_chars (elim_decimal_chars clist))), 538 strm) 539 in case Substring.getc strm 540 of NONE => NONE 541 | SOME(#"]",strm') => SOME(chars charset_empty, strm') 542 | SOME(#"^",strm') => SOME(compl (mk_cset (get_charset_body strm' []))) 543 | SOME(ch, strm') => SOME(mk_cset(get_charset_body strm' [ch])) 544 end 545 546fun lex strm = 547 let open Substring 548 in case getc strm 549 of NONE => NONE 550 | SOME (#"(",strm') => SOME(lparen,strm') 551 | SOME (#")",strm') => SOME(rparen,strm') 552 | SOME (#"[",strm') => get_charset strm' 553 | SOME (#"{",strm') => get_range strm' 554 | SOME (#".",strm') => SOME(dot,strm') 555 | SOME (#"~",strm') => SOME(preFix #"~",strm') 556 | SOME (#"|",strm') => SOME(inFix #"|",strm') 557 | SOME (#"&",strm') => SOME(inFix #"&",strm') 558 | SOME (#"*",strm') => SOME(postFix #"*",strm') 559 | SOME (#"+",strm') => SOME(postFix #"+",strm') 560 | SOME (#"?",strm') => SOME(postFix #"?",strm') 561 | SOME (#"\\",strm') => 562 (case getc strm' 563 of NONE => NONE 564 | SOME(#"d",strm'') => SOME(digit,strm'') 565 | SOME(#"w",strm'') => SOME(alphanum,strm'') 566 | SOME(#"s",strm'') => SOME(whitespace,strm'') 567 | SOME(#"i",strm'') => SOME(interval,strm'') 568 | SOME(#"k",strm'') => get_const strm'' 569 | SOME(ch,strm'') => SOME(char ch,strm'') 570 ) 571 | SOME (ch,strm') => SOME (char ch,strm') 572 end 573 574fun lexemes ss = case lex ss of NONE => [] | SOME(l,ss') => l::lexemes ss' 575 576(* 577val ss = Substring.full "[abc\234d]"; 578val ss = Substring.full "[abc\\234d]"; 579val ss = Substring.full "[\117-\234\010]"; 580val [chars cset] = lexemes ss; 581*) 582 583 584(*---------------------------------------------------------------------------*) 585(* Now parsing *) 586(*---------------------------------------------------------------------------*) 587 588fun PARSE_ERR s ss = 589 let open Substring 590 val estring = String.concat 591 ["Regexp parser failed!\n ",s, 592 "\n Remaining input: ", string ss, ".\n"] 593 in 594 raise ERR "PARSE_ERR" estring 595 end; 596 597(*---------------------------------------------------------------------------*) 598(* We parse into tree, from which it is easy to generate regexp *) 599(*---------------------------------------------------------------------------*) 600 601datatype tree 602 = Ident of Char.char 603 | Cset of charset 604 | Ap of string * tree list 605 | Power of tree * int 606 | Range of tree * int option * int option 607 | Interval of IntInf.int * IntInf.int 608 | Const of IntInf.int; 609 610 611fun expect lexeme (stk,ss) = 612 case lex ss 613 of SOME (lexed, ss') => 614 if lexeme_equal lexed lexeme 615 then (stk,ss') 616 else PARSE_ERR ("Expected "^quote(lexeme2string lexeme)) ss 617 | NONE => PARSE_ERR ("Empty input when expecting " 618 ^quote(lexeme2string lexeme)) ss; 619 620fun opr #"*" = "Star" 621 | opr #"+" = "Plus" 622 | opr #"?" = "Opt" 623 | opr #"|" = "Sum" 624 | opr #"&" = "And" 625 | opr #"~" = "Not" 626 | opr ch = raise ERR "opr" ("unknown operator: "^Char.toString ch) 627 628val isInfix = Char.contains"|&"; 629 630(*---------------------------------------------------------------------------*) 631(* Essential grammar being parsed. Rules given in order of increasing *) 632(* precedence. *) 633(* *) 634(* re ::= re "|" re *) 635(* | re "&" re *) 636(* | re <dot> re ;; <dot> is implicit because concat. is invisible *) 637(* | ~re *) 638(* | re<p> ;; p in {+,*,?} or a power or a range *) 639(* | "(" re ")" *) 640(* | charset *) 641(* *) 642(*---------------------------------------------------------------------------*) 643 644fun parse ss = Disj ([],ss) 645and 646 Disj (stk,ss) = 647 let val (stk',ss') = Conj (stk,ss) 648 in case lex ss' 649 of SOME (inFix #"|",ss'') => 650 (case Disj(stk',ss'') 651 of (e1::e2::t,ss''') => (Ap("Sum",[e2,e1])::t,ss''') 652 | otherwise => PARSE_ERR "expected right part of | expression" ss'') 653 | other => (stk',ss') 654 end 655and 656 Conj (stk,ss) = 657 let val (stk',ss') = Juxt (stk,ss) 658 in case lex ss' 659 of SOME (inFix #"&",ss'') => 660 (case Conj(stk',ss'') 661 of (e1::e2::t,ss''') => (Ap("And",[e2,e1])::t,ss''') 662 | otherwise => PARSE_ERR "expected right part of & expression" ss'') 663 | other => (stk',ss') 664 end 665and 666 Juxt (stk,ss) = 667 let val (stk',ss') = Prefix (stk,ss) 668 in case lex ss' 669 of SOME(inFix _,_) => (stk',ss') 670 | SOME(rparen,_) => (stk',ss') 671 | SOME(postFix _,_) => (stk',ss') 672 | SOME(power _,_) => (stk',ss') 673 | SOME(range _,_) => (stk',ss') 674 | SOME other => 675 (case Juxt(stk',ss') 676 of (e1::e2::t,ss'') => (Ap("Juxt",[e2,e1])::t,ss'') 677 | otherwise => PARSE_ERR "expected another regexp" ss') 678 | NONE => (stk',ss') 679 end 680and 681 Prefix (stk,ss) = 682 case lex ss 683 of SOME(preFix ch,ss') => 684 (case Prefix(stk,ss') 685 of (h::t,ss'') => (Ap(opr ch,[h])::t,ss'') 686 | ([],_) => PARSE_ERR "missing argument to ~" ss') 687 | otherwise => Postfix(stk,ss) 688and 689 Postfix (stk,ss) = 690 let fun Postfixen (stk,ss) = (* deal with multiple post-fixes *) 691 case lex ss 692 of SOME(postFix ch,ss') => 693 (case stk 694 of h::t => Postfixen(Ap(opr ch,[h])::t,ss') 695 | [] => PARSE_ERR "missing argument to postfix operator" ss) 696 | SOME(power i,ss') => 697 (case stk 698 of h::t => Postfixen(Power(h,IntInf.toInt i)::t,ss') 699 | [] => PARSE_ERR "missing regexp for {-} operator" ss) 700 | SOME(range(o1,o2),ss') => 701 (case stk 702 of Ap("interval",[])::t 703 => (case (o1,o2) 704 of (SOME i, SOME j) => Postfixen(Interval(i,j)::t,ss') 705 | otherwise => PARSE_ERR "incomplete interval" ss) 706 | h::t => let val o1' = Option.map IntInf.toInt o1 707 val o2' = Option.map IntInf.toInt o2 708 in Postfixen(Range(h,o1',o2')::t,ss') 709 end 710 | [] => PARSE_ERR "missing regexp for {-,-} operator" ss) 711 | otherwise => (stk,ss) 712 in Postfixen (Atom (stk,ss)) 713 end 714and 715 Atom (stk,ss) = 716 case lex ss 717 of SOME(dot,ss') => (Ap("dot",[])::stk,ss') 718 | SOME(char ch,ss') => (Ident ch::stk,ss') 719 | SOME(chars cset,ss') => (Cset cset::stk,ss') 720 | SOME(digit,ss') => (Ap("digit",[])::stk,ss') 721 | SOME(alphanum,ss') => (Ap("alphanum",[])::stk,ss') 722 | SOME(whitespace,ss') => (Ap("whitespace",[])::stk,ss') 723 | SOME(interval,ss') => (Ap("interval",[])::stk,ss') 724 | SOME(const copt,ss') => (Const copt::stk,ss') 725 | SOME(lparen,ss') => 726 let in 727 case expect rparen (parse ss') 728 of ([x],ss'') => (x::stk,ss'') 729 | other => PARSE_ERR "unable to complete parse inside parentheses" ss' 730 end 731 | other => (stk,ss) 732; 733 734val tree_parse = parse; 735 736fun substring_to_tree sstring = 737 case parse sstring 738 of ([t],ss) => if Substring.isEmpty ss then t 739 else PARSE_ERR "remaining input after successful parse" ss 740 | ([],ss) => PARSE_ERR "no parse" ss 741 | (other,ss) => PARSE_ERR "multiple parses" ss; 742 743val string_to_tree = substring_to_tree o Substring.full; 744 745fun quote_to_tree input = 746 case input 747 of [QUOTE s] => 748 let open Substring 749 val ss = full s 750 val (_,ss') = position (String.implode[#"*",#")"]) ss 751 in substring_to_tree (slice(ss',2,NONE)) 752 end 753 | otherwise => raise ERR "quote_to_tree" 754 "expected a simple quotation of the form ` .... `"; 755 756fun tree_to_regexp intervalFn = 757 let fun t2r tree = 758 case tree 759 of Ident ch => Chset (charset_of [ch]) 760 | Cset cset => Chset cset 761 | Power(t,i) => replicate (t2r t) i 762 | Range(t,SOME i,SOME j) => ranged (t2r t) i j 763 | Range(t,NONE,SOME j) => ranged (t2r t) 0 j 764 | Range(t,SOME i,NONE) => 765 let val r = t2r t 766 in Cat(replicate r i, Star r) 767 end 768 | Range (t,NONE,NONE) => Star (t2r t) 769 | Interval(i,j) => intervalFn (i,j) 770 | Const k => intervalFn (k,k) 771 | Ap("dot",[]) => DOT 772 | Ap("digit",[]) => DIGIT 773 | Ap("alphanum",[]) => ALPHANUM 774 | Ap("whitespace",[]) => WHITESPACE 775 | Ap("Star",[t]) => Star(t2r t) 776 | Ap("Plus",[t]) => let val r = t2r t in Cat(r,Star r) end 777 | Ap("Opt",[t]) => Or[EPSILON,t2r t] 778 | Ap("Sum",[t1,t2]) => Or[t2r t1,t2r t2] 779 | Ap("And",[t1,t2]) => Neg(Or[Neg(t2r t1),Neg(t2r t2)]) 780 | Ap("Juxt",[t1,t2]) => Cat(t2r t1,t2r t2) 781 | Ap("Not",[t]) => Neg(t2r t) 782 | Ap("interval",[]) => raise ERR "tree_to_regexp" "missing interval parameters" 783 | Ap(other,_) => raise ERR "tree_to_regexp" ("unknown operator: "^other) 784 in 785 t2r 786 end 787; 788 789val the_intervalFn : (IntInf.int * IntInf.int -> regexp) ref = 790 ref (fn _ => raise ERR "tree_to_regexp" "interval regexp generator not installed") 791 792fun get_intervalFn() = !the_intervalFn 793fun set_intervalFn f = (the_intervalFn := f); 794 795fun fromSubstring sstring = 796 tree_to_regexp (get_intervalFn()) (substring_to_tree sstring) 797 handle e => raise wrap_exn "Regexp_Type" "fromSubstring" e; 798 799fun fromString s = tree_to_regexp (get_intervalFn()) (string_to_tree s) 800 handle e => raise wrap_exn "Regexp_Type" "fromString" e; 801 802fun fromQuote q = tree_to_regexp (get_intervalFn()) (quote_to_tree q) 803 handle e => raise wrap_exn "Regexp_Type" "fromQuote" e; 804 805end 806