1(* Title: Pure/ML/ml_lex.ML 2 Author: Makarius 3 4Lexical syntax for Isabelle/ML and Standard ML. 5*) 6 7signature ML_LEX = 8sig 9 val keywords: string list 10 datatype token_kind = 11 Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | 12 Space | Comment of Comment.kind option | Error of string | EOF 13 eqtype token 14 val stopper: token Scan.stopper 15 val is_regular: token -> bool 16 val is_improper: token -> bool 17 val is_comment: token -> bool 18 val set_range: Position.range -> token -> token 19 val range_of: token -> Position.range 20 val pos_of: token -> Position.T 21 val end_pos_of: token -> Position.T 22 val kind_of: token -> token_kind 23 val content_of: token -> string 24 val check_content_of: token -> string 25 val flatten: token list -> string 26 val source: (Symbol.symbol, 'a) Source.source -> 27 (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source) 28 Source.source) Source.source 29 val tokenize: string -> token list 30 val read_pos: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list 31 val read: Symbol_Pos.text -> token Antiquote.antiquote list 32 val read_set_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list 33 val read_source: bool -> Input.source -> token Antiquote.antiquote list 34end; 35 36structure ML_Lex: ML_LEX = 37struct 38 39(** keywords **) 40 41val keywords = 42 ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[", 43 "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case", 44 "datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun", 45 "functor", "handle", "if", "in", "include", "infix", "infixr", 46 "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", 47 "rec", "sharing", "sig", "signature", "struct", "structure", "then", 48 "type", "val", "where", "while", "with", "withtype"]; 49 50val keywords2 = 51 ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of", 52 "sig", "struct", "then", "while", "with"]; 53 54val keywords3 = 55 ["handle", "open", "raise"]; 56 57val lexicon = Scan.make_lexicon (map raw_explode keywords); 58 59 60 61(** tokens **) 62 63(* datatype token *) 64 65datatype token_kind = 66 Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String | 67 Space | Comment of Comment.kind option | Error of string | EOF; 68 69datatype token = Token of Position.range * (token_kind * string); 70 71 72(* position *) 73 74fun set_range range (Token (_, x)) = Token (range, x); 75fun range_of (Token (range, _)) = range; 76 77val pos_of = #1 o range_of; 78val end_pos_of = #2 o range_of; 79 80 81(* stopper *) 82 83fun mk_eof pos = Token ((pos, Position.none), (EOF, "")); 84val eof = mk_eof Position.none; 85 86fun is_eof (Token (_, (EOF, _))) = true 87 | is_eof _ = false; 88 89val stopper = 90 Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof; 91 92 93(* token content *) 94 95fun kind_of (Token (_, (k, _))) = k; 96 97fun content_of (Token (_, (_, x))) = x; 98fun token_leq (tok, tok') = content_of tok <= content_of tok'; 99 100fun is_keyword (Token (_, (Keyword, _))) = true 101 | is_keyword _ = false; 102 103fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x) 104 | is_delimiter _ = false; 105 106fun is_regular (Token (_, (Error _, _))) = false 107 | is_regular (Token (_, (EOF, _))) = false 108 | is_regular _ = true; 109 110fun is_improper (Token (_, (Space, _))) = true 111 | is_improper (Token (_, (Comment _, _))) = true 112 | is_improper _ = false; 113 114fun is_comment (Token (_, (Comment _, _))) = true 115 | is_comment _ = false; 116 117fun warn tok = 118 (case tok of 119 Token (_, (Keyword, ":>")) => 120 warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\ 121 \prefer non-opaque matching (:) possibly with abstype" ^ 122 Position.here (pos_of tok)) 123 | _ => ()); 124 125fun check_content_of tok = 126 (case kind_of tok of 127 Error msg => error msg 128 | _ => content_of tok); 129 130 131(* flatten *) 132 133fun flatten_content (tok :: (toks as tok' :: _)) = 134 Symbol.escape (check_content_of tok) :: 135 (if is_improper tok orelse is_improper tok' then flatten_content toks 136 else Symbol.space :: flatten_content toks) 137 | flatten_content toks = map (Symbol.escape o check_content_of) toks; 138 139val flatten = implode o flatten_content; 140 141 142(* markup *) 143 144local 145 146fun token_kind_markup SML = 147 fn Type_Var => (Markup.ML_tvar, "") 148 | Word => (Markup.ML_numeral, "") 149 | Int => (Markup.ML_numeral, "") 150 | Real => (Markup.ML_numeral, "") 151 | Char => (Markup.ML_char, "") 152 | String => (if SML then Markup.SML_string else Markup.ML_string, "") 153 | Comment _ => (if SML then Markup.SML_comment else Markup.ML_comment, "") 154 | Error msg => (Markup.bad (), msg) 155 | _ => (Markup.empty, ""); 156 157in 158 159fun token_report SML (tok as Token ((pos, _), (kind, x))) = 160 let 161 val (markup, txt) = 162 if not (is_keyword tok) then token_kind_markup SML kind 163 else if is_delimiter tok then (Markup.ML_delimiter, "") 164 else if member (op =) keywords2 x then (Markup.ML_keyword2 |> Markup.keyword_properties, "") 165 else if member (op =) keywords3 x then (Markup.ML_keyword3 |> Markup.keyword_properties, "") 166 else (Markup.ML_keyword1 |> Markup.keyword_properties, ""); 167 in ((pos, markup), txt) end; 168 169end; 170 171 172 173(** scanners **) 174 175open Basic_Symbol_Pos; 176 177val err_prefix = "SML lexical error: "; 178 179fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg); 180 181 182(* identifiers *) 183 184local 185 186val scan_letdigs = 187 Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol); 188 189val scan_alphanumeric = 190 Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs; 191 192val scan_symbolic = 193 Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol); 194 195in 196 197val scan_ident = scan_alphanumeric || scan_symbolic; 198 199val scan_long_ident = 200 Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "="); 201 202val scan_type_var = $$$ "'" @@@ scan_letdigs; 203 204end; 205 206 207(* numerals *) 208 209local 210 211val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol); 212val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol); 213val scan_sign = Scan.optional ($$$ "~") []; 214val scan_decint = scan_sign @@@ scan_dec; 215val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint; 216 217in 218 219val scan_word = 220 $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex || 221 $$$ "0" @@@ $$$ "w" @@@ scan_dec; 222 223val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec); 224 225val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) []; 226 227val scan_real = 228 scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] || 229 scan_decint @@@ scan_exp; 230 231end; 232 233 234(* chars and strings *) 235 236val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol); 237 238local 239 240val scan_escape = 241 Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single || 242 $$$ "^" @@@ 243 (Scan.one (fn (s, _) => Char.ord #"@" <= ord s andalso ord s <= Char.ord #"_") >> single) || 244 Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- 245 Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) -- 246 Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]); 247 248val scan_str = 249 Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso 250 (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single || 251 $$$ "\\" @@@ !!! "bad escape character in string" scan_escape; 252 253val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\"; 254val scan_gaps = Scan.repeats scan_gap; 255 256in 257 258val scan_char = 259 $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\""; 260 261val recover_char = 262 $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) []; 263 264val scan_string = 265 Scan.ahead ($$ "\"") |-- 266 !!! "unclosed string literal" 267 ($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\""); 268 269val recover_string = 270 $$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str); 271 272end; 273 274 275(* rat antiquotation *) 276 277val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none); 278 279val scan_rat_antiq = 280 Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos 281 >> (fn ((pos1, (pos2, body)), pos3) => 282 {start = Position.range_position (pos1, pos2), 283 stop = Position.none, 284 range = Position.range (pos1, pos3), 285 body = rat_name @ body}); 286 287 288(* scan tokens *) 289 290local 291 292fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss)); 293 294val scan_sml = 295 scan_char >> token Char || 296 scan_string >> token String || 297 scan_blanks1 >> token Space || 298 Symbol_Pos.scan_comment err_prefix >> token (Comment NONE) || 299 Scan.max token_leq 300 (Scan.literal lexicon >> token Keyword) 301 (scan_word >> token Word || 302 scan_real >> token Real || 303 scan_int >> token Int || 304 scan_long_ident >> token Long_Ident || 305 scan_ident >> token Ident || 306 scan_type_var >> token Type_Var); 307 308val scan_ml_antiq = 309 Comment.scan >> (fn (kind, ss) => Antiquote.Text (token (Comment (SOME kind)) ss)) || 310 Antiquote.scan_control >> Antiquote.Control || 311 Antiquote.scan_antiq >> Antiquote.Antiq || 312 scan_rat_antiq >> Antiquote.Antiq || 313 scan_sml >> Antiquote.Text; 314 315fun recover msg = 316 (recover_char || 317 recover_string || 318 Symbol_Pos.recover_cartouche || 319 Symbol_Pos.recover_comment || 320 Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single) 321 >> (single o token (Error msg)); 322 323fun gen_read SML pos text = 324 let 325 val syms = 326 Symbol_Pos.explode (text, pos) 327 |> SML ? maps (fn (s, p) => raw_explode s |> map (rpair p)); 328 329 val termination = 330 if null syms then [] 331 else 332 let 333 val pos1 = List.last syms |-> Position.advance; 334 val pos2 = Position.advance Symbol.space pos1; 335 in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end; 336 337 val scan = if SML then scan_sml >> Antiquote.Text else scan_ml_antiq; 338 fun check (Antiquote.Text tok) = (check_content_of tok; if SML then () else warn tok) 339 | check _ = (); 340 val input = 341 Source.of_list syms 342 |> Source.source Symbol_Pos.stopper 343 (Scan.recover (Scan.bulk (!!! "bad input" scan)) 344 (fn msg => recover msg >> map Antiquote.Text)) 345 |> Source.exhaust; 346 val _ = Position.reports (Antiquote.antiq_reports input); 347 val _ = 348 Position.reports_text (maps (fn Antiquote.Text t => [token_report SML t] | _ => []) input); 349 val _ = List.app check input; 350 in input @ termination end; 351 352in 353 354fun source src = 355 Symbol_Pos.source (Position.line 1) src 356 |> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_sml)) recover); 357 358val tokenize = Symbol.explode #> Source.of_list #> source #> Source.exhaust; 359 360val read_pos = gen_read false; 361 362val read = read_pos Position.none; 363 364fun read_set_range range = 365 read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq); 366 367fun read_source SML source = 368 let 369 val pos = Input.pos_of source; 370 val language = if SML then Markup.language_SML else Markup.language_ML; 371 val _ = 372 if Position.is_reported_range pos 373 then Position.report pos (language (Input.is_delimited source)) 374 else (); 375 in gen_read SML pos (Input.text_of source) end; 376 377end; 378 379end; 380