1(*  Title:      Pure/ML/ml_lex.ML
2    Author:     Makarius
3
4Lexical syntax for Isabelle/ML and Standard ML.
5*)
6
7signature ML_LEX =
8sig
9  val keywords: string list
10  datatype token_kind =
11    Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |
12    Space | Comment of Comment.kind option | Error of string | EOF
13  eqtype token
14  val stopper: token Scan.stopper
15  val is_regular: token -> bool
16  val is_improper: token -> bool
17  val is_comment: token -> bool
18  val set_range: Position.range -> token -> token
19  val range_of: token -> Position.range
20  val pos_of: token -> Position.T
21  val end_pos_of: token -> Position.T
22  val kind_of: token -> token_kind
23  val content_of: token -> string
24  val check_content_of: token -> string
25  val flatten: token list -> string
26  val source: (Symbol.symbol, 'a) Source.source ->
27    (token, (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source)
28      Source.source) Source.source
29  val tokenize: string -> token list
30  val read_pos: Position.T -> Symbol_Pos.text -> token Antiquote.antiquote list
31  val read: Symbol_Pos.text -> token Antiquote.antiquote list
32  val read_set_range: Position.range -> Symbol_Pos.text -> token Antiquote.antiquote list
33  val read_source: bool -> Input.source -> token Antiquote.antiquote list
34end;
35
36structure ML_Lex: ML_LEX =
37struct
38
39(** keywords **)
40
41val keywords =
42 ["#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", "[",
43  "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", "case",
44  "datatype", "do", "else", "end", "eqtype", "exception", "fn", "fun",
45  "functor", "handle", "if", "in", "include", "infix", "infixr",
46  "let", "local", "nonfix", "of", "op", "open", "orelse", "raise",
47  "rec", "sharing", "sig", "signature", "struct", "structure", "then",
48  "type", "val", "where", "while", "with", "withtype"];
49
50val keywords2 =
51 ["and", "case", "do", "else", "end", "if", "in", "let", "local", "of",
52  "sig", "struct", "then", "while", "with"];
53
54val keywords3 =
55 ["handle", "open", "raise"];
56
57val lexicon = Scan.make_lexicon (map raw_explode keywords);
58
59
60
61(** tokens **)
62
63(* datatype token *)
64
65datatype token_kind =
66  Keyword | Ident | Long_Ident | Type_Var | Word | Int | Real | Char | String |
67  Space | Comment of Comment.kind option | Error of string | EOF;
68
69datatype token = Token of Position.range * (token_kind * string);
70
71
72(* position *)
73
74fun set_range range (Token (_, x)) = Token (range, x);
75fun range_of (Token (range, _)) = range;
76
77val pos_of = #1 o range_of;
78val end_pos_of = #2 o range_of;
79
80
81(* stopper *)
82
83fun mk_eof pos = Token ((pos, Position.none), (EOF, ""));
84val eof = mk_eof Position.none;
85
86fun is_eof (Token (_, (EOF, _))) = true
87  | is_eof _ = false;
88
89val stopper =
90  Scan.stopper (fn [] => eof | toks => mk_eof (end_pos_of (List.last toks))) is_eof;
91
92
93(* token content *)
94
95fun kind_of (Token (_, (k, _))) = k;
96
97fun content_of (Token (_, (_, x))) = x;
98fun token_leq (tok, tok') = content_of tok <= content_of tok';
99
100fun is_keyword (Token (_, (Keyword, _))) = true
101  | is_keyword _ = false;
102
103fun is_delimiter (Token (_, (Keyword, x))) = not (Symbol.is_ascii_identifier x)
104  | is_delimiter _ = false;
105
106fun is_regular (Token (_, (Error _, _))) = false
107  | is_regular (Token (_, (EOF, _))) = false
108  | is_regular _ = true;
109
110fun is_improper (Token (_, (Space, _))) = true
111  | is_improper (Token (_, (Comment _, _))) = true
112  | is_improper _ = false;
113
114fun is_comment (Token (_, (Comment _, _))) = true
115  | is_comment _ = false;
116
117fun warn tok =
118  (case tok of
119    Token (_, (Keyword, ":>")) =>
120      warning ("Opaque signature matching (:>) fails to work with ML pretty printing --\n\
121        \prefer non-opaque matching (:) possibly with abstype" ^
122        Position.here (pos_of tok))
123  | _ => ());
124
125fun check_content_of tok =
126  (case kind_of tok of
127    Error msg => error msg
128  | _ => content_of tok);
129
130
131(* flatten *)
132
133fun flatten_content (tok :: (toks as tok' :: _)) =
134      Symbol.escape (check_content_of tok) ::
135        (if is_improper tok orelse is_improper tok' then flatten_content toks
136         else Symbol.space :: flatten_content toks)
137  | flatten_content toks = map (Symbol.escape o check_content_of) toks;
138
139val flatten = implode o flatten_content;
140
141
142(* markup *)
143
144local
145
146fun token_kind_markup SML =
147 fn Type_Var => (Markup.ML_tvar, "")
148  | Word => (Markup.ML_numeral, "")
149  | Int => (Markup.ML_numeral, "")
150  | Real => (Markup.ML_numeral, "")
151  | Char => (Markup.ML_char, "")
152  | String => (if SML then Markup.SML_string else Markup.ML_string, "")
153  | Comment _ => (if SML then Markup.SML_comment else Markup.ML_comment, "")
154  | Error msg => (Markup.bad (), msg)
155  | _ => (Markup.empty, "");
156
157in
158
159fun token_report SML (tok as Token ((pos, _), (kind, x))) =
160  let
161    val (markup, txt) =
162      if not (is_keyword tok) then token_kind_markup SML kind
163      else if is_delimiter tok then (Markup.ML_delimiter, "")
164      else if member (op =) keywords2 x then (Markup.ML_keyword2 |> Markup.keyword_properties, "")
165      else if member (op =) keywords3 x then (Markup.ML_keyword3 |> Markup.keyword_properties, "")
166      else (Markup.ML_keyword1 |> Markup.keyword_properties, "");
167  in ((pos, markup), txt) end;
168
169end;
170
171
172
173(** scanners **)
174
175open Basic_Symbol_Pos;
176
177val err_prefix = "SML lexical error: ";
178
179fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg);
180
181
182(* identifiers *)
183
184local
185
186val scan_letdigs =
187  Scan.many (Symbol.is_ascii_letdig o Symbol_Pos.symbol);
188
189val scan_alphanumeric =
190  Scan.one (Symbol.is_ascii_letter o Symbol_Pos.symbol) ::: scan_letdigs;
191
192val scan_symbolic =
193  Scan.many1 (member (op =) (raw_explode "!#$%&*+-/:<=>?@\\^`|~") o Symbol_Pos.symbol);
194
195in
196
197val scan_ident = scan_alphanumeric || scan_symbolic;
198
199val scan_long_ident =
200  Scan.repeats1 (scan_alphanumeric @@@ $$$ ".") @@@ (scan_ident || $$$ "=");
201
202val scan_type_var = $$$ "'" @@@ scan_letdigs;
203
204end;
205
206
207(* numerals *)
208
209local
210
211val scan_dec = Scan.many1 (Symbol.is_ascii_digit o Symbol_Pos.symbol);
212val scan_hex = Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol);
213val scan_sign = Scan.optional ($$$ "~") [];
214val scan_decint = scan_sign @@@ scan_dec;
215val scan_exp = ($$$ "E" || $$$ "e") @@@ scan_decint;
216
217in
218
219val scan_word =
220  $$$ "0" @@@ $$$ "w" @@@ $$$ "x" @@@ scan_hex ||
221  $$$ "0" @@@ $$$ "w" @@@ scan_dec;
222
223val scan_int = scan_sign @@@ ($$$ "0" @@@ $$$ "x" @@@ scan_hex || scan_dec);
224
225val scan_rat = scan_decint @@@ Scan.optional ($$$ "/" @@@ scan_dec) [];
226
227val scan_real =
228  scan_decint @@@ $$$ "." @@@ scan_dec @@@ Scan.optional scan_exp [] ||
229  scan_decint @@@ scan_exp;
230
231end;
232
233
234(* chars and strings *)
235
236val scan_blanks1 = Scan.many1 (Symbol.is_ascii_blank o Symbol_Pos.symbol);
237
238local
239
240val scan_escape =
241  Scan.one (member (op =) (raw_explode "\"\\abtnvfr") o Symbol_Pos.symbol) >> single ||
242  $$$ "^" @@@
243    (Scan.one (fn (s, _) => Char.ord #"@" <= ord s andalso ord s <= Char.ord #"_") >> single) ||
244  Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
245    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) --
246    Scan.one (Symbol.is_ascii_digit o Symbol_Pos.symbol) >> (fn ((a, b), c) => [a, b, c]);
247
248val scan_str =
249  Scan.one (fn (s, _) => Symbol.not_eof s andalso s <> "\"" andalso s <> "\\" andalso
250    (not (Symbol.is_char s) orelse Symbol.is_printable s)) >> single ||
251  $$$ "\\" @@@ !!! "bad escape character in string" scan_escape;
252
253val scan_gap = $$$ "\\" @@@ scan_blanks1 @@@ $$$ "\\";
254val scan_gaps = Scan.repeats scan_gap;
255
256in
257
258val scan_char =
259  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ scan_str @@@ scan_gaps @@@ $$$ "\"";
260
261val recover_char =
262  $$$ "#" @@@ $$$ "\"" @@@ scan_gaps @@@ Scan.optional (Scan.permissive scan_str @@@ scan_gaps) [];
263
264val scan_string =
265  Scan.ahead ($$ "\"") |--
266    !!! "unclosed string literal"
267      ($$$ "\"" @@@ Scan.repeats (scan_gap || scan_str) @@@ $$$ "\"");
268
269val recover_string =
270  $$$ "\"" @@@ Scan.repeats (scan_gap || Scan.permissive scan_str);
271
272end;
273
274
275(* rat antiquotation *)
276
277val rat_name = Symbol_Pos.explode ("Pure.rat ", Position.none);
278
279val scan_rat_antiq =
280  Symbol_Pos.scan_pos -- ($$ "@" |-- Symbol_Pos.scan_pos -- scan_rat) -- Symbol_Pos.scan_pos
281    >> (fn ((pos1, (pos2, body)), pos3) =>
282      {start = Position.range_position (pos1, pos2),
283       stop = Position.none,
284       range = Position.range (pos1, pos3),
285       body = rat_name @ body});
286
287
288(* scan tokens *)
289
290local
291
292fun token k ss = Token (Symbol_Pos.range ss, (k, Symbol_Pos.content ss));
293
294val scan_sml =
295  scan_char >> token Char ||
296  scan_string >> token String ||
297  scan_blanks1 >> token Space ||
298  Symbol_Pos.scan_comment err_prefix >> token (Comment NONE) ||
299  Scan.max token_leq
300   (Scan.literal lexicon >> token Keyword)
301   (scan_word >> token Word ||
302    scan_real >> token Real ||
303    scan_int >> token Int ||
304    scan_long_ident >> token Long_Ident ||
305    scan_ident >> token Ident ||
306    scan_type_var >> token Type_Var);
307
308val scan_ml_antiq =
309  Comment.scan >> (fn (kind, ss) => Antiquote.Text (token (Comment (SOME kind)) ss)) ||
310  Antiquote.scan_control >> Antiquote.Control ||
311  Antiquote.scan_antiq >> Antiquote.Antiq ||
312  scan_rat_antiq >> Antiquote.Antiq ||
313  scan_sml >> Antiquote.Text;
314
315fun recover msg =
316 (recover_char ||
317  recover_string ||
318  Symbol_Pos.recover_cartouche ||
319  Symbol_Pos.recover_comment ||
320  Scan.one (Symbol.not_eof o Symbol_Pos.symbol) >> single)
321  >> (single o token (Error msg));
322
323fun gen_read SML pos text =
324  let
325    val syms =
326      Symbol_Pos.explode (text, pos)
327      |> SML ? maps (fn (s, p) => raw_explode s |> map (rpair p));
328
329    val termination =
330      if null syms then []
331      else
332        let
333          val pos1 = List.last syms |-> Position.advance;
334          val pos2 = Position.advance Symbol.space pos1;
335        in [Antiquote.Text (Token (Position.range (pos1, pos2), (Space, Symbol.space)))] end;
336
337    val scan = if SML then scan_sml >> Antiquote.Text else scan_ml_antiq;
338    fun check (Antiquote.Text tok) = (check_content_of tok; if SML then () else warn tok)
339      | check _ = ();
340    val input =
341      Source.of_list syms
342      |> Source.source Symbol_Pos.stopper
343        (Scan.recover (Scan.bulk (!!! "bad input" scan))
344          (fn msg => recover msg >> map Antiquote.Text))
345      |> Source.exhaust;
346    val _ = Position.reports (Antiquote.antiq_reports input);
347    val _ =
348      Position.reports_text (maps (fn Antiquote.Text t => [token_report SML t] | _ => []) input);
349    val _ = List.app check input;
350  in input @ termination end;
351
352in
353
354fun source src =
355  Symbol_Pos.source (Position.line 1) src
356  |> Source.source Symbol_Pos.stopper (Scan.recover (Scan.bulk (!!! "bad input" scan_sml)) recover);
357
358val tokenize = Symbol.explode #> Source.of_list #> source #> Source.exhaust;
359
360val read_pos = gen_read false;
361
362val read = read_pos Position.none;
363
364fun read_set_range range =
365  read #> map (fn Antiquote.Text tok => Antiquote.Text (set_range range tok) | antiq => antiq);
366
367fun read_source SML source =
368  let
369    val pos = Input.pos_of source;
370    val language = if SML then Markup.language_SML else Markup.language_ML;
371    val _ =
372      if Position.is_reported_range pos
373      then Position.report pos (language (Input.is_delimited source))
374      else ();
375  in gen_read SML pos (Input.text_of source) end;
376
377end;
378
379end;
380