1/*  Title:      Pure/ML/ml_lex.scala
2    Author:     Makarius
3
4Lexical syntax for Isabelle/ML and Standard ML.
5*/
6
7package isabelle
8
9
10import scala.collection.mutable
11import scala.util.parsing.input.Reader
12
13
14object ML_Lex
15{
16  /** keywords **/
17
18  val keywords: Set[String] =
19    Set("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>",
20      "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
21      "case", "datatype", "do", "else", "end", "eqtype", "exception",
22      "fn", "fun", "functor", "handle", "if", "in", "include",
23      "infix", "infixr", "let", "local", "nonfix", "of", "op", "open",
24      "orelse", "raise", "rec", "sharing", "sig", "signature",
25      "struct", "structure", "then", "type", "val", "where", "while",
26      "with", "withtype")
27
28  val keywords2: Set[String] =
29    Set("and", "case", "do", "else", "end", "if", "in", "let", "local",
30      "of", "sig", "struct", "then", "while", "with")
31
32  val keywords3: Set[String] =
33    Set("handle", "open", "raise")
34
35  private val lexicon: Scan.Lexicon = Scan.Lexicon(keywords.toList: _*)
36
37
38
39  /** tokens **/
40
41  object Kind extends Enumeration
42  {
43    val KEYWORD = Value("keyword")
44    val IDENT = Value("identifier")
45    val LONG_IDENT = Value("long identifier")
46    val TYPE_VAR = Value("type variable")
47    val WORD = Value("word")
48    val INT = Value("integer")
49    val REAL = Value("real")
50    val CHAR = Value("character")
51    val STRING = Value("quoted string")
52    val SPACE = Value("white space")
53    val INFORMAL_COMMENT = Value("informal comment")
54    val FORMAL_COMMENT = Value("formal comment")
55    val CONTROL = Value("control symbol antiquotation")
56    val ANTIQ = Value("antiquotation")
57    val ANTIQ_START = Value("antiquotation: start")
58    val ANTIQ_STOP = Value("antiquotation: stop")
59    val ANTIQ_OTHER = Value("antiquotation: other")
60    val ANTIQ_STRING = Value("antiquotation: quoted string")
61    val ANTIQ_ALT_STRING = Value("antiquotation: back-quoted string")
62    val ANTIQ_CARTOUCHE = Value("antiquotation: text cartouche")
63    val ERROR = Value("bad input")
64  }
65
66  sealed case class Token(kind: Kind.Value, source: String)
67  {
68    def is_keyword: Boolean = kind == Kind.KEYWORD
69    def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source)
70    def is_space: Boolean = kind == Kind.SPACE
71    def is_comment: Boolean = kind == Kind.INFORMAL_COMMENT || kind == Kind.FORMAL_COMMENT
72  }
73
74
75
76  /** parsers **/
77
78  case object ML_String extends Scan.Line_Context
79  case class Antiq(ctxt: Scan.Line_Context) extends Scan.Line_Context
80
81  private object Parsers extends Scan.Parsers with Antiquote.Parsers with Comment.Parsers
82  {
83    /* string material */
84
85    private val blanks = many(character(Symbol.is_ascii_blank))
86    private val blanks1 = many1(character(Symbol.is_ascii_blank))
87
88    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
89    private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y }
90
91    private val escape =
92      one(character("\"\\abtnvfr".contains(_))) |
93      "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
94      repeated(character(Symbol.is_ascii_digit), 3, 3)
95
96    private val str =
97      one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |
98      one(s =>
99        Symbol.is_open(s) || Symbol.is_close(s) || Symbol.is_symbolic(s) || Symbol.is_control(s)) |
100      "\\" ~ escape ^^ { case x ~ y => x + y }
101
102
103    /* ML char -- without gaps */
104
105    private val ml_char: Parser[Token] =
106      "#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) }
107
108    private val recover_ml_char: Parser[String] =
109      "#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x }
110
111
112    /* ML string */
113
114    private val ml_string_body: Parser[String] =
115      rep(gap | str) ^^ (_.mkString)
116
117    private val recover_ml_string: Parser[String] =
118      "\"" ~ ml_string_body ^^ { case x ~ y => x + y }
119
120    private val ml_string: Parser[Token] =
121      "\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
122
123    private def ml_string_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
124    {
125      def result(x: String, c: Scan.Line_Context) = (Token(Kind.STRING, x), c)
126
127      ctxt match {
128        case Scan.Finished =>
129          "\"" ~ ml_string_body ~ ("\"" | gap_start) ^^
130            { case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) }
131        case ML_String =>
132          blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^
133            { case x ~ Some(y ~ z ~ w) =>
134                result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String)
135              case x ~ None => result(x, ML_String) }
136        case _ => failure("")
137      }
138    }
139
140
141    /* ML comment */
142
143    private val ml_comment: Parser[Token] =
144      comment ^^ (x => Token(Kind.INFORMAL_COMMENT, x))
145
146    private def ml_comment_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
147      comment_line(ctxt) ^^ { case (x, c) => (Token(Kind.INFORMAL_COMMENT, x), c) }
148
149    private val ml_formal_comment: Parser[Token] =
150      comment_cartouche ^^ (x => Token(Kind.FORMAL_COMMENT, x))
151
152    private def ml_formal_comment_line(ctxt: Scan.Line_Context)
153        : Parser[(Token, Scan.Line_Context)] =
154      comment_cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.FORMAL_COMMENT, x), c) }
155
156
157    /* antiquotations (line-oriented) */
158
159    def ml_cartouche_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
160      cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), c) }
161
162    def ml_antiq_start(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
163      ctxt match {
164        case Scan.Finished => "@{" ^^ (x => (Token(Kind.ANTIQ_START, x), Antiq(Scan.Finished)))
165        case _ => failure("")
166      }
167
168    def ml_antiq_stop(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
169      ctxt match {
170        case Antiq(Scan.Finished) => "}" ^^ (x => (Token(Kind.ANTIQ_STOP, x), Scan.Finished))
171        case _ => failure("")
172      }
173
174    def ml_antiq_body(context: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
175      context match {
176        case Antiq(ctxt) =>
177          (if (ctxt == Scan.Finished) antiq_other ^^ (x => (Token(Kind.ANTIQ_OTHER, x), context))
178           else failure("")) |
179          quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_STRING, x), Antiq(c)) } |
180          quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_ALT_STRING, x), Antiq(c)) } |
181          cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), Antiq(c)) }
182        case _ => failure("")
183      }
184
185
186    /* token */
187
188    private def other_token: Parser[Token] =
189    {
190      /* identifiers */
191
192      val letdigs = many(character(Symbol.is_ascii_letdig))
193
194      val alphanumeric =
195        one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
196
197      val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
198
199      val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x))
200
201      val long_ident =
202        rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
203          (alphanumeric | (symbolic | "=")) ^^
204          { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
205
206      val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
207
208
209      /* numerals */
210
211      val dec = many1(character(Symbol.is_ascii_digit))
212      val hex = many1(character(Symbol.is_ascii_hex))
213      val sign = opt("~") ^^ { case Some(x) => x case None => "" }
214      val decint = sign ~ dec ^^ { case x ~ y => x + y }
215      val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
216
217      val word =
218        ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
219          (x => Token(Kind.WORD, x))
220
221      val int =
222        sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
223          { case x ~ y => Token(Kind.INT, x + y) }
224
225      val rat =
226        decint ~ opt("/" ~ dec) ^^ { case x ~ None => x case x ~ Some(y ~ z) => x + y + z }
227
228      val real =
229        (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
230          { case x ~ y ~ z ~ w => x + y + z + w } |
231         decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
232
233
234      /* main */
235
236      val space = blanks1 ^^ (x => Token(Kind.SPACE, x))
237
238      val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x))
239
240      val ml_control = control ^^ (x => Token(Kind.CONTROL, x))
241      val ml_antiq =
242        "@" ~ rat ^^ { case x ~ y => Token(Kind.ANTIQ, x + y) } |
243        antiq ^^ (x => Token(Kind.ANTIQ, x))
244
245      val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x))
246
247      val recover =
248        (recover_ml_char | (recover_ml_string | (recover_cartouche | recover_comment))) ^^
249          (x => Token(Kind.ERROR, x))
250
251      space | (ml_control | (recover | (ml_antiq |
252        (((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad))))
253    }
254
255    def token: Parser[Token] =
256      ml_char | (ml_string | (ml_comment | (ml_formal_comment | other_token)))
257
258    def token_line(SML: Boolean, ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
259    {
260      val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished))
261
262      if (SML) ml_string_line(ctxt) | (ml_comment_line(ctxt) | other)
263      else {
264        ml_string_line(ctxt) |
265          (ml_comment_line(ctxt) |
266            (ml_formal_comment_line(ctxt) |
267              (ml_cartouche_line(ctxt) |
268                (ml_antiq_start(ctxt) | (ml_antiq_stop(ctxt) | (ml_antiq_body(ctxt) | other))))))
269      }
270    }
271  }
272
273
274  /* tokenize */
275
276  def tokenize(input: CharSequence): List[Token] =
277    Parsers.parseAll(Parsers.rep(Parsers.token), Scan.char_reader(input)) match {
278      case Parsers.Success(tokens, _) => tokens
279      case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
280    }
281
282  def tokenize_line(SML: Boolean, input: CharSequence, context: Scan.Line_Context)
283    : (List[Token], Scan.Line_Context) =
284  {
285    var in: Reader[Char] = Scan.char_reader(input)
286    val toks = new mutable.ListBuffer[Token]
287    var ctxt = context
288    while (!in.atEnd) {
289      Parsers.parse(Parsers.token_line(SML, ctxt), in) match {
290        case Parsers.Success((x, c), rest) => toks += x; ctxt = c; in = rest
291        case Parsers.NoSuccess(_, rest) =>
292          error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
293      }
294    }
295    (toks.toList, ctxt)
296  }
297}
298