1/* Title: Pure/ML/ml_lex.scala 2 Author: Makarius 3 4Lexical syntax for Isabelle/ML and Standard ML. 5*/ 6 7package isabelle 8 9 10import scala.collection.mutable 11import scala.util.parsing.input.Reader 12 13 14object ML_Lex 15{ 16 /** keywords **/ 17 18 val keywords: Set[String] = 19 Set("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>", 20 "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", 21 "case", "datatype", "do", "else", "end", "eqtype", "exception", 22 "fn", "fun", "functor", "handle", "if", "in", "include", 23 "infix", "infixr", "let", "local", "nonfix", "of", "op", "open", 24 "orelse", "raise", "rec", "sharing", "sig", "signature", 25 "struct", "structure", "then", "type", "val", "where", "while", 26 "with", "withtype") 27 28 val keywords2: Set[String] = 29 Set("and", "case", "do", "else", "end", "if", "in", "let", "local", 30 "of", "sig", "struct", "then", "while", "with") 31 32 val keywords3: Set[String] = 33 Set("handle", "open", "raise") 34 35 private val lexicon: Scan.Lexicon = Scan.Lexicon(keywords.toList: _*) 36 37 38 39 /** tokens **/ 40 41 object Kind extends Enumeration 42 { 43 val KEYWORD = Value("keyword") 44 val IDENT = Value("identifier") 45 val LONG_IDENT = Value("long identifier") 46 val TYPE_VAR = Value("type variable") 47 val WORD = Value("word") 48 val INT = Value("integer") 49 val REAL = Value("real") 50 val CHAR = Value("character") 51 val STRING = Value("quoted string") 52 val SPACE = Value("white space") 53 val INFORMAL_COMMENT = Value("informal comment") 54 val FORMAL_COMMENT = Value("formal comment") 55 val CONTROL = Value("control symbol antiquotation") 56 val ANTIQ = Value("antiquotation") 57 val ANTIQ_START = Value("antiquotation: start") 58 val ANTIQ_STOP = Value("antiquotation: stop") 59 val ANTIQ_OTHER = Value("antiquotation: other") 60 val ANTIQ_STRING = Value("antiquotation: quoted string") 61 val ANTIQ_ALT_STRING = Value("antiquotation: back-quoted string") 62 val ANTIQ_CARTOUCHE = Value("antiquotation: text cartouche") 63 val ERROR = Value("bad input") 64 } 65 66 sealed case class Token(kind: Kind.Value, source: String) 67 { 68 def is_keyword: Boolean = kind == Kind.KEYWORD 69 def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source) 70 def is_space: Boolean = kind == Kind.SPACE 71 def is_comment: Boolean = kind == Kind.INFORMAL_COMMENT || kind == Kind.FORMAL_COMMENT 72 } 73 74 75 76 /** parsers **/ 77 78 case object ML_String extends Scan.Line_Context 79 case class Antiq(ctxt: Scan.Line_Context) extends Scan.Line_Context 80 81 private object Parsers extends Scan.Parsers with Antiquote.Parsers with Comment.Parsers 82 { 83 /* string material */ 84 85 private val blanks = many(character(Symbol.is_ascii_blank)) 86 private val blanks1 = many1(character(Symbol.is_ascii_blank)) 87 88 private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z } 89 private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y } 90 91 private val escape = 92 one(character("\"\\abtnvfr".contains(_))) | 93 "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } | 94 repeated(character(Symbol.is_ascii_digit), 3, 3) 95 96 private val str = 97 one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) | 98 one(s => 99 Symbol.is_open(s) || Symbol.is_close(s) || Symbol.is_symbolic(s) || Symbol.is_control(s)) | 100 "\\" ~ escape ^^ { case x ~ y => x + y } 101 102 103 /* ML char -- without gaps */ 104 105 private val ml_char: Parser[Token] = 106 "#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) } 107 108 private val recover_ml_char: Parser[String] = 109 "#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x } 110 111 112 /* ML string */ 113 114 private val ml_string_body: Parser[String] = 115 rep(gap | str) ^^ (_.mkString) 116 117 private val recover_ml_string: Parser[String] = 118 "\"" ~ ml_string_body ^^ { case x ~ y => x + y } 119 120 private val ml_string: Parser[Token] = 121 "\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) } 122 123 private def ml_string_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = 124 { 125 def result(x: String, c: Scan.Line_Context) = (Token(Kind.STRING, x), c) 126 127 ctxt match { 128 case Scan.Finished => 129 "\"" ~ ml_string_body ~ ("\"" | gap_start) ^^ 130 { case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) } 131 case ML_String => 132 blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^ 133 { case x ~ Some(y ~ z ~ w) => 134 result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String) 135 case x ~ None => result(x, ML_String) } 136 case _ => failure("") 137 } 138 } 139 140 141 /* ML comment */ 142 143 private val ml_comment: Parser[Token] = 144 comment ^^ (x => Token(Kind.INFORMAL_COMMENT, x)) 145 146 private def ml_comment_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = 147 comment_line(ctxt) ^^ { case (x, c) => (Token(Kind.INFORMAL_COMMENT, x), c) } 148 149 private val ml_formal_comment: Parser[Token] = 150 comment_cartouche ^^ (x => Token(Kind.FORMAL_COMMENT, x)) 151 152 private def ml_formal_comment_line(ctxt: Scan.Line_Context) 153 : Parser[(Token, Scan.Line_Context)] = 154 comment_cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.FORMAL_COMMENT, x), c) } 155 156 157 /* antiquotations (line-oriented) */ 158 159 def ml_cartouche_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = 160 cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), c) } 161 162 def ml_antiq_start(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = 163 ctxt match { 164 case Scan.Finished => "@{" ^^ (x => (Token(Kind.ANTIQ_START, x), Antiq(Scan.Finished))) 165 case _ => failure("") 166 } 167 168 def ml_antiq_stop(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = 169 ctxt match { 170 case Antiq(Scan.Finished) => "}" ^^ (x => (Token(Kind.ANTIQ_STOP, x), Scan.Finished)) 171 case _ => failure("") 172 } 173 174 def ml_antiq_body(context: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = 175 context match { 176 case Antiq(ctxt) => 177 (if (ctxt == Scan.Finished) antiq_other ^^ (x => (Token(Kind.ANTIQ_OTHER, x), context)) 178 else failure("")) | 179 quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_STRING, x), Antiq(c)) } | 180 quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_ALT_STRING, x), Antiq(c)) } | 181 cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), Antiq(c)) } 182 case _ => failure("") 183 } 184 185 186 /* token */ 187 188 private def other_token: Parser[Token] = 189 { 190 /* identifiers */ 191 192 val letdigs = many(character(Symbol.is_ascii_letdig)) 193 194 val alphanumeric = 195 one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y } 196 197 val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_))) 198 199 val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x)) 200 201 val long_ident = 202 rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~ 203 (alphanumeric | (symbolic | "=")) ^^ 204 { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) } 205 206 val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) } 207 208 209 /* numerals */ 210 211 val dec = many1(character(Symbol.is_ascii_digit)) 212 val hex = many1(character(Symbol.is_ascii_hex)) 213 val sign = opt("~") ^^ { case Some(x) => x case None => "" } 214 val decint = sign ~ dec ^^ { case x ~ y => x + y } 215 val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y } 216 217 val word = 218 ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^ 219 (x => Token(Kind.WORD, x)) 220 221 val int = 222 sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^ 223 { case x ~ y => Token(Kind.INT, x + y) } 224 225 val rat = 226 decint ~ opt("/" ~ dec) ^^ { case x ~ None => x case x ~ Some(y ~ z) => x + y + z } 227 228 val real = 229 (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^ 230 { case x ~ y ~ z ~ w => x + y + z + w } | 231 decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x)) 232 233 234 /* main */ 235 236 val space = blanks1 ^^ (x => Token(Kind.SPACE, x)) 237 238 val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x)) 239 240 val ml_control = control ^^ (x => Token(Kind.CONTROL, x)) 241 val ml_antiq = 242 "@" ~ rat ^^ { case x ~ y => Token(Kind.ANTIQ, x + y) } | 243 antiq ^^ (x => Token(Kind.ANTIQ, x)) 244 245 val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x)) 246 247 val recover = 248 (recover_ml_char | (recover_ml_string | (recover_cartouche | recover_comment))) ^^ 249 (x => Token(Kind.ERROR, x)) 250 251 space | (ml_control | (recover | (ml_antiq | 252 (((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad)))) 253 } 254 255 def token: Parser[Token] = 256 ml_char | (ml_string | (ml_comment | (ml_formal_comment | other_token))) 257 258 def token_line(SML: Boolean, ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] = 259 { 260 val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished)) 261 262 if (SML) ml_string_line(ctxt) | (ml_comment_line(ctxt) | other) 263 else { 264 ml_string_line(ctxt) | 265 (ml_comment_line(ctxt) | 266 (ml_formal_comment_line(ctxt) | 267 (ml_cartouche_line(ctxt) | 268 (ml_antiq_start(ctxt) | (ml_antiq_stop(ctxt) | (ml_antiq_body(ctxt) | other)))))) 269 } 270 } 271 } 272 273 274 /* tokenize */ 275 276 def tokenize(input: CharSequence): List[Token] = 277 Parsers.parseAll(Parsers.rep(Parsers.token), Scan.char_reader(input)) match { 278 case Parsers.Success(tokens, _) => tokens 279 case _ => error("Unexpected failure of tokenizing input:\n" + input.toString) 280 } 281 282 def tokenize_line(SML: Boolean, input: CharSequence, context: Scan.Line_Context) 283 : (List[Token], Scan.Line_Context) = 284 { 285 var in: Reader[Char] = Scan.char_reader(input) 286 val toks = new mutable.ListBuffer[Token] 287 var ctxt = context 288 while (!in.atEnd) { 289 Parsers.parse(Parsers.token_line(SML, ctxt), in) match { 290 case Parsers.Success((x, c), rest) => toks += x; ctxt = c; in = rest 291 case Parsers.NoSuccess(_, rest) => 292 error("Unexpected failure of tokenizing input:\n" + rest.source.toString) 293 } 294 } 295 (toks.toList, ctxt) 296 } 297} 298