1type pos = (int * int);
2type arg = int;
3open Tokens;
4type lexresult  = (svalue,pos) token
5exception Impossible of string;
6fun fatalError s = raise(Impossible s);
7
8fun mkTok f text pos line =
9  (f text, (pos - String.size text, line), (pos, line));
10
11fun mkMtTok text pos line =
12  ((pos - String.size text, line), (pos, line));
13
14
15
16(* To translate escape sequences *)
17
18(* No problem that this isn't correct for Macintosh *)
19val char_for_backslash = fn
20(* *)    #"n" => #"\010"
21(* *)  | #"r" => #"\013"
22(* *)  | #"a" => #"\007"
23(* *)  | #"b" => #"\008"
24(* *)  | #"t" => #"\009"
25(* *)  | #"v" => #"\011"
26(* *)  | #"f" => #"\012"
27(* *)  | c => c
28;
29
30(* The table of keywords *)
31
32val keyword_table =
33List.foldl (fn ((str, tok), t) => Binarymap.insert (t, str, tok))
34(Binarymap.mkDict String.compare)
35[
36  ("abstype",      ABSTYPE),
37  ("and",          AND),
38  ("andalso",      ANDALSO),
39  ("as",           AS),
40  ("case",         CASE),
41  ("datatype",     DATATYPE),
42  ("do",           DO),
43  ("else",         ELSE),
44  ("eqtype",       EQTYPE),
45  ("end",          END),
46  ("exception",    EXCEPTION),
47  ("fn",           FN),
48  ("fun",          FUN),
49  ("handle",       HANDLE),
50  ("if",           IF),
51  ("in",           IN),
52  ("include",      INCLUDE),
53  ("infix",        INFIX),
54  ("infixr",       INFIXR),
55  ("let",          LET),
56  ("local",        LOCAL),
57  ("nonfix",       NONFIX),
58  ("of",           OF),
59  ("op",           OP),
60  ("open",         OPEN),
61  ("orelse",       ORELSE),
62  ("prim_eqtype",  PRIM_EQTYPE),
63  ("prim_EQtype",  PRIM_REFTYPE),
64  ("prim_type",    PRIM_TYPE),
65  ("prim_val",     PRIM_VAL),
66  ("raise",        RAISE),
67  ("rec",          REC),
68  ("sig",          SIG),
69  ("signature",    SIGNATURE),
70  ("struct",       STRUCT),
71  ("structure",    STRUCTURE),
72  ("then",         THEN),
73  ("type",         TYPE),
74  ("val",          VAL),
75  ("while",        WHILE),
76  ("where",        WHERE),
77  ("with",         WITH),
78  ("withtype",     WITHTYPE),
79  ("#",            HASH),
80  ("->",           ARROW),
81  ("|",            BAR),
82  (":",            COLON),
83  ("=>",           DARROW),
84  ("=",            EQUALS),
85  ("*",            STAR)
86];
87(*
88local (* Make sure that strings are shared (interned); this saves space
89         when writing to disk: *)
90    val intern_table = (Hasht.new 123 : (string, string) Hasht.t);
91in
92    fun share s =
93       case Hasht.peek intern_table s of
94           NONE    => (Hasht.insert intern_table s s; s)
95         | SOME s' => s'
96end
97*)
98fun share s = s;
99
100fun mkKeyword text pos line =
101  (Binarymap.find (keyword_table, text)) (mkMtTok text pos line)
102  handle Binarymap.NotFound => ID (mkTok share text pos line);
103
104val savedLexemeStart = ref 0;
105
106fun splitQualId s =
107  let open CharVector
108      val len' = size s - 1
109      fun parse n =
110        if n = 0 then
111          ("", s)
112        else if sub(s, n) = #"." then
113          ( String.extract(s, 0, SOME n),
114            String.extract(s, n + 1, SOME(len' - n)) )
115        else
116          parse (n-1)
117  in parse len' end;
118
119fun mkQualId text pos line =
120  let val (qual, id) = splitQualId text in
121    if id = "*" then
122      QUAL_STAR (mkTok (fn x => { qual=qual, id=id }) text pos line)
123    else
124      QUAL_ID (mkTok (fn x => { qual=qual, id=id }) text pos line)
125  end
126;
127
128exception LexicalError of string * string * int (* (message, loc1, loc2) *)
129fun lexError msg text pos line =
130  raise (LexicalError (msg, text, line))
131
132fun eof commentDepth =
133  if not (commentDepth = 0) then
134    lexError "Unclosed comment" "" ~1 (!savedLexemeStart)
135  else
136    EOF ((~1, ~1), (~1, ~1));
137
138fun constTooLarge msg yytext yypos yylineno =
139(
140  lexError (msg ^ " constant is too large") yytext yypos yylineno
141);
142
143fun string_of_string s =
144  case String.fromString (String.substring (s, 1, String.size s - 2)) of
145    NONE => raise Fail ""
146  | SOME s => s;
147
148fun char_of_string s =
149  case Char.fromString (String.substring (s, 2, String.size s - 3)) of
150    NONE => raise Fail ""
151  | SOME s => s;
152
153fun int_of_string s =
154  case StringCvt.scanString (Int.scan StringCvt.DEC) s of
155    NONE => raise Fail ""
156  | SOME i => i;
157
158fun hex_of_string s =
159  case StringCvt.scanString (Int.scan StringCvt.HEX) s of
160    NONE => raise Fail ""
161  | SOME i => i;
162
163fun word_of_string s =
164  case StringCvt.scanString (Word.scan StringCvt.DEC) s of
165    NONE => raise Fail ""
166  | SOME i => i;
167
168fun word_of_hexstring s =
169  case StringCvt.scanString (Word.scan StringCvt.HEX) s of
170    NONE => raise Fail ""
171  | SOME i => i;
172
173fun real_of_string s =
174  case Real.fromString s of
175    NONE => raise Fail ""
176  | SOME r => r;
177
178%%
179%header (functor SMLLexFun(structure Tokens : SML_TOKENS));
180%s Comment;
181%full
182%arg (commentDepth);
183%count
184id=[A-Za-z][A-Za-z0-9_']* | [-!%&$#+/:<=>?@~^|*\\]+;
185stringchar=(\\["abtnvfr\\])|(\\[ \t\n\r]+\\)|(\\\^[@-_])|(\\[0-9][0-9][0-9])|[^\\\n\r\127\255\001-\026];
186
187%%
188<INITIAL>[\ \t\n]+ =>
189  ( continue () );
190<INITIAL>"(*" =>
191  ( (savedLexemeStart:=(!yylineno); YYBEGIN Comment; lex 1 ()) );
192<INITIAL>"*)" =>
193  ( lexError "unmatched comment bracket" yytext yypos (!yylineno) );
194<INITIAL>'[A-Za-z0-9_']+ =>
195  ( TYVAR (mkTok (fn x => x) yytext yypos (!yylineno)) );
196<INITIAL>0 =>
197  ( ZDIGIT (mkTok (fn x => 0) yytext yypos (!yylineno)) );
198<INITIAL>[1-9] =>
199  ( NZDIGIT (mkTok int_of_string yytext yypos (!yylineno)) );
200<INITIAL>0[0-9]+ =>
201  ( ZPOSINT2 (mkTok int_of_string yytext yypos (!yylineno))
202     handle Fail _ => constTooLarge "integer" yytext yypos (!yylineno) );
203<INITIAL>[1-9][0-9]+ =>
204  ( NZPOSINT2 (mkTok int_of_string yytext yypos (!yylineno))
205     handle Fail _ => constTooLarge "integer" yytext yypos (!yylineno) );
206<INITIAL>~[0-9]+ =>
207  ( NEGINT (mkTok int_of_string yytext yypos (!yylineno))
208     handle Fail _ => constTooLarge "integer" yytext yypos (!yylineno) );
209<INITIAL>~?0x[0-9a-fA-F]+ =>
210  ( NEGINT (mkTok hex_of_string yytext yypos (!yylineno))
211     handle Fail _ => constTooLarge "integer" yytext yypos (!yylineno) );
212<INITIAL>0w[0-9]+ =>
213  ( WORD (mkTok word_of_string yytext yypos (!yylineno))
214     handle Fail _ => constTooLarge "word" yytext yypos (!yylineno) );
215<INITIAL>0wx[0-9a-fA-F]+ =>
216  ( WORD (mkTok word_of_hexstring yytext yypos (!yylineno))
217     handle Fail _ => constTooLarge "word" yytext yypos (!yylineno) );
218<INITIAL>~?[0-9]+(\.[0-9]+)?([eE]~?[0-9]+)? =>
219  ( REAL (mkTok real_of_string yytext yypos (!yylineno))
220     handle Fail _ => constTooLarge "real" yytext yypos (!yylineno) );
221<INITIAL>"{stringchar}*" =>
222  ( STRING (mkTok string_of_string yytext yypos (!yylineno)) );
223<INITIAL>#"{stringchar}" =>
224  ( CHAR (mkTok char_of_string yytext yypos (!yylineno)) );
225<INITIAL>_ =>
226  ( UNDERBAR (mkMtTok yytext yypos (!yylineno)) );
227<INITIAL>, =>
228  ( COMMA (mkMtTok yytext yypos (!yylineno)) );
229<INITIAL>\.\.\. =>
230  ( DOTDOTDOT (mkMtTok yytext yypos (!yylineno)) );
231<INITIAL>\{ =>
232  ( LBRACE (mkMtTok yytext yypos (!yylineno)) );
233<INITIAL>} =>
234  ( RBRACE (mkMtTok yytext yypos (!yylineno)) );
235<INITIAL>\[ =>
236  ( LBRACKET (mkMtTok yytext yypos (!yylineno)) );
237<INITIAL>#\[ =>
238  ( HASHLBRACKET (mkMtTok yytext yypos (!yylineno)) );
239<INITIAL>] =>
240  ( RBRACKET (mkMtTok yytext yypos (!yylineno)) );
241<INITIAL>\( =>
242  ( LPAREN (mkMtTok yytext yypos (!yylineno)) );
243<INITIAL>\) =>
244  ( RPAREN (mkMtTok yytext yypos (!yylineno)) );
245<INITIAL>\; =>
246  ( SEMICOLON (mkMtTok yytext yypos (!yylineno)) );
247<INITIAL>{id} =>
248  ( mkKeyword yytext yypos (!yylineno) );
249<INITIAL>({id}\.)+{id} =>
250  ( mkQualId yytext yypos (!yylineno) );
251<INITIAL>. =>
252  ( lexError "ill-formed token" yytext yypos (!yylineno) );
253
254<Comment>"(*" =>
255  ( (lex (commentDepth + 1) ()) );
256<Comment>"*)" =>
257  ( (if commentDepth = 1 then YYBEGIN INITIAL else ()); lex (commentDepth - 1) ());
258<Comment>[^*()]+ =>
259  ( continue () );
260<Comment>. =>
261  ( continue () );
262
263