1structure Holdep_tokens :> Holdep_tokens =
2struct
3
4infix |>
5fun x |> f = f x
6
7exception LEX_ERROR of string
8type result = (string,int) Binarymap.dict
9
10datatype char_reader = CR of {reader : unit -> string,
11                              current : char option,
12                              maxpos : int,
13                              buffer : string,
14                              closer : unit -> unit,
15                              pos : int}
16(* invariants:
17    * buffer = "" ��� current = NONE ��� maxpos = 0 ��� pos = 0
18    * 0 ��� pos ��� maxpos
19*)
20
21fun current (CR {current = c, ...}) = c
22fun make inp close = let
23  val newbuf = inp()
24in
25  if newbuf = "" then CR {pos = 0, maxpos = 0, buffer = newbuf,
26                          reader = inp, current = NONE,
27                          closer = close}
28  else CR {pos = 0, maxpos = size newbuf - 1, buffer = newbuf,
29           reader = inp, current = SOME(String.sub(newbuf, 0)),
30           closer = close}
31end
32fun fromFile f = let
33  val is = TextIO.openIn f
34in
35  make (fn () => TextIO.input is) (fn () => TextIO.closeIn is)
36end
37fun fromStream is = make (fn () => TextIO.input is) (fn () => ())
38fun fromReader uc = make (fn () => case uc() of NONE => "" | SOME c => str c)
39                         (fn () => ())
40fun closeCR (CR {closer,...}) = closer()
41
42fun advance (c as CR {pos, buffer, maxpos, reader, current, closer}) =
43    if pos < maxpos then
44      CR { pos = pos + 1, buffer = buffer, reader = reader,
45           current = SOME(String.sub(buffer, pos + 1)),
46           maxpos = maxpos, closer = closer }
47    else if buffer = "" then c
48    else make reader closer
49
50datatype SCR = SCR of {linenum : int,
51                       filename : string,
52                       colnum : int,
53                       ids : (string,int) Binarymap.dict,
54                       cr : char_reader}
55
56
57fun SCRfromNamedCR (name, cr) =
58  SCR { linenum = 1, colnum = 0, filename = name,
59        ids = Binarymap.mkDict String.compare, cr = cr }
60
61fun makeSCR fname = SCRfromNamedCR (fname, fromFile fname)
62fun SCRfromStream (name, is) = SCRfromNamedCR (name, fromStream is)
63fun SCRfromReader (name, uc) = SCRfromNamedCR (name, fromReader uc)
64
65fun currentChar (SCR{cr,...}) = current cr
66fun closeSCR (SCR{cr,...}) = closeCR cr
67fun getIDs (SCR{ids,...}) = ids
68fun inc (SCR {linenum, filename, colnum, ids, cr}) =
69    SCR{linenum = linenum, filename = filename, colnum = colnum + 1,
70        ids = ids, cr = advance cr}
71fun newline (SCR{linenum, filename, colnum, ids, cr}) =
72    SCR{linenum = linenum + 1, filename = filename, colnum = 0,
73        ids = ids, cr = advance cr}
74fun completeID s (SCR{linenum, filename, colnum, ids, cr}) =
75    SCR{linenum = linenum, filename = filename, colnum = colnum,
76        ids = case Binarymap.peek(ids,s) of
77                  NONE => Binarymap.insert(ids, s, linenum)
78                | SOME _ => ids,
79        cr = cr}
80
81fun mem x [] = false
82  | mem x (y::ys) = x = y orelse mem x ys
83val SMLsyms = String.explode "!%&$+/:<=>?@~|#*\\-~^"
84val symbools = List.tabulate(256, fn i => mem (Char.chr i) SMLsyms)
85val symb_vec = Vector.fromList symbools
86fun isSMLSym c = Vector.sub(symb_vec, Char.ord c)
87
88fun isSMLAlphaCont c =
89    Char.isAlphaNum c orelse c = #"_" orelse c = #"'"
90
91fun Error(SCR {filename,colnum,linenum,...}, msg) =
92    raise LEX_ERROR (filename^" "^Int.toString linenum ^ "." ^
93                     Int.toString colnum ^ " " ^ msg)
94
95
96
97fun clean_open scr =
98    case currentChar scr of
99        NONE => scr
100      | SOME #"d" => openTermPFX "datatype" 1 (inc scr)
101      | SOME #"e" => opene (inc scr) (* exception, end *)
102      | SOME #"f" => openf (inc scr) (* fun, functor *)
103      | SOME #"i" => openi (inc scr) (* in, infixl, infix, infixr *)
104      | SOME #"l" => openTermPFX "local" 1 (inc scr)
105      | SOME #"n" => openTermPFX "nonfix" 1 (inc scr)
106      | SOME #"o" => modTermPFX true "open" 1 (clean_open, clean_open) (inc scr)
107      | SOME #"p" => openTermPFX "prim_val" 1 (inc scr)
108      | SOME #"s" => opens (inc scr) (* structure, signature *)
109      | SOME #"t" => openTermPFX "type" 1 (inc scr)
110      | SOME #"v" => openTermPFX "val" 1 (inc scr)
111      | SOME #";" => clean_initial (inc scr)
112      | SOME #"(" => openLPAREN (inc scr)
113      | SOME #"#" => openHASH (inc scr)
114      | SOME #"\n" => clean_open (newline scr)
115      | SOME c => if Char.isSpace c then clean_open (inc scr)
116                  else if Char.isAlpha c then
117                    modAlphaID true clean_open ("", [c]) (scr |> inc)
118                  else if isSMLSym c then
119                    modSymID true clean_open [c] (scr |> inc)
120                  else Error(scr, "Bad character >"^str c^"< after open")
121and openTermPFX kstr c scr =
122    modTermPFX true kstr c (clean_initial, clean_open) scr
123and openOnKWord kstr scr = OnKWord true kstr (clean_initial, clean_open) scr
124and opene scr =
125    case currentChar scr of
126        NONE => scr |> completeID "e"
127      | SOME #"n" => openTermPFX "end" 2 (inc scr)
128      | SOME #"x" => openTermPFX "exception" 2 (inc scr)
129      | SOME c => extend_openAlpha "e" c scr
130and openf scr =
131    case currentChar scr of
132        NONE => scr |> completeID "f"
133      | SOME #"u" => openfu (inc scr)
134      | SOME c => extend_openAlpha "f" c scr
135and openfu scr =
136    case currentChar scr of
137        NONE => scr |> completeID "fu"
138      | SOME #"n" => openfun (inc scr)
139      | SOME c => extend_openAlpha "fu" c scr
140and openfun scr =
141    case currentChar scr of
142        NONE => scr
143      | SOME #"c" => openTermPFX "functor" 4 (inc scr)
144      | SOME c => openOnKWord "fun" scr
145and openi scr =
146    case currentChar scr of
147        NONE => scr |> completeID "i"
148      | SOME #"n" => openin (inc scr)
149      | SOME c => extend_openAlpha "i" c scr
150and openin scr =
151    case currentChar scr of
152        NONE => Error(scr, "Don't expect to see 'in'-EOF")
153      | SOME #"f" => scr |> inc |> openinf
154      | SOME c => openOnKWord "in" scr
155and openinf scr =
156    case currentChar scr of
157        NONE => scr |> completeID "inf"
158      | SOME #"i" => scr |> inc |> openinfi
159      | SOME c => extend_openAlpha "inf" c scr
160and openinfi scr =
161    case currentChar scr of
162        NONE => scr |> completeID "infi"
163      | SOME #"x" => scr |> inc |> openinfix
164      | SOME c => extend_openAlpha "infi" c scr
165and openinfix scr =
166    case currentChar scr of
167        NONE => Error(scr, "Don't expect to see 'infix'-EOF")
168      | SOME #"l" => openOnKWord "infixl" (inc scr)
169      | SOME #"r" => openOnKWord "infixr" (inc scr)
170      | SOME c => openOnKWord "infix" scr
171and opens scr =
172    case currentChar scr of
173        NONE => scr |> completeID "s"
174      | SOME #"i" => openTermPFX "signature" 2 (inc scr)
175      | SOME #"t" => openTermPFX "structure" 2 (inc scr)
176      | SOME c => extend_openAlpha "s" c scr
177and extend_openAlpha pfx c scr =
178    if Char.isSpace c then clean_open (scr |> inc |> completeID pfx)
179    else if isSMLAlphaCont c then
180      modAlphaID true clean_open (pfx, [c]) (scr |> inc)
181    else if isSMLSym c then
182      modSymID true clean_open [c] (scr |> inc |> completeID pfx)
183    else Error(scr, "Bad character >"^str c^"< after 'open'")
184and openLPAREN scr =
185    case currentChar scr of
186        NONE => Error(scr, "Don't expect to see '('-EOF")
187      | SOME #"*" => COMMENT clean_open (inc scr)
188      | SOME c => Error(scr, "Don't expect to see '("^str c^"' after 'open'")
189and openHASH scr =
190    case currentChar scr of
191        NONE => Error(scr, "Don't expect to see 'open'-'#'")
192      | SOME c => if isSMLSym c then
193                    modSymID true clean_open [c,#"#"] (scr |> inc)
194                  else Error(scr, "Don't expect to see 'open'-'#'")
195and openQID0 scr = (* seen the dot *)
196    case currentChar scr of
197        NONE => Error(scr, "'.'-EOF unexpected")
198      | SOME c => if Char.isAlpha c then openQIDalpha (inc scr)
199                  else if isSMLSym c then openQIDsym (inc scr)
200                  else Error(scr, "'."^str c^" unexpected")
201and openQIDalpha scr =
202    case currentChar scr of
203        NONE => scr
204      | SOME #"." => openQID0 (inc scr)
205      | SOME c => if isSMLAlphaCont c then openQIDalpha (inc scr)
206                  else clean_open scr
207and openQIDsym scr =
208    case currentChar scr of
209        NONE => scr
210      | SOME #"." => openQID0 (inc scr)
211      | SOME c => if isSMLSym c then openQIDsym (inc scr)
212                  else clean_open scr
213and modTermPFX dotok kword numseen (seenk, notk) scr = let
214  fun get() = String.extract(kword, 0, SOME numseen)
215in
216  case currentChar scr of
217      NONE => notk (scr |> completeID (get()))
218    | SOME c => if c = String.sub(kword, numseen) then
219                  if numseen + 1 = size kword then
220                    OnKWord dotok kword (seenk,notk) (inc scr)
221                  else
222                    modTermPFX dotok kword (numseen + 1) (seenk,notk) (inc scr)
223                else if isSMLAlphaCont c then
224                  modAlphaID dotok notk (get(), [c]) (inc scr)
225                else notk (scr |> completeID (get()))
226end
227and OnKWord dotok kword (seenk,notseenk) scr =
228    case currentChar scr of
229        NONE => seenk scr
230      | SOME c => if isSMLAlphaCont c then
231                    modAlphaID dotok notseenk (kword, [c]) (inc scr)
232                  else seenk scr
233and modAlphaID dotok k (base,cs) scr =
234    case currentChar scr of
235        NONE => scr |> completeID (base ^ implode (List.rev cs))
236      | SOME #"." =>
237          if dotok then
238            openQID0 (scr |> inc |> completeID (base ^ implode (List.rev cs)))
239          else Error(scr, "Didn't expect to see qualified ident here")
240      | SOME c =>
241          if isSMLAlphaCont c then modAlphaID dotok k (base,c::cs) (inc scr)
242          else k (scr |> completeID (base ^ implode (List.rev cs)))
243and modSymID dotok k cs scr =
244    case currentChar scr of
245        NONE => scr |> completeID (implode (List.rev cs))
246      | SOME #"." =>
247          if dotok then
248            openQID0 (scr |> inc |> completeID (implode (List.rev cs)))
249          else Error(scr, "Didn't expect to see qualified ident here")
250      | SOME c => if isSMLSym c then modSymID dotok k (c::cs) (inc scr)
251                  else k (scr |> completeID (implode (List.rev cs)))
252and includeLPAR scr =
253    case currentChar scr of
254        NONE => Error(scr, "Don't expect 'include'-'('-EOF")
255      | SOME #"*" => COMMENT clean_include (inc scr)
256      | SOME c => Error(scr, "Don't expect 'include'-'('-'"^str c^"'")
257and clean_include scr =
258    case currentChar scr of
259        NONE => scr
260      | SOME #"d" =>
261          modTermPFX false "datatype" 1 (clean_initial, clean_include) (inc scr)
262      | SOME #"e" => includee (inc scr)
263      | SOME #"s" =>
264          modTermPFX false "structure" 1 (clean_initial, clean_include) (inc scr)
265      | SOME #"t" =>
266          modTermPFX false "type" 1 (clean_initial, clean_include) (inc scr)
267      | SOME #"v" =>
268          modTermPFX false "val" 1 (clean_initial, clean_include) (inc scr)
269      | SOME #"w" =>
270          modTermPFX false "where" 1 (clean_initial, clean_include) (inc scr)
271      | SOME #"\n" => clean_include (newline scr)
272      | SOME #"(" => includeLPAR (inc scr)
273      | SOME #";" => clean_initial (inc scr)
274      | SOME c => if Char.isSpace c then clean_include (inc scr)
275                  else if Char.isAlpha c then
276                    modAlphaID false clean_include ("", [c]) (inc scr)
277                  else if isSMLSym c then
278                    modSymID false clean_include [c] (inc scr)
279                  else Error(scr, "Bad character >"^str c^"< after 'include'")
280and includee scr =
281    case currentChar scr of
282        NONE => scr |> completeID "e"
283      | SOME #"\n" => clean_include (scr |> newline |> completeID "e")
284      | SOME #"n" =>
285          modTermPFX false "end" 2 (clean_initial, clean_include) (inc scr)
286      | SOME #"x" =>
287          modTermPFX false "exception" 2 (clean_initial, clean_include) (inc scr)
288      | SOME c => if Char.isSpace c then clean_include (scr |> inc |> completeID "e")
289                  else if isSMLSym c then
290                    modSymID false clean_include [c] (scr |> inc |> completeID "e")
291                  else if isSMLAlphaCont c then
292                    modAlphaID false clean_include ("e", [c]) (inc scr)
293                  else Error(scr, "Bad character >"^str c^"< after 'include'")
294and clean_initial scr =
295    case currentChar scr of
296        NONE => scr
297      | SOME #"i" => initialAlphaKWordPFX "include" 1 clean_include (inc scr)
298      | SOME #"o" => initialAlphaKWordPFX "open" 1 clean_open (inc scr)
299      | SOME #"\n" => clean_initial (newline scr)
300      | SOME #"(" => initialLPAREN (inc scr)
301      | SOME #"\"" => STRING (inc scr)
302      | SOME c =>
303        if Char.isSpace c then clean_initial (inc scr)
304        else if Char.isAlpha c then initialAlphaID("", [c]) (inc scr)
305        else if isSMLSym c then initialSymID("", [c]) (inc scr)
306        else clean_initial (inc scr)
307and initialAlphaID (pfx, cs) scr =
308    case currentChar scr of
309        NONE => scr
310      | SOME #"." =>
311          initialQID0 (scr |> inc
312                           |> completeID (pfx ^ String.implode (List.rev cs)))
313      | SOME #"\n" => clean_initial (newline scr)
314      | SOME c => if isSMLAlphaCont c then initialAlphaID (pfx, c::cs) (inc scr)
315                  else clean_initial scr
316and initialSymID (pfx, cs) scr =
317    case currentChar scr of
318        NONE => scr
319      | SOME #"." => initialQID0 (scr |> inc |> completeID (pfx ^ String.implode (List.rev cs)))
320      | SOME #"\n" => clean_initial (newline scr)
321      | SOME c => if isSMLSym c then initialSymID (pfx, c::cs) (inc scr)
322                  else clean_initial scr
323and initialLPAREN scr =
324    case currentChar scr of
325        NONE => Error(scr, "'('-EOF unexpected")
326      | SOME #"*" => COMMENT clean_initial (inc scr)
327      | SOME #"(" => initialLPAREN (inc scr)
328      | _ => clean_initial scr
329and COMMENT k scr =
330    case currentChar scr of
331        NONE => Error(scr, "Unterminated comment")
332      | SOME #"(" => COMMENTlpar k (inc scr)
333      | SOME #"*" => COMMENTast k (inc scr)
334      | SOME #"\n" => COMMENT k (newline scr)
335      | _ => COMMENT k (inc scr)
336and COMMENTlpar k scr =
337    case currentChar scr of
338        NONE => Error(scr, "Unterminated comment")
339      | SOME #"(" => COMMENTlpar k (inc scr)
340      | SOME #"*" => COMMENT (COMMENT k) (inc scr)
341      | SOME #"\n" => COMMENT k (newline scr)
342      | _ => COMMENT k (inc scr)
343and COMMENTast k scr =
344    case currentChar scr of
345        NONE => Error(scr, "Unterminated comment")
346      | SOME #"*" => COMMENTast k (inc scr)
347      | SOME #")" => k (inc scr)
348      | SOME #"\n" => COMMENT k (newline scr)
349      | _ => COMMENT k (inc scr)
350and initialQID0 scr = (* have just seen '.' *)
351    case currentChar scr of
352        NONE => Error(scr, "'.'-EOF unexpected")
353      | SOME c => if Char.isSpace c then Error(scr, "'.'-whitespace unexpected")
354                  else if Char.isAlpha c then
355                    initialAlphaQID (inc scr)
356                  else if isSMLSym c then
357                    initialSymQID (inc scr)
358                  else Error(scr, "Bad character >"^str c^"< after qualifying '.'")
359and initialAlphaQID scr = (* have seen some characters of ID *)
360    case currentChar scr of
361        NONE => scr
362      | SOME #"." => initialQID0 (inc scr)
363      | SOME c => if isSMLAlphaCont c then initialAlphaQID (inc scr)
364                  else clean_initial scr
365and initialSymQID scr = (* have seen some characters of ID *)
366    case currentChar scr of
367        NONE => scr
368      | SOME #"." => initialQID0 (inc scr)
369      | SOME c => if isSMLSym c then initialSymQID (inc scr)
370                  else clean_initial scr
371and STRING scr = (* have seen the quote *)
372    case currentChar scr of
373        NONE => Error(scr, "Unterminated string literal")
374      | SOME #"\"" => clean_initial (inc scr)
375      | SOME #"\\" => STRINGslash (inc scr)
376      | SOME #"\n" => Error(scr, "Unescaped newline in string literal")
377      | _ => STRING (inc scr)
378and STRINGcaret scr =
379    case currentChar scr of
380        NONE => Error(scr, "Unterminated string literal")
381      | SOME c => let val i = Char.ord c
382                  in
383                    if i < 64 orelse i >= 96 then
384                      Error(scr, "Illegal caret-escape in string literal")
385                    else STRING (inc scr)
386                  end
387and STRINGslash scr =
388    case currentChar scr of
389        NONE => Error(scr, "Unterminated string literal")
390      | SOME #"\n" => STRINGelidews (newline scr)
391      | SOME #"\\" => STRING (inc scr)
392      | SOME #"\"" => STRING (inc scr)
393      | SOME #"^" => STRINGcaret (inc scr)
394      | SOME #"a" => STRING (inc scr)
395      | SOME #"b" => STRING (inc scr)
396      | SOME #"f" => STRING (inc scr)
397      | SOME #"n" => STRING (inc scr)
398      | SOME #"r" => STRING (inc scr)
399      | SOME #"t" => STRING (inc scr)
400      | SOME #"v" => STRING (inc scr)
401      | SOME c => if Char.isDigit c then
402                    STRINGslashdigit 1 (inc scr)
403                  else if Char.isSpace c then
404                    STRINGelidews (inc scr)
405                  else Error(scr, "Illegal backslash escape >" ^ str c ^
406                                  "< in string literal")
407and STRINGelidews scr =
408    case currentChar scr of
409        NONE => Error(scr, "Unterminated string literal")
410      | SOME #"\\" => STRING (inc scr)
411      | SOME #"\n" => STRINGelidews (newline scr)
412      | SOME c => if Char.isSpace c then STRINGelidews (inc scr)
413                  else Error(scr, "Illegal char >" ^ str c ^
414                                  "< in string \\...\\ elision")
415and STRINGslashdigit cnt scr =
416    case currentChar scr of
417        NONE => Error(scr, "Unterminated string literal")
418      | SOME c => if Char.isDigit c then
419                    if cnt = 2 then STRING (inc scr)
420                    else STRINGslashdigit (cnt + 1) (inc scr)
421                  else Error(scr, "Illegal backslash escape in string literal")
422and initialAlphaKWordPFX kword numseen k scr =
423    case currentChar scr of
424        NONE => scr
425      | SOME c =>
426        if c = String.sub(kword, numseen) then
427          if numseen + 1 = size kword then
428            initialAlphaKWord kword k (inc scr)
429          else
430            initialAlphaKWordPFX kword (numseen + 1) k (inc scr)
431        else if isSMLAlphaCont c then
432          initialAlphaID (String.extract(kword, 0, SOME numseen), [c])
433                         (inc scr)
434        else clean_initial scr
435and initialAlphaKWord kword k scr =
436    case currentChar scr of
437        NONE => k scr
438      | SOME c => if isSMLAlphaCont c then
439                    initialAlphaID (kword, [c]) (inc scr)
440                  else k scr
441
442fun scrdeps scr =
443    getIDs (clean_initial scr) before
444    closeSCR scr
445
446fun file_deps fname = scrdeps (makeSCR fname)
447fun stream_deps p = scrdeps (SCRfromStream p)
448fun reader_deps p = scrdeps (SCRfromReader p)
449
450end (* struct *)
451