1(*  Title:      HOL/TPTP/TPTP_Parser/tptp.lex
2    Author:     Nik Sultana, Cambridge University Computer Laboratory
3
4 Notes:
5 * Omit %full in definitions to restrict alphabet to ascii.
6 * Could include %posarg to ensure that we'd start counting character positions
7   from 0, but it would punish performance.
8 * %s AF F COMMENT; -- could improve by making stateful.
9
10 Acknowledgements:
11 * Geoff Sutcliffe for help with TPTP.
12 * Timothy Bourke for his tips on getting ML-Yacc working with Poly/ML.
13 * An early version of this was ported from the specification shipped with
14   Leo-II, written by Frank Theiss.
15 * Some boilerplate bits were taken from the ml-yacc/ml-lex manual by Roger Price.
16 * Jasmin Blanchette and Makarius Wenzel for help with Isabelle integration.
17*)
18
19structure T = Tokens
20type pos = int             (* Position in file *)
21type lineNo = int
22type svalue = T.svalue
23type ('a,'b) token = ('a,'b) T.token
24type lexresult = (svalue,pos) token
25type lexarg = string
26type arg = lexarg
27val col = ref 0;
28val linep = ref 1;         (* Line pointer *)
29val eolpos = ref 0;
30
31val badCh : string * string * int * int -> unit = fn
32    (file_name, bad, line, col) =>
33    TextIO.output(TextIO.stdOut, file_name ^ "["
34          ^ Int.toString line ^ "." ^ Int.toString col
35          ^ "] Invalid character \"" ^ bad ^ "\"\n");
36
37val eof = fn file_name =>
38  let
39    val result = T.EOF (!linep,!col);
40    val _ = linep := 0;
41  in result end
42(*here could check whether file ended prematurely:
43 see if have open brackets, or if we're in some state other than INITIAL*)
44
45val count_commentlines : string -> unit = fn str =>
46  let
47    val str' = String.explode str
48    val newlines = List.filter (fn x => x = #"\n") str'
49  in linep := (!linep) + (List.length newlines) end
50
51%%
52%header (functor TPTPLexFun(structure Tokens: TPTP_TOKENS));
53%arg (file_name:string);
54
55percentage_sign           = "%";
56double_quote              = ["];
57do_char                   = ([^"\\]|[\\]["\\]);
58single_quote              = ['];
59sq_char                   = ([^'\\]|[\\]['\\]);
60sign                      = [+-];
61dot                       = [.];
62exponent                  = [Ee];
63slash                     = [/];
64zero_numeric              = [0];
65non_zero_numeric          = [1-9];
66numeric                   = [0-9];
67lower_alpha               = [a-z];
68upper_alpha               = [A-Z];
69alpha_numeric             = ({lower_alpha}|{upper_alpha}|{numeric}|_);
70dollar                    = \$;
71printable_char            = .;
72viewable_char             = [.\n];
73
74dot_decimal               = {dot}{numeric}+;
75
76ddollar                   = \$\$;
77unsigned_integer          = {numeric}+;
78divide                    = [/];
79
80signed_integer            = {sign}{unsigned_integer};
81exp_suffix                = {exponent}({signed_integer}|{unsigned_integer});
82real                      = ({signed_integer}|{unsigned_integer}){dot_decimal}{exp_suffix}?;
83rational                  = ({signed_integer}|{unsigned_integer}){divide}{unsigned_integer};
84
85lower_word                = {lower_alpha}{alpha_numeric}*;
86upper_word                = {upper_alpha}{alpha_numeric}*;
87dollar_dollar_word        = {ddollar}{lower_word};
88dollar_word               = {dollar}{lower_word};
89dollar_underscore         = {dollar}_;
90
91distinct_object           = {double_quote}{do_char}+{double_quote};
92
93ws                        = ([\ ]|[\t]);
94single_quoted             = {single_quote}{sq_char}+{single_quote};
95
96system_comment_one        = [%][\ ]*{ddollar}[_]*;
97system_comment_multi      = [/][\*][\ ]*(ddollar)([^\*]*[\*][\*]*[^/\*])*[^\*]*[\*][\*]*[/];
98system_comment            = (system_comment_one)|(system_comment_multi);
99
100comment_line              = {percentage_sign}[^\n]*;
101comment_block             = [/][\*]([^\*]*[\*]+[^/\*])*[^\*]*[\*]+[/];
102comment                   = ({comment_line}|{comment_block})+;
103
104eol                       = ("\013\010"|"\010"|"\013");
105
106%%
107
108{ws}*           => (col:=(!col)+size yytext; continue () );
109
110{eol}           => (linep:=(!linep)+1;
111                   eolpos:=yypos+size yytext; continue ());
112
113"&"             => (col:=yypos-(!eolpos); T.AMPERSAND(!linep,!col));
114
115"@+"            => (col:=yypos-(!eolpos); T.INDEF_CHOICE(!linep,!col));
116"@-"            => (col:=yypos-(!eolpos); T.DEFIN_CHOICE(!linep,!col));
117
118"!!"            => (col:=yypos-(!eolpos); T.OPERATOR_FORALL(!linep,!col));
119"??"            => (col:=yypos-(!eolpos); T.OPERATOR_EXISTS(!linep,!col));
120
121"@"             => (col:=yypos-(!eolpos); T.AT_SIGN(!linep,!col));
122"^"             => (col:=yypos-(!eolpos); T.CARET(!linep,!col));
123
124":"             => (col:=yypos-(!eolpos); T.COLON(!linep,!col));
125","             => (col:=yypos-(!eolpos); T.COMMA(!linep,!col));
126"="             => (col:=yypos-(!eolpos); T.EQUALS(!linep,!col));
127"!"             => (col:=yypos-(!eolpos); T.EXCLAMATION(!linep,!col));
128":="            => (col:=yypos-(!eolpos); T.LET(!linep,!col));
129">"             => (col:=yypos-(!eolpos); T.ARROW(!linep,!col));
130
131"<="            => (col:=yypos-(!eolpos); T.FI(!linep,!col));
132"<=>"           => (col:=yypos-(!eolpos); T.IFF(!linep,!col));
133"=>"            => (col:=yypos-(!eolpos); T.IMPLIES(!linep,!col));
134"["             => (col:=yypos-(!eolpos); T.LBRKT(!linep,!col));
135"("             => (col:=yypos-(!eolpos); T.LPAREN(!linep,!col));
136"->"            => (col:=yypos-(!eolpos); T.MAP_TO(!linep,!col));
137"--"            => (col:=yypos-(!eolpos); T.MMINUS(!linep,!col));
138"~&"            => (col:=yypos-(!eolpos); T.NAND(!linep,!col));
139"!="            => (col:=yypos-(!eolpos); T.NEQUALS(!linep,!col));
140"<~>"           => (col:=yypos-(!eolpos); T.XOR(!linep,!col));
141"~|"            => (col:=yypos-(!eolpos); T.NOR(!linep,!col));
142"."             => (col:=yypos-(!eolpos); T.PERIOD(!linep,!col));
143"++"            => (col:=yypos-(!eolpos); T.PPLUS(!linep,!col));
144"?"             => (col:=yypos-(!eolpos); T.QUESTION(!linep,!col));
145"]"             => (col:=yypos-(!eolpos); T.RBRKT(!linep,!col));
146")"             => (col:=yypos-(!eolpos); T.RPAREN(!linep,!col));
147"~"             => (col:=yypos-(!eolpos); T.TILDE(!linep,!col));
148"|"             => (col:=yypos-(!eolpos); T.VLINE(!linep,!col));
149
150{distinct_object}    => (col:=yypos-(!eolpos); T.DISTINCT_OBJECT(yytext,!linep,!col));
151{rational}           => (col:=yypos-(!eolpos); T.RATIONAL(yytext,!linep,!col));
152{real}               => (col:=yypos-(!eolpos); T.REAL(yytext,!linep,!col));
153{signed_integer}     => (col:=yypos-(!eolpos); T.SIGNED_INTEGER(yytext,!linep,!col));
154{unsigned_integer}   => (col:=yypos-(!eolpos); T.UNSIGNED_INTEGER(yytext,!linep,!col));
155{dot_decimal}        => (col:=yypos-(!eolpos); T.DOT_DECIMAL(yytext,!linep,!col));
156{single_quoted}      => (col:=yypos-(!eolpos); T.SINGLE_QUOTED(yytext,!linep,!col));
157{upper_word}         => (col:=yypos-(!eolpos); T.UPPER_WORD(yytext,!linep,!col));
158{comment}            => (col:=yypos-(!eolpos); count_commentlines yytext;T.COMMENT(yytext,!linep,!col));
159
160"thf"          => (col:=yypos-(!eolpos); T.THF(!linep,!col));
161"fof"          => (col:=yypos-(!eolpos); T.FOF(!linep,!col));
162"cnf"          => (col:=yypos-(!eolpos); T.CNF(!linep,!col));
163"tff"          => (col:=yypos-(!eolpos); T.TFF(!linep,!col));
164"include"      => (col:=yypos-(!eolpos); T.INCLUDE(!linep,!col));
165
166"$thf"          => (col:=yypos-(!eolpos); T.DTHF(!linep,!col));
167"$fof"          => (col:=yypos-(!eolpos); T.DFOF(!linep,!col));
168"$cnf"          => (col:=yypos-(!eolpos); T.DCNF(!linep,!col));
169"$fot"          => (col:=yypos-(!eolpos); T.DFOT(!linep,!col));
170"$tff"          => (col:=yypos-(!eolpos); T.DTFF(!linep,!col));
171
172"$ite_f"        => (col:=yypos-(!eolpos); T.ITE_F(!linep,!col));
173"$ite_t"        => (col:=yypos-(!eolpos); T.ITE_T(!linep,!col));
174"$let_tf"        => (col:=yypos-(!eolpos); T.LET_TF(!linep,!col));
175"$let_ff"        => (col:=yypos-(!eolpos); T.LET_FF(!linep,!col));
176"$let_ft"        => (col:=yypos-(!eolpos); T.LET_FT(!linep,!col));
177"$let_tt"        => (col:=yypos-(!eolpos); T.LET_TT(!linep,!col));
178
179{lower_word}   => (col:=yypos-(!eolpos); T.LOWER_WORD(yytext,!linep,!col));
180{dollar_word}  => (col:=yypos-(!eolpos); T.DOLLAR_WORD(yytext,!linep,!col));
181{dollar_underscore}  => (col:=yypos-(!eolpos); T.DOLLAR_WORD(yytext,!linep,!col));
182{dollar_dollar_word}  => (col:=yypos-(!eolpos); T.DOLLAR_DOLLAR_WORD(yytext,!linep,!col));
183
184"+"           => (col:=yypos-(!eolpos); T.PLUS(!linep,!col));
185"*"           => (col:=yypos-(!eolpos); T.TIMES(!linep,!col));
186"-->"         => (col:=yypos-(!eolpos); T.GENTZEN_ARROW(!linep,!col));
187"<<"          => (col:=yypos-(!eolpos); T.SUBTYPE(!linep,!col));
188"!>"          => (col:=yypos-(!eolpos); T.DEP_PROD(!linep,!col));
189"?*"          => (col:=yypos-(!eolpos); T.DEP_SUM(!linep,!col));
190
191":-"          => (col:=yypos-(!eolpos); T.LET_TERM(!linep,!col));
192