1:- module(rdtok). % SEPIA header 2 3:- local display/1. 4display(X) :- write(error, X). 5ttyput(X) :- put(error, X). 6ttynl :- nl(error). 7get0(X) :- get(X). 8 9% File : RDTOK.PL 10% Author : R.A.O'Keefe 11% Updated: 5 July 1984 12% Purpose: Tokeniser in reasonably standard Prolog. 13 14/* This tokeniser is meant to complement the library READ routine. 15 It recognises Dec-10 Prolog with the following exceptions: 16 17 %( is not accepted as an alternative to { 18 19 %) is not accepted as an alternative to ) 20 21 NOLC convention is not supported (read_name could be made to do it) 22 23 ,.. is not accepted as an alternative to | (hooray!) 24 25 large integers are not read in as xwd(Top18Bits,Bottom18Bits) 26 27 After a comma, "(" is read as ' (' rather than '('. This does the 28 parser no harm at all, and the Dec-10 tokeniser's behaviour here 29 doesn't actually buy you anything. This tokeniser guarantees never 30 to return '(' except immediately after an atom, yielding ' (' every 31 other where. 32 33 In particular, radix notation is EXACTLY as in Dec-10 Prolog version 3.53. 34 Some times might be of interest. Applied to an earlier version of this file: 35 this code took 1.66 seconds 36 the Dec-10 tokeniser took 1.28 seconds 37 A Pascal version took 0.96 seconds 38 The Dec-10 tokeniser was called via the old RDTOK interface, with 39 which this file is compatible. One reason for the difference in 40 speed is the way variables are looked up: this code uses a linear 41 list, while the Dec-10 tokeniser uses some sort of tree. The Pascal 42 version is the program WLIST which lists "words" and their frequencies. 43 It uses a hash table. Another difference is the way characters are 44 classified: the Dec-10 tokeniser and WLIST have a table which maps 45 ASCII codes to character classes, and don't do all this comparison 46 and and memberchking. We could do that without leaving standard Prolog, 47 but what do you want from one evening's work? 48*/ 49 50:- export 51 read_tokens/2. 52 53:- local 54 read_string/3, 55 read_string/4. 56 57:- mode 58 read_after_atom(+, ?, -), 59 read_digits(+, +, +, -, -), 60 read_fullstop(+, ?, -), 61 read_integer(+, -, -), 62 read_lookup(?, +), 63 read_name(+, -, -), 64 read_solidus(+, ?, -), 65 read_solidus(+, -), 66 read_string(-, +, -), 67 read_string(+, -, +, -), 68 more_string(+, +, -, -), 69 read_symbol(+, -, -), 70 read_tokens(?, ?), 71 read_tokens(+, ?, -). 72 73 74 75% read_tokens(TokenList, Dictionary) 76% returns a list of tokens. It is needed to "prime" read_tokens/2 77% with the initial blank, and to check for end of file. The 78% Dictionary is a list of AtomName=Variable pairs in no particular order. 79% The way end of file is handled is that everything else FAILS when it 80% hits character "26", sometimes printing a warning. It might have been 81% an idea to return the atom 'end_of_file' instead of the same token list 82% that you'd have got from reading "end_of_file. ", but (1) this file is 83% for compatibility, and (b) there are good practical reasons for wanting 84% this behaviour. 85 86read_tokens(TokenList, Dictionary) :- 87 read_tokens(32, Dict, ListOfTokens), 88 append(Dict, [], Dict), !, % fill in the "hole" at the end 89 Dictionary = Dict, % unify explicitly so we'll read and 90 TokenList = ListOfTokens. % then check even with filled in arguments 91read_tokens([atom(end_of_file)], []). % End Of File is all that can go wrong 92 93 94 95read_tokens(26, _, _) :- !, % 26 is the end-of-file character 96 fail. % in every standard Prolog 97read_tokens(Ch, Dict, Tokens) :- 98 Ch =< 32, % ignore layout. CR, LF, and the 99 !, % Dec-10 newline character (31) 100 get0(NextCh), % are all skipped here. 101 read_tokens(NextCh, Dict, Tokens). 102read_tokens(37, Dict, Tokens) :- !, % %comment 103 repeat, % skip characters to a line 104 get0(Ch), % terminator (should we be 105 ( Ch = 31 ; Ch = 26 ), % more thorough, e.g. ^L?) 106 !, % stop when we find one 107 Ch =\= 26, % fail on EOF 108 get0(NextCh), 109 read_tokens(NextCh, Dict, Tokens). 110read_tokens(47, Dict, Tokens) :- !, % /*comment? 111 get0(NextCh), 112 read_solidus(NextCh, Dict, Tokens). 113read_tokens(33, Dict, [atom(!)|Tokens]) :- !, % This is a special case so 114 get0(NextCh), % that !. reads as two tokens. 115 read_after_atom(NextCh, Dict, Tokens). % It could be cleverer. 116read_tokens(40, Dict, [' ('|Tokens]) :- !, % NB!!! 117 get0(NextCh), 118 read_tokens(NextCh, Dict, Tokens). 119read_tokens(41, Dict, [')'|Tokens]) :- !, 120 get0(NextCh), 121 read_tokens(NextCh, Dict, Tokens). 122read_tokens(44, Dict, [','|Tokens]) :- !, 123 get0(NextCh), 124 read_tokens(NextCh, Dict, Tokens). 125read_tokens(59, Dict, [atom((;))|Tokens]) :- !, % ; is nearly a punctuation 126 get0(NextCh), % mark but not quite (e.g. 127 read_tokens(NextCh, Dict, Tokens). % you can :-op declare it). 128read_tokens(91, Dict, ['['|Tokens]) :- !, 129 get0(NextCh), 130 read_tokens(NextCh, Dict, Tokens). 131read_tokens(93, Dict, [']'|Tokens]) :- !, 132 get0(NextCh), 133 read_tokens(NextCh, Dict, Tokens). 134read_tokens(123, Dict, ['{'|Tokens]) :- !, 135 get0(NextCh), 136 read_tokens(NextCh, Dict, Tokens). 137read_tokens(124, Dict, ['|'|Tokens]) :- !, 138 get0(NextCh), 139 read_tokens(NextCh, Dict, Tokens). 140read_tokens(125, Dict, ['}'|Tokens]) :- !, 141 get0(NextCh), 142 read_tokens(NextCh, Dict, Tokens). 143read_tokens(46, Dict, Tokens) :- !, % full stop 144 get0(NextCh), % or possibly .=. &c 145 read_fullstop(NextCh, Dict, Tokens). 146read_tokens(34, Dict, [string(S)|Tokens]) :- !, % "string" 147 read_string(S, 34, NextCh), 148 read_tokens(NextCh, Dict, Tokens). 149read_tokens(39, Dict, [atom(A)|Tokens]) :- !, % 'atom' 150 read_string(S, 39, NextCh), 151 name(A, S), % BUG: '0' = 0 unlike Dec-10 Prolog 152 read_after_atom(NextCh, Dict, Tokens). 153read_tokens(Ch, Dict, [var(Var,Name)|Tokens]) :- 154 ( Ch = 95 ; Ch >= 65, Ch =< 90 ), % _ or A..Z 155 !, % have to watch out for "_" 156 read_name(Ch, S, NextCh), 157 ( S = [0'_], Name = '_' % anonymous variable 158 ; name(Name, S), % construct name 159 read_lookup(Dict, Name=Var) % lookup/enter in dictionary 160 ), !, 161 read_tokens(NextCh, Dict, Tokens). 162read_tokens(Ch, Dict, [integer(I)|Tokens]) :- 163 Ch >= 48, Ch =< 57, 164 !, 165 read_integer(Ch, I, NextCh), 166 read_tokens(NextCh, Dict, Tokens). 167read_tokens(Ch, Dict, [atom(A)|Tokens]) :- 168 Ch >= 97, Ch =< 122, % a..z 169 !, % no corresponding _ problem 170 read_name(Ch, S, NextCh), 171 name(A, S), 172 read_after_atom(NextCh, Dict, Tokens). 173read_tokens(Ch, Dict, [atom(A)|Tokens]) :- % THIS MUST BE THE LAST CLAUSE 174 get0(AnotherCh), 175 read_symbol(AnotherCh, Chars, NextCh), % might read 0 chars 176 name(A, [Ch|Chars]), % so might be [Ch] 177 read_after_atom(NextCh, Dict, Tokens). 178 179 180 181% The only difference between read_after_atom(Ch, Dict, Tokens) and 182% read_tokens/3 is what they do when Ch is "(". read_after_atom 183% finds the token to be '(', while read_tokens finds the token to be 184% ' ('. This is how the parser can tell whether <atom> <paren> must 185% be an operator application or an ordinary function symbol application. 186% See the library file READ.PL for details. 187 188read_after_atom(40, Dict, ['('|Tokens]) :- !, 189 get0(NextCh), 190 read_tokens(NextCh, Dict, Tokens). 191read_after_atom(Ch, Dict, Tokens) :- 192 read_tokens(Ch, Dict, Tokens). 193 194 195 196 197% read_string(Chars, Quote, NextCh) 198% reads the body of a string delimited by Quote characters. 199% The result is a list of ASCII codes. There are two complications. 200% If we hit the end of the file inside the string this predicate FAILS. 201% It does not return any special structure. That is the only reason 202% it can ever fail. The other complication is that when we find a Quote 203% we have to look ahead one character in case it is doubled. Note that 204% if we find an end-of-file after the quote we *don't* fail, we return 205% a normal string and the end of file character is returned as NextCh. 206% If we were going to accept C-like escape characters, as I think we 207% should, this would need changing (as would the code for 0'x). But 208% the purpose of this module is not to present my ideal syntax but to 209% present something which will read present-day Prolog programs. 210 211read_string(Chars, Quote, NextCh) :- 212 get0(Ch), 213 read_string(Ch, Chars, Quote, NextCh). 214 215 216read_string(26, _, Quote, 26) :- 217 display('! end of file in '), ttyput(Quote), 218 display(token), ttyput(Quote), ttynl, 219 !, fail. 220read_string(Quote, Chars, Quote, NextCh) :- !, 221 get0(Ch), % closing or doubled quote 222 more_string(Ch, Quote, Chars, NextCh). 223read_string(Char, [Char|Chars], Quote, NextCh) :- 224 read_string(Chars, Quote, NextCh). % ordinary character 225 226 227more_string(Quote, Quote, [Quote|Chars], NextCh) :- !, 228 read_string(Chars, Quote, NextCh). % doubled quote 229more_string(NextCh, _, [], NextCh). % end 230 231 232 233% read_solidus(Ch, Dict, Tokens) 234% checks to see whether /Ch is a /* comment or a symbol. If the 235% former, it skips the comment. If the latter it just calls read_symbol. 236% We have to take great care with /* comments to handle end of file 237% inside a comment, which is why read_solidus/2 passes back an end of 238% file character or a (forged) blank that we can give to read_tokens. 239 240 241read_solidus(42, Dict, Tokens) :- !, 242 get0(Ch), 243 read_solidus(Ch, NextCh), 244 read_tokens(NextCh, Dict, Tokens). 245read_solidus(Ch, Dict, [atom(A)|Tokens]) :- 246 read_symbol(Ch, Chars, NextCh), % might read 0 chars 247 name(A, [47|Chars]), 248 read_tokens(NextCh, Dict, Tokens). 249 250read_solidus(26, 26) :- !, 251 display('! end of file in /*comment'), ttynl. 252read_solidus(42, LastCh) :- 253 get0(NextCh), 254 NextCh =\= 47, !, % might be ^Z or * though 255 read_solidus(NextCh, LastCh). 256read_solidus(42, 32) :- !. % the / was skipped in the previous clause 257read_solidus(_, LastCh) :- 258 get0(NextCh), 259 read_solidus(NextCh, LastCh). 260 261 262% read_name(Char, String, LastCh) 263% reads a sequence of letters, digits, and underscores, and returns 264% them as String. The first character which cannot join this sequence 265% is returned as LastCh. 266 267read_name(Char, [Char|Chars], LastCh) :- 268 ( Char >= 97, Char =< 122 % a..z 269 ; Char >= 65, Char =< 90 % A..Z 270 ; Char >= 48, Char =< 57 % 0..9 271 ; Char = 95 % _ 272 ), !, 273 get0(NextCh), 274 read_name(NextCh, Chars, LastCh). 275read_name(LastCh, [], LastCh). 276 277 278% read_symbol(Ch, String, NextCh) 279% reads the other kind of atom which needs no quoting: one which is 280% a string of "symbol" characters. Note that it may accept 0 281% characters, this happens when called from read_fullstop. 282 283read_symbol(Char, [Char|Chars], LastCh) :- 284 memberchk(Char, [0'#,0'$,0'&,0'*,0'+,0'-,0'.,0'/,0':,0'<,0'=,0'>,0'?,0'@,0'\,0'^,0'`,0'~]), 285 !, 286 get0(NextCh), 287 read_symbol(NextCh, Chars, LastCh). 288read_symbol(LastCh, [], LastCh). 289 290 291% read_fullstop(Char, Dict, Tokens) 292% looks at the next character after a full stop. There are 293% three cases: 294% (a) the next character is an end of file. We treat this 295% as an unexpected end of file. The reason for this is 296% that we HAVE to handle end of file characters in this 297% module or they are gone forever; if we failed to check 298% for end of file here and just accepted .<EOF> like .<NL> 299% the caller would have no way of detecting an end of file 300% and the next call would abort. 301% (b) the next character is a layout character. This is a 302% clause terminator. 303% (c) the next character is anything else. This is just an 304% ordinary symbol and we call read_symbol to process it. 305 306read_fullstop(26, _, _) :- !, 307 display('! end of file just after full stop'), ttynl, 308 fail. 309read_fullstop(Ch, _, []) :- 310 Ch =< 32, !. % END OF CLAUSE 311read_fullstop(Ch, Dict, [atom(A)|Tokens]) :- 312 read_symbol(Ch, S, NextCh), 313 name(A, [46|S]), 314 read_tokens(NextCh, Dict, Tokens). 315 316 317 318% read_integer is complicated by having to understand radix notation. 319% There are three forms of integer: 320% 0 ' <any character> - the ASCII code for that character 321% <digit> ' <digits> - the digits, read in that base 322% <digits> - the digits, read in base 10. 323% Note that radix 16 is not understood, because 16 is two digits, 324% and that all the decimal digits are accepted in each base (this 325% is also true of C). So 2'89 = 25. I can't say I care for this, 326% but it does no great harm, and that's what Dec-10 Prolog does. 327% The X =\= 26 tests are to make sure we don't miss an end of file 328% character. The tokeniser really should be in C, not least to 329% make handling end of file characters bearable. If we hit an end 330% of file inside an integer, read_integer will fail. 331 332read_integer(BaseChar, IntVal, NextCh) :- 333 Base is BaseChar - 48, 334 get0(Ch), 335 Ch =\= 26, 336 ( Ch =\= 39, read_digits(Ch, Base, 10, IntVal, NextCh) 337 ; Base >= 1, read_digits(0, Base, IntVal, NextCh) 338 ; get0(IntVal), IntVal =\= 26, get0(NextCh) 339 ), !. 340 341read_digits(SoFar, Base, Value, NextCh) :- 342 get0(Ch), 343 Ch =\= 26, 344 read_digits(Ch, SoFar, Base, Value, NextCh). 345 346read_digits(Digit, SoFar, Base, Value, NextCh) :- 347 Digit >= 48, Digit =< 57, 348 !, 349 Next is SoFar*Base-48+Digit, 350 read_digits(Next, Base, Value, NextCh). 351read_digits(LastCh, Value, _, Value, LastCh). 352 353 354 355% read_lookup is identical to memberchk except for argument order and 356% mode declaration. 357 358read_lookup([X|_], X) :- !. 359read_lookup([_|T], X) :- 360 read_lookup(T, X). 361 362 363