1:- module(rdtok).			% SEPIA header
2
3:- local display/1.
4display(X) :- write(error, X).
5ttyput(X) :- put(error, X).
6ttynl :- nl(error).
7get0(X) :- get(X).
8
9%   File   : RDTOK.PL
10%   Author : R.A.O'Keefe
11%   Updated: 5 July 1984
12%   Purpose: Tokeniser in reasonably standard Prolog.
13
14/*  This tokeniser is meant to complement the library READ routine.
15    It recognises Dec-10 Prolog with the following exceptions:
16
17	%( is not accepted as an alternative to {
18
19	%) is not accepted as an alternative to )
20
21	NOLC convention is not supported (read_name could be made to do it)
22
23	,.. is not accepted as an alternative to | (hooray!)
24
25	large integers are not read in as xwd(Top18Bits,Bottom18Bits)
26
27	After a comma, "(" is read as ' (' rather than '('.  This does the
28	parser no harm at all, and the Dec-10 tokeniser's behaviour here
29	doesn't actually buy you anything.  This tokeniser guarantees never
30	to return '(' except immediately after an atom, yielding ' (' every
31	other where.
32
33    In particular, radix notation is EXACTLY as in Dec-10 Prolog version 3.53.
34    Some times might be of interest.  Applied to an earlier version of this file:
35	this code took			1.66 seconds
36	the Dec-10 tokeniser took	1.28 seconds
37	A Pascal version took		0.96 seconds
38    The Dec-10 tokeniser was called via the old RDTOK interface, with
39    which this file is compatible.  One reason for the difference in
40    speed is the way variables are looked up: this code uses a linear
41    list, while the Dec-10 tokeniser uses some sort of tree.  The Pascal
42    version is the program WLIST which lists "words" and their frequencies.
43    It uses a hash table.  Another difference is the way characters are
44    classified: the Dec-10 tokeniser and WLIST have a table which maps
45    ASCII codes to character classes, and don't do all this comparison
46    and and memberchking.  We could do that without leaving standard Prolog,
47    but what do you want from one evening's work?
48*/
49
50:- export
51	read_tokens/2.
52
53:- local
54	read_string/3,
55	read_string/4.
56
57:- mode
58	read_after_atom(+, ?, -),
59	read_digits(+, +, +, -, -),
60	read_fullstop(+, ?, -),
61	read_integer(+, -, -),
62	read_lookup(?, +),
63	read_name(+, -, -),
64	read_solidus(+, ?, -),
65	read_solidus(+, -),
66	read_string(-, +, -),
67	read_string(+, -, +, -),
68	more_string(+, +, -, -),
69	read_symbol(+, -, -),
70	read_tokens(?, ?),
71	read_tokens(+, ?, -).
72
73
74
75%   read_tokens(TokenList, Dictionary)
76%   returns a list of tokens.  It is needed to "prime" read_tokens/2
77%   with the initial blank, and to check for end of file.  The
78%   Dictionary is a list of AtomName=Variable pairs in no particular order.
79%   The way end of file is handled is that everything else FAILS when it
80%   hits character "26", sometimes printing a warning.  It might have been
81%   an idea to return the atom 'end_of_file' instead of the same token list
82%   that you'd have got from reading "end_of_file. ", but (1) this file is
83%   for compatibility, and (b) there are good practical reasons for wanting
84%   this behaviour.
85
86read_tokens(TokenList, Dictionary) :-
87	read_tokens(32, Dict, ListOfTokens),
88	append(Dict, [], Dict), !,	%  fill in the "hole" at the end
89	Dictionary = Dict,		%  unify explicitly so we'll read and
90	TokenList = ListOfTokens.	%  then check even with filled in arguments
91read_tokens([atom(end_of_file)], []).	%  End Of File is all that can go wrong
92
93
94
95read_tokens(26, _, _) :- !,			%  26 is the end-of-file character
96	fail.					%  in every standard Prolog
97read_tokens(Ch, Dict, Tokens) :-
98	Ch =< 32,				%  ignore layout.  CR, LF, and the
99	!,					%  Dec-10 newline character (31)
100	get0(NextCh),				%  are all skipped here.
101	read_tokens(NextCh, Dict, Tokens).
102read_tokens(37, Dict, Tokens) :- !,		%  %comment
103	repeat,					%  skip characters to a line
104	    get0(Ch),				%  terminator (should we be
105	    ( Ch = 31 ; Ch = 26 ),		%  more thorough, e.g. ^L?)
106	!,					%  stop when we find one
107	Ch =\= 26,				%  fail on EOF
108	get0(NextCh),
109	read_tokens(NextCh, Dict, Tokens).
110read_tokens(47, Dict, Tokens) :- !,		%  /*comment?
111	get0(NextCh),
112	read_solidus(NextCh, Dict, Tokens).
113read_tokens(33, Dict, [atom(!)|Tokens]) :- !,	%  This is a special case so
114	get0(NextCh),				%  that !. reads as two tokens.
115	read_after_atom(NextCh, Dict, Tokens).	%  It could be cleverer.
116read_tokens(40, Dict, [' ('|Tokens]) :- !,	%  NB!!!
117	get0(NextCh),
118	read_tokens(NextCh, Dict, Tokens).
119read_tokens(41, Dict, [')'|Tokens]) :- !,
120	get0(NextCh),
121	read_tokens(NextCh, Dict, Tokens).
122read_tokens(44, Dict, [','|Tokens]) :- !,
123	get0(NextCh),
124	read_tokens(NextCh, Dict, Tokens).
125read_tokens(59, Dict, [atom((;))|Tokens]) :- !,	%   ; is nearly a punctuation
126	get0(NextCh),				%   mark but not quite (e.g.
127	read_tokens(NextCh, Dict, Tokens).	%   you can :-op declare it).
128read_tokens(91, Dict, ['['|Tokens]) :- !,
129	get0(NextCh),
130	read_tokens(NextCh, Dict, Tokens).
131read_tokens(93, Dict, [']'|Tokens]) :- !,
132	get0(NextCh),
133	read_tokens(NextCh, Dict, Tokens).
134read_tokens(123, Dict, ['{'|Tokens]) :- !,
135	get0(NextCh),
136	read_tokens(NextCh, Dict, Tokens).
137read_tokens(124, Dict, ['|'|Tokens]) :- !,
138	get0(NextCh),
139	read_tokens(NextCh, Dict, Tokens).
140read_tokens(125, Dict, ['}'|Tokens]) :- !,
141	get0(NextCh),
142	read_tokens(NextCh, Dict, Tokens).
143read_tokens(46, Dict, Tokens) :- !,		%  full stop
144	get0(NextCh),				%  or possibly .=. &c
145	read_fullstop(NextCh, Dict, Tokens).
146read_tokens(34, Dict, [string(S)|Tokens]) :- !,	%  "string"
147	read_string(S, 34, NextCh),
148	read_tokens(NextCh, Dict, Tokens).
149read_tokens(39, Dict, [atom(A)|Tokens]) :- !,	%  'atom'
150	read_string(S, 39, NextCh),
151	name(A, S),				%  BUG: '0' = 0 unlike Dec-10 Prolog
152	read_after_atom(NextCh, Dict, Tokens).
153read_tokens(Ch, Dict, [var(Var,Name)|Tokens]) :-
154	(  Ch = 95 ; Ch >= 65, Ch =< 90  ),	%  _ or A..Z
155	!,					%  have to watch out for "_"
156	read_name(Ch, S, NextCh),
157	(  S = [0'_], Name = '_'		%  anonymous variable
158	;  name(Name, S),			%  construct name
159	   read_lookup(Dict, Name=Var)		%  lookup/enter in dictionary
160	), !,
161	read_tokens(NextCh, Dict, Tokens).
162read_tokens(Ch, Dict, [integer(I)|Tokens]) :-
163	Ch >= 48, Ch =< 57,
164	!,
165	read_integer(Ch, I, NextCh),
166	read_tokens(NextCh, Dict, Tokens).
167read_tokens(Ch, Dict, [atom(A)|Tokens]) :-
168	Ch >= 97, Ch =< 122,			%  a..z
169	!,					%  no corresponding _ problem
170	read_name(Ch, S, NextCh),
171	name(A, S),
172	read_after_atom(NextCh, Dict, Tokens).
173read_tokens(Ch, Dict, [atom(A)|Tokens]) :-	% THIS MUST BE THE LAST CLAUSE
174	get0(AnotherCh),
175	read_symbol(AnotherCh, Chars, NextCh),	% might read 0 chars
176	name(A, [Ch|Chars]),			% so might be [Ch]
177	read_after_atom(NextCh, Dict, Tokens).
178
179
180
181%   The only difference between read_after_atom(Ch, Dict, Tokens) and
182%   read_tokens/3 is what they do when Ch is "(".  read_after_atom
183%   finds the token to be '(', while read_tokens finds the token to be
184%   ' ('.  This is how the parser can tell whether <atom> <paren> must
185%   be an operator application or an ordinary function symbol application.
186%   See the library file READ.PL for details.
187
188read_after_atom(40, Dict, ['('|Tokens]) :- !,
189	get0(NextCh),
190	read_tokens(NextCh, Dict, Tokens).
191read_after_atom(Ch, Dict, Tokens) :-
192	read_tokens(Ch, Dict, Tokens).
193
194
195
196
197%   read_string(Chars, Quote, NextCh)
198%   reads the body of a string delimited by Quote characters.
199%   The result is a list of ASCII codes.  There are two complications.
200%   If we hit the end of the file inside the string this predicate FAILS.
201%   It does not return any special structure.  That is the only reason
202%   it can ever fail.  The other complication is that when we find a Quote
203%   we have to look ahead one character in case it is doubled.  Note that
204%   if we find an end-of-file after the quote we *don't* fail, we return
205%   a normal string and the end of file character is returned as NextCh.
206%   If we were going to accept C-like escape characters, as I think we
207%   should, this would need changing (as would the code for 0'x).  But
208%   the purpose of this module is not to present my ideal syntax but to
209%   present something which will read present-day Prolog programs.
210
211read_string(Chars, Quote, NextCh) :-
212	get0(Ch),
213	read_string(Ch, Chars, Quote, NextCh).
214
215
216read_string(26, _, Quote, 26) :-
217	display('! end of file in '), ttyput(Quote),
218	display(token), ttyput(Quote), ttynl,
219	!, fail.
220read_string(Quote, Chars, Quote, NextCh) :- !,
221	get0(Ch),				% closing or doubled quote
222	more_string(Ch, Quote, Chars, NextCh).
223read_string(Char, [Char|Chars], Quote, NextCh) :-
224	read_string(Chars, Quote, NextCh).	% ordinary character
225
226
227more_string(Quote, Quote, [Quote|Chars], NextCh) :- !,
228	read_string(Chars, Quote, NextCh).	% doubled quote
229more_string(NextCh, _, [], NextCh).		% end
230
231
232
233%   read_solidus(Ch, Dict, Tokens)
234%   checks to see whether /Ch is a /* comment or a symbol.  If the
235%   former, it skips the comment.  If the latter it just calls read_symbol.
236%   We have to take great care with /* comments to handle end of file
237%   inside a comment, which is why read_solidus/2 passes back an end of
238%   file character or a (forged) blank that we can give to read_tokens.
239
240
241read_solidus(42, Dict, Tokens) :- !,
242	get0(Ch),
243	read_solidus(Ch, NextCh),
244	read_tokens(NextCh, Dict, Tokens).
245read_solidus(Ch, Dict, [atom(A)|Tokens]) :-
246	read_symbol(Ch, Chars, NextCh),		% might read 0 chars
247	name(A, [47|Chars]),
248	read_tokens(NextCh, Dict, Tokens).
249
250read_solidus(26, 26) :- !,
251	display('! end of file in /*comment'), ttynl.
252read_solidus(42, LastCh) :-
253	get0(NextCh),
254	NextCh =\= 47, !,	%  might be ^Z or * though
255	read_solidus(NextCh, LastCh).
256read_solidus(42, 32) :- !.	%  the / was skipped in the previous clause
257read_solidus(_, LastCh) :-
258	get0(NextCh),
259	read_solidus(NextCh, LastCh).
260
261
262%   read_name(Char, String, LastCh)
263%   reads a sequence of letters, digits, and underscores, and returns
264%   them as String.  The first character which cannot join this sequence
265%   is returned as LastCh.
266
267read_name(Char, [Char|Chars], LastCh) :-
268	( Char >= 97, Char =< 122	% a..z
269	; Char >= 65, Char =< 90	% A..Z
270	; Char >= 48, Char =< 57	% 0..9
271	; Char = 95			% _
272	), !,
273	get0(NextCh),
274	read_name(NextCh, Chars, LastCh).
275read_name(LastCh, [], LastCh).
276
277
278%   read_symbol(Ch, String, NextCh)
279%   reads the other kind of atom which needs no quoting: one which is
280%   a string of "symbol" characters.  Note that it may accept 0
281%   characters, this happens when called from read_fullstop.
282
283read_symbol(Char, [Char|Chars], LastCh) :-
284	memberchk(Char, [0'#,0'$,0'&,0'*,0'+,0'-,0'.,0'/,0':,0'<,0'=,0'>,0'?,0'@,0'\,0'^,0'`,0'~]),
285	!,
286	get0(NextCh),
287	read_symbol(NextCh, Chars, LastCh).
288read_symbol(LastCh, [], LastCh).
289
290
291%   read_fullstop(Char, Dict, Tokens)
292%   looks at the next character after a full stop.  There are
293%   three cases:
294%	(a) the next character is an end of file.  We treat this
295%	    as an unexpected end of file.  The reason for this is
296%	    that we HAVE to handle end of file characters in this
297%	    module or they are gone forever; if we failed to check
298%	    for end of file here and just accepted .<EOF> like .<NL>
299%	    the caller would have no way of detecting an end of file
300%	    and the next call would abort.
301%	(b) the next character is a layout character.  This is a
302%	    clause terminator.
303%	(c) the next character is anything else.  This is just an
304%	    ordinary symbol and we call read_symbol to process it.
305
306read_fullstop(26, _, _) :- !,
307	display('! end of file just after full stop'), ttynl,
308	fail.
309read_fullstop(Ch, _, []) :-
310	Ch =< 32, !.		% END OF CLAUSE
311read_fullstop(Ch, Dict, [atom(A)|Tokens]) :-
312	read_symbol(Ch, S, NextCh),
313	name(A, [46|S]),
314	read_tokens(NextCh, Dict, Tokens).
315
316
317
318%   read_integer is complicated by having to understand radix notation.
319%   There are three forms of integer:
320%	0 ' <any character>	- the ASCII code for that character
321%	<digit> ' <digits>	- the digits, read in that base
322%	<digits>		- the digits, read in base 10.
323%   Note that radix 16 is not understood, because 16 is two digits,
324%   and that all the decimal digits are accepted in each base (this
325%   is also true of C).  So 2'89 = 25.  I can't say I care for this,
326%   but it does no great harm, and that's what Dec-10 Prolog does.
327%   The X =\= 26 tests are to make sure we don't miss an end of file
328%   character.  The tokeniser really should be in C, not least to
329%   make handling end of file characters bearable.  If we hit an end
330%   of file inside an integer, read_integer will fail.
331
332read_integer(BaseChar, IntVal, NextCh) :-
333	Base is BaseChar - 48,
334	get0(Ch),
335	Ch =\= 26,
336	(   Ch =\= 39, read_digits(Ch, Base, 10, IntVal, NextCh)
337	;   Base >= 1, read_digits(0, Base, IntVal, NextCh)
338	;   get0(IntVal), IntVal =\= 26, get0(NextCh)
339	),  !.
340
341read_digits(SoFar, Base, Value, NextCh) :-
342	get0(Ch),
343	Ch =\= 26,
344	read_digits(Ch, SoFar, Base, Value, NextCh).
345
346read_digits(Digit, SoFar, Base, Value, NextCh) :-
347	Digit >= 48, Digit =< 57,
348	!,
349	Next is SoFar*Base-48+Digit,
350	read_digits(Next, Base, Value, NextCh).
351read_digits(LastCh, Value, _, Value, LastCh).
352
353
354
355%   read_lookup is identical to memberchk except for argument order and
356%   mode declaration.
357
358read_lookup([X|_], X) :- !.
359read_lookup([_|T], X) :-
360	read_lookup(T, X).
361
362
363