1/*
2 * *****************************************************************************
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2018-2023 Gavin D. Howard and contributors.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice, this
12 *   list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright notice,
15 *   this list of conditions and the following disclaimer in the documentation
16 *   and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 * *****************************************************************************
31 *
32 * Common code for the lexers.
33 *
34 */
35
36#include <assert.h>
37#include <ctype.h>
38#include <stdbool.h>
39#include <string.h>
40
41#include <lex.h>
42#include <vm.h>
43#include <bc.h>
44
45void
46bc_lex_invalidChar(BcLex* l, char c)
47{
48	l->t = BC_LEX_INVALID;
49	bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
50}
51
52void
53bc_lex_lineComment(BcLex* l)
54{
55	l->t = BC_LEX_WHITESPACE;
56	while (l->i < l->len && l->buf[l->i] != '\n')
57	{
58		l->i += 1;
59	}
60}
61
62void
63bc_lex_comment(BcLex* l)
64{
65	size_t i, nlines = 0;
66	const char* buf;
67	bool end = false, got_more;
68	char c;
69
70	l->i += 1;
71	l->t = BC_LEX_WHITESPACE;
72
73	// This loop is complex because it might need to request more data from
74	// stdin if the comment is not ended. This loop is taken until the comment
75	// is finished or we have EOF.
76	do
77	{
78		buf = l->buf;
79		got_more = false;
80
81		// If we are in stdin mode, the buffer must be the one used for stdin.
82		assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
83
84		// Find the end of the comment.
85		for (i = l->i; !end; i += !end)
86		{
87			// While we don't have an asterisk, eat, but increment nlines.
88			for (; (c = buf[i]) && c != '*'; ++i)
89			{
90				nlines += (c == '\n');
91			}
92
93			// If this is true, we need to request more data.
94			if (BC_ERR(!c || buf[i + 1] == '\0'))
95			{
96				// Read more, if possible.
97				if (!vm->eof && l->mode != BC_MODE_FILE)
98				{
99					got_more = bc_lex_readLine(l);
100				}
101
102				break;
103			}
104
105			// If this turns true, we found the end. Yay!
106			end = (buf[i + 1] == '/');
107		}
108	}
109	while (got_more && !end);
110
111	// If we didn't find the end, barf.
112	if (!end)
113	{
114		l->i = i;
115		bc_lex_err(l, BC_ERR_PARSE_COMMENT);
116	}
117
118	l->i = i + 2;
119	l->line += nlines;
120}
121
122void
123bc_lex_whitespace(BcLex* l)
124{
125	char c;
126
127	l->t = BC_LEX_WHITESPACE;
128
129	// Eat. We don't eat newlines because they can be special.
130	for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i])
131	{
132		continue;
133	}
134}
135
136void
137bc_lex_commonTokens(BcLex* l, char c)
138{
139	if (!c) l->t = BC_LEX_EOF;
140	else if (c == '\n') l->t = BC_LEX_NLINE;
141	else bc_lex_whitespace(l);
142}
143
144/**
145 * Parses a number.
146 * @param l         The lexer.
147 * @param start     The start character.
148 * @param int_only  Whether this function should only look for an integer. This
149 *                  is used to implement the exponent of scientific notation.
150 */
151static size_t
152bc_lex_num(BcLex* l, char start, bool int_only)
153{
154	const char* buf = l->buf + l->i;
155	size_t i;
156	char c;
157	bool last_pt, pt = (start == '.');
158
159	// This loop looks complex. It is not. It is asking if the character is not
160	// a nul byte and it if it a valid num character based on what we have found
161	// thus far, or whether it is a backslash followed by a newline. I can do
162	// i+1 on the buffer because the buffer must have a nul byte.
163	for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) ||
164	                             (c == '\\' && buf[i + 1] == '\n'));
165	     ++i)
166	{
167		// I don't need to test that the next character is a newline because
168		// the loop condition above ensures that.
169		if (c == '\\')
170		{
171			i += 2;
172
173			// Make sure to eat whitespace at the beginning of the line.
174			while (isspace(buf[i]) && buf[i] != '\n')
175			{
176				i += 1;
177			}
178
179			c = buf[i];
180
181			// If the next character is not a number character, bail.
182			if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break;
183		}
184
185		// Did we find the radix point?
186		last_pt = (c == '.');
187
188		// If we did, and we already have one, then break because it's not part
189		// of this number.
190		if (pt && last_pt) break;
191
192		// Set whether we have found a radix point.
193		pt = pt || last_pt;
194
195		bc_vec_push(&l->str, &c);
196	}
197
198	return i;
199}
200
201void
202bc_lex_number(BcLex* l, char start)
203{
204	l->t = BC_LEX_NUMBER;
205
206	// Make sure the string is clear.
207	bc_vec_popAll(&l->str);
208	bc_vec_push(&l->str, &start);
209
210	// Parse the number.
211	l->i += bc_lex_num(l, start, false);
212
213#if BC_ENABLE_EXTRA_MATH
214	{
215		char c = l->buf[l->i];
216
217		// Do we have a number in scientific notation?
218		if (c == 'e')
219		{
220#if BC_ENABLED
221			// Barf for POSIX.
222			if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM);
223#endif // BC_ENABLED
224
225			// Push the e.
226			bc_vec_push(&l->str, &c);
227			l->i += 1;
228			c = l->buf[l->i];
229
230			// Check for negative specifically because bc_lex_num() does not.
231			if (c == BC_LEX_NEG_CHAR)
232			{
233				bc_vec_push(&l->str, &c);
234				l->i += 1;
235				c = l->buf[l->i];
236			}
237
238			// We must have a number character, so barf if not.
239			if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true)))
240			{
241				bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
242			}
243
244			// Parse the exponent.
245			l->i += bc_lex_num(l, 0, true);
246		}
247	}
248#endif // BC_ENABLE_EXTRA_MATH
249
250	bc_vec_pushByte(&l->str, '\0');
251}
252
253void
254bc_lex_name(BcLex* l)
255{
256	size_t i = 0;
257	const char* buf = l->buf + l->i - 1;
258	char c = buf[i];
259
260	l->t = BC_LEX_NAME;
261
262	// Should be obvious. It's looking for valid characters.
263	while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_')
264	{
265		c = buf[++i];
266	}
267
268	// Set the string to the identifier.
269	bc_vec_string(&l->str, i, buf);
270
271	// Increment the index. We minus 1 because it has already been incremented.
272	l->i += i - 1;
273}
274
275void
276bc_lex_init(BcLex* l)
277{
278	BC_SIG_ASSERT_LOCKED;
279	assert(l != NULL);
280	bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE);
281}
282
283void
284bc_lex_free(BcLex* l)
285{
286	BC_SIG_ASSERT_LOCKED;
287	assert(l != NULL);
288	bc_vec_free(&l->str);
289}
290
291void
292bc_lex_file(BcLex* l, const char* file)
293{
294	assert(l != NULL && file != NULL);
295	l->line = 1;
296	vm->file = file;
297}
298
299void
300bc_lex_next(BcLex* l)
301{
302	BC_SIG_ASSERT_LOCKED;
303
304	assert(l != NULL);
305
306	l->last = l->t;
307
308	// If this wasn't here, the line number would be off.
309	l->line += (l->i != 0 && l->buf[l->i - 1] == '\n');
310
311	// If the last token was EOF, someone called this one too many times.
312	if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF);
313
314	l->t = BC_LEX_EOF;
315
316	// We are done if this is true.
317	if (l->i == l->len) return;
318
319	// Loop until failure or we don't have whitespace. This
320	// is so the parser doesn't get inundated with whitespace.
321	do
322	{
323		vm->next(l);
324	}
325	while (l->t == BC_LEX_WHITESPACE);
326}
327
328/**
329 * Updates the buffer and len so that they are not invalidated when the stdin
330 * buffer grows.
331 * @param l     The lexer.
332 * @param text  The text.
333 * @param len   The length of the text.
334 */
335static void
336bc_lex_fixText(BcLex* l, const char* text, size_t len)
337{
338	l->buf = text;
339	l->len = len;
340}
341
342bool
343bc_lex_readLine(BcLex* l)
344{
345	bool good;
346
347	// These are reversed because they should be already locked, but
348	// bc_vm_readLine() needs them to be unlocked.
349	BC_SIG_UNLOCK;
350
351	// Make sure we read from the appropriate place.
352	switch (l->mode)
353	{
354		case BC_MODE_EXPRS:
355		{
356			good = bc_vm_readBuf(false);
357			break;
358		}
359
360		case BC_MODE_FILE:
361		{
362			good = false;
363			break;
364		}
365
366		case BC_MODE_STDIN:
367		{
368			good = bc_vm_readLine(false);
369			break;
370		}
371
372#ifdef __GNUC__
373#ifndef __clang__
374		default:
375		{
376			// We should never get here.
377			abort();
378		}
379#endif // __clang__
380#endif // __GNUC__
381	}
382
383	BC_SIG_LOCK;
384
385	bc_lex_fixText(l, vm->buffer.v, vm->buffer.len - 1);
386
387	return good;
388}
389
390void
391bc_lex_text(BcLex* l, const char* text, BcMode mode)
392{
393	BC_SIG_ASSERT_LOCKED;
394
395	assert(l != NULL && text != NULL);
396
397	bc_lex_fixText(l, text, strlen(text));
398	l->i = 0;
399	l->t = l->last = BC_LEX_INVALID;
400	l->mode = mode;
401
402	bc_lex_next(l);
403}
404