1/*
2 * *****************************************************************************
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2018-2023 Gavin D. Howard and contributors.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice, this
12 *   list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright notice,
15 *   this list of conditions and the following disclaimer in the documentation
16 *   and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 * *****************************************************************************
31 *
32 * Definitions for bc's lexer.
33 *
34 */
35
36#ifndef BC_LEX_H
37#define BC_LEX_H
38
39#include <stdbool.h>
40#include <stddef.h>
41
42#include <status.h>
43#include <vector.h>
44#include <lang.h>
45
46/**
47 * A convenience macro for throwing errors in lex code. This takes care of
48 * plumbing like passing in the current line the lexer is on.
49 * @param l  The lexer.
50 * @param e  The error.
51 */
52#if BC_DEBUG
53#define bc_lex_err(l, e) (bc_vm_handleError((e), __FILE__, __LINE__, (l)->line))
54#else // BC_DEBUG
55#define bc_lex_err(l, e) (bc_vm_handleError((e), (l)->line))
56#endif // BC_DEBUG
57
58/**
59 * A convenience macro for throwing errors in lex code. This takes care of
60 * plumbing like passing in the current line the lexer is on.
61 * @param l  The lexer.
62 * @param e  The error.
63 */
64#if BC_DEBUG
65#define bc_lex_verr(l, e, ...) \
66	(bc_vm_handleError((e), __FILE__, __LINE__, (l)->line, __VA_ARGS__))
67#else // BC_DEBUG
68#define bc_lex_verr(l, e, ...) (bc_vm_handleError((e), (l)->line, __VA_ARGS__))
69#endif // BC_DEBUG
70
71// BC_LEX_NEG_CHAR returns the char that corresponds to negative for the
72// current calculator.
73//
74// BC_LEX_LAST_NUM_CHAR returns the char that corresponds to the last valid
75// char for numbers. In bc and dc, capital letters are part of numbers, to a
76// point. (dc only goes up to hex, so its last valid char is 'F'.)
77#if BC_ENABLED
78
79#if DC_ENABLED
80#define BC_LEX_NEG_CHAR (BC_IS_BC ? '-' : '_')
81#define BC_LEX_LAST_NUM_CHAR (BC_IS_BC ? 'Z' : 'F')
82#else // DC_ENABLED
83#define BC_LEX_NEG_CHAR ('-')
84#define BC_LEX_LAST_NUM_CHAR ('Z')
85#endif // DC_ENABLED
86
87#else // BC_ENABLED
88
89#define BC_LEX_NEG_CHAR ('_')
90#define BC_LEX_LAST_NUM_CHAR ('F')
91
92#endif // BC_ENABLED
93
94/**
95 * Returns true if c is a valid number character.
96 * @param c         The char to check.
97 * @param pt        If a decimal point has already been seen.
98 * @param int_only  True if the number is expected to be an int only, false if
99 *                  non-integers are allowed.
100 * @return          True if @a c is a valid number character.
101 */
102#define BC_LEX_NUM_CHAR(c, pt, int_only)                               \
103	(isdigit(c) != 0 || ((c) >= 'A' && (c) <= BC_LEX_LAST_NUM_CHAR) || \
104	 ((c) == '.' && !(pt) && !(int_only)))
105
106/// An enum of lex token types.
107typedef enum BcLexType
108{
109	/// End of file.
110	BC_LEX_EOF,
111
112	/// Marker for invalid tokens, used by bc and dc for const data.
113	BC_LEX_INVALID,
114
115#if BC_ENABLED
116
117	/// Increment operator.
118	BC_LEX_OP_INC,
119
120	/// Decrement operator.
121	BC_LEX_OP_DEC,
122
123#endif // BC_ENABLED
124
125	/// BC_LEX_NEG is not used in lexing; it is only for parsing. The lexer
126	/// marks all '-' characters as BC_LEX_OP_MINUS, but the parser needs to be
127	/// able to distinguish them.
128	BC_LEX_NEG,
129
130	/// Boolean not.
131	BC_LEX_OP_BOOL_NOT,
132
133#if BC_ENABLE_EXTRA_MATH
134
135	/// Truncation operator.
136	BC_LEX_OP_TRUNC,
137
138#endif // BC_ENABLE_EXTRA_MATH
139
140	/// Power operator.
141	BC_LEX_OP_POWER,
142
143	/// Multiplication operator.
144	BC_LEX_OP_MULTIPLY,
145
146	/// Division operator.
147	BC_LEX_OP_DIVIDE,
148
149	/// Modulus operator.
150	BC_LEX_OP_MODULUS,
151
152	/// Addition operator.
153	BC_LEX_OP_PLUS,
154
155	/// Subtraction operator.
156	BC_LEX_OP_MINUS,
157
158#if BC_ENABLE_EXTRA_MATH
159
160	/// Places (truncate or extend) operator.
161	BC_LEX_OP_PLACES,
162
163	/// Left (decimal) shift operator.
164	BC_LEX_OP_LSHIFT,
165
166	/// Right (decimal) shift operator.
167	BC_LEX_OP_RSHIFT,
168
169#endif // BC_ENABLE_EXTRA_MATH
170
171	/// Equal operator.
172	BC_LEX_OP_REL_EQ,
173
174	/// Less than or equal operator.
175	BC_LEX_OP_REL_LE,
176
177	/// Greater than or equal operator.
178	BC_LEX_OP_REL_GE,
179
180	/// Not equal operator.
181	BC_LEX_OP_REL_NE,
182
183	/// Less than operator.
184	BC_LEX_OP_REL_LT,
185
186	/// Greater than operator.
187	BC_LEX_OP_REL_GT,
188
189	/// Boolean or operator.
190	BC_LEX_OP_BOOL_OR,
191
192	/// Boolean and operator.
193	BC_LEX_OP_BOOL_AND,
194
195#if BC_ENABLED
196
197	/// Power assignment operator.
198	BC_LEX_OP_ASSIGN_POWER,
199
200	/// Multiplication assignment operator.
201	BC_LEX_OP_ASSIGN_MULTIPLY,
202
203	/// Division assignment operator.
204	BC_LEX_OP_ASSIGN_DIVIDE,
205
206	/// Modulus assignment operator.
207	BC_LEX_OP_ASSIGN_MODULUS,
208
209	/// Addition assignment operator.
210	BC_LEX_OP_ASSIGN_PLUS,
211
212	/// Subtraction assignment operator.
213	BC_LEX_OP_ASSIGN_MINUS,
214
215#if BC_ENABLE_EXTRA_MATH
216
217	/// Places (truncate or extend) assignment operator.
218	BC_LEX_OP_ASSIGN_PLACES,
219
220	/// Left (decimal) shift assignment operator.
221	BC_LEX_OP_ASSIGN_LSHIFT,
222
223	/// Right (decimal) shift assignment operator.
224	BC_LEX_OP_ASSIGN_RSHIFT,
225
226#endif // BC_ENABLE_EXTRA_MATH
227#endif // BC_ENABLED
228
229	/// Assignment operator.
230	BC_LEX_OP_ASSIGN,
231
232	/// Newline.
233	BC_LEX_NLINE,
234
235	/// Whitespace.
236	BC_LEX_WHITESPACE,
237
238	/// Left parenthesis.
239	BC_LEX_LPAREN,
240
241	/// Right parenthesis.
242	BC_LEX_RPAREN,
243
244	/// Left bracket.
245	BC_LEX_LBRACKET,
246
247	/// Comma.
248	BC_LEX_COMMA,
249
250	/// Right bracket.
251	BC_LEX_RBRACKET,
252
253	/// Left brace.
254	BC_LEX_LBRACE,
255
256	/// Semicolon.
257	BC_LEX_SCOLON,
258
259	/// Right brace.
260	BC_LEX_RBRACE,
261
262	/// String.
263	BC_LEX_STR,
264
265	/// Identifier/name.
266	BC_LEX_NAME,
267
268	/// Constant number.
269	BC_LEX_NUMBER,
270
271	// These keywords are in the order they are in for a reason. Don't change
272	// the order unless you want a bunch of weird failures in the test suite.
273	// In fact, almost all of these tokens are in a specific order for a reason.
274
275#if BC_ENABLED
276
277	/// bc auto keyword.
278	BC_LEX_KW_AUTO,
279
280	/// bc break keyword.
281	BC_LEX_KW_BREAK,
282
283	/// bc continue keyword.
284	BC_LEX_KW_CONTINUE,
285
286	/// bc define keyword.
287	BC_LEX_KW_DEFINE,
288
289	/// bc for keyword.
290	BC_LEX_KW_FOR,
291
292	/// bc if keyword.
293	BC_LEX_KW_IF,
294
295	/// bc limits keyword.
296	BC_LEX_KW_LIMITS,
297
298	/// bc return keyword.
299	BC_LEX_KW_RETURN,
300
301	/// bc while keyword.
302	BC_LEX_KW_WHILE,
303
304	/// bc halt keyword.
305	BC_LEX_KW_HALT,
306
307	/// bc last keyword.
308	BC_LEX_KW_LAST,
309
310#endif // BC_ENABLED
311
312	/// bc ibase keyword.
313	BC_LEX_KW_IBASE,
314
315	/// bc obase keyword.
316	BC_LEX_KW_OBASE,
317
318	/// bc scale keyword.
319	BC_LEX_KW_SCALE,
320
321#if BC_ENABLE_EXTRA_MATH
322
323	/// bc seed keyword.
324	BC_LEX_KW_SEED,
325
326#endif // BC_ENABLE_EXTRA_MATH
327
328	/// bc length keyword.
329	BC_LEX_KW_LENGTH,
330
331	/// bc print keyword.
332	BC_LEX_KW_PRINT,
333
334	/// bc sqrt keyword.
335	BC_LEX_KW_SQRT,
336
337	/// bc abs keyword.
338	BC_LEX_KW_ABS,
339
340	/// bc is_number keyword.
341	BC_LEX_KW_IS_NUMBER,
342
343	/// bc is_string keyword.
344	BC_LEX_KW_IS_STRING,
345
346#if BC_ENABLE_EXTRA_MATH
347
348	/// bc irand keyword.
349	BC_LEX_KW_IRAND,
350
351#endif // BC_ENABLE_EXTRA_MATH
352
353	/// bc asciffy keyword.
354	BC_LEX_KW_ASCIIFY,
355
356	/// bc modexp keyword.
357	BC_LEX_KW_MODEXP,
358
359	/// bc divmod keyword.
360	BC_LEX_KW_DIVMOD,
361
362	/// bc quit keyword.
363	BC_LEX_KW_QUIT,
364
365	/// bc read keyword.
366	BC_LEX_KW_READ,
367
368#if BC_ENABLE_EXTRA_MATH
369
370	/// bc rand keyword.
371	BC_LEX_KW_RAND,
372
373#endif // BC_ENABLE_EXTRA_MATH
374
375	/// bc maxibase keyword.
376	BC_LEX_KW_MAXIBASE,
377
378	/// bc maxobase keyword.
379	BC_LEX_KW_MAXOBASE,
380
381	/// bc maxscale keyword.
382	BC_LEX_KW_MAXSCALE,
383
384#if BC_ENABLE_EXTRA_MATH
385
386	/// bc maxrand keyword.
387	BC_LEX_KW_MAXRAND,
388
389#endif // BC_ENABLE_EXTRA_MATH
390
391	/// bc line_length keyword.
392	BC_LEX_KW_LINE_LENGTH,
393
394#if BC_ENABLED
395
396	/// bc global_stacks keyword.
397	BC_LEX_KW_GLOBAL_STACKS,
398
399#endif // BC_ENABLED
400
401	/// bc leading_zero keyword.
402	BC_LEX_KW_LEADING_ZERO,
403
404	/// bc stream keyword.
405	BC_LEX_KW_STREAM,
406
407	/// bc else keyword.
408	BC_LEX_KW_ELSE,
409
410#if DC_ENABLED
411
412	/// dc extended registers keyword.
413	BC_LEX_EXTENDED_REGISTERS,
414
415	/// A special token for dc to calculate equal without a register.
416	BC_LEX_EQ_NO_REG,
417
418	/// Colon (array) operator.
419	BC_LEX_COLON,
420
421	/// Execute command.
422	BC_LEX_EXECUTE,
423
424	/// Print stack command.
425	BC_LEX_PRINT_STACK,
426
427	/// Clear stack command.
428	BC_LEX_CLEAR_STACK,
429
430	/// Register stack level command.
431	BC_LEX_REG_STACK_LEVEL,
432
433	/// Main stack level command.
434	BC_LEX_STACK_LEVEL,
435
436	/// Duplicate command.
437	BC_LEX_DUPLICATE,
438
439	/// Swap (reverse) command.
440	BC_LEX_SWAP,
441
442	/// Pop (remove) command.
443	BC_LEX_POP,
444
445	/// Store ibase command.
446	BC_LEX_STORE_IBASE,
447
448	/// Store obase command.
449	BC_LEX_STORE_OBASE,
450
451	/// Store scale command.
452	BC_LEX_STORE_SCALE,
453
454#if BC_ENABLE_EXTRA_MATH
455
456	/// Store seed command.
457	BC_LEX_STORE_SEED,
458
459#endif // BC_ENABLE_EXTRA_MATH
460
461	/// Load variable onto stack command.
462	BC_LEX_LOAD,
463
464	/// Pop off of variable stack onto results stack command.
465	BC_LEX_LOAD_POP,
466
467	/// Push onto variable stack command.
468	BC_LEX_STORE_PUSH,
469
470	/// Print with pop command.
471	BC_LEX_PRINT_POP,
472
473	/// Parameterized quit command.
474	BC_LEX_NQUIT,
475
476	/// Execution stack depth command.
477	BC_LEX_EXEC_STACK_LENGTH,
478
479	/// Scale of number command. This is needed specifically for dc because bc
480	/// parses the scale function in parts.
481	BC_LEX_SCALE_FACTOR,
482
483	/// Array length command. This is needed specifically for dc because bc
484	/// just reuses its length keyword.
485	BC_LEX_ARRAY_LENGTH,
486
487#endif // DC_ENABLED
488
489} BcLexType;
490
491struct BcLex;
492
493/**
494 * A function pointer to call when another token is needed. Mostly called by the
495 * parser.
496 * @param l  The lexer.
497 */
498typedef void (*BcLexNext)(struct BcLex* l);
499
500/// The lexer.
501typedef struct BcLex
502{
503	/// A pointer to the text to lex.
504	const char* buf;
505
506	/// The current index into buf.
507	size_t i;
508
509	/// The current line.
510	size_t line;
511
512	/// The length of buf.
513	size_t len;
514
515	/// The current token.
516	BcLexType t;
517
518	/// The previous token.
519	BcLexType last;
520
521	/// A string to store extra data for tokens. For example, the @a BC_LEX_STR
522	/// token really needs to store the actual string, and numbers also need the
523	/// string.
524	BcVec str;
525
526	/// The mode the lexer is in.
527	BcMode mode;
528
529} BcLex;
530
531/**
532 * Initializes a lexer.
533 * @param l  The lexer to initialize.
534 */
535void
536bc_lex_init(BcLex* l);
537
538/**
539 * Frees a lexer. This is not guarded by #if BC_DEBUG because a separate
540 * parser is created at runtime to parse read() expressions and dc strings, and
541 * that parser needs a lexer.
542 * @param l  The lexer to free.
543 */
544void
545bc_lex_free(BcLex* l);
546
547/**
548 * Sets the filename that the lexer will be lexing.
549 * @param l     The lexer.
550 * @param file  The filename that the lexer will lex.
551 */
552void
553bc_lex_file(BcLex* l, const char* file);
554
555/**
556 * Sets the text the lexer will lex.
557 * @param l     The lexer.
558 * @param text  The text to lex.
559 * @param mode  The mode to lex in.
560 */
561void
562bc_lex_text(BcLex* l, const char* text, BcMode mode);
563
564/**
565 * Generic next function for the parser to call. It takes care of calling the
566 * correct @a BcLexNext function and consuming whitespace.
567 * @param l  The lexer.
568 */
569void
570bc_lex_next(BcLex* l);
571
572/**
573 * Lexes a line comment (one beginning with '#' and going to a newline).
574 * @param l  The lexer.
575 */
576void
577bc_lex_lineComment(BcLex* l);
578
579/**
580 * Lexes a general comment (C-style comment).
581 * @param l  The lexer.
582 */
583void
584bc_lex_comment(BcLex* l);
585
586/**
587 * Lexes whitespace, finding as much as possible.
588 * @param l  The lexer.
589 */
590void
591bc_lex_whitespace(BcLex* l);
592
593/**
594 * Lexes a number that begins with char @a start. This takes care of parsing
595 * numbers in scientific and engineering notations.
596 * @param l      The lexer.
597 * @param start  The starting char of the number. To detect a number and call
598 *               this function, the lexer had to eat the first char. It fixes
599 *               that by passing it in.
600 */
601void
602bc_lex_number(BcLex* l, char start);
603
604/**
605 * Lexes a name/identifier.
606 * @param l  The lexer.
607 */
608void
609bc_lex_name(BcLex* l);
610
611/**
612 * Lexes common whitespace characters.
613 * @param l  The lexer.
614 * @param c  The character to lex.
615 */
616void
617bc_lex_commonTokens(BcLex* l, char c);
618
619/**
620 * Throws a parse error because char @a c was invalid.
621 * @param l  The lexer.
622 * @param c  The problem character.
623 */
624void
625bc_lex_invalidChar(BcLex* l, char c);
626
627/**
628 * Reads a line from stdin and puts it into the lexer's buffer.
629 * @param l  The lexer.
630 */
631bool
632bc_lex_readLine(BcLex* l);
633
634#endif // BC_LEX_H
635