1/*	$NetBSD: lex.h,v 1.7 2024/02/21 22:52:30 christos Exp $	*/
2
3/*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16#pragma once
17
18/*****
19***** Module Info
20*****/
21
22/*! \file isc/lex.h
23 * \brief The "lex" module provides a lightweight tokenizer.  It can operate
24 * on files or buffers, and can handle "include".  It is designed for
25 * parsing of DNS master files and the BIND configuration file, but
26 * should be general enough to tokenize other things, e.g. HTTP.
27 *
28 * \li MP:
29 *	No synchronization is provided.  Clients must ensure exclusive
30 *	access.
31 *
32 * \li Reliability:
33 *	No anticipated impact.
34 *
35 * \li Resources:
36 *	TBS
37 *
38 * \li Security:
39 *	No anticipated impact.
40 *
41 * \li Standards:
42 * 	None.
43 */
44
45/***
46 *** Imports
47 ***/
48
49#include <stdbool.h>
50#include <stdio.h>
51
52#include <isc/lang.h>
53#include <isc/region.h>
54#include <isc/types.h>
55
56ISC_LANG_BEGINDECLS
57
58/***
59 *** Options
60 ***/
61
62/*@{*/
63/*!
64 * Various options for isc_lex_gettoken().
65 */
66
67#define ISC_LEXOPT_EOL	     0x0001 /*%< Want end-of-line token. */
68#define ISC_LEXOPT_EOF	     0x0002 /*%< Want end-of-file token. */
69#define ISC_LEXOPT_INITIALWS 0x0004 /*%< Want initial whitespace. */
70#define ISC_LEXOPT_NUMBER    0x0008 /*%< Recognize numbers. */
71#define ISC_LEXOPT_QSTRING   0x0010 /*%< Recognize qstrings. */
72/*@}*/
73
74/*@{*/
75/*!
76 * The ISC_LEXOPT_DNSMULTILINE option handles the processing of '(' and ')' in
77 * the DNS master file format.  If this option is set, then the
78 * ISC_LEXOPT_INITIALWS and ISC_LEXOPT_EOL options will be ignored when
79 * the paren count is > 0.  To use this option, '(' and ')' must be special
80 * characters.
81 */
82#define ISC_LEXOPT_DNSMULTILINE 0x0020 /*%< Handle '(' and ')'. */
83#define ISC_LEXOPT_NOMORE	0x0040 /*%< Want "no more" token. */
84
85#define ISC_LEXOPT_CNUMBER	    0x0080 /*%< Recognize octal and hex. */
86#define ISC_LEXOPT_ESCAPE	    0x0100 /*%< Recognize escapes. */
87#define ISC_LEXOPT_QSTRINGMULTILINE 0x0200 /*%< Allow multiline "" strings */
88#define ISC_LEXOPT_OCTAL	    0x0400 /*%< Expect a octal number. */
89#define ISC_LEXOPT_BTEXT	    0x0800 /*%< Bracketed text. */
90#define ISC_LEXOPT_VPAIR	    0x1000 /*%< Recognize value pair. */
91#define ISC_LEXOPT_QVPAIR	    0x2000 /*%< Recognize quoted value pair. */
92/*@}*/
93/*@{*/
94/*!
95 * Various commenting styles, which may be changed at any time with
96 * isc_lex_setcomments().
97 */
98
99#define ISC_LEXCOMMENT_C	     0x01
100#define ISC_LEXCOMMENT_CPLUSPLUS     0x02
101#define ISC_LEXCOMMENT_SHELL	     0x04
102#define ISC_LEXCOMMENT_DNSMASTERFILE 0x08
103/*@}*/
104
105/***
106 *** Types
107 ***/
108
109/*! Lex */
110
111typedef char isc_lexspecials_t[256];
112
113/* Tokens */
114
115typedef enum {
116	isc_tokentype_unknown = 0,
117	isc_tokentype_string = 1,
118	isc_tokentype_number = 2,
119	isc_tokentype_qstring = 3,
120	isc_tokentype_eol = 4,
121	isc_tokentype_eof = 5,
122	isc_tokentype_initialws = 6,
123	isc_tokentype_special = 7,
124	isc_tokentype_nomore = 8,
125	isc_tokentype_btext = 9,
126	isc_tokentype_vpair = 10,
127	isc_tokentype_qvpair = 11,
128} isc_tokentype_t;
129
130typedef union {
131	char		 as_char;
132	unsigned long	 as_ulong;
133	isc_region_t	 as_region;
134	isc_textregion_t as_textregion;
135	void		*as_pointer;
136} isc_tokenvalue_t;
137
138typedef struct isc_token {
139	isc_tokentype_t	 type;
140	isc_tokenvalue_t value;
141} isc_token_t;
142
143/***
144 *** Functions
145 ***/
146
147isc_result_t
148isc_lex_create(isc_mem_t *mctx, size_t max_token, isc_lex_t **lexp);
149/*%<
150 * Create a lexer.
151 *
152 * 'max_token' is a hint of the number of bytes in the largest token.
153 *
154 * Requires:
155 *\li	'*lexp' is a valid lexer.
156 *
157 * Ensures:
158 *\li	On success, *lexp is attached to the newly created lexer.
159 *
160 * Returns:
161 *\li	#ISC_R_SUCCESS
162 *\li	#ISC_R_NOMEMORY
163 */
164
165void
166isc_lex_destroy(isc_lex_t **lexp);
167/*%<
168 * Destroy the lexer.
169 *
170 * Requires:
171 *\li	'*lexp' is a valid lexer.
172 *
173 * Ensures:
174 *\li	*lexp == NULL
175 */
176
177unsigned int
178isc_lex_getcomments(isc_lex_t *lex);
179/*%<
180 * Return the current lexer commenting styles.
181 *
182 * Requires:
183 *\li	'lex' is a valid lexer.
184 *
185 * Returns:
186 *\li	The commenting styles which are currently allowed.
187 */
188
189void
190isc_lex_setcomments(isc_lex_t *lex, unsigned int comments);
191/*%<
192 * Set allowed lexer commenting styles.
193 *
194 * Requires:
195 *\li	'lex' is a valid lexer.
196 *
197 *\li	'comments' has meaningful values.
198 */
199
200void
201isc_lex_getspecials(isc_lex_t *lex, isc_lexspecials_t specials);
202/*%<
203 * Put the current list of specials into 'specials'.
204 *
205 * Requires:
206 *\li	'lex' is a valid lexer.
207 */
208
209void
210isc_lex_setspecials(isc_lex_t *lex, isc_lexspecials_t specials);
211/*!<
212 * The characters in 'specials' are returned as tokens.  Along with
213 * whitespace, they delimit strings and numbers.
214 *
215 * Note:
216 *\li	Comment processing takes precedence over special character
217 *	recognition.
218 *
219 * Requires:
220 *\li	'lex' is a valid lexer.
221 */
222
223isc_result_t
224isc_lex_openfile(isc_lex_t *lex, const char *filename);
225/*%<
226 * Open 'filename' and make it the current input source for 'lex'.
227 *
228 * Requires:
229 *\li	'lex' is a valid lexer.
230 *
231 *\li	filename is a valid C string.
232 *
233 * Returns:
234 *\li	#ISC_R_SUCCESS
235 *\li	#ISC_R_NOMEMORY			Out of memory
236 *\li	#ISC_R_NOTFOUND			File not found
237 *\li	#ISC_R_NOPERM			No permission to open file
238 *\li	#ISC_R_FAILURE			Couldn't open file, not sure why
239 *\li	#ISC_R_UNEXPECTED
240 */
241
242isc_result_t
243isc_lex_openstream(isc_lex_t *lex, FILE *stream);
244/*%<
245 * Make 'stream' the current input source for 'lex'.
246 *
247 * Requires:
248 *\li	'lex' is a valid lexer.
249 *
250 *\li	'stream' is a valid C stream.
251 *
252 * Returns:
253 *\li	#ISC_R_SUCCESS
254 *\li	#ISC_R_NOMEMORY			Out of memory
255 */
256
257isc_result_t
258isc_lex_openbuffer(isc_lex_t *lex, isc_buffer_t *buffer);
259/*%<
260 * Make 'buffer' the current input source for 'lex'.
261 *
262 * Requires:
263 *\li	'lex' is a valid lexer.
264 *
265 *\li	'buffer' is a valid buffer.
266 *
267 * Returns:
268 *\li	#ISC_R_SUCCESS
269 *\li	#ISC_R_NOMEMORY			Out of memory
270 */
271
272isc_result_t
273isc_lex_close(isc_lex_t *lex);
274/*%<
275 * Close the most recently opened object (i.e. file or buffer).
276 *
277 * Returns:
278 *\li	#ISC_R_SUCCESS
279 *\li	#ISC_R_NOMORE			No more input sources
280 */
281
282isc_result_t
283isc_lex_gettoken(isc_lex_t *lex, unsigned int options, isc_token_t *tokenp);
284/*%<
285 * Get the next token.
286 *
287 * Requires:
288 *\li	'lex' is a valid lexer.
289 *
290 *\li	'lex' has an input source.
291 *
292 *\li	'options' contains valid options.
293 *
294 *\li	'*tokenp' is a valid pointer.
295 *
296 * Returns:
297 *\li	#ISC_R_SUCCESS
298 *\li	#ISC_R_UNEXPECTEDEND
299 *\li	#ISC_R_NOMEMORY
300 *
301 *	These two results are returned only if their corresponding lexer
302 *	options are not set.
303 *
304 *\li	#ISC_R_EOF			End of input source
305 *\li	#ISC_R_NOMORE			No more input sources
306 */
307
308isc_result_t
309isc_lex_getmastertoken(isc_lex_t *lex, isc_token_t *token,
310		       isc_tokentype_t expect, bool eol);
311/*%<
312 * Get the next token from a DNS master file type stream.  This is a
313 * convenience function that sets appropriate options and handles quoted
314 * strings and end of line correctly for master files.  It also ungets
315 * unexpected tokens.  If `eol` is set then expect end-of-line otherwise
316 * eol is a error.
317 *
318 * Requires:
319 *\li	'lex' is a valid lexer.
320 *
321 *\li	'token' is a valid pointer
322 *
323 * Returns:
324 *
325 * \li	any return code from isc_lex_gettoken().
326 */
327
328isc_result_t
329isc_lex_getoctaltoken(isc_lex_t *lex, isc_token_t *token, bool eol);
330/*%<
331 * Get the next token from a DNS master file type stream.  This is a
332 * convenience function that sets appropriate options and handles end
333 * of line correctly for master files.  It also ungets unexpected tokens.
334 * If `eol` is set then expect end-of-line otherwise eol is a error.
335 *
336 * Requires:
337 *\li	'lex' is a valid lexer.
338 *
339 *\li	'token' is a valid pointer
340 *
341 * Returns:
342 *
343 * \li	any return code from isc_lex_gettoken().
344 */
345
346void
347isc_lex_ungettoken(isc_lex_t *lex, isc_token_t *tokenp);
348/*%<
349 * Unget the current token.
350 *
351 * Requires:
352 *\li	'lex' is a valid lexer.
353 *
354 *\li	'lex' has an input source.
355 *
356 *\li	'tokenp' points to a valid token.
357 *
358 *\li	There is no ungotten token already.
359 */
360
361void
362isc_lex_getlasttokentext(isc_lex_t *lex, isc_token_t *tokenp, isc_region_t *r);
363/*%<
364 * Returns a region containing the text of the last token returned.
365 *
366 * Requires:
367 *\li	'lex' is a valid lexer.
368 *
369 *\li	'lex' has an input source.
370 *
371 *\li	'tokenp' points to a valid token.
372 *
373 *\li	A token has been gotten and not ungotten.
374 */
375
376char *
377isc_lex_getsourcename(isc_lex_t *lex);
378/*%<
379 * Return the input source name.
380 *
381 * Requires:
382 *\li	'lex' is a valid lexer.
383 *
384 * Returns:
385 * \li	source name or NULL if no current source.
386 *\li	result valid while current input source exists.
387 */
388
389unsigned long
390isc_lex_getsourceline(isc_lex_t *lex);
391/*%<
392 * Return the input source line number.
393 *
394 * Requires:
395 *\li	'lex' is a valid lexer.
396 *
397 * Returns:
398 *\li 	Current line number or 0 if no current source.
399 */
400
401isc_result_t
402isc_lex_setsourcename(isc_lex_t *lex, const char *name);
403/*%<
404 * Assigns a new name to the input source.
405 *
406 * Requires:
407 *
408 * \li	'lex' is a valid lexer.
409 *
410 * Returns:
411 * \li	#ISC_R_SUCCESS
412 * \li	#ISC_R_NOMEMORY
413 * \li	#ISC_R_NOTFOUND - there are no sources.
414 */
415
416isc_result_t
417isc_lex_setsourceline(isc_lex_t *lex, unsigned long line);
418/*%<
419 * Assigns a new line number to the input source. This can be used
420 * when parsing a buffer that's been excerpted from the middle a file,
421 * allowing logged messages to display the correct line number,
422 * rather than the line number within the buffer.
423 *
424 * Requires:
425 *
426 * \li	'lex' is a valid lexer.
427 *
428 * Returns:
429 * \li	#ISC_R_SUCCESS
430 * \li	#ISC_R_NOTFOUND - there are no sources.
431 */
432
433bool
434isc_lex_isfile(isc_lex_t *lex);
435/*%<
436 * Return whether the current input source is a file.
437 *
438 * Requires:
439 *\li	'lex' is a valid lexer.
440 *
441 * Returns:
442 * \li	#true if the current input is a file,
443 *\li	#false otherwise.
444 */
445
446ISC_LANG_ENDDECLS
447