1/* $NetBSD: lex.h,v 1.7 2024/02/21 22:52:30 christos Exp $ */ 2 3/* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16#pragma once 17 18/***** 19***** Module Info 20*****/ 21 22/*! \file isc/lex.h 23 * \brief The "lex" module provides a lightweight tokenizer. It can operate 24 * on files or buffers, and can handle "include". It is designed for 25 * parsing of DNS master files and the BIND configuration file, but 26 * should be general enough to tokenize other things, e.g. HTTP. 27 * 28 * \li MP: 29 * No synchronization is provided. Clients must ensure exclusive 30 * access. 31 * 32 * \li Reliability: 33 * No anticipated impact. 34 * 35 * \li Resources: 36 * TBS 37 * 38 * \li Security: 39 * No anticipated impact. 40 * 41 * \li Standards: 42 * None. 43 */ 44 45/*** 46 *** Imports 47 ***/ 48 49#include <stdbool.h> 50#include <stdio.h> 51 52#include <isc/lang.h> 53#include <isc/region.h> 54#include <isc/types.h> 55 56ISC_LANG_BEGINDECLS 57 58/*** 59 *** Options 60 ***/ 61 62/*@{*/ 63/*! 64 * Various options for isc_lex_gettoken(). 65 */ 66 67#define ISC_LEXOPT_EOL 0x0001 /*%< Want end-of-line token. */ 68#define ISC_LEXOPT_EOF 0x0002 /*%< Want end-of-file token. */ 69#define ISC_LEXOPT_INITIALWS 0x0004 /*%< Want initial whitespace. */ 70#define ISC_LEXOPT_NUMBER 0x0008 /*%< Recognize numbers. */ 71#define ISC_LEXOPT_QSTRING 0x0010 /*%< Recognize qstrings. */ 72/*@}*/ 73 74/*@{*/ 75/*! 76 * The ISC_LEXOPT_DNSMULTILINE option handles the processing of '(' and ')' in 77 * the DNS master file format. If this option is set, then the 78 * ISC_LEXOPT_INITIALWS and ISC_LEXOPT_EOL options will be ignored when 79 * the paren count is > 0. To use this option, '(' and ')' must be special 80 * characters. 81 */ 82#define ISC_LEXOPT_DNSMULTILINE 0x0020 /*%< Handle '(' and ')'. */ 83#define ISC_LEXOPT_NOMORE 0x0040 /*%< Want "no more" token. */ 84 85#define ISC_LEXOPT_CNUMBER 0x0080 /*%< Recognize octal and hex. */ 86#define ISC_LEXOPT_ESCAPE 0x0100 /*%< Recognize escapes. */ 87#define ISC_LEXOPT_QSTRINGMULTILINE 0x0200 /*%< Allow multiline "" strings */ 88#define ISC_LEXOPT_OCTAL 0x0400 /*%< Expect a octal number. */ 89#define ISC_LEXOPT_BTEXT 0x0800 /*%< Bracketed text. */ 90#define ISC_LEXOPT_VPAIR 0x1000 /*%< Recognize value pair. */ 91#define ISC_LEXOPT_QVPAIR 0x2000 /*%< Recognize quoted value pair. */ 92/*@}*/ 93/*@{*/ 94/*! 95 * Various commenting styles, which may be changed at any time with 96 * isc_lex_setcomments(). 97 */ 98 99#define ISC_LEXCOMMENT_C 0x01 100#define ISC_LEXCOMMENT_CPLUSPLUS 0x02 101#define ISC_LEXCOMMENT_SHELL 0x04 102#define ISC_LEXCOMMENT_DNSMASTERFILE 0x08 103/*@}*/ 104 105/*** 106 *** Types 107 ***/ 108 109/*! Lex */ 110 111typedef char isc_lexspecials_t[256]; 112 113/* Tokens */ 114 115typedef enum { 116 isc_tokentype_unknown = 0, 117 isc_tokentype_string = 1, 118 isc_tokentype_number = 2, 119 isc_tokentype_qstring = 3, 120 isc_tokentype_eol = 4, 121 isc_tokentype_eof = 5, 122 isc_tokentype_initialws = 6, 123 isc_tokentype_special = 7, 124 isc_tokentype_nomore = 8, 125 isc_tokentype_btext = 9, 126 isc_tokentype_vpair = 10, 127 isc_tokentype_qvpair = 11, 128} isc_tokentype_t; 129 130typedef union { 131 char as_char; 132 unsigned long as_ulong; 133 isc_region_t as_region; 134 isc_textregion_t as_textregion; 135 void *as_pointer; 136} isc_tokenvalue_t; 137 138typedef struct isc_token { 139 isc_tokentype_t type; 140 isc_tokenvalue_t value; 141} isc_token_t; 142 143/*** 144 *** Functions 145 ***/ 146 147isc_result_t 148isc_lex_create(isc_mem_t *mctx, size_t max_token, isc_lex_t **lexp); 149/*%< 150 * Create a lexer. 151 * 152 * 'max_token' is a hint of the number of bytes in the largest token. 153 * 154 * Requires: 155 *\li '*lexp' is a valid lexer. 156 * 157 * Ensures: 158 *\li On success, *lexp is attached to the newly created lexer. 159 * 160 * Returns: 161 *\li #ISC_R_SUCCESS 162 *\li #ISC_R_NOMEMORY 163 */ 164 165void 166isc_lex_destroy(isc_lex_t **lexp); 167/*%< 168 * Destroy the lexer. 169 * 170 * Requires: 171 *\li '*lexp' is a valid lexer. 172 * 173 * Ensures: 174 *\li *lexp == NULL 175 */ 176 177unsigned int 178isc_lex_getcomments(isc_lex_t *lex); 179/*%< 180 * Return the current lexer commenting styles. 181 * 182 * Requires: 183 *\li 'lex' is a valid lexer. 184 * 185 * Returns: 186 *\li The commenting styles which are currently allowed. 187 */ 188 189void 190isc_lex_setcomments(isc_lex_t *lex, unsigned int comments); 191/*%< 192 * Set allowed lexer commenting styles. 193 * 194 * Requires: 195 *\li 'lex' is a valid lexer. 196 * 197 *\li 'comments' has meaningful values. 198 */ 199 200void 201isc_lex_getspecials(isc_lex_t *lex, isc_lexspecials_t specials); 202/*%< 203 * Put the current list of specials into 'specials'. 204 * 205 * Requires: 206 *\li 'lex' is a valid lexer. 207 */ 208 209void 210isc_lex_setspecials(isc_lex_t *lex, isc_lexspecials_t specials); 211/*!< 212 * The characters in 'specials' are returned as tokens. Along with 213 * whitespace, they delimit strings and numbers. 214 * 215 * Note: 216 *\li Comment processing takes precedence over special character 217 * recognition. 218 * 219 * Requires: 220 *\li 'lex' is a valid lexer. 221 */ 222 223isc_result_t 224isc_lex_openfile(isc_lex_t *lex, const char *filename); 225/*%< 226 * Open 'filename' and make it the current input source for 'lex'. 227 * 228 * Requires: 229 *\li 'lex' is a valid lexer. 230 * 231 *\li filename is a valid C string. 232 * 233 * Returns: 234 *\li #ISC_R_SUCCESS 235 *\li #ISC_R_NOMEMORY Out of memory 236 *\li #ISC_R_NOTFOUND File not found 237 *\li #ISC_R_NOPERM No permission to open file 238 *\li #ISC_R_FAILURE Couldn't open file, not sure why 239 *\li #ISC_R_UNEXPECTED 240 */ 241 242isc_result_t 243isc_lex_openstream(isc_lex_t *lex, FILE *stream); 244/*%< 245 * Make 'stream' the current input source for 'lex'. 246 * 247 * Requires: 248 *\li 'lex' is a valid lexer. 249 * 250 *\li 'stream' is a valid C stream. 251 * 252 * Returns: 253 *\li #ISC_R_SUCCESS 254 *\li #ISC_R_NOMEMORY Out of memory 255 */ 256 257isc_result_t 258isc_lex_openbuffer(isc_lex_t *lex, isc_buffer_t *buffer); 259/*%< 260 * Make 'buffer' the current input source for 'lex'. 261 * 262 * Requires: 263 *\li 'lex' is a valid lexer. 264 * 265 *\li 'buffer' is a valid buffer. 266 * 267 * Returns: 268 *\li #ISC_R_SUCCESS 269 *\li #ISC_R_NOMEMORY Out of memory 270 */ 271 272isc_result_t 273isc_lex_close(isc_lex_t *lex); 274/*%< 275 * Close the most recently opened object (i.e. file or buffer). 276 * 277 * Returns: 278 *\li #ISC_R_SUCCESS 279 *\li #ISC_R_NOMORE No more input sources 280 */ 281 282isc_result_t 283isc_lex_gettoken(isc_lex_t *lex, unsigned int options, isc_token_t *tokenp); 284/*%< 285 * Get the next token. 286 * 287 * Requires: 288 *\li 'lex' is a valid lexer. 289 * 290 *\li 'lex' has an input source. 291 * 292 *\li 'options' contains valid options. 293 * 294 *\li '*tokenp' is a valid pointer. 295 * 296 * Returns: 297 *\li #ISC_R_SUCCESS 298 *\li #ISC_R_UNEXPECTEDEND 299 *\li #ISC_R_NOMEMORY 300 * 301 * These two results are returned only if their corresponding lexer 302 * options are not set. 303 * 304 *\li #ISC_R_EOF End of input source 305 *\li #ISC_R_NOMORE No more input sources 306 */ 307 308isc_result_t 309isc_lex_getmastertoken(isc_lex_t *lex, isc_token_t *token, 310 isc_tokentype_t expect, bool eol); 311/*%< 312 * Get the next token from a DNS master file type stream. This is a 313 * convenience function that sets appropriate options and handles quoted 314 * strings and end of line correctly for master files. It also ungets 315 * unexpected tokens. If `eol` is set then expect end-of-line otherwise 316 * eol is a error. 317 * 318 * Requires: 319 *\li 'lex' is a valid lexer. 320 * 321 *\li 'token' is a valid pointer 322 * 323 * Returns: 324 * 325 * \li any return code from isc_lex_gettoken(). 326 */ 327 328isc_result_t 329isc_lex_getoctaltoken(isc_lex_t *lex, isc_token_t *token, bool eol); 330/*%< 331 * Get the next token from a DNS master file type stream. This is a 332 * convenience function that sets appropriate options and handles end 333 * of line correctly for master files. It also ungets unexpected tokens. 334 * If `eol` is set then expect end-of-line otherwise eol is a error. 335 * 336 * Requires: 337 *\li 'lex' is a valid lexer. 338 * 339 *\li 'token' is a valid pointer 340 * 341 * Returns: 342 * 343 * \li any return code from isc_lex_gettoken(). 344 */ 345 346void 347isc_lex_ungettoken(isc_lex_t *lex, isc_token_t *tokenp); 348/*%< 349 * Unget the current token. 350 * 351 * Requires: 352 *\li 'lex' is a valid lexer. 353 * 354 *\li 'lex' has an input source. 355 * 356 *\li 'tokenp' points to a valid token. 357 * 358 *\li There is no ungotten token already. 359 */ 360 361void 362isc_lex_getlasttokentext(isc_lex_t *lex, isc_token_t *tokenp, isc_region_t *r); 363/*%< 364 * Returns a region containing the text of the last token returned. 365 * 366 * Requires: 367 *\li 'lex' is a valid lexer. 368 * 369 *\li 'lex' has an input source. 370 * 371 *\li 'tokenp' points to a valid token. 372 * 373 *\li A token has been gotten and not ungotten. 374 */ 375 376char * 377isc_lex_getsourcename(isc_lex_t *lex); 378/*%< 379 * Return the input source name. 380 * 381 * Requires: 382 *\li 'lex' is a valid lexer. 383 * 384 * Returns: 385 * \li source name or NULL if no current source. 386 *\li result valid while current input source exists. 387 */ 388 389unsigned long 390isc_lex_getsourceline(isc_lex_t *lex); 391/*%< 392 * Return the input source line number. 393 * 394 * Requires: 395 *\li 'lex' is a valid lexer. 396 * 397 * Returns: 398 *\li Current line number or 0 if no current source. 399 */ 400 401isc_result_t 402isc_lex_setsourcename(isc_lex_t *lex, const char *name); 403/*%< 404 * Assigns a new name to the input source. 405 * 406 * Requires: 407 * 408 * \li 'lex' is a valid lexer. 409 * 410 * Returns: 411 * \li #ISC_R_SUCCESS 412 * \li #ISC_R_NOMEMORY 413 * \li #ISC_R_NOTFOUND - there are no sources. 414 */ 415 416isc_result_t 417isc_lex_setsourceline(isc_lex_t *lex, unsigned long line); 418/*%< 419 * Assigns a new line number to the input source. This can be used 420 * when parsing a buffer that's been excerpted from the middle a file, 421 * allowing logged messages to display the correct line number, 422 * rather than the line number within the buffer. 423 * 424 * Requires: 425 * 426 * \li 'lex' is a valid lexer. 427 * 428 * Returns: 429 * \li #ISC_R_SUCCESS 430 * \li #ISC_R_NOTFOUND - there are no sources. 431 */ 432 433bool 434isc_lex_isfile(isc_lex_t *lex); 435/*%< 436 * Return whether the current input source is a file. 437 * 438 * Requires: 439 *\li 'lex' is a valid lexer. 440 * 441 * Returns: 442 * \li #true if the current input is a file, 443 *\li #false otherwise. 444 */ 445 446ISC_LANG_ENDDECLS 447