1# SPDX-License-Identifier: BSD-3-Clause 2 3# ----------------------------------------------------------------------------- 4# ply: lex.py 5# 6# Copyright (C) 2001-2009, 7# David M. Beazley (Dabeaz LLC) 8# All rights reserved. 9# 10# Redistribution and use in source and binary forms, with or without 11# modification, are permitted provided that the following conditions are 12# met: 13# 14# * Redistributions of source code must retain the above copyright notice, 15# this list of conditions and the following disclaimer. 16# * Redistributions in binary form must reproduce the above copyright notice, 17# this list of conditions and the following disclaimer in the documentation 18# and/or other materials provided with the distribution. 19# * Neither the name of the David Beazley or Dabeaz LLC may be used to 20# endorse or promote products derived from this software without 21# specific prior written permission. 22# 23# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 27# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 28# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 29# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 30# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 31# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 32# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34# ----------------------------------------------------------------------------- 35 36__version__ = "3.2" 37__tabversion__ = "3.2" # Version of table file used 38 39from past.builtins import cmp 40import re 41import sys 42import types 43import copy 44import os 45 46# Python3 doesn't have a build-in cmp function. 47# We need to import it here, even though it isn't called in this file 48# when interpreted by python3, to prevent pylint from treating it as 49# an error. 50 51# This tuple contains known string types 52try: 53 # Python 2.6 54 StringTypes = (types.StringType, types.UnicodeType) 55except AttributeError: 56 # Python 3.0 57 StringTypes = (str, bytes) 58 59# Extract the code attribute of a function. Different implementations 60# are for Python 2/3 compatibility. 61 62if sys.version_info[0] < 3: 63 def func_code(f): 64 return f.func_code 65else: 66 def func_code(f): 67 return f.__code__ 68 69# This regular expression is used to match valid token names 70_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') 71 72# Exception thrown when invalid token encountered and no default error 73# handler is defined. 74 75 76class LexError(Exception): 77 def __init__(self, message, s): 78 self.args = (message,) 79 self.text = s 80 81# Token class. This class is used to represent the tokens produced. 82 83 84class LexToken(object): 85 def __str__(self): 86 return "LexToken(%s,%r,%d,%d)" % (self.type, self.value, self.lineno, self.lexpos) 87 88 def __repr__(self): 89 return str(self) 90 91# This object is a stand-in for a logging object created by the 92# logging module. 93 94 95class PlyLogger(object): 96 def __init__(self, f): 97 self.f = f 98 99 def critical(self, msg, *args, **kwargs): 100 self.f.write((msg % args) + "\n") 101 102 def warning(self, msg, *args, **kwargs): 103 self.f.write("WARNING: " + (msg % args) + "\n") 104 105 def error(self, msg, *args, **kwargs): 106 self.f.write("ERROR: " + (msg % args) + "\n") 107 108 info = critical 109 debug = critical 110 111# Null logger is used when no output is generated. Does nothing. 112 113 114class NullLogger(object): 115 def __getattribute__(self, name): 116 return self 117 118 def __call__(self, *args, **kwargs): 119 return self 120 121# ----------------------------------------------------------------------------- 122# === Lexing Engine === 123# 124# The following Lexer class implements the lexer runtime. There are only 125# a few public methods and attributes: 126# 127# input() - Store a new string in the lexer 128# token() - Get the next token 129# clone() - Clone the lexer 130# 131# lineno - Current line number 132# lexpos - Current position in the input string 133# ----------------------------------------------------------------------------- 134 135 136class Lexer: 137 def __init__(self): 138 self.lexre = None # Master regular expression. This is a list of 139 # tuples (re,findex) where re is a compiled 140 # regular expression and findex is a list 141 # mapping regex group numbers to rules 142 self.lexretext = None # Current regular expression strings 143 self.lexstatere = {} # Dictionary mapping lexer states to master regexs 144 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 145 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 146 self.lexstate = "INITIAL" # Current lexer state 147 self.lexstatestack = [] # Stack of lexer states 148 self.lexstateinfo = None # State information 149 self.lexstateignore = {} # Dictionary of ignored characters for each state 150 self.lexstateerrorf = {} # Dictionary of error functions for each state 151 self.lexreflags = 0 # Optional re compile flags 152 self.lexdata = None # Actual input data (as a string) 153 self.lexpos = 0 # Current position in input text 154 self.lexlen = 0 # Length of the input text 155 self.lexerrorf = None # Error rule (if any) 156 self.lextokens = None # List of valid tokens 157 self.lexignore = "" # Ignored characters 158 self.lexliterals = "" # Literal characters that can be passed through 159 self.lexmodule = None # Module 160 self.lineno = 1 # Current line number 161 self.lexoptimize = 0 # Optimized mode 162 163 def clone(self, object=None): 164 c = copy.copy(self) 165 166 # If the object parameter has been supplied, it means we are attaching the 167 # lexer to a new object. In this case, we have to rebind all methods in 168 # the lexstatere and lexstateerrorf tables. 169 170 if object: 171 newtab = {} 172 for key, ritem in self.lexstatere.items(): 173 newre = [] 174 for cre, findex in ritem: 175 newfindex = [] 176 for f in findex: 177 if not f or not f[0]: 178 newfindex.append(f) 179 continue 180 newfindex.append((getattr(object, f[0].__name__), f[1])) 181 newre.append((cre, newfindex)) 182 newtab[key] = newre 183 c.lexstatere = newtab 184 c.lexstateerrorf = {} 185 for key, ef in self.lexstateerrorf.items(): 186 c.lexstateerrorf[key] = getattr(object, ef.__name__) 187 c.lexmodule = object 188 return c 189 190 # ------------------------------------------------------------ 191 # writetab() - Write lexer information to a table file 192 # ------------------------------------------------------------ 193 def writetab(self, tabfile, outputdir=""): 194 if isinstance(tabfile, types.ModuleType): 195 return 196 basetabfilename = tabfile.split(".")[-1] 197 filename = os.path.join(outputdir, basetabfilename)+".py" 198 tf = open(filename, "w") 199 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % 200 (tabfile, __version__)) 201 tf.write("_tabversion = %s\n" % repr(__version__)) 202 tf.write("_lextokens = %s\n" % repr(self.lextokens)) 203 tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) 204 tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 205 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 206 207 tabre = {} 208 # Collect all functions in the initial state 209 initial = self.lexstatere["INITIAL"] 210 initialfuncs = [] 211 for part in initial: 212 for f in part[1]: 213 if f and f[0]: 214 initialfuncs.append(f) 215 216 for key, lre in self.lexstatere.items(): 217 titem = [] 218 for i in range(len(lre)): 219 titem.append((self.lexstateretext[key][i], _funcs_to_names( 220 lre[i][1], self.lexstaterenames[key][i]))) 221 tabre[key] = titem 222 223 tf.write("_lexstatere = %s\n" % repr(tabre)) 224 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 225 226 taberr = {} 227 for key, ef in self.lexstateerrorf.items(): 228 if ef: 229 taberr[key] = ef.__name__ 230 else: 231 taberr[key] = None 232 tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 233 tf.close() 234 235 # ------------------------------------------------------------ 236 # readtab() - Read lexer information from a tab file 237 # ------------------------------------------------------------ 238 def readtab(self, tabfile, fdict): 239 if isinstance(tabfile, types.ModuleType): 240 lextab = tabfile 241 else: 242 if sys.version_info[0] < 3: 243 exec("import %s as lextab" % tabfile) 244 else: 245 env = {} 246 exec("import %s as lextab" % tabfile, env, env) 247 lextab = env['lextab'] 248 249 if getattr(lextab, "_tabversion", "0.0") != __version__: 250 raise ImportError("Inconsistent PLY version") 251 252 self.lextokens = lextab._lextokens 253 self.lexreflags = lextab._lexreflags 254 self.lexliterals = lextab._lexliterals 255 self.lexstateinfo = lextab._lexstateinfo 256 self.lexstateignore = lextab._lexstateignore 257 self.lexstatere = {} 258 self.lexstateretext = {} 259 for key, lre in lextab._lexstatere.items(): 260 titem = [] 261 txtitem = [] 262 for i in range(len(lre)): 263 titem.append((re.compile(lre[i][0], lextab._lexreflags), 264 _names_to_funcs(lre[i][1], fdict))) 265 txtitem.append(lre[i][0]) 266 self.lexstatere[key] = titem 267 self.lexstateretext[key] = txtitem 268 self.lexstateerrorf = {} 269 for key, ef in lextab._lexstateerrorf.items(): 270 self.lexstateerrorf[key] = fdict[ef] 271 self.begin('INITIAL') 272 273 # ------------------------------------------------------------ 274 # input() - Push a new string into the lexer 275 # ------------------------------------------------------------ 276 def input(self, s): 277 # Pull off the first character to see if s looks like a string 278 c = s[:1] 279 if not isinstance(c, StringTypes): 280 raise ValueError("Expected a string") 281 self.lexdata = s 282 self.lexpos = 0 283 self.lexlen = len(s) 284 285 # ------------------------------------------------------------ 286 # begin() - Changes the lexing state 287 # ------------------------------------------------------------ 288 def begin(self, state): 289 if not state in self.lexstatere: 290 raise ValueError("Undefined state") 291 self.lexre = self.lexstatere[state] 292 self.lexretext = self.lexstateretext[state] 293 self.lexignore = self.lexstateignore.get(state, "") 294 self.lexerrorf = self.lexstateerrorf.get(state, None) 295 self.lexstate = state 296 297 # ------------------------------------------------------------ 298 # push_state() - Changes the lexing state and saves old on stack 299 # ------------------------------------------------------------ 300 def push_state(self, state): 301 self.lexstatestack.append(self.lexstate) 302 self.begin(state) 303 304 # ------------------------------------------------------------ 305 # pop_state() - Restores the previous state 306 # ------------------------------------------------------------ 307 def pop_state(self): 308 self.begin(self.lexstatestack.pop()) 309 310 # ------------------------------------------------------------ 311 # current_state() - Returns the current lexing state 312 # ------------------------------------------------------------ 313 def current_state(self): 314 return self.lexstate 315 316 # ------------------------------------------------------------ 317 # skip() - Skip ahead n characters 318 # ------------------------------------------------------------ 319 def skip(self, n): 320 self.lexpos += n 321 322 # ------------------------------------------------------------ 323 # opttoken() - Return the next token from the Lexer 324 # 325 # Note: This function has been carefully implemented to be as fast 326 # as possible. Don't make changes unless you really know what 327 # you are doing 328 # ------------------------------------------------------------ 329 def token(self): 330 # Make local copies of frequently referenced attributes 331 lexpos = self.lexpos 332 lexlen = self.lexlen 333 lexignore = self.lexignore 334 lexdata = self.lexdata 335 336 while lexpos < lexlen: 337 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 338 if lexdata[lexpos] in lexignore: 339 lexpos += 1 340 continue 341 342 # Look for a regular expression match 343 for lexre, lexindexfunc in self.lexre: 344 m = lexre.match(lexdata, lexpos) 345 if not m: 346 continue 347 348 # Create a token for return 349 tok = LexToken() 350 tok.value = m.group() 351 tok.lineno = self.lineno 352 tok.lexpos = lexpos 353 354 i = m.lastindex 355 func, tok.type = lexindexfunc[i] 356 357 if not func: 358 # If no token type was set, it's an ignored token 359 if tok.type: 360 self.lexpos = m.end() 361 return tok 362 else: 363 lexpos = m.end() 364 break 365 366 lexpos = m.end() 367 368 # If token is processed by a function, call it 369 370 tok.lexer = self # Set additional attributes useful in token rules 371 self.lexmatch = m 372 self.lexpos = lexpos 373 374 newtok = func(tok) 375 376 # Every function must return a token, if nothing, we just move to next token 377 if not newtok: 378 lexpos = self.lexpos # This is here in case user has updated lexpos. 379 lexignore = self.lexignore # This is here in case there was a state change 380 break 381 382 # Verify type of the token. If not in the token map, raise an error 383 if not self.lexoptimize: 384 if not newtok.type in self.lextokens: 385 raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( 386 func_code(func).co_filename, func_code(func).co_firstlineno, 387 func.__name__, newtok.type), lexdata[lexpos:]) 388 389 return newtok 390 else: 391 # No match, see if in literals 392 if lexdata[lexpos] in self.lexliterals: 393 tok = LexToken() 394 tok.value = lexdata[lexpos] 395 tok.lineno = self.lineno 396 tok.type = tok.value 397 tok.lexpos = lexpos 398 self.lexpos = lexpos + 1 399 return tok 400 401 # No match. Call t_error() if defined. 402 if self.lexerrorf: 403 tok = LexToken() 404 tok.value = self.lexdata[lexpos:] 405 tok.lineno = self.lineno 406 tok.type = "error" 407 tok.lexer = self 408 tok.lexpos = lexpos 409 self.lexpos = lexpos 410 newtok = self.lexerrorf(tok) 411 if lexpos == self.lexpos: 412 # Error method didn't change text position at all. This is an error. 413 raise LexError("Scanning error. Illegal character '%s'" % 414 (lexdata[lexpos]), lexdata[lexpos:]) 415 lexpos = self.lexpos 416 if not newtok: 417 continue 418 return newtok 419 420 self.lexpos = lexpos 421 raise LexError("Illegal character '%s' at index %d" % 422 (lexdata[lexpos], lexpos), lexdata[lexpos:]) 423 424 self.lexpos = lexpos + 1 425 if self.lexdata is None: 426 raise RuntimeError("No input string given with input()") 427 return None 428 429 # Iterator interface 430 def __iter__(self): 431 return self 432 433 def next(self): 434 t = self.token() 435 if t is None: 436 raise StopIteration 437 return t 438 439 __next__ = next 440 441# ----------------------------------------------------------------------------- 442# ==== Lex Builder === 443# 444# The functions and classes below are used to collect lexing information 445# and build a Lexer object from it. 446# ----------------------------------------------------------------------------- 447 448# ----------------------------------------------------------------------------- 449# get_caller_module_dict() 450# 451# This function returns a dictionary containing all of the symbols defined within 452# a caller further down the call stack. This is used to get the environment 453# associated with the yacc() call if none was provided. 454# ----------------------------------------------------------------------------- 455 456 457def get_caller_module_dict(levels): 458 try: 459 raise RuntimeError 460 except RuntimeError: 461 e, b, t = sys.exc_info() 462 f = t.tb_frame 463 while levels > 0: 464 f = f.f_back 465 levels -= 1 466 ldict = f.f_globals.copy() 467 if f.f_globals != f.f_locals: 468 ldict.update(f.f_locals) 469 470 return ldict 471 472# ----------------------------------------------------------------------------- 473# _funcs_to_names() 474# 475# Given a list of regular expression functions, this converts it to a list 476# suitable for output to a table file 477# ----------------------------------------------------------------------------- 478 479 480def _funcs_to_names(funclist, namelist): 481 result = [] 482 for f, name in zip(funclist, namelist): 483 if f and f[0]: 484 result.append((name, f[1])) 485 else: 486 result.append(f) 487 return result 488 489# ----------------------------------------------------------------------------- 490# _names_to_funcs() 491# 492# Given a list of regular expression function names, this converts it back to 493# functions. 494# ----------------------------------------------------------------------------- 495 496 497def _names_to_funcs(namelist, fdict): 498 result = [] 499 for n in namelist: 500 if n and n[0]: 501 result.append((fdict[n[0]], n[1])) 502 else: 503 result.append(n) 504 return result 505 506# ----------------------------------------------------------------------------- 507# _form_master_re() 508# 509# This function takes a list of all of the regex components and attempts to 510# form the master regular expression. Given limitations in the Python re 511# module, it may be necessary to break the master regex into separate expressions. 512# ----------------------------------------------------------------------------- 513 514 515def _form_master_re(relist, reflags, ldict, toknames): 516 if not relist: 517 return [] 518 regex = "|".join(relist) 519 try: 520 lexre = re.compile(regex, re.VERBOSE | reflags) 521 522 # Build the index to function map for the matching engine 523 lexindexfunc = [None] * (max(lexre.groupindex.values())+1) 524 lexindexnames = lexindexfunc[:] 525 526 for f, i in lexre.groupindex.items(): 527 handle = ldict.get(f, None) 528 if type(handle) in (types.FunctionType, types.MethodType): 529 lexindexfunc[i] = (handle, toknames[f]) 530 lexindexnames[i] = f 531 elif handle is not None: 532 lexindexnames[i] = f 533 if f.find("ignore_") > 0: 534 lexindexfunc[i] = (None, None) 535 else: 536 lexindexfunc[i] = (None, toknames[f]) 537 538 return [(lexre, lexindexfunc)], [regex], [lexindexnames] 539 except Exception: 540 m = int(len(relist)/2) 541 if m == 0: 542 m = 1 543 llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames) 544 rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames) 545 return llist+rlist, lre+rre, lnames+rnames 546 547# ----------------------------------------------------------------------------- 548# def _statetoken(s,names) 549# 550# Given a declaration name s of the form "t_" and a dictionary whose keys are 551# state names, this function returns a tuple (states,tokenname) where states 552# is a tuple of state names and tokenname is the name of the token. For example, 553# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 554# ----------------------------------------------------------------------------- 555 556 557def _statetoken(s, names): 558 nonstate = 1 559 parts = s.split("_") 560 for i in range(1, len(parts)): 561 if not parts[i] in names and parts[i] != 'ANY': 562 break 563 if i > 1: 564 states = tuple(parts[1:i]) 565 else: 566 states = ('INITIAL',) 567 568 if 'ANY' in states: 569 states = tuple(names) 570 571 tokenname = "_".join(parts[i:]) 572 return (states, tokenname) 573 574 575# ----------------------------------------------------------------------------- 576# LexerReflect() 577# 578# This class represents information needed to build a lexer as extracted from a 579# user's input file. 580# ----------------------------------------------------------------------------- 581class LexerReflect(object): 582 def __init__(self, ldict, log=None, reflags=0): 583 self.ldict = ldict 584 self.error_func = None 585 self.tokens = [] 586 self.reflags = reflags 587 self.stateinfo = {'INITIAL': 'inclusive'} 588 self.files = {} 589 self.error = 0 590 591 if log is None: 592 self.log = PlyLogger(sys.stderr) 593 else: 594 self.log = log 595 596 # Get all of the basic information 597 def get_all(self): 598 self.get_tokens() 599 self.get_literals() 600 self.get_states() 601 self.get_rules() 602 603 # Validate all of the information 604 def validate_all(self): 605 self.validate_tokens() 606 self.validate_literals() 607 self.validate_rules() 608 return self.error 609 610 # Get the tokens map 611 def get_tokens(self): 612 tokens = self.ldict.get("tokens", None) 613 if not tokens: 614 self.log.error("No token list is defined") 615 self.error = 1 616 return 617 618 if not isinstance(tokens, (list, tuple)): 619 self.log.error("tokens must be a list or tuple") 620 self.error = 1 621 return 622 623 if not tokens: 624 self.log.error("tokens is empty") 625 self.error = 1 626 return 627 628 self.tokens = tokens 629 630 # Validate the tokens 631 def validate_tokens(self): 632 terminals = {} 633 for n in self.tokens: 634 if not _is_identifier.match(n): 635 self.log.error("Bad token name '%s'", n) 636 self.error = 1 637 if n in terminals: 638 self.log.warning("Token '%s' multiply defined", n) 639 terminals[n] = 1 640 641 # Get the literals specifier 642 def get_literals(self): 643 self.literals = self.ldict.get("literals", "") 644 645 # Validate literals 646 def validate_literals(self): 647 try: 648 for c in self.literals: 649 if not isinstance(c, StringTypes) or len(c) > 1: 650 self.log.error("Invalid literal %s. Must be a single character", repr(c)) 651 self.error = 1 652 continue 653 654 except TypeError: 655 self.log.error( 656 "Invalid literals specification. literals must be a sequence of characters") 657 self.error = 1 658 659 def get_states(self): 660 self.states = self.ldict.get("states", None) 661 # Build statemap 662 if self.states: 663 if not isinstance(self.states, (tuple, list)): 664 self.log.error("states must be defined as a tuple or list") 665 self.error = 1 666 else: 667 for s in self.states: 668 if not isinstance(s, tuple) or len(s) != 2: 669 self.log.error( 670 "Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s)) 671 self.error = 1 672 continue 673 name, statetype = s 674 if not isinstance(name, StringTypes): 675 self.log.error("State name %s must be a string", repr(name)) 676 self.error = 1 677 continue 678 if not (statetype == 'inclusive' or statetype == 'exclusive'): 679 self.log.error( 680 "State type for state %s must be 'inclusive' or 'exclusive'", name) 681 self.error = 1 682 continue 683 if name in self.stateinfo: 684 self.log.error("State '%s' already defined", name) 685 self.error = 1 686 continue 687 self.stateinfo[name] = statetype 688 689 # Get all of the symbols with a t_ prefix and sort them into various 690 # categories (functions, strings, error functions, and ignore characters) 691 692 def get_rules(self): 693 tsymbols = [f for f in self.ldict if f[:2] == 't_'] 694 695 # Now build up a list of functions and a list of strings 696 697 self.toknames = {} # Mapping of symbols to token names 698 self.funcsym = {} # Symbols defined as functions 699 self.strsym = {} # Symbols defined as strings 700 self.ignore = {} # Ignore strings by state 701 self.errorf = {} # Error functions by state 702 703 for s in self.stateinfo: 704 self.funcsym[s] = [] 705 self.strsym[s] = [] 706 707 if len(tsymbols) == 0: 708 self.log.error("No rules of the form t_rulename are defined") 709 self.error = 1 710 return 711 712 for f in tsymbols: 713 t = self.ldict[f] 714 states, tokname = _statetoken(f, self.stateinfo) 715 self.toknames[f] = tokname 716 717 if hasattr(t, "__call__"): 718 if tokname == 'error': 719 for s in states: 720 self.errorf[s] = t 721 elif tokname == 'ignore': 722 line = func_code(t).co_firstlineno 723 file = func_code(t).co_filename 724 self.log.error("%s:%d: Rule '%s' must be defined as a string", 725 file, line, t.__name__) 726 self.error = 1 727 else: 728 for s in states: 729 self.funcsym[s].append((f, t)) 730 elif isinstance(t, StringTypes): 731 if tokname == 'ignore': 732 for s in states: 733 self.ignore[s] = t 734 if "\\" in t: 735 self.log.warning("%s contains a literal backslash '\\'", f) 736 737 elif tokname == 'error': 738 self.log.error("Rule '%s' must be defined as a function", f) 739 self.error = 1 740 else: 741 for s in states: 742 self.strsym[s].append((f, t)) 743 else: 744 self.log.error("%s not defined as a function or string", f) 745 self.error = 1 746 747 # Sort the functions by line number 748 for f in self.funcsym.values(): 749 if sys.version_info[0] < 3: 750 f.sort(lambda x, y: cmp( 751 func_code(x[1]).co_firstlineno, func_code(y[1]).co_firstlineno)) 752 else: 753 # Python 3.0 754 f.sort(key=lambda x: func_code(x[1]).co_firstlineno) 755 756 # Sort the strings by regular expression length 757 for s in self.strsym.values(): 758 if sys.version_info[0] < 3: 759 s.sort(lambda x, y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) 760 else: 761 # Python 3.0 762 s.sort(key=lambda x: len(x[1]), reverse=True) 763 764 # Validate all of the t_rules collected 765 def validate_rules(self): 766 for state in self.stateinfo: 767 # Validate all rules defined by functions 768 769 for fname, f in self.funcsym[state]: 770 line = func_code(f).co_firstlineno 771 file = func_code(f).co_filename 772 self.files[file] = 1 773 774 tokname = self.toknames[fname] 775 if isinstance(f, types.MethodType): 776 reqargs = 2 777 else: 778 reqargs = 1 779 nargs = func_code(f).co_argcount 780 if nargs > reqargs: 781 self.log.error("%s:%d: Rule '%s' has too many arguments", 782 file, line, f.__name__) 783 self.error = 1 784 continue 785 786 if nargs < reqargs: 787 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) 788 self.error = 1 789 continue 790 791 if not f.__doc__: 792 self.log.error("%s:%d: No regular expression defined for rule '%s'", 793 file, line, f.__name__) 794 self.error = 1 795 continue 796 797 try: 798 c = re.compile("(?P<%s>%s)" % (fname, f.__doc__), re.VERBOSE | self.reflags) 799 if c.match(""): 800 self.log.error( 801 "%s:%d: Regular expression for rule '%s' matches empty string", file, line, f.__name__) 802 self.error = 1 803 except re.error: 804 _etype, e, _etrace = sys.exc_info() 805 self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", 806 file, line, f.__name__, e) 807 if '#' in f.__doc__: 808 self.log.error( 809 "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__) 810 self.error = 1 811 812 # Validate all rules defined by strings 813 for name, r in self.strsym[state]: 814 tokname = self.toknames[name] 815 if tokname == 'error': 816 self.log.error("Rule '%s' must be defined as a function", name) 817 self.error = 1 818 continue 819 820 if not tokname in self.tokens and tokname.find("ignore_") < 0: 821 self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname) 822 self.error = 1 823 continue 824 825 try: 826 c = re.compile("(?P<%s>%s)" % (name, r), re.VERBOSE | self.reflags) 827 if (c.match("")): 828 self.log.error( 829 "Regular expression for rule '%s' matches empty string", name) 830 self.error = 1 831 except re.error: 832 _etype, e, _etrace = sys.exc_info() 833 self.log.error("Invalid regular expression for rule '%s'. %s", name, e) 834 if '#' in r: 835 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name) 836 self.error = 1 837 838 if not self.funcsym[state] and not self.strsym[state]: 839 self.log.error("No rules defined for state '%s'", state) 840 self.error = 1 841 842 # Validate the error function 843 efunc = self.errorf.get(state, None) 844 if efunc: 845 f = efunc 846 line = func_code(f).co_firstlineno 847 file = func_code(f).co_filename 848 self.files[file] = 1 849 850 if isinstance(f, types.MethodType): 851 reqargs = 2 852 else: 853 reqargs = 1 854 nargs = func_code(f).co_argcount 855 if nargs > reqargs: 856 self.log.error("%s:%d: Rule '%s' has too many arguments", 857 file, line, f.__name__) 858 self.error = 1 859 860 if nargs < reqargs: 861 self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__) 862 self.error = 1 863 864 for f in self.files: 865 self.validate_file(f) 866 867 # ----------------------------------------------------------------------------- 868 # validate_file() 869 # 870 # This checks to see if there are duplicated t_rulename() functions or strings 871 # in the parser input file. This is done using a simple regular expression 872 # match on each line in the given file. 873 # ----------------------------------------------------------------------------- 874 875 def validate_file(self, filename): 876 import os.path 877 base, ext = os.path.splitext(filename) 878 if ext != '.py': 879 return # No idea what the file is. Return OK 880 881 try: 882 f = open(filename) 883 lines = f.readlines() 884 f.close() 885 except IOError: 886 return # Couldn't find the file. Don't worry about it 887 888 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 889 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 890 891 counthash = {} 892 linen = 1 893 for l in lines: 894 m = fre.match(l) 895 if not m: 896 m = sre.match(l) 897 if m: 898 name = m.group(1) 899 prev = counthash.get(name) 900 if not prev: 901 counthash[name] = linen 902 else: 903 self.log.error( 904 "%s:%d: Rule %s redefined. Previously defined on line %d", filename, linen, name, prev) 905 self.error = 1 906 linen += 1 907 908# ----------------------------------------------------------------------------- 909# lex(module) 910# 911# Build all of the regular expression rules from definitions in the supplied module 912# ----------------------------------------------------------------------------- 913 914 915def lex(module=None, object=None, debug=0, optimize=0, lextab="lextab", reflags=0, nowarn=0, outputdir="", debuglog=None, errorlog=None): 916 global lexer 917 ldict = None 918 stateinfo = {'INITIAL': 'inclusive'} 919 lexobj = Lexer() 920 lexobj.lexoptimize = optimize 921 global token, input 922 923 if errorlog is None: 924 errorlog = PlyLogger(sys.stderr) 925 926 if debug: 927 if debuglog is None: 928 debuglog = PlyLogger(sys.stderr) 929 930 # Get the module dictionary used for the lexer 931 if object: 932 module = object 933 934 if module: 935 _items = [(k, getattr(module, k)) for k in dir(module)] 936 ldict = dict(_items) 937 else: 938 ldict = get_caller_module_dict(2) 939 940 # Collect parser information from the dictionary 941 linfo = LexerReflect(ldict, log=errorlog, reflags=reflags) 942 linfo.get_all() 943 if not optimize: 944 if linfo.validate_all(): 945 raise SyntaxError("Can't build lexer") 946 947 if optimize and lextab: 948 try: 949 lexobj.readtab(lextab, ldict) 950 token = lexobj.token 951 input = lexobj.input 952 lexer = lexobj 953 return lexobj 954 955 except ImportError: 956 pass 957 958 # Dump some basic debugging information 959 if debug: 960 debuglog.info("lex: tokens = %r", linfo.tokens) 961 debuglog.info("lex: literals = %r", linfo.literals) 962 debuglog.info("lex: states = %r", linfo.stateinfo) 963 964 # Build a dictionary of valid token names 965 lexobj.lextokens = {} 966 for n in linfo.tokens: 967 lexobj.lextokens[n] = 1 968 969 # Get literals specification 970 if isinstance(linfo.literals, (list, tuple)): 971 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 972 else: 973 lexobj.lexliterals = linfo.literals 974 975 # Get the stateinfo dictionary 976 stateinfo = linfo.stateinfo 977 978 regexs = {} 979 # Build the master regular expressions 980 for state in stateinfo: 981 regex_list = [] 982 983 # Add rules defined by functions first 984 for fname, f in linfo.funcsym[state]: 985 line = func_code(f).co_firstlineno 986 file = func_code(f).co_filename 987 regex_list.append("(?P<%s>%s)" % (fname, f.__doc__)) 988 if debug: 989 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, f.__doc__, state) 990 991 # Now add all of the simple rules 992 for name, r in linfo.strsym[state]: 993 regex_list.append("(?P<%s>%s)" % (name, r)) 994 if debug: 995 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state) 996 997 regexs[state] = regex_list 998 999 # Build the master regular expressions 1000 1001 if debug: 1002 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 1003 1004 for state in regexs: 1005 lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames) 1006 lexobj.lexstatere[state] = lexre 1007 lexobj.lexstateretext[state] = re_text 1008 lexobj.lexstaterenames[state] = re_names 1009 if debug: 1010 for i in range(len(re_text)): 1011 debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, re_text[i]) 1012 1013 # For inclusive states, we need to add the regular expressions from the INITIAL state 1014 for state, stype in stateinfo.items(): 1015 if state != "INITIAL" and stype == 'inclusive': 1016 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) 1017 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) 1018 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) 1019 1020 lexobj.lexstateinfo = stateinfo 1021 lexobj.lexre = lexobj.lexstatere["INITIAL"] 1022 lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 1023 1024 # Set up ignore variables 1025 lexobj.lexstateignore = linfo.ignore 1026 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL", "") 1027 1028 # Set up error functions 1029 lexobj.lexstateerrorf = linfo.errorf 1030 lexobj.lexerrorf = linfo.errorf.get("INITIAL", None) 1031 if not lexobj.lexerrorf: 1032 errorlog.warning("No t_error rule is defined") 1033 1034 # Check state information for ignore and error rules 1035 for s, stype in stateinfo.items(): 1036 if stype == 'exclusive': 1037 if not s in linfo.errorf: 1038 errorlog.warning("No error rule is defined for exclusive state '%s'", s) 1039 if not s in linfo.ignore and lexobj.lexignore: 1040 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 1041 elif stype == 'inclusive': 1042 if not s in linfo.errorf: 1043 linfo.errorf[s] = linfo.errorf.get("INITIAL", None) 1044 if not s in linfo.ignore: 1045 linfo.ignore[s] = linfo.ignore.get("INITIAL", "") 1046 1047 # Create global versions of the token() and input() functions 1048 token = lexobj.token 1049 input = lexobj.input 1050 lexer = lexobj 1051 1052 # If in optimize mode, we write the lextab 1053 if lextab and optimize: 1054 lexobj.writetab(lextab, outputdir) 1055 1056 return lexobj 1057 1058# ----------------------------------------------------------------------------- 1059# runmain() 1060# 1061# This runs the lexer as a main program 1062# ----------------------------------------------------------------------------- 1063 1064 1065def runmain(lexer=None, data=None): 1066 if not data: 1067 try: 1068 filename = sys.argv[1] 1069 f = open(filename) 1070 data = f.read() 1071 f.close() 1072 except IndexError: 1073 sys.stdout.write("Reading from standard input (type EOF to end):\n") 1074 data = sys.stdin.read() 1075 1076 if lexer: 1077 _input = lexer.input 1078 else: 1079 _input = input 1080 _input(data) 1081 if lexer: 1082 _token = lexer.token 1083 else: 1084 _token = token 1085 1086 while 1: 1087 tok = _token() 1088 if not tok: 1089 break 1090 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno, tok.lexpos)) 1091 1092# ----------------------------------------------------------------------------- 1093# @TOKEN(regex) 1094# 1095# This decorator function can be used to set the regex expression on a function 1096# when its docstring might need to be set in an alternative way 1097# ----------------------------------------------------------------------------- 1098 1099 1100def TOKEN(r): 1101 def set_doc(f): 1102 if hasattr(r, "__call__"): 1103 f.__doc__ = r.__doc__ 1104 else: 1105 f.__doc__ = r 1106 return f 1107 return set_doc 1108 1109 1110# Alternative spelling of the TOKEN decorator 1111Token = TOKEN 1112