1# 2# @TAG(OTHER_BSD) 3# 4# ----------------------------------------------------------------------------- 5# ply: lex.py 6# 7# Copyright (C) 2001-2009, 8# David M. Beazley (Dabeaz LLC) 9# All rights reserved. 10# 11# Redistribution and use in source and binary forms, with or without 12# modification, are permitted provided that the following conditions are 13# met: 14# 15# * Redistributions of source code must retain the above copyright notice, 16# this list of conditions and the following disclaimer. 17# * Redistributions in binary form must reproduce the above copyright notice, 18# this list of conditions and the following disclaimer in the documentation 19# and/or other materials provided with the distribution. 20# * Neither the name of the David Beazley or Dabeaz LLC may be used to 21# endorse or promote products derived from this software without 22# specific prior written permission. 23# 24# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 25# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 27# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 28# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 29# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 30# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 31# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 32# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 33# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 34# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35# ----------------------------------------------------------------------------- 36 37__version__ = "3.2" 38__tabversion__ = "3.2" # Version of table file used 39 40import re, sys, types, copy, os 41 42# Python3 doesn't have a build-in cmp function. 43# We need to import it here, even though it isn't called in this file 44# when interpreted by python3, to prevent pylint from treating it as 45# an error. 46from past.builtins import cmp 47 48# This tuple contains known string types 49try: 50 # Python 2.6 51 StringTypes = (types.StringType, types.UnicodeType) 52except AttributeError: 53 # Python 3.0 54 StringTypes = (str, bytes) 55 56# Extract the code attribute of a function. Different implementations 57# are for Python 2/3 compatibility. 58 59if sys.version_info[0] < 3: 60 def func_code(f): 61 return f.func_code 62else: 63 def func_code(f): 64 return f.__code__ 65 66# This regular expression is used to match valid token names 67_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') 68 69# Exception thrown when invalid token encountered and no default error 70# handler is defined. 71 72class LexError(Exception): 73 def __init__(self,message,s): 74 self.args = (message,) 75 self.text = s 76 77# Token class. This class is used to represent the tokens produced. 78class LexToken(object): 79 def __str__(self): 80 return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) 81 def __repr__(self): 82 return str(self) 83 84# This object is a stand-in for a logging object created by the 85# logging module. 86 87class PlyLogger(object): 88 def __init__(self,f): 89 self.f = f 90 def critical(self,msg,*args,**kwargs): 91 self.f.write((msg % args) + "\n") 92 93 def warning(self,msg,*args,**kwargs): 94 self.f.write("WARNING: "+ (msg % args) + "\n") 95 96 def error(self,msg,*args,**kwargs): 97 self.f.write("ERROR: " + (msg % args) + "\n") 98 99 info = critical 100 debug = critical 101 102# Null logger is used when no output is generated. Does nothing. 103class NullLogger(object): 104 def __getattribute__(self,name): 105 return self 106 def __call__(self,*args,**kwargs): 107 return self 108 109# ----------------------------------------------------------------------------- 110# === Lexing Engine === 111# 112# The following Lexer class implements the lexer runtime. There are only 113# a few public methods and attributes: 114# 115# input() - Store a new string in the lexer 116# token() - Get the next token 117# clone() - Clone the lexer 118# 119# lineno - Current line number 120# lexpos - Current position in the input string 121# ----------------------------------------------------------------------------- 122 123class Lexer: 124 def __init__(self): 125 self.lexre = None # Master regular expression. This is a list of 126 # tuples (re,findex) where re is a compiled 127 # regular expression and findex is a list 128 # mapping regex group numbers to rules 129 self.lexretext = None # Current regular expression strings 130 self.lexstatere = {} # Dictionary mapping lexer states to master regexs 131 self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 132 self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 133 self.lexstate = "INITIAL" # Current lexer state 134 self.lexstatestack = [] # Stack of lexer states 135 self.lexstateinfo = None # State information 136 self.lexstateignore = {} # Dictionary of ignored characters for each state 137 self.lexstateerrorf = {} # Dictionary of error functions for each state 138 self.lexreflags = 0 # Optional re compile flags 139 self.lexdata = None # Actual input data (as a string) 140 self.lexpos = 0 # Current position in input text 141 self.lexlen = 0 # Length of the input text 142 self.lexerrorf = None # Error rule (if any) 143 self.lextokens = None # List of valid tokens 144 self.lexignore = "" # Ignored characters 145 self.lexliterals = "" # Literal characters that can be passed through 146 self.lexmodule = None # Module 147 self.lineno = 1 # Current line number 148 self.lexoptimize = 0 # Optimized mode 149 150 def clone(self,object=None): 151 c = copy.copy(self) 152 153 # If the object parameter has been supplied, it means we are attaching the 154 # lexer to a new object. In this case, we have to rebind all methods in 155 # the lexstatere and lexstateerrorf tables. 156 157 if object: 158 newtab = { } 159 for key, ritem in self.lexstatere.items(): 160 newre = [] 161 for cre, findex in ritem: 162 newfindex = [] 163 for f in findex: 164 if not f or not f[0]: 165 newfindex.append(f) 166 continue 167 newfindex.append((getattr(object,f[0].__name__),f[1])) 168 newre.append((cre,newfindex)) 169 newtab[key] = newre 170 c.lexstatere = newtab 171 c.lexstateerrorf = { } 172 for key, ef in self.lexstateerrorf.items(): 173 c.lexstateerrorf[key] = getattr(object,ef.__name__) 174 c.lexmodule = object 175 return c 176 177 # ------------------------------------------------------------ 178 # writetab() - Write lexer information to a table file 179 # ------------------------------------------------------------ 180 def writetab(self,tabfile,outputdir=""): 181 if isinstance(tabfile,types.ModuleType): 182 return 183 basetabfilename = tabfile.split(".")[-1] 184 filename = os.path.join(outputdir,basetabfilename)+".py" 185 tf = open(filename,"w") 186 tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) 187 tf.write("_tabversion = %s\n" % repr(__version__)) 188 tf.write("_lextokens = %s\n" % repr(self.lextokens)) 189 tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) 190 tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 191 tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 192 193 tabre = { } 194 # Collect all functions in the initial state 195 initial = self.lexstatere["INITIAL"] 196 initialfuncs = [] 197 for part in initial: 198 for f in part[1]: 199 if f and f[0]: 200 initialfuncs.append(f) 201 202 for key, lre in self.lexstatere.items(): 203 titem = [] 204 for i in range(len(lre)): 205 titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) 206 tabre[key] = titem 207 208 tf.write("_lexstatere = %s\n" % repr(tabre)) 209 tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 210 211 taberr = { } 212 for key, ef in self.lexstateerrorf.items(): 213 if ef: 214 taberr[key] = ef.__name__ 215 else: 216 taberr[key] = None 217 tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 218 tf.close() 219 220 # ------------------------------------------------------------ 221 # readtab() - Read lexer information from a tab file 222 # ------------------------------------------------------------ 223 def readtab(self,tabfile,fdict): 224 if isinstance(tabfile,types.ModuleType): 225 lextab = tabfile 226 else: 227 if sys.version_info[0] < 3: 228 exec("import %s as lextab" % tabfile) 229 else: 230 env = { } 231 exec("import %s as lextab" % tabfile, env,env) 232 lextab = env['lextab'] 233 234 if getattr(lextab,"_tabversion","0.0") != __version__: 235 raise ImportError("Inconsistent PLY version") 236 237 self.lextokens = lextab._lextokens 238 self.lexreflags = lextab._lexreflags 239 self.lexliterals = lextab._lexliterals 240 self.lexstateinfo = lextab._lexstateinfo 241 self.lexstateignore = lextab._lexstateignore 242 self.lexstatere = { } 243 self.lexstateretext = { } 244 for key,lre in lextab._lexstatere.items(): 245 titem = [] 246 txtitem = [] 247 for i in range(len(lre)): 248 titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict))) 249 txtitem.append(lre[i][0]) 250 self.lexstatere[key] = titem 251 self.lexstateretext[key] = txtitem 252 self.lexstateerrorf = { } 253 for key,ef in lextab._lexstateerrorf.items(): 254 self.lexstateerrorf[key] = fdict[ef] 255 self.begin('INITIAL') 256 257 # ------------------------------------------------------------ 258 # input() - Push a new string into the lexer 259 # ------------------------------------------------------------ 260 def input(self,s): 261 # Pull off the first character to see if s looks like a string 262 c = s[:1] 263 if not isinstance(c,StringTypes): 264 raise ValueError("Expected a string") 265 self.lexdata = s 266 self.lexpos = 0 267 self.lexlen = len(s) 268 269 # ------------------------------------------------------------ 270 # begin() - Changes the lexing state 271 # ------------------------------------------------------------ 272 def begin(self,state): 273 if not state in self.lexstatere: 274 raise ValueError("Undefined state") 275 self.lexre = self.lexstatere[state] 276 self.lexretext = self.lexstateretext[state] 277 self.lexignore = self.lexstateignore.get(state,"") 278 self.lexerrorf = self.lexstateerrorf.get(state,None) 279 self.lexstate = state 280 281 # ------------------------------------------------------------ 282 # push_state() - Changes the lexing state and saves old on stack 283 # ------------------------------------------------------------ 284 def push_state(self,state): 285 self.lexstatestack.append(self.lexstate) 286 self.begin(state) 287 288 # ------------------------------------------------------------ 289 # pop_state() - Restores the previous state 290 # ------------------------------------------------------------ 291 def pop_state(self): 292 self.begin(self.lexstatestack.pop()) 293 294 # ------------------------------------------------------------ 295 # current_state() - Returns the current lexing state 296 # ------------------------------------------------------------ 297 def current_state(self): 298 return self.lexstate 299 300 # ------------------------------------------------------------ 301 # skip() - Skip ahead n characters 302 # ------------------------------------------------------------ 303 def skip(self,n): 304 self.lexpos += n 305 306 # ------------------------------------------------------------ 307 # opttoken() - Return the next token from the Lexer 308 # 309 # Note: This function has been carefully implemented to be as fast 310 # as possible. Don't make changes unless you really know what 311 # you are doing 312 # ------------------------------------------------------------ 313 def token(self): 314 # Make local copies of frequently referenced attributes 315 lexpos = self.lexpos 316 lexlen = self.lexlen 317 lexignore = self.lexignore 318 lexdata = self.lexdata 319 320 while lexpos < lexlen: 321 # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 322 if lexdata[lexpos] in lexignore: 323 lexpos += 1 324 continue 325 326 # Look for a regular expression match 327 for lexre,lexindexfunc in self.lexre: 328 m = lexre.match(lexdata,lexpos) 329 if not m: continue 330 331 # Create a token for return 332 tok = LexToken() 333 tok.value = m.group() 334 tok.lineno = self.lineno 335 tok.lexpos = lexpos 336 337 i = m.lastindex 338 func,tok.type = lexindexfunc[i] 339 340 if not func: 341 # If no token type was set, it's an ignored token 342 if tok.type: 343 self.lexpos = m.end() 344 return tok 345 else: 346 lexpos = m.end() 347 break 348 349 lexpos = m.end() 350 351 # If token is processed by a function, call it 352 353 tok.lexer = self # Set additional attributes useful in token rules 354 self.lexmatch = m 355 self.lexpos = lexpos 356 357 newtok = func(tok) 358 359 # Every function must return a token, if nothing, we just move to next token 360 if not newtok: 361 lexpos = self.lexpos # This is here in case user has updated lexpos. 362 lexignore = self.lexignore # This is here in case there was a state change 363 break 364 365 # Verify type of the token. If not in the token map, raise an error 366 if not self.lexoptimize: 367 if not newtok.type in self.lextokens: 368 raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( 369 func_code(func).co_filename, func_code(func).co_firstlineno, 370 func.__name__, newtok.type),lexdata[lexpos:]) 371 372 return newtok 373 else: 374 # No match, see if in literals 375 if lexdata[lexpos] in self.lexliterals: 376 tok = LexToken() 377 tok.value = lexdata[lexpos] 378 tok.lineno = self.lineno 379 tok.type = tok.value 380 tok.lexpos = lexpos 381 self.lexpos = lexpos + 1 382 return tok 383 384 # No match. Call t_error() if defined. 385 if self.lexerrorf: 386 tok = LexToken() 387 tok.value = self.lexdata[lexpos:] 388 tok.lineno = self.lineno 389 tok.type = "error" 390 tok.lexer = self 391 tok.lexpos = lexpos 392 self.lexpos = lexpos 393 newtok = self.lexerrorf(tok) 394 if lexpos == self.lexpos: 395 # Error method didn't change text position at all. This is an error. 396 raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) 397 lexpos = self.lexpos 398 if not newtok: continue 399 return newtok 400 401 self.lexpos = lexpos 402 raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) 403 404 self.lexpos = lexpos + 1 405 if self.lexdata is None: 406 raise RuntimeError("No input string given with input()") 407 return None 408 409 # Iterator interface 410 def __iter__(self): 411 return self 412 413 def next(self): 414 t = self.token() 415 if t is None: 416 raise StopIteration 417 return t 418 419 __next__ = next 420 421# ----------------------------------------------------------------------------- 422# ==== Lex Builder === 423# 424# The functions and classes below are used to collect lexing information 425# and build a Lexer object from it. 426# ----------------------------------------------------------------------------- 427 428# ----------------------------------------------------------------------------- 429# get_caller_module_dict() 430# 431# This function returns a dictionary containing all of the symbols defined within 432# a caller further down the call stack. This is used to get the environment 433# associated with the yacc() call if none was provided. 434# ----------------------------------------------------------------------------- 435 436def get_caller_module_dict(levels): 437 try: 438 raise RuntimeError 439 except RuntimeError: 440 e,b,t = sys.exc_info() 441 f = t.tb_frame 442 while levels > 0: 443 f = f.f_back 444 levels -= 1 445 ldict = f.f_globals.copy() 446 if f.f_globals != f.f_locals: 447 ldict.update(f.f_locals) 448 449 return ldict 450 451# ----------------------------------------------------------------------------- 452# _funcs_to_names() 453# 454# Given a list of regular expression functions, this converts it to a list 455# suitable for output to a table file 456# ----------------------------------------------------------------------------- 457 458def _funcs_to_names(funclist,namelist): 459 result = [] 460 for f,name in zip(funclist,namelist): 461 if f and f[0]: 462 result.append((name, f[1])) 463 else: 464 result.append(f) 465 return result 466 467# ----------------------------------------------------------------------------- 468# _names_to_funcs() 469# 470# Given a list of regular expression function names, this converts it back to 471# functions. 472# ----------------------------------------------------------------------------- 473 474def _names_to_funcs(namelist,fdict): 475 result = [] 476 for n in namelist: 477 if n and n[0]: 478 result.append((fdict[n[0]],n[1])) 479 else: 480 result.append(n) 481 return result 482 483# ----------------------------------------------------------------------------- 484# _form_master_re() 485# 486# This function takes a list of all of the regex components and attempts to 487# form the master regular expression. Given limitations in the Python re 488# module, it may be necessary to break the master regex into separate expressions. 489# ----------------------------------------------------------------------------- 490 491def _form_master_re(relist,reflags,ldict,toknames): 492 if not relist: return [] 493 regex = "|".join(relist) 494 try: 495 lexre = re.compile(regex,re.VERBOSE | reflags) 496 497 # Build the index to function map for the matching engine 498 lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) 499 lexindexnames = lexindexfunc[:] 500 501 for f,i in lexre.groupindex.items(): 502 handle = ldict.get(f,None) 503 if type(handle) in (types.FunctionType, types.MethodType): 504 lexindexfunc[i] = (handle,toknames[f]) 505 lexindexnames[i] = f 506 elif handle is not None: 507 lexindexnames[i] = f 508 if f.find("ignore_") > 0: 509 lexindexfunc[i] = (None,None) 510 else: 511 lexindexfunc[i] = (None, toknames[f]) 512 513 return [(lexre,lexindexfunc)],[regex],[lexindexnames] 514 except Exception: 515 m = int(len(relist)/2) 516 if m == 0: m = 1 517 llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) 518 rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) 519 return llist+rlist, lre+rre, lnames+rnames 520 521# ----------------------------------------------------------------------------- 522# def _statetoken(s,names) 523# 524# Given a declaration name s of the form "t_" and a dictionary whose keys are 525# state names, this function returns a tuple (states,tokenname) where states 526# is a tuple of state names and tokenname is the name of the token. For example, 527# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 528# ----------------------------------------------------------------------------- 529 530def _statetoken(s,names): 531 nonstate = 1 532 parts = s.split("_") 533 for i in range(1,len(parts)): 534 if not parts[i] in names and parts[i] != 'ANY': break 535 if i > 1: 536 states = tuple(parts[1:i]) 537 else: 538 states = ('INITIAL',) 539 540 if 'ANY' in states: 541 states = tuple(names) 542 543 tokenname = "_".join(parts[i:]) 544 return (states,tokenname) 545 546 547# ----------------------------------------------------------------------------- 548# LexerReflect() 549# 550# This class represents information needed to build a lexer as extracted from a 551# user's input file. 552# ----------------------------------------------------------------------------- 553class LexerReflect(object): 554 def __init__(self,ldict,log=None,reflags=0): 555 self.ldict = ldict 556 self.error_func = None 557 self.tokens = [] 558 self.reflags = reflags 559 self.stateinfo = { 'INITIAL' : 'inclusive'} 560 self.files = {} 561 self.error = 0 562 563 if log is None: 564 self.log = PlyLogger(sys.stderr) 565 else: 566 self.log = log 567 568 # Get all of the basic information 569 def get_all(self): 570 self.get_tokens() 571 self.get_literals() 572 self.get_states() 573 self.get_rules() 574 575 # Validate all of the information 576 def validate_all(self): 577 self.validate_tokens() 578 self.validate_literals() 579 self.validate_rules() 580 return self.error 581 582 # Get the tokens map 583 def get_tokens(self): 584 tokens = self.ldict.get("tokens",None) 585 if not tokens: 586 self.log.error("No token list is defined") 587 self.error = 1 588 return 589 590 if not isinstance(tokens,(list, tuple)): 591 self.log.error("tokens must be a list or tuple") 592 self.error = 1 593 return 594 595 if not tokens: 596 self.log.error("tokens is empty") 597 self.error = 1 598 return 599 600 self.tokens = tokens 601 602 # Validate the tokens 603 def validate_tokens(self): 604 terminals = {} 605 for n in self.tokens: 606 if not _is_identifier.match(n): 607 self.log.error("Bad token name '%s'",n) 608 self.error = 1 609 if n in terminals: 610 self.log.warning("Token '%s' multiply defined", n) 611 terminals[n] = 1 612 613 # Get the literals specifier 614 def get_literals(self): 615 self.literals = self.ldict.get("literals","") 616 617 # Validate literals 618 def validate_literals(self): 619 try: 620 for c in self.literals: 621 if not isinstance(c,StringTypes) or len(c) > 1: 622 self.log.error("Invalid literal %s. Must be a single character", repr(c)) 623 self.error = 1 624 continue 625 626 except TypeError: 627 self.log.error("Invalid literals specification. literals must be a sequence of characters") 628 self.error = 1 629 630 def get_states(self): 631 self.states = self.ldict.get("states",None) 632 # Build statemap 633 if self.states: 634 if not isinstance(self.states,(tuple,list)): 635 self.log.error("states must be defined as a tuple or list") 636 self.error = 1 637 else: 638 for s in self.states: 639 if not isinstance(s,tuple) or len(s) != 2: 640 self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s)) 641 self.error = 1 642 continue 643 name, statetype = s 644 if not isinstance(name,StringTypes): 645 self.log.error("State name %s must be a string", repr(name)) 646 self.error = 1 647 continue 648 if not (statetype == 'inclusive' or statetype == 'exclusive'): 649 self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name) 650 self.error = 1 651 continue 652 if name in self.stateinfo: 653 self.log.error("State '%s' already defined",name) 654 self.error = 1 655 continue 656 self.stateinfo[name] = statetype 657 658 # Get all of the symbols with a t_ prefix and sort them into various 659 # categories (functions, strings, error functions, and ignore characters) 660 661 def get_rules(self): 662 tsymbols = [f for f in self.ldict if f[:2] == 't_' ] 663 664 # Now build up a list of functions and a list of strings 665 666 self.toknames = { } # Mapping of symbols to token names 667 self.funcsym = { } # Symbols defined as functions 668 self.strsym = { } # Symbols defined as strings 669 self.ignore = { } # Ignore strings by state 670 self.errorf = { } # Error functions by state 671 672 for s in self.stateinfo: 673 self.funcsym[s] = [] 674 self.strsym[s] = [] 675 676 if len(tsymbols) == 0: 677 self.log.error("No rules of the form t_rulename are defined") 678 self.error = 1 679 return 680 681 for f in tsymbols: 682 t = self.ldict[f] 683 states, tokname = _statetoken(f,self.stateinfo) 684 self.toknames[f] = tokname 685 686 if hasattr(t,"__call__"): 687 if tokname == 'error': 688 for s in states: 689 self.errorf[s] = t 690 elif tokname == 'ignore': 691 line = func_code(t).co_firstlineno 692 file = func_code(t).co_filename 693 self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__) 694 self.error = 1 695 else: 696 for s in states: 697 self.funcsym[s].append((f,t)) 698 elif isinstance(t, StringTypes): 699 if tokname == 'ignore': 700 for s in states: 701 self.ignore[s] = t 702 if "\\" in t: 703 self.log.warning("%s contains a literal backslash '\\'",f) 704 705 elif tokname == 'error': 706 self.log.error("Rule '%s' must be defined as a function", f) 707 self.error = 1 708 else: 709 for s in states: 710 self.strsym[s].append((f,t)) 711 else: 712 self.log.error("%s not defined as a function or string", f) 713 self.error = 1 714 715 # Sort the functions by line number 716 for f in self.funcsym.values(): 717 if sys.version_info[0] < 3: 718 f.sort(lambda x,y: cmp(func_code(x[1]).co_firstlineno,func_code(y[1]).co_firstlineno)) 719 else: 720 # Python 3.0 721 f.sort(key=lambda x: func_code(x[1]).co_firstlineno) 722 723 # Sort the strings by regular expression length 724 for s in self.strsym.values(): 725 if sys.version_info[0] < 3: 726 s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) 727 else: 728 # Python 3.0 729 s.sort(key=lambda x: len(x[1]),reverse=True) 730 731 # Validate all of the t_rules collected 732 def validate_rules(self): 733 for state in self.stateinfo: 734 # Validate all rules defined by functions 735 736 737 738 for fname, f in self.funcsym[state]: 739 line = func_code(f).co_firstlineno 740 file = func_code(f).co_filename 741 self.files[file] = 1 742 743 tokname = self.toknames[fname] 744 if isinstance(f, types.MethodType): 745 reqargs = 2 746 else: 747 reqargs = 1 748 nargs = func_code(f).co_argcount 749 if nargs > reqargs: 750 self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 751 self.error = 1 752 continue 753 754 if nargs < reqargs: 755 self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 756 self.error = 1 757 continue 758 759 if not f.__doc__: 760 self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__) 761 self.error = 1 762 continue 763 764 try: 765 c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags) 766 if c.match(""): 767 self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__) 768 self.error = 1 769 except re.error: 770 _etype, e, _etrace = sys.exc_info() 771 self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e) 772 if '#' in f.__doc__: 773 self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__) 774 self.error = 1 775 776 # Validate all rules defined by strings 777 for name,r in self.strsym[state]: 778 tokname = self.toknames[name] 779 if tokname == 'error': 780 self.log.error("Rule '%s' must be defined as a function", name) 781 self.error = 1 782 continue 783 784 if not tokname in self.tokens and tokname.find("ignore_") < 0: 785 self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname) 786 self.error = 1 787 continue 788 789 try: 790 c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags) 791 if (c.match("")): 792 self.log.error("Regular expression for rule '%s' matches empty string",name) 793 self.error = 1 794 except re.error: 795 _etype, e, _etrace = sys.exc_info() 796 self.log.error("Invalid regular expression for rule '%s'. %s",name,e) 797 if '#' in r: 798 self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name) 799 self.error = 1 800 801 if not self.funcsym[state] and not self.strsym[state]: 802 self.log.error("No rules defined for state '%s'",state) 803 self.error = 1 804 805 # Validate the error function 806 efunc = self.errorf.get(state,None) 807 if efunc: 808 f = efunc 809 line = func_code(f).co_firstlineno 810 file = func_code(f).co_filename 811 self.files[file] = 1 812 813 if isinstance(f, types.MethodType): 814 reqargs = 2 815 else: 816 reqargs = 1 817 nargs = func_code(f).co_argcount 818 if nargs > reqargs: 819 self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 820 self.error = 1 821 822 if nargs < reqargs: 823 self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 824 self.error = 1 825 826 for f in self.files: 827 self.validate_file(f) 828 829 830 # ----------------------------------------------------------------------------- 831 # validate_file() 832 # 833 # This checks to see if there are duplicated t_rulename() functions or strings 834 # in the parser input file. This is done using a simple regular expression 835 # match on each line in the given file. 836 # ----------------------------------------------------------------------------- 837 838 def validate_file(self,filename): 839 import os.path 840 base,ext = os.path.splitext(filename) 841 if ext != '.py': return # No idea what the file is. Return OK 842 843 try: 844 f = open(filename) 845 lines = f.readlines() 846 f.close() 847 except IOError: 848 return # Couldn't find the file. Don't worry about it 849 850 fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 851 sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 852 853 counthash = { } 854 linen = 1 855 for l in lines: 856 m = fre.match(l) 857 if not m: 858 m = sre.match(l) 859 if m: 860 name = m.group(1) 861 prev = counthash.get(name) 862 if not prev: 863 counthash[name] = linen 864 else: 865 self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev) 866 self.error = 1 867 linen += 1 868 869# ----------------------------------------------------------------------------- 870# lex(module) 871# 872# Build all of the regular expression rules from definitions in the supplied module 873# ----------------------------------------------------------------------------- 874def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None): 875 global lexer 876 ldict = None 877 stateinfo = { 'INITIAL' : 'inclusive'} 878 lexobj = Lexer() 879 lexobj.lexoptimize = optimize 880 global token,input 881 882 if errorlog is None: 883 errorlog = PlyLogger(sys.stderr) 884 885 if debug: 886 if debuglog is None: 887 debuglog = PlyLogger(sys.stderr) 888 889 # Get the module dictionary used for the lexer 890 if object: module = object 891 892 if module: 893 _items = [(k,getattr(module,k)) for k in dir(module)] 894 ldict = dict(_items) 895 else: 896 ldict = get_caller_module_dict(2) 897 898 # Collect parser information from the dictionary 899 linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) 900 linfo.get_all() 901 if not optimize: 902 if linfo.validate_all(): 903 raise SyntaxError("Can't build lexer") 904 905 if optimize and lextab: 906 try: 907 lexobj.readtab(lextab,ldict) 908 token = lexobj.token 909 input = lexobj.input 910 lexer = lexobj 911 return lexobj 912 913 except ImportError: 914 pass 915 916 # Dump some basic debugging information 917 if debug: 918 debuglog.info("lex: tokens = %r", linfo.tokens) 919 debuglog.info("lex: literals = %r", linfo.literals) 920 debuglog.info("lex: states = %r", linfo.stateinfo) 921 922 # Build a dictionary of valid token names 923 lexobj.lextokens = { } 924 for n in linfo.tokens: 925 lexobj.lextokens[n] = 1 926 927 # Get literals specification 928 if isinstance(linfo.literals,(list,tuple)): 929 lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 930 else: 931 lexobj.lexliterals = linfo.literals 932 933 # Get the stateinfo dictionary 934 stateinfo = linfo.stateinfo 935 936 regexs = { } 937 # Build the master regular expressions 938 for state in stateinfo: 939 regex_list = [] 940 941 # Add rules defined by functions first 942 for fname, f in linfo.funcsym[state]: 943 line = func_code(f).co_firstlineno 944 file = func_code(f).co_filename 945 regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) 946 if debug: 947 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state) 948 949 # Now add all of the simple rules 950 for name,r in linfo.strsym[state]: 951 regex_list.append("(?P<%s>%s)" % (name,r)) 952 if debug: 953 debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state) 954 955 regexs[state] = regex_list 956 957 # Build the master regular expressions 958 959 if debug: 960 debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 961 962 for state in regexs: 963 lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames) 964 lexobj.lexstatere[state] = lexre 965 lexobj.lexstateretext[state] = re_text 966 lexobj.lexstaterenames[state] = re_names 967 if debug: 968 for i in range(len(re_text)): 969 debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i]) 970 971 # For inclusive states, we need to add the regular expressions from the INITIAL state 972 for state,stype in stateinfo.items(): 973 if state != "INITIAL" and stype == 'inclusive': 974 lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) 975 lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) 976 lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) 977 978 lexobj.lexstateinfo = stateinfo 979 lexobj.lexre = lexobj.lexstatere["INITIAL"] 980 lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 981 982 # Set up ignore variables 983 lexobj.lexstateignore = linfo.ignore 984 lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") 985 986 # Set up error functions 987 lexobj.lexstateerrorf = linfo.errorf 988 lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) 989 if not lexobj.lexerrorf: 990 errorlog.warning("No t_error rule is defined") 991 992 # Check state information for ignore and error rules 993 for s,stype in stateinfo.items(): 994 if stype == 'exclusive': 995 if not s in linfo.errorf: 996 errorlog.warning("No error rule is defined for exclusive state '%s'", s) 997 if not s in linfo.ignore and lexobj.lexignore: 998 errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 999 elif stype == 'inclusive': 1000 if not s in linfo.errorf: 1001 linfo.errorf[s] = linfo.errorf.get("INITIAL",None) 1002 if not s in linfo.ignore: 1003 linfo.ignore[s] = linfo.ignore.get("INITIAL","") 1004 1005 # Create global versions of the token() and input() functions 1006 token = lexobj.token 1007 input = lexobj.input 1008 lexer = lexobj 1009 1010 # If in optimize mode, we write the lextab 1011 if lextab and optimize: 1012 lexobj.writetab(lextab,outputdir) 1013 1014 return lexobj 1015 1016# ----------------------------------------------------------------------------- 1017# runmain() 1018# 1019# This runs the lexer as a main program 1020# ----------------------------------------------------------------------------- 1021 1022def runmain(lexer=None,data=None): 1023 if not data: 1024 try: 1025 filename = sys.argv[1] 1026 f = open(filename) 1027 data = f.read() 1028 f.close() 1029 except IndexError: 1030 sys.stdout.write("Reading from standard input (type EOF to end):\n") 1031 data = sys.stdin.read() 1032 1033 if lexer: 1034 _input = lexer.input 1035 else: 1036 _input = input 1037 _input(data) 1038 if lexer: 1039 _token = lexer.token 1040 else: 1041 _token = token 1042 1043 while 1: 1044 tok = _token() 1045 if not tok: break 1046 sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos)) 1047 1048# ----------------------------------------------------------------------------- 1049# @TOKEN(regex) 1050# 1051# This decorator function can be used to set the regex expression on a function 1052# when its docstring might need to be set in an alternative way 1053# ----------------------------------------------------------------------------- 1054 1055def TOKEN(r): 1056 def set_doc(f): 1057 if hasattr(r,"__call__"): 1058 f.__doc__ = r.__doc__ 1059 else: 1060 f.__doc__ = r 1061 return f 1062 return set_doc 1063 1064# Alternative spelling of the TOKEN decorator 1065Token = TOKEN 1066 1067