1# SPDX-License-Identifier: BSD-3-Clause
2
3# -----------------------------------------------------------------------------
4# ply: lex.py
5#
6# Copyright (C) 2001-2009,
7# David M. Beazley (Dabeaz LLC)
8# All rights reserved.
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright notice,
15#   this list of conditions and the following disclaimer.
16# * Redistributions in binary form must reproduce the above copyright notice,
17#   this list of conditions and the following disclaimer in the documentation
18#   and/or other materials provided with the distribution.
19# * Neither the name of the David Beazley or Dabeaz LLC may be used to
20#   endorse or promote products derived from this software without
21#  specific prior written permission.
22#
23# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34# -----------------------------------------------------------------------------
35
36__version__ = "3.2"
37__tabversion__ = "3.2"       # Version of table file used
38
39from past.builtins import cmp
40import re
41import sys
42import types
43import copy
44import os
45
46# Python3 doesn't have a build-in cmp function.
47# We need to import it here, even though it isn't called in this file
48# when interpreted by python3, to prevent pylint from treating it as
49# an error.
50
51# This tuple contains known string types
52try:
53    # Python 2.6
54    StringTypes = (types.StringType, types.UnicodeType)
55except AttributeError:
56    # Python 3.0
57    StringTypes = (str, bytes)
58
59# Extract the code attribute of a function. Different implementations
60# are for Python 2/3 compatibility.
61
62if sys.version_info[0] < 3:
63    def func_code(f):
64        return f.func_code
65else:
66    def func_code(f):
67        return f.__code__
68
69# This regular expression is used to match valid token names
70_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
71
72# Exception thrown when invalid token encountered and no default error
73# handler is defined.
74
75
76class LexError(Exception):
77    def __init__(self, message, s):
78        self.args = (message,)
79        self.text = s
80
81# Token class.  This class is used to represent the tokens produced.
82
83
84class LexToken(object):
85    def __str__(self):
86        return "LexToken(%s,%r,%d,%d)" % (self.type, self.value, self.lineno, self.lexpos)
87
88    def __repr__(self):
89        return str(self)
90
91# This object is a stand-in for a logging object created by the
92# logging module.
93
94
95class PlyLogger(object):
96    def __init__(self, f):
97        self.f = f
98
99    def critical(self, msg, *args, **kwargs):
100        self.f.write((msg % args) + "\n")
101
102    def warning(self, msg, *args, **kwargs):
103        self.f.write("WARNING: " + (msg % args) + "\n")
104
105    def error(self, msg, *args, **kwargs):
106        self.f.write("ERROR: " + (msg % args) + "\n")
107
108    info = critical
109    debug = critical
110
111# Null logger is used when no output is generated. Does nothing.
112
113
114class NullLogger(object):
115    def __getattribute__(self, name):
116        return self
117
118    def __call__(self, *args, **kwargs):
119        return self
120
121# -----------------------------------------------------------------------------
122#                        === Lexing Engine ===
123#
124# The following Lexer class implements the lexer runtime.   There are only
125# a few public methods and attributes:
126#
127#    input()          -  Store a new string in the lexer
128#    token()          -  Get the next token
129#    clone()          -  Clone the lexer
130#
131#    lineno           -  Current line number
132#    lexpos           -  Current position in the input string
133# -----------------------------------------------------------------------------
134
135
136class Lexer:
137    def __init__(self):
138        self.lexre = None             # Master regular expression. This is a list of
139        # tuples (re,findex) where re is a compiled
140        # regular expression and findex is a list
141        # mapping regex group numbers to rules
142        self.lexretext = None         # Current regular expression strings
143        self.lexstatere = {}          # Dictionary mapping lexer states to master regexs
144        self.lexstateretext = {}      # Dictionary mapping lexer states to regex strings
145        self.lexstaterenames = {}     # Dictionary mapping lexer states to symbol names
146        self.lexstate = "INITIAL"     # Current lexer state
147        self.lexstatestack = []       # Stack of lexer states
148        self.lexstateinfo = None      # State information
149        self.lexstateignore = {}      # Dictionary of ignored characters for each state
150        self.lexstateerrorf = {}      # Dictionary of error functions for each state
151        self.lexreflags = 0           # Optional re compile flags
152        self.lexdata = None           # Actual input data (as a string)
153        self.lexpos = 0               # Current position in input text
154        self.lexlen = 0               # Length of the input text
155        self.lexerrorf = None         # Error rule (if any)
156        self.lextokens = None         # List of valid tokens
157        self.lexignore = ""           # Ignored characters
158        self.lexliterals = ""         # Literal characters that can be passed through
159        self.lexmodule = None         # Module
160        self.lineno = 1               # Current line number
161        self.lexoptimize = 0          # Optimized mode
162
163    def clone(self, object=None):
164        c = copy.copy(self)
165
166        # If the object parameter has been supplied, it means we are attaching the
167        # lexer to a new object.  In this case, we have to rebind all methods in
168        # the lexstatere and lexstateerrorf tables.
169
170        if object:
171            newtab = {}
172            for key, ritem in self.lexstatere.items():
173                newre = []
174                for cre, findex in ritem:
175                    newfindex = []
176                    for f in findex:
177                        if not f or not f[0]:
178                            newfindex.append(f)
179                            continue
180                        newfindex.append((getattr(object, f[0].__name__), f[1]))
181                newre.append((cre, newfindex))
182                newtab[key] = newre
183            c.lexstatere = newtab
184            c.lexstateerrorf = {}
185            for key, ef in self.lexstateerrorf.items():
186                c.lexstateerrorf[key] = getattr(object, ef.__name__)
187            c.lexmodule = object
188        return c
189
190    # ------------------------------------------------------------
191    # writetab() - Write lexer information to a table file
192    # ------------------------------------------------------------
193    def writetab(self, tabfile, outputdir=""):
194        if isinstance(tabfile, types.ModuleType):
195            return
196        basetabfilename = tabfile.split(".")[-1]
197        filename = os.path.join(outputdir, basetabfilename)+".py"
198        tf = open(filename, "w")
199        tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" %
200                 (tabfile, __version__))
201        tf.write("_tabversion   = %s\n" % repr(__version__))
202        tf.write("_lextokens    = %s\n" % repr(self.lextokens))
203        tf.write("_lexreflags   = %s\n" % repr(self.lexreflags))
204        tf.write("_lexliterals  = %s\n" % repr(self.lexliterals))
205        tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
206
207        tabre = {}
208        # Collect all functions in the initial state
209        initial = self.lexstatere["INITIAL"]
210        initialfuncs = []
211        for part in initial:
212            for f in part[1]:
213                if f and f[0]:
214                    initialfuncs.append(f)
215
216        for key, lre in self.lexstatere.items():
217            titem = []
218            for i in range(len(lre)):
219                titem.append((self.lexstateretext[key][i], _funcs_to_names(
220                    lre[i][1], self.lexstaterenames[key][i])))
221            tabre[key] = titem
222
223        tf.write("_lexstatere   = %s\n" % repr(tabre))
224        tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
225
226        taberr = {}
227        for key, ef in self.lexstateerrorf.items():
228            if ef:
229                taberr[key] = ef.__name__
230            else:
231                taberr[key] = None
232        tf.write("_lexstateerrorf = %s\n" % repr(taberr))
233        tf.close()
234
235    # ------------------------------------------------------------
236    # readtab() - Read lexer information from a tab file
237    # ------------------------------------------------------------
238    def readtab(self, tabfile, fdict):
239        if isinstance(tabfile, types.ModuleType):
240            lextab = tabfile
241        else:
242            if sys.version_info[0] < 3:
243                exec("import %s as lextab" % tabfile)
244            else:
245                env = {}
246                exec("import %s as lextab" % tabfile, env, env)
247                lextab = env['lextab']
248
249        if getattr(lextab, "_tabversion", "0.0") != __version__:
250            raise ImportError("Inconsistent PLY version")
251
252        self.lextokens = lextab._lextokens
253        self.lexreflags = lextab._lexreflags
254        self.lexliterals = lextab._lexliterals
255        self.lexstateinfo = lextab._lexstateinfo
256        self.lexstateignore = lextab._lexstateignore
257        self.lexstatere = {}
258        self.lexstateretext = {}
259        for key, lre in lextab._lexstatere.items():
260            titem = []
261            txtitem = []
262            for i in range(len(lre)):
263                titem.append((re.compile(lre[i][0], lextab._lexreflags),
264                              _names_to_funcs(lre[i][1], fdict)))
265                txtitem.append(lre[i][0])
266            self.lexstatere[key] = titem
267            self.lexstateretext[key] = txtitem
268        self.lexstateerrorf = {}
269        for key, ef in lextab._lexstateerrorf.items():
270            self.lexstateerrorf[key] = fdict[ef]
271        self.begin('INITIAL')
272
273    # ------------------------------------------------------------
274    # input() - Push a new string into the lexer
275    # ------------------------------------------------------------
276    def input(self, s):
277        # Pull off the first character to see if s looks like a string
278        c = s[:1]
279        if not isinstance(c, StringTypes):
280            raise ValueError("Expected a string")
281        self.lexdata = s
282        self.lexpos = 0
283        self.lexlen = len(s)
284
285    # ------------------------------------------------------------
286    # begin() - Changes the lexing state
287    # ------------------------------------------------------------
288    def begin(self, state):
289        if not state in self.lexstatere:
290            raise ValueError("Undefined state")
291        self.lexre = self.lexstatere[state]
292        self.lexretext = self.lexstateretext[state]
293        self.lexignore = self.lexstateignore.get(state, "")
294        self.lexerrorf = self.lexstateerrorf.get(state, None)
295        self.lexstate = state
296
297    # ------------------------------------------------------------
298    # push_state() - Changes the lexing state and saves old on stack
299    # ------------------------------------------------------------
300    def push_state(self, state):
301        self.lexstatestack.append(self.lexstate)
302        self.begin(state)
303
304    # ------------------------------------------------------------
305    # pop_state() - Restores the previous state
306    # ------------------------------------------------------------
307    def pop_state(self):
308        self.begin(self.lexstatestack.pop())
309
310    # ------------------------------------------------------------
311    # current_state() - Returns the current lexing state
312    # ------------------------------------------------------------
313    def current_state(self):
314        return self.lexstate
315
316    # ------------------------------------------------------------
317    # skip() - Skip ahead n characters
318    # ------------------------------------------------------------
319    def skip(self, n):
320        self.lexpos += n
321
322    # ------------------------------------------------------------
323    # opttoken() - Return the next token from the Lexer
324    #
325    # Note: This function has been carefully implemented to be as fast
326    # as possible.  Don't make changes unless you really know what
327    # you are doing
328    # ------------------------------------------------------------
329    def token(self):
330        # Make local copies of frequently referenced attributes
331        lexpos = self.lexpos
332        lexlen = self.lexlen
333        lexignore = self.lexignore
334        lexdata = self.lexdata
335
336        while lexpos < lexlen:
337            # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
338            if lexdata[lexpos] in lexignore:
339                lexpos += 1
340                continue
341
342            # Look for a regular expression match
343            for lexre, lexindexfunc in self.lexre:
344                m = lexre.match(lexdata, lexpos)
345                if not m:
346                    continue
347
348                # Create a token for return
349                tok = LexToken()
350                tok.value = m.group()
351                tok.lineno = self.lineno
352                tok.lexpos = lexpos
353
354                i = m.lastindex
355                func, tok.type = lexindexfunc[i]
356
357                if not func:
358                    # If no token type was set, it's an ignored token
359                    if tok.type:
360                        self.lexpos = m.end()
361                        return tok
362                    else:
363                        lexpos = m.end()
364                        break
365
366                lexpos = m.end()
367
368                # If token is processed by a function, call it
369
370                tok.lexer = self      # Set additional attributes useful in token rules
371                self.lexmatch = m
372                self.lexpos = lexpos
373
374                newtok = func(tok)
375
376                # Every function must return a token, if nothing, we just move to next token
377                if not newtok:
378                    lexpos = self.lexpos         # This is here in case user has updated lexpos.
379                    lexignore = self.lexignore      # This is here in case there was a state change
380                    break
381
382                # Verify type of the token.  If not in the token map, raise an error
383                if not self.lexoptimize:
384                    if not newtok.type in self.lextokens:
385                        raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
386                            func_code(func).co_filename, func_code(func).co_firstlineno,
387                            func.__name__, newtok.type), lexdata[lexpos:])
388
389                return newtok
390            else:
391                # No match, see if in literals
392                if lexdata[lexpos] in self.lexliterals:
393                    tok = LexToken()
394                    tok.value = lexdata[lexpos]
395                    tok.lineno = self.lineno
396                    tok.type = tok.value
397                    tok.lexpos = lexpos
398                    self.lexpos = lexpos + 1
399                    return tok
400
401                # No match. Call t_error() if defined.
402                if self.lexerrorf:
403                    tok = LexToken()
404                    tok.value = self.lexdata[lexpos:]
405                    tok.lineno = self.lineno
406                    tok.type = "error"
407                    tok.lexer = self
408                    tok.lexpos = lexpos
409                    self.lexpos = lexpos
410                    newtok = self.lexerrorf(tok)
411                    if lexpos == self.lexpos:
412                        # Error method didn't change text position at all. This is an error.
413                        raise LexError("Scanning error. Illegal character '%s'" %
414                                       (lexdata[lexpos]), lexdata[lexpos:])
415                    lexpos = self.lexpos
416                    if not newtok:
417                        continue
418                    return newtok
419
420                self.lexpos = lexpos
421                raise LexError("Illegal character '%s' at index %d" %
422                               (lexdata[lexpos], lexpos), lexdata[lexpos:])
423
424        self.lexpos = lexpos + 1
425        if self.lexdata is None:
426            raise RuntimeError("No input string given with input()")
427        return None
428
429    # Iterator interface
430    def __iter__(self):
431        return self
432
433    def next(self):
434        t = self.token()
435        if t is None:
436            raise StopIteration
437        return t
438
439    __next__ = next
440
441# -----------------------------------------------------------------------------
442#                           ==== Lex Builder ===
443#
444# The functions and classes below are used to collect lexing information
445# and build a Lexer object from it.
446# -----------------------------------------------------------------------------
447
448# -----------------------------------------------------------------------------
449# get_caller_module_dict()
450#
451# This function returns a dictionary containing all of the symbols defined within
452# a caller further down the call stack.  This is used to get the environment
453# associated with the yacc() call if none was provided.
454# -----------------------------------------------------------------------------
455
456
457def get_caller_module_dict(levels):
458    try:
459        raise RuntimeError
460    except RuntimeError:
461        e, b, t = sys.exc_info()
462        f = t.tb_frame
463        while levels > 0:
464            f = f.f_back
465            levels -= 1
466        ldict = f.f_globals.copy()
467        if f.f_globals != f.f_locals:
468            ldict.update(f.f_locals)
469
470        return ldict
471
472# -----------------------------------------------------------------------------
473# _funcs_to_names()
474#
475# Given a list of regular expression functions, this converts it to a list
476# suitable for output to a table file
477# -----------------------------------------------------------------------------
478
479
480def _funcs_to_names(funclist, namelist):
481    result = []
482    for f, name in zip(funclist, namelist):
483        if f and f[0]:
484            result.append((name, f[1]))
485        else:
486            result.append(f)
487    return result
488
489# -----------------------------------------------------------------------------
490# _names_to_funcs()
491#
492# Given a list of regular expression function names, this converts it back to
493# functions.
494# -----------------------------------------------------------------------------
495
496
497def _names_to_funcs(namelist, fdict):
498    result = []
499    for n in namelist:
500        if n and n[0]:
501            result.append((fdict[n[0]], n[1]))
502        else:
503            result.append(n)
504    return result
505
506# -----------------------------------------------------------------------------
507# _form_master_re()
508#
509# This function takes a list of all of the regex components and attempts to
510# form the master regular expression.  Given limitations in the Python re
511# module, it may be necessary to break the master regex into separate expressions.
512# -----------------------------------------------------------------------------
513
514
515def _form_master_re(relist, reflags, ldict, toknames):
516    if not relist:
517        return []
518    regex = "|".join(relist)
519    try:
520        lexre = re.compile(regex, re.VERBOSE | reflags)
521
522        # Build the index to function map for the matching engine
523        lexindexfunc = [None] * (max(lexre.groupindex.values())+1)
524        lexindexnames = lexindexfunc[:]
525
526        for f, i in lexre.groupindex.items():
527            handle = ldict.get(f, None)
528            if type(handle) in (types.FunctionType, types.MethodType):
529                lexindexfunc[i] = (handle, toknames[f])
530                lexindexnames[i] = f
531            elif handle is not None:
532                lexindexnames[i] = f
533                if f.find("ignore_") > 0:
534                    lexindexfunc[i] = (None, None)
535                else:
536                    lexindexfunc[i] = (None, toknames[f])
537
538        return [(lexre, lexindexfunc)], [regex], [lexindexnames]
539    except Exception:
540        m = int(len(relist)/2)
541        if m == 0:
542            m = 1
543        llist, lre, lnames = _form_master_re(relist[:m], reflags, ldict, toknames)
544        rlist, rre, rnames = _form_master_re(relist[m:], reflags, ldict, toknames)
545        return llist+rlist, lre+rre, lnames+rnames
546
547# -----------------------------------------------------------------------------
548# def _statetoken(s,names)
549#
550# Given a declaration name s of the form "t_" and a dictionary whose keys are
551# state names, this function returns a tuple (states,tokenname) where states
552# is a tuple of state names and tokenname is the name of the token.  For example,
553# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
554# -----------------------------------------------------------------------------
555
556
557def _statetoken(s, names):
558    nonstate = 1
559    parts = s.split("_")
560    for i in range(1, len(parts)):
561        if not parts[i] in names and parts[i] != 'ANY':
562            break
563    if i > 1:
564        states = tuple(parts[1:i])
565    else:
566        states = ('INITIAL',)
567
568    if 'ANY' in states:
569        states = tuple(names)
570
571    tokenname = "_".join(parts[i:])
572    return (states, tokenname)
573
574
575# -----------------------------------------------------------------------------
576# LexerReflect()
577#
578# This class represents information needed to build a lexer as extracted from a
579# user's input file.
580# -----------------------------------------------------------------------------
581class LexerReflect(object):
582    def __init__(self, ldict, log=None, reflags=0):
583        self.ldict = ldict
584        self.error_func = None
585        self.tokens = []
586        self.reflags = reflags
587        self.stateinfo = {'INITIAL': 'inclusive'}
588        self.files = {}
589        self.error = 0
590
591        if log is None:
592            self.log = PlyLogger(sys.stderr)
593        else:
594            self.log = log
595
596    # Get all of the basic information
597    def get_all(self):
598        self.get_tokens()
599        self.get_literals()
600        self.get_states()
601        self.get_rules()
602
603    # Validate all of the information
604    def validate_all(self):
605        self.validate_tokens()
606        self.validate_literals()
607        self.validate_rules()
608        return self.error
609
610    # Get the tokens map
611    def get_tokens(self):
612        tokens = self.ldict.get("tokens", None)
613        if not tokens:
614            self.log.error("No token list is defined")
615            self.error = 1
616            return
617
618        if not isinstance(tokens, (list, tuple)):
619            self.log.error("tokens must be a list or tuple")
620            self.error = 1
621            return
622
623        if not tokens:
624            self.log.error("tokens is empty")
625            self.error = 1
626            return
627
628        self.tokens = tokens
629
630    # Validate the tokens
631    def validate_tokens(self):
632        terminals = {}
633        for n in self.tokens:
634            if not _is_identifier.match(n):
635                self.log.error("Bad token name '%s'", n)
636                self.error = 1
637            if n in terminals:
638                self.log.warning("Token '%s' multiply defined", n)
639            terminals[n] = 1
640
641    # Get the literals specifier
642    def get_literals(self):
643        self.literals = self.ldict.get("literals", "")
644
645    # Validate literals
646    def validate_literals(self):
647        try:
648            for c in self.literals:
649                if not isinstance(c, StringTypes) or len(c) > 1:
650                    self.log.error("Invalid literal %s. Must be a single character", repr(c))
651                    self.error = 1
652                    continue
653
654        except TypeError:
655            self.log.error(
656                "Invalid literals specification. literals must be a sequence of characters")
657            self.error = 1
658
659    def get_states(self):
660        self.states = self.ldict.get("states", None)
661        # Build statemap
662        if self.states:
663            if not isinstance(self.states, (tuple, list)):
664                self.log.error("states must be defined as a tuple or list")
665                self.error = 1
666            else:
667                for s in self.states:
668                    if not isinstance(s, tuple) or len(s) != 2:
669                        self.log.error(
670                            "Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')", repr(s))
671                        self.error = 1
672                        continue
673                    name, statetype = s
674                    if not isinstance(name, StringTypes):
675                        self.log.error("State name %s must be a string", repr(name))
676                        self.error = 1
677                        continue
678                    if not (statetype == 'inclusive' or statetype == 'exclusive'):
679                        self.log.error(
680                            "State type for state %s must be 'inclusive' or 'exclusive'", name)
681                        self.error = 1
682                        continue
683                    if name in self.stateinfo:
684                        self.log.error("State '%s' already defined", name)
685                        self.error = 1
686                        continue
687                    self.stateinfo[name] = statetype
688
689    # Get all of the symbols with a t_ prefix and sort them into various
690    # categories (functions, strings, error functions, and ignore characters)
691
692    def get_rules(self):
693        tsymbols = [f for f in self.ldict if f[:2] == 't_']
694
695        # Now build up a list of functions and a list of strings
696
697        self.toknames = {}        # Mapping of symbols to token names
698        self.funcsym = {}        # Symbols defined as functions
699        self.strsym = {}        # Symbols defined as strings
700        self.ignore = {}        # Ignore strings by state
701        self.errorf = {}        # Error functions by state
702
703        for s in self.stateinfo:
704            self.funcsym[s] = []
705            self.strsym[s] = []
706
707        if len(tsymbols) == 0:
708            self.log.error("No rules of the form t_rulename are defined")
709            self.error = 1
710            return
711
712        for f in tsymbols:
713            t = self.ldict[f]
714            states, tokname = _statetoken(f, self.stateinfo)
715            self.toknames[f] = tokname
716
717            if hasattr(t, "__call__"):
718                if tokname == 'error':
719                    for s in states:
720                        self.errorf[s] = t
721                elif tokname == 'ignore':
722                    line = func_code(t).co_firstlineno
723                    file = func_code(t).co_filename
724                    self.log.error("%s:%d: Rule '%s' must be defined as a string",
725                                   file, line, t.__name__)
726                    self.error = 1
727                else:
728                    for s in states:
729                        self.funcsym[s].append((f, t))
730            elif isinstance(t, StringTypes):
731                if tokname == 'ignore':
732                    for s in states:
733                        self.ignore[s] = t
734                    if "\\" in t:
735                        self.log.warning("%s contains a literal backslash '\\'", f)
736
737                elif tokname == 'error':
738                    self.log.error("Rule '%s' must be defined as a function", f)
739                    self.error = 1
740                else:
741                    for s in states:
742                        self.strsym[s].append((f, t))
743            else:
744                self.log.error("%s not defined as a function or string", f)
745                self.error = 1
746
747        # Sort the functions by line number
748        for f in self.funcsym.values():
749            if sys.version_info[0] < 3:
750                f.sort(lambda x, y: cmp(
751                    func_code(x[1]).co_firstlineno, func_code(y[1]).co_firstlineno))
752            else:
753                # Python 3.0
754                f.sort(key=lambda x: func_code(x[1]).co_firstlineno)
755
756        # Sort the strings by regular expression length
757        for s in self.strsym.values():
758            if sys.version_info[0] < 3:
759                s.sort(lambda x, y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
760            else:
761                # Python 3.0
762                s.sort(key=lambda x: len(x[1]), reverse=True)
763
764    # Validate all of the t_rules collected
765    def validate_rules(self):
766        for state in self.stateinfo:
767            # Validate all rules defined by functions
768
769            for fname, f in self.funcsym[state]:
770                line = func_code(f).co_firstlineno
771                file = func_code(f).co_filename
772                self.files[file] = 1
773
774                tokname = self.toknames[fname]
775                if isinstance(f, types.MethodType):
776                    reqargs = 2
777                else:
778                    reqargs = 1
779                nargs = func_code(f).co_argcount
780                if nargs > reqargs:
781                    self.log.error("%s:%d: Rule '%s' has too many arguments",
782                                   file, line, f.__name__)
783                    self.error = 1
784                    continue
785
786                if nargs < reqargs:
787                    self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
788                    self.error = 1
789                    continue
790
791                if not f.__doc__:
792                    self.log.error("%s:%d: No regular expression defined for rule '%s'",
793                                   file, line, f.__name__)
794                    self.error = 1
795                    continue
796
797                try:
798                    c = re.compile("(?P<%s>%s)" % (fname, f.__doc__), re.VERBOSE | self.reflags)
799                    if c.match(""):
800                        self.log.error(
801                            "%s:%d: Regular expression for rule '%s' matches empty string", file, line, f.__name__)
802                        self.error = 1
803                except re.error:
804                    _etype, e, _etrace = sys.exc_info()
805                    self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s",
806                                   file, line, f.__name__, e)
807                    if '#' in f.__doc__:
808                        self.log.error(
809                            "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'", file, line, f.__name__)
810                    self.error = 1
811
812            # Validate all rules defined by strings
813            for name, r in self.strsym[state]:
814                tokname = self.toknames[name]
815                if tokname == 'error':
816                    self.log.error("Rule '%s' must be defined as a function", name)
817                    self.error = 1
818                    continue
819
820                if not tokname in self.tokens and tokname.find("ignore_") < 0:
821                    self.log.error("Rule '%s' defined for an unspecified token %s", name, tokname)
822                    self.error = 1
823                    continue
824
825                try:
826                    c = re.compile("(?P<%s>%s)" % (name, r), re.VERBOSE | self.reflags)
827                    if (c.match("")):
828                        self.log.error(
829                            "Regular expression for rule '%s' matches empty string", name)
830                        self.error = 1
831                except re.error:
832                    _etype, e, _etrace = sys.exc_info()
833                    self.log.error("Invalid regular expression for rule '%s'. %s", name, e)
834                    if '#' in r:
835                        self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'", name)
836                    self.error = 1
837
838            if not self.funcsym[state] and not self.strsym[state]:
839                self.log.error("No rules defined for state '%s'", state)
840                self.error = 1
841
842            # Validate the error function
843            efunc = self.errorf.get(state, None)
844            if efunc:
845                f = efunc
846                line = func_code(f).co_firstlineno
847                file = func_code(f).co_filename
848                self.files[file] = 1
849
850                if isinstance(f, types.MethodType):
851                    reqargs = 2
852                else:
853                    reqargs = 1
854                nargs = func_code(f).co_argcount
855                if nargs > reqargs:
856                    self.log.error("%s:%d: Rule '%s' has too many arguments",
857                                   file, line, f.__name__)
858                    self.error = 1
859
860                if nargs < reqargs:
861                    self.log.error("%s:%d: Rule '%s' requires an argument", file, line, f.__name__)
862                    self.error = 1
863
864        for f in self.files:
865            self.validate_file(f)
866
867    # -----------------------------------------------------------------------------
868    # validate_file()
869    #
870    # This checks to see if there are duplicated t_rulename() functions or strings
871    # in the parser input file.  This is done using a simple regular expression
872    # match on each line in the given file.
873    # -----------------------------------------------------------------------------
874
875    def validate_file(self, filename):
876        import os.path
877        base, ext = os.path.splitext(filename)
878        if ext != '.py':
879            return         # No idea what the file is. Return OK
880
881        try:
882            f = open(filename)
883            lines = f.readlines()
884            f.close()
885        except IOError:
886            return                      # Couldn't find the file.  Don't worry about it
887
888        fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
889        sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
890
891        counthash = {}
892        linen = 1
893        for l in lines:
894            m = fre.match(l)
895            if not m:
896                m = sre.match(l)
897            if m:
898                name = m.group(1)
899                prev = counthash.get(name)
900                if not prev:
901                    counthash[name] = linen
902                else:
903                    self.log.error(
904                        "%s:%d: Rule %s redefined. Previously defined on line %d", filename, linen, name, prev)
905                    self.error = 1
906            linen += 1
907
908# -----------------------------------------------------------------------------
909# lex(module)
910#
911# Build all of the regular expression rules from definitions in the supplied module
912# -----------------------------------------------------------------------------
913
914
915def lex(module=None, object=None, debug=0, optimize=0, lextab="lextab", reflags=0, nowarn=0, outputdir="", debuglog=None, errorlog=None):
916    global lexer
917    ldict = None
918    stateinfo = {'INITIAL': 'inclusive'}
919    lexobj = Lexer()
920    lexobj.lexoptimize = optimize
921    global token, input
922
923    if errorlog is None:
924        errorlog = PlyLogger(sys.stderr)
925
926    if debug:
927        if debuglog is None:
928            debuglog = PlyLogger(sys.stderr)
929
930    # Get the module dictionary used for the lexer
931    if object:
932        module = object
933
934    if module:
935        _items = [(k, getattr(module, k)) for k in dir(module)]
936        ldict = dict(_items)
937    else:
938        ldict = get_caller_module_dict(2)
939
940    # Collect parser information from the dictionary
941    linfo = LexerReflect(ldict, log=errorlog, reflags=reflags)
942    linfo.get_all()
943    if not optimize:
944        if linfo.validate_all():
945            raise SyntaxError("Can't build lexer")
946
947    if optimize and lextab:
948        try:
949            lexobj.readtab(lextab, ldict)
950            token = lexobj.token
951            input = lexobj.input
952            lexer = lexobj
953            return lexobj
954
955        except ImportError:
956            pass
957
958    # Dump some basic debugging information
959    if debug:
960        debuglog.info("lex: tokens   = %r", linfo.tokens)
961        debuglog.info("lex: literals = %r", linfo.literals)
962        debuglog.info("lex: states   = %r", linfo.stateinfo)
963
964    # Build a dictionary of valid token names
965    lexobj.lextokens = {}
966    for n in linfo.tokens:
967        lexobj.lextokens[n] = 1
968
969    # Get literals specification
970    if isinstance(linfo.literals, (list, tuple)):
971        lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals)
972    else:
973        lexobj.lexliterals = linfo.literals
974
975    # Get the stateinfo dictionary
976    stateinfo = linfo.stateinfo
977
978    regexs = {}
979    # Build the master regular expressions
980    for state in stateinfo:
981        regex_list = []
982
983        # Add rules defined by functions first
984        for fname, f in linfo.funcsym[state]:
985            line = func_code(f).co_firstlineno
986            file = func_code(f).co_filename
987            regex_list.append("(?P<%s>%s)" % (fname, f.__doc__))
988            if debug:
989                debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", fname, f.__doc__, state)
990
991        # Now add all of the simple rules
992        for name, r in linfo.strsym[state]:
993            regex_list.append("(?P<%s>%s)" % (name, r))
994            if debug:
995                debuglog.info("lex: Adding rule %s -> '%s' (state '%s')", name, r, state)
996
997        regexs[state] = regex_list
998
999    # Build the master regular expressions
1000
1001    if debug:
1002        debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====")
1003
1004    for state in regexs:
1005        lexre, re_text, re_names = _form_master_re(regexs[state], reflags, ldict, linfo.toknames)
1006        lexobj.lexstatere[state] = lexre
1007        lexobj.lexstateretext[state] = re_text
1008        lexobj.lexstaterenames[state] = re_names
1009        if debug:
1010            for i in range(len(re_text)):
1011                debuglog.info("lex: state '%s' : regex[%d] = '%s'", state, i, re_text[i])
1012
1013    # For inclusive states, we need to add the regular expressions from the INITIAL state
1014    for state, stype in stateinfo.items():
1015        if state != "INITIAL" and stype == 'inclusive':
1016            lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
1017            lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
1018            lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL'])
1019
1020    lexobj.lexstateinfo = stateinfo
1021    lexobj.lexre = lexobj.lexstatere["INITIAL"]
1022    lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
1023
1024    # Set up ignore variables
1025    lexobj.lexstateignore = linfo.ignore
1026    lexobj.lexignore = lexobj.lexstateignore.get("INITIAL", "")
1027
1028    # Set up error functions
1029    lexobj.lexstateerrorf = linfo.errorf
1030    lexobj.lexerrorf = linfo.errorf.get("INITIAL", None)
1031    if not lexobj.lexerrorf:
1032        errorlog.warning("No t_error rule is defined")
1033
1034    # Check state information for ignore and error rules
1035    for s, stype in stateinfo.items():
1036        if stype == 'exclusive':
1037            if not s in linfo.errorf:
1038                errorlog.warning("No error rule is defined for exclusive state '%s'", s)
1039            if not s in linfo.ignore and lexobj.lexignore:
1040                errorlog.warning("No ignore rule is defined for exclusive state '%s'", s)
1041        elif stype == 'inclusive':
1042            if not s in linfo.errorf:
1043                linfo.errorf[s] = linfo.errorf.get("INITIAL", None)
1044            if not s in linfo.ignore:
1045                linfo.ignore[s] = linfo.ignore.get("INITIAL", "")
1046
1047    # Create global versions of the token() and input() functions
1048    token = lexobj.token
1049    input = lexobj.input
1050    lexer = lexobj
1051
1052    # If in optimize mode, we write the lextab
1053    if lextab and optimize:
1054        lexobj.writetab(lextab, outputdir)
1055
1056    return lexobj
1057
1058# -----------------------------------------------------------------------------
1059# runmain()
1060#
1061# This runs the lexer as a main program
1062# -----------------------------------------------------------------------------
1063
1064
1065def runmain(lexer=None, data=None):
1066    if not data:
1067        try:
1068            filename = sys.argv[1]
1069            f = open(filename)
1070            data = f.read()
1071            f.close()
1072        except IndexError:
1073            sys.stdout.write("Reading from standard input (type EOF to end):\n")
1074            data = sys.stdin.read()
1075
1076    if lexer:
1077        _input = lexer.input
1078    else:
1079        _input = input
1080    _input(data)
1081    if lexer:
1082        _token = lexer.token
1083    else:
1084        _token = token
1085
1086    while 1:
1087        tok = _token()
1088        if not tok:
1089            break
1090        sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno, tok.lexpos))
1091
1092# -----------------------------------------------------------------------------
1093# @TOKEN(regex)
1094#
1095# This decorator function can be used to set the regex expression on a function
1096# when its docstring might need to be set in an alternative way
1097# -----------------------------------------------------------------------------
1098
1099
1100def TOKEN(r):
1101    def set_doc(f):
1102        if hasattr(r, "__call__"):
1103            f.__doc__ = r.__doc__
1104        else:
1105            f.__doc__ = r
1106        return f
1107    return set_doc
1108
1109
1110# Alternative spelling of the TOKEN decorator
1111Token = TOKEN
1112