1//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class represents the Lexer for tablegen files.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14#define LLVM_LIB_TABLEGEN_TGLEXER_H
15
16#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/StringRef.h"
18#include "llvm/ADT/StringSet.h"
19#include "llvm/Support/DataTypes.h"
20#include "llvm/Support/SMLoc.h"
21#include <cassert>
22#include <memory>
23#include <set>
24#include <string>
25
26namespace llvm {
27class SourceMgr;
28class SMLoc;
29class Twine;
30
31namespace tgtok {
32  enum TokKind {
33    // Markers
34    Eof, Error,
35
36    // Tokens with no info.
37    minus, plus,        // - +
38    l_square, r_square, // [ ]
39    l_brace, r_brace,   // { }
40    l_paren, r_paren,   // ( )
41    less, greater,      // < >
42    colon, semi,        // : ;
43    comma, period,      // , .
44    equal, question,    // = ?
45    paste,              // #
46
47    // Keywords. ('ElseKW' is named to distinguish it from the existing 'Else'
48    // that means the preprocessor #else.)
49    Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List,
50    MultiClass, String, Defset, Defvar, If, Then, ElseKW,
51
52    // !keywords.
53    XConcat, XADD, XMUL, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XListSplat,
54    XStrConcat, XCast, XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty,
55    XIf, XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, XSetOp, XGetOp,
56
57    // Integer value.
58    IntVal,
59
60    // Binary constant.  Note that these are sized according to the number of
61    // bits given.
62    BinaryIntVal,
63
64    // String valued tokens.
65    Id, StrVal, VarName, CodeFragment,
66
67    // Preprocessing tokens for internal usage by the lexer.
68    // They are never returned as a result of Lex().
69    Ifdef, Ifndef, Else, Endif, Define
70  };
71}
72
73/// TGLexer - TableGen Lexer class.
74class TGLexer {
75  SourceMgr &SrcMgr;
76
77  const char *CurPtr = nullptr;
78  StringRef CurBuf;
79
80  // Information about the current token.
81  const char *TokStart = nullptr;
82  tgtok::TokKind CurCode = tgtok::TokKind::Eof;
83  std::string CurStrVal;  // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT
84  int64_t CurIntVal = 0;  // This is valid for INTVAL.
85
86  /// CurBuffer - This is the current buffer index we're lexing from as managed
87  /// by the SourceMgr object.
88  unsigned CurBuffer = 0;
89
90public:
91  typedef std::set<std::string> DependenciesSetTy;
92
93private:
94  /// Dependencies - This is the list of all included files.
95  DependenciesSetTy Dependencies;
96
97public:
98  TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
99
100  tgtok::TokKind Lex() {
101    return CurCode = LexToken(CurPtr == CurBuf.begin());
102  }
103
104  const DependenciesSetTy &getDependencies() const {
105    return Dependencies;
106  }
107
108  tgtok::TokKind getCode() const { return CurCode; }
109
110  const std::string &getCurStrVal() const {
111    assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
112            CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
113           "This token doesn't have a string value");
114    return CurStrVal;
115  }
116  int64_t getCurIntVal() const {
117    assert(CurCode == tgtok::IntVal && "This token isn't an integer");
118    return CurIntVal;
119  }
120  std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
121    assert(CurCode == tgtok::BinaryIntVal &&
122           "This token isn't a binary integer");
123    return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
124  }
125
126  SMLoc getLoc() const;
127
128private:
129  /// LexToken - Read the next token and return its code.
130  tgtok::TokKind LexToken(bool FileOrLineStart = false);
131
132  tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
133  tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
134
135  int getNextChar();
136  int peekNextChar(int Index) const;
137  void SkipBCPLComment();
138  bool SkipCComment();
139  tgtok::TokKind LexIdentifier();
140  bool LexInclude();
141  tgtok::TokKind LexString();
142  tgtok::TokKind LexVarName();
143  tgtok::TokKind LexNumber();
144  tgtok::TokKind LexBracket();
145  tgtok::TokKind LexExclaim();
146
147  // Process EOF encountered in LexToken().
148  // If EOF is met in an include file, then the method will update
149  // CurPtr, CurBuf and preprocessing include stack, and return true.
150  // If EOF is met in the top-level file, then the method will
151  // update and check the preprocessing include stack, and return false.
152  bool processEOF();
153
154  // *** Structures and methods for preprocessing support ***
155
156  // A set of macro names that are defined either via command line or
157  // by using:
158  //     #define NAME
159  StringSet<> DefinedMacros;
160
161  // Each of #ifdef and #else directives has a descriptor associated
162  // with it.
163  //
164  // An ordered list of preprocessing controls defined by #ifdef/#else
165  // directives that are in effect currently is called preprocessing
166  // control stack.  It is represented as a vector of PreprocessorControlDesc's.
167  //
168  // The control stack is updated according to the following rules:
169  //
170  // For each #ifdef we add an element to the control stack.
171  // For each #else we replace the top element with a descriptor
172  // with an inverted IsDefined value.
173  // For each #endif we pop the top element from the control stack.
174  //
175  // When CurPtr reaches the current buffer's end, the control stack
176  // must be empty, i.e. #ifdef and the corresponding #endif
177  // must be located in the same file.
178  struct PreprocessorControlDesc {
179    // Either tgtok::Ifdef or tgtok::Else.
180    tgtok::TokKind Kind;
181
182    // True, if the condition for this directive is true, false - otherwise.
183    // Examples:
184    //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
185    //     ...
186    //     #else             : false, if NAME is defined, true - otherwise.
187    bool IsDefined;
188
189    // Pointer into CurBuf to the beginning of the preprocessing directive
190    // word, e.g.:
191    //     #ifdef NAME
192    //      ^ - SrcPos
193    SMLoc SrcPos;
194  };
195
196  // We want to disallow code like this:
197  //     file1.td:
198  //         #define NAME
199  //         #ifdef NAME
200  //         include "file2.td"
201  //     EOF
202  //     file2.td:
203  //         #endif
204  //     EOF
205  //
206  // To do this, we clear the preprocessing control stack on entry
207  // to each of the included file.  PrepIncludeStack is used to store
208  // preprocessing control stacks for the current file and all its
209  // parent files.  The back() element is the preprocessing control
210  // stack for the current file.
211  std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
212      PrepIncludeStack;
213
214  // Validate that the current preprocessing control stack is empty,
215  // since we are about to exit a file, and pop the include stack.
216  //
217  // If IncludeStackMustBeEmpty is true, the include stack must be empty
218  // after the popping, otherwise, the include stack must not be empty
219  // after the popping.  Basically, the include stack must be empty
220  // only if we exit the "top-level" file (i.e. finish lexing).
221  //
222  // The method returns false, if the current preprocessing control stack
223  // is not empty (e.g. there is an unterminated #ifdef/#else),
224  // true - otherwise.
225  bool prepExitInclude(bool IncludeStackMustBeEmpty);
226
227  // Look ahead for a preprocessing directive starting from CurPtr.  The caller
228  // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
229  // a preprocessing directive word followed by a whitespace, then it returns
230  // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
231  //
232  // CurPtr is not adjusted by this method.
233  tgtok::TokKind prepIsDirective() const;
234
235  // Given a preprocessing token kind, adjusts CurPtr to the end
236  // of the preprocessing directive word.  Returns true, unless
237  // an unsupported token kind is passed in.
238  //
239  // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
240  // to avoid adjusting CurPtr before we are sure that '#' is followed
241  // by a preprocessing directive.  If it is not, then we fall back to
242  // tgtok::paste interpretation of '#'.
243  bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
244
245  // The main "exit" point from the token parsing to preprocessor.
246  //
247  // The method is called for CurPtr, when prepIsDirective() returns
248  // true.  The first parameter matches the result of prepIsDirective(),
249  // denoting the actual preprocessor directive to be processed.
250  //
251  // If the preprocessing directive disables the tokens processing, e.g.:
252  //     #ifdef NAME // NAME is undefined
253  // then lexPreprocessor() enters the lines-skipping mode.
254  // In this mode, it does not parse any tokens, because the code under
255  // the #ifdef may not even be a correct tablegen code.  The preprocessor
256  // looks for lines containing other preprocessing directives, which
257  // may be prepended with whitespaces and C-style comments.  If the line
258  // does not contain a preprocessing directive, it is skipped completely.
259  // Otherwise, the preprocessing directive is processed by recursively
260  // calling lexPreprocessor().  The processing of the encountered
261  // preprocessing directives includes updating preprocessing control stack
262  // and adding new macros into DefinedMacros set.
263  //
264  // The second parameter controls whether lexPreprocessor() is called from
265  // LexToken() (true) or recursively from lexPreprocessor() (false).
266  //
267  // If ReturnNextLiveToken is true, the method returns the next
268  // LEX token following the current directive or following the end
269  // of the disabled preprocessing region corresponding to this directive.
270  // If ReturnNextLiveToken is false, the method returns the first parameter,
271  // unless there were errors encountered in the disabled preprocessing
272  // region - in this case, it returns tgtok::Error.
273  tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
274                                 bool ReturnNextLiveToken = true);
275
276  // Worker method for lexPreprocessor() to skip lines after some
277  // preprocessing directive up to the buffer end or to the directive
278  // that re-enables token processing.  The method returns true
279  // upon processing the next directive that re-enables tokens
280  // processing.  False is returned if an error was encountered.
281  //
282  // Note that prepSkipRegion() calls lexPreprocessor() to process
283  // encountered preprocessing directives.  In this case, the second
284  // parameter to lexPreprocessor() is set to false.  Being passed
285  // false ReturnNextLiveToken, lexPreprocessor() must never call
286  // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
287  // to prepSkipRegion() and checking that it is never set to false.
288  bool prepSkipRegion(bool MustNeverBeFalse);
289
290  // Lex name of the macro after either #ifdef or #define.  We could have used
291  // LexIdentifier(), but it has special handling of "include" word, which
292  // could result in awkward diagnostic errors.  Consider:
293  // ----
294  // #ifdef include
295  // class ...
296  // ----
297  // LexIdentifier() will engage LexInclude(), which will complain about
298  // missing file with name "class".  Instead, prepLexMacroName() will treat
299  // "include" as a normal macro name.
300  //
301  // On entry, CurPtr points to the end of a preprocessing directive word.
302  // The method allows for whitespaces between the preprocessing directive
303  // and the macro name.  The allowed whitespaces are ' ' and '\t'.
304  //
305  // If the first non-whitespace symbol after the preprocessing directive
306  // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
307  // the method updates TokStart to the position of the first non-whitespace
308  // symbol, sets CurPtr to the position of the macro name's last symbol,
309  // and returns a string reference to the macro name.  Otherwise,
310  // TokStart is set to the first non-whitespace symbol after the preprocessing
311  // directive, and the method returns an empty string reference.
312  //
313  // In all cases, TokStart may be used to point to the word following
314  // the preprocessing directive.
315  StringRef prepLexMacroName();
316
317  // Skip any whitespaces starting from CurPtr.  The method is used
318  // only in the lines-skipping mode to find the first non-whitespace
319  // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
320  // and '\r'.  The method skips C-style comments as well, because
321  // it is used to find the beginning of the preprocessing directive.
322  // If we do not handle C-style comments the following code would
323  // result in incorrect detection of a preprocessing directive:
324  //     /*
325  //     #ifdef NAME
326  //     */
327  // As long as we skip C-style comments, the following code is correctly
328  // recognized as a preprocessing directive:
329  //     /* first line comment
330  //        second line comment */ #ifdef NAME
331  //
332  // The method returns true upon reaching the first non-whitespace symbol
333  // or EOF, CurPtr is set to point to this symbol.  The method returns false,
334  // if an error occured during skipping of a C-style comment.
335  bool prepSkipLineBegin();
336
337  // Skip any whitespaces or comments after a preprocessing directive.
338  // The method returns true upon reaching either end of the line
339  // or end of the file.  If there is a multiline C-style comment
340  // after the preprocessing directive, the method skips
341  // the comment, so the final CurPtr may point to one of the next lines.
342  // The method returns false, if an error occured during skipping
343  // C- or C++-style comment, or a non-whitespace symbol appears
344  // after the preprocessing directive.
345  //
346  // The method maybe called both during lines-skipping and tokens
347  // processing.  It actually verifies that only whitespaces or/and
348  // comments follow a preprocessing directive.
349  //
350  // After the execution of this mehod, CurPtr points either to new line
351  // symbol, buffer end or non-whitespace symbol following the preprocesing
352  // directive.
353  bool prepSkipDirectiveEnd();
354
355  // Skip all symbols to the end of the line/file.
356  // The method adjusts CurPtr, so that it points to either new line
357  // symbol in the current line or the buffer end.
358  void prepSkipToLineEnd();
359
360  // Return true, if the current preprocessor control stack is such that
361  // we should allow lexer to process the next token, false - otherwise.
362  //
363  // In particular, the method returns true, if all the #ifdef/#else
364  // controls on the stack have their IsDefined member set to true.
365  bool prepIsProcessingEnabled();
366
367  // Report an error, if we reach EOF with non-empty preprocessing control
368  // stack.  This means there is no matching #endif for the previous
369  // #ifdef/#else.
370  void prepReportPreprocessorStackError();
371};
372
373} // end namespace llvm
374
375#endif
376