1//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Implement the Lexer for TableGen.
10//
11//===----------------------------------------------------------------------===//
12
13#include "TGLexer.h"
14#include "llvm/ADT/ArrayRef.h"
15#include "llvm/ADT/StringSwitch.h"
16#include "llvm/ADT/Twine.h"
17#include "llvm/Config/config.h" // for strtoull()/strtoll() define
18#include "llvm/Support/Compiler.h"
19#include "llvm/Support/MemoryBuffer.h"
20#include "llvm/Support/SourceMgr.h"
21#include "llvm/TableGen/Error.h"
22#include <algorithm>
23#include <cctype>
24#include <cerrno>
25#include <cstdint>
26#include <cstdio>
27#include <cstdlib>
28#include <cstring>
29
30using namespace llvm;
31
32namespace {
33// A list of supported preprocessing directives with their
34// internal token kinds and names.
35struct {
36  tgtok::TokKind Kind;
37  const char *Word;
38} PreprocessorDirs[] = {
39  { tgtok::Ifdef, "ifdef" },
40  { tgtok::Ifndef, "ifndef" },
41  { tgtok::Else, "else" },
42  { tgtok::Endif, "endif" },
43  { tgtok::Define, "define" }
44};
45} // end anonymous namespace
46
47TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
48  CurBuffer = SrcMgr.getMainFileID();
49  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
50  CurPtr = CurBuf.begin();
51  TokStart = nullptr;
52
53  // Pretend that we enter the "top-level" include file.
54  PrepIncludeStack.push_back(
55      std::make_unique<std::vector<PreprocessorControlDesc>>());
56
57  // Put all macros defined in the command line into the DefinedMacros set.
58  for (const std::string &MacroName : Macros)
59    DefinedMacros.insert(MacroName);
60}
61
62SMLoc TGLexer::getLoc() const {
63  return SMLoc::getFromPointer(TokStart);
64}
65
66SMRange TGLexer::getLocRange() const {
67  return {getLoc(), SMLoc::getFromPointer(CurPtr)};
68}
69
70/// ReturnError - Set the error to the specified string at the specified
71/// location.  This is defined to always return tgtok::Error.
72tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) {
73  PrintError(Loc, Msg);
74  return tgtok::Error;
75}
76
77tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
78  return ReturnError(SMLoc::getFromPointer(Loc), Msg);
79}
80
81bool TGLexer::processEOF() {
82  SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
83  if (ParentIncludeLoc != SMLoc()) {
84    // If prepExitInclude() detects a problem with the preprocessing
85    // control stack, it will return false.  Pretend that we reached
86    // the final EOF and stop lexing more tokens by returning false
87    // to LexToken().
88    if (!prepExitInclude(false))
89      return false;
90
91    CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
92    CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
93    CurPtr = ParentIncludeLoc.getPointer();
94    // Make sure TokStart points into the parent file's buffer.
95    // LexToken() assigns to it before calling getNextChar(),
96    // so it is pointing into the included file now.
97    TokStart = CurPtr;
98    return true;
99  }
100
101  // Pretend that we exit the "top-level" include file.
102  // Note that in case of an error (e.g. control stack imbalance)
103  // the routine will issue a fatal error.
104  prepExitInclude(true);
105  return false;
106}
107
108int TGLexer::getNextChar() {
109  char CurChar = *CurPtr++;
110  switch (CurChar) {
111  default:
112    return (unsigned char)CurChar;
113
114  case 0: {
115    // A NUL character in the stream is either the end of the current buffer or
116    // a spurious NUL in the file.  Disambiguate that here.
117    if (CurPtr - 1 == CurBuf.end()) {
118      --CurPtr; // Arrange for another call to return EOF again.
119      return EOF;
120    }
121    PrintError(getLoc(),
122               "NUL character is invalid in source; treated as space");
123    return ' ';
124  }
125
126  case '\n':
127  case '\r':
128    // Handle the newline character by ignoring it and incrementing the line
129    // count.  However, be careful about 'dos style' files with \n\r in them.
130    // Only treat a \n\r or \r\n as a single line.
131    if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
132        *CurPtr != CurChar)
133      ++CurPtr;  // Eat the two char newline sequence.
134    return '\n';
135  }
136}
137
138int TGLexer::peekNextChar(int Index) const {
139  return *(CurPtr + Index);
140}
141
142tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
143  TokStart = CurPtr;
144  // This always consumes at least one character.
145  int CurChar = getNextChar();
146
147  switch (CurChar) {
148  default:
149    // Handle letters: [a-zA-Z_]
150    if (isalpha(CurChar) || CurChar == '_')
151      return LexIdentifier();
152
153    // Unknown character, emit an error.
154    return ReturnError(TokStart, "Unexpected character");
155  case EOF:
156    // Lex next token, if we just left an include file.
157    // Note that leaving an include file means that the next
158    // symbol is located at the end of the 'include "..."'
159    // construct, so LexToken() is called with default
160    // false parameter.
161    if (processEOF())
162      return LexToken();
163
164    // Return EOF denoting the end of lexing.
165    return tgtok::Eof;
166
167  case ':': return tgtok::colon;
168  case ';': return tgtok::semi;
169  case ',': return tgtok::comma;
170  case '<': return tgtok::less;
171  case '>': return tgtok::greater;
172  case ']': return tgtok::r_square;
173  case '{': return tgtok::l_brace;
174  case '}': return tgtok::r_brace;
175  case '(': return tgtok::l_paren;
176  case ')': return tgtok::r_paren;
177  case '=': return tgtok::equal;
178  case '?': return tgtok::question;
179  case '#':
180    if (FileOrLineStart) {
181      tgtok::TokKind Kind = prepIsDirective();
182      if (Kind != tgtok::Error)
183        return lexPreprocessor(Kind);
184    }
185
186    return tgtok::paste;
187
188  // The period is a separate case so we can recognize the "..."
189  // range punctuator.
190  case '.':
191    if (peekNextChar(0) == '.') {
192      ++CurPtr; // Eat second dot.
193      if (peekNextChar(0) == '.') {
194        ++CurPtr; // Eat third dot.
195        return tgtok::dotdotdot;
196      }
197      return ReturnError(TokStart, "Invalid '..' punctuation");
198    }
199    return tgtok::dot;
200
201  case '\r':
202    PrintFatalError("getNextChar() must never return '\r'");
203    return tgtok::Error;
204
205  case ' ':
206  case '\t':
207    // Ignore whitespace.
208    return LexToken(FileOrLineStart);
209  case '\n':
210    // Ignore whitespace, and identify the new line.
211    return LexToken(true);
212  case '/':
213    // If this is the start of a // comment, skip until the end of the line or
214    // the end of the buffer.
215    if (*CurPtr == '/')
216      SkipBCPLComment();
217    else if (*CurPtr == '*') {
218      if (SkipCComment())
219        return tgtok::Error;
220    } else // Otherwise, this is an error.
221      return ReturnError(TokStart, "Unexpected character");
222    return LexToken(FileOrLineStart);
223  case '-': case '+':
224  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
225  case '7': case '8': case '9': {
226    int NextChar = 0;
227    if (isdigit(CurChar)) {
228      // Allow identifiers to start with a number if it is followed by
229      // an identifier.  This can happen with paste operations like
230      // foo#8i.
231      int i = 0;
232      do {
233        NextChar = peekNextChar(i++);
234      } while (isdigit(NextChar));
235
236      if (NextChar == 'x' || NextChar == 'b') {
237        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
238        // likely a number.
239        int NextNextChar = peekNextChar(i);
240        switch (NextNextChar) {
241        default:
242          break;
243        case '0': case '1':
244          if (NextChar == 'b')
245            return LexNumber();
246          [[fallthrough]];
247        case '2': case '3': case '4': case '5':
248        case '6': case '7': case '8': case '9':
249        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
250        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
251          if (NextChar == 'x')
252            return LexNumber();
253          break;
254        }
255      }
256    }
257
258    if (isalpha(NextChar) || NextChar == '_')
259      return LexIdentifier();
260
261    return LexNumber();
262  }
263  case '"': return LexString();
264  case '$': return LexVarName();
265  case '[': return LexBracket();
266  case '!': return LexExclaim();
267  }
268}
269
270/// LexString - Lex "[^"]*"
271tgtok::TokKind TGLexer::LexString() {
272  const char *StrStart = CurPtr;
273
274  CurStrVal = "";
275
276  while (*CurPtr != '"') {
277    // If we hit the end of the buffer, report an error.
278    if (*CurPtr == 0 && CurPtr == CurBuf.end())
279      return ReturnError(StrStart, "End of file in string literal");
280
281    if (*CurPtr == '\n' || *CurPtr == '\r')
282      return ReturnError(StrStart, "End of line in string literal");
283
284    if (*CurPtr != '\\') {
285      CurStrVal += *CurPtr++;
286      continue;
287    }
288
289    ++CurPtr;
290
291    switch (*CurPtr) {
292    case '\\': case '\'': case '"':
293      // These turn into their literal character.
294      CurStrVal += *CurPtr++;
295      break;
296    case 't':
297      CurStrVal += '\t';
298      ++CurPtr;
299      break;
300    case 'n':
301      CurStrVal += '\n';
302      ++CurPtr;
303      break;
304
305    case '\n':
306    case '\r':
307      return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
308
309    // If we hit the end of the buffer, report an error.
310    case '\0':
311      if (CurPtr == CurBuf.end())
312        return ReturnError(StrStart, "End of file in string literal");
313      [[fallthrough]];
314    default:
315      return ReturnError(CurPtr, "invalid escape in string literal");
316    }
317  }
318
319  ++CurPtr;
320  return tgtok::StrVal;
321}
322
323tgtok::TokKind TGLexer::LexVarName() {
324  if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
325    return ReturnError(TokStart, "Invalid variable name");
326
327  // Otherwise, we're ok, consume the rest of the characters.
328  const char *VarNameStart = CurPtr++;
329
330  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
331    ++CurPtr;
332
333  CurStrVal.assign(VarNameStart, CurPtr);
334  return tgtok::VarName;
335}
336
337tgtok::TokKind TGLexer::LexIdentifier() {
338  // The first letter is [a-zA-Z_].
339  const char *IdentStart = TokStart;
340
341  // Match the rest of the identifier regex: [0-9a-zA-Z_]*
342  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
343    ++CurPtr;
344
345  // Check to see if this identifier is a reserved keyword.
346  StringRef Str(IdentStart, CurPtr-IdentStart);
347
348  tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
349                            .Case("int", tgtok::Int)
350                            .Case("bit", tgtok::Bit)
351                            .Case("bits", tgtok::Bits)
352                            .Case("string", tgtok::String)
353                            .Case("list", tgtok::List)
354                            .Case("code", tgtok::Code)
355                            .Case("dag", tgtok::Dag)
356                            .Case("class", tgtok::Class)
357                            .Case("def", tgtok::Def)
358                            .Case("true", tgtok::TrueVal)
359                            .Case("false", tgtok::FalseVal)
360                            .Case("foreach", tgtok::Foreach)
361                            .Case("defm", tgtok::Defm)
362                            .Case("defset", tgtok::Defset)
363                            .Case("multiclass", tgtok::MultiClass)
364                            .Case("field", tgtok::Field)
365                            .Case("let", tgtok::Let)
366                            .Case("in", tgtok::In)
367                            .Case("defvar", tgtok::Defvar)
368                            .Case("include", tgtok::Include)
369                            .Case("if", tgtok::If)
370                            .Case("then", tgtok::Then)
371                            .Case("else", tgtok::ElseKW)
372                            .Case("assert", tgtok::Assert)
373                            .Case("dump", tgtok::Dump)
374                            .Default(tgtok::Id);
375
376  // A couple of tokens require special processing.
377  switch (Kind) {
378    case tgtok::Include:
379      if (LexInclude()) return tgtok::Error;
380      return Lex();
381    case tgtok::Id:
382      CurStrVal.assign(Str.begin(), Str.end());
383      break;
384    default:
385      break;
386  }
387
388  return Kind;
389}
390
391/// LexInclude - We just read the "include" token.  Get the string token that
392/// comes next and enter the include.
393bool TGLexer::LexInclude() {
394  // The token after the include must be a string.
395  tgtok::TokKind Tok = LexToken();
396  if (Tok == tgtok::Error) return true;
397  if (Tok != tgtok::StrVal) {
398    PrintError(getLoc(), "Expected filename after include");
399    return true;
400  }
401
402  // Get the string.
403  std::string Filename = CurStrVal;
404  std::string IncludedFile;
405
406  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
407                                    IncludedFile);
408  if (!CurBuffer) {
409    PrintError(getLoc(), "Could not find include file '" + Filename + "'");
410    return true;
411  }
412
413  Dependencies.insert(IncludedFile);
414  // Save the line number and lex buffer of the includer.
415  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
416  CurPtr = CurBuf.begin();
417
418  PrepIncludeStack.push_back(
419      std::make_unique<std::vector<PreprocessorControlDesc>>());
420  return false;
421}
422
423/// SkipBCPLComment - Skip over the comment by finding the next CR or LF.
424/// Or we may end up at the end of the buffer.
425void TGLexer::SkipBCPLComment() {
426  ++CurPtr;  // skip the second slash.
427  auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data());
428  CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos;
429}
430
431/// SkipCComment - This skips C-style /**/ comments.  The only difference from C
432/// is that we allow nesting.
433bool TGLexer::SkipCComment() {
434  ++CurPtr;  // skip the star.
435  unsigned CommentDepth = 1;
436
437  while (true) {
438    int CurChar = getNextChar();
439    switch (CurChar) {
440    case EOF:
441      PrintError(TokStart, "Unterminated comment!");
442      return true;
443    case '*':
444      // End of the comment?
445      if (CurPtr[0] != '/') break;
446
447      ++CurPtr;   // End the */.
448      if (--CommentDepth == 0)
449        return false;
450      break;
451    case '/':
452      // Start of a nested comment?
453      if (CurPtr[0] != '*') break;
454      ++CurPtr;
455      ++CommentDepth;
456      break;
457    }
458  }
459}
460
461/// LexNumber - Lex:
462///    [-+]?[0-9]+
463///    0x[0-9a-fA-F]+
464///    0b[01]+
465tgtok::TokKind TGLexer::LexNumber() {
466  unsigned Base = 0;
467  const char *NumStart;
468
469  // Check if it's a hex or a binary value.
470  if (CurPtr[-1] == '0') {
471    NumStart = CurPtr + 1;
472    if (CurPtr[0] == 'x') {
473      Base = 16;
474      do
475        ++CurPtr;
476      while (isxdigit(CurPtr[0]));
477    } else if (CurPtr[0] == 'b') {
478      Base = 2;
479      do
480        ++CurPtr;
481      while (CurPtr[0] == '0' || CurPtr[0] == '1');
482    }
483  }
484
485  // For a hex or binary value, we always convert it to an unsigned value.
486  bool IsMinus = false;
487
488  // Check if it's a decimal value.
489  if (Base == 0) {
490    // Check for a sign without a digit.
491    if (!isdigit(CurPtr[0])) {
492      if (CurPtr[-1] == '-')
493        return tgtok::minus;
494      else if (CurPtr[-1] == '+')
495        return tgtok::plus;
496    }
497
498    Base = 10;
499    NumStart = TokStart;
500    IsMinus = CurPtr[-1] == '-';
501
502    while (isdigit(CurPtr[0]))
503      ++CurPtr;
504  }
505
506  // Requires at least one digit.
507  if (CurPtr == NumStart)
508    return ReturnError(TokStart, "Invalid number");
509
510  errno = 0;
511  if (IsMinus)
512    CurIntVal = strtoll(NumStart, nullptr, Base);
513  else
514    CurIntVal = strtoull(NumStart, nullptr, Base);
515
516  if (errno == EINVAL)
517    return ReturnError(TokStart, "Invalid number");
518  if (errno == ERANGE)
519    return ReturnError(TokStart, "Number out of range");
520
521  return Base == 2 ? tgtok::BinaryIntVal : tgtok::IntVal;
522}
523
524/// LexBracket - We just read '['.  If this is a code block, return it,
525/// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
526tgtok::TokKind TGLexer::LexBracket() {
527  if (CurPtr[0] != '{')
528    return tgtok::l_square;
529  ++CurPtr;
530  const char *CodeStart = CurPtr;
531  while (true) {
532    int Char = getNextChar();
533    if (Char == EOF) break;
534
535    if (Char != '}') continue;
536
537    Char = getNextChar();
538    if (Char == EOF) break;
539    if (Char == ']') {
540      CurStrVal.assign(CodeStart, CurPtr-2);
541      return tgtok::CodeFragment;
542    }
543  }
544
545  return ReturnError(CodeStart - 2, "Unterminated code block");
546}
547
548/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
549tgtok::TokKind TGLexer::LexExclaim() {
550  if (!isalpha(*CurPtr))
551    return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
552
553  const char *Start = CurPtr++;
554  while (isalpha(*CurPtr))
555    ++CurPtr;
556
557  // Check to see which operator this is.
558  tgtok::TokKind Kind =
559      StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
560          .Case("eq", tgtok::XEq)
561          .Case("ne", tgtok::XNe)
562          .Case("le", tgtok::XLe)
563          .Case("lt", tgtok::XLt)
564          .Case("ge", tgtok::XGe)
565          .Case("gt", tgtok::XGt)
566          .Case("if", tgtok::XIf)
567          .Case("cond", tgtok::XCond)
568          .Case("isa", tgtok::XIsA)
569          .Case("head", tgtok::XHead)
570          .Case("tail", tgtok::XTail)
571          .Case("size", tgtok::XSize)
572          .Case("con", tgtok::XConcat)
573          .Case("dag", tgtok::XDag)
574          .Case("add", tgtok::XADD)
575          .Case("sub", tgtok::XSUB)
576          .Case("mul", tgtok::XMUL)
577          .Case("div", tgtok::XDIV)
578          .Case("not", tgtok::XNOT)
579          .Case("logtwo", tgtok::XLOG2)
580          .Case("and", tgtok::XAND)
581          .Case("or", tgtok::XOR)
582          .Case("xor", tgtok::XXOR)
583          .Case("shl", tgtok::XSHL)
584          .Case("sra", tgtok::XSRA)
585          .Case("srl", tgtok::XSRL)
586          .Case("cast", tgtok::XCast)
587          .Case("empty", tgtok::XEmpty)
588          .Case("subst", tgtok::XSubst)
589          .Case("foldl", tgtok::XFoldl)
590          .Case("foreach", tgtok::XForEach)
591          .Case("filter", tgtok::XFilter)
592          .Case("listconcat", tgtok::XListConcat)
593          .Case("listsplat", tgtok::XListSplat)
594          .Case("listremove", tgtok::XListRemove)
595          .Case("range", tgtok::XRange)
596          .Case("strconcat", tgtok::XStrConcat)
597          .Case("interleave", tgtok::XInterleave)
598          .Case("substr", tgtok::XSubstr)
599          .Case("find", tgtok::XFind)
600          .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
601          .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
602          .Case("getdagarg", tgtok::XGetDagArg)
603          .Case("getdagname", tgtok::XGetDagName)
604          .Case("setdagarg", tgtok::XSetDagArg)
605          .Case("setdagname", tgtok::XSetDagName)
606          .Case("exists", tgtok::XExists)
607          .Case("tolower", tgtok::XToLower)
608          .Case("toupper", tgtok::XToUpper)
609          .Case("repr", tgtok::XRepr)
610          .Default(tgtok::Error);
611
612  return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
613}
614
615bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
616  // Report an error, if preprocessor control stack for the current
617  // file is not empty.
618  if (!PrepIncludeStack.back()->empty()) {
619    prepReportPreprocessorStackError();
620
621    return false;
622  }
623
624  // Pop the preprocessing controls from the include stack.
625  if (PrepIncludeStack.empty()) {
626    PrintFatalError("Preprocessor include stack is empty");
627  }
628
629  PrepIncludeStack.pop_back();
630
631  if (IncludeStackMustBeEmpty) {
632    if (!PrepIncludeStack.empty())
633      PrintFatalError("Preprocessor include stack is not empty");
634  } else {
635    if (PrepIncludeStack.empty())
636      PrintFatalError("Preprocessor include stack is empty");
637  }
638
639  return true;
640}
641
642tgtok::TokKind TGLexer::prepIsDirective() const {
643  for (const auto &PD : PreprocessorDirs) {
644    int NextChar = *CurPtr;
645    bool Match = true;
646    unsigned I = 0;
647    for (; I < strlen(PD.Word); ++I) {
648      if (NextChar != PD.Word[I]) {
649        Match = false;
650        break;
651      }
652
653      NextChar = peekNextChar(I + 1);
654    }
655
656    // Check for whitespace after the directive.  If there is no whitespace,
657    // then we do not recognize it as a preprocessing directive.
658    if (Match) {
659      tgtok::TokKind Kind = PD.Kind;
660
661      // New line and EOF may follow only #else/#endif.  It will be reported
662      // as an error for #ifdef/#define after the call to prepLexMacroName().
663      if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF ||
664          NextChar == '\n' ||
665          // It looks like TableGen does not support '\r' as the actual
666          // carriage return, e.g. getNextChar() treats a single '\r'
667          // as '\n'.  So we do the same here.
668          NextChar == '\r')
669        return Kind;
670
671      // Allow comments after some directives, e.g.:
672      //     #else// OR #else/**/
673      //     #endif// OR #endif/**/
674      //
675      // Note that we do allow comments after #ifdef/#define here, e.g.
676      //     #ifdef/**/ AND #ifdef//
677      //     #define/**/ AND #define//
678      //
679      // These cases will be reported as incorrect after calling
680      // prepLexMacroName().  We could have supported C-style comments
681      // after #ifdef/#define, but this would complicate the code
682      // for little benefit.
683      if (NextChar == '/') {
684        NextChar = peekNextChar(I + 1);
685
686        if (NextChar == '*' || NextChar == '/')
687          return Kind;
688
689        // Pretend that we do not recognize the directive.
690      }
691    }
692  }
693
694  return tgtok::Error;
695}
696
697bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
698  TokStart = CurPtr;
699
700  for (const auto &PD : PreprocessorDirs)
701    if (PD.Kind == Kind) {
702      // Advance CurPtr to the end of the preprocessing word.
703      CurPtr += strlen(PD.Word);
704      return true;
705    }
706
707  PrintFatalError("Unsupported preprocessing token in "
708                  "prepEatPreprocessorDirective()");
709  return false;
710}
711
712tgtok::TokKind TGLexer::lexPreprocessor(
713    tgtok::TokKind Kind, bool ReturnNextLiveToken) {
714
715  // We must be looking at a preprocessing directive.  Eat it!
716  if (!prepEatPreprocessorDirective(Kind))
717    PrintFatalError("lexPreprocessor() called for unknown "
718                    "preprocessor directive");
719
720  if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) {
721    StringRef MacroName = prepLexMacroName();
722    StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef";
723    if (MacroName.empty())
724      return ReturnError(TokStart, "Expected macro name after " + IfTokName);
725
726    bool MacroIsDefined = DefinedMacros.count(MacroName) != 0;
727
728    // Canonicalize ifndef's MacroIsDefined to its ifdef equivalent.
729    if (Kind == tgtok::Ifndef)
730      MacroIsDefined = !MacroIsDefined;
731
732    // Regardless of whether we are processing tokens or not,
733    // we put the #ifdef control on stack.
734    // Note that MacroIsDefined has been canonicalized against ifdef.
735    PrepIncludeStack.back()->push_back(
736        {tgtok::Ifdef, MacroIsDefined, SMLoc::getFromPointer(TokStart)});
737
738    if (!prepSkipDirectiveEnd())
739      return ReturnError(CurPtr, "Only comments are supported after " +
740                                     IfTokName + " NAME");
741
742    // If we were not processing tokens before this #ifdef,
743    // then just return back to the lines skipping code.
744    if (!ReturnNextLiveToken)
745      return Kind;
746
747    // If we were processing tokens before this #ifdef,
748    // and the macro is defined, then just return the next token.
749    if (MacroIsDefined)
750      return LexToken();
751
752    // We were processing tokens before this #ifdef, and the macro
753    // is not defined, so we have to start skipping the lines.
754    // If the skipping is successful, it will return the token following
755    // either #else or #endif corresponding to this #ifdef.
756    if (prepSkipRegion(ReturnNextLiveToken))
757      return LexToken();
758
759    return tgtok::Error;
760  } else if (Kind == tgtok::Else) {
761    // Check if this #else is correct before calling prepSkipDirectiveEnd(),
762    // which will move CurPtr away from the beginning of #else.
763    if (PrepIncludeStack.back()->empty())
764      return ReturnError(TokStart, "#else without #ifdef or #ifndef");
765
766    PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back();
767
768    if (IfdefEntry.Kind != tgtok::Ifdef) {
769      PrintError(TokStart, "double #else");
770      return ReturnError(IfdefEntry.SrcPos, "Previous #else is here");
771    }
772
773    // Replace the corresponding #ifdef's control with its negation
774    // on the control stack.
775    PrepIncludeStack.back()->pop_back();
776    PrepIncludeStack.back()->push_back(
777        {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)});
778
779    if (!prepSkipDirectiveEnd())
780      return ReturnError(CurPtr, "Only comments are supported after #else");
781
782    // If we were processing tokens before this #else,
783    // we have to start skipping lines until the matching #endif.
784    if (ReturnNextLiveToken) {
785      if (prepSkipRegion(ReturnNextLiveToken))
786        return LexToken();
787
788      return tgtok::Error;
789    }
790
791    // Return to the lines skipping code.
792    return Kind;
793  } else if (Kind == tgtok::Endif) {
794    // Check if this #endif is correct before calling prepSkipDirectiveEnd(),
795    // which will move CurPtr away from the beginning of #endif.
796    if (PrepIncludeStack.back()->empty())
797      return ReturnError(TokStart, "#endif without #ifdef");
798
799    auto &IfdefOrElseEntry = PrepIncludeStack.back()->back();
800
801    if (IfdefOrElseEntry.Kind != tgtok::Ifdef &&
802        IfdefOrElseEntry.Kind != tgtok::Else) {
803      PrintFatalError("Invalid preprocessor control on the stack");
804      return tgtok::Error;
805    }
806
807    if (!prepSkipDirectiveEnd())
808      return ReturnError(CurPtr, "Only comments are supported after #endif");
809
810    PrepIncludeStack.back()->pop_back();
811
812    // If we were processing tokens before this #endif, then
813    // we should continue it.
814    if (ReturnNextLiveToken) {
815      return LexToken();
816    }
817
818    // Return to the lines skipping code.
819    return Kind;
820  } else if (Kind == tgtok::Define) {
821    StringRef MacroName = prepLexMacroName();
822    if (MacroName.empty())
823      return ReturnError(TokStart, "Expected macro name after #define");
824
825    if (!DefinedMacros.insert(MacroName).second)
826      PrintWarning(getLoc(),
827                   "Duplicate definition of macro: " + Twine(MacroName));
828
829    if (!prepSkipDirectiveEnd())
830      return ReturnError(CurPtr,
831                         "Only comments are supported after #define NAME");
832
833    if (!ReturnNextLiveToken) {
834      PrintFatalError("#define must be ignored during the lines skipping");
835      return tgtok::Error;
836    }
837
838    return LexToken();
839  }
840
841  PrintFatalError("Preprocessing directive is not supported");
842  return tgtok::Error;
843}
844
845bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
846  if (!MustNeverBeFalse)
847    PrintFatalError("Invalid recursion.");
848
849  do {
850    // Skip all symbols to the line end.
851    prepSkipToLineEnd();
852
853    // Find the first non-whitespace symbol in the next line(s).
854    if (!prepSkipLineBegin())
855      return false;
856
857    // If the first non-blank/comment symbol on the line is '#',
858    // it may be a start of preprocessing directive.
859    //
860    // If it is not '#' just go to the next line.
861    if (*CurPtr == '#')
862      ++CurPtr;
863    else
864      continue;
865
866    tgtok::TokKind Kind = prepIsDirective();
867
868    // If we did not find a preprocessing directive or it is #define,
869    // then just skip to the next line.  We do not have to do anything
870    // for #define in the line-skipping mode.
871    if (Kind == tgtok::Error || Kind == tgtok::Define)
872      continue;
873
874    tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false);
875
876    // If lexPreprocessor() encountered an error during lexing this
877    // preprocessor idiom, then return false to the calling lexPreprocessor().
878    // This will force tgtok::Error to be returned to the tokens processing.
879    if (ProcessedKind == tgtok::Error)
880      return false;
881
882    if (Kind != ProcessedKind)
883      PrintFatalError("prepIsDirective() and lexPreprocessor() "
884                      "returned different token kinds");
885
886    // If this preprocessing directive enables tokens processing,
887    // then return to the lexPreprocessor() and get to the next token.
888    // We can move from line-skipping mode to processing tokens only
889    // due to #else or #endif.
890    if (prepIsProcessingEnabled()) {
891      if (Kind != tgtok::Else && Kind != tgtok::Endif) {
892        PrintFatalError("Tokens processing was enabled by an unexpected "
893                        "preprocessing directive");
894        return false;
895      }
896
897      return true;
898    }
899  } while (CurPtr != CurBuf.end());
900
901  // We have reached the end of the file, but never left the lines-skipping
902  // mode.  This means there is no matching #endif.
903  prepReportPreprocessorStackError();
904  return false;
905}
906
907StringRef TGLexer::prepLexMacroName() {
908  // Skip whitespaces between the preprocessing directive and the macro name.
909  while (*CurPtr == ' ' || *CurPtr == '\t')
910    ++CurPtr;
911
912  TokStart = CurPtr;
913  // Macro names start with [a-zA-Z_].
914  if (*CurPtr != '_' && !isalpha(*CurPtr))
915    return "";
916
917  // Match the rest of the identifier regex: [0-9a-zA-Z_]*
918  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
919    ++CurPtr;
920
921  return StringRef(TokStart, CurPtr - TokStart);
922}
923
924bool TGLexer::prepSkipLineBegin() {
925  while (CurPtr != CurBuf.end()) {
926    switch (*CurPtr) {
927    case ' ':
928    case '\t':
929    case '\n':
930    case '\r':
931      break;
932
933    case '/': {
934      int NextChar = peekNextChar(1);
935      if (NextChar == '*') {
936        // Skip C-style comment.
937        // Note that we do not care about skipping the C++-style comments.
938        // If the line contains "//", it may not contain any processable
939        // preprocessing directive.  Just return CurPtr pointing to
940        // the first '/' in this case.  We also do not care about
941        // incorrect symbols after the first '/' - we are in lines-skipping
942        // mode, so incorrect code is allowed to some extent.
943
944        // Set TokStart to the beginning of the comment to enable proper
945        // diagnostic printing in case of error in SkipCComment().
946        TokStart = CurPtr;
947
948        // CurPtr must point to '*' before call to SkipCComment().
949        ++CurPtr;
950        if (SkipCComment())
951          return false;
952      } else {
953        // CurPtr points to the non-whitespace '/'.
954        return true;
955      }
956
957      // We must not increment CurPtr after the comment was lexed.
958      continue;
959    }
960
961    default:
962      return true;
963    }
964
965    ++CurPtr;
966  }
967
968  // We have reached the end of the file.  Return to the lines skipping
969  // code, and allow it to handle the EOF as needed.
970  return true;
971}
972
973bool TGLexer::prepSkipDirectiveEnd() {
974  while (CurPtr != CurBuf.end()) {
975    switch (*CurPtr) {
976    case ' ':
977    case '\t':
978      break;
979
980    case '\n':
981    case '\r':
982      return true;
983
984    case '/': {
985      int NextChar = peekNextChar(1);
986      if (NextChar == '/') {
987        // Skip C++-style comment.
988        // We may just return true now, but let's skip to the line/buffer end
989        // to simplify the method specification.
990        ++CurPtr;
991        SkipBCPLComment();
992      } else if (NextChar == '*') {
993        // When we are skipping C-style comment at the end of a preprocessing
994        // directive, we can skip several lines.  If any meaningful TD token
995        // follows the end of the C-style comment on the same line, it will
996        // be considered as an invalid usage of TD token.
997        // For example, we want to forbid usages like this one:
998        //     #define MACRO class Class {}
999        // But with C-style comments we also disallow the following:
1000        //     #define MACRO /* This macro is used
1001        //                      to ... */ class Class {}
1002        // One can argue that this should be allowed, but it does not seem
1003        // to be worth of the complication.  Moreover, this matches
1004        // the C preprocessor behavior.
1005
1006        // Set TokStart to the beginning of the comment to enable proper
1007        // diagnostic printer in case of error in SkipCComment().
1008        TokStart = CurPtr;
1009        ++CurPtr;
1010        if (SkipCComment())
1011          return false;
1012      } else {
1013        TokStart = CurPtr;
1014        PrintError(CurPtr, "Unexpected character");
1015        return false;
1016      }
1017
1018      // We must not increment CurPtr after the comment was lexed.
1019      continue;
1020    }
1021
1022    default:
1023      // Do not allow any non-whitespaces after the directive.
1024      TokStart = CurPtr;
1025      return false;
1026    }
1027
1028    ++CurPtr;
1029  }
1030
1031  return true;
1032}
1033
1034void TGLexer::prepSkipToLineEnd() {
1035  while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end())
1036    ++CurPtr;
1037}
1038
1039bool TGLexer::prepIsProcessingEnabled() {
1040  for (const PreprocessorControlDesc &I :
1041       llvm::reverse(*PrepIncludeStack.back()))
1042    if (!I.IsDefined)
1043      return false;
1044
1045  return true;
1046}
1047
1048void TGLexer::prepReportPreprocessorStackError() {
1049  if (PrepIncludeStack.back()->empty())
1050    PrintFatalError("prepReportPreprocessorStackError() called with "
1051                    "empty control stack");
1052
1053  auto &PrepControl = PrepIncludeStack.back()->back();
1054  PrintError(CurBuf.end(), "Reached EOF without matching #endif");
1055  PrintError(PrepControl.SrcPos, "The latest preprocessor control is here");
1056
1057  TokStart = CurPtr;
1058}
1059