lib/TableGen/TGLexer.cpp

226584Sdim//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
226584Sdim//
226584Sdim//                     The LLVM Compiler Infrastructure
226584Sdim//
226584Sdim// This file is distributed under the University of Illinois Open Source
226584Sdim// License. See LICENSE.TXT for details.
226584Sdim//
226584Sdim//===----------------------------------------------------------------------===//
226584Sdim//
226584Sdim// Implement the Lexer for TableGen.
226584Sdim//
226584Sdim//===----------------------------------------------------------------------===//
226584Sdim
226584Sdim#include "TGLexer.h"
226584Sdim#include "llvm/ADT/StringSwitch.h"
226584Sdim#include "llvm/ADT/Twine.h"
249423Sdim#include "llvm/Config/config.h" // for strtoull()/strtoll() define
249423Sdim#include "llvm/Support/MemoryBuffer.h"
249423Sdim#include "llvm/Support/SourceMgr.h"
249423Sdim#include "llvm/TableGen/Error.h"
226584Sdim#include <cctype>
249423Sdim#include <cerrno>
226584Sdim#include <cstdio>
226584Sdim#include <cstdlib>
226584Sdim#include <cstring>
234353Sdim
226584Sdimusing namespace llvm;
226584Sdim
226584SdimTGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
226584Sdim  CurBuffer = 0;
226584Sdim  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
226584Sdim  CurPtr = CurBuf->getBufferStart();
226584Sdim  TokStart = 0;
226584Sdim}
226584Sdim
226584SdimSMLoc TGLexer::getLoc() const {
226584Sdim  return SMLoc::getFromPointer(TokStart);
226584Sdim}
226584Sdim
226584Sdim/// ReturnError - Set the error to the specified string at the specified
226584Sdim/// location.  This is defined to always return tgtok::Error.
226584Sdimtgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
226584Sdim  PrintError(Loc, Msg);
226584Sdim  return tgtok::Error;
226584Sdim}
226584Sdim
226584Sdimint TGLexer::getNextChar() {
226584Sdim  char CurChar = *CurPtr++;
226584Sdim  switch (CurChar) {
226584Sdim  default:
226584Sdim    return (unsigned char)CurChar;
226584Sdim  case 0: {
226584Sdim    // A nul character in the stream is either the end of the current buffer or
226584Sdim    // a random nul in the file.  Disambiguate that here.
226584Sdim    if (CurPtr-1 != CurBuf->getBufferEnd())
226584Sdim      return 0;  // Just whitespace.
226584Sdim
226584Sdim    // If this is the end of an included file, pop the parent file off the
226584Sdim    // include stack.
226584Sdim    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
226584Sdim    if (ParentIncludeLoc != SMLoc()) {
226584Sdim      CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
226584Sdim      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
226584Sdim      CurPtr = ParentIncludeLoc.getPointer();
226584Sdim      return getNextChar();
226584Sdim    }
226584Sdim
226584Sdim    // Otherwise, return end of file.
226584Sdim    --CurPtr;  // Another call to lex will return EOF again.
226584Sdim    return EOF;
226584Sdim  }
226584Sdim  case '\n':
226584Sdim  case '\r':
226584Sdim    // Handle the newline character by ignoring it and incrementing the line
226584Sdim    // count.  However, be careful about 'dos style' files with \n\r in them.
226584Sdim    // Only treat a \n\r or \r\n as a single line.
226584Sdim    if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
226584Sdim        *CurPtr != CurChar)
226584Sdim      ++CurPtr;  // Eat the two char newline sequence.
226584Sdim    return '\n';
226584Sdim  }
226584Sdim}
226584Sdim
234353Sdimint TGLexer::peekNextChar(int Index) {
234353Sdim  return *(CurPtr + Index);
234353Sdim}
234353Sdim
226584Sdimtgtok::TokKind TGLexer::LexToken() {
226584Sdim  TokStart = CurPtr;
226584Sdim  // This always consumes at least one character.
226584Sdim  int CurChar = getNextChar();
226584Sdim
226584Sdim  switch (CurChar) {
226584Sdim  default:
234353Sdim    // Handle letters: [a-zA-Z_]
234353Sdim    if (isalpha(CurChar) || CurChar == '_')
226584Sdim      return LexIdentifier();
234353Sdim
226584Sdim    // Unknown character, emit an error.
226584Sdim    return ReturnError(TokStart, "Unexpected character");
226584Sdim  case EOF: return tgtok::Eof;
226584Sdim  case ':': return tgtok::colon;
226584Sdim  case ';': return tgtok::semi;
226584Sdim  case '.': return tgtok::period;
226584Sdim  case ',': return tgtok::comma;
226584Sdim  case '<': return tgtok::less;
226584Sdim  case '>': return tgtok::greater;
226584Sdim  case ']': return tgtok::r_square;
226584Sdim  case '{': return tgtok::l_brace;
226584Sdim  case '}': return tgtok::r_brace;
226584Sdim  case '(': return tgtok::l_paren;
226584Sdim  case ')': return tgtok::r_paren;
226584Sdim  case '=': return tgtok::equal;
226584Sdim  case '?': return tgtok::question;
234353Sdim  case '#': return tgtok::paste;
226584Sdim
226584Sdim  case 0:
226584Sdim  case ' ':
226584Sdim  case '\t':
226584Sdim  case '\n':
226584Sdim  case '\r':
226584Sdim    // Ignore whitespace.
226584Sdim    return LexToken();
226584Sdim  case '/':
226584Sdim    // If this is the start of a // comment, skip until the end of the line or
226584Sdim    // the end of the buffer.
226584Sdim    if (*CurPtr == '/')
226584Sdim      SkipBCPLComment();
226584Sdim    else if (*CurPtr == '*') {
226584Sdim      if (SkipCComment())
226584Sdim        return tgtok::Error;
226584Sdim    } else // Otherwise, this is an error.
226584Sdim      return ReturnError(TokStart, "Unexpected character");
226584Sdim    return LexToken();
226584Sdim  case '-': case '+':
226584Sdim  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
234353Sdim  case '7': case '8': case '9': {
234353Sdim    int NextChar = 0;
234353Sdim    if (isdigit(CurChar)) {
234353Sdim      // Allow identifiers to start with a number if it is followed by
234353Sdim      // an identifier.  This can happen with paste operations like
234353Sdim      // foo#8i.
234353Sdim      int i = 0;
234353Sdim      do {
234353Sdim        NextChar = peekNextChar(i++);
234353Sdim      } while (isdigit(NextChar));
234353Sdim
234353Sdim      if (NextChar == 'x' || NextChar == 'b') {
234353Sdim        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
234353Sdim        // likely a number.
234353Sdim        int NextNextChar = peekNextChar(i);
234353Sdim        switch (NextNextChar) {
234353Sdim        default:
234353Sdim          break;
234353Sdim        case '0': case '1':
234353Sdim          if (NextChar == 'b')
234353Sdim            return LexNumber();
234353Sdim          // Fallthrough
234353Sdim        case '2': case '3': case '4': case '5':
234353Sdim        case '6': case '7': case '8': case '9':
234353Sdim        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
234353Sdim        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
234353Sdim          if (NextChar == 'x')
234353Sdim            return LexNumber();
234353Sdim          break;
234353Sdim        }
234353Sdim      }
234353Sdim    }
234353Sdim
234353Sdim    if (isalpha(NextChar) || NextChar == '_')
234353Sdim      return LexIdentifier();
234353Sdim
226584Sdim    return LexNumber();
234353Sdim  }
226584Sdim  case '"': return LexString();
226584Sdim  case '$': return LexVarName();
226584Sdim  case '[': return LexBracket();
226584Sdim  case '!': return LexExclaim();
226584Sdim  }
226584Sdim}
226584Sdim
226584Sdim/// LexString - Lex "[^"]*"
226584Sdimtgtok::TokKind TGLexer::LexString() {
226584Sdim  const char *StrStart = CurPtr;
226584Sdim
226584Sdim  CurStrVal = "";
226584Sdim
226584Sdim  while (*CurPtr != '"') {
226584Sdim    // If we hit the end of the buffer, report an error.
226584Sdim    if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
226584Sdim      return ReturnError(StrStart, "End of file in string literal");
226584Sdim
226584Sdim    if (*CurPtr == '\n' || *CurPtr == '\r')
226584Sdim      return ReturnError(StrStart, "End of line in string literal");
226584Sdim
226584Sdim    if (*CurPtr != '\\') {
226584Sdim      CurStrVal += *CurPtr++;
226584Sdim      continue;
226584Sdim    }
226584Sdim
226584Sdim    ++CurPtr;
226584Sdim
226584Sdim    switch (*CurPtr) {
226584Sdim    case '\\': case '\'': case '"':
226584Sdim      // These turn into their literal character.
226584Sdim      CurStrVal += *CurPtr++;
226584Sdim      break;
226584Sdim    case 't':
226584Sdim      CurStrVal += '\t';
226584Sdim      ++CurPtr;
226584Sdim      break;
226584Sdim    case 'n':
226584Sdim      CurStrVal += '\n';
226584Sdim      ++CurPtr;
226584Sdim      break;
226584Sdim
226584Sdim    case '\n':
226584Sdim    case '\r':
226584Sdim      return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
226584Sdim
226584Sdim    // If we hit the end of the buffer, report an error.
226584Sdim    case '\0':
226584Sdim      if (CurPtr == CurBuf->getBufferEnd())
226584Sdim        return ReturnError(StrStart, "End of file in string literal");
226584Sdim      // FALL THROUGH
226584Sdim    default:
226584Sdim      return ReturnError(CurPtr, "invalid escape in string literal");
226584Sdim    }
226584Sdim  }
226584Sdim
226584Sdim  ++CurPtr;
226584Sdim  return tgtok::StrVal;
226584Sdim}
226584Sdim
226584Sdimtgtok::TokKind TGLexer::LexVarName() {
226584Sdim  if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
226584Sdim    return ReturnError(TokStart, "Invalid variable name");
226584Sdim
226584Sdim  // Otherwise, we're ok, consume the rest of the characters.
226584Sdim  const char *VarNameStart = CurPtr++;
226584Sdim
226584Sdim  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
226584Sdim    ++CurPtr;
226584Sdim
226584Sdim  CurStrVal.assign(VarNameStart, CurPtr);
226584Sdim  return tgtok::VarName;
226584Sdim}
226584Sdim
226584Sdim
226584Sdimtgtok::TokKind TGLexer::LexIdentifier() {
226584Sdim  // The first letter is [a-zA-Z_#].
226584Sdim  const char *IdentStart = TokStart;
226584Sdim
226584Sdim  // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
234353Sdim  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
226584Sdim    ++CurPtr;
226584Sdim
226584Sdim  // Check to see if this identifier is a keyword.
226584Sdim  StringRef Str(IdentStart, CurPtr-IdentStart);
226584Sdim
226584Sdim  if (Str == "include") {
226584Sdim    if (LexInclude()) return tgtok::Error;
226584Sdim    return Lex();
226584Sdim  }
226584Sdim
226584Sdim  tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
226584Sdim    .Case("int", tgtok::Int)
226584Sdim    .Case("bit", tgtok::Bit)
226584Sdim    .Case("bits", tgtok::Bits)
226584Sdim    .Case("string", tgtok::String)
226584Sdim    .Case("list", tgtok::List)
226584Sdim    .Case("code", tgtok::Code)
226584Sdim    .Case("dag", tgtok::Dag)
226584Sdim    .Case("class", tgtok::Class)
226584Sdim    .Case("def", tgtok::Def)
234353Sdim    .Case("foreach", tgtok::Foreach)
226584Sdim    .Case("defm", tgtok::Defm)
226584Sdim    .Case("multiclass", tgtok::MultiClass)
226584Sdim    .Case("field", tgtok::Field)
226584Sdim    .Case("let", tgtok::Let)
226584Sdim    .Case("in", tgtok::In)
226584Sdim    .Default(tgtok::Id);
226584Sdim
226584Sdim  if (Kind == tgtok::Id)
226584Sdim    CurStrVal.assign(Str.begin(), Str.end());
226584Sdim  return Kind;
226584Sdim}
226584Sdim
226584Sdim/// LexInclude - We just read the "include" token.  Get the string token that
226584Sdim/// comes next and enter the include.
226584Sdimbool TGLexer::LexInclude() {
226584Sdim  // The token after the include must be a string.
226584Sdim  tgtok::TokKind Tok = LexToken();
226584Sdim  if (Tok == tgtok::Error) return true;
226584Sdim  if (Tok != tgtok::StrVal) {
226584Sdim    PrintError(getLoc(), "Expected filename after include");
226584Sdim    return true;
226584Sdim  }
226584Sdim
226584Sdim  // Get the string.
226584Sdim  std::string Filename = CurStrVal;
226584Sdim  std::string IncludedFile;
226584Sdim
226584Sdim
226584Sdim  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
226584Sdim                                    IncludedFile);
226584Sdim  if (CurBuffer == -1) {
226584Sdim    PrintError(getLoc(), "Could not find include file '" + Filename + "'");
226584Sdim    return true;
226584Sdim  }
226584Sdim
249423Sdim  DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
249423Sdim  if (Found != Dependencies.end()) {
249423Sdim    PrintError(getLoc(),
249423Sdim               "File '" + IncludedFile + "' has already been included.");
249423Sdim    SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
249423Sdim                        "previously included here");
249423Sdim    return true;
249423Sdim  }
249423Sdim  Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
226584Sdim  // Save the line number and lex buffer of the includer.
226584Sdim  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
226584Sdim  CurPtr = CurBuf->getBufferStart();
226584Sdim  return false;
226584Sdim}
226584Sdim
226584Sdimvoid TGLexer::SkipBCPLComment() {
226584Sdim  ++CurPtr;  // skip the second slash.
226584Sdim  while (1) {
226584Sdim    switch (*CurPtr) {
226584Sdim    case '\n':
226584Sdim    case '\r':
226584Sdim      return;  // Newline is end of comment.
226584Sdim    case 0:
226584Sdim      // If this is the end of the buffer, end the comment.
226584Sdim      if (CurPtr == CurBuf->getBufferEnd())
226584Sdim        return;
226584Sdim      break;
226584Sdim    }
226584Sdim    // Otherwise, skip the character.
226584Sdim    ++CurPtr;
226584Sdim  }
226584Sdim}
226584Sdim
226584Sdim/// SkipCComment - This skips C-style /**/ comments.  The only difference from C
226584Sdim/// is that we allow nesting.
226584Sdimbool TGLexer::SkipCComment() {
226584Sdim  ++CurPtr;  // skip the star.
226584Sdim  unsigned CommentDepth = 1;
226584Sdim
226584Sdim  while (1) {
226584Sdim    int CurChar = getNextChar();
226584Sdim    switch (CurChar) {
226584Sdim    case EOF:
226584Sdim      PrintError(TokStart, "Unterminated comment!");
226584Sdim      return true;
226584Sdim    case '*':
226584Sdim      // End of the comment?
226584Sdim      if (CurPtr[0] != '/') break;
226584Sdim
226584Sdim      ++CurPtr;   // End the */.
226584Sdim      if (--CommentDepth == 0)
226584Sdim        return false;
226584Sdim      break;
226584Sdim    case '/':
226584Sdim      // Start of a nested comment?
226584Sdim      if (CurPtr[0] != '*') break;
226584Sdim      ++CurPtr;
226584Sdim      ++CommentDepth;
226584Sdim      break;
226584Sdim    }
226584Sdim  }
226584Sdim}
226584Sdim
226584Sdim/// LexNumber - Lex:
226584Sdim///    [-+]?[0-9]+
226584Sdim///    0x[0-9a-fA-F]+
226584Sdim///    0b[01]+
226584Sdimtgtok::TokKind TGLexer::LexNumber() {
226584Sdim  if (CurPtr[-1] == '0') {
226584Sdim    if (CurPtr[0] == 'x') {
226584Sdim      ++CurPtr;
226584Sdim      const char *NumStart = CurPtr;
226584Sdim      while (isxdigit(CurPtr[0]))
226584Sdim        ++CurPtr;
226584Sdim
226584Sdim      // Requires at least one hex digit.
226584Sdim      if (CurPtr == NumStart)
226584Sdim        return ReturnError(TokStart, "Invalid hexadecimal number");
226584Sdim
226584Sdim      errno = 0;
226584Sdim      CurIntVal = strtoll(NumStart, 0, 16);
226584Sdim      if (errno == EINVAL)
226584Sdim        return ReturnError(TokStart, "Invalid hexadecimal number");
226584Sdim      if (errno == ERANGE) {
226584Sdim        errno = 0;
226584Sdim        CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
226584Sdim        if (errno == EINVAL)
226584Sdim          return ReturnError(TokStart, "Invalid hexadecimal number");
226584Sdim        if (errno == ERANGE)
226584Sdim          return ReturnError(TokStart, "Hexadecimal number out of range");
226584Sdim      }
226584Sdim      return tgtok::IntVal;
226584Sdim    } else if (CurPtr[0] == 'b') {
226584Sdim      ++CurPtr;
226584Sdim      const char *NumStart = CurPtr;
226584Sdim      while (CurPtr[0] == '0' || CurPtr[0] == '1')
226584Sdim        ++CurPtr;
226584Sdim
226584Sdim      // Requires at least one binary digit.
226584Sdim      if (CurPtr == NumStart)
226584Sdim        return ReturnError(CurPtr-2, "Invalid binary number");
226584Sdim      CurIntVal = strtoll(NumStart, 0, 2);
226584Sdim      return tgtok::IntVal;
226584Sdim    }
226584Sdim  }
226584Sdim
226584Sdim  // Check for a sign without a digit.
226584Sdim  if (!isdigit(CurPtr[0])) {
226584Sdim    if (CurPtr[-1] == '-')
226584Sdim      return tgtok::minus;
226584Sdim    else if (CurPtr[-1] == '+')
226584Sdim      return tgtok::plus;
226584Sdim  }
226584Sdim
226584Sdim  while (isdigit(CurPtr[0]))
226584Sdim    ++CurPtr;
226584Sdim  CurIntVal = strtoll(TokStart, 0, 10);
226584Sdim  return tgtok::IntVal;
226584Sdim}
226584Sdim
226584Sdim/// LexBracket - We just read '['.  If this is a code block, return it,
226584Sdim/// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
226584Sdimtgtok::TokKind TGLexer::LexBracket() {
226584Sdim  if (CurPtr[0] != '{')
226584Sdim    return tgtok::l_square;
226584Sdim  ++CurPtr;
226584Sdim  const char *CodeStart = CurPtr;
226584Sdim  while (1) {
226584Sdim    int Char = getNextChar();
226584Sdim    if (Char == EOF) break;
226584Sdim
226584Sdim    if (Char != '}') continue;
226584Sdim
226584Sdim    Char = getNextChar();
226584Sdim    if (Char == EOF) break;
226584Sdim    if (Char == ']') {
226584Sdim      CurStrVal.assign(CodeStart, CurPtr-2);
226584Sdim      return tgtok::CodeFragment;
226584Sdim    }
226584Sdim  }
226584Sdim
226584Sdim  return ReturnError(CodeStart-2, "Unterminated Code Block");
226584Sdim}
226584Sdim
226584Sdim/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
226584Sdimtgtok::TokKind TGLexer::LexExclaim() {
226584Sdim  if (!isalpha(*CurPtr))
226584Sdim    return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
226584Sdim
226584Sdim  const char *Start = CurPtr++;
226584Sdim  while (isalpha(*CurPtr))
226584Sdim    ++CurPtr;
226584Sdim
226584Sdim  // Check to see which operator this is.
226584Sdim  tgtok::TokKind Kind =
226584Sdim    StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
226584Sdim    .Case("eq", tgtok::XEq)
226584Sdim    .Case("if", tgtok::XIf)
226584Sdim    .Case("head", tgtok::XHead)
226584Sdim    .Case("tail", tgtok::XTail)
226584Sdim    .Case("con", tgtok::XConcat)
249423Sdim    .Case("add", tgtok::XADD)
226584Sdim    .Case("shl", tgtok::XSHL)
226584Sdim    .Case("sra", tgtok::XSRA)
226584Sdim    .Case("srl", tgtok::XSRL)
226584Sdim    .Case("cast", tgtok::XCast)
226584Sdim    .Case("empty", tgtok::XEmpty)
226584Sdim    .Case("subst", tgtok::XSubst)
226584Sdim    .Case("foreach", tgtok::XForEach)
226584Sdim    .Case("strconcat", tgtok::XStrConcat)
226584Sdim    .Default(tgtok::Error);
226584Sdim
226584Sdim  return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
226584Sdim}
226584Sdim