1226584Sdim//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2226584Sdim// 3226584Sdim// The LLVM Compiler Infrastructure 4226584Sdim// 5226584Sdim// This file is distributed under the University of Illinois Open Source 6226584Sdim// License. See LICENSE.TXT for details. 7226584Sdim// 8226584Sdim//===----------------------------------------------------------------------===// 9226584Sdim// 10226584Sdim// Implement the Lexer for TableGen. 11226584Sdim// 12226584Sdim//===----------------------------------------------------------------------===// 13226584Sdim 14226584Sdim#include "TGLexer.h" 15226584Sdim#include "llvm/ADT/StringSwitch.h" 16226584Sdim#include "llvm/ADT/Twine.h" 17249423Sdim#include "llvm/Config/config.h" // for strtoull()/strtoll() define 18249423Sdim#include "llvm/Support/MemoryBuffer.h" 19249423Sdim#include "llvm/Support/SourceMgr.h" 20249423Sdim#include "llvm/TableGen/Error.h" 21226584Sdim#include <cctype> 22249423Sdim#include <cerrno> 23226584Sdim#include <cstdio> 24226584Sdim#include <cstdlib> 25226584Sdim#include <cstring> 26234353Sdim 27226584Sdimusing namespace llvm; 28226584Sdim 29226584SdimTGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { 30226584Sdim CurBuffer = 0; 31226584Sdim CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 32226584Sdim CurPtr = CurBuf->getBufferStart(); 33226584Sdim TokStart = 0; 34226584Sdim} 35226584Sdim 36226584SdimSMLoc TGLexer::getLoc() const { 37226584Sdim return SMLoc::getFromPointer(TokStart); 38226584Sdim} 39226584Sdim 40226584Sdim/// ReturnError - Set the error to the specified string at the specified 41226584Sdim/// location. This is defined to always return tgtok::Error. 42226584Sdimtgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 43226584Sdim PrintError(Loc, Msg); 44226584Sdim return tgtok::Error; 45226584Sdim} 46226584Sdim 47226584Sdimint TGLexer::getNextChar() { 48226584Sdim char CurChar = *CurPtr++; 49226584Sdim switch (CurChar) { 50226584Sdim default: 51226584Sdim return (unsigned char)CurChar; 52226584Sdim case 0: { 53226584Sdim // A nul character in the stream is either the end of the current buffer or 54226584Sdim // a random nul in the file. Disambiguate that here. 55226584Sdim if (CurPtr-1 != CurBuf->getBufferEnd()) 56226584Sdim return 0; // Just whitespace. 57226584Sdim 58226584Sdim // If this is the end of an included file, pop the parent file off the 59226584Sdim // include stack. 60226584Sdim SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 61226584Sdim if (ParentIncludeLoc != SMLoc()) { 62226584Sdim CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 63226584Sdim CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 64226584Sdim CurPtr = ParentIncludeLoc.getPointer(); 65226584Sdim return getNextChar(); 66226584Sdim } 67226584Sdim 68226584Sdim // Otherwise, return end of file. 69226584Sdim --CurPtr; // Another call to lex will return EOF again. 70226584Sdim return EOF; 71226584Sdim } 72226584Sdim case '\n': 73226584Sdim case '\r': 74226584Sdim // Handle the newline character by ignoring it and incrementing the line 75226584Sdim // count. However, be careful about 'dos style' files with \n\r in them. 76226584Sdim // Only treat a \n\r or \r\n as a single line. 77226584Sdim if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 78226584Sdim *CurPtr != CurChar) 79226584Sdim ++CurPtr; // Eat the two char newline sequence. 80226584Sdim return '\n'; 81226584Sdim } 82226584Sdim} 83226584Sdim 84234353Sdimint TGLexer::peekNextChar(int Index) { 85234353Sdim return *(CurPtr + Index); 86234353Sdim} 87234353Sdim 88226584Sdimtgtok::TokKind TGLexer::LexToken() { 89226584Sdim TokStart = CurPtr; 90226584Sdim // This always consumes at least one character. 91226584Sdim int CurChar = getNextChar(); 92226584Sdim 93226584Sdim switch (CurChar) { 94226584Sdim default: 95234353Sdim // Handle letters: [a-zA-Z_] 96234353Sdim if (isalpha(CurChar) || CurChar == '_') 97226584Sdim return LexIdentifier(); 98234353Sdim 99226584Sdim // Unknown character, emit an error. 100226584Sdim return ReturnError(TokStart, "Unexpected character"); 101226584Sdim case EOF: return tgtok::Eof; 102226584Sdim case ':': return tgtok::colon; 103226584Sdim case ';': return tgtok::semi; 104226584Sdim case '.': return tgtok::period; 105226584Sdim case ',': return tgtok::comma; 106226584Sdim case '<': return tgtok::less; 107226584Sdim case '>': return tgtok::greater; 108226584Sdim case ']': return tgtok::r_square; 109226584Sdim case '{': return tgtok::l_brace; 110226584Sdim case '}': return tgtok::r_brace; 111226584Sdim case '(': return tgtok::l_paren; 112226584Sdim case ')': return tgtok::r_paren; 113226584Sdim case '=': return tgtok::equal; 114226584Sdim case '?': return tgtok::question; 115234353Sdim case '#': return tgtok::paste; 116226584Sdim 117226584Sdim case 0: 118226584Sdim case ' ': 119226584Sdim case '\t': 120226584Sdim case '\n': 121226584Sdim case '\r': 122226584Sdim // Ignore whitespace. 123226584Sdim return LexToken(); 124226584Sdim case '/': 125226584Sdim // If this is the start of a // comment, skip until the end of the line or 126226584Sdim // the end of the buffer. 127226584Sdim if (*CurPtr == '/') 128226584Sdim SkipBCPLComment(); 129226584Sdim else if (*CurPtr == '*') { 130226584Sdim if (SkipCComment()) 131226584Sdim return tgtok::Error; 132226584Sdim } else // Otherwise, this is an error. 133226584Sdim return ReturnError(TokStart, "Unexpected character"); 134226584Sdim return LexToken(); 135226584Sdim case '-': case '+': 136226584Sdim case '0': case '1': case '2': case '3': case '4': case '5': case '6': 137234353Sdim case '7': case '8': case '9': { 138234353Sdim int NextChar = 0; 139234353Sdim if (isdigit(CurChar)) { 140234353Sdim // Allow identifiers to start with a number if it is followed by 141234353Sdim // an identifier. This can happen with paste operations like 142234353Sdim // foo#8i. 143234353Sdim int i = 0; 144234353Sdim do { 145234353Sdim NextChar = peekNextChar(i++); 146234353Sdim } while (isdigit(NextChar)); 147234353Sdim 148234353Sdim if (NextChar == 'x' || NextChar == 'b') { 149234353Sdim // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 150234353Sdim // likely a number. 151234353Sdim int NextNextChar = peekNextChar(i); 152234353Sdim switch (NextNextChar) { 153234353Sdim default: 154234353Sdim break; 155234353Sdim case '0': case '1': 156234353Sdim if (NextChar == 'b') 157234353Sdim return LexNumber(); 158234353Sdim // Fallthrough 159234353Sdim case '2': case '3': case '4': case '5': 160234353Sdim case '6': case '7': case '8': case '9': 161234353Sdim case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 162234353Sdim case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 163234353Sdim if (NextChar == 'x') 164234353Sdim return LexNumber(); 165234353Sdim break; 166234353Sdim } 167234353Sdim } 168234353Sdim } 169234353Sdim 170234353Sdim if (isalpha(NextChar) || NextChar == '_') 171234353Sdim return LexIdentifier(); 172234353Sdim 173226584Sdim return LexNumber(); 174234353Sdim } 175226584Sdim case '"': return LexString(); 176226584Sdim case '$': return LexVarName(); 177226584Sdim case '[': return LexBracket(); 178226584Sdim case '!': return LexExclaim(); 179226584Sdim } 180226584Sdim} 181226584Sdim 182226584Sdim/// LexString - Lex "[^"]*" 183226584Sdimtgtok::TokKind TGLexer::LexString() { 184226584Sdim const char *StrStart = CurPtr; 185226584Sdim 186226584Sdim CurStrVal = ""; 187226584Sdim 188226584Sdim while (*CurPtr != '"') { 189226584Sdim // If we hit the end of the buffer, report an error. 190226584Sdim if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd()) 191226584Sdim return ReturnError(StrStart, "End of file in string literal"); 192226584Sdim 193226584Sdim if (*CurPtr == '\n' || *CurPtr == '\r') 194226584Sdim return ReturnError(StrStart, "End of line in string literal"); 195226584Sdim 196226584Sdim if (*CurPtr != '\\') { 197226584Sdim CurStrVal += *CurPtr++; 198226584Sdim continue; 199226584Sdim } 200226584Sdim 201226584Sdim ++CurPtr; 202226584Sdim 203226584Sdim switch (*CurPtr) { 204226584Sdim case '\\': case '\'': case '"': 205226584Sdim // These turn into their literal character. 206226584Sdim CurStrVal += *CurPtr++; 207226584Sdim break; 208226584Sdim case 't': 209226584Sdim CurStrVal += '\t'; 210226584Sdim ++CurPtr; 211226584Sdim break; 212226584Sdim case 'n': 213226584Sdim CurStrVal += '\n'; 214226584Sdim ++CurPtr; 215226584Sdim break; 216226584Sdim 217226584Sdim case '\n': 218226584Sdim case '\r': 219226584Sdim return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 220226584Sdim 221226584Sdim // If we hit the end of the buffer, report an error. 222226584Sdim case '\0': 223226584Sdim if (CurPtr == CurBuf->getBufferEnd()) 224226584Sdim return ReturnError(StrStart, "End of file in string literal"); 225226584Sdim // FALL THROUGH 226226584Sdim default: 227226584Sdim return ReturnError(CurPtr, "invalid escape in string literal"); 228226584Sdim } 229226584Sdim } 230226584Sdim 231226584Sdim ++CurPtr; 232226584Sdim return tgtok::StrVal; 233226584Sdim} 234226584Sdim 235226584Sdimtgtok::TokKind TGLexer::LexVarName() { 236226584Sdim if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 237226584Sdim return ReturnError(TokStart, "Invalid variable name"); 238226584Sdim 239226584Sdim // Otherwise, we're ok, consume the rest of the characters. 240226584Sdim const char *VarNameStart = CurPtr++; 241226584Sdim 242226584Sdim while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 243226584Sdim ++CurPtr; 244226584Sdim 245226584Sdim CurStrVal.assign(VarNameStart, CurPtr); 246226584Sdim return tgtok::VarName; 247226584Sdim} 248226584Sdim 249226584Sdim 250226584Sdimtgtok::TokKind TGLexer::LexIdentifier() { 251226584Sdim // The first letter is [a-zA-Z_#]. 252226584Sdim const char *IdentStart = TokStart; 253226584Sdim 254226584Sdim // Match the rest of the identifier regex: [0-9a-zA-Z_#]* 255234353Sdim while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 256226584Sdim ++CurPtr; 257226584Sdim 258226584Sdim // Check to see if this identifier is a keyword. 259226584Sdim StringRef Str(IdentStart, CurPtr-IdentStart); 260226584Sdim 261226584Sdim if (Str == "include") { 262226584Sdim if (LexInclude()) return tgtok::Error; 263226584Sdim return Lex(); 264226584Sdim } 265226584Sdim 266226584Sdim tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 267226584Sdim .Case("int", tgtok::Int) 268226584Sdim .Case("bit", tgtok::Bit) 269226584Sdim .Case("bits", tgtok::Bits) 270226584Sdim .Case("string", tgtok::String) 271226584Sdim .Case("list", tgtok::List) 272226584Sdim .Case("code", tgtok::Code) 273226584Sdim .Case("dag", tgtok::Dag) 274226584Sdim .Case("class", tgtok::Class) 275226584Sdim .Case("def", tgtok::Def) 276234353Sdim .Case("foreach", tgtok::Foreach) 277226584Sdim .Case("defm", tgtok::Defm) 278226584Sdim .Case("multiclass", tgtok::MultiClass) 279226584Sdim .Case("field", tgtok::Field) 280226584Sdim .Case("let", tgtok::Let) 281226584Sdim .Case("in", tgtok::In) 282226584Sdim .Default(tgtok::Id); 283226584Sdim 284226584Sdim if (Kind == tgtok::Id) 285226584Sdim CurStrVal.assign(Str.begin(), Str.end()); 286226584Sdim return Kind; 287226584Sdim} 288226584Sdim 289226584Sdim/// LexInclude - We just read the "include" token. Get the string token that 290226584Sdim/// comes next and enter the include. 291226584Sdimbool TGLexer::LexInclude() { 292226584Sdim // The token after the include must be a string. 293226584Sdim tgtok::TokKind Tok = LexToken(); 294226584Sdim if (Tok == tgtok::Error) return true; 295226584Sdim if (Tok != tgtok::StrVal) { 296226584Sdim PrintError(getLoc(), "Expected filename after include"); 297226584Sdim return true; 298226584Sdim } 299226584Sdim 300226584Sdim // Get the string. 301226584Sdim std::string Filename = CurStrVal; 302226584Sdim std::string IncludedFile; 303226584Sdim 304226584Sdim 305226584Sdim CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 306226584Sdim IncludedFile); 307226584Sdim if (CurBuffer == -1) { 308226584Sdim PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 309226584Sdim return true; 310226584Sdim } 311226584Sdim 312249423Sdim DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile); 313249423Sdim if (Found != Dependencies.end()) { 314249423Sdim PrintError(getLoc(), 315249423Sdim "File '" + IncludedFile + "' has already been included."); 316249423Sdim SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note, 317249423Sdim "previously included here"); 318249423Sdim return true; 319249423Sdim } 320249423Sdim Dependencies.insert(std::make_pair(IncludedFile, getLoc())); 321226584Sdim // Save the line number and lex buffer of the includer. 322226584Sdim CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 323226584Sdim CurPtr = CurBuf->getBufferStart(); 324226584Sdim return false; 325226584Sdim} 326226584Sdim 327226584Sdimvoid TGLexer::SkipBCPLComment() { 328226584Sdim ++CurPtr; // skip the second slash. 329226584Sdim while (1) { 330226584Sdim switch (*CurPtr) { 331226584Sdim case '\n': 332226584Sdim case '\r': 333226584Sdim return; // Newline is end of comment. 334226584Sdim case 0: 335226584Sdim // If this is the end of the buffer, end the comment. 336226584Sdim if (CurPtr == CurBuf->getBufferEnd()) 337226584Sdim return; 338226584Sdim break; 339226584Sdim } 340226584Sdim // Otherwise, skip the character. 341226584Sdim ++CurPtr; 342226584Sdim } 343226584Sdim} 344226584Sdim 345226584Sdim/// SkipCComment - This skips C-style /**/ comments. The only difference from C 346226584Sdim/// is that we allow nesting. 347226584Sdimbool TGLexer::SkipCComment() { 348226584Sdim ++CurPtr; // skip the star. 349226584Sdim unsigned CommentDepth = 1; 350226584Sdim 351226584Sdim while (1) { 352226584Sdim int CurChar = getNextChar(); 353226584Sdim switch (CurChar) { 354226584Sdim case EOF: 355226584Sdim PrintError(TokStart, "Unterminated comment!"); 356226584Sdim return true; 357226584Sdim case '*': 358226584Sdim // End of the comment? 359226584Sdim if (CurPtr[0] != '/') break; 360226584Sdim 361226584Sdim ++CurPtr; // End the */. 362226584Sdim if (--CommentDepth == 0) 363226584Sdim return false; 364226584Sdim break; 365226584Sdim case '/': 366226584Sdim // Start of a nested comment? 367226584Sdim if (CurPtr[0] != '*') break; 368226584Sdim ++CurPtr; 369226584Sdim ++CommentDepth; 370226584Sdim break; 371226584Sdim } 372226584Sdim } 373226584Sdim} 374226584Sdim 375226584Sdim/// LexNumber - Lex: 376226584Sdim/// [-+]?[0-9]+ 377226584Sdim/// 0x[0-9a-fA-F]+ 378226584Sdim/// 0b[01]+ 379226584Sdimtgtok::TokKind TGLexer::LexNumber() { 380226584Sdim if (CurPtr[-1] == '0') { 381226584Sdim if (CurPtr[0] == 'x') { 382226584Sdim ++CurPtr; 383226584Sdim const char *NumStart = CurPtr; 384226584Sdim while (isxdigit(CurPtr[0])) 385226584Sdim ++CurPtr; 386226584Sdim 387226584Sdim // Requires at least one hex digit. 388226584Sdim if (CurPtr == NumStart) 389226584Sdim return ReturnError(TokStart, "Invalid hexadecimal number"); 390226584Sdim 391226584Sdim errno = 0; 392226584Sdim CurIntVal = strtoll(NumStart, 0, 16); 393226584Sdim if (errno == EINVAL) 394226584Sdim return ReturnError(TokStart, "Invalid hexadecimal number"); 395226584Sdim if (errno == ERANGE) { 396226584Sdim errno = 0; 397226584Sdim CurIntVal = (int64_t)strtoull(NumStart, 0, 16); 398226584Sdim if (errno == EINVAL) 399226584Sdim return ReturnError(TokStart, "Invalid hexadecimal number"); 400226584Sdim if (errno == ERANGE) 401226584Sdim return ReturnError(TokStart, "Hexadecimal number out of range"); 402226584Sdim } 403226584Sdim return tgtok::IntVal; 404226584Sdim } else if (CurPtr[0] == 'b') { 405226584Sdim ++CurPtr; 406226584Sdim const char *NumStart = CurPtr; 407226584Sdim while (CurPtr[0] == '0' || CurPtr[0] == '1') 408226584Sdim ++CurPtr; 409226584Sdim 410226584Sdim // Requires at least one binary digit. 411226584Sdim if (CurPtr == NumStart) 412226584Sdim return ReturnError(CurPtr-2, "Invalid binary number"); 413226584Sdim CurIntVal = strtoll(NumStart, 0, 2); 414226584Sdim return tgtok::IntVal; 415226584Sdim } 416226584Sdim } 417226584Sdim 418226584Sdim // Check for a sign without a digit. 419226584Sdim if (!isdigit(CurPtr[0])) { 420226584Sdim if (CurPtr[-1] == '-') 421226584Sdim return tgtok::minus; 422226584Sdim else if (CurPtr[-1] == '+') 423226584Sdim return tgtok::plus; 424226584Sdim } 425226584Sdim 426226584Sdim while (isdigit(CurPtr[0])) 427226584Sdim ++CurPtr; 428226584Sdim CurIntVal = strtoll(TokStart, 0, 10); 429226584Sdim return tgtok::IntVal; 430226584Sdim} 431226584Sdim 432226584Sdim/// LexBracket - We just read '['. If this is a code block, return it, 433226584Sdim/// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 434226584Sdimtgtok::TokKind TGLexer::LexBracket() { 435226584Sdim if (CurPtr[0] != '{') 436226584Sdim return tgtok::l_square; 437226584Sdim ++CurPtr; 438226584Sdim const char *CodeStart = CurPtr; 439226584Sdim while (1) { 440226584Sdim int Char = getNextChar(); 441226584Sdim if (Char == EOF) break; 442226584Sdim 443226584Sdim if (Char != '}') continue; 444226584Sdim 445226584Sdim Char = getNextChar(); 446226584Sdim if (Char == EOF) break; 447226584Sdim if (Char == ']') { 448226584Sdim CurStrVal.assign(CodeStart, CurPtr-2); 449226584Sdim return tgtok::CodeFragment; 450226584Sdim } 451226584Sdim } 452226584Sdim 453226584Sdim return ReturnError(CodeStart-2, "Unterminated Code Block"); 454226584Sdim} 455226584Sdim 456226584Sdim/// LexExclaim - Lex '!' and '![a-zA-Z]+'. 457226584Sdimtgtok::TokKind TGLexer::LexExclaim() { 458226584Sdim if (!isalpha(*CurPtr)) 459226584Sdim return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 460226584Sdim 461226584Sdim const char *Start = CurPtr++; 462226584Sdim while (isalpha(*CurPtr)) 463226584Sdim ++CurPtr; 464226584Sdim 465226584Sdim // Check to see which operator this is. 466226584Sdim tgtok::TokKind Kind = 467226584Sdim StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 468226584Sdim .Case("eq", tgtok::XEq) 469226584Sdim .Case("if", tgtok::XIf) 470226584Sdim .Case("head", tgtok::XHead) 471226584Sdim .Case("tail", tgtok::XTail) 472226584Sdim .Case("con", tgtok::XConcat) 473249423Sdim .Case("add", tgtok::XADD) 474226584Sdim .Case("shl", tgtok::XSHL) 475226584Sdim .Case("sra", tgtok::XSRA) 476226584Sdim .Case("srl", tgtok::XSRL) 477226584Sdim .Case("cast", tgtok::XCast) 478226584Sdim .Case("empty", tgtok::XEmpty) 479226584Sdim .Case("subst", tgtok::XSubst) 480226584Sdim .Case("foreach", tgtok::XForEach) 481226584Sdim .Case("strconcat", tgtok::XStrConcat) 482226584Sdim .Default(tgtok::Error); 483226584Sdim 484226584Sdim return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 485226584Sdim} 486226584Sdim 487