1226584Sdim//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2226584Sdim//
3226584Sdim//                     The LLVM Compiler Infrastructure
4226584Sdim//
5226584Sdim// This file is distributed under the University of Illinois Open Source
6226584Sdim// License. See LICENSE.TXT for details.
7226584Sdim//
8226584Sdim//===----------------------------------------------------------------------===//
9226584Sdim//
10226584Sdim// Implement the Lexer for TableGen.
11226584Sdim//
12226584Sdim//===----------------------------------------------------------------------===//
13226584Sdim
14226584Sdim#include "TGLexer.h"
15226584Sdim#include "llvm/ADT/StringSwitch.h"
16226584Sdim#include "llvm/ADT/Twine.h"
17249423Sdim#include "llvm/Config/config.h" // for strtoull()/strtoll() define
18249423Sdim#include "llvm/Support/MemoryBuffer.h"
19249423Sdim#include "llvm/Support/SourceMgr.h"
20249423Sdim#include "llvm/TableGen/Error.h"
21226584Sdim#include <cctype>
22249423Sdim#include <cerrno>
23226584Sdim#include <cstdio>
24226584Sdim#include <cstdlib>
25226584Sdim#include <cstring>
26234353Sdim
27226584Sdimusing namespace llvm;
28226584Sdim
29226584SdimTGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
30226584Sdim  CurBuffer = 0;
31226584Sdim  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
32226584Sdim  CurPtr = CurBuf->getBufferStart();
33226584Sdim  TokStart = 0;
34226584Sdim}
35226584Sdim
36226584SdimSMLoc TGLexer::getLoc() const {
37226584Sdim  return SMLoc::getFromPointer(TokStart);
38226584Sdim}
39226584Sdim
40226584Sdim/// ReturnError - Set the error to the specified string at the specified
41226584Sdim/// location.  This is defined to always return tgtok::Error.
42226584Sdimtgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
43226584Sdim  PrintError(Loc, Msg);
44226584Sdim  return tgtok::Error;
45226584Sdim}
46226584Sdim
47226584Sdimint TGLexer::getNextChar() {
48226584Sdim  char CurChar = *CurPtr++;
49226584Sdim  switch (CurChar) {
50226584Sdim  default:
51226584Sdim    return (unsigned char)CurChar;
52226584Sdim  case 0: {
53226584Sdim    // A nul character in the stream is either the end of the current buffer or
54226584Sdim    // a random nul in the file.  Disambiguate that here.
55226584Sdim    if (CurPtr-1 != CurBuf->getBufferEnd())
56226584Sdim      return 0;  // Just whitespace.
57226584Sdim
58226584Sdim    // If this is the end of an included file, pop the parent file off the
59226584Sdim    // include stack.
60226584Sdim    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
61226584Sdim    if (ParentIncludeLoc != SMLoc()) {
62226584Sdim      CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
63226584Sdim      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
64226584Sdim      CurPtr = ParentIncludeLoc.getPointer();
65226584Sdim      return getNextChar();
66226584Sdim    }
67226584Sdim
68226584Sdim    // Otherwise, return end of file.
69226584Sdim    --CurPtr;  // Another call to lex will return EOF again.
70226584Sdim    return EOF;
71226584Sdim  }
72226584Sdim  case '\n':
73226584Sdim  case '\r':
74226584Sdim    // Handle the newline character by ignoring it and incrementing the line
75226584Sdim    // count.  However, be careful about 'dos style' files with \n\r in them.
76226584Sdim    // Only treat a \n\r or \r\n as a single line.
77226584Sdim    if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
78226584Sdim        *CurPtr != CurChar)
79226584Sdim      ++CurPtr;  // Eat the two char newline sequence.
80226584Sdim    return '\n';
81226584Sdim  }
82226584Sdim}
83226584Sdim
84234353Sdimint TGLexer::peekNextChar(int Index) {
85234353Sdim  return *(CurPtr + Index);
86234353Sdim}
87234353Sdim
88226584Sdimtgtok::TokKind TGLexer::LexToken() {
89226584Sdim  TokStart = CurPtr;
90226584Sdim  // This always consumes at least one character.
91226584Sdim  int CurChar = getNextChar();
92226584Sdim
93226584Sdim  switch (CurChar) {
94226584Sdim  default:
95234353Sdim    // Handle letters: [a-zA-Z_]
96234353Sdim    if (isalpha(CurChar) || CurChar == '_')
97226584Sdim      return LexIdentifier();
98234353Sdim
99226584Sdim    // Unknown character, emit an error.
100226584Sdim    return ReturnError(TokStart, "Unexpected character");
101226584Sdim  case EOF: return tgtok::Eof;
102226584Sdim  case ':': return tgtok::colon;
103226584Sdim  case ';': return tgtok::semi;
104226584Sdim  case '.': return tgtok::period;
105226584Sdim  case ',': return tgtok::comma;
106226584Sdim  case '<': return tgtok::less;
107226584Sdim  case '>': return tgtok::greater;
108226584Sdim  case ']': return tgtok::r_square;
109226584Sdim  case '{': return tgtok::l_brace;
110226584Sdim  case '}': return tgtok::r_brace;
111226584Sdim  case '(': return tgtok::l_paren;
112226584Sdim  case ')': return tgtok::r_paren;
113226584Sdim  case '=': return tgtok::equal;
114226584Sdim  case '?': return tgtok::question;
115234353Sdim  case '#': return tgtok::paste;
116226584Sdim
117226584Sdim  case 0:
118226584Sdim  case ' ':
119226584Sdim  case '\t':
120226584Sdim  case '\n':
121226584Sdim  case '\r':
122226584Sdim    // Ignore whitespace.
123226584Sdim    return LexToken();
124226584Sdim  case '/':
125226584Sdim    // If this is the start of a // comment, skip until the end of the line or
126226584Sdim    // the end of the buffer.
127226584Sdim    if (*CurPtr == '/')
128226584Sdim      SkipBCPLComment();
129226584Sdim    else if (*CurPtr == '*') {
130226584Sdim      if (SkipCComment())
131226584Sdim        return tgtok::Error;
132226584Sdim    } else // Otherwise, this is an error.
133226584Sdim      return ReturnError(TokStart, "Unexpected character");
134226584Sdim    return LexToken();
135226584Sdim  case '-': case '+':
136226584Sdim  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
137234353Sdim  case '7': case '8': case '9': {
138234353Sdim    int NextChar = 0;
139234353Sdim    if (isdigit(CurChar)) {
140234353Sdim      // Allow identifiers to start with a number if it is followed by
141234353Sdim      // an identifier.  This can happen with paste operations like
142234353Sdim      // foo#8i.
143234353Sdim      int i = 0;
144234353Sdim      do {
145234353Sdim        NextChar = peekNextChar(i++);
146234353Sdim      } while (isdigit(NextChar));
147234353Sdim
148234353Sdim      if (NextChar == 'x' || NextChar == 'b') {
149234353Sdim        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
150234353Sdim        // likely a number.
151234353Sdim        int NextNextChar = peekNextChar(i);
152234353Sdim        switch (NextNextChar) {
153234353Sdim        default:
154234353Sdim          break;
155234353Sdim        case '0': case '1':
156234353Sdim          if (NextChar == 'b')
157234353Sdim            return LexNumber();
158234353Sdim          // Fallthrough
159234353Sdim        case '2': case '3': case '4': case '5':
160234353Sdim        case '6': case '7': case '8': case '9':
161234353Sdim        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
162234353Sdim        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
163234353Sdim          if (NextChar == 'x')
164234353Sdim            return LexNumber();
165234353Sdim          break;
166234353Sdim        }
167234353Sdim      }
168234353Sdim    }
169234353Sdim
170234353Sdim    if (isalpha(NextChar) || NextChar == '_')
171234353Sdim      return LexIdentifier();
172234353Sdim
173226584Sdim    return LexNumber();
174234353Sdim  }
175226584Sdim  case '"': return LexString();
176226584Sdim  case '$': return LexVarName();
177226584Sdim  case '[': return LexBracket();
178226584Sdim  case '!': return LexExclaim();
179226584Sdim  }
180226584Sdim}
181226584Sdim
182226584Sdim/// LexString - Lex "[^"]*"
183226584Sdimtgtok::TokKind TGLexer::LexString() {
184226584Sdim  const char *StrStart = CurPtr;
185226584Sdim
186226584Sdim  CurStrVal = "";
187226584Sdim
188226584Sdim  while (*CurPtr != '"') {
189226584Sdim    // If we hit the end of the buffer, report an error.
190226584Sdim    if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
191226584Sdim      return ReturnError(StrStart, "End of file in string literal");
192226584Sdim
193226584Sdim    if (*CurPtr == '\n' || *CurPtr == '\r')
194226584Sdim      return ReturnError(StrStart, "End of line in string literal");
195226584Sdim
196226584Sdim    if (*CurPtr != '\\') {
197226584Sdim      CurStrVal += *CurPtr++;
198226584Sdim      continue;
199226584Sdim    }
200226584Sdim
201226584Sdim    ++CurPtr;
202226584Sdim
203226584Sdim    switch (*CurPtr) {
204226584Sdim    case '\\': case '\'': case '"':
205226584Sdim      // These turn into their literal character.
206226584Sdim      CurStrVal += *CurPtr++;
207226584Sdim      break;
208226584Sdim    case 't':
209226584Sdim      CurStrVal += '\t';
210226584Sdim      ++CurPtr;
211226584Sdim      break;
212226584Sdim    case 'n':
213226584Sdim      CurStrVal += '\n';
214226584Sdim      ++CurPtr;
215226584Sdim      break;
216226584Sdim
217226584Sdim    case '\n':
218226584Sdim    case '\r':
219226584Sdim      return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
220226584Sdim
221226584Sdim    // If we hit the end of the buffer, report an error.
222226584Sdim    case '\0':
223226584Sdim      if (CurPtr == CurBuf->getBufferEnd())
224226584Sdim        return ReturnError(StrStart, "End of file in string literal");
225226584Sdim      // FALL THROUGH
226226584Sdim    default:
227226584Sdim      return ReturnError(CurPtr, "invalid escape in string literal");
228226584Sdim    }
229226584Sdim  }
230226584Sdim
231226584Sdim  ++CurPtr;
232226584Sdim  return tgtok::StrVal;
233226584Sdim}
234226584Sdim
235226584Sdimtgtok::TokKind TGLexer::LexVarName() {
236226584Sdim  if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
237226584Sdim    return ReturnError(TokStart, "Invalid variable name");
238226584Sdim
239226584Sdim  // Otherwise, we're ok, consume the rest of the characters.
240226584Sdim  const char *VarNameStart = CurPtr++;
241226584Sdim
242226584Sdim  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
243226584Sdim    ++CurPtr;
244226584Sdim
245226584Sdim  CurStrVal.assign(VarNameStart, CurPtr);
246226584Sdim  return tgtok::VarName;
247226584Sdim}
248226584Sdim
249226584Sdim
250226584Sdimtgtok::TokKind TGLexer::LexIdentifier() {
251226584Sdim  // The first letter is [a-zA-Z_#].
252226584Sdim  const char *IdentStart = TokStart;
253226584Sdim
254226584Sdim  // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
255234353Sdim  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
256226584Sdim    ++CurPtr;
257226584Sdim
258226584Sdim  // Check to see if this identifier is a keyword.
259226584Sdim  StringRef Str(IdentStart, CurPtr-IdentStart);
260226584Sdim
261226584Sdim  if (Str == "include") {
262226584Sdim    if (LexInclude()) return tgtok::Error;
263226584Sdim    return Lex();
264226584Sdim  }
265226584Sdim
266226584Sdim  tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
267226584Sdim    .Case("int", tgtok::Int)
268226584Sdim    .Case("bit", tgtok::Bit)
269226584Sdim    .Case("bits", tgtok::Bits)
270226584Sdim    .Case("string", tgtok::String)
271226584Sdim    .Case("list", tgtok::List)
272226584Sdim    .Case("code", tgtok::Code)
273226584Sdim    .Case("dag", tgtok::Dag)
274226584Sdim    .Case("class", tgtok::Class)
275226584Sdim    .Case("def", tgtok::Def)
276234353Sdim    .Case("foreach", tgtok::Foreach)
277226584Sdim    .Case("defm", tgtok::Defm)
278226584Sdim    .Case("multiclass", tgtok::MultiClass)
279226584Sdim    .Case("field", tgtok::Field)
280226584Sdim    .Case("let", tgtok::Let)
281226584Sdim    .Case("in", tgtok::In)
282226584Sdim    .Default(tgtok::Id);
283226584Sdim
284226584Sdim  if (Kind == tgtok::Id)
285226584Sdim    CurStrVal.assign(Str.begin(), Str.end());
286226584Sdim  return Kind;
287226584Sdim}
288226584Sdim
289226584Sdim/// LexInclude - We just read the "include" token.  Get the string token that
290226584Sdim/// comes next and enter the include.
291226584Sdimbool TGLexer::LexInclude() {
292226584Sdim  // The token after the include must be a string.
293226584Sdim  tgtok::TokKind Tok = LexToken();
294226584Sdim  if (Tok == tgtok::Error) return true;
295226584Sdim  if (Tok != tgtok::StrVal) {
296226584Sdim    PrintError(getLoc(), "Expected filename after include");
297226584Sdim    return true;
298226584Sdim  }
299226584Sdim
300226584Sdim  // Get the string.
301226584Sdim  std::string Filename = CurStrVal;
302226584Sdim  std::string IncludedFile;
303226584Sdim
304226584Sdim
305226584Sdim  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
306226584Sdim                                    IncludedFile);
307226584Sdim  if (CurBuffer == -1) {
308226584Sdim    PrintError(getLoc(), "Could not find include file '" + Filename + "'");
309226584Sdim    return true;
310226584Sdim  }
311226584Sdim
312249423Sdim  DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
313249423Sdim  if (Found != Dependencies.end()) {
314249423Sdim    PrintError(getLoc(),
315249423Sdim               "File '" + IncludedFile + "' has already been included.");
316249423Sdim    SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
317249423Sdim                        "previously included here");
318249423Sdim    return true;
319249423Sdim  }
320249423Sdim  Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
321226584Sdim  // Save the line number and lex buffer of the includer.
322226584Sdim  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
323226584Sdim  CurPtr = CurBuf->getBufferStart();
324226584Sdim  return false;
325226584Sdim}
326226584Sdim
327226584Sdimvoid TGLexer::SkipBCPLComment() {
328226584Sdim  ++CurPtr;  // skip the second slash.
329226584Sdim  while (1) {
330226584Sdim    switch (*CurPtr) {
331226584Sdim    case '\n':
332226584Sdim    case '\r':
333226584Sdim      return;  // Newline is end of comment.
334226584Sdim    case 0:
335226584Sdim      // If this is the end of the buffer, end the comment.
336226584Sdim      if (CurPtr == CurBuf->getBufferEnd())
337226584Sdim        return;
338226584Sdim      break;
339226584Sdim    }
340226584Sdim    // Otherwise, skip the character.
341226584Sdim    ++CurPtr;
342226584Sdim  }
343226584Sdim}
344226584Sdim
345226584Sdim/// SkipCComment - This skips C-style /**/ comments.  The only difference from C
346226584Sdim/// is that we allow nesting.
347226584Sdimbool TGLexer::SkipCComment() {
348226584Sdim  ++CurPtr;  // skip the star.
349226584Sdim  unsigned CommentDepth = 1;
350226584Sdim
351226584Sdim  while (1) {
352226584Sdim    int CurChar = getNextChar();
353226584Sdim    switch (CurChar) {
354226584Sdim    case EOF:
355226584Sdim      PrintError(TokStart, "Unterminated comment!");
356226584Sdim      return true;
357226584Sdim    case '*':
358226584Sdim      // End of the comment?
359226584Sdim      if (CurPtr[0] != '/') break;
360226584Sdim
361226584Sdim      ++CurPtr;   // End the */.
362226584Sdim      if (--CommentDepth == 0)
363226584Sdim        return false;
364226584Sdim      break;
365226584Sdim    case '/':
366226584Sdim      // Start of a nested comment?
367226584Sdim      if (CurPtr[0] != '*') break;
368226584Sdim      ++CurPtr;
369226584Sdim      ++CommentDepth;
370226584Sdim      break;
371226584Sdim    }
372226584Sdim  }
373226584Sdim}
374226584Sdim
375226584Sdim/// LexNumber - Lex:
376226584Sdim///    [-+]?[0-9]+
377226584Sdim///    0x[0-9a-fA-F]+
378226584Sdim///    0b[01]+
379226584Sdimtgtok::TokKind TGLexer::LexNumber() {
380226584Sdim  if (CurPtr[-1] == '0') {
381226584Sdim    if (CurPtr[0] == 'x') {
382226584Sdim      ++CurPtr;
383226584Sdim      const char *NumStart = CurPtr;
384226584Sdim      while (isxdigit(CurPtr[0]))
385226584Sdim        ++CurPtr;
386226584Sdim
387226584Sdim      // Requires at least one hex digit.
388226584Sdim      if (CurPtr == NumStart)
389226584Sdim        return ReturnError(TokStart, "Invalid hexadecimal number");
390226584Sdim
391226584Sdim      errno = 0;
392226584Sdim      CurIntVal = strtoll(NumStart, 0, 16);
393226584Sdim      if (errno == EINVAL)
394226584Sdim        return ReturnError(TokStart, "Invalid hexadecimal number");
395226584Sdim      if (errno == ERANGE) {
396226584Sdim        errno = 0;
397226584Sdim        CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
398226584Sdim        if (errno == EINVAL)
399226584Sdim          return ReturnError(TokStart, "Invalid hexadecimal number");
400226584Sdim        if (errno == ERANGE)
401226584Sdim          return ReturnError(TokStart, "Hexadecimal number out of range");
402226584Sdim      }
403226584Sdim      return tgtok::IntVal;
404226584Sdim    } else if (CurPtr[0] == 'b') {
405226584Sdim      ++CurPtr;
406226584Sdim      const char *NumStart = CurPtr;
407226584Sdim      while (CurPtr[0] == '0' || CurPtr[0] == '1')
408226584Sdim        ++CurPtr;
409226584Sdim
410226584Sdim      // Requires at least one binary digit.
411226584Sdim      if (CurPtr == NumStart)
412226584Sdim        return ReturnError(CurPtr-2, "Invalid binary number");
413226584Sdim      CurIntVal = strtoll(NumStart, 0, 2);
414226584Sdim      return tgtok::IntVal;
415226584Sdim    }
416226584Sdim  }
417226584Sdim
418226584Sdim  // Check for a sign without a digit.
419226584Sdim  if (!isdigit(CurPtr[0])) {
420226584Sdim    if (CurPtr[-1] == '-')
421226584Sdim      return tgtok::minus;
422226584Sdim    else if (CurPtr[-1] == '+')
423226584Sdim      return tgtok::plus;
424226584Sdim  }
425226584Sdim
426226584Sdim  while (isdigit(CurPtr[0]))
427226584Sdim    ++CurPtr;
428226584Sdim  CurIntVal = strtoll(TokStart, 0, 10);
429226584Sdim  return tgtok::IntVal;
430226584Sdim}
431226584Sdim
432226584Sdim/// LexBracket - We just read '['.  If this is a code block, return it,
433226584Sdim/// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
434226584Sdimtgtok::TokKind TGLexer::LexBracket() {
435226584Sdim  if (CurPtr[0] != '{')
436226584Sdim    return tgtok::l_square;
437226584Sdim  ++CurPtr;
438226584Sdim  const char *CodeStart = CurPtr;
439226584Sdim  while (1) {
440226584Sdim    int Char = getNextChar();
441226584Sdim    if (Char == EOF) break;
442226584Sdim
443226584Sdim    if (Char != '}') continue;
444226584Sdim
445226584Sdim    Char = getNextChar();
446226584Sdim    if (Char == EOF) break;
447226584Sdim    if (Char == ']') {
448226584Sdim      CurStrVal.assign(CodeStart, CurPtr-2);
449226584Sdim      return tgtok::CodeFragment;
450226584Sdim    }
451226584Sdim  }
452226584Sdim
453226584Sdim  return ReturnError(CodeStart-2, "Unterminated Code Block");
454226584Sdim}
455226584Sdim
456226584Sdim/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
457226584Sdimtgtok::TokKind TGLexer::LexExclaim() {
458226584Sdim  if (!isalpha(*CurPtr))
459226584Sdim    return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
460226584Sdim
461226584Sdim  const char *Start = CurPtr++;
462226584Sdim  while (isalpha(*CurPtr))
463226584Sdim    ++CurPtr;
464226584Sdim
465226584Sdim  // Check to see which operator this is.
466226584Sdim  tgtok::TokKind Kind =
467226584Sdim    StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
468226584Sdim    .Case("eq", tgtok::XEq)
469226584Sdim    .Case("if", tgtok::XIf)
470226584Sdim    .Case("head", tgtok::XHead)
471226584Sdim    .Case("tail", tgtok::XTail)
472226584Sdim    .Case("con", tgtok::XConcat)
473249423Sdim    .Case("add", tgtok::XADD)
474226584Sdim    .Case("shl", tgtok::XSHL)
475226584Sdim    .Case("sra", tgtok::XSRA)
476226584Sdim    .Case("srl", tgtok::XSRL)
477226584Sdim    .Case("cast", tgtok::XCast)
478226584Sdim    .Case("empty", tgtok::XEmpty)
479226584Sdim    .Case("subst", tgtok::XSubst)
480226584Sdim    .Case("foreach", tgtok::XForEach)
481226584Sdim    .Case("strconcat", tgtok::XStrConcat)
482226584Sdim    .Default(tgtok::Error);
483226584Sdim
484226584Sdim  return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
485226584Sdim}
486226584Sdim
487