TGLexer.cpp revision 226584
1//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Implement the Lexer for TableGen.
11//
12//===----------------------------------------------------------------------===//
13
14#include "TGLexer.h"
15#include "llvm/TableGen/Error.h"
16#include "llvm/Support/SourceMgr.h"
17#include "llvm/Support/MemoryBuffer.h"
18#include "llvm/Config/config.h"
19#include "llvm/ADT/StringSwitch.h"
20#include "llvm/ADT/Twine.h"
21#include <cctype>
22#include <cstdio>
23#include <cstdlib>
24#include <cstring>
25#include <cerrno>
26using namespace llvm;
27
28TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
29  CurBuffer = 0;
30  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
31  CurPtr = CurBuf->getBufferStart();
32  TokStart = 0;
33}
34
35SMLoc TGLexer::getLoc() const {
36  return SMLoc::getFromPointer(TokStart);
37}
38
39/// ReturnError - Set the error to the specified string at the specified
40/// location.  This is defined to always return tgtok::Error.
41tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
42  PrintError(Loc, Msg);
43  return tgtok::Error;
44}
45
46int TGLexer::getNextChar() {
47  char CurChar = *CurPtr++;
48  switch (CurChar) {
49  default:
50    return (unsigned char)CurChar;
51  case 0: {
52    // A nul character in the stream is either the end of the current buffer or
53    // a random nul in the file.  Disambiguate that here.
54    if (CurPtr-1 != CurBuf->getBufferEnd())
55      return 0;  // Just whitespace.
56
57    // If this is the end of an included file, pop the parent file off the
58    // include stack.
59    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
60    if (ParentIncludeLoc != SMLoc()) {
61      CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
62      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
63      CurPtr = ParentIncludeLoc.getPointer();
64      return getNextChar();
65    }
66
67    // Otherwise, return end of file.
68    --CurPtr;  // Another call to lex will return EOF again.
69    return EOF;
70  }
71  case '\n':
72  case '\r':
73    // Handle the newline character by ignoring it and incrementing the line
74    // count.  However, be careful about 'dos style' files with \n\r in them.
75    // Only treat a \n\r or \r\n as a single line.
76    if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
77        *CurPtr != CurChar)
78      ++CurPtr;  // Eat the two char newline sequence.
79    return '\n';
80  }
81}
82
83tgtok::TokKind TGLexer::LexToken() {
84  TokStart = CurPtr;
85  // This always consumes at least one character.
86  int CurChar = getNextChar();
87
88  switch (CurChar) {
89  default:
90    // Handle letters: [a-zA-Z_#]
91    if (isalpha(CurChar) || CurChar == '_' || CurChar == '#')
92      return LexIdentifier();
93
94    // Unknown character, emit an error.
95    return ReturnError(TokStart, "Unexpected character");
96  case EOF: return tgtok::Eof;
97  case ':': return tgtok::colon;
98  case ';': return tgtok::semi;
99  case '.': return tgtok::period;
100  case ',': return tgtok::comma;
101  case '<': return tgtok::less;
102  case '>': return tgtok::greater;
103  case ']': return tgtok::r_square;
104  case '{': return tgtok::l_brace;
105  case '}': return tgtok::r_brace;
106  case '(': return tgtok::l_paren;
107  case ')': return tgtok::r_paren;
108  case '=': return tgtok::equal;
109  case '?': return tgtok::question;
110
111  case 0:
112  case ' ':
113  case '\t':
114  case '\n':
115  case '\r':
116    // Ignore whitespace.
117    return LexToken();
118  case '/':
119    // If this is the start of a // comment, skip until the end of the line or
120    // the end of the buffer.
121    if (*CurPtr == '/')
122      SkipBCPLComment();
123    else if (*CurPtr == '*') {
124      if (SkipCComment())
125        return tgtok::Error;
126    } else // Otherwise, this is an error.
127      return ReturnError(TokStart, "Unexpected character");
128    return LexToken();
129  case '-': case '+':
130  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
131  case '7': case '8': case '9':
132    return LexNumber();
133  case '"': return LexString();
134  case '$': return LexVarName();
135  case '[': return LexBracket();
136  case '!': return LexExclaim();
137  }
138}
139
140/// LexString - Lex "[^"]*"
141tgtok::TokKind TGLexer::LexString() {
142  const char *StrStart = CurPtr;
143
144  CurStrVal = "";
145
146  while (*CurPtr != '"') {
147    // If we hit the end of the buffer, report an error.
148    if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
149      return ReturnError(StrStart, "End of file in string literal");
150
151    if (*CurPtr == '\n' || *CurPtr == '\r')
152      return ReturnError(StrStart, "End of line in string literal");
153
154    if (*CurPtr != '\\') {
155      CurStrVal += *CurPtr++;
156      continue;
157    }
158
159    ++CurPtr;
160
161    switch (*CurPtr) {
162    case '\\': case '\'': case '"':
163      // These turn into their literal character.
164      CurStrVal += *CurPtr++;
165      break;
166    case 't':
167      CurStrVal += '\t';
168      ++CurPtr;
169      break;
170    case 'n':
171      CurStrVal += '\n';
172      ++CurPtr;
173      break;
174
175    case '\n':
176    case '\r':
177      return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
178
179    // If we hit the end of the buffer, report an error.
180    case '\0':
181      if (CurPtr == CurBuf->getBufferEnd())
182        return ReturnError(StrStart, "End of file in string literal");
183      // FALL THROUGH
184    default:
185      return ReturnError(CurPtr, "invalid escape in string literal");
186    }
187  }
188
189  ++CurPtr;
190  return tgtok::StrVal;
191}
192
193tgtok::TokKind TGLexer::LexVarName() {
194  if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
195    return ReturnError(TokStart, "Invalid variable name");
196
197  // Otherwise, we're ok, consume the rest of the characters.
198  const char *VarNameStart = CurPtr++;
199
200  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
201    ++CurPtr;
202
203  CurStrVal.assign(VarNameStart, CurPtr);
204  return tgtok::VarName;
205}
206
207
208tgtok::TokKind TGLexer::LexIdentifier() {
209  // The first letter is [a-zA-Z_#].
210  const char *IdentStart = TokStart;
211
212  // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
213  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' ||
214         *CurPtr == '#')
215    ++CurPtr;
216
217  // Check to see if this identifier is a keyword.
218  StringRef Str(IdentStart, CurPtr-IdentStart);
219
220  if (Str == "include") {
221    if (LexInclude()) return tgtok::Error;
222    return Lex();
223  }
224
225  tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
226    .Case("int", tgtok::Int)
227    .Case("bit", tgtok::Bit)
228    .Case("bits", tgtok::Bits)
229    .Case("string", tgtok::String)
230    .Case("list", tgtok::List)
231    .Case("code", tgtok::Code)
232    .Case("dag", tgtok::Dag)
233    .Case("class", tgtok::Class)
234    .Case("def", tgtok::Def)
235    .Case("defm", tgtok::Defm)
236    .Case("multiclass", tgtok::MultiClass)
237    .Case("field", tgtok::Field)
238    .Case("let", tgtok::Let)
239    .Case("in", tgtok::In)
240    .Default(tgtok::Id);
241
242  if (Kind == tgtok::Id)
243    CurStrVal.assign(Str.begin(), Str.end());
244  return Kind;
245}
246
247/// LexInclude - We just read the "include" token.  Get the string token that
248/// comes next and enter the include.
249bool TGLexer::LexInclude() {
250  // The token after the include must be a string.
251  tgtok::TokKind Tok = LexToken();
252  if (Tok == tgtok::Error) return true;
253  if (Tok != tgtok::StrVal) {
254    PrintError(getLoc(), "Expected filename after include");
255    return true;
256  }
257
258  // Get the string.
259  std::string Filename = CurStrVal;
260  std::string IncludedFile;
261
262
263  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
264                                    IncludedFile);
265  if (CurBuffer == -1) {
266    PrintError(getLoc(), "Could not find include file '" + Filename + "'");
267    return true;
268  }
269
270  Dependencies.push_back(IncludedFile);
271  // Save the line number and lex buffer of the includer.
272  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
273  CurPtr = CurBuf->getBufferStart();
274  return false;
275}
276
277void TGLexer::SkipBCPLComment() {
278  ++CurPtr;  // skip the second slash.
279  while (1) {
280    switch (*CurPtr) {
281    case '\n':
282    case '\r':
283      return;  // Newline is end of comment.
284    case 0:
285      // If this is the end of the buffer, end the comment.
286      if (CurPtr == CurBuf->getBufferEnd())
287        return;
288      break;
289    }
290    // Otherwise, skip the character.
291    ++CurPtr;
292  }
293}
294
295/// SkipCComment - This skips C-style /**/ comments.  The only difference from C
296/// is that we allow nesting.
297bool TGLexer::SkipCComment() {
298  ++CurPtr;  // skip the star.
299  unsigned CommentDepth = 1;
300
301  while (1) {
302    int CurChar = getNextChar();
303    switch (CurChar) {
304    case EOF:
305      PrintError(TokStart, "Unterminated comment!");
306      return true;
307    case '*':
308      // End of the comment?
309      if (CurPtr[0] != '/') break;
310
311      ++CurPtr;   // End the */.
312      if (--CommentDepth == 0)
313        return false;
314      break;
315    case '/':
316      // Start of a nested comment?
317      if (CurPtr[0] != '*') break;
318      ++CurPtr;
319      ++CommentDepth;
320      break;
321    }
322  }
323}
324
325/// LexNumber - Lex:
326///    [-+]?[0-9]+
327///    0x[0-9a-fA-F]+
328///    0b[01]+
329tgtok::TokKind TGLexer::LexNumber() {
330  if (CurPtr[-1] == '0') {
331    if (CurPtr[0] == 'x') {
332      ++CurPtr;
333      const char *NumStart = CurPtr;
334      while (isxdigit(CurPtr[0]))
335        ++CurPtr;
336
337      // Requires at least one hex digit.
338      if (CurPtr == NumStart)
339        return ReturnError(TokStart, "Invalid hexadecimal number");
340
341      errno = 0;
342      CurIntVal = strtoll(NumStart, 0, 16);
343      if (errno == EINVAL)
344        return ReturnError(TokStart, "Invalid hexadecimal number");
345      if (errno == ERANGE) {
346        errno = 0;
347        CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
348        if (errno == EINVAL)
349          return ReturnError(TokStart, "Invalid hexadecimal number");
350        if (errno == ERANGE)
351          return ReturnError(TokStart, "Hexadecimal number out of range");
352      }
353      return tgtok::IntVal;
354    } else if (CurPtr[0] == 'b') {
355      ++CurPtr;
356      const char *NumStart = CurPtr;
357      while (CurPtr[0] == '0' || CurPtr[0] == '1')
358        ++CurPtr;
359
360      // Requires at least one binary digit.
361      if (CurPtr == NumStart)
362        return ReturnError(CurPtr-2, "Invalid binary number");
363      CurIntVal = strtoll(NumStart, 0, 2);
364      return tgtok::IntVal;
365    }
366  }
367
368  // Check for a sign without a digit.
369  if (!isdigit(CurPtr[0])) {
370    if (CurPtr[-1] == '-')
371      return tgtok::minus;
372    else if (CurPtr[-1] == '+')
373      return tgtok::plus;
374  }
375
376  while (isdigit(CurPtr[0]))
377    ++CurPtr;
378  CurIntVal = strtoll(TokStart, 0, 10);
379  return tgtok::IntVal;
380}
381
382/// LexBracket - We just read '['.  If this is a code block, return it,
383/// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
384tgtok::TokKind TGLexer::LexBracket() {
385  if (CurPtr[0] != '{')
386    return tgtok::l_square;
387  ++CurPtr;
388  const char *CodeStart = CurPtr;
389  while (1) {
390    int Char = getNextChar();
391    if (Char == EOF) break;
392
393    if (Char != '}') continue;
394
395    Char = getNextChar();
396    if (Char == EOF) break;
397    if (Char == ']') {
398      CurStrVal.assign(CodeStart, CurPtr-2);
399      return tgtok::CodeFragment;
400    }
401  }
402
403  return ReturnError(CodeStart-2, "Unterminated Code Block");
404}
405
406/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
407tgtok::TokKind TGLexer::LexExclaim() {
408  if (!isalpha(*CurPtr))
409    return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
410
411  const char *Start = CurPtr++;
412  while (isalpha(*CurPtr))
413    ++CurPtr;
414
415  // Check to see which operator this is.
416  tgtok::TokKind Kind =
417    StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
418    .Case("eq", tgtok::XEq)
419    .Case("if", tgtok::XIf)
420    .Case("head", tgtok::XHead)
421    .Case("tail", tgtok::XTail)
422    .Case("con", tgtok::XConcat)
423    .Case("shl", tgtok::XSHL)
424    .Case("sra", tgtok::XSRA)
425    .Case("srl", tgtok::XSRL)
426    .Case("cast", tgtok::XCast)
427    .Case("empty", tgtok::XEmpty)
428    .Case("subst", tgtok::XSubst)
429    .Case("foreach", tgtok::XForEach)
430    .Case("strconcat", tgtok::XStrConcat)
431    .Default(tgtok::Error);
432
433  return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
434}
435
436