AsmLexer.cpp revision 202878
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This class implements the lexer for assembly files.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/MC/MCParser/AsmLexer.h"
15#include "llvm/Support/SMLoc.h"
16#include "llvm/Support/MemoryBuffer.h"
17#include "llvm/MC/MCAsmInfo.h"
18#include <cerrno>
19#include <cstdio>
20#include <cstdlib>
21using namespace llvm;
22
23AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
24  CurBuf = NULL;
25  CurPtr = NULL;
26  TokStart = 0;
27}
28
29AsmLexer::~AsmLexer() {
30}
31
32void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
33  CurBuf = buf;
34
35  if (ptr)
36    CurPtr = ptr;
37  else
38    CurPtr = CurBuf->getBufferStart();
39
40  TokStart = 0;
41}
42
43SMLoc AsmLexer::getLoc() const {
44  return SMLoc::getFromPointer(TokStart);
45}
46
47/// ReturnError - Set the error to the specified string at the specified
48/// location.  This is defined to always return AsmToken::Error.
49AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
50  SetError(SMLoc::getFromPointer(Loc), Msg);
51
52  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
53}
54
55int AsmLexer::getNextChar() {
56  char CurChar = *CurPtr++;
57  switch (CurChar) {
58  default:
59    return (unsigned char)CurChar;
60  case 0:
61    // A nul character in the stream is either the end of the current buffer or
62    // a random nul in the file.  Disambiguate that here.
63    if (CurPtr-1 != CurBuf->getBufferEnd())
64      return 0;  // Just whitespace.
65
66    // Otherwise, return end of file.
67    --CurPtr;  // Another call to lex will return EOF again.
68    return EOF;
69  }
70}
71
72/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
73AsmToken AsmLexer::LexIdentifier() {
74  while (isalnum(*CurPtr) || *CurPtr == '_' || *CurPtr == '$' ||
75         *CurPtr == '.' || *CurPtr == '@')
76    ++CurPtr;
77  return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
78}
79
80/// LexSlash: Slash: /
81///           C-Style Comment: /* ... */
82AsmToken AsmLexer::LexSlash() {
83  switch (*CurPtr) {
84  case '*': break; // C style comment.
85  case '/': return ++CurPtr, LexLineComment();
86  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr, 1));
87  }
88
89  // C Style comment.
90  ++CurPtr;  // skip the star.
91  while (1) {
92    int CurChar = getNextChar();
93    switch (CurChar) {
94    case EOF:
95      return ReturnError(TokStart, "unterminated comment");
96    case '*':
97      // End of the comment?
98      if (CurPtr[0] != '/') break;
99
100      ++CurPtr;   // End the */.
101      return LexToken();
102    }
103  }
104}
105
106/// LexLineComment: Comment: #[^\n]*
107///                        : //[^\n]*
108AsmToken AsmLexer::LexLineComment() {
109  // FIXME: This is broken if we happen to a comment at the end of a file, which
110  // was .included, and which doesn't end with a newline.
111  int CurChar = getNextChar();
112  while (CurChar != '\n' && CurChar != '\n' && CurChar != EOF)
113    CurChar = getNextChar();
114
115  if (CurChar == EOF)
116    return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
117  return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
118}
119
120
121/// LexDigit: First character is [0-9].
122///   Local Label: [0-9][:]
123///   Forward/Backward Label: [0-9][fb]
124///   Binary integer: 0b[01]+
125///   Octal integer: 0[0-7]+
126///   Hex integer: 0x[0-9a-fA-F]+
127///   Decimal integer: [1-9][0-9]*
128/// TODO: FP literal.
129AsmToken AsmLexer::LexDigit() {
130  if (*CurPtr == ':')
131    return ReturnError(TokStart, "FIXME: local label not implemented");
132  if (*CurPtr == 'f' || *CurPtr == 'b')
133    return ReturnError(TokStart, "FIXME: directional label not implemented");
134
135  // Decimal integer: [1-9][0-9]*
136  if (CurPtr[-1] != '0') {
137    while (isdigit(*CurPtr))
138      ++CurPtr;
139
140    StringRef Result(TokStart, CurPtr - TokStart);
141
142    long long Value;
143    if (Result.getAsInteger(10, Value))
144      return ReturnError(TokStart, "Invalid decimal number");
145    return AsmToken(AsmToken::Integer, Result, Value);
146  }
147
148  if (*CurPtr == 'b') {
149    ++CurPtr;
150    const char *NumStart = CurPtr;
151    while (CurPtr[0] == '0' || CurPtr[0] == '1')
152      ++CurPtr;
153
154    // Requires at least one binary digit.
155    if (CurPtr == NumStart)
156      return ReturnError(TokStart, "Invalid binary number");
157
158    StringRef Result(TokStart, CurPtr - TokStart);
159
160    long long Value;
161    if (Result.getAsInteger(2, Value))
162      return ReturnError(TokStart, "Invalid binary number");
163
164    return AsmToken(AsmToken::Integer, Result, Value);
165  }
166
167  if (*CurPtr == 'x') {
168    ++CurPtr;
169    const char *NumStart = CurPtr;
170    while (isxdigit(CurPtr[0]))
171      ++CurPtr;
172
173    // Requires at least one hex digit.
174    if (CurPtr == NumStart)
175      return ReturnError(CurPtr-2, "Invalid hexadecimal number");
176
177    unsigned long long Result;
178    if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
179      return ReturnError(TokStart, "Invalid hexadecimal number");
180
181    return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart),
182                    (int64_t)Result);
183  }
184
185  // Must be an octal number, it starts with 0.
186  while (*CurPtr >= '0' && *CurPtr <= '7')
187    ++CurPtr;
188
189  StringRef Result(TokStart, CurPtr - TokStart);
190  long long Value;
191  if (Result.getAsInteger(8, Value))
192    return ReturnError(TokStart, "Invalid octal number");
193
194  return AsmToken(AsmToken::Integer, Result, Value);
195}
196
197/// LexQuote: String: "..."
198AsmToken AsmLexer::LexQuote() {
199  int CurChar = getNextChar();
200  // TODO: does gas allow multiline string constants?
201  while (CurChar != '"') {
202    if (CurChar == '\\') {
203      // Allow \", etc.
204      CurChar = getNextChar();
205    }
206
207    if (CurChar == EOF)
208      return ReturnError(TokStart, "unterminated string constant");
209
210    CurChar = getNextChar();
211  }
212
213  return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
214}
215
216StringRef AsmLexer::LexUntilEndOfStatement() {
217  TokStart = CurPtr;
218
219  while (!isAtStartOfComment(*CurPtr) && // Start of line comment.
220	  *CurPtr != ';' &&  // End of statement marker.
221         *CurPtr != '\n' &&
222         *CurPtr != '\r' &&
223         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
224    ++CurPtr;
225  }
226  return StringRef(TokStart, CurPtr-TokStart);
227}
228
229bool AsmLexer::isAtStartOfComment(char Char) {
230  // FIXME: This won't work for multi-character comment indicators like "//".
231  return Char == *MAI.getCommentString();
232}
233
234AsmToken AsmLexer::LexToken() {
235  TokStart = CurPtr;
236  // This always consumes at least one character.
237  int CurChar = getNextChar();
238
239  if (isAtStartOfComment(CurChar))
240    return LexLineComment();
241
242  switch (CurChar) {
243  default:
244    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
245    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
246      return LexIdentifier();
247
248    // Unknown character, emit an error.
249    return ReturnError(TokStart, "invalid character in input");
250  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
251  case 0:
252  case ' ':
253  case '\t':
254    // Ignore whitespace.
255    return LexToken();
256  case '\n': // FALL THROUGH.
257  case '\r': // FALL THROUGH.
258  case ';': return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
259  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
260  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
261  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
262  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
263  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
264  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
265  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
266  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
267  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
268  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
269  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
270  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
271  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
272  case '=':
273    if (*CurPtr == '=')
274      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
275    return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
276  case '|':
277    if (*CurPtr == '|')
278      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
279    return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
280  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
281  case '&':
282    if (*CurPtr == '&')
283      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
284    return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
285  case '!':
286    if (*CurPtr == '=')
287      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
288    return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
289  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
290  case '/': return LexSlash();
291  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
292  case '"': return LexQuote();
293  case '0': case '1': case '2': case '3': case '4':
294  case '5': case '6': case '7': case '8': case '9':
295    return LexDigit();
296  case '<':
297    switch (*CurPtr) {
298    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
299                                        StringRef(TokStart, 2));
300    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
301                                        StringRef(TokStart, 2));
302    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
303                                        StringRef(TokStart, 2));
304    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
305    }
306  case '>':
307    switch (*CurPtr) {
308    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
309                                        StringRef(TokStart, 2));
310    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
311                                        StringRef(TokStart, 2));
312    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
313    }
314
315  // TODO: Quoted identifiers (objc methods etc)
316  // local labels: [0-9][:]
317  // Forward/backward labels: [0-9][fb]
318  // Integers, fp constants, character constants.
319  }
320}
321