AsmLexer.cpp revision 263508
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This class implements the lexer for assembly files.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/MC/MCParser/AsmLexer.h"
15#include "llvm/MC/MCAsmInfo.h"
16#include "llvm/Support/MemoryBuffer.h"
17#include "llvm/Support/SMLoc.h"
18#include <cctype>
19#include <cerrno>
20#include <cstdio>
21#include <cstdlib>
22using namespace llvm;
23
24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
25  CurBuf = NULL;
26  CurPtr = NULL;
27  isAtStartOfLine = true;
28}
29
30AsmLexer::~AsmLexer() {
31}
32
33void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
34  CurBuf = buf;
35
36  if (ptr)
37    CurPtr = ptr;
38  else
39    CurPtr = CurBuf->getBufferStart();
40
41  TokStart = 0;
42}
43
44/// ReturnError - Set the error to the specified string at the specified
45/// location.  This is defined to always return AsmToken::Error.
46AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
47  SetError(SMLoc::getFromPointer(Loc), Msg);
48
49  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
50}
51
52int AsmLexer::getNextChar() {
53  char CurChar = *CurPtr++;
54  switch (CurChar) {
55  default:
56    return (unsigned char)CurChar;
57  case 0:
58    // A nul character in the stream is either the end of the current buffer or
59    // a random nul in the file.  Disambiguate that here.
60    if (CurPtr-1 != CurBuf->getBufferEnd())
61      return 0;  // Just whitespace.
62
63    // Otherwise, return end of file.
64    --CurPtr;  // Another call to lex will return EOF again.
65    return EOF;
66  }
67}
68
69/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
70///
71/// The leading integral digit sequence and dot should have already been
72/// consumed, some or all of the fractional digit sequence *can* have been
73/// consumed.
74AsmToken AsmLexer::LexFloatLiteral() {
75  // Skip the fractional digit sequence.
76  while (isdigit(*CurPtr))
77    ++CurPtr;
78
79  // Check for exponent; we intentionally accept a slighlty wider set of
80  // literals here and rely on the upstream client to reject invalid ones (e.g.,
81  // "1e+").
82  if (*CurPtr == 'e' || *CurPtr == 'E') {
83    ++CurPtr;
84    if (*CurPtr == '-' || *CurPtr == '+')
85      ++CurPtr;
86    while (isdigit(*CurPtr))
87      ++CurPtr;
88  }
89
90  return AsmToken(AsmToken::Real,
91                  StringRef(TokStart, CurPtr - TokStart));
92}
93
94/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
95/// while making sure there are enough actual digits around for the constant to
96/// be valid.
97///
98/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
99/// before we get here.
100AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
101  assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
102         "unexpected parse state in floating hex");
103  bool NoFracDigits = true;
104
105  // Skip the fractional part if there is one
106  if (*CurPtr == '.') {
107    ++CurPtr;
108
109    const char *FracStart = CurPtr;
110    while (isxdigit(*CurPtr))
111      ++CurPtr;
112
113    NoFracDigits = CurPtr == FracStart;
114  }
115
116  if (NoIntDigits && NoFracDigits)
117    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
118                                 "expected at least one significand digit");
119
120  // Make sure we do have some kind of proper exponent part
121  if (*CurPtr != 'p' && *CurPtr != 'P')
122    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
123                                 "expected exponent part 'p'");
124  ++CurPtr;
125
126  if (*CurPtr == '+' || *CurPtr == '-')
127    ++CurPtr;
128
129  // N.b. exponent digits are *not* hex
130  const char *ExpStart = CurPtr;
131  while (isdigit(*CurPtr))
132    ++CurPtr;
133
134  if (CurPtr == ExpStart)
135    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
136                                 "expected at least one exponent digit");
137
138  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
139}
140
141/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
142static bool IsIdentifierChar(char c) {
143  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@' || c == '?';
144}
145AsmToken AsmLexer::LexIdentifier() {
146  // Check for floating point literals.
147  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
148    // Disambiguate a .1243foo identifier from a floating literal.
149    while (isdigit(*CurPtr))
150      ++CurPtr;
151    if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr))
152      return LexFloatLiteral();
153  }
154
155  while (IsIdentifierChar(*CurPtr))
156    ++CurPtr;
157
158  // Handle . as a special case.
159  if (CurPtr == TokStart+1 && TokStart[0] == '.')
160    return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
161
162  return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
163}
164
165/// LexSlash: Slash: /
166///           C-Style Comment: /* ... */
167AsmToken AsmLexer::LexSlash() {
168  switch (*CurPtr) {
169  case '*': break; // C style comment.
170  case '/': return ++CurPtr, LexLineComment();
171  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
172  }
173
174  // C Style comment.
175  ++CurPtr;  // skip the star.
176  while (1) {
177    int CurChar = getNextChar();
178    switch (CurChar) {
179    case EOF:
180      return ReturnError(TokStart, "unterminated comment");
181    case '*':
182      // End of the comment?
183      if (CurPtr[0] != '/') break;
184
185      ++CurPtr;   // End the */.
186      return LexToken();
187    }
188  }
189}
190
191/// LexLineComment: Comment: #[^\n]*
192///                        : //[^\n]*
193AsmToken AsmLexer::LexLineComment() {
194  // FIXME: This is broken if we happen to a comment at the end of a file, which
195  // was .included, and which doesn't end with a newline.
196  int CurChar = getNextChar();
197  while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
198    CurChar = getNextChar();
199
200  if (CurChar == EOF)
201    return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
202  return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
203}
204
205static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
206  // Skip ULL, UL, U, L and LL suffices.
207  if (CurPtr[0] == 'U')
208    ++CurPtr;
209  if (CurPtr[0] == 'L')
210    ++CurPtr;
211  if (CurPtr[0] == 'L')
212    ++CurPtr;
213}
214
215// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
216// integer as a hexadecimal, possibly with leading zeroes.
217static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
218  const char *FirstHex = 0;
219  const char *LookAhead = CurPtr;
220  while (1) {
221    if (isdigit(*LookAhead)) {
222      ++LookAhead;
223    } else if (isxdigit(*LookAhead)) {
224      if (!FirstHex)
225        FirstHex = LookAhead;
226      ++LookAhead;
227    } else {
228      break;
229    }
230  }
231  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
232  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
233  if (isHex)
234    return 16;
235  return DefaultRadix;
236}
237
238/// LexDigit: First character is [0-9].
239///   Local Label: [0-9][:]
240///   Forward/Backward Label: [0-9][fb]
241///   Binary integer: 0b[01]+
242///   Octal integer: 0[0-7]+
243///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
244///   Decimal integer: [1-9][0-9]*
245AsmToken AsmLexer::LexDigit() {
246  // Decimal integer: [1-9][0-9]*
247  if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
248    unsigned Radix = doLookAhead(CurPtr, 10);
249    bool isHex = Radix == 16;
250    // Check for floating point literals.
251    if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
252      ++CurPtr;
253      return LexFloatLiteral();
254    }
255
256    StringRef Result(TokStart, CurPtr - TokStart);
257
258    long long Value;
259    if (Result.getAsInteger(Radix, Value)) {
260      // Allow positive values that are too large to fit into a signed 64-bit
261      // integer, but that do fit in an unsigned one, we just convert them over.
262      unsigned long long UValue;
263      if (Result.getAsInteger(Radix, UValue))
264        return ReturnError(TokStart, !isHex ? "invalid decimal number" :
265                           "invalid hexdecimal number");
266      Value = (long long)UValue;
267    }
268
269    // Consume the [bB][hH].
270    if (Radix == 2 || Radix == 16)
271      ++CurPtr;
272
273    // The darwin/x86 (and x86-64) assembler accepts and ignores type
274    // suffices on integer literals.
275    SkipIgnoredIntegerSuffix(CurPtr);
276
277    return AsmToken(AsmToken::Integer, Result, Value);
278  }
279
280  if (*CurPtr == 'b') {
281    ++CurPtr;
282    // See if we actually have "0b" as part of something like "jmp 0b\n"
283    if (!isdigit(CurPtr[0])) {
284      --CurPtr;
285      StringRef Result(TokStart, CurPtr - TokStart);
286      return AsmToken(AsmToken::Integer, Result, 0);
287    }
288    const char *NumStart = CurPtr;
289    while (CurPtr[0] == '0' || CurPtr[0] == '1')
290      ++CurPtr;
291
292    // Requires at least one binary digit.
293    if (CurPtr == NumStart)
294      return ReturnError(TokStart, "invalid binary number");
295
296    StringRef Result(TokStart, CurPtr - TokStart);
297
298    long long Value;
299    if (Result.substr(2).getAsInteger(2, Value))
300      return ReturnError(TokStart, "invalid binary number");
301
302    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
303    // suffixes on integer literals.
304    SkipIgnoredIntegerSuffix(CurPtr);
305
306    return AsmToken(AsmToken::Integer, Result, Value);
307  }
308
309  if (*CurPtr == 'x') {
310    ++CurPtr;
311    const char *NumStart = CurPtr;
312    while (isxdigit(CurPtr[0]))
313      ++CurPtr;
314
315    // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
316    // diagnosed by LexHexFloatLiteral).
317    if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
318      return LexHexFloatLiteral(NumStart == CurPtr);
319
320    // Otherwise requires at least one hex digit.
321    if (CurPtr == NumStart)
322      return ReturnError(CurPtr-2, "invalid hexadecimal number");
323
324    unsigned long long Result;
325    if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
326      return ReturnError(TokStart, "invalid hexadecimal number");
327
328    // Consume the optional [hH].
329    if (*CurPtr == 'h' || *CurPtr == 'H')
330      ++CurPtr;
331
332    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
333    // suffixes on integer literals.
334    SkipIgnoredIntegerSuffix(CurPtr);
335
336    return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart),
337                    (int64_t)Result);
338  }
339
340  // Either octal or hexadecimal.
341  long long Value;
342  unsigned Radix = doLookAhead(CurPtr, 8);
343  bool isHex = Radix == 16;
344  StringRef Result(TokStart, CurPtr - TokStart);
345  if (Result.getAsInteger(Radix, Value))
346    return ReturnError(TokStart, !isHex ? "invalid octal number" :
347                       "invalid hexdecimal number");
348
349  // Consume the [hH].
350  if (Radix == 16)
351    ++CurPtr;
352
353  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
354  // suffixes on integer literals.
355  SkipIgnoredIntegerSuffix(CurPtr);
356
357  return AsmToken(AsmToken::Integer, Result, Value);
358}
359
360/// LexSingleQuote: Integer: 'b'
361AsmToken AsmLexer::LexSingleQuote() {
362  int CurChar = getNextChar();
363
364  if (CurChar == '\\')
365    CurChar = getNextChar();
366
367  if (CurChar == EOF)
368    return ReturnError(TokStart, "unterminated single quote");
369
370  CurChar = getNextChar();
371
372  if (CurChar != '\'')
373    return ReturnError(TokStart, "single quote way too long");
374
375  // The idea here being that 'c' is basically just an integral
376  // constant.
377  StringRef Res = StringRef(TokStart,CurPtr - TokStart);
378  long long Value;
379
380  if (Res.startswith("\'\\")) {
381    char theChar = Res[2];
382    switch (theChar) {
383      default: Value = theChar; break;
384      case '\'': Value = '\''; break;
385      case 't': Value = '\t'; break;
386      case 'n': Value = '\n'; break;
387      case 'b': Value = '\b'; break;
388    }
389  } else
390    Value = TokStart[1];
391
392  return AsmToken(AsmToken::Integer, Res, Value);
393}
394
395
396/// LexQuote: String: "..."
397AsmToken AsmLexer::LexQuote() {
398  int CurChar = getNextChar();
399  // TODO: does gas allow multiline string constants?
400  while (CurChar != '"') {
401    if (CurChar == '\\') {
402      // Allow \", etc.
403      CurChar = getNextChar();
404    }
405
406    if (CurChar == EOF)
407      return ReturnError(TokStart, "unterminated string constant");
408
409    CurChar = getNextChar();
410  }
411
412  return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
413}
414
415StringRef AsmLexer::LexUntilEndOfStatement() {
416  TokStart = CurPtr;
417
418  while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
419         !isAtStatementSeparator(CurPtr) && // End of statement marker.
420         *CurPtr != '\n' &&
421         *CurPtr != '\r' &&
422         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
423    ++CurPtr;
424  }
425  return StringRef(TokStart, CurPtr-TokStart);
426}
427
428StringRef AsmLexer::LexUntilEndOfLine() {
429  TokStart = CurPtr;
430
431  while (*CurPtr != '\n' &&
432         *CurPtr != '\r' &&
433         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
434    ++CurPtr;
435  }
436  return StringRef(TokStart, CurPtr-TokStart);
437}
438
439bool AsmLexer::isAtStartOfComment(char Char) {
440  // FIXME: This won't work for multi-character comment indicators like "//".
441  return Char == *MAI.getCommentString();
442}
443
444bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
445  return strncmp(Ptr, MAI.getSeparatorString(),
446                 strlen(MAI.getSeparatorString())) == 0;
447}
448
449AsmToken AsmLexer::LexToken() {
450  TokStart = CurPtr;
451  // This always consumes at least one character.
452  int CurChar = getNextChar();
453
454  if (isAtStartOfComment(CurChar)) {
455    // If this comment starts with a '#', then return the Hash token and let
456    // the assembler parser see if it can be parsed as a cpp line filename
457    // comment. We do this only if we are at the start of a line.
458    if (CurChar == '#' && isAtStartOfLine)
459      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
460    isAtStartOfLine = true;
461    return LexLineComment();
462  }
463  if (isAtStatementSeparator(TokStart)) {
464    CurPtr += strlen(MAI.getSeparatorString()) - 1;
465    return AsmToken(AsmToken::EndOfStatement,
466                    StringRef(TokStart, strlen(MAI.getSeparatorString())));
467  }
468
469  // If we're missing a newline at EOF, make sure we still get an
470  // EndOfStatement token before the Eof token.
471  if (CurChar == EOF && !isAtStartOfLine) {
472    isAtStartOfLine = true;
473    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
474  }
475
476  isAtStartOfLine = false;
477  switch (CurChar) {
478  default:
479    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
480    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
481      return LexIdentifier();
482
483    // Unknown character, emit an error.
484    return ReturnError(TokStart, "invalid character in input");
485  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
486  case 0:
487  case ' ':
488  case '\t':
489    if (SkipSpace) {
490      // Ignore whitespace.
491      return LexToken();
492    } else {
493      int len = 1;
494      while (*CurPtr==' ' || *CurPtr=='\t') {
495        CurPtr++;
496        len++;
497      }
498      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
499    }
500  case '\n': // FALL THROUGH.
501  case '\r':
502    isAtStartOfLine = true;
503    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
504  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
505  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
506  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
507  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
508  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
509  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
510  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
511  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
512  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
513  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
514  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
515  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
516  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
517  case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
518  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
519  case '=':
520    if (*CurPtr == '=')
521      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
522    return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
523  case '|':
524    if (*CurPtr == '|')
525      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
526    return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
527  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
528  case '&':
529    if (*CurPtr == '&')
530      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
531    return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
532  case '!':
533    if (*CurPtr == '=')
534      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
535    return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
536  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
537  case '/': return LexSlash();
538  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
539  case '\'': return LexSingleQuote();
540  case '"': return LexQuote();
541  case '0': case '1': case '2': case '3': case '4':
542  case '5': case '6': case '7': case '8': case '9':
543    return LexDigit();
544  case '<':
545    switch (*CurPtr) {
546    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
547                                        StringRef(TokStart, 2));
548    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
549                                        StringRef(TokStart, 2));
550    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
551                                        StringRef(TokStart, 2));
552    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
553    }
554  case '>':
555    switch (*CurPtr) {
556    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
557                                        StringRef(TokStart, 2));
558    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
559                                        StringRef(TokStart, 2));
560    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
561    }
562
563  // TODO: Quoted identifiers (objc methods etc)
564  // local labels: [0-9][:]
565  // Forward/backward labels: [0-9][fb]
566  // Integers, fp constants, character constants.
567  }
568}
569