AsmLexer.cpp revision 263508
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This class implements the lexer for assembly files. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/MC/MCParser/AsmLexer.h" 15#include "llvm/MC/MCAsmInfo.h" 16#include "llvm/Support/MemoryBuffer.h" 17#include "llvm/Support/SMLoc.h" 18#include <cctype> 19#include <cerrno> 20#include <cstdio> 21#include <cstdlib> 22using namespace llvm; 23 24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurBuf = NULL; 26 CurPtr = NULL; 27 isAtStartOfLine = true; 28} 29 30AsmLexer::~AsmLexer() { 31} 32 33void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 34 CurBuf = buf; 35 36 if (ptr) 37 CurPtr = ptr; 38 else 39 CurPtr = CurBuf->getBufferStart(); 40 41 TokStart = 0; 42} 43 44/// ReturnError - Set the error to the specified string at the specified 45/// location. This is defined to always return AsmToken::Error. 46AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 47 SetError(SMLoc::getFromPointer(Loc), Msg); 48 49 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 50} 51 52int AsmLexer::getNextChar() { 53 char CurChar = *CurPtr++; 54 switch (CurChar) { 55 default: 56 return (unsigned char)CurChar; 57 case 0: 58 // A nul character in the stream is either the end of the current buffer or 59 // a random nul in the file. Disambiguate that here. 60 if (CurPtr-1 != CurBuf->getBufferEnd()) 61 return 0; // Just whitespace. 62 63 // Otherwise, return end of file. 64 --CurPtr; // Another call to lex will return EOF again. 65 return EOF; 66 } 67} 68 69/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 70/// 71/// The leading integral digit sequence and dot should have already been 72/// consumed, some or all of the fractional digit sequence *can* have been 73/// consumed. 74AsmToken AsmLexer::LexFloatLiteral() { 75 // Skip the fractional digit sequence. 76 while (isdigit(*CurPtr)) 77 ++CurPtr; 78 79 // Check for exponent; we intentionally accept a slighlty wider set of 80 // literals here and rely on the upstream client to reject invalid ones (e.g., 81 // "1e+"). 82 if (*CurPtr == 'e' || *CurPtr == 'E') { 83 ++CurPtr; 84 if (*CurPtr == '-' || *CurPtr == '+') 85 ++CurPtr; 86 while (isdigit(*CurPtr)) 87 ++CurPtr; 88 } 89 90 return AsmToken(AsmToken::Real, 91 StringRef(TokStart, CurPtr - TokStart)); 92} 93 94/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ 95/// while making sure there are enough actual digits around for the constant to 96/// be valid. 97/// 98/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed 99/// before we get here. 100AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { 101 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && 102 "unexpected parse state in floating hex"); 103 bool NoFracDigits = true; 104 105 // Skip the fractional part if there is one 106 if (*CurPtr == '.') { 107 ++CurPtr; 108 109 const char *FracStart = CurPtr; 110 while (isxdigit(*CurPtr)) 111 ++CurPtr; 112 113 NoFracDigits = CurPtr == FracStart; 114 } 115 116 if (NoIntDigits && NoFracDigits) 117 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 118 "expected at least one significand digit"); 119 120 // Make sure we do have some kind of proper exponent part 121 if (*CurPtr != 'p' && *CurPtr != 'P') 122 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 123 "expected exponent part 'p'"); 124 ++CurPtr; 125 126 if (*CurPtr == '+' || *CurPtr == '-') 127 ++CurPtr; 128 129 // N.b. exponent digits are *not* hex 130 const char *ExpStart = CurPtr; 131 while (isdigit(*CurPtr)) 132 ++CurPtr; 133 134 if (CurPtr == ExpStart) 135 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 136 "expected at least one exponent digit"); 137 138 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 139} 140 141/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]* 142static bool IsIdentifierChar(char c) { 143 return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@' || c == '?'; 144} 145AsmToken AsmLexer::LexIdentifier() { 146 // Check for floating point literals. 147 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 148 // Disambiguate a .1243foo identifier from a floating literal. 149 while (isdigit(*CurPtr)) 150 ++CurPtr; 151 if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) 152 return LexFloatLiteral(); 153 } 154 155 while (IsIdentifierChar(*CurPtr)) 156 ++CurPtr; 157 158 // Handle . as a special case. 159 if (CurPtr == TokStart+1 && TokStart[0] == '.') 160 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 161 162 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 163} 164 165/// LexSlash: Slash: / 166/// C-Style Comment: /* ... */ 167AsmToken AsmLexer::LexSlash() { 168 switch (*CurPtr) { 169 case '*': break; // C style comment. 170 case '/': return ++CurPtr, LexLineComment(); 171 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 172 } 173 174 // C Style comment. 175 ++CurPtr; // skip the star. 176 while (1) { 177 int CurChar = getNextChar(); 178 switch (CurChar) { 179 case EOF: 180 return ReturnError(TokStart, "unterminated comment"); 181 case '*': 182 // End of the comment? 183 if (CurPtr[0] != '/') break; 184 185 ++CurPtr; // End the */. 186 return LexToken(); 187 } 188 } 189} 190 191/// LexLineComment: Comment: #[^\n]* 192/// : //[^\n]* 193AsmToken AsmLexer::LexLineComment() { 194 // FIXME: This is broken if we happen to a comment at the end of a file, which 195 // was .included, and which doesn't end with a newline. 196 int CurChar = getNextChar(); 197 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 198 CurChar = getNextChar(); 199 200 if (CurChar == EOF) 201 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 202 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 203} 204 205static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 206 // Skip ULL, UL, U, L and LL suffices. 207 if (CurPtr[0] == 'U') 208 ++CurPtr; 209 if (CurPtr[0] == 'L') 210 ++CurPtr; 211 if (CurPtr[0] == 'L') 212 ++CurPtr; 213} 214 215// Look ahead to search for first non-hex digit, if it's [hH], then we treat the 216// integer as a hexadecimal, possibly with leading zeroes. 217static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) { 218 const char *FirstHex = 0; 219 const char *LookAhead = CurPtr; 220 while (1) { 221 if (isdigit(*LookAhead)) { 222 ++LookAhead; 223 } else if (isxdigit(*LookAhead)) { 224 if (!FirstHex) 225 FirstHex = LookAhead; 226 ++LookAhead; 227 } else { 228 break; 229 } 230 } 231 bool isHex = *LookAhead == 'h' || *LookAhead == 'H'; 232 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex; 233 if (isHex) 234 return 16; 235 return DefaultRadix; 236} 237 238/// LexDigit: First character is [0-9]. 239/// Local Label: [0-9][:] 240/// Forward/Backward Label: [0-9][fb] 241/// Binary integer: 0b[01]+ 242/// Octal integer: 0[0-7]+ 243/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 244/// Decimal integer: [1-9][0-9]* 245AsmToken AsmLexer::LexDigit() { 246 // Decimal integer: [1-9][0-9]* 247 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 248 unsigned Radix = doLookAhead(CurPtr, 10); 249 bool isHex = Radix == 16; 250 // Check for floating point literals. 251 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) { 252 ++CurPtr; 253 return LexFloatLiteral(); 254 } 255 256 StringRef Result(TokStart, CurPtr - TokStart); 257 258 long long Value; 259 if (Result.getAsInteger(Radix, Value)) { 260 // Allow positive values that are too large to fit into a signed 64-bit 261 // integer, but that do fit in an unsigned one, we just convert them over. 262 unsigned long long UValue; 263 if (Result.getAsInteger(Radix, UValue)) 264 return ReturnError(TokStart, !isHex ? "invalid decimal number" : 265 "invalid hexdecimal number"); 266 Value = (long long)UValue; 267 } 268 269 // Consume the [bB][hH]. 270 if (Radix == 2 || Radix == 16) 271 ++CurPtr; 272 273 // The darwin/x86 (and x86-64) assembler accepts and ignores type 274 // suffices on integer literals. 275 SkipIgnoredIntegerSuffix(CurPtr); 276 277 return AsmToken(AsmToken::Integer, Result, Value); 278 } 279 280 if (*CurPtr == 'b') { 281 ++CurPtr; 282 // See if we actually have "0b" as part of something like "jmp 0b\n" 283 if (!isdigit(CurPtr[0])) { 284 --CurPtr; 285 StringRef Result(TokStart, CurPtr - TokStart); 286 return AsmToken(AsmToken::Integer, Result, 0); 287 } 288 const char *NumStart = CurPtr; 289 while (CurPtr[0] == '0' || CurPtr[0] == '1') 290 ++CurPtr; 291 292 // Requires at least one binary digit. 293 if (CurPtr == NumStart) 294 return ReturnError(TokStart, "invalid binary number"); 295 296 StringRef Result(TokStart, CurPtr - TokStart); 297 298 long long Value; 299 if (Result.substr(2).getAsInteger(2, Value)) 300 return ReturnError(TokStart, "invalid binary number"); 301 302 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 303 // suffixes on integer literals. 304 SkipIgnoredIntegerSuffix(CurPtr); 305 306 return AsmToken(AsmToken::Integer, Result, Value); 307 } 308 309 if (*CurPtr == 'x') { 310 ++CurPtr; 311 const char *NumStart = CurPtr; 312 while (isxdigit(CurPtr[0])) 313 ++CurPtr; 314 315 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be 316 // diagnosed by LexHexFloatLiteral). 317 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') 318 return LexHexFloatLiteral(NumStart == CurPtr); 319 320 // Otherwise requires at least one hex digit. 321 if (CurPtr == NumStart) 322 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 323 324 unsigned long long Result; 325 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 326 return ReturnError(TokStart, "invalid hexadecimal number"); 327 328 // Consume the optional [hH]. 329 if (*CurPtr == 'h' || *CurPtr == 'H') 330 ++CurPtr; 331 332 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 333 // suffixes on integer literals. 334 SkipIgnoredIntegerSuffix(CurPtr); 335 336 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 337 (int64_t)Result); 338 } 339 340 // Either octal or hexadecimal. 341 long long Value; 342 unsigned Radix = doLookAhead(CurPtr, 8); 343 bool isHex = Radix == 16; 344 StringRef Result(TokStart, CurPtr - TokStart); 345 if (Result.getAsInteger(Radix, Value)) 346 return ReturnError(TokStart, !isHex ? "invalid octal number" : 347 "invalid hexdecimal number"); 348 349 // Consume the [hH]. 350 if (Radix == 16) 351 ++CurPtr; 352 353 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 354 // suffixes on integer literals. 355 SkipIgnoredIntegerSuffix(CurPtr); 356 357 return AsmToken(AsmToken::Integer, Result, Value); 358} 359 360/// LexSingleQuote: Integer: 'b' 361AsmToken AsmLexer::LexSingleQuote() { 362 int CurChar = getNextChar(); 363 364 if (CurChar == '\\') 365 CurChar = getNextChar(); 366 367 if (CurChar == EOF) 368 return ReturnError(TokStart, "unterminated single quote"); 369 370 CurChar = getNextChar(); 371 372 if (CurChar != '\'') 373 return ReturnError(TokStart, "single quote way too long"); 374 375 // The idea here being that 'c' is basically just an integral 376 // constant. 377 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 378 long long Value; 379 380 if (Res.startswith("\'\\")) { 381 char theChar = Res[2]; 382 switch (theChar) { 383 default: Value = theChar; break; 384 case '\'': Value = '\''; break; 385 case 't': Value = '\t'; break; 386 case 'n': Value = '\n'; break; 387 case 'b': Value = '\b'; break; 388 } 389 } else 390 Value = TokStart[1]; 391 392 return AsmToken(AsmToken::Integer, Res, Value); 393} 394 395 396/// LexQuote: String: "..." 397AsmToken AsmLexer::LexQuote() { 398 int CurChar = getNextChar(); 399 // TODO: does gas allow multiline string constants? 400 while (CurChar != '"') { 401 if (CurChar == '\\') { 402 // Allow \", etc. 403 CurChar = getNextChar(); 404 } 405 406 if (CurChar == EOF) 407 return ReturnError(TokStart, "unterminated string constant"); 408 409 CurChar = getNextChar(); 410 } 411 412 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 413} 414 415StringRef AsmLexer::LexUntilEndOfStatement() { 416 TokStart = CurPtr; 417 418 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 419 !isAtStatementSeparator(CurPtr) && // End of statement marker. 420 *CurPtr != '\n' && 421 *CurPtr != '\r' && 422 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 423 ++CurPtr; 424 } 425 return StringRef(TokStart, CurPtr-TokStart); 426} 427 428StringRef AsmLexer::LexUntilEndOfLine() { 429 TokStart = CurPtr; 430 431 while (*CurPtr != '\n' && 432 *CurPtr != '\r' && 433 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 434 ++CurPtr; 435 } 436 return StringRef(TokStart, CurPtr-TokStart); 437} 438 439bool AsmLexer::isAtStartOfComment(char Char) { 440 // FIXME: This won't work for multi-character comment indicators like "//". 441 return Char == *MAI.getCommentString(); 442} 443 444bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 445 return strncmp(Ptr, MAI.getSeparatorString(), 446 strlen(MAI.getSeparatorString())) == 0; 447} 448 449AsmToken AsmLexer::LexToken() { 450 TokStart = CurPtr; 451 // This always consumes at least one character. 452 int CurChar = getNextChar(); 453 454 if (isAtStartOfComment(CurChar)) { 455 // If this comment starts with a '#', then return the Hash token and let 456 // the assembler parser see if it can be parsed as a cpp line filename 457 // comment. We do this only if we are at the start of a line. 458 if (CurChar == '#' && isAtStartOfLine) 459 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 460 isAtStartOfLine = true; 461 return LexLineComment(); 462 } 463 if (isAtStatementSeparator(TokStart)) { 464 CurPtr += strlen(MAI.getSeparatorString()) - 1; 465 return AsmToken(AsmToken::EndOfStatement, 466 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 467 } 468 469 // If we're missing a newline at EOF, make sure we still get an 470 // EndOfStatement token before the Eof token. 471 if (CurChar == EOF && !isAtStartOfLine) { 472 isAtStartOfLine = true; 473 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 474 } 475 476 isAtStartOfLine = false; 477 switch (CurChar) { 478 default: 479 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 480 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 481 return LexIdentifier(); 482 483 // Unknown character, emit an error. 484 return ReturnError(TokStart, "invalid character in input"); 485 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 486 case 0: 487 case ' ': 488 case '\t': 489 if (SkipSpace) { 490 // Ignore whitespace. 491 return LexToken(); 492 } else { 493 int len = 1; 494 while (*CurPtr==' ' || *CurPtr=='\t') { 495 CurPtr++; 496 len++; 497 } 498 return AsmToken(AsmToken::Space, StringRef(TokStart, len)); 499 } 500 case '\n': // FALL THROUGH. 501 case '\r': 502 isAtStartOfLine = true; 503 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 504 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 505 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 506 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 507 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 508 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 509 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 510 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 511 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 512 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 513 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 514 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 515 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 516 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 517 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 518 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 519 case '=': 520 if (*CurPtr == '=') 521 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 522 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 523 case '|': 524 if (*CurPtr == '|') 525 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 526 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 527 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 528 case '&': 529 if (*CurPtr == '&') 530 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 531 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 532 case '!': 533 if (*CurPtr == '=') 534 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 535 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 536 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 537 case '/': return LexSlash(); 538 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 539 case '\'': return LexSingleQuote(); 540 case '"': return LexQuote(); 541 case '0': case '1': case '2': case '3': case '4': 542 case '5': case '6': case '7': case '8': case '9': 543 return LexDigit(); 544 case '<': 545 switch (*CurPtr) { 546 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 547 StringRef(TokStart, 2)); 548 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 549 StringRef(TokStart, 2)); 550 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 551 StringRef(TokStart, 2)); 552 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 553 } 554 case '>': 555 switch (*CurPtr) { 556 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 557 StringRef(TokStart, 2)); 558 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 559 StringRef(TokStart, 2)); 560 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 561 } 562 563 // TODO: Quoted identifiers (objc methods etc) 564 // local labels: [0-9][:] 565 // Forward/backward labels: [0-9][fb] 566 // Integers, fp constants, character constants. 567 } 568} 569