LiteralSupport.cpp revision 223017
1//===--- LiteralSupport.cpp - Code to parse and process literals ----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the NumericLiteralParser, CharLiteralParser, and 11// StringLiteralParser interfaces. 12// 13//===----------------------------------------------------------------------===// 14 15#include "clang/Lex/LiteralSupport.h" 16#include "clang/Lex/Preprocessor.h" 17#include "clang/Lex/LexDiagnostic.h" 18#include "clang/Basic/TargetInfo.h" 19#include "llvm/ADT/StringRef.h" 20#include "llvm/ADT/StringExtras.h" 21using namespace clang; 22 23/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's 24/// not valid. 25static int HexDigitValue(char C) { 26 if (C >= '0' && C <= '9') return C-'0'; 27 if (C >= 'a' && C <= 'f') return C-'a'+10; 28 if (C >= 'A' && C <= 'F') return C-'A'+10; 29 return -1; 30} 31 32/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in 33/// either a character or a string literal. 34static unsigned ProcessCharEscape(const char *&ThisTokBuf, 35 const char *ThisTokEnd, bool &HadError, 36 FullSourceLoc Loc, bool IsWide, 37 Diagnostic *Diags, const TargetInfo &Target) { 38 // Skip the '\' char. 39 ++ThisTokBuf; 40 41 // We know that this character can't be off the end of the buffer, because 42 // that would have been \", which would not have been the end of string. 43 unsigned ResultChar = *ThisTokBuf++; 44 switch (ResultChar) { 45 // These map to themselves. 46 case '\\': case '\'': case '"': case '?': break; 47 48 // These have fixed mappings. 49 case 'a': 50 // TODO: K&R: the meaning of '\\a' is different in traditional C 51 ResultChar = 7; 52 break; 53 case 'b': 54 ResultChar = 8; 55 break; 56 case 'e': 57 if (Diags) 58 Diags->Report(Loc, diag::ext_nonstandard_escape) << "e"; 59 ResultChar = 27; 60 break; 61 case 'E': 62 if (Diags) 63 Diags->Report(Loc, diag::ext_nonstandard_escape) << "E"; 64 ResultChar = 27; 65 break; 66 case 'f': 67 ResultChar = 12; 68 break; 69 case 'n': 70 ResultChar = 10; 71 break; 72 case 'r': 73 ResultChar = 13; 74 break; 75 case 't': 76 ResultChar = 9; 77 break; 78 case 'v': 79 ResultChar = 11; 80 break; 81 case 'x': { // Hex escape. 82 ResultChar = 0; 83 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { 84 if (Diags) 85 Diags->Report(Loc, diag::err_hex_escape_no_digits); 86 HadError = 1; 87 break; 88 } 89 90 // Hex escapes are a maximal series of hex digits. 91 bool Overflow = false; 92 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { 93 int CharVal = HexDigitValue(ThisTokBuf[0]); 94 if (CharVal == -1) break; 95 // About to shift out a digit? 96 Overflow |= (ResultChar & 0xF0000000) ? true : false; 97 ResultChar <<= 4; 98 ResultChar |= CharVal; 99 } 100 101 // See if any bits will be truncated when evaluated as a character. 102 unsigned CharWidth = 103 IsWide ? Target.getWCharWidth() : Target.getCharWidth(); 104 105 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 106 Overflow = true; 107 ResultChar &= ~0U >> (32-CharWidth); 108 } 109 110 // Check for overflow. 111 if (Overflow && Diags) // Too many digits to fit in 112 Diags->Report(Loc, diag::warn_hex_escape_too_large); 113 break; 114 } 115 case '0': case '1': case '2': case '3': 116 case '4': case '5': case '6': case '7': { 117 // Octal escapes. 118 --ThisTokBuf; 119 ResultChar = 0; 120 121 // Octal escapes are a series of octal digits with maximum length 3. 122 // "\0123" is a two digit sequence equal to "\012" "3". 123 unsigned NumDigits = 0; 124 do { 125 ResultChar <<= 3; 126 ResultChar |= *ThisTokBuf++ - '0'; 127 ++NumDigits; 128 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && 129 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); 130 131 // Check for overflow. Reject '\777', but not L'\777'. 132 unsigned CharWidth = 133 IsWide ? Target.getWCharWidth() : Target.getCharWidth(); 134 135 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 136 if (Diags) 137 Diags->Report(Loc, diag::warn_octal_escape_too_large); 138 ResultChar &= ~0U >> (32-CharWidth); 139 } 140 break; 141 } 142 143 // Otherwise, these are not valid escapes. 144 case '(': case '{': case '[': case '%': 145 // GCC accepts these as extensions. We warn about them as such though. 146 if (Diags) 147 Diags->Report(Loc, diag::ext_nonstandard_escape) 148 << std::string()+(char)ResultChar; 149 break; 150 default: 151 if (Diags == 0) 152 break; 153 154 if (isgraph(ResultChar)) 155 Diags->Report(Loc, diag::ext_unknown_escape) 156 << std::string()+(char)ResultChar; 157 else 158 Diags->Report(Loc, diag::ext_unknown_escape) 159 << "x"+llvm::utohexstr(ResultChar); 160 break; 161 } 162 163 return ResultChar; 164} 165 166/// ProcessUCNEscape - Read the Universal Character Name, check constraints and 167/// return the UTF32. 168static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, 169 uint32_t &UcnVal, unsigned short &UcnLen, 170 FullSourceLoc Loc, Diagnostic *Diags, 171 const LangOptions &Features) { 172 if (!Features.CPlusPlus && !Features.C99 && Diags) 173 Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89); 174 175 // Save the beginning of the string (for error diagnostics). 176 const char *ThisTokBegin = ThisTokBuf; 177 178 // Skip the '\u' char's. 179 ThisTokBuf += 2; 180 181 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { 182 if (Diags) 183 Diags->Report(Loc, diag::err_ucn_escape_no_digits); 184 return false; 185 } 186 UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); 187 unsigned short UcnLenSave = UcnLen; 188 for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) { 189 int CharVal = HexDigitValue(ThisTokBuf[0]); 190 if (CharVal == -1) break; 191 UcnVal <<= 4; 192 UcnVal |= CharVal; 193 } 194 // If we didn't consume the proper number of digits, there is a problem. 195 if (UcnLenSave) { 196 if (Diags) { 197 SourceLocation L = 198 Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin, 199 Loc.getManager(), Features); 200 Diags->Report(FullSourceLoc(L, Loc.getManager()), 201 diag::err_ucn_escape_incomplete); 202 } 203 return false; 204 } 205 // Check UCN constraints (C99 6.4.3p2). 206 if ((UcnVal < 0xa0 && 207 (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, ` 208 || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF) 209 || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ { 210 if (Diags) 211 Diags->Report(Loc, diag::err_ucn_escape_invalid); 212 return false; 213 } 214 return true; 215} 216 217/// EncodeUCNEscape - Read the Universal Character Name, check constraints and 218/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of 219/// StringLiteralParser. When we decide to implement UCN's for identifiers, 220/// we will likely rework our support for UCN's. 221static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, 222 char *&ResultBuf, bool &HadError, 223 FullSourceLoc Loc, bool wide, Diagnostic *Diags, 224 const LangOptions &Features) { 225 typedef uint32_t UTF32; 226 UTF32 UcnVal = 0; 227 unsigned short UcnLen = 0; 228 if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags, 229 Features)) { 230 HadError = 1; 231 return; 232 } 233 234 if (wide) { 235 (void)UcnLen; 236 assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); 237 238 if (!Features.ShortWChar) { 239 // Note: our internal rep of wide char tokens is always little-endian. 240 *ResultBuf++ = (UcnVal & 0x000000FF); 241 *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; 242 *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; 243 *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; 244 return; 245 } 246 247 // Convert to UTF16. 248 if (UcnVal < (UTF32)0xFFFF) { 249 *ResultBuf++ = (UcnVal & 0x000000FF); 250 *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; 251 return; 252 } 253 if (Diags) Diags->Report(Loc, diag::warn_ucn_escape_too_large); 254 255 typedef uint16_t UTF16; 256 UcnVal -= 0x10000; 257 UTF16 surrogate1 = 0xD800 + (UcnVal >> 10); 258 UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF); 259 *ResultBuf++ = (surrogate1 & 0x000000FF); 260 *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8; 261 *ResultBuf++ = (surrogate2 & 0x000000FF); 262 *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8; 263 return; 264 } 265 // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. 266 // The conversion below was inspired by: 267 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c 268 // First, we determine how many bytes the result will require. 269 typedef uint8_t UTF8; 270 271 unsigned short bytesToWrite = 0; 272 if (UcnVal < (UTF32)0x80) 273 bytesToWrite = 1; 274 else if (UcnVal < (UTF32)0x800) 275 bytesToWrite = 2; 276 else if (UcnVal < (UTF32)0x10000) 277 bytesToWrite = 3; 278 else 279 bytesToWrite = 4; 280 281 const unsigned byteMask = 0xBF; 282 const unsigned byteMark = 0x80; 283 284 // Once the bits are split out into bytes of UTF8, this is a mask OR-ed 285 // into the first byte, depending on how many bytes follow. 286 static const UTF8 firstByteMark[5] = { 287 0x00, 0x00, 0xC0, 0xE0, 0xF0 288 }; 289 // Finally, we write the bytes into ResultBuf. 290 ResultBuf += bytesToWrite; 291 switch (bytesToWrite) { // note: everything falls through. 292 case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 293 case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 294 case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 295 case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]); 296 } 297 // Update the buffer. 298 ResultBuf += bytesToWrite; 299} 300 301 302/// integer-constant: [C99 6.4.4.1] 303/// decimal-constant integer-suffix 304/// octal-constant integer-suffix 305/// hexadecimal-constant integer-suffix 306/// decimal-constant: 307/// nonzero-digit 308/// decimal-constant digit 309/// octal-constant: 310/// 0 311/// octal-constant octal-digit 312/// hexadecimal-constant: 313/// hexadecimal-prefix hexadecimal-digit 314/// hexadecimal-constant hexadecimal-digit 315/// hexadecimal-prefix: one of 316/// 0x 0X 317/// integer-suffix: 318/// unsigned-suffix [long-suffix] 319/// unsigned-suffix [long-long-suffix] 320/// long-suffix [unsigned-suffix] 321/// long-long-suffix [unsigned-sufix] 322/// nonzero-digit: 323/// 1 2 3 4 5 6 7 8 9 324/// octal-digit: 325/// 0 1 2 3 4 5 6 7 326/// hexadecimal-digit: 327/// 0 1 2 3 4 5 6 7 8 9 328/// a b c d e f 329/// A B C D E F 330/// unsigned-suffix: one of 331/// u U 332/// long-suffix: one of 333/// l L 334/// long-long-suffix: one of 335/// ll LL 336/// 337/// floating-constant: [C99 6.4.4.2] 338/// TODO: add rules... 339/// 340NumericLiteralParser:: 341NumericLiteralParser(const char *begin, const char *end, 342 SourceLocation TokLoc, Preprocessor &pp) 343 : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) { 344 345 // This routine assumes that the range begin/end matches the regex for integer 346 // and FP constants (specifically, the 'pp-number' regex), and assumes that 347 // the byte at "*end" is both valid and not part of the regex. Because of 348 // this, it doesn't have to check for 'overscan' in various places. 349 assert(!isalnum(*end) && *end != '.' && *end != '_' && 350 "Lexer didn't maximally munch?"); 351 352 s = DigitsBegin = begin; 353 saw_exponent = false; 354 saw_period = false; 355 isLong = false; 356 isUnsigned = false; 357 isLongLong = false; 358 isFloat = false; 359 isImaginary = false; 360 isMicrosoftInteger = false; 361 hadError = false; 362 363 if (*s == '0') { // parse radix 364 ParseNumberStartingWithZero(TokLoc); 365 if (hadError) 366 return; 367 } else { // the first digit is non-zero 368 radix = 10; 369 s = SkipDigits(s); 370 if (s == ThisTokEnd) { 371 // Done. 372 } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) { 373 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 374 diag::err_invalid_decimal_digit) << llvm::StringRef(s, 1); 375 hadError = true; 376 return; 377 } else if (*s == '.') { 378 s++; 379 saw_period = true; 380 s = SkipDigits(s); 381 } 382 if ((*s == 'e' || *s == 'E')) { // exponent 383 const char *Exponent = s; 384 s++; 385 saw_exponent = true; 386 if (*s == '+' || *s == '-') s++; // sign 387 const char *first_non_digit = SkipDigits(s); 388 if (first_non_digit != s) { 389 s = first_non_digit; 390 } else { 391 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin), 392 diag::err_exponent_has_no_digits); 393 hadError = true; 394 return; 395 } 396 } 397 } 398 399 SuffixBegin = s; 400 401 // Parse the suffix. At this point we can classify whether we have an FP or 402 // integer constant. 403 bool isFPConstant = isFloatingLiteral(); 404 405 // Loop over all of the characters of the suffix. If we see something bad, 406 // we break out of the loop. 407 for (; s != ThisTokEnd; ++s) { 408 switch (*s) { 409 case 'f': // FP Suffix for "float" 410 case 'F': 411 if (!isFPConstant) break; // Error for integer constant. 412 if (isFloat || isLong) break; // FF, LF invalid. 413 isFloat = true; 414 continue; // Success. 415 case 'u': 416 case 'U': 417 if (isFPConstant) break; // Error for floating constant. 418 if (isUnsigned) break; // Cannot be repeated. 419 isUnsigned = true; 420 continue; // Success. 421 case 'l': 422 case 'L': 423 if (isLong || isLongLong) break; // Cannot be repeated. 424 if (isFloat) break; // LF invalid. 425 426 // Check for long long. The L's need to be adjacent and the same case. 427 if (s+1 != ThisTokEnd && s[1] == s[0]) { 428 if (isFPConstant) break; // long long invalid for floats. 429 isLongLong = true; 430 ++s; // Eat both of them. 431 } else { 432 isLong = true; 433 } 434 continue; // Success. 435 case 'i': 436 case 'I': 437 if (PP.getLangOptions().Microsoft) { 438 if (isFPConstant || isLong || isLongLong) break; 439 440 // Allow i8, i16, i32, i64, and i128. 441 if (s + 1 != ThisTokEnd) { 442 switch (s[1]) { 443 case '8': 444 s += 2; // i8 suffix 445 isMicrosoftInteger = true; 446 break; 447 case '1': 448 if (s + 2 == ThisTokEnd) break; 449 if (s[2] == '6') { 450 s += 3; // i16 suffix 451 isMicrosoftInteger = true; 452 } 453 else if (s[2] == '2') { 454 if (s + 3 == ThisTokEnd) break; 455 if (s[3] == '8') { 456 s += 4; // i128 suffix 457 isMicrosoftInteger = true; 458 } 459 } 460 break; 461 case '3': 462 if (s + 2 == ThisTokEnd) break; 463 if (s[2] == '2') { 464 s += 3; // i32 suffix 465 isLong = true; 466 isMicrosoftInteger = true; 467 } 468 break; 469 case '6': 470 if (s + 2 == ThisTokEnd) break; 471 if (s[2] == '4') { 472 s += 3; // i64 suffix 473 isLongLong = true; 474 isMicrosoftInteger = true; 475 } 476 break; 477 default: 478 break; 479 } 480 break; 481 } 482 } 483 // fall through. 484 case 'j': 485 case 'J': 486 if (isImaginary) break; // Cannot be repeated. 487 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 488 diag::ext_imaginary_constant); 489 isImaginary = true; 490 continue; // Success. 491 } 492 // If we reached here, there was an error. 493 break; 494 } 495 496 // Report an error if there are any. 497 if (s != ThisTokEnd) { 498 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 499 isFPConstant ? diag::err_invalid_suffix_float_constant : 500 diag::err_invalid_suffix_integer_constant) 501 << llvm::StringRef(SuffixBegin, ThisTokEnd-SuffixBegin); 502 hadError = true; 503 return; 504 } 505} 506 507/// ParseNumberStartingWithZero - This method is called when the first character 508/// of the number is found to be a zero. This means it is either an octal 509/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or 510/// a floating point number (01239.123e4). Eat the prefix, determining the 511/// radix etc. 512void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { 513 assert(s[0] == '0' && "Invalid method call"); 514 s++; 515 516 // Handle a hex number like 0x1234. 517 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) { 518 s++; 519 radix = 16; 520 DigitsBegin = s; 521 s = SkipHexDigits(s); 522 if (s == ThisTokEnd) { 523 // Done. 524 } else if (*s == '.') { 525 s++; 526 saw_period = true; 527 s = SkipHexDigits(s); 528 } 529 // A binary exponent can appear with or with a '.'. If dotted, the 530 // binary exponent is required. 531 if ((*s == 'p' || *s == 'P') && !PP.getLangOptions().CPlusPlus0x) { 532 const char *Exponent = s; 533 s++; 534 saw_exponent = true; 535 if (*s == '+' || *s == '-') s++; // sign 536 const char *first_non_digit = SkipDigits(s); 537 if (first_non_digit == s) { 538 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 539 diag::err_exponent_has_no_digits); 540 hadError = true; 541 return; 542 } 543 s = first_non_digit; 544 545 // In C++0x, we cannot support hexadecmial floating literals because 546 // they conflict with user-defined literals, so we warn in previous 547 // versions of C++ by default. 548 if (PP.getLangOptions().CPlusPlus) 549 PP.Diag(TokLoc, diag::ext_hexconstant_cplusplus); 550 else if (!PP.getLangOptions().HexFloats) 551 PP.Diag(TokLoc, diag::ext_hexconstant_invalid); 552 } else if (saw_period) { 553 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 554 diag::err_hexconstant_requires_exponent); 555 hadError = true; 556 } 557 return; 558 } 559 560 // Handle simple binary numbers 0b01010 561 if (*s == 'b' || *s == 'B') { 562 // 0b101010 is a GCC extension. 563 PP.Diag(TokLoc, diag::ext_binary_literal); 564 ++s; 565 radix = 2; 566 DigitsBegin = s; 567 s = SkipBinaryDigits(s); 568 if (s == ThisTokEnd) { 569 // Done. 570 } else if (isxdigit(*s)) { 571 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 572 diag::err_invalid_binary_digit) << llvm::StringRef(s, 1); 573 hadError = true; 574 } 575 // Other suffixes will be diagnosed by the caller. 576 return; 577 } 578 579 // For now, the radix is set to 8. If we discover that we have a 580 // floating point constant, the radix will change to 10. Octal floating 581 // point constants are not permitted (only decimal and hexadecimal). 582 radix = 8; 583 DigitsBegin = s; 584 s = SkipOctalDigits(s); 585 if (s == ThisTokEnd) 586 return; // Done, simple octal number like 01234 587 588 // If we have some other non-octal digit that *is* a decimal digit, see if 589 // this is part of a floating point number like 094.123 or 09e1. 590 if (isdigit(*s)) { 591 const char *EndDecimal = SkipDigits(s); 592 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') { 593 s = EndDecimal; 594 radix = 10; 595 } 596 } 597 598 // If we have a hex digit other than 'e' (which denotes a FP exponent) then 599 // the code is using an incorrect base. 600 if (isxdigit(*s) && *s != 'e' && *s != 'E') { 601 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 602 diag::err_invalid_octal_digit) << llvm::StringRef(s, 1); 603 hadError = true; 604 return; 605 } 606 607 if (*s == '.') { 608 s++; 609 radix = 10; 610 saw_period = true; 611 s = SkipDigits(s); // Skip suffix. 612 } 613 if (*s == 'e' || *s == 'E') { // exponent 614 const char *Exponent = s; 615 s++; 616 radix = 10; 617 saw_exponent = true; 618 if (*s == '+' || *s == '-') s++; // sign 619 const char *first_non_digit = SkipDigits(s); 620 if (first_non_digit != s) { 621 s = first_non_digit; 622 } else { 623 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 624 diag::err_exponent_has_no_digits); 625 hadError = true; 626 return; 627 } 628 } 629} 630 631 632/// GetIntegerValue - Convert this numeric literal value to an APInt that 633/// matches Val's input width. If there is an overflow, set Val to the low bits 634/// of the result and return true. Otherwise, return false. 635bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { 636 // Fast path: Compute a conservative bound on the maximum number of 637 // bits per digit in this radix. If we can't possibly overflow a 638 // uint64 based on that bound then do the simple conversion to 639 // integer. This avoids the expensive overflow checking below, and 640 // handles the common cases that matter (small decimal integers and 641 // hex/octal values which don't overflow). 642 unsigned MaxBitsPerDigit = 1; 643 while ((1U << MaxBitsPerDigit) < radix) 644 MaxBitsPerDigit += 1; 645 if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) { 646 uint64_t N = 0; 647 for (s = DigitsBegin; s != SuffixBegin; ++s) 648 N = N*radix + HexDigitValue(*s); 649 650 // This will truncate the value to Val's input width. Simply check 651 // for overflow by comparing. 652 Val = N; 653 return Val.getZExtValue() != N; 654 } 655 656 Val = 0; 657 s = DigitsBegin; 658 659 llvm::APInt RadixVal(Val.getBitWidth(), radix); 660 llvm::APInt CharVal(Val.getBitWidth(), 0); 661 llvm::APInt OldVal = Val; 662 663 bool OverflowOccurred = false; 664 while (s < SuffixBegin) { 665 unsigned C = HexDigitValue(*s++); 666 667 // If this letter is out of bound for this radix, reject it. 668 assert(C < radix && "NumericLiteralParser ctor should have rejected this"); 669 670 CharVal = C; 671 672 // Add the digit to the value in the appropriate radix. If adding in digits 673 // made the value smaller, then this overflowed. 674 OldVal = Val; 675 676 // Multiply by radix, did overflow occur on the multiply? 677 Val *= RadixVal; 678 OverflowOccurred |= Val.udiv(RadixVal) != OldVal; 679 680 // Add value, did overflow occur on the value? 681 // (a + b) ult b <=> overflow 682 Val += CharVal; 683 OverflowOccurred |= Val.ult(CharVal); 684 } 685 return OverflowOccurred; 686} 687 688llvm::APFloat::opStatus 689NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { 690 using llvm::APFloat; 691 using llvm::StringRef; 692 693 unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin); 694 return Result.convertFromString(StringRef(ThisTokBegin, n), 695 APFloat::rmNearestTiesToEven); 696} 697 698 699CharLiteralParser::CharLiteralParser(const char *begin, const char *end, 700 SourceLocation Loc, Preprocessor &PP) { 701 // At this point we know that the character matches the regex "L?'.*'". 702 HadError = false; 703 704 // Determine if this is a wide character. 705 IsWide = begin[0] == 'L'; 706 if (IsWide) ++begin; 707 708 // Skip over the entry quote. 709 assert(begin[0] == '\'' && "Invalid token lexed"); 710 ++begin; 711 712 // FIXME: The "Value" is an uint64_t so we can handle char literals of 713 // up to 64-bits. 714 // FIXME: This extensively assumes that 'char' is 8-bits. 715 assert(PP.getTargetInfo().getCharWidth() == 8 && 716 "Assumes char is 8 bits"); 717 assert(PP.getTargetInfo().getIntWidth() <= 64 && 718 (PP.getTargetInfo().getIntWidth() & 7) == 0 && 719 "Assumes sizeof(int) on target is <= 64 and a multiple of char"); 720 assert(PP.getTargetInfo().getWCharWidth() <= 64 && 721 "Assumes sizeof(wchar) on target is <= 64"); 722 723 // This is what we will use for overflow detection 724 llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0); 725 726 unsigned NumCharsSoFar = 0; 727 bool Warned = false; 728 while (begin[0] != '\'') { 729 uint64_t ResultChar; 730 731 // Is this a Universal Character Name escape? 732 if (begin[0] != '\\') // If this is a normal character, consume it. 733 ResultChar = *begin++; 734 else { // Otherwise, this is an escape character. 735 // Check for UCN. 736 if (begin[1] == 'u' || begin[1] == 'U') { 737 uint32_t utf32 = 0; 738 unsigned short UcnLen = 0; 739 if (!ProcessUCNEscape(begin, end, utf32, UcnLen, 740 FullSourceLoc(Loc, PP.getSourceManager()), 741 &PP.getDiagnostics(), PP.getLangOptions())) { 742 HadError = 1; 743 } 744 ResultChar = utf32; 745 } else { 746 // Otherwise, this is a non-UCN escape character. Process it. 747 ResultChar = ProcessCharEscape(begin, end, HadError, 748 FullSourceLoc(Loc,PP.getSourceManager()), 749 IsWide, 750 &PP.getDiagnostics(), PP.getTargetInfo()); 751 } 752 } 753 754 // If this is a multi-character constant (e.g. 'abc'), handle it. These are 755 // implementation defined (C99 6.4.4.4p10). 756 if (NumCharsSoFar) { 757 if (IsWide) { 758 // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'. 759 LitVal = 0; 760 } else { 761 // Narrow character literals act as though their value is concatenated 762 // in this implementation, but warn on overflow. 763 if (LitVal.countLeadingZeros() < 8 && !Warned) { 764 PP.Diag(Loc, diag::warn_char_constant_too_large); 765 Warned = true; 766 } 767 LitVal <<= 8; 768 } 769 } 770 771 LitVal = LitVal + ResultChar; 772 ++NumCharsSoFar; 773 } 774 775 // If this is the second character being processed, do special handling. 776 if (NumCharsSoFar > 1) { 777 // Warn about discarding the top bits for multi-char wide-character 778 // constants (L'abcd'). 779 if (IsWide) 780 PP.Diag(Loc, diag::warn_extraneous_wide_char_constant); 781 else if (NumCharsSoFar != 4) 782 PP.Diag(Loc, diag::ext_multichar_character_literal); 783 else 784 PP.Diag(Loc, diag::ext_four_char_character_literal); 785 IsMultiChar = true; 786 } else 787 IsMultiChar = false; 788 789 // Transfer the value from APInt to uint64_t 790 Value = LitVal.getZExtValue(); 791 792 if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF) 793 PP.Diag(Loc, diag::warn_ucn_escape_too_large); 794 795 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") 796 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple 797 // character constants are not sign extended in the this implementation: 798 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. 799 if (!IsWide && NumCharsSoFar == 1 && (Value & 128) && 800 PP.getLangOptions().CharIsSigned) 801 Value = (signed char)Value; 802} 803 804 805/// string-literal: [C99 6.4.5] 806/// " [s-char-sequence] " 807/// L" [s-char-sequence] " 808/// s-char-sequence: 809/// s-char 810/// s-char-sequence s-char 811/// s-char: 812/// any source character except the double quote ", 813/// backslash \, or newline character 814/// escape-character 815/// universal-character-name 816/// escape-character: [C99 6.4.4.4] 817/// \ escape-code 818/// universal-character-name 819/// escape-code: 820/// character-escape-code 821/// octal-escape-code 822/// hex-escape-code 823/// character-escape-code: one of 824/// n t b r f v a 825/// \ ' " ? 826/// octal-escape-code: 827/// octal-digit 828/// octal-digit octal-digit 829/// octal-digit octal-digit octal-digit 830/// hex-escape-code: 831/// x hex-digit 832/// hex-escape-code hex-digit 833/// universal-character-name: 834/// \u hex-quad 835/// \U hex-quad hex-quad 836/// hex-quad: 837/// hex-digit hex-digit hex-digit hex-digit 838/// 839StringLiteralParser:: 840StringLiteralParser(const Token *StringToks, unsigned NumStringToks, 841 Preprocessor &PP, bool Complain) 842 : SM(PP.getSourceManager()), Features(PP.getLangOptions()), 843 Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0), 844 MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0), 845 ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) { 846 init(StringToks, NumStringToks); 847} 848 849void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ 850 // The literal token may have come from an invalid source location (e.g. due 851 // to a PCH error), in which case the token length will be 0. 852 if (NumStringToks == 0 || StringToks[0].getLength() < 2) { 853 hadError = true; 854 return; 855 } 856 857 // Scan all of the string portions, remember the max individual token length, 858 // computing a bound on the concatenated string length, and see whether any 859 // piece is a wide-string. If any of the string portions is a wide-string 860 // literal, the result is a wide-string literal [C99 6.4.5p4]. 861 assert(NumStringToks && "expected at least one token"); 862 MaxTokenLength = StringToks[0].getLength(); 863 assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); 864 SizeBound = StringToks[0].getLength()-2; // -2 for "". 865 AnyWide = StringToks[0].is(tok::wide_string_literal); 866 867 hadError = false; 868 869 // Implement Translation Phase #6: concatenation of string literals 870 /// (C99 5.1.1.2p1). The common case is only one string fragment. 871 for (unsigned i = 1; i != NumStringToks; ++i) { 872 if (StringToks[i].getLength() < 2) { 873 hadError = true; 874 return; 875 } 876 877 // The string could be shorter than this if it needs cleaning, but this is a 878 // reasonable bound, which is all we need. 879 assert(StringToks[i].getLength() >= 2 && "literal token is invalid!"); 880 SizeBound += StringToks[i].getLength()-2; // -2 for "". 881 882 // Remember maximum string piece length. 883 if (StringToks[i].getLength() > MaxTokenLength) 884 MaxTokenLength = StringToks[i].getLength(); 885 886 // Remember if we see any wide strings. 887 AnyWide |= StringToks[i].is(tok::wide_string_literal); 888 } 889 890 // Include space for the null terminator. 891 ++SizeBound; 892 893 // TODO: K&R warning: "traditional C rejects string constant concatenation" 894 895 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not 896 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true. 897 wchar_tByteWidth = ~0U; 898 if (AnyWide) { 899 wchar_tByteWidth = Target.getWCharWidth(); 900 assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!"); 901 wchar_tByteWidth /= 8; 902 } 903 904 // The output buffer size needs to be large enough to hold wide characters. 905 // This is a worst-case assumption which basically corresponds to L"" "long". 906 if (AnyWide) 907 SizeBound *= wchar_tByteWidth; 908 909 // Size the temporary buffer to hold the result string data. 910 ResultBuf.resize(SizeBound); 911 912 // Likewise, but for each string piece. 913 llvm::SmallString<512> TokenBuf; 914 TokenBuf.resize(MaxTokenLength); 915 916 // Loop over all the strings, getting their spelling, and expanding them to 917 // wide strings as appropriate. 918 ResultPtr = &ResultBuf[0]; // Next byte to fill in. 919 920 Pascal = false; 921 922 for (unsigned i = 0, e = NumStringToks; i != e; ++i) { 923 const char *ThisTokBuf = &TokenBuf[0]; 924 // Get the spelling of the token, which eliminates trigraphs, etc. We know 925 // that ThisTokBuf points to a buffer that is big enough for the whole token 926 // and 'spelled' tokens can only shrink. 927 bool StringInvalid = false; 928 unsigned ThisTokLen = 929 Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, 930 &StringInvalid); 931 if (StringInvalid) { 932 hadError = 1; 933 continue; 934 } 935 936 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote. 937 bool wide = false; 938 // TODO: Input character set mapping support. 939 940 // Skip L marker for wide strings. 941 if (ThisTokBuf[0] == 'L') { 942 wide = true; 943 ++ThisTokBuf; 944 } 945 946 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); 947 ++ThisTokBuf; 948 949 // Check if this is a pascal string 950 if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && 951 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { 952 953 // If the \p sequence is found in the first token, we have a pascal string 954 // Otherwise, if we already have a pascal string, ignore the first \p 955 if (i == 0) { 956 ++ThisTokBuf; 957 Pascal = true; 958 } else if (Pascal) 959 ThisTokBuf += 2; 960 } 961 962 while (ThisTokBuf != ThisTokEnd) { 963 // Is this a span of non-escape characters? 964 if (ThisTokBuf[0] != '\\') { 965 const char *InStart = ThisTokBuf; 966 do { 967 ++ThisTokBuf; 968 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); 969 970 // Copy the character span over. 971 unsigned Len = ThisTokBuf-InStart; 972 if (!AnyWide) { 973 memcpy(ResultPtr, InStart, Len); 974 ResultPtr += Len; 975 } else { 976 // Note: our internal rep of wide char tokens is always little-endian. 977 for (; Len; --Len, ++InStart) { 978 *ResultPtr++ = InStart[0]; 979 // Add zeros at the end. 980 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) 981 *ResultPtr++ = 0; 982 } 983 } 984 continue; 985 } 986 // Is this a Universal Character Name escape? 987 if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { 988 EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, 989 hadError, FullSourceLoc(StringToks[i].getLocation(),SM), 990 wide, Diags, Features); 991 continue; 992 } 993 // Otherwise, this is a non-UCN escape character. Process it. 994 unsigned ResultChar = 995 ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, 996 FullSourceLoc(StringToks[i].getLocation(), SM), 997 AnyWide, Diags, Target); 998 999 // Note: our internal rep of wide char tokens is always little-endian. 1000 *ResultPtr++ = ResultChar & 0xFF; 1001 1002 if (AnyWide) { 1003 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) 1004 *ResultPtr++ = ResultChar >> i*8; 1005 } 1006 } 1007 } 1008 1009 if (Pascal) { 1010 ResultBuf[0] = ResultPtr-&ResultBuf[0]-1; 1011 if (AnyWide) 1012 ResultBuf[0] /= wchar_tByteWidth; 1013 1014 // Verify that pascal strings aren't too large. 1015 if (GetStringLength() > 256) { 1016 if (Diags) 1017 Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM), 1018 diag::err_pascal_string_too_long) 1019 << SourceRange(StringToks[0].getLocation(), 1020 StringToks[NumStringToks-1].getLocation()); 1021 hadError = 1; 1022 return; 1023 } 1024 } else if (Diags) { 1025 // Complain if this string literal has too many characters. 1026 unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509; 1027 1028 if (GetNumStringChars() > MaxChars) 1029 Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM), 1030 diag::ext_string_too_long) 1031 << GetNumStringChars() << MaxChars 1032 << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0) 1033 << SourceRange(StringToks[0].getLocation(), 1034 StringToks[NumStringToks-1].getLocation()); 1035 } 1036} 1037 1038 1039/// getOffsetOfStringByte - This function returns the offset of the 1040/// specified byte of the string data represented by Token. This handles 1041/// advancing over escape sequences in the string. 1042unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, 1043 unsigned ByteNo) const { 1044 // Get the spelling of the token. 1045 llvm::SmallString<32> SpellingBuffer; 1046 SpellingBuffer.resize(Tok.getLength()); 1047 1048 bool StringInvalid = false; 1049 const char *SpellingPtr = &SpellingBuffer[0]; 1050 unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features, 1051 &StringInvalid); 1052 if (StringInvalid) 1053 return 0; 1054 1055 assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet"); 1056 1057 1058 const char *SpellingStart = SpellingPtr; 1059 const char *SpellingEnd = SpellingPtr+TokLen; 1060 1061 // Skip over the leading quote. 1062 assert(SpellingPtr[0] == '"' && "Should be a string literal!"); 1063 ++SpellingPtr; 1064 1065 // Skip over bytes until we find the offset we're looking for. 1066 while (ByteNo) { 1067 assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!"); 1068 1069 // Step over non-escapes simply. 1070 if (*SpellingPtr != '\\') { 1071 ++SpellingPtr; 1072 --ByteNo; 1073 continue; 1074 } 1075 1076 // Otherwise, this is an escape character. Advance over it. 1077 bool HadError = false; 1078 ProcessCharEscape(SpellingPtr, SpellingEnd, HadError, 1079 FullSourceLoc(Tok.getLocation(), SM), 1080 false, Diags, Target); 1081 assert(!HadError && "This method isn't valid on erroneous strings"); 1082 --ByteNo; 1083 } 1084 1085 return SpellingPtr-SpellingStart; 1086} 1087