DependencyDirectivesSourceMinimizer.cpp revision 360660
1//===- DependencyDirectivesSourceMinimizer.cpp - -------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8/// 9/// \file 10/// This is the implementation for minimizing header and source files to the 11/// minimum necessary preprocessor directives for evaluating includes. It 12/// reduces the source down to #define, #include, #import, @import, and any 13/// conditional preprocessor logic that contains one of those. 14/// 15//===----------------------------------------------------------------------===// 16 17#include "clang/Lex/DependencyDirectivesSourceMinimizer.h" 18#include "clang/Basic/CharInfo.h" 19#include "clang/Basic/Diagnostic.h" 20#include "clang/Lex/LexDiagnostic.h" 21#include "llvm/ADT/StringSwitch.h" 22#include "llvm/Support/MemoryBuffer.h" 23 24using namespace llvm; 25using namespace clang; 26using namespace clang::minimize_source_to_dependency_directives; 27 28namespace { 29 30struct Minimizer { 31 /// Minimized output. 32 SmallVectorImpl<char> &Out; 33 /// The known tokens encountered during the minimization. 34 SmallVectorImpl<Token> &Tokens; 35 36 Minimizer(SmallVectorImpl<char> &Out, SmallVectorImpl<Token> &Tokens, 37 StringRef Input, DiagnosticsEngine *Diags, 38 SourceLocation InputSourceLoc) 39 : Out(Out), Tokens(Tokens), Input(Input), Diags(Diags), 40 InputSourceLoc(InputSourceLoc) {} 41 42 /// Lex the provided source and emit the minimized output. 43 /// 44 /// \returns True on error. 45 bool minimize(); 46 47private: 48 struct IdInfo { 49 const char *Last; 50 StringRef Name; 51 }; 52 53 /// Lex an identifier. 54 /// 55 /// \pre First points at a valid identifier head. 56 LLVM_NODISCARD IdInfo lexIdentifier(const char *First, const char *const End); 57 LLVM_NODISCARD bool isNextIdentifier(StringRef Id, const char *&First, 58 const char *const End); 59 LLVM_NODISCARD bool minimizeImpl(const char *First, const char *const End); 60 LLVM_NODISCARD bool lexPPLine(const char *&First, const char *const End); 61 LLVM_NODISCARD bool lexAt(const char *&First, const char *const End); 62 LLVM_NODISCARD bool lexDefine(const char *&First, const char *const End); 63 LLVM_NODISCARD bool lexPragma(const char *&First, const char *const End); 64 LLVM_NODISCARD bool lexEndif(const char *&First, const char *const End); 65 LLVM_NODISCARD bool lexDefault(TokenKind Kind, StringRef Directive, 66 const char *&First, const char *const End); 67 Token &makeToken(TokenKind K) { 68 Tokens.emplace_back(K, Out.size()); 69 return Tokens.back(); 70 } 71 void popToken() { 72 Out.resize(Tokens.back().Offset); 73 Tokens.pop_back(); 74 } 75 TokenKind top() const { return Tokens.empty() ? pp_none : Tokens.back().K; } 76 77 Minimizer &put(char Byte) { 78 Out.push_back(Byte); 79 return *this; 80 } 81 Minimizer &append(StringRef S) { return append(S.begin(), S.end()); } 82 Minimizer &append(const char *First, const char *Last) { 83 Out.append(First, Last); 84 return *this; 85 } 86 87 void printToNewline(const char *&First, const char *const End); 88 void printAdjacentModuleNameParts(const char *&First, const char *const End); 89 LLVM_NODISCARD bool printAtImportBody(const char *&First, 90 const char *const End); 91 void printDirectiveBody(const char *&First, const char *const End); 92 void printAdjacentMacroArgs(const char *&First, const char *const End); 93 LLVM_NODISCARD bool printMacroArgs(const char *&First, const char *const End); 94 95 /// Reports a diagnostic if the diagnostic engine is provided. Always returns 96 /// true at the end. 97 bool reportError(const char *CurPtr, unsigned Err); 98 99 StringMap<char> SplitIds; 100 StringRef Input; 101 DiagnosticsEngine *Diags; 102 SourceLocation InputSourceLoc; 103}; 104 105} // end anonymous namespace 106 107bool Minimizer::reportError(const char *CurPtr, unsigned Err) { 108 if (!Diags) 109 return true; 110 assert(CurPtr >= Input.data() && "invalid buffer ptr"); 111 Diags->Report(InputSourceLoc.getLocWithOffset(CurPtr - Input.data()), Err); 112 return true; 113} 114 115static void skipOverSpaces(const char *&First, const char *const End) { 116 while (First != End && isHorizontalWhitespace(*First)) 117 ++First; 118} 119 120LLVM_NODISCARD static bool isRawStringLiteral(const char *First, 121 const char *Current) { 122 assert(First <= Current); 123 124 // Check if we can even back up. 125 if (*Current != '"' || First == Current) 126 return false; 127 128 // Check for an "R". 129 --Current; 130 if (*Current != 'R') 131 return false; 132 if (First == Current || !isIdentifierBody(*--Current)) 133 return true; 134 135 // Check for a prefix of "u", "U", or "L". 136 if (*Current == 'u' || *Current == 'U' || *Current == 'L') 137 return First == Current || !isIdentifierBody(*--Current); 138 139 // Check for a prefix of "u8". 140 if (*Current != '8' || First == Current || *Current-- != 'u') 141 return false; 142 return First == Current || !isIdentifierBody(*--Current); 143} 144 145static void skipRawString(const char *&First, const char *const End) { 146 assert(First[0] == '"'); 147 assert(First[-1] == 'R'); 148 149 const char *Last = ++First; 150 while (Last != End && *Last != '(') 151 ++Last; 152 if (Last == End) { 153 First = Last; // Hit the end... just give up. 154 return; 155 } 156 157 StringRef Terminator(First, Last - First); 158 for (;;) { 159 // Move First to just past the next ")". 160 First = Last; 161 while (First != End && *First != ')') 162 ++First; 163 if (First == End) 164 return; 165 ++First; 166 167 // Look ahead for the terminator sequence. 168 Last = First; 169 while (Last != End && size_t(Last - First) < Terminator.size() && 170 Terminator[Last - First] == *Last) 171 ++Last; 172 173 // Check if we hit it (or the end of the file). 174 if (Last == End) { 175 First = Last; 176 return; 177 } 178 if (size_t(Last - First) < Terminator.size()) 179 continue; 180 if (*Last != '"') 181 continue; 182 First = Last + 1; 183 return; 184 } 185} 186 187static void skipString(const char *&First, const char *const End) { 188 assert(*First == '\'' || *First == '"'); 189 const char Terminator = *First; 190 for (++First; First != End && *First != Terminator; ++First) 191 if (*First == '\\') 192 if (++First == End) 193 return; 194 if (First != End) 195 ++First; // Finish off the string. 196} 197 198static void skipNewline(const char *&First, const char *End) { 199 assert(isVerticalWhitespace(*First)); 200 ++First; 201 if (First == End) 202 return; 203 204 // Check for "\n\r" and "\r\n". 205 if (LLVM_UNLIKELY(isVerticalWhitespace(*First) && First[-1] != First[0])) 206 ++First; 207} 208 209static void skipToNewlineRaw(const char *&First, const char *const End) { 210 for (;;) { 211 if (First == End) 212 return; 213 214 if (isVerticalWhitespace(*First)) 215 return; 216 217 while (!isVerticalWhitespace(*First)) 218 if (++First == End) 219 return; 220 221 if (First[-1] != '\\') 222 return; 223 224 ++First; // Keep going... 225 } 226} 227 228static const char *reverseOverSpaces(const char *First, const char *Last) { 229 assert(First <= Last); 230 while (First != Last && isHorizontalWhitespace(Last[-1])) 231 --Last; 232 return Last; 233} 234 235static void skipLineComment(const char *&First, const char *const End) { 236 assert(First[0] == '/' && First[1] == '/'); 237 First += 2; 238 skipToNewlineRaw(First, End); 239} 240 241static void skipBlockComment(const char *&First, const char *const End) { 242 assert(First[0] == '/' && First[1] == '*'); 243 if (End - First < 4) { 244 First = End; 245 return; 246 } 247 for (First += 3; First != End; ++First) 248 if (First[-1] == '*' && First[0] == '/') { 249 ++First; 250 return; 251 } 252} 253 254/// \returns True if the current single quotation mark character is a C++ 14 255/// digit separator. 256static bool isQuoteCppDigitSeparator(const char *const Start, 257 const char *const Cur, 258 const char *const End) { 259 assert(*Cur == '\'' && "expected quotation character"); 260 // skipLine called in places where we don't expect a valid number 261 // body before `start` on the same line, so always return false at the start. 262 if (Start == Cur) 263 return false; 264 // The previous character must be a valid PP number character. 265 // Make sure that the L, u, U, u8 prefixes don't get marked as a 266 // separator though. 267 char Prev = *(Cur - 1); 268 if (Prev == 'L' || Prev == 'U' || Prev == 'u') 269 return false; 270 if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u') 271 return false; 272 if (!isPreprocessingNumberBody(Prev)) 273 return false; 274 // The next character should be a valid identifier body character. 275 return (Cur + 1) < End && isIdentifierBody(*(Cur + 1)); 276} 277 278static void skipLine(const char *&First, const char *const End) { 279 do { 280 assert(First <= End); 281 if (First == End) 282 return; 283 284 if (isVerticalWhitespace(*First)) { 285 skipNewline(First, End); 286 return; 287 } 288 const char *Start = First; 289 while (First != End && !isVerticalWhitespace(*First)) { 290 // Iterate over strings correctly to avoid comments and newlines. 291 if (*First == '"' || 292 (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) { 293 if (isRawStringLiteral(Start, First)) 294 skipRawString(First, End); 295 else 296 skipString(First, End); 297 continue; 298 } 299 300 // Iterate over comments correctly. 301 if (*First != '/' || End - First < 2) { 302 ++First; 303 continue; 304 } 305 306 if (First[1] == '/') { 307 // "//...". 308 skipLineComment(First, End); 309 continue; 310 } 311 312 if (First[1] != '*') { 313 ++First; 314 continue; 315 } 316 317 // "/*...*/". 318 skipBlockComment(First, End); 319 } 320 if (First == End) 321 return; 322 323 // Skip over the newline. 324 assert(isVerticalWhitespace(*First)); 325 skipNewline(First, End); 326 } while (First[-2] == '\\'); // Continue past line-continuations. 327} 328 329static void skipDirective(StringRef Name, const char *&First, 330 const char *const End) { 331 if (llvm::StringSwitch<bool>(Name) 332 .Case("warning", true) 333 .Case("error", true) 334 .Default(false)) 335 // Do not process quotes or comments. 336 skipToNewlineRaw(First, End); 337 else 338 skipLine(First, End); 339} 340 341void Minimizer::printToNewline(const char *&First, const char *const End) { 342 while (First != End && !isVerticalWhitespace(*First)) { 343 const char *Last = First; 344 do { 345 // Iterate over strings correctly to avoid comments and newlines. 346 if (*Last == '"' || *Last == '\'') { 347 if (LLVM_UNLIKELY(isRawStringLiteral(First, Last))) 348 skipRawString(Last, End); 349 else 350 skipString(Last, End); 351 continue; 352 } 353 if (*Last != '/' || End - Last < 2) { 354 ++Last; 355 continue; // Gather the rest up to print verbatim. 356 } 357 358 if (Last[1] != '/' && Last[1] != '*') { 359 ++Last; 360 continue; 361 } 362 363 // Deal with "//..." and "/*...*/". 364 append(First, reverseOverSpaces(First, Last)); 365 First = Last; 366 367 if (Last[1] == '/') { 368 skipLineComment(First, End); 369 return; 370 } 371 372 put(' '); 373 skipBlockComment(First, End); 374 skipOverSpaces(First, End); 375 Last = First; 376 } while (Last != End && !isVerticalWhitespace(*Last)); 377 378 // Print out the string. 379 if (Last == End || Last == First || Last[-1] != '\\') { 380 append(First, reverseOverSpaces(First, Last)); 381 return; 382 } 383 384 // Print up to the backslash, backing up over spaces. 385 append(First, reverseOverSpaces(First, Last - 1)); 386 387 First = Last; 388 skipNewline(First, End); 389 skipOverSpaces(First, End); 390 } 391} 392 393static void skipWhitespace(const char *&First, const char *const End) { 394 for (;;) { 395 assert(First <= End); 396 skipOverSpaces(First, End); 397 398 if (End - First < 2) 399 return; 400 401 if (First[0] == '\\' && isVerticalWhitespace(First[1])) { 402 skipNewline(++First, End); 403 continue; 404 } 405 406 // Check for a non-comment character. 407 if (First[0] != '/') 408 return; 409 410 // "// ...". 411 if (First[1] == '/') { 412 skipLineComment(First, End); 413 return; 414 } 415 416 // Cannot be a comment. 417 if (First[1] != '*') 418 return; 419 420 // "/*...*/". 421 skipBlockComment(First, End); 422 } 423} 424 425void Minimizer::printAdjacentModuleNameParts(const char *&First, 426 const char *const End) { 427 // Skip over parts of the body. 428 const char *Last = First; 429 do 430 ++Last; 431 while (Last != End && (isIdentifierBody(*Last) || *Last == '.')); 432 append(First, Last); 433 First = Last; 434} 435 436bool Minimizer::printAtImportBody(const char *&First, const char *const End) { 437 for (;;) { 438 skipWhitespace(First, End); 439 if (First == End) 440 return true; 441 442 if (isVerticalWhitespace(*First)) { 443 skipNewline(First, End); 444 continue; 445 } 446 447 // Found a semicolon. 448 if (*First == ';') { 449 put(*First++).put('\n'); 450 return false; 451 } 452 453 // Don't handle macro expansions inside @import for now. 454 if (!isIdentifierBody(*First) && *First != '.') 455 return true; 456 457 printAdjacentModuleNameParts(First, End); 458 } 459} 460 461void Minimizer::printDirectiveBody(const char *&First, const char *const End) { 462 skipWhitespace(First, End); // Skip initial whitespace. 463 printToNewline(First, End); 464 while (Out.back() == ' ') 465 Out.pop_back(); 466 put('\n'); 467} 468 469LLVM_NODISCARD static const char *lexRawIdentifier(const char *First, 470 const char *const End) { 471 assert(isIdentifierBody(*First) && "invalid identifer"); 472 const char *Last = First + 1; 473 while (Last != End && isIdentifierBody(*Last)) 474 ++Last; 475 return Last; 476} 477 478LLVM_NODISCARD static const char * 479getIdentifierContinuation(const char *First, const char *const End) { 480 if (End - First < 3 || First[0] != '\\' || !isVerticalWhitespace(First[1])) 481 return nullptr; 482 483 ++First; 484 skipNewline(First, End); 485 if (First == End) 486 return nullptr; 487 return isIdentifierBody(First[0]) ? First : nullptr; 488} 489 490Minimizer::IdInfo Minimizer::lexIdentifier(const char *First, 491 const char *const End) { 492 const char *Last = lexRawIdentifier(First, End); 493 const char *Next = getIdentifierContinuation(Last, End); 494 if (LLVM_LIKELY(!Next)) 495 return IdInfo{Last, StringRef(First, Last - First)}; 496 497 // Slow path, where identifiers are split over lines. 498 SmallVector<char, 64> Id(First, Last); 499 while (Next) { 500 Last = lexRawIdentifier(Next, End); 501 Id.append(Next, Last); 502 Next = getIdentifierContinuation(Last, End); 503 } 504 return IdInfo{ 505 Last, 506 SplitIds.try_emplace(StringRef(Id.begin(), Id.size()), 0).first->first()}; 507} 508 509void Minimizer::printAdjacentMacroArgs(const char *&First, 510 const char *const End) { 511 // Skip over parts of the body. 512 const char *Last = First; 513 do 514 ++Last; 515 while (Last != End && 516 (isIdentifierBody(*Last) || *Last == '.' || *Last == ',')); 517 append(First, Last); 518 First = Last; 519} 520 521bool Minimizer::printMacroArgs(const char *&First, const char *const End) { 522 assert(*First == '('); 523 put(*First++); 524 for (;;) { 525 skipWhitespace(First, End); 526 if (First == End) 527 return true; 528 529 if (*First == ')') { 530 put(*First++); 531 return false; 532 } 533 534 // This is intentionally fairly liberal. 535 if (!(isIdentifierBody(*First) || *First == '.' || *First == ',')) 536 return true; 537 538 printAdjacentMacroArgs(First, End); 539 } 540} 541 542/// Looks for an identifier starting from Last. 543/// 544/// Updates "First" to just past the next identifier, if any. Returns true iff 545/// the identifier matches "Id". 546bool Minimizer::isNextIdentifier(StringRef Id, const char *&First, 547 const char *const End) { 548 skipWhitespace(First, End); 549 if (First == End || !isIdentifierHead(*First)) 550 return false; 551 552 IdInfo FoundId = lexIdentifier(First, End); 553 First = FoundId.Last; 554 return FoundId.Name == Id; 555} 556 557bool Minimizer::lexAt(const char *&First, const char *const End) { 558 // Handle "@import". 559 const char *ImportLoc = First++; 560 if (!isNextIdentifier("import", First, End)) { 561 skipLine(First, End); 562 return false; 563 } 564 makeToken(decl_at_import); 565 append("@import "); 566 if (printAtImportBody(First, End)) 567 return reportError( 568 ImportLoc, diag::err_dep_source_minimizer_missing_sema_after_at_import); 569 skipWhitespace(First, End); 570 if (First == End) 571 return false; 572 if (!isVerticalWhitespace(*First)) 573 return reportError( 574 ImportLoc, diag::err_dep_source_minimizer_unexpected_tokens_at_import); 575 skipNewline(First, End); 576 return false; 577} 578 579bool Minimizer::lexDefine(const char *&First, const char *const End) { 580 makeToken(pp_define); 581 append("#define "); 582 skipWhitespace(First, End); 583 584 if (!isIdentifierHead(*First)) 585 return reportError(First, diag::err_pp_macro_not_identifier); 586 587 IdInfo Id = lexIdentifier(First, End); 588 const char *Last = Id.Last; 589 append(Id.Name); 590 if (Last == End) 591 return false; 592 if (*Last == '(') { 593 size_t Size = Out.size(); 594 if (printMacroArgs(Last, End)) { 595 // Be robust to bad macro arguments, since they can show up in disabled 596 // code. 597 Out.resize(Size); 598 append("(/* invalid */\n"); 599 skipLine(Last, End); 600 return false; 601 } 602 } 603 skipWhitespace(Last, End); 604 if (Last == End) 605 return false; 606 if (!isVerticalWhitespace(*Last)) 607 put(' '); 608 printDirectiveBody(Last, End); 609 First = Last; 610 return false; 611} 612 613bool Minimizer::lexPragma(const char *&First, const char *const End) { 614 // #pragma. 615 if (!isNextIdentifier("clang", First, End)) { 616 skipLine(First, End); 617 return false; 618 } 619 620 // #pragma clang. 621 if (!isNextIdentifier("module", First, End)) { 622 skipLine(First, End); 623 return false; 624 } 625 626 // #pragma clang module. 627 if (!isNextIdentifier("import", First, End)) { 628 skipLine(First, End); 629 return false; 630 } 631 632 // #pragma clang module import. 633 makeToken(pp_pragma_import); 634 append("#pragma clang module import "); 635 printDirectiveBody(First, End); 636 return false; 637} 638 639bool Minimizer::lexEndif(const char *&First, const char *const End) { 640 // Strip out "#else" if it's empty. 641 if (top() == pp_else) 642 popToken(); 643 644 // Strip out "#elif" if they're empty. 645 while (top() == pp_elif) 646 popToken(); 647 648 // If "#if" is empty, strip it and skip the "#endif". 649 if (top() == pp_if || top() == pp_ifdef || top() == pp_ifndef) { 650 popToken(); 651 skipLine(First, End); 652 return false; 653 } 654 655 return lexDefault(pp_endif, "endif", First, End); 656} 657 658bool Minimizer::lexDefault(TokenKind Kind, StringRef Directive, 659 const char *&First, const char *const End) { 660 makeToken(Kind); 661 put('#').append(Directive).put(' '); 662 printDirectiveBody(First, End); 663 return false; 664} 665 666bool Minimizer::lexPPLine(const char *&First, const char *const End) { 667 assert(First != End); 668 669 skipWhitespace(First, End); 670 assert(First <= End); 671 if (First == End) 672 return false; 673 674 if (*First != '#' && *First != '@') { 675 skipLine(First, End); 676 assert(First <= End); 677 return false; 678 } 679 680 // Handle "@import". 681 if (*First == '@') 682 return lexAt(First, End); 683 684 // Handle preprocessing directives. 685 ++First; // Skip over '#'. 686 skipWhitespace(First, End); 687 688 if (First == End) 689 return reportError(First, diag::err_pp_expected_eol); 690 691 if (!isIdentifierHead(*First)) { 692 skipLine(First, End); 693 return false; 694 } 695 696 // Figure out the token. 697 IdInfo Id = lexIdentifier(First, End); 698 First = Id.Last; 699 auto Kind = llvm::StringSwitch<TokenKind>(Id.Name) 700 .Case("include", pp_include) 701 .Case("__include_macros", pp___include_macros) 702 .Case("define", pp_define) 703 .Case("undef", pp_undef) 704 .Case("import", pp_import) 705 .Case("include_next", pp_include_next) 706 .Case("if", pp_if) 707 .Case("ifdef", pp_ifdef) 708 .Case("ifndef", pp_ifndef) 709 .Case("elif", pp_elif) 710 .Case("else", pp_else) 711 .Case("endif", pp_endif) 712 .Case("pragma", pp_pragma_import) 713 .Default(pp_none); 714 if (Kind == pp_none) { 715 skipDirective(Id.Name, First, End); 716 return false; 717 } 718 719 if (Kind == pp_endif) 720 return lexEndif(First, End); 721 722 if (Kind == pp_define) 723 return lexDefine(First, End); 724 725 if (Kind == pp_pragma_import) 726 return lexPragma(First, End); 727 728 // Everything else. 729 return lexDefault(Kind, Id.Name, First, End); 730} 731 732bool Minimizer::minimizeImpl(const char *First, const char *const End) { 733 while (First != End) 734 if (lexPPLine(First, End)) 735 return true; 736 return false; 737} 738 739bool Minimizer::minimize() { 740 bool Error = minimizeImpl(Input.begin(), Input.end()); 741 742 if (!Error) { 743 // Add a trailing newline and an EOF on success. 744 if (!Out.empty() && Out.back() != '\n') 745 Out.push_back('\n'); 746 makeToken(pp_eof); 747 } 748 749 // Null-terminate the output. This way the memory buffer that's passed to 750 // Clang will not have to worry about the terminating '\0'. 751 Out.push_back(0); 752 Out.pop_back(); 753 return Error; 754} 755 756bool clang::minimizeSourceToDependencyDirectives( 757 StringRef Input, SmallVectorImpl<char> &Output, 758 SmallVectorImpl<Token> &Tokens, DiagnosticsEngine *Diags, 759 SourceLocation InputSourceLoc) { 760 Output.clear(); 761 Tokens.clear(); 762 return Minimizer(Output, Tokens, Input, Diags, InputSourceLoc).minimize(); 763} 764