Parser.cpp revision 321369
1//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// 10/// \file 11/// \brief Recursive parser implementation for the matcher expression grammar. 12/// 13//===----------------------------------------------------------------------===// 14 15#include "clang/ASTMatchers/Dynamic/Parser.h" 16#include "clang/ASTMatchers/Dynamic/Registry.h" 17#include "clang/Basic/CharInfo.h" 18#include "llvm/ADT/Optional.h" 19#include "llvm/Support/ManagedStatic.h" 20#include <string> 21#include <vector> 22 23namespace clang { 24namespace ast_matchers { 25namespace dynamic { 26 27/// \brief Simple structure to hold information for one token from the parser. 28struct Parser::TokenInfo { 29 /// \brief Different possible tokens. 30 enum TokenKind { 31 TK_Eof, 32 TK_OpenParen, 33 TK_CloseParen, 34 TK_Comma, 35 TK_Period, 36 TK_Literal, 37 TK_Ident, 38 TK_InvalidChar, 39 TK_Error, 40 TK_CodeCompletion 41 }; 42 43 /// \brief Some known identifiers. 44 static const char* const ID_Bind; 45 46 TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} 47 48 StringRef Text; 49 TokenKind Kind; 50 SourceRange Range; 51 VariantValue Value; 52}; 53 54const char* const Parser::TokenInfo::ID_Bind = "bind"; 55 56/// \brief Simple tokenizer for the parser. 57class Parser::CodeTokenizer { 58public: 59 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 60 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error), 61 CodeCompletionLocation(nullptr) { 62 NextToken = getNextToken(); 63 } 64 65 CodeTokenizer(StringRef MatcherCode, Diagnostics *Error, 66 unsigned CodeCompletionOffset) 67 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error), 68 CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) { 69 NextToken = getNextToken(); 70 } 71 72 /// \brief Returns but doesn't consume the next token. 73 const TokenInfo &peekNextToken() const { return NextToken; } 74 75 /// \brief Consumes and returns the next token. 76 TokenInfo consumeNextToken() { 77 TokenInfo ThisToken = NextToken; 78 NextToken = getNextToken(); 79 return ThisToken; 80 } 81 82 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 83 84private: 85 TokenInfo getNextToken() { 86 consumeWhitespace(); 87 TokenInfo Result; 88 Result.Range.Start = currentLocation(); 89 90 if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) { 91 Result.Kind = TokenInfo::TK_CodeCompletion; 92 Result.Text = StringRef(CodeCompletionLocation, 0); 93 CodeCompletionLocation = nullptr; 94 return Result; 95 } 96 97 if (Code.empty()) { 98 Result.Kind = TokenInfo::TK_Eof; 99 Result.Text = ""; 100 return Result; 101 } 102 103 switch (Code[0]) { 104 case ',': 105 Result.Kind = TokenInfo::TK_Comma; 106 Result.Text = Code.substr(0, 1); 107 Code = Code.drop_front(); 108 break; 109 case '.': 110 Result.Kind = TokenInfo::TK_Period; 111 Result.Text = Code.substr(0, 1); 112 Code = Code.drop_front(); 113 break; 114 case '(': 115 Result.Kind = TokenInfo::TK_OpenParen; 116 Result.Text = Code.substr(0, 1); 117 Code = Code.drop_front(); 118 break; 119 case ')': 120 Result.Kind = TokenInfo::TK_CloseParen; 121 Result.Text = Code.substr(0, 1); 122 Code = Code.drop_front(); 123 break; 124 125 case '"': 126 case '\'': 127 // Parse a string literal. 128 consumeStringLiteral(&Result); 129 break; 130 131 case '0': case '1': case '2': case '3': case '4': 132 case '5': case '6': case '7': case '8': case '9': 133 // Parse an unsigned and float literal. 134 consumeNumberLiteral(&Result); 135 break; 136 137 default: 138 if (isAlphanumeric(Code[0])) { 139 // Parse an identifier 140 size_t TokenLength = 1; 141 while (1) { 142 // A code completion location in/immediately after an identifier will 143 // cause the portion of the identifier before the code completion 144 // location to become a code completion token. 145 if (CodeCompletionLocation == Code.data() + TokenLength) { 146 CodeCompletionLocation = nullptr; 147 Result.Kind = TokenInfo::TK_CodeCompletion; 148 Result.Text = Code.substr(0, TokenLength); 149 Code = Code.drop_front(TokenLength); 150 return Result; 151 } 152 if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength])) 153 break; 154 ++TokenLength; 155 } 156 if (TokenLength == 4 && Code.startswith("true")) { 157 Result.Kind = TokenInfo::TK_Literal; 158 Result.Value = true; 159 } else if (TokenLength == 5 && Code.startswith("false")) { 160 Result.Kind = TokenInfo::TK_Literal; 161 Result.Value = false; 162 } else { 163 Result.Kind = TokenInfo::TK_Ident; 164 Result.Text = Code.substr(0, TokenLength); 165 } 166 Code = Code.drop_front(TokenLength); 167 } else { 168 Result.Kind = TokenInfo::TK_InvalidChar; 169 Result.Text = Code.substr(0, 1); 170 Code = Code.drop_front(1); 171 } 172 break; 173 } 174 175 Result.Range.End = currentLocation(); 176 return Result; 177 } 178 179 /// \brief Consume an unsigned and float literal. 180 void consumeNumberLiteral(TokenInfo *Result) { 181 bool isFloatingLiteral = false; 182 unsigned Length = 1; 183 if (Code.size() > 1) { 184 // Consume the 'x' or 'b' radix modifier, if present. 185 switch (toLowercase(Code[1])) { 186 case 'x': case 'b': Length = 2; 187 } 188 } 189 while (Length < Code.size() && isHexDigit(Code[Length])) 190 ++Length; 191 192 // Try to recognize a floating point literal. 193 while (Length < Code.size()) { 194 char c = Code[Length]; 195 if (c == '-' || c == '+' || c == '.' || isHexDigit(c)) { 196 isFloatingLiteral = true; 197 Length++; 198 } else { 199 break; 200 } 201 } 202 203 Result->Text = Code.substr(0, Length); 204 Code = Code.drop_front(Length); 205 206 if (isFloatingLiteral) { 207 char *end; 208 errno = 0; 209 std::string Text = Result->Text.str(); 210 double doubleValue = strtod(Text.c_str(), &end); 211 if (*end == 0 && errno == 0) { 212 Result->Kind = TokenInfo::TK_Literal; 213 Result->Value = doubleValue; 214 return; 215 } 216 } else { 217 unsigned Value; 218 if (!Result->Text.getAsInteger(0, Value)) { 219 Result->Kind = TokenInfo::TK_Literal; 220 Result->Value = Value; 221 return; 222 } 223 } 224 225 SourceRange Range; 226 Range.Start = Result->Range.Start; 227 Range.End = currentLocation(); 228 Error->addError(Range, Error->ET_ParserNumberError) << Result->Text; 229 Result->Kind = TokenInfo::TK_Error; 230 } 231 232 /// \brief Consume a string literal. 233 /// 234 /// \c Code must be positioned at the start of the literal (the opening 235 /// quote). Consumed until it finds the same closing quote character. 236 void consumeStringLiteral(TokenInfo *Result) { 237 bool InEscape = false; 238 const char Marker = Code[0]; 239 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 240 if (InEscape) { 241 InEscape = false; 242 continue; 243 } 244 if (Code[Length] == '\\') { 245 InEscape = true; 246 continue; 247 } 248 if (Code[Length] == Marker) { 249 Result->Kind = TokenInfo::TK_Literal; 250 Result->Text = Code.substr(0, Length + 1); 251 Result->Value = Code.substr(1, Length - 1); 252 Code = Code.drop_front(Length + 1); 253 return; 254 } 255 } 256 257 StringRef ErrorText = Code; 258 Code = Code.drop_front(Code.size()); 259 SourceRange Range; 260 Range.Start = Result->Range.Start; 261 Range.End = currentLocation(); 262 Error->addError(Range, Error->ET_ParserStringError) << ErrorText; 263 Result->Kind = TokenInfo::TK_Error; 264 } 265 266 /// \brief Consume all leading whitespace from \c Code. 267 void consumeWhitespace() { 268 while (!Code.empty() && isWhitespace(Code[0])) { 269 if (Code[0] == '\n') { 270 ++Line; 271 StartOfLine = Code.drop_front(); 272 } 273 Code = Code.drop_front(); 274 } 275 } 276 277 SourceLocation currentLocation() { 278 SourceLocation Location; 279 Location.Line = Line; 280 Location.Column = Code.data() - StartOfLine.data() + 1; 281 return Location; 282 } 283 284 StringRef Code; 285 StringRef StartOfLine; 286 unsigned Line; 287 Diagnostics *Error; 288 TokenInfo NextToken; 289 const char *CodeCompletionLocation; 290}; 291 292Parser::Sema::~Sema() {} 293 294std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes( 295 llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) { 296 return std::vector<ArgKind>(); 297} 298 299std::vector<MatcherCompletion> 300Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) { 301 return std::vector<MatcherCompletion>(); 302} 303 304struct Parser::ScopedContextEntry { 305 Parser *P; 306 307 ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) { 308 P->ContextStack.push_back(std::make_pair(C, 0u)); 309 } 310 311 ~ScopedContextEntry() { 312 P->ContextStack.pop_back(); 313 } 314 315 void nextArg() { 316 ++P->ContextStack.back().second; 317 } 318}; 319 320/// \brief Parse expressions that start with an identifier. 321/// 322/// This function can parse named values and matchers. 323/// In case of failure it will try to determine the user's intent to give 324/// an appropriate error message. 325bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) { 326 const TokenInfo NameToken = Tokenizer->consumeNextToken(); 327 328 if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) { 329 // Parse as a named value. 330 if (const VariantValue NamedValue = 331 NamedValues ? NamedValues->lookup(NameToken.Text) 332 : VariantValue()) { 333 *Value = NamedValue; 334 return true; 335 } 336 // If the syntax is correct and the name is not a matcher either, report 337 // unknown named value. 338 if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma || 339 Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen || 340 Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) && 341 !S->lookupMatcherCtor(NameToken.Text)) { 342 Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound) 343 << NameToken.Text; 344 return false; 345 } 346 // Otherwise, fallback to the matcher parser. 347 } 348 349 // Parse as a matcher expression. 350 return parseMatcherExpressionImpl(NameToken, Value); 351} 352 353/// \brief Parse and validate a matcher expression. 354/// \return \c true on success, in which case \c Value has the matcher parsed. 355/// If the input is malformed, or some argument has an error, it 356/// returns \c false. 357bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken, 358 VariantValue *Value) { 359 assert(NameToken.Kind == TokenInfo::TK_Ident); 360 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 361 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 362 Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen) 363 << OpenToken.Text; 364 return false; 365 } 366 367 llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text); 368 369 if (!Ctor) { 370 Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound) 371 << NameToken.Text; 372 // Do not return here. We need to continue to give completion suggestions. 373 } 374 375 std::vector<ParserValue> Args; 376 TokenInfo EndToken; 377 378 { 379 ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr); 380 381 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 382 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 383 // End of args. 384 EndToken = Tokenizer->consumeNextToken(); 385 break; 386 } 387 if (Args.size() > 0) { 388 // We must find a , token to continue. 389 const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 390 if (CommaToken.Kind != TokenInfo::TK_Comma) { 391 Error->addError(CommaToken.Range, Error->ET_ParserNoComma) 392 << CommaToken.Text; 393 return false; 394 } 395 } 396 397 Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error, 398 NameToken.Text, NameToken.Range, 399 Args.size() + 1); 400 ParserValue ArgValue; 401 ArgValue.Text = Tokenizer->peekNextToken().Text; 402 ArgValue.Range = Tokenizer->peekNextToken().Range; 403 if (!parseExpressionImpl(&ArgValue.Value)) { 404 return false; 405 } 406 407 Args.push_back(ArgValue); 408 SCE.nextArg(); 409 } 410 } 411 412 if (EndToken.Kind == TokenInfo::TK_Eof) { 413 Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen); 414 return false; 415 } 416 417 std::string BindID; 418 if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { 419 // Parse .bind("foo") 420 Tokenizer->consumeNextToken(); // consume the period. 421 const TokenInfo BindToken = Tokenizer->consumeNextToken(); 422 if (BindToken.Kind == TokenInfo::TK_CodeCompletion) { 423 addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1)); 424 return false; 425 } 426 427 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 428 const TokenInfo IDToken = Tokenizer->consumeNextToken(); 429 const TokenInfo CloseToken = Tokenizer->consumeNextToken(); 430 431 // TODO: We could use different error codes for each/some to be more 432 // explicit about the syntax error. 433 if (BindToken.Kind != TokenInfo::TK_Ident || 434 BindToken.Text != TokenInfo::ID_Bind) { 435 Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr); 436 return false; 437 } 438 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 439 Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr); 440 return false; 441 } 442 if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { 443 Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr); 444 return false; 445 } 446 if (CloseToken.Kind != TokenInfo::TK_CloseParen) { 447 Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr); 448 return false; 449 } 450 BindID = IDToken.Value.getString(); 451 } 452 453 if (!Ctor) 454 return false; 455 456 // Merge the start and end infos. 457 Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error, 458 NameToken.Text, NameToken.Range); 459 SourceRange MatcherRange = NameToken.Range; 460 MatcherRange.End = EndToken.Range.End; 461 VariantMatcher Result = S->actOnMatcherExpression( 462 *Ctor, MatcherRange, BindID, Args, Error); 463 if (Result.isNull()) return false; 464 465 *Value = Result; 466 return true; 467} 468 469// If the prefix of this completion matches the completion token, add it to 470// Completions minus the prefix. 471void Parser::addCompletion(const TokenInfo &CompToken, 472 const MatcherCompletion& Completion) { 473 if (StringRef(Completion.TypedText).startswith(CompToken.Text) && 474 Completion.Specificity > 0) { 475 Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()), 476 Completion.MatcherDecl, Completion.Specificity); 477 } 478} 479 480std::vector<MatcherCompletion> Parser::getNamedValueCompletions( 481 ArrayRef<ArgKind> AcceptedTypes) { 482 if (!NamedValues) return std::vector<MatcherCompletion>(); 483 std::vector<MatcherCompletion> Result; 484 for (const auto &Entry : *NamedValues) { 485 unsigned Specificity; 486 if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) { 487 std::string Decl = 488 (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str(); 489 Result.emplace_back(Entry.getKey(), Decl, Specificity); 490 } 491 } 492 return Result; 493} 494 495void Parser::addExpressionCompletions() { 496 const TokenInfo CompToken = Tokenizer->consumeNextToken(); 497 assert(CompToken.Kind == TokenInfo::TK_CodeCompletion); 498 499 // We cannot complete code if there is an invalid element on the context 500 // stack. 501 for (ContextStackTy::iterator I = ContextStack.begin(), 502 E = ContextStack.end(); 503 I != E; ++I) { 504 if (!I->first) 505 return; 506 } 507 508 auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack); 509 for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) { 510 addCompletion(CompToken, Completion); 511 } 512 513 for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) { 514 addCompletion(CompToken, Completion); 515 } 516} 517 518/// \brief Parse an <Expresssion> 519bool Parser::parseExpressionImpl(VariantValue *Value) { 520 switch (Tokenizer->nextTokenKind()) { 521 case TokenInfo::TK_Literal: 522 *Value = Tokenizer->consumeNextToken().Value; 523 return true; 524 525 case TokenInfo::TK_Ident: 526 return parseIdentifierPrefixImpl(Value); 527 528 case TokenInfo::TK_CodeCompletion: 529 addExpressionCompletions(); 530 return false; 531 532 case TokenInfo::TK_Eof: 533 Error->addError(Tokenizer->consumeNextToken().Range, 534 Error->ET_ParserNoCode); 535 return false; 536 537 case TokenInfo::TK_Error: 538 // This error was already reported by the tokenizer. 539 return false; 540 541 case TokenInfo::TK_OpenParen: 542 case TokenInfo::TK_CloseParen: 543 case TokenInfo::TK_Comma: 544 case TokenInfo::TK_Period: 545 case TokenInfo::TK_InvalidChar: 546 const TokenInfo Token = Tokenizer->consumeNextToken(); 547 Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text; 548 return false; 549 } 550 551 llvm_unreachable("Unknown token kind."); 552} 553 554static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema; 555 556Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, 557 const NamedValueMap *NamedValues, Diagnostics *Error) 558 : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema), 559 NamedValues(NamedValues), Error(Error) {} 560 561Parser::RegistrySema::~RegistrySema() {} 562 563llvm::Optional<MatcherCtor> 564Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) { 565 return Registry::lookupMatcherCtor(MatcherName); 566} 567 568VariantMatcher Parser::RegistrySema::actOnMatcherExpression( 569 MatcherCtor Ctor, SourceRange NameRange, StringRef BindID, 570 ArrayRef<ParserValue> Args, Diagnostics *Error) { 571 if (BindID.empty()) { 572 return Registry::constructMatcher(Ctor, NameRange, Args, Error); 573 } else { 574 return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args, 575 Error); 576 } 577} 578 579std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes( 580 ArrayRef<std::pair<MatcherCtor, unsigned>> Context) { 581 return Registry::getAcceptedCompletionTypes(Context); 582} 583 584std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions( 585 ArrayRef<ArgKind> AcceptedTypes) { 586 return Registry::getMatcherCompletions(AcceptedTypes); 587} 588 589bool Parser::parseExpression(StringRef Code, Sema *S, 590 const NamedValueMap *NamedValues, 591 VariantValue *Value, Diagnostics *Error) { 592 CodeTokenizer Tokenizer(Code, Error); 593 if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value)) 594 return false; 595 if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { 596 Error->addError(Tokenizer.peekNextToken().Range, 597 Error->ET_ParserTrailingCode); 598 return false; 599 } 600 return true; 601} 602 603std::vector<MatcherCompletion> 604Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S, 605 const NamedValueMap *NamedValues) { 606 Diagnostics Error; 607 CodeTokenizer Tokenizer(Code, &Error, CompletionOffset); 608 Parser P(&Tokenizer, S, NamedValues, &Error); 609 VariantValue Dummy; 610 P.parseExpressionImpl(&Dummy); 611 612 // Sort by specificity, then by name. 613 std::sort(P.Completions.begin(), P.Completions.end(), 614 [](const MatcherCompletion &A, const MatcherCompletion &B) { 615 if (A.Specificity != B.Specificity) 616 return A.Specificity > B.Specificity; 617 return A.TypedText < B.TypedText; 618 }); 619 620 return P.Completions; 621} 622 623llvm::Optional<DynTypedMatcher> 624Parser::parseMatcherExpression(StringRef Code, Sema *S, 625 const NamedValueMap *NamedValues, 626 Diagnostics *Error) { 627 VariantValue Value; 628 if (!parseExpression(Code, S, NamedValues, &Value, Error)) 629 return llvm::Optional<DynTypedMatcher>(); 630 if (!Value.isMatcher()) { 631 Error->addError(SourceRange(), Error->ET_ParserNotAMatcher); 632 return llvm::Optional<DynTypedMatcher>(); 633 } 634 llvm::Optional<DynTypedMatcher> Result = 635 Value.getMatcher().getSingleMatcher(); 636 if (!Result.hasValue()) { 637 Error->addError(SourceRange(), Error->ET_ParserOverloadedType) 638 << Value.getTypeAsString(); 639 } 640 return Result; 641} 642 643} // namespace dynamic 644} // namespace ast_matchers 645} // namespace clang 646