Parser.cpp revision 259701
1259701Sdim//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// 2259701Sdim// 3259701Sdim// The LLVM Compiler Infrastructure 4259701Sdim// 5259701Sdim// This file is distributed under the University of Illinois Open Source 6259701Sdim// License. See LICENSE.TXT for details. 7259701Sdim// 8259701Sdim//===----------------------------------------------------------------------===// 9259701Sdim/// 10259701Sdim/// \file 11259701Sdim/// \brief Recursive parser implementation for the matcher expression grammar. 12259701Sdim/// 13259701Sdim//===----------------------------------------------------------------------===// 14259701Sdim 15259701Sdim#include <string> 16259701Sdim#include <vector> 17259701Sdim 18259701Sdim#include "clang/ASTMatchers/Dynamic/Parser.h" 19259701Sdim#include "clang/ASTMatchers/Dynamic/Registry.h" 20259701Sdim#include "clang/Basic/CharInfo.h" 21259701Sdim#include "llvm/ADT/Twine.h" 22259701Sdim 23259701Sdimnamespace clang { 24259701Sdimnamespace ast_matchers { 25259701Sdimnamespace dynamic { 26259701Sdim 27259701Sdim/// \brief Simple structure to hold information for one token from the parser. 28259701Sdimstruct Parser::TokenInfo { 29259701Sdim /// \brief Different possible tokens. 30259701Sdim enum TokenKind { 31259701Sdim TK_Eof = 0, 32259701Sdim TK_OpenParen = 1, 33259701Sdim TK_CloseParen = 2, 34259701Sdim TK_Comma = 3, 35259701Sdim TK_Period = 4, 36259701Sdim TK_Literal = 5, 37259701Sdim TK_Ident = 6, 38259701Sdim TK_InvalidChar = 7, 39259701Sdim TK_Error = 8 40259701Sdim }; 41259701Sdim 42259701Sdim /// \brief Some known identifiers. 43259701Sdim static const char* const ID_Bind; 44259701Sdim 45259701Sdim TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} 46259701Sdim 47259701Sdim StringRef Text; 48259701Sdim TokenKind Kind; 49259701Sdim SourceRange Range; 50259701Sdim VariantValue Value; 51259701Sdim}; 52259701Sdim 53259701Sdimconst char* const Parser::TokenInfo::ID_Bind = "bind"; 54259701Sdim 55259701Sdim/// \brief Simple tokenizer for the parser. 56259701Sdimclass Parser::CodeTokenizer { 57259701Sdimpublic: 58259701Sdim explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 59259701Sdim : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) { 60259701Sdim NextToken = getNextToken(); 61259701Sdim } 62259701Sdim 63259701Sdim /// \brief Returns but doesn't consume the next token. 64259701Sdim const TokenInfo &peekNextToken() const { return NextToken; } 65259701Sdim 66259701Sdim /// \brief Consumes and returns the next token. 67259701Sdim TokenInfo consumeNextToken() { 68259701Sdim TokenInfo ThisToken = NextToken; 69259701Sdim NextToken = getNextToken(); 70259701Sdim return ThisToken; 71259701Sdim } 72259701Sdim 73259701Sdim TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 74259701Sdim 75259701Sdimprivate: 76259701Sdim TokenInfo getNextToken() { 77259701Sdim consumeWhitespace(); 78259701Sdim TokenInfo Result; 79259701Sdim Result.Range.Start = currentLocation(); 80259701Sdim 81259701Sdim if (Code.empty()) { 82259701Sdim Result.Kind = TokenInfo::TK_Eof; 83259701Sdim Result.Text = ""; 84259701Sdim return Result; 85259701Sdim } 86259701Sdim 87259701Sdim switch (Code[0]) { 88259701Sdim case ',': 89259701Sdim Result.Kind = TokenInfo::TK_Comma; 90259701Sdim Result.Text = Code.substr(0, 1); 91259701Sdim Code = Code.drop_front(); 92259701Sdim break; 93259701Sdim case '.': 94259701Sdim Result.Kind = TokenInfo::TK_Period; 95259701Sdim Result.Text = Code.substr(0, 1); 96259701Sdim Code = Code.drop_front(); 97259701Sdim break; 98259701Sdim case '(': 99259701Sdim Result.Kind = TokenInfo::TK_OpenParen; 100259701Sdim Result.Text = Code.substr(0, 1); 101259701Sdim Code = Code.drop_front(); 102259701Sdim break; 103259701Sdim case ')': 104259701Sdim Result.Kind = TokenInfo::TK_CloseParen; 105259701Sdim Result.Text = Code.substr(0, 1); 106259701Sdim Code = Code.drop_front(); 107259701Sdim break; 108259701Sdim 109259701Sdim case '"': 110259701Sdim case '\'': 111259701Sdim // Parse a string literal. 112259701Sdim consumeStringLiteral(&Result); 113259701Sdim break; 114259701Sdim 115259701Sdim case '0': case '1': case '2': case '3': case '4': 116259701Sdim case '5': case '6': case '7': case '8': case '9': 117259701Sdim // Parse an unsigned literal. 118259701Sdim consumeUnsignedLiteral(&Result); 119259701Sdim break; 120259701Sdim 121259701Sdim default: 122259701Sdim if (isAlphanumeric(Code[0])) { 123259701Sdim // Parse an identifier 124259701Sdim size_t TokenLength = 1; 125259701Sdim while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength])) 126259701Sdim ++TokenLength; 127259701Sdim Result.Kind = TokenInfo::TK_Ident; 128259701Sdim Result.Text = Code.substr(0, TokenLength); 129259701Sdim Code = Code.drop_front(TokenLength); 130259701Sdim } else { 131259701Sdim Result.Kind = TokenInfo::TK_InvalidChar; 132259701Sdim Result.Text = Code.substr(0, 1); 133259701Sdim Code = Code.drop_front(1); 134259701Sdim } 135259701Sdim break; 136259701Sdim } 137259701Sdim 138259701Sdim Result.Range.End = currentLocation(); 139259701Sdim return Result; 140259701Sdim } 141259701Sdim 142259701Sdim /// \brief Consume an unsigned literal. 143259701Sdim void consumeUnsignedLiteral(TokenInfo *Result) { 144259701Sdim unsigned Length = 1; 145259701Sdim if (Code.size() > 1) { 146259701Sdim // Consume the 'x' or 'b' radix modifier, if present. 147259701Sdim switch (toLowercase(Code[1])) { 148259701Sdim case 'x': case 'b': Length = 2; 149259701Sdim } 150259701Sdim } 151259701Sdim while (Length < Code.size() && isHexDigit(Code[Length])) 152259701Sdim ++Length; 153259701Sdim 154259701Sdim Result->Text = Code.substr(0, Length); 155259701Sdim Code = Code.drop_front(Length); 156259701Sdim 157259701Sdim unsigned Value; 158259701Sdim if (!Result->Text.getAsInteger(0, Value)) { 159259701Sdim Result->Kind = TokenInfo::TK_Literal; 160259701Sdim Result->Value = Value; 161259701Sdim } else { 162259701Sdim SourceRange Range; 163259701Sdim Range.Start = Result->Range.Start; 164259701Sdim Range.End = currentLocation(); 165259701Sdim Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text; 166259701Sdim Result->Kind = TokenInfo::TK_Error; 167259701Sdim } 168259701Sdim } 169259701Sdim 170259701Sdim /// \brief Consume a string literal. 171259701Sdim /// 172259701Sdim /// \c Code must be positioned at the start of the literal (the opening 173259701Sdim /// quote). Consumed until it finds the same closing quote character. 174259701Sdim void consumeStringLiteral(TokenInfo *Result) { 175259701Sdim bool InEscape = false; 176259701Sdim const char Marker = Code[0]; 177259701Sdim for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 178259701Sdim if (InEscape) { 179259701Sdim InEscape = false; 180259701Sdim continue; 181259701Sdim } 182259701Sdim if (Code[Length] == '\\') { 183259701Sdim InEscape = true; 184259701Sdim continue; 185259701Sdim } 186259701Sdim if (Code[Length] == Marker) { 187259701Sdim Result->Kind = TokenInfo::TK_Literal; 188259701Sdim Result->Text = Code.substr(0, Length + 1); 189259701Sdim Result->Value = Code.substr(1, Length - 1).str(); 190259701Sdim Code = Code.drop_front(Length + 1); 191259701Sdim return; 192259701Sdim } 193259701Sdim } 194259701Sdim 195259701Sdim StringRef ErrorText = Code; 196259701Sdim Code = Code.drop_front(Code.size()); 197259701Sdim SourceRange Range; 198259701Sdim Range.Start = Result->Range.Start; 199259701Sdim Range.End = currentLocation(); 200259701Sdim Error->addError(Range, Error->ET_ParserStringError) << ErrorText; 201259701Sdim Result->Kind = TokenInfo::TK_Error; 202259701Sdim } 203259701Sdim 204259701Sdim /// \brief Consume all leading whitespace from \c Code. 205259701Sdim void consumeWhitespace() { 206259701Sdim while (!Code.empty() && isWhitespace(Code[0])) { 207259701Sdim if (Code[0] == '\n') { 208259701Sdim ++Line; 209259701Sdim StartOfLine = Code.drop_front(); 210259701Sdim } 211259701Sdim Code = Code.drop_front(); 212259701Sdim } 213259701Sdim } 214259701Sdim 215259701Sdim SourceLocation currentLocation() { 216259701Sdim SourceLocation Location; 217259701Sdim Location.Line = Line; 218259701Sdim Location.Column = Code.data() - StartOfLine.data() + 1; 219259701Sdim return Location; 220259701Sdim } 221259701Sdim 222259701Sdim StringRef Code; 223259701Sdim StringRef StartOfLine; 224259701Sdim unsigned Line; 225259701Sdim Diagnostics *Error; 226259701Sdim TokenInfo NextToken; 227259701Sdim}; 228259701Sdim 229259701SdimParser::Sema::~Sema() {} 230259701Sdim 231259701Sdim/// \brief Parse and validate a matcher expression. 232259701Sdim/// \return \c true on success, in which case \c Value has the matcher parsed. 233259701Sdim/// If the input is malformed, or some argument has an error, it 234259701Sdim/// returns \c false. 235259701Sdimbool Parser::parseMatcherExpressionImpl(VariantValue *Value) { 236259701Sdim const TokenInfo NameToken = Tokenizer->consumeNextToken(); 237259701Sdim assert(NameToken.Kind == TokenInfo::TK_Ident); 238259701Sdim const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 239259701Sdim if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 240259701Sdim Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen) 241259701Sdim << OpenToken.Text; 242259701Sdim return false; 243259701Sdim } 244259701Sdim 245259701Sdim std::vector<ParserValue> Args; 246259701Sdim TokenInfo EndToken; 247259701Sdim while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 248259701Sdim if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 249259701Sdim // End of args. 250259701Sdim EndToken = Tokenizer->consumeNextToken(); 251259701Sdim break; 252259701Sdim } 253259701Sdim if (Args.size() > 0) { 254259701Sdim // We must find a , token to continue. 255259701Sdim const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 256259701Sdim if (CommaToken.Kind != TokenInfo::TK_Comma) { 257259701Sdim Error->addError(CommaToken.Range, Error->ET_ParserNoComma) 258259701Sdim << CommaToken.Text; 259259701Sdim return false; 260259701Sdim } 261259701Sdim } 262259701Sdim 263259701Sdim Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error, 264259701Sdim NameToken.Text, NameToken.Range, Args.size() + 1); 265259701Sdim ParserValue ArgValue; 266259701Sdim ArgValue.Text = Tokenizer->peekNextToken().Text; 267259701Sdim ArgValue.Range = Tokenizer->peekNextToken().Range; 268259701Sdim if (!parseExpressionImpl(&ArgValue.Value)) return false; 269259701Sdim 270259701Sdim Args.push_back(ArgValue); 271259701Sdim } 272259701Sdim 273259701Sdim if (EndToken.Kind == TokenInfo::TK_Eof) { 274259701Sdim Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen); 275259701Sdim return false; 276259701Sdim } 277259701Sdim 278259701Sdim std::string BindID; 279259701Sdim if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { 280259701Sdim // Parse .bind("foo") 281259701Sdim Tokenizer->consumeNextToken(); // consume the period. 282259701Sdim const TokenInfo BindToken = Tokenizer->consumeNextToken(); 283259701Sdim const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 284259701Sdim const TokenInfo IDToken = Tokenizer->consumeNextToken(); 285259701Sdim const TokenInfo CloseToken = Tokenizer->consumeNextToken(); 286259701Sdim 287259701Sdim // TODO: We could use different error codes for each/some to be more 288259701Sdim // explicit about the syntax error. 289259701Sdim if (BindToken.Kind != TokenInfo::TK_Ident || 290259701Sdim BindToken.Text != TokenInfo::ID_Bind) { 291259701Sdim Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr); 292259701Sdim return false; 293259701Sdim } 294259701Sdim if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 295259701Sdim Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr); 296259701Sdim return false; 297259701Sdim } 298259701Sdim if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { 299259701Sdim Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr); 300259701Sdim return false; 301259701Sdim } 302259701Sdim if (CloseToken.Kind != TokenInfo::TK_CloseParen) { 303259701Sdim Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr); 304259701Sdim return false; 305259701Sdim } 306259701Sdim BindID = IDToken.Value.getString(); 307259701Sdim } 308259701Sdim 309259701Sdim // Merge the start and end infos. 310259701Sdim Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error, 311259701Sdim NameToken.Text, NameToken.Range); 312259701Sdim SourceRange MatcherRange = NameToken.Range; 313259701Sdim MatcherRange.End = EndToken.Range.End; 314259701Sdim VariantMatcher Result = S->actOnMatcherExpression( 315259701Sdim NameToken.Text, MatcherRange, BindID, Args, Error); 316259701Sdim if (Result.isNull()) return false; 317259701Sdim 318259701Sdim *Value = Result; 319259701Sdim return true; 320259701Sdim} 321259701Sdim 322259701Sdim/// \brief Parse an <Expresssion> 323259701Sdimbool Parser::parseExpressionImpl(VariantValue *Value) { 324259701Sdim switch (Tokenizer->nextTokenKind()) { 325259701Sdim case TokenInfo::TK_Literal: 326259701Sdim *Value = Tokenizer->consumeNextToken().Value; 327259701Sdim return true; 328259701Sdim 329259701Sdim case TokenInfo::TK_Ident: 330259701Sdim return parseMatcherExpressionImpl(Value); 331259701Sdim 332259701Sdim case TokenInfo::TK_Eof: 333259701Sdim Error->addError(Tokenizer->consumeNextToken().Range, 334259701Sdim Error->ET_ParserNoCode); 335259701Sdim return false; 336259701Sdim 337259701Sdim case TokenInfo::TK_Error: 338259701Sdim // This error was already reported by the tokenizer. 339259701Sdim return false; 340259701Sdim 341259701Sdim case TokenInfo::TK_OpenParen: 342259701Sdim case TokenInfo::TK_CloseParen: 343259701Sdim case TokenInfo::TK_Comma: 344259701Sdim case TokenInfo::TK_Period: 345259701Sdim case TokenInfo::TK_InvalidChar: 346259701Sdim const TokenInfo Token = Tokenizer->consumeNextToken(); 347259701Sdim Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text; 348259701Sdim return false; 349259701Sdim } 350259701Sdim 351259701Sdim llvm_unreachable("Unknown token kind."); 352259701Sdim} 353259701Sdim 354259701SdimParser::Parser(CodeTokenizer *Tokenizer, Sema *S, 355259701Sdim Diagnostics *Error) 356259701Sdim : Tokenizer(Tokenizer), S(S), Error(Error) {} 357259701Sdim 358259701Sdimclass RegistrySema : public Parser::Sema { 359259701Sdimpublic: 360259701Sdim virtual ~RegistrySema() {} 361259701Sdim VariantMatcher actOnMatcherExpression(StringRef MatcherName, 362259701Sdim const SourceRange &NameRange, 363259701Sdim StringRef BindID, 364259701Sdim ArrayRef<ParserValue> Args, 365259701Sdim Diagnostics *Error) { 366259701Sdim if (BindID.empty()) { 367259701Sdim return Registry::constructMatcher(MatcherName, NameRange, Args, Error); 368259701Sdim } else { 369259701Sdim return Registry::constructBoundMatcher(MatcherName, NameRange, BindID, 370259701Sdim Args, Error); 371259701Sdim } 372259701Sdim } 373259701Sdim}; 374259701Sdim 375259701Sdimbool Parser::parseExpression(StringRef Code, VariantValue *Value, 376259701Sdim Diagnostics *Error) { 377259701Sdim RegistrySema S; 378259701Sdim return parseExpression(Code, &S, Value, Error); 379259701Sdim} 380259701Sdim 381259701Sdimbool Parser::parseExpression(StringRef Code, Sema *S, 382259701Sdim VariantValue *Value, Diagnostics *Error) { 383259701Sdim CodeTokenizer Tokenizer(Code, Error); 384259701Sdim if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false; 385259701Sdim if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { 386259701Sdim Error->addError(Tokenizer.peekNextToken().Range, 387259701Sdim Error->ET_ParserTrailingCode); 388259701Sdim return false; 389259701Sdim } 390259701Sdim return true; 391259701Sdim} 392259701Sdim 393259701Sdimllvm::Optional<DynTypedMatcher> 394259701SdimParser::parseMatcherExpression(StringRef Code, Diagnostics *Error) { 395259701Sdim RegistrySema S; 396259701Sdim return parseMatcherExpression(Code, &S, Error); 397259701Sdim} 398259701Sdim 399259701Sdimllvm::Optional<DynTypedMatcher> 400259701SdimParser::parseMatcherExpression(StringRef Code, Parser::Sema *S, 401259701Sdim Diagnostics *Error) { 402259701Sdim VariantValue Value; 403259701Sdim if (!parseExpression(Code, S, &Value, Error)) 404259701Sdim return llvm::Optional<DynTypedMatcher>(); 405259701Sdim if (!Value.isMatcher()) { 406259701Sdim Error->addError(SourceRange(), Error->ET_ParserNotAMatcher); 407259701Sdim return llvm::Optional<DynTypedMatcher>(); 408259701Sdim } 409259701Sdim llvm::Optional<DynTypedMatcher> Result = 410259701Sdim Value.getMatcher().getSingleMatcher(); 411259701Sdim if (!Result.hasValue()) { 412259701Sdim Error->addError(SourceRange(), Error->ET_ParserOverloadedType) 413259701Sdim << Value.getTypeAsString(); 414259701Sdim } 415259701Sdim return Result; 416259701Sdim} 417259701Sdim 418259701Sdim} // namespace dynamic 419259701Sdim} // namespace ast_matchers 420259701Sdim} // namespace clang 421