Parser.cpp revision 259701
1259701Sdim//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2259701Sdim//
3259701Sdim//                     The LLVM Compiler Infrastructure
4259701Sdim//
5259701Sdim// This file is distributed under the University of Illinois Open Source
6259701Sdim// License. See LICENSE.TXT for details.
7259701Sdim//
8259701Sdim//===----------------------------------------------------------------------===//
9259701Sdim///
10259701Sdim/// \file
11259701Sdim/// \brief Recursive parser implementation for the matcher expression grammar.
12259701Sdim///
13259701Sdim//===----------------------------------------------------------------------===//
14259701Sdim
15259701Sdim#include <string>
16259701Sdim#include <vector>
17259701Sdim
18259701Sdim#include "clang/ASTMatchers/Dynamic/Parser.h"
19259701Sdim#include "clang/ASTMatchers/Dynamic/Registry.h"
20259701Sdim#include "clang/Basic/CharInfo.h"
21259701Sdim#include "llvm/ADT/Twine.h"
22259701Sdim
23259701Sdimnamespace clang {
24259701Sdimnamespace ast_matchers {
25259701Sdimnamespace dynamic {
26259701Sdim
27259701Sdim/// \brief Simple structure to hold information for one token from the parser.
28259701Sdimstruct Parser::TokenInfo {
29259701Sdim  /// \brief Different possible tokens.
30259701Sdim  enum TokenKind {
31259701Sdim    TK_Eof = 0,
32259701Sdim    TK_OpenParen = 1,
33259701Sdim    TK_CloseParen = 2,
34259701Sdim    TK_Comma = 3,
35259701Sdim    TK_Period = 4,
36259701Sdim    TK_Literal = 5,
37259701Sdim    TK_Ident = 6,
38259701Sdim    TK_InvalidChar = 7,
39259701Sdim    TK_Error = 8
40259701Sdim  };
41259701Sdim
42259701Sdim  /// \brief Some known identifiers.
43259701Sdim  static const char* const ID_Bind;
44259701Sdim
45259701Sdim  TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
46259701Sdim
47259701Sdim  StringRef Text;
48259701Sdim  TokenKind Kind;
49259701Sdim  SourceRange Range;
50259701Sdim  VariantValue Value;
51259701Sdim};
52259701Sdim
53259701Sdimconst char* const Parser::TokenInfo::ID_Bind = "bind";
54259701Sdim
55259701Sdim/// \brief Simple tokenizer for the parser.
56259701Sdimclass Parser::CodeTokenizer {
57259701Sdimpublic:
58259701Sdim  explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
59259701Sdim      : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
60259701Sdim    NextToken = getNextToken();
61259701Sdim  }
62259701Sdim
63259701Sdim  /// \brief Returns but doesn't consume the next token.
64259701Sdim  const TokenInfo &peekNextToken() const { return NextToken; }
65259701Sdim
66259701Sdim  /// \brief Consumes and returns the next token.
67259701Sdim  TokenInfo consumeNextToken() {
68259701Sdim    TokenInfo ThisToken = NextToken;
69259701Sdim    NextToken = getNextToken();
70259701Sdim    return ThisToken;
71259701Sdim  }
72259701Sdim
73259701Sdim  TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
74259701Sdim
75259701Sdimprivate:
76259701Sdim  TokenInfo getNextToken() {
77259701Sdim    consumeWhitespace();
78259701Sdim    TokenInfo Result;
79259701Sdim    Result.Range.Start = currentLocation();
80259701Sdim
81259701Sdim    if (Code.empty()) {
82259701Sdim      Result.Kind = TokenInfo::TK_Eof;
83259701Sdim      Result.Text = "";
84259701Sdim      return Result;
85259701Sdim    }
86259701Sdim
87259701Sdim    switch (Code[0]) {
88259701Sdim    case ',':
89259701Sdim      Result.Kind = TokenInfo::TK_Comma;
90259701Sdim      Result.Text = Code.substr(0, 1);
91259701Sdim      Code = Code.drop_front();
92259701Sdim      break;
93259701Sdim    case '.':
94259701Sdim      Result.Kind = TokenInfo::TK_Period;
95259701Sdim      Result.Text = Code.substr(0, 1);
96259701Sdim      Code = Code.drop_front();
97259701Sdim      break;
98259701Sdim    case '(':
99259701Sdim      Result.Kind = TokenInfo::TK_OpenParen;
100259701Sdim      Result.Text = Code.substr(0, 1);
101259701Sdim      Code = Code.drop_front();
102259701Sdim      break;
103259701Sdim    case ')':
104259701Sdim      Result.Kind = TokenInfo::TK_CloseParen;
105259701Sdim      Result.Text = Code.substr(0, 1);
106259701Sdim      Code = Code.drop_front();
107259701Sdim      break;
108259701Sdim
109259701Sdim    case '"':
110259701Sdim    case '\'':
111259701Sdim      // Parse a string literal.
112259701Sdim      consumeStringLiteral(&Result);
113259701Sdim      break;
114259701Sdim
115259701Sdim    case '0': case '1': case '2': case '3': case '4':
116259701Sdim    case '5': case '6': case '7': case '8': case '9':
117259701Sdim      // Parse an unsigned literal.
118259701Sdim      consumeUnsignedLiteral(&Result);
119259701Sdim      break;
120259701Sdim
121259701Sdim    default:
122259701Sdim      if (isAlphanumeric(Code[0])) {
123259701Sdim        // Parse an identifier
124259701Sdim        size_t TokenLength = 1;
125259701Sdim        while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
126259701Sdim          ++TokenLength;
127259701Sdim        Result.Kind = TokenInfo::TK_Ident;
128259701Sdim        Result.Text = Code.substr(0, TokenLength);
129259701Sdim        Code = Code.drop_front(TokenLength);
130259701Sdim      } else {
131259701Sdim        Result.Kind = TokenInfo::TK_InvalidChar;
132259701Sdim        Result.Text = Code.substr(0, 1);
133259701Sdim        Code = Code.drop_front(1);
134259701Sdim      }
135259701Sdim      break;
136259701Sdim    }
137259701Sdim
138259701Sdim    Result.Range.End = currentLocation();
139259701Sdim    return Result;
140259701Sdim  }
141259701Sdim
142259701Sdim  /// \brief Consume an unsigned literal.
143259701Sdim  void consumeUnsignedLiteral(TokenInfo *Result) {
144259701Sdim    unsigned Length = 1;
145259701Sdim    if (Code.size() > 1) {
146259701Sdim      // Consume the 'x' or 'b' radix modifier, if present.
147259701Sdim      switch (toLowercase(Code[1])) {
148259701Sdim      case 'x': case 'b': Length = 2;
149259701Sdim      }
150259701Sdim    }
151259701Sdim    while (Length < Code.size() && isHexDigit(Code[Length]))
152259701Sdim      ++Length;
153259701Sdim
154259701Sdim    Result->Text = Code.substr(0, Length);
155259701Sdim    Code = Code.drop_front(Length);
156259701Sdim
157259701Sdim    unsigned Value;
158259701Sdim    if (!Result->Text.getAsInteger(0, Value)) {
159259701Sdim      Result->Kind = TokenInfo::TK_Literal;
160259701Sdim      Result->Value = Value;
161259701Sdim    } else {
162259701Sdim      SourceRange Range;
163259701Sdim      Range.Start = Result->Range.Start;
164259701Sdim      Range.End = currentLocation();
165259701Sdim      Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
166259701Sdim      Result->Kind = TokenInfo::TK_Error;
167259701Sdim    }
168259701Sdim  }
169259701Sdim
170259701Sdim  /// \brief Consume a string literal.
171259701Sdim  ///
172259701Sdim  /// \c Code must be positioned at the start of the literal (the opening
173259701Sdim  /// quote). Consumed until it finds the same closing quote character.
174259701Sdim  void consumeStringLiteral(TokenInfo *Result) {
175259701Sdim    bool InEscape = false;
176259701Sdim    const char Marker = Code[0];
177259701Sdim    for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
178259701Sdim      if (InEscape) {
179259701Sdim        InEscape = false;
180259701Sdim        continue;
181259701Sdim      }
182259701Sdim      if (Code[Length] == '\\') {
183259701Sdim        InEscape = true;
184259701Sdim        continue;
185259701Sdim      }
186259701Sdim      if (Code[Length] == Marker) {
187259701Sdim        Result->Kind = TokenInfo::TK_Literal;
188259701Sdim        Result->Text = Code.substr(0, Length + 1);
189259701Sdim        Result->Value = Code.substr(1, Length - 1).str();
190259701Sdim        Code = Code.drop_front(Length + 1);
191259701Sdim        return;
192259701Sdim      }
193259701Sdim    }
194259701Sdim
195259701Sdim    StringRef ErrorText = Code;
196259701Sdim    Code = Code.drop_front(Code.size());
197259701Sdim    SourceRange Range;
198259701Sdim    Range.Start = Result->Range.Start;
199259701Sdim    Range.End = currentLocation();
200259701Sdim    Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
201259701Sdim    Result->Kind = TokenInfo::TK_Error;
202259701Sdim  }
203259701Sdim
204259701Sdim  /// \brief Consume all leading whitespace from \c Code.
205259701Sdim  void consumeWhitespace() {
206259701Sdim    while (!Code.empty() && isWhitespace(Code[0])) {
207259701Sdim      if (Code[0] == '\n') {
208259701Sdim        ++Line;
209259701Sdim        StartOfLine = Code.drop_front();
210259701Sdim      }
211259701Sdim      Code = Code.drop_front();
212259701Sdim    }
213259701Sdim  }
214259701Sdim
215259701Sdim  SourceLocation currentLocation() {
216259701Sdim    SourceLocation Location;
217259701Sdim    Location.Line = Line;
218259701Sdim    Location.Column = Code.data() - StartOfLine.data() + 1;
219259701Sdim    return Location;
220259701Sdim  }
221259701Sdim
222259701Sdim  StringRef Code;
223259701Sdim  StringRef StartOfLine;
224259701Sdim  unsigned Line;
225259701Sdim  Diagnostics *Error;
226259701Sdim  TokenInfo NextToken;
227259701Sdim};
228259701Sdim
229259701SdimParser::Sema::~Sema() {}
230259701Sdim
231259701Sdim/// \brief Parse and validate a matcher expression.
232259701Sdim/// \return \c true on success, in which case \c Value has the matcher parsed.
233259701Sdim///   If the input is malformed, or some argument has an error, it
234259701Sdim///   returns \c false.
235259701Sdimbool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
236259701Sdim  const TokenInfo NameToken = Tokenizer->consumeNextToken();
237259701Sdim  assert(NameToken.Kind == TokenInfo::TK_Ident);
238259701Sdim  const TokenInfo OpenToken = Tokenizer->consumeNextToken();
239259701Sdim  if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
240259701Sdim    Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
241259701Sdim        << OpenToken.Text;
242259701Sdim    return false;
243259701Sdim  }
244259701Sdim
245259701Sdim  std::vector<ParserValue> Args;
246259701Sdim  TokenInfo EndToken;
247259701Sdim  while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
248259701Sdim    if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
249259701Sdim      // End of args.
250259701Sdim      EndToken = Tokenizer->consumeNextToken();
251259701Sdim      break;
252259701Sdim    }
253259701Sdim    if (Args.size() > 0) {
254259701Sdim      // We must find a , token to continue.
255259701Sdim      const TokenInfo CommaToken = Tokenizer->consumeNextToken();
256259701Sdim      if (CommaToken.Kind != TokenInfo::TK_Comma) {
257259701Sdim        Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
258259701Sdim            << CommaToken.Text;
259259701Sdim        return false;
260259701Sdim      }
261259701Sdim    }
262259701Sdim
263259701Sdim    Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
264259701Sdim                             NameToken.Text, NameToken.Range, Args.size() + 1);
265259701Sdim    ParserValue ArgValue;
266259701Sdim    ArgValue.Text = Tokenizer->peekNextToken().Text;
267259701Sdim    ArgValue.Range = Tokenizer->peekNextToken().Range;
268259701Sdim    if (!parseExpressionImpl(&ArgValue.Value)) return false;
269259701Sdim
270259701Sdim    Args.push_back(ArgValue);
271259701Sdim  }
272259701Sdim
273259701Sdim  if (EndToken.Kind == TokenInfo::TK_Eof) {
274259701Sdim    Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
275259701Sdim    return false;
276259701Sdim  }
277259701Sdim
278259701Sdim  std::string BindID;
279259701Sdim  if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
280259701Sdim    // Parse .bind("foo")
281259701Sdim    Tokenizer->consumeNextToken();  // consume the period.
282259701Sdim    const TokenInfo BindToken = Tokenizer->consumeNextToken();
283259701Sdim    const TokenInfo OpenToken = Tokenizer->consumeNextToken();
284259701Sdim    const TokenInfo IDToken = Tokenizer->consumeNextToken();
285259701Sdim    const TokenInfo CloseToken = Tokenizer->consumeNextToken();
286259701Sdim
287259701Sdim    // TODO: We could use different error codes for each/some to be more
288259701Sdim    //       explicit about the syntax error.
289259701Sdim    if (BindToken.Kind != TokenInfo::TK_Ident ||
290259701Sdim        BindToken.Text != TokenInfo::ID_Bind) {
291259701Sdim      Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
292259701Sdim      return false;
293259701Sdim    }
294259701Sdim    if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
295259701Sdim      Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
296259701Sdim      return false;
297259701Sdim    }
298259701Sdim    if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
299259701Sdim      Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
300259701Sdim      return false;
301259701Sdim    }
302259701Sdim    if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
303259701Sdim      Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
304259701Sdim      return false;
305259701Sdim    }
306259701Sdim    BindID = IDToken.Value.getString();
307259701Sdim  }
308259701Sdim
309259701Sdim  // Merge the start and end infos.
310259701Sdim  Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
311259701Sdim                           NameToken.Text, NameToken.Range);
312259701Sdim  SourceRange MatcherRange = NameToken.Range;
313259701Sdim  MatcherRange.End = EndToken.Range.End;
314259701Sdim  VariantMatcher Result = S->actOnMatcherExpression(
315259701Sdim      NameToken.Text, MatcherRange, BindID, Args, Error);
316259701Sdim  if (Result.isNull()) return false;
317259701Sdim
318259701Sdim  *Value = Result;
319259701Sdim  return true;
320259701Sdim}
321259701Sdim
322259701Sdim/// \brief Parse an <Expresssion>
323259701Sdimbool Parser::parseExpressionImpl(VariantValue *Value) {
324259701Sdim  switch (Tokenizer->nextTokenKind()) {
325259701Sdim  case TokenInfo::TK_Literal:
326259701Sdim    *Value = Tokenizer->consumeNextToken().Value;
327259701Sdim    return true;
328259701Sdim
329259701Sdim  case TokenInfo::TK_Ident:
330259701Sdim    return parseMatcherExpressionImpl(Value);
331259701Sdim
332259701Sdim  case TokenInfo::TK_Eof:
333259701Sdim    Error->addError(Tokenizer->consumeNextToken().Range,
334259701Sdim                    Error->ET_ParserNoCode);
335259701Sdim    return false;
336259701Sdim
337259701Sdim  case TokenInfo::TK_Error:
338259701Sdim    // This error was already reported by the tokenizer.
339259701Sdim    return false;
340259701Sdim
341259701Sdim  case TokenInfo::TK_OpenParen:
342259701Sdim  case TokenInfo::TK_CloseParen:
343259701Sdim  case TokenInfo::TK_Comma:
344259701Sdim  case TokenInfo::TK_Period:
345259701Sdim  case TokenInfo::TK_InvalidChar:
346259701Sdim    const TokenInfo Token = Tokenizer->consumeNextToken();
347259701Sdim    Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
348259701Sdim    return false;
349259701Sdim  }
350259701Sdim
351259701Sdim  llvm_unreachable("Unknown token kind.");
352259701Sdim}
353259701Sdim
354259701SdimParser::Parser(CodeTokenizer *Tokenizer, Sema *S,
355259701Sdim               Diagnostics *Error)
356259701Sdim    : Tokenizer(Tokenizer), S(S), Error(Error) {}
357259701Sdim
358259701Sdimclass RegistrySema : public Parser::Sema {
359259701Sdimpublic:
360259701Sdim  virtual ~RegistrySema() {}
361259701Sdim  VariantMatcher actOnMatcherExpression(StringRef MatcherName,
362259701Sdim                                        const SourceRange &NameRange,
363259701Sdim                                        StringRef BindID,
364259701Sdim                                        ArrayRef<ParserValue> Args,
365259701Sdim                                        Diagnostics *Error) {
366259701Sdim    if (BindID.empty()) {
367259701Sdim      return Registry::constructMatcher(MatcherName, NameRange, Args, Error);
368259701Sdim    } else {
369259701Sdim      return Registry::constructBoundMatcher(MatcherName, NameRange, BindID,
370259701Sdim                                             Args, Error);
371259701Sdim    }
372259701Sdim  }
373259701Sdim};
374259701Sdim
375259701Sdimbool Parser::parseExpression(StringRef Code, VariantValue *Value,
376259701Sdim                             Diagnostics *Error) {
377259701Sdim  RegistrySema S;
378259701Sdim  return parseExpression(Code, &S, Value, Error);
379259701Sdim}
380259701Sdim
381259701Sdimbool Parser::parseExpression(StringRef Code, Sema *S,
382259701Sdim                             VariantValue *Value, Diagnostics *Error) {
383259701Sdim  CodeTokenizer Tokenizer(Code, Error);
384259701Sdim  if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
385259701Sdim  if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
386259701Sdim    Error->addError(Tokenizer.peekNextToken().Range,
387259701Sdim                    Error->ET_ParserTrailingCode);
388259701Sdim    return false;
389259701Sdim  }
390259701Sdim  return true;
391259701Sdim}
392259701Sdim
393259701Sdimllvm::Optional<DynTypedMatcher>
394259701SdimParser::parseMatcherExpression(StringRef Code, Diagnostics *Error) {
395259701Sdim  RegistrySema S;
396259701Sdim  return parseMatcherExpression(Code, &S, Error);
397259701Sdim}
398259701Sdim
399259701Sdimllvm::Optional<DynTypedMatcher>
400259701SdimParser::parseMatcherExpression(StringRef Code, Parser::Sema *S,
401259701Sdim                               Diagnostics *Error) {
402259701Sdim  VariantValue Value;
403259701Sdim  if (!parseExpression(Code, S, &Value, Error))
404259701Sdim    return llvm::Optional<DynTypedMatcher>();
405259701Sdim  if (!Value.isMatcher()) {
406259701Sdim    Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
407259701Sdim    return llvm::Optional<DynTypedMatcher>();
408259701Sdim  }
409259701Sdim  llvm::Optional<DynTypedMatcher> Result =
410259701Sdim      Value.getMatcher().getSingleMatcher();
411259701Sdim  if (!Result.hasValue()) {
412259701Sdim    Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
413259701Sdim        << Value.getTypeAsString();
414259701Sdim  }
415259701Sdim  return Result;
416259701Sdim}
417259701Sdim
418259701Sdim}  // namespace dynamic
419259701Sdim}  // namespace ast_matchers
420259701Sdim}  // namespace clang
421