1//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Recursive parser implementation for the matcher expression grammar.
12///
13//===----------------------------------------------------------------------===//
14
15#include <string>
16#include <vector>
17
18#include "clang/ASTMatchers/Dynamic/Parser.h"
19#include "clang/ASTMatchers/Dynamic/Registry.h"
20#include "clang/Basic/CharInfo.h"
21#include "llvm/ADT/Twine.h"
22
23namespace clang {
24namespace ast_matchers {
25namespace dynamic {
26
27/// \brief Simple structure to hold information for one token from the parser.
28struct Parser::TokenInfo {
29  /// \brief Different possible tokens.
30  enum TokenKind {
31    TK_Eof = 0,
32    TK_OpenParen = 1,
33    TK_CloseParen = 2,
34    TK_Comma = 3,
35    TK_Period = 4,
36    TK_Literal = 5,
37    TK_Ident = 6,
38    TK_InvalidChar = 7,
39    TK_Error = 8
40  };
41
42  /// \brief Some known identifiers.
43  static const char* const ID_Bind;
44
45  TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
46
47  StringRef Text;
48  TokenKind Kind;
49  SourceRange Range;
50  VariantValue Value;
51};
52
53const char* const Parser::TokenInfo::ID_Bind = "bind";
54
55/// \brief Simple tokenizer for the parser.
56class Parser::CodeTokenizer {
57public:
58  explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
59      : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
60    NextToken = getNextToken();
61  }
62
63  /// \brief Returns but doesn't consume the next token.
64  const TokenInfo &peekNextToken() const { return NextToken; }
65
66  /// \brief Consumes and returns the next token.
67  TokenInfo consumeNextToken() {
68    TokenInfo ThisToken = NextToken;
69    NextToken = getNextToken();
70    return ThisToken;
71  }
72
73  TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
74
75private:
76  TokenInfo getNextToken() {
77    consumeWhitespace();
78    TokenInfo Result;
79    Result.Range.Start = currentLocation();
80
81    if (Code.empty()) {
82      Result.Kind = TokenInfo::TK_Eof;
83      Result.Text = "";
84      return Result;
85    }
86
87    switch (Code[0]) {
88    case ',':
89      Result.Kind = TokenInfo::TK_Comma;
90      Result.Text = Code.substr(0, 1);
91      Code = Code.drop_front();
92      break;
93    case '.':
94      Result.Kind = TokenInfo::TK_Period;
95      Result.Text = Code.substr(0, 1);
96      Code = Code.drop_front();
97      break;
98    case '(':
99      Result.Kind = TokenInfo::TK_OpenParen;
100      Result.Text = Code.substr(0, 1);
101      Code = Code.drop_front();
102      break;
103    case ')':
104      Result.Kind = TokenInfo::TK_CloseParen;
105      Result.Text = Code.substr(0, 1);
106      Code = Code.drop_front();
107      break;
108
109    case '"':
110    case '\'':
111      // Parse a string literal.
112      consumeStringLiteral(&Result);
113      break;
114
115    case '0': case '1': case '2': case '3': case '4':
116    case '5': case '6': case '7': case '8': case '9':
117      // Parse an unsigned literal.
118      consumeUnsignedLiteral(&Result);
119      break;
120
121    default:
122      if (isAlphanumeric(Code[0])) {
123        // Parse an identifier
124        size_t TokenLength = 1;
125        while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
126          ++TokenLength;
127        Result.Kind = TokenInfo::TK_Ident;
128        Result.Text = Code.substr(0, TokenLength);
129        Code = Code.drop_front(TokenLength);
130      } else {
131        Result.Kind = TokenInfo::TK_InvalidChar;
132        Result.Text = Code.substr(0, 1);
133        Code = Code.drop_front(1);
134      }
135      break;
136    }
137
138    Result.Range.End = currentLocation();
139    return Result;
140  }
141
142  /// \brief Consume an unsigned literal.
143  void consumeUnsignedLiteral(TokenInfo *Result) {
144    unsigned Length = 1;
145    if (Code.size() > 1) {
146      // Consume the 'x' or 'b' radix modifier, if present.
147      switch (toLowercase(Code[1])) {
148      case 'x': case 'b': Length = 2;
149      }
150    }
151    while (Length < Code.size() && isHexDigit(Code[Length]))
152      ++Length;
153
154    Result->Text = Code.substr(0, Length);
155    Code = Code.drop_front(Length);
156
157    unsigned Value;
158    if (!Result->Text.getAsInteger(0, Value)) {
159      Result->Kind = TokenInfo::TK_Literal;
160      Result->Value = Value;
161    } else {
162      SourceRange Range;
163      Range.Start = Result->Range.Start;
164      Range.End = currentLocation();
165      Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
166      Result->Kind = TokenInfo::TK_Error;
167    }
168  }
169
170  /// \brief Consume a string literal.
171  ///
172  /// \c Code must be positioned at the start of the literal (the opening
173  /// quote). Consumed until it finds the same closing quote character.
174  void consumeStringLiteral(TokenInfo *Result) {
175    bool InEscape = false;
176    const char Marker = Code[0];
177    for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
178      if (InEscape) {
179        InEscape = false;
180        continue;
181      }
182      if (Code[Length] == '\\') {
183        InEscape = true;
184        continue;
185      }
186      if (Code[Length] == Marker) {
187        Result->Kind = TokenInfo::TK_Literal;
188        Result->Text = Code.substr(0, Length + 1);
189        Result->Value = Code.substr(1, Length - 1).str();
190        Code = Code.drop_front(Length + 1);
191        return;
192      }
193    }
194
195    StringRef ErrorText = Code;
196    Code = Code.drop_front(Code.size());
197    SourceRange Range;
198    Range.Start = Result->Range.Start;
199    Range.End = currentLocation();
200    Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
201    Result->Kind = TokenInfo::TK_Error;
202  }
203
204  /// \brief Consume all leading whitespace from \c Code.
205  void consumeWhitespace() {
206    while (!Code.empty() && isWhitespace(Code[0])) {
207      if (Code[0] == '\n') {
208        ++Line;
209        StartOfLine = Code.drop_front();
210      }
211      Code = Code.drop_front();
212    }
213  }
214
215  SourceLocation currentLocation() {
216    SourceLocation Location;
217    Location.Line = Line;
218    Location.Column = Code.data() - StartOfLine.data() + 1;
219    return Location;
220  }
221
222  StringRef Code;
223  StringRef StartOfLine;
224  unsigned Line;
225  Diagnostics *Error;
226  TokenInfo NextToken;
227};
228
229Parser::Sema::~Sema() {}
230
231/// \brief Parse and validate a matcher expression.
232/// \return \c true on success, in which case \c Value has the matcher parsed.
233///   If the input is malformed, or some argument has an error, it
234///   returns \c false.
235bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
236  const TokenInfo NameToken = Tokenizer->consumeNextToken();
237  assert(NameToken.Kind == TokenInfo::TK_Ident);
238  const TokenInfo OpenToken = Tokenizer->consumeNextToken();
239  if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
240    Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
241        << OpenToken.Text;
242    return false;
243  }
244
245  std::vector<ParserValue> Args;
246  TokenInfo EndToken;
247  while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
248    if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
249      // End of args.
250      EndToken = Tokenizer->consumeNextToken();
251      break;
252    }
253    if (Args.size() > 0) {
254      // We must find a , token to continue.
255      const TokenInfo CommaToken = Tokenizer->consumeNextToken();
256      if (CommaToken.Kind != TokenInfo::TK_Comma) {
257        Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
258            << CommaToken.Text;
259        return false;
260      }
261    }
262
263    Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
264                             NameToken.Text, NameToken.Range, Args.size() + 1);
265    ParserValue ArgValue;
266    ArgValue.Text = Tokenizer->peekNextToken().Text;
267    ArgValue.Range = Tokenizer->peekNextToken().Range;
268    if (!parseExpressionImpl(&ArgValue.Value)) return false;
269
270    Args.push_back(ArgValue);
271  }
272
273  if (EndToken.Kind == TokenInfo::TK_Eof) {
274    Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
275    return false;
276  }
277
278  std::string BindID;
279  if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
280    // Parse .bind("foo")
281    Tokenizer->consumeNextToken();  // consume the period.
282    const TokenInfo BindToken = Tokenizer->consumeNextToken();
283    const TokenInfo OpenToken = Tokenizer->consumeNextToken();
284    const TokenInfo IDToken = Tokenizer->consumeNextToken();
285    const TokenInfo CloseToken = Tokenizer->consumeNextToken();
286
287    // TODO: We could use different error codes for each/some to be more
288    //       explicit about the syntax error.
289    if (BindToken.Kind != TokenInfo::TK_Ident ||
290        BindToken.Text != TokenInfo::ID_Bind) {
291      Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
292      return false;
293    }
294    if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
295      Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
296      return false;
297    }
298    if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
299      Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
300      return false;
301    }
302    if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
303      Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
304      return false;
305    }
306    BindID = IDToken.Value.getString();
307  }
308
309  // Merge the start and end infos.
310  Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
311                           NameToken.Text, NameToken.Range);
312  SourceRange MatcherRange = NameToken.Range;
313  MatcherRange.End = EndToken.Range.End;
314  VariantMatcher Result = S->actOnMatcherExpression(
315      NameToken.Text, MatcherRange, BindID, Args, Error);
316  if (Result.isNull()) return false;
317
318  *Value = Result;
319  return true;
320}
321
322/// \brief Parse an <Expresssion>
323bool Parser::parseExpressionImpl(VariantValue *Value) {
324  switch (Tokenizer->nextTokenKind()) {
325  case TokenInfo::TK_Literal:
326    *Value = Tokenizer->consumeNextToken().Value;
327    return true;
328
329  case TokenInfo::TK_Ident:
330    return parseMatcherExpressionImpl(Value);
331
332  case TokenInfo::TK_Eof:
333    Error->addError(Tokenizer->consumeNextToken().Range,
334                    Error->ET_ParserNoCode);
335    return false;
336
337  case TokenInfo::TK_Error:
338    // This error was already reported by the tokenizer.
339    return false;
340
341  case TokenInfo::TK_OpenParen:
342  case TokenInfo::TK_CloseParen:
343  case TokenInfo::TK_Comma:
344  case TokenInfo::TK_Period:
345  case TokenInfo::TK_InvalidChar:
346    const TokenInfo Token = Tokenizer->consumeNextToken();
347    Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
348    return false;
349  }
350
351  llvm_unreachable("Unknown token kind.");
352}
353
354Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
355               Diagnostics *Error)
356    : Tokenizer(Tokenizer), S(S), Error(Error) {}
357
358class RegistrySema : public Parser::Sema {
359public:
360  virtual ~RegistrySema() {}
361  VariantMatcher actOnMatcherExpression(StringRef MatcherName,
362                                        const SourceRange &NameRange,
363                                        StringRef BindID,
364                                        ArrayRef<ParserValue> Args,
365                                        Diagnostics *Error) {
366    if (BindID.empty()) {
367      return Registry::constructMatcher(MatcherName, NameRange, Args, Error);
368    } else {
369      return Registry::constructBoundMatcher(MatcherName, NameRange, BindID,
370                                             Args, Error);
371    }
372  }
373};
374
375bool Parser::parseExpression(StringRef Code, VariantValue *Value,
376                             Diagnostics *Error) {
377  RegistrySema S;
378  return parseExpression(Code, &S, Value, Error);
379}
380
381bool Parser::parseExpression(StringRef Code, Sema *S,
382                             VariantValue *Value, Diagnostics *Error) {
383  CodeTokenizer Tokenizer(Code, Error);
384  if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
385  if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
386    Error->addError(Tokenizer.peekNextToken().Range,
387                    Error->ET_ParserTrailingCode);
388    return false;
389  }
390  return true;
391}
392
393llvm::Optional<DynTypedMatcher>
394Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) {
395  RegistrySema S;
396  return parseMatcherExpression(Code, &S, Error);
397}
398
399llvm::Optional<DynTypedMatcher>
400Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S,
401                               Diagnostics *Error) {
402  VariantValue Value;
403  if (!parseExpression(Code, S, &Value, Error))
404    return llvm::Optional<DynTypedMatcher>();
405  if (!Value.isMatcher()) {
406    Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
407    return llvm::Optional<DynTypedMatcher>();
408  }
409  llvm::Optional<DynTypedMatcher> Result =
410      Value.getMatcher().getSingleMatcher();
411  if (!Result.hasValue()) {
412    Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
413        << Value.getTypeAsString();
414  }
415  return Result;
416}
417
418}  // namespace dynamic
419}  // namespace ast_matchers
420}  // namespace clang
421