Parser.cpp revision 321369
1//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Recursive parser implementation for the matcher expression grammar.
12///
13//===----------------------------------------------------------------------===//
14
15#include "clang/ASTMatchers/Dynamic/Parser.h"
16#include "clang/ASTMatchers/Dynamic/Registry.h"
17#include "clang/Basic/CharInfo.h"
18#include "llvm/ADT/Optional.h"
19#include "llvm/Support/ManagedStatic.h"
20#include <string>
21#include <vector>
22
23namespace clang {
24namespace ast_matchers {
25namespace dynamic {
26
27/// \brief Simple structure to hold information for one token from the parser.
28struct Parser::TokenInfo {
29  /// \brief Different possible tokens.
30  enum TokenKind {
31    TK_Eof,
32    TK_OpenParen,
33    TK_CloseParen,
34    TK_Comma,
35    TK_Period,
36    TK_Literal,
37    TK_Ident,
38    TK_InvalidChar,
39    TK_Error,
40    TK_CodeCompletion
41  };
42
43  /// \brief Some known identifiers.
44  static const char* const ID_Bind;
45
46  TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
47
48  StringRef Text;
49  TokenKind Kind;
50  SourceRange Range;
51  VariantValue Value;
52};
53
54const char* const Parser::TokenInfo::ID_Bind = "bind";
55
56/// \brief Simple tokenizer for the parser.
57class Parser::CodeTokenizer {
58public:
59  explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
60      : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
61        CodeCompletionLocation(nullptr) {
62    NextToken = getNextToken();
63  }
64
65  CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
66                unsigned CodeCompletionOffset)
67      : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
68        CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
69    NextToken = getNextToken();
70  }
71
72  /// \brief Returns but doesn't consume the next token.
73  const TokenInfo &peekNextToken() const { return NextToken; }
74
75  /// \brief Consumes and returns the next token.
76  TokenInfo consumeNextToken() {
77    TokenInfo ThisToken = NextToken;
78    NextToken = getNextToken();
79    return ThisToken;
80  }
81
82  TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
83
84private:
85  TokenInfo getNextToken() {
86    consumeWhitespace();
87    TokenInfo Result;
88    Result.Range.Start = currentLocation();
89
90    if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
91      Result.Kind = TokenInfo::TK_CodeCompletion;
92      Result.Text = StringRef(CodeCompletionLocation, 0);
93      CodeCompletionLocation = nullptr;
94      return Result;
95    }
96
97    if (Code.empty()) {
98      Result.Kind = TokenInfo::TK_Eof;
99      Result.Text = "";
100      return Result;
101    }
102
103    switch (Code[0]) {
104    case ',':
105      Result.Kind = TokenInfo::TK_Comma;
106      Result.Text = Code.substr(0, 1);
107      Code = Code.drop_front();
108      break;
109    case '.':
110      Result.Kind = TokenInfo::TK_Period;
111      Result.Text = Code.substr(0, 1);
112      Code = Code.drop_front();
113      break;
114    case '(':
115      Result.Kind = TokenInfo::TK_OpenParen;
116      Result.Text = Code.substr(0, 1);
117      Code = Code.drop_front();
118      break;
119    case ')':
120      Result.Kind = TokenInfo::TK_CloseParen;
121      Result.Text = Code.substr(0, 1);
122      Code = Code.drop_front();
123      break;
124
125    case '"':
126    case '\'':
127      // Parse a string literal.
128      consumeStringLiteral(&Result);
129      break;
130
131    case '0': case '1': case '2': case '3': case '4':
132    case '5': case '6': case '7': case '8': case '9':
133      // Parse an unsigned and float literal.
134      consumeNumberLiteral(&Result);
135      break;
136
137    default:
138      if (isAlphanumeric(Code[0])) {
139        // Parse an identifier
140        size_t TokenLength = 1;
141        while (1) {
142          // A code completion location in/immediately after an identifier will
143          // cause the portion of the identifier before the code completion
144          // location to become a code completion token.
145          if (CodeCompletionLocation == Code.data() + TokenLength) {
146            CodeCompletionLocation = nullptr;
147            Result.Kind = TokenInfo::TK_CodeCompletion;
148            Result.Text = Code.substr(0, TokenLength);
149            Code = Code.drop_front(TokenLength);
150            return Result;
151          }
152          if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
153            break;
154          ++TokenLength;
155        }
156        if (TokenLength == 4 && Code.startswith("true")) {
157          Result.Kind = TokenInfo::TK_Literal;
158          Result.Value = true;
159        } else if (TokenLength == 5 && Code.startswith("false")) {
160          Result.Kind = TokenInfo::TK_Literal;
161          Result.Value = false;
162        } else {
163          Result.Kind = TokenInfo::TK_Ident;
164          Result.Text = Code.substr(0, TokenLength);
165        }
166        Code = Code.drop_front(TokenLength);
167      } else {
168        Result.Kind = TokenInfo::TK_InvalidChar;
169        Result.Text = Code.substr(0, 1);
170        Code = Code.drop_front(1);
171      }
172      break;
173    }
174
175    Result.Range.End = currentLocation();
176    return Result;
177  }
178
179  /// \brief Consume an unsigned and float literal.
180  void consumeNumberLiteral(TokenInfo *Result) {
181    bool isFloatingLiteral = false;
182    unsigned Length = 1;
183    if (Code.size() > 1) {
184      // Consume the 'x' or 'b' radix modifier, if present.
185      switch (toLowercase(Code[1])) {
186      case 'x': case 'b': Length = 2;
187      }
188    }
189    while (Length < Code.size() && isHexDigit(Code[Length]))
190      ++Length;
191
192    // Try to recognize a floating point literal.
193    while (Length < Code.size()) {
194      char c = Code[Length];
195      if (c == '-' || c == '+' || c == '.' || isHexDigit(c)) {
196        isFloatingLiteral = true;
197        Length++;
198      } else {
199        break;
200      }
201    }
202
203    Result->Text = Code.substr(0, Length);
204    Code = Code.drop_front(Length);
205
206    if (isFloatingLiteral) {
207      char *end;
208      errno = 0;
209      std::string Text = Result->Text.str();
210      double doubleValue = strtod(Text.c_str(), &end);
211      if (*end == 0 && errno == 0) {
212        Result->Kind = TokenInfo::TK_Literal;
213        Result->Value = doubleValue;
214        return;
215      }
216    } else {
217      unsigned Value;
218      if (!Result->Text.getAsInteger(0, Value)) {
219        Result->Kind = TokenInfo::TK_Literal;
220        Result->Value = Value;
221        return;
222      }
223    }
224
225    SourceRange Range;
226    Range.Start = Result->Range.Start;
227    Range.End = currentLocation();
228    Error->addError(Range, Error->ET_ParserNumberError) << Result->Text;
229    Result->Kind = TokenInfo::TK_Error;
230  }
231
232  /// \brief Consume a string literal.
233  ///
234  /// \c Code must be positioned at the start of the literal (the opening
235  /// quote). Consumed until it finds the same closing quote character.
236  void consumeStringLiteral(TokenInfo *Result) {
237    bool InEscape = false;
238    const char Marker = Code[0];
239    for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
240      if (InEscape) {
241        InEscape = false;
242        continue;
243      }
244      if (Code[Length] == '\\') {
245        InEscape = true;
246        continue;
247      }
248      if (Code[Length] == Marker) {
249        Result->Kind = TokenInfo::TK_Literal;
250        Result->Text = Code.substr(0, Length + 1);
251        Result->Value = Code.substr(1, Length - 1);
252        Code = Code.drop_front(Length + 1);
253        return;
254      }
255    }
256
257    StringRef ErrorText = Code;
258    Code = Code.drop_front(Code.size());
259    SourceRange Range;
260    Range.Start = Result->Range.Start;
261    Range.End = currentLocation();
262    Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
263    Result->Kind = TokenInfo::TK_Error;
264  }
265
266  /// \brief Consume all leading whitespace from \c Code.
267  void consumeWhitespace() {
268    while (!Code.empty() && isWhitespace(Code[0])) {
269      if (Code[0] == '\n') {
270        ++Line;
271        StartOfLine = Code.drop_front();
272      }
273      Code = Code.drop_front();
274    }
275  }
276
277  SourceLocation currentLocation() {
278    SourceLocation Location;
279    Location.Line = Line;
280    Location.Column = Code.data() - StartOfLine.data() + 1;
281    return Location;
282  }
283
284  StringRef Code;
285  StringRef StartOfLine;
286  unsigned Line;
287  Diagnostics *Error;
288  TokenInfo NextToken;
289  const char *CodeCompletionLocation;
290};
291
292Parser::Sema::~Sema() {}
293
294std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes(
295    llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
296  return std::vector<ArgKind>();
297}
298
299std::vector<MatcherCompletion>
300Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) {
301  return std::vector<MatcherCompletion>();
302}
303
304struct Parser::ScopedContextEntry {
305  Parser *P;
306
307  ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
308    P->ContextStack.push_back(std::make_pair(C, 0u));
309  }
310
311  ~ScopedContextEntry() {
312    P->ContextStack.pop_back();
313  }
314
315  void nextArg() {
316    ++P->ContextStack.back().second;
317  }
318};
319
320/// \brief Parse expressions that start with an identifier.
321///
322/// This function can parse named values and matchers.
323/// In case of failure it will try to determine the user's intent to give
324/// an appropriate error message.
325bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
326  const TokenInfo NameToken = Tokenizer->consumeNextToken();
327
328  if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
329    // Parse as a named value.
330    if (const VariantValue NamedValue =
331            NamedValues ? NamedValues->lookup(NameToken.Text)
332                        : VariantValue()) {
333      *Value = NamedValue;
334      return true;
335    }
336    // If the syntax is correct and the name is not a matcher either, report
337    // unknown named value.
338    if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
339         Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
340         Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
341        !S->lookupMatcherCtor(NameToken.Text)) {
342      Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
343          << NameToken.Text;
344      return false;
345    }
346    // Otherwise, fallback to the matcher parser.
347  }
348
349  // Parse as a matcher expression.
350  return parseMatcherExpressionImpl(NameToken, Value);
351}
352
353/// \brief Parse and validate a matcher expression.
354/// \return \c true on success, in which case \c Value has the matcher parsed.
355///   If the input is malformed, or some argument has an error, it
356///   returns \c false.
357bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
358                                        VariantValue *Value) {
359  assert(NameToken.Kind == TokenInfo::TK_Ident);
360  const TokenInfo OpenToken = Tokenizer->consumeNextToken();
361  if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
362    Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
363        << OpenToken.Text;
364    return false;
365  }
366
367  llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
368
369  if (!Ctor) {
370    Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
371        << NameToken.Text;
372    // Do not return here. We need to continue to give completion suggestions.
373  }
374
375  std::vector<ParserValue> Args;
376  TokenInfo EndToken;
377
378  {
379    ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
380
381    while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
382      if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
383        // End of args.
384        EndToken = Tokenizer->consumeNextToken();
385        break;
386      }
387      if (Args.size() > 0) {
388        // We must find a , token to continue.
389        const TokenInfo CommaToken = Tokenizer->consumeNextToken();
390        if (CommaToken.Kind != TokenInfo::TK_Comma) {
391          Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
392              << CommaToken.Text;
393          return false;
394        }
395      }
396
397      Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
398                               NameToken.Text, NameToken.Range,
399                               Args.size() + 1);
400      ParserValue ArgValue;
401      ArgValue.Text = Tokenizer->peekNextToken().Text;
402      ArgValue.Range = Tokenizer->peekNextToken().Range;
403      if (!parseExpressionImpl(&ArgValue.Value)) {
404        return false;
405      }
406
407      Args.push_back(ArgValue);
408      SCE.nextArg();
409    }
410  }
411
412  if (EndToken.Kind == TokenInfo::TK_Eof) {
413    Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
414    return false;
415  }
416
417  std::string BindID;
418  if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
419    // Parse .bind("foo")
420    Tokenizer->consumeNextToken();  // consume the period.
421    const TokenInfo BindToken = Tokenizer->consumeNextToken();
422    if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
423      addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1));
424      return false;
425    }
426
427    const TokenInfo OpenToken = Tokenizer->consumeNextToken();
428    const TokenInfo IDToken = Tokenizer->consumeNextToken();
429    const TokenInfo CloseToken = Tokenizer->consumeNextToken();
430
431    // TODO: We could use different error codes for each/some to be more
432    //       explicit about the syntax error.
433    if (BindToken.Kind != TokenInfo::TK_Ident ||
434        BindToken.Text != TokenInfo::ID_Bind) {
435      Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
436      return false;
437    }
438    if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
439      Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
440      return false;
441    }
442    if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
443      Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
444      return false;
445    }
446    if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
447      Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
448      return false;
449    }
450    BindID = IDToken.Value.getString();
451  }
452
453  if (!Ctor)
454    return false;
455
456  // Merge the start and end infos.
457  Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
458                           NameToken.Text, NameToken.Range);
459  SourceRange MatcherRange = NameToken.Range;
460  MatcherRange.End = EndToken.Range.End;
461  VariantMatcher Result = S->actOnMatcherExpression(
462      *Ctor, MatcherRange, BindID, Args, Error);
463  if (Result.isNull()) return false;
464
465  *Value = Result;
466  return true;
467}
468
469// If the prefix of this completion matches the completion token, add it to
470// Completions minus the prefix.
471void Parser::addCompletion(const TokenInfo &CompToken,
472                           const MatcherCompletion& Completion) {
473  if (StringRef(Completion.TypedText).startswith(CompToken.Text) &&
474      Completion.Specificity > 0) {
475    Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()),
476                             Completion.MatcherDecl, Completion.Specificity);
477  }
478}
479
480std::vector<MatcherCompletion> Parser::getNamedValueCompletions(
481    ArrayRef<ArgKind> AcceptedTypes) {
482  if (!NamedValues) return std::vector<MatcherCompletion>();
483  std::vector<MatcherCompletion> Result;
484  for (const auto &Entry : *NamedValues) {
485    unsigned Specificity;
486    if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) {
487      std::string Decl =
488          (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str();
489      Result.emplace_back(Entry.getKey(), Decl, Specificity);
490    }
491  }
492  return Result;
493}
494
495void Parser::addExpressionCompletions() {
496  const TokenInfo CompToken = Tokenizer->consumeNextToken();
497  assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
498
499  // We cannot complete code if there is an invalid element on the context
500  // stack.
501  for (ContextStackTy::iterator I = ContextStack.begin(),
502                                E = ContextStack.end();
503       I != E; ++I) {
504    if (!I->first)
505      return;
506  }
507
508  auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack);
509  for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) {
510    addCompletion(CompToken, Completion);
511  }
512
513  for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) {
514    addCompletion(CompToken, Completion);
515  }
516}
517
518/// \brief Parse an <Expresssion>
519bool Parser::parseExpressionImpl(VariantValue *Value) {
520  switch (Tokenizer->nextTokenKind()) {
521  case TokenInfo::TK_Literal:
522    *Value = Tokenizer->consumeNextToken().Value;
523    return true;
524
525  case TokenInfo::TK_Ident:
526    return parseIdentifierPrefixImpl(Value);
527
528  case TokenInfo::TK_CodeCompletion:
529    addExpressionCompletions();
530    return false;
531
532  case TokenInfo::TK_Eof:
533    Error->addError(Tokenizer->consumeNextToken().Range,
534                    Error->ET_ParserNoCode);
535    return false;
536
537  case TokenInfo::TK_Error:
538    // This error was already reported by the tokenizer.
539    return false;
540
541  case TokenInfo::TK_OpenParen:
542  case TokenInfo::TK_CloseParen:
543  case TokenInfo::TK_Comma:
544  case TokenInfo::TK_Period:
545  case TokenInfo::TK_InvalidChar:
546    const TokenInfo Token = Tokenizer->consumeNextToken();
547    Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
548    return false;
549  }
550
551  llvm_unreachable("Unknown token kind.");
552}
553
554static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema;
555
556Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
557               const NamedValueMap *NamedValues, Diagnostics *Error)
558    : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema),
559      NamedValues(NamedValues), Error(Error) {}
560
561Parser::RegistrySema::~RegistrySema() {}
562
563llvm::Optional<MatcherCtor>
564Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
565  return Registry::lookupMatcherCtor(MatcherName);
566}
567
568VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
569    MatcherCtor Ctor, SourceRange NameRange, StringRef BindID,
570    ArrayRef<ParserValue> Args, Diagnostics *Error) {
571  if (BindID.empty()) {
572    return Registry::constructMatcher(Ctor, NameRange, Args, Error);
573  } else {
574    return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
575                                           Error);
576  }
577}
578
579std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
580    ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
581  return Registry::getAcceptedCompletionTypes(Context);
582}
583
584std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
585    ArrayRef<ArgKind> AcceptedTypes) {
586  return Registry::getMatcherCompletions(AcceptedTypes);
587}
588
589bool Parser::parseExpression(StringRef Code, Sema *S,
590                             const NamedValueMap *NamedValues,
591                             VariantValue *Value, Diagnostics *Error) {
592  CodeTokenizer Tokenizer(Code, Error);
593  if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value))
594    return false;
595  if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
596    Error->addError(Tokenizer.peekNextToken().Range,
597                    Error->ET_ParserTrailingCode);
598    return false;
599  }
600  return true;
601}
602
603std::vector<MatcherCompletion>
604Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S,
605                           const NamedValueMap *NamedValues) {
606  Diagnostics Error;
607  CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
608  Parser P(&Tokenizer, S, NamedValues, &Error);
609  VariantValue Dummy;
610  P.parseExpressionImpl(&Dummy);
611
612  // Sort by specificity, then by name.
613  std::sort(P.Completions.begin(), P.Completions.end(),
614            [](const MatcherCompletion &A, const MatcherCompletion &B) {
615    if (A.Specificity != B.Specificity)
616      return A.Specificity > B.Specificity;
617    return A.TypedText < B.TypedText;
618  });
619
620  return P.Completions;
621}
622
623llvm::Optional<DynTypedMatcher>
624Parser::parseMatcherExpression(StringRef Code, Sema *S,
625                               const NamedValueMap *NamedValues,
626                               Diagnostics *Error) {
627  VariantValue Value;
628  if (!parseExpression(Code, S, NamedValues, &Value, Error))
629    return llvm::Optional<DynTypedMatcher>();
630  if (!Value.isMatcher()) {
631    Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
632    return llvm::Optional<DynTypedMatcher>();
633  }
634  llvm::Optional<DynTypedMatcher> Result =
635      Value.getMatcher().getSingleMatcher();
636  if (!Result.hasValue()) {
637    Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
638        << Value.getTypeAsString();
639  }
640  return Result;
641}
642
643}  // namespace dynamic
644}  // namespace ast_matchers
645}  // namespace clang
646