1311118Sdim//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
2311118Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6311118Sdim//
7311118Sdim//===----------------------------------------------------------------------===//
8311118Sdim///
9327952Sdim/// \file
10327952Sdim/// This file defines classes for searching and analyzing source code clones.
11311118Sdim///
12311118Sdim//===----------------------------------------------------------------------===//
13311118Sdim
14311118Sdim#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
15311118Sdim#define LLVM_CLANG_AST_CLONEDETECTION_H
16311118Sdim
17321369Sdim#include "clang/AST/StmtVisitor.h"
18321369Sdim#include "llvm/Support/Regex.h"
19311118Sdim#include <vector>
20311118Sdim
21311118Sdimnamespace clang {
22311118Sdim
23311118Sdimclass Stmt;
24311118Sdimclass Decl;
25311118Sdimclass VarDecl;
26311118Sdimclass ASTContext;
27311118Sdimclass CompoundStmt;
28311118Sdim
29321369Sdim/// Identifies a list of statements.
30321369Sdim///
31311118Sdim/// Can either identify a single arbitrary Stmt object, a continuous sequence of
32311118Sdim/// child statements inside a CompoundStmt or no statements at all.
33311118Sdimclass StmtSequence {
34311118Sdim  /// If this object identifies a sequence of statements inside a CompoundStmt,
35311118Sdim  /// S points to this CompoundStmt. If this object only identifies a single
36311118Sdim  /// Stmt, then S is a pointer to this Stmt.
37311118Sdim  const Stmt *S;
38311118Sdim
39321369Sdim  /// The declaration that contains the statements.
40321369Sdim  const Decl *D;
41311118Sdim
42311118Sdim  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
43311118Sdim  /// instance is representing the CompoundStmt children inside the array
44311118Sdim  /// [StartIndex, EndIndex).
45311118Sdim  unsigned StartIndex;
46311118Sdim  unsigned EndIndex;
47311118Sdim
48311118Sdimpublic:
49321369Sdim  /// Constructs a StmtSequence holding multiple statements.
50311118Sdim  ///
51311118Sdim  /// The resulting StmtSequence identifies a continuous sequence of statements
52311118Sdim  /// in the body of the given CompoundStmt. Which statements of the body should
53311118Sdim  /// be identified needs to be specified by providing a start and end index
54311118Sdim  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
55311118Sdim  ///
56311118Sdim  /// \param Stmt A CompoundStmt that contains all statements in its body.
57321369Sdim  /// \param D The Decl containing this Stmt.
58311118Sdim  /// \param StartIndex The inclusive start index in the children array of
59311118Sdim  ///                   \p Stmt
60311118Sdim  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
61321369Sdim  StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
62321369Sdim               unsigned EndIndex);
63311118Sdim
64321369Sdim  /// Constructs a StmtSequence holding a single statement.
65311118Sdim  ///
66311118Sdim  /// \param Stmt An arbitrary Stmt.
67321369Sdim  /// \param D The Decl containing this Stmt.
68321369Sdim  StmtSequence(const Stmt *Stmt, const Decl *D);
69311118Sdim
70321369Sdim  /// Constructs an empty StmtSequence.
71311118Sdim  StmtSequence();
72311118Sdim
73311118Sdim  typedef const Stmt *const *iterator;
74311118Sdim
75311118Sdim  /// Returns an iterator pointing to the first statement in this sequence.
76311118Sdim  iterator begin() const;
77311118Sdim
78311118Sdim  /// Returns an iterator pointing behind the last statement in this sequence.
79311118Sdim  iterator end() const;
80311118Sdim
81311118Sdim  /// Returns the first statement in this sequence.
82311118Sdim  ///
83311118Sdim  /// This method should only be called on a non-empty StmtSequence object.
84311118Sdim  const Stmt *front() const {
85311118Sdim    assert(!empty());
86311118Sdim    return begin()[0];
87311118Sdim  }
88311118Sdim
89311118Sdim  /// Returns the last statement in this sequence.
90311118Sdim  ///
91311118Sdim  /// This method should only be called on a non-empty StmtSequence object.
92311118Sdim  const Stmt *back() const {
93311118Sdim    assert(!empty());
94311118Sdim    return begin()[size() - 1];
95311118Sdim  }
96311118Sdim
97311118Sdim  /// Returns the number of statements this object holds.
98311118Sdim  unsigned size() const {
99311118Sdim    if (holdsSequence())
100311118Sdim      return EndIndex - StartIndex;
101311118Sdim    if (S == nullptr)
102311118Sdim      return 0;
103311118Sdim    return 1;
104311118Sdim  }
105311118Sdim
106311118Sdim  /// Returns true if and only if this StmtSequence contains no statements.
107311118Sdim  bool empty() const { return size() == 0; }
108311118Sdim
109311118Sdim  /// Returns the related ASTContext for the stored Stmts.
110321369Sdim  ASTContext &getASTContext() const;
111321369Sdim
112321369Sdim  /// Returns the declaration that contains the stored Stmts.
113321369Sdim  const Decl *getContainingDecl() const {
114321369Sdim    assert(D);
115321369Sdim    return D;
116311118Sdim  }
117311118Sdim
118311118Sdim  /// Returns true if this objects holds a list of statements.
119311118Sdim  bool holdsSequence() const { return EndIndex != 0; }
120311118Sdim
121311118Sdim  /// Returns the start sourcelocation of the first statement in this sequence.
122311118Sdim  ///
123311118Sdim  /// This method should only be called on a non-empty StmtSequence object.
124341825Sdim  SourceLocation getBeginLoc() const;
125311118Sdim
126311118Sdim  /// Returns the end sourcelocation of the last statement in this sequence.
127311118Sdim  ///
128311118Sdim  /// This method should only be called on a non-empty StmtSequence object.
129311118Sdim  SourceLocation getEndLoc() const;
130311118Sdim
131311118Sdim  /// Returns the source range of the whole sequence - from the beginning
132311118Sdim  /// of the first statement to the end of the last statement.
133311118Sdim  SourceRange getSourceRange() const;
134311118Sdim
135311118Sdim  bool operator==(const StmtSequence &Other) const {
136311118Sdim    return std::tie(S, StartIndex, EndIndex) ==
137311118Sdim           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
138311118Sdim  }
139311118Sdim
140311118Sdim  bool operator!=(const StmtSequence &Other) const {
141311118Sdim    return std::tie(S, StartIndex, EndIndex) !=
142311118Sdim           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
143311118Sdim  }
144311118Sdim
145311118Sdim  /// Returns true if and only if this sequence covers a source range that
146311118Sdim  /// contains the source range of the given sequence \p Other.
147311118Sdim  ///
148311118Sdim  /// This method should only be called on a non-empty StmtSequence object
149311118Sdim  /// and passed a non-empty StmtSequence object.
150311118Sdim  bool contains(const StmtSequence &Other) const;
151311118Sdim};
152311118Sdim
153321369Sdim/// Searches for similar subtrees in the AST.
154311118Sdim///
155321369Sdim/// First, this class needs several declarations with statement bodies which
156321369Sdim/// can be passed via analyzeCodeBody. Afterwards all statements can be
157321369Sdim/// searched for clones by calling findClones with a given list of constraints
158321369Sdim/// that should specify the wanted properties of the clones.
159311118Sdim///
160321369Sdim/// The result of findClones can be further constrained with the constrainClones
161321369Sdim/// method.
162321369Sdim///
163341825Sdim/// This class only searches for clones in executable source code
164311118Sdim/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
165311118Sdim/// are not supported.
166311118Sdimclass CloneDetector {
167321369Sdim
168311118Sdimpublic:
169321369Sdim  /// A collection of StmtSequences that share an arbitrary property.
170321369Sdim  typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
171311118Sdim
172321369Sdim  /// Generates and stores search data for all statements in the body of
173321369Sdim  /// the given Decl.
174321369Sdim  void analyzeCodeBody(const Decl *D);
175311118Sdim
176321369Sdim  /// Constrains the given list of clone groups with the given constraint.
177321369Sdim  ///
178321369Sdim  /// The constraint is expected to have a method with the signature
179321369Sdim  ///     `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
180321369Sdim  /// as this is the interface that the CloneDetector uses for applying the
181321369Sdim  /// constraint. The constraint is supposed to directly modify the passed list
182321369Sdim  /// so that all clones in the list fulfill the specific property this
183321369Sdim  /// constraint ensures.
184321369Sdim  template <typename T>
185321369Sdim  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
186321369Sdim    C.constrain(CloneGroups);
187321369Sdim  }
188311118Sdim
189321369Sdim  /// Constrains the given list of clone groups with the given list of
190321369Sdim  /// constraints.
191321369Sdim  ///
192321369Sdim  /// The constraints are applied in sequence in the order in which they are
193321369Sdim  /// passed to this function.
194321369Sdim  template <typename T1, typename... Ts>
195321369Sdim  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
196321369Sdim                              Ts... ConstraintList) {
197321369Sdim    constrainClones(CloneGroups, C);
198321369Sdim    constrainClones(CloneGroups, ConstraintList...);
199321369Sdim  }
200311118Sdim
201321369Sdim  /// Searches for clones in all previously passed statements.
202321369Sdim  /// \param Result Output parameter to which all created clone groups are
203321369Sdim  ///               added.
204321369Sdim  /// \param ConstraintList The constraints that should be applied to the
205321369Sdim  //         result.
206321369Sdim  template <typename... Ts>
207321369Sdim  void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
208321369Sdim    // The initial assumption is that there is only one clone group and every
209321369Sdim    // statement is a clone of the others. This clone group will then be
210321369Sdim    // split up with the help of the constraints.
211321369Sdim    CloneGroup AllClones;
212321369Sdim    AllClones.reserve(Sequences.size());
213321369Sdim    for (const auto &C : Sequences) {
214321369Sdim      AllClones.push_back(C);
215321369Sdim    }
216311118Sdim
217321369Sdim    Result.push_back(AllClones);
218311118Sdim
219321369Sdim    constrainClones(Result, ConstraintList...);
220321369Sdim  }
221311118Sdim
222321369Sdimprivate:
223321369Sdim  CloneGroup Sequences;
224321369Sdim};
225311118Sdim
226321369Sdim/// This class is a utility class that contains utility functions for building
227321369Sdim/// custom constraints.
228321369Sdimclass CloneConstraint {
229321369Sdimpublic:
230321369Sdim  /// Removes all groups by using a filter function.
231321369Sdim  /// \param CloneGroups The list of CloneGroups that is supposed to be
232321369Sdim  ///                    filtered.
233321369Sdim  /// \param Filter The filter function that should return true for all groups
234321369Sdim  ///               that should be removed from the list.
235327952Sdim  static void filterGroups(
236327952Sdim      std::vector<CloneDetector::CloneGroup> &CloneGroups,
237327952Sdim      llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
238321369Sdim    CloneGroups.erase(
239321369Sdim        std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
240321369Sdim        CloneGroups.end());
241321369Sdim  }
242311118Sdim
243321369Sdim  /// Splits the given CloneGroups until the given Compare function returns true
244321369Sdim  /// for all clones in a single group.
245321369Sdim  /// \param CloneGroups A list of CloneGroups that should be modified.
246321369Sdim  /// \param Compare The comparison function that all clones are supposed to
247321369Sdim  ///                pass. Should return true if and only if two clones belong
248321369Sdim  ///                to the same CloneGroup.
249321369Sdim  static void splitCloneGroups(
250321369Sdim      std::vector<CloneDetector::CloneGroup> &CloneGroups,
251327952Sdim      llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
252327952Sdim          Compare);
253321369Sdim};
254311118Sdim
255327952Sdim/// This constraint moves clones into clone groups of type II via hashing.
256327952Sdim///
257327952Sdim/// Clones with different hash values are moved into separate clone groups.
258327952Sdim/// Collisions are possible, and this constraint does nothing to address this
259327952Sdim/// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
260327952Sdim/// constraint chain, not necessarily immediately, to eliminate hash collisions
261327952Sdim/// through a more detailed analysis.
262327952Sdimclass RecursiveCloneTypeIIHashConstraint {
263327952Sdimpublic:
264327952Sdim  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
265327952Sdim};
266311118Sdim
267327952Sdim/// This constraint moves clones into clone groups of type II by comparing them.
268327952Sdim///
269327952Sdim/// Clones that aren't type II clones are moved into separate clone groups.
270327952Sdim/// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
271327952Sdim/// group are guaranteed to be be type II clones of each other, but it is too
272327952Sdim/// slow to efficiently handle large amounts of clones.
273327952Sdimclass RecursiveCloneTypeIIVerifyConstraint {
274321369Sdimpublic:
275321369Sdim  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
276321369Sdim};
277321369Sdim
278321369Sdim/// Ensures that every clone has at least the given complexity.
279321369Sdim///
280321369Sdim/// Complexity is here defined as the total amount of children of a statement.
281321369Sdim/// This constraint assumes the first statement in the group is representative
282321369Sdim/// for all other statements in the group in terms of complexity.
283321369Sdimclass MinComplexityConstraint {
284321369Sdim  unsigned MinComplexity;
285321369Sdim
286321369Sdimpublic:
287321369Sdim  MinComplexityConstraint(unsigned MinComplexity)
288321369Sdim      : MinComplexity(MinComplexity) {}
289321369Sdim
290327952Sdim  /// Calculates the complexity of the given StmtSequence.
291327952Sdim  /// \param Limit The limit of complexity we probe for. After reaching
292327952Sdim  ///              this limit during calculation, this method is exiting
293327952Sdim  ///              early to improve performance and returns this limit.
294327952Sdim  size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
295321369Sdim                                 const std::string &ParentMacroStack = "");
296321369Sdim
297321369Sdim  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
298321369Sdim    CloneConstraint::filterGroups(
299321369Sdim        CloneGroups, [this](const CloneDetector::CloneGroup &A) {
300321369Sdim          if (!A.empty())
301327952Sdim            return calculateStmtComplexity(A.front(), MinComplexity) <
302327952Sdim                   MinComplexity;
303321369Sdim          else
304321369Sdim            return false;
305321369Sdim        });
306321369Sdim  }
307321369Sdim};
308321369Sdim
309321369Sdim/// Ensures that all clone groups contain at least the given amount of clones.
310321369Sdimclass MinGroupSizeConstraint {
311321369Sdim  unsigned MinGroupSize;
312321369Sdim
313321369Sdimpublic:
314321369Sdim  MinGroupSizeConstraint(unsigned MinGroupSize = 2)
315321369Sdim      : MinGroupSize(MinGroupSize) {}
316321369Sdim
317321369Sdim  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
318321369Sdim    CloneConstraint::filterGroups(CloneGroups,
319321369Sdim                                  [this](const CloneDetector::CloneGroup &A) {
320321369Sdim                                    return A.size() < MinGroupSize;
321321369Sdim                                  });
322321369Sdim  }
323321369Sdim};
324321369Sdim
325321369Sdim/// Ensures that no clone group fully contains another clone group.
326321369Sdimstruct OnlyLargestCloneConstraint {
327321369Sdim  void constrain(std::vector<CloneDetector::CloneGroup> &Result);
328321369Sdim};
329321369Sdim
330321369Sdimstruct FilenamePatternConstraint {
331321369Sdim  StringRef IgnoredFilesPattern;
332321369Sdim  std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
333321369Sdim
334341825Sdim  FilenamePatternConstraint(StringRef IgnoredFilesPattern)
335321369Sdim      : IgnoredFilesPattern(IgnoredFilesPattern) {
336321369Sdim    IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
337321369Sdim        IgnoredFilesPattern.str() + "$)");
338321369Sdim  }
339321369Sdim
340321369Sdim  bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
341321369Sdim
342321369Sdim  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
343321369Sdim    CloneConstraint::filterGroups(
344321369Sdim        CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
345321369Sdim          return isAutoGenerated(Group);
346321369Sdim        });
347321369Sdim  }
348321369Sdim};
349321369Sdim
350321369Sdim/// Analyzes the pattern of the referenced variables in a statement.
351321369Sdimclass VariablePattern {
352321369Sdim
353341825Sdim  /// Describes an occurrence of a variable reference in a statement.
354321369Sdim  struct VariableOccurence {
355321369Sdim    /// The index of the associated VarDecl in the Variables vector.
356321369Sdim    size_t KindID;
357321369Sdim    /// The statement in the code where the variable was referenced.
358321369Sdim    const Stmt *Mention;
359321369Sdim
360321369Sdim    VariableOccurence(size_t KindID, const Stmt *Mention)
361321369Sdim        : KindID(KindID), Mention(Mention) {}
362321369Sdim  };
363321369Sdim
364341825Sdim  /// All occurrences of referenced variables in the order of appearance.
365321369Sdim  std::vector<VariableOccurence> Occurences;
366321369Sdim  /// List of referenced variables in the order of appearance.
367321369Sdim  /// Every item in this list is unique.
368321369Sdim  std::vector<const VarDecl *> Variables;
369321369Sdim
370321369Sdim  /// Adds a new variable referenced to this pattern.
371321369Sdim  /// \param VarDecl The declaration of the variable that is referenced.
372321369Sdim  /// \param Mention The SourceRange where this variable is referenced.
373321369Sdim  void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
374321369Sdim
375321369Sdim  /// Adds each referenced variable from the given statement.
376321369Sdim  void addVariables(const Stmt *S);
377321369Sdim
378321369Sdimpublic:
379321369Sdim  /// Creates an VariablePattern object with information about the given
380321369Sdim  /// StmtSequence.
381321369Sdim  VariablePattern(const StmtSequence &Sequence) {
382321369Sdim    for (const Stmt *S : Sequence)
383321369Sdim      addVariables(S);
384321369Sdim  }
385321369Sdim
386321369Sdim  /// Describes two clones that reference their variables in a different pattern
387321369Sdim  /// which could indicate a programming error.
388311118Sdim  struct SuspiciousClonePair {
389321369Sdim    /// Utility class holding the relevant information about a single
390321369Sdim    /// clone in this pair.
391311118Sdim    struct SuspiciousCloneInfo {
392311118Sdim      /// The variable which referencing in this clone was against the pattern.
393311118Sdim      const VarDecl *Variable;
394311118Sdim      /// Where the variable was referenced.
395311118Sdim      const Stmt *Mention;
396311118Sdim      /// The variable that should have been referenced to follow the pattern.
397311118Sdim      /// If Suggestion is a nullptr then it's not possible to fix the pattern
398311118Sdim      /// by referencing a different variable in this clone.
399311118Sdim      const VarDecl *Suggestion;
400311118Sdim      SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
401311118Sdim                          const VarDecl *Suggestion)
402311118Sdim          : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
403311118Sdim      SuspiciousCloneInfo() {}
404311118Sdim    };
405311118Sdim    /// The first clone in the pair which always has a suggested variable.
406311118Sdim    SuspiciousCloneInfo FirstCloneInfo;
407311118Sdim    /// This other clone in the pair which can have a suggested variable.
408311118Sdim    SuspiciousCloneInfo SecondCloneInfo;
409311118Sdim  };
410311118Sdim
411321369Sdim  /// Counts the differences between this pattern and the given one.
412321369Sdim  /// \param Other The given VariablePattern to compare with.
413321369Sdim  /// \param FirstMismatch Output parameter that will be filled with information
414321369Sdim  ///        about the first difference between the two patterns. This parameter
415321369Sdim  ///        can be a nullptr, in which case it will be ignored.
416321369Sdim  /// \return Returns the number of differences between the pattern this object
417321369Sdim  ///         is following and the given VariablePattern.
418321369Sdim  ///
419321369Sdim  /// For example, the following statements all have the same pattern and this
420321369Sdim  /// function would return zero:
421321369Sdim  ///
422321369Sdim  ///   if (a < b) return a; return b;
423321369Sdim  ///   if (x < y) return x; return y;
424321369Sdim  ///   if (u2 < u1) return u2; return u1;
425321369Sdim  ///
426321369Sdim  /// But the following statement has a different pattern (note the changed
427321369Sdim  /// variables in the return statements) and would have two differences when
428321369Sdim  /// compared with one of the statements above.
429321369Sdim  ///
430321369Sdim  ///   if (a < b) return b; return a;
431321369Sdim  ///
432321369Sdim  /// This function should only be called if the related statements of the given
433321369Sdim  /// pattern and the statements of this objects are clones of each other.
434321369Sdim  unsigned countPatternDifferences(
435321369Sdim      const VariablePattern &Other,
436321369Sdim      VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
437321369Sdim};
438311118Sdim
439321369Sdim/// Ensures that all clones reference variables in the same pattern.
440321369Sdimstruct MatchingVariablePatternConstraint {
441321369Sdim  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
442311118Sdim};
443311118Sdim
444311118Sdim} // end namespace clang
445311118Sdim
446311118Sdim#endif // LLVM_CLANG_AST_CLONEDETECTION_H
447