1311118Sdim//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// 2311118Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6311118Sdim// 7311118Sdim//===----------------------------------------------------------------------===// 8311118Sdim/// 9327952Sdim/// \file 10327952Sdim/// This file defines classes for searching and analyzing source code clones. 11311118Sdim/// 12311118Sdim//===----------------------------------------------------------------------===// 13311118Sdim 14311118Sdim#ifndef LLVM_CLANG_AST_CLONEDETECTION_H 15311118Sdim#define LLVM_CLANG_AST_CLONEDETECTION_H 16311118Sdim 17321369Sdim#include "clang/AST/StmtVisitor.h" 18321369Sdim#include "llvm/Support/Regex.h" 19311118Sdim#include <vector> 20311118Sdim 21311118Sdimnamespace clang { 22311118Sdim 23311118Sdimclass Stmt; 24311118Sdimclass Decl; 25311118Sdimclass VarDecl; 26311118Sdimclass ASTContext; 27311118Sdimclass CompoundStmt; 28311118Sdim 29321369Sdim/// Identifies a list of statements. 30321369Sdim/// 31311118Sdim/// Can either identify a single arbitrary Stmt object, a continuous sequence of 32311118Sdim/// child statements inside a CompoundStmt or no statements at all. 33311118Sdimclass StmtSequence { 34311118Sdim /// If this object identifies a sequence of statements inside a CompoundStmt, 35311118Sdim /// S points to this CompoundStmt. If this object only identifies a single 36311118Sdim /// Stmt, then S is a pointer to this Stmt. 37311118Sdim const Stmt *S; 38311118Sdim 39321369Sdim /// The declaration that contains the statements. 40321369Sdim const Decl *D; 41311118Sdim 42311118Sdim /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence 43311118Sdim /// instance is representing the CompoundStmt children inside the array 44311118Sdim /// [StartIndex, EndIndex). 45311118Sdim unsigned StartIndex; 46311118Sdim unsigned EndIndex; 47311118Sdim 48311118Sdimpublic: 49321369Sdim /// Constructs a StmtSequence holding multiple statements. 50311118Sdim /// 51311118Sdim /// The resulting StmtSequence identifies a continuous sequence of statements 52311118Sdim /// in the body of the given CompoundStmt. Which statements of the body should 53311118Sdim /// be identified needs to be specified by providing a start and end index 54311118Sdim /// that describe a non-empty sub-array in the body of the given CompoundStmt. 55311118Sdim /// 56311118Sdim /// \param Stmt A CompoundStmt that contains all statements in its body. 57321369Sdim /// \param D The Decl containing this Stmt. 58311118Sdim /// \param StartIndex The inclusive start index in the children array of 59311118Sdim /// \p Stmt 60311118Sdim /// \param EndIndex The exclusive end index in the children array of \p Stmt. 61321369Sdim StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex, 62321369Sdim unsigned EndIndex); 63311118Sdim 64321369Sdim /// Constructs a StmtSequence holding a single statement. 65311118Sdim /// 66311118Sdim /// \param Stmt An arbitrary Stmt. 67321369Sdim /// \param D The Decl containing this Stmt. 68321369Sdim StmtSequence(const Stmt *Stmt, const Decl *D); 69311118Sdim 70321369Sdim /// Constructs an empty StmtSequence. 71311118Sdim StmtSequence(); 72311118Sdim 73311118Sdim typedef const Stmt *const *iterator; 74311118Sdim 75311118Sdim /// Returns an iterator pointing to the first statement in this sequence. 76311118Sdim iterator begin() const; 77311118Sdim 78311118Sdim /// Returns an iterator pointing behind the last statement in this sequence. 79311118Sdim iterator end() const; 80311118Sdim 81311118Sdim /// Returns the first statement in this sequence. 82311118Sdim /// 83311118Sdim /// This method should only be called on a non-empty StmtSequence object. 84311118Sdim const Stmt *front() const { 85311118Sdim assert(!empty()); 86311118Sdim return begin()[0]; 87311118Sdim } 88311118Sdim 89311118Sdim /// Returns the last statement in this sequence. 90311118Sdim /// 91311118Sdim /// This method should only be called on a non-empty StmtSequence object. 92311118Sdim const Stmt *back() const { 93311118Sdim assert(!empty()); 94311118Sdim return begin()[size() - 1]; 95311118Sdim } 96311118Sdim 97311118Sdim /// Returns the number of statements this object holds. 98311118Sdim unsigned size() const { 99311118Sdim if (holdsSequence()) 100311118Sdim return EndIndex - StartIndex; 101311118Sdim if (S == nullptr) 102311118Sdim return 0; 103311118Sdim return 1; 104311118Sdim } 105311118Sdim 106311118Sdim /// Returns true if and only if this StmtSequence contains no statements. 107311118Sdim bool empty() const { return size() == 0; } 108311118Sdim 109311118Sdim /// Returns the related ASTContext for the stored Stmts. 110321369Sdim ASTContext &getASTContext() const; 111321369Sdim 112321369Sdim /// Returns the declaration that contains the stored Stmts. 113321369Sdim const Decl *getContainingDecl() const { 114321369Sdim assert(D); 115321369Sdim return D; 116311118Sdim } 117311118Sdim 118311118Sdim /// Returns true if this objects holds a list of statements. 119311118Sdim bool holdsSequence() const { return EndIndex != 0; } 120311118Sdim 121311118Sdim /// Returns the start sourcelocation of the first statement in this sequence. 122311118Sdim /// 123311118Sdim /// This method should only be called on a non-empty StmtSequence object. 124341825Sdim SourceLocation getBeginLoc() const; 125311118Sdim 126311118Sdim /// Returns the end sourcelocation of the last statement in this sequence. 127311118Sdim /// 128311118Sdim /// This method should only be called on a non-empty StmtSequence object. 129311118Sdim SourceLocation getEndLoc() const; 130311118Sdim 131311118Sdim /// Returns the source range of the whole sequence - from the beginning 132311118Sdim /// of the first statement to the end of the last statement. 133311118Sdim SourceRange getSourceRange() const; 134311118Sdim 135311118Sdim bool operator==(const StmtSequence &Other) const { 136311118Sdim return std::tie(S, StartIndex, EndIndex) == 137311118Sdim std::tie(Other.S, Other.StartIndex, Other.EndIndex); 138311118Sdim } 139311118Sdim 140311118Sdim bool operator!=(const StmtSequence &Other) const { 141311118Sdim return std::tie(S, StartIndex, EndIndex) != 142311118Sdim std::tie(Other.S, Other.StartIndex, Other.EndIndex); 143311118Sdim } 144311118Sdim 145311118Sdim /// Returns true if and only if this sequence covers a source range that 146311118Sdim /// contains the source range of the given sequence \p Other. 147311118Sdim /// 148311118Sdim /// This method should only be called on a non-empty StmtSequence object 149311118Sdim /// and passed a non-empty StmtSequence object. 150311118Sdim bool contains(const StmtSequence &Other) const; 151311118Sdim}; 152311118Sdim 153321369Sdim/// Searches for similar subtrees in the AST. 154311118Sdim/// 155321369Sdim/// First, this class needs several declarations with statement bodies which 156321369Sdim/// can be passed via analyzeCodeBody. Afterwards all statements can be 157321369Sdim/// searched for clones by calling findClones with a given list of constraints 158321369Sdim/// that should specify the wanted properties of the clones. 159311118Sdim/// 160321369Sdim/// The result of findClones can be further constrained with the constrainClones 161321369Sdim/// method. 162321369Sdim/// 163341825Sdim/// This class only searches for clones in executable source code 164311118Sdim/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) 165311118Sdim/// are not supported. 166311118Sdimclass CloneDetector { 167321369Sdim 168311118Sdimpublic: 169321369Sdim /// A collection of StmtSequences that share an arbitrary property. 170321369Sdim typedef llvm::SmallVector<StmtSequence, 8> CloneGroup; 171311118Sdim 172321369Sdim /// Generates and stores search data for all statements in the body of 173321369Sdim /// the given Decl. 174321369Sdim void analyzeCodeBody(const Decl *D); 175311118Sdim 176321369Sdim /// Constrains the given list of clone groups with the given constraint. 177321369Sdim /// 178321369Sdim /// The constraint is expected to have a method with the signature 179321369Sdim /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)` 180321369Sdim /// as this is the interface that the CloneDetector uses for applying the 181321369Sdim /// constraint. The constraint is supposed to directly modify the passed list 182321369Sdim /// so that all clones in the list fulfill the specific property this 183321369Sdim /// constraint ensures. 184321369Sdim template <typename T> 185321369Sdim static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) { 186321369Sdim C.constrain(CloneGroups); 187321369Sdim } 188311118Sdim 189321369Sdim /// Constrains the given list of clone groups with the given list of 190321369Sdim /// constraints. 191321369Sdim /// 192321369Sdim /// The constraints are applied in sequence in the order in which they are 193321369Sdim /// passed to this function. 194321369Sdim template <typename T1, typename... Ts> 195321369Sdim static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C, 196321369Sdim Ts... ConstraintList) { 197321369Sdim constrainClones(CloneGroups, C); 198321369Sdim constrainClones(CloneGroups, ConstraintList...); 199321369Sdim } 200311118Sdim 201321369Sdim /// Searches for clones in all previously passed statements. 202321369Sdim /// \param Result Output parameter to which all created clone groups are 203321369Sdim /// added. 204321369Sdim /// \param ConstraintList The constraints that should be applied to the 205321369Sdim // result. 206321369Sdim template <typename... Ts> 207321369Sdim void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) { 208321369Sdim // The initial assumption is that there is only one clone group and every 209321369Sdim // statement is a clone of the others. This clone group will then be 210321369Sdim // split up with the help of the constraints. 211321369Sdim CloneGroup AllClones; 212321369Sdim AllClones.reserve(Sequences.size()); 213321369Sdim for (const auto &C : Sequences) { 214321369Sdim AllClones.push_back(C); 215321369Sdim } 216311118Sdim 217321369Sdim Result.push_back(AllClones); 218311118Sdim 219321369Sdim constrainClones(Result, ConstraintList...); 220321369Sdim } 221311118Sdim 222321369Sdimprivate: 223321369Sdim CloneGroup Sequences; 224321369Sdim}; 225311118Sdim 226321369Sdim/// This class is a utility class that contains utility functions for building 227321369Sdim/// custom constraints. 228321369Sdimclass CloneConstraint { 229321369Sdimpublic: 230321369Sdim /// Removes all groups by using a filter function. 231321369Sdim /// \param CloneGroups The list of CloneGroups that is supposed to be 232321369Sdim /// filtered. 233321369Sdim /// \param Filter The filter function that should return true for all groups 234321369Sdim /// that should be removed from the list. 235327952Sdim static void filterGroups( 236327952Sdim std::vector<CloneDetector::CloneGroup> &CloneGroups, 237327952Sdim llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) { 238321369Sdim CloneGroups.erase( 239321369Sdim std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter), 240321369Sdim CloneGroups.end()); 241321369Sdim } 242311118Sdim 243321369Sdim /// Splits the given CloneGroups until the given Compare function returns true 244321369Sdim /// for all clones in a single group. 245321369Sdim /// \param CloneGroups A list of CloneGroups that should be modified. 246321369Sdim /// \param Compare The comparison function that all clones are supposed to 247321369Sdim /// pass. Should return true if and only if two clones belong 248321369Sdim /// to the same CloneGroup. 249321369Sdim static void splitCloneGroups( 250321369Sdim std::vector<CloneDetector::CloneGroup> &CloneGroups, 251327952Sdim llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> 252327952Sdim Compare); 253321369Sdim}; 254311118Sdim 255327952Sdim/// This constraint moves clones into clone groups of type II via hashing. 256327952Sdim/// 257327952Sdim/// Clones with different hash values are moved into separate clone groups. 258327952Sdim/// Collisions are possible, and this constraint does nothing to address this 259327952Sdim/// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the 260327952Sdim/// constraint chain, not necessarily immediately, to eliminate hash collisions 261327952Sdim/// through a more detailed analysis. 262327952Sdimclass RecursiveCloneTypeIIHashConstraint { 263327952Sdimpublic: 264327952Sdim void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 265327952Sdim}; 266311118Sdim 267327952Sdim/// This constraint moves clones into clone groups of type II by comparing them. 268327952Sdim/// 269327952Sdim/// Clones that aren't type II clones are moved into separate clone groups. 270327952Sdim/// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone 271327952Sdim/// group are guaranteed to be be type II clones of each other, but it is too 272327952Sdim/// slow to efficiently handle large amounts of clones. 273327952Sdimclass RecursiveCloneTypeIIVerifyConstraint { 274321369Sdimpublic: 275321369Sdim void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 276321369Sdim}; 277321369Sdim 278321369Sdim/// Ensures that every clone has at least the given complexity. 279321369Sdim/// 280321369Sdim/// Complexity is here defined as the total amount of children of a statement. 281321369Sdim/// This constraint assumes the first statement in the group is representative 282321369Sdim/// for all other statements in the group in terms of complexity. 283321369Sdimclass MinComplexityConstraint { 284321369Sdim unsigned MinComplexity; 285321369Sdim 286321369Sdimpublic: 287321369Sdim MinComplexityConstraint(unsigned MinComplexity) 288321369Sdim : MinComplexity(MinComplexity) {} 289321369Sdim 290327952Sdim /// Calculates the complexity of the given StmtSequence. 291327952Sdim /// \param Limit The limit of complexity we probe for. After reaching 292327952Sdim /// this limit during calculation, this method is exiting 293327952Sdim /// early to improve performance and returns this limit. 294327952Sdim size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit, 295321369Sdim const std::string &ParentMacroStack = ""); 296321369Sdim 297321369Sdim void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 298321369Sdim CloneConstraint::filterGroups( 299321369Sdim CloneGroups, [this](const CloneDetector::CloneGroup &A) { 300321369Sdim if (!A.empty()) 301327952Sdim return calculateStmtComplexity(A.front(), MinComplexity) < 302327952Sdim MinComplexity; 303321369Sdim else 304321369Sdim return false; 305321369Sdim }); 306321369Sdim } 307321369Sdim}; 308321369Sdim 309321369Sdim/// Ensures that all clone groups contain at least the given amount of clones. 310321369Sdimclass MinGroupSizeConstraint { 311321369Sdim unsigned MinGroupSize; 312321369Sdim 313321369Sdimpublic: 314321369Sdim MinGroupSizeConstraint(unsigned MinGroupSize = 2) 315321369Sdim : MinGroupSize(MinGroupSize) {} 316321369Sdim 317321369Sdim void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 318321369Sdim CloneConstraint::filterGroups(CloneGroups, 319321369Sdim [this](const CloneDetector::CloneGroup &A) { 320321369Sdim return A.size() < MinGroupSize; 321321369Sdim }); 322321369Sdim } 323321369Sdim}; 324321369Sdim 325321369Sdim/// Ensures that no clone group fully contains another clone group. 326321369Sdimstruct OnlyLargestCloneConstraint { 327321369Sdim void constrain(std::vector<CloneDetector::CloneGroup> &Result); 328321369Sdim}; 329321369Sdim 330321369Sdimstruct FilenamePatternConstraint { 331321369Sdim StringRef IgnoredFilesPattern; 332321369Sdim std::shared_ptr<llvm::Regex> IgnoredFilesRegex; 333321369Sdim 334341825Sdim FilenamePatternConstraint(StringRef IgnoredFilesPattern) 335321369Sdim : IgnoredFilesPattern(IgnoredFilesPattern) { 336321369Sdim IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" + 337321369Sdim IgnoredFilesPattern.str() + "$)"); 338321369Sdim } 339321369Sdim 340321369Sdim bool isAutoGenerated(const CloneDetector::CloneGroup &Group); 341321369Sdim 342321369Sdim void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 343321369Sdim CloneConstraint::filterGroups( 344321369Sdim CloneGroups, [this](const CloneDetector::CloneGroup &Group) { 345321369Sdim return isAutoGenerated(Group); 346321369Sdim }); 347321369Sdim } 348321369Sdim}; 349321369Sdim 350321369Sdim/// Analyzes the pattern of the referenced variables in a statement. 351321369Sdimclass VariablePattern { 352321369Sdim 353341825Sdim /// Describes an occurrence of a variable reference in a statement. 354321369Sdim struct VariableOccurence { 355321369Sdim /// The index of the associated VarDecl in the Variables vector. 356321369Sdim size_t KindID; 357321369Sdim /// The statement in the code where the variable was referenced. 358321369Sdim const Stmt *Mention; 359321369Sdim 360321369Sdim VariableOccurence(size_t KindID, const Stmt *Mention) 361321369Sdim : KindID(KindID), Mention(Mention) {} 362321369Sdim }; 363321369Sdim 364341825Sdim /// All occurrences of referenced variables in the order of appearance. 365321369Sdim std::vector<VariableOccurence> Occurences; 366321369Sdim /// List of referenced variables in the order of appearance. 367321369Sdim /// Every item in this list is unique. 368321369Sdim std::vector<const VarDecl *> Variables; 369321369Sdim 370321369Sdim /// Adds a new variable referenced to this pattern. 371321369Sdim /// \param VarDecl The declaration of the variable that is referenced. 372321369Sdim /// \param Mention The SourceRange where this variable is referenced. 373321369Sdim void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention); 374321369Sdim 375321369Sdim /// Adds each referenced variable from the given statement. 376321369Sdim void addVariables(const Stmt *S); 377321369Sdim 378321369Sdimpublic: 379321369Sdim /// Creates an VariablePattern object with information about the given 380321369Sdim /// StmtSequence. 381321369Sdim VariablePattern(const StmtSequence &Sequence) { 382321369Sdim for (const Stmt *S : Sequence) 383321369Sdim addVariables(S); 384321369Sdim } 385321369Sdim 386321369Sdim /// Describes two clones that reference their variables in a different pattern 387321369Sdim /// which could indicate a programming error. 388311118Sdim struct SuspiciousClonePair { 389321369Sdim /// Utility class holding the relevant information about a single 390321369Sdim /// clone in this pair. 391311118Sdim struct SuspiciousCloneInfo { 392311118Sdim /// The variable which referencing in this clone was against the pattern. 393311118Sdim const VarDecl *Variable; 394311118Sdim /// Where the variable was referenced. 395311118Sdim const Stmt *Mention; 396311118Sdim /// The variable that should have been referenced to follow the pattern. 397311118Sdim /// If Suggestion is a nullptr then it's not possible to fix the pattern 398311118Sdim /// by referencing a different variable in this clone. 399311118Sdim const VarDecl *Suggestion; 400311118Sdim SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, 401311118Sdim const VarDecl *Suggestion) 402311118Sdim : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {} 403311118Sdim SuspiciousCloneInfo() {} 404311118Sdim }; 405311118Sdim /// The first clone in the pair which always has a suggested variable. 406311118Sdim SuspiciousCloneInfo FirstCloneInfo; 407311118Sdim /// This other clone in the pair which can have a suggested variable. 408311118Sdim SuspiciousCloneInfo SecondCloneInfo; 409311118Sdim }; 410311118Sdim 411321369Sdim /// Counts the differences between this pattern and the given one. 412321369Sdim /// \param Other The given VariablePattern to compare with. 413321369Sdim /// \param FirstMismatch Output parameter that will be filled with information 414321369Sdim /// about the first difference between the two patterns. This parameter 415321369Sdim /// can be a nullptr, in which case it will be ignored. 416321369Sdim /// \return Returns the number of differences between the pattern this object 417321369Sdim /// is following and the given VariablePattern. 418321369Sdim /// 419321369Sdim /// For example, the following statements all have the same pattern and this 420321369Sdim /// function would return zero: 421321369Sdim /// 422321369Sdim /// if (a < b) return a; return b; 423321369Sdim /// if (x < y) return x; return y; 424321369Sdim /// if (u2 < u1) return u2; return u1; 425321369Sdim /// 426321369Sdim /// But the following statement has a different pattern (note the changed 427321369Sdim /// variables in the return statements) and would have two differences when 428321369Sdim /// compared with one of the statements above. 429321369Sdim /// 430321369Sdim /// if (a < b) return b; return a; 431321369Sdim /// 432321369Sdim /// This function should only be called if the related statements of the given 433321369Sdim /// pattern and the statements of this objects are clones of each other. 434321369Sdim unsigned countPatternDifferences( 435321369Sdim const VariablePattern &Other, 436321369Sdim VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr); 437321369Sdim}; 438311118Sdim 439321369Sdim/// Ensures that all clones reference variables in the same pattern. 440321369Sdimstruct MatchingVariablePatternConstraint { 441321369Sdim void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups); 442311118Sdim}; 443311118Sdim 444311118Sdim} // end namespace clang 445311118Sdim 446311118Sdim#endif // LLVM_CLANG_AST_CLONEDETECTION_H 447