1//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Suffix Tree class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Support/SuffixTree.h"
14#include "llvm/Support/Allocator.h"
15#include <vector>
16
17using namespace llvm;
18
19SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
20  Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
21  Active.Node = Root;
22
23  // Keep track of the number of suffixes we have to add of the current
24  // prefix.
25  unsigned SuffixesToAdd = 0;
26
27  // Construct the suffix tree iteratively on each prefix of the string.
28  // PfxEndIdx is the end index of the current prefix.
29  // End is one past the last element in the string.
30  for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
31    SuffixesToAdd++;
32    LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
33    SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
34  }
35
36  // Set the suffix indices of each leaf.
37  assert(Root && "Root node can't be nullptr!");
38  setSuffixIndices();
39}
40
41SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
42                                       unsigned StartIdx, unsigned Edge) {
43
44  assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
45
46  SuffixTreeNode *N = new (NodeAllocator.Allocate())
47      SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
48  Parent.Children[Edge] = N;
49
50  return N;
51}
52
53SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
54                                               unsigned StartIdx,
55                                               unsigned EndIdx, unsigned Edge) {
56
57  assert(StartIdx <= EndIdx && "String can't start after it ends!");
58  assert(!(!Parent && StartIdx != EmptyIdx) &&
59         "Non-root internal nodes must have parents!");
60
61  unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
62  SuffixTreeNode *N =
63      new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
64  if (Parent)
65    Parent->Children[Edge] = N;
66
67  return N;
68}
69
70void SuffixTree::setSuffixIndices() {
71  // List of nodes we need to visit along with the current length of the
72  // string.
73  std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
74
75  // Current node being visited.
76  SuffixTreeNode *CurrNode = Root;
77
78  // Sum of the lengths of the nodes down the path to the current one.
79  unsigned CurrNodeLen = 0;
80  ToVisit.push_back({CurrNode, CurrNodeLen});
81  while (!ToVisit.empty()) {
82    std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
83    ToVisit.pop_back();
84    CurrNode->ConcatLen = CurrNodeLen;
85    for (auto &ChildPair : CurrNode->Children) {
86      assert(ChildPair.second && "Node had a null child!");
87      ToVisit.push_back(
88          {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
89    }
90
91    // No children, so we are at the end of the string.
92    if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
93      CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
94  }
95}
96
97unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
98  SuffixTreeNode *NeedsLink = nullptr;
99
100  while (SuffixesToAdd > 0) {
101
102    // Are we waiting to add anything other than just the last character?
103    if (Active.Len == 0) {
104      // If not, then say the active index is the end index.
105      Active.Idx = EndIdx;
106    }
107
108    assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
109
110    // The first character in the current substring we're looking at.
111    unsigned FirstChar = Str[Active.Idx];
112
113    // Have we inserted anything starting with FirstChar at the current node?
114    if (Active.Node->Children.count(FirstChar) == 0) {
115      // If not, then we can just insert a leaf and move to the next step.
116      insertLeaf(*Active.Node, EndIdx, FirstChar);
117
118      // The active node is an internal node, and we visited it, so it must
119      // need a link if it doesn't have one.
120      if (NeedsLink) {
121        NeedsLink->Link = Active.Node;
122        NeedsLink = nullptr;
123      }
124    } else {
125      // There's a match with FirstChar, so look for the point in the tree to
126      // insert a new node.
127      SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
128
129      unsigned SubstringLen = NextNode->size();
130
131      // Is the current suffix we're trying to insert longer than the size of
132      // the child we want to move to?
133      if (Active.Len >= SubstringLen) {
134        // If yes, then consume the characters we've seen and move to the next
135        // node.
136        Active.Idx += SubstringLen;
137        Active.Len -= SubstringLen;
138        Active.Node = NextNode;
139        continue;
140      }
141
142      // Otherwise, the suffix we're trying to insert must be contained in the
143      // next node we want to move to.
144      unsigned LastChar = Str[EndIdx];
145
146      // Is the string we're trying to insert a substring of the next node?
147      if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
148        // If yes, then we're done for this step. Remember our insertion point
149        // and move to the next end index. At this point, we have an implicit
150        // suffix tree.
151        if (NeedsLink && !Active.Node->isRoot()) {
152          NeedsLink->Link = Active.Node;
153          NeedsLink = nullptr;
154        }
155
156        Active.Len++;
157        break;
158      }
159
160      // The string we're trying to insert isn't a substring of the next node,
161      // but matches up to a point. Split the node.
162      //
163      // For example, say we ended our search at a node n and we're trying to
164      // insert ABD. Then we'll create a new node s for AB, reduce n to just
165      // representing C, and insert a new leaf node l to represent d. This
166      // allows us to ensure that if n was a leaf, it remains a leaf.
167      //
168      //   | ABC  ---split--->  | AB
169      //   n                    s
170      //                     C / \ D
171      //                      n   l
172
173      // The node s from the diagram
174      SuffixTreeNode *SplitNode =
175          insertInternalNode(Active.Node, NextNode->StartIdx,
176                             NextNode->StartIdx + Active.Len - 1, FirstChar);
177
178      // Insert the new node representing the new substring into the tree as
179      // a child of the split node. This is the node l from the diagram.
180      insertLeaf(*SplitNode, EndIdx, LastChar);
181
182      // Make the old node a child of the split node and update its start
183      // index. This is the node n from the diagram.
184      NextNode->StartIdx += Active.Len;
185      SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
186
187      // SplitNode is an internal node, update the suffix link.
188      if (NeedsLink)
189        NeedsLink->Link = SplitNode;
190
191      NeedsLink = SplitNode;
192    }
193
194    // We've added something new to the tree, so there's one less suffix to
195    // add.
196    SuffixesToAdd--;
197
198    if (Active.Node->isRoot()) {
199      if (Active.Len > 0) {
200        Active.Len--;
201        Active.Idx = EndIdx - SuffixesToAdd + 1;
202      }
203    } else {
204      // Start the next phase at the next smallest suffix.
205      Active.Node = Active.Node->Link;
206    }
207  }
208
209  return SuffixesToAdd;
210}
211