1//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Suffix Tree class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Support/SuffixTree.h"
14#include "llvm/Support/Allocator.h"
15#include "llvm/Support/Casting.h"
16#include "llvm/Support/SuffixTreeNode.h"
17
18using namespace llvm;
19
20/// \returns the number of elements in the substring associated with \p N.
21static size_t numElementsInSubstring(const SuffixTreeNode *N) {
22  assert(N && "Got a null node?");
23  if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
24    if (Internal->isRoot())
25      return 0;
26  return N->getEndIdx() - N->getStartIdx() + 1;
27}
28
29SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) {
30  Root = insertRoot();
31  Active.Node = Root;
32
33  // Keep track of the number of suffixes we have to add of the current
34  // prefix.
35  unsigned SuffixesToAdd = 0;
36
37  // Construct the suffix tree iteratively on each prefix of the string.
38  // PfxEndIdx is the end index of the current prefix.
39  // End is one past the last element in the string.
40  for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
41    SuffixesToAdd++;
42    LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
43    SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
44  }
45
46  // Set the suffix indices of each leaf.
47  assert(Root && "Root node can't be nullptr!");
48  setSuffixIndices();
49}
50
51SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
52                                       unsigned StartIdx, unsigned Edge) {
53  assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
54  auto *N = new (LeafNodeAllocator.Allocate())
55      SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
56  Parent.Children[Edge] = N;
57  return N;
58}
59
60SuffixTreeInternalNode *
61SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
62                               unsigned StartIdx, unsigned EndIdx,
63                               unsigned Edge) {
64  assert(StartIdx <= EndIdx && "String can't start after it ends!");
65  assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
66         "Non-root internal nodes must have parents!");
67  auto *N = new (InternalNodeAllocator.Allocate())
68      SuffixTreeInternalNode(StartIdx, EndIdx, Root);
69  if (Parent)
70    Parent->Children[Edge] = N;
71  return N;
72}
73
74SuffixTreeInternalNode *SuffixTree::insertRoot() {
75  return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx,
76                            SuffixTreeNode::EmptyIdx, /*Edge = */ 0);
77}
78
79void SuffixTree::setSuffixIndices() {
80  // List of nodes we need to visit along with the current length of the
81  // string.
82  SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
83
84  // Current node being visited.
85  SuffixTreeNode *CurrNode = Root;
86
87  // Sum of the lengths of the nodes down the path to the current one.
88  unsigned CurrNodeLen = 0;
89  ToVisit.push_back({CurrNode, CurrNodeLen});
90  while (!ToVisit.empty()) {
91    std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
92    ToVisit.pop_back();
93    // Length of the current node from the root down to here.
94    CurrNode->setConcatLen(CurrNodeLen);
95    if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
96      for (auto &ChildPair : InternalNode->Children) {
97        assert(ChildPair.second && "Node had a null child!");
98        ToVisit.push_back(
99            {ChildPair.second,
100             CurrNodeLen + numElementsInSubstring(ChildPair.second)});
101      }
102    // No children, so we are at the end of the string.
103    if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
104      LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
105  }
106}
107
108unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
109  SuffixTreeInternalNode *NeedsLink = nullptr;
110
111  while (SuffixesToAdd > 0) {
112
113    // Are we waiting to add anything other than just the last character?
114    if (Active.Len == 0) {
115      // If not, then say the active index is the end index.
116      Active.Idx = EndIdx;
117    }
118
119    assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
120
121    // The first character in the current substring we're looking at.
122    unsigned FirstChar = Str[Active.Idx];
123
124    // Have we inserted anything starting with FirstChar at the current node?
125    if (Active.Node->Children.count(FirstChar) == 0) {
126      // If not, then we can just insert a leaf and move to the next step.
127      insertLeaf(*Active.Node, EndIdx, FirstChar);
128
129      // The active node is an internal node, and we visited it, so it must
130      // need a link if it doesn't have one.
131      if (NeedsLink) {
132        NeedsLink->setLink(Active.Node);
133        NeedsLink = nullptr;
134      }
135    } else {
136      // There's a match with FirstChar, so look for the point in the tree to
137      // insert a new node.
138      SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
139
140      unsigned SubstringLen = numElementsInSubstring(NextNode);
141
142      // Is the current suffix we're trying to insert longer than the size of
143      // the child we want to move to?
144      if (Active.Len >= SubstringLen) {
145        // If yes, then consume the characters we've seen and move to the next
146        // node.
147        assert(isa<SuffixTreeInternalNode>(NextNode) &&
148               "Expected an internal node?");
149        Active.Idx += SubstringLen;
150        Active.Len -= SubstringLen;
151        Active.Node = cast<SuffixTreeInternalNode>(NextNode);
152        continue;
153      }
154
155      // Otherwise, the suffix we're trying to insert must be contained in the
156      // next node we want to move to.
157      unsigned LastChar = Str[EndIdx];
158
159      // Is the string we're trying to insert a substring of the next node?
160      if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
161        // If yes, then we're done for this step. Remember our insertion point
162        // and move to the next end index. At this point, we have an implicit
163        // suffix tree.
164        if (NeedsLink && !Active.Node->isRoot()) {
165          NeedsLink->setLink(Active.Node);
166          NeedsLink = nullptr;
167        }
168
169        Active.Len++;
170        break;
171      }
172
173      // The string we're trying to insert isn't a substring of the next node,
174      // but matches up to a point. Split the node.
175      //
176      // For example, say we ended our search at a node n and we're trying to
177      // insert ABD. Then we'll create a new node s for AB, reduce n to just
178      // representing C, and insert a new leaf node l to represent d. This
179      // allows us to ensure that if n was a leaf, it remains a leaf.
180      //
181      //   | ABC  ---split--->  | AB
182      //   n                    s
183      //                     C / \ D
184      //                      n   l
185
186      // The node s from the diagram
187      SuffixTreeInternalNode *SplitNode = insertInternalNode(
188          Active.Node, NextNode->getStartIdx(),
189          NextNode->getStartIdx() + Active.Len - 1, FirstChar);
190
191      // Insert the new node representing the new substring into the tree as
192      // a child of the split node. This is the node l from the diagram.
193      insertLeaf(*SplitNode, EndIdx, LastChar);
194
195      // Make the old node a child of the split node and update its start
196      // index. This is the node n from the diagram.
197      NextNode->incrementStartIdx(Active.Len);
198      SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
199
200      // SplitNode is an internal node, update the suffix link.
201      if (NeedsLink)
202        NeedsLink->setLink(SplitNode);
203
204      NeedsLink = SplitNode;
205    }
206
207    // We've added something new to the tree, so there's one less suffix to
208    // add.
209    SuffixesToAdd--;
210
211    if (Active.Node->isRoot()) {
212      if (Active.Len > 0) {
213        Active.Len--;
214        Active.Idx = EndIdx - SuffixesToAdd + 1;
215      }
216    } else {
217      // Start the next phase at the next smallest suffix.
218      Active.Node = Active.Node->getLink();
219    }
220  }
221
222  return SuffixesToAdd;
223}
224
225void SuffixTree::RepeatedSubstringIterator::advance() {
226  // Clear the current state. If we're at the end of the range, then this
227  // is the state we want to be in.
228  RS = RepeatedSubstring();
229  N = nullptr;
230
231  // Each leaf node represents a repeat of a string.
232  SmallVector<unsigned> RepeatedSubstringStarts;
233
234  // Continue visiting nodes until we find one which repeats more than once.
235  while (!InternalNodesToVisit.empty()) {
236    RepeatedSubstringStarts.clear();
237    auto *Curr = InternalNodesToVisit.back();
238    InternalNodesToVisit.pop_back();
239
240    // Keep track of the length of the string associated with the node. If
241    // it's too short, we'll quit.
242    unsigned Length = Curr->getConcatLen();
243
244    // Iterate over each child, saving internal nodes for visiting, and
245    // leaf nodes in LeafChildren. Internal nodes represent individual
246    // strings, which may repeat.
247    for (auto &ChildPair : Curr->Children) {
248      // Save all of this node's children for processing.
249      if (auto *InternalChild =
250              dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) {
251        InternalNodesToVisit.push_back(InternalChild);
252        continue;
253      }
254
255      if (Length < MinLength)
256        continue;
257
258      // Have an occurrence of a potentially repeated string. Save it.
259      auto *Leaf = cast<SuffixTreeLeafNode>(ChildPair.second);
260      RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
261    }
262
263    // The root never represents a repeated substring. If we're looking at
264    // that, then skip it.
265    if (Curr->isRoot())
266      continue;
267
268    // Do we have any repeated substrings?
269    if (RepeatedSubstringStarts.size() < 2)
270      continue;
271
272    // Yes. Update the state to reflect this, and then bail out.
273    N = Curr;
274    RS.Length = Length;
275    for (unsigned StartIdx : RepeatedSubstringStarts)
276      RS.StartIndices.push_back(StartIdx);
277    break;
278  }
279  // At this point, either NewRS is an empty RepeatedSubstring, or it was
280  // set in the above loop. Similarly, N is either nullptr, or the node
281  // associated with NewRS.
282}
283