xref: /llvm-project/llvm/lib/Support/SuffixTree.cpp (revision c2f0c204d1847fac9f8d47c06a40cecd717a546d)
1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Suffix Tree class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/SuffixTree.h"
14 #include "llvm/Support/Allocator.h"
15 
16 using namespace llvm;
17 
18 /// \returns the number of elements in the substring associated with \p N.
19 static size_t numElementsInSubstring(const SuffixTreeNode *N) {
20   assert(N && "Got a null node?");
21   if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
22     if (Internal->isRoot())
23       return 0;
24   return N->getEndIdx() - N->getStartIdx() + 1;
25 }
26 
27 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) {
28   Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
29   Active.Node = Root;
30 
31   // Keep track of the number of suffixes we have to add of the current
32   // prefix.
33   unsigned SuffixesToAdd = 0;
34 
35   // Construct the suffix tree iteratively on each prefix of the string.
36   // PfxEndIdx is the end index of the current prefix.
37   // End is one past the last element in the string.
38   for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
39     SuffixesToAdd++;
40     LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
41     SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
42   }
43 
44   // Set the suffix indices of each leaf.
45   assert(Root && "Root node can't be nullptr!");
46   setSuffixIndices();
47 }
48 
49 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
50                                        unsigned StartIdx, unsigned Edge) {
51   assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
52   auto *N = new (LeafNodeAllocator.Allocate())
53       SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
54   Parent.Children[Edge] = N;
55   return N;
56 }
57 
58 SuffixTreeInternalNode *
59 SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
60                                unsigned StartIdx, unsigned EndIdx,
61                                unsigned Edge) {
62   assert(StartIdx <= EndIdx && "String can't start after it ends!");
63   assert(!(!Parent && StartIdx != EmptyIdx) &&
64          "Non-root internal nodes must have parents!");
65   auto *N = new (InternalNodeAllocator.Allocate())
66       SuffixTreeInternalNode(StartIdx, EndIdx, Root);
67   if (Parent)
68     Parent->Children[Edge] = N;
69   return N;
70 }
71 
72 void SuffixTree::setSuffixIndices() {
73   // List of nodes we need to visit along with the current length of the
74   // string.
75   SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
76 
77   // Current node being visited.
78   SuffixTreeNode *CurrNode = Root;
79 
80   // Sum of the lengths of the nodes down the path to the current one.
81   unsigned CurrNodeLen = 0;
82   ToVisit.push_back({CurrNode, CurrNodeLen});
83   while (!ToVisit.empty()) {
84     std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
85     ToVisit.pop_back();
86     // Length of the current node from the root down to here.
87     CurrNode->setConcatLen(CurrNodeLen);
88     if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
89       for (auto &ChildPair : InternalNode->Children) {
90         assert(ChildPair.second && "Node had a null child!");
91         ToVisit.push_back(
92             {ChildPair.second,
93              CurrNodeLen + numElementsInSubstring(ChildPair.second)});
94       }
95     // No children, so we are at the end of the string.
96     if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
97       LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
98   }
99 }
100 
101 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
102   SuffixTreeInternalNode *NeedsLink = nullptr;
103 
104   while (SuffixesToAdd > 0) {
105 
106     // Are we waiting to add anything other than just the last character?
107     if (Active.Len == 0) {
108       // If not, then say the active index is the end index.
109       Active.Idx = EndIdx;
110     }
111 
112     assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
113 
114     // The first character in the current substring we're looking at.
115     unsigned FirstChar = Str[Active.Idx];
116 
117     // Have we inserted anything starting with FirstChar at the current node?
118     if (Active.Node->Children.count(FirstChar) == 0) {
119       // If not, then we can just insert a leaf and move to the next step.
120       insertLeaf(*Active.Node, EndIdx, FirstChar);
121 
122       // The active node is an internal node, and we visited it, so it must
123       // need a link if it doesn't have one.
124       if (NeedsLink) {
125         NeedsLink->setLink(Active.Node);
126         NeedsLink = nullptr;
127       }
128     } else {
129       // There's a match with FirstChar, so look for the point in the tree to
130       // insert a new node.
131       SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
132 
133       unsigned SubstringLen = numElementsInSubstring(NextNode);
134 
135       // Is the current suffix we're trying to insert longer than the size of
136       // the child we want to move to?
137       if (Active.Len >= SubstringLen) {
138         // If yes, then consume the characters we've seen and move to the next
139         // node.
140         assert(isa<SuffixTreeInternalNode>(NextNode) &&
141                "Expected an internal node?");
142         Active.Idx += SubstringLen;
143         Active.Len -= SubstringLen;
144         Active.Node = cast<SuffixTreeInternalNode>(NextNode);
145         continue;
146       }
147 
148       // Otherwise, the suffix we're trying to insert must be contained in the
149       // next node we want to move to.
150       unsigned LastChar = Str[EndIdx];
151 
152       // Is the string we're trying to insert a substring of the next node?
153       if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
154         // If yes, then we're done for this step. Remember our insertion point
155         // and move to the next end index. At this point, we have an implicit
156         // suffix tree.
157         if (NeedsLink && !Active.Node->isRoot()) {
158           NeedsLink->setLink(Active.Node);
159           NeedsLink = nullptr;
160         }
161 
162         Active.Len++;
163         break;
164       }
165 
166       // The string we're trying to insert isn't a substring of the next node,
167       // but matches up to a point. Split the node.
168       //
169       // For example, say we ended our search at a node n and we're trying to
170       // insert ABD. Then we'll create a new node s for AB, reduce n to just
171       // representing C, and insert a new leaf node l to represent d. This
172       // allows us to ensure that if n was a leaf, it remains a leaf.
173       //
174       //   | ABC  ---split--->  | AB
175       //   n                    s
176       //                     C / \ D
177       //                      n   l
178 
179       // The node s from the diagram
180       SuffixTreeInternalNode *SplitNode =
181           insertInternalNode(Active.Node, NextNode->getStartIdx(),
182                              NextNode->getStartIdx() + Active.Len - 1, FirstChar);
183 
184       // Insert the new node representing the new substring into the tree as
185       // a child of the split node. This is the node l from the diagram.
186       insertLeaf(*SplitNode, EndIdx, LastChar);
187 
188       // Make the old node a child of the split node and update its start
189       // index. This is the node n from the diagram.
190       NextNode->incrementStartIdx(Active.Len);
191       SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
192 
193       // SplitNode is an internal node, update the suffix link.
194       if (NeedsLink)
195         NeedsLink->setLink(SplitNode);
196 
197       NeedsLink = SplitNode;
198     }
199 
200     // We've added something new to the tree, so there's one less suffix to
201     // add.
202     SuffixesToAdd--;
203 
204     if (Active.Node->isRoot()) {
205       if (Active.Len > 0) {
206         Active.Len--;
207         Active.Idx = EndIdx - SuffixesToAdd + 1;
208       }
209     } else {
210       // Start the next phase at the next smallest suffix.
211       Active.Node = Active.Node->getLink();
212     }
213   }
214 
215   return SuffixesToAdd;
216 }
217