1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Suffix Tree class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/SuffixTree.h" 14 #include "llvm/Support/Allocator.h" 15 #include "llvm/Support/Casting.h" 16 #include "llvm/Support/SuffixTreeNode.h" 17 18 using namespace llvm; 19 20 /// \returns the number of elements in the substring associated with \p N. 21 static size_t numElementsInSubstring(const SuffixTreeNode *N) { 22 assert(N && "Got a null node?"); 23 if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N)) 24 if (Internal->isRoot()) 25 return 0; 26 return N->getEndIdx() - N->getStartIdx() + 1; 27 } 28 29 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) { 30 Root = insertRoot(); 31 Active.Node = Root; 32 33 // Keep track of the number of suffixes we have to add of the current 34 // prefix. 35 unsigned SuffixesToAdd = 0; 36 37 // Construct the suffix tree iteratively on each prefix of the string. 38 // PfxEndIdx is the end index of the current prefix. 39 // End is one past the last element in the string. 40 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { 41 SuffixesToAdd++; 42 LeafEndIdx = PfxEndIdx; // Extend each of the leaves. 43 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); 44 } 45 46 // Set the suffix indices of each leaf. 47 assert(Root && "Root node can't be nullptr!"); 48 setSuffixIndices(); 49 } 50 51 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent, 52 unsigned StartIdx, unsigned Edge) { 53 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); 54 auto *N = new (LeafNodeAllocator.Allocate()) 55 SuffixTreeLeafNode(StartIdx, &LeafEndIdx); 56 Parent.Children[Edge] = N; 57 ++NumLeafNodesAllocated; 58 return N; 59 } 60 61 SuffixTreeInternalNode * 62 SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent, 63 unsigned StartIdx, unsigned EndIdx, 64 unsigned Edge) { 65 assert(StartIdx <= EndIdx && "String can't start after it ends!"); 66 assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) && 67 "Non-root internal nodes must have parents!"); 68 auto *N = new (InternalNodeAllocator.Allocate()) 69 SuffixTreeInternalNode(StartIdx, EndIdx, Root); 70 if (Parent) 71 Parent->Children[Edge] = N; 72 ++NumInternalNodesAllocated; 73 return N; 74 } 75 76 SuffixTreeInternalNode *SuffixTree::insertRoot() { 77 return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx, 78 SuffixTreeNode::EmptyIdx, /*Edge = */ 0); 79 } 80 81 void SuffixTree::setSuffixIndices() { 82 // List of nodes we need to visit along with the current length of the 83 // string. 84 SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit; 85 86 // Current node being visited. 87 SuffixTreeNode *CurrNode = Root; 88 89 // Sum of the lengths of the nodes down the path to the current one. 90 unsigned CurrNodeLen = 0; 91 ToVisit.push_back({CurrNode, CurrNodeLen}); 92 while (!ToVisit.empty()) { 93 std::tie(CurrNode, CurrNodeLen) = ToVisit.back(); 94 ToVisit.pop_back(); 95 // Length of the current node from the root down to here. 96 CurrNode->setConcatLen(CurrNodeLen); 97 if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) 98 for (auto &ChildPair : InternalNode->Children) { 99 assert(ChildPair.second && "Node had a null child!"); 100 ToVisit.push_back( 101 {ChildPair.second, 102 CurrNodeLen + numElementsInSubstring(ChildPair.second)}); 103 } 104 // No children, so we are at the end of the string. 105 if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode)) 106 LeafNode->setSuffixIdx(Str.size() - CurrNodeLen); 107 } 108 } 109 110 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) { 111 SuffixTreeInternalNode *NeedsLink = nullptr; 112 113 while (SuffixesToAdd > 0) { 114 115 // Are we waiting to add anything other than just the last character? 116 if (Active.Len == 0) { 117 // If not, then say the active index is the end index. 118 Active.Idx = EndIdx; 119 } 120 121 assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); 122 123 // The first character in the current substring we're looking at. 124 unsigned FirstChar = Str[Active.Idx]; 125 126 // Have we inserted anything starting with FirstChar at the current node? 127 if (Active.Node->Children.count(FirstChar) == 0) { 128 // If not, then we can just insert a leaf and move to the next step. 129 insertLeaf(*Active.Node, EndIdx, FirstChar); 130 131 // The active node is an internal node, and we visited it, so it must 132 // need a link if it doesn't have one. 133 if (NeedsLink) { 134 NeedsLink->setLink(Active.Node); 135 NeedsLink = nullptr; 136 } 137 } else { 138 // There's a match with FirstChar, so look for the point in the tree to 139 // insert a new node. 140 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; 141 142 unsigned SubstringLen = numElementsInSubstring(NextNode); 143 144 // Is the current suffix we're trying to insert longer than the size of 145 // the child we want to move to? 146 if (Active.Len >= SubstringLen) { 147 // If yes, then consume the characters we've seen and move to the next 148 // node. 149 assert(isa<SuffixTreeInternalNode>(NextNode) && 150 "Expected an internal node?"); 151 Active.Idx += SubstringLen; 152 Active.Len -= SubstringLen; 153 Active.Node = cast<SuffixTreeInternalNode>(NextNode); 154 continue; 155 } 156 157 // Otherwise, the suffix we're trying to insert must be contained in the 158 // next node we want to move to. 159 unsigned LastChar = Str[EndIdx]; 160 161 // Is the string we're trying to insert a substring of the next node? 162 if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) { 163 // If yes, then we're done for this step. Remember our insertion point 164 // and move to the next end index. At this point, we have an implicit 165 // suffix tree. 166 if (NeedsLink && !Active.Node->isRoot()) { 167 NeedsLink->setLink(Active.Node); 168 NeedsLink = nullptr; 169 } 170 171 Active.Len++; 172 break; 173 } 174 175 // The string we're trying to insert isn't a substring of the next node, 176 // but matches up to a point. Split the node. 177 // 178 // For example, say we ended our search at a node n and we're trying to 179 // insert ABD. Then we'll create a new node s for AB, reduce n to just 180 // representing C, and insert a new leaf node l to represent d. This 181 // allows us to ensure that if n was a leaf, it remains a leaf. 182 // 183 // | ABC ---split---> | AB 184 // n s 185 // C / \ D 186 // n l 187 188 // The node s from the diagram 189 SuffixTreeInternalNode *SplitNode = insertInternalNode( 190 Active.Node, NextNode->getStartIdx(), 191 NextNode->getStartIdx() + Active.Len - 1, FirstChar); 192 193 // Insert the new node representing the new substring into the tree as 194 // a child of the split node. This is the node l from the diagram. 195 insertLeaf(*SplitNode, EndIdx, LastChar); 196 197 // Make the old node a child of the split node and update its start 198 // index. This is the node n from the diagram. 199 NextNode->incrementStartIdx(Active.Len); 200 SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode; 201 202 // SplitNode is an internal node, update the suffix link. 203 if (NeedsLink) 204 NeedsLink->setLink(SplitNode); 205 206 NeedsLink = SplitNode; 207 } 208 209 // We've added something new to the tree, so there's one less suffix to 210 // add. 211 SuffixesToAdd--; 212 213 if (Active.Node->isRoot()) { 214 if (Active.Len > 0) { 215 Active.Len--; 216 Active.Idx = EndIdx - SuffixesToAdd + 1; 217 } 218 } else { 219 // Start the next phase at the next smallest suffix. 220 Active.Node = Active.Node->getLink(); 221 } 222 } 223 224 return SuffixesToAdd; 225 } 226 227 void SuffixTree::RepeatedSubstringIterator::advance() { 228 // Clear the current state. If we're at the end of the range, then this 229 // is the state we want to be in. 230 RS = RepeatedSubstring(); 231 N = nullptr; 232 233 // Each leaf node represents a repeat of a string. 234 SmallVector<unsigned> RepeatedSubstringStarts; 235 236 // Continue visiting nodes until we find one which repeats more than once. 237 while (!InternalNodesToVisit.empty()) { 238 RepeatedSubstringStarts.clear(); 239 auto *Curr = InternalNodesToVisit.back(); 240 InternalNodesToVisit.pop_back(); 241 242 // Keep track of the length of the string associated with the node. If 243 // it's too short, we'll quit. 244 unsigned Length = Curr->getConcatLen(); 245 246 // Iterate over each child, saving internal nodes for visiting, and 247 // leaf nodes in LeafChildren. Internal nodes represent individual 248 // strings, which may repeat. 249 for (auto &ChildPair : Curr->Children) { 250 // Save all of this node's children for processing. 251 if (auto *InternalChild = 252 dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) { 253 InternalNodesToVisit.push_back(InternalChild); 254 continue; 255 } 256 257 if (Length < MinLength) 258 continue; 259 260 // Have an occurrence of a potentially repeated string. Save it. 261 auto *Leaf = cast<SuffixTreeLeafNode>(ChildPair.second); 262 RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx()); 263 } 264 265 // The root never represents a repeated substring. If we're looking at 266 // that, then skip it. 267 if (Curr->isRoot()) 268 continue; 269 270 // Do we have any repeated substrings? 271 if (RepeatedSubstringStarts.size() < 2) 272 continue; 273 274 // Yes. Update the state to reflect this, and then bail out. 275 N = Curr; 276 RS.Length = Length; 277 for (unsigned StartIdx : RepeatedSubstringStarts) 278 RS.StartIndices.push_back(StartIdx); 279 break; 280 } 281 // At this point, either NewRS is an empty RepeatedSubstring, or it was 282 // set in the above loop. Similarly, N is either nullptr, or the node 283 // associated with NewRS. 284 } 285