1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Suffix Tree class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/SuffixTree.h" 14 #include "llvm/Support/Allocator.h" 15 16 using namespace llvm; 17 18 /// \returns the number of elements in the substring associated with \p N. 19 static size_t numElementsInSubstring(const SuffixTreeNode *N) { 20 assert(N && "Got a null node?"); 21 if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N)) 22 if (Internal->isRoot()) 23 return 0; 24 return N->getEndIdx() - N->getStartIdx() + 1; 25 } 26 27 SuffixTree::SuffixTree(const ArrayRef<unsigned> &Str) : Str(Str) { 28 Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0); 29 Active.Node = Root; 30 31 // Keep track of the number of suffixes we have to add of the current 32 // prefix. 33 unsigned SuffixesToAdd = 0; 34 35 // Construct the suffix tree iteratively on each prefix of the string. 36 // PfxEndIdx is the end index of the current prefix. 37 // End is one past the last element in the string. 38 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) { 39 SuffixesToAdd++; 40 LeafEndIdx = PfxEndIdx; // Extend each of the leaves. 41 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd); 42 } 43 44 // Set the suffix indices of each leaf. 45 assert(Root && "Root node can't be nullptr!"); 46 setSuffixIndices(); 47 } 48 49 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent, 50 unsigned StartIdx, unsigned Edge) { 51 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!"); 52 auto *N = new (LeafNodeAllocator.Allocate()) 53 SuffixTreeLeafNode(StartIdx, &LeafEndIdx); 54 Parent.Children[Edge] = N; 55 return N; 56 } 57 58 SuffixTreeInternalNode * 59 SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent, 60 unsigned StartIdx, unsigned EndIdx, 61 unsigned Edge) { 62 assert(StartIdx <= EndIdx && "String can't start after it ends!"); 63 assert(!(!Parent && StartIdx != EmptyIdx) && 64 "Non-root internal nodes must have parents!"); 65 auto *N = new (InternalNodeAllocator.Allocate()) 66 SuffixTreeInternalNode(StartIdx, EndIdx, Root); 67 if (Parent) 68 Parent->Children[Edge] = N; 69 return N; 70 } 71 72 void SuffixTree::setSuffixIndices() { 73 // List of nodes we need to visit along with the current length of the 74 // string. 75 SmallVector<std::pair<SuffixTreeNode *, unsigned>> ToVisit; 76 77 // Current node being visited. 78 SuffixTreeNode *CurrNode = Root; 79 80 // Sum of the lengths of the nodes down the path to the current one. 81 unsigned CurrNodeLen = 0; 82 ToVisit.push_back({CurrNode, CurrNodeLen}); 83 while (!ToVisit.empty()) { 84 std::tie(CurrNode, CurrNodeLen) = ToVisit.back(); 85 ToVisit.pop_back(); 86 // Length of the current node from the root down to here. 87 CurrNode->setConcatLen(CurrNodeLen); 88 if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) 89 for (auto &ChildPair : InternalNode->Children) { 90 assert(ChildPair.second && "Node had a null child!"); 91 ToVisit.push_back( 92 {ChildPair.second, 93 CurrNodeLen + numElementsInSubstring(ChildPair.second)}); 94 } 95 // No children, so we are at the end of the string. 96 if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode)) 97 LeafNode->setSuffixIdx(Str.size() - CurrNodeLen); 98 } 99 } 100 101 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) { 102 SuffixTreeInternalNode *NeedsLink = nullptr; 103 104 while (SuffixesToAdd > 0) { 105 106 // Are we waiting to add anything other than just the last character? 107 if (Active.Len == 0) { 108 // If not, then say the active index is the end index. 109 Active.Idx = EndIdx; 110 } 111 112 assert(Active.Idx <= EndIdx && "Start index can't be after end index!"); 113 114 // The first character in the current substring we're looking at. 115 unsigned FirstChar = Str[Active.Idx]; 116 117 // Have we inserted anything starting with FirstChar at the current node? 118 if (Active.Node->Children.count(FirstChar) == 0) { 119 // If not, then we can just insert a leaf and move to the next step. 120 insertLeaf(*Active.Node, EndIdx, FirstChar); 121 122 // The active node is an internal node, and we visited it, so it must 123 // need a link if it doesn't have one. 124 if (NeedsLink) { 125 NeedsLink->setLink(Active.Node); 126 NeedsLink = nullptr; 127 } 128 } else { 129 // There's a match with FirstChar, so look for the point in the tree to 130 // insert a new node. 131 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar]; 132 133 unsigned SubstringLen = numElementsInSubstring(NextNode); 134 135 // Is the current suffix we're trying to insert longer than the size of 136 // the child we want to move to? 137 if (Active.Len >= SubstringLen) { 138 // If yes, then consume the characters we've seen and move to the next 139 // node. 140 assert(isa<SuffixTreeInternalNode>(NextNode) && 141 "Expected an internal node?"); 142 Active.Idx += SubstringLen; 143 Active.Len -= SubstringLen; 144 Active.Node = cast<SuffixTreeInternalNode>(NextNode); 145 continue; 146 } 147 148 // Otherwise, the suffix we're trying to insert must be contained in the 149 // next node we want to move to. 150 unsigned LastChar = Str[EndIdx]; 151 152 // Is the string we're trying to insert a substring of the next node? 153 if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) { 154 // If yes, then we're done for this step. Remember our insertion point 155 // and move to the next end index. At this point, we have an implicit 156 // suffix tree. 157 if (NeedsLink && !Active.Node->isRoot()) { 158 NeedsLink->setLink(Active.Node); 159 NeedsLink = nullptr; 160 } 161 162 Active.Len++; 163 break; 164 } 165 166 // The string we're trying to insert isn't a substring of the next node, 167 // but matches up to a point. Split the node. 168 // 169 // For example, say we ended our search at a node n and we're trying to 170 // insert ABD. Then we'll create a new node s for AB, reduce n to just 171 // representing C, and insert a new leaf node l to represent d. This 172 // allows us to ensure that if n was a leaf, it remains a leaf. 173 // 174 // | ABC ---split---> | AB 175 // n s 176 // C / \ D 177 // n l 178 179 // The node s from the diagram 180 SuffixTreeInternalNode *SplitNode = 181 insertInternalNode(Active.Node, NextNode->getStartIdx(), 182 NextNode->getStartIdx() + Active.Len - 1, FirstChar); 183 184 // Insert the new node representing the new substring into the tree as 185 // a child of the split node. This is the node l from the diagram. 186 insertLeaf(*SplitNode, EndIdx, LastChar); 187 188 // Make the old node a child of the split node and update its start 189 // index. This is the node n from the diagram. 190 NextNode->incrementStartIdx(Active.Len); 191 SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode; 192 193 // SplitNode is an internal node, update the suffix link. 194 if (NeedsLink) 195 NeedsLink->setLink(SplitNode); 196 197 NeedsLink = SplitNode; 198 } 199 200 // We've added something new to the tree, so there's one less suffix to 201 // add. 202 SuffixesToAdd--; 203 204 if (Active.Node->isRoot()) { 205 if (Active.Len > 0) { 206 Active.Len--; 207 Active.Idx = EndIdx - SuffixesToAdd + 1; 208 } 209 } else { 210 // Start the next phase at the next smallest suffix. 211 Active.Node = Active.Node->getLink(); 212 } 213 } 214 215 return SuffixesToAdd; 216 } 217