xref: /llvm-project/llvm/unittests/Support/SuffixTreeTest.cpp (revision d9a00ed3668803d11675b103fe9b6ed077ddc4c1)
1 //===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "llvm/Support/SuffixTree.h"
10 #include "gtest/gtest.h"
11 #include <vector>
12 
13 using namespace llvm;
14 
15 namespace {
16 
17 // Each example vector has a unique element at the end to represent the end of
18 // the string
19 
20 // Tests that The SuffixTree finds a simple repetition of the substring {1, 2}
21 // {1, 2} twice in the provided string.
TEST(SuffixTreeTest,TestSingleRepetition)22 TEST(SuffixTreeTest, TestSingleRepetition) {
23   std::vector<unsigned> SimpleRepetitionData = {1, 2, 1, 2, 3};
24   SuffixTree ST(SimpleRepetitionData);
25   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
26   for (auto It = ST.begin(); It != ST.end(); It++)
27     SubStrings.push_back(*It);
28   ASSERT_EQ(SubStrings.size(), 1u);
29   EXPECT_EQ(SubStrings[0].Length, 2u);
30   EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u);
31   for (unsigned StartIdx : SubStrings[0].StartIndices) {
32     EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u);
33     EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u);
34   }
35 }
36 
37 // Tests that the SuffixTree is able to find the substrings {1, 2, 3} at
38 // at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4.
39 // This test also serves as a flag for improvements to the suffix tree.
40 //
41 // FIXME: Right now, the longest repeated substring from a specific index is
42 // returned, this could be improved to return the longest repeated substring, as
43 // well as those that are smaller.
TEST(SuffixTreeTest,TestLongerRepetition)44 TEST(SuffixTreeTest, TestLongerRepetition) {
45   std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4};
46   SuffixTree ST(RepeatedRepetitionData);
47   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
48   for (auto It = ST.begin(); It != ST.end(); It++)
49     SubStrings.push_back(*It);
50   EXPECT_EQ(SubStrings.size(), 2u);
51   unsigned Len;
52   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
53     Len = RS.Length;
54     bool IsExpectedLen = (Len == 3u || Len == 2u);
55     bool IsExpectedIndex;
56     ASSERT_TRUE(IsExpectedLen);
57 
58     if (Len == 3u) {
59       for (unsigned StartIdx : RS.StartIndices) {
60         IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
61         EXPECT_TRUE(IsExpectedIndex);
62         EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
63         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
64         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
65       }
66     } else {
67       for (unsigned StartIdx : RS.StartIndices) {
68         IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u);
69         EXPECT_TRUE(IsExpectedIndex);
70         EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
71         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
72       }
73     }
74   }
75 }
76 
77 // Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at
78 // indices 0 and 1.
79 //
80 // FIXME: Add support for detecting {1, 1} and {1, 1, 1}
81 // See Test TestSingleCharacterRepeatWithLeafDescendants for the fix
TEST(SuffixTreeTest,TestSingleCharacterRepeat)82 TEST(SuffixTreeTest, TestSingleCharacterRepeat) {
83   std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
84   std::vector<unsigned>::iterator RRDIt, RRDIt2;
85   SuffixTree ST(RepeatedRepetitionData);
86   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
87   for (auto It = ST.begin(); It != ST.end(); It++)
88     SubStrings.push_back(*It);
89   EXPECT_EQ(SubStrings.size(), 1u);
90   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
91     EXPECT_EQ(RS.StartIndices.size(),
92               RepeatedRepetitionData.size() - RS.Length);
93     for (unsigned StartIdx : SubStrings[0].StartIndices) {
94       RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
95       std::advance(RRDIt, StartIdx);
96       std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
97       ASSERT_TRUE(
98           all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
99                  [](unsigned Elt) { return Elt == 1; }));
100     }
101   }
102 }
103 
104 // Tests that the SuffixTree is able to find the following substrings:
105 // {1, 1} at indices 0, 1, 2, 3, and 4;
106 // {1, 1, 1} at indices 0, 1, 2, and 3;
107 // {1, 1, 1, 1}  at indices 0, 1, and 2; and
108 // {1, 1, 1, 1, 1} at indices 0 and 1.
109 //
110 // This is the fix for TestSingleCharacterRepeat.
TEST(SuffixTreeTest,TestSingleCharacterRepeatWithLeafDescendants)111 TEST(SuffixTreeTest, TestSingleCharacterRepeatWithLeafDescendants) {
112   std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
113   std::vector<unsigned>::iterator RRDIt, RRDIt2;
114   SuffixTree ST(RepeatedRepetitionData, /*OutlinerLeafDescendants=*/true);
115   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
116   for (auto It = ST.begin(); It != ST.end(); It++)
117     SubStrings.push_back(*It);
118   EXPECT_EQ(SubStrings.size(), 4u);
119   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
120     EXPECT_EQ(RS.StartIndices.size(),
121               RepeatedRepetitionData.size() - RS.Length);
122     for (unsigned StartIdx : SubStrings[0].StartIndices) {
123       RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
124       std::advance(RRDIt, StartIdx);
125       std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
126       ASSERT_TRUE(
127           all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
128                  [](unsigned Elt) { return Elt == 1; }));
129     }
130   }
131 }
132 
133 // The suffix tree cannot currently find repeated substrings in strings of the
134 // form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem
135 // repeats")
136 //
137 // FIXME: Teach the SuffixTree to recognize these cases.
TEST(SuffixTreeTest,TestTandemRepeat)138 TEST(SuffixTreeTest, TestTandemRepeat) {
139   std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3};
140   SuffixTree ST(RepeatedRepetitionData);
141   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
142   for (auto It = ST.begin(); It != ST.end(); It++)
143     SubStrings.push_back(*It);
144   EXPECT_EQ(SubStrings.size(), 0u);
145 }
146 
147 // Tests that the SuffixTree does not erroneously include values that are not
148 // in repeated substrings.  That is, only finds {1, 1} at indices 0 and 3 and
149 // does not include 2 and 3.
TEST(SuffixTreeTest,TestExclusion)150 TEST(SuffixTreeTest, TestExclusion) {
151   std::vector<unsigned> RepeatedRepetitionData = {1, 1, 2, 1, 1, 3};
152   std::vector<unsigned>::iterator RRDIt, RRDIt2;
153   SuffixTree ST(RepeatedRepetitionData);
154   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
155   for (auto It = ST.begin(); It != ST.end(); It++)
156     SubStrings.push_back(*It);
157   EXPECT_EQ(SubStrings.size(), 1u);
158   bool IsExpectedIndex;
159   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
160     for (unsigned StartIdx : RS.StartIndices) {
161       IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
162       EXPECT_TRUE(IsExpectedIndex);
163       RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
164       std::advance(RRDIt, StartIdx);
165       std::advance(RRDIt2, StartIdx + RS.Length);
166       EXPECT_TRUE(
167           all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
168                  [](unsigned Elt) { return Elt == 1; }));
169     }
170   }
171 }
172 
173 // Tests that the SuffixTree is able to find three substrings
174 // {1, 2, 3} at indices 6 and 10;
175 // {2, 3} at indices 7 and 11; and
176 // {1, 2} at indicies 0 and 3.
177 //
178 // FIXME: {1, 2} has indices 6 and 10 missing as it is a substring of {1, 2, 3}
179 // See Test TestSubstringRepeatsWithLeafDescendants for the fix
TEST(SuffixTreeTest,TestSubstringRepeats)180 TEST(SuffixTreeTest, TestSubstringRepeats) {
181   std::vector<unsigned> RepeatedRepetitionData = {1, 2, 100, 1, 2, 101, 1,
182                                                   2, 3, 103, 1, 2, 3,   104};
183   SuffixTree ST(RepeatedRepetitionData);
184   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
185   for (auto It = ST.begin(); It != ST.end(); It++)
186     SubStrings.push_back(*It);
187   EXPECT_EQ(SubStrings.size(), 3u);
188   unsigned Len;
189   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
190     Len = RS.Length;
191     bool IsExpectedLen = (Len == 3u || Len == 2u);
192     ASSERT_TRUE(IsExpectedLen);
193     bool IsExpectedIndex;
194 
195     if (Len == 3u) { // {1, 2, 3}
196       EXPECT_EQ(RS.StartIndices.size(), 2u);
197       for (unsigned StartIdx : RS.StartIndices) {
198         IsExpectedIndex = (StartIdx == 6u || StartIdx == 10u);
199         EXPECT_TRUE(IsExpectedIndex);
200         EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
201         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
202         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
203       }
204     } else {
205       if (RepeatedRepetitionData[RS.StartIndices[0]] == 1u) { // {1, 2}
206         EXPECT_EQ(RS.StartIndices.size(), 2u);
207         for (unsigned StartIdx : RS.StartIndices) {
208           IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
209           EXPECT_TRUE(IsExpectedIndex);
210           EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
211         }
212       } else { // {2, 3}
213         EXPECT_EQ(RS.StartIndices.size(), 2u);
214         for (unsigned StartIdx : RS.StartIndices) {
215           IsExpectedIndex = (StartIdx == 7u || StartIdx == 11u);
216           EXPECT_TRUE(IsExpectedIndex);
217           EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
218           EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
219         }
220       }
221     }
222   }
223 }
224 
225 // Tests that the SuffixTree is able to find three substrings
226 // {1, 2, 3} at indices 6 and 10;
227 // {2, 3} at indices 7 and 11; and
228 // {1, 2} at indicies 0, 3, 6, and 10.
229 //
230 // This is the fix for TestSubstringRepeats
TEST(SuffixTreeTest,TestSubstringRepeatsWithLeafDescendants)231 TEST(SuffixTreeTest, TestSubstringRepeatsWithLeafDescendants) {
232   std::vector<unsigned> RepeatedRepetitionData = {1, 2, 100, 1, 2, 101, 1,
233                                                   2, 3, 103, 1, 2, 3,   104};
234   SuffixTree ST(RepeatedRepetitionData, /*OutlinerLeafDescendants=*/true);
235   std::vector<SuffixTree::RepeatedSubstring> SubStrings;
236   for (auto It = ST.begin(); It != ST.end(); It++)
237     SubStrings.push_back(*It);
238   EXPECT_EQ(SubStrings.size(), 3u);
239   unsigned Len;
240   for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
241     Len = RS.Length;
242     bool IsExpectedLen = (Len == 3u || Len == 2u);
243     ASSERT_TRUE(IsExpectedLen);
244     bool IsExpectedIndex;
245 
246     if (Len == 3u) { // {1, 2, 3}
247       EXPECT_EQ(RS.StartIndices.size(), 2u);
248       for (unsigned StartIdx : RS.StartIndices) {
249         IsExpectedIndex = (StartIdx == 6u || StartIdx == 10u);
250         EXPECT_TRUE(IsExpectedIndex);
251         EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
252         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
253         EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
254       }
255     } else {
256       if (RepeatedRepetitionData[RS.StartIndices[0]] == 1u) { // {1, 2}
257         EXPECT_EQ(RS.StartIndices.size(), 4u);
258         for (unsigned StartIdx : RS.StartIndices) {
259           IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u ||
260                              StartIdx == 6u || StartIdx == 10u);
261           EXPECT_TRUE(IsExpectedIndex);
262           EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
263         }
264       } else { // {2, 3}
265         EXPECT_EQ(RS.StartIndices.size(), 2u);
266         for (unsigned StartIdx : RS.StartIndices) {
267           IsExpectedIndex = (StartIdx == 7u || StartIdx == 11u);
268           EXPECT_TRUE(IsExpectedIndex);
269           EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
270           EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
271         }
272       }
273     }
274   }
275 }
276 
277 } // namespace
278