1 //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // TrigramIndex implements a heuristic for SpecialCaseList that allows to 10 // filter out ~99% incoming queries when all regular expressions in the 11 // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more 12 // complicated, the check is defeated and it will always pass the queries to a 13 // full regex. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Support/TrigramIndex.h" 18 #include "llvm/ADT/StringRef.h" 19 #include <set> 20 21 using namespace llvm; 22 23 static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; 24 isAdvancedMetachar(unsigned Char)25static bool isAdvancedMetachar(unsigned Char) { 26 return strchr(RegexAdvancedMetachars, Char) != nullptr; 27 } 28 insert(const std::string & Regex)29void TrigramIndex::insert(const std::string &Regex) { 30 if (Defeated) return; 31 std::set<unsigned> Was; 32 unsigned Cnt = 0; 33 unsigned Tri = 0; 34 unsigned Len = 0; 35 bool Escaped = false; 36 for (unsigned Char : Regex) { 37 if (!Escaped) { 38 // Regular expressions allow escaping symbols by preceding it with '\'. 39 if (Char == '\\') { 40 Escaped = true; 41 continue; 42 } 43 if (isAdvancedMetachar(Char)) { 44 // This is a more complicated regex than we can handle here. 45 Defeated = true; 46 return; 47 } 48 if (Char == '.' || Char == '*') { 49 Tri = 0; 50 Len = 0; 51 continue; 52 } 53 } 54 if (Escaped && Char >= '1' && Char <= '9') { 55 Defeated = true; 56 return; 57 } 58 // We have already handled escaping and can reset the flag. 59 Escaped = false; 60 Tri = ((Tri << 8) + Char) & 0xFFFFFF; 61 Len++; 62 if (Len < 3) 63 continue; 64 // We don't want the index to grow too much for the popular trigrams, 65 // as they are weak signals. It's ok to still require them for the 66 // rules we have already processed. It's just a small additional 67 // computational cost. 68 if (Index[Tri].size() >= 4) 69 continue; 70 Cnt++; 71 if (!Was.count(Tri)) { 72 // Adding the current rule to the index. 73 Index[Tri].push_back(Counts.size()); 74 Was.insert(Tri); 75 } 76 } 77 if (!Cnt) { 78 // This rule does not have remarkable trigrams to rely on. 79 // We have to always call the full regex chain. 80 Defeated = true; 81 return; 82 } 83 Counts.push_back(Cnt); 84 } 85 isDefinitelyOut(StringRef Query) const86bool TrigramIndex::isDefinitelyOut(StringRef Query) const { 87 if (Defeated) 88 return false; 89 std::vector<unsigned> CurCounts(Counts.size()); 90 unsigned Tri = 0; 91 for (size_t I = 0; I < Query.size(); I++) { 92 Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; 93 if (I < 2) 94 continue; 95 const auto &II = Index.find(Tri); 96 if (II == Index.end()) 97 continue; 98 for (size_t J : II->second) { 99 CurCounts[J]++; 100 // If we have reached a desired limit, we have to look at the query 101 // more closely by running a full regex. 102 if (CurCounts[J] >= Counts[J]) 103 return false; 104 } 105 } 106 return true; 107 } 108