109467b48Spatrick //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===// 209467b48Spatrick // 309467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 409467b48Spatrick // See https://llvm.org/LICENSE.txt for license information. 509467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 609467b48Spatrick // 709467b48Spatrick //===----------------------------------------------------------------------===// 809467b48Spatrick // 909467b48Spatrick // TrigramIndex implements a heuristic for SpecialCaseList that allows to 1009467b48Spatrick // filter out ~99% incoming queries when all regular expressions in the 1109467b48Spatrick // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more 1209467b48Spatrick // complicated, the check is defeated and it will always pass the queries to a 1309467b48Spatrick // full regex. 1409467b48Spatrick // 1509467b48Spatrick //===----------------------------------------------------------------------===// 1609467b48Spatrick 1709467b48Spatrick #include "llvm/Support/TrigramIndex.h" 18*d415bd75Srobert #include "llvm/ADT/StringRef.h" 1909467b48Spatrick #include <set> 2009467b48Spatrick 2109467b48Spatrick using namespace llvm; 2209467b48Spatrick 2309467b48Spatrick static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; 2409467b48Spatrick isAdvancedMetachar(unsigned Char)2509467b48Spatrickstatic bool isAdvancedMetachar(unsigned Char) { 2609467b48Spatrick return strchr(RegexAdvancedMetachars, Char) != nullptr; 2709467b48Spatrick } 2809467b48Spatrick insert(const std::string & Regex)2973471bf0Spatrickvoid TrigramIndex::insert(const std::string &Regex) { 3009467b48Spatrick if (Defeated) return; 3109467b48Spatrick std::set<unsigned> Was; 3209467b48Spatrick unsigned Cnt = 0; 3309467b48Spatrick unsigned Tri = 0; 3409467b48Spatrick unsigned Len = 0; 3509467b48Spatrick bool Escaped = false; 3609467b48Spatrick for (unsigned Char : Regex) { 3709467b48Spatrick if (!Escaped) { 3809467b48Spatrick // Regular expressions allow escaping symbols by preceding it with '\'. 3909467b48Spatrick if (Char == '\\') { 4009467b48Spatrick Escaped = true; 4109467b48Spatrick continue; 4209467b48Spatrick } 4309467b48Spatrick if (isAdvancedMetachar(Char)) { 4409467b48Spatrick // This is a more complicated regex than we can handle here. 4509467b48Spatrick Defeated = true; 4609467b48Spatrick return; 4709467b48Spatrick } 4809467b48Spatrick if (Char == '.' || Char == '*') { 4909467b48Spatrick Tri = 0; 5009467b48Spatrick Len = 0; 5109467b48Spatrick continue; 5209467b48Spatrick } 5309467b48Spatrick } 5409467b48Spatrick if (Escaped && Char >= '1' && Char <= '9') { 5509467b48Spatrick Defeated = true; 5609467b48Spatrick return; 5709467b48Spatrick } 5809467b48Spatrick // We have already handled escaping and can reset the flag. 5909467b48Spatrick Escaped = false; 6009467b48Spatrick Tri = ((Tri << 8) + Char) & 0xFFFFFF; 6109467b48Spatrick Len++; 6209467b48Spatrick if (Len < 3) 6309467b48Spatrick continue; 6409467b48Spatrick // We don't want the index to grow too much for the popular trigrams, 6509467b48Spatrick // as they are weak signals. It's ok to still require them for the 6609467b48Spatrick // rules we have already processed. It's just a small additional 6709467b48Spatrick // computational cost. 6809467b48Spatrick if (Index[Tri].size() >= 4) 6909467b48Spatrick continue; 7009467b48Spatrick Cnt++; 7109467b48Spatrick if (!Was.count(Tri)) { 7209467b48Spatrick // Adding the current rule to the index. 7309467b48Spatrick Index[Tri].push_back(Counts.size()); 7409467b48Spatrick Was.insert(Tri); 7509467b48Spatrick } 7609467b48Spatrick } 7709467b48Spatrick if (!Cnt) { 7809467b48Spatrick // This rule does not have remarkable trigrams to rely on. 7909467b48Spatrick // We have to always call the full regex chain. 8009467b48Spatrick Defeated = true; 8109467b48Spatrick return; 8209467b48Spatrick } 8309467b48Spatrick Counts.push_back(Cnt); 8409467b48Spatrick } 8509467b48Spatrick isDefinitelyOut(StringRef Query) const8609467b48Spatrickbool TrigramIndex::isDefinitelyOut(StringRef Query) const { 8709467b48Spatrick if (Defeated) 8809467b48Spatrick return false; 8909467b48Spatrick std::vector<unsigned> CurCounts(Counts.size()); 9009467b48Spatrick unsigned Tri = 0; 9109467b48Spatrick for (size_t I = 0; I < Query.size(); I++) { 9209467b48Spatrick Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; 9309467b48Spatrick if (I < 2) 9409467b48Spatrick continue; 9509467b48Spatrick const auto &II = Index.find(Tri); 9609467b48Spatrick if (II == Index.end()) 9709467b48Spatrick continue; 9809467b48Spatrick for (size_t J : II->second) { 9909467b48Spatrick CurCounts[J]++; 10009467b48Spatrick // If we have reached a desired limit, we have to look at the query 10109467b48Spatrick // more closely by running a full regex. 10209467b48Spatrick if (CurCounts[J] >= Counts[J]) 10309467b48Spatrick return false; 10409467b48Spatrick } 10509467b48Spatrick } 10609467b48Spatrick return true; 10709467b48Spatrick } 108