1311116Sdim//===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===// 2311116Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6311116Sdim// 7311116Sdim//===----------------------------------------------------------------------===// 8311116Sdim// 9311116Sdim// TrigramIndex implements a heuristic for SpecialCaseList that allows to 10311116Sdim// filter out ~99% incoming queries when all regular expressions in the 11311116Sdim// SpecialCaseList are simple wildcards with '*' and '.'. If rules are more 12311116Sdim// complicated, the check is defeated and it will always pass the queries to a 13311116Sdim// full regex. 14311116Sdim// 15311116Sdim//===----------------------------------------------------------------------===// 16311116Sdim 17311116Sdim#include "llvm/Support/TrigramIndex.h" 18311116Sdim#include "llvm/ADT/SmallVector.h" 19311116Sdim 20311116Sdim#include <set> 21311116Sdim#include <string> 22321369Sdim#include <unordered_map> 23311116Sdim 24311116Sdimusing namespace llvm; 25311116Sdim 26311116Sdimstatic const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; 27311116Sdim 28311116Sdimstatic bool isAdvancedMetachar(unsigned Char) { 29311116Sdim return strchr(RegexAdvancedMetachars, Char) != nullptr; 30311116Sdim} 31311116Sdim 32311116Sdimvoid TrigramIndex::insert(std::string Regex) { 33311116Sdim if (Defeated) return; 34311116Sdim std::set<unsigned> Was; 35311116Sdim unsigned Cnt = 0; 36311116Sdim unsigned Tri = 0; 37311116Sdim unsigned Len = 0; 38311116Sdim bool Escaped = false; 39311116Sdim for (unsigned Char : Regex) { 40311116Sdim if (!Escaped) { 41311116Sdim // Regular expressions allow escaping symbols by preceding it with '\'. 42311116Sdim if (Char == '\\') { 43311116Sdim Escaped = true; 44311116Sdim continue; 45311116Sdim } 46311116Sdim if (isAdvancedMetachar(Char)) { 47311116Sdim // This is a more complicated regex than we can handle here. 48311116Sdim Defeated = true; 49311116Sdim return; 50311116Sdim } 51311116Sdim if (Char == '.' || Char == '*') { 52311116Sdim Tri = 0; 53311116Sdim Len = 0; 54311116Sdim continue; 55311116Sdim } 56311116Sdim } 57311116Sdim if (Escaped && Char >= '1' && Char <= '9') { 58311116Sdim Defeated = true; 59311116Sdim return; 60311116Sdim } 61311116Sdim // We have already handled escaping and can reset the flag. 62311116Sdim Escaped = false; 63311116Sdim Tri = ((Tri << 8) + Char) & 0xFFFFFF; 64311116Sdim Len++; 65311116Sdim if (Len < 3) 66311116Sdim continue; 67311116Sdim // We don't want the index to grow too much for the popular trigrams, 68311116Sdim // as they are weak signals. It's ok to still require them for the 69311116Sdim // rules we have already processed. It's just a small additional 70311116Sdim // computational cost. 71311116Sdim if (Index[Tri].size() >= 4) 72311116Sdim continue; 73311116Sdim Cnt++; 74311116Sdim if (!Was.count(Tri)) { 75311116Sdim // Adding the current rule to the index. 76311116Sdim Index[Tri].push_back(Counts.size()); 77311116Sdim Was.insert(Tri); 78311116Sdim } 79311116Sdim } 80311116Sdim if (!Cnt) { 81311116Sdim // This rule does not have remarkable trigrams to rely on. 82311116Sdim // We have to always call the full regex chain. 83311116Sdim Defeated = true; 84311116Sdim return; 85311116Sdim } 86311116Sdim Counts.push_back(Cnt); 87311116Sdim} 88311116Sdim 89311116Sdimbool TrigramIndex::isDefinitelyOut(StringRef Query) const { 90311116Sdim if (Defeated) 91311116Sdim return false; 92311116Sdim std::vector<unsigned> CurCounts(Counts.size()); 93311116Sdim unsigned Tri = 0; 94311116Sdim for (size_t I = 0; I < Query.size(); I++) { 95311116Sdim Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; 96311116Sdim if (I < 2) 97311116Sdim continue; 98311116Sdim const auto &II = Index.find(Tri); 99311116Sdim if (II == Index.end()) 100311116Sdim continue; 101311116Sdim for (size_t J : II->second) { 102311116Sdim CurCounts[J]++; 103311116Sdim // If we have reached a desired limit, we have to look at the query 104311116Sdim // more closely by running a full regex. 105311116Sdim if (CurCounts[J] >= Counts[J]) 106311116Sdim return false; 107311116Sdim } 108311116Sdim } 109311116Sdim return true; 110311116Sdim} 111