1311116Sdim//===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===// 2311116Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6311116Sdim// 7311116Sdim//===----------------------------------------------------------------------===// 8311116Sdim// 9311116Sdim// This file implements a glob pattern matcher. 10311116Sdim// 11311116Sdim//===----------------------------------------------------------------------===// 12311116Sdim 13311116Sdim#include "llvm/Support/GlobPattern.h" 14311116Sdim#include "llvm/ADT/ArrayRef.h" 15311116Sdim#include "llvm/ADT/Optional.h" 16311116Sdim#include "llvm/ADT/StringRef.h" 17311116Sdim#include "llvm/Support/Errc.h" 18311116Sdim 19311116Sdimusing namespace llvm; 20311116Sdim 21311116Sdimstatic bool hasWildcard(StringRef S) { 22360784Sdim return S.find_first_of("?*[\\") != StringRef::npos; 23311116Sdim} 24311116Sdim 25311116Sdim// Expands character ranges and returns a bitmap. 26311116Sdim// For example, "a-cf-hz" is expanded to "abcfghz". 27311116Sdimstatic Expected<BitVector> expand(StringRef S, StringRef Original) { 28311116Sdim BitVector BV(256, false); 29311116Sdim 30311116Sdim // Expand X-Y. 31311116Sdim for (;;) { 32311116Sdim if (S.size() < 3) 33311116Sdim break; 34311116Sdim 35327952Sdim uint8_t Start = S[0]; 36327952Sdim uint8_t End = S[2]; 37327952Sdim 38311116Sdim // If it doesn't start with something like X-Y, 39311116Sdim // consume the first character and proceed. 40311116Sdim if (S[1] != '-') { 41327952Sdim BV[Start] = true; 42311116Sdim S = S.substr(1); 43311116Sdim continue; 44311116Sdim } 45311116Sdim 46311116Sdim // It must be in the form of X-Y. 47311116Sdim // Validate it and then interpret the range. 48327952Sdim if (Start > End) 49311116Sdim return make_error<StringError>("invalid glob pattern: " + Original, 50311116Sdim errc::invalid_argument); 51311116Sdim 52327952Sdim for (int C = Start; C <= End; ++C) 53327952Sdim BV[(uint8_t)C] = true; 54311116Sdim S = S.substr(3); 55311116Sdim } 56311116Sdim 57311116Sdim for (char C : S) 58327952Sdim BV[(uint8_t)C] = true; 59311116Sdim return BV; 60311116Sdim} 61311116Sdim 62311116Sdim// This is a scanner for the glob pattern. 63360784Sdim// A glob pattern token is one of "*", "?", "\", "[<chars>]", "[^<chars>]" 64360784Sdim// (which is a negative form of "[<chars>]"), "[!<chars>]" (which is 65360784Sdim// equivalent to "[^<chars>]"), or a non-meta character. 66311116Sdim// This function returns the first token in S. 67311116Sdimstatic Expected<BitVector> scan(StringRef &S, StringRef Original) { 68311116Sdim switch (S[0]) { 69311116Sdim case '*': 70311116Sdim S = S.substr(1); 71311116Sdim // '*' is represented by an empty bitvector. 72311116Sdim // All other bitvectors are 256-bit long. 73311116Sdim return BitVector(); 74311116Sdim case '?': 75311116Sdim S = S.substr(1); 76311116Sdim return BitVector(256, true); 77311116Sdim case '[': { 78360784Sdim // ']' is allowed as the first character of a character class. '[]' is 79360784Sdim // invalid. So, just skip the first character. 80360784Sdim size_t End = S.find(']', 2); 81311116Sdim if (End == StringRef::npos) 82311116Sdim return make_error<StringError>("invalid glob pattern: " + Original, 83311116Sdim errc::invalid_argument); 84311116Sdim 85311116Sdim StringRef Chars = S.substr(1, End - 1); 86311116Sdim S = S.substr(End + 1); 87360784Sdim if (Chars.startswith("^") || Chars.startswith("!")) { 88311116Sdim Expected<BitVector> BV = expand(Chars.substr(1), Original); 89311116Sdim if (!BV) 90311116Sdim return BV.takeError(); 91311116Sdim return BV->flip(); 92311116Sdim } 93311116Sdim return expand(Chars, Original); 94311116Sdim } 95360784Sdim case '\\': 96360784Sdim // Eat this character and fall through below to treat it like a non-meta 97360784Sdim // character. 98360784Sdim S = S.substr(1); 99360784Sdim LLVM_FALLTHROUGH; 100311116Sdim default: 101311116Sdim BitVector BV(256, false); 102327952Sdim BV[(uint8_t)S[0]] = true; 103311116Sdim S = S.substr(1); 104311116Sdim return BV; 105311116Sdim } 106311116Sdim} 107311116Sdim 108311116SdimExpected<GlobPattern> GlobPattern::create(StringRef S) { 109311116Sdim GlobPattern Pat; 110311116Sdim 111311116Sdim // S doesn't contain any metacharacter, 112311116Sdim // so the regular string comparison should work. 113311116Sdim if (!hasWildcard(S)) { 114311116Sdim Pat.Exact = S; 115311116Sdim return Pat; 116311116Sdim } 117311116Sdim 118360784Sdim // S is something like "foo*", and the "* is not escaped. We can use 119360784Sdim // startswith(). 120360784Sdim if (S.endswith("*") && !S.endswith("\\*") && !hasWildcard(S.drop_back())) { 121311116Sdim Pat.Prefix = S.drop_back(); 122311116Sdim return Pat; 123311116Sdim } 124311116Sdim 125311116Sdim // S is something like "*foo". We can use endswith(). 126311116Sdim if (S.startswith("*") && !hasWildcard(S.drop_front())) { 127311116Sdim Pat.Suffix = S.drop_front(); 128311116Sdim return Pat; 129311116Sdim } 130311116Sdim 131311116Sdim // Otherwise, we need to do real glob pattern matching. 132311116Sdim // Parse the pattern now. 133311116Sdim StringRef Original = S; 134311116Sdim while (!S.empty()) { 135311116Sdim Expected<BitVector> BV = scan(S, Original); 136311116Sdim if (!BV) 137311116Sdim return BV.takeError(); 138311116Sdim Pat.Tokens.push_back(*BV); 139311116Sdim } 140311116Sdim return Pat; 141311116Sdim} 142311116Sdim 143311116Sdimbool GlobPattern::match(StringRef S) const { 144311116Sdim if (Exact) 145311116Sdim return S == *Exact; 146311116Sdim if (Prefix) 147311116Sdim return S.startswith(*Prefix); 148311116Sdim if (Suffix) 149311116Sdim return S.endswith(*Suffix); 150311116Sdim return matchOne(Tokens, S); 151311116Sdim} 152311116Sdim 153311116Sdim// Runs glob pattern Pats against string S. 154311116Sdimbool GlobPattern::matchOne(ArrayRef<BitVector> Pats, StringRef S) const { 155311116Sdim for (;;) { 156311116Sdim if (Pats.empty()) 157311116Sdim return S.empty(); 158311116Sdim 159311116Sdim // If Pats[0] is '*', try to match Pats[1..] against all possible 160311116Sdim // tail strings of S to see at least one pattern succeeds. 161311116Sdim if (Pats[0].size() == 0) { 162311116Sdim Pats = Pats.slice(1); 163311116Sdim if (Pats.empty()) 164311116Sdim // Fast path. If a pattern is '*', it matches anything. 165311116Sdim return true; 166311116Sdim for (size_t I = 0, E = S.size(); I < E; ++I) 167311116Sdim if (matchOne(Pats, S.substr(I))) 168311116Sdim return true; 169311116Sdim return false; 170311116Sdim } 171311116Sdim 172311116Sdim // If Pats[0] is not '*', it must consume one character. 173327952Sdim if (S.empty() || !Pats[0][(uint8_t)S[0]]) 174311116Sdim return false; 175311116Sdim Pats = Pats.slice(1); 176311116Sdim S = S.substr(1); 177311116Sdim } 178311116Sdim} 179