1311116Sdim//===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2311116Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6311116Sdim//
7311116Sdim//===----------------------------------------------------------------------===//
8311116Sdim//
9311116Sdim// This file implements a glob pattern matcher.
10311116Sdim//
11311116Sdim//===----------------------------------------------------------------------===//
12311116Sdim
13311116Sdim#include "llvm/Support/GlobPattern.h"
14311116Sdim#include "llvm/ADT/ArrayRef.h"
15311116Sdim#include "llvm/ADT/Optional.h"
16311116Sdim#include "llvm/ADT/StringRef.h"
17311116Sdim#include "llvm/Support/Errc.h"
18311116Sdim
19311116Sdimusing namespace llvm;
20311116Sdim
21311116Sdimstatic bool hasWildcard(StringRef S) {
22360784Sdim  return S.find_first_of("?*[\\") != StringRef::npos;
23311116Sdim}
24311116Sdim
25311116Sdim// Expands character ranges and returns a bitmap.
26311116Sdim// For example, "a-cf-hz" is expanded to "abcfghz".
27311116Sdimstatic Expected<BitVector> expand(StringRef S, StringRef Original) {
28311116Sdim  BitVector BV(256, false);
29311116Sdim
30311116Sdim  // Expand X-Y.
31311116Sdim  for (;;) {
32311116Sdim    if (S.size() < 3)
33311116Sdim      break;
34311116Sdim
35327952Sdim    uint8_t Start = S[0];
36327952Sdim    uint8_t End = S[2];
37327952Sdim
38311116Sdim    // If it doesn't start with something like X-Y,
39311116Sdim    // consume the first character and proceed.
40311116Sdim    if (S[1] != '-') {
41327952Sdim      BV[Start] = true;
42311116Sdim      S = S.substr(1);
43311116Sdim      continue;
44311116Sdim    }
45311116Sdim
46311116Sdim    // It must be in the form of X-Y.
47311116Sdim    // Validate it and then interpret the range.
48327952Sdim    if (Start > End)
49311116Sdim      return make_error<StringError>("invalid glob pattern: " + Original,
50311116Sdim                                     errc::invalid_argument);
51311116Sdim
52327952Sdim    for (int C = Start; C <= End; ++C)
53327952Sdim      BV[(uint8_t)C] = true;
54311116Sdim    S = S.substr(3);
55311116Sdim  }
56311116Sdim
57311116Sdim  for (char C : S)
58327952Sdim    BV[(uint8_t)C] = true;
59311116Sdim  return BV;
60311116Sdim}
61311116Sdim
62311116Sdim// This is a scanner for the glob pattern.
63360784Sdim// A glob pattern token is one of "*", "?", "\", "[<chars>]", "[^<chars>]"
64360784Sdim// (which is a negative form of "[<chars>]"), "[!<chars>]" (which is
65360784Sdim// equivalent to "[^<chars>]"), or a non-meta character.
66311116Sdim// This function returns the first token in S.
67311116Sdimstatic Expected<BitVector> scan(StringRef &S, StringRef Original) {
68311116Sdim  switch (S[0]) {
69311116Sdim  case '*':
70311116Sdim    S = S.substr(1);
71311116Sdim    // '*' is represented by an empty bitvector.
72311116Sdim    // All other bitvectors are 256-bit long.
73311116Sdim    return BitVector();
74311116Sdim  case '?':
75311116Sdim    S = S.substr(1);
76311116Sdim    return BitVector(256, true);
77311116Sdim  case '[': {
78360784Sdim    // ']' is allowed as the first character of a character class. '[]' is
79360784Sdim    // invalid. So, just skip the first character.
80360784Sdim    size_t End = S.find(']', 2);
81311116Sdim    if (End == StringRef::npos)
82311116Sdim      return make_error<StringError>("invalid glob pattern: " + Original,
83311116Sdim                                     errc::invalid_argument);
84311116Sdim
85311116Sdim    StringRef Chars = S.substr(1, End - 1);
86311116Sdim    S = S.substr(End + 1);
87360784Sdim    if (Chars.startswith("^") || Chars.startswith("!")) {
88311116Sdim      Expected<BitVector> BV = expand(Chars.substr(1), Original);
89311116Sdim      if (!BV)
90311116Sdim        return BV.takeError();
91311116Sdim      return BV->flip();
92311116Sdim    }
93311116Sdim    return expand(Chars, Original);
94311116Sdim  }
95360784Sdim  case '\\':
96360784Sdim    // Eat this character and fall through below to treat it like a non-meta
97360784Sdim    // character.
98360784Sdim    S = S.substr(1);
99360784Sdim    LLVM_FALLTHROUGH;
100311116Sdim  default:
101311116Sdim    BitVector BV(256, false);
102327952Sdim    BV[(uint8_t)S[0]] = true;
103311116Sdim    S = S.substr(1);
104311116Sdim    return BV;
105311116Sdim  }
106311116Sdim}
107311116Sdim
108311116SdimExpected<GlobPattern> GlobPattern::create(StringRef S) {
109311116Sdim  GlobPattern Pat;
110311116Sdim
111311116Sdim  // S doesn't contain any metacharacter,
112311116Sdim  // so the regular string comparison should work.
113311116Sdim  if (!hasWildcard(S)) {
114311116Sdim    Pat.Exact = S;
115311116Sdim    return Pat;
116311116Sdim  }
117311116Sdim
118360784Sdim  // S is something like "foo*", and the "* is not escaped. We can use
119360784Sdim  // startswith().
120360784Sdim  if (S.endswith("*") && !S.endswith("\\*") && !hasWildcard(S.drop_back())) {
121311116Sdim    Pat.Prefix = S.drop_back();
122311116Sdim    return Pat;
123311116Sdim  }
124311116Sdim
125311116Sdim  // S is something like "*foo". We can use endswith().
126311116Sdim  if (S.startswith("*") && !hasWildcard(S.drop_front())) {
127311116Sdim    Pat.Suffix = S.drop_front();
128311116Sdim    return Pat;
129311116Sdim  }
130311116Sdim
131311116Sdim  // Otherwise, we need to do real glob pattern matching.
132311116Sdim  // Parse the pattern now.
133311116Sdim  StringRef Original = S;
134311116Sdim  while (!S.empty()) {
135311116Sdim    Expected<BitVector> BV = scan(S, Original);
136311116Sdim    if (!BV)
137311116Sdim      return BV.takeError();
138311116Sdim    Pat.Tokens.push_back(*BV);
139311116Sdim  }
140311116Sdim  return Pat;
141311116Sdim}
142311116Sdim
143311116Sdimbool GlobPattern::match(StringRef S) const {
144311116Sdim  if (Exact)
145311116Sdim    return S == *Exact;
146311116Sdim  if (Prefix)
147311116Sdim    return S.startswith(*Prefix);
148311116Sdim  if (Suffix)
149311116Sdim    return S.endswith(*Suffix);
150311116Sdim  return matchOne(Tokens, S);
151311116Sdim}
152311116Sdim
153311116Sdim// Runs glob pattern Pats against string S.
154311116Sdimbool GlobPattern::matchOne(ArrayRef<BitVector> Pats, StringRef S) const {
155311116Sdim  for (;;) {
156311116Sdim    if (Pats.empty())
157311116Sdim      return S.empty();
158311116Sdim
159311116Sdim    // If Pats[0] is '*', try to match Pats[1..] against all possible
160311116Sdim    // tail strings of S to see at least one pattern succeeds.
161311116Sdim    if (Pats[0].size() == 0) {
162311116Sdim      Pats = Pats.slice(1);
163311116Sdim      if (Pats.empty())
164311116Sdim        // Fast path. If a pattern is '*', it matches anything.
165311116Sdim        return true;
166311116Sdim      for (size_t I = 0, E = S.size(); I < E; ++I)
167311116Sdim        if (matchOne(Pats, S.substr(I)))
168311116Sdim          return true;
169311116Sdim      return false;
170311116Sdim    }
171311116Sdim
172311116Sdim    // If Pats[0] is not '*', it must consume one character.
173327952Sdim    if (S.empty() || !Pats[0][(uint8_t)S[0]])
174311116Sdim      return false;
175311116Sdim    Pats = Pats.slice(1);
176311116Sdim    S = S.substr(1);
177311116Sdim  }
178311116Sdim}
179