1//===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a glob pattern matcher.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Support/GlobPattern.h"
14#include "llvm/ADT/StringRef.h"
15#include "llvm/Support/Errc.h"
16
17using namespace llvm;
18
19// Expands character ranges and returns a bitmap.
20// For example, "a-cf-hz" is expanded to "abcfghz".
21static Expected<BitVector> expand(StringRef S, StringRef Original) {
22  BitVector BV(256, false);
23
24  // Expand X-Y.
25  for (;;) {
26    if (S.size() < 3)
27      break;
28
29    uint8_t Start = S[0];
30    uint8_t End = S[2];
31
32    // If it doesn't start with something like X-Y,
33    // consume the first character and proceed.
34    if (S[1] != '-') {
35      BV[Start] = true;
36      S = S.substr(1);
37      continue;
38    }
39
40    // It must be in the form of X-Y.
41    // Validate it and then interpret the range.
42    if (Start > End)
43      return make_error<StringError>("invalid glob pattern: " + Original,
44                                     errc::invalid_argument);
45
46    for (int C = Start; C <= End; ++C)
47      BV[(uint8_t)C] = true;
48    S = S.substr(3);
49  }
50
51  for (char C : S)
52    BV[(uint8_t)C] = true;
53  return BV;
54}
55
56// Identify brace expansions in S and return the list of patterns they expand
57// into.
58static Expected<SmallVector<std::string, 1>>
59parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
60  SmallVector<std::string> SubPatterns = {S.str()};
61  if (!MaxSubPatterns || !S.contains('{'))
62    return std::move(SubPatterns);
63
64  struct BraceExpansion {
65    size_t Start;
66    size_t Length;
67    SmallVector<StringRef, 2> Terms;
68  };
69  SmallVector<BraceExpansion, 0> BraceExpansions;
70
71  BraceExpansion *CurrentBE = nullptr;
72  size_t TermBegin;
73  for (size_t I = 0, E = S.size(); I != E; ++I) {
74    if (S[I] == '[') {
75      I = S.find(']', I + 2);
76      if (I == std::string::npos)
77        return make_error<StringError>("invalid glob pattern, unmatched '['",
78                                       errc::invalid_argument);
79    } else if (S[I] == '{') {
80      if (CurrentBE)
81        return make_error<StringError>(
82            "nested brace expansions are not supported",
83            errc::invalid_argument);
84      CurrentBE = &BraceExpansions.emplace_back();
85      CurrentBE->Start = I;
86      TermBegin = I + 1;
87    } else if (S[I] == ',') {
88      if (!CurrentBE)
89        continue;
90      CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
91      TermBegin = I + 1;
92    } else if (S[I] == '}') {
93      if (!CurrentBE)
94        continue;
95      if (CurrentBE->Terms.empty())
96        return make_error<StringError>(
97            "empty or singleton brace expansions are not supported",
98            errc::invalid_argument);
99      CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
100      CurrentBE->Length = I - CurrentBE->Start + 1;
101      CurrentBE = nullptr;
102    } else if (S[I] == '\\') {
103      if (++I == E)
104        return make_error<StringError>("invalid glob pattern, stray '\\'",
105                                       errc::invalid_argument);
106    }
107  }
108  if (CurrentBE)
109    return make_error<StringError>("incomplete brace expansion",
110                                   errc::invalid_argument);
111
112  size_t NumSubPatterns = 1;
113  for (auto &BE : BraceExpansions) {
114    if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
115      NumSubPatterns = std::numeric_limits<size_t>::max();
116      break;
117    }
118    NumSubPatterns *= BE.Terms.size();
119  }
120  if (NumSubPatterns > *MaxSubPatterns)
121    return make_error<StringError>("too many brace expansions",
122                                   errc::invalid_argument);
123  // Replace brace expansions in reverse order so that we don't invalidate
124  // earlier start indices
125  for (auto &BE : reverse(BraceExpansions)) {
126    SmallVector<std::string> OrigSubPatterns;
127    std::swap(SubPatterns, OrigSubPatterns);
128    for (StringRef Term : BE.Terms)
129      for (StringRef Orig : OrigSubPatterns)
130        SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
131  }
132  return std::move(SubPatterns);
133}
134
135Expected<GlobPattern>
136GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
137  GlobPattern Pat;
138
139  // Store the prefix that does not contain any metacharacter.
140  size_t PrefixSize = S.find_first_of("?*[{\\");
141  Pat.Prefix = S.substr(0, PrefixSize);
142  if (PrefixSize == std::string::npos)
143    return Pat;
144  S = S.substr(PrefixSize);
145
146  SmallVector<std::string, 1> SubPats;
147  if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
148    return std::move(Err);
149  for (StringRef SubPat : SubPats) {
150    auto SubGlobOrErr = SubGlobPattern::create(SubPat);
151    if (!SubGlobOrErr)
152      return SubGlobOrErr.takeError();
153    Pat.SubGlobs.push_back(*SubGlobOrErr);
154  }
155
156  return Pat;
157}
158
159Expected<GlobPattern::SubGlobPattern>
160GlobPattern::SubGlobPattern::create(StringRef S) {
161  SubGlobPattern Pat;
162
163  // Parse brackets.
164  Pat.Pat.assign(S.begin(), S.end());
165  for (size_t I = 0, E = S.size(); I != E; ++I) {
166    if (S[I] == '[') {
167      // ']' is allowed as the first character of a character class. '[]' is
168      // invalid. So, just skip the first character.
169      ++I;
170      size_t J = S.find(']', I + 1);
171      if (J == StringRef::npos)
172        return make_error<StringError>("invalid glob pattern, unmatched '['",
173                                       errc::invalid_argument);
174      StringRef Chars = S.substr(I, J - I);
175      bool Invert = S[I] == '^' || S[I] == '!';
176      Expected<BitVector> BV =
177          Invert ? expand(Chars.substr(1), S) : expand(Chars, S);
178      if (!BV)
179        return BV.takeError();
180      if (Invert)
181        BV->flip();
182      Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
183      I = J;
184    } else if (S[I] == '\\') {
185      if (++I == E)
186        return make_error<StringError>("invalid glob pattern, stray '\\'",
187                                       errc::invalid_argument);
188    }
189  }
190  return Pat;
191}
192
193bool GlobPattern::match(StringRef S) const {
194  if (!S.consume_front(Prefix))
195    return false;
196  if (SubGlobs.empty() && S.empty())
197    return true;
198  for (auto &Glob : SubGlobs)
199    if (Glob.match(S))
200      return true;
201  return false;
202}
203
204// Factor the pattern into segments split by '*'. The segment is matched
205// sequentianlly by finding the first occurrence past the end of the previous
206// match.
207bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
208  const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
209             *SavedS = S;
210  const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
211  size_t B = 0, SavedB = 0;
212  while (S != End) {
213    if (P == PEnd)
214      ;
215    else if (*P == '*') {
216      // The non-* substring on the left of '*' matches the tail of S. Save the
217      // positions to be used by backtracking if we see a mismatch later.
218      SegmentBegin = ++P;
219      SavedS = S;
220      SavedB = B;
221      continue;
222    } else if (*P == '[') {
223      if (Brackets[B].Bytes[uint8_t(*S)]) {
224        P = Pat.data() + Brackets[B++].NextOffset;
225        ++S;
226        continue;
227      }
228    } else if (*P == '\\') {
229      if (*++P == *S) {
230        ++P;
231        ++S;
232        continue;
233      }
234    } else if (*P == *S || *P == '?') {
235      ++P;
236      ++S;
237      continue;
238    }
239    if (!SegmentBegin)
240      return false;
241    // We have seen a '*'. Backtrack to the saved positions. Shift the S
242    // position to probe the next starting position in the segment.
243    P = SegmentBegin;
244    S = ++SavedS;
245    B = SavedB;
246  }
247  // All bytes in Str have been matched. Return true if the rest part of Pat is
248  // empty or contains only '*'.
249  return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
250}
251