1/*
2 * An implementation of what I call the "Sea of Stars" algorithm for
3 * POSIX fnmatch(). The basic idea is that we factor the pattern into
4 * a head component (which we match first and can reject without ever
5 * measuring the length of the string), an optional tail component
6 * (which only exists if the pattern contains at least one star), and
7 * an optional "sea of stars", a set of star-separated components
8 * between the head and tail. After the head and tail matches have
9 * been removed from the input string, the components in the "sea of
10 * stars" are matched sequentially by searching for their first
11 * occurrence past the end of the previous match.
12 *
13 * - Rich Felker, April 2012
14 */
15
16#include "locale_impl.h"
17#include <fnmatch.h>
18#include <stdlib.h>
19#include <string.h>
20#include <wchar.h>
21#include <wctype.h>
22
23#define END 0
24#define UNMATCHABLE -2
25#define BRACKET -3
26#define QUESTION -4
27#define STAR -5
28
29static int str_next(const char* str, size_t n, size_t* step) {
30    if (!n) {
31        *step = 0;
32        return 0;
33    }
34    if (str[0] >= 128U) {
35        wchar_t wc;
36        int k = mbtowc(&wc, str, n);
37        if (k < 0) {
38            *step = 1;
39            return -1;
40        }
41        *step = k;
42        return wc;
43    }
44    *step = 1;
45    return str[0];
46}
47
48static int pat_next(const char* pat, size_t m, size_t* step, int flags) {
49    int esc = 0;
50    if (!m || !*pat) {
51        *step = 0;
52        return END;
53    }
54    *step = 1;
55    if (pat[0] == '\\' && pat[1] && !(flags & FNM_NOESCAPE)) {
56        *step = 2;
57        pat++;
58        esc = 1;
59        goto escaped;
60    }
61    if (pat[0] == '[') {
62        size_t k = 1;
63        if (k < m)
64            if (pat[k] == '^' || pat[k] == '!')
65                k++;
66        if (k < m)
67            if (pat[k] == ']')
68                k++;
69        for (; k < m && pat[k] && pat[k] != ']'; k++) {
70            if (k + 1 < m && pat[k + 1] && pat[k] == '[' &&
71                (pat[k + 1] == ':' || pat[k + 1] == '.' || pat[k + 1] == '=')) {
72                int z = pat[k + 1];
73                k += 2;
74                if (k < m && pat[k])
75                    k++;
76                while (k < m && pat[k] && (pat[k - 1] != z || pat[k] != ']'))
77                    k++;
78                if (k == m || !pat[k])
79                    break;
80            }
81        }
82        if (k == m || !pat[k]) {
83            *step = 1;
84            return '[';
85        }
86        *step = k + 1;
87        return BRACKET;
88    }
89    if (pat[0] == '*')
90        return STAR;
91    if (pat[0] == '?')
92        return QUESTION;
93escaped:
94    if (pat[0] >= 128U) {
95        wchar_t wc;
96        int k = mbtowc(&wc, pat, m);
97        if (k < 0) {
98            *step = 0;
99            return UNMATCHABLE;
100        }
101        *step = k + esc;
102        return wc;
103    }
104    return pat[0];
105}
106
107static int casefold(int k) {
108    int c = towupper(k);
109    return c == k ? towlower(k) : c;
110}
111
112static int match_bracket(const char* p, int k, int kfold) {
113    wchar_t wc;
114    int inv = 0;
115    p++;
116    if (*p == '^' || *p == '!') {
117        inv = 1;
118        p++;
119    }
120    if (*p == ']') {
121        if (k == ']')
122            return !inv;
123        p++;
124    } else if (*p == '-') {
125        if (k == '-')
126            return !inv;
127        p++;
128    }
129    wc = p[-1];
130    for (; *p != ']'; p++) {
131        if (p[0] == '-' && p[1] != ']') {
132            wchar_t wc2;
133            int l = mbtowc(&wc2, p + 1, 4);
134            if (l < 0)
135                return 0;
136            if (wc <= wc2)
137                if ((unsigned)k - wc <= wc2 - wc || (unsigned)kfold - wc <= wc2 - wc)
138                    return !inv;
139            p += l - 1;
140            continue;
141        }
142        if (p[0] == '[' && (p[1] == ':' || p[1] == '.' || p[1] == '=')) {
143            const char* p0 = p + 2;
144            int z = p[1];
145            p += 3;
146            while (p[-1] != z || p[0] != ']')
147                p++;
148            if (z == ':' && p - 1 - p0 < 16) {
149                char buf[16];
150                memcpy(buf, p0, p - 1 - p0);
151                buf[p - 1 - p0] = 0;
152                if (iswctype(k, wctype(buf)) || iswctype(kfold, wctype(buf)))
153                    return !inv;
154            }
155            continue;
156        }
157        if (*p < 128U) {
158            wc = (unsigned char)*p;
159        } else {
160            int l = mbtowc(&wc, p, 4);
161            if (l < 0)
162                return 0;
163            p += l - 1;
164        }
165        if (wc == k || wc == kfold)
166            return !inv;
167    }
168    return inv;
169}
170
171static int fnmatch_internal(const char* pat, size_t m, const char* str, size_t n, int flags) {
172    const char *p, *ptail, *endpat;
173    const char *s, *stail, *endstr;
174    size_t pinc, sinc, tailcnt = 0;
175    int c, k, kfold;
176
177    if (flags & FNM_PERIOD) {
178        if (*str == '.' && *pat != '.')
179            return FNM_NOMATCH;
180    }
181    for (;;) {
182        switch ((c = pat_next(pat, m, &pinc, flags))) {
183        case UNMATCHABLE:
184            return FNM_NOMATCH;
185        case STAR:
186            pat++;
187            m--;
188            break;
189        default:
190            k = str_next(str, n, &sinc);
191            if (k <= 0)
192                return (c == END) ? 0 : FNM_NOMATCH;
193            str += sinc;
194            n -= sinc;
195            kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
196            if (c == BRACKET) {
197                if (!match_bracket(pat, k, kfold))
198                    return FNM_NOMATCH;
199            } else if (c != QUESTION && k != c && kfold != c) {
200                return FNM_NOMATCH;
201            }
202            pat += pinc;
203            m -= pinc;
204            continue;
205        }
206        break;
207    }
208
209    /* Compute real pat length if it was initially unknown/-1 */
210    m = strnlen(pat, m);
211    endpat = pat + m;
212
213    /* Find the last * in pat and count chars needed after it */
214    for (p = ptail = pat; p < endpat; p += pinc) {
215        switch (pat_next(p, endpat - p, &pinc, flags)) {
216        case UNMATCHABLE:
217            return FNM_NOMATCH;
218        case STAR:
219            tailcnt = 0;
220            ptail = p + 1;
221            break;
222        default:
223            tailcnt++;
224            break;
225        }
226    }
227
228    /* Past this point we need not check for UNMATCHABLE in pat,
229     * because all of pat has already been parsed once. */
230
231    /* Compute real str length if it was initially unknown/-1 */
232    n = strnlen(str, n);
233    endstr = str + n;
234    if (n < tailcnt)
235        return FNM_NOMATCH;
236
237    /* Find the final tailcnt chars of str, accounting for UTF-8.
238     * On illegal sequences we may get it wrong, but in that case
239     * we necessarily have a matching failure anyway. */
240    for (s = endstr; s > str && tailcnt; tailcnt--) {
241        if (s[-1] < 128U || MB_CUR_MAX == 1)
242            s--;
243        else
244            while ((unsigned char)*--s - 0x80U < 0x40 && s > str)
245                ;
246    }
247    if (tailcnt)
248        return FNM_NOMATCH;
249    stail = s;
250
251    /* Check that the pat and str tails match */
252    p = ptail;
253    for (;;) {
254        c = pat_next(p, endpat - p, &pinc, flags);
255        p += pinc;
256        if ((k = str_next(s, endstr - s, &sinc)) <= 0) {
257            if (c != END)
258                return FNM_NOMATCH;
259            break;
260        }
261        s += sinc;
262        kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
263        if (c == BRACKET) {
264            if (!match_bracket(p - pinc, k, kfold))
265                return FNM_NOMATCH;
266        } else if (c != QUESTION && k != c && kfold != c) {
267            return FNM_NOMATCH;
268        }
269    }
270
271    /* We're all done with the tails now, so throw them out */
272    endstr = stail;
273    endpat = ptail;
274
275    /* Match pattern components until there are none left */
276    while (pat < endpat) {
277        p = pat;
278        s = str;
279        for (;;) {
280            c = pat_next(p, endpat - p, &pinc, flags);
281            p += pinc;
282            /* Encountering * completes/commits a component */
283            if (c == STAR) {
284                pat = p;
285                str = s;
286                break;
287            }
288            k = str_next(s, endstr - s, &sinc);
289            if (!k)
290                return FNM_NOMATCH;
291            kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
292            if (c == BRACKET) {
293                if (!match_bracket(p - pinc, k, kfold))
294                    break;
295            } else if (c != QUESTION && k != c && kfold != c) {
296                break;
297            }
298            s += sinc;
299        }
300        if (c == STAR)
301            continue;
302        /* If we failed, advance str, by 1 char if it's a valid
303         * char, or past all invalid bytes otherwise. */
304        k = str_next(str, endstr - str, &sinc);
305        if (k > 0)
306            str += sinc;
307        else
308            for (str++; str_next(str, endstr - str, &sinc) < 0; str++)
309                ;
310    }
311
312    return 0;
313}
314
315int fnmatch(const char* pat, const char* str, int flags) {
316    const char *s, *p;
317    size_t inc;
318    int c;
319    if (flags & FNM_PATHNAME)
320        for (;;) {
321            for (s = str; *s && *s != '/'; s++)
322                ;
323            for (p = pat; (c = pat_next(p, -1, &inc, flags)) != END && c != '/'; p += inc)
324                ;
325            if (c != *s && (!*s || !(flags & FNM_LEADING_DIR)))
326                return FNM_NOMATCH;
327            if (fnmatch_internal(pat, p - pat, str, s - str, flags))
328                return FNM_NOMATCH;
329            if (!c)
330                return 0;
331            str = s + 1;
332            pat = p + inc;
333        }
334    else if (flags & FNM_LEADING_DIR) {
335        for (s = str; *s; s++) {
336            if (*s != '/')
337                continue;
338            if (!fnmatch_internal(pat, -1, str, s - str, flags))
339                return 0;
340        }
341    }
342    return fnmatch_internal(pat, -1, str, -1, flags);
343}
344