1/*
2 * An implementation of what I call the "Sea of Stars" algorithm for
3 * POSIX fnmatch(). The basic idea is that we factor the pattern into
4 * a head component (which we match first and can reject without ever
5 * measuring the length of the string), an optional tail component
6 * (which only exists if the pattern contains at least one star), and
7 * an optional "sea of stars", a set of star-separated components
8 * between the head and tail. After the head and tail matches have
9 * been removed from the input string, the components in the "sea of
10 * stars" are matched sequentially by searching for their first
11 * occurrence past the end of the previous match.
12 *
13 * - Rich Felker, April 2012
14 */
15
16#include <string.h>
17#include <fnmatch.h>
18#include <stdlib.h>
19#include <wchar.h>
20#include <wctype.h>
21
22#define END 0
23#define UNMATCHABLE -2
24#define BRACKET -3
25#define QUESTION -4
26#define STAR -5
27
28static int str_next(const char *str, size_t n, size_t *step)
29{
30	if (!n) {
31		*step = 0;
32		return 0;
33	}
34	if (str[0] >= 128U) {
35		wchar_t wc;
36		int k = mbtowc(&wc, str, n);
37		if (k<0) {
38			*step = 1;
39			return -1;
40		}
41		*step = k;
42		return wc;
43	}
44	*step = 1;
45	return str[0];
46}
47
48static int pat_next(const char *pat, size_t m, size_t *step, int flags)
49{
50	int esc = 0;
51	if (!m || !*pat) {
52		*step = 0;
53		return END;
54	}
55	*step = 1;
56	if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) {
57		*step = 2;
58		pat++;
59		esc = 1;
60		goto escaped;
61	}
62	if (pat[0]=='[') {
63		size_t k = 1;
64		if (k<m) if (pat[k] == '^' || pat[k] == '!') k++;
65		if (k<m) if (pat[k] == ']') k++;
66		for (; k<m && pat[k] && pat[k]!=']'; k++) {
67			if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) {
68				int z = pat[k+1];
69				k+=2;
70				if (k<m && pat[k]) k++;
71				while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++;
72				if (k==m || !pat[k]) break;
73			}
74		}
75		if (k==m || !pat[k]) {
76			*step = 1;
77			return '[';
78		}
79		*step = k+1;
80		return BRACKET;
81	}
82	if (pat[0] == '*')
83		return STAR;
84	if (pat[0] == '?')
85		return QUESTION;
86escaped:
87	if (pat[0] >= 128U) {
88		wchar_t wc;
89		int k = mbtowc(&wc, pat, m);
90		if (k<0) {
91			*step = 0;
92			return UNMATCHABLE;
93		}
94		*step = k + esc;
95		return wc;
96	}
97	return pat[0];
98}
99
100static int casefold(int k)
101{
102	int c = towupper(k);
103	return c == k ? towlower(k) : c;
104}
105
106static int match_bracket(const char *p, int k, int kfold)
107{
108	wchar_t wc;
109	int inv = 0;
110	p++;
111	if (*p=='^' || *p=='!') {
112		inv = 1;
113		p++;
114	}
115	if (*p==']') {
116		if (k==']') return !inv;
117		p++;
118	} else if (*p=='-') {
119		if (k=='-') return !inv;
120		p++;
121	}
122	wc = p[-1];
123	for (; *p != ']'; p++) {
124		if (p[0]=='-' && p[1]!=']') {
125			wchar_t wc2;
126			int l = mbtowc(&wc2, p+1, 4);
127			if (l < 0) return 0;
128			if (wc <= wc2)
129				if ((unsigned)k-wc <= wc2-wc ||
130				    (unsigned)kfold-wc <= wc2-wc)
131					return !inv;
132			p += l-1;
133			continue;
134		}
135		if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) {
136			const char *p0 = p+2;
137			int z = p[1];
138			p+=3;
139			while (p[-1]!=z || p[0]!=']') p++;
140			if (z == ':' && p-1-p0 < 16) {
141				char buf[16];
142				memcpy(buf, p0, p-1-p0);
143				buf[p-1-p0] = 0;
144				if (iswctype(k, wctype(buf)) ||
145				    iswctype(kfold, wctype(buf)))
146					return !inv;
147			}
148			continue;
149		}
150		if (*p < 128U) {
151			wc = (unsigned char)*p;
152		} else {
153			int l = mbtowc(&wc, p, 4);
154			if (l < 0) return 0;
155			p += l-1;
156		}
157		if (wc==k || wc==kfold) return !inv;
158	}
159	return inv;
160}
161
162static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags)
163{
164	const char *p, *ptail, *endpat;
165	const char *s, *stail, *endstr;
166	size_t pinc, sinc, tailcnt=0;
167	int c, k, kfold;
168
169	if (flags & FNM_PERIOD) {
170		if (*str == '.' && *pat != '.')
171			return FNM_NOMATCH;
172	}
173	for (;;) {
174		switch ((c = pat_next(pat, m, &pinc, flags))) {
175		case UNMATCHABLE:
176			return FNM_NOMATCH;
177		case STAR:
178			pat++;
179			m--;
180			break;
181		default:
182			k = str_next(str, n, &sinc);
183			if (k <= 0)
184				return (c==END) ? 0 : FNM_NOMATCH;
185			str += sinc;
186			n -= sinc;
187			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
188			if (c == BRACKET) {
189				if (!match_bracket(pat, k, kfold))
190					return FNM_NOMATCH;
191			} else if (c != QUESTION && k != c && kfold != c) {
192				return FNM_NOMATCH;
193			}
194			pat+=pinc;
195			m-=pinc;
196			continue;
197		}
198		break;
199	}
200
201	/* Compute real pat length if it was initially unknown/-1 */
202	m = strnlen(pat, m);
203	endpat = pat + m;
204
205	/* Find the last * in pat and count chars needed after it */
206	for (p=ptail=pat; p<endpat; p+=pinc) {
207		switch (pat_next(p, endpat-p, &pinc, flags)) {
208		case UNMATCHABLE:
209			return FNM_NOMATCH;
210		case STAR:
211			tailcnt=0;
212			ptail = p+1;
213			break;
214		default:
215			tailcnt++;
216			break;
217		}
218	}
219
220	/* Past this point we need not check for UNMATCHABLE in pat,
221	 * because all of pat has already been parsed once. */
222
223	/* Compute real str length if it was initially unknown/-1 */
224	n = strnlen(str, n);
225	endstr = str + n;
226	if (n < tailcnt) return FNM_NOMATCH;
227
228	/* Find the final tailcnt chars of str, accounting for UTF-8.
229	 * On illegal sequences we may get it wrong, but in that case
230	 * we necessarily have a matching failure anyway. */
231	for (s=endstr; s>str && tailcnt; tailcnt--) {
232		if (s[-1] < 128U) s--;
233		else while ((unsigned char)*--s-0x80U<0x40 && s>str);
234	}
235	if (tailcnt) return FNM_NOMATCH;
236	stail = s;
237
238	/* Check that the pat and str tails match */
239	p = ptail;
240	for (;;) {
241		c = pat_next(p, endpat-p, &pinc, flags);
242		p += pinc;
243		if ((k = str_next(s, endstr-s, &sinc)) <= 0) {
244			if (c != END) return FNM_NOMATCH;
245			break;
246		}
247		s += sinc;
248		kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
249		if (c == BRACKET) {
250			if (!match_bracket(p-pinc, k, kfold))
251				return FNM_NOMATCH;
252		} else if (c != QUESTION && k != c && kfold != c) {
253			return FNM_NOMATCH;
254		}
255	}
256
257	/* We're all done with the tails now, so throw them out */
258	endstr = stail;
259	endpat = ptail;
260
261	/* Match pattern components until there are none left */
262	while (pat<endpat) {
263		p = pat;
264		s = str;
265		for (;;) {
266			c = pat_next(p, endpat-p, &pinc, flags);
267			p += pinc;
268			/* Encountering * completes/commits a component */
269			if (c == STAR) {
270				pat = p;
271				str = s;
272				break;
273			}
274			k = str_next(s, endstr-s, &sinc);
275			if (!k)
276				return FNM_NOMATCH;
277			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
278			if (c == BRACKET) {
279				if (!match_bracket(p-pinc, k, kfold))
280					break;
281			} else if (c != QUESTION && k != c && kfold != c) {
282				break;
283			}
284			s += sinc;
285		}
286		if (c == STAR) continue;
287		/* If we failed, advance str, by 1 char if it's a valid
288		 * char, or past all invalid bytes otherwise. */
289		k = str_next(str, endstr-str, &sinc);
290		if (k > 0) str += sinc;
291		else for (str++; str_next(str, endstr-str, &sinc)<0; str++);
292	}
293
294	return 0;
295}
296
297int fnmatch(const char *pat, const char *str, int flags)
298{
299	const char *s, *p;
300	size_t inc;
301	int c;
302	if (flags & FNM_PATHNAME) for (;;) {
303		for (s=str; *s && *s!='/'; s++);
304		for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc);
305		if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR)))
306			return FNM_NOMATCH;
307		if (fnmatch_internal(pat, p-pat, str, s-str, flags))
308			return FNM_NOMATCH;
309		if (!c) return 0;
310		str = s+1;
311		pat = p+inc;
312	} else if (flags & FNM_LEADING_DIR) {
313		for (s=str; *s; s++) {
314			if (*s != '/') continue;
315			if (!fnmatch_internal(pat, -1, str, s-str, flags))
316				return 0;
317		}
318	}
319	return fnmatch_internal(pat, -1, str, -1, flags);
320}
321