1/*
2 * An implementation of what I call the "Sea of Stars" algorithm for
3 * POSIX fnmatch(). The basic idea is that we factor the pattern into
4 * a head component (which we match first and can reject without ever
5 * measuring the length of the string), an optional tail component
6 * (which only exists if the pattern contains at least one star), and
7 * an optional "sea of stars", a set of star-separated components
8 * between the head and tail. After the head and tail matches have
9 * been removed from the input string, the components in the "sea of
10 * stars" are matched sequentially by searching for their first
11 * occurrence past the end of the previous match.
12 *
13 * - Rich Felker, April 2012
14 */
15
16#include <string.h>
17#include <fnmatch.h>
18#include <stdlib.h>
19#include <wchar.h>
20#include <wctype.h>
21#include "locale_impl.h"
22
23#define END 0
24#define UNMATCHABLE -2
25#define BRACKET -3
26#define QUESTION -4
27#define STAR -5
28
29static int str_next(const char *str, size_t n, size_t *step)
30{
31	if (!n) {
32		*step = 0;
33		return 0;
34	}
35	if (str[0] >= 128U) {
36		wchar_t wc;
37		int k = mbtowc(&wc, str, n);
38		if (k<0) {
39			*step = 1;
40			return -1;
41		}
42		*step = k;
43		return wc;
44	}
45	*step = 1;
46	return str[0];
47}
48
49static int pat_next(const char *pat, size_t m, size_t *step, int flags)
50{
51	int esc = 0;
52	if (!m || !*pat) {
53		*step = 0;
54		return END;
55	}
56	*step = 1;
57	if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) {
58		*step = 2;
59		pat++;
60		esc = 1;
61		goto escaped;
62	}
63	if (pat[0]=='[') {
64		size_t k = 1;
65		if (k<m) if (pat[k] == '^' || pat[k] == '!') k++;
66		if (k<m) if (pat[k] == ']') k++;
67		for (; k<m && pat[k] && pat[k]!=']'; k++) {
68			if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) {
69				int z = pat[k+1];
70				k+=2;
71				if (k<m && pat[k]) k++;
72				while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++;
73				if (k==m || !pat[k]) break;
74			}
75		}
76		if (k==m || !pat[k]) {
77			*step = 1;
78			return '[';
79		}
80		*step = k+1;
81		return BRACKET;
82	}
83	if (pat[0] == '*')
84		return STAR;
85	if (pat[0] == '?')
86		return QUESTION;
87escaped:
88	if (pat[0] >= 128U) {
89		wchar_t wc;
90		int k = mbtowc(&wc, pat, m);
91		if (k<0) {
92			*step = 0;
93			return UNMATCHABLE;
94		}
95		*step = k + esc;
96		return wc;
97	}
98	return pat[0];
99}
100
101static int casefold(int k)
102{
103	int c = towupper(k);
104	return c == k ? towlower(k) : c;
105}
106
107static int match_bracket(const char *p, int k, int kfold)
108{
109	wchar_t wc;
110	int inv = 0;
111	p++;
112	if (*p=='^' || *p=='!') {
113		inv = 1;
114		p++;
115	}
116	if (*p==']') {
117		if (k==']') return !inv;
118		p++;
119	} else if (*p=='-') {
120		if (k=='-') return !inv;
121		p++;
122	}
123	wc = p[-1];
124	for (; *p != ']'; p++) {
125		if (p[0]=='-' && p[1]!=']') {
126			wchar_t wc2;
127			int l = mbtowc(&wc2, p+1, 4);
128			if (l < 0) return 0;
129			if (wc <= wc2)
130				if ((unsigned)k-wc <= wc2-wc ||
131				    (unsigned)kfold-wc <= wc2-wc)
132					return !inv;
133			p += l-1;
134			continue;
135		}
136		if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) {
137			const char *p0 = p+2;
138			int z = p[1];
139			p+=3;
140			while (p[-1]!=z || p[0]!=']') p++;
141			if (z == ':' && p-1-p0 < 16) {
142				char buf[16];
143				memcpy(buf, p0, p-1-p0);
144				buf[p-1-p0] = 0;
145				if (iswctype(k, wctype(buf)) ||
146				    iswctype(kfold, wctype(buf)))
147					return !inv;
148			}
149			continue;
150		}
151		if (*p < 128U) {
152			wc = (unsigned char)*p;
153		} else {
154			int l = mbtowc(&wc, p, 4);
155			if (l < 0) return 0;
156			p += l-1;
157		}
158		if (wc==k || wc==kfold) return !inv;
159	}
160	return inv;
161}
162
163static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags)
164{
165	const char *p, *ptail, *endpat;
166	const char *s, *stail, *endstr;
167	size_t pinc, sinc, tailcnt=0;
168	int c, k, kfold;
169
170	if (flags & FNM_PERIOD) {
171		if (*str == '.' && *pat != '.')
172			return FNM_NOMATCH;
173	}
174	for (;;) {
175		switch ((c = pat_next(pat, m, &pinc, flags))) {
176		case UNMATCHABLE:
177			return FNM_NOMATCH;
178		case STAR:
179			pat++;
180			m--;
181			break;
182		default:
183			k = str_next(str, n, &sinc);
184			if (k <= 0)
185				return (c==END) ? 0 : FNM_NOMATCH;
186			str += sinc;
187			n -= sinc;
188			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
189			if (c == BRACKET) {
190				if (!match_bracket(pat, k, kfold))
191					return FNM_NOMATCH;
192			} else if (c != QUESTION && k != c && kfold != c) {
193				return FNM_NOMATCH;
194			}
195			pat+=pinc;
196			m-=pinc;
197			continue;
198		}
199		break;
200	}
201
202	/* Compute real pat length if it was initially unknown/-1 */
203	m = strnlen(pat, m);
204	endpat = pat + m;
205
206	/* Find the last * in pat and count chars needed after it */
207	for (p=ptail=pat; p<endpat; p+=pinc) {
208		switch (pat_next(p, endpat-p, &pinc, flags)) {
209		case UNMATCHABLE:
210			return FNM_NOMATCH;
211		case STAR:
212			tailcnt=0;
213			ptail = p+1;
214			break;
215		default:
216			tailcnt++;
217			break;
218		}
219	}
220
221	/* Past this point we need not check for UNMATCHABLE in pat,
222	 * because all of pat has already been parsed once. */
223
224	/* Compute real str length if it was initially unknown/-1 */
225	n = strnlen(str, n);
226	endstr = str + n;
227	if (n < tailcnt) return FNM_NOMATCH;
228
229	/* Find the final tailcnt chars of str, accounting for UTF-8.
230	 * On illegal sequences we may get it wrong, but in that case
231	 * we necessarily have a matching failure anyway. */
232	for (s=endstr; s>str && tailcnt; tailcnt--) {
233		if (s[-1] < 128U || MB_CUR_MAX==1) s--;
234		else while ((unsigned char)*--s-0x80U<0x40 && s>str);
235	}
236	if (tailcnt) return FNM_NOMATCH;
237	stail = s;
238
239	/* Check that the pat and str tails match */
240	p = ptail;
241	for (;;) {
242		c = pat_next(p, endpat-p, &pinc, flags);
243		p += pinc;
244		if ((k = str_next(s, endstr-s, &sinc)) <= 0) {
245			if (c != END) return FNM_NOMATCH;
246			break;
247		}
248		s += sinc;
249		kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
250		if (c == BRACKET) {
251			if (!match_bracket(p-pinc, k, kfold))
252				return FNM_NOMATCH;
253		} else if (c != QUESTION && k != c && kfold != c) {
254			return FNM_NOMATCH;
255		}
256	}
257
258	/* We're all done with the tails now, so throw them out */
259	endstr = stail;
260	endpat = ptail;
261
262	/* Match pattern components until there are none left */
263	while (pat<endpat) {
264		p = pat;
265		s = str;
266		for (;;) {
267			c = pat_next(p, endpat-p, &pinc, flags);
268			p += pinc;
269			/* Encountering * completes/commits a component */
270			if (c == STAR) {
271				pat = p;
272				str = s;
273				break;
274			}
275			k = str_next(s, endstr-s, &sinc);
276			if (!k)
277				return FNM_NOMATCH;
278			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
279			if (c == BRACKET) {
280				if (!match_bracket(p-pinc, k, kfold))
281					break;
282			} else if (c != QUESTION && k != c && kfold != c) {
283				break;
284			}
285			s += sinc;
286		}
287		if (c == STAR) continue;
288		/* If we failed, advance str, by 1 char if it's a valid
289		 * char, or past all invalid bytes otherwise. */
290		k = str_next(str, endstr-str, &sinc);
291		if (k > 0) str += sinc;
292		else for (str++; str_next(str, endstr-str, &sinc)<0; str++);
293	}
294
295	return 0;
296}
297
298int fnmatch(const char *pat, const char *str, int flags)
299{
300	const char *s, *p;
301	size_t inc;
302	int c;
303	if (flags & FNM_PATHNAME) for (;;) {
304		for (s=str; *s && *s!='/'; s++);
305		for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc);
306		if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR)))
307			return FNM_NOMATCH;
308		if (fnmatch_internal(pat, p-pat, str, s-str, flags))
309			return FNM_NOMATCH;
310		if (!c) return 0;
311		str = s+1;
312		pat = p+inc;
313	} else if (flags & FNM_LEADING_DIR) {
314		for (s=str; *s; s++) {
315			if (*s != '/') continue;
316			if (!fnmatch_internal(pat, -1, str, s-str, flags))
317				return 0;
318		}
319	}
320	return fnmatch_internal(pat, -1, str, -1, flags);
321}
322