1/*-
2 * Copyright (c) 2003-2007 Tim Kientzle
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "archive_platform.h"
28
29#ifdef HAVE_STRING_H
30#include <string.h>
31#endif
32#ifdef HAVE_WCHAR_H
33#include <wchar.h>
34#endif
35
36#include "archive_pathmatch.h"
37
38/*
39 * Check whether a character 'c' is matched by a list specification [...]:
40 *    * Leading '!' or '^' negates the class.
41 *    * <char>-<char> is a range of characters
42 *    * \<char> removes any special meaning for <char>
43 *
44 * Some interesting boundary cases:
45 *   a-d-e is one range (a-d) followed by two single characters - and e.
46 *   \a-\d is same as a-d
47 *   a\-d is three single characters: a, d, -
48 *   Trailing - is not special (so [a-] is two characters a and -).
49 *   Initial - is not special ([a-] is same as [-a] is same as [\\-a])
50 *   This function never sees a trailing \.
51 *   [] always fails
52 *   [!] always succeeds
53 */
54static int
55pm_list(const char *start, const char *end, const char c, int flags)
56{
57	const char *p = start;
58	char rangeStart = '\0', nextRangeStart;
59	int match = 1, nomatch = 0;
60
61	/* This will be used soon... */
62	(void)flags; /* UNUSED */
63
64	/* If this is a negated class, return success for nomatch. */
65	if ((*p == '!' || *p == '^') && p < end) {
66		match = 0;
67		nomatch = 1;
68		++p;
69	}
70
71	while (p < end) {
72		nextRangeStart = '\0';
73		switch (*p) {
74		case '-':
75			/* Trailing or initial '-' is not special. */
76			if ((rangeStart == '\0') || (p == end - 1)) {
77				if (*p == c)
78					return (match);
79			} else {
80				char rangeEnd = *++p;
81				if (rangeEnd == '\\')
82					rangeEnd = *++p;
83				if ((rangeStart <= c) && (c <= rangeEnd))
84					return (match);
85			}
86			break;
87		case '\\':
88			++p;
89			/* Fall through */
90		default:
91			if (*p == c)
92				return (match);
93			nextRangeStart = *p; /* Possible start of range. */
94		}
95		rangeStart = nextRangeStart;
96		++p;
97	}
98	return (nomatch);
99}
100
101static int
102pm_list_w(const wchar_t *start, const wchar_t *end, const wchar_t c, int flags)
103{
104	const wchar_t *p = start;
105	wchar_t rangeStart = L'\0', nextRangeStart;
106	int match = 1, nomatch = 0;
107
108	/* This will be used soon... */
109	(void)flags; /* UNUSED */
110
111	/* If this is a negated class, return success for nomatch. */
112	if ((*p == L'!' || *p == L'^') && p < end) {
113		match = 0;
114		nomatch = 1;
115		++p;
116	}
117
118	while (p < end) {
119		nextRangeStart = L'\0';
120		switch (*p) {
121		case L'-':
122			/* Trailing or initial '-' is not special. */
123			if ((rangeStart == L'\0') || (p == end - 1)) {
124				if (*p == c)
125					return (match);
126			} else {
127				wchar_t rangeEnd = *++p;
128				if (rangeEnd == L'\\')
129					rangeEnd = *++p;
130				if ((rangeStart <= c) && (c <= rangeEnd))
131					return (match);
132			}
133			break;
134		case L'\\':
135			++p;
136			/* Fall through */
137		default:
138			if (*p == c)
139				return (match);
140			nextRangeStart = *p; /* Possible start of range. */
141		}
142		rangeStart = nextRangeStart;
143		++p;
144	}
145	return (nomatch);
146}
147
148/*
149 * If s is pointing to "./", ".//", "./././" or the like, skip it.
150 */
151static const char *
152pm_slashskip(const char *s) {
153	while ((*s == '/')
154	    || (s[0] == '.' && s[1] == '/')
155	    || (s[0] == '.' && s[1] == '\0'))
156		++s;
157	return (s);
158}
159
160static const wchar_t *
161pm_slashskip_w(const wchar_t *s) {
162	while ((*s == L'/')
163	    || (s[0] == L'.' && s[1] == L'/')
164	    || (s[0] == L'.' && s[1] == L'\0'))
165		++s;
166	return (s);
167}
168
169static int
170pm(const char *p, const char *s, int flags)
171{
172	const char *end;
173
174	/*
175	 * Ignore leading './', './/', '././', etc.
176	 */
177	if (s[0] == '.' && s[1] == '/')
178		s = pm_slashskip(s + 1);
179	if (p[0] == '.' && p[1] == '/')
180		p = pm_slashskip(p + 1);
181
182	for (;;) {
183		switch (*p) {
184		case '\0':
185			if (s[0] == '/') {
186				if (flags & PATHMATCH_NO_ANCHOR_END)
187					return (1);
188				/* "dir" == "dir/" == "dir/." */
189				s = pm_slashskip(s);
190			}
191			return (*s == '\0');
192		case '?':
193			/* ? always succeeds, unless we hit end of 's' */
194			if (*s == '\0')
195				return (0);
196			break;
197		case '*':
198			/* "*" == "**" == "***" ... */
199			while (*p == '*')
200				++p;
201			/* Trailing '*' always succeeds. */
202			if (*p == '\0')
203				return (1);
204			while (*s) {
205				if (archive_pathmatch(p, s, flags))
206					return (1);
207				++s;
208			}
209			return (0);
210		case '[':
211			/*
212			 * Find the end of the [...] character class,
213			 * ignoring \] that might occur within the class.
214			 */
215			end = p + 1;
216			while (*end != '\0' && *end != ']') {
217				if (*end == '\\' && end[1] != '\0')
218					++end;
219				++end;
220			}
221			if (*end == ']') {
222				/* We found [...], try to match it. */
223				if (!pm_list(p + 1, end, *s, flags))
224					return (0);
225				p = end; /* Jump to trailing ']' char. */
226				break;
227			} else
228				/* No final ']', so just match '['. */
229				if (*p != *s)
230					return (0);
231			break;
232		case '\\':
233			/* Trailing '\\' matches itself. */
234			if (p[1] == '\0') {
235				if (*s != '\\')
236					return (0);
237			} else {
238				++p;
239				if (*p != *s)
240					return (0);
241			}
242			break;
243		case '/':
244			if (*s != '/' && *s != '\0')
245				return (0);
246			/* Note: pattern "/\./" won't match "/";
247			 * pm_slashskip() correctly stops at backslash. */
248			p = pm_slashskip(p);
249			s = pm_slashskip(s);
250			if (*p == '\0' && (flags & PATHMATCH_NO_ANCHOR_END))
251				return (1);
252			--p; /* Counteract the increment below. */
253			--s;
254			break;
255		case '$':
256			/* '$' is special only at end of pattern and only
257			 * if PATHMATCH_NO_ANCHOR_END is specified. */
258			if (p[1] == '\0' && (flags & PATHMATCH_NO_ANCHOR_END)){
259				/* "dir" == "dir/" == "dir/." */
260				return (*pm_slashskip(s) == '\0');
261			}
262			/* Otherwise, '$' is not special. */
263			/* FALL THROUGH */
264		default:
265			if (*p != *s)
266				return (0);
267			break;
268		}
269		++p;
270		++s;
271	}
272}
273
274static int
275pm_w(const wchar_t *p, const wchar_t *s, int flags)
276{
277	const wchar_t *end;
278
279	/*
280	 * Ignore leading './', './/', '././', etc.
281	 */
282	if (s[0] == L'.' && s[1] == L'/')
283		s = pm_slashskip_w(s + 1);
284	if (p[0] == L'.' && p[1] == L'/')
285		p = pm_slashskip_w(p + 1);
286
287	for (;;) {
288		switch (*p) {
289		case L'\0':
290			if (s[0] == L'/') {
291				if (flags & PATHMATCH_NO_ANCHOR_END)
292					return (1);
293				/* "dir" == "dir/" == "dir/." */
294				s = pm_slashskip_w(s);
295			}
296			return (*s == L'\0');
297		case L'?':
298			/* ? always succeeds, unless we hit end of 's' */
299			if (*s == L'\0')
300				return (0);
301			break;
302		case L'*':
303			/* "*" == "**" == "***" ... */
304			while (*p == L'*')
305				++p;
306			/* Trailing '*' always succeeds. */
307			if (*p == L'\0')
308				return (1);
309			while (*s) {
310				if (archive_pathmatch_w(p, s, flags))
311					return (1);
312				++s;
313			}
314			return (0);
315		case L'[':
316			/*
317			 * Find the end of the [...] character class,
318			 * ignoring \] that might occur within the class.
319			 */
320			end = p + 1;
321			while (*end != L'\0' && *end != L']') {
322				if (*end == L'\\' && end[1] != L'\0')
323					++end;
324				++end;
325			}
326			if (*end == L']') {
327				/* We found [...], try to match it. */
328				if (!pm_list_w(p + 1, end, *s, flags))
329					return (0);
330				p = end; /* Jump to trailing ']' char. */
331				break;
332			} else
333				/* No final ']', so just match '['. */
334				if (*p != *s)
335					return (0);
336			break;
337		case L'\\':
338			/* Trailing '\\' matches itself. */
339			if (p[1] == L'\0') {
340				if (*s != L'\\')
341					return (0);
342			} else {
343				++p;
344				if (*p != *s)
345					return (0);
346			}
347			break;
348		case L'/':
349			if (*s != L'/' && *s != L'\0')
350				return (0);
351			/* Note: pattern "/\./" won't match "/";
352			 * pm_slashskip() correctly stops at backslash. */
353			p = pm_slashskip_w(p);
354			s = pm_slashskip_w(s);
355			if (*p == L'\0' && (flags & PATHMATCH_NO_ANCHOR_END))
356				return (1);
357			--p; /* Counteract the increment below. */
358			--s;
359			break;
360		case L'$':
361			/* '$' is special only at end of pattern and only
362			 * if PATHMATCH_NO_ANCHOR_END is specified. */
363			if (p[1] == L'\0' && (flags & PATHMATCH_NO_ANCHOR_END)){
364				/* "dir" == "dir/" == "dir/." */
365				return (*pm_slashskip_w(s) == L'\0');
366			}
367			/* Otherwise, '$' is not special. */
368			/* FALL THROUGH */
369		default:
370			if (*p != *s)
371				return (0);
372			break;
373		}
374		++p;
375		++s;
376	}
377}
378
379/* Main entry point. */
380int
381__archive_pathmatch(const char *p, const char *s, int flags)
382{
383	/* Empty pattern only matches the empty string. */
384	if (p == NULL || *p == '\0')
385		return (s == NULL || *s == '\0');
386	else if (s == NULL)
387		return (0);
388
389	/* Leading '^' anchors the start of the pattern. */
390	if (*p == '^') {
391		++p;
392		flags &= ~PATHMATCH_NO_ANCHOR_START;
393	}
394
395	if (*p == '/' && *s != '/')
396		return (0);
397
398	/* Certain patterns anchor implicitly. */
399	if (*p == '*' || *p == '/') {
400		while (*p == '/')
401			++p;
402		while (*s == '/')
403			++s;
404		return (pm(p, s, flags));
405	}
406
407	/* If start is unanchored, try to match start of each path element. */
408	if (flags & PATHMATCH_NO_ANCHOR_START) {
409		for ( ; s != NULL; s = strchr(s, '/')) {
410			if (*s == '/')
411				s++;
412			if (pm(p, s, flags))
413				return (1);
414		}
415		return (0);
416	}
417
418	/* Default: Match from beginning. */
419	return (pm(p, s, flags));
420}
421
422int
423__archive_pathmatch_w(const wchar_t *p, const wchar_t *s, int flags)
424{
425	/* Empty pattern only matches the empty string. */
426	if (p == NULL || *p == L'\0')
427		return (s == NULL || *s == L'\0');
428	else if (s == NULL)
429		return (0);
430
431	/* Leading '^' anchors the start of the pattern. */
432	if (*p == L'^') {
433		++p;
434		flags &= ~PATHMATCH_NO_ANCHOR_START;
435	}
436
437	if (*p == L'/' && *s != L'/')
438		return (0);
439
440	/* Certain patterns anchor implicitly. */
441	if (*p == L'*' || *p == L'/') {
442		while (*p == L'/')
443			++p;
444		while (*s == L'/')
445			++s;
446		return (pm_w(p, s, flags));
447	}
448
449	/* If start is unanchored, try to match start of each path element. */
450	if (flags & PATHMATCH_NO_ANCHOR_START) {
451		for ( ; s != NULL; s = wcschr(s, L'/')) {
452			if (*s == L'/')
453				s++;
454			if (pm_w(p, s, flags))
455				return (1);
456		}
457		return (0);
458	}
459
460	/* Default: Match from beginning. */
461	return (pm_w(p, s, flags));
462}
463