archive_pathmatch.c revision 370535
1/*-
2 * Copyright (c) 2003-2007 Tim Kientzle
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer
10 *    in this position and unchanged.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "archive_platform.h"
28__FBSDID("$FreeBSD$");
29
30#ifdef HAVE_STRING_H
31#include <string.h>
32#endif
33#ifdef HAVE_WCHAR_H
34#include <wchar.h>
35#endif
36
37#include "archive_pathmatch.h"
38
39/*
40 * Check whether a character 'c' is matched by a list specification [...]:
41 *    * Leading '!' or '^' negates the class.
42 *    * <char>-<char> is a range of characters
43 *    * \<char> removes any special meaning for <char>
44 *
45 * Some interesting boundary cases:
46 *   a-d-e is one range (a-d) followed by two single characters - and e.
47 *   \a-\d is same as a-d
48 *   a\-d is three single characters: a, d, -
49 *   Trailing - is not special (so [a-] is two characters a and -).
50 *   Initial - is not special ([a-] is same as [-a] is same as [\\-a])
51 *   This function never sees a trailing \.
52 *   [] always fails
53 *   [!] always succeeds
54 */
55static int
56pm_list(const char *start, const char *end, const char c, int flags)
57{
58	const char *p = start;
59	char rangeStart = '\0', nextRangeStart;
60	int match = 1, nomatch = 0;
61
62	/* This will be used soon... */
63	(void)flags; /* UNUSED */
64
65	/* If this is a negated class, return success for nomatch. */
66	if ((*p == '!' || *p == '^') && p < end) {
67		match = 0;
68		nomatch = 1;
69		++p;
70	}
71
72	while (p < end) {
73		nextRangeStart = '\0';
74		switch (*p) {
75		case '-':
76			/* Trailing or initial '-' is not special. */
77			if ((rangeStart == '\0') || (p == end - 1)) {
78				if (*p == c)
79					return (match);
80			} else {
81				char rangeEnd = *++p;
82				if (rangeEnd == '\\')
83					rangeEnd = *++p;
84				if ((rangeStart <= c) && (c <= rangeEnd))
85					return (match);
86			}
87			break;
88		case '\\':
89			++p;
90			/* Fall through */
91		default:
92			if (*p == c)
93				return (match);
94			nextRangeStart = *p; /* Possible start of range. */
95		}
96		rangeStart = nextRangeStart;
97		++p;
98	}
99	return (nomatch);
100}
101
102static int
103pm_list_w(const wchar_t *start, const wchar_t *end, const wchar_t c, int flags)
104{
105	const wchar_t *p = start;
106	wchar_t rangeStart = L'\0', nextRangeStart;
107	int match = 1, nomatch = 0;
108
109	/* This will be used soon... */
110	(void)flags; /* UNUSED */
111
112	/* If this is a negated class, return success for nomatch. */
113	if ((*p == L'!' || *p == L'^') && p < end) {
114		match = 0;
115		nomatch = 1;
116		++p;
117	}
118
119	while (p < end) {
120		nextRangeStart = L'\0';
121		switch (*p) {
122		case L'-':
123			/* Trailing or initial '-' is not special. */
124			if ((rangeStart == L'\0') || (p == end - 1)) {
125				if (*p == c)
126					return (match);
127			} else {
128				wchar_t rangeEnd = *++p;
129				if (rangeEnd == L'\\')
130					rangeEnd = *++p;
131				if ((rangeStart <= c) && (c <= rangeEnd))
132					return (match);
133			}
134			break;
135		case L'\\':
136			++p;
137			/* Fall through */
138		default:
139			if (*p == c)
140				return (match);
141			nextRangeStart = *p; /* Possible start of range. */
142		}
143		rangeStart = nextRangeStart;
144		++p;
145	}
146	return (nomatch);
147}
148
149/*
150 * If s is pointing to "./", ".//", "./././" or the like, skip it.
151 */
152static const char *
153pm_slashskip(const char *s) {
154	while ((*s == '/')
155	    || (s[0] == '.' && s[1] == '/')
156	    || (s[0] == '.' && s[1] == '\0'))
157		++s;
158	return (s);
159}
160
161static const wchar_t *
162pm_slashskip_w(const wchar_t *s) {
163	while ((*s == L'/')
164	    || (s[0] == L'.' && s[1] == L'/')
165	    || (s[0] == L'.' && s[1] == L'\0'))
166		++s;
167	return (s);
168}
169
170static int
171pm(const char *p, const char *s, int flags)
172{
173	const char *end;
174
175	/*
176	 * Ignore leading './', './/', '././', etc.
177	 */
178	if (s[0] == '.' && s[1] == '/')
179		s = pm_slashskip(s + 1);
180	if (p[0] == '.' && p[1] == '/')
181		p = pm_slashskip(p + 1);
182
183	for (;;) {
184		switch (*p) {
185		case '\0':
186			if (s[0] == '/') {
187				if (flags & PATHMATCH_NO_ANCHOR_END)
188					return (1);
189				/* "dir" == "dir/" == "dir/." */
190				s = pm_slashskip(s);
191			}
192			return (*s == '\0');
193		case '?':
194			/* ? always succeeds, unless we hit end of 's' */
195			if (*s == '\0')
196				return (0);
197			break;
198		case '*':
199			/* "*" == "**" == "***" ... */
200			while (*p == '*')
201				++p;
202			/* Trailing '*' always succeeds. */
203			if (*p == '\0')
204				return (1);
205			while (*s) {
206				if (archive_pathmatch(p, s, flags))
207					return (1);
208				++s;
209			}
210			return (0);
211		case '[':
212			/*
213			 * Find the end of the [...] character class,
214			 * ignoring \] that might occur within the class.
215			 */
216			end = p + 1;
217			while (*end != '\0' && *end != ']') {
218				if (*end == '\\' && end[1] != '\0')
219					++end;
220				++end;
221			}
222			if (*end == ']') {
223				/* We found [...], try to match it. */
224				if (!pm_list(p + 1, end, *s, flags))
225					return (0);
226				p = end; /* Jump to trailing ']' char. */
227				break;
228			} else
229				/* No final ']', so just match '['. */
230				if (*p != *s)
231					return (0);
232			break;
233		case '\\':
234			/* Trailing '\\' matches itself. */
235			if (p[1] == '\0') {
236				if (*s != '\\')
237					return (0);
238			} else {
239				++p;
240				if (*p != *s)
241					return (0);
242			}
243			break;
244		case '/':
245			if (*s != '/' && *s != '\0')
246				return (0);
247			/* Note: pattern "/\./" won't match "/";
248			 * pm_slashskip() correctly stops at backslash. */
249			p = pm_slashskip(p);
250			s = pm_slashskip(s);
251			if (*p == '\0' && (flags & PATHMATCH_NO_ANCHOR_END))
252				return (1);
253			--p; /* Counteract the increment below. */
254			--s;
255			break;
256		case '$':
257			/* '$' is special only at end of pattern and only
258			 * if PATHMATCH_NO_ANCHOR_END is specified. */
259			if (p[1] == '\0' && (flags & PATHMATCH_NO_ANCHOR_END)){
260				/* "dir" == "dir/" == "dir/." */
261				return (*pm_slashskip(s) == '\0');
262			}
263			/* Otherwise, '$' is not special. */
264			/* FALL THROUGH */
265		default:
266			if (*p != *s)
267				return (0);
268			break;
269		}
270		++p;
271		++s;
272	}
273}
274
275static int
276pm_w(const wchar_t *p, const wchar_t *s, int flags)
277{
278	const wchar_t *end;
279
280	/*
281	 * Ignore leading './', './/', '././', etc.
282	 */
283	if (s[0] == L'.' && s[1] == L'/')
284		s = pm_slashskip_w(s + 1);
285	if (p[0] == L'.' && p[1] == L'/')
286		p = pm_slashskip_w(p + 1);
287
288	for (;;) {
289		switch (*p) {
290		case L'\0':
291			if (s[0] == L'/') {
292				if (flags & PATHMATCH_NO_ANCHOR_END)
293					return (1);
294				/* "dir" == "dir/" == "dir/." */
295				s = pm_slashskip_w(s);
296			}
297			return (*s == L'\0');
298		case L'?':
299			/* ? always succeeds, unless we hit end of 's' */
300			if (*s == L'\0')
301				return (0);
302			break;
303		case L'*':
304			/* "*" == "**" == "***" ... */
305			while (*p == L'*')
306				++p;
307			/* Trailing '*' always succeeds. */
308			if (*p == L'\0')
309				return (1);
310			while (*s) {
311				if (archive_pathmatch_w(p, s, flags))
312					return (1);
313				++s;
314			}
315			return (0);
316		case L'[':
317			/*
318			 * Find the end of the [...] character class,
319			 * ignoring \] that might occur within the class.
320			 */
321			end = p + 1;
322			while (*end != L'\0' && *end != L']') {
323				if (*end == L'\\' && end[1] != L'\0')
324					++end;
325				++end;
326			}
327			if (*end == L']') {
328				/* We found [...], try to match it. */
329				if (!pm_list_w(p + 1, end, *s, flags))
330					return (0);
331				p = end; /* Jump to trailing ']' char. */
332				break;
333			} else
334				/* No final ']', so just match '['. */
335				if (*p != *s)
336					return (0);
337			break;
338		case L'\\':
339			/* Trailing '\\' matches itself. */
340			if (p[1] == L'\0') {
341				if (*s != L'\\')
342					return (0);
343			} else {
344				++p;
345				if (*p != *s)
346					return (0);
347			}
348			break;
349		case L'/':
350			if (*s != L'/' && *s != L'\0')
351				return (0);
352			/* Note: pattern "/\./" won't match "/";
353			 * pm_slashskip() correctly stops at backslash. */
354			p = pm_slashskip_w(p);
355			s = pm_slashskip_w(s);
356			if (*p == L'\0' && (flags & PATHMATCH_NO_ANCHOR_END))
357				return (1);
358			--p; /* Counteract the increment below. */
359			--s;
360			break;
361		case L'$':
362			/* '$' is special only at end of pattern and only
363			 * if PATHMATCH_NO_ANCHOR_END is specified. */
364			if (p[1] == L'\0' && (flags & PATHMATCH_NO_ANCHOR_END)){
365				/* "dir" == "dir/" == "dir/." */
366				return (*pm_slashskip_w(s) == L'\0');
367			}
368			/* Otherwise, '$' is not special. */
369			/* FALL THROUGH */
370		default:
371			if (*p != *s)
372				return (0);
373			break;
374		}
375		++p;
376		++s;
377	}
378}
379
380/* Main entry point. */
381int
382__archive_pathmatch(const char *p, const char *s, int flags)
383{
384	/* Empty pattern only matches the empty string. */
385	if (p == NULL || *p == '\0')
386		return (s == NULL || *s == '\0');
387	else if (s == NULL)
388		return (0);
389
390	/* Leading '^' anchors the start of the pattern. */
391	if (*p == '^') {
392		++p;
393		flags &= ~PATHMATCH_NO_ANCHOR_START;
394	}
395
396	if (*p == '/' && *s != '/')
397		return (0);
398
399	/* Certain patterns anchor implicitly. */
400	if (*p == '*' || *p == '/') {
401		while (*p == '/')
402			++p;
403		while (*s == '/')
404			++s;
405		return (pm(p, s, flags));
406	}
407
408	/* If start is unanchored, try to match start of each path element. */
409	if (flags & PATHMATCH_NO_ANCHOR_START) {
410		for ( ; s != NULL; s = strchr(s, '/')) {
411			if (*s == '/')
412				s++;
413			if (pm(p, s, flags))
414				return (1);
415		}
416		return (0);
417	}
418
419	/* Default: Match from beginning. */
420	return (pm(p, s, flags));
421}
422
423int
424__archive_pathmatch_w(const wchar_t *p, const wchar_t *s, int flags)
425{
426	/* Empty pattern only matches the empty string. */
427	if (p == NULL || *p == L'\0')
428		return (s == NULL || *s == L'\0');
429	else if (s == NULL)
430		return (0);
431
432	/* Leading '^' anchors the start of the pattern. */
433	if (*p == L'^') {
434		++p;
435		flags &= ~PATHMATCH_NO_ANCHOR_START;
436	}
437
438	if (*p == L'/' && *s != L'/')
439		return (0);
440
441	/* Certain patterns anchor implicitly. */
442	if (*p == L'*' || *p == L'/') {
443		while (*p == L'/')
444			++p;
445		while (*s == L'/')
446			++s;
447		return (pm_w(p, s, flags));
448	}
449
450	/* If start is unanchored, try to match start of each path element. */
451	if (flags & PATHMATCH_NO_ANCHOR_START) {
452		for ( ; s != NULL; s = wcschr(s, L'/')) {
453			if (*s == L'/')
454				s++;
455			if (pm_w(p, s, flags))
456				return (1);
457		}
458		return (0);
459	}
460
461	/* Default: Match from beginning. */
462	return (pm_w(p, s, flags));
463}
464