1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1985-2011 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                  Common Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*            http://www.opensource.org/licenses/cpl1.0.txt             *
11*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                   Phong Vo <kpv@research.att.com>                    *
20*                                                                      *
21***********************************************************************/
22#pragma prototyped
23
24/*
25 * D. G. Korn
26 * G. S. Fowler
27 * AT&T Research
28 *
29 * match shell file patterns -- derived from Bourne and Korn shell gmatch()
30 *
31 *	sh pattern	egrep RE	description
32 *	----------	--------	-----------
33 *	*		.*		0 or more chars
34 *	?		.		any single char
35 *	[.]		[.]		char class
36 *	[!.]		[^.]		negated char class
37 *	[[:.:]]		[[:.:]]		ctype class
38 *	[[=.=]]		[[=.=]]		equivalence class
39 *	[[...]]		[[...]]		collation element
40 *	*(.)		(.)*		0 or more of
41 *	+(.)		(.)+		1 or more of
42 *	?(.)		(.)?		0 or 1 of
43 *	(.)		(.)		1 of
44 *	@(.)		(.)		1 of
45 *	a|b		a|b		a or b
46 *	\#				() subgroup back reference [1-9]
47 *	a&b				a and b
48 *	!(.)				none of
49 *
50 * \ used to escape metacharacters
51 *
52 *	*, ?, (, |, &, ), [, \ must be \'d outside of [...]
53 *	only ] must be \'d inside [...]
54 *
55 * BUG: unbalanced ) terminates top level pattern
56 */
57
58#include <ast.h>
59#include <ctype.h>
60#include <hashkey.h>
61
62#ifndef	isblank
63#define	isblank(x)	((x)==' '||(x)=='\t')
64#endif
65
66#ifndef isgraph
67#define	isgraph(x)	(isprint(x)&&!isblank(x))
68#endif
69
70#define MAXGROUP	10
71
72typedef struct
73{
74	char*		beg[MAXGROUP];
75	char*		end[MAXGROUP];
76	char*		next_s;
77	short		groups;
78} Group_t;
79
80typedef struct
81{
82	Group_t		current;
83	Group_t		best;
84	char*		last_s;
85	char*		next_p;
86} Match_t;
87
88#define mbgetchar(p)	(*p++)
89
90#ifndef isxdigit
91#define isxdigit(c)	((c)>='0'&&(c)<='9'||(c)>='a'&&(c)<='f'||(c)>='A'&&(c)<='F')
92#endif
93
94#define getsource(s,e)	(((s)>=(e))?0:mbgetchar(s))
95
96#define COLL_MAX	3
97
98/*
99 * gobble chars up to <sub> or ) keeping track of (...) and [...]
100 * sub must be one of { '|', '&', 0 }
101 * 0 returned if s runs out
102 */
103
104static char*
105gobble(Match_t* mp, register char* s, register int sub, int* g, int clear)
106{
107	register int	p = 0;
108	register char*	b = 0;
109	int		c = 0;
110	int		n;
111
112	for (;;)
113		switch (mbgetchar(s))
114		{
115		case '\\':
116			if (mbgetchar(s))
117				break;
118			/*FALLTHROUGH*/
119		case 0:
120			return 0;
121		case '[':
122			if (!b)
123			{
124				if (*s == '!')
125					mbgetchar(s);
126				b = s;
127			}
128			else if (*s == '.' || *s == '=' || *s == ':')
129				c = *s;
130			break;
131		case ']':
132			if (b)
133			{
134				if (*(s - 2) == c)
135					c = 0;
136				else if (b != (s - 1))
137					b = 0;
138			}
139			break;
140		case '(':
141			if (!b)
142			{
143				p++;
144				n = (*g)++;
145				if (clear)
146				{
147					if (!sub)
148						n++;
149					if (n < MAXGROUP)
150						mp->current.beg[n] = mp->current.end[n] = 0;
151				}
152			}
153			break;
154		case ')':
155			if (!b && p-- <= 0)
156				return sub ? 0 : s;
157			break;
158		case '|':
159			if (!b && !p && sub == '|')
160				return s;
161			break;
162		}
163}
164
165static int	grpmatch(Match_t*, int, char*, register char*, char*, int);
166
167/*
168 * match a single pattern
169 * e is the end (0) of the substring in s
170 * r marks the start of a repeated subgroup pattern
171 */
172
173static int
174onematch(Match_t* mp, int g, char* s, char* p, char* e, char* r, int flags)
175{
176	register int 	pc;
177	register int 	sc;
178	register int	n;
179	register int	icase;
180	char*		olds;
181	char*		oldp;
182
183	icase = flags & STR_ICASE;
184	do
185	{
186		olds = s;
187		sc = getsource(s, e);
188		if (icase && isupper(sc))
189			sc = tolower(sc);
190		oldp = p;
191		switch (pc = mbgetchar(p))
192		{
193		case '(':
194		case '*':
195		case '?':
196		case '+':
197		case '@':
198		case '!':
199			if (pc == '(' || *p == '(')
200			{
201				char*	subp;
202				int	oldg;
203
204				s = olds;
205				subp = p + (pc != '(');
206				oldg = g;
207				n = ++g;
208				if (g < MAXGROUP && (!r || g > mp->current.groups))
209					mp->current.beg[g] = mp->current.end[g] = 0;
210				if (!(p = gobble(mp, subp, 0, &g, !r)))
211					return 0;
212				if (pc == '*' || pc == '?' || pc == '+' && oldp == r)
213				{
214					if (onematch(mp, g, s, p, e, NiL, flags))
215						return 1;
216					if (!sc || !getsource(s, e))
217					{
218						mp->current.groups = oldg;
219						return 0;
220					}
221				}
222				if (pc == '*' || pc == '+')
223				{
224					p = oldp;
225					sc = n - 1;
226				}
227				else
228					sc = g;
229				pc = (pc != '!');
230				do
231				{
232					if (grpmatch(mp, n, olds, subp, s, flags) == pc)
233					{
234						if (n < MAXGROUP)
235						{
236							if (!mp->current.beg[n] || mp->current.beg[n] > olds)
237								mp->current.beg[n] = olds;
238							if (s > mp->current.end[n])
239								mp->current.end[n] = s;
240						}
241						if (onematch(mp, sc, s, p, e, oldp, flags))
242						{
243							if (p == oldp && n < MAXGROUP)
244							{
245								if (!mp->current.beg[n] || mp->current.beg[n] > olds)
246									mp->current.beg[n] = olds;
247								if (s > mp->current.end[n])
248									mp->current.end[n] = s;
249							}
250							return 1;
251						}
252					}
253				} while (s < e && mbgetchar(s));
254				mp->current.groups = oldg;
255				return 0;
256			}
257			else if (pc == '*')
258			{
259				/*
260				 * several stars are the same as one
261				 */
262
263				while (*p == '*' && *(p + 1) != '(')
264					p++;
265				oldp = p;
266				switch (pc = mbgetchar(p))
267				{
268				case '@':
269				case '!':
270				case '+':
271					n = *p == '(';
272					break;
273				case '(':
274				case '[':
275				case '?':
276				case '*':
277					n = 1;
278					break;
279				case 0:
280				case '|':
281				case '&':
282				case ')':
283					mp->current.next_s = (flags & STR_MAXIMAL) ? e : olds;
284					mp->next_p = oldp;
285					mp->current.groups = g;
286					if (!pc && (!mp->best.next_s || (flags & STR_MAXIMAL) && mp->current.next_s > mp->best.next_s || !(flags & STR_MAXIMAL) && mp->current.next_s < mp->best.next_s))
287						mp->best = mp->current;
288					return 1;
289				case '\\':
290					if (!(pc = mbgetchar(p)))
291						return 0;
292					if (pc >= '0' && pc <= '9')
293					{
294						n = pc - '0';
295						if (n <= g && mp->current.beg[n])
296							pc = *mp->current.beg[n];
297					}
298					/*FALLTHROUGH*/
299				default:
300					if (icase && isupper(pc))
301						pc = tolower(pc);
302					n = 0;
303					break;
304				}
305				p = oldp;
306				for (;;)
307				{
308					if ((n || pc == sc) && onematch(mp, g, olds, p, e, NiL, flags))
309						return 1;
310					if (!sc)
311						return 0;
312					olds = s;
313					sc = getsource(s, e);
314					if ((flags & STR_ICASE) && isupper(sc))
315						sc = tolower(sc);
316				}
317			}
318			else if (pc != '?' && pc != sc)
319				return 0;
320			break;
321		case 0:
322			if (!(flags & STR_MAXIMAL))
323				sc = 0;
324			/*FALLTHROUGH*/
325		case '|':
326		case '&':
327		case ')':
328			if (!sc)
329			{
330				mp->current.next_s = olds;
331				mp->next_p = oldp;
332				mp->current.groups = g;
333			}
334			if (!pc && (!mp->best.next_s || (flags & STR_MAXIMAL) && olds > mp->best.next_s || !(flags & STR_MAXIMAL) && olds < mp->best.next_s))
335			{
336				mp->best = mp->current;
337				mp->best.next_s = olds;
338				mp->best.groups = g;
339			}
340			return !sc;
341		case '[':
342			{
343				/*UNDENT...*/
344
345	int	invert;
346	int	x;
347	int	ok = 0;
348	char*	range;
349
350	if (!sc)
351		return 0;
352	range = 0;
353	n = 0;
354	if (invert = *p == '!')
355		p++;
356	for (;;)
357	{
358		oldp = p;
359		if (!(pc = mbgetchar(p)))
360			return 0;
361		else if (pc == '[' && (*p == ':' || *p == '=' || *p == '.'))
362		{
363			x = 0;
364			n = mbgetchar(p);
365			oldp = p;
366			for (;;)
367			{
368				if (!(pc = mbgetchar(p)))
369					return 0;
370				if (pc == n && *p == ']')
371					break;
372				x++;
373			}
374			mbgetchar(p);
375			if (ok)
376				/*NOP*/;
377			else if (n == ':')
378			{
379				switch (HASHNKEY5(x, oldp[0], oldp[1], oldp[2], oldp[3], oldp[4]))
380				{
381				case HASHNKEY5(5,'a','l','n','u','m'):
382					if (isalnum(sc))
383						ok = 1;
384					break;
385				case HASHNKEY5(5,'a','l','p','h','a'):
386					if (isalpha(sc))
387						ok = 1;
388					break;
389				case HASHNKEY5(5,'b','l','a','n','k'):
390					if (isblank(sc))
391						ok = 1;
392					break;
393				case HASHNKEY5(5,'c','n','t','r','l'):
394					if (iscntrl(sc))
395						ok = 1;
396					break;
397				case HASHNKEY5(5,'d','i','g','i','t'):
398					if (isdigit(sc))
399						ok = 1;
400					break;
401				case HASHNKEY5(5,'g','r','a','p','h'):
402					if (isgraph(sc))
403						ok = 1;
404					break;
405				case HASHNKEY5(5,'l','o','w','e','r'):
406					if (islower(sc))
407						ok = 1;
408					break;
409				case HASHNKEY5(5,'p','r','i','n','t'):
410					if (isprint(sc))
411						ok = 1;
412					break;
413				case HASHNKEY5(5,'p','u','n','c','t'):
414					if (ispunct(sc))
415						ok = 1;
416					break;
417				case HASHNKEY5(5,'s','p','a','c','e'):
418					if (isspace(sc))
419						ok = 1;
420					break;
421				case HASHNKEY5(5,'u','p','p','e','r'):
422					if (icase ? islower(sc) : isupper(sc))
423						ok = 1;
424					break;
425				case HASHNKEY5(6,'x','d','i','g','i'):
426					if (oldp[5] == 't' && isxdigit(sc))
427						ok = 1;
428					break;
429				}
430			}
431			else if (range)
432				goto getrange;
433			else if (*p == '-' && *(p + 1) != ']')
434			{
435				mbgetchar(p);
436				range = oldp;
437			}
438			else if (isalpha(*oldp) && isalpha(*olds) && tolower(*oldp) == tolower(*olds) || sc == mbgetchar(oldp))
439				ok = 1;
440			n = 1;
441		}
442		else if (pc == ']' && n)
443		{
444			if (ok != invert)
445				break;
446			return 0;
447		}
448		else if (pc == '\\' && (oldp = p, !(pc = mbgetchar(p))))
449			return 0;
450		else if (ok)
451			/*NOP*/;
452		else if (range)
453		{
454		getrange:
455			if (icase && isupper(pc))
456				pc = tolower(pc);
457			x = mbgetchar(range);
458			if (icase && isupper(x))
459				x = tolower(x);
460			if (sc == x || sc == pc || sc > x && sc < pc)
461				ok = 1;
462			if (*p == '-' && *(p + 1) != ']')
463			{
464				mbgetchar(p);
465				range = oldp;
466			}
467			else
468				range = 0;
469			n = 1;
470		}
471		else if (*p == '-' && *(p + 1) != ']')
472		{
473			mbgetchar(p);
474			range = oldp;
475			n = 1;
476		}
477		else
478		{
479			if (icase && isupper(pc))
480				pc = tolower(pc);
481			if (sc == pc)
482				ok = 1;
483			n = pc;
484		}
485	}
486
487				/*...INDENT*/
488			}
489			break;
490		case '\\':
491			if (!(pc = mbgetchar(p)))
492				return 0;
493			if (pc >= '0' && pc <= '9')
494			{
495				n = pc - '0';
496				if (n <= g && (oldp = mp->current.beg[n]))
497				{
498					while (oldp < mp->current.end[n])
499						if (!*olds || *olds++ != *oldp++)
500							return 0;
501					s = olds;
502					break;
503				}
504			}
505			/*FALLTHROUGH*/
506		default:
507			if (icase && isupper(pc))
508				pc = tolower(pc);
509			if (pc != sc)
510				return 0;
511			break;
512		}
513	} while (sc);
514	return 0;
515}
516
517/*
518 * match any pattern in a group
519 * | and & subgroups are parsed here
520 */
521
522static int
523grpmatch(Match_t* mp, int g, char* s, register char* p, char* e, int flags)
524{
525	register char*	a;
526
527	do
528	{
529		for (a = p; onematch(mp, g, s, a, e, NiL, flags); a++)
530			if (*(a = mp->next_p) != '&')
531				return 1;
532	} while (p = gobble(mp, p, '|', &g, 1));
533	return 0;
534}
535
536/*
537 * subgroup match
538 * 0 returned if no match
539 * otherwise number of subgroups matched returned
540 * match group begin offsets are even elements of sub
541 * match group end offsets are odd elements of sub
542 * the matched string is from s+sub[0] up to but not
543 * including s+sub[1]
544 */
545
546int
547strgrpmatch(const char* b, const char* p, int* sub, int n, int flags)
548{
549	register int	i;
550	register char*	s;
551	char*		e;
552	Match_t		match;
553
554	s = (char*)b;
555	match.last_s = e = s + strlen(s);
556	for (;;)
557	{
558		match.best.next_s = 0;
559		match.current.groups = 0;
560		if ((i = grpmatch(&match, 0, s, (char*)p, e, flags)) || match.best.next_s)
561		{
562			if (!i)
563				match.current = match.best;
564			match.current.groups++;
565			match.current.end[0] = match.current.next_s;
566			break;
567		}
568		if ((flags & STR_LEFT) || s >= e)
569			return 0;
570		s++;
571	}
572	if ((flags & STR_RIGHT) && match.current.next_s != e)
573		return 0;
574	if (!sub)
575		return 1;
576	match.current.beg[0] = s;
577	s = (char*)b;
578	if (n > match.current.groups)
579		n = match.current.groups;
580	for (i = 0; i < n; i++)
581	{
582		sub[i * 2] = match.current.end[i] ? match.current.beg[i] - s : 0;
583		sub[i * 2 + 1] = match.current.end[i] ? match.current.end[i] - s : 0;
584	}
585	return n;
586}
587
588/*
589 * compare the string s with the shell pattern p
590 * returns 1 for match 0 otherwise
591 */
592
593int
594strmatch(const char* s, const char* p)
595{
596	return strgrpmatch(s, p, NiL, 0, STR_MAXIMAL|STR_LEFT|STR_RIGHT);
597}
598