regexp.c revision 282449
1/*
2 * Copyright (c) 1980, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 4. Neither the name of the University nor the names of its contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32
33__FBSDID("$FreeBSD: head/usr.bin/vgrind/regexp.c 282449 2015-05-05 07:33:38Z bapt $");
34
35#ifndef lint
36static const char copyright[] =
37"@(#) Copyright (c) 1980, 1993\n\
38	The Regents of the University of California.  All rights reserved.\n";
39#endif
40
41#ifndef lint
42static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
43#endif
44
45#include <ctype.h>
46#include <stdlib.h>
47#include <string.h>
48
49#include "extern.h"
50
51#define FALSE	0
52#define TRUE	!(FALSE)
53#define NIL	0
54
55static void	expconv(void);
56
57boolean	 _escaped;	/* true if we are currently _escaped */
58char	*s_start;	/* start of string */
59boolean	 l_onecase;	/* true if upper and lower equivalent */
60
61#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
62
63/*  STRNCMP -	like strncmp except that we convert the
64 *	 	first string to lower case before comparing
65 *		if l_onecase is set.
66 */
67
68int
69STRNCMP(register char *s1, register char *s2, register int len)
70{
71	if (l_onecase) {
72	    do
73		if (*s2 - makelower(*s1))
74			return (*s2 - makelower(*s1));
75		else {
76			s2++;
77			s1++;
78		}
79	    while (--len);
80	} else {
81	    do
82		if (*s2 - *s1)
83			return (*s2 - *s1);
84		else {
85			s2++;
86			s1++;
87		}
88	    while (--len);
89	}
90	return(0);
91}
92
93/*	The following routine converts an irregular expression to
94 *	internal format.
95 *
96 *	Either meta symbols (\a \d or \p) or character strings or
97 *	operations ( alternation or perenthesizing ) can be
98 *	specified.  Each starts with a descriptor byte.  The descriptor
99 *	byte has STR set for strings, META set for meta symbols
100 *	and OPER set for operations.
101 *	The descriptor byte can also have the OPT bit set if the object
102 *	defined is optional.  Also ALT can be set to indicate an alternation.
103 *
104 *	For metasymbols the byte following the descriptor byte identities
105 *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
106 *	strings the byte after the descriptor is a character count for
107 *	the string:
108 *
109 *		meta symbols := descriptor
110 *				symbol
111 *
112 *		strings :=	descriptor
113 *				character count
114 *				the string
115 *
116 *		operatins :=	descriptor
117 *				symbol
118 *				character count
119 */
120
121/*
122 *  handy macros for accessing parts of match blocks
123 */
124#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
125#define MNEXT(A) (A+2)		/* character following a metasymbol block */
126
127#define OSYM(A) (*(A+1))	/* symbol in an operation block */
128#define OCNT(A) (*(A+2))	/* character count */
129#define ONEXT(A) (A+3)		/* next character after the operation */
130#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
131
132#define SCNT(A) (*(A+1))	/* byte count of a string */
133#define SSTR(A) (A+2)		/* address of the string */
134#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
135
136/*
137 *  bit flags in the descriptor
138 */
139#define OPT 1
140#define STR 2
141#define META 4
142#define ALT 8
143#define OPER 16
144
145static char *ccre;	/* pointer to current position in converted exp*/
146static char *ure;	/* pointer current position in unconverted exp */
147
148/* re: unconverted irregular expression */
149char *
150convexp(char *re)
151{
152    register char *cre;		/* pointer to converted regular expression */
153
154    /* allocate room for the converted expression */
155    if (re == NIL)
156	return (NIL);
157    if (*re == '\0')
158	return (NIL);
159    cre = malloc (4 * strlen(re) + 3);
160    ccre = cre;
161    ure = re;
162
163    /* start the conversion with a \a */
164    *cre = META | OPT;
165    MSYM(cre) = 'a';
166    ccre = MNEXT(cre);
167
168    /* start the conversion (its recursive) */
169    expconv ();
170    *ccre = 0;
171    return (cre);
172}
173
174static void
175expconv()
176{
177    register char *cs;		/* pointer to current symbol in converted exp */
178    register char c;		/* character being processed */
179    register char *acs;		/* pinter to last alternate */
180    register int temp;
181
182    /* let the conversion begin */
183    acs = NIL;
184    cs = NIL;
185    while (*ure != NIL) {
186	switch (c = *ure++) {
187
188	case '\\':
189	    switch (c = *ure++) {
190
191	    /* escaped characters are just characters */
192	    default:
193		if (cs == NIL || (*cs & STR) == 0) {
194		    cs = ccre;
195		    *cs = STR;
196		    SCNT(cs) = 1;
197		    ccre += 2;
198		} else
199		    SCNT(cs)++;
200		*ccre++ = c;
201		break;
202
203	    /* normal(?) metacharacters */
204	    case 'a':
205	    case 'd':
206	    case 'e':
207	    case 'p':
208		if (acs != NIL && acs != cs) {
209		    do {
210			temp = OCNT(acs);
211			OCNT(acs) = ccre - acs;
212			acs -= temp;
213		    } while (temp != 0);
214		    acs = NIL;
215		}
216		cs = ccre;
217		*cs = META;
218		MSYM(cs) = c;
219		ccre = MNEXT(cs);
220		break;
221	    }
222	    break;
223
224	/* just put the symbol in */
225	case '^':
226	case '$':
227	    if (acs != NIL && acs != cs) {
228		do {
229		    temp = OCNT(acs);
230		    OCNT(acs) = ccre - acs;
231		    acs -= temp;
232		} while (temp != 0);
233		acs = NIL;
234	    }
235	    cs = ccre;
236	    *cs = META;
237	    MSYM(cs) = c;
238	    ccre = MNEXT(cs);
239	    break;
240
241	/* mark the last match sequence as optional */
242	case '?':
243	    if (cs)
244	    	*cs = *cs | OPT;
245	    break;
246
247	/* recurse and define a subexpression */
248	case '(':
249	    if (acs != NIL && acs != cs) {
250		do {
251		    temp = OCNT(acs);
252		    OCNT(acs) = ccre - acs;
253		    acs -= temp;
254		} while (temp != 0);
255		acs = NIL;
256	    }
257	    cs = ccre;
258	    *cs = OPER;
259	    OSYM(cs) = '(';
260	    ccre = ONEXT(cs);
261	    expconv ();
262	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
263	    break;
264
265	/* return from a recursion */
266	case ')':
267	    if (acs != NIL) {
268		do {
269		    temp = OCNT(acs);
270		    OCNT(acs) = ccre - acs;
271		    acs -= temp;
272		} while (temp != 0);
273		acs = NIL;
274	    }
275	    cs = ccre;
276	    *cs = META;
277	    MSYM(cs) = c;
278	    ccre = MNEXT(cs);
279	    return;
280
281	/* mark the last match sequence as having an alternate */
282	/* the third byte will contain an offset to jump over the */
283	/* alternate match in case the first did not fail */
284	case '|':
285	    if (acs != NIL && acs != cs)
286		OCNT(ccre) = ccre - acs;	/* make a back pointer */
287	    else
288		OCNT(ccre) = 0;
289	    *cs |= ALT;
290	    cs = ccre;
291	    *cs = OPER;
292	    OSYM(cs) = '|';
293	    ccre = ONEXT(cs);
294	    acs = cs;	/* remember that the pointer is to be filles */
295	    break;
296
297	/* if its not a metasymbol just build a scharacter string */
298	default:
299	    if (cs == NIL || (*cs & STR) == 0) {
300		cs = ccre;
301		*cs = STR;
302		SCNT(cs) = 1;
303		ccre = SSTR(cs);
304	    } else
305		SCNT(cs)++;
306	    *ccre++ = c;
307	    break;
308	}
309    }
310    if (acs != NIL) {
311	do {
312	    temp = OCNT(acs);
313	    OCNT(acs) = ccre - acs;
314	    acs -= temp;
315	} while (temp != 0);
316	acs = NIL;
317    }
318    return;
319}
320/* end of convertre */
321
322
323/*
324 *	The following routine recognises an irregular expresion
325 *	with the following special characters:
326 *
327 *		\?	-	means last match was optional
328 *		\a	-	matches any number of characters
329 *		\d	-	matches any number of spaces and tabs
330 *		\p	-	matches any number of alphanumeric
331 *				characters. The
332 *				characters matched will be copied into
333 *				the area pointed to by 'name'.
334 *		\|	-	alternation
335 *		\( \)	-	grouping used mostly for alternation and
336 *				optionality
337 *
338 *	The irregular expression must be translated to internal form
339 *	prior to calling this routine
340 *
341 *	The value returned is the pointer to the first non \a
342 *	character matched.
343 */
344
345/*
346 *  s: string to check for a match in
347 *  re: a converted irregular expression
348 *  mstring: where to put whatever matches a \p
349 */
350char *
351expmatch (register char *s, register char *re, register char *mstring)
352{
353    register char *cs;		/* the current symbol */
354    register char *ptr,*s1;	/* temporary pointer */
355    boolean matched;		/* a temporary boolean */
356
357    /* initial conditions */
358    if (re == NIL)
359	return (NIL);
360    cs = re;
361    matched = FALSE;
362
363    /* loop till expression string is exhausted (or at least pretty tired) */
364    while (*cs) {
365	switch (*cs & (OPER | STR | META)) {
366
367	/* try to match a string */
368	case STR:
369	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
370	    if (matched) {
371
372		/* hoorah it matches */
373		s += SCNT(cs);
374		cs = SNEXT(cs);
375	    } else if (*cs & ALT) {
376
377		/* alternation, skip to next expression */
378		cs = SNEXT(cs);
379	    } else if (*cs & OPT) {
380
381		/* the match is optional */
382		cs = SNEXT(cs);
383		matched = 1;		/* indicate a successful match */
384	    } else {
385
386		/* no match, error return */
387		return (NIL);
388	    }
389	    break;
390
391	/* an operator, do something fancy */
392	case OPER:
393	    switch (OSYM(cs)) {
394
395	    /* this is an alternation */
396	    case '|':
397		if (matched)
398
399		    /* last thing in the alternation was a match, skip ahead */
400		    cs = OPTR(cs);
401		else
402
403		    /* no match, keep trying */
404		    cs = ONEXT(cs);
405		break;
406
407	    /* this is a grouping, recurse */
408	    case '(':
409		ptr = expmatch (s, ONEXT(cs), mstring);
410		if (ptr != NIL) {
411
412		    /* the subexpression matched */
413		    matched = 1;
414		    s = ptr;
415		} else if (*cs & ALT) {
416
417		    /* alternation, skip to next expression */
418		    matched = 0;
419		} else if (*cs & OPT) {
420
421		    /* the match is optional */
422		    matched = 1;	/* indicate a successful match */
423		} else {
424
425		    /* no match, error return */
426		    return (NIL);
427		}
428		cs = OPTR(cs);
429		break;
430	    }
431	    break;
432
433	/* try to match a metasymbol */
434	case META:
435	    switch (MSYM(cs)) {
436
437	    /* try to match anything and remember what was matched */
438	    case 'p':
439		/*
440		 *  This is really the same as trying the match the
441		 *  remaining parts of the expression to any subset
442		 *  of the string.
443		 */
444		s1 = s;
445		do {
446		    ptr = expmatch (s1, MNEXT(cs), mstring);
447		    if (ptr != NIL && s1 != s) {
448
449			/* we have a match, remember the match */
450			strncpy (mstring, s, s1 - s);
451			mstring[s1 - s] = '\0';
452			return (ptr);
453		    } else if (ptr != NIL && (*cs & OPT)) {
454
455			/* it was aoptional so no match is ok */
456			return (ptr);
457		    } else if (ptr != NIL) {
458
459			/* not optional and we still matched */
460			return (NIL);
461		    }
462		    if (!(isalnum(*s1) || *s1 == '_' ||
463			  /* C++ destructor */
464			  *s1 == '~' ||
465			  /* C++ scope operator */
466			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
467			   (s1++, TRUE))))
468			return (NIL);
469		    if (*s1 == '\\')
470			_escaped = _escaped ? FALSE : TRUE;
471		    else
472			_escaped = FALSE;
473		} while (*s1++);
474		return (NIL);
475
476	    /* try to match anything */
477	    case 'a':
478		/*
479		 *  This is really the same as trying the match the
480		 *  remaining parts of the expression to any subset
481		 *  of the string.
482		 */
483		s1 = s;
484		do {
485		    ptr = expmatch (s1, MNEXT(cs), mstring);
486		    if (ptr != NIL && s1 != s) {
487
488			/* we have a match */
489			return (ptr);
490		    } else if (ptr != NIL && (*cs & OPT)) {
491
492			/* it was aoptional so no match is ok */
493			return (ptr);
494		    } else if (ptr != NIL) {
495
496			/* not optional and we still matched */
497			return (NIL);
498		    }
499		    if (*s1 == '\\')
500			_escaped = _escaped ? FALSE : TRUE;
501		    else
502			_escaped = FALSE;
503		} while (*s1++);
504		return (NIL);
505
506	    /* fail if we are currently _escaped */
507	    case 'e':
508		if (_escaped)
509		    return(NIL);
510		cs = MNEXT(cs);
511		break;
512
513	    /* match any number of tabs and spaces */
514	    case 'd':
515		ptr = s;
516		while (*s == ' ' || *s == '\t')
517		    s++;
518		if (s != ptr || s == s_start) {
519
520		    /* match, be happy */
521		    matched = 1;
522		    cs = MNEXT(cs);
523		} else if (*s == '\n' || *s == '\0') {
524
525		    /* match, be happy */
526		    matched = 1;
527		    cs = MNEXT(cs);
528		} else if (*cs & ALT) {
529
530		    /* try the next part */
531		    matched = 0;
532		    cs = MNEXT(cs);
533		} else if (*cs & OPT) {
534
535		    /* doesn't matter */
536		    matched = 1;
537		    cs = MNEXT(cs);
538		} else
539
540		    /* no match, error return */
541		    return (NIL);
542		break;
543
544	    /* check for end of line */
545	    case '$':
546		if (*s == '\0' || *s == '\n') {
547
548		    /* match, be happy */
549		    s++;
550		    matched = 1;
551		    cs = MNEXT(cs);
552		} else if (*cs & ALT) {
553
554		    /* try the next part */
555		    matched = 0;
556		    cs = MNEXT(cs);
557		} else if (*cs & OPT) {
558
559		    /* doesn't matter */
560		    matched = 1;
561		    cs = MNEXT(cs);
562		} else
563
564		    /* no match, error return */
565		    return (NIL);
566		break;
567
568	    /* check for start of line */
569	    case '^':
570		if (s == s_start) {
571
572		    /* match, be happy */
573		    matched = 1;
574		    cs = MNEXT(cs);
575		} else if (*cs & ALT) {
576
577		    /* try the next part */
578		    matched = 0;
579		    cs = MNEXT(cs);
580		} else if (*cs & OPT) {
581
582		    /* doesn't matter */
583		    matched = 1;
584		    cs = MNEXT(cs);
585		} else
586
587		    /* no match, error return */
588		    return (NIL);
589		break;
590
591	    /* end of a subexpression, return success */
592	    case ')':
593		return (s);
594	    }
595	    break;
596	}
597    }
598    return (s);
599}
600