regexp.c revision 289677
110154Sache/*
210154Sache * Copyright (c) 1980, 1993
37767Sache *	The Regents of the University of California.  All rights reserved.
4941Snate *
510154Sache *
610154Sache * Redistribution and use in source and binary forms, with or without
710154Sache * modification, are permitted provided that the following conditions
810154Sache * are met:
910154Sache * 1. Redistributions of source code must retain the above copyright
1010154Sache *    notice, this list of conditions and the following disclaimer.
1110154Sache * 2. Redistributions in binary form must reproduce the above copyright
1210154Sache *    notice, this list of conditions and the following disclaimer in the
13941Snate *    documentation and/or other materials provided with the distribution.
1410154Sache * 4. Neither the name of the University nor the names of its contributors
1510154Sache *    may be used to endorse or promote products derived from this software
1610154Sache *    without specific prior written permission.
1710154Sache *
1810154Sache * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1910154Sache * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2010154Sache * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2110154Sache * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2210154Sache * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2310154Sache * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2454158Scharnier * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2554158Scharnier * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26941Snate * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27941Snate * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28941Snate * SUCH DAMAGE.
29941Snate */
30941Snate
31941Snate#include <sys/cdefs.h>
32941Snate
337767Sache__FBSDID("$FreeBSD: head/usr.bin/vgrind/regexp.c 289677 2015-10-21 05:37:09Z eadler $");
3482973Sru
3582973Sru#ifndef lint
3654158Scharnierstatic const char copyright[] =
37941Snate"@(#) Copyright (c) 1980, 1993\n\
387767Sache	The Regents of the University of California.  All rights reserved.\n";
39941Snate#endif
40941Snate
41941Snate#ifndef lint
427767Sachestatic const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
43941Snate#endif
44941Snate
45941Snate#include <ctype.h>
46941Snate#include <stdlib.h>
47941Snate#include <stdbool.h>
487767Sache#include <string.h>
49941Snate
50941Snate#include "extern.h"
51941Snate
52941Snatestatic void	expconv(void);
53941Snate
54941Snatebool	 _escaped;	/* true if we are currently x_escaped */
55941Snatechar	*s_start;	/* start of string */
56941Snatebool	 l_onecase;	/* true if upper and lower equivalent */
57941Snate
5854158Scharnier#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
59941Snate
60941Snate/*  STRNCMP -	like strncmp except that we convert the
61941Snate *	 	first string to lower case before comparing
62241737Sed *		if l_onecase is set.
63241737Sed */
64241737Sed
65241737Sedint
66227269SedSTRNCMP(register char *s1, register char *s2, register int len)
67241737Sed{
68227269Sed	if (l_onecase) {
69227269Sed	    do
70941Snate		if (*s2 - makelower(*s1))
7182973Sru			return (*s2 - makelower(*s1));
7282973Sru		else {
7382973Sru			s2++;
7482973Sru			s1++;
75241852Seadler		}
76241852Seadler	    while (--len);
7782973Sru	} else {
78941Snate	    do
7982973Sru		if (*s2 - *s1)
8082973Sru			return (*s2 - *s1);
8182973Sru		else {
8282973Sru			s2++;
8382973Sru			s1++;
84241852Seadler		}
85241852Seadler	    while (--len);
8682973Sru	}
87941Snate	return(0);
8882973Sru}
89241852Seadler
90241852Seadler/*	The following routine converts an irregular expression to
9182973Sru *	internal format.
92941Snate *
9382973Sru *	Either meta symbols (\a \d or \p) or character strings or
94241852Seadler *	operations ( alternation or parenthesizing ) can be
95241852Seadler *	specified.  Each starts with a descriptor byte.  The descriptor
9682973Sru *	byte has STR set for strings, META set for meta symbols
97941Snate *	and OPER set for operations.
9882973Sru *	The descriptor byte can also have the OPT bit set if the object
9982973Sru *	defined is optional.  Also ALT can be set to indicate an alternation.
10082973Sru *
10182973Sru *	For metasymbols the byte following the descriptor byte identities
102241852Seadler *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
103241852Seadler *	strings the byte after the descriptor is a character count for
10482973Sru *	the string:
10582973Sru *
106941Snate *		meta symbols := descriptor
107 *				symbol
108 *
109 *		strings :=	descriptor
110 *				character count
111 *				the string
112 *
113 *		operations :=	descriptor
114 *				symbol
115 *				character count
116 */
117
118/*
119 *  handy macros for accessing parts of match blocks
120 */
121#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
122#define MNEXT(A) (A+2)		/* character following a metasymbol block */
123
124#define OSYM(A) (*(A+1))	/* symbol in an operation block */
125#define OCNT(A) (*(A+2))	/* character count */
126#define ONEXT(A) (A+3)		/* next character after the operation */
127#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
128
129#define SCNT(A) (*(A+1))	/* byte count of a string */
130#define SSTR(A) (A+2)		/* address of the string */
131#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
132
133/*
134 *  bit flags in the descriptor
135 */
136#define OPT 1
137#define STR 2
138#define META 4
139#define ALT 8
140#define OPER 16
141
142static char *ccre;	/* pointer to current position in converted exp*/
143static char *ure;	/* pointer current position in unconverted exp */
144
145/* re: unconverted irregular expression */
146char *
147convexp(char *re)
148{
149    register char *cre;		/* pointer to converted regular expression */
150
151    /* allocate room for the converted expression */
152    if (re == NULL)
153	return (NULL);
154    if (*re == '\0')
155	return (NULL);
156    cre = malloc(4 * strlen(re) + 3);
157    ccre = cre;
158    ure = re;
159
160    /* start the conversion with a \a */
161    *cre = META | OPT;
162    MSYM(cre) = 'a';
163    ccre = MNEXT(cre);
164
165    /* start the conversion (its recursive) */
166    expconv ();
167    *ccre = 0;
168    return (cre);
169}
170
171static void
172expconv()
173{
174    register char *cs;		/* pointer to current symbol in converted exp */
175    register char c;		/* character being processed */
176    register char *acs;		/* pinter to last alternate */
177    register int temp;
178
179    /* let the conversion begin */
180    acs = NULL;
181    cs = NULL;
182    while (*ure) {
183	switch (c = *ure++) {
184
185	case '\\':
186	    switch (c = *ure++) {
187
188	    /* escaped characters are just characters */
189	    default:
190		if (cs == NULL || (*cs & STR) == 0) {
191		    cs = ccre;
192		    *cs = STR;
193		    SCNT(cs) = 1;
194		    ccre += 2;
195		} else
196		    SCNT(cs)++;
197		*ccre++ = c;
198		break;
199
200	    /* normal(?) metacharacters */
201	    case 'a':
202	    case 'd':
203	    case 'e':
204	    case 'p':
205		if (acs != NULL && acs != cs) {
206		    do {
207			temp = OCNT(acs);
208			OCNT(acs) = ccre - acs;
209			acs -= temp;
210		    } while (temp != 0);
211		    acs = NULL;
212		}
213		cs = ccre;
214		*cs = META;
215		MSYM(cs) = c;
216		ccre = MNEXT(cs);
217		break;
218	    }
219	    break;
220
221	/* just put the symbol in */
222	case '^':
223	case '$':
224	    if (acs != NULL && acs != cs) {
225		do {
226		    temp = OCNT(acs);
227		    OCNT(acs) = ccre - acs;
228		    acs -= temp;
229		} while (temp != 0);
230		acs = NULL;
231	    }
232	    cs = ccre;
233	    *cs = META;
234	    MSYM(cs) = c;
235	    ccre = MNEXT(cs);
236	    break;
237
238	/* mark the last match sequence as optional */
239	case '?':
240	    if (cs)
241	    	*cs = *cs | OPT;
242	    break;
243
244	/* recurse and define a subexpression */
245	case '(':
246	    if (acs != NULL && acs != cs) {
247		do {
248		    temp = OCNT(acs);
249		    OCNT(acs) = ccre - acs;
250		    acs -= temp;
251		} while (temp != 0);
252		acs = NULL;
253	    }
254	    cs = ccre;
255	    *cs = OPER;
256	    OSYM(cs) = '(';
257	    ccre = ONEXT(cs);
258	    expconv();
259	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
260	    break;
261
262	/* return from a recursion */
263	case ')':
264	    if (acs != NULL) {
265		do {
266		    temp = OCNT(acs);
267		    OCNT(acs) = ccre - acs;
268		    acs -= temp;
269		} while (temp != 0);
270		acs = NULL;
271	    }
272	    cs = ccre;
273	    *cs = META;
274	    MSYM(cs) = c;
275	    ccre = MNEXT(cs);
276	    return;
277
278	/* mark the last match sequence as having an alternate */
279	/* the third byte will contain an offset to jump over the */
280	/* alternate match in case the first did not fail */
281	case '|':
282	    if (acs != NULL && acs != cs)
283		OCNT(ccre) = ccre - acs;	/* make a back pointer */
284	    else
285		OCNT(ccre) = 0;
286	    *cs |= ALT;
287	    cs = ccre;
288	    *cs = OPER;
289	    OSYM(cs) = '|';
290	    ccre = ONEXT(cs);
291	    acs = cs;	/* remember that the pointer is to be filles */
292	    break;
293
294	/* if its not a metasymbol just build a scharacter string */
295	default:
296	    if (cs == NULL || (*cs & STR) == 0) {
297		cs = ccre;
298		*cs = STR;
299		SCNT(cs) = 1;
300		ccre = SSTR(cs);
301	    } else
302		SCNT(cs)++;
303	    *ccre++ = c;
304	    break;
305	}
306    }
307    if (acs != NULL) {
308	do {
309	    temp = OCNT(acs);
310	    OCNT(acs) = ccre - acs;
311	    acs -= temp;
312	} while (temp != 0);
313	acs = NULL;
314    }
315    return;
316}
317/* end of convertre */
318
319
320/*
321 *	The following routine recognises an irregular expression
322 *	with the following special characters:
323 *
324 *		\?	-	means last match was optional
325 *		\a	-	matches any number of characters
326 *		\d	-	matches any number of spaces and tabs
327 *		\p	-	matches any number of alphanumeric
328 *				characters. The
329 *				characters matched will be copied into
330 *				the area pointed to by 'name'.
331 *		\|	-	alternation
332 *		\( \)	-	grouping used mostly for alternation and
333 *				optionality
334 *
335 *	The irregular expression must be translated to internal form
336 *	prior to calling this routine
337 *
338 *	The value returned is the pointer to the first non \a
339 *	character matched.
340 */
341
342/*
343 *  s: string to check for a match in
344 *  re: a converted irregular expression
345 *  mstring: where to put whatever matches a \p
346 */
347char *
348expmatch (register char *s, register char *re, register char *mstring)
349{
350    register char *cs;		/* the current symbol */
351    register char *ptr,*s1;	/* temporary pointer */
352    bool matched;	/* a temporary bool */
353
354    /* initial conditions */
355    if (re == NULL)
356	return (NULL);
357    cs = re;
358    matched = false;
359
360    /* loop till expression string is exhausted (or at least pretty tired) */
361    while (*cs) {
362	switch (*cs & (OPER | STR | META)) {
363
364	/* try to match a string */
365	case STR:
366	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
367	    if (matched) {
368
369		/* hoorah it matches */
370		s += SCNT(cs);
371		cs = SNEXT(cs);
372	    } else if (*cs & ALT) {
373
374		/* alternation, skip to next expression */
375		cs = SNEXT(cs);
376	    } else if (*cs & OPT) {
377
378		/* the match is optional */
379		cs = SNEXT(cs);
380		matched = 1;		/* indicate a successful match */
381	    } else {
382
383		/* no match, error return */
384		return (NULL);
385	    }
386	    break;
387
388	/* an operator, do something fancy */
389	case OPER:
390	    switch (OSYM(cs)) {
391
392	    /* this is an alternation */
393	    case '|':
394		if (matched)
395
396		    /* last thing in the alternation was a match, skip ahead */
397		    cs = OPTR(cs);
398		else
399
400		    /* no match, keep trying */
401		    cs = ONEXT(cs);
402		break;
403
404	    /* this is a grouping, recurse */
405	    case '(':
406		ptr = expmatch(s, ONEXT(cs), mstring);
407		if (ptr != NULL) {
408
409		    /* the subexpression matched */
410		    matched = 1;
411		    s = ptr;
412		} else if (*cs & ALT) {
413
414		    /* alternation, skip to next expression */
415		    matched = 0;
416		} else if (*cs & OPT) {
417
418		    /* the match is optional */
419		    matched = 1;	/* indicate a successful match */
420		} else {
421
422		    /* no match, error return */
423		    return (NULL);
424		}
425		cs = OPTR(cs);
426		break;
427	    }
428	    break;
429
430	/* try to match a metasymbol */
431	case META:
432	    switch (MSYM(cs)) {
433
434	    /* try to match anything and remember what was matched */
435	    case 'p':
436		/*
437		 *  This is really the same as trying the match the
438		 *  remaining parts of the expression to any subset
439		 *  of the string.
440		 */
441		s1 = s;
442		do {
443		    ptr = expmatch(s1, MNEXT(cs), mstring);
444		    if (ptr != NULL && s1 != s) {
445
446			/* we have a match, remember the match */
447			strncpy (mstring, s, s1 - s);
448			mstring[s1 - s] = '\0';
449			return (ptr);
450		    } else if (ptr != NULL && (*cs & OPT)) {
451
452			/* it was aoptional so no match is ok */
453			return (ptr);
454		    } else if (ptr != NULL) {
455
456			/* not optional and we still matched */
457			return (NULL);
458		    }
459		    if (!(isalnum(*s1) || *s1 == '_' ||
460			  /* C++ destructor */
461			  *s1 == '~' ||
462			  /* C++ scope operator */
463			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
464			   (s1++, true))))
465			return (NULL);
466		    if (*s1 == '\\')
467			_escaped = _escaped ? false : true;
468		    else
469			_escaped = false;
470		} while (*s1++);
471		return (NULL);
472
473	    /* try to match anything */
474	    case 'a':
475		/*
476		 *  This is really the same as trying the match the
477		 *  remaining parts of the expression to any subset
478		 *  of the string.
479		 */
480		s1 = s;
481		do {
482		    ptr = expmatch(s1, MNEXT(cs), mstring);
483		    if (ptr != NULL && s1 != s) {
484
485			/* we have a match */
486			return (ptr);
487		    } else if (ptr != NULL && (*cs & OPT)) {
488
489			/* it was aoptional so no match is ok */
490			return (ptr);
491		    } else if (ptr != NULL) {
492
493			/* not optional and we still matched */
494			return (NULL);
495		    }
496		    if (*s1 == '\\')
497			_escaped = _escaped ? false : true;
498		    else
499			_escaped = false;
500		} while (*s1++);
501		return (NULL);
502
503	    /* fail if we are currently _escaped */
504	    case 'e':
505		if (_escaped)
506		    return(NULL);
507		cs = MNEXT(cs);
508		break;
509
510	    /* match any number of tabs and spaces */
511	    case 'd':
512		ptr = s;
513		while (*s == ' ' || *s == '\t')
514		    s++;
515		if (s != ptr || s == s_start) {
516
517		    /* match, be happy */
518		    matched = 1;
519		    cs = MNEXT(cs);
520		} else if (*s == '\n' || *s == '\0') {
521
522		    /* match, be happy */
523		    matched = 1;
524		    cs = MNEXT(cs);
525		} else if (*cs & ALT) {
526
527		    /* try the next part */
528		    matched = 0;
529		    cs = MNEXT(cs);
530		} else if (*cs & OPT) {
531
532		    /* doesn't matter */
533		    matched = 1;
534		    cs = MNEXT(cs);
535		} else
536
537		    /* no match, error return */
538		    return (NULL);
539		break;
540
541	    /* check for end of line */
542	    case '$':
543		if (*s == '\0' || *s == '\n') {
544
545		    /* match, be happy */
546		    s++;
547		    matched = 1;
548		    cs = MNEXT(cs);
549		} else if (*cs & ALT) {
550
551		    /* try the next part */
552		    matched = 0;
553		    cs = MNEXT(cs);
554		} else if (*cs & OPT) {
555
556		    /* doesn't matter */
557		    matched = 1;
558		    cs = MNEXT(cs);
559		} else
560
561		    /* no match, error return */
562		    return (NULL);
563		break;
564
565	    /* check for start of line */
566	    case '^':
567		if (s == s_start) {
568
569		    /* match, be happy */
570		    matched = 1;
571		    cs = MNEXT(cs);
572		} else if (*cs & ALT) {
573
574		    /* try the next part */
575		    matched = 0;
576		    cs = MNEXT(cs);
577		} else if (*cs & OPT) {
578
579		    /* doesn't matter */
580		    matched = 1;
581		    cs = MNEXT(cs);
582		} else
583
584		    /* no match, error return */
585		    return (NULL);
586		break;
587
588	    /* end of a subexpression, return success */
589	    case ')':
590		return (s);
591	    }
592	    break;
593	}
594    }
595    return (s);
596}
597