regexp.c revision 1590
1/*
2 * Copyright (c) 1980, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 *    must display the following acknowledgement:
16 *	This product includes software developed by the University of
17 *	California, Berkeley and its contributors.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#ifndef lint
36static char copyright[] =
37"@(#) Copyright (c) 1980, 1993\n\
38	The Regents of the University of California.  All rights reserved.\n";
39#endif /* not lint */
40
41#ifndef lint
42static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
43#endif /* not lint */
44
45#include <ctype.h>
46#include <stdlib.h>
47#include <string.h>
48#include "extern.h"
49
50#define FALSE	0
51#define TRUE	!(FALSE)
52#define NIL	0
53
54static void	expconv __P((void));
55
56boolean	 _escaped;	/* true if we are currently _escaped */
57char	*_start;	/* start of string */
58boolean	 l_onecase;	/* true if upper and lower equivalent */
59
60#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
61
62/*  STRNCMP -	like strncmp except that we convert the
63 *	 	first string to lower case before comparing
64 *		if l_onecase is set.
65 */
66
67int
68STRNCMP(s1, s2, len)
69	register char *s1,*s2;
70	register int len;
71{
72	if (l_onecase) {
73	    do
74		if (*s2 - makelower(*s1))
75			return (*s2 - makelower(*s1));
76		else {
77			s2++;
78			s1++;
79		}
80	    while (--len);
81	} else {
82	    do
83		if (*s2 - *s1)
84			return (*s2 - *s1);
85		else {
86			s2++;
87			s1++;
88		}
89	    while (--len);
90	}
91	return(0);
92}
93
94/*	The following routine converts an irregular expression to
95 *	internal format.
96 *
97 *	Either meta symbols (\a \d or \p) or character strings or
98 *	operations ( alternation or perenthesizing ) can be
99 *	specified.  Each starts with a descriptor byte.  The descriptor
100 *	byte has STR set for strings, META set for meta symbols
101 *	and OPER set for operations.
102 *	The descriptor byte can also have the OPT bit set if the object
103 *	defined is optional.  Also ALT can be set to indicate an alternation.
104 *
105 *	For metasymbols the byte following the descriptor byte identities
106 *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
107 *	strings the byte after the descriptor is a character count for
108 *	the string:
109 *
110 *		meta symbols := descriptor
111 *				symbol
112 *
113 *		strings :=	descriptor
114 *				character count
115 *				the string
116 *
117 *		operatins :=	descriptor
118 *				symbol
119 *				character count
120 */
121
122/*
123 *  handy macros for accessing parts of match blocks
124 */
125#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
126#define MNEXT(A) (A+2)		/* character following a metasymbol block */
127
128#define OSYM(A) (*(A+1))	/* symbol in an operation block */
129#define OCNT(A) (*(A+2))	/* character count */
130#define ONEXT(A) (A+3)		/* next character after the operation */
131#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
132
133#define SCNT(A) (*(A+1))	/* byte count of a string */
134#define SSTR(A) (A+2)		/* address of the string */
135#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
136
137/*
138 *  bit flags in the descriptor
139 */
140#define OPT 1
141#define STR 2
142#define META 4
143#define ALT 8
144#define OPER 16
145
146static char *ccre;	/* pointer to current position in converted exp*/
147static char *ure;	/* pointer current position in unconverted exp */
148
149char *
150convexp(re)
151    char *re;		/* unconverted irregular expression */
152{
153    register char *cre;		/* pointer to converted regular expression */
154
155    /* allocate room for the converted expression */
156    if (re == NIL)
157	return (NIL);
158    if (*re == '\0')
159	return (NIL);
160    cre = malloc (4 * strlen(re) + 3);
161    ccre = cre;
162    ure = re;
163
164    /* start the conversion with a \a */
165    *cre = META | OPT;
166    MSYM(cre) = 'a';
167    ccre = MNEXT(cre);
168
169    /* start the conversion (its recursive) */
170    expconv ();
171    *ccre = 0;
172    return (cre);
173}
174
175static void
176expconv()
177{
178    register char *cs;		/* pointer to current symbol in converted exp */
179    register char c;		/* character being processed */
180    register char *acs;		/* pinter to last alternate */
181    register int temp;
182
183    /* let the conversion begin */
184    acs = NIL;
185    cs = NIL;
186    while (*ure != NIL) {
187	switch (c = *ure++) {
188
189	case '\\':
190	    switch (c = *ure++) {
191
192	    /* escaped characters are just characters */
193	    default:
194		if (cs == NIL || (*cs & STR) == 0) {
195		    cs = ccre;
196		    *cs = STR;
197		    SCNT(cs) = 1;
198		    ccre += 2;
199		} else
200		    SCNT(cs)++;
201		*ccre++ = c;
202		break;
203
204	    /* normal(?) metacharacters */
205	    case 'a':
206	    case 'd':
207	    case 'e':
208	    case 'p':
209		if (acs != NIL && acs != cs) {
210		    do {
211			temp = OCNT(acs);
212			OCNT(acs) = ccre - acs;
213			acs -= temp;
214		    } while (temp != 0);
215		    acs = NIL;
216		}
217		cs = ccre;
218		*cs = META;
219		MSYM(cs) = c;
220		ccre = MNEXT(cs);
221		break;
222	    }
223	    break;
224
225	/* just put the symbol in */
226	case '^':
227	case '$':
228	    if (acs != NIL && acs != cs) {
229		do {
230		    temp = OCNT(acs);
231		    OCNT(acs) = ccre - acs;
232		    acs -= temp;
233		} while (temp != 0);
234		acs = NIL;
235	    }
236	    cs = ccre;
237	    *cs = META;
238	    MSYM(cs) = c;
239	    ccre = MNEXT(cs);
240	    break;
241
242	/* mark the last match sequence as optional */
243	case '?':
244	    if (cs)
245	    	*cs = *cs | OPT;
246	    break;
247
248	/* recurse and define a subexpression */
249	case '(':
250	    if (acs != NIL && acs != cs) {
251		do {
252		    temp = OCNT(acs);
253		    OCNT(acs) = ccre - acs;
254		    acs -= temp;
255		} while (temp != 0);
256		acs = NIL;
257	    }
258	    cs = ccre;
259	    *cs = OPER;
260	    OSYM(cs) = '(';
261	    ccre = ONEXT(cs);
262	    expconv ();
263	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
264	    break;
265
266	/* reurn from a recursion */
267	case ')':
268	    if (acs != NIL) {
269		do {
270		    temp = OCNT(acs);
271		    OCNT(acs) = ccre - acs;
272		    acs -= temp;
273		} while (temp != 0);
274		acs = NIL;
275	    }
276	    cs = ccre;
277	    *cs = META;
278	    MSYM(cs) = c;
279	    ccre = MNEXT(cs);
280	    return;
281
282	/* mark the last match sequence as having an alternate */
283	/* the third byte will contain an offset to jump over the */
284	/* alternate match in case the first did not fail */
285	case '|':
286	    if (acs != NIL && acs != cs)
287		OCNT(ccre) = ccre - acs;	/* make a back pointer */
288	    else
289		OCNT(ccre) = 0;
290	    *cs |= ALT;
291	    cs = ccre;
292	    *cs = OPER;
293	    OSYM(cs) = '|';
294	    ccre = ONEXT(cs);
295	    acs = cs;	/* remember that the pointer is to be filles */
296	    break;
297
298	/* if its not a metasymbol just build a scharacter string */
299	default:
300	    if (cs == NIL || (*cs & STR) == 0) {
301		cs = ccre;
302		*cs = STR;
303		SCNT(cs) = 1;
304		ccre = SSTR(cs);
305	    } else
306		SCNT(cs)++;
307	    *ccre++ = c;
308	    break;
309	}
310    }
311    if (acs != NIL) {
312	do {
313	    temp = OCNT(acs);
314	    OCNT(acs) = ccre - acs;
315	    acs -= temp;
316	} while (temp != 0);
317	acs = NIL;
318    }
319    return;
320}
321/* end of convertre */
322
323
324/*
325 *	The following routine recognises an irregular expresion
326 *	with the following special characters:
327 *
328 *		\?	-	means last match was optional
329 *		\a	-	matches any number of characters
330 *		\d	-	matches any number of spaces and tabs
331 *		\p	-	matches any number of alphanumeric
332 *				characters. The
333 *				characters matched will be copied into
334 *				the area pointed to by 'name'.
335 *		\|	-	alternation
336 *		\( \)	-	grouping used mostly for alternation and
337 *				optionality
338 *
339 *	The irregular expression must be translated to internal form
340 *	prior to calling this routine
341 *
342 *	The value returned is the pointer to the first non \a
343 *	character matched.
344 */
345
346char *
347expmatch (s, re, mstring)
348    register char *s;		/* string to check for a match in */
349    register char *re;		/* a converted irregular expression */
350    register char *mstring;	/* where to put whatever matches a \p */
351{
352    register char *cs;		/* the current symbol */
353    register char *ptr,*s1;	/* temporary pointer */
354    boolean matched;		/* a temporary boolean */
355
356    /* initial conditions */
357    if (re == NIL)
358	return (NIL);
359    cs = re;
360    matched = FALSE;
361
362    /* loop till expression string is exhausted (or at least pretty tired) */
363    while (*cs) {
364	switch (*cs & (OPER | STR | META)) {
365
366	/* try to match a string */
367	case STR:
368	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
369	    if (matched) {
370
371		/* hoorah it matches */
372		s += SCNT(cs);
373		cs = SNEXT(cs);
374	    } else if (*cs & ALT) {
375
376		/* alternation, skip to next expression */
377		cs = SNEXT(cs);
378	    } else if (*cs & OPT) {
379
380		/* the match is optional */
381		cs = SNEXT(cs);
382		matched = 1;		/* indicate a successful match */
383	    } else {
384
385		/* no match, error return */
386		return (NIL);
387	    }
388	    break;
389
390	/* an operator, do something fancy */
391	case OPER:
392	    switch (OSYM(cs)) {
393
394	    /* this is an alternation */
395	    case '|':
396		if (matched)
397
398		    /* last thing in the alternation was a match, skip ahead */
399		    cs = OPTR(cs);
400		else
401
402		    /* no match, keep trying */
403		    cs = ONEXT(cs);
404		break;
405
406	    /* this is a grouping, recurse */
407	    case '(':
408		ptr = expmatch (s, ONEXT(cs), mstring);
409		if (ptr != NIL) {
410
411		    /* the subexpression matched */
412		    matched = 1;
413		    s = ptr;
414		} else if (*cs & ALT) {
415
416		    /* alternation, skip to next expression */
417		    matched = 0;
418		} else if (*cs & OPT) {
419
420		    /* the match is optional */
421		    matched = 1;	/* indicate a successful match */
422		} else {
423
424		    /* no match, error return */
425		    return (NIL);
426		}
427		cs = OPTR(cs);
428		break;
429	    }
430	    break;
431
432	/* try to match a metasymbol */
433	case META:
434	    switch (MSYM(cs)) {
435
436	    /* try to match anything and remember what was matched */
437	    case 'p':
438		/*
439		 *  This is really the same as trying the match the
440		 *  remaining parts of the expression to any subset
441		 *  of the string.
442		 */
443		s1 = s;
444		do {
445		    ptr = expmatch (s1, MNEXT(cs), mstring);
446		    if (ptr != NIL && s1 != s) {
447
448			/* we have a match, remember the match */
449			strncpy (mstring, s, s1 - s);
450			mstring[s1 - s] = '\0';
451			return (ptr);
452		    } else if (ptr != NIL && (*cs & OPT)) {
453
454			/* it was aoptional so no match is ok */
455			return (ptr);
456		    } else if (ptr != NIL) {
457
458			/* not optional and we still matched */
459			return (NIL);
460		    }
461		    if (!isalnum(*s1) && *s1 != '_')
462			return (NIL);
463		    if (*s1 == '\\')
464			_escaped = _escaped ? FALSE : TRUE;
465		    else
466			_escaped = FALSE;
467		} while (*s1++);
468		return (NIL);
469
470	    /* try to match anything */
471	    case 'a':
472		/*
473		 *  This is really the same as trying the match the
474		 *  remaining parts of the expression to any subset
475		 *  of the string.
476		 */
477		s1 = s;
478		do {
479		    ptr = expmatch (s1, MNEXT(cs), mstring);
480		    if (ptr != NIL && s1 != s) {
481
482			/* we have a match */
483			return (ptr);
484		    } else if (ptr != NIL && (*cs & OPT)) {
485
486			/* it was aoptional so no match is ok */
487			return (ptr);
488		    } else if (ptr != NIL) {
489
490			/* not optional and we still matched */
491			return (NIL);
492		    }
493		    if (*s1 == '\\')
494			_escaped = _escaped ? FALSE : TRUE;
495		    else
496			_escaped = FALSE;
497		} while (*s1++);
498		return (NIL);
499
500	    /* fail if we are currently _escaped */
501	    case 'e':
502		if (_escaped)
503		    return(NIL);
504		cs = MNEXT(cs);
505		break;
506
507	    /* match any number of tabs and spaces */
508	    case 'd':
509		ptr = s;
510		while (*s == ' ' || *s == '\t')
511		    s++;
512		if (s != ptr || s == _start) {
513
514		    /* match, be happy */
515		    matched = 1;
516		    cs = MNEXT(cs);
517		} else if (*s == '\n' || *s == '\0') {
518
519		    /* match, be happy */
520		    matched = 1;
521		    cs = MNEXT(cs);
522		} else if (*cs & ALT) {
523
524		    /* try the next part */
525		    matched = 0;
526		    cs = MNEXT(cs);
527		} else if (*cs & OPT) {
528
529		    /* doesn't matter */
530		    matched = 1;
531		    cs = MNEXT(cs);
532		} else
533
534		    /* no match, error return */
535		    return (NIL);
536		break;
537
538	    /* check for end of line */
539	    case '$':
540		if (*s == '\0' || *s == '\n') {
541
542		    /* match, be happy */
543		    s++;
544		    matched = 1;
545		    cs = MNEXT(cs);
546		} else if (*cs & ALT) {
547
548		    /* try the next part */
549		    matched = 0;
550		    cs = MNEXT(cs);
551		} else if (*cs & OPT) {
552
553		    /* doesn't matter */
554		    matched = 1;
555		    cs = MNEXT(cs);
556		} else
557
558		    /* no match, error return */
559		    return (NIL);
560		break;
561
562	    /* check for start of line */
563	    case '^':
564		if (s == _start) {
565
566		    /* match, be happy */
567		    matched = 1;
568		    cs = MNEXT(cs);
569		} else if (*cs & ALT) {
570
571		    /* try the next part */
572		    matched = 0;
573		    cs = MNEXT(cs);
574		} else if (*cs & OPT) {
575
576		    /* doesn't matter */
577		    matched = 1;
578		    cs = MNEXT(cs);
579		} else
580
581		    /* no match, error return */
582		    return (NIL);
583		break;
584
585	    /* end of a subexpression, return success */
586	    case ')':
587		return (s);
588	    }
589	    break;
590	}
591    }
592    return (s);
593}
594