1/*	$NetBSD: regexp.c,v 1.15 2021/12/12 08:49:58 andvar Exp $	*/
2
3/*
4 * Copyright (c) 1980, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33#if HAVE_NBTOOL_CONFIG_H
34#include "nbtool_config.h"
35#endif
36
37#include <sys/cdefs.h>
38#ifndef lint
39__COPYRIGHT("@(#) Copyright (c) 1980, 1993\
40 The Regents of the University of California.  All rights reserved.");
41#endif /* not lint */
42
43#ifndef lint
44#if 0
45static char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
46#endif
47__RCSID("$NetBSD: regexp.c,v 1.15 2021/12/12 08:49:58 andvar Exp $");
48#endif /* not lint */
49
50#include <assert.h>
51#include <ctype.h>
52#include <stdlib.h>
53#include <stdbool.h>
54#include <string.h>
55#include "extern.h"
56
57static void	expconv (void);
58
59bool	 x_escaped;	/* true if we are currently x_escaped */
60char	*x_start;	/* start of string */
61bool	 l_onecase;	/* true if upper and lower equivalent */
62
63#define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
64
65static char
66tocc(ptrdiff_t x) {
67	if (x & ~0xff)
68		abort();
69	return (char)x;
70}
71
72/*  STRNCMP -	like strncmp except that we convert the
73 *	 	first string to lower case before comparing
74 *		if l_onecase is set.
75 */
76
77int
78STRNCMP(char *s1, char *s2, int len)
79{
80	if (l_onecase) {
81	    do
82		if (*s2 - makelower(*s1))
83			return *s2 - makelower(*s1);
84		else {
85			s2++;
86			s1++;
87		}
88	    while (--len);
89	} else {
90	    do
91		if (*s2 - *s1)
92			return *s2 - *s1;
93		else {
94			s2++;
95			s1++;
96		}
97	    while (--len);
98	}
99	return(0);
100}
101
102/*	The following routine converts an irregular expression to
103 *	internal format.
104 *
105 *	Either meta symbols (\a \d or \p) or character strings or
106 *	operations ( alternation or perenthesizing ) can be
107 *	specified.  Each starts with a descriptor byte.  The descriptor
108 *	byte has STR set for strings, META set for meta symbols
109 *	and OPER set for operations.
110 *	The descriptor byte can also have the OPT bit set if the object
111 *	defined is optional.  Also ALT can be set to indicate an alternation.
112 *
113 *	For metasymbols the byte following the descriptor byte identities
114 *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
115 *	strings the byte after the descriptor is a character count for
116 *	the string:
117 *
118 *		meta symbols := descriptor
119 *				symbol
120 *
121 *		strings :=	descriptor
122 *				character count
123 *				the string
124 *
125 *		operations :=	descriptor
126 *				symbol
127 *				character count
128 */
129
130/*
131 *  handy macros for accessing parts of match blocks
132 */
133#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
134#define MNEXT(A) (A+2)		/* character following a metasymbol block */
135
136#define OSYM(A) (*(A+1))	/* symbol in an operation block */
137#define OCNT(A) (*(A+2))	/* character count */
138#define ONEXT(A) (A+3)		/* next character after the operation */
139#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
140
141#define SCNT(A) (*(A+1))	/* byte count of a string */
142#define SSTR(A) (A+2)		/* address of the string */
143#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
144
145/*
146 *  bit flags in the descriptor
147 */
148#define OPT 1
149#define STR 2
150#define META 4
151#define ALT 8
152#define OPER 16
153
154static char *ccre;	/* pointer to current position in converted exp*/
155static char *ure;	/* pointer current position in unconverted exp */
156
157char *
158convexp(char *re)	/* unconverted irregular expression */
159{
160    char *cre;		/* pointer to converted regular expression */
161
162    /* allocate room for the converted expression */
163    if (re == NULL)
164	return NULL;
165    if (*re == '\0')
166	return NULL;
167    cre = malloc(4 * strlen(re) + 3);
168    ccre = cre;
169    ure = re;
170
171    /* start the conversion with a \a */
172    *cre = META | OPT;
173    MSYM(cre) = 'a';
174    ccre = MNEXT(cre);
175
176    /* start the conversion (its recursive) */
177    expconv();
178    *ccre = 0;
179    return cre;
180}
181
182static void
183expconv(void)
184{
185    char *cs;		/* pointer to current symbol in converted exp */
186    char c;		/* character being processed */
187    char *acs;		/* pinter to last alternate */
188    int temp;
189
190    /* let the conversion begin */
191    acs = NULL;
192    cs = NULL;
193    while (*ure) {
194	switch (c = *ure++) {
195
196	case '\\':
197	    switch (c = *ure++) {
198
199	    /* escaped characters are just characters */
200	    default:
201		if (cs == NULL || (*cs & STR) == 0) {
202		    cs = ccre;
203		    *cs = STR;
204		    SCNT(cs) = 1;
205		    ccre += 2;
206		} else
207		    SCNT(cs)++;
208		*ccre++ = c;
209		break;
210
211	    /* normal(?) metacharacters */
212	    case 'a':
213	    case 'd':
214	    case 'e':
215	    case 'p':
216		if (acs != NULL && acs != cs) {
217		    do {
218			temp = OCNT(acs);
219			OCNT(acs) = tocc(ccre - acs);
220			acs -= temp;
221		    } while (temp != 0);
222		    acs = NULL;
223		}
224		cs = ccre;
225		*cs = META;
226		MSYM(cs) = c;
227		ccre = MNEXT(cs);
228		break;
229	    }
230	    break;
231
232	/* just put the symbol in */
233	case '^':
234	case '$':
235	    if (acs != NULL && acs != cs) {
236		do {
237		    temp = OCNT(acs);
238		    OCNT(acs) = tocc(ccre - acs);
239		    acs -= temp;
240		} while (temp != 0);
241		acs = NULL;
242	    }
243	    cs = ccre;
244	    *cs = META;
245	    MSYM(cs) = c;
246	    ccre = MNEXT(cs);
247	    break;
248
249	/* mark the last match sequence as optional */
250	case '?':
251	    if (cs)
252	    	*cs = *cs | OPT;
253	    break;
254
255	/* recurse and define a subexpression */
256	case '(':
257	    if (acs != NULL && acs != cs) {
258		do {
259		    temp = OCNT(acs);
260		    OCNT(acs) = tocc(ccre - acs);
261		    acs -= temp;
262		} while (temp != 0);
263		acs = NULL;
264	    }
265	    cs = ccre;
266	    *cs = OPER;
267	    OSYM(cs) = '(';
268	    ccre = ONEXT(cs);
269	    expconv();
270	    OCNT(cs) = tocc(ccre - cs);		/* offset to next symbol */
271	    break;
272
273	/* reurn from a recursion */
274	case ')':
275	    if (acs != NULL) {
276		do {
277		    temp = OCNT(acs);
278		    OCNT(acs) = tocc(ccre - acs);
279		    acs -= temp;
280		} while (temp != 0);
281		acs = NULL;
282	    }
283	    cs = ccre;
284	    *cs = META;
285	    MSYM(cs) = c;
286	    ccre = MNEXT(cs);
287	    return;
288
289	/* mark the last match sequence as having an alternate */
290	/* the third byte will contain an offset to jump over the */
291	/* alternate match in case the first did not fail */
292	case '|':
293	    if (acs != NULL && acs != cs)
294		OCNT(ccre) = tocc(ccre - acs);	/* make a back pointer */
295	    else
296		OCNT(ccre) = 0;
297	    assert(cs != NULL);
298	    *cs |= ALT;
299	    cs = ccre;
300	    *cs = OPER;
301	    OSYM(cs) = '|';
302	    ccre = ONEXT(cs);
303	    acs = cs;	/* remember that the pointer is to be filles */
304	    break;
305
306	/* if its not a metasymbol just build a scharacter string */
307	default:
308	    if (cs == NULL || (*cs & STR) == 0) {
309		cs = ccre;
310		*cs = STR;
311		SCNT(cs) = 1;
312		ccre = SSTR(cs);
313	    } else
314		SCNT(cs)++;
315	    *ccre++ = c;
316	    break;
317	}
318    }
319    if (acs != NULL) {
320	do {
321	    temp = OCNT(acs);
322	    OCNT(acs) = tocc(ccre - acs);
323	    acs -= temp;
324	} while (temp != 0);
325	acs = NULL;
326    }
327    return;
328}
329/* end of convertre */
330
331
332/*
333 *	The following routine recognises an irregular expression
334 *	with the following special characters:
335 *
336 *		\?	-	means last match was optional
337 *		\a	-	matches any number of characters
338 *		\d	-	matches any number of spaces and tabs
339 *		\p	-	matches any number of alphanumeric
340 *				characters. The
341 *				characters matched will be copied into
342 *				the area pointed to by 'name'.
343 *		\|	-	alternation
344 *		\( \)	-	grouping used mostly for alternation and
345 *				optionality
346 *
347 *	The irregular expression must be translated to internal form
348 *	prior to calling this routine
349 *
350 *	The value returned is the pointer to the first non \a
351 *	character matched.
352 */
353
354char *
355expmatch(
356    char *s,		/* string to check for a match in */
357    char *re,		/* a converted irregular expression */
358    char *mstring)	/* where to put whatever matches a \p */
359{
360    char *cs;		/* the current symbol */
361    char *ptr,*s1;	/* temporary pointer */
362    bool matched;	/* a temporary bool */
363
364    /* initial conditions */
365    if (re == NULL)
366	return NULL;
367    cs = re;
368    matched = false;
369
370    /* loop till expression string is exhausted (or at least pretty tired) */
371    while (*cs) {
372	switch (*cs & (OPER | STR | META)) {
373
374	/* try to match a string */
375	case STR:
376	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
377	    if (matched) {
378
379		/* hoorah it matches */
380		s += SCNT(cs);
381		cs = SNEXT(cs);
382	    } else if (*cs & ALT) {
383
384		/* alternation, skip to next expression */
385		cs = SNEXT(cs);
386	    } else if (*cs & OPT) {
387
388		/* the match is optional */
389		cs = SNEXT(cs);
390		matched = 1;		/* indicate a successful match */
391	    } else {
392
393		/* no match, error return */
394		return NULL;
395	    }
396	    break;
397
398	/* an operator, do something fancy */
399	case OPER:
400	    switch (OSYM(cs)) {
401
402	    /* this is an alternation */
403	    case '|':
404		if (matched)
405
406		    /* last thing in the alternation was a match, skip ahead */
407		    cs = OPTR(cs);
408		else
409
410		    /* no match, keep trying */
411		    cs = ONEXT(cs);
412		break;
413
414	    /* this is a grouping, recurse */
415	    case '(':
416		ptr = expmatch(s, ONEXT(cs), mstring);
417		if (ptr != NULL) {
418
419		    /* the subexpression matched */
420		    matched = 1;
421		    s = ptr;
422		} else if (*cs & ALT) {
423
424		    /* alternation, skip to next expression */
425		    matched = 0;
426		} else if (*cs & OPT) {
427
428		    /* the match is optional */
429		    matched = 1;	/* indicate a successful match */
430		} else {
431
432		    /* no match, error return */
433		    return NULL;
434		}
435		cs = OPTR(cs);
436		break;
437	    }
438	    break;
439
440	/* try to match a metasymbol */
441	case META:
442	    switch (MSYM(cs)) {
443
444	    /* try to match anything and remember what was matched */
445	    case 'p':
446		/*
447		 *  This is really the same as trying the match the
448		 *  remaining parts of the expression to any subset
449		 *  of the string.
450		 */
451		s1 = s;
452		do {
453		    ptr = expmatch(s1, MNEXT(cs), mstring);
454		    if (ptr != NULL && s1 != s) {
455
456			/* we have a match, remember the match */
457			strncpy(mstring, s, (size_t)(s1 - s));
458			mstring[s1 - s] = '\0';
459			return ptr;
460		    } else if (ptr != NULL && (*cs & OPT)) {
461
462			/* it was aoptional so no match is ok */
463			return ptr;
464		    } else if (ptr != NULL) {
465
466			/* not optional and we still matched */
467			return NULL;
468		    }
469		    if (!isalnum((unsigned char)*s1) && *s1 != '_')
470			return NULL;
471		    if (*s1 == '\\')
472			x_escaped = x_escaped ? false : true;
473		    else
474			x_escaped = false;
475		} while (*s1++);
476		return NULL;
477
478	    /* try to match anything */
479	    case 'a':
480		/*
481		 *  This is really the same as trying the match the
482		 *  remaining parts of the expression to any subset
483		 *  of the string.
484		 */
485		s1 = s;
486		do {
487		    ptr = expmatch(s1, MNEXT(cs), mstring);
488		    if (ptr != NULL && s1 != s) {
489
490			/* we have a match */
491			return ptr;
492		    } else if (ptr != NULL && (*cs & OPT)) {
493
494			/* it was aoptional so no match is ok */
495			return ptr;
496		    } else if (ptr != NULL) {
497
498			/* not optional and we still matched */
499			return NULL;
500		    }
501		    if (*s1 == '\\')
502			x_escaped = x_escaped ? false : true;
503		    else
504			x_escaped = false;
505		} while (*s1++);
506		return NULL;
507
508	    /* fail if we are currently x_escaped */
509	    case 'e':
510		if (x_escaped)
511		    return NULL;
512		cs = MNEXT(cs);
513		break;
514
515	    /* match any number of tabs and spaces */
516	    case 'd':
517		ptr = s;
518		while (*s == ' ' || *s == '\t')
519		    s++;
520		if (s != ptr || s == x_start) {
521
522		    /* match, be happy */
523		    matched = 1;
524		    cs = MNEXT(cs);
525		} else if (*s == '\n' || *s == '\0') {
526
527		    /* match, be happy */
528		    matched = 1;
529		    cs = MNEXT(cs);
530		} else if (*cs & ALT) {
531
532		    /* try the next part */
533		    matched = 0;
534		    cs = MNEXT(cs);
535		} else if (*cs & OPT) {
536
537		    /* doesn't matter */
538		    matched = 1;
539		    cs = MNEXT(cs);
540		} else
541
542		    /* no match, error return */
543		    return NULL;
544		break;
545
546	    /* check for end of line */
547	    case '$':
548		if (*s == '\0' || *s == '\n') {
549
550		    /* match, be happy */
551		    s++;
552		    matched = 1;
553		    cs = MNEXT(cs);
554		} else if (*cs & ALT) {
555
556		    /* try the next part */
557		    matched = 0;
558		    cs = MNEXT(cs);
559		} else if (*cs & OPT) {
560
561		    /* doesn't matter */
562		    matched = 1;
563		    cs = MNEXT(cs);
564		} else
565
566		    /* no match, error return */
567		    return NULL;
568		break;
569
570	    /* check for start of line */
571	    case '^':
572		if (s == x_start) {
573
574		    /* match, be happy */
575		    matched = 1;
576		    cs = MNEXT(cs);
577		} else if (*cs & ALT) {
578
579		    /* try the next part */
580		    matched = 0;
581		    cs = MNEXT(cs);
582		} else if (*cs & OPT) {
583
584		    /* doesn't matter */
585		    matched = 1;
586		    cs = MNEXT(cs);
587		} else
588
589		    /* no match, error return */
590		    return NULL;
591		break;
592
593	    /* end of a subexpression, return success */
594	    case ')':
595		return s;
596	    }
597	    break;
598	}
599    }
600    return s;
601}
602