1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1980, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33#include <sys/cdefs.h>
34
35__FBSDID("$FreeBSD$");
36
37#ifndef lint
38static const char copyright[] =
39"@(#) Copyright (c) 1980, 1993\n\
40	The Regents of the University of California.  All rights reserved.\n";
41#endif
42
43#ifndef lint
44static const char sccsid[] = "@(#)regexp.c	8.1 (Berkeley) 6/6/93";
45#endif
46
47#include <ctype.h>
48#include <stdlib.h>
49#include <stdbool.h>
50#include <string.h>
51
52#include "extern.h"
53
54static void	expconv(void);
55
56bool	 _escaped;	/* true if we are currently x_escaped */
57char	*s_start;	/* start of string */
58bool	 l_onecase;	/* true if upper and lower equivalent */
59
60#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
61
62/*  STRNCMP -	like strncmp except that we convert the
63 *	 	first string to lower case before comparing
64 *		if l_onecase is set.
65 */
66
67int
68STRNCMP(register char *s1, register char *s2, register int len)
69{
70	if (l_onecase) {
71	    do
72		if (*s2 - makelower(*s1))
73			return (*s2 - makelower(*s1));
74		else {
75			s2++;
76			s1++;
77		}
78	    while (--len);
79	} else {
80	    do
81		if (*s2 - *s1)
82			return (*s2 - *s1);
83		else {
84			s2++;
85			s1++;
86		}
87	    while (--len);
88	}
89	return(0);
90}
91
92/*	The following routine converts an irregular expression to
93 *	internal format.
94 *
95 *	Either meta symbols (\a \d or \p) or character strings or
96 *	operations ( alternation or parenthesizing ) can be
97 *	specified.  Each starts with a descriptor byte.  The descriptor
98 *	byte has STR set for strings, META set for meta symbols
99 *	and OPER set for operations.
100 *	The descriptor byte can also have the OPT bit set if the object
101 *	defined is optional.  Also ALT can be set to indicate an alternation.
102 *
103 *	For metasymbols the byte following the descriptor byte identities
104 *	the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
105 *	strings the byte after the descriptor is a character count for
106 *	the string:
107 *
108 *		meta symbols := descriptor
109 *				symbol
110 *
111 *		strings :=	descriptor
112 *				character count
113 *				the string
114 *
115 *		operations :=	descriptor
116 *				symbol
117 *				character count
118 */
119
120/*
121 *  handy macros for accessing parts of match blocks
122 */
123#define MSYM(A) (*(A+1))	/* symbol in a meta symbol block */
124#define MNEXT(A) (A+2)		/* character following a metasymbol block */
125
126#define OSYM(A) (*(A+1))	/* symbol in an operation block */
127#define OCNT(A) (*(A+2))	/* character count */
128#define ONEXT(A) (A+3)		/* next character after the operation */
129#define OPTR(A) (A+*(A+2))	/* place pointed to by the operator */
130
131#define SCNT(A) (*(A+1))	/* byte count of a string */
132#define SSTR(A) (A+2)		/* address of the string */
133#define SNEXT(A) (A+2+*(A+1))	/* character following the string */
134
135/*
136 *  bit flags in the descriptor
137 */
138#define OPT 1
139#define STR 2
140#define META 4
141#define ALT 8
142#define OPER 16
143
144static char *ccre;	/* pointer to current position in converted exp*/
145static char *ure;	/* pointer current position in unconverted exp */
146
147/* re: unconverted irregular expression */
148char *
149convexp(char *re)
150{
151    register char *cre;		/* pointer to converted regular expression */
152
153    /* allocate room for the converted expression */
154    if (re == NULL)
155	return (NULL);
156    if (*re == '\0')
157	return (NULL);
158    cre = malloc(4 * strlen(re) + 3);
159    ccre = cre;
160    ure = re;
161
162    /* start the conversion with a \a */
163    *cre = META | OPT;
164    MSYM(cre) = 'a';
165    ccre = MNEXT(cre);
166
167    /* start the conversion (its recursive) */
168    expconv ();
169    *ccre = 0;
170    return (cre);
171}
172
173static void
174expconv()
175{
176    register char *cs;		/* pointer to current symbol in converted exp */
177    register char c;		/* character being processed */
178    register char *acs;		/* pinter to last alternate */
179    register int temp;
180
181    /* let the conversion begin */
182    acs = NULL;
183    cs = NULL;
184    while (*ure) {
185	switch (c = *ure++) {
186
187	case '\\':
188	    switch (c = *ure++) {
189
190	    /* escaped characters are just characters */
191	    default:
192		if (cs == NULL || (*cs & STR) == 0) {
193		    cs = ccre;
194		    *cs = STR;
195		    SCNT(cs) = 1;
196		    ccre += 2;
197		} else
198		    SCNT(cs)++;
199		*ccre++ = c;
200		break;
201
202	    /* normal(?) metacharacters */
203	    case 'a':
204	    case 'd':
205	    case 'e':
206	    case 'p':
207		if (acs != NULL && acs != cs) {
208		    do {
209			temp = OCNT(acs);
210			OCNT(acs) = ccre - acs;
211			acs -= temp;
212		    } while (temp != 0);
213		    acs = NULL;
214		}
215		cs = ccre;
216		*cs = META;
217		MSYM(cs) = c;
218		ccre = MNEXT(cs);
219		break;
220	    }
221	    break;
222
223	/* just put the symbol in */
224	case '^':
225	case '$':
226	    if (acs != NULL && acs != cs) {
227		do {
228		    temp = OCNT(acs);
229		    OCNT(acs) = ccre - acs;
230		    acs -= temp;
231		} while (temp != 0);
232		acs = NULL;
233	    }
234	    cs = ccre;
235	    *cs = META;
236	    MSYM(cs) = c;
237	    ccre = MNEXT(cs);
238	    break;
239
240	/* mark the last match sequence as optional */
241	case '?':
242	    if (cs)
243	    	*cs = *cs | OPT;
244	    break;
245
246	/* recurse and define a subexpression */
247	case '(':
248	    if (acs != NULL && acs != cs) {
249		do {
250		    temp = OCNT(acs);
251		    OCNT(acs) = ccre - acs;
252		    acs -= temp;
253		} while (temp != 0);
254		acs = NULL;
255	    }
256	    cs = ccre;
257	    *cs = OPER;
258	    OSYM(cs) = '(';
259	    ccre = ONEXT(cs);
260	    expconv();
261	    OCNT(cs) = ccre - cs;		/* offset to next symbol */
262	    break;
263
264	/* return from a recursion */
265	case ')':
266	    if (acs != NULL) {
267		do {
268		    temp = OCNT(acs);
269		    OCNT(acs) = ccre - acs;
270		    acs -= temp;
271		} while (temp != 0);
272		acs = NULL;
273	    }
274	    cs = ccre;
275	    *cs = META;
276	    MSYM(cs) = c;
277	    ccre = MNEXT(cs);
278	    return;
279
280	/* mark the last match sequence as having an alternate */
281	/* the third byte will contain an offset to jump over the */
282	/* alternate match in case the first did not fail */
283	case '|':
284	    if (acs != NULL && acs != cs)
285		OCNT(ccre) = ccre - acs;	/* make a back pointer */
286	    else
287		OCNT(ccre) = 0;
288	    *cs |= ALT;
289	    cs = ccre;
290	    *cs = OPER;
291	    OSYM(cs) = '|';
292	    ccre = ONEXT(cs);
293	    acs = cs;	/* remember that the pointer is to be filles */
294	    break;
295
296	/* if its not a metasymbol just build a scharacter string */
297	default:
298	    if (cs == NULL || (*cs & STR) == 0) {
299		cs = ccre;
300		*cs = STR;
301		SCNT(cs) = 1;
302		ccre = SSTR(cs);
303	    } else
304		SCNT(cs)++;
305	    *ccre++ = c;
306	    break;
307	}
308    }
309    if (acs != NULL) {
310	do {
311	    temp = OCNT(acs);
312	    OCNT(acs) = ccre - acs;
313	    acs -= temp;
314	} while (temp != 0);
315	acs = NULL;
316    }
317    return;
318}
319/* end of convertre */
320
321
322/*
323 *	The following routine recognises an irregular expression
324 *	with the following special characters:
325 *
326 *		\?	-	means last match was optional
327 *		\a	-	matches any number of characters
328 *		\d	-	matches any number of spaces and tabs
329 *		\p	-	matches any number of alphanumeric
330 *				characters. The
331 *				characters matched will be copied into
332 *				the area pointed to by 'name'.
333 *		\|	-	alternation
334 *		\( \)	-	grouping used mostly for alternation and
335 *				optionality
336 *
337 *	The irregular expression must be translated to internal form
338 *	prior to calling this routine
339 *
340 *	The value returned is the pointer to the first non \a
341 *	character matched.
342 */
343
344/*
345 *  s: string to check for a match in
346 *  re: a converted irregular expression
347 *  mstring: where to put whatever matches a \p
348 */
349char *
350expmatch (register char *s, register char *re, register char *mstring)
351{
352    register char *cs;		/* the current symbol */
353    register char *ptr,*s1;	/* temporary pointer */
354    bool matched;	/* a temporary bool */
355
356    /* initial conditions */
357    if (re == NULL)
358	return (NULL);
359    cs = re;
360    matched = false;
361
362    /* loop till expression string is exhausted (or at least pretty tired) */
363    while (*cs) {
364	switch (*cs & (OPER | STR | META)) {
365
366	/* try to match a string */
367	case STR:
368	    matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
369	    if (matched) {
370
371		/* hoorah it matches */
372		s += SCNT(cs);
373		cs = SNEXT(cs);
374	    } else if (*cs & ALT) {
375
376		/* alternation, skip to next expression */
377		cs = SNEXT(cs);
378	    } else if (*cs & OPT) {
379
380		/* the match is optional */
381		cs = SNEXT(cs);
382		matched = 1;		/* indicate a successful match */
383	    } else {
384
385		/* no match, error return */
386		return (NULL);
387	    }
388	    break;
389
390	/* an operator, do something fancy */
391	case OPER:
392	    switch (OSYM(cs)) {
393
394	    /* this is an alternation */
395	    case '|':
396		if (matched)
397
398		    /* last thing in the alternation was a match, skip ahead */
399		    cs = OPTR(cs);
400		else
401
402		    /* no match, keep trying */
403		    cs = ONEXT(cs);
404		break;
405
406	    /* this is a grouping, recurse */
407	    case '(':
408		ptr = expmatch(s, ONEXT(cs), mstring);
409		if (ptr != NULL) {
410
411		    /* the subexpression matched */
412		    matched = 1;
413		    s = ptr;
414		} else if (*cs & ALT) {
415
416		    /* alternation, skip to next expression */
417		    matched = 0;
418		} else if (*cs & OPT) {
419
420		    /* the match is optional */
421		    matched = 1;	/* indicate a successful match */
422		} else {
423
424		    /* no match, error return */
425		    return (NULL);
426		}
427		cs = OPTR(cs);
428		break;
429	    }
430	    break;
431
432	/* try to match a metasymbol */
433	case META:
434	    switch (MSYM(cs)) {
435
436	    /* try to match anything and remember what was matched */
437	    case 'p':
438		/*
439		 *  This is really the same as trying the match the
440		 *  remaining parts of the expression to any subset
441		 *  of the string.
442		 */
443		s1 = s;
444		do {
445		    ptr = expmatch(s1, MNEXT(cs), mstring);
446		    if (ptr != NULL && s1 != s) {
447
448			/* we have a match, remember the match */
449			strncpy (mstring, s, s1 - s);
450			mstring[s1 - s] = '\0';
451			return (ptr);
452		    } else if (ptr != NULL && (*cs & OPT)) {
453
454			/* it was aoptional so no match is ok */
455			return (ptr);
456		    } else if (ptr != NULL) {
457
458			/* not optional and we still matched */
459			return (NULL);
460		    }
461		    if (!(isalnum(*s1) || *s1 == '_' ||
462			  /* C++ destructor */
463			  *s1 == '~' ||
464			  /* C++ scope operator */
465			  (strlen(s1) > 1 && *s1 == ':' && s1[1] == ':' &&
466			   (s1++, true))))
467			return (NULL);
468		    if (*s1 == '\\')
469			_escaped = _escaped ? false : true;
470		    else
471			_escaped = false;
472		} while (*s1++);
473		return (NULL);
474
475	    /* try to match anything */
476	    case 'a':
477		/*
478		 *  This is really the same as trying the match the
479		 *  remaining parts of the expression to any subset
480		 *  of the string.
481		 */
482		s1 = s;
483		do {
484		    ptr = expmatch(s1, MNEXT(cs), mstring);
485		    if (ptr != NULL && s1 != s) {
486
487			/* we have a match */
488			return (ptr);
489		    } else if (ptr != NULL && (*cs & OPT)) {
490
491			/* it was aoptional so no match is ok */
492			return (ptr);
493		    } else if (ptr != NULL) {
494
495			/* not optional and we still matched */
496			return (NULL);
497		    }
498		    if (*s1 == '\\')
499			_escaped = _escaped ? false : true;
500		    else
501			_escaped = false;
502		} while (*s1++);
503		return (NULL);
504
505	    /* fail if we are currently _escaped */
506	    case 'e':
507		if (_escaped)
508		    return(NULL);
509		cs = MNEXT(cs);
510		break;
511
512	    /* match any number of tabs and spaces */
513	    case 'd':
514		ptr = s;
515		while (*s == ' ' || *s == '\t')
516		    s++;
517		if (s != ptr || s == s_start) {
518
519		    /* match, be happy */
520		    matched = 1;
521		    cs = MNEXT(cs);
522		} else if (*s == '\n' || *s == '\0') {
523
524		    /* match, be happy */
525		    matched = 1;
526		    cs = MNEXT(cs);
527		} else if (*cs & ALT) {
528
529		    /* try the next part */
530		    matched = 0;
531		    cs = MNEXT(cs);
532		} else if (*cs & OPT) {
533
534		    /* doesn't matter */
535		    matched = 1;
536		    cs = MNEXT(cs);
537		} else
538
539		    /* no match, error return */
540		    return (NULL);
541		break;
542
543	    /* check for end of line */
544	    case '$':
545		if (*s == '\0' || *s == '\n') {
546
547		    /* match, be happy */
548		    s++;
549		    matched = 1;
550		    cs = MNEXT(cs);
551		} else if (*cs & ALT) {
552
553		    /* try the next part */
554		    matched = 0;
555		    cs = MNEXT(cs);
556		} else if (*cs & OPT) {
557
558		    /* doesn't matter */
559		    matched = 1;
560		    cs = MNEXT(cs);
561		} else
562
563		    /* no match, error return */
564		    return (NULL);
565		break;
566
567	    /* check for start of line */
568	    case '^':
569		if (s == s_start) {
570
571		    /* match, be happy */
572		    matched = 1;
573		    cs = MNEXT(cs);
574		} else if (*cs & ALT) {
575
576		    /* try the next part */
577		    matched = 0;
578		    cs = MNEXT(cs);
579		} else if (*cs & OPT) {
580
581		    /* doesn't matter */
582		    matched = 1;
583		    cs = MNEXT(cs);
584		} else
585
586		    /* no match, error return */
587		    return (NULL);
588		break;
589
590	    /* end of a subexpression, return success */
591	    case ')':
592		return (s);
593	    }
594	    break;
595	}
596    }
597    return (s);
598}
599