1/*
2 * Copyright (C) 2013  Internet Systems Consortium, Inc. ("ISC")
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14 * PERFORMANCE OF THIS SOFTWARE.
15 */
16
17#include <config.h>
18
19#include <isc/file.h>
20#include <isc/regex.h>
21#include <isc/string.h>
22
23#if VALREGEX_REPORT_REASON
24#define FAIL(x) do { reason = (x); goto error; } while(0)
25#else
26#define FAIL(x) goto error
27#endif
28
29/*
30 * Validate the regular expression 'C' locale.
31 */
32int
33isc_regex_validate(const char *c) {
34	enum {
35		none, parse_bracket, parse_bound,
36		parse_ce, parse_ec, parse_cc
37	} state = none;
38	/* Well known character classes. */
39	const char *cc[] = {
40		":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
41		":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
42		":print:", ":xdigit:"
43	};
44	isc_boolean_t seen_comma = ISC_FALSE;
45	isc_boolean_t seen_high = ISC_FALSE;
46	isc_boolean_t seen_char = ISC_FALSE;
47	isc_boolean_t seen_ec = ISC_FALSE;
48	isc_boolean_t seen_ce = ISC_FALSE;
49	isc_boolean_t have_atom = ISC_FALSE;
50	int group = 0;
51	int range = 0;
52	int sub = 0;
53	isc_boolean_t empty_ok = ISC_FALSE;
54	isc_boolean_t neg = ISC_FALSE;
55	isc_boolean_t was_multiple = ISC_FALSE;
56	unsigned int low = 0;
57	unsigned int high = 0;
58	const char *ccname = NULL;
59	int range_start = 0;
60#if VALREGEX_REPORT_REASON
61	const char *reason = "";
62#endif
63
64	if (c == NULL || *c == 0)
65		FAIL("empty string");
66
67	while (c != NULL && *c != 0) {
68		switch (state) {
69		case none:
70			switch (*c) {
71			case '\\':	/* make literal */
72				++c;
73				switch (*c) {
74				case '1': case '2': case '3':
75				case '4': case '5': case '6':
76				case '7': case '8': case '9':
77					if ((*c - '0') > sub)
78						FAIL("bad back reference");
79					have_atom = ISC_TRUE;
80					was_multiple = ISC_FALSE;
81					break;
82				case 0:
83					FAIL("escaped end-of-string");
84				default:
85					goto literal;
86				}
87				++c;
88				break;
89			case '[':	/* bracket start */
90				++c;
91				neg = ISC_FALSE;
92				was_multiple = ISC_FALSE;
93				seen_char = ISC_FALSE;
94				state = parse_bracket;
95				break;
96			case '{': 	/* bound start */
97				switch (c[1]) {
98				case '0': case '1': case '2': case '3':
99				case '4': case '5': case '6': case '7':
100				case '8': case '9':
101					if (!have_atom)
102						FAIL("no atom");
103					if (was_multiple)
104						FAIL("was multiple");
105					seen_comma = ISC_FALSE;
106					seen_high = ISC_FALSE;
107					low = high = 0;
108					state = parse_bound;
109					break;
110				default:
111					goto literal;
112				}
113				++c;
114				have_atom = ISC_TRUE;
115				was_multiple = ISC_TRUE;
116				break;
117			case '}':
118				goto literal;
119			case '(':	/* group start */
120				have_atom = ISC_FALSE;
121				was_multiple = ISC_FALSE;
122				empty_ok = ISC_TRUE;
123				++group;
124				++sub;
125				++c;
126				break;
127			case ')':	/* group end */
128				if (group && !have_atom && !empty_ok)
129					FAIL("empty alternative");
130				have_atom = ISC_TRUE;
131				was_multiple = ISC_FALSE;
132				if (group != 0)
133					--group;
134				++c;
135				break;
136			case '|':	/* alternative seperator */
137				if (!have_atom)
138					FAIL("no atom");
139				have_atom = ISC_FALSE;
140				empty_ok = ISC_FALSE;
141				was_multiple = ISC_FALSE;
142				++c;
143				break;
144			case '^':
145			case '$':
146				have_atom = ISC_TRUE;
147				was_multiple = ISC_TRUE;
148				++c;
149				break;
150			case '+':
151			case '*':
152			case '?':
153				if (was_multiple)
154					FAIL("was multiple");
155				if (!have_atom)
156					FAIL("no atom");
157				have_atom = ISC_TRUE;
158				was_multiple = ISC_TRUE;
159				++c;
160				break;
161			case '.':
162			default:
163			literal:
164				have_atom = ISC_TRUE;
165				was_multiple = ISC_FALSE;
166				++c;
167				break;
168			}
169			break;
170		case parse_bound:
171			switch (*c) {
172			case '0': case '1': case '2': case '3': case '4':
173			case '5': case '6': case '7': case '8': case '9':
174				if (!seen_comma) {
175					low = low * 10 + *c - '0';
176					if (low > 255)
177						FAIL("lower bound too big");
178				} else {
179					seen_high = ISC_TRUE;
180					high = high * 10 + *c - '0';
181					if (high > 255)
182						FAIL("upper bound too big");
183				}
184				++c;
185				break;
186			case ',':
187				if (seen_comma)
188					FAIL("multiple commas");
189				seen_comma = ISC_TRUE;
190				++c;
191				break;
192			default:
193			case '{':
194				FAIL("non digit/comma");
195			case '}':
196				if (seen_high && low > high)
197					FAIL("bad parse bound");
198				seen_comma = ISC_FALSE;
199				state = none;
200				++c;
201				break;
202			}
203			break;
204		case parse_bracket:
205			switch (*c) {
206			case '^':
207				if (seen_char || neg) goto inside;
208				neg = ISC_TRUE;
209				++c;
210				break;
211			case '-':
212				if (range == 2) goto inside;
213				if (!seen_char) goto inside;
214				if (range == 1)
215					FAIL("bad range");
216				range = 2;
217				++c;
218				break;
219			case '[':
220				++c;
221				switch (*c) {
222				case '.':	/* collating element */
223					if (range) --range;
224					++c;
225					state = parse_ce;
226					seen_ce = ISC_FALSE;
227					break;
228				case '=':	/* equivalence class */
229					if (range == 2)
230					    FAIL("equivalence class in range");
231					++c;
232					state = parse_ec;
233					seen_ec = ISC_FALSE;
234					break;
235				case ':':	/* character class */
236					if (range == 2)
237					      FAIL("character class in range");
238					ccname = c;
239					++c;
240					state = parse_cc;
241					break;
242				}
243				seen_char = ISC_TRUE;
244				break;
245			case ']':
246				if (!c[1] && !seen_char)
247					FAIL("unfinished brace");
248				if (!seen_char)
249					goto inside;
250				++c;
251				range = 0;
252				have_atom = ISC_TRUE;
253				state = none;
254				break;
255			default:
256			inside:
257				seen_char = ISC_TRUE;
258				if (range == 2 && *c < range_start)
259					FAIL("out of order range");
260				if (range != 0)
261					--range;
262				range_start = *c;
263				++c;
264				break;
265			};
266			break;
267		case parse_ce:
268			switch (*c) {
269			case '.':
270				++c;
271				switch (*c) {
272				case ']':
273					if (!seen_ce)
274						 FAIL("empty ce");
275					++c;
276					state = parse_bracket;
277					break;
278				default:
279					if (seen_ce)
280						range_start = 256;
281					else
282						range_start = '.';
283					seen_ce = ISC_TRUE;
284					break;
285				}
286				break;
287			default:
288				if (seen_ce)
289					range_start = 256;
290				else
291					range_start = *c;
292				seen_ce = ISC_TRUE;
293				++c;
294				break;
295			}
296			break;
297		case parse_ec:
298			switch (*c) {
299			case '=':
300				++c;
301				switch (*c) {
302				case ']':
303					if (!seen_ec)
304						FAIL("no ec");
305					++c;
306					state = parse_bracket;
307					break;
308				default:
309					seen_ec = ISC_TRUE;
310					break;
311				}
312				break;
313			default:
314				seen_ec = ISC_TRUE;
315				++c;
316				break;
317			}
318			break;
319		case parse_cc:
320			switch (*c) {
321			case ':':
322				++c;
323				switch (*c) {
324				case ']': {
325					unsigned int i;
326					isc_boolean_t found = ISC_FALSE;
327					for (i = 0;
328					     i < sizeof(cc)/sizeof(*cc);
329					     i++)
330					{
331						unsigned int len;
332						len = strlen(cc[i]);
333						if (len !=
334						    (unsigned int)(c - ccname))
335							continue;
336						if (strncmp(cc[i], ccname, len))
337							continue;
338						found = ISC_TRUE;
339					}
340					if (!found)
341						FAIL("unknown cc");
342					++c;
343					state = parse_bracket;
344					break;
345					}
346				default:
347					break;
348				}
349				break;
350			default:
351				++c;
352				break;
353			}
354			break;
355		}
356	}
357	if (group != 0)
358		FAIL("group open");
359	if (state != none)
360		FAIL("incomplete");
361	if (!have_atom)
362		FAIL("no atom");
363	return (sub);
364
365 error:
366#if VALREGEX_REPORT_REASON
367	fprintf(stderr, "%s\n", reason);
368#endif
369	return (-1);
370}
371