1/*	$NetBSD: att.c,v 1.6 2011/11/06 16:43:25 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Christos Zoulas.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *        This product includes software developed by the NetBSD
21 *        Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 *    contributors may be used to endorse or promote products derived
24 *    from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39#include <sys/cdefs.h>
40__RCSID("$NetBSD: att.c,v 1.6 2011/11/06 16:43:25 christos Exp $");
41
42#include <stdio.h>
43#include <regex.h>
44#include <string.h>
45#include <stdlib.h>
46#include <vis.h>
47#include <ctype.h>
48#include <atf-c.h>
49
50ATF_TC(regex_att);
51
52ATF_TC_HEAD(regex_att, tc)
53{
54
55	atf_tc_set_md_var(tc, "descr", "Driver for parsing AT&T format"
56	    " input files available from:"
57	    " http://www2.research.att.com/~gsf/testregex/");
58}
59
60static const char sep[] = "\r\n\t";
61static const char delim[3] = "\\\\\0";
62
63
64static void
65fail(const char *pattern, const char *input, size_t lineno) {
66	fprintf(stderr,
67	    "skipping failed test at line %zu (pattern=%s, input=%s)\n",
68	    lineno, pattern, input);
69}
70
71static int
72bug(const char *pattern, const char *input, size_t lineno) {
73	static const struct {
74		const char *p;
75		const char *i;
76	} b[] = {
77#if defined(REGEX_SPENCER)
78		/*
79		 * The default libc implementation by Henry Spencer
80		 */
81		{ "a[-]?c", "ac" },			// basic.dat
82		{ "(a*)*", "a" },			// categorization.dat
83		{ "(aba|a*b)*", "ababa" },		// categorization.dat
84		{ "\\(a\\(b\\)*\\)*\\2", "abab" },	// categorization.dat
85		{ "(a*)*", "aaaaaa" },			// nullsubexpression.dat
86		{ "(a*)*", "aaaaaax" },			// nullsubexpression.dat
87		{ "(a*)+", "a" },			// nullsubexpression.dat
88		{ "(a*)+", "aaaaaa" },			// nullsubexpression.dat
89		{ "(a*)+", "aaaaaax" },			// nullsubexpression.dat
90		{ "([a]*)*", "a" },			// nullsubexpression.dat
91		{ "([a]*)*", "aaaaaa" },		// nullsubexpression.dat
92		{ "([a]*)*", "aaaaaax" },		// nullsubexpression.dat
93		{ "([a]*)+", "a" },			// nullsubexpression.dat
94		{ "([a]*)+", "aaaaaa" },		// nullsubexpression.dat
95		{ "([a]*)+", "aaaaaax" },		// nullsubexpression.dat
96		{ "([^b]*)*", "a" },			// nullsubexpression.dat
97		{ "([^b]*)*", "aaaaaa" },		// nullsubexpression.dat
98		{ "([^b]*)*", "aaaaaab" },		// nullsubexpression.dat
99		{ "([ab]*)*", "a" },			// nullsubexpression.dat
100		{ "([ab]*)*", "aaaaaa" },		// nullsubexpression.dat
101		{ "([ab]*)*", "ababab" },		// nullsubexpression.dat
102		{ "([ab]*)*", "bababa" },		// nullsubexpression.dat
103		{ "([ab]*)*", "b" },			// nullsubexpression.dat
104		{ "([ab]*)*", "bbbbbb" },		// nullsubexpression.dat
105		{ "([ab]*)*", "aaaabcde" },		// nullsubexpression.dat
106		{ "([^a]*)*", "b" },			// nullsubexpression.dat
107		{ "([^a]*)*", "bbbbbb" },		// nullsubexpression.dat
108		{ "([^ab]*)*", "ccccxx" },		// nullsubexpression.dat
109		{ "\\(a*\\)*\\(x\\)", "ax" },		// nullsubexpression.dat
110		{ "\\(a*\\)*\\(x\\)", "axa" },		// nullsubexpression.dat
111		{ "\\(a*\\)*\\(x\\)\\(\\1\\)", "x" },	// nullsubexpression.dat
112/* crash! */	{ "\\(a*\\)*\\(x\\)\\(\\1\\)", "ax" },	// nullsubexpression.dat
113/* crash! */	{ "\\(a*\\)*\\(x\\)\\(\\1\\)\\(x\\)", "axxa" },	// ""
114		{ "(a*)*(x)",  "ax" },			// nullsubexpression.dat
115		{ "(a*)*(x)",  "axa" },			// nullsubexpression.dat
116		{ "(a*)+(x)",  "ax" },			// nullsubexpression.dat
117		{ "(a*)+(x)",  "axa" },			// nullsubexpression.dat
118		{ "((a|ab)(c|bcd))(d*)", "abcd" },	// forcedassoc.dat
119		{ "((a|ab)(bcd|c))(d*)", "abcd" },	// forcedassoc.dat
120		{ "((ab|a)(c|bcd))(d*)", "abcd" },	// forcedassoc.dat
121		{ "((ab|a)(bcd|c))(d*)", "abcd" },	// forcedassoc.dat
122		{ "((a*)(b|abc))(c*)", "abc" },		// forcedassoc.dat
123		{ "((a*)(abc|b))(c*)", "abc" },		// forcedassoc.dat
124		{ "((..)|(.)){2}", "aaa" },		// repetition.dat
125		{ "((..)|(.)){3}", "aaa" },		// repetition.dat
126		{ "((..)|(.)){3}", "aaaa" },		// repetition.dat
127		{ "((..)|(.)){3}", "aaaaa" },		// repetition.dat
128		{ "X(.?){0,}Y", "X1234567Y" },		// repetition.dat
129		{ "X(.?){1,}Y", "X1234567Y" },		// repetition.dat
130		{ "X(.?){2,}Y", "X1234567Y" },		// repetition.dat
131		{ "X(.?){3,}Y", "X1234567Y" },		// repetition.dat
132		{ "X(.?){4,}Y", "X1234567Y" },		// repetition.dat
133		{ "X(.?){5,}Y", "X1234567Y" },		// repetition.dat
134		{ "X(.?){6,}Y", "X1234567Y" },		// repetition.dat
135		{ "X(.?){7,}Y", "X1234567Y" },		// repetition.dat
136		{ "X(.?){0,8}Y", "X1234567Y" },		// repetition.dat
137		{ "X(.?){1,8}Y", "X1234567Y" },		// repetition.dat
138		{ "X(.?){2,8}Y", "X1234567Y" },		// repetition.dat
139		{ "X(.?){3,8}Y", "X1234567Y" },		// repetition.dat
140		{ "X(.?){4,8}Y", "X1234567Y" },		// repetition.dat
141		{ "X(.?){5,8}Y", "X1234567Y" },		// repetition.dat
142		{ "X(.?){6,8}Y", "X1234567Y" },		// repetition.dat
143		{ "X(.?){7,8}Y", "X1234567Y" },		// repetition.dat
144		{ "(a|ab|c|bcd){0,}(d*)", "ababcd" },	// repetition.dat
145		{ "(a|ab|c|bcd){1,}(d*)", "ababcd" },	// repetition.dat
146		{ "(a|ab|c|bcd){2,}(d*)", "ababcd" },	// repetition.dat
147		{ "(a|ab|c|bcd){3,}(d*)", "ababcd" },	// repetition.dat
148		{ "(a|ab|c|bcd){1,10}(d*)", "ababcd" },	// repetition.dat
149		{ "(a|ab|c|bcd){2,10}(d*)", "ababcd" },	// repetition.dat
150		{ "(a|ab|c|bcd){3,10}(d*)", "ababcd" },	// repetition.dat
151		{ "(a|ab|c|bcd)*(d*)", "ababcd" },	// repetition.dat
152		{ "(a|ab|c|bcd)+(d*)", "ababcd" },	// repetition.dat
153		{ "(ab|a|c|bcd){0,}(d*)", "ababcd" },	// repetition.dat
154		{ "(ab|a|c|bcd){1,}(d*)", "ababcd" },	// repetition.dat
155		{ "(ab|a|c|bcd){2,}(d*)", "ababcd" },	// repetition.dat
156		{ "(ab|a|c|bcd){3,}(d*)", "ababcd" },	// repetition.dat
157		{ "(ab|a|c|bcd){1,10}(d*)", "ababcd" },	// repetition.dat
158		{ "(ab|a|c|bcd){2,10}(d*)", "ababcd" },	// repetition.dat
159		{ "(ab|a|c|bcd){3,10}(d*)", "ababcd" },	// repetition.dat
160		{ "(ab|a|c|bcd)*(d*)", "ababcd" },	// repetition.dat
161		{ "(ab|a|c|bcd)+(d*)", "ababcd" },	// repetition.dat
162#elif defined(REGEX_TRE)
163		{ "a[-]?c", "ac" },			// basic.dat
164		{ "a\\(b\\)*\\1", "a" },		// categorization.dat
165		{ "a\\(b\\)*\\1", "abab" },		// categorization.dat
166		{ "\\(a\\(b\\)*\\)*\\2", "abab" },	// categorization.dat
167		{ "\\(a*\\)*\\(x\\)\\(\\1\\)", "ax" },	// categorization.dat
168		{ "\\(a*\\)*\\(x\\)\\(\\1\\)\\(x\\)", "axxa" },	// ""
169		{ "((..)|(.))*", "aa" },		// repetition.dat
170		{ "((..)|(.))*", "aaa" },		// repetition.dat
171		{ "((..)|(.))*", "aaaaa" },		// repetition.dat
172		{ "X(.?){7,}Y", "X1234567Y" },		// repetition.dat
173#else
174		{ "", "" }
175#endif
176	};
177
178	for (size_t i = 0; i < __arraycount(b); i++) {
179		if (strcmp(pattern, b[i].p) == 0 &&
180		    strcmp(input, b[i].i) == 0) {
181			fail(pattern, input, lineno);
182			return 1;
183		}
184	}
185	return 0;
186}
187
188#ifdef REGEX_SPENCER
189#define HAVE_BRACES	1
190#define HAVE_MINIMAL	0
191#endif
192#ifndef HAVE_BRACES
193#define HAVE_BRACES	1
194#endif
195#ifndef HAVE_MINIMAL
196#define HAVE_MINIMAL	1
197#endif
198
199static int
200optional(const char *s)
201{
202	static const struct{
203		const char *n;
204		int v;
205	} nv[]= {
206		{ "[[<element>]] not supported", HAVE_BRACES },
207		{ "no *? +? mimimal match ops", HAVE_MINIMAL },
208	};
209
210	for (size_t i = 0; i < __arraycount(nv); i++)
211		if (strcmp(nv[i].n, s) == 0) {
212			if (nv[i].v)
213				return 0;
214			fprintf(stderr, "skipping unsupported [%s] tests\n", s);
215			return 1;
216		}
217
218	ATF_REQUIRE_MSG(0, "Unknown feature: %s", s);
219	return 0;
220}
221
222static int
223unsupported(const char *s)
224{
225	static const char *we[] = {
226#if defined(REGEX_SPENCER)
227		"ASSOCIATIVITY=left",		// have right associativity
228		"SUBEXPRESSION=precedence",	// have grouping subexpression
229		"REPEAT_LONGEST=last",		// have first repeat longest
230		"BUG=alternation-order",	// don't have it
231		"BUG=first-match",		// don't have it
232		"BUG=nomatch-match",		// don't have it
233		"BUG=repeat-any",		// don't have it
234		"BUG=range-null",		// don't have it
235		"BUG=repeat-null-unknown",	// don't have it
236		"BUG=repeat-null",		// don't have it
237		"BUG=repeat-artifact",		// don't have it
238		"BUG=subexpression-first",	// don't have it
239#elif defined(REGEX_TRE)
240		"ASSOCIATIVITY=right",		// have left associativity
241		"SUBEXPRESSION=grouping",	// have precedence subexpression
242		"REPEAT_LONGEST=first",		// have last repeat longest
243		"LENGTH=first",			// have last length
244		"BUG=alternation-order",	// don't have it
245		"BUG=first-match",		// don't have it
246		"BUG=range-null",		// don't have it
247		"BUG=repeat-null",		// don't have it
248		"BUG=repeat-artifact",		// don't have it
249		"BUG=subexpression-first",	// don't have it
250		"BUG=repeat-short",		// don't have it
251#endif
252	};
253
254	if (s == NULL)
255		return 0;
256
257	while (*s == '#' || isspace((unsigned char)*s))
258		s++;
259
260	for (size_t i = 0; i < __arraycount(we); i++)
261		if (strcmp(we[i], s) == 0)
262			return 1;
263	return 0;
264}
265
266static void
267geterror(const char *s, int *comp, int *exec)
268{
269	static const struct {
270		const char *n;
271		int v;
272		int ce;
273	} nv[] = {
274#define COMP 1
275#define EXEC 2
276		{ "OK", 0, COMP|EXEC },
277#define _DO(a, b)	{ # a, REG_ ## a, b },
278		_DO(NOMATCH, EXEC)
279		_DO(BADPAT, COMP)
280		_DO(ECOLLATE, COMP)
281		_DO(ECTYPE, COMP)
282		_DO(EESCAPE, COMP)
283		_DO(ESUBREG, COMP)
284		_DO(EBRACK, COMP)
285		_DO(EPAREN, COMP)
286		_DO(EBRACE, COMP)
287		_DO(BADBR, COMP)
288		_DO(ERANGE, COMP)
289		_DO(ESPACE, EXEC)
290		_DO(BADRPT, COMP)
291		_DO(EMPTY, COMP)
292		_DO(ASSERT, COMP)
293		_DO(INVARG, COMP)
294		_DO(ENOSYS, COMP)
295#undef _DO
296	};
297	*comp = 0;
298	*exec = 0;
299	for (size_t i = 0; i < __arraycount(nv); i++)
300		if (strcmp(s, nv[i].n) == 0) {
301			if (nv[i].ce & COMP)
302				*comp = nv[i].v;
303			if (nv[i].ce & EXEC)
304				*exec = nv[i].v;
305			return;
306		}
307	ATF_REQUIRE_MSG(0, "Unknown error %s", s);
308	return;
309}
310
311static int
312getflags(char *s)
313{
314	int flags = 0;
315
316	for (;; s++)
317		switch (*s) {
318		case '0': case '1': case '2': case '3': case '4':
319		case '5': case '6': case '7': case '8': case '9':
320			*s = '\0';
321			break;
322		case '\0':
323			return flags;
324		case 'B':
325		case 'E':
326		case 'F':
327		case 'L':
328			break;
329		case 'i':
330			flags |= REG_ICASE;
331			*s = '\0';
332			break;
333		case '$':
334			*s = '\0';
335			break;
336		case 'n':
337			*s = '\0';
338			break;
339		default:
340			ATF_REQUIRE_MSG(0, "Unknown char %c", *s);
341			break;
342		}
343}
344
345static size_t
346getmatches(const char *s)
347{
348	size_t i;
349	char *q;
350	for (i = 0; (q = strchr(s, '(')) != NULL; i++, s = q + 1)
351		continue;
352	ATF_REQUIRE_MSG(i != 0, "No parentheses found");
353	return i;
354}
355
356static void
357checkcomment(const char *s, size_t lineno)
358{
359	if (s && strstr(s, "BUG") != NULL)
360		fprintf(stderr, "Expected %s at line %zu\n", s, lineno);
361}
362
363static void
364checkmatches(const char *matches, size_t nm, const regmatch_t *pm,
365    size_t lineno)
366{
367	if (nm == 0)
368		return;
369
370	char *res;
371	size_t len = strlen(matches) + 1, off = 0;
372
373	ATF_REQUIRE((res = strdup(matches)) != NULL);
374	for (size_t i = 0; i < nm; i++) {
375		int l;
376		if (pm[i].rm_so == -1 && pm[i].rm_eo == -1)
377			l = snprintf(res + off, len - off, "(?,?)");
378		else
379			l = snprintf(res + off, len - off, "(%lld,%lld)",
380			    (long long)pm[i].rm_so, (long long)pm[i].rm_eo);
381		ATF_REQUIRE_MSG((size_t) l < len - off, "String too long %s"
382		    " cur=%d, max=%zu", res, l, len - off);
383		off += l;
384	}
385	ATF_REQUIRE_STREQ_MSG(res, matches, " at line %zu", lineno);
386	free(res);
387}
388
389ATF_TC_BODY(regex_att, tc)
390{
391	regex_t re;
392	char *line, *lastpattern = NULL;
393	size_t len, lineno = 0;
394	int skipping = 0;
395
396	for (; (line = fparseln(stdin, &len, &lineno, delim, 0))
397	    != NULL; free(line)) {
398		char *name, *pattern, *input, *matches, *comment;
399		regmatch_t *pm;
400		size_t nm;
401#ifdef DEBUG
402		fprintf(stderr, "[%s]\n", line);
403#endif
404		if ((name = strtok(line, sep)) == NULL)
405			continue;
406
407		/*
408		 * We check these early so that we skip the lines quickly
409		 * in order to do more strict testing on the other arguments
410		 * The same characters are also tested in the switch below
411		 */
412		if (*name == '}') {
413			skipping = 0;
414			continue;
415		}
416		if (skipping)
417			continue;
418		if (*name == ';' || *name == '#' || strcmp(name, "NOTE") == 0)
419			continue;
420		if (*name == ':') {
421			/* Skip ":HA#???:" prefix */
422			while (*++name && *name != ':')
423				continue;
424			if (*name)
425				name++;
426		}
427
428		ATF_REQUIRE_MSG((pattern = strtok(NULL, sep)) != NULL,
429			"Missing pattern at line %zu", lineno);
430		ATF_REQUIRE_MSG((input = strtok(NULL, sep)) != NULL,
431			"Missing input at line %zu", lineno);
432
433		if (strchr(name, '$')) {
434			ATF_REQUIRE(strunvis(pattern, pattern) != -1);
435			ATF_REQUIRE(strunvis(input, input) != -1);
436		}
437
438
439		if (strcmp(input, "NULL") == 0)
440			*input = '\0';
441
442		if (strcmp(pattern, "SAME") == 0) {
443			ATF_REQUIRE(lastpattern != NULL);
444			pattern = lastpattern;
445		} else {
446			free(lastpattern);
447			ATF_REQUIRE((lastpattern = strdup(pattern)) != NULL);
448		}
449
450		ATF_REQUIRE_MSG((matches = strtok(NULL, sep)) != NULL,
451		    "Missing matches at line %zu", lineno);
452
453		comment = strtok(NULL, sep);
454		switch (*name) {
455		case '{':	/* Begin optional implementation */
456			if (optional(comment)) {
457				skipping++;
458				continue;
459			}
460			name++;	/* We have it, so ignore */
461			break;
462		case '}':	/* End optional implementation */
463			skipping = 0;
464			continue;
465		case '?':	/* Optional */
466		case '|':	/* Alternative */
467			if (unsupported(comment))
468				continue;
469			name++;	/* We have it, so ignore */
470			break;
471		case '#':	/* Comment */
472		case ';':	/* Skip */
473			continue;
474		default:
475			break;
476		}
477
478		/* XXX: Our bug */
479		if (bug(pattern, input, lineno))
480			continue;
481
482		int comp, exec;
483		if (*matches != '(') {
484			geterror(matches, &comp, &exec);
485			pm = NULL;
486			nm = 0;
487		} else {
488			comp = exec = 0;
489			nm = getmatches(matches);
490			ATF_REQUIRE((pm = calloc(nm, sizeof(*pm))) != NULL);
491		}
492
493
494
495		int iflags = getflags(name);
496		for (; *name; name++) {
497			int flags;
498			switch (*name) {
499			case 'B':
500				flags = REG_BASIC;
501				break;
502			case 'E':
503				flags = REG_EXTENDED;
504				break;
505			case 'L':
506				flags = REG_NOSPEC;
507				break;
508			default:
509				ATF_REQUIRE_MSG(0, "Bad name %c", *name);
510				continue;
511			}
512			int c = regcomp(&re, pattern, flags | iflags);
513			ATF_REQUIRE_MSG(c == comp,
514			    "regcomp returned %d for pattern %s at line %zu",
515			    c, pattern, lineno);
516			if (c)
517				continue;
518			int e = regexec(&re, input, nm, pm, 0);
519			ATF_REQUIRE_MSG(e == exec, "Expected error %d,"
520			    " got %d at line %zu", exec, e, lineno);
521			checkmatches(matches, nm, pm, lineno);
522			checkcomment(comment, lineno);
523			regfree(&re);
524		}
525		free(pm);
526	}
527}
528
529ATF_TP_ADD_TCS(tp)
530{
531
532	ATF_TP_ADD_TC(tp, regex_att);
533	return atf_no_error();
534}
535