1/*
2 * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3 * All rights reserved
4 *
5 * "THE BEER-WARE LICENSE" (Revision 42):
6 * Sergey Lyubka wrote this file.  As long as you retain this notice you
7 * can do whatever you want with this stuff. If we meet some day, and you think
8 * this stuff is worth it, you can buy me a beer in return.
9 */
10
11/*
12 * Downloaded Sat Nov  5 17:43:06 CET 2011 at
13 * http://slre.sourceforge.net/1.0/slre.c
14 */
15
16#ifdef SLRE_TEST
17#include <stdio.h>
18#include <assert.h>
19#include <ctype.h>
20#include <stdlib.h>
21#include <string.h>
22#else
23#include <log.h>
24#include <linux/ctype.h>
25#include <linux/string.h>
26#endif /* SLRE_TEST */
27
28#include <errno.h>
29
30#include <slre.h>
31
32enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
33	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
34
35#ifdef SLRE_TEST
36static struct {
37	const char	*name;
38	int		narg;
39	const char	*flags;
40} opcodes[] = {
41	{"END",		0, ""},		/* End of code block or program	*/
42	{"BRANCH",	2, "oo"},	/* Alternative operator, "|"	*/
43	{"ANY",		0, ""},		/* Match any character, "."	*/
44	{"EXACT",	2, "d"},	/* Match exact string		*/
45	{"ANYOF",	2, "D"},	/* Match any from set, "[]"	*/
46	{"ANYBUT",	2, "D"},	/* Match any but from set, "[^]"*/
47	{"OPEN ",	1, "i"},	/* Capture start, "("		*/
48	{"CLOSE",	1, "i"},	/* Capture end, ")"		*/
49	{"BOL",		0, ""},		/* Beginning of string, "^"	*/
50	{"EOL",		0, ""},		/* End of string, "$"		*/
51	{"STAR",	1, "o"},	/* Match zero or more times "*"	*/
52	{"PLUS",	1, "o"},	/* Match one or more times, "+"	*/
53	{"STARQ",	1, "o"},	/* Non-greedy STAR,  "*?"	*/
54	{"PLUSQ",	1, "o"},	/* Non-greedy PLUS, "+?"	*/
55	{"QUEST",	1, "o"},	/* Match zero or one time, "?"	*/
56	{"SPACE",	0, ""},		/* Match whitespace, "\s"	*/
57	{"NONSPACE",	0, ""},		/* Match non-space, "\S"	*/
58	{"DIGIT",	0, ""}		/* Match digit, "\d"		*/
59};
60#endif /* SLRE_TEST */
61
62/*
63 * Commands and operands are all unsigned char (1 byte long). All code offsets
64 * are relative to current address, and positive (always point forward). Data
65 * offsets are absolute. Commands with operands:
66 *
67 * BRANCH offset1 offset2
68 *	Try to match the code block that follows the BRANCH instruction
69 *	(code block ends with END). If no match, try to match code block that
70 *	starts at offset1. If either of these match, jump to offset2.
71 *
72 * EXACT data_offset data_length
73 *	Try to match exact string. String is recorded in data section from
74 *	data_offset, and has length data_length.
75 *
76 * OPEN capture_number
77 * CLOSE capture_number
78 *	If the user have passed 'struct cap' array for captures, OPEN
79 *	records the beginning of the matched substring (cap->ptr), CLOSE
80 *	sets the length (cap->len) for respective capture_number.
81 *
82 * STAR code_offset
83 * PLUS code_offset
84 * QUEST code_offset
85 *	*, +, ?, respectively. Try to gobble as much as possible from the
86 *	matched buffer, until code block that follows these instructions
87 *	matches. When the longest possible string is matched,
88 *	jump to code_offset
89 *
90 * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
91 */
92
93static const char *meta_chars = "|.^$*+?()[\\";
94
95#ifdef SLRE_TEST
96
97static void
98print_character_set(FILE *fp, const unsigned char *p, int len)
99{
100	int	i;
101
102	for (i = 0; i < len; i++) {
103		if (i > 0)
104			(void) fputc(',', fp);
105		if (p[i] == 0) {
106			i++;
107			if (p[i] == 0)
108				(void) fprintf(fp, "\\x%02x", p[i]);
109			else
110				(void) fprintf(fp, "%s", opcodes[p[i]].name);
111		} else if (isprint(p[i])) {
112			(void) fputc(p[i], fp);
113		} else {
114			(void) fprintf(fp, "\\x%02x", p[i]);
115		}
116	}
117}
118
119void
120slre_dump(const struct slre *r, FILE *fp)
121{
122	int	i, j, ch, op, pc;
123
124	for (pc = 0; pc < r->code_size; pc++) {
125
126		op = r->code[pc];
127		(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
128
129		for (i = 0; opcodes[op].flags[i] != '\0'; i++)
130			switch (opcodes[op].flags[i]) {
131			case 'i':
132				(void) fprintf(fp, "%d ", r->code[pc + 1]);
133				pc++;
134				break;
135			case 'o':
136				(void) fprintf(fp, "%d ",
137				    pc + r->code[pc + 1] - i);
138				pc++;
139				break;
140			case 'D':
141				print_character_set(fp, r->data +
142				    r->code[pc + 1], r->code[pc + 2]);
143				pc += 2;
144				break;
145			case 'd':
146				(void) fputc('"', fp);
147				for (j = 0; j < r->code[pc + 2]; j++) {
148					ch = r->data[r->code[pc + 1] + j];
149					if (isprint(ch)) {
150						(void) fputc(ch, fp);
151					} else {
152						(void) fprintf(fp,
153							"\\x%02x", ch);
154					}
155				}
156				(void) fputc('"', fp);
157				pc += 2;
158				break;
159			}
160
161		(void) fputc('\n', fp);
162	}
163}
164#endif /* SLRE_TEST */
165
166static void
167set_jump_offset(struct slre *r, int pc, int offset)
168{
169	assert(offset < r->code_size);
170
171	if (r->code_size - offset > 0xff)
172		r->err_str = "Jump offset is too big";
173	else
174		r->code[pc] = (unsigned char) (r->code_size - offset);
175}
176
177static void
178emit(struct slre *r, int code)
179{
180	if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
181		r->err_str = "RE is too long (code overflow)";
182	else
183		r->code[r->code_size++] = (unsigned char) code;
184}
185
186static void
187store_char_in_data(struct slre *r, int ch)
188{
189	if (r->data_size >= (int) sizeof(r->data))
190		r->err_str = "RE is too long (data overflow)";
191	else
192		r->data[r->data_size++] = ch;
193}
194
195static void
196exact(struct slre *r, const char **re)
197{
198	int	old_data_size = r->data_size;
199
200	while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
201		store_char_in_data(r, *(*re)++);
202
203	emit(r, EXACT);
204	emit(r, old_data_size);
205	emit(r, r->data_size - old_data_size);
206}
207
208static int
209get_escape_char(const char **re)
210{
211	int	res;
212
213	switch (*(*re)++) {
214	case 'n':
215		res = '\n';
216		break;
217	case 'r':
218		res = '\r';
219		break;
220	case 't':
221		res = '\t';
222		break;
223	case '0':
224		res = 0;
225		break;
226	case 'S':
227		res = NONSPACE << 8;
228		break;
229	case 's':
230		res = SPACE << 8;
231		break;
232	case 'd':
233		res = DIGIT << 8;
234		break;
235	default:
236		res = (*re)[-1];
237		break;
238	}
239
240	return res;
241}
242
243static void
244anyof(struct slre *r, const char **re)
245{
246	int	esc, old_data_size = r->data_size, op = ANYOF;
247
248	if (**re == '^') {
249		op = ANYBUT;
250		(*re)++;
251	}
252
253	while (**re != '\0')
254
255		switch (*(*re)++) {
256		case ']':
257			emit(r, op);
258			emit(r, old_data_size);
259			emit(r, r->data_size - old_data_size);
260			return;
261			/* NOTREACHED */
262			break;
263		case '\\':
264			esc = get_escape_char(re);
265			if ((esc & 0xff) == 0) {
266				store_char_in_data(r, 0);
267				store_char_in_data(r, esc >> 8);
268			} else {
269				store_char_in_data(r, esc);
270			}
271			break;
272		default:
273			store_char_in_data(r, (*re)[-1]);
274			break;
275		}
276
277	r->err_str = "No closing ']' bracket";
278}
279
280static void
281relocate(struct slre *r, int begin, int shift)
282{
283	emit(r, END);
284	memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
285	r->code_size += shift;
286}
287
288static void
289quantifier(struct slre *r, int prev, int op)
290{
291	if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
292		r->code[prev + 2]--;
293		emit(r, EXACT);
294		emit(r, r->code[prev + 1] + r->code[prev + 2]);
295		emit(r, 1);
296		prev = r->code_size - 3;
297	}
298	relocate(r, prev, 2);
299	r->code[prev] = op;
300	set_jump_offset(r, prev + 1, prev);
301}
302
303static void
304exact_one_char(struct slre *r, int ch)
305{
306	emit(r, EXACT);
307	emit(r, r->data_size);
308	emit(r, 1);
309	store_char_in_data(r, ch);
310}
311
312static void
313fixup_branch(struct slre *r, int fixup)
314{
315	if (fixup > 0) {
316		emit(r, END);
317		set_jump_offset(r, fixup, fixup - 2);
318	}
319}
320
321static void
322compile(struct slre *r, const char **re)
323{
324	int	op, esc, branch_start, last_op, fixup, cap_no, level;
325
326	fixup = 0;
327	level = r->num_caps;
328	branch_start = last_op = r->code_size;
329
330	for (;;)
331		switch (*(*re)++) {
332		case '\0':
333			(*re)--;
334			return;
335			/* NOTREACHED */
336			break;
337		case '^':
338			emit(r, BOL);
339			break;
340		case '$':
341			emit(r, EOL);
342			break;
343		case '.':
344			last_op = r->code_size;
345			emit(r, ANY);
346			break;
347		case '[':
348			last_op = r->code_size;
349			anyof(r, re);
350			break;
351		case '\\':
352			last_op = r->code_size;
353			esc = get_escape_char(re);
354			if (esc & 0xff00)
355				emit(r, esc >> 8);
356			else
357				exact_one_char(r, esc);
358			break;
359		case '(':
360			last_op = r->code_size;
361			cap_no = ++r->num_caps;
362			emit(r, OPEN);
363			emit(r, cap_no);
364
365			compile(r, re);
366			if (*(*re)++ != ')') {
367				r->err_str = "No closing bracket";
368				return;
369			}
370
371			emit(r, CLOSE);
372			emit(r, cap_no);
373			break;
374		case ')':
375			(*re)--;
376			fixup_branch(r, fixup);
377			if (level == 0) {
378				r->err_str = "Unbalanced brackets";
379				return;
380			}
381			return;
382			/* NOTREACHED */
383			break;
384		case '+':
385		case '*':
386			op = (*re)[-1] == '*' ? STAR : PLUS;
387			if (**re == '?') {
388				(*re)++;
389				op = op == STAR ? STARQ : PLUSQ;
390			}
391			quantifier(r, last_op, op);
392			break;
393		case '?':
394			quantifier(r, last_op, QUEST);
395			break;
396		case '|':
397			fixup_branch(r, fixup);
398			relocate(r, branch_start, 3);
399			r->code[branch_start] = BRANCH;
400			set_jump_offset(r, branch_start + 1, branch_start);
401			fixup = branch_start + 2;
402			r->code[fixup] = 0xff;
403			break;
404		default:
405			(*re)--;
406			last_op = r->code_size;
407			exact(r, re);
408			break;
409		}
410}
411
412int
413slre_compile(struct slre *r, const char *re)
414{
415	r->err_str = NULL;
416	r->code_size = r->data_size = r->num_caps = r->anchored = 0;
417
418	if (*re == '^')
419		r->anchored++;
420
421	emit(r, OPEN);	/* This will capture what matches full RE */
422	emit(r, 0);
423
424	while (*re != '\0')
425		compile(r, &re);
426
427	if (r->code[2] == BRANCH)
428		fixup_branch(r, 4);
429
430	emit(r, CLOSE);
431	emit(r, 0);
432	emit(r, END);
433
434	return (r->err_str == NULL ? 1 : 0);
435}
436
437static int match(const struct slre *, int,
438		const char *, int, int *, struct cap *);
439
440static void
441loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
442{
443	int	saved_offset, matched_offset;
444
445	matched_offset = *ofs;
446
447	while (match(r, pc + 2, s, len, ofs, NULL)) {
448		saved_offset = *ofs;
449		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
450			matched_offset = saved_offset;
451		*ofs = saved_offset;
452	}
453
454	*ofs = matched_offset;
455}
456
457static void
458loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
459{
460	int	saved_offset = *ofs;
461
462	while (match(r, pc + 2, s, len, ofs, NULL)) {
463		saved_offset = *ofs;
464		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
465			break;
466	}
467
468	*ofs = saved_offset;
469}
470
471static int
472is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
473{
474	int	i, ch;
475
476	ch = s[*ofs];
477
478	for (i = 0; i < len; i++)
479		if (p[i] == ch) {
480			(*ofs)++;
481			return 1;
482		}
483
484	return 0;
485}
486
487static int
488is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
489{
490	int	i, ch;
491
492	ch = s[*ofs];
493
494	for (i = 0; i < len; i++) {
495		if (p[i] == ch)
496			return 0;
497	}
498
499	(*ofs)++;
500	return 1;
501}
502
503static int
504match(const struct slre *r, int pc, const char *s, int len,
505		int *ofs, struct cap *caps)
506{
507	int	n, saved_offset, res = 1;
508
509	while (res && r->code[pc] != END) {
510
511		assert(pc < r->code_size);
512		assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
513
514		switch (r->code[pc]) {
515		case BRANCH:
516			saved_offset = *ofs;
517			res = match(r, pc + 3, s, len, ofs, caps);
518			if (res == 0) {
519				*ofs = saved_offset;
520				res = match(r, pc + r->code[pc + 1],
521				    s, len, ofs, caps);
522			}
523			pc += r->code[pc + 2];
524			break;
525		case EXACT:
526			res = 0;
527			n = r->code[pc + 2];	/* String length */
528			if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
529			    r->code[pc + 1], n)) {
530				(*ofs) += n;
531				res = 1;
532			}
533			pc += 3;
534			break;
535		case QUEST:
536			res = 1;
537			saved_offset = *ofs;
538			if (!match(r, pc + 2, s, len, ofs, caps))
539				*ofs = saved_offset;
540			pc += r->code[pc + 1];
541			break;
542		case STAR:
543			res = 1;
544			loop_greedy(r, pc, s, len, ofs);
545			pc += r->code[pc + 1];
546			break;
547		case STARQ:
548			res = 1;
549			loop_non_greedy(r, pc, s, len, ofs);
550			pc += r->code[pc + 1];
551			break;
552		case PLUS:
553			res = match(r, pc + 2, s, len, ofs, caps);
554			if (res == 0)
555				break;
556
557			loop_greedy(r, pc, s, len, ofs);
558			pc += r->code[pc + 1];
559			break;
560		case PLUSQ:
561			res = match(r, pc + 2, s, len, ofs, caps);
562			if (res == 0)
563				break;
564
565			loop_non_greedy(r, pc, s, len, ofs);
566			pc += r->code[pc + 1];
567			break;
568		case SPACE:
569			res = 0;
570			if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
571				(*ofs)++;
572				res = 1;
573			}
574			pc++;
575			break;
576		case NONSPACE:
577			res = 0;
578			if (*ofs < len &&
579					!isspace(((unsigned char *)s)[*ofs])) {
580				(*ofs)++;
581				res = 1;
582			}
583			pc++;
584			break;
585		case DIGIT:
586			res = 0;
587			if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
588				(*ofs)++;
589				res = 1;
590			}
591			pc++;
592			break;
593		case ANY:
594			res = 0;
595			if (*ofs < len) {
596				(*ofs)++;
597				res = 1;
598			}
599			pc++;
600			break;
601		case ANYOF:
602			res = 0;
603			if (*ofs < len)
604				res = is_any_of(r->data + r->code[pc + 1],
605					r->code[pc + 2], s, ofs);
606			pc += 3;
607			break;
608		case ANYBUT:
609			res = 0;
610			if (*ofs < len)
611				res = is_any_but(r->data + r->code[pc + 1],
612					r->code[pc + 2], s, ofs);
613			pc += 3;
614			break;
615		case BOL:
616			res = *ofs == 0 ? 1 : 0;
617			pc++;
618			break;
619		case EOL:
620			res = *ofs == len ? 1 : 0;
621			pc++;
622			break;
623		case OPEN:
624			if (caps != NULL)
625				caps[r->code[pc + 1]].ptr = s + *ofs;
626			pc += 2;
627			break;
628		case CLOSE:
629			if (caps != NULL)
630				caps[r->code[pc + 1]].len = (s + *ofs) -
631				    caps[r->code[pc + 1]].ptr;
632			pc += 2;
633			break;
634		case END:
635			pc++;
636			break;
637		default:
638			printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
639			assert(0);
640			break;
641		}
642	}
643
644	return res;
645}
646
647int
648slre_match(const struct slre *r, const char *buf, int len,
649		struct cap *caps)
650{
651	int	i, ofs = 0, res = 0;
652
653	if (r->anchored) {
654		res = match(r, 0, buf, len, &ofs, caps);
655	} else {
656		for (i = 0; i < len && res == 0; i++) {
657			ofs = i;
658			res = match(r, 0, buf, len, &ofs, caps);
659		}
660	}
661
662	return res;
663}
664
665#ifdef SLRE_TEST
666#define N_CAPS	5
667
668int main(int argc, char *argv[])
669{
670	struct slre	slre;
671	struct cap	caps[N_CAPS];
672	unsigned char	data[1 * 1024 * 1024];
673	FILE		*fp;
674	int		i, res, len;
675
676	if (argc < 2) {
677		fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
678		return 1;
679	}
680
681	fp = fopen(argv[2], "rb");
682	if (fp == NULL) {
683		fprintf(stderr, "Error: cannot open %s:%s\n",
684			argv[2], strerror(errno));
685		return 1;
686	}
687
688	if (!slre_compile(&slre, argv[1])) {
689		(void) fclose(fp);
690		fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
691		return 1;
692	}
693
694	slre_dump(&slre, stderr);
695
696	while (fgets(data, sizeof(data), fp) != NULL) {
697		len = strlen(data);
698
699		if ((len > 0) && (data[len-1] == '\n')) {
700			data[len-1] = '\0';
701			--len;
702		}
703
704		printf("Data = \"%s\"\n", data);
705
706		(void) memset(caps, 0, sizeof(caps));
707
708		res = slre_match(&slre, data, len, caps);
709		printf("Result [%d]: %d\n", i, res);
710
711		for (i = 0; i < N_CAPS; i++) {
712			if (caps[i].len > 0) {
713				printf("Substring %d: len=%d  [%.*s]\n", i,
714					caps[i].len,
715					caps[i].len, caps[i].ptr);
716			}
717		}
718		printf("----------------------------------------------------\n");
719	}
720	(void) fclose(fp);
721
722	return 0;
723}
724#endif /* SLRE_TEST */
725