1/*
2Open Tracker License
3
4Terms and Conditions
5
6Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
7
8Permission is hereby granted, free of charge, to any person obtaining a copy of
9this software and associated documentation files (the "Software"), to deal in
10the Software without restriction, including without limitation the rights to
11use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12of the Software, and to permit persons to whom the Software is furnished to do
13so, subject to the following conditions:
14
15The above copyright notice and this permission notice applies to all licensees
16and shall be included in all copies or substantial portions of the Software.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
25Except as contained in this notice, the name of Be Incorporated shall not be
26used in advertising or otherwise to promote the sale, use or other dealings in
27this Software without prior written authorization from Be Incorporated.
28
29Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30of Be Incorporated in the United States and other countries. Other brand product
31names are registered trademarks or trademarks of their respective holders.
32All rights reserved.
33*/
34
35// This code is based on regexp.c, v.1.3 by Henry Spencer:
36
37// @(#)regexp.c	1.3 of 18 April 87
38//
39//	Copyright (c) 1986 by University of Toronto.
40//	Written by Henry Spencer.  Not derived from licensed software.
41//
42//	Permission is granted to anyone to use this software for any
43//	purpose on any computer system, and to redistribute it freely,
44//	subject to the following restrictions:
45//
46//	1. The author is not responsible for the consequences of use of
47//		this software, no matter how awful, even if they arise
48//		from defects in it.
49//
50//	2. The origin of this software must not be misrepresented, either
51//		by explicit claim or by omission.
52//
53//	3. Altered versions must be plainly marked as such, and must not
54//		be misrepresented as being the original software.
55//
56// Beware that some of this code is subtly aware of the way operator
57// precedence is structured in regular expressions.  Serious changes in
58// regular-expression syntax might require a total rethink.
59//
60
61// ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62// project (www.opentracker.org), Jul 11, 2000.
63
64
65#include <stdlib.h>
66#include <stdio.h>
67#include <string.h>
68
69#include <Catalog.h>
70#include <Errors.h>
71
72#include "RegExp.h"
73
74
75#undef B_TRANSLATION_CONTEXT
76#define B_TRANSLATION_CONTEXT "libtracker"
77
78
79// The first byte of the regexp internal "program" is actually this magic
80// number; the start node begins in the second byte.
81
82const uint8 kRegExpMagic = 0234;
83
84// The "internal use only" fields in RegExp.h are present to pass info from
85// compile to execute that permits the execute phase to run lots faster on
86// simple cases.  They are:
87//
88// regstart	char that must begin a match; '\0' if none obvious
89// reganch	is the match anchored (at beginning-of-line only)?
90// regmust	string (pointer into program) that match must include, or NULL
91// regmlen	length of regmust string
92//
93// Regstart and reganch permit very fast decisions on suitable starting points
94// for a match, cutting down the work a lot.  Regmust permits fast rejection
95// of lines that cannot possibly match.  The regmust tests are costly enough
96// that Compile() supplies a regmust only if the r.e. contains something
97// potentially expensive (at present, the only such thing detected is * or +
98// at the start of the r.e., which can involve a lot of backup). Regmlen is
99// supplied because the test in RunMatcher() needs it and Compile() is
100// computing it anyway.
101//
102//
103//
104// Structure for regexp "program".  This is essentially a linear encoding
105// of a nondeterministic finite-state machine (aka syntax charts or
106// "railroad normal form" in parsing technology).  Each node is an opcode
107// plus a "next" pointer, possibly plus an operand.  "Next" pointers of
108// all nodes except kRegExpBranch implement concatenation; a "next" pointer
109// with a kRegExpBranch on both ends of it is connecting two alternatives.
110// (Here we have one of the subtle syntax dependencies:  an individual
111// kRegExpBranch (as opposed to a collection of them) is never concatenated
112// with anything because of operator precedence.)  The operand of some types
113// of node is a literal string; for others, it is a node leading into a
114// sub-FSM. In particular, the operand of a kRegExpBranch node is the first
115// node of the branch. (NB this is* not* a tree structure:  the tail of the
116// branch connects to the thing following the set of kRegExpBranches).
117// The opcodes are:
118//
119
120// definition	number	opnd?	meaning
121enum {
122	kRegExpEnd = 0,		// no	End of program.
123	kRegExpBol = 1,		// no	Match "" at beginning of line.
124	kRegExpEol = 2,		// no	Match "" at end of line.
125	kRegExpAny = 3,		// no	Match any one character.
126	kRegExpAnyOf = 4,	// str	Match any character in this string.
127	kRegExpAnyBut =	5,	// str	Match any character not in this string.
128	kRegExpBranch =	6,	// node	Match this alternative, or the next...
129	kRegExpBack = 7,	// no	Match "", "next" ptr points backward.
130	kRegExpExactly = 8,	// str	Match this string.
131	kRegExpNothing = 9,	// no	Match empty string.
132	kRegExpStar = 10,	// node	Match this (simple) thing 0 or more times.
133	kRegExpPlus = 11,	// node	Match this (simple) thing 1 or more times.
134	kRegExpOpen	= 20,	// no	Mark this point in input as start of #n.
135							//	kRegExpOpen + 1 is number 1, etc.
136	kRegExpClose = 30	// no	Analogous to kRegExpOpen.
137};
138
139//
140// Opcode notes:
141//
142// kRegExpBranch	The set of branches constituting a single choice are
143//		hooked together with their "next" pointers, since precedence prevents
144//		anything being concatenated to any individual branch.  The
145//		"next" pointer of the last kRegExpBranch in a choice points to the
146//		thing following the whole choice.  This is also where the
147//		final "next" pointer of each individual branch points; each
148//		branch starts with the operand node of a kRegExpBranch node.
149//
150// kRegExpBack		Normal "next" pointers all implicitly point forward;
151//		kRegExpBack exists to make loop structures possible.
152//
153// kRegExpStar,kRegExpPlus	'?', and complex '*' and '+', are implemented as
154//		circular kRegExpBranch structures using kRegExpBack.  Simple cases
155//		(one character per match) are implemented with kRegExpStar and
156//		kRegExpPlus for speed and to minimize recursive plunges.
157//
158// kRegExpOpen,kRegExpClose	...are numbered at compile time.
159//
160//
161//
162// A node is one char of opcode followed by two chars of "next" pointer.
163// "Next" pointers are stored as two 8-bit pieces, high order first.  The
164// value is a positive offset from the opcode of the node containing it.
165// An operand, if any, simply follows the node.  (Note that much of the
166// code generation knows about this implicit relationship.)
167//
168// Using two bytes for the "next" pointer is vast overkill for most things,
169// but allows patterns to get big without disasters.
170//
171
172const char* kMeta = "^$.[()|?+*\\";
173const int32 kMaxSize = 32767L;
174	// Probably could be 65535L.
175
176
177// Flags to be passed up and down:
178enum {
179	kHasWidth =	01,	// Known never to match null string.
180	kSimple = 02,	// Simple enough to be kRegExpStar/kRegExpPlus operand.
181	kSPStart = 04,	// Starts with * or +.
182	kWorst = 0	// Worst case.
183};
184
185
186const char* kRegExpErrorStringArray[] = {
187	B_TRANSLATE_MARK("Unmatched parenthesis."),
188	B_TRANSLATE_MARK("Expression too long."),
189	B_TRANSLATE_MARK("Too many parenthesis."),
190	B_TRANSLATE_MARK("Junk on end."),
191	B_TRANSLATE_MARK("*+? operand may be empty."),
192	B_TRANSLATE_MARK("Nested *?+."),
193	B_TRANSLATE_MARK("Invalid bracket range."),
194	B_TRANSLATE_MARK("Unmatched brackets."),
195	B_TRANSLATE_MARK("Internal error."),
196	B_TRANSLATE_MARK("?+* follows nothing."),
197	B_TRANSLATE_MARK("Trailing \\."),
198	B_TRANSLATE_MARK("Corrupted expression."),
199	B_TRANSLATE_MARK("Memory corruption."),
200	B_TRANSLATE_MARK("Corrupted pointers."),
201	B_TRANSLATE_MARK("Corrupted opcode.")
202};
203
204
205#ifdef DEBUG
206int32 regnarrate = 0;
207#endif
208
209
210//	#pragma mark - RegExp
211
212
213RegExp::RegExp()
214	:
215	fError(B_OK),
216	fRegExp(NULL),
217	fInputScanPointer(NULL),
218	fParenthesisCount(0),
219	fDummy('\0'),
220	fCodeEmitPointer(NULL),
221	fCodeSize(0),
222	fStringInputPointer(NULL),
223	fRegBol(NULL),
224	fStartPArrayPointer(NULL),
225	fEndPArrayPointer(NULL)
226{
227}
228
229
230RegExp::RegExp(const char* pattern)
231	:
232	fError(B_OK),
233	fRegExp(NULL)
234{
235	fRegExp = Compile(pattern);
236}
237
238
239RegExp::RegExp(const BString& pattern)
240	:
241	fError(B_OK),
242	fRegExp(NULL)
243{
244	fRegExp = Compile(pattern.String());
245}
246
247
248RegExp::~RegExp()
249{
250	free(fRegExp);
251}
252
253
254status_t
255RegExp::InitCheck() const
256{
257	return fError;
258}
259
260
261status_t
262RegExp::SetTo(const char* pattern)
263{
264	fError = B_OK;
265	free(fRegExp);
266	fRegExp = Compile(pattern);
267	return fError;
268}
269
270
271status_t
272RegExp::SetTo(const BString& pattern)
273{
274	fError = B_OK;
275	free(fRegExp);
276	fRegExp = Compile(pattern.String());
277	return fError;
278}
279
280
281bool
282RegExp::Matches(const char* string) const
283{
284	if (fRegExp == NULL || string == NULL)
285		return false;
286
287	return RunMatcher(fRegExp, string) == 1;
288}
289
290
291bool
292RegExp::Matches(const BString& string) const
293{
294	if (fRegExp == NULL)
295		return false;
296
297	return RunMatcher(fRegExp, string.String()) == 1;
298}
299
300
301//
302// - Compile - compile a regular expression into internal code
303//
304// We can't allocate space until we know how big the compiled form will be,
305// but we can't compile it (and thus know how big it is) until we've got a
306// place to put the code.  So we cheat:  we compile it twice, once with code
307// generation turned off and size counting turned on, and once "for real".
308// This also means that we don't allocate space until we are sure that the
309// thing really will compile successfully, and we never have to move the
310// code and thus invalidate pointers into it.  (Note that it has to be in
311// one piece because free() must be able to free it all.)
312//
313// Beware that the optimization-preparation code in here knows about some
314// of the structure of the compiled regexp.
315regexp*
316RegExp::Compile(const char* exp)
317{
318	regexp* r;
319	const char* scan;
320	const char* longest;
321	int32 len;
322	int32 flags;
323
324	if (exp == NULL) {
325		SetError(B_BAD_VALUE);
326		return NULL;
327	}
328
329	// First pass: determine size, legality.
330	fInputScanPointer = exp;
331	fParenthesisCount = 1;
332	fCodeSize = 0L;
333	fCodeEmitPointer = &fDummy;
334	Char(kRegExpMagic);
335	if (Reg(0, &flags) == NULL)
336		return NULL;
337
338	// Small enough for pointer-storage convention?
339	if (fCodeSize >= kMaxSize) {
340		SetError(REGEXP_TOO_BIG);
341		return NULL;
342	}
343
344	r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
345		// Allocate space
346
347	if (r == NULL) {
348		SetError(B_NO_MEMORY);
349		return NULL;
350	}
351
352	// Second pass: emit code.
353	fInputScanPointer = exp;
354	fParenthesisCount = 1;
355	fCodeEmitPointer = r->program;
356	Char(kRegExpMagic);
357	if (Reg(0, &flags) == NULL) {
358		free(r);
359		return NULL;
360	}
361
362	// Dig out information for optimizations.
363	r->regstart = '\0';
364		// Worst-case defaults.
365	r->reganch = 0;
366	r->regmust = NULL;
367	r->regmlen = 0;
368	scan = r->program + 1;
369		// First kRegExpBranch.
370	if (*Next((char*)scan) == kRegExpEnd) {
371		// Only one top-level choice.
372		scan = Operand(scan);
373
374		// Starting-point info.
375		if (*scan == kRegExpExactly)
376			r->regstart = *Operand(scan);
377		else if (*scan == kRegExpBol)
378			r->reganch++;
379
380		// If there's something expensive in the r.e., find the
381		// longest literal string that must appear and make it the
382		// regmust.  Resolve ties in favor of later strings, since
383		// the regstart check works with the beginning of the r.e.
384		// and avoiding duplication strengthens checking.  Not a
385		// strong reason, but sufficient in the absence of others.
386		if ((flags & kSPStart) != 0) {
387			longest = NULL;
388			len = 0;
389			for (; scan != NULL; scan = Next((char*)scan)) {
390				if (*scan == kRegExpExactly
391					&& (int32)strlen(Operand(scan)) >= len) {
392					longest = Operand(scan);
393					len = (int32)strlen(Operand(scan));
394				}
395			}
396			r->regmust = longest;
397			r->regmlen = len;
398		}
399	}
400
401	return r;
402}
403
404
405regexp*
406RegExp::Expression() const
407{
408	return fRegExp;
409}
410
411
412const char*
413RegExp::ErrorString() const
414{
415	if (fError >= REGEXP_UNMATCHED_PARENTHESIS
416		&& fError <= REGEXP_CORRUPTED_OPCODE) {
417		return B_TRANSLATE_NOCOLLECT(
418			kRegExpErrorStringArray[fError - B_ERRORS_END]);
419	}
420
421	return strerror(fError);
422}
423
424
425void
426RegExp::SetError(status_t error) const
427{
428	fError = error;
429}
430
431
432//
433// - Reg - regular expression, i.e. main body or parenthesized thing
434//
435// Caller must absorb opening parenthesis.
436//
437// Combining parenthesis handling with the base level of regular expression
438// is a trifle forced, but the need to tie the tails of the branches to what
439// follows makes it hard to avoid.
440//
441char*
442RegExp::Reg(int32 paren, int32* flagp)
443{
444	char* ret;
445	char* br;
446	char* ender;
447	int32 parno = 0;
448	int32 flags;
449
450	*flagp = kHasWidth;
451		// Tentatively.
452
453	// Make an kRegExpOpen node, if parenthesized.
454	if (paren) {
455		if (fParenthesisCount >= kSubExpressionMax) {
456			SetError(REGEXP_TOO_MANY_PARENTHESIS);
457			return NULL;
458		}
459		parno = fParenthesisCount;
460		fParenthesisCount++;
461		ret = Node((char)(kRegExpOpen + parno));
462	} else
463		ret = NULL;
464
465	// Pick up the branches, linking them together.
466	br = Branch(&flags);
467	if (br == NULL)
468		return NULL;
469
470	if (ret != NULL) {
471		Tail(ret, br);
472			// kRegExpOpen -> first
473	} else
474		ret = br;
475
476	if (!(flags & kHasWidth))
477		*flagp &= ~kHasWidth;
478
479	*flagp |= flags&kSPStart;
480	while (*fInputScanPointer == '|') {
481		fInputScanPointer++;
482		br = Branch(&flags);
483		if (br == NULL)
484			return NULL;
485
486		Tail(ret, br);
487			// kRegExpBranch -> kRegExpBranch.
488		if (!(flags & kHasWidth))
489			*flagp &= ~kHasWidth;
490
491		*flagp |= flags&kSPStart;
492	}
493
494	// Make a closing node, and hook it on the end.
495	ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
496	Tail(ret, ender);
497
498	// Hook the tails of the branches to the closing node.
499	for (br = ret; br != NULL; br = Next(br))
500		OpTail(br, ender);
501
502	// Check for proper termination.
503	if (paren && *fInputScanPointer++ != ')') {
504		SetError(REGEXP_UNMATCHED_PARENTHESIS);
505		return NULL;
506	} else if (!paren && *fInputScanPointer != '\0') {
507		if (*fInputScanPointer == ')') {
508			SetError(REGEXP_UNMATCHED_PARENTHESIS);
509			return NULL;
510		} else {
511			SetError(REGEXP_JUNK_ON_END);
512			return NULL;	//  "Can't happen".
513		}
514		// NOTREACHED
515	}
516
517	return ret;
518}
519
520
521//
522// - Branch - one alternative of an | operator
523//
524// Implements the concatenation operator.
525//
526char*
527RegExp::Branch(int32* flagp)
528{
529	char* ret;
530	char* chain;
531	char* latest;
532	int32 flags;
533
534	*flagp = kWorst;
535		// Tentatively.
536
537	ret = Node(kRegExpBranch);
538	chain = NULL;
539	while (*fInputScanPointer != '\0'
540		&& *fInputScanPointer != '|'
541		&& *fInputScanPointer != ')') {
542		latest = Piece(&flags);
543		if (latest == NULL)
544			return NULL;
545
546		*flagp |= flags & kHasWidth;
547		if (chain == NULL) {
548			// First piece.
549			*flagp |= flags & kSPStart;
550		} else
551			Tail(chain, latest);
552
553		chain = latest;
554	}
555
556	if (chain == NULL) {
557		// Loop ran zero times.
558		Node(kRegExpNothing);
559	}
560
561	return ret;
562}
563
564
565//
566// - Piece - something followed by possible [*+?]
567//
568// Note that the branching code sequences used for ? and the general cases
569// of * and + are somewhat optimized:  they use the same kRegExpNothing node
570// as both the endmarker for their branch list and the body of the last
571// branch. It might seem that this node could be dispensed with entirely,
572// but the endmarker role is not redundant.
573//
574char*
575RegExp::Piece(int32* flagp)
576{
577	char* ret;
578	char op;
579	char* next;
580	int32 flags;
581
582	ret = Atom(&flags);
583	if (ret == NULL)
584		return NULL;
585
586	op = *fInputScanPointer;
587	if (!IsMult(op)) {
588		*flagp = flags;
589		return ret;
590	}
591
592	if (!(flags & kHasWidth) && op != '?') {
593		SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
594		return NULL;
595	}
596	*flagp = op != '+' ? kWorst | kSPStart :  kWorst | kHasWidth;
597
598	if (op == '*' && (flags & kSimple))
599		Insert(kRegExpStar, ret);
600	else if (op == '*') {
601		// Emit x* as (x&|), where & means "self".
602		Insert(kRegExpBranch, ret);
603			// Either x
604		OpTail(ret, Node(kRegExpBack));
605			// and loop
606		OpTail(ret, ret);
607			// back
608		Tail(ret, Node(kRegExpBranch));
609			// or
610		Tail(ret, Node(kRegExpNothing));
611			// null.
612	} else if (op == '+' && (flags & kSimple))
613		Insert(kRegExpPlus, ret);
614	else if (op == '+') {
615		// Emit x+ as x(&|), where & means "self".
616		next = Node(kRegExpBranch);
617			// Either
618		Tail(ret, next);
619		Tail(Node(kRegExpBack), ret);
620			// loop back
621		Tail(next, Node(kRegExpBranch));
622			// or
623		Tail(ret, Node(kRegExpNothing));
624			// null.
625	} else if (op == '?') {
626		// Emit x? as (x|)
627		Insert(kRegExpBranch, ret);
628			// Either x
629		Tail(ret, Node(kRegExpBranch));
630			// or
631		next = Node(kRegExpNothing);
632			// null.
633		Tail(ret, next);
634		OpTail(ret, next);
635	}
636
637	fInputScanPointer++;
638	if (IsMult(*fInputScanPointer)) {
639		SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
640		return NULL;
641	}
642
643	return ret;
644}
645
646
647//
648// - Atom - the lowest level
649//
650// Optimization:  gobbles an entire sequence of ordinary characters so that
651// it can turn them into a single node, which is smaller to store and
652// faster to run.  Backslashed characters are exceptions, each becoming a
653// separate node; the code is simpler that way and it's not worth fixing.
654//
655char*
656RegExp::Atom(int32* flagp)
657{
658	char* ret;
659	int32 flags;
660
661	*flagp = kWorst;
662		// tentatively
663
664	switch (*fInputScanPointer++) {
665		case '^':
666			ret = Node(kRegExpBol);
667			break;
668
669		case '$':
670			ret = Node(kRegExpEol);
671			break;
672
673		case '.':
674			ret = Node(kRegExpAny);
675			*flagp |= kHasWidth|kSimple;
676			break;
677
678		case '[':
679		{
680			int32 cclass;
681			int32 classend;
682
683			if (*fInputScanPointer == '^') {
684				// complement of range
685				ret = Node(kRegExpAnyBut);
686				fInputScanPointer++;
687			} else
688				ret = Node(kRegExpAnyOf);
689			if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
690				Char(*fInputScanPointer++);
691			while (*fInputScanPointer != '\0'
692				&& *fInputScanPointer != ']') {
693				if (*fInputScanPointer == '-') {
694					fInputScanPointer++;
695					if (*fInputScanPointer == ']'
696						|| *fInputScanPointer == '\0') {
697						Char('-');
698					} else {
699						cclass = UCharAt(fInputScanPointer - 2) + 1;
700						classend = UCharAt(fInputScanPointer);
701						if (cclass > classend + 1) {
702							SetError(REGEXP_INVALID_BRACKET_RANGE);
703							return NULL;
704						}
705						for (; cclass <= classend; cclass++)
706							Char((char)cclass);
707						fInputScanPointer++;
708					}
709				} else
710					Char(*fInputScanPointer++);
711			}
712			Char('\0');
713			if (*fInputScanPointer != ']') {
714				SetError(REGEXP_UNMATCHED_BRACKET);
715				return NULL;
716			}
717			fInputScanPointer++;
718			*flagp |= kHasWidth | kSimple;
719			break;
720		}
721
722		case '(':
723			ret = Reg(1, &flags);
724			if (ret == NULL)
725				return NULL;
726			*flagp |= flags & (kHasWidth | kSPStart);
727			break;
728
729		case '\0':
730		case '|':
731		case ')':
732			SetError(REGEXP_INTERNAL_ERROR);
733			return NULL;
734				//  supposed to be caught earlier
735
736		case '?':
737		case '+':
738		case '*':
739			SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
740			return NULL;
741
742		case '\\':
743			if (*fInputScanPointer == '\0') {
744				SetError(REGEXP_TRAILING_BACKSLASH);
745				return NULL;
746			}
747			ret = Node(kRegExpExactly);
748			Char(*fInputScanPointer++);
749			Char('\0');
750			*flagp |= kHasWidth|kSimple;
751			break;
752
753		default:
754		{
755			int32 len;
756			char ender;
757
758			fInputScanPointer--;
759			len = (int32)strcspn(fInputScanPointer, kMeta);
760			if (len <= 0) {
761				SetError(REGEXP_INTERNAL_ERROR);
762				return NULL;
763			}
764
765			ender = *(fInputScanPointer + len);
766			if (len > 1 && IsMult(ender)) {
767				// Back off clear of ?+* operand.
768				len--;
769			}
770
771			*flagp |= kHasWidth;
772			if (len == 1)
773				*flagp |= kSimple;
774
775			ret = Node(kRegExpExactly);
776			while (len > 0) {
777				Char(*fInputScanPointer++);
778				len--;
779			}
780
781			Char('\0');
782			break;
783		}
784	}
785
786	return ret;
787}
788
789
790//
791// - Node - emit a node
792//
793// Returns location.
794//
795char*
796RegExp::Node(char op)
797{
798	char* ret;
799	char* ptr;
800
801	ret = fCodeEmitPointer;
802	if (ret == &fDummy) {
803		fCodeSize += 3;
804		return ret;
805	}
806
807	ptr = ret;
808	*ptr++ = op;
809	*ptr++ = '\0';
810		// Null "next" pointer.
811	*ptr++ = '\0';
812	fCodeEmitPointer = ptr;
813
814	return ret;
815}
816
817
818//
819// - Char - emit (if appropriate) a byte of code
820//
821void
822RegExp::Char(char b)
823{
824	if (fCodeEmitPointer != &fDummy)
825		*fCodeEmitPointer++ = b;
826	else
827		fCodeSize++;
828}
829
830
831//
832// - Insert - insert an operator in front of already-emitted operand
833//
834// Means relocating the operand.
835//
836void
837RegExp::Insert(char op, char* opnd)
838{
839	char* src;
840	char* dst;
841	char* place;
842
843	if (fCodeEmitPointer == &fDummy) {
844		fCodeSize += 3;
845		return;
846	}
847
848	src = fCodeEmitPointer;
849	fCodeEmitPointer += 3;
850	dst = fCodeEmitPointer;
851	while (src > opnd)
852		*--dst = *--src;
853
854	place = opnd;
855		// Op node, where operand used to be.
856	*place++ = op;
857	*place++ = '\0';
858	*place++ = '\0';
859}
860
861
862//
863// - Tail - set the next-pointer at the end of a node chain
864//
865void
866RegExp::Tail(char* p, char* val)
867{
868	char* scan;
869	char* temp;
870	int32 offset;
871
872	if (p == &fDummy)
873		return;
874
875	// Find last node.
876	scan = p;
877	for (;;) {
878		temp = Next(scan);
879		if (temp == NULL)
880			break;
881		scan = temp;
882	}
883
884	if (scan[0] == kRegExpBack)
885		offset = scan - val;
886	else
887		offset = val - scan;
888
889	scan[1] = (char)((offset >> 8) & 0377);
890	scan[2] = (char)(offset & 0377);
891}
892
893
894//
895// - OpTail - Tail on operand of first argument; nop if operandless
896//
897void
898RegExp::OpTail(char* p, char* val)
899{
900	// "Operandless" and "op != kRegExpBranch" are synonymous in practice.
901	if (p == NULL || p == &fDummy || *p != kRegExpBranch)
902		return;
903
904	Tail(Operand(p), val);
905}
906
907//
908// RunMatcher and friends
909//
910
911
912//
913// - RunMatcher - match a regexp against a string
914//
915int32
916RegExp::RunMatcher(regexp* prog, const char* string) const
917{
918	const char* s;
919
920	// be paranoid...
921	if (prog == NULL || string == NULL) {
922		SetError(B_BAD_VALUE);
923		return 0;
924	}
925
926	// check validity of program
927	if (UCharAt(prog->program) != kRegExpMagic) {
928		SetError(REGEXP_CORRUPTED_PROGRAM);
929		return 0;
930	}
931
932	// if there is a "must appear" string, look for it
933	if (prog->regmust != NULL) {
934		s = string;
935		while ((s = strchr(s, prog->regmust[0])) != NULL) {
936			if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) {
937				// found it
938				break;
939			}
940			s++;
941		}
942		if (s == NULL) {
943			// not present
944			return 0;
945		}
946	}
947
948	// mark beginning of line for ^
949	fRegBol = string;
950
951	// simplest case:  anchored match need be tried only once
952	if (prog->reganch)
953		return Try(prog, (char*)string);
954
955	// messy cases: unanchored match
956	s = string;
957	if (prog->regstart != '\0') {
958		// we know what char it must start with
959		while ((s = strchr(s, prog->regstart)) != NULL) {
960			if (Try(prog, (char*)s))
961				return 1;
962			s++;
963		}
964	} else {
965		// we don't -- general case.
966		do {
967			if (Try(prog, (char*)s))
968				return 1;
969		} while (*s++ != '\0');
970	}
971
972	// failure
973	return 0;
974}
975
976
977//
978// - Try - try match at specific point
979//
980// Returns 0 on failure, 1 on success.
981//
982int32
983RegExp::Try(regexp* prog, const char* string) const
984{
985	int32 i;
986	const char **sp;
987	const char **ep;
988
989	fStringInputPointer = string;
990	fStartPArrayPointer = prog->startp;
991	fEndPArrayPointer = prog->endp;
992
993	sp = prog->startp;
994	ep = prog->endp;
995	for (i = kSubExpressionMax; i > 0; i--) {
996		*sp++ = NULL;
997		*ep++ = NULL;
998	}
999	if (Match(prog->program + 1)) {
1000		prog->startp[0] = string;
1001		prog->endp[0] = fStringInputPointer;
1002		return 1;
1003	} else
1004		return 0;
1005}
1006
1007
1008//
1009// - Match - main matching routine
1010//
1011// Conceptually the strategy is simple:  check to see whether the current
1012// node matches, call self recursively to see whether the rest matches,
1013// and then act accordingly.  In practice we make some effort to avoid
1014// recursion, in particular by going through "ordinary" nodes (that don't
1015// need to know whether the rest of the match failed) by a loop instead of
1016// by recursion.
1017//
1018// Returns 0 on failure, 1 on success.
1019//
1020int32
1021RegExp::Match(const char* prog) const
1022{
1023	const char* scan;	// Current node.
1024	const char* next;		// Next node.
1025
1026	scan = prog;
1027#ifdef DEBUG
1028	if (scan != NULL && regnarrate)
1029		fprintf(stderr, "%s(\n", Prop(scan));
1030#endif
1031	while (scan != NULL) {
1032#ifdef DEBUG
1033		if (regnarrate)
1034			fprintf(stderr, "%s...\n", Prop(scan));
1035#endif
1036		next = Next(scan);
1037
1038		switch (*scan) {
1039			case kRegExpBol:
1040				if (fStringInputPointer != fRegBol)
1041					return 0;
1042				break;
1043
1044			case kRegExpEol:
1045				if (*fStringInputPointer != '\0')
1046					return 0;
1047				break;
1048
1049			case kRegExpAny:
1050				if (*fStringInputPointer == '\0')
1051					return 0;
1052				fStringInputPointer++;
1053				break;
1054
1055			case kRegExpExactly:
1056			{
1057				const char* opnd = Operand(scan);
1058				// Inline the first character, for speed.
1059				if (*opnd != *fStringInputPointer)
1060					return 0;
1061
1062				uint32 len = strlen(opnd);
1063				if (len > 1
1064					&& strncmp(opnd, fStringInputPointer, len) != 0) {
1065					return 0;
1066				}
1067
1068				fStringInputPointer += len;
1069			}
1070			break;
1071
1072			case kRegExpAnyOf:
1073				if (*fStringInputPointer == '\0'
1074					|| strchr(Operand(scan), *fStringInputPointer) == NULL) {
1075					return 0;
1076				}
1077				fStringInputPointer++;
1078				break;
1079
1080			case kRegExpAnyBut:
1081				if (*fStringInputPointer == '\0'
1082					|| strchr(Operand(scan), *fStringInputPointer) != NULL) {
1083					return 0;
1084				}
1085				fStringInputPointer++;
1086				break;
1087
1088			case kRegExpNothing:
1089				break;
1090
1091			case kRegExpBack:
1092				break;
1093
1094			case kRegExpOpen + 1:
1095			case kRegExpOpen + 2:
1096			case kRegExpOpen + 3:
1097			case kRegExpOpen + 4:
1098			case kRegExpOpen + 5:
1099			case kRegExpOpen + 6:
1100			case kRegExpOpen + 7:
1101			case kRegExpOpen + 8:
1102			case kRegExpOpen + 9:
1103			{
1104				int32 no;
1105				const char* save;
1106
1107				no = *scan - kRegExpOpen;
1108				save = fStringInputPointer;
1109
1110				if (Match(next)) {
1111					//
1112					// Don't set startp if some later
1113					// invocation of the same parentheses
1114					// already has.
1115					//
1116					if (fStartPArrayPointer[no] == NULL)
1117						fStartPArrayPointer[no] = save;
1118					return 1;
1119				} else
1120					return 0;
1121			}
1122			break;
1123
1124			case kRegExpClose + 1:
1125			case kRegExpClose + 2:
1126			case kRegExpClose + 3:
1127			case kRegExpClose + 4:
1128			case kRegExpClose + 5:
1129			case kRegExpClose + 6:
1130			case kRegExpClose + 7:
1131			case kRegExpClose + 8:
1132			case kRegExpClose + 9:
1133			{
1134				int32 no;
1135				const char* save;
1136
1137				no = *scan - kRegExpClose;
1138				save = fStringInputPointer;
1139
1140				if (Match(next)) {
1141					//
1142					// Don't set endp if some later
1143					// invocation of the same parentheses
1144					// already has.
1145					//
1146					if (fEndPArrayPointer[no] == NULL)
1147						fEndPArrayPointer[no] = save;
1148					return 1;
1149				} else
1150					return 0;
1151			}
1152			break;
1153
1154			case kRegExpBranch:
1155			{
1156				const char* save;
1157
1158				if (*next != kRegExpBranch) {
1159					// no choice
1160					next = Operand(scan);
1161						// avoid recursion
1162				} else {
1163					do {
1164						save = fStringInputPointer;
1165						if (Match(Operand(scan)))
1166							return 1;
1167						fStringInputPointer = save;
1168						scan = Next(scan);
1169					} while (scan != NULL && *scan == kRegExpBranch);
1170					return 0;
1171					// NOTREACHED
1172				}
1173			}
1174			break;
1175
1176			case kRegExpStar:
1177			case kRegExpPlus:
1178				{
1179					char nextch;
1180					int32 no;
1181					const char* save;
1182					int32 min;
1183
1184					//
1185					// Lookahead to avoid useless match attempts
1186					// when we know what character comes next.
1187					//
1188					nextch = '\0';
1189					if (*next == kRegExpExactly)
1190						nextch = *Operand(next);
1191					min = (*scan == kRegExpStar) ? 0 : 1;
1192					save = fStringInputPointer;
1193					no = Repeat(Operand(scan));
1194					while (no >= min) {
1195						// if it could work, try it
1196						if (nextch == '\0' || *fStringInputPointer == nextch) {
1197							if (Match(next))
1198								return 1;
1199						}
1200						// couldn't or didn't, back up
1201						no--;
1202						fStringInputPointer = save + no;
1203					}
1204					return 0;
1205				}
1206				break;
1207
1208			case kRegExpEnd:
1209				return 1;
1210					// Success!
1211
1212			default:
1213				SetError(REGEXP_MEMORY_CORRUPTION);
1214				return 0;
1215			}
1216
1217		scan = next;
1218	}
1219
1220	//
1221	// We get here only if there's trouble -- normally "case kRegExpEnd" is
1222	// the terminating point.
1223	//
1224	SetError(REGEXP_CORRUPTED_POINTERS);
1225
1226	return 0;
1227}
1228
1229
1230//
1231// - Repeat - repeatedly match something simple, report how many
1232//
1233int32
1234RegExp::Repeat(const char* p) const
1235{
1236	int32 count = 0;
1237	const char* scan;
1238	const char* opnd;
1239
1240	scan = fStringInputPointer;
1241	opnd = Operand(p);
1242	switch (*p) {
1243		case kRegExpAny:
1244			count = (int32)strlen(scan);
1245			scan += count;
1246			break;
1247
1248		case kRegExpExactly:
1249			while (*opnd == *scan) {
1250				count++;
1251				scan++;
1252			}
1253			break;
1254
1255		case kRegExpAnyOf:
1256			while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1257				count++;
1258				scan++;
1259			}
1260			break;
1261
1262		case kRegExpAnyBut:
1263			while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1264				count++;
1265				scan++;
1266			}
1267			break;
1268
1269		default:
1270			// oh dear, called inappropriately
1271			SetError(REGEXP_INTERNAL_ERROR);
1272			count = 0;
1273				// best compromise
1274			break;
1275	}
1276	fStringInputPointer = scan;
1277
1278	return count;
1279}
1280
1281
1282//
1283// - Next - dig the "next" pointer out of a node
1284//
1285char*
1286RegExp::Next(char* p)
1287{
1288	int32 offset;
1289
1290	if (p == &fDummy)
1291		return NULL;
1292
1293	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1294	if (offset == 0)
1295		return NULL;
1296
1297	if (*p == kRegExpBack)
1298		return p - offset;
1299	else
1300		return p + offset;
1301}
1302
1303
1304const char*
1305RegExp::Next(const char* p) const
1306{
1307	int32 offset;
1308
1309	if (p == &fDummy)
1310		return NULL;
1311
1312	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1313	if (offset == 0)
1314		return NULL;
1315
1316	if (*p == kRegExpBack)
1317		return p - offset;
1318	else
1319		return p + offset;
1320}
1321
1322
1323inline int32
1324RegExp::UCharAt(const char* p) const
1325{
1326	return (int32)*(unsigned char *)p;
1327}
1328
1329
1330inline char*
1331RegExp::Operand(char* p) const
1332{
1333	return p + 3;
1334}
1335
1336
1337inline const char*
1338RegExp::Operand(const char* p) const
1339{
1340	return p + 3;
1341}
1342
1343
1344inline bool
1345RegExp::IsMult(char c) const
1346{
1347	return c == '*' || c == '+' || c == '?';
1348}
1349
1350
1351#ifdef DEBUG
1352
1353
1354//
1355// - Dump - dump a regexp onto stdout in vaguely comprehensible form
1356//
1357void
1358RegExp::Dump()
1359{
1360	const char* s;
1361	char op = kRegExpExactly;
1362		// Arbitrary non-kRegExpEnd op.
1363	const char* next;
1364
1365	s = fRegExp->program + 1;
1366	while (op != kRegExpEnd) {
1367		// While that wasn't kRegExpEnd last time...
1368		op = *s;
1369		printf("%2ld%s", s - fRegExp->program, Prop(s));
1370			// Where, what.
1371		next = Next(s);
1372		if (next == NULL) {
1373			// next ptr
1374			printf("(0)");
1375		} else
1376			printf("(%ld)", (s - fRegExp->program) + (next - s));
1377
1378		s += 3;
1379		if (op == kRegExpAnyOf || op == kRegExpAnyBut
1380			|| op == kRegExpExactly) {
1381			// literal string, where present
1382			while (*s != '\0') {
1383				putchar(*s);
1384				s++;
1385			}
1386			s++;
1387		}
1388		putchar('\n');
1389	}
1390
1391	// header fields of interest
1392	if (fRegExp->regstart != '\0')
1393		printf("start `%c' ", fRegExp->regstart);
1394
1395	if (fRegExp->reganch)
1396		printf("anchored ");
1397
1398	if (fRegExp->regmust != NULL)
1399		printf("must have \"%s\"", fRegExp->regmust);
1400
1401	printf("\n");
1402}
1403
1404
1405//
1406// - Prop - printable representation of opcode
1407//
1408char*
1409RegExp::Prop(const char* op) const
1410{
1411	const char* p = NULL;
1412	static char buf[50];
1413
1414	strcpy(buf, ":");
1415
1416	switch (*op) {
1417		case kRegExpBol:
1418			p = "kRegExpBol";
1419			break;
1420
1421		case kRegExpEol:
1422			p = "kRegExpEol";
1423			break;
1424
1425		case kRegExpAny:
1426			p = "kRegExpAny";
1427			break;
1428
1429		case kRegExpAnyOf:
1430			p = "kRegExpAnyOf";
1431			break;
1432
1433		case kRegExpAnyBut:
1434			p = "kRegExpAnyBut";
1435			break;
1436
1437		case kRegExpBranch:
1438			p = "kRegExpBranch";
1439			break;
1440
1441		case kRegExpExactly:
1442			p = "kRegExpExactly";
1443			break;
1444
1445		case kRegExpNothing:
1446			p = "kRegExpNothing";
1447			break;
1448
1449		case kRegExpBack:
1450			p = "kRegExpBack";
1451			break;
1452
1453		case kRegExpEnd:
1454			p = "kRegExpEnd";
1455			break;
1456
1457		case kRegExpOpen + 1:
1458		case kRegExpOpen + 2:
1459		case kRegExpOpen + 3:
1460		case kRegExpOpen + 4:
1461		case kRegExpOpen + 5:
1462		case kRegExpOpen + 6:
1463		case kRegExpOpen + 7:
1464		case kRegExpOpen + 8:
1465		case kRegExpOpen + 9:
1466			sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1467			p = NULL;
1468			break;
1469
1470		case kRegExpClose + 1:
1471		case kRegExpClose + 2:
1472		case kRegExpClose + 3:
1473		case kRegExpClose + 4:
1474		case kRegExpClose + 5:
1475		case kRegExpClose + 6:
1476		case kRegExpClose + 7:
1477		case kRegExpClose + 8:
1478		case kRegExpClose + 9:
1479			sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1480			p = NULL;
1481			break;
1482
1483		case kRegExpStar:
1484			p = "kRegExpStar";
1485			break;
1486
1487		case kRegExpPlus:
1488			p = "kRegExpPlus";
1489			break;
1490
1491		default:
1492			RegExpError("corrupted opcode");
1493			break;
1494	}
1495
1496	if (p != NULL)
1497		strcat(buf, p);
1498
1499	return buf;
1500}
1501
1502
1503void
1504RegExp::RegExpError(const char*) const
1505{
1506	// does nothing now, perhaps it should printf?
1507}
1508
1509#endif	// DEBUG
1510