1//----------------------------------------------------------------------
2//  This software is part of the Haiku distribution and is covered
3//  by the MIT License.
4//----------------------------------------------------------------------
5/*!
6	\file sniffer/Parser.cpp
7	MIME sniffer rule parser implementation
8*/
9
10#include <sniffer/Parser.h>
11#include <sniffer/Pattern.h>
12#include <sniffer/PatternList.h>
13#include <sniffer/Range.h>
14#include <sniffer/RPattern.h>
15#include <sniffer/RPatternList.h>
16#include <sniffer/Rule.h>
17
18#include <new>
19#include <stdio.h>
20#include <stdlib.h>	// For atol(), atof()
21#include <string.h>
22#include <String.h>	// BString
23
24using namespace BPrivate::Storage::Sniffer;
25
26// Miscellaneous helper functions
27char escapeChar(char ch);
28char hexToChar(char hi, char low);
29char hexToChar(char hex);
30char octalToChar(char octal);
31char octalToChar(char hi, char low);
32char octalToChar(char hi, char mid, char low);
33bool isHexChar(char ch);
34bool isWhiteSpace(char ch);
35bool isOctalChar(char ch);
36bool isDecimalChar(char ch);
37bool isPunctuation(char ch);
38
39//! Parses the given rule.
40/*! The resulting parsed Rule structure is stored in \c rule, which
41	must be pre-allocated. If parsing fails, a descriptive error message (meant
42	to be viewed in a monospaced font) is placed in the pre-allocated \c BString
43	pointed to by \c parseError (which may be \c NULL if you don't care about
44	the error message).
45
46	\param rule Pointer to a NULL-terminated string containing the sniffer
47	            rule to be parsed
48	\param result Pointer to a pre-allocated \c Rule object into which the result
49	              of parsing is placed upon success.
50	\param parseError Point to pre-allocated \c BString object into which
51	                  a descriptive error message is stored upon failure.
52
53	\return
54	- B_OK: Success
55	- B_BAD_MIME_SNIFFER_RULE: Failure
56*/
57status_t
58BPrivate::Storage::Sniffer::parse(const char *rule, Rule *result, BString *parseError) {
59	Parser parser;
60	return parser.Parse(rule, result, parseError);
61}
62
63//------------------------------------------------------------------------------
64// Token
65//------------------------------------------------------------------------------
66
67Token::Token(TokenType type, const ssize_t pos)
68	: fType(type)
69	, fPos(pos)
70{
71//	if (type != EmptyToken)
72//		cout << "New Token, fType == " << tokenTypeToString(fType) << endl;
73}
74
75Token::~Token() {
76}
77
78TokenType
79Token::Type() const {
80	return fType;
81}
82
83const std::string&
84Token::String() const {
85	throw new Err("Sniffer scanner error: Token::String() called on non-string token", fPos);
86}
87
88int32
89Token::Int() const {
90	throw new Err("Sniffer scanner error: Token::Int() called on non-integer token", fPos);
91}
92
93double
94Token::Float() const {
95	throw new Err("Sniffer scanner error: Token::Float() called on non-float token", fPos);
96}
97
98ssize_t
99Token::Pos() const {
100	return fPos;
101}
102
103bool
104Token::operator==(Token &ref) const {
105	// Compare types, then data if necessary
106	if (Type() == ref.Type()) {
107		switch (Type()) {
108			case CharacterString:
109//				printf(" str1 == '%s'\n", String());
110//				printf(" str2 == '%s'\n", ref.String());
111//				printf(" strcmp() == %d\n", strcmp(String(), ref.String()));
112			{
113				return String() == ref.String();
114
115/*
116				// strcmp() seems to choke on certain, non-normal ASCII chars
117				// (i.e. chars outside the usual alphabets, but still valid
118				// as far as ASCII is concerned), so we'll just compare the
119				// strings by hand to be safe.
120				const char *str1 = String();
121				const char *str2 = ref.String();
122				int len1 = strlen(str1);
123				int len2 = strlen(str2);
124//				printf("len1 == %d\n", len1);
125//				printf("len2 == %d\n", len2);
126				if (len1 == len2) {
127					for (int i = 0; i < len1; i++) {
128//						printf("i == %d, str1[%d] == %x, str2[%d] == %x\n", i, i, str1[i], i, str2[i]);
129						if (str1[i] != str2[i])
130							return false;
131					}
132				}
133				return true;
134*/
135			}
136//				return strcmp(String(), ref.String()) == 0;
137
138			case Integer:
139				return Int() == ref.Int();
140
141			case FloatingPoint:
142				return Float() == ref.Float();
143
144			default:
145				return true;
146		}
147	} else
148		return false;
149}
150
151//------------------------------------------------------------------------------
152// StringToken
153//------------------------------------------------------------------------------
154
155StringToken::StringToken(const std::string &str, const ssize_t pos)
156	: Token(CharacterString, pos)
157	, fString(str)
158{
159}
160
161StringToken::~StringToken() {
162}
163
164const std::string&
165StringToken::String() const {
166	return fString;
167}
168
169//------------------------------------------------------------------------------
170// IntToken
171//------------------------------------------------------------------------------
172
173IntToken::IntToken(const int32 value, const ssize_t pos)
174	: Token(Integer, pos)
175	, fValue(value)
176{
177}
178
179IntToken::~IntToken() {
180}
181
182int32
183IntToken::Int() const {
184	return fValue;
185}
186
187double
188IntToken::Float() const {
189	return (double)fValue;
190}
191
192//------------------------------------------------------------------------------
193// FloatToken
194//------------------------------------------------------------------------------
195
196FloatToken::FloatToken(const double value, const ssize_t pos)
197	: Token(FloatingPoint, pos)
198	, fValue(value)
199{
200}
201
202FloatToken::~FloatToken() {
203}
204
205
206double
207FloatToken::Float() const {
208	return fValue;
209}
210
211//------------------------------------------------------------------------------
212// TokenStream
213//------------------------------------------------------------------------------
214
215TokenStream::TokenStream(const std::string &string)
216	: fCStatus(B_NO_INIT)
217	, fPos(-1)
218	, fStrLen(-1)
219{
220	SetTo(string);
221}
222
223TokenStream::TokenStream()
224	: fCStatus(B_NO_INIT)
225	, fPos(-1)
226	, fStrLen(-1)
227{
228}
229
230TokenStream::~TokenStream() {
231	Unset();
232}
233
234status_t
235TokenStream::SetTo(const std::string &string) {
236	Unset();
237	fStrLen = string.length();
238	CharStream stream(string);
239	if (stream.InitCheck() != B_OK)
240		throw new Err("Sniffer scanner error: Unable to intialize character stream", -1);
241
242	typedef enum TokenStreamScannerState {
243		tsssStart,
244		tsssOneSingle,
245		tsssOneDouble,
246		tsssOneZero,
247		tsssZeroX,
248		tsssOneHex,
249		tsssTwoHex,
250		tsssIntOrFloat,
251		tsssFloat,
252		tsssLonelyDecimalPoint,
253		tsssLonelyMinusOrPlus,
254		tsssLonelyFloatExtension,
255		tsssLonelyFloatExtensionWithSign,
256		tsssExtendedFloat,
257		tsssUnquoted,
258		tsssEscape,
259		tsssEscapeX,
260		tsssEscapeOneOctal,
261		tsssEscapeTwoOctal,
262		tsssEscapeOneHex,
263	} TokenStreamScannerState;
264
265	TokenStreamScannerState state = tsssStart;
266	TokenStreamScannerState escapedState = tsssStart;
267		// Used to remember which state to return to from an escape sequence
268
269	std::string charStr = "";	// Used to build up character strings
270	char lastChar = 0;			// For two char lookahead
271	char lastLastChar = 0;		// For three char lookahead (have I mentioned I hate octal?)
272	bool keepLooping = true;
273	ssize_t startPos = 0;
274	while (keepLooping) {
275		ssize_t pos = stream.Pos();
276		char ch = stream.Get();
277		switch (state) {
278			case tsssStart:
279				startPos = pos;
280				switch (ch) {
281					case 0x3:	// End-Of-Text
282						if (stream.IsEmpty())
283							keepLooping = false;
284						else
285							throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
286						break;
287
288					case '\t':
289					case '\n':
290					case ' ':
291						// Whitespace, so ignore it.
292						break;
293
294					case '"':
295						charStr = "";
296						state = tsssOneDouble;
297						break;
298
299					case '\'':
300						charStr = "";
301						state = tsssOneSingle;
302						break;
303
304					case '+':
305					case '-':
306						charStr = ch;
307						lastChar = ch;
308						state = tsssLonelyMinusOrPlus;
309						break;
310
311					case '.':
312						charStr = ch;
313						state = tsssLonelyDecimalPoint;
314						break;
315
316					case '0':
317						charStr = ch;
318						state = tsssOneZero;
319						break;
320
321					case '1':
322					case '2':
323					case '3':
324					case '4':
325					case '5':
326					case '6':
327					case '7':
328					case '8':
329					case '9':
330						charStr = ch;
331						state = tsssIntOrFloat;
332						break;
333
334					case '&':	AddToken(Ampersand, pos);		break;
335					case '(':	AddToken(LeftParen, pos);		break;
336					case ')':	AddToken(RightParen, pos);		break;
337					case ':':	AddToken(Colon, pos);			break;
338					case '[':	AddToken(LeftBracket, pos);		break;
339
340					case '\\':
341						charStr = "";					// Clear our string
342						state = tsssEscape;
343						escapedState = tsssUnquoted;	// Unquoted strings begin with an escaped character
344						break;
345
346					case ']':	AddToken(RightBracket, pos);		break;
347					case '|':	AddToken(Divider, pos);			break;
348
349					default:
350						throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
351				}
352				break;
353
354			case tsssOneSingle:
355				switch (ch) {
356					case '\\':
357						escapedState = state;		// Save our state
358						state = tsssEscape;			// Handle the escape sequence
359						break;
360					case '\'':
361						AddString(charStr, startPos);
362						state = tsssStart;
363						break;
364					case 0x3:
365						if (stream.IsEmpty())
366							throw new Err(std::string("Sniffer pattern error: unterminated single-quoted string"), pos);
367						else
368							charStr += ch;
369						break;
370					default:
371						charStr += ch;
372						break;
373				}
374				break;
375
376			case tsssOneDouble:
377				switch (ch) {
378					case '\\':
379						escapedState = state;		// Save our state
380						state = tsssEscape;			// Handle the escape sequence
381						break;
382					case '"':
383						AddString(charStr, startPos);
384						state = tsssStart;
385						break;
386					case 0x3:
387						if (stream.IsEmpty())
388							throw new Err(std::string("Sniffer pattern error: unterminated double-quoted string"), pos);
389						else
390							charStr += ch;
391						break;
392					default:
393						charStr += ch;
394						break;
395				}
396				break;
397
398			case tsssOneZero:
399				if (ch == 'x') {
400					charStr = "";	// Reinit, since we actually have a hex string
401					state = tsssZeroX;
402				} else if ('0' <= ch && ch <= '9') {
403					charStr += ch;
404					state = tsssIntOrFloat;
405				} else if (ch == '.') {
406					charStr += ch;
407					state = tsssFloat;
408				} else if (ch == 'e' || ch == 'E') {
409					charStr += ch;
410					state = tsssLonelyFloatExtension;
411				} else {
412					// Terminate the number
413					AddInt(charStr.c_str(), startPos);
414
415					// Push the last char back on and try again
416					stream.Unget();
417					state = tsssStart;
418				}
419				break;
420
421			case tsssZeroX:
422				if (isHexChar(ch)) {
423					lastChar = ch;
424					state = tsssOneHex;
425				} else
426					throw new Err(std::string("Sniffer pattern error: incomplete hex code"), pos);
427				break;
428
429			case tsssOneHex:
430				if (isHexChar(ch)) {
431					try {
432						charStr += hexToChar(lastChar, ch);
433					} catch (Err *err) {
434						if (err)
435							err->SetPos(pos);
436						throw err;
437					}
438					state = tsssTwoHex;
439				} else
440					throw new Err(std::string("Sniffer pattern error: bad hex literal"), pos);	// Same as R5
441				break;
442
443			case tsssTwoHex:
444				if (isHexChar(ch)) {
445					lastChar = ch;
446					state = tsssOneHex;
447				} else {
448					AddString(charStr, startPos);
449					stream.Unget();		// So punctuation gets handled properly
450					state = tsssStart;
451				}
452				break;
453
454			case tsssIntOrFloat:
455				if (isDecimalChar(ch))
456					charStr += ch;
457				else if (ch == '.') {
458					charStr += ch;
459					state = tsssFloat;
460				} else if (ch == 'e' || ch == 'E') {
461					charStr += ch;
462					state = tsssLonelyFloatExtension;
463				} else {
464					// Terminate the number
465					AddInt(charStr.c_str(), startPos);
466
467					// Push the last char back on and try again
468					stream.Unget();
469					state = tsssStart;
470				}
471				break;
472
473			case tsssFloat:
474				if (isDecimalChar(ch))
475					charStr += ch;
476				else if (ch == 'e' || ch == 'E') {
477					charStr += ch;
478					state = tsssLonelyFloatExtension;
479				} else {
480					// Terminate the number
481					AddFloat(charStr.c_str(), startPos);
482
483					// Push the last char back on and try again
484					stream.Unget();
485					state = tsssStart;
486				}
487				break;
488
489			case tsssLonelyDecimalPoint:
490				if (isDecimalChar(ch)) {
491					charStr += ch;
492					state = tsssFloat;
493				} else
494					throw new Err(std::string("Sniffer pattern error: incomplete floating point number"), pos);
495				break;
496
497			case tsssLonelyMinusOrPlus:
498				if (isDecimalChar(ch)) {
499					charStr += ch;
500					state = tsssIntOrFloat;
501				} else if (ch == '.') {
502					charStr += ch;
503					state = tsssLonelyDecimalPoint;
504				} else if (ch == 'i' && lastChar == '-') {
505					AddToken(CaseInsensitiveFlag, startPos);
506					state = tsssStart;
507				} else
508					throw new Err(std::string("Sniffer pattern error: incomplete signed number or invalid flag"), pos);
509				break;
510
511			case tsssLonelyFloatExtension:
512				if (ch == '+' || ch == '-') {
513					charStr += ch;
514					state = tsssLonelyFloatExtensionWithSign;
515				} else if (isDecimalChar(ch)) {
516					charStr += ch;
517					state = tsssExtendedFloat;
518				} else
519					throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
520				break;
521
522			case tsssLonelyFloatExtensionWithSign:
523				if (isDecimalChar(ch)) {
524					charStr += ch;
525					state = tsssExtendedFloat;
526				} else
527					throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
528				break;
529
530			case tsssExtendedFloat:
531				if (isDecimalChar(ch)) {
532					charStr += ch;
533					state = tsssExtendedFloat;
534				} else {
535					// Terminate the number
536					AddFloat(charStr.c_str(), startPos);
537
538					// Push the last char back on and try again
539					stream.Unget();
540					state = tsssStart;
541				}
542				break;
543
544			case tsssUnquoted:
545				if (ch == '\\') {
546					escapedState = state;		// Save our state
547					state = tsssEscape;			// Handle the escape sequence
548				} else if (isWhiteSpace(ch) || isPunctuation(ch)) {
549					AddString(charStr, startPos);
550					stream.Unget();				// In case it's punctuation, let tsssStart handle it
551					state = tsssStart;
552				} else if (ch == 0x3 && stream.IsEmpty()) {
553					AddString(charStr, startPos);
554					keepLooping = false;
555				} else {
556					charStr += ch;
557				}
558				break;
559
560			case tsssEscape:
561				if (isOctalChar(ch)) {
562					lastChar = ch;
563					state = tsssEscapeOneOctal;
564				} else if (ch == 'x') {
565					state = tsssEscapeX;
566				} else {
567					// Check for a true end-of-text marker
568					if (ch == 0x3 && stream.IsEmpty())
569						throw new Err(std::string("Sniffer pattern error: incomplete escape sequence"), pos);
570					else {
571						charStr += escapeChar(ch);
572						state = escapedState;	// Return to the state we were in before the escape
573					}
574				}
575				break;
576
577			case tsssEscapeX:
578				if (isHexChar(ch)) {
579					lastChar = ch;
580					state = tsssEscapeOneHex;
581				} else
582					throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
583				break;
584
585			case tsssEscapeOneOctal:
586				if (isOctalChar(ch)) {
587					lastLastChar = lastChar;
588					lastChar = ch;
589					state = tsssEscapeTwoOctal;
590				} else {
591					// First handle the octal
592					try {
593						charStr += octalToChar(lastChar);
594					} catch (Err *err) {
595						if (err)
596							err->SetPos(startPos);
597						throw err;
598					}
599
600					// Push the new char back on and let the state we
601					// were in when the escape sequence was hit handle it.
602					stream.Unget();
603					state = escapedState;
604				}
605				break;
606
607			case tsssEscapeTwoOctal:
608				if (isOctalChar(ch)) {
609					try {
610						charStr += octalToChar(lastLastChar, lastChar, ch);
611					} catch (Err *err) {
612						if (err)
613							err->SetPos(startPos);
614						throw err;
615					}
616					state = escapedState;
617				} else {
618					// First handle the octal
619					try {
620						charStr += octalToChar(lastLastChar, lastChar);
621					} catch (Err *err) {
622						if (err)
623							err->SetPos(startPos);
624						throw err;
625					}
626
627					// Push the new char back on and let the state we
628					// were in when the escape sequence was hit handle it.
629					stream.Unget();
630					state = escapedState;
631				}
632				break;
633
634			case tsssEscapeOneHex:
635				if (isHexChar(ch)) {
636					try {
637						charStr += hexToChar(lastChar, ch);
638					} catch (Err *err) {
639						if (err)
640							err->SetPos(pos);
641						throw err;
642					}
643					state = escapedState;
644				} else
645					throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
646				break;
647
648		}
649	}
650	if (state == tsssStart)	{
651		fCStatus = B_OK;
652		fPos = 0;
653	} else {
654		throw new Err("Sniffer pattern error: unterminated rule", stream.Pos());
655	}
656
657	return fCStatus;
658}
659
660void
661TokenStream::Unset() {
662	std::vector<Token*>::iterator i;
663	for (i = fTokenList.begin(); i != fTokenList.end(); i++)
664		delete *i;
665	fTokenList.clear();
666	fCStatus = B_NO_INIT;
667	fStrLen = -1;
668}
669
670status_t
671TokenStream::InitCheck() const {
672	return fCStatus;
673}
674
675//! Returns a pointer to the next token in the stream.
676/*! The TokenStream object retains owner ship of the Token object returned by Get().
677    If Get() is called at the end of the stream, a pointer to a Err object is thrown.
678*/
679const Token*
680TokenStream::Get() {
681	if (fCStatus != B_OK)
682		throw new Err("Sniffer parser error: TokenStream::Get() called on uninitialized TokenStream object", -1);
683	if (fPos < (ssize_t)fTokenList.size())
684		return fTokenList[fPos++];
685	else {
686		throw new Err("Sniffer pattern error: unterminated rule", EndPos());
687//		fPos++;			// Increment fPos to keep Unget()s consistent
688//		return NULL;	// Return NULL to signal end of list
689	}
690}
691
692//! Places token returned by the most recent call to Get() back on the head of the stream.
693/*! If Unget() is called at the beginning of the stream, a pointer to a Err object is thrown.
694*/
695void
696TokenStream::Unget() {
697	if (fCStatus != B_OK)
698		throw new Err("Sniffer parser error: TokenStream::Unget() called on uninitialized TokenStream object", -1);
699	if (fPos > 0)
700		fPos--;
701	else
702		throw new Err("Sniffer parser error: TokenStream::Unget() called at beginning of token stream", -1);
703}
704
705
706/*! \brief Reads the next token in the stream and verifies it is of the given type,
707	throwing a pointer to a Err object if it is not.
708*/
709void
710TokenStream::Read(TokenType type) {
711	const Token *t = Get();
712	if (t->Type() != type) {
713		throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(type)
714	                + ", found " + tokenTypeToString(t->Type())).c_str(), t->Pos());
715	}
716}
717
718//! Conditionally reads the next token in the stream.
719/*! CondRead() peeks at the next token in the stream. If it is of the given type, the
720	token is removed from the stream and \c true is returned. If it is not of the
721	given type, false is returned and the token remains at the head of the stream.
722*/
723bool
724TokenStream::CondRead(TokenType type) {
725	const Token *t = Get();
726	if (t->Type() == type) {
727		return true;
728	} else {
729		Unget();
730		return false;
731	}
732}
733
734ssize_t
735TokenStream::Pos() const {
736	return fPos < (ssize_t)fTokenList.size() ? fTokenList[fPos]->Pos() : fStrLen;
737}
738
739ssize_t
740TokenStream::EndPos() const {
741	return fStrLen;
742}
743
744bool
745TokenStream::IsEmpty() const {
746	return fCStatus != B_OK || fPos >= (ssize_t)fTokenList.size();
747}
748
749void
750TokenStream::AddToken(TokenType type, ssize_t pos) {
751	Token *token = new Token(type, pos);
752	fTokenList.push_back(token);
753}
754
755void
756TokenStream::AddString(const std::string &str, ssize_t pos) {
757	Token *token = new StringToken(str, pos);
758	fTokenList.push_back(token);
759}
760
761void
762TokenStream::AddInt(const char *str, ssize_t pos) {
763	// Convert the string to an int
764	int32 value = atol(str);
765	Token *token = new IntToken(value, pos);
766	fTokenList.push_back(token);
767}
768
769void
770TokenStream::AddFloat(const char *str, ssize_t pos) {
771	// Convert the string to a float
772	double value = atof(str);
773	Token *token = new FloatToken(value, pos);
774	fTokenList.push_back(token);
775}
776
777//------------------------------------------------------------------------------
778// Helper functions
779//------------------------------------------------------------------------------
780
781char
782escapeChar(char ch) {
783	// I've manually handled all the escape sequences I could come
784	// up with, and for anything else I just return the character
785	// passed in. Hex escapes are handled elsewhere, so \x just
786	// returns 'x'. Similarly, octals are handled elsewhere, so \0
787	// through \9 just return '0' through '9'.
788	switch (ch) {
789		case 'a':
790			return '\a';
791		case 'b':
792			return '\b';
793		case 'f':
794			return '\f';
795		case 'n':
796			return '\n';
797		case 'r':
798			return '\r';
799		case 't':
800			return '\t';
801		case 'v':
802			return '\v';
803		default:
804			return ch;
805	}
806}
807
808// Converts 0x|hi|low| to a single char
809char
810hexToChar(char hi, char low) {
811	return (hexToChar(hi) << 4)	| hexToChar(low);
812}
813
814// Converts 0x|ch| to a single char
815char
816hexToChar(char hex) {
817	if ('0' <= hex && hex <= '9')
818		return hex-'0';
819	else if ('a' <= hex && hex <= 'f')
820		return hex-'a'+10;
821	else if ('A' <= hex && hex <= 'F')
822		return hex-'A'+10;
823	else
824		throw new Err(std::string("Sniffer parser error: invalid hex digit '") + hex + "' passed to hexToChar()", -1);
825}
826
827char
828octalToChar(char octal) {
829	return octalToChar('0', '0', octal);
830}
831
832char
833octalToChar(char hi, char low) {
834	return octalToChar('0', hi, low);
835}
836
837char
838octalToChar(char hi, char mid, char low) {
839	if (isOctalChar(hi) && isOctalChar(mid) && isOctalChar(low)) {
840		// Check for octals >= decimal 256
841		if ((hi-'0') <= 3)
842			return ((hi-'0') << 6) | ((mid-'0') << 3) | (low-'0');
843		else
844			throw new Err("Sniffer pattern error: invalid octal literal (octals must be between octal 0 and octal 377 inclusive)", -1);
845	} else
846		throw new Err(std::string("Sniffer parser error: invalid octal digit passed to hexToChar()"), -1);
847}
848
849bool
850isHexChar(char ch) {
851	return ('0' <= ch && ch <= '9')
852	         || ('a' <= ch && ch <= 'f')
853	           || ('A' <= ch && ch <= 'F');
854}
855
856bool
857isWhiteSpace(char ch) {
858	return ch == ' ' || ch == '\n' || ch == '\t';
859}
860
861bool
862isOctalChar(char ch) {
863	return ('0' <= ch && ch <= '7');
864}
865
866bool
867isDecimalChar(char ch) {
868	return ('0' <= ch && ch <= '9');
869}
870
871bool
872isPunctuation(char ch) {
873	switch (ch) {
874		case '&':
875		case '(':
876		case ')':
877		case ':':
878		case '[':
879		case ']':
880		case '|':
881			return true;
882		default:
883			return false;
884	}
885}
886
887const char*
888BPrivate::Storage::Sniffer::tokenTypeToString(TokenType type) {
889	switch (type) {
890		case LeftParen:
891			return "LeftParen";
892			break;
893		case RightParen:
894			return "RightParen";
895			break;
896		case LeftBracket:
897			return "LeftBracket";
898			break;
899		case RightBracket:
900			return "RightBracket";
901			break;
902		case Colon:
903			return "Colon";
904			break;
905		case Divider:
906			return "Divider";
907			break;
908		case Ampersand:
909			return "Ampersand";
910			break;
911		case CaseInsensitiveFlag:
912			return "CaseInsensitiveFlag";
913			break;
914		case CharacterString:
915			return "CharacterString";
916			break;
917		case Integer:
918			return "Integer";
919			break;
920		case FloatingPoint:
921			return "FloatingPoint";
922			break;
923		default:
924			return "UNKNOWN TOKEN TYPE";
925			break;
926	}
927}
928
929//------------------------------------------------------------------------------
930// Parser
931//------------------------------------------------------------------------------
932
933Parser::Parser()
934	: fOutOfMemErr(new(std::nothrow) Err("Sniffer parser error: out of memory", -1))
935{
936}
937
938Parser::~Parser() {
939	delete fOutOfMemErr;
940}
941
942status_t
943Parser::Parse(const char *rule, Rule *result, BString *parseError) {
944	try {
945		if (!rule)
946			throw new Err("Sniffer pattern error: NULL pattern", -1);
947		if (!result)
948			return B_BAD_VALUE;
949		if (stream.SetTo(rule) != B_OK)
950			throw new Err("Sniffer parser error: Unable to intialize token stream", -1);
951
952		ParseRule(result);
953
954		return B_OK;
955
956	} catch (Err *err) {
957//		cout << "Caught error" << endl;
958		if (parseError)
959			parseError->SetTo(ErrorMessage(err, rule).c_str());
960		delete err;
961		return rule ? (status_t)B_BAD_MIME_SNIFFER_RULE : (status_t)B_BAD_VALUE;
962	}
963}
964
965std::string
966Parser::ErrorMessage(Err *err, const char *rule) {
967	const char* msg = (err && err->Msg())
968    	                ? err->Msg()
969    	                  : "Sniffer parser error: Unexpected error with no supplied error message";
970    ssize_t pos = err && (err->Pos() >= 0) ? err->Pos() : 0;
971    std::string str = std::string(rule ? rule : "") + "\n";
972    for (int i = 0; i < pos; i++)
973    	str += " ";
974    str += "^    ";
975    str += msg;
976    return str;
977}
978
979void
980Parser::ParseRule(Rule *result) {
981	if (!result)
982		throw new Err("Sniffer parser error: NULL Rule object passed to Parser::ParseRule()", -1);
983
984	// Priority
985	double priority = ParsePriority();
986	// Conjunction List
987	std::vector<DisjList*>* list = ParseConjList();
988
989	result->SetTo(priority, list);
990}
991
992double
993Parser::ParsePriority() {
994	const Token *t = stream.Get();
995	if (t->Type() == FloatingPoint || t->Type() == Integer) {
996		double result = t->Float();
997		if (0.0 <= result && result <= 1.0)
998			return result;
999		else {
1000//			cout << "(priority == " << result << ")" << endl;
1001			throw new Err("Sniffer pattern error: invalid priority", t->Pos());
1002		}
1003	} else
1004		throw new Err("Sniffer pattern error: match level expected", t->Pos());	// Same as R5
1005}
1006
1007std::vector<DisjList*>*
1008Parser::ParseConjList() {
1009	std::vector<DisjList*> *list = new(std::nothrow) std::vector<DisjList*>;
1010	if (!list)
1011		ThrowOutOfMemError(stream.Pos());
1012	try {
1013		// DisjList+
1014		int count = 0;
1015		while (true) {
1016			DisjList* expr = ParseDisjList();
1017			if (!expr)
1018				break;
1019			else {
1020				list->push_back(expr);
1021				count++;
1022			}
1023		}
1024		if (count == 0)
1025			throw new Err("Sniffer pattern error: missing expression", -1);
1026	} catch (...) {
1027		delete list;
1028		throw;
1029	}
1030	return list;
1031}
1032
1033DisjList*
1034Parser::ParseDisjList() {
1035	// If we've run out of tokens right now, it's okay, but
1036	// we need to let ParseConjList() know what's up
1037	if (stream.IsEmpty())
1038		return NULL;
1039
1040	// Peek ahead, then let the appropriate Parse*List()
1041	// functions handle things
1042	const Token *t1 = stream.Get();
1043
1044	// PatternList | RangeList
1045	if (t1->Type() == LeftParen) {
1046		const Token *t2 = stream.Get();
1047		// Skip the case-insensitive flag, if there is one
1048		const Token *tokenOfInterest = (t2->Type() == CaseInsensitiveFlag) ? stream.Get() : t2;
1049		if (t2 != tokenOfInterest)
1050			stream.Unget();	// We called Get() three times
1051		stream.Unget();
1052		stream.Unget();
1053		// RangeList
1054		if (tokenOfInterest->Type() == LeftBracket) {
1055			return ParseRPatternList();
1056		// PatternList
1057		} else {
1058			return ParsePatternList(Range(0,0));
1059		}
1060	// Range, PatternList
1061	} else if (t1->Type() == LeftBracket) {
1062		stream.Unget();
1063		return ParsePatternList(ParseRange());
1064	} else {
1065		throw new Err("Sniffer pattern error: missing pattern", t1->Pos());	// Same as R5
1066	}
1067
1068	// PatternList
1069	// RangeList
1070	// Range + PatternList
1071}
1072
1073Range
1074Parser::ParseRange() {
1075	int32 start, end;
1076	// LeftBracket
1077	stream.Read(LeftBracket);
1078	// Integer
1079	{
1080		const Token *t = stream.Get();
1081		if (t->Type() == Integer) {
1082			start = t->Int();
1083			end = start;	// In case we aren't given an explicit end
1084		} else
1085			throw new Err("Sniffer pattern error: pattern offset expected", t->Pos());
1086	}
1087	// [Colon, Integer] RightBracket
1088	{
1089		const Token *t = stream.Get();
1090		// Colon, Integer, RightBracket
1091		if (t->Type() == Colon) {
1092			// Integer
1093			{
1094				const Token *t = stream.Get();
1095				if (t->Type() == Integer) {
1096					end = t->Int();
1097				} else
1098					ThrowUnexpectedTokenError(Integer, t);
1099			}
1100			// RightBracket
1101			stream.Read(RightBracket);
1102		// !(Colon, Integer) RightBracket
1103		} else if (t->Type() == RightBracket) {
1104			// Nothing to do here...
1105
1106		// Something else...
1107		} else
1108			ThrowUnexpectedTokenError(Colon, Integer, t);
1109	}
1110	Range range(start, end);
1111	if (range.InitCheck() == B_OK)
1112		return range;
1113	else
1114		throw range.GetErr();
1115}
1116
1117DisjList*
1118Parser::ParsePatternList(Range range) {
1119	PatternList *list = new(std::nothrow) PatternList(range);
1120	if (!list)
1121		ThrowOutOfMemError(stream.Pos());
1122	try {
1123		// LeftParen
1124		stream.Read(LeftParen);
1125		// [Flag] Pattern, (Divider, [Flag] Pattern)*
1126		while (true) {
1127			// [Flag]
1128			if (stream.CondRead(CaseInsensitiveFlag))
1129				list->SetCaseInsensitive(true);
1130			// Pattern
1131			list->Add(ParsePattern());
1132			// [Divider]
1133			if (!stream.CondRead(Divider))
1134				break;
1135		}
1136		// RightParen
1137		const Token *t = stream.Get();
1138		if (t->Type() != RightParen)
1139			throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1140	} catch (...) {
1141		delete list;
1142		throw;
1143	}
1144	return list;
1145}
1146
1147DisjList*
1148Parser::ParseRPatternList() {
1149	RPatternList *list = new(std::nothrow) RPatternList();
1150	if (!list)
1151		ThrowOutOfMemError(stream.Pos());
1152	try {
1153		// LeftParen
1154		stream.Read(LeftParen);
1155		// [Flag] RPattern, (Divider, [Flag] RPattern)*
1156		while (true) {
1157			// [Flag]
1158			if (stream.CondRead(CaseInsensitiveFlag))
1159				list->SetCaseInsensitive(true);
1160			// RPattern
1161			list->Add(ParseRPattern());
1162			// [Divider]
1163			if (!stream.CondRead(Divider))
1164				break;
1165		}
1166		// RightParen
1167		const Token *t = stream.Get();
1168		if (t->Type() != RightParen)
1169			throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1170	} catch (...) {
1171		delete list;
1172		throw;
1173	}
1174	return list;
1175}
1176
1177RPattern*
1178Parser::ParseRPattern() {
1179	// Range
1180	Range range = ParseRange();
1181	// Pattern
1182	Pattern *pattern = ParsePattern();
1183
1184	RPattern *result = new(std::nothrow) RPattern(range, pattern);
1185	if (result) {
1186		if (result->InitCheck() == B_OK)
1187			return result;
1188		else {
1189			Err *err = result->GetErr();
1190			delete result;
1191			throw err;
1192		}
1193	} else
1194		ThrowOutOfMemError(stream.Pos());
1195	return NULL;
1196}
1197
1198Pattern*
1199Parser::ParsePattern() {
1200	std::string str;
1201	// String
1202	{
1203		const Token *t = stream.Get();
1204		if (t->Type() == CharacterString)
1205			str = t->String();
1206		else
1207			throw new Err("Sniffer pattern error: missing pattern", t->Pos());
1208	}
1209	// [Ampersand, String]
1210	if (stream.CondRead(Ampersand)) {
1211		// String (i.e. Mask)
1212		const Token *t = stream.Get();
1213		if (t->Type() == CharacterString) {
1214			Pattern *result = new(std::nothrow) Pattern(str, t->String());
1215			if (!result)
1216				ThrowOutOfMemError(t->Pos());
1217			if (result->InitCheck() == B_OK) {
1218				return result;
1219			} else {
1220				Err *err = result->GetErr();
1221				delete result;
1222				if (err) {
1223					err->SetPos(t->Pos());
1224				}
1225				throw err;
1226			}
1227		} else
1228			ThrowUnexpectedTokenError(CharacterString, t);
1229	} else {
1230		// No mask specified.
1231		Pattern *result = new(std::nothrow) Pattern(str);
1232		if (result) {
1233			if (result->InitCheck() == B_OK)
1234				return result;
1235			else {
1236				Err *err = result->GetErr();
1237				delete result;
1238				throw err;
1239			}
1240		} else
1241			ThrowOutOfMemError(stream.Pos());
1242	}
1243	return NULL;
1244}
1245
1246void
1247Parser::ThrowEndOfStreamError() {
1248	throw new Err("Sniffer pattern error: unterminated rule", stream.EndPos());
1249}
1250
1251inline
1252void
1253Parser::ThrowOutOfMemError(ssize_t pos) {
1254	if (fOutOfMemErr)
1255		fOutOfMemErr->SetPos(pos);
1256	Err *err = fOutOfMemErr;
1257	fOutOfMemErr = NULL;
1258	throw err;
1259}
1260
1261void
1262Parser::ThrowUnexpectedTokenError(TokenType expected, const Token *found) {
1263	throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected)
1264	                + ", found " + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1265	                , (found ? found->Pos() : stream.EndPos()));
1266}
1267
1268void
1269Parser::ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found) {
1270	throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected1)
1271	                + " or " + tokenTypeToString(expected2) + ", found "
1272	                + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1273	                , (found ? found->Pos() : stream.EndPos()));
1274}
1275
1276
1277
1278