1//----------------------------------------------------------------------
2//  This software is part of the Haiku distribution and is covered
3//  by the MIT License.
4//---------------------------------------------------------------------
5/*!
6	\file sniffer/Parser.h
7	MIME sniffer rule parser declarations
8*/
9#ifndef _SNIFFER_PARSER_H
10#define _SNIFFER_PARSER_H
11
12#include <SupportDefs.h>
13#include <sniffer/CharStream.h>
14#include <sniffer/Err.h>
15#include <sniffer/Range.h>
16#include <sniffer/Rule.h>
17#include <List.h>
18#include <string>
19#include <vector>
20
21class BString;
22
23//! MIME Sniffer related classes
24namespace BPrivate {
25namespace Storage {
26namespace Sniffer {
27
28class Rule;
29class DisjList;
30class RPattern;
31class Pattern;
32
33//------------------------------------------------------------------------------
34// The mighty parsing function ;-)
35//------------------------------------------------------------------------------
36
37status_t parse(const char *rule, Rule *result, BString *parseError = NULL);
38
39//------------------------------------------------------------------------------
40// Classes used internally by the parser
41//------------------------------------------------------------------------------
42
43//! Types of tokens
44typedef enum TokenType {
45	EmptyToken,
46	LeftParen,
47	RightParen,
48	LeftBracket,
49	RightBracket,
50	Colon,
51	Divider,
52	Ampersand,
53	CaseInsensitiveFlag,
54	CharacterString,
55	Integer,
56	FloatingPoint
57} TokenType;
58
59/*! \brief Returns a NULL-terminated string contating the
60		   name of the given token type
61*/
62const char* tokenTypeToString(TokenType type);
63
64//! Base token class returned by TokenStream
65/*! Each token represents a single chunk of relevant information
66    in a given rule. For example, the floating point number "1.2e-35",
67    originally represented as a 7-character string, is added to the
68    token stream as a single FloatToken object.
69*/
70class Token {
71public:
72	Token(TokenType type = EmptyToken, const ssize_t pos = -1);
73	virtual ~Token();
74	TokenType Type() const;
75	virtual const std::string& String() const;
76	virtual int32 Int() const;
77	virtual double Float() const;
78	ssize_t Pos() const;
79	bool operator==(Token &ref) const;
80protected:
81	TokenType fType;
82	ssize_t fPos;
83};
84
85//! String token class
86/*! Single-quoted strings, double-quoted strings, unquoted strings, and
87	hex literals are all converted to StringToken objects by the scanner
88	and from then on treated uniformly.
89*/
90class StringToken : public Token {
91public:
92	StringToken(const std::string &str, const ssize_t pos);
93	virtual ~StringToken();
94	virtual const std::string& String() const;
95protected:
96	std::string fString;
97};
98
99//! Integer token class
100/*! Signed or unsigned integer literals are coverted to IntToken objects,
101    which may then be treated as either ints or floats (since a priority
102    of "1" would be valid, but scanned as an int instead of a float).
103*/
104class IntToken : public Token {
105public:
106	IntToken(const int32 value, const ssize_t pos);
107	virtual ~IntToken();
108	virtual int32 Int() const;
109	virtual double Float() const;
110protected:
111	int32 fValue;
112};
113
114//! Floating point token class
115/*! Signed or unsigned, extended or non-extended notation floating point
116    numbers are converted to FloatToken objects by the scanner.
117*/
118class FloatToken : public Token {
119public:
120	FloatToken(const double value, const ssize_t pos);
121	virtual ~FloatToken();
122	virtual double Float() const;
123protected:
124	double fValue;
125};
126
127//! Manages a stream of Token objects
128/*! Provides Get() and Unget() operations, some handy shortcut operations (Read()
129    and CondRead()), and handles memory management with respect to all the
130    Token objects in the stream (i.e. never delete a Token object returned by Get()).
131
132    Also, the scanner portion of the parser is implemented in the TokenStream's
133    SetTo() function.
134*/
135class TokenStream {
136public:
137	TokenStream(const std::string &string);
138	TokenStream();
139	~TokenStream();
140
141	status_t SetTo(const std::string &string);
142	void Unset();
143	status_t InitCheck() const;
144
145	const Token* Get();
146	void Unget();
147
148	void Read(TokenType type);
149	bool CondRead(TokenType type);
150
151	ssize_t Pos() const;
152	ssize_t EndPos() const;
153
154	bool IsEmpty() const;
155
156private:
157	void AddToken(TokenType type, ssize_t pos);
158	void AddString(const std::string &str, ssize_t pos);
159	void AddInt(const char *str, ssize_t pos);
160	void AddFloat(const char *str, ssize_t pos);
161
162	std::vector<Token*> fTokenList;
163	status_t fCStatus;
164	int fPos;
165	int fStrLen;
166
167
168	TokenStream(const TokenStream &ref);
169	TokenStream& operator=(const TokenStream &ref);
170};
171
172//! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
173/*! A MIME sniffer rule is valid if it is well-formed with respect to the
174	following grammar and fulfills some further conditions listed thereafter:
175
176	<code>
177	Rule			::= LWS Priority LWS ConjList LWS
178	ConjList		::= DisjList (LWS DisjList)*
179	DisjList		::= "(" LWS PatternList LWS ")"
180						| "(" LWS RPatternList LWS ")"
181						| Range LWS "(" LWS PatternList LWS ")"
182	RPatternList	::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
183	PatternList		::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
184
185	RPattern		::= LWS Range LWS Pattern
186	Pattern			::= PString [ LWS "&" LWS Mask ]
187	Range			::=	"[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"
188
189	Priority		::= Float
190	Mask			::= PString
191	PString			::= HexLiteral | QuotedString | UnquotedString
192
193	HexLiteral		::= "0x" HexPair HexPair*
194	HexPair			::= HexChar HexChar
195
196	QuotedString	::= SingleQuotedString | DoubleQuotedString
197	SQuotedString	:= "'" SQChar+ "'"
198	DQuotedString	:= '"' DQChar+ '"'
199
200	UnquotedString	::= EscapedChar UChar*
201	EscapedChar		::= OctalEscape | HexEscape | "\" Char
202	OctalEscape		::= "\" [[OctHiChar] OctChar] OctChar
203	HexEscape		::= "\x" HexPair
204
205	Flag			::= "-i"
206
207	SDecimal		::= [Sign] Decimal
208	Decimal			::= DecChar DecChar*
209	Float			::= Fixed [("E" | "e") SDecimal]
210	Fixed			::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
211	Sign			::= "+" | "-"
212
213	PunctuationChar	::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
214	OctHiChar		::= "0" | "1" | "2" | "3"
215	OctChar			::= OctHiChar | "4" | "5" | "6" | "7"
216	DecChar			::= OctChar | "8" | "9"
217	HexChar			::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
218						| "D" | "E" | "F"
219
220	Char			:: <any character>
221	SQChar			::= <Char except "\", "'"> | EscapedChar
222	DQChar			::= <Char except "\", '"'> | EscapedChar
223	UChar			::= <Char except "\", LWSChar,  and PunctuationChar> | EscapedChar
224
225	LWS				::= LWSChar*
226	LWSChar			::= " " | TAB | LF
227	</code>
228
229	Conditions:
230	- If a mask is specified for a pattern, this mask must have the same
231	  length as the pattern string.
232	- 0.0 <= Priority <= 1.0
233	- 0 <= Range begin <= Range end
234
235	Notes:
236	- If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
237	  in a DisjList, case-insensitivity is applied to the entire DisjList.
238
239	Examples:
240	- 1.0 ('ABCD')
241	  The file must start with the string "ABCD". The priority of the rule
242	  is 1.0 (maximal).
243	- 0.8 [0:3] ('ABCD' | 'abcd')
244	  The file must contain the string "ABCD" or "abcd" starting somewhere in
245	  the first four bytes. The rule priority is 0.8.
246	- 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
247	  The file must contain the string "ABCD" or "abcd" starting somewhere in
248	  the first four bytes or the string "EFGH" at position 13. The rule
249	  priority is 0.5.
250	- 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
251	  The file must contain the string "A.CD" or "ab.d" (whereas "." is an
252	  arbitrary character) starting somewhere in the first four bytes. The
253	  rule priority is 0.8.
254	- 0.3 [10] ('mnop') ('abc') [20] ('xyz')
255	  The file must contain the string 'abc' at the beginning of the file,
256	  the string 'mnop' starting at position 10, and the string 'xyz'
257	  starting at position 20. The rule priority is 0.3.
258	- 200e-3 (-i 'ab')
259	  The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
260	  beginning of the file. The rule priority is 0.2.
261
262	Real examples:
263	- 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
264	        | [0:32]"#ifdef")
265	  text/x-source-code
266	- 0.70 ("8BPS  \000\000\000\000" & 0xffffffff0000ffffffff )
267	  image/x-photoshop
268	- 0.40 [0:64]( -i "&lt;HTML" | "&lt;HEAD" | "&lt;TITLE" | "&lt;BODY"
269			| "&lt;TABLE" | "&lt;!--" | "&lt;META" | "&lt;CENTER")
270	  text/html
271
272*/
273class Parser {
274public:
275	Parser();
276	~Parser();
277	status_t Parse(const char *rule, Rule *result, BString *parseError = NULL);
278private:
279	std::string ErrorMessage(Err *err, const char *rule);
280
281	// Things that get done a lot :-)
282	void ThrowEndOfStreamError();
283	inline void ThrowOutOfMemError(ssize_t pos);
284	void ThrowUnexpectedTokenError(TokenType expected, const Token *found);
285	void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found);
286
287	// Parsing functions
288	void ParseRule(Rule *result);
289	double ParsePriority();
290	std::vector<DisjList*>* ParseConjList();
291	DisjList* ParseDisjList();
292	Range ParseRange();
293	DisjList* ParsePatternList(Range range);
294	DisjList* ParseRPatternList();
295	RPattern* ParseRPattern();
296	Pattern* ParsePattern();
297
298	TokenStream stream;
299
300	Err *fOutOfMemErr;
301};
302
303};	// namespace Sniffer
304};	// namespace Storage
305};	// namespace BPrivate
306
307#endif	// _SNIFFER_PARSER_H
308
309
310
311
312