1/*
2 * Copyright 2006-2014 Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 *		Stephan A��mus <superstippi@gmx.de>
7 *		Rene Gollent <rene@gollent.com>
8 *		John Scipione <jscipione@gmail.com>
9 *		Ingo Weinhold <bonefish@cs.tu-berlin.de>
10 */
11
12
13#include "CLanguageTokenizer.h"
14
15#include <ctype.h>
16#include <stdio.h>
17#include <stdlib.h>
18
19
20using CLanguage::ParseException;
21using CLanguage::Token;
22using CLanguage::Tokenizer;
23
24
25// #pragma mark - Token
26
27
28Token::Token()
29	:
30	string(""),
31	type(TOKEN_NONE),
32	value(),
33	position(0)
34{
35}
36
37
38Token::Token(const Token& other)
39	:
40	string(other.string),
41	type(other.type),
42	value(other.value),
43	position(other.position)
44{
45}
46
47
48Token::Token(const char* string, int32 length, int32 position, int32 type)
49	:
50	string(string, length),
51	type(type),
52	value(),
53	position(position)
54{
55}
56
57
58Token&
59Token::operator=(const Token& other)
60{
61	string = other.string;
62	type = other.type;
63	value = other.value;
64	position = other.position;
65	return *this;
66}
67
68
69// #pragma mark - Tokenizer
70
71
72Tokenizer::Tokenizer()
73	:
74	fString(""),
75	fCurrentChar(NULL),
76	fCurrentToken(),
77	fReuseToken(false)
78{
79}
80
81
82void
83Tokenizer::SetTo(const char* string)
84{
85	fString = string;
86	fCurrentChar = fString.String();
87	fCurrentToken = Token();
88	fReuseToken = false;
89}
90
91
92const Token&
93Tokenizer::NextToken()
94{
95	if (fCurrentToken.type == TOKEN_END_OF_LINE)
96		return fCurrentToken;
97
98	if (fReuseToken) {
99		fReuseToken = false;
100		return fCurrentToken;
101	}
102
103	while (*fCurrentChar != 0 && isspace(*fCurrentChar))
104		fCurrentChar++;
105
106	if (*fCurrentChar == 0) {
107		return fCurrentToken = Token("", 0, _CurrentPos(),
108			TOKEN_END_OF_LINE);
109	}
110
111	bool decimal = *fCurrentChar == '.';
112
113	if (decimal || isdigit(*fCurrentChar)) {
114		if (*fCurrentChar == '0' && fCurrentChar[1] == 'x')
115			return _ParseHexOperand();
116
117		BString temp;
118
119		const char* begin = fCurrentChar;
120
121		// optional digits before the comma
122		while (isdigit(*fCurrentChar)) {
123			temp << *fCurrentChar;
124			fCurrentChar++;
125		}
126
127		// optional post decimal part
128		// (required if there are no digits before the decimal)
129		if (*fCurrentChar == '.') {
130			decimal = true;
131			temp << '.';
132			fCurrentChar++;
133
134			// optional post decimal digits
135			while (isdigit(*fCurrentChar)) {
136				temp << *fCurrentChar;
137				fCurrentChar++;
138			}
139		}
140
141		int32 length = fCurrentChar - begin;
142		if (length == 1 && decimal) {
143			// check for . operator
144			fCurrentChar = begin;
145			if (!_ParseOperator())
146				throw ParseException("unexpected character", _CurrentPos());
147
148			return fCurrentToken;
149		}
150
151		BString test = temp;
152		test << "&_";
153		double value;
154		char t[2];
155		int32 matches = sscanf(test.String(), "%lf&%s", &value, t);
156		if (matches != 2)
157			throw ParseException("error in constant", _CurrentPos() - length);
158
159		fCurrentToken = Token(begin, length, _CurrentPos() - length,
160			TOKEN_CONSTANT);
161		if (decimal)
162			fCurrentToken.value.SetTo(value);
163		else
164			fCurrentToken.value.SetTo((int64)strtoll(temp.String(), NULL, 10));
165	} else if (isalpha(*fCurrentChar) || *fCurrentChar == '_') {
166		const char* begin = fCurrentChar;
167		while (*fCurrentChar != 0 && (isalpha(*fCurrentChar)
168			|| isdigit(*fCurrentChar) || *fCurrentChar == '_')) {
169			fCurrentChar++;
170		}
171		int32 length = fCurrentChar - begin;
172		fCurrentToken = Token(begin, length, _CurrentPos() - length,
173			TOKEN_IDENTIFIER);
174	} else if (*fCurrentChar == '"' || *fCurrentChar == '\'') {
175		bool terminatorFound = false;
176		const char* begin = fCurrentChar++;
177		while (*fCurrentChar != 0) {
178			if (*fCurrentChar == '\\') {
179				if (*(fCurrentChar++) != 0)
180					fCurrentChar++;
181			} else if (*(fCurrentChar++) == *begin) {
182				terminatorFound = true;
183				break;
184			}
185		}
186		int32 tokenType = TOKEN_STRING_LITERAL;
187		if (!terminatorFound) {
188			tokenType = *begin == '"' ? TOKEN_DOUBLE_QUOTE
189					: TOKEN_SINGLE_QUOTE;
190			fCurrentChar = begin + 1;
191		}
192
193		int32 length = fCurrentChar - begin;
194		fCurrentToken = Token(begin, length, _CurrentPos() - length,
195			tokenType);
196	} else {
197		if (!_ParseOperator()) {
198			int32 type = TOKEN_NONE;
199			switch (*fCurrentChar) {
200				case '\n':
201					type = TOKEN_END_OF_LINE;
202					break;
203
204				case '(':
205					type = TOKEN_OPENING_PAREN;
206					break;
207				case ')':
208					type = TOKEN_CLOSING_PAREN;
209					break;
210
211				case '[':
212					type = TOKEN_OPENING_SQUARE_BRACKET;
213					break;
214				case ']':
215					type = TOKEN_CLOSING_SQUARE_BRACKET;
216					break;
217
218				case '{':
219					type = TOKEN_OPENING_CURLY_BRACE;
220					break;
221				case '}':
222					type = TOKEN_CLOSING_CURLY_BRACE;
223					break;
224
225				case '\\':
226					type = TOKEN_BACKSLASH;
227					break;
228
229				case ':':
230					type = TOKEN_COLON;
231					break;
232
233				case ';':
234					type = TOKEN_SEMICOLON;
235					break;
236
237				case ',':
238					type = TOKEN_COMMA;
239					break;
240
241				case '.':
242					type = TOKEN_PERIOD;
243					break;
244
245				case '#':
246					type = TOKEN_POUND;
247					break;
248
249				default:
250					throw ParseException("unexpected character",
251						_CurrentPos());
252			}
253			fCurrentToken = Token(fCurrentChar, 1, _CurrentPos(),
254				type);
255			fCurrentChar++;
256		}
257	}
258
259	return fCurrentToken;
260}
261
262
263bool
264Tokenizer::_ParseOperator()
265{
266	int32 type = TOKEN_NONE;
267	int32 length = 0;
268	switch (*fCurrentChar) {
269		case '+':
270			type = TOKEN_PLUS;
271			length = 1;
272			break;
273
274		case '-':
275			 if (_Peek() == '>') {
276			 	type = TOKEN_MEMBER_PTR;
277			 	length = 2;
278			 } else {
279				type = TOKEN_MINUS;
280				length = 1;
281			 }
282			break;
283
284		case '*':
285			switch (_Peek()) {
286				case '/':
287					type = TOKEN_END_COMMENT_BLOCK;
288					length = 2;
289					break;
290				default:
291					type = TOKEN_STAR;
292					length = 1;
293					break;
294			}
295			break;
296
297		case '/':
298			switch (_Peek()) {
299				case '*':
300					type = TOKEN_BEGIN_COMMENT_BLOCK;
301					length = 2;
302					break;
303				case '/':
304					type = TOKEN_INLINE_COMMENT;
305					length = 2;
306					break;
307				default:
308					type = TOKEN_SLASH;
309					length = 1;
310					break;
311			}
312			break;
313
314		case '%':
315			type = TOKEN_MODULO;
316			length = 1;
317			break;
318
319		case '^':
320			type = TOKEN_BITWISE_XOR;
321			length = 1;
322			break;
323
324		case '&':
325			if (_Peek() == '&') {
326			 	type = TOKEN_LOGICAL_AND;
327			 	length = 2;
328			} else {
329				type = TOKEN_BITWISE_AND;
330				length = 1;
331			}
332			break;
333
334		case '|':
335			if (_Peek() == '|') {
336				type = TOKEN_LOGICAL_OR;
337				length = 2;
338			} else {
339				type = TOKEN_BITWISE_OR;
340				length = 1;
341			}
342			break;
343
344		case '!':
345			if (_Peek() == '=') {
346				type = TOKEN_NE;
347				length = 2;
348			} else {
349				type = TOKEN_LOGICAL_NOT;
350				length = 1;
351			}
352			break;
353
354		case '=':
355			if (_Peek() == '=') {
356				type = TOKEN_EQ;
357				length = 2;
358			} else {
359				type = TOKEN_ASSIGN;
360				length = 1;
361			}
362			break;
363
364		case '>':
365			if (_Peek() == '=') {
366				type = TOKEN_GE;
367				length = 2;
368			} else {
369				type = TOKEN_GT;
370				length = 1;
371			}
372			break;
373
374		case '<':
375			if (_Peek() == '=') {
376				type = TOKEN_LE;
377				length = 2;
378			} else {
379				type = TOKEN_LT;
380				length = 1;
381			}
382			break;
383
384		case '~':
385			type = TOKEN_BITWISE_NOT;
386			length = 1;
387			break;
388
389
390		case '?':
391			type = TOKEN_CONDITION;
392			length = 1;
393			break;
394
395		case '.':
396			type = TOKEN_MEMBER_PTR;
397			length = 1;
398			break;
399
400		default:
401			break;
402	}
403
404	if (length == 0)
405		return false;
406
407	fCurrentToken = Token(fCurrentChar, length, _CurrentPos(), type);
408	fCurrentChar += length;
409
410	return true;
411}
412
413
414void
415Tokenizer::RewindToken()
416{
417	fReuseToken = true;
418}
419
420
421char
422Tokenizer::_Peek() const
423{
424	if (_CurrentPos() < fString.Length())
425		return *(fCurrentChar + 1);
426
427	return '\0';
428}
429
430
431/*static*/ bool
432Tokenizer::_IsHexDigit(char c)
433{
434	return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
435}
436
437
438Token&
439Tokenizer::_ParseHexOperand()
440{
441	const char* begin = fCurrentChar;
442	fCurrentChar += 2;
443		// skip "0x"
444
445	if (!_IsHexDigit(*fCurrentChar))
446		throw ParseException("expected hex digit", _CurrentPos());
447
448	fCurrentChar++;
449	while (_IsHexDigit(*fCurrentChar))
450		fCurrentChar++;
451
452	int32 length = fCurrentChar - begin;
453	fCurrentToken = Token(begin, length, _CurrentPos() - length,
454		TOKEN_CONSTANT);
455
456	if (length <= 10) {
457		// including the leading 0x, a 32-bit constant will be at most
458		// 10 characters. Anything larger, and 64 is necessary.
459		fCurrentToken.value.SetTo((uint32)strtoul(
460			fCurrentToken.string.String(), NULL, 16));
461	} else {
462		fCurrentToken.value.SetTo((uint64)strtoull(
463			fCurrentToken.string.String(), NULL, 16));
464	}
465	return fCurrentToken;
466}
467
468
469int32
470Tokenizer::_CurrentPos() const
471{
472	return fCurrentChar - fString.String();
473}
474