1// Copyright 2017 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "fidl/lexer.h"
6
7#include <ctype.h>
8
9namespace fidl {
10
11namespace {
12
13bool IsIdentifierBody(char c) {
14    return isalnum(c) || c == '_';
15}
16
17// IsIdentifierValid disallows identifiers (escaped, and unescaped) from
18// starting or ending with underscore.
19bool IsIdentifierValid(StringView source_data) {
20    return source_data[0] != '_' && source_data[source_data.size() - 1] != '_';
21}
22
23bool IsNumericLiteralBody(char c) {
24    switch (c) {
25    case '0':
26    case '1':
27    case '2':
28    case '3':
29    case '4':
30    case '5':
31    case '6':
32    case '7':
33    case '8':
34    case '9':
35    case 'a':
36    case 'A':
37    case 'b':
38    case 'B':
39    case 'c':
40    case 'C':
41    case 'd':
42    case 'D':
43    case 'e':
44    case 'E':
45    case 'f':
46    case 'F':
47    case 'x':
48    case 'X':
49    case '-':
50    case '_':
51    case '.':
52        return true;
53    default:
54        return false;
55    }
56}
57
58} // namespace
59
60constexpr char Lexer::Peek() const {
61    return *current_;
62}
63
64void Lexer::Skip() {
65    ++current_;
66    ++token_start_;
67}
68
69char Lexer::Consume() {
70    auto current = *current_;
71    ++current_;
72    ++token_size_;
73    return current;
74}
75
76StringView Lexer::Reset(Token::Kind kind) {
77    auto data = StringView(token_start_, token_size_);
78    if (kind != Token::Kind::kComment) {
79        previous_end_ = token_start_ + token_size_;
80    }
81    token_start_ = current_;
82    token_size_ = 0u;
83    return data;
84}
85
86Token Lexer::Finish(Token::Kind kind) {
87    StringView previous(previous_end_, token_start_ - previous_end_);
88    StringView current(token_start_, token_size_);
89    SourceLocation previous_location(previous, source_file_);
90    return Token(previous_location,
91                 SourceLocation(Reset(kind), source_file_), kind);
92}
93
94Token Lexer::LexEndOfStream() {
95    return Finish(Token::Kind::kEndOfFile);
96}
97
98Token Lexer::LexNumericLiteral() {
99    while (IsNumericLiteralBody(Peek()))
100        Consume();
101    return Finish(Token::Kind::kNumericLiteral);
102}
103
104Token Lexer::LexIdentifier() {
105    while (IsIdentifierBody(Peek()))
106        Consume();
107    StringView previous(previous_end_, token_start_ - previous_end_);
108    SourceLocation previous_end(previous, source_file_);
109    StringView identifier_data = Reset(Token::Kind::kNotAToken);
110    if (!IsIdentifierValid(identifier_data))
111        return Finish(Token::Kind::kNotAToken);
112    return identifier_table_->MakeIdentifier(
113        previous_end, identifier_data, source_file_, /* escaped */ false);
114}
115
116Token Lexer::LexEscapedIdentifier() {
117    // Reset() to drop the initial @ from the identifier.
118    Reset(Token::Kind::kComment);
119
120    while (IsIdentifierBody(Peek()))
121        Consume();
122    StringView previous(previous_end_, token_start_ - previous_end_);
123    SourceLocation previous_end(previous, source_file_);
124    StringView identifier_data = Reset(Token::Kind::kNotAToken);
125    if (!IsIdentifierValid(identifier_data))
126        return Finish(Token::Kind::kNotAToken);
127    return identifier_table_->MakeIdentifier(
128        previous_end, identifier_data, source_file_, /* escaped */ true);
129}
130
131Token Lexer::LexStringLiteral() {
132    auto last = Peek();
133
134    // Lexing a "string literal" to the next matching delimiter.
135    for (;;) {
136        auto next = Consume();
137        switch (next) {
138        case 0:
139            return Finish(Token::Kind::kNotAToken);
140        case '"':
141            // This escaping logic is incorrect for the input: "\\"
142            if (last != '\\')
143                return Finish(Token::Kind::kStringLiteral);
144        // Fall through.
145        default:
146            last = next;
147        }
148    }
149}
150
151Token Lexer::LexCommentOrDocComment() {
152    // Consume the second /.
153    assert(Peek() == '/');
154    Consume();
155
156    // Check if it's a Doc Comment
157    auto comment_type = Token::Kind::kComment;
158    if (Peek() == '/') {
159        comment_type = Token::Kind::kDocComment;
160        Consume();
161        // Anything with more than 3 slashes is a likely a section
162        // break comment
163        if (Peek() == '/') {
164          comment_type = Token::Kind::kComment;
165        }
166    }
167
168    // Lexing a C++-style // comment. Go to the end of the line or
169    // file.
170    for (;;) {
171        switch (Peek()) {
172        case 0:
173        case '\n':
174          return Finish(comment_type);
175        default:
176            Consume();
177            continue;
178        }
179    }
180}
181
182void Lexer::SkipWhitespace() {
183    for (;;) {
184        switch (Peek()) {
185        case ' ':
186        case '\n':
187        case '\r':
188        case '\t':
189            Skip();
190            continue;
191        default:
192            return;
193        }
194    }
195}
196
197Token Lexer::LexNoComments() {
198    for (;;) {
199        auto token = Lex();
200        if (token.kind() == Token::Kind::kComment)
201            continue;
202        return token;
203    }
204}
205
206Token Lexer::Lex() {
207    SkipWhitespace();
208
209    switch (Consume()) {
210    case 0:
211        return LexEndOfStream();
212
213    case ' ':
214    case '\n':
215    case '\r':
216    case '\t':
217        assert(false && "Should have been handled by SkipWhitespace!");
218
219    case '-':
220        // Maybe the start of an arrow.
221        if (Peek() == '>') {
222            Consume();
223            return Finish(Token::Kind::kArrow);
224        }
225    // Fallthrough
226    case '0':
227    case '1':
228    case '2':
229    case '3':
230    case '4':
231    case '5':
232    case '6':
233    case '7':
234    case '8':
235    case '9':
236        return LexNumericLiteral();
237
238    case 'a':
239    case 'A':
240    case 'b':
241    case 'B':
242    case 'c':
243    case 'C':
244    case 'd':
245    case 'D':
246    case 'e':
247    case 'E':
248    case 'f':
249    case 'F':
250    case 'g':
251    case 'G':
252    case 'h':
253    case 'H':
254    case 'i':
255    case 'I':
256    case 'j':
257    case 'J':
258    case 'k':
259    case 'K':
260    case 'l':
261    case 'L':
262    case 'm':
263    case 'M':
264    case 'n':
265    case 'N':
266    case 'o':
267    case 'O':
268    case 'p':
269    case 'P':
270    case 'q':
271    case 'Q':
272    case 'r':
273    case 'R':
274    case 's':
275    case 'S':
276    case 't':
277    case 'T':
278    case 'u':
279    case 'U':
280    case 'v':
281    case 'V':
282    case 'w':
283    case 'W':
284    case 'x':
285    case 'X':
286    case 'y':
287    case 'Y':
288    case 'z':
289    case 'Z':
290        return LexIdentifier();
291
292    case '@':
293        return LexEscapedIdentifier();
294
295    case '"':
296        return LexStringLiteral();
297
298    case '/':
299        // Maybe the start of a comment.
300        switch (Peek()) {
301        case '/':
302            return LexCommentOrDocComment();
303        default:
304            return Finish(Token::Kind::kNotAToken);
305        }
306
307    case '(':
308        return Finish(Token::Kind::kLeftParen);
309    case ')':
310        return Finish(Token::Kind::kRightParen);
311    case '[':
312        return Finish(Token::Kind::kLeftSquare);
313    case ']':
314        return Finish(Token::Kind::kRightSquare);
315    case '{':
316        return Finish(Token::Kind::kLeftCurly);
317    case '}':
318        return Finish(Token::Kind::kRightCurly);
319    case '<':
320        return Finish(Token::Kind::kLeftAngle);
321    case '>':
322        return Finish(Token::Kind::kRightAngle);
323
324    case '.':
325        return Finish(Token::Kind::kDot);
326    case ',':
327        return Finish(Token::Kind::kComma);
328    case ';':
329        return Finish(Token::Kind::kSemicolon);
330    case ':':
331        return Finish(Token::Kind::kColon);
332    case '?':
333        return Finish(Token::Kind::kQuestion);
334    case '=':
335        return Finish(Token::Kind::kEqual);
336    case '&':
337        return Finish(Token::Kind::kAmpersand);
338
339    default:
340        return Finish(Token::Kind::kNotAToken);
341    }
342}
343
344} // namespace fidl
345