1// lex.h -- Go frontend lexer.     -*- C++ -*-
2
3// Copyright 2009 The Go Authors. All rights reserved.
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file.
6
7#ifndef GO_LEX_H
8#define GO_LEX_H
9
10#include <mpfr.h>
11
12#include "operator.h"
13#include "go-linemap.h"
14
15struct Unicode_range;
16
17// The keywords.  These must be in sorted order, other than
18// KEYWORD_INVALID.  They must match the Keywords::mapping_ array in
19// lex.cc.
20
21enum Keyword
22{
23  KEYWORD_INVALID,	// Not a keyword.
24  KEYWORD_ASM,
25  KEYWORD_BREAK,
26  KEYWORD_CASE,
27  KEYWORD_CHAN,
28  KEYWORD_CONST,
29  KEYWORD_CONTINUE,
30  KEYWORD_DEFAULT,
31  KEYWORD_DEFER,
32  KEYWORD_ELSE,
33  KEYWORD_FALLTHROUGH,
34  KEYWORD_FOR,
35  KEYWORD_FUNC,
36  KEYWORD_GO,
37  KEYWORD_GOTO,
38  KEYWORD_IF,
39  KEYWORD_IMPORT,
40  KEYWORD_INTERFACE,
41  KEYWORD_MAP,
42  KEYWORD_PACKAGE,
43  KEYWORD_RANGE,
44  KEYWORD_RETURN,
45  KEYWORD_SELECT,
46  KEYWORD_STRUCT,
47  KEYWORD_SWITCH,
48  KEYWORD_TYPE,
49  KEYWORD_VAR
50};
51
52// A token returned from the lexer.
53
54class Token
55{
56 public:
57  // Token classification.
58  enum Classification
59  {
60    // Token is invalid.
61    TOKEN_INVALID,
62    // Token indicates end of input.
63    TOKEN_EOF,
64    // Token is a keyword.
65    TOKEN_KEYWORD,
66    // Token is an identifier.
67    TOKEN_IDENTIFIER,
68    // Token is a string of characters.
69    TOKEN_STRING,
70    // Token is an operator.
71    TOKEN_OPERATOR,
72    // Token is a character constant.
73    TOKEN_CHARACTER,
74    // Token is an integer.
75    TOKEN_INTEGER,
76    // Token is a floating point number.
77    TOKEN_FLOAT,
78    // Token is an imaginary number.
79    TOKEN_IMAGINARY
80  };
81
82  ~Token();
83  Token(const Token&);
84  Token& operator=(const Token&);
85
86  // Get token classification.
87  Classification
88  classification() const
89  { return this->classification_; }
90
91  // Make a token for an invalid value.
92  static Token
93  make_invalid_token(Location location)
94  { return Token(TOKEN_INVALID, location); }
95
96  // Make a token representing end of file.
97  static Token
98  make_eof_token(Location location)
99  { return Token(TOKEN_EOF, location); }
100
101  // Make a keyword token.
102  static Token
103  make_keyword_token(Keyword keyword, Location location)
104  {
105    Token tok(TOKEN_KEYWORD, location);
106    tok.u_.keyword = keyword;
107    return tok;
108  }
109
110  // Make an identifier token.
111  static Token
112  make_identifier_token(const std::string& value, bool is_exported,
113			Location location)
114  {
115    Token tok(TOKEN_IDENTIFIER, location);
116    tok.u_.identifier_value.name = new std::string(value);
117    tok.u_.identifier_value.is_exported = is_exported;
118    return tok;
119  }
120
121  // Make a quoted string token.
122  static Token
123  make_string_token(const std::string& value, Location location)
124  {
125    Token tok(TOKEN_STRING, location);
126    tok.u_.string_value = new std::string(value);
127    return tok;
128  }
129
130  // Make an operator token.
131  static Token
132  make_operator_token(Operator op, Location location)
133  {
134    Token tok(TOKEN_OPERATOR, location);
135    tok.u_.op = op;
136    return tok;
137  }
138
139  // Make a character constant token.
140  static Token
141  make_character_token(mpz_t val, Location location)
142  {
143    Token tok(TOKEN_CHARACTER, location);
144    mpz_init(tok.u_.integer_value);
145    mpz_swap(tok.u_.integer_value, val);
146    return tok;
147  }
148
149  // Make an integer token.
150  static Token
151  make_integer_token(mpz_t val, Location location)
152  {
153    Token tok(TOKEN_INTEGER, location);
154    mpz_init(tok.u_.integer_value);
155    mpz_swap(tok.u_.integer_value, val);
156    return tok;
157  }
158
159  // Make a float token.
160  static Token
161  make_float_token(mpfr_t val, Location location)
162  {
163    Token tok(TOKEN_FLOAT, location);
164    mpfr_init(tok.u_.float_value);
165    mpfr_swap(tok.u_.float_value, val);
166    return tok;
167  }
168
169  // Make a token for an imaginary number.
170  static Token
171  make_imaginary_token(mpfr_t val, Location location)
172  {
173    Token tok(TOKEN_IMAGINARY, location);
174    mpfr_init(tok.u_.float_value);
175    mpfr_swap(tok.u_.float_value, val);
176    return tok;
177  }
178
179  // Get the location of the token.
180  Location
181  location() const
182  { return this->location_; }
183
184  // Return whether this is an invalid token.
185  bool
186  is_invalid() const
187  { return this->classification_ == TOKEN_INVALID; }
188
189  // Return whether this is the EOF token.
190  bool
191  is_eof() const
192  { return this->classification_ == TOKEN_EOF; }
193
194  // Return the keyword value for a keyword token.
195  Keyword
196  keyword() const
197  {
198    go_assert(this->classification_ == TOKEN_KEYWORD);
199    return this->u_.keyword;
200  }
201
202  // Return whether this is an identifier.
203  bool
204  is_identifier() const
205  { return this->classification_ == TOKEN_IDENTIFIER; }
206
207  // Return the identifier.
208  const std::string&
209  identifier() const
210  {
211    go_assert(this->classification_ == TOKEN_IDENTIFIER);
212    return *this->u_.identifier_value.name;
213  }
214
215  // Return whether the identifier is exported.
216  bool
217  is_identifier_exported() const
218  {
219    go_assert(this->classification_ == TOKEN_IDENTIFIER);
220    return this->u_.identifier_value.is_exported;
221  }
222
223  // Return whether this is a string.
224  bool
225  is_string() const
226  {
227    return this->classification_ == TOKEN_STRING;
228  }
229
230  // Return the value of a string.  The returned value is a string of
231  // UTF-8 characters.
232  std::string
233  string_value() const
234  {
235    go_assert(this->classification_ == TOKEN_STRING);
236    return *this->u_.string_value;
237  }
238
239  // Return the value of a character constant.
240  const mpz_t*
241  character_value() const
242  {
243    go_assert(this->classification_ == TOKEN_CHARACTER);
244    return &this->u_.integer_value;
245  }
246
247  // Return the value of an integer.
248  const mpz_t*
249  integer_value() const
250  {
251    go_assert(this->classification_ == TOKEN_INTEGER);
252    return &this->u_.integer_value;
253  }
254
255  // Return the value of a float.
256  const mpfr_t*
257  float_value() const
258  {
259    go_assert(this->classification_ == TOKEN_FLOAT);
260    return &this->u_.float_value;
261  }
262
263  // Return the value of an imaginary number.
264  const mpfr_t*
265  imaginary_value() const
266  {
267    go_assert(this->classification_ == TOKEN_IMAGINARY);
268    return &this->u_.float_value;
269  }
270
271  // Return the operator value for an operator token.
272  Operator
273  op() const
274  {
275    go_assert(this->classification_ == TOKEN_OPERATOR);
276    return this->u_.op;
277  }
278
279  // Return whether this token is KEYWORD.
280  bool
281  is_keyword(Keyword keyword) const
282  {
283    return (this->classification_ == TOKEN_KEYWORD
284	    && this->u_.keyword == keyword);
285  }
286
287  // Return whether this token is OP.
288  bool
289  is_op(Operator op) const
290  { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; }
291
292  // Print the token for debugging.
293  void
294  print(FILE*) const;
295
296 private:
297  // Private constructor used by make_..._token functions above.
298  Token(Classification, Location);
299
300  // Clear the token.
301  void
302  clear();
303
304  // The token classification.
305  Classification classification_;
306  union
307  {
308    // The keyword value for TOKEN_KEYWORD.
309    Keyword keyword;
310    // The token value for TOKEN_IDENTIFIER.
311    struct
312    {
313      // The name of the identifier.  This has been mangled to only
314      // include ASCII characters.
315      std::string* name;
316      // Whether this name should be exported.  This is true if the
317      // first letter in the name is upper case.
318      bool is_exported;
319    } identifier_value;
320    // The string value for TOKEN_STRING.
321    std::string* string_value;
322    // The token value for TOKEN_CHARACTER or TOKEN_INTEGER.
323    mpz_t integer_value;
324    // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY.
325    mpfr_t float_value;
326    // The token value for TOKEN_OPERATOR or the keyword value
327    Operator op;
328  } u_;
329  // The source location.
330  Location location_;
331};
332
333// The lexer itself.
334
335class Lex
336{
337 public:
338  Lex(const char* input_file_name, FILE* input_file, Linemap *linemap);
339
340  ~Lex();
341
342  // Return the next token.
343  Token
344  next_token();
345
346  // Return the contents of any current //extern comment.
347  const std::string&
348  extern_name() const
349  { return this->extern_; }
350
351  // Return whether we have seen a //go:nointerface comment, clearing
352  // the flag.
353  bool
354  get_and_clear_nointerface()
355  {
356    bool ret = this->saw_nointerface_;
357    this->saw_nointerface_ = false;
358    return ret;
359  }
360
361  // Return whether the identifier NAME should be exported.  NAME is a
362  // mangled name which includes only ASCII characters.
363  static bool
364  is_exported_name(const std::string& name);
365
366  // Return whether the identifier NAME is invalid.  When we see an
367  // invalid character we still build an identifier, but we use a
368  // magic string to indicate that the identifier is invalid.  We then
369  // use this to avoid knockon errors.
370  static bool
371  is_invalid_identifier(const std::string& name);
372
373  // A helper function.  Append V to STR.  IS_CHARACTER is true if V
374  // is a Unicode character which should be converted into UTF-8,
375  // false if it is a byte value to be appended directly.  The
376  // location is used to warn about an out of range character.
377  static void
378  append_char(unsigned int v, bool is_charater, std::string* str,
379	      Location);
380
381  // A helper function.  Fetch a UTF-8 character from STR and store it
382  // in *VALUE.  Return the number of bytes read from STR.  Return 0
383  // if STR does not point to a valid UTF-8 character.
384  static int
385  fetch_char(const char* str, unsigned int *value);
386
387  // Return whether C is a Unicode or "C" locale space character.
388  static bool
389  is_unicode_space(unsigned int c);
390
391 private:
392  ssize_t
393  get_line();
394
395  bool
396  require_line();
397
398  // The current location.
399  Location
400  location() const;
401
402  // A position CHARS column positions before the current location.
403  Location
404  earlier_location(int chars) const;
405
406  static bool
407  is_hex_digit(char);
408
409  static unsigned char
410  octal_value(char c)
411  { return c - '0'; }
412
413  Token
414  make_invalid_token()
415  { return Token::make_invalid_token(this->location()); }
416
417  Token
418  make_eof_token()
419  { return Token::make_eof_token(this->location()); }
420
421  Token
422  make_operator(Operator op, int chars)
423  { return Token::make_operator_token(op, this->earlier_location(chars)); }
424
425  Token
426  gather_identifier();
427
428  static bool
429  could_be_exponent(const char*, const char*);
430
431  Token
432  gather_number();
433
434  Token
435  gather_character();
436
437  Token
438  gather_string();
439
440  Token
441  gather_raw_string();
442
443  const char*
444  advance_one_utf8_char(const char*, unsigned int*, bool*);
445
446  const char*
447  advance_one_char(const char*, bool, unsigned int*, bool*);
448
449  static bool
450  is_unicode_digit(unsigned int c);
451
452  static bool
453  is_unicode_letter(unsigned int c);
454
455  static bool
456  is_unicode_uppercase(unsigned int c);
457
458  static bool
459  is_in_unicode_range(unsigned int C, const Unicode_range* ranges,
460		      size_t range_size);
461
462  Operator
463  three_character_operator(char, char, char);
464
465  Operator
466  two_character_operator(char, char);
467
468  Operator
469  one_character_operator(char);
470
471  bool
472  skip_c_comment();
473
474  void
475  skip_cpp_comment();
476
477  // The input file name.
478  const char* input_file_name_;
479  // The input file.
480  FILE* input_file_;
481  // The object used to keep track of file names and line numbers.
482  Linemap* linemap_;
483  // The line buffer.  This holds the current line.
484  char* linebuf_;
485  // The size of the line buffer.
486  size_t linebufsize_;
487  // The nmber of characters in the current line.
488  size_t linesize_;
489  // The current offset in linebuf_.
490  size_t lineoff_;
491  // The current line number.
492  size_t lineno_;
493  // Whether to add a semicolon if we see a newline now.
494  bool add_semi_at_eol_;
495  // Whether we just saw a magic go:nointerface comment.
496  bool saw_nointerface_;
497  // The external name to use for a function declaration, from a magic
498  // //extern comment.
499  std::string extern_;
500};
501
502#endif // !defined(GO_LEX_H)
503