1/*
2** $Id: llex.c,v 2.96.1.1 2017/04/19 17:20:42 roberto Exp $
3** Lexical Analyzer
4** See Copyright Notice in lua.h
5*/
6
7#define llex_c
8#define LUA_CORE
9
10#include "lprefix.h"
11
12
13#include <locale.h>
14#include <string.h>
15
16#include "lua.h"
17
18#include "lctype.h"
19#include "ldebug.h"
20#include "ldo.h"
21#include "lgc.h"
22#include "llex.h"
23#include "lobject.h"
24#include "lparser.h"
25#include "lstate.h"
26#include "lstring.h"
27#include "ltable.h"
28#include "lzio.h"
29
30
31
32#define next(ls) (ls->current = zgetc(ls->z))
33
34
35
36#define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
37
38
39/* ORDER RESERVED */
40static const char *const luaX_tokens [] = {
41    "and", "break", "do", "else", "elseif",
42    "end", "false", "for", "function", "goto", "if",
43    "in", "local", "nil", "not", "or", "repeat",
44    "return", "then", "true", "until", "while",
45    "//", "..", "...", "==", ">=", "<=", "~=",
46    "<<", ">>", "::", "<eof>",
47    "<number>", "<integer>", "<name>", "<string>"
48};
49
50
51#define save_and_next(ls) (save(ls, ls->current), next(ls))
52
53
54static l_noret lexerror (LexState *ls, const char *msg, int token);
55
56
57static void save (LexState *ls, int c) {
58  Mbuffer *b = ls->buff;
59  if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
60    size_t newsize;
61    if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
62      lexerror(ls, "lexical element too long", 0);
63    newsize = luaZ_sizebuffer(b) * 2;
64    luaZ_resizebuffer(ls->L, b, newsize);
65  }
66  b->buffer[luaZ_bufflen(b)++] = cast(char, c);
67}
68
69
70void luaX_init (lua_State *L) {
71  int i;
72  TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
73  luaC_fix(L, obj2gco(e));  /* never collect this name */
74  for (i=0; i<NUM_RESERVED; i++) {
75    TString *ts = luaS_new(L, luaX_tokens[i]);
76    luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
77    ts->extra = cast_byte(i+1);  /* reserved word */
78  }
79}
80
81
82const char *luaX_token2str (LexState *ls, int token) {
83  if (token < FIRST_RESERVED) {  /* single-byte symbols? */
84    lua_assert(token == cast_uchar(token));
85    return luaO_pushfstring(ls->L, "'%c'", token);
86  }
87  else {
88    const char *s = luaX_tokens[token - FIRST_RESERVED];
89    if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
90      return luaO_pushfstring(ls->L, "'%s'", s);
91    else  /* names, strings, and numerals */
92      return s;
93  }
94}
95
96
97static const char *txtToken (LexState *ls, int token) {
98  switch (token) {
99    case TK_NAME: case TK_STRING:
100    case TK_FLT: case TK_INT:
101      save(ls, '\0');
102      return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
103    default:
104      return luaX_token2str(ls, token);
105  }
106}
107
108
109static l_noret lexerror (LexState *ls, const char *msg, int token) {
110  msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
111  if (token)
112    luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
113  luaD_throw(ls->L, LUA_ERRSYNTAX);
114}
115
116
117l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
118  lexerror(ls, msg, ls->t.token);
119}
120
121
122/*
123** creates a new string and anchors it in scanner's table so that
124** it will not be collected until the end of the compilation
125** (by that time it should be anchored somewhere)
126*/
127TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
128  lua_State *L = ls->L;
129  TValue *o;  /* entry for 'str' */
130  TString *ts = luaS_newlstr(L, str, l);  /* create new string */
131  setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
132  o = luaH_set(L, ls->h, L->top - 1);
133  if (ttisnil(o)) {  /* not in use yet? */
134    /* boolean value does not need GC barrier;
135       table has no metatable, so it does not need to invalidate cache */
136    setbvalue(o, 1);  /* t[string] = true */
137    luaC_checkGC(L);
138  }
139  else {  /* string already present */
140    ts = tsvalue(keyfromval(o));  /* re-use value previously stored */
141  }
142  L->top--;  /* remove string from stack */
143  return ts;
144}
145
146
147/*
148** increment line number and skips newline sequence (any of
149** \n, \r, \n\r, or \r\n)
150*/
151static void inclinenumber (LexState *ls) {
152  int old = ls->current;
153  lua_assert(currIsNewline(ls));
154  next(ls);  /* skip '\n' or '\r' */
155  if (currIsNewline(ls) && ls->current != old)
156    next(ls);  /* skip '\n\r' or '\r\n' */
157  if (++ls->linenumber >= MAX_INT)
158    lexerror(ls, "chunk has too many lines", 0);
159}
160
161
162void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
163                    int firstchar) {
164  ls->t.token = 0;
165  ls->L = L;
166  ls->current = firstchar;
167  ls->lookahead.token = TK_EOS;  /* no look-ahead token */
168  ls->z = z;
169  ls->fs = NULL;
170  ls->linenumber = 1;
171  ls->lastline = 1;
172  ls->source = source;
173  ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
174  luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
175}
176
177
178
179/*
180** =======================================================
181** LEXICAL ANALYZER
182** =======================================================
183*/
184
185
186static int check_next1 (LexState *ls, int c) {
187  if (ls->current == c) {
188    next(ls);
189    return 1;
190  }
191  else return 0;
192}
193
194
195/*
196** Check whether current char is in set 'set' (with two chars) and
197** saves it
198*/
199static int check_next2 (LexState *ls, const char *set) {
200  lua_assert(set[2] == '\0');
201  if (ls->current == set[0] || ls->current == set[1]) {
202    save_and_next(ls);
203    return 1;
204  }
205  else return 0;
206}
207
208
209/* LUA_NUMBER */
210/*
211** this function is quite liberal in what it accepts, as 'luaO_str2num'
212** will reject ill-formed numerals.
213*/
214static int read_numeral (LexState *ls, SemInfo *seminfo) {
215  TValue obj;
216  const char *expo = "Ee";
217  int first = ls->current;
218  lua_assert(lisdigit(ls->current));
219  save_and_next(ls);
220  if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
221    expo = "Pp";
222  for (;;) {
223    if (check_next2(ls, expo))  /* exponent part? */
224      check_next2(ls, "-+");  /* optional exponent sign */
225    if (lisxdigit(ls->current))
226      save_and_next(ls);
227    else if (ls->current == '.')
228      save_and_next(ls);
229    else break;
230  }
231  save(ls, '\0');
232  if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)  /* format error? */
233    lexerror(ls, "malformed number", TK_FLT);
234  if (ttisinteger(&obj)) {
235    seminfo->i = ivalue(&obj);
236    return TK_INT;
237  }
238  else {
239    lua_assert(ttisfloat(&obj));
240    seminfo->r = fltvalue(&obj);
241    return TK_FLT;
242  }
243}
244
245
246/*
247** reads a sequence '[=*[' or ']=*]', leaving the last bracket.
248** If sequence is well formed, return its number of '='s + 2; otherwise,
249** return 1 if there is no '='s or 0 otherwise (an unfinished '[==...').
250*/
251static size_t skip_sep (LexState *ls) {
252  size_t count = 0;
253  int s = ls->current;
254  lua_assert(s == '[' || s == ']');
255  save_and_next(ls);
256  while (ls->current == '=') {
257    save_and_next(ls);
258    count++;
259  }
260  return (ls->current == s) ? count + 2
261         : (count == 0) ? 1
262         : 0;
263
264}
265
266
267static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
268  int line = ls->linenumber;  /* initial line (for error message) */
269  save_and_next(ls);  /* skip 2nd '[' */
270  if (currIsNewline(ls))  /* string starts with a newline? */
271    inclinenumber(ls);  /* skip it */
272  for (;;) {
273    switch (ls->current) {
274      case EOZ: {  /* error */
275        const char *what = (seminfo ? "string" : "comment");
276        const char *msg = luaO_pushfstring(ls->L,
277                     "unfinished long %s (starting at line %d)", what, line);
278        lexerror(ls, msg, TK_EOS);
279        break;  /* to avoid warnings */
280      }
281      case ']': {
282        if (skip_sep(ls) == sep) {
283          save_and_next(ls);  /* skip 2nd ']' */
284          goto endloop;
285        }
286        break;
287      }
288      case '\n': case '\r': {
289        save(ls, '\n');
290        inclinenumber(ls);
291        if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
292        break;
293      }
294      default: {
295        if (seminfo) save_and_next(ls);
296        else next(ls);
297      }
298    }
299  } endloop:
300  if (seminfo)
301    seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
302                                     luaZ_bufflen(ls->buff) - 2 * sep);
303}
304
305
306static void esccheck (LexState *ls, int c, const char *msg) {
307  if (!c) {
308    if (ls->current != EOZ)
309      save_and_next(ls);  /* add current to buffer for error message */
310    lexerror(ls, msg, TK_STRING);
311  }
312}
313
314
315static int gethexa (LexState *ls) {
316  save_and_next(ls);
317  esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
318  return luaO_hexavalue(ls->current);
319}
320
321
322static int readhexaesc (LexState *ls) {
323  int r = gethexa(ls);
324  r = (r << 4) + gethexa(ls);
325  luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
326  return r;
327}
328
329
330static unsigned long readutf8esc (LexState *ls) {
331  unsigned long r;
332  int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
333  save_and_next(ls);  /* skip 'u' */
334  esccheck(ls, ls->current == '{', "missing '{'");
335  r = gethexa(ls);  /* must have at least one digit */
336  while ((save_and_next(ls), lisxdigit(ls->current))) {
337    i++;
338    r = (r << 4) + luaO_hexavalue(ls->current);
339    esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
340  }
341  esccheck(ls, ls->current == '}', "missing '}'");
342  next(ls);  /* skip '}' */
343  luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
344  return r;
345}
346
347
348static void utf8esc (LexState *ls) {
349  char buff[UTF8BUFFSZ];
350  int n = luaO_utf8esc(buff, readutf8esc(ls));
351  for (; n > 0; n--)  /* add 'buff' to string */
352    save(ls, buff[UTF8BUFFSZ - n]);
353}
354
355
356static int readdecesc (LexState *ls) {
357  int i;
358  int r = 0;  /* result accumulator */
359  for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
360    r = 10*r + ls->current - '0';
361    save_and_next(ls);
362  }
363  esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
364  luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
365  return r;
366}
367
368
369static void read_string (LexState *ls, int del, SemInfo *seminfo) {
370  save_and_next(ls);  /* keep delimiter (for error messages) */
371  while (ls->current != del) {
372    switch (ls->current) {
373      case EOZ:
374        lexerror(ls, "unfinished string", TK_EOS);
375        break;  /* to avoid warnings */
376      case '\n':
377      case '\r':
378        lexerror(ls, "unfinished string", TK_STRING);
379        break;  /* to avoid warnings */
380      case '\\': {  /* escape sequences */
381        int c;  /* final character to be saved */
382        save_and_next(ls);  /* keep '\\' for error messages */
383        switch (ls->current) {
384          case 'a': c = '\a'; goto read_save;
385          case 'b': c = '\b'; goto read_save;
386          case 'f': c = '\f'; goto read_save;
387          case 'n': c = '\n'; goto read_save;
388          case 'r': c = '\r'; goto read_save;
389          case 't': c = '\t'; goto read_save;
390          case 'v': c = '\v'; goto read_save;
391          case 'x': c = readhexaesc(ls); goto read_save;
392          case 'u': utf8esc(ls);  goto no_save;
393          case '\n': case '\r':
394            inclinenumber(ls); c = '\n'; goto only_save;
395          case '\\': case '\"': case '\'':
396            c = ls->current; goto read_save;
397          case EOZ: goto no_save;  /* will raise an error next loop */
398          case 'z': {  /* zap following span of spaces */
399            luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
400            next(ls);  /* skip the 'z' */
401            while (lisspace(ls->current)) {
402              if (currIsNewline(ls)) inclinenumber(ls);
403              else next(ls);
404            }
405            goto no_save;
406          }
407          default: {
408            esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
409            c = readdecesc(ls);  /* digital escape '\ddd' */
410            goto only_save;
411          }
412        }
413       read_save:
414         next(ls);
415         /* go through */
416       only_save:
417         luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
418         save(ls, c);
419         /* go through */
420       no_save: break;
421      }
422      default:
423        save_and_next(ls);
424    }
425  }
426  save_and_next(ls);  /* skip delimiter */
427  seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
428                                   luaZ_bufflen(ls->buff) - 2);
429}
430
431
432static int llex (LexState *ls, SemInfo *seminfo) {
433  luaZ_resetbuffer(ls->buff);
434  for (;;) {
435    switch (ls->current) {
436      case '\n': case '\r': {  /* line breaks */
437        inclinenumber(ls);
438        break;
439      }
440      case ' ': case '\f': case '\t': case '\v': {  /* spaces */
441        next(ls);
442        break;
443      }
444      case '-': {  /* '-' or '--' (comment) */
445        next(ls);
446        if (ls->current != '-') return '-';
447        /* else is a comment */
448        next(ls);
449        if (ls->current == '[') {  /* long comment? */
450          size_t sep = skip_sep(ls);
451          luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
452          if (sep >= 2) {
453            read_long_string(ls, NULL, sep);  /* skip long comment */
454            luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
455            break;
456          }
457        }
458        /* else short comment */
459        while (!currIsNewline(ls) && ls->current != EOZ)
460          next(ls);  /* skip until end of line (or end of file) */
461        break;
462      }
463      case '[': {  /* long string or simply '[' */
464        size_t sep = skip_sep(ls);
465        if (sep >= 2) {
466          read_long_string(ls, seminfo, sep);
467          return TK_STRING;
468        }
469        else if (sep == 0)  /* '[=...' missing second bracket */
470          lexerror(ls, "invalid long string delimiter", TK_STRING);
471        return '[';
472      }
473      case '=': {
474        next(ls);
475        if (check_next1(ls, '=')) return TK_EQ;
476        else return '=';
477      }
478      case '<': {
479        next(ls);
480        if (check_next1(ls, '=')) return TK_LE;
481        else if (check_next1(ls, '<')) return TK_SHL;
482        else return '<';
483      }
484      case '>': {
485        next(ls);
486        if (check_next1(ls, '=')) return TK_GE;
487        else if (check_next1(ls, '>')) return TK_SHR;
488        else return '>';
489      }
490      case '/': {
491        next(ls);
492        if (check_next1(ls, '/')) return TK_IDIV;
493        else return '/';
494      }
495      case '~': {
496        next(ls);
497        if (check_next1(ls, '=')) return TK_NE;
498        else return '~';
499      }
500      case ':': {
501        next(ls);
502        if (check_next1(ls, ':')) return TK_DBCOLON;
503        else return ':';
504      }
505      case '"': case '\'': {  /* short literal strings */
506        read_string(ls, ls->current, seminfo);
507        return TK_STRING;
508      }
509      case '.': {  /* '.', '..', '...', or number */
510        save_and_next(ls);
511        if (check_next1(ls, '.')) {
512          if (check_next1(ls, '.'))
513            return TK_DOTS;   /* '...' */
514          else return TK_CONCAT;   /* '..' */
515        }
516        else if (!lisdigit(ls->current)) return '.';
517        else return read_numeral(ls, seminfo);
518      }
519      case '0': case '1': case '2': case '3': case '4':
520      case '5': case '6': case '7': case '8': case '9': {
521        return read_numeral(ls, seminfo);
522      }
523      case EOZ: {
524        return TK_EOS;
525      }
526      default: {
527        if (lislalpha(ls->current)) {  /* identifier or reserved word? */
528          TString *ts;
529          do {
530            save_and_next(ls);
531          } while (lislalnum(ls->current));
532          ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
533                                  luaZ_bufflen(ls->buff));
534          seminfo->ts = ts;
535          if (isreserved(ts))  /* reserved word? */
536            return ts->extra - 1 + FIRST_RESERVED;
537          else {
538            return TK_NAME;
539          }
540        }
541        else {  /* single-char tokens (+ - / ...) */
542          int c = ls->current;
543          next(ls);
544          return c;
545        }
546      }
547    }
548  }
549}
550
551
552void luaX_next (LexState *ls) {
553  ls->lastline = ls->linenumber;
554  if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
555    ls->t = ls->lookahead;  /* use this one */
556    ls->lookahead.token = TK_EOS;  /* and discharge it */
557  }
558  else
559    ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
560}
561
562
563int luaX_lookahead (LexState *ls) {
564  lua_assert(ls->lookahead.token == TK_EOS);
565  ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
566  return ls->lookahead.token;
567}
568
569