lex.c revision 107806
1/**************************************************************** 2Copyright (C) Lucent Technologies 1997 3All Rights Reserved 4 5Permission to use, copy, modify, and distribute this software and 6its documentation for any purpose and without fee is hereby 7granted, provided that the above copyright notice appear in all 8copies and that both that the copyright notice and this 9permission notice and warranty disclaimer appear in supporting 10documentation, and that the name Lucent Technologies or any of 11its entities not be used in advertising or publicity pertaining 12to distribution of the software without specific, written prior 13permission. 14 15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22THIS SOFTWARE. 23****************************************************************/ 24 25#include <stdio.h> 26#include <stdlib.h> 27#include <string.h> 28#include <ctype.h> 29#include "awk.h" 30#include "ytab.h" 31 32extern YYSTYPE yylval; 33extern int infunc; 34 35int lineno = 1; 36int bracecnt = 0; 37int brackcnt = 0; 38int parencnt = 0; 39 40typedef struct Keyword { 41 const char *word; 42 int sub; 43 int type; 44} Keyword; 45 46Keyword keywords[] ={ /* keep sorted: binary searched */ 47 { "BEGIN", XBEGIN, XBEGIN }, 48 { "END", XEND, XEND }, 49 { "NF", VARNF, VARNF }, 50 { "atan2", FATAN, BLTIN }, 51 { "break", BREAK, BREAK }, 52 { "close", CLOSE, CLOSE }, 53 { "continue", CONTINUE, CONTINUE }, 54 { "cos", FCOS, BLTIN }, 55 { "delete", DELETE, DELETE }, 56 { "do", DO, DO }, 57 { "else", ELSE, ELSE }, 58 { "exit", EXIT, EXIT }, 59 { "exp", FEXP, BLTIN }, 60 { "fflush", FFLUSH, BLTIN }, 61 { "for", FOR, FOR }, 62 { "func", FUNC, FUNC }, 63 { "function", FUNC, FUNC }, 64 { "getline", GETLINE, GETLINE }, 65 { "gsub", GSUB, GSUB }, 66 { "if", IF, IF }, 67 { "in", IN, IN }, 68 { "index", INDEX, INDEX }, 69 { "int", FINT, BLTIN }, 70 { "length", FLENGTH, BLTIN }, 71 { "log", FLOG, BLTIN }, 72 { "match", MATCHFCN, MATCHFCN }, 73 { "next", NEXT, NEXT }, 74 { "nextfile", NEXTFILE, NEXTFILE }, 75 { "print", PRINT, PRINT }, 76 { "printf", PRINTF, PRINTF }, 77 { "rand", FRAND, BLTIN }, 78 { "return", RETURN, RETURN }, 79 { "sin", FSIN, BLTIN }, 80 { "split", SPLIT, SPLIT }, 81 { "sprintf", SPRINTF, SPRINTF }, 82 { "sqrt", FSQRT, BLTIN }, 83 { "srand", FSRAND, BLTIN }, 84 { "sub", SUB, SUB }, 85 { "substr", SUBSTR, SUBSTR }, 86 { "system", FSYSTEM, BLTIN }, 87 { "tolower", FTOLOWER, BLTIN }, 88 { "toupper", FTOUPPER, BLTIN }, 89 { "while", WHILE, WHILE }, 90}; 91 92#define DEBUG 93#ifdef DEBUG 94#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 95#else 96#define RET(x) return(x) 97#endif 98 99int peek(void) 100{ 101 int c = input(); 102 unput(c); 103 return c; 104} 105 106int gettok(char **pbuf, int *psz) /* get next input token */ 107{ 108 int c, retc; 109 char *buf = *pbuf; 110 int sz = *psz; 111 char *bp = buf; 112 113 c = input(); 114 if (c == 0) 115 return 0; 116 buf[0] = c; 117 buf[1] = 0; 118 if (!isalnum(c) && c != '.' && c != '_') 119 return c; 120 121 *bp++ = c; 122 if (isalpha(c) || c == '_') { /* it's a varname */ 123 for ( ; (c = input()) != 0; ) { 124 if (bp-buf >= sz) 125 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 126 FATAL( "out of space for name %.10s...", buf ); 127 if (isalnum(c) || c == '_') 128 *bp++ = c; 129 else { 130 *bp = 0; 131 unput(c); 132 break; 133 } 134 } 135 *bp = 0; 136 retc = 'a'; /* alphanumeric */ 137 } else { /* it's a number */ 138 char *rem; 139 /* read input until can't be a number */ 140 for ( ; (c = input()) != 0; ) { 141 if (bp-buf >= sz) 142 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 143 FATAL( "out of space for number %.10s...", buf ); 144 if (isdigit(c) || c == 'e' || c == 'E' 145 || c == '.' || c == '+' || c == '-') 146 *bp++ = c; 147 else { 148 unput(c); 149 break; 150 } 151 } 152 *bp = 0; 153 strtod(buf, &rem); /* parse the number */ 154 unputstr(rem); /* put rest back for later */ 155 if (rem == buf) { /* it wasn't a valid number at all */ 156 buf[1] = 0; /* so return one character as token */ 157 retc = buf[0]; /* character is its own type */ 158 } else { /* some prefix was a number */ 159 rem[0] = 0; /* so truncate where failure started */ 160 retc = '0'; /* number */ 161 } 162 } 163 *pbuf = buf; 164 *psz = sz; 165 return retc; 166} 167 168int word(char *); 169int string(void); 170int regexpr(void); 171int sc = 0; /* 1 => return a } right now */ 172int reg = 0; /* 1 => return a REGEXPR now */ 173 174int yylex(void) 175{ 176 int c; 177 static char *buf = 0; 178 static int bufsize = 500; 179 180 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 181 FATAL( "out of space in yylex" ); 182 if (sc) { 183 sc = 0; 184 RET('}'); 185 } 186 if (reg) { 187 reg = 0; 188 return regexpr(); 189 } 190 for (;;) { 191 c = gettok(&buf, &bufsize); 192 if (c == 0) 193 return 0; 194 if (isalpha(c) || c == '_') 195 return word(buf); 196 if (isdigit(c)) { 197 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 198 /* should this also have STR set? */ 199 RET(NUMBER); 200 } 201 202 yylval.i = c; 203 switch (c) { 204 case '\n': /* {EOL} */ 205 RET(NL); 206 case '\r': /* assume \n is coming */ 207 case ' ': /* {WS}+ */ 208 case '\t': 209 break; 210 case '#': /* #.* strip comments */ 211 while ((c = input()) != '\n' && c != 0) 212 ; 213 unput(c); 214 break; 215 case ';': 216 RET(';'); 217 case '\\': 218 if (peek() == '\n') { 219 input(); 220 } else if (peek() == '\r') { 221 input(); input(); /* \n */ 222 lineno++; 223 } else { 224 RET(c); 225 } 226 break; 227 case '&': 228 if (peek() == '&') { 229 input(); RET(AND); 230 } else 231 RET('&'); 232 case '|': 233 if (peek() == '|') { 234 input(); RET(BOR); 235 } else 236 RET('|'); 237 case '!': 238 if (peek() == '=') { 239 input(); yylval.i = NE; RET(NE); 240 } else if (peek() == '~') { 241 input(); yylval.i = NOTMATCH; RET(MATCHOP); 242 } else 243 RET(NOT); 244 case '~': 245 yylval.i = MATCH; 246 RET(MATCHOP); 247 case '<': 248 if (peek() == '=') { 249 input(); yylval.i = LE; RET(LE); 250 } else { 251 yylval.i = LT; RET(LT); 252 } 253 case '=': 254 if (peek() == '=') { 255 input(); yylval.i = EQ; RET(EQ); 256 } else { 257 yylval.i = ASSIGN; RET(ASGNOP); 258 } 259 case '>': 260 if (peek() == '=') { 261 input(); yylval.i = GE; RET(GE); 262 } else if (peek() == '>') { 263 input(); yylval.i = APPEND; RET(APPEND); 264 } else { 265 yylval.i = GT; RET(GT); 266 } 267 case '+': 268 if (peek() == '+') { 269 input(); yylval.i = INCR; RET(INCR); 270 } else if (peek() == '=') { 271 input(); yylval.i = ADDEQ; RET(ASGNOP); 272 } else 273 RET('+'); 274 case '-': 275 if (peek() == '-') { 276 input(); yylval.i = DECR; RET(DECR); 277 } else if (peek() == '=') { 278 input(); yylval.i = SUBEQ; RET(ASGNOP); 279 } else 280 RET('-'); 281 case '*': 282 if (peek() == '=') { /* *= */ 283 input(); yylval.i = MULTEQ; RET(ASGNOP); 284 } else if (peek() == '*') { /* ** or **= */ 285 input(); /* eat 2nd * */ 286 if (peek() == '=') { 287 input(); yylval.i = POWEQ; RET(ASGNOP); 288 } else { 289 RET(POWER); 290 } 291 } else 292 RET('*'); 293 case '/': 294 RET('/'); 295 case '%': 296 if (peek() == '=') { 297 input(); yylval.i = MODEQ; RET(ASGNOP); 298 } else 299 RET('%'); 300 case '^': 301 if (peek() == '=') { 302 input(); yylval.i = POWEQ; RET(ASGNOP); 303 } else 304 RET(POWER); 305 306 case '$': 307 /* BUG: awkward, if not wrong */ 308 c = gettok(&buf, &bufsize); 309 if (isalpha(c)) { 310 if (strcmp(buf, "NF") == 0) { /* very special */ 311 unputstr("(NF)"); 312 RET(INDIRECT); 313 } 314 c = peek(); 315 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 316 unputstr(buf); 317 RET(INDIRECT); 318 } 319 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 320 RET(IVAR); 321 } else if (c == 0) { /* */ 322 SYNTAX( "unexpected end of input after $" ); 323 RET(';'); 324 } else { 325 unputstr(buf); 326 RET(INDIRECT); 327 } 328 329 case '}': 330 if (--bracecnt < 0) 331 SYNTAX( "extra }" ); 332 sc = 1; 333 RET(';'); 334 case ']': 335 if (--brackcnt < 0) 336 SYNTAX( "extra ]" ); 337 RET(']'); 338 case ')': 339 if (--parencnt < 0) 340 SYNTAX( "extra )" ); 341 RET(')'); 342 case '{': 343 bracecnt++; 344 RET('{'); 345 case '[': 346 brackcnt++; 347 RET('['); 348 case '(': 349 parencnt++; 350 RET('('); 351 352 case '"': 353 return string(); /* BUG: should be like tran.c ? */ 354 355 default: 356 RET(c); 357 } 358 } 359} 360 361int string(void) 362{ 363 int c, n; 364 char *s, *bp; 365 static char *buf = 0; 366 static int bufsz = 500; 367 368 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 369 FATAL("out of space for strings"); 370 for (bp = buf; (c = input()) != '"'; ) { 371 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)) 372 FATAL("out of space for string %.10s...", buf); 373 switch (c) { 374 case '\n': 375 case '\r': 376 case 0: 377 SYNTAX( "non-terminated string %.10s...", buf ); 378 lineno++; 379 if (c == 0) /* hopeless */ 380 FATAL( "giving up" ); 381 break; 382 case '\\': 383 c = input(); 384 switch (c) { 385 case '"': *bp++ = '"'; break; 386 case 'n': *bp++ = '\n'; break; 387 case 't': *bp++ = '\t'; break; 388 case 'f': *bp++ = '\f'; break; 389 case 'r': *bp++ = '\r'; break; 390 case 'b': *bp++ = '\b'; break; 391 case 'v': *bp++ = '\v'; break; 392 case 'a': *bp++ = '\007'; break; 393 case '\\': *bp++ = '\\'; break; 394 395 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 396 case '3': case '4': case '5': case '6': case '7': 397 n = c - '0'; 398 if ((c = peek()) >= '0' && c < '8') { 399 n = 8 * n + input() - '0'; 400 if ((c = peek()) >= '0' && c < '8') 401 n = 8 * n + input() - '0'; 402 } 403 *bp++ = n; 404 break; 405 406 case 'x': /* hex \x0-9a-fA-F + */ 407 { char xbuf[100], *px; 408 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 409 if (isdigit(c) 410 || (c >= 'a' && c <= 'f') 411 || (c >= 'A' && c <= 'F')) 412 *px++ = c; 413 else 414 break; 415 } 416 *px = 0; 417 unput(c); 418 sscanf(xbuf, "%x", &n); 419 *bp++ = n; 420 break; 421 } 422 423 default: 424 *bp++ = c; 425 break; 426 } 427 break; 428 default: 429 *bp++ = c; 430 break; 431 } 432 } 433 *bp = 0; 434 s = tostring(buf); 435 *bp++ = ' '; *bp++ = 0; 436 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 437 RET(STRING); 438} 439 440 441int binsearch(char *w, Keyword *kp, int n) 442{ 443 int cond, low, mid, high; 444 445 low = 0; 446 high = n - 1; 447 while (low <= high) { 448 mid = (low + high) / 2; 449 if ((cond = strcmp(w, kp[mid].word)) < 0) 450 high = mid - 1; 451 else if (cond > 0) 452 low = mid + 1; 453 else 454 return mid; 455 } 456 return -1; 457} 458 459int word(char *w) 460{ 461 Keyword *kp; 462 int c, n; 463 464 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 465 kp = keywords + n; 466 if (n != -1) { /* found in table */ 467 yylval.i = kp->sub; 468 switch (kp->type) { /* special handling */ 469 case FSYSTEM: 470 if (safe) 471 SYNTAX( "system is unsafe" ); 472 RET(kp->type); 473 case FUNC: 474 if (infunc) 475 SYNTAX( "illegal nested function" ); 476 RET(kp->type); 477 case RETURN: 478 if (!infunc) 479 SYNTAX( "return not in function" ); 480 RET(kp->type); 481 case VARNF: 482 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 483 RET(VARNF); 484 default: 485 RET(kp->type); 486 } 487 } 488 c = peek(); /* look for '(' */ 489 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 490 yylval.i = n; 491 RET(ARG); 492 } else { 493 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 494 if (c == '(') { 495 RET(CALL); 496 } else { 497 RET(VAR); 498 } 499 } 500} 501 502void startreg(void) /* next call to yylex will return a regular expression */ 503{ 504 reg = 1; 505} 506 507int regexpr(void) 508{ 509 int c; 510 static char *buf = 0; 511 static int bufsz = 500; 512 char *bp; 513 514 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 515 FATAL("out of space for rex expr"); 516 bp = buf; 517 for ( ; (c = input()) != '/' && c != 0; ) { 518 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) 519 FATAL("out of space for reg expr %.10s...", buf); 520 if (c == '\n') { 521 SYNTAX( "newline in regular expression %.10s...", buf ); 522 unput('\n'); 523 break; 524 } else if (c == '\\') { 525 *bp++ = '\\'; 526 *bp++ = input(); 527 } else { 528 *bp++ = c; 529 } 530 } 531 *bp = 0; 532 yylval.s = tostring(buf); 533 unput('/'); 534 RET(REGEXPR); 535} 536 537/* low-level lexical stuff, sort of inherited from lex */ 538 539char ebuf[300]; 540char *ep = ebuf; 541char yysbuf[100]; /* pushback buffer */ 542char *yysptr = yysbuf; 543FILE *yyin = 0; 544 545int input(void) /* get next lexical input character */ 546{ 547 int c; 548 extern char *lexprog; 549 550 if (yysptr > yysbuf) 551 c = *--yysptr; 552 else if (lexprog != NULL) { /* awk '...' */ 553 if ((c = *lexprog) != 0) 554 lexprog++; 555 } else /* awk -f ... */ 556 c = pgetc(); 557 if (c == '\n') 558 lineno++; 559 else if (c == EOF) 560 c = 0; 561 if (ep >= ebuf + sizeof ebuf) 562 ep = ebuf; 563 return *ep++ = c; 564} 565 566void unput(int c) /* put lexical character back on input */ 567{ 568 if (c == '\n') 569 lineno--; 570 if (yysptr >= yysbuf + sizeof(yysbuf)) 571 FATAL("pushed back too much: %.20s...", yysbuf); 572 *yysptr++ = c; 573 if (--ep < ebuf) 574 ep = ebuf + sizeof(ebuf) - 1; 575} 576 577void unputstr(const char *s) /* put a string back on input */ 578{ 579 int i; 580 581 for (i = strlen(s)-1; i >= 0; i--) 582 unput(s[i]); 583} 584