lex.c revision 90902
1248590Smm/**************************************************************** 2248590SmmCopyright (C) Lucent Technologies 1997 3248590SmmAll Rights Reserved 4248590Smm 5248590SmmPermission to use, copy, modify, and distribute this software and 6248590Smmits documentation for any purpose and without fee is hereby 7248590Smmgranted, provided that the above copyright notice appear in all 8248590Smmcopies and that both that the copyright notice and this 9248590Smmpermission notice and warranty disclaimer appear in supporting 10248590Smmdocumentation, and that the name Lucent Technologies or any of 11248590Smmits entities not be used in advertising or publicity pertaining 12248590Smmto distribution of the software without specific, written prior 13248590Smmpermission. 14248590Smm 15248590SmmLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 16248590SmmINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 17248590SmmIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 18248590SmmSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19248590SmmWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 20248590SmmIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 21248590SmmARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 22248590SmmTHIS SOFTWARE. 23248590Smm****************************************************************/ 24248590Smm 25248590Smm#include <stdio.h> 26248590Smm#include <stdlib.h> 27248590Smm#include <string.h> 28248590Smm#include <ctype.h> 29248590Smm#include "awk.h" 30248590Smm#include "ytab.h" 31248590Smm 32248590Smmextern YYSTYPE yylval; 33248590Smmextern int infunc; 34248590Smm 35248590Smmint lineno = 1; 36248590Smmint bracecnt = 0; 37248590Smmint brackcnt = 0; 38248590Smmint parencnt = 0; 39248590Smm 40248590Smmtypedef struct Keyword { 41248590Smm char *word; 42248590Smm int sub; 43248590Smm int type; 44248590Smm} Keyword; 45248590Smm 46248590SmmKeyword keywords[] ={ /* keep sorted: binary searched */ 47248590Smm { "BEGIN", XBEGIN, XBEGIN }, 48248590Smm { "END", XEND, XEND }, 49248590Smm { "NF", VARNF, VARNF }, 50248590Smm { "atan2", FATAN, BLTIN }, 51248590Smm { "break", BREAK, BREAK }, 52248590Smm { "close", CLOSE, CLOSE }, 53248590Smm { "continue", CONTINUE, CONTINUE }, 54248590Smm { "cos", FCOS, BLTIN }, 55248590Smm { "delete", DELETE, DELETE }, 56248590Smm { "do", DO, DO }, 57248590Smm { "else", ELSE, ELSE }, 58248590Smm { "exit", EXIT, EXIT }, 59248590Smm { "exp", FEXP, BLTIN }, 60248590Smm { "fflush", FFLUSH, BLTIN }, 61248590Smm { "for", FOR, FOR }, 62248590Smm { "func", FUNC, FUNC }, 63248590Smm { "function", FUNC, FUNC }, 64248590Smm { "getline", GETLINE, GETLINE }, 65248590Smm { "gsub", GSUB, GSUB }, 66248590Smm { "if", IF, IF }, 67248590Smm { "in", IN, IN }, 68248590Smm { "index", INDEX, INDEX }, 69248590Smm { "int", FINT, BLTIN }, 70248590Smm { "length", FLENGTH, BLTIN }, 71248590Smm { "log", FLOG, BLTIN }, 72248590Smm { "match", MATCHFCN, MATCHFCN }, 73248590Smm { "next", NEXT, NEXT }, 74248590Smm { "nextfile", NEXTFILE, NEXTFILE }, 75362133Smm { "print", PRINT, PRINT }, 76362133Smm { "printf", PRINTF, PRINTF }, 77362133Smm { "rand", FRAND, BLTIN }, 78248590Smm { "return", RETURN, RETURN }, 79248590Smm { "sin", FSIN, BLTIN }, 80248590Smm { "split", SPLIT, SPLIT }, 81248590Smm { "sprintf", SPRINTF, SPRINTF }, 82248590Smm { "sqrt", FSQRT, BLTIN }, 83248590Smm { "srand", FSRAND, BLTIN }, 84248590Smm { "sub", SUB, SUB }, 85248590Smm { "substr", SUBSTR, SUBSTR }, 86248590Smm { "system", FSYSTEM, BLTIN }, 87248590Smm { "tolower", FTOLOWER, BLTIN }, 88248590Smm { "toupper", FTOUPPER, BLTIN }, 89248590Smm { "while", WHILE, WHILE }, 90248590Smm}; 91248590Smm 92248590Smm#define DEBUG 93248590Smm#ifdef DEBUG 94248590Smm#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 95248590Smm#else 96248590Smm#define RET(x) return(x) 97248590Smm#endif 98248590Smm 99248590Smmint peek(void) 100248590Smm{ 101248590Smm int c = input(); 102248590Smm unput(c); 103248590Smm return c; 104248590Smm} 105248590Smm 106248590Smmint gettok(char **pbuf, int *psz) /* get next input token */ 107248590Smm{ 108248590Smm int c, retc; 109248590Smm char *buf = *pbuf; 110248590Smm int sz = *psz; 111248590Smm char *bp = buf; 112248590Smm 113248590Smm c = input(); 114248590Smm if (c == 0) 115248590Smm return 0; 116248590Smm buf[0] = c; 117248590Smm buf[1] = 0; 118248590Smm if (!isalnum(c) && c != '.' && c != '_') 119248590Smm return c; 120248590Smm 121248590Smm *bp++ = c; 122248590Smm if (isalpha(c) || c == '_') { /* it's a varname */ 123248590Smm for ( ; (c = input()) != 0; ) { 124248590Smm if (bp-buf >= sz) 125248590Smm if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 126248590Smm FATAL( "out of space for name %.10s...", buf ); 127248590Smm if (isalnum(c) || c == '_') 128248590Smm *bp++ = c; 129248590Smm else { 130248590Smm *bp = 0; 131248590Smm unput(c); 132248590Smm break; 133248590Smm } 134248590Smm } 135248590Smm *bp = 0; 136248590Smm retc = 'a'; /* alphanumeric */ 137248590Smm } else { /* it's a number */ 138248590Smm char *rem; 139248590Smm /* read input until can't be a number */ 140248590Smm for ( ; (c = input()) != 0; ) { 141248590Smm if (bp-buf >= sz) 142248590Smm if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0)) 143248590Smm FATAL( "out of space for number %.10s...", buf ); 144248590Smm if (isdigit(c) || c == 'e' || c == 'E' 145248590Smm || c == '.' || c == '+' || c == '-') 146248590Smm *bp++ = c; 147248590Smm else { 148248590Smm unput(c); 149248590Smm break; 150248590Smm } 151248590Smm } 152248590Smm *bp = 0; 153248590Smm strtod(buf, &rem); /* parse the number */ 154248590Smm unputstr(rem); /* put rest back for later */ 155248590Smm if (rem == buf) { /* it wasn't a valid number at all */ 156248590Smm buf[1] = 0; /* so return one character as token */ 157248590Smm retc = buf[0]; /* character is its own type */ 158248590Smm } else { /* some prefix was a number */ 159248590Smm rem[0] = 0; /* so truncate where failure started */ 160248590Smm retc = '0'; /* number */ 161248590Smm } 162248590Smm } 163248590Smm *pbuf = buf; 164248590Smm *psz = sz; 165248590Smm return retc; 166248590Smm} 167248590Smm 168248590Smmint word(char *); 169248590Smmint string(void); 170248590Smmint regexpr(void); 171248590Smmint sc = 0; /* 1 => return a } right now */ 172248590Smmint reg = 0; /* 1 => return a REGEXPR now */ 173248590Smm 174248590Smmint yylex(void) 175248590Smm{ 176248590Smm int c; 177248590Smm static char *buf = 0; 178248590Smm static int bufsize = 500; 179248590Smm 180248590Smm if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 181362133Smm FATAL( "out of space in yylex" ); 182362133Smm if (sc) { 183248590Smm sc = 0; 184248590Smm RET('}'); 185248590Smm } 186248590Smm if (reg) { 187248590Smm reg = 0; 188248590Smm return regexpr(); 189248590Smm } 190248590Smm for (;;) { 191248590Smm c = gettok(&buf, &bufsize); 192248590Smm if (c == 0) 193248590Smm return 0; 194248590Smm if (isalpha(c) || c == '_') 195248590Smm return word(buf); 196248590Smm if (isdigit(c)) { 197362133Smm yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 198248590Smm /* should this also have STR set? */ 199248590Smm RET(NUMBER); 200248590Smm } 201248590Smm 202248590Smm yylval.i = c; 203248590Smm switch (c) { 204248590Smm case '\n': /* {EOL} */ 205248590Smm RET(NL); 206248590Smm case '\r': /* assume \n is coming */ 207248590Smm case ' ': /* {WS}+ */ 208248590Smm case '\t': 209248590Smm break; 210248590Smm case '#': /* #.* strip comments */ 211248590Smm while ((c = input()) != '\n' && c != 0) 212248590Smm ; 213248590Smm unput(c); 214248590Smm break; 215248590Smm case ';': 216248590Smm RET(';'); 217248590Smm case '\\': 218248590Smm if (peek() == '\n') { 219248590Smm input(); 220248590Smm } else if (peek() == '\r') { 221248590Smm input(); input(); /* \n */ 222248590Smm lineno++; 223248590Smm } else { 224248590Smm RET(c); 225248590Smm } 226248590Smm break; 227248590Smm case '&': 228248590Smm if (peek() == '&') { 229248590Smm input(); RET(AND); 230248590Smm } else 231248590Smm RET('&'); 232248590Smm case '|': 233248590Smm if (peek() == '|') { 234248590Smm input(); RET(BOR); 235248590Smm } else 236248590Smm RET('|'); 237248590Smm case '!': 238248590Smm if (peek() == '=') { 239248590Smm input(); yylval.i = NE; RET(NE); 240248590Smm } else if (peek() == '~') { 241 input(); yylval.i = NOTMATCH; RET(MATCHOP); 242 } else 243 RET(NOT); 244 case '~': 245 yylval.i = MATCH; 246 RET(MATCHOP); 247 case '<': 248 if (peek() == '=') { 249 input(); yylval.i = LE; RET(LE); 250 } else { 251 yylval.i = LT; RET(LT); 252 } 253 case '=': 254 if (peek() == '=') { 255 input(); yylval.i = EQ; RET(EQ); 256 } else { 257 yylval.i = ASSIGN; RET(ASGNOP); 258 } 259 case '>': 260 if (peek() == '=') { 261 input(); yylval.i = GE; RET(GE); 262 } else if (peek() == '>') { 263 input(); yylval.i = APPEND; RET(APPEND); 264 } else { 265 yylval.i = GT; RET(GT); 266 } 267 case '+': 268 if (peek() == '+') { 269 input(); yylval.i = INCR; RET(INCR); 270 } else if (peek() == '=') { 271 input(); yylval.i = ADDEQ; RET(ASGNOP); 272 } else 273 RET('+'); 274 case '-': 275 if (peek() == '-') { 276 input(); yylval.i = DECR; RET(DECR); 277 } else if (peek() == '=') { 278 input(); yylval.i = SUBEQ; RET(ASGNOP); 279 } else 280 RET('-'); 281 case '*': 282 if (peek() == '=') { /* *= */ 283 input(); yylval.i = MULTEQ; RET(ASGNOP); 284 } else if (peek() == '*') { /* ** or **= */ 285 input(); /* eat 2nd * */ 286 if (peek() == '=') { 287 input(); yylval.i = POWEQ; RET(ASGNOP); 288 } else { 289 RET(POWER); 290 } 291 } else 292 RET('*'); 293 case '/': 294 RET('/'); 295 case '%': 296 if (peek() == '=') { 297 input(); yylval.i = MODEQ; RET(ASGNOP); 298 } else 299 RET('%'); 300 case '^': 301 if (peek() == '=') { 302 input(); yylval.i = POWEQ; RET(ASGNOP); 303 } else 304 RET(POWER); 305 306 case '$': 307 /* BUG: awkward, if not wrong */ 308 c = gettok(&buf, &bufsize); 309 if (isalpha(c)) { 310 if (strcmp(buf, "NF") == 0) { /* very special */ 311 unputstr("(NF)"); 312 RET(INDIRECT); 313 } 314 c = peek(); 315 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 316 unputstr(buf); 317 RET(INDIRECT); 318 } 319 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 320 RET(IVAR); 321 } else if (c == 0) { /* */ 322 SYNTAX( "unexpected end of input after $" ); 323 RET(';'); 324 } else { 325 unputstr(buf); 326 RET(INDIRECT); 327 } 328 329 case '}': 330 if (--bracecnt < 0) 331 SYNTAX( "extra }" ); 332 sc = 1; 333 RET(';'); 334 case ']': 335 if (--brackcnt < 0) 336 SYNTAX( "extra ]" ); 337 RET(']'); 338 case ')': 339 if (--parencnt < 0) 340 SYNTAX( "extra )" ); 341 RET(')'); 342 case '{': 343 bracecnt++; 344 RET('{'); 345 case '[': 346 brackcnt++; 347 RET('['); 348 case '(': 349 parencnt++; 350 RET('('); 351 352 case '"': 353 return string(); /* BUG: should be like tran.c ? */ 354 355 default: 356 RET(c); 357 } 358 } 359} 360 361int string(void) 362{ 363 int c, n; 364 char *s, *bp; 365 static char *buf = 0; 366 static int bufsz = 500; 367 368 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 369 FATAL("out of space for strings"); 370 for (bp = buf; (c = input()) != '"'; ) { 371 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0)) 372 FATAL("out of space for string %.10s...", buf); 373 switch (c) { 374 case '\n': 375 case '\r': 376 case 0: 377 SYNTAX( "non-terminated string %.10s...", buf ); 378 lineno++; 379 if (c == 0) /* hopeless */ 380 FATAL( "giving up" ); 381 break; 382 case '\\': 383 c = input(); 384 switch (c) { 385 case '"': *bp++ = '"'; break; 386 case 'n': *bp++ = '\n'; break; 387 case 't': *bp++ = '\t'; break; 388 case 'f': *bp++ = '\f'; break; 389 case 'r': *bp++ = '\r'; break; 390 case 'b': *bp++ = '\b'; break; 391 case 'v': *bp++ = '\v'; break; 392 case 'a': *bp++ = '\007'; break; 393 case '\\': *bp++ = '\\'; break; 394 395 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 396 case '3': case '4': case '5': case '6': case '7': 397 n = c - '0'; 398 if ((c = peek()) >= '0' && c < '8') { 399 n = 8 * n + input() - '0'; 400 if ((c = peek()) >= '0' && c < '8') 401 n = 8 * n + input() - '0'; 402 } 403 *bp++ = n; 404 break; 405 406 case 'x': /* hex \x0-9a-fA-F + */ 407 { char xbuf[100], *px; 408 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 409 if (isdigit(c) 410 || (c >= 'a' && c <= 'f') 411 || (c >= 'A' && c <= 'F')) 412 *px++ = c; 413 else 414 break; 415 } 416 *px = 0; 417 unput(c); 418 sscanf(xbuf, "%x", &n); 419 *bp++ = n; 420 break; 421 } 422 423 default: 424 *bp++ = c; 425 break; 426 } 427 break; 428 default: 429 *bp++ = c; 430 break; 431 } 432 } 433 *bp = 0; 434 s = tostring(buf); 435 *bp++ = ' '; *bp++ = 0; 436 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 437 RET(STRING); 438} 439 440 441int binsearch(char *w, Keyword *kp, int n) 442{ 443 int cond, low, mid, high; 444 445 low = 0; 446 high = n - 1; 447 while (low <= high) { 448 mid = (low + high) / 2; 449 if ((cond = strcmp(w, kp[mid].word)) < 0) 450 high = mid - 1; 451 else if (cond > 0) 452 low = mid + 1; 453 else 454 return mid; 455 } 456 return -1; 457} 458 459int word(char *w) 460{ 461 Keyword *kp; 462 int c, n; 463 464 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 465 kp = keywords + n; 466 if (n != -1) { /* found in table */ 467 yylval.i = kp->sub; 468 switch (kp->type) { /* special handling */ 469 case FSYSTEM: 470 if (safe) 471 SYNTAX( "system is unsafe" ); 472 RET(kp->type); 473 case FUNC: 474 if (infunc) 475 SYNTAX( "illegal nested function" ); 476 RET(kp->type); 477 case RETURN: 478 if (!infunc) 479 SYNTAX( "return not in function" ); 480 RET(kp->type); 481 case VARNF: 482 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 483 RET(VARNF); 484 default: 485 RET(kp->type); 486 } 487 } 488 c = peek(); /* look for '(' */ 489 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 490 yylval.i = n; 491 RET(ARG); 492 } else { 493 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 494 if (c == '(') { 495 RET(CALL); 496 } else { 497 RET(VAR); 498 } 499 } 500} 501 502void startreg(void) /* next call to yyles will return a regular expression */ 503{ 504 reg = 1; 505} 506 507int regexpr(void) 508{ 509 int c; 510 static char *buf = 0; 511 static int bufsz = 500; 512 char *bp; 513 514 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 515 FATAL("out of space for rex expr"); 516 bp = buf; 517 for ( ; (c = input()) != '/' && c != 0; ) { 518 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0)) 519 FATAL("out of space for reg expr %.10s...", buf); 520 if (c == '\n') { 521 SYNTAX( "newline in regular expression %.10s...", buf ); 522 unput('\n'); 523 break; 524 } else if (c == '\\') { 525 *bp++ = '\\'; 526 *bp++ = input(); 527 } else { 528 *bp++ = c; 529 } 530 } 531 *bp = 0; 532 yylval.s = tostring(buf); 533 unput('/'); 534 RET(REGEXPR); 535} 536 537/* low-level lexical stuff, sort of inherited from lex */ 538 539char ebuf[300]; 540char *ep = ebuf; 541char yysbuf[100]; /* pushback buffer */ 542char *yysptr = yysbuf; 543FILE *yyin = 0; 544 545int input(void) /* get next lexical input character */ 546{ 547 int c; 548 extern char *lexprog; 549 550 if (yysptr > yysbuf) 551 c = *--yysptr; 552 else if (lexprog != NULL) { /* awk '...' */ 553 if ((c = *lexprog) != 0) 554 lexprog++; 555 } else /* awk -f ... */ 556 c = pgetc(); 557 if (c == '\n') 558 lineno++; 559 else if (c == EOF) 560 c = 0; 561 if (ep >= ebuf + sizeof ebuf) 562 ep = ebuf; 563 return *ep++ = c; 564} 565 566void unput(int c) /* put lexical character back on input */ 567{ 568 if (c == '\n') 569 lineno--; 570 if (yysptr >= yysbuf + sizeof(yysbuf)) 571 FATAL("pushed back too much: %.20s...", yysbuf); 572 *yysptr++ = c; 573 if (--ep < ebuf) 574 ep = ebuf + sizeof(ebuf) - 1; 575} 576 577void unputstr(char *s) /* put a string back on input */ 578{ 579 int i; 580 581 for (i = strlen(s)-1; i >= 0; i--) 582 unput(s[i]); 583} 584