lex.c revision 170332
1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "ytab.h"
31
32extern YYSTYPE	yylval;
33extern int	infunc;
34
35int	lineno	= 1;
36int	bracecnt = 0;
37int	brackcnt  = 0;
38int	parencnt = 0;
39
40typedef struct Keyword {
41	const char *word;
42	int	sub;
43	int	type;
44} Keyword;
45
46Keyword keywords[] ={	/* keep sorted: binary searched */
47	{ "BEGIN",	XBEGIN,		XBEGIN },
48	{ "END",	XEND,		XEND },
49	{ "NF",		VARNF,		VARNF },
50	{ "atan2",	FATAN,		BLTIN },
51	{ "break",	BREAK,		BREAK },
52	{ "close",	CLOSE,		CLOSE },
53	{ "continue",	CONTINUE,	CONTINUE },
54	{ "cos",	FCOS,		BLTIN },
55	{ "delete",	DELETE,		DELETE },
56	{ "do",		DO,		DO },
57	{ "else",	ELSE,		ELSE },
58	{ "exit",	EXIT,		EXIT },
59	{ "exp",	FEXP,		BLTIN },
60	{ "fflush",	FFLUSH,		BLTIN },
61	{ "for",	FOR,		FOR },
62	{ "func",	FUNC,		FUNC },
63	{ "function",	FUNC,		FUNC },
64	{ "getline",	GETLINE,	GETLINE },
65	{ "gsub",	GSUB,		GSUB },
66	{ "if",		IF,		IF },
67	{ "in",		IN,		IN },
68	{ "index",	INDEX,		INDEX },
69	{ "int",	FINT,		BLTIN },
70	{ "length",	FLENGTH,	BLTIN },
71	{ "log",	FLOG,		BLTIN },
72	{ "match",	MATCHFCN,	MATCHFCN },
73	{ "next",	NEXT,		NEXT },
74	{ "nextfile",	NEXTFILE,	NEXTFILE },
75	{ "print",	PRINT,		PRINT },
76	{ "printf",	PRINTF,		PRINTF },
77	{ "rand",	FRAND,		BLTIN },
78	{ "return",	RETURN,		RETURN },
79	{ "sin",	FSIN,		BLTIN },
80	{ "split",	SPLIT,		SPLIT },
81	{ "sprintf",	SPRINTF,	SPRINTF },
82	{ "sqrt",	FSQRT,		BLTIN },
83	{ "srand",	FSRAND,		BLTIN },
84	{ "sub",	SUB,		SUB },
85	{ "substr",	SUBSTR,		SUBSTR },
86	{ "system",	FSYSTEM,	BLTIN },
87	{ "tolower",	FTOLOWER,	BLTIN },
88	{ "toupper",	FTOUPPER,	BLTIN },
89	{ "while",	WHILE,		WHILE },
90};
91
92#define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93
94int peek(void)
95{
96	int c = input();
97	unput(c);
98	return c;
99}
100
101int gettok(char **pbuf, int *psz)	/* get next input token */
102{
103	int c, retc;
104	char *buf = *pbuf;
105	int sz = *psz;
106	char *bp = buf;
107
108	c = input();
109	if (c == 0)
110		return 0;
111	buf[0] = c;
112	buf[1] = 0;
113	if (!isalnum(c) && c != '.' && c != '_')
114		return c;
115
116	*bp++ = c;
117	if (isalpha(c) || c == '_') {	/* it's a varname */
118		for ( ; (c = input()) != 0; ) {
119			if (bp-buf >= sz)
120				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121					FATAL( "out of space for name %.10s...", buf );
122			if (isalnum(c) || c == '_')
123				*bp++ = c;
124			else {
125				*bp = 0;
126				unput(c);
127				break;
128			}
129		}
130		*bp = 0;
131		retc = 'a';	/* alphanumeric */
132	} else {	/* maybe it's a number, but could be . */
133		char *rem;
134		/* read input until can't be a number */
135		for ( ; (c = input()) != 0; ) {
136			if (bp-buf >= sz)
137				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138					FATAL( "out of space for number %.10s...", buf );
139			if (isdigit(c) || c == 'e' || c == 'E'
140			  || c == '.' || c == '+' || c == '-')
141				*bp++ = c;
142			else {
143				unput(c);
144				break;
145			}
146		}
147		*bp = 0;
148		strtod(buf, &rem);	/* parse the number */
149		if (rem == buf) {	/* it wasn't a valid number at all */
150			buf[1] = 0;	/* return one character as token */
151			retc = buf[0];	/* character is its own type */
152			unputstr(rem+1); /* put rest back for later */
153		} else {	/* some prefix was a number */
154			unputstr(rem);	/* put rest back for later */
155			rem[0] = 0;	/* truncate buf after number part */
156			retc = '0';	/* type is number */
157		}
158	}
159	*pbuf = buf;
160	*psz = sz;
161	return retc;
162}
163
164int	word(char *);
165int	string(void);
166int	regexpr(void);
167int	sc	= 0;	/* 1 => return a } right now */
168int	reg	= 0;	/* 1 => return a REGEXPR now */
169
170int yylex(void)
171{
172	int c;
173	static char *buf = 0;
174	static int bufsize = 5; /* BUG: setting this small causes core dump! */
175
176	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
177		FATAL( "out of space in yylex" );
178	if (sc) {
179		sc = 0;
180		RET('}');
181	}
182	if (reg) {
183		reg = 0;
184		return regexpr();
185	}
186	for (;;) {
187		c = gettok(&buf, &bufsize);
188		if (c == 0)
189			return 0;
190		if (isalpha(c) || c == '_')
191			return word(buf);
192		if (isdigit(c)) {
193			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
194			/* should this also have STR set? */
195			RET(NUMBER);
196		}
197
198		yylval.i = c;
199		switch (c) {
200		case '\n':	/* {EOL} */
201			RET(NL);
202		case '\r':	/* assume \n is coming */
203		case ' ':	/* {WS}+ */
204		case '\t':
205			break;
206		case '#':	/* #.* strip comments */
207			while ((c = input()) != '\n' && c != 0)
208				;
209			unput(c);
210			break;
211		case ';':
212			RET(';');
213		case '\\':
214			if (peek() == '\n') {
215				input();
216			} else if (peek() == '\r') {
217				input(); input();	/* \n */
218				lineno++;
219			} else {
220				RET(c);
221			}
222			break;
223		case '&':
224			if (peek() == '&') {
225				input(); RET(AND);
226			} else
227				RET('&');
228		case '|':
229			if (peek() == '|') {
230				input(); RET(BOR);
231			} else
232				RET('|');
233		case '!':
234			if (peek() == '=') {
235				input(); yylval.i = NE; RET(NE);
236			} else if (peek() == '~') {
237				input(); yylval.i = NOTMATCH; RET(MATCHOP);
238			} else
239				RET(NOT);
240		case '~':
241			yylval.i = MATCH;
242			RET(MATCHOP);
243		case '<':
244			if (peek() == '=') {
245				input(); yylval.i = LE; RET(LE);
246			} else {
247				yylval.i = LT; RET(LT);
248			}
249		case '=':
250			if (peek() == '=') {
251				input(); yylval.i = EQ; RET(EQ);
252			} else {
253				yylval.i = ASSIGN; RET(ASGNOP);
254			}
255		case '>':
256			if (peek() == '=') {
257				input(); yylval.i = GE; RET(GE);
258			} else if (peek() == '>') {
259				input(); yylval.i = APPEND; RET(APPEND);
260			} else {
261				yylval.i = GT; RET(GT);
262			}
263		case '+':
264			if (peek() == '+') {
265				input(); yylval.i = INCR; RET(INCR);
266			} else if (peek() == '=') {
267				input(); yylval.i = ADDEQ; RET(ASGNOP);
268			} else
269				RET('+');
270		case '-':
271			if (peek() == '-') {
272				input(); yylval.i = DECR; RET(DECR);
273			} else if (peek() == '=') {
274				input(); yylval.i = SUBEQ; RET(ASGNOP);
275			} else
276				RET('-');
277		case '*':
278			if (peek() == '=') {	/* *= */
279				input(); yylval.i = MULTEQ; RET(ASGNOP);
280			} else if (peek() == '*') {	/* ** or **= */
281				input();	/* eat 2nd * */
282				if (peek() == '=') {
283					input(); yylval.i = POWEQ; RET(ASGNOP);
284				} else {
285					RET(POWER);
286				}
287			} else
288				RET('*');
289		case '/':
290			RET('/');
291		case '%':
292			if (peek() == '=') {
293				input(); yylval.i = MODEQ; RET(ASGNOP);
294			} else
295				RET('%');
296		case '^':
297			if (peek() == '=') {
298				input(); yylval.i = POWEQ; RET(ASGNOP);
299			} else
300				RET(POWER);
301
302		case '$':
303			/* BUG: awkward, if not wrong */
304			c = gettok(&buf, &bufsize);
305			if (isalpha(c)) {
306				if (strcmp(buf, "NF") == 0) {	/* very special */
307					unputstr("(NF)");
308					RET(INDIRECT);
309				}
310				c = peek();
311				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
312					unputstr(buf);
313					RET(INDIRECT);
314				}
315				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
316				RET(IVAR);
317			} else if (c == 0) {	/*  */
318				SYNTAX( "unexpected end of input after $" );
319				RET(';');
320			} else {
321				unputstr(buf);
322				RET(INDIRECT);
323			}
324
325		case '}':
326			if (--bracecnt < 0)
327				SYNTAX( "extra }" );
328			sc = 1;
329			RET(';');
330		case ']':
331			if (--brackcnt < 0)
332				SYNTAX( "extra ]" );
333			RET(']');
334		case ')':
335			if (--parencnt < 0)
336				SYNTAX( "extra )" );
337			RET(')');
338		case '{':
339			bracecnt++;
340			RET('{');
341		case '[':
342			brackcnt++;
343			RET('[');
344		case '(':
345			parencnt++;
346			RET('(');
347
348		case '"':
349			return string();	/* BUG: should be like tran.c ? */
350
351		default:
352			RET(c);
353		}
354	}
355}
356
357int string(void)
358{
359	int c, n;
360	char *s, *bp;
361	static char *buf = 0;
362	static int bufsz = 500;
363
364	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
365		FATAL("out of space for strings");
366	for (bp = buf; (c = input()) != '"'; ) {
367		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
368			FATAL("out of space for string %.10s...", buf);
369		switch (c) {
370		case '\n':
371		case '\r':
372		case 0:
373			SYNTAX( "non-terminated string %.10s...", buf );
374			lineno++;
375			if (c == 0)	/* hopeless */
376				FATAL( "giving up" );
377			break;
378		case '\\':
379			c = input();
380			switch (c) {
381			case '"': *bp++ = '"'; break;
382			case 'n': *bp++ = '\n'; break;
383			case 't': *bp++ = '\t'; break;
384			case 'f': *bp++ = '\f'; break;
385			case 'r': *bp++ = '\r'; break;
386			case 'b': *bp++ = '\b'; break;
387			case 'v': *bp++ = '\v'; break;
388			case 'a': *bp++ = '\007'; break;
389			case '\\': *bp++ = '\\'; break;
390
391			case '0': case '1': case '2': /* octal: \d \dd \ddd */
392			case '3': case '4': case '5': case '6': case '7':
393				n = c - '0';
394				if ((c = peek()) >= '0' && c < '8') {
395					n = 8 * n + input() - '0';
396					if ((c = peek()) >= '0' && c < '8')
397						n = 8 * n + input() - '0';
398				}
399				*bp++ = n;
400				break;
401
402			case 'x':	/* hex  \x0-9a-fA-F + */
403			    {	char xbuf[100], *px;
404				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
405					if (isdigit(c)
406					 || (c >= 'a' && c <= 'f')
407					 || (c >= 'A' && c <= 'F'))
408						*px++ = c;
409					else
410						break;
411				}
412				*px = 0;
413				unput(c);
414	  			sscanf(xbuf, "%x", &n);
415				*bp++ = n;
416				break;
417			    }
418
419			default:
420				*bp++ = c;
421				break;
422			}
423			break;
424		default:
425			*bp++ = c;
426			break;
427		}
428	}
429	*bp = 0;
430	s = tostring(buf);
431	*bp++ = ' '; *bp++ = 0;
432	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
433	RET(STRING);
434}
435
436
437int binsearch(char *w, Keyword *kp, int n)
438{
439	int cond, low, mid, high;
440
441	low = 0;
442	high = n - 1;
443	while (low <= high) {
444		mid = (low + high) / 2;
445		if ((cond = strcmp(w, kp[mid].word)) < 0)
446			high = mid - 1;
447		else if (cond > 0)
448			low = mid + 1;
449		else
450			return mid;
451	}
452	return -1;
453}
454
455int word(char *w)
456{
457	Keyword *kp;
458	int c, n;
459
460	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
461/* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
462	kp = keywords + n;
463	if (n != -1) {	/* found in table */
464		yylval.i = kp->sub;
465		switch (kp->type) {	/* special handling */
466		case BLTIN:
467			if (kp->sub == FSYSTEM && safe)
468				SYNTAX( "system is unsafe" );
469			RET(kp->type);
470		case FUNC:
471			if (infunc)
472				SYNTAX( "illegal nested function" );
473			RET(kp->type);
474		case RETURN:
475			if (!infunc)
476				SYNTAX( "return not in function" );
477			RET(kp->type);
478		case VARNF:
479			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
480			RET(VARNF);
481		default:
482			RET(kp->type);
483		}
484	}
485	c = peek();	/* look for '(' */
486	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
487		yylval.i = n;
488		RET(ARG);
489	} else {
490		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
491		if (c == '(') {
492			RET(CALL);
493		} else {
494			RET(VAR);
495		}
496	}
497}
498
499void startreg(void)	/* next call to yylex will return a regular expression */
500{
501	reg = 1;
502}
503
504int regexpr(void)
505{
506	int c;
507	static char *buf = 0;
508	static int bufsz = 500;
509	char *bp;
510
511	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
512		FATAL("out of space for rex expr");
513	bp = buf;
514	for ( ; (c = input()) != '/' && c != 0; ) {
515		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
516			FATAL("out of space for reg expr %.10s...", buf);
517		if (c == '\n') {
518			SYNTAX( "newline in regular expression %.10s...", buf );
519			unput('\n');
520			break;
521		} else if (c == '\\') {
522			*bp++ = '\\';
523			*bp++ = input();
524		} else {
525			*bp++ = c;
526		}
527	}
528	*bp = 0;
529	if (c == 0)
530		SYNTAX("non-terminated regular expression %.10s...", buf);
531	yylval.s = tostring(buf);
532	unput('/');
533	RET(REGEXPR);
534}
535
536/* low-level lexical stuff, sort of inherited from lex */
537
538char	ebuf[300];
539char	*ep = ebuf;
540char	yysbuf[100];	/* pushback buffer */
541char	*yysptr = yysbuf;
542FILE	*yyin = 0;
543
544int input(void)	/* get next lexical input character */
545{
546	int c;
547	extern char *lexprog;
548
549	if (yysptr > yysbuf)
550		c = (uschar)*--yysptr;
551	else if (lexprog != NULL) {	/* awk '...' */
552		if ((c = (uschar)*lexprog) != 0)
553			lexprog++;
554	} else				/* awk -f ... */
555		c = pgetc();
556	if (c == '\n')
557		lineno++;
558	else if (c == EOF)
559		c = 0;
560	if (ep >= ebuf + sizeof ebuf)
561		ep = ebuf;
562	return *ep++ = c;
563}
564
565void unput(int c)	/* put lexical character back on input */
566{
567	if (c == '\n')
568		lineno--;
569	if (yysptr >= yysbuf + sizeof(yysbuf))
570		FATAL("pushed back too much: %.20s...", yysbuf);
571	*yysptr++ = c;
572	if (--ep < ebuf)
573		ep = ebuf + sizeof(ebuf) - 1;
574}
575
576void unputstr(const char *s)	/* put a string back on input */
577{
578	int i;
579
580	for (i = strlen(s)-1; i >= 0; i--)
581		unput(s[i]);
582}
583