lex.c revision 146299
1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "ytab.h"
31
32extern YYSTYPE	yylval;
33extern int	infunc;
34
35int	lineno	= 1;
36int	bracecnt = 0;
37int	brackcnt  = 0;
38int	parencnt = 0;
39
40typedef struct Keyword {
41	const char *word;
42	int	sub;
43	int	type;
44} Keyword;
45
46Keyword keywords[] ={	/* keep sorted: binary searched */
47	{ "BEGIN",	XBEGIN,		XBEGIN },
48	{ "END",	XEND,		XEND },
49	{ "NF",		VARNF,		VARNF },
50	{ "atan2",	FATAN,		BLTIN },
51	{ "break",	BREAK,		BREAK },
52	{ "close",	CLOSE,		CLOSE },
53	{ "continue",	CONTINUE,	CONTINUE },
54	{ "cos",	FCOS,		BLTIN },
55	{ "delete",	DELETE,		DELETE },
56	{ "do",		DO,		DO },
57	{ "else",	ELSE,		ELSE },
58	{ "exit",	EXIT,		EXIT },
59	{ "exp",	FEXP,		BLTIN },
60	{ "fflush",	FFLUSH,		BLTIN },
61	{ "for",	FOR,		FOR },
62	{ "func",	FUNC,		FUNC },
63	{ "function",	FUNC,		FUNC },
64	{ "getline",	GETLINE,	GETLINE },
65	{ "gsub",	GSUB,		GSUB },
66	{ "if",		IF,		IF },
67	{ "in",		IN,		IN },
68	{ "index",	INDEX,		INDEX },
69	{ "int",	FINT,		BLTIN },
70	{ "length",	FLENGTH,	BLTIN },
71	{ "log",	FLOG,		BLTIN },
72	{ "match",	MATCHFCN,	MATCHFCN },
73	{ "next",	NEXT,		NEXT },
74	{ "nextfile",	NEXTFILE,	NEXTFILE },
75	{ "print",	PRINT,		PRINT },
76	{ "printf",	PRINTF,		PRINTF },
77	{ "rand",	FRAND,		BLTIN },
78	{ "return",	RETURN,		RETURN },
79	{ "sin",	FSIN,		BLTIN },
80	{ "split",	SPLIT,		SPLIT },
81	{ "sprintf",	SPRINTF,	SPRINTF },
82	{ "sqrt",	FSQRT,		BLTIN },
83	{ "srand",	FSRAND,		BLTIN },
84	{ "sub",	SUB,		SUB },
85	{ "substr",	SUBSTR,		SUBSTR },
86	{ "system",	FSYSTEM,	BLTIN },
87	{ "tolower",	FTOLOWER,	BLTIN },
88	{ "toupper",	FTOUPPER,	BLTIN },
89	{ "while",	WHILE,		WHILE },
90};
91
92#define DEBUG
93#ifdef	DEBUG
94#define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95#else
96#define	RET(x)	return(x)
97#endif
98
99int peek(void)
100{
101	int c = input();
102	unput(c);
103	return c;
104}
105
106int gettok(char **pbuf, int *psz)	/* get next input token */
107{
108	int c, retc;
109	char *buf = *pbuf;
110	int sz = *psz;
111	char *bp = buf;
112
113	c = input();
114	if (c == 0)
115		return 0;
116	buf[0] = c;
117	buf[1] = 0;
118	if (!isalnum(c) && c != '.' && c != '_')
119		return c;
120
121	*bp++ = c;
122	if (isalpha(c) || c == '_') {	/* it's a varname */
123		for ( ; (c = input()) != 0; ) {
124			if (bp-buf >= sz)
125				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126					FATAL( "out of space for name %.10s...", buf );
127			if (isalnum(c) || c == '_')
128				*bp++ = c;
129			else {
130				*bp = 0;
131				unput(c);
132				break;
133			}
134		}
135		*bp = 0;
136		retc = 'a';	/* alphanumeric */
137	} else {	/* maybe it's a number, but could be . */
138		char *rem;
139		/* read input until can't be a number */
140		for ( ; (c = input()) != 0; ) {
141			if (bp-buf >= sz)
142				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143					FATAL( "out of space for number %.10s...", buf );
144			if (isdigit(c) || c == 'e' || c == 'E'
145			  || c == '.' || c == '+' || c == '-')
146				*bp++ = c;
147			else {
148				unput(c);
149				break;
150			}
151		}
152		*bp = 0;
153		strtod(buf, &rem);	/* parse the number */
154		if (rem == buf) {	/* it wasn't a valid number at all */
155			buf[1] = 0;	/* return one character as token */
156			retc = buf[0];	/* character is its own type */
157			unputstr(rem+1); /* put rest back for later */
158		} else {	/* some prefix was a number */
159			unputstr(rem);	/* put rest back for later */
160			rem[0] = 0;	/* truncate buf after number part */
161			retc = '0';	/* type is number */
162		}
163	}
164	*pbuf = buf;
165	*psz = sz;
166	return retc;
167}
168
169int	word(char *);
170int	string(void);
171int	regexpr(void);
172int	sc	= 0;	/* 1 => return a } right now */
173int	reg	= 0;	/* 1 => return a REGEXPR now */
174
175int yylex(void)
176{
177	int c;
178	static char *buf = 0;
179	static int bufsize = 500;
180
181	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
182		FATAL( "out of space in yylex" );
183	if (sc) {
184		sc = 0;
185		RET('}');
186	}
187	if (reg) {
188		reg = 0;
189		return regexpr();
190	}
191/* printf("top\n"); */
192	for (;;) {
193		c = gettok(&buf, &bufsize);
194/* printf("gettok [%s]\n", buf); */
195		if (c == 0)
196			return 0;
197		if (isalpha(c) || c == '_')
198			return word(buf);
199		if (isdigit(c)) {
200			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
201			/* should this also have STR set? */
202			RET(NUMBER);
203		}
204
205		yylval.i = c;
206		switch (c) {
207		case '\n':	/* {EOL} */
208			RET(NL);
209		case '\r':	/* assume \n is coming */
210		case ' ':	/* {WS}+ */
211		case '\t':
212			break;
213		case '#':	/* #.* strip comments */
214			while ((c = input()) != '\n' && c != 0)
215				;
216			unput(c);
217			break;
218		case ';':
219			RET(';');
220		case '\\':
221			if (peek() == '\n') {
222				input();
223			} else if (peek() == '\r') {
224				input(); input();	/* \n */
225				lineno++;
226			} else {
227				RET(c);
228			}
229			break;
230		case '&':
231			if (peek() == '&') {
232				input(); RET(AND);
233			} else
234				RET('&');
235		case '|':
236			if (peek() == '|') {
237				input(); RET(BOR);
238			} else
239				RET('|');
240		case '!':
241			if (peek() == '=') {
242				input(); yylval.i = NE; RET(NE);
243			} else if (peek() == '~') {
244				input(); yylval.i = NOTMATCH; RET(MATCHOP);
245			} else
246				RET(NOT);
247		case '~':
248			yylval.i = MATCH;
249			RET(MATCHOP);
250		case '<':
251			if (peek() == '=') {
252				input(); yylval.i = LE; RET(LE);
253			} else {
254				yylval.i = LT; RET(LT);
255			}
256		case '=':
257			if (peek() == '=') {
258				input(); yylval.i = EQ; RET(EQ);
259			} else {
260				yylval.i = ASSIGN; RET(ASGNOP);
261			}
262		case '>':
263			if (peek() == '=') {
264				input(); yylval.i = GE; RET(GE);
265			} else if (peek() == '>') {
266				input(); yylval.i = APPEND; RET(APPEND);
267			} else {
268				yylval.i = GT; RET(GT);
269			}
270		case '+':
271			if (peek() == '+') {
272				input(); yylval.i = INCR; RET(INCR);
273			} else if (peek() == '=') {
274				input(); yylval.i = ADDEQ; RET(ASGNOP);
275			} else
276				RET('+');
277		case '-':
278			if (peek() == '-') {
279				input(); yylval.i = DECR; RET(DECR);
280			} else if (peek() == '=') {
281				input(); yylval.i = SUBEQ; RET(ASGNOP);
282			} else
283				RET('-');
284		case '*':
285			if (peek() == '=') {	/* *= */
286				input(); yylval.i = MULTEQ; RET(ASGNOP);
287			} else if (peek() == '*') {	/* ** or **= */
288				input();	/* eat 2nd * */
289				if (peek() == '=') {
290					input(); yylval.i = POWEQ; RET(ASGNOP);
291				} else {
292					RET(POWER);
293				}
294			} else
295				RET('*');
296		case '/':
297			RET('/');
298		case '%':
299			if (peek() == '=') {
300				input(); yylval.i = MODEQ; RET(ASGNOP);
301			} else
302				RET('%');
303		case '^':
304			if (peek() == '=') {
305				input(); yylval.i = POWEQ; RET(ASGNOP);
306			} else
307				RET(POWER);
308
309		case '$':
310			/* BUG: awkward, if not wrong */
311			c = gettok(&buf, &bufsize);
312			if (isalpha(c)) {
313				if (strcmp(buf, "NF") == 0) {	/* very special */
314					unputstr("(NF)");
315					RET(INDIRECT);
316				}
317				c = peek();
318				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
319					unputstr(buf);
320					RET(INDIRECT);
321				}
322				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
323				RET(IVAR);
324			} else if (c == 0) {	/*  */
325				SYNTAX( "unexpected end of input after $" );
326				RET(';');
327			} else {
328				unputstr(buf);
329				RET(INDIRECT);
330			}
331
332		case '}':
333			if (--bracecnt < 0)
334				SYNTAX( "extra }" );
335			sc = 1;
336			RET(';');
337		case ']':
338			if (--brackcnt < 0)
339				SYNTAX( "extra ]" );
340			RET(']');
341		case ')':
342			if (--parencnt < 0)
343				SYNTAX( "extra )" );
344			RET(')');
345		case '{':
346			bracecnt++;
347			RET('{');
348		case '[':
349			brackcnt++;
350			RET('[');
351		case '(':
352			parencnt++;
353			RET('(');
354
355		case '"':
356			return string();	/* BUG: should be like tran.c ? */
357
358		default:
359			RET(c);
360		}
361	}
362}
363
364int string(void)
365{
366	int c, n;
367	char *s, *bp;
368	static char *buf = 0;
369	static int bufsz = 500;
370
371	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
372		FATAL("out of space for strings");
373	for (bp = buf; (c = input()) != '"'; ) {
374		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
375			FATAL("out of space for string %.10s...", buf);
376		switch (c) {
377		case '\n':
378		case '\r':
379		case 0:
380			SYNTAX( "non-terminated string %.10s...", buf );
381			lineno++;
382			if (c == 0)	/* hopeless */
383				FATAL( "giving up" );
384			break;
385		case '\\':
386			c = input();
387			switch (c) {
388			case '"': *bp++ = '"'; break;
389			case 'n': *bp++ = '\n'; break;
390			case 't': *bp++ = '\t'; break;
391			case 'f': *bp++ = '\f'; break;
392			case 'r': *bp++ = '\r'; break;
393			case 'b': *bp++ = '\b'; break;
394			case 'v': *bp++ = '\v'; break;
395			case 'a': *bp++ = '\007'; break;
396			case '\\': *bp++ = '\\'; break;
397
398			case '0': case '1': case '2': /* octal: \d \dd \ddd */
399			case '3': case '4': case '5': case '6': case '7':
400				n = c - '0';
401				if ((c = peek()) >= '0' && c < '8') {
402					n = 8 * n + input() - '0';
403					if ((c = peek()) >= '0' && c < '8')
404						n = 8 * n + input() - '0';
405				}
406				*bp++ = n;
407				break;
408
409			case 'x':	/* hex  \x0-9a-fA-F + */
410			    {	char xbuf[100], *px;
411				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
412					if (isdigit(c)
413					 || (c >= 'a' && c <= 'f')
414					 || (c >= 'A' && c <= 'F'))
415						*px++ = c;
416					else
417						break;
418				}
419				*px = 0;
420				unput(c);
421	  			sscanf(xbuf, "%x", &n);
422				*bp++ = n;
423				break;
424			    }
425
426			default:
427				*bp++ = c;
428				break;
429			}
430			break;
431		default:
432			*bp++ = c;
433			break;
434		}
435	}
436	*bp = 0;
437	s = tostring(buf);
438	*bp++ = ' '; *bp++ = 0;
439	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
440	RET(STRING);
441}
442
443
444int binsearch(char *w, Keyword *kp, int n)
445{
446	int cond, low, mid, high;
447
448	low = 0;
449	high = n - 1;
450	while (low <= high) {
451		mid = (low + high) / 2;
452		if ((cond = strcmp(w, kp[mid].word)) < 0)
453			high = mid - 1;
454		else if (cond > 0)
455			low = mid + 1;
456		else
457			return mid;
458	}
459	return -1;
460}
461
462int word(char *w)
463{
464	Keyword *kp;
465	int c, n;
466
467	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
468	kp = keywords + n;
469	if (n != -1) {	/* found in table */
470		yylval.i = kp->sub;
471		switch (kp->type) {	/* special handling */
472		case FSYSTEM:
473			if (safe)
474				SYNTAX( "system is unsafe" );
475			RET(kp->type);
476		case FUNC:
477			if (infunc)
478				SYNTAX( "illegal nested function" );
479			RET(kp->type);
480		case RETURN:
481			if (!infunc)
482				SYNTAX( "return not in function" );
483			RET(kp->type);
484		case VARNF:
485			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
486			RET(VARNF);
487		default:
488			RET(kp->type);
489		}
490	}
491	c = peek();	/* look for '(' */
492	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
493		yylval.i = n;
494		RET(ARG);
495	} else {
496		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
497		if (c == '(') {
498			RET(CALL);
499		} else {
500			RET(VAR);
501		}
502	}
503}
504
505void startreg(void)	/* next call to yylex will return a regular expression */
506{
507	reg = 1;
508}
509
510int regexpr(void)
511{
512	int c;
513	static char *buf = 0;
514	static int bufsz = 500;
515	char *bp;
516
517	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
518		FATAL("out of space for rex expr");
519	bp = buf;
520	for ( ; (c = input()) != '/' && c != 0; ) {
521		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
522			FATAL("out of space for reg expr %.10s...", buf);
523		if (c == '\n') {
524			SYNTAX( "newline in regular expression %.10s...", buf );
525			unput('\n');
526			break;
527		} else if (c == '\\') {
528			*bp++ = '\\';
529			*bp++ = input();
530		} else {
531			*bp++ = c;
532		}
533	}
534	*bp = 0;
535	if (c == 0)
536		SYNTAX("non-terminated regular expression %.10s...", buf);
537	yylval.s = tostring(buf);
538	unput('/');
539	RET(REGEXPR);
540}
541
542/* low-level lexical stuff, sort of inherited from lex */
543
544char	ebuf[300];
545char	*ep = ebuf;
546char	yysbuf[100];	/* pushback buffer */
547char	*yysptr = yysbuf;
548FILE	*yyin = 0;
549
550int input(void)	/* get next lexical input character */
551{
552	int c;
553	extern char *lexprog;
554
555	if (yysptr > yysbuf)
556		c = (uschar)*--yysptr;
557	else if (lexprog != NULL) {	/* awk '...' */
558		if ((c = (uschar)*lexprog) != 0)
559			lexprog++;
560	} else				/* awk -f ... */
561		c = pgetc();
562	if (c == '\n')
563		lineno++;
564	else if (c == EOF)
565		c = 0;
566	if (ep >= ebuf + sizeof ebuf)
567		ep = ebuf;
568	return *ep++ = c;
569}
570
571void unput(int c)	/* put lexical character back on input */
572{
573	if (c == '\n')
574		lineno--;
575	if (yysptr >= yysbuf + sizeof(yysbuf))
576		FATAL("pushed back too much: %.20s...", yysbuf);
577	*yysptr++ = c;
578	if (--ep < ebuf)
579		ep = ebuf + sizeof(ebuf) - 1;
580}
581
582void unputstr(const char *s)	/* put a string back on input */
583{
584	int i;
585
586	for (i = strlen(s)-1; i >= 0; i--)
587		unput(s[i]);
588}
589