lex.c revision 125505
1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "ytab.h"
31
32extern YYSTYPE	yylval;
33extern int	infunc;
34
35int	lineno	= 1;
36int	bracecnt = 0;
37int	brackcnt  = 0;
38int	parencnt = 0;
39
40typedef struct Keyword {
41	const char *word;
42	int	sub;
43	int	type;
44} Keyword;
45
46Keyword keywords[] ={	/* keep sorted: binary searched */
47	{ "BEGIN",	XBEGIN,		XBEGIN },
48	{ "END",	XEND,		XEND },
49	{ "NF",		VARNF,		VARNF },
50	{ "atan2",	FATAN,		BLTIN },
51	{ "break",	BREAK,		BREAK },
52	{ "close",	CLOSE,		CLOSE },
53	{ "continue",	CONTINUE,	CONTINUE },
54	{ "cos",	FCOS,		BLTIN },
55	{ "delete",	DELETE,		DELETE },
56	{ "do",		DO,		DO },
57	{ "else",	ELSE,		ELSE },
58	{ "exit",	EXIT,		EXIT },
59	{ "exp",	FEXP,		BLTIN },
60	{ "fflush",	FFLUSH,		BLTIN },
61	{ "for",	FOR,		FOR },
62	{ "func",	FUNC,		FUNC },
63	{ "function",	FUNC,		FUNC },
64	{ "getline",	GETLINE,	GETLINE },
65	{ "gsub",	GSUB,		GSUB },
66	{ "if",		IF,		IF },
67	{ "in",		IN,		IN },
68	{ "index",	INDEX,		INDEX },
69	{ "int",	FINT,		BLTIN },
70	{ "length",	FLENGTH,	BLTIN },
71	{ "log",	FLOG,		BLTIN },
72	{ "match",	MATCHFCN,	MATCHFCN },
73	{ "next",	NEXT,		NEXT },
74	{ "nextfile",	NEXTFILE,	NEXTFILE },
75	{ "print",	PRINT,		PRINT },
76	{ "printf",	PRINTF,		PRINTF },
77	{ "rand",	FRAND,		BLTIN },
78	{ "return",	RETURN,		RETURN },
79	{ "sin",	FSIN,		BLTIN },
80	{ "split",	SPLIT,		SPLIT },
81	{ "sprintf",	SPRINTF,	SPRINTF },
82	{ "sqrt",	FSQRT,		BLTIN },
83	{ "srand",	FSRAND,		BLTIN },
84	{ "sub",	SUB,		SUB },
85	{ "substr",	SUBSTR,		SUBSTR },
86	{ "system",	FSYSTEM,	BLTIN },
87	{ "tolower",	FTOLOWER,	BLTIN },
88	{ "toupper",	FTOUPPER,	BLTIN },
89	{ "while",	WHILE,		WHILE },
90};
91
92#define DEBUG
93#ifdef	DEBUG
94#define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95#else
96#define	RET(x)	return(x)
97#endif
98
99int peek(void)
100{
101	int c = input();
102	unput(c);
103	return c;
104}
105
106int gettok(char **pbuf, int *psz)	/* get next input token */
107{
108	int c, retc;
109	char *buf = *pbuf;
110	int sz = *psz;
111	char *bp = buf;
112
113	c = input();
114	if (c == 0)
115		return 0;
116	buf[0] = c;
117	buf[1] = 0;
118	if (!isalnum(c) && c != '.' && c != '_')
119		return c;
120
121	*bp++ = c;
122	if (isalpha(c) || c == '_') {	/* it's a varname */
123		for ( ; (c = input()) != 0; ) {
124			if (bp-buf >= sz)
125				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126					FATAL( "out of space for name %.10s...", buf );
127			if (isalnum(c) || c == '_')
128				*bp++ = c;
129			else {
130				*bp = 0;
131				unput(c);
132				break;
133			}
134		}
135		*bp = 0;
136		retc = 'a';	/* alphanumeric */
137	} else {	/* it's a number */
138		char *rem;
139		/* read input until can't be a number */
140		for ( ; (c = input()) != 0; ) {
141			if (bp-buf >= sz)
142				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143					FATAL( "out of space for number %.10s...", buf );
144			if (isdigit(c) || c == 'e' || c == 'E'
145			  || c == '.' || c == '+' || c == '-')
146				*bp++ = c;
147			else {
148				unput(c);
149				break;
150			}
151		}
152		*bp = 0;
153		strtod(buf, &rem);	/* parse the number */
154		unputstr(rem);		/* put rest back for later */
155		if (rem == buf) {	/* it wasn't a valid number at all */
156			buf[1] = 0;	/* so return one character as token */
157			retc = buf[0];	/* character is its own type */
158		} else {	/* some prefix was a number */
159			rem[0] = 0;	/* so truncate where failure started */
160			retc = '0';	/* number */
161		}
162	}
163	*pbuf = buf;
164	*psz = sz;
165	return retc;
166}
167
168int	word(char *);
169int	string(void);
170int	regexpr(void);
171int	sc	= 0;	/* 1 => return a } right now */
172int	reg	= 0;	/* 1 => return a REGEXPR now */
173
174int yylex(void)
175{
176	int c;
177	static char *buf = 0;
178	static int bufsize = 500;
179
180	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
181		FATAL( "out of space in yylex" );
182	if (sc) {
183		sc = 0;
184		RET('}');
185	}
186	if (reg) {
187		reg = 0;
188		return regexpr();
189	}
190	for (;;) {
191		c = gettok(&buf, &bufsize);
192		if (c == 0)
193			return 0;
194		if (isalpha(c) || c == '_')
195			return word(buf);
196		if (isdigit(c)) {
197			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
198			/* should this also have STR set? */
199			RET(NUMBER);
200		}
201
202		yylval.i = c;
203		switch (c) {
204		case '\n':	/* {EOL} */
205			RET(NL);
206		case '\r':	/* assume \n is coming */
207		case ' ':	/* {WS}+ */
208		case '\t':
209			break;
210		case '#':	/* #.* strip comments */
211			while ((c = input()) != '\n' && c != 0)
212				;
213			unput(c);
214			break;
215		case ';':
216			RET(';');
217		case '\\':
218			if (peek() == '\n') {
219				input();
220			} else if (peek() == '\r') {
221				input(); input();	/* \n */
222				lineno++;
223			} else {
224				RET(c);
225			}
226			break;
227		case '&':
228			if (peek() == '&') {
229				input(); RET(AND);
230			} else
231				RET('&');
232		case '|':
233			if (peek() == '|') {
234				input(); RET(BOR);
235			} else
236				RET('|');
237		case '!':
238			if (peek() == '=') {
239				input(); yylval.i = NE; RET(NE);
240			} else if (peek() == '~') {
241				input(); yylval.i = NOTMATCH; RET(MATCHOP);
242			} else
243				RET(NOT);
244		case '~':
245			yylval.i = MATCH;
246			RET(MATCHOP);
247		case '<':
248			if (peek() == '=') {
249				input(); yylval.i = LE; RET(LE);
250			} else {
251				yylval.i = LT; RET(LT);
252			}
253		case '=':
254			if (peek() == '=') {
255				input(); yylval.i = EQ; RET(EQ);
256			} else {
257				yylval.i = ASSIGN; RET(ASGNOP);
258			}
259		case '>':
260			if (peek() == '=') {
261				input(); yylval.i = GE; RET(GE);
262			} else if (peek() == '>') {
263				input(); yylval.i = APPEND; RET(APPEND);
264			} else {
265				yylval.i = GT; RET(GT);
266			}
267		case '+':
268			if (peek() == '+') {
269				input(); yylval.i = INCR; RET(INCR);
270			} else if (peek() == '=') {
271				input(); yylval.i = ADDEQ; RET(ASGNOP);
272			} else
273				RET('+');
274		case '-':
275			if (peek() == '-') {
276				input(); yylval.i = DECR; RET(DECR);
277			} else if (peek() == '=') {
278				input(); yylval.i = SUBEQ; RET(ASGNOP);
279			} else
280				RET('-');
281		case '*':
282			if (peek() == '=') {	/* *= */
283				input(); yylval.i = MULTEQ; RET(ASGNOP);
284			} else if (peek() == '*') {	/* ** or **= */
285				input();	/* eat 2nd * */
286				if (peek() == '=') {
287					input(); yylval.i = POWEQ; RET(ASGNOP);
288				} else {
289					RET(POWER);
290				}
291			} else
292				RET('*');
293		case '/':
294			RET('/');
295		case '%':
296			if (peek() == '=') {
297				input(); yylval.i = MODEQ; RET(ASGNOP);
298			} else
299				RET('%');
300		case '^':
301			if (peek() == '=') {
302				input(); yylval.i = POWEQ; RET(ASGNOP);
303			} else
304				RET(POWER);
305
306		case '$':
307			/* BUG: awkward, if not wrong */
308			c = gettok(&buf, &bufsize);
309			if (isalpha(c)) {
310				if (strcmp(buf, "NF") == 0) {	/* very special */
311					unputstr("(NF)");
312					RET(INDIRECT);
313				}
314				c = peek();
315				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
316					unputstr(buf);
317					RET(INDIRECT);
318				}
319				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
320				RET(IVAR);
321			} else if (c == 0) {	/*  */
322				SYNTAX( "unexpected end of input after $" );
323				RET(';');
324			} else {
325				unputstr(buf);
326				RET(INDIRECT);
327			}
328
329		case '}':
330			if (--bracecnt < 0)
331				SYNTAX( "extra }" );
332			sc = 1;
333			RET(';');
334		case ']':
335			if (--brackcnt < 0)
336				SYNTAX( "extra ]" );
337			RET(']');
338		case ')':
339			if (--parencnt < 0)
340				SYNTAX( "extra )" );
341			RET(')');
342		case '{':
343			bracecnt++;
344			RET('{');
345		case '[':
346			brackcnt++;
347			RET('[');
348		case '(':
349			parencnt++;
350			RET('(');
351
352		case '"':
353			return string();	/* BUG: should be like tran.c ? */
354
355		default:
356			RET(c);
357		}
358	}
359}
360
361int string(void)
362{
363	int c, n;
364	char *s, *bp;
365	static char *buf = 0;
366	static int bufsz = 500;
367
368	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
369		FATAL("out of space for strings");
370	for (bp = buf; (c = input()) != '"'; ) {
371		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
372			FATAL("out of space for string %.10s...", buf);
373		switch (c) {
374		case '\n':
375		case '\r':
376		case 0:
377			SYNTAX( "non-terminated string %.10s...", buf );
378			lineno++;
379			if (c == 0)	/* hopeless */
380				FATAL( "giving up" );
381			break;
382		case '\\':
383			c = input();
384			switch (c) {
385			case '"': *bp++ = '"'; break;
386			case 'n': *bp++ = '\n'; break;
387			case 't': *bp++ = '\t'; break;
388			case 'f': *bp++ = '\f'; break;
389			case 'r': *bp++ = '\r'; break;
390			case 'b': *bp++ = '\b'; break;
391			case 'v': *bp++ = '\v'; break;
392			case 'a': *bp++ = '\007'; break;
393			case '\\': *bp++ = '\\'; break;
394
395			case '0': case '1': case '2': /* octal: \d \dd \ddd */
396			case '3': case '4': case '5': case '6': case '7':
397				n = c - '0';
398				if ((c = peek()) >= '0' && c < '8') {
399					n = 8 * n + input() - '0';
400					if ((c = peek()) >= '0' && c < '8')
401						n = 8 * n + input() - '0';
402				}
403				*bp++ = n;
404				break;
405
406			case 'x':	/* hex  \x0-9a-fA-F + */
407			    {	char xbuf[100], *px;
408				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
409					if (isdigit(c)
410					 || (c >= 'a' && c <= 'f')
411					 || (c >= 'A' && c <= 'F'))
412						*px++ = c;
413					else
414						break;
415				}
416				*px = 0;
417				unput(c);
418	  			sscanf(xbuf, "%x", &n);
419				*bp++ = n;
420				break;
421			    }
422
423			default:
424				*bp++ = c;
425				break;
426			}
427			break;
428		default:
429			*bp++ = c;
430			break;
431		}
432	}
433	*bp = 0;
434	s = tostring(buf);
435	*bp++ = ' '; *bp++ = 0;
436	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
437	RET(STRING);
438}
439
440
441int binsearch(char *w, Keyword *kp, int n)
442{
443	int cond, low, mid, high;
444
445	low = 0;
446	high = n - 1;
447	while (low <= high) {
448		mid = (low + high) / 2;
449		if ((cond = strcmp(w, kp[mid].word)) < 0)
450			high = mid - 1;
451		else if (cond > 0)
452			low = mid + 1;
453		else
454			return mid;
455	}
456	return -1;
457}
458
459int word(char *w)
460{
461	Keyword *kp;
462	int c, n;
463
464	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
465	kp = keywords + n;
466	if (n != -1) {	/* found in table */
467		yylval.i = kp->sub;
468		switch (kp->type) {	/* special handling */
469		case FSYSTEM:
470			if (safe)
471				SYNTAX( "system is unsafe" );
472			RET(kp->type);
473		case FUNC:
474			if (infunc)
475				SYNTAX( "illegal nested function" );
476			RET(kp->type);
477		case RETURN:
478			if (!infunc)
479				SYNTAX( "return not in function" );
480			RET(kp->type);
481		case VARNF:
482			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
483			RET(VARNF);
484		default:
485			RET(kp->type);
486		}
487	}
488	c = peek();	/* look for '(' */
489	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
490		yylval.i = n;
491		RET(ARG);
492	} else {
493		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
494		if (c == '(') {
495			RET(CALL);
496		} else {
497			RET(VAR);
498		}
499	}
500}
501
502void startreg(void)	/* next call to yylex will return a regular expression */
503{
504	reg = 1;
505}
506
507int regexpr(void)
508{
509	int c;
510	static char *buf = 0;
511	static int bufsz = 500;
512	char *bp;
513
514	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
515		FATAL("out of space for rex expr");
516	bp = buf;
517	for ( ; (c = input()) != '/' && c != 0; ) {
518		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
519			FATAL("out of space for reg expr %.10s...", buf);
520		if (c == '\n') {
521			SYNTAX( "newline in regular expression %.10s...", buf );
522			unput('\n');
523			break;
524		} else if (c == '\\') {
525			*bp++ = '\\';
526			*bp++ = input();
527		} else {
528			*bp++ = c;
529		}
530	}
531	*bp = 0;
532	if (c == 0)
533		SYNTAX("non-terminated regular expression %.10s...", buf);
534	yylval.s = tostring(buf);
535	unput('/');
536	RET(REGEXPR);
537}
538
539/* low-level lexical stuff, sort of inherited from lex */
540
541char	ebuf[300];
542char	*ep = ebuf;
543char	yysbuf[100];	/* pushback buffer */
544char	*yysptr = yysbuf;
545FILE	*yyin = 0;
546
547int input(void)	/* get next lexical input character */
548{
549	int c;
550	extern char *lexprog;
551
552	if (yysptr > yysbuf)
553		c = (uschar)*--yysptr;
554	else if (lexprog != NULL) {	/* awk '...' */
555		if ((c = (uschar)*lexprog) != 0)
556			lexprog++;
557	} else				/* awk -f ... */
558		c = pgetc();
559	if (c == '\n')
560		lineno++;
561	else if (c == EOF)
562		c = 0;
563	if (ep >= ebuf + sizeof ebuf)
564		ep = ebuf;
565	return *ep++ = c;
566}
567
568void unput(int c)	/* put lexical character back on input */
569{
570	if (c == '\n')
571		lineno--;
572	if (yysptr >= yysbuf + sizeof(yysbuf))
573		FATAL("pushed back too much: %.20s...", yysbuf);
574	*yysptr++ = c;
575	if (--ep < ebuf)
576		ep = ebuf + sizeof(ebuf) - 1;
577}
578
579void unputstr(const char *s)	/* put a string back on input */
580{
581	int i;
582
583	for (i = strlen(s)-1; i >= 0; i--)
584		unput(s[i]);
585}
586