lex.c revision 90902
1248590Smm/****************************************************************
2248590SmmCopyright (C) Lucent Technologies 1997
3248590SmmAll Rights Reserved
4248590Smm
5248590SmmPermission to use, copy, modify, and distribute this software and
6248590Smmits documentation for any purpose and without fee is hereby
7248590Smmgranted, provided that the above copyright notice appear in all
8248590Smmcopies and that both that the copyright notice and this
9248590Smmpermission notice and warranty disclaimer appear in supporting
10248590Smmdocumentation, and that the name Lucent Technologies or any of
11248590Smmits entities not be used in advertising or publicity pertaining
12248590Smmto distribution of the software without specific, written prior
13248590Smmpermission.
14248590Smm
15248590SmmLUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16248590SmmINCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17248590SmmIN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18248590SmmSPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19248590SmmWHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20248590SmmIN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21248590SmmARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22248590SmmTHIS SOFTWARE.
23248590Smm****************************************************************/
24248590Smm
25248590Smm#include <stdio.h>
26248590Smm#include <stdlib.h>
27248590Smm#include <string.h>
28248590Smm#include <ctype.h>
29248590Smm#include "awk.h"
30248590Smm#include "ytab.h"
31248590Smm
32248590Smmextern YYSTYPE	yylval;
33248590Smmextern int	infunc;
34248590Smm
35248590Smmint	lineno	= 1;
36248590Smmint	bracecnt = 0;
37248590Smmint	brackcnt  = 0;
38248590Smmint	parencnt = 0;
39248590Smm
40248590Smmtypedef struct Keyword {
41248590Smm	char	*word;
42248590Smm	int	sub;
43248590Smm	int	type;
44248590Smm} Keyword;
45248590Smm
46248590SmmKeyword keywords[] ={	/* keep sorted: binary searched */
47248590Smm	{ "BEGIN",	XBEGIN,		XBEGIN },
48248590Smm	{ "END",	XEND,		XEND },
49248590Smm	{ "NF",		VARNF,		VARNF },
50248590Smm	{ "atan2",	FATAN,		BLTIN },
51248590Smm	{ "break",	BREAK,		BREAK },
52248590Smm	{ "close",	CLOSE,		CLOSE },
53248590Smm	{ "continue",	CONTINUE,	CONTINUE },
54248590Smm	{ "cos",	FCOS,		BLTIN },
55248590Smm	{ "delete",	DELETE,		DELETE },
56248590Smm	{ "do",		DO,		DO },
57248590Smm	{ "else",	ELSE,		ELSE },
58248590Smm	{ "exit",	EXIT,		EXIT },
59248590Smm	{ "exp",	FEXP,		BLTIN },
60248590Smm	{ "fflush",	FFLUSH,		BLTIN },
61248590Smm	{ "for",	FOR,		FOR },
62248590Smm	{ "func",	FUNC,		FUNC },
63248590Smm	{ "function",	FUNC,		FUNC },
64248590Smm	{ "getline",	GETLINE,	GETLINE },
65248590Smm	{ "gsub",	GSUB,		GSUB },
66248590Smm	{ "if",		IF,		IF },
67248590Smm	{ "in",		IN,		IN },
68248590Smm	{ "index",	INDEX,		INDEX },
69248590Smm	{ "int",	FINT,		BLTIN },
70248590Smm	{ "length",	FLENGTH,	BLTIN },
71248590Smm	{ "log",	FLOG,		BLTIN },
72248590Smm	{ "match",	MATCHFCN,	MATCHFCN },
73248590Smm	{ "next",	NEXT,		NEXT },
74248590Smm	{ "nextfile",	NEXTFILE,	NEXTFILE },
75362133Smm	{ "print",	PRINT,		PRINT },
76362133Smm	{ "printf",	PRINTF,		PRINTF },
77362133Smm	{ "rand",	FRAND,		BLTIN },
78248590Smm	{ "return",	RETURN,		RETURN },
79248590Smm	{ "sin",	FSIN,		BLTIN },
80248590Smm	{ "split",	SPLIT,		SPLIT },
81248590Smm	{ "sprintf",	SPRINTF,	SPRINTF },
82248590Smm	{ "sqrt",	FSQRT,		BLTIN },
83248590Smm	{ "srand",	FSRAND,		BLTIN },
84248590Smm	{ "sub",	SUB,		SUB },
85248590Smm	{ "substr",	SUBSTR,		SUBSTR },
86248590Smm	{ "system",	FSYSTEM,	BLTIN },
87248590Smm	{ "tolower",	FTOLOWER,	BLTIN },
88248590Smm	{ "toupper",	FTOUPPER,	BLTIN },
89248590Smm	{ "while",	WHILE,		WHILE },
90248590Smm};
91248590Smm
92248590Smm#define DEBUG
93248590Smm#ifdef	DEBUG
94248590Smm#define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95248590Smm#else
96248590Smm#define	RET(x)	return(x)
97248590Smm#endif
98248590Smm
99248590Smmint peek(void)
100248590Smm{
101248590Smm	int c = input();
102248590Smm	unput(c);
103248590Smm	return c;
104248590Smm}
105248590Smm
106248590Smmint gettok(char **pbuf, int *psz)	/* get next input token */
107248590Smm{
108248590Smm	int c, retc;
109248590Smm	char *buf = *pbuf;
110248590Smm	int sz = *psz;
111248590Smm	char *bp = buf;
112248590Smm
113248590Smm	c = input();
114248590Smm	if (c == 0)
115248590Smm		return 0;
116248590Smm	buf[0] = c;
117248590Smm	buf[1] = 0;
118248590Smm	if (!isalnum(c) && c != '.' && c != '_')
119248590Smm		return c;
120248590Smm
121248590Smm	*bp++ = c;
122248590Smm	if (isalpha(c) || c == '_') {	/* it's a varname */
123248590Smm		for ( ; (c = input()) != 0; ) {
124248590Smm			if (bp-buf >= sz)
125248590Smm				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126248590Smm					FATAL( "out of space for name %.10s...", buf );
127248590Smm			if (isalnum(c) || c == '_')
128248590Smm				*bp++ = c;
129248590Smm			else {
130248590Smm				*bp = 0;
131248590Smm				unput(c);
132248590Smm				break;
133248590Smm			}
134248590Smm		}
135248590Smm		*bp = 0;
136248590Smm		retc = 'a';	/* alphanumeric */
137248590Smm	} else {	/* it's a number */
138248590Smm		char *rem;
139248590Smm		/* read input until can't be a number */
140248590Smm		for ( ; (c = input()) != 0; ) {
141248590Smm			if (bp-buf >= sz)
142248590Smm				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
143248590Smm					FATAL( "out of space for number %.10s...", buf );
144248590Smm			if (isdigit(c) || c == 'e' || c == 'E'
145248590Smm			  || c == '.' || c == '+' || c == '-')
146248590Smm				*bp++ = c;
147248590Smm			else {
148248590Smm				unput(c);
149248590Smm				break;
150248590Smm			}
151248590Smm		}
152248590Smm		*bp = 0;
153248590Smm		strtod(buf, &rem);	/* parse the number */
154248590Smm		unputstr(rem);		/* put rest back for later */
155248590Smm		if (rem == buf) {	/* it wasn't a valid number at all */
156248590Smm			buf[1] = 0;	/* so return one character as token */
157248590Smm			retc = buf[0];	/* character is its own type */
158248590Smm		} else {	/* some prefix was a number */
159248590Smm			rem[0] = 0;	/* so truncate where failure started */
160248590Smm			retc = '0';	/* number */
161248590Smm		}
162248590Smm	}
163248590Smm	*pbuf = buf;
164248590Smm	*psz = sz;
165248590Smm	return retc;
166248590Smm}
167248590Smm
168248590Smmint	word(char *);
169248590Smmint	string(void);
170248590Smmint	regexpr(void);
171248590Smmint	sc	= 0;	/* 1 => return a } right now */
172248590Smmint	reg	= 0;	/* 1 => return a REGEXPR now */
173248590Smm
174248590Smmint yylex(void)
175248590Smm{
176248590Smm	int c;
177248590Smm	static char *buf = 0;
178248590Smm	static int bufsize = 500;
179248590Smm
180248590Smm	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
181362133Smm		FATAL( "out of space in yylex" );
182362133Smm	if (sc) {
183248590Smm		sc = 0;
184248590Smm		RET('}');
185248590Smm	}
186248590Smm	if (reg) {
187248590Smm		reg = 0;
188248590Smm		return regexpr();
189248590Smm	}
190248590Smm	for (;;) {
191248590Smm		c = gettok(&buf, &bufsize);
192248590Smm		if (c == 0)
193248590Smm			return 0;
194248590Smm		if (isalpha(c) || c == '_')
195248590Smm			return word(buf);
196248590Smm		if (isdigit(c)) {
197362133Smm			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
198248590Smm			/* should this also have STR set? */
199248590Smm			RET(NUMBER);
200248590Smm		}
201248590Smm
202248590Smm		yylval.i = c;
203248590Smm		switch (c) {
204248590Smm		case '\n':	/* {EOL} */
205248590Smm			RET(NL);
206248590Smm		case '\r':	/* assume \n is coming */
207248590Smm		case ' ':	/* {WS}+ */
208248590Smm		case '\t':
209248590Smm			break;
210248590Smm		case '#':	/* #.* strip comments */
211248590Smm			while ((c = input()) != '\n' && c != 0)
212248590Smm				;
213248590Smm			unput(c);
214248590Smm			break;
215248590Smm		case ';':
216248590Smm			RET(';');
217248590Smm		case '\\':
218248590Smm			if (peek() == '\n') {
219248590Smm				input();
220248590Smm			} else if (peek() == '\r') {
221248590Smm				input(); input();	/* \n */
222248590Smm				lineno++;
223248590Smm			} else {
224248590Smm				RET(c);
225248590Smm			}
226248590Smm			break;
227248590Smm		case '&':
228248590Smm			if (peek() == '&') {
229248590Smm				input(); RET(AND);
230248590Smm			} else
231248590Smm				RET('&');
232248590Smm		case '|':
233248590Smm			if (peek() == '|') {
234248590Smm				input(); RET(BOR);
235248590Smm			} else
236248590Smm				RET('|');
237248590Smm		case '!':
238248590Smm			if (peek() == '=') {
239248590Smm				input(); yylval.i = NE; RET(NE);
240248590Smm			} else if (peek() == '~') {
241				input(); yylval.i = NOTMATCH; RET(MATCHOP);
242			} else
243				RET(NOT);
244		case '~':
245			yylval.i = MATCH;
246			RET(MATCHOP);
247		case '<':
248			if (peek() == '=') {
249				input(); yylval.i = LE; RET(LE);
250			} else {
251				yylval.i = LT; RET(LT);
252			}
253		case '=':
254			if (peek() == '=') {
255				input(); yylval.i = EQ; RET(EQ);
256			} else {
257				yylval.i = ASSIGN; RET(ASGNOP);
258			}
259		case '>':
260			if (peek() == '=') {
261				input(); yylval.i = GE; RET(GE);
262			} else if (peek() == '>') {
263				input(); yylval.i = APPEND; RET(APPEND);
264			} else {
265				yylval.i = GT; RET(GT);
266			}
267		case '+':
268			if (peek() == '+') {
269				input(); yylval.i = INCR; RET(INCR);
270			} else if (peek() == '=') {
271				input(); yylval.i = ADDEQ; RET(ASGNOP);
272			} else
273				RET('+');
274		case '-':
275			if (peek() == '-') {
276				input(); yylval.i = DECR; RET(DECR);
277			} else if (peek() == '=') {
278				input(); yylval.i = SUBEQ; RET(ASGNOP);
279			} else
280				RET('-');
281		case '*':
282			if (peek() == '=') {	/* *= */
283				input(); yylval.i = MULTEQ; RET(ASGNOP);
284			} else if (peek() == '*') {	/* ** or **= */
285				input();	/* eat 2nd * */
286				if (peek() == '=') {
287					input(); yylval.i = POWEQ; RET(ASGNOP);
288				} else {
289					RET(POWER);
290				}
291			} else
292				RET('*');
293		case '/':
294			RET('/');
295		case '%':
296			if (peek() == '=') {
297				input(); yylval.i = MODEQ; RET(ASGNOP);
298			} else
299				RET('%');
300		case '^':
301			if (peek() == '=') {
302				input(); yylval.i = POWEQ; RET(ASGNOP);
303			} else
304				RET(POWER);
305
306		case '$':
307			/* BUG: awkward, if not wrong */
308			c = gettok(&buf, &bufsize);
309			if (isalpha(c)) {
310				if (strcmp(buf, "NF") == 0) {	/* very special */
311					unputstr("(NF)");
312					RET(INDIRECT);
313				}
314				c = peek();
315				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
316					unputstr(buf);
317					RET(INDIRECT);
318				}
319				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
320				RET(IVAR);
321			} else if (c == 0) {	/*  */
322				SYNTAX( "unexpected end of input after $" );
323				RET(';');
324			} else {
325				unputstr(buf);
326				RET(INDIRECT);
327			}
328
329		case '}':
330			if (--bracecnt < 0)
331				SYNTAX( "extra }" );
332			sc = 1;
333			RET(';');
334		case ']':
335			if (--brackcnt < 0)
336				SYNTAX( "extra ]" );
337			RET(']');
338		case ')':
339			if (--parencnt < 0)
340				SYNTAX( "extra )" );
341			RET(')');
342		case '{':
343			bracecnt++;
344			RET('{');
345		case '[':
346			brackcnt++;
347			RET('[');
348		case '(':
349			parencnt++;
350			RET('(');
351
352		case '"':
353			return string();	/* BUG: should be like tran.c ? */
354
355		default:
356			RET(c);
357		}
358	}
359}
360
361int string(void)
362{
363	int c, n;
364	char *s, *bp;
365	static char *buf = 0;
366	static int bufsz = 500;
367
368	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
369		FATAL("out of space for strings");
370	for (bp = buf; (c = input()) != '"'; ) {
371		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
372			FATAL("out of space for string %.10s...", buf);
373		switch (c) {
374		case '\n':
375		case '\r':
376		case 0:
377			SYNTAX( "non-terminated string %.10s...", buf );
378			lineno++;
379			if (c == 0)	/* hopeless */
380				FATAL( "giving up" );
381			break;
382		case '\\':
383			c = input();
384			switch (c) {
385			case '"': *bp++ = '"'; break;
386			case 'n': *bp++ = '\n'; break;
387			case 't': *bp++ = '\t'; break;
388			case 'f': *bp++ = '\f'; break;
389			case 'r': *bp++ = '\r'; break;
390			case 'b': *bp++ = '\b'; break;
391			case 'v': *bp++ = '\v'; break;
392			case 'a': *bp++ = '\007'; break;
393			case '\\': *bp++ = '\\'; break;
394
395			case '0': case '1': case '2': /* octal: \d \dd \ddd */
396			case '3': case '4': case '5': case '6': case '7':
397				n = c - '0';
398				if ((c = peek()) >= '0' && c < '8') {
399					n = 8 * n + input() - '0';
400					if ((c = peek()) >= '0' && c < '8')
401						n = 8 * n + input() - '0';
402				}
403				*bp++ = n;
404				break;
405
406			case 'x':	/* hex  \x0-9a-fA-F + */
407			    {	char xbuf[100], *px;
408				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
409					if (isdigit(c)
410					 || (c >= 'a' && c <= 'f')
411					 || (c >= 'A' && c <= 'F'))
412						*px++ = c;
413					else
414						break;
415				}
416				*px = 0;
417				unput(c);
418	  			sscanf(xbuf, "%x", &n);
419				*bp++ = n;
420				break;
421			    }
422
423			default:
424				*bp++ = c;
425				break;
426			}
427			break;
428		default:
429			*bp++ = c;
430			break;
431		}
432	}
433	*bp = 0;
434	s = tostring(buf);
435	*bp++ = ' '; *bp++ = 0;
436	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
437	RET(STRING);
438}
439
440
441int binsearch(char *w, Keyword *kp, int n)
442{
443	int cond, low, mid, high;
444
445	low = 0;
446	high = n - 1;
447	while (low <= high) {
448		mid = (low + high) / 2;
449		if ((cond = strcmp(w, kp[mid].word)) < 0)
450			high = mid - 1;
451		else if (cond > 0)
452			low = mid + 1;
453		else
454			return mid;
455	}
456	return -1;
457}
458
459int word(char *w)
460{
461	Keyword *kp;
462	int c, n;
463
464	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
465	kp = keywords + n;
466	if (n != -1) {	/* found in table */
467		yylval.i = kp->sub;
468		switch (kp->type) {	/* special handling */
469		case FSYSTEM:
470			if (safe)
471				SYNTAX( "system is unsafe" );
472			RET(kp->type);
473		case FUNC:
474			if (infunc)
475				SYNTAX( "illegal nested function" );
476			RET(kp->type);
477		case RETURN:
478			if (!infunc)
479				SYNTAX( "return not in function" );
480			RET(kp->type);
481		case VARNF:
482			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
483			RET(VARNF);
484		default:
485			RET(kp->type);
486		}
487	}
488	c = peek();	/* look for '(' */
489	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
490		yylval.i = n;
491		RET(ARG);
492	} else {
493		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
494		if (c == '(') {
495			RET(CALL);
496		} else {
497			RET(VAR);
498		}
499	}
500}
501
502void startreg(void)	/* next call to yyles will return a regular expression */
503{
504	reg = 1;
505}
506
507int regexpr(void)
508{
509	int c;
510	static char *buf = 0;
511	static int bufsz = 500;
512	char *bp;
513
514	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
515		FATAL("out of space for rex expr");
516	bp = buf;
517	for ( ; (c = input()) != '/' && c != 0; ) {
518		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
519			FATAL("out of space for reg expr %.10s...", buf);
520		if (c == '\n') {
521			SYNTAX( "newline in regular expression %.10s...", buf );
522			unput('\n');
523			break;
524		} else if (c == '\\') {
525			*bp++ = '\\';
526			*bp++ = input();
527		} else {
528			*bp++ = c;
529		}
530	}
531	*bp = 0;
532	yylval.s = tostring(buf);
533	unput('/');
534	RET(REGEXPR);
535}
536
537/* low-level lexical stuff, sort of inherited from lex */
538
539char	ebuf[300];
540char	*ep = ebuf;
541char	yysbuf[100];	/* pushback buffer */
542char	*yysptr = yysbuf;
543FILE	*yyin = 0;
544
545int input(void)	/* get next lexical input character */
546{
547	int c;
548	extern char *lexprog;
549
550	if (yysptr > yysbuf)
551		c = *--yysptr;
552	else if (lexprog != NULL) {	/* awk '...' */
553		if ((c = *lexprog) != 0)
554			lexprog++;
555	} else				/* awk -f ... */
556		c = pgetc();
557	if (c == '\n')
558		lineno++;
559	else if (c == EOF)
560		c = 0;
561	if (ep >= ebuf + sizeof ebuf)
562		ep = ebuf;
563	return *ep++ = c;
564}
565
566void unput(int c)	/* put lexical character back on input */
567{
568	if (c == '\n')
569		lineno--;
570	if (yysptr >= yysbuf + sizeof(yysbuf))
571		FATAL("pushed back too much: %.20s...", yysbuf);
572	*yysptr++ = c;
573	if (--ep < ebuf)
574		ep = ebuf + sizeof(ebuf) - 1;
575}
576
577void unputstr(char *s)	/* put a string back on input */
578{
579	int i;
580
581	for (i = strlen(s)-1; i >= 0; i--)
582		unput(s[i]);
583}
584