lex.c revision 85587
1/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "ytab.h"
31
32extern YYSTYPE	yylval;
33extern int	infunc;
34
35int	lineno	= 1;
36int	bracecnt = 0;
37int	brackcnt  = 0;
38int	parencnt = 0;
39
40typedef struct Keyword {
41	char	*word;
42	int	sub;
43	int	type;
44} Keyword;
45
46Keyword keywords[] ={	/* keep sorted: binary searched */
47	{ "BEGIN",	XBEGIN,		XBEGIN },
48	{ "END",	XEND,		XEND },
49	{ "NF",		VARNF,		VARNF },
50	{ "atan2",	FATAN,		BLTIN },
51	{ "break",	BREAK,		BREAK },
52	{ "close",	CLOSE,		CLOSE },
53	{ "continue",	CONTINUE,	CONTINUE },
54	{ "cos",	FCOS,		BLTIN },
55	{ "delete",	DELETE,		DELETE },
56	{ "do",		DO,		DO },
57	{ "else",	ELSE,		ELSE },
58	{ "exit",	EXIT,		EXIT },
59	{ "exp",	FEXP,		BLTIN },
60	{ "fflush",	FFLUSH,		BLTIN },
61	{ "for",	FOR,		FOR },
62	{ "func",	FUNC,		FUNC },
63	{ "function",	FUNC,		FUNC },
64	{ "getline",	GETLINE,	GETLINE },
65	{ "gsub",	GSUB,		GSUB },
66	{ "if",		IF,		IF },
67	{ "in",		IN,		IN },
68	{ "index",	INDEX,		INDEX },
69	{ "int",	FINT,		BLTIN },
70	{ "length",	FLENGTH,	BLTIN },
71	{ "log",	FLOG,		BLTIN },
72	{ "match",	MATCHFCN,	MATCHFCN },
73	{ "next",	NEXT,		NEXT },
74	{ "nextfile",	NEXTFILE,	NEXTFILE },
75	{ "print",	PRINT,		PRINT },
76	{ "printf",	PRINTF,		PRINTF },
77	{ "rand",	FRAND,		BLTIN },
78	{ "return",	RETURN,		RETURN },
79	{ "sin",	FSIN,		BLTIN },
80	{ "split",	SPLIT,		SPLIT },
81	{ "sprintf",	SPRINTF,	SPRINTF },
82	{ "sqrt",	FSQRT,		BLTIN },
83	{ "srand",	FSRAND,		BLTIN },
84	{ "sub",	SUB,		SUB },
85	{ "substr",	SUBSTR,		SUBSTR },
86	{ "system",	FSYSTEM,	BLTIN },
87	{ "tolower",	FTOLOWER,	BLTIN },
88	{ "toupper",	FTOUPPER,	BLTIN },
89	{ "while",	WHILE,		WHILE },
90};
91
92#define DEBUG
93#ifdef	DEBUG
94#define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
95#else
96#define	RET(x)	return(x)
97#endif
98
99int peek(void)
100{
101	int c = input();
102	unput(c);
103	return c;
104}
105
106int gettok(char **pbuf, int *psz)	/* get next input token */
107{
108	int c;
109	char *buf = *pbuf;
110	int sz = *psz;
111	char *bp = buf;
112
113	c = input();
114	if (c == 0)
115		return 0;
116	buf[0] = c;
117	buf[1] = 0;
118	if (!isalnum(c) && c != '.' && c != '_')
119		return c;
120
121	*bp++ = c;
122	if (isalpha(c) || c == '_') {	/* it's a varname */
123		for ( ; (c = input()) != 0; ) {
124			if (bp-buf >= sz)
125				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
126					FATAL( "out of space for name %.10s...", buf );
127			if (isalnum(c) || c == '_')
128				*bp++ = c;
129			else {
130				*bp = 0;
131				unput(c);
132				break;
133			}
134		}
135		*bp = 0;
136	} else {	/* it's a number */
137		char *rem;
138		/* read input until can't be a number */
139		for ( ; (c = input()) != 0; ) {
140			if (bp-buf >= sz)
141				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, 0))
142					FATAL( "out of space for number %.10s...", buf );
143			if (isdigit(c) || c == 'e' || c == 'E'
144			  || c == '.' || c == '+' || c == '-')
145				*bp++ = c;
146			else {
147				unput(c);
148				break;
149			}
150		}
151		*bp = 0;
152		strtod(buf, &rem);	/* parse the number */
153		unputstr(rem);		/* put rest back for later */
154		rem[0] = 0;
155	}
156	*pbuf = buf;
157	*psz = sz;
158	return buf[0];
159}
160
161int	word(char *);
162int	string(void);
163int	regexpr(void);
164int	sc	= 0;	/* 1 => return a } right now */
165int	reg	= 0;	/* 1 => return a REGEXPR now */
166
167int yylex(void)
168{
169	int c;
170	static char *buf = 0;
171	static int bufsize = 500;
172
173	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
174		FATAL( "out of space in yylex" );
175	if (sc) {
176		sc = 0;
177		RET('}');
178	}
179	if (reg) {
180		reg = 0;
181		return regexpr();
182	}
183	for (;;) {
184		c = gettok(&buf, &bufsize);
185		if (c == 0)
186			return 0;
187		if (isalpha(c) || c == '_')
188			return word(buf);
189		if (isdigit(c) || c == '.') {
190			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
191			/* should this also have STR set? */
192			RET(NUMBER);
193		}
194
195		yylval.i = c;
196		switch (c) {
197		case '\n':	/* {EOL} */
198			RET(NL);
199		case '\r':	/* assume \n is coming */
200		case ' ':	/* {WS}+ */
201		case '\t':
202			break;
203		case '#':	/* #.* strip comments */
204			while ((c = input()) != '\n' && c != 0)
205				;
206			unput(c);
207			break;
208		case ';':
209			RET(';');
210		case '\\':
211			if (peek() == '\n') {
212				input();
213			} else if (peek() == '\r') {
214				input(); input();	/* \n */
215				lineno++;
216			} else {
217				RET(c);
218			}
219			break;
220		case '&':
221			if (peek() == '&') {
222				input(); RET(AND);
223			} else
224				RET('&');
225		case '|':
226			if (peek() == '|') {
227				input(); RET(BOR);
228			} else
229				RET('|');
230		case '!':
231			if (peek() == '=') {
232				input(); yylval.i = NE; RET(NE);
233			} else if (peek() == '~') {
234				input(); yylval.i = NOTMATCH; RET(MATCHOP);
235			} else
236				RET(NOT);
237		case '~':
238			yylval.i = MATCH;
239			RET(MATCHOP);
240		case '<':
241			if (peek() == '=') {
242				input(); yylval.i = LE; RET(LE);
243			} else {
244				yylval.i = LT; RET(LT);
245			}
246		case '=':
247			if (peek() == '=') {
248				input(); yylval.i = EQ; RET(EQ);
249			} else {
250				yylval.i = ASSIGN; RET(ASGNOP);
251			}
252		case '>':
253			if (peek() == '=') {
254				input(); yylval.i = GE; RET(GE);
255			} else if (peek() == '>') {
256				input(); yylval.i = APPEND; RET(APPEND);
257			} else {
258				yylval.i = GT; RET(GT);
259			}
260		case '+':
261			if (peek() == '+') {
262				input(); yylval.i = INCR; RET(INCR);
263			} else if (peek() == '=') {
264				input(); yylval.i = ADDEQ; RET(ASGNOP);
265			} else
266				RET('+');
267		case '-':
268			if (peek() == '-') {
269				input(); yylval.i = DECR; RET(DECR);
270			} else if (peek() == '=') {
271				input(); yylval.i = SUBEQ; RET(ASGNOP);
272			} else
273				RET('-');
274		case '*':
275			if (peek() == '=') {	/* *= */
276				input(); yylval.i = MULTEQ; RET(ASGNOP);
277			} else if (peek() == '*') {	/* ** or **= */
278				input();	/* eat 2nd * */
279				if (peek() == '=') {
280					input(); yylval.i = POWEQ; RET(ASGNOP);
281				} else {
282					RET(POWER);
283				}
284			} else
285				RET('*');
286		case '/':
287			RET('/');
288		case '%':
289			if (peek() == '=') {
290				input(); yylval.i = MODEQ; RET(ASGNOP);
291			} else
292				RET('%');
293		case '^':
294			if (peek() == '=') {
295				input(); yylval.i = POWEQ; RET(ASGNOP);
296			} else
297				RET(POWER);
298
299		case '$':
300			/* BUG: awkward, if not wrong */
301			c = gettok(&buf, &bufsize);
302			if (isalpha(c)) {
303				if (strcmp(buf, "NF") == 0) {	/* very special */
304					unputstr("(NF)");
305					RET(INDIRECT);
306				}
307				c = peek();
308				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
309					unputstr(buf);
310					RET(INDIRECT);
311				}
312				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
313				RET(IVAR);
314			} else {
315				unputstr(buf);
316				RET(INDIRECT);
317			}
318
319		case '}':
320			if (--bracecnt < 0)
321				SYNTAX( "extra }" );
322			sc = 1;
323			RET(';');
324		case ']':
325			if (--brackcnt < 0)
326				SYNTAX( "extra ]" );
327			RET(']');
328		case ')':
329			if (--parencnt < 0)
330				SYNTAX( "extra )" );
331			RET(')');
332		case '{':
333			bracecnt++;
334			RET('{');
335		case '[':
336			brackcnt++;
337			RET('[');
338		case '(':
339			parencnt++;
340			RET('(');
341
342		case '"':
343			return string();	/* BUG: should be like tran.c ? */
344
345		default:
346			RET(c);
347		}
348	}
349}
350
351int string(void)
352{
353	int c, n;
354	char *s, *bp;
355	static char *buf = 0;
356	static int bufsz = 500;
357
358	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
359		FATAL("out of space for strings");
360	for (bp = buf; (c = input()) != '"'; ) {
361		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, 0))
362			FATAL("out of space for string %.10s...", buf);
363		switch (c) {
364		case '\n':
365		case '\r':
366		case 0:
367			SYNTAX( "non-terminated string %.10s...", buf );
368			lineno++;
369			break;
370		case '\\':
371			c = input();
372			switch (c) {
373			case '"': *bp++ = '"'; break;
374			case 'n': *bp++ = '\n'; break;
375			case 't': *bp++ = '\t'; break;
376			case 'f': *bp++ = '\f'; break;
377			case 'r': *bp++ = '\r'; break;
378			case 'b': *bp++ = '\b'; break;
379			case 'v': *bp++ = '\v'; break;
380			case 'a': *bp++ = '\007'; break;
381			case '\\': *bp++ = '\\'; break;
382
383			case '0': case '1': case '2': /* octal: \d \dd \ddd */
384			case '3': case '4': case '5': case '6': case '7':
385				n = c - '0';
386				if ((c = peek()) >= '0' && c < '8') {
387					n = 8 * n + input() - '0';
388					if ((c = peek()) >= '0' && c < '8')
389						n = 8 * n + input() - '0';
390				}
391				*bp++ = n;
392				break;
393
394			case 'x':	/* hex  \x0-9a-fA-F + */
395			    {	char xbuf[100], *px;
396				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
397					if (isdigit(c)
398					 || (c >= 'a' && c <= 'f')
399					 || (c >= 'A' && c <= 'F'))
400						*px++ = c;
401					else
402						break;
403				}
404				*px = 0;
405				unput(c);
406	  			sscanf(xbuf, "%x", &n);
407				*bp++ = n;
408				break;
409			    }
410
411			default:
412				*bp++ = c;
413				break;
414			}
415			break;
416		default:
417			*bp++ = c;
418			break;
419		}
420	}
421	*bp = 0;
422	s = tostring(buf);
423	*bp++ = ' '; *bp++ = 0;
424	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
425	RET(STRING);
426}
427
428
429int binsearch(char *w, Keyword *kp, int n)
430{
431	int cond, low, mid, high;
432
433	low = 0;
434	high = n - 1;
435	while (low <= high) {
436		mid = (low + high) / 2;
437		if ((cond = strcmp(w, kp[mid].word)) < 0)
438			high = mid - 1;
439		else if (cond > 0)
440			low = mid + 1;
441		else
442			return mid;
443	}
444	return -1;
445}
446
447int word(char *w)
448{
449	Keyword *kp;
450	int c, n;
451
452	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
453	kp = keywords + n;
454	if (n != -1) {	/* found in table */
455		yylval.i = kp->sub;
456		switch (kp->type) {	/* special handling */
457		case FSYSTEM:
458			if (safe)
459				SYNTAX( "system is unsafe" );
460			RET(kp->type);
461		case FUNC:
462			if (infunc)
463				SYNTAX( "illegal nested function" );
464			RET(kp->type);
465		case RETURN:
466			if (!infunc)
467				SYNTAX( "return not in function" );
468			RET(kp->type);
469		case VARNF:
470			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
471			RET(VARNF);
472		default:
473			RET(kp->type);
474		}
475	}
476	c = peek();	/* look for '(' */
477	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
478		yylval.i = n;
479		RET(ARG);
480	} else {
481		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
482		if (c == '(') {
483			RET(CALL);
484		} else {
485			RET(VAR);
486		}
487	}
488}
489
490void startreg(void)	/* next call to yyles will return a regular expression */
491{
492	reg = 1;
493}
494
495int regexpr(void)
496{
497	int c;
498	static char *buf = 0;
499	static int bufsz = 500;
500	char *bp;
501
502	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
503		FATAL("out of space for rex expr");
504	bp = buf;
505	for ( ; (c = input()) != '/' && c != 0; ) {
506		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, 0))
507			FATAL("out of space for reg expr %.10s...", buf);
508		if (c == '\n') {
509			SYNTAX( "newline in regular expression %.10s...", buf );
510			unput('\n');
511			break;
512		} else if (c == '\\') {
513			*bp++ = '\\';
514			*bp++ = input();
515		} else {
516			*bp++ = c;
517		}
518	}
519	*bp = 0;
520	yylval.s = tostring(buf);
521	unput('/');
522	RET(REGEXPR);
523}
524
525/* low-level lexical stuff, sort of inherited from lex */
526
527char	ebuf[300];
528char	*ep = ebuf;
529char	yysbuf[100];	/* pushback buffer */
530char	*yysptr = yysbuf;
531FILE	*yyin = 0;
532
533int input(void)	/* get next lexical input character */
534{
535	int c;
536	extern char *lexprog;
537
538	if (yysptr > yysbuf)
539		c = *--yysptr;
540	else if (lexprog != NULL) {	/* awk '...' */
541		if ((c = *lexprog) != 0)
542			lexprog++;
543	} else				/* awk -f ... */
544		c = pgetc();
545	if (c == '\n')
546		lineno++;
547	else if (c == EOF)
548		c = 0;
549	if (ep >= ebuf + sizeof ebuf)
550		ep = ebuf;
551	return *ep++ = c;
552}
553
554void unput(int c)	/* put lexical character back on input */
555{
556	if (c == '\n')
557		lineno--;
558	if (yysptr >= yysbuf + sizeof(yysbuf))
559		FATAL("pushed back too much: %.20s...", yysbuf);
560	*yysptr++ = c;
561	if (--ep < ebuf)
562		ep = ebuf + sizeof(ebuf) - 1;
563}
564
565void unputstr(char *s)	/* put a string back on input */
566{
567	int i;
568
569	for (i = strlen(s)-1; i >= 0; i--)
570		unput(s[i]);
571}
572