lexi.c revision 116390
1/*
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#if 0
37#ifndef lint
38static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39#endif /* not lint */
40#endif
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 116390 2003-06-15 09:28:17Z charnier $");
43
44/*
45 * Here we have the token scanner for indent.  It scans off one token and puts
46 * it in the global variable "token".  It returns a code, indicating the type
47 * of token scanned.
48 */
49
50#include <err.h>
51#include <stdio.h>
52#include <ctype.h>
53#include <stdlib.h>
54#include <string.h>
55#include "indent_globs.h"
56#include "indent_codes.h"
57#include "indent.h"
58
59#define alphanum 1
60#define opchar 3
61
62struct templ {
63    const char *rwd;
64    int         rwcode;
65};
66
67struct templ specials[1000] =
68{
69    {"switch", 1},
70    {"case", 2},
71    {"break", 0},
72    {"struct", 3},
73    {"union", 3},
74    {"enum", 3},
75    {"default", 2},
76    {"int", 4},
77    {"char", 4},
78    {"float", 4},
79    {"double", 4},
80    {"long", 4},
81    {"short", 4},
82    {"typdef", 4},
83    {"unsigned", 4},
84    {"register", 4},
85    {"static", 4},
86    {"global", 4},
87    {"extern", 4},
88    {"void", 4},
89    {"goto", 0},
90    {"return", 0},
91    {"if", 5},
92    {"while", 5},
93    {"for", 5},
94    {"else", 6},
95    {"do", 6},
96    {"sizeof", 7},
97    {"const", 9},
98    {"volatile", 9},
99    {0, 0}
100};
101
102char        chartype[128] =
103{				/* this is used to facilitate the decision of
104				 * what type (alphanumeric, operator) each
105				 * character is */
106    0, 0, 0, 0, 0, 0, 0, 0,
107    0, 0, 0, 0, 0, 0, 0, 0,
108    0, 0, 0, 0, 0, 0, 0, 0,
109    0, 0, 0, 0, 0, 0, 0, 0,
110    0, 3, 0, 0, 1, 3, 3, 0,
111    0, 0, 3, 3, 0, 3, 0, 3,
112    1, 1, 1, 1, 1, 1, 1, 1,
113    1, 1, 0, 0, 3, 3, 3, 3,
114    0, 1, 1, 1, 1, 1, 1, 1,
115    1, 1, 1, 1, 1, 1, 1, 1,
116    1, 1, 1, 1, 1, 1, 1, 1,
117    1, 1, 1, 0, 0, 0, 3, 1,
118    0, 1, 1, 1, 1, 1, 1, 1,
119    1, 1, 1, 1, 1, 1, 1, 1,
120    1, 1, 1, 1, 1, 1, 1, 1,
121    1, 1, 1, 0, 3, 0, 3, 0
122};
123
124int
125lexi(void)
126{
127    int         unary_delim;	/* this is set to 1 if the current token
128				 * forces a following operator to be unary */
129    static int  last_code;	/* the last token type returned */
130    static int  l_struct;	/* set to 1 if the last token was 'struct' */
131    int         code;		/* internal code to be returned */
132    char        qchar;		/* the delimiter character for a string */
133
134    e_token = s_token;		/* point to start of place to save token */
135    unary_delim = false;
136    ps.col_1 = ps.last_nl;	/* tell world that this token started in
137				 * column 1 iff the last thing scanned was nl */
138    ps.last_nl = false;
139
140    while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141	ps.col_1 = false;	/* leading blanks imply token is not in column
142				 * 1 */
143	if (++buf_ptr >= buf_end)
144	    fill_buffer();
145    }
146
147    /* Scan an alphanumeric token */
148    if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149	/*
150	 * we have a character or number
151	 */
152	const char *j;		/* used for searching thru list of
153				 *
154				 * reserved words */
155	struct templ *p;
156
157	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158	    int         seendot = 0,
159	                seenexp = 0,
160			seensfx = 0;
161	    if (*buf_ptr == '0' &&
162		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163		*e_token++ = *buf_ptr++;
164		*e_token++ = *buf_ptr++;
165		while (isxdigit(*buf_ptr)) {
166		    CHECK_SIZE_TOKEN;
167		    *e_token++ = *buf_ptr++;
168		}
169	    }
170	    else
171		while (1) {
172		    if (*buf_ptr == '.') {
173			if (seendot)
174			    break;
175			else
176			    seendot++;
177		    }
178		    CHECK_SIZE_TOKEN;
179		    *e_token++ = *buf_ptr++;
180		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182			    break;
183			else {
184			    seenexp++;
185			    seendot++;
186			    CHECK_SIZE_TOKEN;
187			    *e_token++ = *buf_ptr++;
188			    if (*buf_ptr == '+' || *buf_ptr == '-')
189				*e_token++ = *buf_ptr++;
190			}
191		    }
192		}
193	    while (1) {
194		if (!(seensfx & 1) &&
195			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
196		    CHECK_SIZE_TOKEN;
197		    *e_token++ = *buf_ptr++;
198		    seensfx |= 1;
199		    continue;
200		}
201        	if (!(seensfx & 2) &&
202			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
203		    CHECK_SIZE_TOKEN;
204		    if (buf_ptr[1] == buf_ptr[0])
205		        *e_token++ = *buf_ptr++;
206		    *e_token++ = *buf_ptr++;
207		    seensfx |= 2;
208		    continue;
209		}
210		break;
211	    }
212	}
213	else
214	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
215		/* fill_buffer() terminates buffer with newline */
216		if (*buf_ptr == BACKSLASH) {
217		    if (*(buf_ptr + 1) == '\n') {
218			buf_ptr += 2;
219			if (buf_ptr >= buf_end)
220			    fill_buffer();
221			} else
222			    break;
223		}
224		CHECK_SIZE_TOKEN;
225		/* copy it over */
226		*e_token++ = *buf_ptr++;
227		if (buf_ptr >= buf_end)
228		    fill_buffer();
229	    }
230	*e_token++ = '\0';
231	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
232	    if (++buf_ptr >= buf_end)
233		fill_buffer();
234	}
235	ps.its_a_keyword = false;
236	ps.sizeof_keyword = false;
237	if (l_struct) {		/* if last token was 'struct', then this token
238				 * should be treated as a declaration */
239	    l_struct = false;
240	    last_code = ident;
241	    ps.last_u_d = true;
242	    return (decl);
243	}
244	ps.last_u_d = false;	/* Operator after identifier is binary */
245	last_code = ident;	/* Remember that this is the code we will
246				 * return */
247
248	/*
249	 * This loop will check if the token is a keyword.
250	 */
251	for (p = specials; (j = p->rwd) != 0; p++) {
252	    const char *q = s_token;	/* point at scanned token */
253	    if (*j++ != *q++ || *j++ != *q++)
254		continue;	/* This test depends on the fact that
255				 * identifiers are always at least 1 character
256				 * long (ie. the first two bytes of the
257				 * identifier are always meaningful) */
258	    if (q[-1] == 0)
259		break;		/* If its a one-character identifier */
260	    while (*q++ == *j)
261		if (*j++ == 0)
262		    goto found_keyword;	/* I wish that C had a multi-level
263					 * break... */
264	}
265	if (p->rwd) {		/* we have a keyword */
266    found_keyword:
267	    ps.its_a_keyword = true;
268	    ps.last_u_d = true;
269	    switch (p->rwcode) {
270	    case 1:		/* it is a switch */
271		return (swstmt);
272	    case 2:		/* a case or default */
273		return (casestmt);
274
275	    case 3:		/* a "struct" */
276		if (ps.p_l_follow)
277			break;	/* inside parens: cast */
278		/*
279		 * Next time around, we may want to know that we have had a
280		 * 'struct'
281		 */
282		l_struct = true;
283
284		/*
285		 * Fall through to test for a cast, function prototype or
286		 * sizeof().
287		 */
288	    case 4:		/* one of the declaration keywords */
289		if (ps.p_l_follow) {
290		    ps.cast_mask |= 1 << ps.p_l_follow;
291
292		    /*
293		     * Forget that we saw `struct' if we're in a sizeof().
294		     */
295		    if (ps.sizeof_mask)
296			l_struct = false;
297
298		    break;	/* inside parens: cast, prototype or sizeof() */
299		}
300		last_code = decl;
301		return (decl);
302
303	    case 5:		/* if, while, for */
304		return (sp_paren);
305
306	    case 6:		/* do, else */
307		return (sp_nparen);
308
309	    case 7:
310		ps.sizeof_keyword = true;
311	    default:		/* all others are treated like any other
312				 * identifier */
313		return (ident);
314	    }			/* end of switch */
315	}			/* end of if (found_it) */
316	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
317	    char *tp = buf_ptr;
318	    while (tp < buf_end)
319		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
320		    goto not_proc;
321	    strncpy(ps.procname, token, sizeof ps.procname - 1);
322	    ps.in_parameter_declaration = 1;
323	    rparen_count = 1;
324    not_proc:;
325	}
326	/*
327	 * The following hack attempts to guess whether or not the current
328	 * token is in fact a declaration keyword -- one that has been
329	 * typedefd
330	 */
331	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
332		&& !ps.p_l_follow
333	        && !ps.block_init
334		&& (ps.last_token == rparen || ps.last_token == semicolon ||
335		    ps.last_token == decl ||
336		    ps.last_token == lbrace || ps.last_token == rbrace)) {
337	    ps.its_a_keyword = true;
338	    ps.last_u_d = true;
339	    last_code = decl;
340	    return decl;
341	}
342	if (last_code == decl)	/* if this is a declared variable, then
343				 * following sign is unary */
344	    ps.last_u_d = true;	/* will make "int a -1" work */
345	last_code = ident;
346	return (ident);		/* the ident is not in the list */
347    }				/* end of procesing for alpanum character */
348
349    /* Scan a non-alphanumeric token */
350
351    *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
352				 * moved here */
353    *e_token = '\0';
354    if (++buf_ptr >= buf_end)
355	fill_buffer();
356
357    switch (*token) {
358    case '\n':
359	unary_delim = ps.last_u_d;
360	ps.last_nl = true;	/* remember that we just had a newline */
361	code = (had_eof ? 0 : newline);
362
363	/*
364	 * if data has been exhausted, the newline is a dummy, and we should
365	 * return code to stop
366	 */
367	break;
368
369    case '\'':			/* start of quoted character */
370    case '"':			/* start of string */
371	qchar = *token;
372	if (troff) {
373	    e_token[-1] = '`';
374	    if (qchar == '"')
375		*e_token++ = '`';
376	    e_token = chfont(&bodyf, &stringf, e_token);
377	}
378	do {			/* copy the string */
379	    while (1) {		/* move one character or [/<char>]<char> */
380		if (*buf_ptr == '\n') {
381		    printf("%d: Unterminated literal\n", line_no);
382		    goto stop_lit;
383		}
384		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
385					 * since CHECK_SIZE guarantees that there
386					 * are at least 5 entries left */
387		*e_token = *buf_ptr++;
388		if (buf_ptr >= buf_end)
389		    fill_buffer();
390		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
391		    if (*buf_ptr == '\n')	/* check for escaped newline */
392			++line_no;
393		    if (troff) {
394			*++e_token = BACKSLASH;
395			if (*buf_ptr == BACKSLASH)
396			    *++e_token = BACKSLASH;
397		    }
398		    *++e_token = *buf_ptr++;
399		    ++e_token;	/* we must increment this again because we
400				 * copied two chars */
401		    if (buf_ptr >= buf_end)
402			fill_buffer();
403		}
404		else
405		    break;	/* we copied one character */
406	    }			/* end of while (1) */
407	} while (*e_token++ != qchar);
408	if (troff) {
409	    e_token = chfont(&stringf, &bodyf, e_token - 1);
410	    if (qchar == '"')
411		*e_token++ = '\'';
412	}
413stop_lit:
414	code = ident;
415	break;
416
417    case ('('):
418    case ('['):
419	unary_delim = true;
420	code = lparen;
421	break;
422
423    case (')'):
424    case (']'):
425	code = rparen;
426	break;
427
428    case '#':
429	unary_delim = ps.last_u_d;
430	code = preesc;
431	break;
432
433    case '?':
434	unary_delim = true;
435	code = question;
436	break;
437
438    case (':'):
439	code = colon;
440	unary_delim = true;
441	break;
442
443    case (';'):
444	unary_delim = true;
445	code = semicolon;
446	break;
447
448    case ('{'):
449	unary_delim = true;
450
451	/*
452	 * if (ps.in_or_st) ps.block_init = 1;
453	 */
454	/* ?	code = ps.block_init ? lparen : lbrace; */
455	code = lbrace;
456	break;
457
458    case ('}'):
459	unary_delim = true;
460	/* ?	code = ps.block_init ? rparen : rbrace; */
461	code = rbrace;
462	break;
463
464    case 014:			/* a form feed */
465	unary_delim = ps.last_u_d;
466	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
467				 * right */
468	code = form_feed;
469	break;
470
471    case (','):
472	unary_delim = true;
473	code = comma;
474	break;
475
476    case '.':
477	unary_delim = false;
478	code = period;
479	break;
480
481    case '-':
482    case '+':			/* check for -, +, --, ++ */
483	code = (ps.last_u_d ? unary_op : binary_op);
484	unary_delim = true;
485
486	if (*buf_ptr == token[0]) {
487	    /* check for doubled character */
488	    *e_token++ = *buf_ptr++;
489	    /* buffer overflow will be checked at end of loop */
490	    if (last_code == ident || last_code == rparen) {
491		code = (ps.last_u_d ? unary_op : postop);
492		/* check for following ++ or -- */
493		unary_delim = false;
494	    }
495	}
496	else if (*buf_ptr == '=')
497	    /* check for operator += */
498	    *e_token++ = *buf_ptr++;
499	else if (*buf_ptr == '>') {
500	    /* check for operator -> */
501	    *e_token++ = *buf_ptr++;
502	    if (!pointer_as_binop) {
503		unary_delim = false;
504		code = unary_op;
505		ps.want_blank = false;
506	    }
507	}
508	break;			/* buffer overflow will be checked at end of
509				 * switch */
510
511    case '=':
512	if (ps.in_or_st)
513	    ps.block_init = 1;
514#ifdef undef
515	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
516	    e_token[-1] = *buf_ptr++;
517	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
518		*e_token++ = *buf_ptr++;
519	    *e_token++ = '=';	/* Flip =+ to += */
520	    *e_token = 0;
521	}
522#else
523	if (*buf_ptr == '=') {/* == */
524	    *e_token++ = '=';	/* Flip =+ to += */
525	    buf_ptr++;
526	    *e_token = 0;
527	}
528#endif
529	code = binary_op;
530	unary_delim = true;
531	break;
532	/* can drop thru!!! */
533
534    case '>':
535    case '<':
536    case '!':			/* ops like <, <<, <=, !=, etc */
537	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
538	    *e_token++ = *buf_ptr;
539	    if (++buf_ptr >= buf_end)
540		fill_buffer();
541	}
542	if (*buf_ptr == '=')
543	    *e_token++ = *buf_ptr++;
544	code = (ps.last_u_d ? unary_op : binary_op);
545	unary_delim = true;
546	break;
547
548    default:
549	if (token[0] == '/' && *buf_ptr == '*') {
550	    /* it is start of comment */
551	    *e_token++ = '*';
552
553	    if (++buf_ptr >= buf_end)
554		fill_buffer();
555
556	    code = comment;
557	    unary_delim = ps.last_u_d;
558	    break;
559	}
560	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
561	    /*
562	     * handle ||, &&, etc, and also things as in int *****i
563	     */
564	    *e_token++ = *buf_ptr;
565	    if (++buf_ptr >= buf_end)
566		fill_buffer();
567	}
568	code = (ps.last_u_d ? unary_op : binary_op);
569	unary_delim = true;
570
571
572    }				/* end of switch */
573    if (code != newline) {
574	l_struct = false;
575	last_code = code;
576    }
577    if (buf_ptr >= buf_end)	/* check for input buffer empty */
578	fill_buffer();
579    ps.last_u_d = unary_delim;
580    *e_token = '\0';		/* null terminate the token */
581    return (code);
582}
583
584/*
585 * Add the given keyword to the keyword table, using val as the keyword type
586 */
587void
588addkey(char *key, int val)
589{
590    struct templ *p = specials;
591    while (p->rwd)
592	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
593	    return;
594	else
595	    p++;
596    if (p >= specials + sizeof specials / sizeof specials[0])
597	return;			/* For now, table overflows are silently
598				 * ignored */
599    p->rwd = key;
600    p->rwcode = val;
601    p[1].rwd = 0;
602    p[1].rwcode = 0;
603}
604