lexi.c revision 98771
1/*
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#if 0
37#ifndef lint
38static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39static const char rcsid[] =
40  "$FreeBSD: head/usr.bin/indent/lexi.c 98771 2002-06-24 17:40:27Z jmallett $";
41#endif /* not lint */
42#endif
43
44/*
45 * Here we have the token scanner for indent.  It scans off one token and puts
46 * it in the global variable "token".  It returns a code, indicating the type
47 * of token scanned.
48 */
49
50#include <stdio.h>
51#include <ctype.h>
52#include <stdlib.h>
53#include <string.h>
54#include "indent_globs.h"
55#include "indent_codes.h"
56#include "indent.h"
57
58#define alphanum 1
59#define opchar 3
60
61void fill_buffer(void);
62
63struct templ {
64    const char *rwd;
65    int         rwcode;
66};
67
68struct templ specials[1000] =
69{
70    {"switch", 1},
71    {"case", 2},
72    {"break", 0},
73    {"struct", 3},
74    {"union", 3},
75    {"enum", 3},
76    {"default", 2},
77    {"int", 4},
78    {"char", 4},
79    {"float", 4},
80    {"double", 4},
81    {"long", 4},
82    {"short", 4},
83    {"typdef", 4},
84    {"unsigned", 4},
85    {"register", 4},
86    {"static", 4},
87    {"global", 4},
88    {"extern", 4},
89    {"void", 4},
90    {"goto", 0},
91    {"return", 0},
92    {"if", 5},
93    {"while", 5},
94    {"for", 5},
95    {"else", 6},
96    {"do", 6},
97    {"sizeof", 7},
98    {"const", 9},
99    {"volatile", 9},
100    {0, 0}
101};
102
103char        chartype[128] =
104{				/* this is used to facilitate the decision of
105				 * what type (alphanumeric, operator) each
106				 * character is */
107    0, 0, 0, 0, 0, 0, 0, 0,
108    0, 0, 0, 0, 0, 0, 0, 0,
109    0, 0, 0, 0, 0, 0, 0, 0,
110    0, 0, 0, 0, 0, 0, 0, 0,
111    0, 3, 0, 0, 1, 3, 3, 0,
112    0, 0, 3, 3, 0, 3, 0, 3,
113    1, 1, 1, 1, 1, 1, 1, 1,
114    1, 1, 0, 0, 3, 3, 3, 3,
115    0, 1, 1, 1, 1, 1, 1, 1,
116    1, 1, 1, 1, 1, 1, 1, 1,
117    1, 1, 1, 1, 1, 1, 1, 1,
118    1, 1, 1, 0, 0, 0, 3, 1,
119    0, 1, 1, 1, 1, 1, 1, 1,
120    1, 1, 1, 1, 1, 1, 1, 1,
121    1, 1, 1, 1, 1, 1, 1, 1,
122    1, 1, 1, 0, 3, 0, 3, 0
123};
124
125int
126lexi(void)
127{
128    int         unary_delim;	/* this is set to 1 if the current token
129				 * forces a following operator to be unary */
130    static int  last_code;	/* the last token type returned */
131    static int  l_struct;	/* set to 1 if the last token was 'struct' */
132    int         code;		/* internal code to be returned */
133    char        qchar;		/* the delimiter character for a string */
134
135    e_token = s_token;		/* point to start of place to save token */
136    unary_delim = false;
137    ps.col_1 = ps.last_nl;	/* tell world that this token started in
138				 * column 1 iff the last thing scanned was nl */
139    ps.last_nl = false;
140
141    while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
142	ps.col_1 = false;	/* leading blanks imply token is not in column
143				 * 1 */
144	if (++buf_ptr >= buf_end)
145	    fill_buffer();
146    }
147
148    /* Scan an alphanumeric token */
149    if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
150	/*
151	 * we have a character or number
152	 */
153	const char *j;		/* used for searching thru list of
154				 *
155				 * reserved words */
156	struct templ *p;
157
158	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
159	    int         seendot = 0,
160	                seenexp = 0,
161			seensfx = 0;
162	    if (*buf_ptr == '0' &&
163		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
164		*e_token++ = *buf_ptr++;
165		*e_token++ = *buf_ptr++;
166		while (isxdigit(*buf_ptr)) {
167		    CHECK_SIZE_TOKEN;
168		    *e_token++ = *buf_ptr++;
169		}
170	    }
171	    else
172		while (1) {
173		    if (*buf_ptr == '.') {
174			if (seendot)
175			    break;
176			else
177			    seendot++;
178		    }
179		    CHECK_SIZE_TOKEN;
180		    *e_token++ = *buf_ptr++;
181		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
182			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
183			    break;
184			else {
185			    seenexp++;
186			    seendot++;
187			    CHECK_SIZE_TOKEN;
188			    *e_token++ = *buf_ptr++;
189			    if (*buf_ptr == '+' || *buf_ptr == '-')
190				*e_token++ = *buf_ptr++;
191			}
192		    }
193		}
194	    while (1) {
195		if (!(seensfx & 1) &&
196			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
197		    CHECK_SIZE_TOKEN;
198		    *e_token++ = *buf_ptr++;
199		    seensfx |= 1;
200		    continue;
201		}
202        	if (!(seensfx & 2) &&
203			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
204		    CHECK_SIZE_TOKEN;
205		    if (buf_ptr[1] == buf_ptr[0])
206		        *e_token++ = *buf_ptr++;
207		    *e_token++ = *buf_ptr++;
208		    seensfx |= 2;
209		    continue;
210		}
211		break;
212	    }
213	}
214	else
215	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
216		/* fill_buffer() terminates buffer with newline */
217		if (*buf_ptr == BACKSLASH) {
218		    if (*(buf_ptr + 1) == '\n') {
219			buf_ptr += 2;
220			if (buf_ptr >= buf_end)
221			    fill_buffer();
222			} else
223			    break;
224		}
225		CHECK_SIZE_TOKEN;
226		/* copy it over */
227		*e_token++ = *buf_ptr++;
228		if (buf_ptr >= buf_end)
229		    fill_buffer();
230	    }
231	*e_token++ = '\0';
232	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
233	    if (++buf_ptr >= buf_end)
234		fill_buffer();
235	}
236	ps.its_a_keyword = false;
237	ps.sizeof_keyword = false;
238	if (l_struct) {		/* if last token was 'struct', then this token
239				 * should be treated as a declaration */
240	    l_struct = false;
241	    last_code = ident;
242	    ps.last_u_d = true;
243	    return (decl);
244	}
245	ps.last_u_d = false;	/* Operator after indentifier is binary */
246	last_code = ident;	/* Remember that this is the code we will
247				 * return */
248
249	/*
250	 * This loop will check if the token is a keyword.
251	 */
252	for (p = specials; (j = p->rwd) != 0; p++) {
253	    const char *q = s_token;	/* point at scanned token */
254	    if (*j++ != *q++ || *j++ != *q++)
255		continue;	/* This test depends on the fact that
256				 * identifiers are always at least 1 character
257				 * long (ie. the first two bytes of the
258				 * identifier are always meaningful) */
259	    if (q[-1] == 0)
260		break;		/* If its a one-character identifier */
261	    while (*q++ == *j)
262		if (*j++ == 0)
263		    goto found_keyword;	/* I wish that C had a multi-level
264					 * break... */
265	}
266	if (p->rwd) {		/* we have a keyword */
267    found_keyword:
268	    ps.its_a_keyword = true;
269	    ps.last_u_d = true;
270	    switch (p->rwcode) {
271	    case 1:		/* it is a switch */
272		return (swstmt);
273	    case 2:		/* a case or default */
274		return (casestmt);
275
276	    case 3:		/* a "struct" */
277		if (ps.p_l_follow)
278			break;	/* inside parens: cast */
279		/*
280		 * Next time around, we may want to know that we have had a
281		 * 'struct'
282		 */
283		l_struct = true;
284
285		/*
286		 * Fall through to test for a cast, function prototype or
287		 * sizeof().
288		 */
289	    case 4:		/* one of the declaration keywords */
290		if (ps.p_l_follow) {
291		    ps.cast_mask |= 1 << ps.p_l_follow;
292
293		    /*
294		     * Forget that we saw `struct' if we're in a sizeof().
295		     */
296		    if (ps.sizeof_mask)
297			l_struct = false;
298
299		    break;	/* inside parens: cast, prototype or sizeof() */
300		}
301		last_code = decl;
302		return (decl);
303
304	    case 5:		/* if, while, for */
305		return (sp_paren);
306
307	    case 6:		/* do, else */
308		return (sp_nparen);
309
310	    case 7:
311		ps.sizeof_keyword = true;
312	    default:		/* all others are treated like any other
313				 * identifier */
314		return (ident);
315	    }			/* end of switch */
316	}			/* end of if (found_it) */
317	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
318	    char *tp = buf_ptr;
319	    while (tp < buf_end)
320		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
321		    goto not_proc;
322	    strncpy(ps.procname, token, sizeof ps.procname - 1);
323	    ps.in_parameter_declaration = 1;
324	    rparen_count = 1;
325    not_proc:;
326	}
327	/*
328	 * The following hack attempts to guess whether or not the current
329	 * token is in fact a declaration keyword -- one that has been
330	 * typedefd
331	 */
332	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
333		&& !ps.p_l_follow
334	        && !ps.block_init
335		&& (ps.last_token == rparen || ps.last_token == semicolon ||
336		    ps.last_token == decl ||
337		    ps.last_token == lbrace || ps.last_token == rbrace)) {
338	    ps.its_a_keyword = true;
339	    ps.last_u_d = true;
340	    last_code = decl;
341	    return decl;
342	}
343	if (last_code == decl)	/* if this is a declared variable, then
344				 * following sign is unary */
345	    ps.last_u_d = true;	/* will make "int a -1" work */
346	last_code = ident;
347	return (ident);		/* the ident is not in the list */
348    }				/* end of procesing for alpanum character */
349
350    /* Scan a non-alphanumeric token */
351
352    *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
353				 * moved here */
354    *e_token = '\0';
355    if (++buf_ptr >= buf_end)
356	fill_buffer();
357
358    switch (*token) {
359    case '\n':
360	unary_delim = ps.last_u_d;
361	ps.last_nl = true;	/* remember that we just had a newline */
362	code = (had_eof ? 0 : newline);
363
364	/*
365	 * if data has been exausted, the newline is a dummy, and we should
366	 * return code to stop
367	 */
368	break;
369
370    case '\'':			/* start of quoted character */
371    case '"':			/* start of string */
372	qchar = *token;
373	if (troff) {
374	    e_token[-1] = '`';
375	    if (qchar == '"')
376		*e_token++ = '`';
377	    e_token = chfont(&bodyf, &stringf, e_token);
378	}
379	do {			/* copy the string */
380	    while (1) {		/* move one character or [/<char>]<char> */
381		if (*buf_ptr == '\n') {
382		    printf("%d: Unterminated literal\n", line_no);
383		    goto stop_lit;
384		}
385		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
386					 * since CHECK_SIZE guarantees that there
387					 * are at least 5 entries left */
388		*e_token = *buf_ptr++;
389		if (buf_ptr >= buf_end)
390		    fill_buffer();
391		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
392		    if (*buf_ptr == '\n')	/* check for escaped newline */
393			++line_no;
394		    if (troff) {
395			*++e_token = BACKSLASH;
396			if (*buf_ptr == BACKSLASH)
397			    *++e_token = BACKSLASH;
398		    }
399		    *++e_token = *buf_ptr++;
400		    ++e_token;	/* we must increment this again because we
401				 * copied two chars */
402		    if (buf_ptr >= buf_end)
403			fill_buffer();
404		}
405		else
406		    break;	/* we copied one character */
407	    }			/* end of while (1) */
408	} while (*e_token++ != qchar);
409	if (troff) {
410	    e_token = chfont(&stringf, &bodyf, e_token - 1);
411	    if (qchar == '"')
412		*e_token++ = '\'';
413	}
414stop_lit:
415	code = ident;
416	break;
417
418    case ('('):
419    case ('['):
420	unary_delim = true;
421	code = lparen;
422	break;
423
424    case (')'):
425    case (']'):
426	code = rparen;
427	break;
428
429    case '#':
430	unary_delim = ps.last_u_d;
431	code = preesc;
432	break;
433
434    case '?':
435	unary_delim = true;
436	code = question;
437	break;
438
439    case (':'):
440	code = colon;
441	unary_delim = true;
442	break;
443
444    case (';'):
445	unary_delim = true;
446	code = semicolon;
447	break;
448
449    case ('{'):
450	unary_delim = true;
451
452	/*
453	 * if (ps.in_or_st) ps.block_init = 1;
454	 */
455	/* ?	code = ps.block_init ? lparen : lbrace; */
456	code = lbrace;
457	break;
458
459    case ('}'):
460	unary_delim = true;
461	/* ?	code = ps.block_init ? rparen : rbrace; */
462	code = rbrace;
463	break;
464
465    case 014:			/* a form feed */
466	unary_delim = ps.last_u_d;
467	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
468				 * right */
469	code = form_feed;
470	break;
471
472    case (','):
473	unary_delim = true;
474	code = comma;
475	break;
476
477    case '.':
478	unary_delim = false;
479	code = period;
480	break;
481
482    case '-':
483    case '+':			/* check for -, +, --, ++ */
484	code = (ps.last_u_d ? unary_op : binary_op);
485	unary_delim = true;
486
487	if (*buf_ptr == token[0]) {
488	    /* check for doubled character */
489	    *e_token++ = *buf_ptr++;
490	    /* buffer overflow will be checked at end of loop */
491	    if (last_code == ident || last_code == rparen) {
492		code = (ps.last_u_d ? unary_op : postop);
493		/* check for following ++ or -- */
494		unary_delim = false;
495	    }
496	}
497	else if (*buf_ptr == '=')
498	    /* check for operator += */
499	    *e_token++ = *buf_ptr++;
500	else if (*buf_ptr == '>') {
501	    /* check for operator -> */
502	    *e_token++ = *buf_ptr++;
503	    if (!pointer_as_binop) {
504		unary_delim = false;
505		code = unary_op;
506		ps.want_blank = false;
507	    }
508	}
509	break;			/* buffer overflow will be checked at end of
510				 * switch */
511
512    case '=':
513	if (ps.in_or_st)
514	    ps.block_init = 1;
515#ifdef undef
516	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
517	    e_token[-1] = *buf_ptr++;
518	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
519		*e_token++ = *buf_ptr++;
520	    *e_token++ = '=';	/* Flip =+ to += */
521	    *e_token = 0;
522	}
523#else
524	if (*buf_ptr == '=') {/* == */
525	    *e_token++ = '=';	/* Flip =+ to += */
526	    buf_ptr++;
527	    *e_token = 0;
528	}
529#endif
530	code = binary_op;
531	unary_delim = true;
532	break;
533	/* can drop thru!!! */
534
535    case '>':
536    case '<':
537    case '!':			/* ops like <, <<, <=, !=, etc */
538	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
539	    *e_token++ = *buf_ptr;
540	    if (++buf_ptr >= buf_end)
541		fill_buffer();
542	}
543	if (*buf_ptr == '=')
544	    *e_token++ = *buf_ptr++;
545	code = (ps.last_u_d ? unary_op : binary_op);
546	unary_delim = true;
547	break;
548
549    default:
550	if (token[0] == '/' && *buf_ptr == '*') {
551	    /* it is start of comment */
552	    *e_token++ = '*';
553
554	    if (++buf_ptr >= buf_end)
555		fill_buffer();
556
557	    code = comment;
558	    unary_delim = ps.last_u_d;
559	    break;
560	}
561	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
562	    /*
563	     * handle ||, &&, etc, and also things as in int *****i
564	     */
565	    *e_token++ = *buf_ptr;
566	    if (++buf_ptr >= buf_end)
567		fill_buffer();
568	}
569	code = (ps.last_u_d ? unary_op : binary_op);
570	unary_delim = true;
571
572
573    }				/* end of switch */
574    if (code != newline) {
575	l_struct = false;
576	last_code = code;
577    }
578    if (buf_ptr >= buf_end)	/* check for input buffer empty */
579	fill_buffer();
580    ps.last_u_d = unary_delim;
581    *e_token = '\0';		/* null terminate the token */
582    return (code);
583}
584
585/*
586 * Add the given keyword to the keyword table, using val as the keyword type
587 */
588void
589addkey(char *key, int val)
590{
591    struct templ *p = specials;
592    while (p->rwd)
593	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
594	    return;
595	else
596	    p++;
597    if (p >= specials + sizeof specials / sizeof specials[0])
598	return;			/* For now, table overflows are silently
599				 * ignored */
600    p->rwd = key;
601    p->rwcode = val;
602    p[1].rwd = 0;
603    p[1].rwcode = 0;
604}
605