lexi.c revision 69796
1/*
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#ifndef lint
37static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
38static const char rcsid[] =
39  "@(#)$FreeBSD: head/usr.bin/indent/lexi.c 69796 2000-12-09 09:52:52Z obrien $";
40#endif /* not lint */
41
42/*
43 * Here we have the token scanner for indent.  It scans off one token and puts
44 * it in the global variable "token".  It returns a code, indicating the type
45 * of token scanned.
46 */
47
48#include <stdio.h>
49#include <ctype.h>
50#include <stdlib.h>
51#include <string.h>
52#include "indent_globs.h"
53#include "indent_codes.h"
54
55#define alphanum 1
56#define opchar 3
57
58struct templ {
59    char       *rwd;
60    int         rwcode;
61};
62
63struct templ specials[1000] =
64{
65    "switch", 1,
66    "case", 2,
67    "break", 0,
68    "struct", 3,
69    "union", 3,
70    "enum", 3,
71    "default", 2,
72    "int", 4,
73    "char", 4,
74    "float", 4,
75    "double", 4,
76    "long", 4,
77    "short", 4,
78    "typdef", 4,
79    "unsigned", 4,
80    "register", 4,
81    "static", 4,
82    "global", 4,
83    "extern", 4,
84    "void", 4,
85    "goto", 0,
86    "return", 0,
87    "if", 5,
88    "while", 5,
89    "for", 5,
90    "else", 6,
91    "do", 6,
92    "sizeof", 7,
93    "const", 9,
94    "volatile", 9,
95    0, 0
96};
97
98char        chartype[128] =
99{				/* this is used to facilitate the decision of
100				 * what type (alphanumeric, operator) each
101				 * character is */
102    0, 0, 0, 0, 0, 0, 0, 0,
103    0, 0, 0, 0, 0, 0, 0, 0,
104    0, 0, 0, 0, 0, 0, 0, 0,
105    0, 0, 0, 0, 0, 0, 0, 0,
106    0, 3, 0, 0, 1, 3, 3, 0,
107    0, 0, 3, 3, 0, 3, 0, 3,
108    1, 1, 1, 1, 1, 1, 1, 1,
109    1, 1, 0, 0, 3, 3, 3, 3,
110    0, 1, 1, 1, 1, 1, 1, 1,
111    1, 1, 1, 1, 1, 1, 1, 1,
112    1, 1, 1, 1, 1, 1, 1, 1,
113    1, 1, 1, 0, 0, 0, 3, 1,
114    0, 1, 1, 1, 1, 1, 1, 1,
115    1, 1, 1, 1, 1, 1, 1, 1,
116    1, 1, 1, 1, 1, 1, 1, 1,
117    1, 1, 1, 0, 3, 0, 3, 0
118};
119
120
121
122
123int
124lexi()
125{
126    int         unary_delim;	/* this is set to 1 if the current token
127				 *
128				 * forces a following operator to be unary */
129    static int  last_code;	/* the last token type returned */
130    static int  l_struct;	/* set to 1 if the last token was 'struct' */
131    int         code;		/* internal code to be returned */
132    char        qchar;		/* the delimiter character for a string */
133
134    e_token = s_token;		/* point to start of place to save token */
135    unary_delim = false;
136    ps.col_1 = ps.last_nl;	/* tell world that this token started in
137				 * column 1 iff the last thing scanned was nl */
138    ps.last_nl = false;
139
140    while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141	ps.col_1 = false;	/* leading blanks imply token is not in column
142				 * 1 */
143	if (++buf_ptr >= buf_end)
144	    fill_buffer();
145    }
146
147    /* Scan an alphanumeric token */
148    if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
149	/*
150	 * we have a character or number
151	 */
152	register char *j;	/* used for searching thru list of
153				 *
154				 * reserved words */
155	register struct templ *p;
156
157	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
158	    int         seendot = 0,
159	                seenexp = 0,
160			seensfx = 0;
161	    if (*buf_ptr == '0' &&
162		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163		*e_token++ = *buf_ptr++;
164		*e_token++ = *buf_ptr++;
165		while (isxdigit(*buf_ptr)) {
166		    CHECK_SIZE_TOKEN;
167		    *e_token++ = *buf_ptr++;
168		}
169	    }
170	    else
171		while (1) {
172		    if (*buf_ptr == '.')
173			if (seendot)
174			    break;
175			else
176			    seendot++;
177		    CHECK_SIZE_TOKEN;
178		    *e_token++ = *buf_ptr++;
179		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
180			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
181			    break;
182			else {
183			    seenexp++;
184			    seendot++;
185			    CHECK_SIZE_TOKEN;
186			    *e_token++ = *buf_ptr++;
187			    if (*buf_ptr == '+' || *buf_ptr == '-')
188				*e_token++ = *buf_ptr++;
189			}
190		}
191	    while (1) {
192		if (!(seensfx & 1) &&
193			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
194		    CHECK_SIZE_TOKEN;
195		    *e_token++ = *buf_ptr++;
196		    seensfx |= 1;
197		    continue;
198		}
199        	if (!(seensfx & 2) &&
200			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
201		    CHECK_SIZE_TOKEN;
202		    if (buf_ptr[1] == buf_ptr[0])
203		        *e_token++ = *buf_ptr++;
204		    *e_token++ = *buf_ptr++;
205		    seensfx |= 2;
206		    continue;
207		}
208		break;
209	    }
210	}
211	else
212	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
213		CHECK_SIZE_TOKEN;
214		*e_token++ = *buf_ptr++;
215		if (buf_ptr >= buf_end)
216		    fill_buffer();
217	    }
218	*e_token++ = '\0';
219	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
220	    if (++buf_ptr >= buf_end)
221		fill_buffer();
222	}
223	ps.its_a_keyword = false;
224	ps.sizeof_keyword = false;
225	if (l_struct) {		/* if last token was 'struct', then this token
226				 * should be treated as a declaration */
227	    l_struct = false;
228	    last_code = ident;
229	    ps.last_u_d = true;
230	    return (decl);
231	}
232	ps.last_u_d = false;	/* Operator after indentifier is binary */
233	last_code = ident;	/* Remember that this is the code we will
234				 * return */
235
236	/*
237	 * This loop will check if the token is a keyword.
238	 */
239	for (p = specials; (j = p->rwd) != 0; p++) {
240	    register char *p = s_token;	/* point at scanned token */
241	    if (*j++ != *p++ || *j++ != *p++)
242		continue;	/* This test depends on the fact that
243				 * identifiers are always at least 1 character
244				 * long (ie. the first two bytes of the
245				 * identifier are always meaningful) */
246	    if (p[-1] == 0)
247		break;		/* If its a one-character identifier */
248	    while (*p++ == *j)
249		if (*j++ == 0)
250		    goto found_keyword;	/* I wish that C had a multi-level
251					 * break... */
252	}
253	if (p->rwd) {		/* we have a keyword */
254    found_keyword:
255	    ps.its_a_keyword = true;
256	    ps.last_u_d = true;
257	    switch (p->rwcode) {
258	    case 1:		/* it is a switch */
259		return (swstmt);
260	    case 2:		/* a case or default */
261		return (casestmt);
262
263	    case 3:		/* a "struct" */
264		/*
265		 * Next time around, we may want to know that we have had a
266		 * 'struct'
267		 */
268		l_struct = true;
269
270		/*
271		 * Fall through to test for a cast, function prototype or
272		 * sizeof().
273		 */
274	    case 4:		/* one of the declaration keywords */
275		if (ps.p_l_follow) {
276		    ps.cast_mask |= 1 << ps.p_l_follow;
277
278		    /*
279		     * Forget that we saw `struct' if we're in a sizeof().
280		     */
281		    if (ps.sizeof_mask)
282			l_struct = false;
283
284		    break;	/* inside parens: cast, prototype or sizeof() */
285		}
286		last_code = decl;
287		return (decl);
288
289	    case 5:		/* if, while, for */
290		return (sp_paren);
291
292	    case 6:		/* do, else */
293		return (sp_nparen);
294
295	    case 7:
296		ps.sizeof_keyword = true;
297	    default:		/* all others are treated like any other
298				 * identifier */
299		return (ident);
300	    }			/* end of switch */
301	}			/* end of if (found_it) */
302	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
303	    register char *tp = buf_ptr;
304	    while (tp < buf_end)
305		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
306		    goto not_proc;
307	    strncpy(ps.procname, token, sizeof ps.procname - 1);
308	    ps.in_parameter_declaration = 1;
309	    rparen_count = 1;
310    not_proc:;
311	}
312	/*
313	 * The following hack attempts to guess whether or not the current
314	 * token is in fact a declaration keyword -- one that has been
315	 * typedefd
316	 */
317	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
318		&& !ps.p_l_follow
319	        && !ps.block_init
320		&& (ps.last_token == rparen || ps.last_token == semicolon ||
321		    ps.last_token == decl ||
322		    ps.last_token == lbrace || ps.last_token == rbrace)) {
323	    ps.its_a_keyword = true;
324	    ps.last_u_d = true;
325	    last_code = decl;
326	    return decl;
327	}
328	if (last_code == decl)	/* if this is a declared variable, then
329				 * following sign is unary */
330	    ps.last_u_d = true;	/* will make "int a -1" work */
331	last_code = ident;
332	return (ident);		/* the ident is not in the list */
333    }				/* end of procesing for alpanum character */
334
335    /* Scan a non-alphanumeric token */
336
337    *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
338				 * moved here */
339    *e_token = '\0';
340    if (++buf_ptr >= buf_end)
341	fill_buffer();
342
343    switch (*token) {
344    case '\n':
345	unary_delim = ps.last_u_d;
346	ps.last_nl = true;	/* remember that we just had a newline */
347	code = (had_eof ? 0 : newline);
348
349	/*
350	 * if data has been exausted, the newline is a dummy, and we should
351	 * return code to stop
352	 */
353	break;
354
355    case '\'':			/* start of quoted character */
356    case '"':			/* start of string */
357	qchar = *token;
358	if (troff) {
359	    e_token[-1] = '`';
360	    if (qchar == '"')
361		*e_token++ = '`';
362	    e_token = chfont(&bodyf, &stringf, e_token);
363	}
364	do {			/* copy the string */
365	    while (1) {		/* move one character or [/<char>]<char> */
366		if (*buf_ptr == '\n') {
367		    printf("%d: Unterminated literal\n", line_no);
368		    goto stop_lit;
369		}
370		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
371					 * since CHECK_SIZE guarantees that there
372					 * are at least 5 entries left */
373		*e_token = *buf_ptr++;
374		if (buf_ptr >= buf_end)
375		    fill_buffer();
376		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
377		    if (*buf_ptr == '\n')	/* check for escaped newline */
378			++line_no;
379		    if (troff) {
380			*++e_token = BACKSLASH;
381			if (*buf_ptr == BACKSLASH)
382			    *++e_token = BACKSLASH;
383		    }
384		    *++e_token = *buf_ptr++;
385		    ++e_token;	/* we must increment this again because we
386				 * copied two chars */
387		    if (buf_ptr >= buf_end)
388			fill_buffer();
389		}
390		else
391		    break;	/* we copied one character */
392	    }			/* end of while (1) */
393	} while (*e_token++ != qchar);
394	if (troff) {
395	    e_token = chfont(&stringf, &bodyf, e_token - 1);
396	    if (qchar == '"')
397		*e_token++ = '\'';
398	}
399stop_lit:
400	code = ident;
401	break;
402
403    case ('('):
404    case ('['):
405	unary_delim = true;
406	code = lparen;
407	break;
408
409    case (')'):
410    case (']'):
411	code = rparen;
412	break;
413
414    case '#':
415	unary_delim = ps.last_u_d;
416	code = preesc;
417	break;
418
419    case '?':
420	unary_delim = true;
421	code = question;
422	break;
423
424    case (':'):
425	code = colon;
426	unary_delim = true;
427	break;
428
429    case (';'):
430	unary_delim = true;
431	code = semicolon;
432	break;
433
434    case ('{'):
435	unary_delim = true;
436
437	/*
438	 * if (ps.in_or_st) ps.block_init = 1;
439	 */
440	/* ?	code = ps.block_init ? lparen : lbrace; */
441	code = lbrace;
442	break;
443
444    case ('}'):
445	unary_delim = true;
446	/* ?	code = ps.block_init ? rparen : rbrace; */
447	code = rbrace;
448	break;
449
450    case 014:			/* a form feed */
451	unary_delim = ps.last_u_d;
452	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
453				 * right */
454	code = form_feed;
455	break;
456
457    case (','):
458	unary_delim = true;
459	code = comma;
460	break;
461
462    case '.':
463	unary_delim = false;
464	code = period;
465	break;
466
467    case '-':
468    case '+':			/* check for -, +, --, ++ */
469	code = (ps.last_u_d ? unary_op : binary_op);
470	unary_delim = true;
471
472	if (*buf_ptr == token[0]) {
473	    /* check for doubled character */
474	    *e_token++ = *buf_ptr++;
475	    /* buffer overflow will be checked at end of loop */
476	    if (last_code == ident || last_code == rparen) {
477		code = (ps.last_u_d ? unary_op : postop);
478		/* check for following ++ or -- */
479		unary_delim = false;
480	    }
481	}
482	else if (*buf_ptr == '=')
483	    /* check for operator += */
484	    *e_token++ = *buf_ptr++;
485	else if (*buf_ptr == '>') {
486	    /* check for operator -> */
487	    *e_token++ = *buf_ptr++;
488	    if (!pointer_as_binop) {
489		unary_delim = false;
490		code = unary_op;
491		ps.want_blank = false;
492	    }
493	}
494	break;			/* buffer overflow will be checked at end of
495				 * switch */
496
497    case '=':
498	if (ps.in_or_st)
499	    ps.block_init = 1;
500#ifdef undef
501	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
502	    e_token[-1] = *buf_ptr++;
503	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
504		*e_token++ = *buf_ptr++;
505	    *e_token++ = '=';	/* Flip =+ to += */
506	    *e_token = 0;
507	}
508#else
509	if (*buf_ptr == '=') {/* == */
510	    *e_token++ = '=';	/* Flip =+ to += */
511	    buf_ptr++;
512	    *e_token = 0;
513	}
514#endif
515	code = binary_op;
516	unary_delim = true;
517	break;
518	/* can drop thru!!! */
519
520    case '>':
521    case '<':
522    case '!':			/* ops like <, <<, <=, !=, etc */
523	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
524	    *e_token++ = *buf_ptr;
525	    if (++buf_ptr >= buf_end)
526		fill_buffer();
527	}
528	if (*buf_ptr == '=')
529	    *e_token++ = *buf_ptr++;
530	code = (ps.last_u_d ? unary_op : binary_op);
531	unary_delim = true;
532	break;
533
534    default:
535	if (token[0] == '/' && *buf_ptr == '*') {
536	    /* it is start of comment */
537	    *e_token++ = '*';
538
539	    if (++buf_ptr >= buf_end)
540		fill_buffer();
541
542	    code = comment;
543	    unary_delim = ps.last_u_d;
544	    break;
545	}
546	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
547	    /*
548	     * handle ||, &&, etc, and also things as in int *****i
549	     */
550	    *e_token++ = *buf_ptr;
551	    if (++buf_ptr >= buf_end)
552		fill_buffer();
553	}
554	code = (ps.last_u_d ? unary_op : binary_op);
555	unary_delim = true;
556
557
558    }				/* end of switch */
559    if (code != newline) {
560	l_struct = false;
561	last_code = code;
562    }
563    if (buf_ptr >= buf_end)	/* check for input buffer empty */
564	fill_buffer();
565    ps.last_u_d = unary_delim;
566    *e_token = '\0';		/* null terminate the token */
567    return (code);
568}
569
570/*
571 * Add the given keyword to the keyword table, using val as the keyword type
572 */
573addkey(key, val)
574    char       *key;
575{
576    register struct templ *p = specials;
577    while (p->rwd)
578	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
579	    return;
580	else
581	    p++;
582    if (p >= specials + sizeof specials / sizeof specials[0])
583	return;			/* For now, table overflows are silently
584				 * ignored */
585    p->rwd = key;
586    p->rwcode = val;
587    p[1].rwd = 0;
588    p[1].rwcode = 0;
589    return;
590}
591