lexi.c revision 125623
1/*
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#if 0
37#ifndef lint
38static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39#endif /* not lint */
40#endif
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 125623 2004-02-09 15:27:02Z bde $");
43
44/*
45 * Here we have the token scanner for indent.  It scans off one token and puts
46 * it in the global variable "token".  It returns a code, indicating the type
47 * of token scanned.
48 */
49
50#include <err.h>
51#include <stdio.h>
52#include <ctype.h>
53#include <stdlib.h>
54#include <string.h>
55#include "indent_globs.h"
56#include "indent_codes.h"
57#include "indent.h"
58
59#define alphanum 1
60#define opchar 3
61
62struct templ {
63    const char *rwd;
64    int         rwcode;
65};
66
67struct templ specials[1000] =
68{
69    {"switch", 1},
70    {"case", 2},
71    {"break", 0},
72    {"struct", 3},
73    {"union", 3},
74    {"enum", 3},
75    {"default", 2},
76    {"int", 4},
77    {"char", 4},
78    {"float", 4},
79    {"double", 4},
80    {"long", 4},
81    {"short", 4},
82    {"typdef", 4},
83    {"unsigned", 4},
84    {"register", 4},
85    {"static", 4},
86    {"global", 4},
87    {"extern", 4},
88    {"void", 4},
89    {"const", 4},
90    {"volatile", 4},
91    {"goto", 0},
92    {"return", 0},
93    {"if", 5},
94    {"while", 5},
95    {"for", 5},
96    {"else", 6},
97    {"do", 6},
98    {"sizeof", 7},
99    {0, 0}
100};
101
102char        chartype[128] =
103{				/* this is used to facilitate the decision of
104				 * what type (alphanumeric, operator) each
105				 * character is */
106    0, 0, 0, 0, 0, 0, 0, 0,
107    0, 0, 0, 0, 0, 0, 0, 0,
108    0, 0, 0, 0, 0, 0, 0, 0,
109    0, 0, 0, 0, 0, 0, 0, 0,
110    0, 3, 0, 0, 1, 3, 3, 0,
111    0, 0, 3, 3, 0, 3, 0, 3,
112    1, 1, 1, 1, 1, 1, 1, 1,
113    1, 1, 0, 0, 3, 3, 3, 3,
114    0, 1, 1, 1, 1, 1, 1, 1,
115    1, 1, 1, 1, 1, 1, 1, 1,
116    1, 1, 1, 1, 1, 1, 1, 1,
117    1, 1, 1, 0, 0, 0, 3, 1,
118    0, 1, 1, 1, 1, 1, 1, 1,
119    1, 1, 1, 1, 1, 1, 1, 1,
120    1, 1, 1, 1, 1, 1, 1, 1,
121    1, 1, 1, 0, 3, 0, 3, 0
122};
123
124int
125lexi(void)
126{
127    int         unary_delim;	/* this is set to 1 if the current token
128				 * forces a following operator to be unary */
129    static int  last_code;	/* the last token type returned */
130    static int  l_struct;	/* set to 1 if the last token was 'struct' */
131    int         code;		/* internal code to be returned */
132    char        qchar;		/* the delimiter character for a string */
133
134    e_token = s_token;		/* point to start of place to save token */
135    unary_delim = false;
136    ps.col_1 = ps.last_nl;	/* tell world that this token started in
137				 * column 1 iff the last thing scanned was nl */
138    ps.last_nl = false;
139
140    while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141	ps.col_1 = false;	/* leading blanks imply token is not in column
142				 * 1 */
143	if (++buf_ptr >= buf_end)
144	    fill_buffer();
145    }
146
147    /* Scan an alphanumeric token */
148    if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149	/*
150	 * we have a character or number
151	 */
152	const char *j;		/* used for searching thru list of
153				 *
154				 * reserved words */
155	struct templ *p;
156
157	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158	    int         seendot = 0,
159	                seenexp = 0,
160			seensfx = 0;
161	    if (*buf_ptr == '0' &&
162		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163		*e_token++ = *buf_ptr++;
164		*e_token++ = *buf_ptr++;
165		while (isxdigit(*buf_ptr)) {
166		    CHECK_SIZE_TOKEN;
167		    *e_token++ = *buf_ptr++;
168		}
169	    }
170	    else
171		while (1) {
172		    if (*buf_ptr == '.') {
173			if (seendot)
174			    break;
175			else
176			    seendot++;
177		    }
178		    CHECK_SIZE_TOKEN;
179		    *e_token++ = *buf_ptr++;
180		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182			    break;
183			else {
184			    seenexp++;
185			    seendot++;
186			    CHECK_SIZE_TOKEN;
187			    *e_token++ = *buf_ptr++;
188			    if (*buf_ptr == '+' || *buf_ptr == '-')
189				*e_token++ = *buf_ptr++;
190			}
191		    }
192		}
193	    while (1) {
194		if (!(seensfx & 1) &&
195			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
196		    CHECK_SIZE_TOKEN;
197		    *e_token++ = *buf_ptr++;
198		    seensfx |= 1;
199		    continue;
200		}
201        	if (!(seensfx & 2) &&
202			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
203		    CHECK_SIZE_TOKEN;
204		    if (buf_ptr[1] == buf_ptr[0])
205		        *e_token++ = *buf_ptr++;
206		    *e_token++ = *buf_ptr++;
207		    seensfx |= 2;
208		    continue;
209		}
210		break;
211	    }
212	}
213	else
214	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
215		/* fill_buffer() terminates buffer with newline */
216		if (*buf_ptr == BACKSLASH) {
217		    if (*(buf_ptr + 1) == '\n') {
218			buf_ptr += 2;
219			if (buf_ptr >= buf_end)
220			    fill_buffer();
221			} else
222			    break;
223		}
224		CHECK_SIZE_TOKEN;
225		/* copy it over */
226		*e_token++ = *buf_ptr++;
227		if (buf_ptr >= buf_end)
228		    fill_buffer();
229	    }
230	*e_token++ = '\0';
231	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
232	    if (++buf_ptr >= buf_end)
233		fill_buffer();
234	}
235	ps.its_a_keyword = false;
236	ps.sizeof_keyword = false;
237	if (l_struct && !ps.p_l_follow) {
238				/* if last token was 'struct' and we're not
239				 * in parentheses, then this token
240				 * should be treated as a declaration */
241	    l_struct = false;
242	    last_code = ident;
243	    ps.last_u_d = true;
244	    return (decl);
245	}
246	ps.last_u_d = l_struct;	/* Operator after identifier is binary
247				 * unless last token was 'struct' */
248	l_struct = false;
249	last_code = ident;	/* Remember that this is the code we will
250				 * return */
251
252	/*
253	 * This loop will check if the token is a keyword.
254	 */
255	for (p = specials; (j = p->rwd) != 0; p++) {
256	    const char *q = s_token;	/* point at scanned token */
257	    if (*j++ != *q++ || *j++ != *q++)
258		continue;	/* This test depends on the fact that
259				 * identifiers are always at least 1 character
260				 * long (ie. the first two bytes of the
261				 * identifier are always meaningful) */
262	    if (q[-1] == 0)
263		break;		/* If its a one-character identifier */
264	    while (*q++ == *j)
265		if (*j++ == 0)
266		    goto found_keyword;	/* I wish that C had a multi-level
267					 * break... */
268	}
269	if (p->rwd) {		/* we have a keyword */
270    found_keyword:
271	    ps.its_a_keyword = true;
272	    ps.last_u_d = true;
273	    switch (p->rwcode) {
274	    case 1:		/* it is a switch */
275		return (swstmt);
276	    case 2:		/* a case or default */
277		return (casestmt);
278
279	    case 3:		/* a "struct" */
280		/*
281		 * Next time around, we will want to know that we have had a
282		 * 'struct'
283		 */
284		l_struct = true;
285		/* FALLTHROUGH */
286
287	    case 4:		/* one of the declaration keywords */
288		if (ps.p_l_follow) {
289		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
290		    break;	/* inside parens: cast, param list or sizeof */
291		}
292		last_code = decl;
293		return (decl);
294
295	    case 5:		/* if, while, for */
296		return (sp_paren);
297
298	    case 6:		/* do, else */
299		return (sp_nparen);
300
301	    case 7:
302		ps.sizeof_keyword = true;
303	    default:		/* all others are treated like any other
304				 * identifier */
305		return (ident);
306	    }			/* end of switch */
307	}			/* end of if (found_it) */
308	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
309	    char *tp = buf_ptr;
310	    while (tp < buf_end)
311		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
312		    goto not_proc;
313	    strncpy(ps.procname, token, sizeof ps.procname - 1);
314	    ps.in_parameter_declaration = 1;
315	    rparen_count = 1;
316    not_proc:;
317	}
318	/*
319	 * The following hack attempts to guess whether or not the current
320	 * token is in fact a declaration keyword -- one that has been
321	 * typedefd
322	 */
323	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
324		&& !ps.p_l_follow
325	        && !ps.block_init
326		&& (ps.last_token == rparen || ps.last_token == semicolon ||
327		    ps.last_token == decl ||
328		    ps.last_token == lbrace || ps.last_token == rbrace)) {
329	    ps.its_a_keyword = true;
330	    ps.last_u_d = true;
331	    last_code = decl;
332	    return decl;
333	}
334	if (last_code == decl)	/* if this is a declared variable, then
335				 * following sign is unary */
336	    ps.last_u_d = true;	/* will make "int a -1" work */
337	last_code = ident;
338	return (ident);		/* the ident is not in the list */
339    }				/* end of procesing for alpanum character */
340
341    /* Scan a non-alphanumeric token */
342
343    *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
344				 * moved here */
345    *e_token = '\0';
346    if (++buf_ptr >= buf_end)
347	fill_buffer();
348
349    switch (*token) {
350    case '\n':
351	unary_delim = ps.last_u_d;
352	ps.last_nl = true;	/* remember that we just had a newline */
353	code = (had_eof ? 0 : newline);
354
355	/*
356	 * if data has been exhausted, the newline is a dummy, and we should
357	 * return code to stop
358	 */
359	break;
360
361    case '\'':			/* start of quoted character */
362    case '"':			/* start of string */
363	qchar = *token;
364	if (troff) {
365	    e_token[-1] = '`';
366	    if (qchar == '"')
367		*e_token++ = '`';
368	    e_token = chfont(&bodyf, &stringf, e_token);
369	}
370	do {			/* copy the string */
371	    while (1) {		/* move one character or [/<char>]<char> */
372		if (*buf_ptr == '\n') {
373		    printf("%d: Unterminated literal\n", line_no);
374		    goto stop_lit;
375		}
376		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
377					 * since CHECK_SIZE guarantees that there
378					 * are at least 5 entries left */
379		*e_token = *buf_ptr++;
380		if (buf_ptr >= buf_end)
381		    fill_buffer();
382		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
383		    if (*buf_ptr == '\n')	/* check for escaped newline */
384			++line_no;
385		    if (troff) {
386			*++e_token = BACKSLASH;
387			if (*buf_ptr == BACKSLASH)
388			    *++e_token = BACKSLASH;
389		    }
390		    *++e_token = *buf_ptr++;
391		    ++e_token;	/* we must increment this again because we
392				 * copied two chars */
393		    if (buf_ptr >= buf_end)
394			fill_buffer();
395		}
396		else
397		    break;	/* we copied one character */
398	    }			/* end of while (1) */
399	} while (*e_token++ != qchar);
400	if (troff) {
401	    e_token = chfont(&stringf, &bodyf, e_token - 1);
402	    if (qchar == '"')
403		*e_token++ = '\'';
404	}
405stop_lit:
406	code = ident;
407	break;
408
409    case ('('):
410    case ('['):
411	unary_delim = true;
412	code = lparen;
413	break;
414
415    case (')'):
416    case (']'):
417	code = rparen;
418	break;
419
420    case '#':
421	unary_delim = ps.last_u_d;
422	code = preesc;
423	break;
424
425    case '?':
426	unary_delim = true;
427	code = question;
428	break;
429
430    case (':'):
431	code = colon;
432	unary_delim = true;
433	break;
434
435    case (';'):
436	unary_delim = true;
437	code = semicolon;
438	break;
439
440    case ('{'):
441	unary_delim = true;
442
443	/*
444	 * if (ps.in_or_st) ps.block_init = 1;
445	 */
446	/* ?	code = ps.block_init ? lparen : lbrace; */
447	code = lbrace;
448	break;
449
450    case ('}'):
451	unary_delim = true;
452	/* ?	code = ps.block_init ? rparen : rbrace; */
453	code = rbrace;
454	break;
455
456    case 014:			/* a form feed */
457	unary_delim = ps.last_u_d;
458	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
459				 * right */
460	code = form_feed;
461	break;
462
463    case (','):
464	unary_delim = true;
465	code = comma;
466	break;
467
468    case '.':
469	unary_delim = false;
470	code = period;
471	break;
472
473    case '-':
474    case '+':			/* check for -, +, --, ++ */
475	code = (ps.last_u_d ? unary_op : binary_op);
476	unary_delim = true;
477
478	if (*buf_ptr == token[0]) {
479	    /* check for doubled character */
480	    *e_token++ = *buf_ptr++;
481	    /* buffer overflow will be checked at end of loop */
482	    if (last_code == ident || last_code == rparen) {
483		code = (ps.last_u_d ? unary_op : postop);
484		/* check for following ++ or -- */
485		unary_delim = false;
486	    }
487	}
488	else if (*buf_ptr == '=')
489	    /* check for operator += */
490	    *e_token++ = *buf_ptr++;
491	else if (*buf_ptr == '>') {
492	    /* check for operator -> */
493	    *e_token++ = *buf_ptr++;
494	    if (!pointer_as_binop) {
495		unary_delim = false;
496		code = unary_op;
497		ps.want_blank = false;
498	    }
499	}
500	break;			/* buffer overflow will be checked at end of
501				 * switch */
502
503    case '=':
504	if (ps.in_or_st)
505	    ps.block_init = 1;
506#ifdef undef
507	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
508	    e_token[-1] = *buf_ptr++;
509	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
510		*e_token++ = *buf_ptr++;
511	    *e_token++ = '=';	/* Flip =+ to += */
512	    *e_token = 0;
513	}
514#else
515	if (*buf_ptr == '=') {/* == */
516	    *e_token++ = '=';	/* Flip =+ to += */
517	    buf_ptr++;
518	    *e_token = 0;
519	}
520#endif
521	code = binary_op;
522	unary_delim = true;
523	break;
524	/* can drop thru!!! */
525
526    case '>':
527    case '<':
528    case '!':			/* ops like <, <<, <=, !=, etc */
529	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
530	    *e_token++ = *buf_ptr;
531	    if (++buf_ptr >= buf_end)
532		fill_buffer();
533	}
534	if (*buf_ptr == '=')
535	    *e_token++ = *buf_ptr++;
536	code = (ps.last_u_d ? unary_op : binary_op);
537	unary_delim = true;
538	break;
539
540    default:
541	if (token[0] == '/' && *buf_ptr == '*') {
542	    /* it is start of comment */
543	    *e_token++ = '*';
544
545	    if (++buf_ptr >= buf_end)
546		fill_buffer();
547
548	    code = comment;
549	    unary_delim = ps.last_u_d;
550	    break;
551	}
552	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
553	    /*
554	     * handle ||, &&, etc, and also things as in int *****i
555	     */
556	    *e_token++ = *buf_ptr;
557	    if (++buf_ptr >= buf_end)
558		fill_buffer();
559	}
560	code = (ps.last_u_d ? unary_op : binary_op);
561	unary_delim = true;
562
563
564    }				/* end of switch */
565    if (code != newline) {
566	l_struct = false;
567	last_code = code;
568    }
569    if (buf_ptr >= buf_end)	/* check for input buffer empty */
570	fill_buffer();
571    ps.last_u_d = unary_delim;
572    *e_token = '\0';		/* null terminate the token */
573    return (code);
574}
575
576/*
577 * Add the given keyword to the keyword table, using val as the keyword type
578 */
579void
580addkey(char *key, int val)
581{
582    struct templ *p = specials;
583    while (p->rwd)
584	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
585	    return;
586	else
587	    p++;
588    if (p >= specials + sizeof specials / sizeof specials[0])
589	return;			/* For now, table overflows are silently
590				 * ignored */
591    p->rwd = key;
592    p->rwcode = val;
593    p[1].rwd = 0;
594    p[1].rwcode = 0;
595}
596