1/*
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#if 0
37#ifndef lint
38static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39#endif /* not lint */
40#endif
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD$");
43
44/*
45 * Here we have the token scanner for indent.  It scans off one token and puts
46 * it in the global variable "token".  It returns a code, indicating the type
47 * of token scanned.
48 */
49
50#include <err.h>
51#include <stdio.h>
52#include <ctype.h>
53#include <stdlib.h>
54#include <string.h>
55#include "indent_globs.h"
56#include "indent_codes.h"
57#include "indent.h"
58
59#define alphanum 1
60#define opchar 3
61
62struct templ {
63    const char *rwd;
64    int         rwcode;
65};
66
67struct templ specials[1000] =
68{
69    {"switch", 1},
70    {"case", 2},
71    {"break", 0},
72    {"struct", 3},
73    {"union", 3},
74    {"enum", 3},
75    {"default", 2},
76    {"int", 4},
77    {"char", 4},
78    {"float", 4},
79    {"double", 4},
80    {"long", 4},
81    {"short", 4},
82    {"typdef", 4},
83    {"unsigned", 4},
84    {"register", 4},
85    {"static", 4},
86    {"global", 4},
87    {"extern", 4},
88    {"void", 4},
89    {"const", 4},
90    {"volatile", 4},
91    {"goto", 0},
92    {"return", 0},
93    {"if", 5},
94    {"while", 5},
95    {"for", 5},
96    {"else", 6},
97    {"do", 6},
98    {"sizeof", 7},
99    {0, 0}
100};
101
102char        chartype[128] =
103{				/* this is used to facilitate the decision of
104				 * what type (alphanumeric, operator) each
105				 * character is */
106    0, 0, 0, 0, 0, 0, 0, 0,
107    0, 0, 0, 0, 0, 0, 0, 0,
108    0, 0, 0, 0, 0, 0, 0, 0,
109    0, 0, 0, 0, 0, 0, 0, 0,
110    0, 3, 0, 0, 1, 3, 3, 0,
111    0, 0, 3, 3, 0, 3, 0, 3,
112    1, 1, 1, 1, 1, 1, 1, 1,
113    1, 1, 0, 0, 3, 3, 3, 3,
114    0, 1, 1, 1, 1, 1, 1, 1,
115    1, 1, 1, 1, 1, 1, 1, 1,
116    1, 1, 1, 1, 1, 1, 1, 1,
117    1, 1, 1, 0, 0, 0, 3, 1,
118    0, 1, 1, 1, 1, 1, 1, 1,
119    1, 1, 1, 1, 1, 1, 1, 1,
120    1, 1, 1, 1, 1, 1, 1, 1,
121    1, 1, 1, 0, 3, 0, 3, 0
122};
123
124int
125lexi(void)
126{
127    int         unary_delim;	/* this is set to 1 if the current token
128				 * forces a following operator to be unary */
129    static int  last_code;	/* the last token type returned */
130    static int  l_struct;	/* set to 1 if the last token was 'struct' */
131    int         code;		/* internal code to be returned */
132    char        qchar;		/* the delimiter character for a string */
133
134    e_token = s_token;		/* point to start of place to save token */
135    unary_delim = false;
136    ps.col_1 = ps.last_nl;	/* tell world that this token started in
137				 * column 1 iff the last thing scanned was nl */
138    ps.last_nl = false;
139
140    while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
141	ps.col_1 = false;	/* leading blanks imply token is not in column
142				 * 1 */
143	if (++buf_ptr >= buf_end)
144	    fill_buffer();
145    }
146
147    /* Scan an alphanumeric token */
148    if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149	/*
150	 * we have a character or number
151	 */
152	const char *j;		/* used for searching thru list of
153				 *
154				 * reserved words */
155	struct templ *p;
156
157	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158	    int         seendot = 0,
159	                seenexp = 0,
160			seensfx = 0;
161	    if (*buf_ptr == '0' &&
162		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163		*e_token++ = *buf_ptr++;
164		*e_token++ = *buf_ptr++;
165		while (isxdigit(*buf_ptr)) {
166		    CHECK_SIZE_TOKEN;
167		    *e_token++ = *buf_ptr++;
168		}
169	    }
170	    else
171		while (1) {
172		    if (*buf_ptr == '.') {
173			if (seendot)
174			    break;
175			else
176			    seendot++;
177		    }
178		    CHECK_SIZE_TOKEN;
179		    *e_token++ = *buf_ptr++;
180		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182			    break;
183			else {
184			    seenexp++;
185			    seendot++;
186			    CHECK_SIZE_TOKEN;
187			    *e_token++ = *buf_ptr++;
188			    if (*buf_ptr == '+' || *buf_ptr == '-')
189				*e_token++ = *buf_ptr++;
190			}
191		    }
192		}
193	    while (1) {
194		if (!(seensfx & 1) &&
195			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
196		    CHECK_SIZE_TOKEN;
197		    *e_token++ = *buf_ptr++;
198		    seensfx |= 1;
199		    continue;
200		}
201        	if (!(seensfx & 2) &&
202			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
203		    CHECK_SIZE_TOKEN;
204		    if (buf_ptr[1] == buf_ptr[0])
205		        *e_token++ = *buf_ptr++;
206		    *e_token++ = *buf_ptr++;
207		    seensfx |= 2;
208		    continue;
209		}
210		break;
211	    }
212	}
213	else
214	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
215		/* fill_buffer() terminates buffer with newline */
216		if (*buf_ptr == BACKSLASH) {
217		    if (*(buf_ptr + 1) == '\n') {
218			buf_ptr += 2;
219			if (buf_ptr >= buf_end)
220			    fill_buffer();
221			} else
222			    break;
223		}
224		CHECK_SIZE_TOKEN;
225		/* copy it over */
226		*e_token++ = *buf_ptr++;
227		if (buf_ptr >= buf_end)
228		    fill_buffer();
229	    }
230	*e_token++ = '\0';
231	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
232	    if (++buf_ptr >= buf_end)
233		fill_buffer();
234	}
235	ps.its_a_keyword = false;
236	ps.sizeof_keyword = false;
237	if (l_struct && !ps.p_l_follow) {
238				/* if last token was 'struct' and we're not
239				 * in parentheses, then this token
240				 * should be treated as a declaration */
241	    l_struct = false;
242	    last_code = ident;
243	    ps.last_u_d = true;
244	    return (decl);
245	}
246	ps.last_u_d = l_struct;	/* Operator after identifier is binary
247				 * unless last token was 'struct' */
248	l_struct = false;
249	last_code = ident;	/* Remember that this is the code we will
250				 * return */
251
252	if (auto_typedefs) {
253	    const char *q = s_token;
254	    size_t q_len = strlen(q);
255	    /* Check if we have an "_t" in the end */
256	    if (q_len > 2 &&
257	        (strcmp(q + q_len - 2, "_t") == 0)) {
258	        ps.its_a_keyword = true;
259		ps.last_u_d = true;
260	        goto found_auto_typedef;
261	    }
262	}
263
264	/*
265	 * This loop will check if the token is a keyword.
266	 */
267	for (p = specials; (j = p->rwd) != 0; p++) {
268	    const char *q = s_token;	/* point at scanned token */
269	    if (*j++ != *q++ || *j++ != *q++)
270		continue;	/* This test depends on the fact that
271				 * identifiers are always at least 1 character
272				 * long (ie. the first two bytes of the
273				 * identifier are always meaningful) */
274	    if (q[-1] == 0)
275		break;		/* If its a one-character identifier */
276	    while (*q++ == *j)
277		if (*j++ == 0)
278		    goto found_keyword;	/* I wish that C had a multi-level
279					 * break... */
280	}
281	if (p->rwd) {		/* we have a keyword */
282    found_keyword:
283	    ps.its_a_keyword = true;
284	    ps.last_u_d = true;
285	    switch (p->rwcode) {
286	    case 1:		/* it is a switch */
287		return (swstmt);
288	    case 2:		/* a case or default */
289		return (casestmt);
290
291	    case 3:		/* a "struct" */
292		/*
293		 * Next time around, we will want to know that we have had a
294		 * 'struct'
295		 */
296		l_struct = true;
297		/* FALLTHROUGH */
298
299	    case 4:		/* one of the declaration keywords */
300	    found_auto_typedef:
301		if (ps.p_l_follow) {
302		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
303		    break;	/* inside parens: cast, param list or sizeof */
304		}
305		last_code = decl;
306		return (decl);
307
308	    case 5:		/* if, while, for */
309		return (sp_paren);
310
311	    case 6:		/* do, else */
312		return (sp_nparen);
313
314	    case 7:
315		ps.sizeof_keyword = true;
316	    default:		/* all others are treated like any other
317				 * identifier */
318		return (ident);
319	    }			/* end of switch */
320	}			/* end of if (found_it) */
321	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
322	    char *tp = buf_ptr;
323	    while (tp < buf_end)
324		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
325		    goto not_proc;
326	    strncpy(ps.procname, token, sizeof ps.procname - 1);
327	    ps.in_parameter_declaration = 1;
328	    rparen_count = 1;
329    not_proc:;
330	}
331	/*
332	 * The following hack attempts to guess whether or not the current
333	 * token is in fact a declaration keyword -- one that has been
334	 * typedefd
335	 */
336	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
337		&& !ps.p_l_follow
338	        && !ps.block_init
339		&& (ps.last_token == rparen || ps.last_token == semicolon ||
340		    ps.last_token == decl ||
341		    ps.last_token == lbrace || ps.last_token == rbrace)) {
342	    ps.its_a_keyword = true;
343	    ps.last_u_d = true;
344	    last_code = decl;
345	    return decl;
346	}
347	if (last_code == decl)	/* if this is a declared variable, then
348				 * following sign is unary */
349	    ps.last_u_d = true;	/* will make "int a -1" work */
350	last_code = ident;
351	return (ident);		/* the ident is not in the list */
352    }				/* end of procesing for alpanum character */
353
354    /* Scan a non-alphanumeric token */
355
356    *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
357				 * moved here */
358    *e_token = '\0';
359    if (++buf_ptr >= buf_end)
360	fill_buffer();
361
362    switch (*token) {
363    case '\n':
364	unary_delim = ps.last_u_d;
365	ps.last_nl = true;	/* remember that we just had a newline */
366	code = (had_eof ? 0 : newline);
367
368	/*
369	 * if data has been exhausted, the newline is a dummy, and we should
370	 * return code to stop
371	 */
372	break;
373
374    case '\'':			/* start of quoted character */
375    case '"':			/* start of string */
376	qchar = *token;
377	if (troff) {
378	    e_token[-1] = '`';
379	    if (qchar == '"')
380		*e_token++ = '`';
381	    e_token = chfont(&bodyf, &stringf, e_token);
382	}
383	do {			/* copy the string */
384	    while (1) {		/* move one character or [/<char>]<char> */
385		if (*buf_ptr == '\n') {
386		    diag2(1, "Unterminated literal");
387		    goto stop_lit;
388		}
389		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
390					 * since CHECK_SIZE guarantees that there
391					 * are at least 5 entries left */
392		*e_token = *buf_ptr++;
393		if (buf_ptr >= buf_end)
394		    fill_buffer();
395		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
396		    if (*buf_ptr == '\n')	/* check for escaped newline */
397			++line_no;
398		    if (troff) {
399			*++e_token = BACKSLASH;
400			if (*buf_ptr == BACKSLASH)
401			    *++e_token = BACKSLASH;
402		    }
403		    *++e_token = *buf_ptr++;
404		    ++e_token;	/* we must increment this again because we
405				 * copied two chars */
406		    if (buf_ptr >= buf_end)
407			fill_buffer();
408		}
409		else
410		    break;	/* we copied one character */
411	    }			/* end of while (1) */
412	} while (*e_token++ != qchar);
413	if (troff) {
414	    e_token = chfont(&stringf, &bodyf, e_token - 1);
415	    if (qchar == '"')
416		*e_token++ = '\'';
417	}
418stop_lit:
419	code = ident;
420	break;
421
422    case ('('):
423    case ('['):
424	unary_delim = true;
425	code = lparen;
426	break;
427
428    case (')'):
429    case (']'):
430	code = rparen;
431	break;
432
433    case '#':
434	unary_delim = ps.last_u_d;
435	code = preesc;
436	break;
437
438    case '?':
439	unary_delim = true;
440	code = question;
441	break;
442
443    case (':'):
444	code = colon;
445	unary_delim = true;
446	break;
447
448    case (';'):
449	unary_delim = true;
450	code = semicolon;
451	break;
452
453    case ('{'):
454	unary_delim = true;
455
456	/*
457	 * if (ps.in_or_st) ps.block_init = 1;
458	 */
459	/* ?	code = ps.block_init ? lparen : lbrace; */
460	code = lbrace;
461	break;
462
463    case ('}'):
464	unary_delim = true;
465	/* ?	code = ps.block_init ? rparen : rbrace; */
466	code = rbrace;
467	break;
468
469    case 014:			/* a form feed */
470	unary_delim = ps.last_u_d;
471	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
472				 * right */
473	code = form_feed;
474	break;
475
476    case (','):
477	unary_delim = true;
478	code = comma;
479	break;
480
481    case '.':
482	unary_delim = false;
483	code = period;
484	break;
485
486    case '-':
487    case '+':			/* check for -, +, --, ++ */
488	code = (ps.last_u_d ? unary_op : binary_op);
489	unary_delim = true;
490
491	if (*buf_ptr == token[0]) {
492	    /* check for doubled character */
493	    *e_token++ = *buf_ptr++;
494	    /* buffer overflow will be checked at end of loop */
495	    if (last_code == ident || last_code == rparen) {
496		code = (ps.last_u_d ? unary_op : postop);
497		/* check for following ++ or -- */
498		unary_delim = false;
499	    }
500	}
501	else if (*buf_ptr == '=')
502	    /* check for operator += */
503	    *e_token++ = *buf_ptr++;
504	else if (*buf_ptr == '>') {
505	    /* check for operator -> */
506	    *e_token++ = *buf_ptr++;
507	    if (!pointer_as_binop) {
508		unary_delim = false;
509		code = unary_op;
510		ps.want_blank = false;
511	    }
512	}
513	break;			/* buffer overflow will be checked at end of
514				 * switch */
515
516    case '=':
517	if (ps.in_or_st)
518	    ps.block_init = 1;
519#ifdef undef
520	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
521	    e_token[-1] = *buf_ptr++;
522	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
523		*e_token++ = *buf_ptr++;
524	    *e_token++ = '=';	/* Flip =+ to += */
525	    *e_token = 0;
526	}
527#else
528	if (*buf_ptr == '=') {/* == */
529	    *e_token++ = '=';	/* Flip =+ to += */
530	    buf_ptr++;
531	    *e_token = 0;
532	}
533#endif
534	code = binary_op;
535	unary_delim = true;
536	break;
537	/* can drop thru!!! */
538
539    case '>':
540    case '<':
541    case '!':			/* ops like <, <<, <=, !=, etc */
542	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
543	    *e_token++ = *buf_ptr;
544	    if (++buf_ptr >= buf_end)
545		fill_buffer();
546	}
547	if (*buf_ptr == '=')
548	    *e_token++ = *buf_ptr++;
549	code = (ps.last_u_d ? unary_op : binary_op);
550	unary_delim = true;
551	break;
552
553    default:
554	if (token[0] == '/' && *buf_ptr == '*') {
555	    /* it is start of comment */
556	    *e_token++ = '*';
557
558	    if (++buf_ptr >= buf_end)
559		fill_buffer();
560
561	    code = comment;
562	    unary_delim = ps.last_u_d;
563	    break;
564	}
565	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
566	    /*
567	     * handle ||, &&, etc, and also things as in int *****i
568	     */
569	    *e_token++ = *buf_ptr;
570	    if (++buf_ptr >= buf_end)
571		fill_buffer();
572	}
573	code = (ps.last_u_d ? unary_op : binary_op);
574	unary_delim = true;
575
576
577    }				/* end of switch */
578    if (code != newline) {
579	l_struct = false;
580	last_code = code;
581    }
582    if (buf_ptr >= buf_end)	/* check for input buffer empty */
583	fill_buffer();
584    ps.last_u_d = unary_delim;
585    *e_token = '\0';		/* null terminate the token */
586    return (code);
587}
588
589/*
590 * Add the given keyword to the keyword table, using val as the keyword type
591 */
592void
593addkey(char *key, int val)
594{
595    struct templ *p = specials;
596    while (p->rwd)
597	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
598	    return;
599	else
600	    p++;
601    if (p >= specials + sizeof specials / sizeof specials[0])
602	return;			/* For now, table overflows are silently
603				 * ignored */
604    p->rwd = key;
605    p->rwcode = val;
606    p[1].rwd = 0;
607    p[1].rwcode = 0;
608}
609