1/*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 1985 Sun Microsystems, Inc.
5 * Copyright (c) 1980, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#if 0
39#ifndef lint
40static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
41#endif /* not lint */
42#endif
43#include <sys/cdefs.h>
44__FBSDID("$FreeBSD$");
45
46/*
47 * Here we have the token scanner for indent.  It scans off one token and puts
48 * it in the global variable "token".  It returns a code, indicating the type
49 * of token scanned.
50 */
51
52#include <err.h>
53#include <stdio.h>
54#include <ctype.h>
55#include <stdlib.h>
56#include <string.h>
57#include <sys/param.h>
58
59#include "indent_globs.h"
60#include "indent_codes.h"
61#include "indent.h"
62
63struct templ {
64    const char *rwd;
65    int         rwcode;
66};
67
68/*
69 * This table has to be sorted alphabetically, because it'll be used in binary
70 * search. For the same reason, string must be the first thing in struct templ.
71 */
72struct templ specials[] =
73{
74    {"_Bool", 4},
75    {"_Complex", 4},
76    {"_Imaginary", 4},
77    {"auto", 10},
78    {"bool", 4},
79    {"break", 9},
80    {"case", 8},
81    {"char", 4},
82    {"complex", 4},
83    {"const", 4},
84    {"continue", 12},
85    {"default", 8},
86    {"do", 6},
87    {"double", 4},
88    {"else", 6},
89    {"enum", 3},
90    {"extern", 10},
91    {"float", 4},
92    {"for", 5},
93    {"global", 4},
94    {"goto", 9},
95    {"if", 5},
96    {"imaginary", 4},
97    {"inline", 12},
98    {"int", 4},
99    {"long", 4},
100    {"offsetof", 1},
101    {"register", 10},
102    {"restrict", 12},
103    {"return", 9},
104    {"short", 4},
105    {"signed", 4},
106    {"sizeof", 2},
107    {"static", 10},
108    {"struct", 3},
109    {"switch", 7},
110    {"typedef", 11},
111    {"union", 3},
112    {"unsigned", 4},
113    {"void", 4},
114    {"volatile", 4},
115    {"while", 5}
116};
117
118const char **typenames;
119int         typename_count;
120int         typename_top = -1;
121
122/*
123 * The transition table below was rewritten by hand from lx's output, given
124 * the following definitions. lx is Katherine Flavel's lexer generator.
125 *
126 * O  = /[0-7]/;        D  = /[0-9]/;          NZ = /[1-9]/;
127 * H  = /[a-f0-9]/i;    B  = /[0-1]/;          HP = /0x/i;
128 * BP = /0b/i;          E  = /e[+\-]?/i D+;    P  = /p[+\-]?/i D+;
129 * FS = /[fl]/i;        IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
130 *
131 * D+           E  FS? -> $float;
132 * D*    "." D+ E? FS? -> $float;
133 * D+    "."    E? FS? -> $float;    HP H+           IS? -> $int;
134 * HP H+        P  FS? -> $float;    NZ D*           IS? -> $int;
135 * HP H* "." H+ P  FS? -> $float;    "0" O*          IS? -> $int;
136 * HP H+ "."    P  FS  -> $float;    BP B+           IS? -> $int;
137 */
138static char const *table[] = {
139    /*                examples:
140                                     00
141             s                      0xx
142             t                    00xaa
143             a     11       101100xxa..
144             r   11ee0001101lbuuxx.a.pp
145             t.01.e+008bLuxll0Ll.aa.p+0
146    states:  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
147    ['0'] = "CEIDEHHHIJQ  U  Q  VUVVZZZ",
148    ['1'] = "DEIDEHHHIJQ  U  Q  VUVVZZZ",
149    ['7'] = "DEIDEHHHIJ   U     VUVVZZZ",
150    ['9'] = "DEJDEHHHJJ   U     VUVVZZZ",
151    ['a'] = "             U     VUVV   ",
152    ['b'] = "  K          U     VUVV   ",
153    ['e'] = "  FFF   FF   U     VUVV   ",
154    ['f'] = "    f  f     U     VUVV  f",
155    ['u'] = "  MM    M  i  iiM   M     ",
156    ['x'] = "  N                       ",
157    ['p'] = "                    FFX   ",
158    ['L'] = "  LLf  fL  PR   Li  L    f",
159    ['l'] = "  OOf  fO   S P O i O    f",
160    ['+'] = "     G                 Y  ",
161    ['.'] = "B EE    EE   T      W     ",
162    /*       ABCDEFGHIJKLMNOPQRSTUVWXYZ */
163    [0]   = "uuiifuufiuuiiuiiiiiuiuuuuu",
164};
165
166static int
167strcmp_type(const void *e1, const void *e2)
168{
169    return (strcmp(e1, *(const char * const *)e2));
170}
171
172int
173lexi(struct parser_state *state)
174{
175    int         unary_delim;	/* this is set to 1 if the current token
176				 * forces a following operator to be unary */
177    int         code;		/* internal code to be returned */
178    char        qchar;		/* the delimiter character for a string */
179
180    e_token = s_token;		/* point to start of place to save token */
181    unary_delim = false;
182    state->col_1 = state->last_nl;	/* tell world that this token started
183					 * in column 1 iff the last thing
184					 * scanned was a newline */
185    state->last_nl = false;
186
187    while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
188	state->col_1 = false;	/* leading blanks imply token is not in column
189				 * 1 */
190	if (++buf_ptr >= buf_end)
191	    fill_buffer();
192    }
193
194    /* Scan an alphanumeric token */
195    if (isalnum((unsigned char)*buf_ptr) ||
196	*buf_ptr == '_' || *buf_ptr == '$' ||
197	(buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
198	/*
199	 * we have a character or number
200	 */
201	struct templ *p;
202
203	if (isdigit((unsigned char)*buf_ptr) ||
204	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
205	    char s;
206	    unsigned char i;
207
208	    for (s = 'A'; s != 'f' && s != 'i' && s != 'u'; ) {
209		i = (unsigned char)*buf_ptr;
210		if (i >= nitems(table) || table[i] == NULL ||
211		    table[i][s - 'A'] == ' ') {
212		    s = table[0][s - 'A'];
213		    break;
214		}
215		s = table[i][s - 'A'];
216		CHECK_SIZE_TOKEN(1);
217		*e_token++ = *buf_ptr++;
218		if (buf_ptr >= buf_end)
219		    fill_buffer();
220	    }
221	    /* s now indicates the type: f(loating), i(integer), u(nknown) */
222	}
223	else
224	    while (isalnum((unsigned char)*buf_ptr) ||
225	        *buf_ptr == BACKSLASH ||
226		*buf_ptr == '_' || *buf_ptr == '$') {
227		/* fill_buffer() terminates buffer with newline */
228		if (*buf_ptr == BACKSLASH) {
229		    if (*(buf_ptr + 1) == '\n') {
230			buf_ptr += 2;
231			if (buf_ptr >= buf_end)
232			    fill_buffer();
233			} else
234			    break;
235		}
236		CHECK_SIZE_TOKEN(1);
237		/* copy it over */
238		*e_token++ = *buf_ptr++;
239		if (buf_ptr >= buf_end)
240		    fill_buffer();
241	    }
242	*e_token = '\0';
243
244	if (s_token[0] == 'L' && s_token[1] == '\0' &&
245	      (*buf_ptr == '"' || *buf_ptr == '\''))
246	    return (strpfx);
247
248	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
249	    if (++buf_ptr >= buf_end)
250		fill_buffer();
251	}
252	state->keyword = 0;
253	if (state->last_token == structure && !state->p_l_follow) {
254				/* if last token was 'struct' and we're not
255				 * in parentheses, then this token
256				 * should be treated as a declaration */
257	    state->last_u_d = true;
258	    return (decl);
259	}
260	/*
261	 * Operator after identifier is binary unless last token was 'struct'
262	 */
263	state->last_u_d = (state->last_token == structure);
264
265	p = bsearch(s_token,
266	    specials,
267	    sizeof(specials) / sizeof(specials[0]),
268	    sizeof(specials[0]),
269	    strcmp_type);
270	if (p == NULL) {	/* not a special keyword... */
271	    char *u;
272
273	    /* ... so maybe a type_t or a typedef */
274	    if ((opt.auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
275	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
276		  bsearch(s_token, typenames, typename_top + 1,
277		    sizeof(typenames[0]), strcmp_type))) {
278		state->keyword = 4;	/* a type name */
279		state->last_u_d = true;
280	        goto found_typename;
281	    }
282	} else {			/* we have a keyword */
283	    state->keyword = p->rwcode;
284	    state->last_u_d = true;
285	    switch (p->rwcode) {
286	    case 7:		/* it is a switch */
287		return (swstmt);
288	    case 8:		/* a case or default */
289		return (casestmt);
290
291	    case 3:		/* a "struct" */
292		/* FALLTHROUGH */
293	    case 4:		/* one of the declaration keywords */
294	    found_typename:
295		if (state->p_l_follow) {
296		    /* inside parens: cast, param list, offsetof or sizeof */
297		    state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask;
298		}
299		if (state->last_token == period || state->last_token == unary_op) {
300		    state->keyword = 0;
301		    break;
302		}
303		if (p != NULL && p->rwcode == 3)
304		    return (structure);
305		if (state->p_l_follow)
306		    break;
307		return (decl);
308
309	    case 5:		/* if, while, for */
310		return (sp_paren);
311
312	    case 6:		/* do, else */
313		return (sp_nparen);
314
315	    case 10:		/* storage class specifier */
316		return (storage);
317
318	    case 11:		/* typedef */
319		return (type_def);
320
321	    default:		/* all others are treated like any other
322				 * identifier */
323		return (ident);
324	    }			/* end of switch */
325	}			/* end of if (found_it) */
326	if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 &&
327	    state->in_parameter_declaration == 0 && state->block_init == 0) {
328	    char *tp = buf_ptr;
329	    while (tp < buf_end)
330		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
331		    goto not_proc;
332	    strncpy(state->procname, token, sizeof state->procname - 1);
333	    if (state->in_decl)
334		state->in_parameter_declaration = 1;
335	    return (funcname);
336    not_proc:;
337	}
338	/*
339	 * The following hack attempts to guess whether or not the current
340	 * token is in fact a declaration keyword -- one that has been
341	 * typedefd
342	 */
343	else if (!state->p_l_follow && !state->block_init &&
344	    !state->in_stmt &&
345	    ((*buf_ptr == '*' && buf_ptr[1] != '=') ||
346		isalpha((unsigned char)*buf_ptr)) &&
347	    (state->last_token == semicolon || state->last_token == lbrace ||
348		state->last_token == rbrace)) {
349	    state->keyword = 4;	/* a type name */
350	    state->last_u_d = true;
351	    return decl;
352	}
353	if (state->last_token == decl)	/* if this is a declared variable,
354					 * then following sign is unary */
355	    state->last_u_d = true;	/* will make "int a -1" work */
356	return (ident);		/* the ident is not in the list */
357    }				/* end of procesing for alpanum character */
358
359    /* Scan a non-alphanumeric token */
360
361    CHECK_SIZE_TOKEN(3);		/* things like "<<=" */
362    *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
363				 * moved here */
364    *e_token = '\0';
365    if (++buf_ptr >= buf_end)
366	fill_buffer();
367
368    switch (*token) {
369    case '\n':
370	unary_delim = state->last_u_d;
371	state->last_nl = true;	/* remember that we just had a newline */
372	code = (had_eof ? 0 : newline);
373
374	/*
375	 * if data has been exhausted, the newline is a dummy, and we should
376	 * return code to stop
377	 */
378	break;
379
380    case '\'':			/* start of quoted character */
381    case '"':			/* start of string */
382	qchar = *token;
383	do {			/* copy the string */
384	    while (1) {		/* move one character or [/<char>]<char> */
385		if (*buf_ptr == '\n') {
386		    diag2(1, "Unterminated literal");
387		    goto stop_lit;
388		}
389		CHECK_SIZE_TOKEN(2);
390		*e_token = *buf_ptr++;
391		if (buf_ptr >= buf_end)
392		    fill_buffer();
393		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
394		    if (*buf_ptr == '\n')	/* check for escaped newline */
395			++line_no;
396		    *++e_token = *buf_ptr++;
397		    ++e_token;	/* we must increment this again because we
398				 * copied two chars */
399		    if (buf_ptr >= buf_end)
400			fill_buffer();
401		}
402		else
403		    break;	/* we copied one character */
404	    }			/* end of while (1) */
405	} while (*e_token++ != qchar);
406stop_lit:
407	code = ident;
408	break;
409
410    case ('('):
411    case ('['):
412	unary_delim = true;
413	code = lparen;
414	break;
415
416    case (')'):
417    case (']'):
418	code = rparen;
419	break;
420
421    case '#':
422	unary_delim = state->last_u_d;
423	code = preesc;
424	break;
425
426    case '?':
427	unary_delim = true;
428	code = question;
429	break;
430
431    case (':'):
432	code = colon;
433	unary_delim = true;
434	break;
435
436    case (';'):
437	unary_delim = true;
438	code = semicolon;
439	break;
440
441    case ('{'):
442	unary_delim = true;
443
444	/*
445	 * if (state->in_or_st) state->block_init = 1;
446	 */
447	/* ?	code = state->block_init ? lparen : lbrace; */
448	code = lbrace;
449	break;
450
451    case ('}'):
452	unary_delim = true;
453	/* ?	code = state->block_init ? rparen : rbrace; */
454	code = rbrace;
455	break;
456
457    case 014:			/* a form feed */
458	unary_delim = state->last_u_d;
459	state->last_nl = true;	/* remember this so we can set 'state->col_1'
460				 * right */
461	code = form_feed;
462	break;
463
464    case (','):
465	unary_delim = true;
466	code = comma;
467	break;
468
469    case '.':
470	unary_delim = false;
471	code = period;
472	break;
473
474    case '-':
475    case '+':			/* check for -, +, --, ++ */
476	code = (state->last_u_d ? unary_op : binary_op);
477	unary_delim = true;
478
479	if (*buf_ptr == token[0]) {
480	    /* check for doubled character */
481	    *e_token++ = *buf_ptr++;
482	    /* buffer overflow will be checked at end of loop */
483	    if (state->last_token == ident || state->last_token == rparen) {
484		code = (state->last_u_d ? unary_op : postop);
485		/* check for following ++ or -- */
486		unary_delim = false;
487	    }
488	}
489	else if (*buf_ptr == '=')
490	    /* check for operator += */
491	    *e_token++ = *buf_ptr++;
492	else if (*buf_ptr == '>') {
493	    /* check for operator -> */
494	    *e_token++ = *buf_ptr++;
495	    unary_delim = false;
496	    code = unary_op;
497	    state->want_blank = false;
498	}
499	break;			/* buffer overflow will be checked at end of
500				 * switch */
501
502    case '=':
503	if (state->in_or_st)
504	    state->block_init = 1;
505	if (*buf_ptr == '=') {/* == */
506	    *e_token++ = '=';	/* Flip =+ to += */
507	    buf_ptr++;
508	    *e_token = 0;
509	}
510	code = binary_op;
511	unary_delim = true;
512	break;
513	/* can drop thru!!! */
514
515    case '>':
516    case '<':
517    case '!':			/* ops like <, <<, <=, !=, etc */
518	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
519	    *e_token++ = *buf_ptr;
520	    if (++buf_ptr >= buf_end)
521		fill_buffer();
522	}
523	if (*buf_ptr == '=')
524	    *e_token++ = *buf_ptr++;
525	code = (state->last_u_d ? unary_op : binary_op);
526	unary_delim = true;
527	break;
528
529    case '*':
530	unary_delim = true;
531	if (!state->last_u_d) {
532	    if (*buf_ptr == '=')
533		*e_token++ = *buf_ptr++;
534	    code = binary_op;
535	    break;
536	}
537	while (*buf_ptr == '*' || isspace((unsigned char)*buf_ptr)) {
538	    if (*buf_ptr == '*') {
539		CHECK_SIZE_TOKEN(1);
540		*e_token++ = *buf_ptr;
541	    }
542	    if (++buf_ptr >= buf_end)
543		fill_buffer();
544	}
545	if (ps.in_decl) {
546	    char *tp = buf_ptr;
547
548	    while (isalpha((unsigned char)*tp) ||
549		   isspace((unsigned char)*tp)) {
550		if (++tp >= buf_end)
551		    fill_buffer();
552	    }
553	    if (*tp == '(')
554		ps.procname[0] = ' ';
555	}
556	code = unary_op;
557	break;
558
559    default:
560	if (token[0] == '/' && *buf_ptr == '*') {
561	    /* it is start of comment */
562	    *e_token++ = '*';
563
564	    if (++buf_ptr >= buf_end)
565		fill_buffer();
566
567	    code = comment;
568	    unary_delim = state->last_u_d;
569	    break;
570	}
571	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
572	    /*
573	     * handle ||, &&, etc, and also things as in int *****i
574	     */
575	    CHECK_SIZE_TOKEN(1);
576	    *e_token++ = *buf_ptr;
577	    if (++buf_ptr >= buf_end)
578		fill_buffer();
579	}
580	code = (state->last_u_d ? unary_op : binary_op);
581	unary_delim = true;
582
583
584    }				/* end of switch */
585    if (buf_ptr >= buf_end)	/* check for input buffer empty */
586	fill_buffer();
587    state->last_u_d = unary_delim;
588    CHECK_SIZE_TOKEN(1);
589    *e_token = '\0';		/* null terminate the token */
590    return (code);
591}
592
593/* Initialize constant transition table */
594void
595init_constant_tt(void)
596{
597    table['-'] = table['+'];
598    table['8'] = table['9'];
599    table['2'] = table['3'] = table['4'] = table['5'] = table['6'] = table['7'];
600    table['A'] = table['C'] = table['D'] = table['c'] = table['d'] = table['a'];
601    table['B'] = table['b'];
602    table['E'] = table['e'];
603    table['U'] = table['u'];
604    table['X'] = table['x'];
605    table['P'] = table['p'];
606    table['F'] = table['f'];
607}
608
609void
610alloc_typenames(void)
611{
612
613    typenames = (const char **)malloc(sizeof(typenames[0]) *
614        (typename_count = 16));
615    if (typenames == NULL)
616	err(1, NULL);
617}
618
619void
620add_typename(const char *key)
621{
622    int comparison;
623    const char *copy;
624
625    if (typename_top + 1 >= typename_count) {
626	typenames = realloc((void *)typenames,
627	    sizeof(typenames[0]) * (typename_count *= 2));
628	if (typenames == NULL)
629	    err(1, NULL);
630    }
631    if (typename_top == -1)
632	typenames[++typename_top] = copy = strdup(key);
633    else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
634	/* take advantage of sorted input */
635	if (comparison == 0)	/* remove duplicates */
636	    return;
637	typenames[++typename_top] = copy = strdup(key);
638    }
639    else {
640	int p;
641
642	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
643	    /* find place for the new key */;
644	if (comparison == 0)	/* remove duplicates */
645	    return;
646	memmove(&typenames[p + 1], &typenames[p],
647	    sizeof(typenames[0]) * (++typename_top - p));
648	typenames[p] = copy = strdup(key);
649    }
650
651    if (copy == NULL)
652	err(1, NULL);
653}
654