lexi.c revision 36211
1/*
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *	This product includes software developed by the University of
18 *	California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#ifndef lint
37static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
38#endif /* not lint */
39
40/*
41 * Here we have the token scanner for indent.  It scans off one token and puts
42 * it in the global variable "token".  It returns a code, indicating the type
43 * of token scanned.
44 */
45
46#include <stdio.h>
47#include <ctype.h>
48#include <stdlib.h>
49#include <string.h>
50#include "indent_globs.h"
51#include "indent_codes.h"
52
53#define alphanum 1
54#define opchar 3
55
56struct templ {
57    char       *rwd;
58    int         rwcode;
59};
60
61struct templ specials[100] =
62{
63    "switch", 1,
64    "case", 2,
65    "break", 0,
66    "struct", 3,
67    "union", 3,
68    "enum", 3,
69    "default", 2,
70    "int", 4,
71    "char", 4,
72    "float", 4,
73    "double", 4,
74    "long", 4,
75    "short", 4,
76    "typdef", 4,
77    "unsigned", 4,
78    "register", 4,
79    "static", 4,
80    "global", 4,
81    "extern", 4,
82    "void", 4,
83    "goto", 0,
84    "return", 0,
85    "if", 5,
86    "while", 5,
87    "for", 5,
88    "else", 6,
89    "do", 6,
90    "sizeof", 7,
91    0, 0
92};
93
94char        chartype[128] =
95{				/* this is used to facilitate the decision of
96				 * what type (alphanumeric, operator) each
97				 * character is */
98    0, 0, 0, 0, 0, 0, 0, 0,
99    0, 0, 0, 0, 0, 0, 0, 0,
100    0, 0, 0, 0, 0, 0, 0, 0,
101    0, 0, 0, 0, 0, 0, 0, 0,
102    0, 3, 0, 0, 1, 3, 3, 0,
103    0, 0, 3, 3, 0, 3, 0, 3,
104    1, 1, 1, 1, 1, 1, 1, 1,
105    1, 1, 0, 0, 3, 3, 3, 3,
106    0, 1, 1, 1, 1, 1, 1, 1,
107    1, 1, 1, 1, 1, 1, 1, 1,
108    1, 1, 1, 1, 1, 1, 1, 1,
109    1, 1, 1, 0, 0, 0, 3, 1,
110    0, 1, 1, 1, 1, 1, 1, 1,
111    1, 1, 1, 1, 1, 1, 1, 1,
112    1, 1, 1, 1, 1, 1, 1, 1,
113    1, 1, 1, 0, 3, 0, 3, 0
114};
115
116
117
118
119int
120lexi()
121{
122    int         unary_delim;	/* this is set to 1 if the current token
123				 *
124				 * forces a following operator to be unary */
125    static int  last_code;	/* the last token type returned */
126    static int  l_struct;	/* set to 1 if the last token was 'struct' */
127    int         code;		/* internal code to be returned */
128    char        qchar;		/* the delimiter character for a string */
129
130    e_token = s_token;		/* point to start of place to save token */
131    unary_delim = false;
132    ps.col_1 = ps.last_nl;	/* tell world that this token started in
133				 * column 1 iff the last thing scanned was nl */
134    ps.last_nl = false;
135
136    while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
137	ps.col_1 = false;	/* leading blanks imply token is not in column
138				 * 1 */
139	if (++buf_ptr >= buf_end)
140	    fill_buffer();
141    }
142
143    /* Scan an alphanumeric token */
144    if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
145	/*
146	 * we have a character or number
147	 */
148	register char *j;	/* used for searching thru list of
149				 *
150				 * reserved words */
151	register struct templ *p;
152
153	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
154	    int         seendot = 0,
155	                seenexp = 0,
156			seensfx = 0;
157	    if (*buf_ptr == '0' &&
158		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
159		*e_token++ = *buf_ptr++;
160		*e_token++ = *buf_ptr++;
161		while (isxdigit(*buf_ptr)) {
162		    CHECK_SIZE_TOKEN;
163		    *e_token++ = *buf_ptr++;
164		}
165	    }
166	    else
167		while (1) {
168		    if (*buf_ptr == '.')
169			if (seendot)
170			    break;
171			else
172			    seendot++;
173		    CHECK_SIZE_TOKEN;
174		    *e_token++ = *buf_ptr++;
175		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
176			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
177			    break;
178			else {
179			    seenexp++;
180			    seendot++;
181			    CHECK_SIZE_TOKEN;
182			    *e_token++ = *buf_ptr++;
183			    if (*buf_ptr == '+' || *buf_ptr == '-')
184				*e_token++ = *buf_ptr++;
185			}
186		}
187	    while (1) {
188		if (!(seensfx & 1) &&
189			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
190		    CHECK_SIZE_TOKEN;
191		    *e_token++ = *buf_ptr++;
192		    seensfx |= 1;
193		    continue;
194		}
195        	if (!(seensfx & 2) &&
196			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
197		    CHECK_SIZE_TOKEN;
198		    if (buf_ptr[1] == buf_ptr[0])
199		        *e_token++ = *buf_ptr++;
200		    *e_token++ = *buf_ptr++;
201		    seensfx |= 2;
202		    continue;
203		}
204		break;
205	    }
206	}
207	else
208	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
209		CHECK_SIZE_TOKEN;
210		*e_token++ = *buf_ptr++;
211		if (buf_ptr >= buf_end)
212		    fill_buffer();
213	    }
214	*e_token++ = '\0';
215	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
216	    if (++buf_ptr >= buf_end)
217		fill_buffer();
218	}
219	ps.its_a_keyword = false;
220	ps.sizeof_keyword = false;
221	if (l_struct) {		/* if last token was 'struct', then this token
222				 * should be treated as a declaration */
223	    l_struct = false;
224	    last_code = ident;
225	    ps.last_u_d = true;
226	    return (decl);
227	}
228	ps.last_u_d = false;	/* Operator after indentifier is binary */
229	last_code = ident;	/* Remember that this is the code we will
230				 * return */
231
232	/*
233	 * This loop will check if the token is a keyword.
234	 */
235	for (p = specials; (j = p->rwd) != 0; p++) {
236	    register char *p = s_token;	/* point at scanned token */
237	    if (*j++ != *p++ || *j++ != *p++)
238		continue;	/* This test depends on the fact that
239				 * identifiers are always at least 1 character
240				 * long (ie. the first two bytes of the
241				 * identifier are always meaningful) */
242	    if (p[-1] == 0)
243		break;		/* If its a one-character identifier */
244	    while (*p++ == *j)
245		if (*j++ == 0)
246		    goto found_keyword;	/* I wish that C had a multi-level
247					 * break... */
248	}
249	if (p->rwd) {		/* we have a keyword */
250    found_keyword:
251	    ps.its_a_keyword = true;
252	    ps.last_u_d = true;
253	    switch (p->rwcode) {
254	    case 1:		/* it is a switch */
255		return (swstmt);
256	    case 2:		/* a case or default */
257		return (casestmt);
258
259	    case 3:		/* a "struct" */
260		if (ps.p_l_follow)
261		    break;	/* inside parens: cast */
262		l_struct = true;
263
264		/*
265		 * Next time around, we will want to know that we have had a
266		 * 'struct'
267		 */
268	    case 4:		/* one of the declaration keywords */
269		if (ps.p_l_follow) {
270		    ps.cast_mask |= 1 << ps.p_l_follow;
271		    break;	/* inside parens: cast */
272		}
273		last_code = decl;
274		return (decl);
275
276	    case 5:		/* if, while, for */
277		return (sp_paren);
278
279	    case 6:		/* do, else */
280		return (sp_nparen);
281
282	    case 7:
283		ps.sizeof_keyword = true;
284	    default:		/* all others are treated like any other
285				 * identifier */
286		return (ident);
287	    }			/* end of switch */
288	}			/* end of if (found_it) */
289	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
290	    register char *tp = buf_ptr;
291	    while (tp < buf_end)
292		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
293		    goto not_proc;
294	    strncpy(ps.procname, token, sizeof ps.procname - 1);
295	    ps.in_parameter_declaration = 1;
296	    rparen_count = 1;
297    not_proc:;
298	}
299	/*
300	 * The following hack attempts to guess whether or not the current
301	 * token is in fact a declaration keyword -- one that has been
302	 * typedefd
303	 */
304	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
305		&& !ps.p_l_follow
306	        && !ps.block_init
307		&& (ps.last_token == rparen || ps.last_token == semicolon ||
308		    ps.last_token == decl ||
309		    ps.last_token == lbrace || ps.last_token == rbrace)) {
310	    ps.its_a_keyword = true;
311	    ps.last_u_d = true;
312	    last_code = decl;
313	    return decl;
314	}
315	if (last_code == decl)	/* if this is a declared variable, then
316				 * following sign is unary */
317	    ps.last_u_d = true;	/* will make "int a -1" work */
318	last_code = ident;
319	return (ident);		/* the ident is not in the list */
320    }				/* end of procesing for alpanum character */
321
322    /* Scan a non-alphanumeric token */
323
324    *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
325				 * moved here */
326    *e_token = '\0';
327    if (++buf_ptr >= buf_end)
328	fill_buffer();
329
330    switch (*token) {
331    case '\n':
332	unary_delim = ps.last_u_d;
333	ps.last_nl = true;	/* remember that we just had a newline */
334	code = (had_eof ? 0 : newline);
335
336	/*
337	 * if data has been exausted, the newline is a dummy, and we should
338	 * return code to stop
339	 */
340	break;
341
342    case '\'':			/* start of quoted character */
343    case '"':			/* start of string */
344	qchar = *token;
345	if (troff) {
346	    e_token[-1] = '`';
347	    if (qchar == '"')
348		*e_token++ = '`';
349	    e_token = chfont(&bodyf, &stringf, e_token);
350	}
351	do {			/* copy the string */
352	    while (1) {		/* move one character or [/<char>]<char> */
353		if (*buf_ptr == '\n') {
354		    printf("%d: Unterminated literal\n", line_no);
355		    goto stop_lit;
356		}
357		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
358					 * since CHECK_SIZE guarantees that there
359					 * are at least 5 entries left */
360		*e_token = *buf_ptr++;
361		if (buf_ptr >= buf_end)
362		    fill_buffer();
363		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
364		    if (*buf_ptr == '\n')	/* check for escaped newline */
365			++line_no;
366		    if (troff) {
367			*++e_token = BACKSLASH;
368			if (*buf_ptr == BACKSLASH)
369			    *++e_token = BACKSLASH;
370		    }
371		    *++e_token = *buf_ptr++;
372		    ++e_token;	/* we must increment this again because we
373				 * copied two chars */
374		    if (buf_ptr >= buf_end)
375			fill_buffer();
376		}
377		else
378		    break;	/* we copied one character */
379	    }			/* end of while (1) */
380	} while (*e_token++ != qchar);
381	if (troff) {
382	    e_token = chfont(&stringf, &bodyf, e_token - 1);
383	    if (qchar == '"')
384		*e_token++ = '\'';
385	}
386stop_lit:
387	code = ident;
388	break;
389
390    case ('('):
391    case ('['):
392	unary_delim = true;
393	code = lparen;
394	break;
395
396    case (')'):
397    case (']'):
398	code = rparen;
399	break;
400
401    case '#':
402	unary_delim = ps.last_u_d;
403	code = preesc;
404	break;
405
406    case '?':
407	unary_delim = true;
408	code = question;
409	break;
410
411    case (':'):
412	code = colon;
413	unary_delim = true;
414	break;
415
416    case (';'):
417	unary_delim = true;
418	code = semicolon;
419	break;
420
421    case ('{'):
422	unary_delim = true;
423
424	/*
425	 * if (ps.in_or_st) ps.block_init = 1;
426	 */
427	/* ?	code = ps.block_init ? lparen : lbrace; */
428	code = lbrace;
429	break;
430
431    case ('}'):
432	unary_delim = true;
433	/* ?	code = ps.block_init ? rparen : rbrace; */
434	code = rbrace;
435	break;
436
437    case 014:			/* a form feed */
438	unary_delim = ps.last_u_d;
439	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
440				 * right */
441	code = form_feed;
442	break;
443
444    case (','):
445	unary_delim = true;
446	code = comma;
447	break;
448
449    case '.':
450	unary_delim = false;
451	code = period;
452	break;
453
454    case '-':
455    case '+':			/* check for -, +, --, ++ */
456	code = (ps.last_u_d ? unary_op : binary_op);
457	unary_delim = true;
458
459	if (*buf_ptr == token[0]) {
460	    /* check for doubled character */
461	    *e_token++ = *buf_ptr++;
462	    /* buffer overflow will be checked at end of loop */
463	    if (last_code == ident || last_code == rparen) {
464		code = (ps.last_u_d ? unary_op : postop);
465		/* check for following ++ or -- */
466		unary_delim = false;
467	    }
468	}
469	else if (*buf_ptr == '=')
470	    /* check for operator += */
471	    *e_token++ = *buf_ptr++;
472	else if (*buf_ptr == '>') {
473	    /* check for operator -> */
474	    *e_token++ = *buf_ptr++;
475	    if (!pointer_as_binop) {
476		unary_delim = false;
477		code = unary_op;
478		ps.want_blank = false;
479	    }
480	}
481	break;			/* buffer overflow will be checked at end of
482				 * switch */
483
484    case '=':
485	if (ps.in_or_st)
486	    ps.block_init = 1;
487#ifdef undef
488	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
489	    e_token[-1] = *buf_ptr++;
490	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
491		*e_token++ = *buf_ptr++;
492	    *e_token++ = '=';	/* Flip =+ to += */
493	    *e_token = 0;
494	}
495#else
496	if (*buf_ptr == '=') {/* == */
497	    *e_token++ = '=';	/* Flip =+ to += */
498	    buf_ptr++;
499	    *e_token = 0;
500	}
501#endif
502	code = binary_op;
503	unary_delim = true;
504	break;
505	/* can drop thru!!! */
506
507    case '>':
508    case '<':
509    case '!':			/* ops like <, <<, <=, !=, etc */
510	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
511	    *e_token++ = *buf_ptr;
512	    if (++buf_ptr >= buf_end)
513		fill_buffer();
514	}
515	if (*buf_ptr == '=')
516	    *e_token++ = *buf_ptr++;
517	code = (ps.last_u_d ? unary_op : binary_op);
518	unary_delim = true;
519	break;
520
521    default:
522	if (token[0] == '/' && *buf_ptr == '*') {
523	    /* it is start of comment */
524	    *e_token++ = '*';
525
526	    if (++buf_ptr >= buf_end)
527		fill_buffer();
528
529	    code = comment;
530	    unary_delim = ps.last_u_d;
531	    break;
532	}
533	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
534	    /*
535	     * handle ||, &&, etc, and also things as in int *****i
536	     */
537	    *e_token++ = *buf_ptr;
538	    if (++buf_ptr >= buf_end)
539		fill_buffer();
540	}
541	code = (ps.last_u_d ? unary_op : binary_op);
542	unary_delim = true;
543
544
545    }				/* end of switch */
546    if (code != newline) {
547	l_struct = false;
548	last_code = code;
549    }
550    if (buf_ptr >= buf_end)	/* check for input buffer empty */
551	fill_buffer();
552    ps.last_u_d = unary_delim;
553    *e_token = '\0';		/* null terminate the token */
554    return (code);
555}
556
557/*
558 * Add the given keyword to the keyword table, using val as the keyword type
559 */
560addkey(key, val)
561    char       *key;
562{
563    register struct templ *p = specials;
564    while (p->rwd)
565	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
566	    return;
567	else
568	    p++;
569    if (p >= specials + sizeof specials / sizeof specials[0])
570	return;			/* For now, table overflows are silently
571				 * ignored */
572    p->rwd = key;
573    p->rwcode = val;
574    p[1].rwd = 0;
575    p[1].rwcode = 0;
576    return;
577}
578