lexi.c revision 69796
1/* 2 * Copyright (c) 1985 Sun Microsystems, Inc. 3 * Copyright (c) 1980, 1993 4 * The Regents of the University of California. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36#ifndef lint 37static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 38static const char rcsid[] = 39 "@(#)$FreeBSD: head/usr.bin/indent/lexi.c 69796 2000-12-09 09:52:52Z obrien $"; 40#endif /* not lint */ 41 42/* 43 * Here we have the token scanner for indent. It scans off one token and puts 44 * it in the global variable "token". It returns a code, indicating the type 45 * of token scanned. 46 */ 47 48#include <stdio.h> 49#include <ctype.h> 50#include <stdlib.h> 51#include <string.h> 52#include "indent_globs.h" 53#include "indent_codes.h" 54 55#define alphanum 1 56#define opchar 3 57 58struct templ { 59 char *rwd; 60 int rwcode; 61}; 62 63struct templ specials[1000] = 64{ 65 "switch", 1, 66 "case", 2, 67 "break", 0, 68 "struct", 3, 69 "union", 3, 70 "enum", 3, 71 "default", 2, 72 "int", 4, 73 "char", 4, 74 "float", 4, 75 "double", 4, 76 "long", 4, 77 "short", 4, 78 "typdef", 4, 79 "unsigned", 4, 80 "register", 4, 81 "static", 4, 82 "global", 4, 83 "extern", 4, 84 "void", 4, 85 "goto", 0, 86 "return", 0, 87 "if", 5, 88 "while", 5, 89 "for", 5, 90 "else", 6, 91 "do", 6, 92 "sizeof", 7, 93 "const", 9, 94 "volatile", 9, 95 0, 0 96}; 97 98char chartype[128] = 99{ /* this is used to facilitate the decision of 100 * what type (alphanumeric, operator) each 101 * character is */ 102 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 105 0, 0, 0, 0, 0, 0, 0, 0, 106 0, 3, 0, 0, 1, 3, 3, 0, 107 0, 0, 3, 3, 0, 3, 0, 3, 108 1, 1, 1, 1, 1, 1, 1, 1, 109 1, 1, 0, 0, 3, 3, 3, 3, 110 0, 1, 1, 1, 1, 1, 1, 1, 111 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 1, 1, 1, 1, 1, 1, 113 1, 1, 1, 0, 0, 0, 3, 1, 114 0, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 1, 1, 1, 1, 1, 117 1, 1, 1, 0, 3, 0, 3, 0 118}; 119 120 121 122 123int 124lexi() 125{ 126 int unary_delim; /* this is set to 1 if the current token 127 * 128 * forces a following operator to be unary */ 129 static int last_code; /* the last token type returned */ 130 static int l_struct; /* set to 1 if the last token was 'struct' */ 131 int code; /* internal code to be returned */ 132 char qchar; /* the delimiter character for a string */ 133 134 e_token = s_token; /* point to start of place to save token */ 135 unary_delim = false; 136 ps.col_1 = ps.last_nl; /* tell world that this token started in 137 * column 1 iff the last thing scanned was nl */ 138 ps.last_nl = false; 139 140 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 141 ps.col_1 = false; /* leading blanks imply token is not in column 142 * 1 */ 143 if (++buf_ptr >= buf_end) 144 fill_buffer(); 145 } 146 147 /* Scan an alphanumeric token */ 148 if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) { 149 /* 150 * we have a character or number 151 */ 152 register char *j; /* used for searching thru list of 153 * 154 * reserved words */ 155 register struct templ *p; 156 157 if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) { 158 int seendot = 0, 159 seenexp = 0, 160 seensfx = 0; 161 if (*buf_ptr == '0' && 162 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 163 *e_token++ = *buf_ptr++; 164 *e_token++ = *buf_ptr++; 165 while (isxdigit(*buf_ptr)) { 166 CHECK_SIZE_TOKEN; 167 *e_token++ = *buf_ptr++; 168 } 169 } 170 else 171 while (1) { 172 if (*buf_ptr == '.') 173 if (seendot) 174 break; 175 else 176 seendot++; 177 CHECK_SIZE_TOKEN; 178 *e_token++ = *buf_ptr++; 179 if (!isdigit(*buf_ptr) && *buf_ptr != '.') 180 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 181 break; 182 else { 183 seenexp++; 184 seendot++; 185 CHECK_SIZE_TOKEN; 186 *e_token++ = *buf_ptr++; 187 if (*buf_ptr == '+' || *buf_ptr == '-') 188 *e_token++ = *buf_ptr++; 189 } 190 } 191 while (1) { 192 if (!(seensfx & 1) && 193 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 194 CHECK_SIZE_TOKEN; 195 *e_token++ = *buf_ptr++; 196 seensfx |= 1; 197 continue; 198 } 199 if (!(seensfx & 2) && 200 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 201 CHECK_SIZE_TOKEN; 202 if (buf_ptr[1] == buf_ptr[0]) 203 *e_token++ = *buf_ptr++; 204 *e_token++ = *buf_ptr++; 205 seensfx |= 2; 206 continue; 207 } 208 break; 209 } 210 } 211 else 212 while (chartype[*buf_ptr] == alphanum) { /* copy it over */ 213 CHECK_SIZE_TOKEN; 214 *e_token++ = *buf_ptr++; 215 if (buf_ptr >= buf_end) 216 fill_buffer(); 217 } 218 *e_token++ = '\0'; 219 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 220 if (++buf_ptr >= buf_end) 221 fill_buffer(); 222 } 223 ps.its_a_keyword = false; 224 ps.sizeof_keyword = false; 225 if (l_struct) { /* if last token was 'struct', then this token 226 * should be treated as a declaration */ 227 l_struct = false; 228 last_code = ident; 229 ps.last_u_d = true; 230 return (decl); 231 } 232 ps.last_u_d = false; /* Operator after indentifier is binary */ 233 last_code = ident; /* Remember that this is the code we will 234 * return */ 235 236 /* 237 * This loop will check if the token is a keyword. 238 */ 239 for (p = specials; (j = p->rwd) != 0; p++) { 240 register char *p = s_token; /* point at scanned token */ 241 if (*j++ != *p++ || *j++ != *p++) 242 continue; /* This test depends on the fact that 243 * identifiers are always at least 1 character 244 * long (ie. the first two bytes of the 245 * identifier are always meaningful) */ 246 if (p[-1] == 0) 247 break; /* If its a one-character identifier */ 248 while (*p++ == *j) 249 if (*j++ == 0) 250 goto found_keyword; /* I wish that C had a multi-level 251 * break... */ 252 } 253 if (p->rwd) { /* we have a keyword */ 254 found_keyword: 255 ps.its_a_keyword = true; 256 ps.last_u_d = true; 257 switch (p->rwcode) { 258 case 1: /* it is a switch */ 259 return (swstmt); 260 case 2: /* a case or default */ 261 return (casestmt); 262 263 case 3: /* a "struct" */ 264 /* 265 * Next time around, we may want to know that we have had a 266 * 'struct' 267 */ 268 l_struct = true; 269 270 /* 271 * Fall through to test for a cast, function prototype or 272 * sizeof(). 273 */ 274 case 4: /* one of the declaration keywords */ 275 if (ps.p_l_follow) { 276 ps.cast_mask |= 1 << ps.p_l_follow; 277 278 /* 279 * Forget that we saw `struct' if we're in a sizeof(). 280 */ 281 if (ps.sizeof_mask) 282 l_struct = false; 283 284 break; /* inside parens: cast, prototype or sizeof() */ 285 } 286 last_code = decl; 287 return (decl); 288 289 case 5: /* if, while, for */ 290 return (sp_paren); 291 292 case 6: /* do, else */ 293 return (sp_nparen); 294 295 case 7: 296 ps.sizeof_keyword = true; 297 default: /* all others are treated like any other 298 * identifier */ 299 return (ident); 300 } /* end of switch */ 301 } /* end of if (found_it) */ 302 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 303 register char *tp = buf_ptr; 304 while (tp < buf_end) 305 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 306 goto not_proc; 307 strncpy(ps.procname, token, sizeof ps.procname - 1); 308 ps.in_parameter_declaration = 1; 309 rparen_count = 1; 310 not_proc:; 311 } 312 /* 313 * The following hack attempts to guess whether or not the current 314 * token is in fact a declaration keyword -- one that has been 315 * typedefd 316 */ 317 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 318 && !ps.p_l_follow 319 && !ps.block_init 320 && (ps.last_token == rparen || ps.last_token == semicolon || 321 ps.last_token == decl || 322 ps.last_token == lbrace || ps.last_token == rbrace)) { 323 ps.its_a_keyword = true; 324 ps.last_u_d = true; 325 last_code = decl; 326 return decl; 327 } 328 if (last_code == decl) /* if this is a declared variable, then 329 * following sign is unary */ 330 ps.last_u_d = true; /* will make "int a -1" work */ 331 last_code = ident; 332 return (ident); /* the ident is not in the list */ 333 } /* end of procesing for alpanum character */ 334 335 /* Scan a non-alphanumeric token */ 336 337 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 338 * moved here */ 339 *e_token = '\0'; 340 if (++buf_ptr >= buf_end) 341 fill_buffer(); 342 343 switch (*token) { 344 case '\n': 345 unary_delim = ps.last_u_d; 346 ps.last_nl = true; /* remember that we just had a newline */ 347 code = (had_eof ? 0 : newline); 348 349 /* 350 * if data has been exausted, the newline is a dummy, and we should 351 * return code to stop 352 */ 353 break; 354 355 case '\'': /* start of quoted character */ 356 case '"': /* start of string */ 357 qchar = *token; 358 if (troff) { 359 e_token[-1] = '`'; 360 if (qchar == '"') 361 *e_token++ = '`'; 362 e_token = chfont(&bodyf, &stringf, e_token); 363 } 364 do { /* copy the string */ 365 while (1) { /* move one character or [/<char>]<char> */ 366 if (*buf_ptr == '\n') { 367 printf("%d: Unterminated literal\n", line_no); 368 goto stop_lit; 369 } 370 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 371 * since CHECK_SIZE guarantees that there 372 * are at least 5 entries left */ 373 *e_token = *buf_ptr++; 374 if (buf_ptr >= buf_end) 375 fill_buffer(); 376 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 377 if (*buf_ptr == '\n') /* check for escaped newline */ 378 ++line_no; 379 if (troff) { 380 *++e_token = BACKSLASH; 381 if (*buf_ptr == BACKSLASH) 382 *++e_token = BACKSLASH; 383 } 384 *++e_token = *buf_ptr++; 385 ++e_token; /* we must increment this again because we 386 * copied two chars */ 387 if (buf_ptr >= buf_end) 388 fill_buffer(); 389 } 390 else 391 break; /* we copied one character */ 392 } /* end of while (1) */ 393 } while (*e_token++ != qchar); 394 if (troff) { 395 e_token = chfont(&stringf, &bodyf, e_token - 1); 396 if (qchar == '"') 397 *e_token++ = '\''; 398 } 399stop_lit: 400 code = ident; 401 break; 402 403 case ('('): 404 case ('['): 405 unary_delim = true; 406 code = lparen; 407 break; 408 409 case (')'): 410 case (']'): 411 code = rparen; 412 break; 413 414 case '#': 415 unary_delim = ps.last_u_d; 416 code = preesc; 417 break; 418 419 case '?': 420 unary_delim = true; 421 code = question; 422 break; 423 424 case (':'): 425 code = colon; 426 unary_delim = true; 427 break; 428 429 case (';'): 430 unary_delim = true; 431 code = semicolon; 432 break; 433 434 case ('{'): 435 unary_delim = true; 436 437 /* 438 * if (ps.in_or_st) ps.block_init = 1; 439 */ 440 /* ? code = ps.block_init ? lparen : lbrace; */ 441 code = lbrace; 442 break; 443 444 case ('}'): 445 unary_delim = true; 446 /* ? code = ps.block_init ? rparen : rbrace; */ 447 code = rbrace; 448 break; 449 450 case 014: /* a form feed */ 451 unary_delim = ps.last_u_d; 452 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 453 * right */ 454 code = form_feed; 455 break; 456 457 case (','): 458 unary_delim = true; 459 code = comma; 460 break; 461 462 case '.': 463 unary_delim = false; 464 code = period; 465 break; 466 467 case '-': 468 case '+': /* check for -, +, --, ++ */ 469 code = (ps.last_u_d ? unary_op : binary_op); 470 unary_delim = true; 471 472 if (*buf_ptr == token[0]) { 473 /* check for doubled character */ 474 *e_token++ = *buf_ptr++; 475 /* buffer overflow will be checked at end of loop */ 476 if (last_code == ident || last_code == rparen) { 477 code = (ps.last_u_d ? unary_op : postop); 478 /* check for following ++ or -- */ 479 unary_delim = false; 480 } 481 } 482 else if (*buf_ptr == '=') 483 /* check for operator += */ 484 *e_token++ = *buf_ptr++; 485 else if (*buf_ptr == '>') { 486 /* check for operator -> */ 487 *e_token++ = *buf_ptr++; 488 if (!pointer_as_binop) { 489 unary_delim = false; 490 code = unary_op; 491 ps.want_blank = false; 492 } 493 } 494 break; /* buffer overflow will be checked at end of 495 * switch */ 496 497 case '=': 498 if (ps.in_or_st) 499 ps.block_init = 1; 500#ifdef undef 501 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 502 e_token[-1] = *buf_ptr++; 503 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 504 *e_token++ = *buf_ptr++; 505 *e_token++ = '='; /* Flip =+ to += */ 506 *e_token = 0; 507 } 508#else 509 if (*buf_ptr == '=') {/* == */ 510 *e_token++ = '='; /* Flip =+ to += */ 511 buf_ptr++; 512 *e_token = 0; 513 } 514#endif 515 code = binary_op; 516 unary_delim = true; 517 break; 518 /* can drop thru!!! */ 519 520 case '>': 521 case '<': 522 case '!': /* ops like <, <<, <=, !=, etc */ 523 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 524 *e_token++ = *buf_ptr; 525 if (++buf_ptr >= buf_end) 526 fill_buffer(); 527 } 528 if (*buf_ptr == '=') 529 *e_token++ = *buf_ptr++; 530 code = (ps.last_u_d ? unary_op : binary_op); 531 unary_delim = true; 532 break; 533 534 default: 535 if (token[0] == '/' && *buf_ptr == '*') { 536 /* it is start of comment */ 537 *e_token++ = '*'; 538 539 if (++buf_ptr >= buf_end) 540 fill_buffer(); 541 542 code = comment; 543 unary_delim = ps.last_u_d; 544 break; 545 } 546 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 547 /* 548 * handle ||, &&, etc, and also things as in int *****i 549 */ 550 *e_token++ = *buf_ptr; 551 if (++buf_ptr >= buf_end) 552 fill_buffer(); 553 } 554 code = (ps.last_u_d ? unary_op : binary_op); 555 unary_delim = true; 556 557 558 } /* end of switch */ 559 if (code != newline) { 560 l_struct = false; 561 last_code = code; 562 } 563 if (buf_ptr >= buf_end) /* check for input buffer empty */ 564 fill_buffer(); 565 ps.last_u_d = unary_delim; 566 *e_token = '\0'; /* null terminate the token */ 567 return (code); 568} 569 570/* 571 * Add the given keyword to the keyword table, using val as the keyword type 572 */ 573addkey(key, val) 574 char *key; 575{ 576 register struct templ *p = specials; 577 while (p->rwd) 578 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 579 return; 580 else 581 p++; 582 if (p >= specials + sizeof specials / sizeof specials[0]) 583 return; /* For now, table overflows are silently 584 * ignored */ 585 p->rwd = key; 586 p->rwcode = val; 587 p[1].rwd = 0; 588 p[1].rwcode = 0; 589 return; 590} 591