1133808Spjd/* Language lexer for the GNU compiler for the Java(TM) language. 2156878Spjd Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 3133808Spjd Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com) 4133808Spjd 5133808SpjdThis file is part of GNU CC. 6133808Spjd 7133808SpjdGNU CC is free software; you can redistribute it and/or modify 8133808Spjdit under the terms of the GNU General Public License as published by 9133808Spjdthe Free Software Foundation; either version 2, or (at your option) 10133808Spjdany later version. 11133808Spjd 12133808SpjdGNU CC is distributed in the hope that it will be useful, 13155174Spjdbut WITHOUT ANY WARRANTY; without even the implied warranty of 14133808SpjdMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15133808SpjdGNU General Public License for more details. 16133808Spjd 17133808SpjdYou should have received a copy of the GNU General Public License 18133808Spjdalong with GNU CC; see the file COPYING. If not, write to 19133808Spjdthe Free Software Foundation, 59 Temple Place - Suite 330, 20133808SpjdBoston, MA 02111-1307, USA. 21133808Spjd 22133808SpjdJava and all Java-based marks are trademarks or registered trademarks 23133808Spjdof Sun Microsystems, Inc. in the United States and other countries. 24133808SpjdThe Free Software Foundation is independent of Sun Microsystems, Inc. */ 25133808Spjd 26133808Spjd/* It defines java_lex (yylex) that reads a Java ASCII source file 27133808Spjd possibly containing Unicode escape sequence or utf8 encoded 28133808Spjd characters and returns a token for everything found but comments, 29133808Spjd white spaces and line terminators. When necessary, it also fills 30133808Spjd the java_lval (yylval) union. It's implemented to be called by a 31133808Spjd re-entrant parser generated by Bison. 32133808Spjd 33133808Spjd The lexical analysis conforms to the Java grammar described in "The 34133808Spjd Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele. 35133808Spjd Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */ 36133808Spjd 37133808Spjd#include "keyword.h" 38133808Spjd#include "flags.h" 39133808Spjd#include "chartables.h" 40133808Spjd 41133808Spjd/* Function declarations. */ 42133808Spjdstatic char *java_sprint_unicode PARAMS ((struct java_line *, int)); 43133808Spjdstatic void java_unicode_2_utf8 PARAMS ((unicode_t)); 44133808Spjdstatic void java_lex_error PARAMS ((const char *, int)); 45133808Spjd#ifndef JC1_LITE 46133808Spjdstatic int java_is_eol PARAMS ((FILE *, int)); 47133808Spjdstatic tree build_wfl_node PARAMS ((tree)); 48133808Spjd#endif 49133808Spjdstatic void java_store_unicode PARAMS ((struct java_line *, unicode_t, int)); 50133808Spjdstatic int java_parse_escape_sequence PARAMS ((void)); 51133808Spjdstatic int java_start_char_p PARAMS ((unicode_t)); 52133808Spjdstatic int java_part_char_p PARAMS ((unicode_t)); 53133808Spjdstatic int java_parse_doc_section PARAMS ((int)); 54156612Spjdstatic void java_parse_end_comment PARAMS ((int)); 55133808Spjdstatic int java_get_unicode PARAMS ((void)); 56133808Spjdstatic int java_read_unicode PARAMS ((java_lexer *, int *)); 57133808Spjdstatic int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *, 58133808Spjd int *)); 59133808Spjdstatic void java_store_unicode PARAMS ((struct java_line *, unicode_t, int)); 60133808Spjdstatic int java_read_char PARAMS ((java_lexer *)); 61133808Spjdstatic void java_allocate_new_line PARAMS ((void)); 62133808Spjdstatic void java_unget_unicode PARAMS ((void)); 63156612Spjdstatic unicode_t java_sneak_unicode PARAMS ((void)); 64156612Spjd#ifndef JC1_LITE 65133808Spjdstatic int utf8_cmp PARAMS ((const unsigned char *, int, const char *)); 66133808Spjd#endif 67133808Spjd 68156612Spjdjava_lexer *java_new_lexer PARAMS ((FILE *, const char *)); 69133808Spjd#ifndef JC1_LITE 70133808Spjdstatic void error_if_numeric_overflow PARAMS ((tree)); 71133808Spjd#endif 72133808Spjd 73133808Spjd#ifdef HAVE_ICONV 74133808Spjd/* This is nonzero if we have initialized `need_byteswap'. */ 75133808Spjdstatic int byteswap_init = 0; 76133808Spjd 77133808Spjd/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in 78156612Spjd big-endian order -- not native endian order. We handle this by 79160330Spjd doing a conversion once at startup and seeing what happens. This 80160330Spjd flag holds the results of this determination. */ 81133808Spjdstatic int need_byteswap = 0; 82133808Spjd#endif 83133808Spjd 84133808Spjdvoid 85133808Spjdjava_init_lex (finput, encoding) 86133808Spjd FILE *finput; 87133808Spjd const char *encoding; 88133808Spjd{ 89133808Spjd#ifndef JC1_LITE 90133808Spjd int java_lang_imported = 0; 91133808Spjd 92133808Spjd if (!java_lang_id) 93133808Spjd java_lang_id = get_identifier ("java.lang"); 94133808Spjd if (!inst_id) 95133808Spjd inst_id = get_identifier ("inst$"); 96133808Spjd if (!wpv_id) 97133808Spjd wpv_id = get_identifier ("write_parm_value$"); 98133808Spjd 99133808Spjd if (!java_lang_imported) 100133808Spjd { 101163888Spjd tree node = build_tree_list 102134168Spjd (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE); 103163888Spjd read_import_dir (TREE_PURPOSE (node)); 104134168Spjd TREE_CHAIN (node) = ctxp->import_demand_list; 105134168Spjd ctxp->import_demand_list = node; 106133808Spjd java_lang_imported = 1; 107133808Spjd } 108133808Spjd 109144142Spjd if (!wfl_operator) 110144142Spjd wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0); 111144142Spjd if (!label_id) 112144142Spjd label_id = get_identifier ("$L"); 113133808Spjd if (!wfl_append) 114133808Spjd wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0); 115133808Spjd if (!wfl_string_buffer) 116133808Spjd wfl_string_buffer = 117133808Spjd build_expr_wfl (get_identifier (flag_emit_class_files 118133808Spjd ? "java.lang.StringBuffer" 119133808Spjd : "gnu.gcj.runtime.StringBuffer"), 120133808Spjd NULL, 0, 0); 121133808Spjd if (!wfl_to_string) 122133808Spjd wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0); 123133808Spjd 124133808Spjd CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) = 125133808Spjd CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE; 126133808Spjd 127133808Spjd memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx)); 128133808Spjd memset (current_jcf, 0, sizeof (JCF)); 129133808Spjd ctxp->current_parsed_class = NULL; 130133808Spjd ctxp->package = NULL_TREE; 131133808Spjd#endif 132163888Spjd 133163888Spjd ctxp->filename = input_filename; 134163888Spjd ctxp->lineno = lineno = 0; 135163888Spjd ctxp->p_line = NULL; 136163888Spjd ctxp->c_line = NULL; 137163888Spjd ctxp->java_error_flag = 0; 138163888Spjd ctxp->lexer = java_new_lexer (finput, encoding); 139163888Spjd} 140163888Spjd 141163888Spjdstatic char * 142163888Spjdjava_sprint_unicode (line, i) 143163888Spjd struct java_line *line; 144163888Spjd int i; 145163888Spjd{ 146163888Spjd static char buffer [10]; 147134124Spjd if (line->unicode_escape_p [i] || line->line [i] > 128) 148134124Spjd sprintf (buffer, "\\u%04x", line->line [i]); 149134124Spjd else 150134124Spjd { 151134124Spjd buffer [0] = line->line [i]; 152134124Spjd buffer [1] = '\0'; 153134124Spjd } 154134124Spjd return buffer; 155134124Spjd} 156134124Spjd 157134124Spjdstatic unicode_t 158134124Spjdjava_sneak_unicode () 159134124Spjd{ 160134124Spjd return (ctxp->c_line->line [ctxp->c_line->current]); 161134124Spjd} 162134124Spjd 163134168Spjdstatic void 164134168Spjdjava_unget_unicode () 165134168Spjd{ 166134168Spjd if (!ctxp->c_line->current) 167134168Spjd /* Can't unget unicode. */ 168134168Spjd abort (); 169134168Spjd 170134168Spjd ctxp->c_line->current--; 171134168Spjd ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0); 172134168Spjd} 173134168Spjd 174134168Spjdstatic void 175134168Spjdjava_allocate_new_line () 176134168Spjd{ 177134168Spjd unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0'); 178163888Spjd char ahead_escape_p = (ctxp->c_line ? 179163888Spjd ctxp->c_line->unicode_escape_ahead_p : 0); 180134124Spjd 181134124Spjd if (ctxp->c_line && !ctxp->c_line->white_space_only) 182134124Spjd { 183156612Spjd if (ctxp->p_line) 184156612Spjd { 185156612Spjd free (ctxp->p_line->unicode_escape_p); 186156612Spjd free (ctxp->p_line->line); 187156612Spjd free (ctxp->p_line); 188156612Spjd } 189156612Spjd ctxp->p_line = ctxp->c_line; 190156612Spjd ctxp->c_line = NULL; /* Reallocated. */ 191156612Spjd } 192156612Spjd 193156612Spjd if (!ctxp->c_line) 194156612Spjd { 195156612Spjd ctxp->c_line = xmalloc (sizeof (struct java_line)); 196156612Spjd ctxp->c_line->max = JAVA_LINE_MAX; 197156612Spjd ctxp->c_line->line = xmalloc (sizeof (unicode_t)*ctxp->c_line->max); 198133808Spjd ctxp->c_line->unicode_escape_p = 199133808Spjd xmalloc (sizeof (char)*ctxp->c_line->max); 200133808Spjd ctxp->c_line->white_space_only = 0; 201133808Spjd } 202133808Spjd 203133808Spjd ctxp->c_line->line [0] = ctxp->c_line->size = 0; 204133808Spjd ctxp->c_line->char_col = ctxp->c_line->current = 0; 205133808Spjd if (ahead) 206133808Spjd { 207163888Spjd ctxp->c_line->line [ctxp->c_line->size] = ahead; 208163888Spjd ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p; 209163888Spjd ctxp->c_line->size++; 210163888Spjd } 211163888Spjd ctxp->c_line->ahead [0] = 0; 212163888Spjd ctxp->c_line->unicode_escape_ahead_p = 0; 213163888Spjd ctxp->c_line->lineno = ++lineno; 214163888Spjd ctxp->c_line->white_space_only = 1; 215163888Spjd} 216134168Spjd 217134168Spjd/* Create a new lexer object. */ 218134168Spjd 219134168Spjdjava_lexer * 220134168Spjdjava_new_lexer (finput, encoding) 221134168Spjd FILE *finput; 222134168Spjd const char *encoding; 223134124Spjd{ 224134124Spjd java_lexer *lex = xmalloc (sizeof (java_lexer)); 225134124Spjd int enc_error = 0; 226134124Spjd 227134124Spjd lex->finput = finput; 228134124Spjd lex->bs_count = 0; 229134124Spjd lex->unget_value = 0; 230134168Spjd lex->hit_eof = 0; 231134168Spjd 232134168Spjd#ifdef HAVE_ICONV 233134168Spjd lex->handle = iconv_open ("UCS-2", encoding); 234134168Spjd if (lex->handle != (iconv_t) -1) 235134168Spjd { 236134168Spjd lex->first = -1; 237133808Spjd lex->last = -1; 238133808Spjd lex->out_first = -1; 239133808Spjd lex->out_last = -1; 240133808Spjd lex->read_anything = 0; 241133808Spjd lex->use_fallback = 0; 242133808Spjd 243163888Spjd /* Work around broken iconv() implementations by doing checking at 244163888Spjd runtime. We assume that if the UTF-8 => UCS-2 encoder is broken, 245133808Spjd then all UCS-2 encoders will be broken. Perhaps not a valid 246133808Spjd assumption. */ 247133808Spjd if (! byteswap_init) 248133808Spjd { 249133808Spjd iconv_t handle; 250133808Spjd 251133808Spjd byteswap_init = 1; 252133808Spjd 253133808Spjd handle = iconv_open ("UCS-2", "UTF-8"); 254133808Spjd if (handle != (iconv_t) -1) 255133808Spjd { 256133808Spjd unicode_t result; 257133808Spjd unsigned char in[3]; 258156612Spjd char *inp, *outp; 259133808Spjd size_t inc, outc, r; 260133808Spjd 261133808Spjd /* This is the UTF-8 encoding of \ufeff. */ 262133808Spjd in[0] = 0xef; 263133808Spjd in[1] = 0xbb; 264139671Spjd in[2] = 0xbf; 265133808Spjd 266133808Spjd inp = in; 267139671Spjd inc = 3; 268133808Spjd outp = (char *) &result; 269139671Spjd outc = 2; 270133808Spjd 271133808Spjd r = iconv (handle, (ICONV_CONST char **) &inp, &inc, 272133808Spjd &outp, &outc); 273133808Spjd iconv_close (handle); 274133808Spjd /* Conversion must be complete for us to use the result. */ 275133808Spjd if (r != (size_t) -1 && inc == 0 && outc == 0) 276133808Spjd need_byteswap = (result != 0xfeff); 277133808Spjd } 278133808Spjd } 279133808Spjd 280133808Spjd lex->byte_swap = need_byteswap; 281133808Spjd } 282133808Spjd else 283133808Spjd#endif /* HAVE_ICONV */ 284133808Spjd { 285133808Spjd /* If iconv failed, use the internal decoder if the default 286133808Spjd encoding was requested. This code is used on platforms where 287133808Spjd iconv exists but is insufficient for our needs. For 288133808Spjd instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2. 289133808Spjd 290133808Spjd On Solaris the default encoding, as returned by nl_langinfo(), 291133808Spjd is `646' (aka ASCII), but the Solaris iconv_open() doesn't 292133808Spjd understand that. We work around that by pretending 293156612Spjd `646' to be the same as UTF-8. */ 294133808Spjd if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646")) 295133808Spjd enc_error = 1; 296133808Spjd#ifdef HAVE_ICONV 297133808Spjd else 298133808Spjd lex->use_fallback = 1; 299156612Spjd#endif /* HAVE_ICONV */ 300133808Spjd } 301133808Spjd 302133808Spjd if (enc_error) 303133808Spjd fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding); 304162350Spjd 305156612Spjd return lex; 306133808Spjd} 307133808Spjd 308133808Spjdvoid 309133808Spjdjava_destroy_lexer (lex) 310133808Spjd java_lexer *lex; 311133808Spjd{ 312133808Spjd#ifdef HAVE_ICONV 313133808Spjd if (! lex->use_fallback) 314133808Spjd iconv_close (lex->handle); 315133808Spjd#endif 316139671Spjd free (lex); 317156612Spjd} 318139671Spjd 319156612Spjdstatic int 320133808Spjdjava_read_char (lex) 321133808Spjd java_lexer *lex; 322139671Spjd{ 323139671Spjd if (lex->unget_value) 324156612Spjd { 325139671Spjd unicode_t r = lex->unget_value; 326139671Spjd lex->unget_value = 0; 327139671Spjd return r; 328156612Spjd } 329139671Spjd 330156612Spjd#ifdef HAVE_ICONV 331133808Spjd if (! lex->use_fallback) 332133808Spjd { 333133808Spjd size_t ir, inbytesleft, in_save, out_count, out_save; 334133808Spjd char *inp, *outp; 335133808Spjd unicode_t result; 336133808Spjd 337133808Spjd /* If there is data which has already been converted, use it. */ 338133808Spjd if (lex->out_first == -1 || lex->out_first >= lex->out_last) 339133808Spjd { 340133808Spjd lex->out_first = 0; 341157630Spjd lex->out_last = 0; 342133808Spjd 343133808Spjd while (1) 344133808Spjd { 345133808Spjd /* See if we need to read more data. If FIRST == 0 then 346133808Spjd the previous conversion attempt ended in the middle of 347133808Spjd a character at the end of the buffer. Otherwise we 348133808Spjd only have to read if the buffer is empty. */ 349133808Spjd if (lex->first == 0 || lex->first >= lex->last) 350133808Spjd { 351133808Spjd int r; 352133808Spjd 353133808Spjd if (lex->first >= lex->last) 354133808Spjd { 355133808Spjd lex->first = 0; 356133808Spjd lex->last = 0; 357157630Spjd } 358157630Spjd if (feof (lex->finput)) 359157630Spjd return UEOF; 360157630Spjd r = fread (&lex->buffer[lex->last], 1, 361133808Spjd sizeof (lex->buffer) - lex->last, 362133808Spjd lex->finput); 363133808Spjd lex->last += r; 364133808Spjd } 365133808Spjd 366133808Spjd inbytesleft = lex->last - lex->first; 367133808Spjd out_count = sizeof (lex->out_buffer) - lex->out_last; 368133808Spjd 369133808Spjd if (inbytesleft == 0) 370133808Spjd { 371133808Spjd /* We've tried to read and there is nothing left. */ 372133808Spjd return UEOF; 373133808Spjd } 374157630Spjd 375157630Spjd in_save = inbytesleft; 376133808Spjd out_save = out_count; 377133808Spjd inp = &lex->buffer[lex->first]; 378133808Spjd outp = &lex->out_buffer[lex->out_last]; 379156612Spjd ir = iconv (lex->handle, (ICONV_CONST char **) &inp, 380133808Spjd &inbytesleft, &outp, &out_count); 381133808Spjd 382156612Spjd /* If we haven't read any bytes, then look to see if we 383133808Spjd have read a BOM. */ 384133808Spjd if (! lex->read_anything && out_save - out_count >= 2) 385133808Spjd { 386133808Spjd unicode_t uc = * (unicode_t *) &lex->out_buffer[0]; 387133808Spjd if (uc == 0xfeff) 388133808Spjd { 389133808Spjd lex->byte_swap = 0; 390133808Spjd lex->out_first += 2; 391133808Spjd } 392133808Spjd else if (uc == 0xfffe) 393133808Spjd { 394133808Spjd lex->byte_swap = 1; 395133808Spjd lex->out_first += 2; 396133808Spjd } 397133808Spjd lex->read_anything = 1; 398133808Spjd } 399133808Spjd 400133808Spjd if (lex->byte_swap) 401133808Spjd { 402133808Spjd unsigned int i; 403133808Spjd for (i = 0; i < out_save - out_count; i += 2) 404133808Spjd { 405134420Spjd char t = lex->out_buffer[lex->out_last + i]; 406133808Spjd lex->out_buffer[lex->out_last + i] 407245456Smav = lex->out_buffer[lex->out_last + i + 1]; 408133808Spjd lex->out_buffer[lex->out_last + i + 1] = t; 409133808Spjd } 410133808Spjd } 411133808Spjd 412133808Spjd lex->first += in_save - inbytesleft; 413133808Spjd lex->out_last += out_save - out_count; 414133808Spjd 415133808Spjd /* If we converted anything at all, move along. */ 416133808Spjd if (out_count != out_save) 417133808Spjd break; 418156612Spjd 419156612Spjd if (ir == (size_t) -1) 420156612Spjd { 421133808Spjd if (errno == EINVAL) 422133808Spjd { 423156612Spjd /* This is ok. This means that the end of our buffer 424156612Spjd is in the middle of a character sequence. We just 425156612Spjd move the valid part of the buffer to the beginning 426133808Spjd to force a read. */ 427133808Spjd memmove (&lex->buffer[0], &lex->buffer[lex->first], 428245456Smav lex->last - lex->first); 429245456Smav lex->last -= lex->first; 430245456Smav lex->first = 0; 431245456Smav } 432160330Spjd else 433160330Spjd { 434156612Spjd /* A more serious error. */ 435156612Spjd java_lex_error ("unrecognized character in input stream", 436156612Spjd 0); 437156612Spjd return UEOF; 438156612Spjd } 439156612Spjd } 440156612Spjd } 441156612Spjd } 442156612Spjd 443156612Spjd if (lex->out_first == -1 || lex->out_first >= lex->out_last) 444156612Spjd { 445156612Spjd /* Don't have any data. */ 446156612Spjd return UEOF; 447156612Spjd } 448156612Spjd 449156612Spjd /* Success. */ 450156612Spjd result = * ((unicode_t *) &lex->out_buffer[lex->out_first]); 451156612Spjd lex->out_first += 2; 452156612Spjd return result; 453156612Spjd } 454156612Spjd else 455156612Spjd#endif /* HAVE_ICONV */ 456156612Spjd { 457156612Spjd int c, c1, c2; 458156612Spjd c = getc (lex->finput); 459156612Spjd 460156612Spjd if (c == EOF) 461156612Spjd return UEOF; 462156612Spjd if (c < 128) 463156612Spjd return (unicode_t) c; 464156612Spjd else 465156612Spjd { 466156612Spjd if ((c & 0xe0) == 0xc0) 467245456Smav { 468245456Smav c1 = getc (lex->finput); 469245456Smav if ((c1 & 0xc0) == 0x80) 470245456Smav { 471245456Smav unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f)); 472245456Smav /* Check for valid 2-byte characters. We explicitly 473245456Smav allow \0 because this encoding is common in the 474245456Smav Java world. */ 475245456Smav if (r == 0 || (r >= 0x80 && r <= 0x7ff)) 476245456Smav return r; 477245456Smav } 478245456Smav } 479245456Smav else if ((c & 0xf0) == 0xe0) 480245456Smav { 481245456Smav c1 = getc (lex->finput); 482245456Smav if ((c1 & 0xc0) == 0x80) 483245456Smav { 484245456Smav c2 = getc (lex->finput); 485245456Smav if ((c2 & 0xc0) == 0x80) 486245456Smav { 487245456Smav unicode_t r = (unicode_t)(((c & 0xf) << 12) + 488245456Smav (( c1 & 0x3f) << 6) 489245456Smav + (c2 & 0x3f)); 490245456Smav /* Check for valid 3-byte characters. 491133808Spjd Don't allow surrogate, \ufffe or \uffff. */ 492133808Spjd if (IN_RANGE (r, 0x800, 0xffff) 493156612Spjd && ! IN_RANGE (r, 0xd800, 0xdfff) 494133808Spjd && r != 0xfffe && r != 0xffff) 495133808Spjd return r; 496133808Spjd } 497156612Spjd } 498133808Spjd } 499134420Spjd 500134420Spjd /* We simply don't support invalid characters. We also 501156612Spjd don't support 4-, 5-, or 6-byte UTF-8 sequences, as these 502134420Spjd cannot be valid Java characters. */ 503156612Spjd java_lex_error ("malformed UTF-8 character", 0); 504134420Spjd } 505134420Spjd } 506134420Spjd 507134420Spjd /* We only get here on error. */ 508134420Spjd return UEOF; 509134420Spjd} 510134420Spjd 511133808Spjdstatic void 512156612Spjdjava_store_unicode (l, c, unicode_escape_p) 513133808Spjd struct java_line *l; 514163886Spjd unicode_t c; 515133808Spjd int unicode_escape_p; 516163886Spjd{ 517163886Spjd if (l->size == l->max) 518163886Spjd { 519156527Spjd l->max += JAVA_LINE_MAX; 520133808Spjd l->line = xrealloc (l->line, sizeof (unicode_t)*l->max); 521133808Spjd l->unicode_escape_p = xrealloc (l->unicode_escape_p, 522133808Spjd sizeof (char)*l->max); 523133808Spjd } 524133808Spjd l->line [l->size] = c; 525133808Spjd l->unicode_escape_p [l->size++] = unicode_escape_p; 526133808Spjd} 527133808Spjd 528156612Spjdstatic int 529146118Spjdjava_read_unicode (lex, unicode_escape_p) 530146118Spjd java_lexer *lex; 531146118Spjd int *unicode_escape_p; 532146118Spjd{ 533146118Spjd int c; 534146117Spjd 535156612Spjd c = java_read_char (lex); 536133808Spjd *unicode_escape_p = 0; 537133808Spjd 538133808Spjd if (c != '\\') 539133808Spjd { 540133808Spjd lex->bs_count = 0; 541133808Spjd return c; 542133808Spjd } 543133808Spjd 544133808Spjd ++lex->bs_count; 545133808Spjd if ((lex->bs_count) % 2 == 1) 546133808Spjd { 547133808Spjd /* Odd number of \ seen. */ 548133808Spjd c = java_read_char (lex); 549133808Spjd if (c == 'u') 550133808Spjd { 551133808Spjd unicode_t unicode = 0; 552133808Spjd int shift = 12; 553133808Spjd 554133808Spjd /* Recognize any number of `u's in \u. */ 555133808Spjd while ((c = java_read_char (lex)) == 'u') 556156612Spjd ; 557156612Spjd 558156612Spjd shift = 12; 559156612Spjd do 560156612Spjd { 561133808Spjd if (c == UEOF) 562133808Spjd { 563133808Spjd java_lex_error ("prematurely terminated \\u sequence", 0); 564133808Spjd return UEOF; 565133808Spjd } 566133808Spjd 567133808Spjd if (hex_p (c)) 568133808Spjd unicode |= (unicode_t)(hex_value (c) << shift); 569133808Spjd else 570133808Spjd { 571133808Spjd java_lex_error ("non-hex digit in \\u sequence", 0); 572156612Spjd break; 573133808Spjd } 574133808Spjd 575133808Spjd c = java_read_char (lex); 576133808Spjd shift -= 4; 577133808Spjd } 578133808Spjd while (shift >= 0); 579133808Spjd 580133808Spjd if (c != UEOF) 581133808Spjd lex->unget_value = c; 582133808Spjd 583133808Spjd lex->bs_count = 0; 584133808Spjd *unicode_escape_p = 1; 585162350Spjd return unicode; 586133808Spjd } 587156612Spjd lex->unget_value = c; 588133808Spjd } 589133808Spjd return (unicode_t) '\\'; 590133808Spjd} 591133808Spjd 592133808Spjdstatic int 593133808Spjdjava_read_unicode_collapsing_terminators (lex, unicode_escape_p) 594133808Spjd java_lexer *lex; 595139295Spjd int *unicode_escape_p; 596139295Spjd{ 597139295Spjd int c = java_read_unicode (lex, unicode_escape_p); 598156612Spjd 599133808Spjd if (c == '\r') 600133808Spjd { 601133808Spjd /* We have to read ahead to see if we got \r\n. In that case we 602133808Spjd return a single line terminator. */ 603133808Spjd int dummy; 604162350Spjd c = java_read_unicode (lex, &dummy); 605156612Spjd if (c != '\n' && c != UEOF) 606133808Spjd lex->unget_value = c; 607156612Spjd /* In either case we must return a newline. */ 608133808Spjd c = '\n'; 609133808Spjd } 610133808Spjd 611133808Spjd return c; 612133808Spjd} 613133808Spjd 614133808Spjdstatic int 615133808Spjdjava_get_unicode () 616133808Spjd{ 617133808Spjd /* It's time to read a line when... */ 618133808Spjd if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size) 619133808Spjd { 620133808Spjd int c; 621133808Spjd int found_chars = 0; 622133808Spjd 623133808Spjd if (ctxp->lexer->hit_eof) 624133808Spjd return UEOF; 625133808Spjd 626133808Spjd java_allocate_new_line (); 627156612Spjd if (ctxp->c_line->line[0] != '\n') 628133808Spjd { 629133808Spjd for (;;) 630133808Spjd { 631133808Spjd int unicode_escape_p; 632133808Spjd c = java_read_unicode_collapsing_terminators (ctxp->lexer, 633133808Spjd &unicode_escape_p); 634133808Spjd if (c != UEOF) 635133808Spjd { 636133808Spjd found_chars = 1; 637133808Spjd java_store_unicode (ctxp->c_line, c, unicode_escape_p); 638133808Spjd if (ctxp->c_line->white_space_only 639133808Spjd && !JAVA_WHITE_SPACE_P (c) 640156612Spjd && c != '\n') 641133808Spjd ctxp->c_line->white_space_only = 0; 642 } 643 if ((c == '\n') || (c == UEOF)) 644 break; 645 } 646 647 if (c == UEOF && ! found_chars) 648 { 649 ctxp->lexer->hit_eof = 1; 650 return UEOF; 651 } 652 } 653 } 654 ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0); 655 JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]); 656 return ctxp->c_line->line [ctxp->c_line->current++]; 657} 658 659/* Parse the end of a C style comment. 660 * C is the first character following the '/' and '*'. */ 661static void 662java_parse_end_comment (c) 663 int c; 664{ 665 for ( ;; c = java_get_unicode ()) 666 { 667 switch (c) 668 { 669 case UEOF: 670 java_lex_error ("Comment not terminated at end of input", 0); 671 return; 672 case '*': 673 switch (c = java_get_unicode ()) 674 { 675 case UEOF: 676 java_lex_error ("Comment not terminated at end of input", 0); 677 return; 678 case '/': 679 return; 680 case '*': /* Reparse only '*'. */ 681 java_unget_unicode (); 682 } 683 } 684 } 685} 686 687/* Parse the documentation section. Keywords must be at the beginning 688 of a documentation comment line (ignoring white space and any `*' 689 character). Parsed keyword(s): @DEPRECATED. */ 690 691static int 692java_parse_doc_section (c) 693 int c; 694{ 695 int valid_tag = 0, seen_star = 0; 696 697 while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n') 698 { 699 switch (c) 700 { 701 case '*': 702 seen_star = 1; 703 break; 704 case '\n': /* ULT */ 705 valid_tag = 1; 706 default: 707 seen_star = 0; 708 } 709 c = java_get_unicode(); 710 } 711 712 if (c == UEOF) 713 java_lex_error ("Comment not terminated at end of input", 0); 714 715 if (seen_star && (c == '/')) 716 return 1; /* Goto step1 in caller. */ 717 718 /* We're parsing `@deprecated'. */ 719 if (valid_tag && (c == '@')) 720 { 721 char tag [11]; 722 int tag_index = 0; 723 724 while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n') 725 { 726 c = java_get_unicode (); 727 tag [tag_index++] = c; 728 } 729 730 if (c == UEOF) 731 java_lex_error ("Comment not terminated at end of input", 0); 732 tag [tag_index] = '\0'; 733 734 if (!strcmp (tag, "deprecated")) 735 ctxp->deprecated = 1; 736 } 737 java_unget_unicode (); 738 return 0; 739} 740 741/* Return true if C is a valid start character for a Java identifier. 742 This is only called if C >= 128 -- smaller values are handled 743 inline. However, this function handles all values anyway. */ 744static int 745java_start_char_p (c) 746 unicode_t c; 747{ 748 unsigned int hi = c / 256; 749 const char *const page = type_table[hi]; 750 unsigned long val = (unsigned long) page; 751 int flags; 752 753 if ((val & ~ (LETTER_PART | LETTER_START)) != 0) 754 flags = page[c & 255]; 755 else 756 flags = val; 757 758 return flags & LETTER_START; 759} 760 761/* Return true if C is a valid part character for a Java identifier. 762 This is only called if C >= 128 -- smaller values are handled 763 inline. However, this function handles all values anyway. */ 764static int 765java_part_char_p (c) 766 unicode_t c; 767{ 768 unsigned int hi = c / 256; 769 const char *const page = type_table[hi]; 770 unsigned long val = (unsigned long) page; 771 int flags; 772 773 if ((val & ~ (LETTER_PART | LETTER_START)) != 0) 774 flags = page[c & 255]; 775 else 776 flags = val; 777 778 return flags & LETTER_PART; 779} 780 781static int 782java_parse_escape_sequence () 783{ 784 unicode_t char_lit; 785 int c; 786 787 switch (c = java_get_unicode ()) 788 { 789 case 'b': 790 return (unicode_t)0x8; 791 case 't': 792 return (unicode_t)0x9; 793 case 'n': 794 return (unicode_t)0xa; 795 case 'f': 796 return (unicode_t)0xc; 797 case 'r': 798 return (unicode_t)0xd; 799 case '"': 800 return (unicode_t)0x22; 801 case '\'': 802 return (unicode_t)0x27; 803 case '\\': 804 return (unicode_t)0x5c; 805 case '0': case '1': case '2': case '3': case '4': 806 case '5': case '6': case '7': 807 { 808 int octal_escape[3]; 809 int octal_escape_index = 0; 810 int max = 3; 811 int i, shift; 812 813 for (; octal_escape_index < max && RANGE (c, '0', '7'); 814 c = java_get_unicode ()) 815 { 816 if (octal_escape_index == 0 && c > '3') 817 { 818 /* According to the grammar, `\477' has a well-defined 819 meaning -- it is `\47' followed by `7'. */ 820 --max; 821 } 822 octal_escape [octal_escape_index++] = c; 823 } 824 825 java_unget_unicode (); 826 827 for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1); 828 i < octal_escape_index; i++, shift -= 3) 829 char_lit |= (octal_escape [i] - '0') << shift; 830 831 return char_lit; 832 } 833 default: 834 java_lex_error ("Invalid character in escape sequence", 0); 835 return JAVA_CHAR_ERROR; 836 } 837} 838 839#ifndef JC1_LITE 840#define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0) 841 842/* Subroutine of java_lex: converts floating-point literals to tree 843 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to 844 store the result. FFLAG indicates whether the literal was tagged 845 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING 846 is the line number on which to report any error. */ 847 848static void java_perform_atof PARAMS ((YYSTYPE *, char *, int, int)); 849 850static void 851java_perform_atof (java_lval, literal_token, fflag, number_beginning) 852 YYSTYPE *java_lval; 853 char *literal_token; 854 int fflag; 855 int number_beginning; 856{ 857 REAL_VALUE_TYPE value; 858 tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE); 859 860 SET_REAL_VALUE_ATOF (value, 861 REAL_VALUE_ATOF (literal_token, TYPE_MODE (type))); 862 863 if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value)) 864 { 865 JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double"); 866 value = DCONST0; 867 } 868 else if (IS_ZERO (value)) 869 { 870 /* We check to see if the value is really 0 or if we've found an 871 underflow. We do this in the most primitive imaginable way. */ 872 int really_zero = 1; 873 char *p = literal_token; 874 if (*p == '-') 875 ++p; 876 while (*p && *p != 'e' && *p != 'E') 877 { 878 if (*p != '0' && *p != '.') 879 { 880 really_zero = 0; 881 break; 882 } 883 ++p; 884 } 885 if (! really_zero) 886 { 887 int i = ctxp->c_line->current; 888 ctxp->c_line->current = number_beginning; 889 java_lex_error ("Floating point literal underflow", 0); 890 ctxp->c_line->current = i; 891 } 892 } 893 894 SET_LVAL_NODE_TYPE (build_real (type, value), type); 895} 896#endif 897 898static int yylex PARAMS ((YYSTYPE *)); 899 900static int 901#ifdef JC1_LITE 902yylex (java_lval) 903#else 904java_lex (java_lval) 905#endif 906 YYSTYPE *java_lval; 907{ 908 int c; 909 unicode_t first_unicode; 910 int ascii_index, all_ascii; 911 char *string; 912 913 /* Translation of the Unicode escape in the raw stream of Unicode 914 characters. Takes care of line terminator. */ 915 step1: 916 /* Skip white spaces: SP, TAB and FF or ULT. */ 917 for (c = java_get_unicode (); 918 c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ()) 919 if (c == '\n') 920 { 921 ctxp->elc.line = ctxp->c_line->lineno; 922 ctxp->elc.col = ctxp->c_line->char_col-2; 923 } 924 925 ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col); 926 927 if (c == 0x1a) /* CTRL-Z. */ 928 { 929 if ((c = java_get_unicode ()) == UEOF) 930 return 0; /* Ok here. */ 931 else 932 java_unget_unicode (); /* Caught later, at the end of the 933 function. */ 934 } 935 /* Handle EOF here. */ 936 if (c == UEOF) /* Should probably do something here... */ 937 return 0; 938 939 /* Take care of eventual comments. */ 940 if (c == '/') 941 { 942 switch (c = java_get_unicode ()) 943 { 944 case '/': 945 for (;;) 946 { 947 c = java_get_unicode (); 948 if (c == UEOF) 949 { 950 /* It is ok to end a `//' comment with EOF, unless 951 we're being pedantic. */ 952 if (pedantic) 953 java_lex_error ("Comment not terminated at end of input", 954 0); 955 return 0; 956 } 957 if (c == '\n') /* ULT */ 958 goto step1; 959 } 960 break; 961 962 case '*': 963 if ((c = java_get_unicode ()) == '*') 964 { 965 if ((c = java_get_unicode ()) == '/') 966 goto step1; /* Empty documentation comment. */ 967 else if (java_parse_doc_section (c)) 968 goto step1; 969 } 970 971 java_parse_end_comment ((c = java_get_unicode ())); 972 goto step1; 973 break; 974 default: 975 java_unget_unicode (); 976 c = '/'; 977 break; 978 } 979 } 980 981 ctxp->elc.line = ctxp->c_line->lineno; 982 ctxp->elc.prev_col = ctxp->elc.col; 983 ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1); 984 if (ctxp->elc.col < 0) 985 abort (); 986 987 /* Numeric literals. */ 988 if (JAVA_ASCII_DIGIT (c) || (c == '.')) 989 { 990 /* This section of code is borrowed from gcc/c-lex.c. */ 991#define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2) 992 int parts[TOTAL_PARTS]; 993 HOST_WIDE_INT high, low; 994 /* End borrowed section. */ 995 char literal_token [256]; 996 int literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes; 997 int found_hex_digits = 0, found_non_octal_digits = 0; 998 int i; 999#ifndef JC1_LITE 1000 int number_beginning = ctxp->c_line->current; 1001 tree value; 1002#endif 1003 1004 /* We might have a . separator instead of a FP like .[0-9]*. */ 1005 if (c == '.') 1006 { 1007 unicode_t peep = java_sneak_unicode (); 1008 1009 if (!JAVA_ASCII_DIGIT (peep)) 1010 { 1011 JAVA_LEX_SEP('.'); 1012 BUILD_OPERATOR (DOT_TK); 1013 } 1014 } 1015 1016 for (i = 0; i < TOTAL_PARTS; i++) 1017 parts [i] = 0; 1018 1019 if (c == '0') 1020 { 1021 c = java_get_unicode (); 1022 if (c == 'x' || c == 'X') 1023 { 1024 radix = 16; 1025 c = java_get_unicode (); 1026 } 1027 else if (JAVA_ASCII_DIGIT (c)) 1028 radix = 8; 1029 else if (c == '.' || c == 'e' || c =='E') 1030 { 1031 /* Push the '.', 'e', or 'E' back and prepare for a FP 1032 parsing... */ 1033 java_unget_unicode (); 1034 c = '0'; 1035 } 1036 else 1037 { 1038 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */ 1039 JAVA_LEX_LIT ("0", 10); 1040 switch (c) 1041 { 1042 case 'L': case 'l': 1043 SET_LVAL_NODE (long_zero_node); 1044 return (INT_LIT_TK); 1045 case 'f': case 'F': 1046 SET_LVAL_NODE (float_zero_node); 1047 return (FP_LIT_TK); 1048 case 'd': case 'D': 1049 SET_LVAL_NODE (double_zero_node); 1050 return (FP_LIT_TK); 1051 default: 1052 java_unget_unicode (); 1053 SET_LVAL_NODE (integer_zero_node); 1054 return (INT_LIT_TK); 1055 } 1056 } 1057 } 1058 /* Parse the first part of the literal, until we find something 1059 which is not a number. */ 1060 while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) || 1061 JAVA_ASCII_DIGIT (c)) 1062 { 1063 /* We store in a string (in case it turns out to be a FP) and in 1064 PARTS if we have to process a integer literal. */ 1065 int numeric = hex_value (c); 1066 int count; 1067 1068 /* Remember when we find a valid hexadecimal digit. */ 1069 if (radix == 16) 1070 found_hex_digits = 1; 1071 /* Remember when we find an invalid octal digit. */ 1072 else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c)) 1073 found_non_octal_digits = 1; 1074 1075 literal_token [literal_index++] = c; 1076 /* This section of code if borrowed from gcc/c-lex.c. */ 1077 for (count = 0; count < TOTAL_PARTS; count++) 1078 { 1079 parts[count] *= radix; 1080 if (count) 1081 { 1082 parts[count] += (parts[count-1] >> HOST_BITS_PER_CHAR); 1083 parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1; 1084 } 1085 else 1086 parts[0] += numeric; 1087 } 1088 if (parts [TOTAL_PARTS-1] != 0) 1089 overflow = 1; 1090 /* End borrowed section. */ 1091 c = java_get_unicode (); 1092 } 1093 1094 /* If we have something from the FP char set but not a digit, parse 1095 a FP literal. */ 1096 if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c)) 1097 { 1098 int stage = 0; 1099 int seen_digit = (literal_index ? 1 : 0); 1100 int seen_exponent = 0; 1101 int fflag = 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are 1102 double unless specified. */ 1103 1104 /* It is ok if the radix is 8 because this just means we've 1105 seen a leading `0'. However, radix==16 is invalid. */ 1106 if (radix == 16) 1107 java_lex_error ("Can't express non-decimal FP literal", 0); 1108 radix = 10; 1109 1110 for (;;) 1111 { 1112 if (c == '.') 1113 { 1114 if (stage < 1) 1115 { 1116 stage = 1; 1117 literal_token [literal_index++ ] = c; 1118 c = java_get_unicode (); 1119 } 1120 else 1121 java_lex_error ("Invalid character in FP literal", 0); 1122 } 1123 1124 if (c == 'e' || c == 'E') 1125 { 1126 if (stage < 2) 1127 { 1128 /* {E,e} must have seen at least a digit. */ 1129 if (!seen_digit) 1130 java_lex_error 1131 ("Invalid FP literal, mantissa must have digit", 0); 1132 seen_digit = 0; 1133 seen_exponent = 1; 1134 stage = 2; 1135 literal_token [literal_index++] = c; 1136 c = java_get_unicode (); 1137 } 1138 else 1139 java_lex_error ("Invalid character in FP literal", 0); 1140 } 1141 if ( c == 'f' || c == 'F' || c == 'd' || c == 'D') 1142 { 1143 fflag = ((c == 'd') || (c == 'D')) ? 0 : 1; 1144 stage = 4; /* So we fall through. */ 1145 } 1146 1147 if ((c=='-' || c =='+') && stage == 2) 1148 { 1149 stage = 3; 1150 literal_token [literal_index++] = c; 1151 c = java_get_unicode (); 1152 } 1153 1154 if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) || 1155 (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) || 1156 (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) || 1157 (stage == 3 && JAVA_ASCII_DIGIT (c))) 1158 { 1159 if (JAVA_ASCII_DIGIT (c)) 1160 seen_digit = 1; 1161 if (stage == 2) 1162 stage = 3; 1163 literal_token [literal_index++ ] = c; 1164 c = java_get_unicode (); 1165 } 1166 else 1167 { 1168 if (stage != 4) /* Don't push back fF/dD. */ 1169 java_unget_unicode (); 1170 1171 /* An exponent (if any) must have seen a digit. */ 1172 if (seen_exponent && !seen_digit) 1173 java_lex_error 1174 ("Invalid FP literal, exponent must have digit", 0); 1175 1176 literal_token [literal_index] = '\0'; 1177 JAVA_LEX_LIT (literal_token, radix); 1178 1179#ifndef JC1_LITE 1180 java_perform_atof (java_lval, literal_token, 1181 fflag, number_beginning); 1182#endif 1183 return FP_LIT_TK; 1184 } 1185 } 1186 } /* JAVA_ASCII_FPCHAR (c) */ 1187 1188 /* Here we get back to converting the integral literal. */ 1189 if (radix == 16 && ! found_hex_digits) 1190 java_lex_error 1191 ("0x must be followed by at least one hexadecimal digit", 0); 1192 else if (radix == 8 && found_non_octal_digits) 1193 java_lex_error ("Octal literal contains digit out of range", 0); 1194 else if (c == 'L' || c == 'l') 1195 long_suffix = 1; 1196 else 1197 java_unget_unicode (); 1198 1199#ifdef JAVA_LEX_DEBUG 1200 literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */ 1201 JAVA_LEX_LIT (literal_token, radix); 1202#endif 1203 /* This section of code is borrowed from gcc/c-lex.c. */ 1204 if (!overflow) 1205 { 1206 bytes = GET_TYPE_PRECISION (long_type_node); 1207 for (i = bytes; i < TOTAL_PARTS; i++) 1208 if (parts [i]) 1209 { 1210 overflow = 1; 1211 break; 1212 } 1213 } 1214 high = low = 0; 1215 for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++) 1216 { 1217 high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT 1218 / HOST_BITS_PER_CHAR)] 1219 << (i * HOST_BITS_PER_CHAR)); 1220 low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR); 1221 } 1222 /* End borrowed section. */ 1223 1224#ifndef JC1_LITE 1225 /* Range checking. */ 1226 value = build_int_2 (low, high); 1227 /* Temporarily set type to unsigned. */ 1228 SET_LVAL_NODE_TYPE (value, (long_suffix 1229 ? unsigned_long_type_node 1230 : unsigned_int_type_node)); 1231 1232 /* For base 10 numbers, only values up to the highest value 1233 (plus one) can be written. For instance, only ints up to 1234 2147483648 can be written. The special case of the largest 1235 negative value is handled elsewhere. For other bases, any 1236 number can be represented. */ 1237 if (overflow || (radix == 10 1238 && tree_int_cst_lt (long_suffix 1239 ? decimal_long_max 1240 : decimal_int_max, 1241 value))) 1242 { 1243 if (long_suffix) 1244 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal"); 1245 else 1246 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal"); 1247 } 1248 1249 /* Sign extend the value. */ 1250 SET_LVAL_NODE_TYPE (value, (long_suffix ? long_type_node : int_type_node)); 1251 force_fit_type (value, 0); 1252 JAVA_RADIX10_FLAG (value) = radix == 10; 1253#else 1254 SET_LVAL_NODE_TYPE (build_int_2 (low, high), 1255 long_suffix ? long_type_node : int_type_node); 1256#endif 1257 return INT_LIT_TK; 1258 } 1259 1260 /* Character literals. */ 1261 if (c == '\'') 1262 { 1263 int char_lit; 1264 if ((c = java_get_unicode ()) == '\\') 1265 char_lit = java_parse_escape_sequence (); 1266 else 1267 { 1268 if (c == '\n' || c == '\'') 1269 java_lex_error ("Invalid character literal", 0); 1270 char_lit = c; 1271 } 1272 1273 c = java_get_unicode (); 1274 1275 if ((c == '\n') || (c == UEOF)) 1276 java_lex_error ("Character literal not terminated at end of line", 0); 1277 if (c != '\'') 1278 java_lex_error ("Syntax error in character literal", 0); 1279 1280 if (char_lit == JAVA_CHAR_ERROR) 1281 char_lit = 0; /* We silently convert it to zero. */ 1282 1283 JAVA_LEX_CHAR_LIT (char_lit); 1284 SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node); 1285 return CHAR_LIT_TK; 1286 } 1287 1288 /* String literals. */ 1289 if (c == '"') 1290 { 1291 int no_error; 1292 char *string; 1293 1294 for (no_error = 1, c = java_get_unicode (); 1295 c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ()) 1296 { 1297 if (c == '\\') 1298 c = java_parse_escape_sequence (); 1299 if (c == JAVA_CHAR_ERROR) 1300 { 1301 no_error = 0; 1302 c = 0; /* We silently convert it to zero. */ 1303 } 1304 java_unicode_2_utf8 (c); 1305 } 1306 if (c == '\n' || c == UEOF) /* ULT. */ 1307 { 1308 lineno--; /* Refer to the line where the terminator was seen. */ 1309 java_lex_error ("String not terminated at end of line", 0); 1310 lineno++; 1311 } 1312 1313 obstack_1grow (&temporary_obstack, '\0'); 1314 string = obstack_finish (&temporary_obstack); 1315#ifndef JC1_LITE 1316 if (!no_error || (c != '"')) 1317 java_lval->node = error_mark_node; /* FIXME: Requires futher 1318 testing. */ 1319 else 1320 java_lval->node = build_string (strlen (string), string); 1321#endif 1322 obstack_free (&temporary_obstack, string); 1323 return STRING_LIT_TK; 1324 } 1325 1326 /* Separator. */ 1327 switch (c) 1328 { 1329 case '(': 1330 JAVA_LEX_SEP (c); 1331 BUILD_OPERATOR (OP_TK); 1332 case ')': 1333 JAVA_LEX_SEP (c); 1334 return CP_TK; 1335 case '{': 1336 JAVA_LEX_SEP (c); 1337 if (ctxp->ccb_indent == 1) 1338 ctxp->first_ccb_indent1 = lineno; 1339 ctxp->ccb_indent++; 1340 BUILD_OPERATOR (OCB_TK); 1341 case '}': 1342 JAVA_LEX_SEP (c); 1343 ctxp->ccb_indent--; 1344 if (ctxp->ccb_indent == 1) 1345 ctxp->last_ccb_indent1 = lineno; 1346 BUILD_OPERATOR (CCB_TK); 1347 case '[': 1348 JAVA_LEX_SEP (c); 1349 BUILD_OPERATOR (OSB_TK); 1350 case ']': 1351 JAVA_LEX_SEP (c); 1352 return CSB_TK; 1353 case ';': 1354 JAVA_LEX_SEP (c); 1355 return SC_TK; 1356 case ',': 1357 JAVA_LEX_SEP (c); 1358 return C_TK; 1359 case '.': 1360 JAVA_LEX_SEP (c); 1361 BUILD_OPERATOR (DOT_TK); 1362 /* return DOT_TK; */ 1363 } 1364 1365 /* Operators. */ 1366 switch (c) 1367 { 1368 case '=': 1369 if ((c = java_get_unicode ()) == '=') 1370 { 1371 BUILD_OPERATOR (EQ_TK); 1372 } 1373 else 1374 { 1375 /* Equals is used in two different locations. In the 1376 variable_declarator: rule, it has to be seen as '=' as opposed 1377 to being seen as an ordinary assignment operator in 1378 assignment_operators: rule. */ 1379 java_unget_unicode (); 1380 BUILD_OPERATOR (ASSIGN_TK); 1381 } 1382 1383 case '>': 1384 switch ((c = java_get_unicode ())) 1385 { 1386 case '=': 1387 BUILD_OPERATOR (GTE_TK); 1388 case '>': 1389 switch ((c = java_get_unicode ())) 1390 { 1391 case '>': 1392 if ((c = java_get_unicode ()) == '=') 1393 { 1394 BUILD_OPERATOR2 (ZRS_ASSIGN_TK); 1395 } 1396 else 1397 { 1398 java_unget_unicode (); 1399 BUILD_OPERATOR (ZRS_TK); 1400 } 1401 case '=': 1402 BUILD_OPERATOR2 (SRS_ASSIGN_TK); 1403 default: 1404 java_unget_unicode (); 1405 BUILD_OPERATOR (SRS_TK); 1406 } 1407 default: 1408 java_unget_unicode (); 1409 BUILD_OPERATOR (GT_TK); 1410 } 1411 1412 case '<': 1413 switch ((c = java_get_unicode ())) 1414 { 1415 case '=': 1416 BUILD_OPERATOR (LTE_TK); 1417 case '<': 1418 if ((c = java_get_unicode ()) == '=') 1419 { 1420 BUILD_OPERATOR2 (LS_ASSIGN_TK); 1421 } 1422 else 1423 { 1424 java_unget_unicode (); 1425 BUILD_OPERATOR (LS_TK); 1426 } 1427 default: 1428 java_unget_unicode (); 1429 BUILD_OPERATOR (LT_TK); 1430 } 1431 1432 case '&': 1433 switch ((c = java_get_unicode ())) 1434 { 1435 case '&': 1436 BUILD_OPERATOR (BOOL_AND_TK); 1437 case '=': 1438 BUILD_OPERATOR2 (AND_ASSIGN_TK); 1439 default: 1440 java_unget_unicode (); 1441 BUILD_OPERATOR (AND_TK); 1442 } 1443 1444 case '|': 1445 switch ((c = java_get_unicode ())) 1446 { 1447 case '|': 1448 BUILD_OPERATOR (BOOL_OR_TK); 1449 case '=': 1450 BUILD_OPERATOR2 (OR_ASSIGN_TK); 1451 default: 1452 java_unget_unicode (); 1453 BUILD_OPERATOR (OR_TK); 1454 } 1455 1456 case '+': 1457 switch ((c = java_get_unicode ())) 1458 { 1459 case '+': 1460 BUILD_OPERATOR (INCR_TK); 1461 case '=': 1462 BUILD_OPERATOR2 (PLUS_ASSIGN_TK); 1463 default: 1464 java_unget_unicode (); 1465 BUILD_OPERATOR (PLUS_TK); 1466 } 1467 1468 case '-': 1469 switch ((c = java_get_unicode ())) 1470 { 1471 case '-': 1472 BUILD_OPERATOR (DECR_TK); 1473 case '=': 1474 BUILD_OPERATOR2 (MINUS_ASSIGN_TK); 1475 default: 1476 java_unget_unicode (); 1477 BUILD_OPERATOR (MINUS_TK); 1478 } 1479 1480 case '*': 1481 if ((c = java_get_unicode ()) == '=') 1482 { 1483 BUILD_OPERATOR2 (MULT_ASSIGN_TK); 1484 } 1485 else 1486 { 1487 java_unget_unicode (); 1488 BUILD_OPERATOR (MULT_TK); 1489 } 1490 1491 case '/': 1492 if ((c = java_get_unicode ()) == '=') 1493 { 1494 BUILD_OPERATOR2 (DIV_ASSIGN_TK); 1495 } 1496 else 1497 { 1498 java_unget_unicode (); 1499 BUILD_OPERATOR (DIV_TK); 1500 } 1501 1502 case '^': 1503 if ((c = java_get_unicode ()) == '=') 1504 { 1505 BUILD_OPERATOR2 (XOR_ASSIGN_TK); 1506 } 1507 else 1508 { 1509 java_unget_unicode (); 1510 BUILD_OPERATOR (XOR_TK); 1511 } 1512 1513 case '%': 1514 if ((c = java_get_unicode ()) == '=') 1515 { 1516 BUILD_OPERATOR2 (REM_ASSIGN_TK); 1517 } 1518 else 1519 { 1520 java_unget_unicode (); 1521 BUILD_OPERATOR (REM_TK); 1522 } 1523 1524 case '!': 1525 if ((c = java_get_unicode()) == '=') 1526 { 1527 BUILD_OPERATOR (NEQ_TK); 1528 } 1529 else 1530 { 1531 java_unget_unicode (); 1532 BUILD_OPERATOR (NEG_TK); 1533 } 1534 1535 case '?': 1536 JAVA_LEX_OP ("?"); 1537 BUILD_OPERATOR (REL_QM_TK); 1538 case ':': 1539 JAVA_LEX_OP (":"); 1540 BUILD_OPERATOR (REL_CL_TK); 1541 case '~': 1542 BUILD_OPERATOR (NOT_TK); 1543 } 1544 1545 /* Keyword, boolean literal or null literal. */ 1546 for (first_unicode = c, all_ascii = 1, ascii_index = 0; 1547 c != UEOF && JAVA_PART_CHAR_P (c); c = java_get_unicode ()) 1548 { 1549 java_unicode_2_utf8 (c); 1550 if (all_ascii && c >= 128) 1551 all_ascii = 0; 1552 ascii_index++; 1553 } 1554 1555 obstack_1grow (&temporary_obstack, '\0'); 1556 string = obstack_finish (&temporary_obstack); 1557 if (c != UEOF) 1558 java_unget_unicode (); 1559 1560 /* If we have something all ascii, we consider a keyword, a boolean 1561 literal, a null literal or an all ASCII identifier. Otherwise, 1562 this is an identifier (possibly not respecting formation rule). */ 1563 if (all_ascii) 1564 { 1565 const struct java_keyword *kw; 1566 if ((kw=java_keyword (string, ascii_index))) 1567 { 1568 JAVA_LEX_KW (string); 1569 switch (kw->token) 1570 { 1571 case PUBLIC_TK: case PROTECTED_TK: case STATIC_TK: 1572 case ABSTRACT_TK: case FINAL_TK: case NATIVE_TK: 1573 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK: 1574 case PRIVATE_TK: case STRICT_TK: 1575 SET_MODIFIER_CTX (kw->token); 1576 return MODIFIER_TK; 1577 case FLOAT_TK: 1578 SET_LVAL_NODE (float_type_node); 1579 return FP_TK; 1580 case DOUBLE_TK: 1581 SET_LVAL_NODE (double_type_node); 1582 return FP_TK; 1583 case BOOLEAN_TK: 1584 SET_LVAL_NODE (boolean_type_node); 1585 return BOOLEAN_TK; 1586 case BYTE_TK: 1587 SET_LVAL_NODE (byte_type_node); 1588 return INTEGRAL_TK; 1589 case SHORT_TK: 1590 SET_LVAL_NODE (short_type_node); 1591 return INTEGRAL_TK; 1592 case INT_TK: 1593 SET_LVAL_NODE (int_type_node); 1594 return INTEGRAL_TK; 1595 case LONG_TK: 1596 SET_LVAL_NODE (long_type_node); 1597 return INTEGRAL_TK; 1598 case CHAR_TK: 1599 SET_LVAL_NODE (char_type_node); 1600 return INTEGRAL_TK; 1601 1602 /* Keyword based literals. */ 1603 case TRUE_TK: 1604 case FALSE_TK: 1605 SET_LVAL_NODE ((kw->token == TRUE_TK ? 1606 boolean_true_node : boolean_false_node)); 1607 return BOOL_LIT_TK; 1608 case NULL_TK: 1609 SET_LVAL_NODE (null_pointer_node); 1610 return NULL_TK; 1611 1612 case ASSERT_TK: 1613 if (flag_assert) 1614 { 1615 BUILD_OPERATOR (kw->token); 1616 return kw->token; 1617 } 1618 else 1619 break; 1620 1621 /* Some keyword we want to retain information on the location 1622 they where found. */ 1623 case CASE_TK: 1624 case DEFAULT_TK: 1625 case SUPER_TK: 1626 case THIS_TK: 1627 case RETURN_TK: 1628 case BREAK_TK: 1629 case CONTINUE_TK: 1630 case TRY_TK: 1631 case CATCH_TK: 1632 case THROW_TK: 1633 case INSTANCEOF_TK: 1634 BUILD_OPERATOR (kw->token); 1635 1636 default: 1637 return kw->token; 1638 } 1639 } 1640 } 1641 1642 /* We may have an ID here. */ 1643 if (JAVA_START_CHAR_P (first_unicode)) 1644 { 1645 JAVA_LEX_ID (string); 1646 java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string)); 1647 return ID_TK; 1648 } 1649 1650 /* Everything else is an invalid character in the input. */ 1651 { 1652 char lex_error_buffer [128]; 1653 sprintf (lex_error_buffer, "Invalid character `%s' in input", 1654 java_sprint_unicode (ctxp->c_line, ctxp->c_line->current)); 1655 java_lex_error (lex_error_buffer, 1); 1656 } 1657 return 0; 1658} 1659 1660#ifndef JC1_LITE 1661/* This is called by the parser to see if an error should be generated 1662 due to numeric overflow. This function only handles the particular 1663 case of the largest negative value, and is only called in the case 1664 where this value is not preceded by `-'. */ 1665static void 1666error_if_numeric_overflow (value) 1667 tree value; 1668{ 1669 if (TREE_CODE (value) == INTEGER_CST 1670 && JAVA_RADIX10_FLAG (value) 1671 && tree_int_cst_sgn (value) < 0) 1672 { 1673 if (TREE_TYPE (value) == long_type_node) 1674 java_lex_error ("Numeric overflow for `long' literal", 0); 1675 else 1676 java_lex_error ("Numeric overflow for `int' literal", 0); 1677 } 1678} 1679#endif /* JC1_LITE */ 1680 1681static void 1682java_unicode_2_utf8 (unicode) 1683 unicode_t unicode; 1684{ 1685 if (RANGE (unicode, 0x01, 0x7f)) 1686 obstack_1grow (&temporary_obstack, (char)unicode); 1687 else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0) 1688 { 1689 obstack_1grow (&temporary_obstack, 1690 (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6))); 1691 obstack_1grow (&temporary_obstack, 1692 (unsigned char)(0x80 | (unicode & 0x3f))); 1693 } 1694 else /* Range 0x800-0xffff. */ 1695 { 1696 obstack_1grow (&temporary_obstack, 1697 (unsigned char)(0xe0 | (unicode & 0xf000) >> 12)); 1698 obstack_1grow (&temporary_obstack, 1699 (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6)); 1700 obstack_1grow (&temporary_obstack, 1701 (unsigned char)(0x80 | (unicode & 0x003f))); 1702 } 1703} 1704 1705#ifndef JC1_LITE 1706static tree 1707build_wfl_node (node) 1708 tree node; 1709{ 1710 node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col); 1711 /* Prevent java_complete_lhs from short-circuiting node (if constant). */ 1712 TREE_TYPE (node) = NULL_TREE; 1713 return node; 1714} 1715#endif 1716 1717static void 1718java_lex_error (msg, forward) 1719 const char *msg ATTRIBUTE_UNUSED; 1720 int forward ATTRIBUTE_UNUSED; 1721{ 1722#ifndef JC1_LITE 1723 ctxp->elc.line = ctxp->c_line->lineno; 1724 ctxp->elc.col = ctxp->c_line->char_col-1+forward; 1725 1726 /* Might be caught in the middle of some error report. */ 1727 ctxp->java_error_flag = 0; 1728 java_error (NULL); 1729 java_error (msg); 1730#endif 1731} 1732 1733#ifndef JC1_LITE 1734static int 1735java_is_eol (fp, c) 1736 FILE *fp; 1737 int c; 1738{ 1739 int next; 1740 switch (c) 1741 { 1742 case '\r': 1743 next = getc (fp); 1744 if (next != '\n' && next != EOF) 1745 ungetc (next, fp); 1746 return 1; 1747 case '\n': 1748 return 1; 1749 default: 1750 return 0; 1751 } 1752} 1753#endif 1754 1755char * 1756java_get_line_col (filename, line, col) 1757 const char *filename ATTRIBUTE_UNUSED; 1758 int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED; 1759{ 1760#ifdef JC1_LITE 1761 return 0; 1762#else 1763 /* Dumb implementation. Doesn't try to cache or optimize things. */ 1764 /* First line of the file is line 1, first column is 1. */ 1765 1766 /* COL == -1 means, at the CR/LF in LINE. */ 1767 /* COL == -2 means, at the first non space char in LINE. */ 1768 1769 FILE *fp; 1770 int c, ccol, cline = 1; 1771 int current_line_col = 0; 1772 int first_non_space = 0; 1773 char *base; 1774 1775 if (!(fp = fopen (filename, "r"))) 1776 fatal_io_error ("can't open %s", filename); 1777 1778 while (cline != line) 1779 { 1780 c = getc (fp); 1781 if (c == EOF) 1782 { 1783 static const char msg[] = "<<file too short - unexpected EOF>>"; 1784 obstack_grow (&temporary_obstack, msg, sizeof(msg)-1); 1785 goto have_line; 1786 } 1787 if (java_is_eol (fp, c)) 1788 cline++; 1789 } 1790 1791 /* Gather the chars of the current line in a buffer. */ 1792 for (;;) 1793 { 1794 c = getc (fp); 1795 if (c < 0 || java_is_eol (fp, c)) 1796 break; 1797 if (!first_non_space && !JAVA_WHITE_SPACE_P (c)) 1798 first_non_space = current_line_col; 1799 obstack_1grow (&temporary_obstack, c); 1800 current_line_col++; 1801 } 1802 have_line: 1803 1804 obstack_1grow (&temporary_obstack, '\n'); 1805 1806 if (col == -1) 1807 { 1808 col = current_line_col; 1809 first_non_space = 0; 1810 } 1811 else if (col == -2) 1812 col = first_non_space; 1813 else 1814 first_non_space = 0; 1815 1816 /* Place the '^' a the right position. */ 1817 base = obstack_base (&temporary_obstack); 1818 for (ccol = 1; ccol <= col+3; ccol++) 1819 { 1820 /* Compute \t when reaching first_non_space. */ 1821 char c = (first_non_space ? 1822 (base [ccol-1] == '\t' ? '\t' : ' ') : ' '); 1823 obstack_1grow (&temporary_obstack, c); 1824 } 1825 obstack_grow0 (&temporary_obstack, "^", 1); 1826 1827 fclose (fp); 1828 return obstack_finish (&temporary_obstack); 1829#endif 1830} 1831 1832#ifndef JC1_LITE 1833static int 1834utf8_cmp (str, length, name) 1835 const unsigned char *str; 1836 int length; 1837 const char *name; 1838{ 1839 const unsigned char *limit = str + length; 1840 int i; 1841 1842 for (i = 0; name[i]; ++i) 1843 { 1844 int ch = UTF8_GET (str, limit); 1845 if (ch != name[i]) 1846 return ch - name[i]; 1847 } 1848 1849 return str == limit ? 0 : 1; 1850} 1851 1852/* A sorted list of all C++ keywords. */ 1853 1854static const char *const cxx_keywords[] = 1855{ 1856 "_Complex", 1857 "__alignof", 1858 "__alignof__", 1859 "__asm", 1860 "__asm__", 1861 "__attribute", 1862 "__attribute__", 1863 "__builtin_va_arg", 1864 "__complex", 1865 "__complex__", 1866 "__const", 1867 "__const__", 1868 "__extension__", 1869 "__imag", 1870 "__imag__", 1871 "__inline", 1872 "__inline__", 1873 "__label__", 1874 "__null", 1875 "__real", 1876 "__real__", 1877 "__restrict", 1878 "__restrict__", 1879 "__signed", 1880 "__signed__", 1881 "__typeof", 1882 "__typeof__", 1883 "__volatile", 1884 "__volatile__", 1885 "and", 1886 "and_eq", 1887 "asm", 1888 "auto", 1889 "bitand", 1890 "bitor", 1891 "bool", 1892 "break", 1893 "case", 1894 "catch", 1895 "char", 1896 "class", 1897 "compl", 1898 "const", 1899 "const_cast", 1900 "continue", 1901 "default", 1902 "delete", 1903 "do", 1904 "double", 1905 "dynamic_cast", 1906 "else", 1907 "enum", 1908 "explicit", 1909 "export", 1910 "extern", 1911 "false", 1912 "float", 1913 "for", 1914 "friend", 1915 "goto", 1916 "if", 1917 "inline", 1918 "int", 1919 "long", 1920 "mutable", 1921 "namespace", 1922 "new", 1923 "not", 1924 "not_eq", 1925 "operator", 1926 "or", 1927 "or_eq", 1928 "private", 1929 "protected", 1930 "public", 1931 "register", 1932 "reinterpret_cast", 1933 "return", 1934 "short", 1935 "signed", 1936 "sizeof", 1937 "static", 1938 "static_cast", 1939 "struct", 1940 "switch", 1941 "template", 1942 "this", 1943 "throw", 1944 "true", 1945 "try", 1946 "typedef", 1947 "typeid", 1948 "typename", 1949 "typeof", 1950 "union", 1951 "unsigned", 1952 "using", 1953 "virtual", 1954 "void", 1955 "volatile", 1956 "wchar_t", 1957 "while", 1958 "xor", 1959 "xor_eq" 1960}; 1961 1962/* Return true if NAME is a C++ keyword. */ 1963 1964int 1965cxx_keyword_p (name, length) 1966 const char *name; 1967 int length; 1968{ 1969 int last = ARRAY_SIZE (cxx_keywords); 1970 int first = 0; 1971 int mid = (last + first) / 2; 1972 int old = -1; 1973 1974 for (mid = (last + first) / 2; 1975 mid != old; 1976 old = mid, mid = (last + first) / 2) 1977 { 1978 int kwl = strlen (cxx_keywords[mid]); 1979 int min_length = kwl > length ? length : kwl; 1980 int r = utf8_cmp (name, min_length, cxx_keywords[mid]); 1981 1982 if (r == 0) 1983 { 1984 int i; 1985 /* We've found a match if all the remaining characters are `$'. */ 1986 for (i = min_length; i < length && name[i] == '$'; ++i) 1987 ; 1988 if (i == length) 1989 return 1; 1990 r = 1; 1991 } 1992 1993 if (r < 0) 1994 last = mid; 1995 else 1996 first = mid; 1997 } 1998 return 0; 1999} 2000#endif /* JC1_LITE */ 2001