gcc/java/lex.c

133808Spjd/* Language lexer for the GNU compiler for the Java(TM) language.
156878Spjd   Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
133808Spjd   Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
133808Spjd
133808SpjdThis file is part of GNU CC.
133808Spjd
133808SpjdGNU CC is free software; you can redistribute it and/or modify
133808Spjdit under the terms of the GNU General Public License as published by
133808Spjdthe Free Software Foundation; either version 2, or (at your option)
133808Spjdany later version.
133808Spjd
133808SpjdGNU CC is distributed in the hope that it will be useful,
155174Spjdbut WITHOUT ANY WARRANTY; without even the implied warranty of
133808SpjdMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
133808SpjdGNU General Public License for more details.
133808Spjd
133808SpjdYou should have received a copy of the GNU General Public License
133808Spjdalong with GNU CC; see the file COPYING.  If not, write to
133808Spjdthe Free Software Foundation, 59 Temple Place - Suite 330,
133808SpjdBoston, MA 02111-1307, USA.
133808Spjd
133808SpjdJava and all Java-based marks are trademarks or registered trademarks
133808Spjdof Sun Microsystems, Inc. in the United States and other countries.
133808SpjdThe Free Software Foundation is independent of Sun Microsystems, Inc.  */
133808Spjd
133808Spjd/* It defines java_lex (yylex) that reads a Java ASCII source file
133808Spjd   possibly containing Unicode escape sequence or utf8 encoded
133808Spjd   characters and returns a token for everything found but comments,
133808Spjd   white spaces and line terminators. When necessary, it also fills
133808Spjd   the java_lval (yylval) union. It's implemented to be called by a
133808Spjd   re-entrant parser generated by Bison.
133808Spjd
133808Spjd   The lexical analysis conforms to the Java grammar described in "The
133808Spjd   Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
133808Spjd   Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
133808Spjd
133808Spjd#include "keyword.h"
133808Spjd#include "flags.h"
133808Spjd#include "chartables.h"
133808Spjd
133808Spjd/* Function declarations.  */
133808Spjdstatic char *java_sprint_unicode PARAMS ((struct java_line *, int));
133808Spjdstatic void java_unicode_2_utf8 PARAMS ((unicode_t));
133808Spjdstatic void java_lex_error PARAMS ((const char *, int));
133808Spjd#ifndef JC1_LITE
133808Spjdstatic int java_is_eol PARAMS ((FILE *, int));
133808Spjdstatic tree build_wfl_node PARAMS ((tree));
133808Spjd#endif
133808Spjdstatic void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
133808Spjdstatic int java_parse_escape_sequence PARAMS ((void));
133808Spjdstatic int java_start_char_p PARAMS ((unicode_t));
133808Spjdstatic int java_part_char_p PARAMS ((unicode_t));
133808Spjdstatic int java_parse_doc_section PARAMS ((int));
156612Spjdstatic void java_parse_end_comment PARAMS ((int));
133808Spjdstatic int java_get_unicode PARAMS ((void));
133808Spjdstatic int java_read_unicode PARAMS ((java_lexer *, int *));
133808Spjdstatic int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
133808Spjd							     int *));
133808Spjdstatic void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
133808Spjdstatic int java_read_char PARAMS ((java_lexer *));
133808Spjdstatic void java_allocate_new_line PARAMS ((void));
133808Spjdstatic void java_unget_unicode PARAMS ((void));
156612Spjdstatic unicode_t java_sneak_unicode PARAMS ((void));
156612Spjd#ifndef JC1_LITE
133808Spjdstatic int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
133808Spjd#endif
133808Spjd
156612Spjdjava_lexer *java_new_lexer PARAMS ((FILE *, const char *));
133808Spjd#ifndef JC1_LITE
133808Spjdstatic void error_if_numeric_overflow PARAMS ((tree));
133808Spjd#endif
133808Spjd
133808Spjd#ifdef HAVE_ICONV
133808Spjd/* This is nonzero if we have initialized `need_byteswap'.  */
133808Spjdstatic int byteswap_init = 0;
133808Spjd
133808Spjd/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
156612Spjd   big-endian order -- not native endian order.  We handle this by
160330Spjd   doing a conversion once at startup and seeing what happens.  This
160330Spjd   flag holds the results of this determination.  */
133808Spjdstatic int need_byteswap = 0;
133808Spjd#endif
133808Spjd
133808Spjdvoid
133808Spjdjava_init_lex (finput, encoding)
133808Spjd     FILE *finput;
133808Spjd     const char *encoding;
133808Spjd{
133808Spjd#ifndef JC1_LITE
133808Spjd  int java_lang_imported = 0;
133808Spjd
133808Spjd  if (!java_lang_id)
133808Spjd    java_lang_id = get_identifier ("java.lang");
133808Spjd  if (!inst_id)
133808Spjd    inst_id = get_identifier ("inst$");
133808Spjd  if (!wpv_id)
133808Spjd    wpv_id = get_identifier ("write_parm_value$");
133808Spjd
133808Spjd  if (!java_lang_imported)
133808Spjd    {
163888Spjd      tree node = build_tree_list
134168Spjd	(build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
163888Spjd      read_import_dir (TREE_PURPOSE (node));
134168Spjd      TREE_CHAIN (node) = ctxp->import_demand_list;
134168Spjd      ctxp->import_demand_list = node;
133808Spjd      java_lang_imported = 1;
133808Spjd    }
133808Spjd
144142Spjd  if (!wfl_operator)
144142Spjd    wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
144142Spjd  if (!label_id)
144142Spjd    label_id = get_identifier ("$L");
133808Spjd  if (!wfl_append)
133808Spjd    wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
133808Spjd  if (!wfl_string_buffer)
133808Spjd    wfl_string_buffer =
133808Spjd      build_expr_wfl (get_identifier (flag_emit_class_files
133808Spjd				      ? "java.lang.StringBuffer"
133808Spjd				      : "gnu.gcj.runtime.StringBuffer"),
133808Spjd		      NULL, 0, 0);
133808Spjd  if (!wfl_to_string)
133808Spjd    wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
133808Spjd
133808Spjd  CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
133808Spjd    CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
133808Spjd
133808Spjd  memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
133808Spjd  memset (current_jcf, 0, sizeof (JCF));
133808Spjd  ctxp->current_parsed_class = NULL;
133808Spjd  ctxp->package = NULL_TREE;
133808Spjd#endif
163888Spjd
163888Spjd  ctxp->filename = input_filename;
163888Spjd  ctxp->lineno = lineno = 0;
163888Spjd  ctxp->p_line = NULL;
163888Spjd  ctxp->c_line = NULL;
163888Spjd  ctxp->java_error_flag = 0;
163888Spjd  ctxp->lexer = java_new_lexer (finput, encoding);
163888Spjd}
163888Spjd
163888Spjdstatic char *
163888Spjdjava_sprint_unicode (line, i)
163888Spjd    struct java_line *line;
163888Spjd    int i;
163888Spjd{
163888Spjd  static char buffer [10];
134124Spjd  if (line->unicode_escape_p [i] || line->line [i] > 128)
134124Spjd    sprintf (buffer, "\\u%04x", line->line [i]);
134124Spjd  else
134124Spjd    {
134124Spjd      buffer [0] = line->line [i];
134124Spjd      buffer [1] = '\0';
134124Spjd    }
134124Spjd  return buffer;
134124Spjd}
134124Spjd
134124Spjdstatic unicode_t
134124Spjdjava_sneak_unicode ()
134124Spjd{
134124Spjd  return (ctxp->c_line->line [ctxp->c_line->current]);
134124Spjd}
134124Spjd
134168Spjdstatic void
134168Spjdjava_unget_unicode ()
134168Spjd{
134168Spjd  if (!ctxp->c_line->current)
134168Spjd    /* Can't unget unicode.  */
134168Spjd    abort ();
134168Spjd
134168Spjd  ctxp->c_line->current--;
134168Spjd  ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
134168Spjd}
134168Spjd
134168Spjdstatic void
134168Spjdjava_allocate_new_line ()
134168Spjd{
134168Spjd  unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
163888Spjd  char ahead_escape_p = (ctxp->c_line ?
163888Spjd			 ctxp->c_line->unicode_escape_ahead_p : 0);
134124Spjd
134124Spjd  if (ctxp->c_line && !ctxp->c_line->white_space_only)
134124Spjd    {
156612Spjd      if (ctxp->p_line)
156612Spjd	{
156612Spjd	  free (ctxp->p_line->unicode_escape_p);
156612Spjd	  free (ctxp->p_line->line);
156612Spjd	  free (ctxp->p_line);
156612Spjd	}
156612Spjd      ctxp->p_line = ctxp->c_line;
156612Spjd      ctxp->c_line = NULL;		/* Reallocated.  */
156612Spjd    }
156612Spjd
156612Spjd  if (!ctxp->c_line)
156612Spjd    {
156612Spjd      ctxp->c_line = xmalloc (sizeof (struct java_line));
156612Spjd      ctxp->c_line->max = JAVA_LINE_MAX;
156612Spjd      ctxp->c_line->line = xmalloc (sizeof (unicode_t)*ctxp->c_line->max);
133808Spjd      ctxp->c_line->unicode_escape_p =
133808Spjd	xmalloc (sizeof (char)*ctxp->c_line->max);
133808Spjd      ctxp->c_line->white_space_only = 0;
133808Spjd    }
133808Spjd
133808Spjd  ctxp->c_line->line [0] = ctxp->c_line->size = 0;
133808Spjd  ctxp->c_line->char_col = ctxp->c_line->current = 0;
133808Spjd  if (ahead)
133808Spjd    {
163888Spjd      ctxp->c_line->line [ctxp->c_line->size] = ahead;
163888Spjd      ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
163888Spjd      ctxp->c_line->size++;
163888Spjd    }
163888Spjd  ctxp->c_line->ahead [0] = 0;
163888Spjd  ctxp->c_line->unicode_escape_ahead_p = 0;
163888Spjd  ctxp->c_line->lineno = ++lineno;
163888Spjd  ctxp->c_line->white_space_only = 1;
163888Spjd}
134168Spjd
134168Spjd/* Create a new lexer object.  */
134168Spjd
134168Spjdjava_lexer *
134168Spjdjava_new_lexer (finput, encoding)
134168Spjd     FILE *finput;
134168Spjd     const char *encoding;
134124Spjd{
134124Spjd  java_lexer *lex = xmalloc (sizeof (java_lexer));
134124Spjd  int enc_error = 0;
134124Spjd
134124Spjd  lex->finput = finput;
134124Spjd  lex->bs_count = 0;
134124Spjd  lex->unget_value = 0;
134168Spjd  lex->hit_eof = 0;
134168Spjd
134168Spjd#ifdef HAVE_ICONV
134168Spjd  lex->handle = iconv_open ("UCS-2", encoding);
134168Spjd  if (lex->handle != (iconv_t) -1)
134168Spjd    {
134168Spjd      lex->first = -1;
133808Spjd      lex->last = -1;
133808Spjd      lex->out_first = -1;
133808Spjd      lex->out_last = -1;
133808Spjd      lex->read_anything = 0;
133808Spjd      lex->use_fallback = 0;
133808Spjd
163888Spjd      /* Work around broken iconv() implementations by doing checking at
163888Spjd	 runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
133808Spjd	 then all UCS-2 encoders will be broken.  Perhaps not a valid
133808Spjd	 assumption.  */
133808Spjd      if (! byteswap_init)
133808Spjd	{
133808Spjd	  iconv_t handle;
133808Spjd
133808Spjd	  byteswap_init = 1;
133808Spjd
133808Spjd	  handle = iconv_open ("UCS-2", "UTF-8");
133808Spjd	  if (handle != (iconv_t) -1)
133808Spjd	    {
133808Spjd	      unicode_t result;
133808Spjd	      unsigned char in[3];
156612Spjd	      char *inp, *outp;
133808Spjd	      size_t inc, outc, r;
133808Spjd
133808Spjd	      /* This is the UTF-8 encoding of \ufeff.  */
133808Spjd	      in[0] = 0xef;
133808Spjd	      in[1] = 0xbb;
139671Spjd	      in[2] = 0xbf;
133808Spjd
133808Spjd	      inp = in;
139671Spjd	      inc = 3;
133808Spjd	      outp = (char *) &result;
139671Spjd	      outc = 2;
133808Spjd
133808Spjd	      r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
133808Spjd			 &outp, &outc);
133808Spjd	      iconv_close (handle);
133808Spjd	      /* Conversion must be complete for us to use the result.  */
133808Spjd	      if (r != (size_t) -1 && inc == 0 && outc == 0)
133808Spjd		need_byteswap = (result != 0xfeff);
133808Spjd	    }
133808Spjd	}
133808Spjd
133808Spjd      lex->byte_swap = need_byteswap;
133808Spjd    }
133808Spjd  else
133808Spjd#endif /* HAVE_ICONV */
133808Spjd    {
133808Spjd      /* If iconv failed, use the internal decoder if the default
133808Spjd	 encoding was requested.  This code is used on platforms where
133808Spjd	 iconv exists but is insufficient for our needs.  For
133808Spjd	 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
133808Spjd
133808Spjd	 On Solaris the default encoding, as returned by nl_langinfo(),
133808Spjd	 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
133808Spjd	 understand that.  We work around that by pretending
156612Spjd	 `646' to be the same as UTF-8.   */
133808Spjd      if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
133808Spjd	enc_error = 1;
133808Spjd#ifdef HAVE_ICONV
133808Spjd      else
133808Spjd	lex->use_fallback = 1;
156612Spjd#endif /* HAVE_ICONV */
133808Spjd    }
133808Spjd
133808Spjd  if (enc_error)
133808Spjd    fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation.  If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
162350Spjd
156612Spjd  return lex;
133808Spjd}
133808Spjd
133808Spjdvoid
133808Spjdjava_destroy_lexer (lex)
133808Spjd     java_lexer *lex;
133808Spjd{
133808Spjd#ifdef HAVE_ICONV
133808Spjd  if (! lex->use_fallback)
133808Spjd    iconv_close (lex->handle);
133808Spjd#endif
139671Spjd  free (lex);
156612Spjd}
139671Spjd
156612Spjdstatic int
133808Spjdjava_read_char (lex)
133808Spjd     java_lexer *lex;
139671Spjd{
139671Spjd  if (lex->unget_value)
156612Spjd    {
139671Spjd      unicode_t r = lex->unget_value;
139671Spjd      lex->unget_value = 0;
139671Spjd      return r;
156612Spjd    }
139671Spjd
156612Spjd#ifdef HAVE_ICONV
133808Spjd  if (! lex->use_fallback)
133808Spjd    {
133808Spjd      size_t ir, inbytesleft, in_save, out_count, out_save;
133808Spjd      char *inp, *outp;
133808Spjd      unicode_t result;
133808Spjd
133808Spjd      /* If there is data which has already been converted, use it.  */
133808Spjd      if (lex->out_first == -1 || lex->out_first >= lex->out_last)
133808Spjd	{
133808Spjd	  lex->out_first = 0;
157630Spjd	  lex->out_last = 0;
133808Spjd
133808Spjd	  while (1)
133808Spjd	    {
133808Spjd	      /* See if we need to read more data.  If FIRST == 0 then
133808Spjd		 the previous conversion attempt ended in the middle of
133808Spjd		 a character at the end of the buffer.  Otherwise we
133808Spjd		 only have to read if the buffer is empty.  */
133808Spjd	      if (lex->first == 0 || lex->first >= lex->last)
133808Spjd		{
133808Spjd		  int r;
133808Spjd
133808Spjd		  if (lex->first >= lex->last)
133808Spjd		    {
133808Spjd		      lex->first = 0;
133808Spjd		      lex->last = 0;
157630Spjd		    }
157630Spjd		  if (feof (lex->finput))
157630Spjd		    return UEOF;
157630Spjd		  r = fread (&lex->buffer[lex->last], 1,
133808Spjd			     sizeof (lex->buffer) - lex->last,
133808Spjd			     lex->finput);
133808Spjd		  lex->last += r;
133808Spjd		}
133808Spjd
133808Spjd	      inbytesleft = lex->last - lex->first;
133808Spjd	      out_count = sizeof (lex->out_buffer) - lex->out_last;
133808Spjd
133808Spjd	      if (inbytesleft == 0)
133808Spjd		{
133808Spjd		  /* We've tried to read and there is nothing left.  */
133808Spjd		  return UEOF;
133808Spjd		}
157630Spjd
157630Spjd	      in_save = inbytesleft;
133808Spjd	      out_save = out_count;
133808Spjd	      inp = &lex->buffer[lex->first];
133808Spjd	      outp = &lex->out_buffer[lex->out_last];
156612Spjd	      ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
133808Spjd			  &inbytesleft, &outp, &out_count);
133808Spjd
156612Spjd	      /* If we haven't read any bytes, then look to see if we
133808Spjd		 have read a BOM.  */
133808Spjd	      if (! lex->read_anything && out_save - out_count >= 2)
133808Spjd		{
133808Spjd		  unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
133808Spjd		  if (uc == 0xfeff)
133808Spjd		    {
133808Spjd		      lex->byte_swap = 0;
133808Spjd		      lex->out_first += 2;
133808Spjd		    }
133808Spjd		  else if (uc == 0xfffe)
133808Spjd		    {
133808Spjd		      lex->byte_swap = 1;
133808Spjd		      lex->out_first += 2;
133808Spjd		    }
133808Spjd		  lex->read_anything = 1;
133808Spjd		}
133808Spjd
133808Spjd	      if (lex->byte_swap)
133808Spjd		{
133808Spjd		  unsigned int i;
133808Spjd		  for (i = 0; i < out_save - out_count; i += 2)
133808Spjd		    {
134420Spjd		      char t = lex->out_buffer[lex->out_last + i];
133808Spjd		      lex->out_buffer[lex->out_last + i]
245456Smav			= lex->out_buffer[lex->out_last + i + 1];
133808Spjd		      lex->out_buffer[lex->out_last + i + 1] = t;
133808Spjd		    }
133808Spjd		}
133808Spjd
133808Spjd	      lex->first += in_save - inbytesleft;
133808Spjd	      lex->out_last += out_save - out_count;
133808Spjd
133808Spjd	      /* If we converted anything at all, move along.  */
133808Spjd	      if (out_count != out_save)
133808Spjd		break;
156612Spjd
156612Spjd	      if (ir == (size_t) -1)
156612Spjd		{
133808Spjd		  if (errno == EINVAL)
133808Spjd		    {
156612Spjd		      /* This is ok.  This means that the end of our buffer
156612Spjd			 is in the middle of a character sequence.  We just
156612Spjd			 move the valid part of the buffer to the beginning
133808Spjd			 to force a read.  */
133808Spjd		      memmove (&lex->buffer[0], &lex->buffer[lex->first],
245456Smav			       lex->last - lex->first);
245456Smav		      lex->last -= lex->first;
245456Smav		      lex->first = 0;
245456Smav		    }
160330Spjd		  else
160330Spjd		    {
156612Spjd		      /* A more serious error.  */
156612Spjd		      java_lex_error ("unrecognized character in input stream",
156612Spjd				      0);
156612Spjd		      return UEOF;
156612Spjd		    }
156612Spjd		}
156612Spjd	    }
156612Spjd	}
156612Spjd
156612Spjd      if (lex->out_first == -1 || lex->out_first >= lex->out_last)
156612Spjd	{
156612Spjd	  /* Don't have any data.  */
156612Spjd	  return UEOF;
156612Spjd	}
156612Spjd
156612Spjd      /* Success.  */
156612Spjd      result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
156612Spjd      lex->out_first += 2;
156612Spjd      return result;
156612Spjd    }
156612Spjd  else
156612Spjd#endif /* HAVE_ICONV */
156612Spjd    {
156612Spjd      int c, c1, c2;
156612Spjd      c = getc (lex->finput);
156612Spjd
156612Spjd      if (c == EOF)
156612Spjd	return UEOF;
156612Spjd      if (c < 128)
156612Spjd	return (unicode_t) c;
156612Spjd      else
156612Spjd	{
156612Spjd	  if ((c & 0xe0) == 0xc0)
245456Smav	    {
245456Smav	      c1 = getc (lex->finput);
245456Smav	      if ((c1 & 0xc0) == 0x80)
245456Smav		{
245456Smav		  unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
245456Smav		  /* Check for valid 2-byte characters.  We explicitly
245456Smav		     allow \0 because this encoding is common in the
245456Smav		     Java world.  */
245456Smav		  if (r == 0 || (r >= 0x80 && r <= 0x7ff))
245456Smav		    return r;
245456Smav		}
245456Smav	    }
245456Smav	  else if ((c & 0xf0) == 0xe0)
245456Smav	    {
245456Smav	      c1 = getc (lex->finput);
245456Smav	      if ((c1 & 0xc0) == 0x80)
245456Smav		{
245456Smav		  c2 = getc (lex->finput);
245456Smav		  if ((c2 & 0xc0) == 0x80)
245456Smav		    {
245456Smav		      unicode_t r =  (unicode_t)(((c & 0xf) << 12) +
245456Smav						 (( c1 & 0x3f) << 6)
245456Smav						 + (c2 & 0x3f));
245456Smav		      /* Check for valid 3-byte characters.
133808Spjd			 Don't allow surrogate, \ufffe or \uffff.  */
133808Spjd		      if (IN_RANGE (r, 0x800, 0xffff)
156612Spjd			  && ! IN_RANGE (r, 0xd800, 0xdfff)
133808Spjd			  && r != 0xfffe && r != 0xffff)
133808Spjd			return r;
133808Spjd		    }
156612Spjd		}
133808Spjd	    }
134420Spjd
134420Spjd	  /* We simply don't support invalid characters.  We also
156612Spjd	     don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
134420Spjd	     cannot be valid Java characters.  */
156612Spjd	  java_lex_error ("malformed UTF-8 character", 0);
134420Spjd	}
134420Spjd    }
134420Spjd
134420Spjd  /* We only get here on error.  */
134420Spjd  return UEOF;
134420Spjd}
134420Spjd
133808Spjdstatic void
156612Spjdjava_store_unicode (l, c, unicode_escape_p)
133808Spjd    struct java_line *l;
163886Spjd    unicode_t c;
133808Spjd    int unicode_escape_p;
163886Spjd{
163886Spjd  if (l->size == l->max)
163886Spjd    {
156527Spjd      l->max += JAVA_LINE_MAX;
133808Spjd      l->line = xrealloc (l->line, sizeof (unicode_t)*l->max);
133808Spjd      l->unicode_escape_p = xrealloc (l->unicode_escape_p,
133808Spjd				      sizeof (char)*l->max);
133808Spjd    }
133808Spjd  l->line [l->size] = c;
133808Spjd  l->unicode_escape_p [l->size++] = unicode_escape_p;
133808Spjd}
133808Spjd
156612Spjdstatic int
146118Spjdjava_read_unicode (lex, unicode_escape_p)
146118Spjd     java_lexer *lex;
146118Spjd     int *unicode_escape_p;
146118Spjd{
146118Spjd  int c;
146117Spjd
156612Spjd  c = java_read_char (lex);
133808Spjd  *unicode_escape_p = 0;
133808Spjd
133808Spjd  if (c != '\\')
133808Spjd    {
133808Spjd      lex->bs_count = 0;
133808Spjd      return c;
133808Spjd    }
133808Spjd
133808Spjd  ++lex->bs_count;
133808Spjd  if ((lex->bs_count) % 2 == 1)
133808Spjd    {
133808Spjd      /* Odd number of \ seen.  */
133808Spjd      c = java_read_char (lex);
133808Spjd      if (c == 'u')
133808Spjd        {
133808Spjd	  unicode_t unicode = 0;
133808Spjd	  int shift = 12;
133808Spjd
133808Spjd	  /* Recognize any number of `u's in \u.  */
133808Spjd	  while ((c = java_read_char (lex)) == 'u')
156612Spjd	    ;
156612Spjd
156612Spjd	  shift = 12;
156612Spjd	  do
156612Spjd	    {
133808Spjd	      if (c == UEOF)
133808Spjd		{
133808Spjd		  java_lex_error ("prematurely terminated \\u sequence", 0);
133808Spjd		  return UEOF;
133808Spjd		}
133808Spjd
133808Spjd	      if (hex_p (c))
133808Spjd		unicode |= (unicode_t)(hex_value (c) << shift);
133808Spjd	      else
133808Spjd		{
133808Spjd		  java_lex_error ("non-hex digit in \\u sequence", 0);
156612Spjd		  break;
133808Spjd		}
133808Spjd
133808Spjd	      c = java_read_char (lex);
133808Spjd	      shift -= 4;
133808Spjd	    }
133808Spjd	  while (shift >= 0);
133808Spjd
133808Spjd	  if (c != UEOF)
133808Spjd	    lex->unget_value = c;
133808Spjd
133808Spjd	  lex->bs_count = 0;
133808Spjd	  *unicode_escape_p = 1;
162350Spjd	  return unicode;
133808Spjd	}
156612Spjd      lex->unget_value = c;
133808Spjd    }
133808Spjd  return (unicode_t) '\\';
133808Spjd}
133808Spjd
133808Spjdstatic int
133808Spjdjava_read_unicode_collapsing_terminators (lex, unicode_escape_p)
133808Spjd     java_lexer *lex;
139295Spjd     int *unicode_escape_p;
139295Spjd{
139295Spjd  int c = java_read_unicode (lex, unicode_escape_p);
156612Spjd
133808Spjd  if (c == '\r')
133808Spjd    {
133808Spjd      /* We have to read ahead to see if we got \r\n.  In that case we
133808Spjd	 return a single line terminator.  */
133808Spjd      int dummy;
162350Spjd      c = java_read_unicode (lex, &dummy);
156612Spjd      if (c != '\n' && c != UEOF)
133808Spjd	lex->unget_value = c;
156612Spjd      /* In either case we must return a newline.  */
133808Spjd      c = '\n';
133808Spjd    }
133808Spjd
133808Spjd  return c;
133808Spjd}
133808Spjd
133808Spjdstatic int
133808Spjdjava_get_unicode ()
133808Spjd{
133808Spjd  /* It's time to read a line when...  */
133808Spjd  if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
133808Spjd    {
133808Spjd      int c;
133808Spjd      int found_chars = 0;
133808Spjd
133808Spjd      if (ctxp->lexer->hit_eof)
133808Spjd	return UEOF;
133808Spjd
133808Spjd      java_allocate_new_line ();
156612Spjd      if (ctxp->c_line->line[0] != '\n')
133808Spjd	{
133808Spjd	  for (;;)
133808Spjd	    {
133808Spjd	      int unicode_escape_p;
133808Spjd	      c = java_read_unicode_collapsing_terminators (ctxp->lexer,
133808Spjd							    &unicode_escape_p);
133808Spjd	      if (c != UEOF)
133808Spjd		{
133808Spjd		  found_chars = 1;
133808Spjd		  java_store_unicode (ctxp->c_line, c, unicode_escape_p);
133808Spjd		  if (ctxp->c_line->white_space_only
133808Spjd		      && !JAVA_WHITE_SPACE_P (c)
156612Spjd		      && c != '\n')
133808Spjd		    ctxp->c_line->white_space_only = 0;
		}
	      if ((c == '\n') || (c == UEOF))
		break;
	    }

	  if (c == UEOF && ! found_chars)
	    {
	      ctxp->lexer->hit_eof = 1;
	      return UEOF;
	    }
	}
    }
  ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
  JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
  return ctxp->c_line->line [ctxp->c_line->current++];
}

/* Parse the end of a C style comment.
 * C is the first character following the '/' and '*'.  */
static void
java_parse_end_comment (c)
     int c;
{
  for ( ;; c = java_get_unicode ())
    {
      switch (c)
	{
	case UEOF:
	  java_lex_error ("Comment not terminated at end of input", 0);
	  return;
	case '*':
	  switch (c = java_get_unicode ())
	    {
	    case UEOF:
	      java_lex_error ("Comment not terminated at end of input", 0);
	      return;
	    case '/':
	      return;
	    case '*':	/* Reparse only '*'.  */
	      java_unget_unicode ();
	    }
	}
    }
}

/* Parse the documentation section. Keywords must be at the beginning
   of a documentation comment line (ignoring white space and any `*'
   character). Parsed keyword(s): @DEPRECATED.  */

static int
java_parse_doc_section (c)
     int c;
{
  int valid_tag = 0, seen_star = 0;

  while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
    {
      switch (c)
	{
	case '*':
	  seen_star = 1;
	  break;
	case '\n': /* ULT */
	  valid_tag = 1;
	default:
	  seen_star = 0;
	}
      c = java_get_unicode();
    }

  if (c == UEOF)
    java_lex_error ("Comment not terminated at end of input", 0);

  if (seen_star && (c == '/'))
    return 1;			/* Goto step1 in caller.  */

  /* We're parsing `@deprecated'.  */
  if (valid_tag && (c == '@'))
    {
      char tag [11];
      int  tag_index = 0;

      while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
	{
	  c = java_get_unicode ();
	  tag [tag_index++] = c;
	}

      if (c == UEOF)
	java_lex_error ("Comment not terminated at end of input", 0);
      tag [tag_index] = '\0';

      if (!strcmp (tag, "deprecated"))
	ctxp->deprecated = 1;
    }
  java_unget_unicode ();
  return 0;
}

/* Return true if C is a valid start character for a Java identifier.
   This is only called if C >= 128 -- smaller values are handled
   inline.  However, this function handles all values anyway.  */
static int
java_start_char_p (c)
     unicode_t c;
{
  unsigned int hi = c / 256;
  const char *const page = type_table[hi];
  unsigned long val = (unsigned long) page;
  int flags;

  if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
    flags = page[c & 255];
  else
    flags = val;

  return flags & LETTER_START;
}

/* Return true if C is a valid part character for a Java identifier.
   This is only called if C >= 128 -- smaller values are handled
   inline.  However, this function handles all values anyway.  */
static int
java_part_char_p (c)
     unicode_t c;
{
  unsigned int hi = c / 256;
  const char *const page = type_table[hi];
  unsigned long val = (unsigned long) page;
  int flags;

  if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
    flags = page[c & 255];
  else
    flags = val;

  return flags & LETTER_PART;
}

static int
java_parse_escape_sequence ()
{
  unicode_t char_lit;
  int c;

  switch (c = java_get_unicode ())
    {
    case 'b':
      return (unicode_t)0x8;
    case 't':
      return (unicode_t)0x9;
    case 'n':
      return (unicode_t)0xa;
    case 'f':
      return (unicode_t)0xc;
    case 'r':
      return (unicode_t)0xd;
    case '"':
      return (unicode_t)0x22;
    case '\'':
      return (unicode_t)0x27;
    case '\\':
      return (unicode_t)0x5c;
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7':
      {
	int octal_escape[3];
	int octal_escape_index = 0;
	int max = 3;
	int i, shift;

	for (; octal_escape_index < max && RANGE (c, '0', '7');
	     c = java_get_unicode ())
	  {
	    if (octal_escape_index == 0 && c > '3')
	      {
		/* According to the grammar, `\477' has a well-defined
		   meaning -- it is `\47' followed by `7'.  */
		--max;
	      }
	    octal_escape [octal_escape_index++] = c;
	  }

	java_unget_unicode ();

	for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
	     i < octal_escape_index; i++, shift -= 3)
	  char_lit |= (octal_escape [i] - '0') << shift;

	return char_lit;
      }
    default:
      java_lex_error ("Invalid character in escape sequence", 0);
      return JAVA_CHAR_ERROR;
    }
}

#ifndef JC1_LITE
#define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)

/* Subroutine of java_lex: converts floating-point literals to tree
   nodes.  LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
   store the result.  FFLAG indicates whether the literal was tagged
   with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
   is the line number on which to report any error.  */

static void java_perform_atof	PARAMS ((YYSTYPE *, char *, int, int));

static void
java_perform_atof (java_lval, literal_token, fflag, number_beginning)
     YYSTYPE *java_lval;
     char *literal_token;
     int fflag;
     int number_beginning;
{
  REAL_VALUE_TYPE value;
  tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);

  SET_REAL_VALUE_ATOF (value,
		       REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));

  if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
    {
      JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
      value = DCONST0;
    }
  else if (IS_ZERO (value))
    {
      /* We check to see if the value is really 0 or if we've found an
	 underflow.  We do this in the most primitive imaginable way.  */
      int really_zero = 1;
      char *p = literal_token;
      if (*p == '-')
	++p;
      while (*p && *p != 'e' && *p != 'E')
	{
	  if (*p != '0' && *p != '.')
	    {
	      really_zero = 0;
	      break;
	    }
	  ++p;
	}
      if (! really_zero)
	{
	  int i = ctxp->c_line->current;
	  ctxp->c_line->current = number_beginning;
	  java_lex_error ("Floating point literal underflow", 0);
	  ctxp->c_line->current = i;
	}
    }

  SET_LVAL_NODE_TYPE (build_real (type, value), type);
}
#endif

static int yylex		PARAMS ((YYSTYPE *));

static int
#ifdef JC1_LITE
yylex (java_lval)
#else
java_lex (java_lval)
#endif
     YYSTYPE *java_lval;
{
  int c;
  unicode_t first_unicode;
  int ascii_index, all_ascii;
  char *string;

  /* Translation of the Unicode escape in the raw stream of Unicode
     characters. Takes care of line terminator.  */
 step1:
  /* Skip white spaces: SP, TAB and FF or ULT.  */
  for (c = java_get_unicode ();
       c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
    if (c == '\n')
      {
	ctxp->elc.line = ctxp->c_line->lineno;
	ctxp->elc.col  = ctxp->c_line->char_col-2;
      }

  ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);

  if (c == 0x1a)		/* CTRL-Z.  */
    {
      if ((c = java_get_unicode ()) == UEOF)
	return 0;		/* Ok here.  */
      else
	java_unget_unicode ();	/* Caught later, at the end of the
                                   function.  */
    }
  /* Handle EOF here.  */
  if (c == UEOF)	/* Should probably do something here...  */
    return 0;

  /* Take care of eventual comments.  */
  if (c == '/')
    {
      switch (c = java_get_unicode ())
	{
	case '/':
	  for (;;)
	    {
	      c = java_get_unicode ();
	      if (c == UEOF)
		{
		  /* It is ok to end a `//' comment with EOF, unless
		     we're being pedantic.  */
		  if (pedantic)
		    java_lex_error ("Comment not terminated at end of input",
				    0);
		  return 0;
		}
	      if (c == '\n')	/* ULT */
		goto step1;
	    }
	  break;

	case '*':
	  if ((c = java_get_unicode ()) == '*')
	    {
	      if ((c = java_get_unicode ()) == '/')
		goto step1;	/* Empty documentation comment.  */
	      else if (java_parse_doc_section (c))
		goto step1;
	    }

	  java_parse_end_comment ((c = java_get_unicode ()));
	  goto step1;
	  break;
	default:
	  java_unget_unicode ();
	  c = '/';
	  break;
	}
    }

  ctxp->elc.line = ctxp->c_line->lineno;
  ctxp->elc.prev_col = ctxp->elc.col;
  ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
  if (ctxp->elc.col < 0)
    abort ();

  /* Numeric literals.  */
  if (JAVA_ASCII_DIGIT (c) || (c == '.'))
    {
      /* This section of code is borrowed from gcc/c-lex.c.  */
#define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
      int parts[TOTAL_PARTS];
      HOST_WIDE_INT high, low;
      /* End borrowed section.  */
      char literal_token [256];
      int  literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
      int  found_hex_digits = 0, found_non_octal_digits = 0;
      int  i;
#ifndef JC1_LITE
      int  number_beginning = ctxp->c_line->current;
      tree value;
#endif

      /* We might have a . separator instead of a FP like .[0-9]*.  */
      if (c == '.')
	{
	  unicode_t peep = java_sneak_unicode ();

	  if (!JAVA_ASCII_DIGIT (peep))
	    {
	      JAVA_LEX_SEP('.');
	      BUILD_OPERATOR (DOT_TK);
	    }
	}

      for (i = 0; i < TOTAL_PARTS; i++)
	parts [i] = 0;

      if (c == '0')
	{
	  c = java_get_unicode ();
	  if (c == 'x' || c == 'X')
	    {
	      radix = 16;
	      c = java_get_unicode ();
	    }
	  else if (JAVA_ASCII_DIGIT (c))
	    radix = 8;
	  else if (c == '.' || c == 'e' || c =='E')
	    {
	      /* Push the '.', 'e', or 'E' back and prepare for a FP
		 parsing...  */
	      java_unget_unicode ();
	      c = '0';
	    }
	  else
	    {
	      /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}.  */
	      JAVA_LEX_LIT ("0", 10);
              switch (c)
		{
		case 'L': case 'l':
		  SET_LVAL_NODE (long_zero_node);
		  return (INT_LIT_TK);
		case 'f': case 'F':
		  SET_LVAL_NODE (float_zero_node);
		  return (FP_LIT_TK);
		case 'd': case 'D':
		  SET_LVAL_NODE (double_zero_node);
		  return (FP_LIT_TK);
		default:
		  java_unget_unicode ();
		  SET_LVAL_NODE (integer_zero_node);
		  return (INT_LIT_TK);
		}
	    }
	}
      /* Parse the first part of the literal, until we find something
	 which is not a number.  */
      while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
	     JAVA_ASCII_DIGIT (c))
	{
	  /* We store in a string (in case it turns out to be a FP) and in
	     PARTS if we have to process a integer literal.  */
	  int numeric = hex_value (c);
	  int count;

	  /* Remember when we find a valid hexadecimal digit.  */
	  if (radix == 16)
	    found_hex_digits = 1;
          /* Remember when we find an invalid octal digit.  */
          else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
            found_non_octal_digits = 1;

	  literal_token [literal_index++] = c;
	  /* This section of code if borrowed from gcc/c-lex.c.  */
	  for (count = 0; count < TOTAL_PARTS; count++)
	    {
	      parts[count] *= radix;
	      if (count)
		{
		  parts[count]   += (parts[count-1] >> HOST_BITS_PER_CHAR);
		  parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
		}
	      else
		parts[0] += numeric;
	    }
	  if (parts [TOTAL_PARTS-1] != 0)
	    overflow = 1;
	  /* End borrowed section.  */
	  c = java_get_unicode ();
	}

      /* If we have something from the FP char set but not a digit, parse
	 a FP literal.  */
      if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
	{
	  int stage = 0;
	  int seen_digit = (literal_index ? 1 : 0);
	  int seen_exponent = 0;
	  int fflag = 0;	/* 1 for {f,F}, 0 for {d,D}. FP literal are
				   double unless specified.  */

	  /* It is ok if the radix is 8 because this just means we've
	     seen a leading `0'.  However, radix==16 is invalid.  */
	  if (radix == 16)
	    java_lex_error ("Can't express non-decimal FP literal", 0);
	  radix = 10;

	  for (;;)
	    {
	      if (c == '.')
		{
		  if (stage < 1)
		    {
		      stage = 1;
		      literal_token [literal_index++ ] = c;
		      c = java_get_unicode ();
		    }
		  else
		    java_lex_error ("Invalid character in FP literal", 0);
		}

	      if (c == 'e' || c == 'E')
		{
		  if (stage < 2)
		    {
		      /* {E,e} must have seen at least a digit.  */
		      if (!seen_digit)
			java_lex_error
                          ("Invalid FP literal, mantissa must have digit", 0);
		      seen_digit = 0;
		      seen_exponent = 1;
		      stage = 2;
		      literal_token [literal_index++] = c;
		      c = java_get_unicode ();
		    }
		  else
		    java_lex_error ("Invalid character in FP literal", 0);
		}
	      if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
		{
		  fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
		  stage = 4;	/* So we fall through.  */
		}

	      if ((c=='-' || c =='+') && stage == 2)
		{
		  stage = 3;
		  literal_token [literal_index++] = c;
		  c = java_get_unicode ();
		}

	      if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
		  (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
		  (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
		  (stage == 3 && JAVA_ASCII_DIGIT (c)))
		{
		  if (JAVA_ASCII_DIGIT (c))
		    seen_digit = 1;
                  if (stage == 2)
                    stage = 3;
		  literal_token [literal_index++ ] = c;
		  c = java_get_unicode ();
		}
	      else
		{
		  if (stage != 4) /* Don't push back fF/dD.  */
		    java_unget_unicode ();

		  /* An exponent (if any) must have seen a digit.  */
		  if (seen_exponent && !seen_digit)
		    java_lex_error
                      ("Invalid FP literal, exponent must have digit", 0);

		  literal_token [literal_index] = '\0';
		  JAVA_LEX_LIT (literal_token, radix);

#ifndef JC1_LITE
		  java_perform_atof (java_lval, literal_token,
				     fflag, number_beginning);
#endif
		  return FP_LIT_TK;
		}
	    }
	} /* JAVA_ASCII_FPCHAR (c) */

      /* Here we get back to converting the integral literal.  */
      if (radix == 16 && ! found_hex_digits)
	java_lex_error
	  ("0x must be followed by at least one hexadecimal digit", 0);
      else if (radix == 8 && found_non_octal_digits)
	java_lex_error ("Octal literal contains digit out of range", 0);
      else if (c == 'L' || c == 'l')
	long_suffix = 1;
      else
	java_unget_unicode ();

#ifdef JAVA_LEX_DEBUG
      literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe.  */
      JAVA_LEX_LIT (literal_token, radix);
#endif
      /* This section of code is borrowed from gcc/c-lex.c.  */
      if (!overflow)
	{
	  bytes = GET_TYPE_PRECISION (long_type_node);
	  for (i = bytes; i < TOTAL_PARTS; i++)
	    if (parts [i])
	      {
	        overflow = 1;
		break;
	      }
	}
      high = low = 0;
      for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
	{
	  high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
					      / HOST_BITS_PER_CHAR)]
		   << (i * HOST_BITS_PER_CHAR));
	  low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
	}
      /* End borrowed section.  */

#ifndef JC1_LITE
      /* Range checking.  */
      value = build_int_2 (low, high);
      /* Temporarily set type to unsigned.  */
      SET_LVAL_NODE_TYPE (value, (long_suffix
				  ? unsigned_long_type_node
				  : unsigned_int_type_node));

      /* For base 10 numbers, only values up to the highest value
	 (plus one) can be written.  For instance, only ints up to
	 2147483648 can be written.  The special case of the largest
	 negative value is handled elsewhere.  For other bases, any
	 number can be represented.  */
      if (overflow || (radix == 10
		       && tree_int_cst_lt (long_suffix
					   ? decimal_long_max
					   : decimal_int_max,
					   value)))
	{
	  if (long_suffix)
	    JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
	  else
	    JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
	}

      /* Sign extend the value.  */
      SET_LVAL_NODE_TYPE (value, (long_suffix ? long_type_node : int_type_node));
      force_fit_type (value, 0);
      JAVA_RADIX10_FLAG (value) = radix == 10;
#else
      SET_LVAL_NODE_TYPE (build_int_2 (low, high),
			  long_suffix ? long_type_node : int_type_node);
#endif
      return INT_LIT_TK;
    }

  /* Character literals.  */
  if (c == '\'')
    {
      int char_lit;
      if ((c = java_get_unicode ()) == '\\')
	char_lit = java_parse_escape_sequence ();
      else
	{
	  if (c == '\n' || c == '\'')
	    java_lex_error ("Invalid character literal", 0);
	  char_lit = c;
	}

      c = java_get_unicode ();

      if ((c == '\n') || (c == UEOF))
	java_lex_error ("Character literal not terminated at end of line", 0);
      if (c != '\'')
	java_lex_error ("Syntax error in character literal", 0);

      if (char_lit == JAVA_CHAR_ERROR)
        char_lit = 0;		/* We silently convert it to zero.  */

      JAVA_LEX_CHAR_LIT (char_lit);
      SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
      return CHAR_LIT_TK;
    }

  /* String literals.  */
  if (c == '"')
    {
      int no_error;
      char *string;

      for (no_error = 1, c = java_get_unicode ();
	   c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
	{
	  if (c == '\\')
	    c = java_parse_escape_sequence ();
	  if (c == JAVA_CHAR_ERROR)
	    {
	      no_error = 0;
	      c = 0;		/* We silently convert it to zero.  */
	    }
	  java_unicode_2_utf8 (c);
	}
      if (c == '\n' || c == UEOF) /* ULT.  */
	{
	  lineno--;	/* Refer to the line where the terminator was seen.  */
	  java_lex_error ("String not terminated at end of line", 0);
	  lineno++;
	}

      obstack_1grow (&temporary_obstack, '\0');
      string = obstack_finish (&temporary_obstack);
#ifndef JC1_LITE
      if (!no_error || (c != '"'))
	java_lval->node = error_mark_node; /* FIXME: Requires futher
                                              testing.  */
      else
	java_lval->node = build_string (strlen (string), string);
#endif
      obstack_free (&temporary_obstack, string);
      return STRING_LIT_TK;
    }

  /* Separator.  */
  switch (c)
    {
    case '(':
      JAVA_LEX_SEP (c);
      BUILD_OPERATOR (OP_TK);
    case ')':
      JAVA_LEX_SEP (c);
      return CP_TK;
    case '{':
      JAVA_LEX_SEP (c);
      if (ctxp->ccb_indent == 1)
	ctxp->first_ccb_indent1 = lineno;
      ctxp->ccb_indent++;
      BUILD_OPERATOR (OCB_TK);
    case '}':
      JAVA_LEX_SEP (c);
      ctxp->ccb_indent--;
      if (ctxp->ccb_indent == 1)
        ctxp->last_ccb_indent1 = lineno;
      BUILD_OPERATOR (CCB_TK);
    case '[':
      JAVA_LEX_SEP (c);
      BUILD_OPERATOR (OSB_TK);
    case ']':
      JAVA_LEX_SEP (c);
      return CSB_TK;
    case ';':
      JAVA_LEX_SEP (c);
      return SC_TK;
    case ',':
      JAVA_LEX_SEP (c);
      return C_TK;
    case '.':
      JAVA_LEX_SEP (c);
      BUILD_OPERATOR (DOT_TK);
      /*      return DOT_TK; */
    }

  /* Operators.  */
  switch (c)
    {
    case '=':
      if ((c = java_get_unicode ()) == '=')
	{
	  BUILD_OPERATOR (EQ_TK);
	}
      else
	{
	  /* Equals is used in two different locations. In the
	     variable_declarator: rule, it has to be seen as '=' as opposed
	     to being seen as an ordinary assignment operator in
	     assignment_operators: rule.  */
	  java_unget_unicode ();
	  BUILD_OPERATOR (ASSIGN_TK);
	}

    case '>':
      switch ((c = java_get_unicode ()))
	{
	case '=':
	  BUILD_OPERATOR (GTE_TK);
	case '>':
	  switch ((c = java_get_unicode ()))
	    {
	    case '>':
	      if ((c = java_get_unicode ()) == '=')
		{
		  BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
		}
	      else
		{
		  java_unget_unicode ();
		  BUILD_OPERATOR (ZRS_TK);
		}
	    case '=':
	      BUILD_OPERATOR2 (SRS_ASSIGN_TK);
	    default:
	      java_unget_unicode ();
	      BUILD_OPERATOR (SRS_TK);
	    }
	default:
	  java_unget_unicode ();
	  BUILD_OPERATOR (GT_TK);
	}

    case '<':
      switch ((c = java_get_unicode ()))
	{
	case '=':
	  BUILD_OPERATOR (LTE_TK);
	case '<':
	  if ((c = java_get_unicode ()) == '=')
	    {
	      BUILD_OPERATOR2 (LS_ASSIGN_TK);
	    }
	  else
	    {
	      java_unget_unicode ();
	      BUILD_OPERATOR (LS_TK);
	    }
	default:
	  java_unget_unicode ();
	  BUILD_OPERATOR (LT_TK);
	}

    case '&':
      switch ((c = java_get_unicode ()))
	{
	case '&':
	  BUILD_OPERATOR (BOOL_AND_TK);
	case '=':
	  BUILD_OPERATOR2 (AND_ASSIGN_TK);
	default:
	  java_unget_unicode ();
	  BUILD_OPERATOR (AND_TK);
	}

    case '|':
      switch ((c = java_get_unicode ()))
	{
	case '|':
	  BUILD_OPERATOR (BOOL_OR_TK);
	case '=':
	  BUILD_OPERATOR2 (OR_ASSIGN_TK);
	default:
	  java_unget_unicode ();
	  BUILD_OPERATOR (OR_TK);
	}

    case '+':
      switch ((c = java_get_unicode ()))
	{
	case '+':
	  BUILD_OPERATOR (INCR_TK);
	case '=':
	  BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
	default:
	  java_unget_unicode ();
	  BUILD_OPERATOR (PLUS_TK);
	}

    case '-':
      switch ((c = java_get_unicode ()))
	{
	case '-':
	  BUILD_OPERATOR (DECR_TK);
	case '=':
	  BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
	default:
	  java_unget_unicode ();
	  BUILD_OPERATOR (MINUS_TK);
	}

    case '*':
      if ((c = java_get_unicode ()) == '=')
	{
	  BUILD_OPERATOR2 (MULT_ASSIGN_TK);
	}
      else
	{
	  java_unget_unicode ();
	  BUILD_OPERATOR (MULT_TK);
	}

    case '/':
      if ((c = java_get_unicode ()) == '=')
	{
	  BUILD_OPERATOR2 (DIV_ASSIGN_TK);
	}
      else
	{
	  java_unget_unicode ();
	  BUILD_OPERATOR (DIV_TK);
	}

    case '^':
      if ((c = java_get_unicode ()) == '=')
	{
	  BUILD_OPERATOR2 (XOR_ASSIGN_TK);
	}
      else
	{
	  java_unget_unicode ();
	  BUILD_OPERATOR (XOR_TK);
	}

    case '%':
      if ((c = java_get_unicode ()) == '=')
	{
	  BUILD_OPERATOR2 (REM_ASSIGN_TK);
	}
      else
	{
	  java_unget_unicode ();
	  BUILD_OPERATOR (REM_TK);
	}

    case '!':
      if ((c = java_get_unicode()) == '=')
	{
	  BUILD_OPERATOR (NEQ_TK);
	}
      else
	{
	  java_unget_unicode ();
	  BUILD_OPERATOR (NEG_TK);
	}

    case '?':
      JAVA_LEX_OP ("?");
      BUILD_OPERATOR (REL_QM_TK);
    case ':':
      JAVA_LEX_OP (":");
      BUILD_OPERATOR (REL_CL_TK);
    case '~':
      BUILD_OPERATOR (NOT_TK);
    }

  /* Keyword, boolean literal or null literal.  */
  for (first_unicode = c, all_ascii = 1, ascii_index = 0;
       c != UEOF && JAVA_PART_CHAR_P (c); c = java_get_unicode ())
    {
      java_unicode_2_utf8 (c);
      if (all_ascii && c >= 128)
        all_ascii = 0;
      ascii_index++;
    }

  obstack_1grow (&temporary_obstack, '\0');
  string = obstack_finish (&temporary_obstack);
  if (c != UEOF)
    java_unget_unicode ();

  /* If we have something all ascii, we consider a keyword, a boolean
     literal, a null literal or an all ASCII identifier.  Otherwise,
     this is an identifier (possibly not respecting formation rule).  */
  if (all_ascii)
    {
      const struct java_keyword *kw;
      if ((kw=java_keyword (string, ascii_index)))
	{
	  JAVA_LEX_KW (string);
	  switch (kw->token)
	    {
	    case PUBLIC_TK:       case PROTECTED_TK: case STATIC_TK:
	    case ABSTRACT_TK:     case FINAL_TK:     case NATIVE_TK:
	    case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
	    case PRIVATE_TK:      case STRICT_TK:
	      SET_MODIFIER_CTX (kw->token);
	      return MODIFIER_TK;
	    case FLOAT_TK:
	      SET_LVAL_NODE (float_type_node);
	      return FP_TK;
	    case DOUBLE_TK:
	      SET_LVAL_NODE (double_type_node);
	      return FP_TK;
	    case BOOLEAN_TK:
	      SET_LVAL_NODE (boolean_type_node);
	      return BOOLEAN_TK;
	    case BYTE_TK:
	      SET_LVAL_NODE (byte_type_node);
	      return INTEGRAL_TK;
	    case SHORT_TK:
	      SET_LVAL_NODE (short_type_node);
	      return INTEGRAL_TK;
	    case INT_TK:
	      SET_LVAL_NODE (int_type_node);
	      return INTEGRAL_TK;
	    case LONG_TK:
	      SET_LVAL_NODE (long_type_node);
	      return INTEGRAL_TK;
	    case CHAR_TK:
	      SET_LVAL_NODE (char_type_node);
	      return INTEGRAL_TK;

	      /* Keyword based literals.  */
	    case TRUE_TK:
	    case FALSE_TK:
	      SET_LVAL_NODE ((kw->token == TRUE_TK ?
			      boolean_true_node : boolean_false_node));
	      return BOOL_LIT_TK;
	    case NULL_TK:
	      SET_LVAL_NODE (null_pointer_node);
	      return NULL_TK;

	    case ASSERT_TK:
	      if (flag_assert)
		{
		  BUILD_OPERATOR (kw->token);
		  return kw->token;
		}
	      else
		break;

	      /* Some keyword we want to retain information on the location
		 they where found.  */
	    case CASE_TK:
	    case DEFAULT_TK:
	    case SUPER_TK:
	    case THIS_TK:
	    case RETURN_TK:
	    case BREAK_TK:
	    case CONTINUE_TK:
	    case TRY_TK:
	    case CATCH_TK:
	    case THROW_TK:
	    case INSTANCEOF_TK:
	      BUILD_OPERATOR (kw->token);

	    default:
	      return kw->token;
	    }
	}
    }

  /* We may have an ID here.  */
  if (JAVA_START_CHAR_P (first_unicode))
    {
      JAVA_LEX_ID (string);
      java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
      return ID_TK;
    }

  /* Everything else is an invalid character in the input.  */
  {
    char lex_error_buffer [128];
    sprintf (lex_error_buffer, "Invalid character `%s' in input",
	     java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
    java_lex_error (lex_error_buffer, 1);
  }
  return 0;
}

#ifndef JC1_LITE
/* This is called by the parser to see if an error should be generated
   due to numeric overflow.  This function only handles the particular
   case of the largest negative value, and is only called in the case
   where this value is not preceded by `-'.  */
static void
error_if_numeric_overflow (value)
     tree value;
{
  if (TREE_CODE (value) == INTEGER_CST
      && JAVA_RADIX10_FLAG (value)
      && tree_int_cst_sgn (value) < 0)
    {
      if (TREE_TYPE (value) == long_type_node)
	java_lex_error ("Numeric overflow for `long' literal", 0);
      else
	java_lex_error ("Numeric overflow for `int' literal", 0);
    }
}
#endif /* JC1_LITE */

static void
java_unicode_2_utf8 (unicode)
    unicode_t unicode;
{
  if (RANGE (unicode, 0x01, 0x7f))
    obstack_1grow (&temporary_obstack, (char)unicode);
  else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
    {
      obstack_1grow (&temporary_obstack,
		     (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
      obstack_1grow (&temporary_obstack,
		     (unsigned char)(0x80 | (unicode & 0x3f)));
    }
  else				/* Range 0x800-0xffff.  */
    {
      obstack_1grow (&temporary_obstack,
		     (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
      obstack_1grow (&temporary_obstack,
		     (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
      obstack_1grow (&temporary_obstack,
		     (unsigned char)(0x80 | (unicode & 0x003f)));
    }
}

#ifndef JC1_LITE
static tree
build_wfl_node (node)
     tree node;
{
  node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
  /* Prevent java_complete_lhs from short-circuiting node (if constant).  */
  TREE_TYPE (node) = NULL_TREE;
  return node;
}
#endif

static void
java_lex_error (msg, forward)
     const char *msg ATTRIBUTE_UNUSED;
     int forward ATTRIBUTE_UNUSED;
{
#ifndef JC1_LITE
  ctxp->elc.line = ctxp->c_line->lineno;
  ctxp->elc.col = ctxp->c_line->char_col-1+forward;

  /* Might be caught in the middle of some error report.  */
  ctxp->java_error_flag = 0;
  java_error (NULL);
  java_error (msg);
#endif
}

#ifndef JC1_LITE
static int
java_is_eol (fp, c)
  FILE *fp;
  int c;
{
  int next;
  switch (c)
    {
    case '\r':
      next = getc (fp);
      if (next != '\n' && next != EOF)
	ungetc (next, fp);
      return 1;
    case '\n':
      return 1;
    default:
      return 0;
    }
}
#endif

char *
java_get_line_col (filename, line, col)
     const char *filename ATTRIBUTE_UNUSED;
     int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
{
#ifdef JC1_LITE
  return 0;
#else
  /* Dumb implementation. Doesn't try to cache or optimize things.  */
  /* First line of the file is line 1, first column is 1.  */

  /* COL == -1 means, at the CR/LF in LINE.  */
  /* COL == -2 means, at the first non space char in LINE.  */

  FILE *fp;
  int c, ccol, cline = 1;
  int current_line_col = 0;
  int first_non_space = 0;
  char *base;

  if (!(fp = fopen (filename, "r")))
    fatal_io_error ("can't open %s", filename);

  while (cline != line)
    {
      c = getc (fp);
      if (c == EOF)
	{
	  static const char msg[] = "<<file too short - unexpected EOF>>";
	  obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
	  goto have_line;
	}
      if (java_is_eol (fp, c))
	cline++;
    }

  /* Gather the chars of the current line in a buffer.  */
  for (;;)
    {
      c = getc (fp);
      if (c < 0 || java_is_eol (fp, c))
	break;
      if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
	first_non_space = current_line_col;
      obstack_1grow (&temporary_obstack, c);
      current_line_col++;
    }
 have_line:

  obstack_1grow (&temporary_obstack, '\n');

  if (col == -1)
    {
      col = current_line_col;
      first_non_space = 0;
    }
  else if (col == -2)
    col = first_non_space;
  else
    first_non_space = 0;

  /* Place the '^' a the right position.  */
  base = obstack_base (&temporary_obstack);
  for (ccol = 1; ccol <= col+3; ccol++)
    {
      /* Compute \t when reaching first_non_space.  */
      char c = (first_non_space ?
		(base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
      obstack_1grow (&temporary_obstack, c);
    }
  obstack_grow0 (&temporary_obstack, "^", 1);

  fclose (fp);
  return obstack_finish (&temporary_obstack);
#endif
}

#ifndef JC1_LITE
static int
utf8_cmp (str, length, name)
     const unsigned char *str;
     int length;
     const char *name;
{
  const unsigned char *limit = str + length;
  int i;

  for (i = 0; name[i]; ++i)
    {
      int ch = UTF8_GET (str, limit);
      if (ch != name[i])
	return ch - name[i];
    }

  return str == limit ? 0 : 1;
}

/* A sorted list of all C++ keywords.  */

static const char *const cxx_keywords[] =
{
  "_Complex",
  "__alignof",
  "__alignof__",
  "__asm",
  "__asm__",
  "__attribute",
  "__attribute__",
  "__builtin_va_arg",
  "__complex",
  "__complex__",
  "__const",
  "__const__",
  "__extension__",
  "__imag",
  "__imag__",
  "__inline",
  "__inline__",
  "__label__",
  "__null",
  "__real",
  "__real__",
  "__restrict",
  "__restrict__",
  "__signed",
  "__signed__",
  "__typeof",
  "__typeof__",
  "__volatile",
  "__volatile__",
  "and",
  "and_eq",
  "asm",
  "auto",
  "bitand",
  "bitor",
  "bool",
  "break",
  "case",
  "catch",
  "char",
  "class",
  "compl",
  "const",
  "const_cast",
  "continue",
  "default",
  "delete",
  "do",
  "double",
  "dynamic_cast",
  "else",
  "enum",
  "explicit",
  "export",
  "extern",
  "false",
  "float",
  "for",
  "friend",
  "goto",
  "if",
  "inline",
  "int",
  "long",
  "mutable",
  "namespace",
  "new",
  "not",
  "not_eq",
  "operator",
  "or",
  "or_eq",
  "private",
  "protected",
  "public",
  "register",
  "reinterpret_cast",
  "return",
  "short",
  "signed",
  "sizeof",
  "static",
  "static_cast",
  "struct",
  "switch",
  "template",
  "this",
  "throw",
  "true",
  "try",
  "typedef",
  "typeid",
  "typename",
  "typeof",
  "union",
  "unsigned",
  "using",
  "virtual",
  "void",
  "volatile",
  "wchar_t",
  "while",
  "xor",
  "xor_eq"
};

/* Return true if NAME is a C++ keyword.  */

int
cxx_keyword_p (name, length)
     const char *name;
     int length;
{
  int last = ARRAY_SIZE (cxx_keywords);
  int first = 0;
  int mid = (last + first) / 2;
  int old = -1;

  for (mid = (last + first) / 2;
       mid != old;
       old = mid, mid = (last + first) / 2)
    {
      int kwl = strlen (cxx_keywords[mid]);
      int min_length = kwl > length ? length : kwl;
      int r = utf8_cmp (name, min_length, cxx_keywords[mid]);

      if (r == 0)
	{
	  int i;
	  /* We've found a match if all the remaining characters are `$'.  */
	  for (i = min_length; i < length && name[i] == '$'; ++i)
	    ;
	  if (i == length)
	    return 1;
	  r = 1;
	}

      if (r < 0)
	last = mid;
      else
	first = mid;
    }
  return 0;
}
#endif /* JC1_LITE */