app.c revision 77298
1/* This is the Assembler Pre-Processor
2   Copyright (C) 1987, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 2000
3   Free Software Foundation, Inc.
4
5   This file is part of GAS, the GNU Assembler.
6
7   GAS is free software; you can redistribute it and/or modify
8   it under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 2, or (at your option)
10   any later version.
11
12   GAS is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   GNU General Public License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with GAS; see the file COPYING.  If not, write to the Free
19   Software Foundation, 59 Temple Place - Suite 330, Boston, MA
20   02111-1307, USA.  */
21
22/* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
23/* App, the assembler pre-processor.  This pre-processor strips out excess
24   spaces, turns single-quoted characters into a decimal constant, and turns
25   # <number> <filename> <garbage> into a .line <number>\n.file <filename>
26   pair.  This needs better error-handling.  */
27
28#include <stdio.h>
29#include "as.h"			/* For BAD_CASE() only */
30
31#if (__STDC__ != 1)
32#ifndef const
33#define const  /* empty */
34#endif
35#endif
36
37#ifdef TC_M68K
38/* Whether we are scrubbing in m68k MRI mode.  This is different from
39   flag_m68k_mri, because the two flags will be affected by the .mri
40   pseudo-op at different times.  */
41static int scrub_m68k_mri;
42#else
43#define scrub_m68k_mri 0
44#endif
45
46/* The pseudo-op which switches in and out of MRI mode.  See the
47   comment in do_scrub_chars.  */
48static const char mri_pseudo[] = ".mri 0";
49
50#if defined TC_ARM && defined OBJ_ELF
51/* The pseudo-op for which we need to special-case `@' characters.
52   See the comment in do_scrub_chars.  */
53static const char   symver_pseudo[] = ".symver";
54static const char * symver_state;
55#endif
56
57static char lex[256];
58static const char symbol_chars[] =
59"$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
60
61#define LEX_IS_SYMBOL_COMPONENT		1
62#define LEX_IS_WHITESPACE		2
63#define LEX_IS_LINE_SEPARATOR		3
64#define LEX_IS_COMMENT_START		4
65#define LEX_IS_LINE_COMMENT_START	5
66#define	LEX_IS_TWOCHAR_COMMENT_1ST	6
67#define	LEX_IS_STRINGQUOTE		8
68#define	LEX_IS_COLON			9
69#define	LEX_IS_NEWLINE			10
70#define	LEX_IS_ONECHAR_QUOTE		11
71#ifdef TC_V850
72#define LEX_IS_DOUBLEDASH_1ST		12
73#endif
74#ifdef TC_M32R
75#define DOUBLEBAR_PARALLEL
76#endif
77#ifdef DOUBLEBAR_PARALLEL
78#define LEX_IS_DOUBLEBAR_1ST		13
79#endif
80#define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
81#define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
82#define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
83#define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
84#define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
85#define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
86
87static int process_escape PARAMS ((int));
88
89/* FIXME-soon: The entire lexer/parser thingy should be
90   built statically at compile time rather than dynamically
91   each and every time the assembler is run.  xoxorich.  */
92
93void
94do_scrub_begin (m68k_mri)
95     int m68k_mri ATTRIBUTE_UNUSED;
96{
97  const char *p;
98  int c;
99
100  lex[' '] = LEX_IS_WHITESPACE;
101  lex['\t'] = LEX_IS_WHITESPACE;
102  lex['\r'] = LEX_IS_WHITESPACE;
103  lex['\n'] = LEX_IS_NEWLINE;
104  lex[':'] = LEX_IS_COLON;
105
106#ifdef TC_M68K
107  scrub_m68k_mri = m68k_mri;
108
109  if (! m68k_mri)
110#endif
111    {
112      lex['"'] = LEX_IS_STRINGQUOTE;
113
114#if ! defined (TC_HPPA) && ! defined (TC_I370)
115      /* I370 uses single-quotes to delimit integer, float constants */
116      lex['\''] = LEX_IS_ONECHAR_QUOTE;
117#endif
118
119#ifdef SINGLE_QUOTE_STRINGS
120      lex['\''] = LEX_IS_STRINGQUOTE;
121#endif
122    }
123
124  /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
125     in state 5 of do_scrub_chars must be changed.  */
126
127  /* Note that these override the previous defaults, e.g. if ';' is a
128     comment char, then it isn't a line separator.  */
129  for (p = symbol_chars; *p; ++p)
130    {
131      lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
132    }				/* declare symbol characters */
133
134  for (c = 128; c < 256; ++c)
135    lex[c] = LEX_IS_SYMBOL_COMPONENT;
136
137#ifdef tc_symbol_chars
138  /* This macro permits the processor to specify all characters which
139     may appears in an operand.  This will prevent the scrubber from
140     discarding meaningful whitespace in certain cases.  The i386
141     backend uses this to support prefixes, which can confuse the
142     scrubber as to whether it is parsing operands or opcodes.  */
143  for (p = tc_symbol_chars; *p; ++p)
144    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
145#endif
146
147  /* The m68k backend wants to be able to change comment_chars.  */
148#ifndef tc_comment_chars
149#define tc_comment_chars comment_chars
150#endif
151  for (p = tc_comment_chars; *p; p++)
152    {
153      lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
154    }				/* declare comment chars */
155
156  for (p = line_comment_chars; *p; p++)
157    {
158      lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
159    }				/* declare line comment chars */
160
161  for (p = line_separator_chars; *p; p++)
162    {
163      lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
164    }				/* declare line separators */
165
166  /* Only allow slash-star comments if slash is not in use.
167     FIXME: This isn't right.  We should always permit them.  */
168  if (lex['/'] == 0)
169    {
170      lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
171    }
172
173#ifdef TC_M68K
174  if (m68k_mri)
175    {
176      lex['\''] = LEX_IS_STRINGQUOTE;
177      lex[';'] = LEX_IS_COMMENT_START;
178      lex['*'] = LEX_IS_LINE_COMMENT_START;
179      /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
180         then it can't be used in an expression.  */
181      lex['!'] = LEX_IS_LINE_COMMENT_START;
182    }
183#endif
184
185#ifdef TC_V850
186  lex['-'] = LEX_IS_DOUBLEDASH_1ST;
187#endif
188#ifdef DOUBLEBAR_PARALLEL
189  lex['|'] = LEX_IS_DOUBLEBAR_1ST;
190#endif
191#ifdef TC_D30V
192  /* must do this is we want VLIW instruction with "->" or "<-" */
193  lex['-'] = LEX_IS_SYMBOL_COMPONENT;
194#endif
195}				/* do_scrub_begin() */
196
197/* Saved state of the scrubber */
198static int state;
199static int old_state;
200static char *out_string;
201static char out_buf[20];
202static int add_newlines;
203static char *saved_input;
204static int saved_input_len;
205static char input_buffer[32 * 1024];
206static const char *mri_state;
207static char mri_last_ch;
208
209/* Data structure for saving the state of app across #include's.  Note that
210   app is called asynchronously to the parsing of the .include's, so our
211   state at the time .include is interpreted is completely unrelated.
212   That's why we have to save it all.  */
213
214struct app_save {
215  int          state;
216  int          old_state;
217  char *       out_string;
218  char         out_buf[sizeof (out_buf)];
219  int          add_newlines;
220  char *       saved_input;
221  int          saved_input_len;
222#ifdef TC_M68K
223  int          scrub_m68k_mri;
224#endif
225  const char * mri_state;
226  char         mri_last_ch;
227#if defined TC_ARM && defined OBJ_ELF
228  const char * symver_state;
229#endif
230};
231
232char *
233app_push ()
234{
235  register struct app_save *saved;
236
237  saved = (struct app_save *) xmalloc (sizeof (*saved));
238  saved->state = state;
239  saved->old_state = old_state;
240  saved->out_string = out_string;
241  memcpy (saved->out_buf, out_buf, sizeof (out_buf));
242  saved->add_newlines = add_newlines;
243  if (saved_input == NULL)
244    saved->saved_input = NULL;
245  else
246    {
247      saved->saved_input = xmalloc (saved_input_len);
248      memcpy (saved->saved_input, saved_input, saved_input_len);
249      saved->saved_input_len = saved_input_len;
250    }
251#ifdef TC_M68K
252  saved->scrub_m68k_mri = scrub_m68k_mri;
253#endif
254  saved->mri_state = mri_state;
255  saved->mri_last_ch = mri_last_ch;
256#if defined TC_ARM && defined OBJ_ELF
257  saved->symver_state = symver_state;
258#endif
259
260  /* do_scrub_begin() is not useful, just wastes time.  */
261
262  state = 0;
263  saved_input = NULL;
264
265  return (char *) saved;
266}
267
268void
269app_pop (arg)
270     char *arg;
271{
272  register struct app_save *saved = (struct app_save *) arg;
273
274  /* There is no do_scrub_end ().  */
275  state = saved->state;
276  old_state = saved->old_state;
277  out_string = saved->out_string;
278  memcpy (out_buf, saved->out_buf, sizeof (out_buf));
279  add_newlines = saved->add_newlines;
280  if (saved->saved_input == NULL)
281    saved_input = NULL;
282  else
283    {
284      assert (saved->saved_input_len <= (int) (sizeof input_buffer));
285      memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
286      saved_input = input_buffer;
287      saved_input_len = saved->saved_input_len;
288      free (saved->saved_input);
289    }
290#ifdef TC_M68K
291  scrub_m68k_mri = saved->scrub_m68k_mri;
292#endif
293  mri_state = saved->mri_state;
294  mri_last_ch = saved->mri_last_ch;
295#if defined TC_ARM && defined OBJ_ELF
296  symver_state = saved->symver_state;
297#endif
298
299  free (arg);
300}				/* app_pop() */
301
302/* @@ This assumes that \n &c are the same on host and target.  This is not
303   necessarily true.  */
304static int
305process_escape (ch)
306     int ch;
307{
308  switch (ch)
309    {
310    case 'b':
311      return '\b';
312    case 'f':
313      return '\f';
314    case 'n':
315      return '\n';
316    case 'r':
317      return '\r';
318    case 't':
319      return '\t';
320    case '\'':
321      return '\'';
322    case '"':
323      return '\"';
324    default:
325      return ch;
326    }
327}
328
329/* This function is called to process input characters.  The GET
330   parameter is used to retrieve more input characters.  GET should
331   set its parameter to point to a buffer, and return the length of
332   the buffer; it should return 0 at end of file.  The scrubbed output
333   characters are put into the buffer starting at TOSTART; the TOSTART
334   buffer is TOLEN bytes in length.  The function returns the number
335   of scrubbed characters put into TOSTART.  This will be TOLEN unless
336   end of file was seen.  This function is arranged as a state
337   machine, and saves its state so that it may return at any point.
338   This is the way the old code used to work.  */
339
340int
341do_scrub_chars (get, tostart, tolen)
342     int (*get) PARAMS ((char *, int));
343     char *tostart;
344     int tolen;
345{
346  char *to = tostart;
347  char *toend = tostart + tolen;
348  char *from;
349  char *fromend;
350  int fromlen;
351  register int ch, ch2 = 0;
352
353  /*State 0: beginning of normal line
354	  1: After first whitespace on line (flush more white)
355	  2: After first non-white (opcode) on line (keep 1white)
356	  3: after second white on line (into operands) (flush white)
357	  4: after putting out a .line, put out digits
358	  5: parsing a string, then go to old-state
359	  6: putting out \ escape in a "d string.
360	  7: After putting out a .appfile, put out string.
361	  8: After putting out a .appfile string, flush until newline.
362	  9: After seeing symbol char in state 3 (keep 1white after symchar)
363	 10: After seeing whitespace in state 9 (keep white before symchar)
364	 11: After seeing a symbol character in state 0 (eg a label definition)
365	 -1: output string in out_string and go to the state in old_state
366	 -2: flush text until a '*' '/' is seen, then go to state old_state
367#ifdef TC_V850
368         12: After seeing a dash, looking for a second dash as a start of comment.
369#endif
370#ifdef DOUBLEBAR_PARALLEL
371	 13: After seeing a vertical bar, looking for a second vertical bar as a parallel expression seperator.
372#endif
373	  */
374
375  /* I added states 9 and 10 because the MIPS ECOFF assembler uses
376     constructs like ``.loc 1 20''.  This was turning into ``.loc
377     120''.  States 9 and 10 ensure that a space is never dropped in
378     between characters which could appear in a identifier.  Ian
379     Taylor, ian@cygnus.com.
380
381     I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
382     correctly on the PA (and any other target where colons are optional).
383     Jeff Law, law@cs.utah.edu.
384
385     I added state 13 so that something like "cmp r1, r2 || trap #1" does not
386     get squashed into "cmp r1,r2||trap#1", with the all important space
387     between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
388
389  /* This macro gets the next input character.  */
390
391#define GET()							\
392  (from < fromend						\
393   ? * (unsigned char *) (from++)				\
394   : (saved_input = NULL,					\
395      fromlen = (*get) (input_buffer, sizeof input_buffer),	\
396      from = input_buffer,					\
397      fromend = from + fromlen,					\
398      (fromlen == 0						\
399       ? EOF							\
400       : * (unsigned char *) (from++))))
401
402  /* This macro pushes a character back on the input stream.  */
403
404#define UNGET(uch) (*--from = (uch))
405
406  /* This macro puts a character into the output buffer.  If this
407     character fills the output buffer, this macro jumps to the label
408     TOFULL.  We use this rather ugly approach because we need to
409     handle two different termination conditions: EOF on the input
410     stream, and a full output buffer.  It would be simpler if we
411     always read in the entire input stream before processing it, but
412     I don't want to make such a significant change to the assembler's
413     memory usage.  */
414
415#define PUT(pch)			\
416  do					\
417    {					\
418      *to++ = (pch);			\
419      if (to >= toend)			\
420        goto tofull;			\
421    }					\
422  while (0)
423
424  if (saved_input != NULL)
425    {
426      from = saved_input;
427      fromend = from + saved_input_len;
428    }
429  else
430    {
431      fromlen = (*get) (input_buffer, sizeof input_buffer);
432      if (fromlen == 0)
433	return 0;
434      from = input_buffer;
435      fromend = from + fromlen;
436    }
437
438  while (1)
439    {
440      /* The cases in this switch end with continue, in order to
441         branch back to the top of this while loop and generate the
442         next output character in the appropriate state.  */
443      switch (state)
444	{
445	case -1:
446	  ch = *out_string++;
447	  if (*out_string == '\0')
448	    {
449	      state = old_state;
450	      old_state = 3;
451	    }
452	  PUT (ch);
453	  continue;
454
455	case -2:
456	  for (;;)
457	    {
458	      do
459		{
460		  ch = GET ();
461
462		  if (ch == EOF)
463		    {
464		      as_warn (_("end of file in comment"));
465		      goto fromeof;
466		    }
467
468		  if (ch == '\n')
469		    PUT ('\n');
470		}
471	      while (ch != '*');
472
473	      while ((ch = GET ()) == '*')
474		;
475
476	      if (ch == EOF)
477		{
478		  as_warn (_("end of file in comment"));
479		  goto fromeof;
480		}
481
482	      if (ch == '/')
483		break;
484
485	      UNGET (ch);
486	    }
487
488	  state = old_state;
489	  UNGET (' ');
490	  continue;
491
492	case 4:
493	  ch = GET ();
494	  if (ch == EOF)
495	    goto fromeof;
496	  else if (ch >= '0' && ch <= '9')
497	    PUT (ch);
498	  else
499	    {
500	      while (ch != EOF && IS_WHITESPACE (ch))
501		ch = GET ();
502	      if (ch == '"')
503		{
504		  UNGET (ch);
505		  if (scrub_m68k_mri)
506		    out_string = "\n\tappfile ";
507		  else
508		    out_string = "\n\t.appfile ";
509		  old_state = 7;
510		  state = -1;
511		  PUT (*out_string++);
512		}
513	      else
514		{
515		  while (ch != EOF && ch != '\n')
516		    ch = GET ();
517		  state = 0;
518		  PUT (ch);
519		}
520	    }
521	  continue;
522
523	case 5:
524	  /* We are going to copy everything up to a quote character,
525             with special handling for a backslash.  We try to
526             optimize the copying in the simple case without using the
527             GET and PUT macros.  */
528	  {
529	    char *s;
530	    int len;
531
532	    for (s = from; s < fromend; s++)
533	      {
534		ch = *s;
535		/* This condition must be changed if the type of any
536                   other character can be LEX_IS_STRINGQUOTE.  */
537		if (ch == '\\'
538		    || ch == '"'
539		    || ch == '\''
540		    || ch == '\n')
541		  break;
542	      }
543	    len = s - from;
544	    if (len > toend - to)
545	      len = toend - to;
546	    if (len > 0)
547	      {
548		memcpy (to, from, len);
549		to += len;
550		from += len;
551	      }
552	  }
553
554	  ch = GET ();
555	  if (ch == EOF)
556	    {
557	      as_warn (_("end of file in string: inserted '\"'"));
558	      state = old_state;
559	      UNGET ('\n');
560	      PUT ('"');
561	    }
562	  else if (lex[ch] == LEX_IS_STRINGQUOTE)
563	    {
564	      state = old_state;
565	      PUT (ch);
566	    }
567#ifndef NO_STRING_ESCAPES
568	  else if (ch == '\\')
569	    {
570	      state = 6;
571	      PUT (ch);
572	    }
573#endif
574	  else if (scrub_m68k_mri && ch == '\n')
575	    {
576	      /* Just quietly terminate the string.  This permits lines like
577		   bne	label	loop if we haven't reach end yet
578		 */
579	      state = old_state;
580	      UNGET (ch);
581	      PUT ('\'');
582	    }
583	  else
584	    {
585	      PUT (ch);
586	    }
587	  continue;
588
589	case 6:
590	  state = 5;
591	  ch = GET ();
592	  switch (ch)
593	    {
594	      /* Handle strings broken across lines, by turning '\n' into
595		 '\\' and 'n'.  */
596	    case '\n':
597	      UNGET ('n');
598	      add_newlines++;
599	      PUT ('\\');
600	      continue;
601
602	    case '"':
603	    case '\\':
604	    case 'b':
605	    case 'f':
606	    case 'n':
607	    case 'r':
608	    case 't':
609	    case 'v':
610	    case 'x':
611	    case 'X':
612	    case '0':
613	    case '1':
614	    case '2':
615	    case '3':
616	    case '4':
617	    case '5':
618	    case '6':
619	    case '7':
620	      break;
621#if defined(IGNORE_NONSTANDARD_ESCAPES) | defined(ONLY_STANDARD_ESCAPES)
622	    default:
623	      as_warn (_("Unknown escape '\\%c' in string: Ignored"), ch);
624	      break;
625#else  /* ONLY_STANDARD_ESCAPES */
626	    default:
627	      /* Accept \x as x for any x */
628	      break;
629#endif /* ONLY_STANDARD_ESCAPES */
630
631	    case EOF:
632	      as_warn (_("End of file in string: '\"' inserted"));
633	      PUT ('"');
634	      continue;
635	    }
636	  PUT (ch);
637	  continue;
638
639	case 7:
640	  ch = GET ();
641	  state = 5;
642	  old_state = 8;
643	  if (ch == EOF)
644	    goto fromeof;
645	  PUT (ch);
646	  continue;
647
648	case 8:
649	  do
650	    ch = GET ();
651	  while (ch != '\n' && ch != EOF);
652	  if (ch == EOF)
653	    goto fromeof;
654	  state = 0;
655	  PUT (ch);
656	  continue;
657	}
658
659      /* OK, we are somewhere in states 0 through 4 or 9 through 11 */
660
661      /* flushchar: */
662      ch = GET ();
663
664    recycle:
665
666#if defined TC_ARM && defined OBJ_ELF
667      /* We need to watch out for .symver directives.  See the comment later
668	 in this function.  */
669      if (symver_state == NULL)
670	{
671	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
672	    symver_state = symver_pseudo + 1;
673	}
674      else
675	{
676	  /* We advance to the next state if we find the right
677	     character.  */
678	  if (ch != '\0' && (*symver_state == ch))
679	    ++symver_state;
680	  else if (*symver_state != '\0')
681	    /* We did not get the expected character, or we didn't
682	       get a valid terminating character after seeing the
683	       entire pseudo-op, so we must go back to the beginning.  */
684	    symver_state = NULL;
685	  else
686	    {
687	      /* We've read the entire pseudo-op.  If this is the end
688		 of the line, go back to the beginning.  */
689	      if (IS_NEWLINE (ch))
690		symver_state = NULL;
691	    }
692	}
693#endif /* TC_ARM && OBJ_ELF */
694
695#ifdef TC_M68K
696      /* We want to have pseudo-ops which control whether we are in
697         MRI mode or not.  Unfortunately, since m68k MRI mode affects
698         the scrubber, that means that we need a special purpose
699         recognizer here.  */
700      if (mri_state == NULL)
701	{
702	  if ((state == 0 || state == 1)
703	      && ch == mri_pseudo[0])
704	    mri_state = mri_pseudo + 1;
705	}
706      else
707	{
708	  /* We advance to the next state if we find the right
709	     character, or if we need a space character and we get any
710	     whitespace character, or if we need a '0' and we get a
711	     '1' (this is so that we only need one state to handle
712	     ``.mri 0'' and ``.mri 1'').  */
713	  if (ch != '\0'
714	      && (*mri_state == ch
715		  || (*mri_state == ' '
716		      && lex[ch] == LEX_IS_WHITESPACE)
717		  || (*mri_state == '0'
718		      && ch == '1')))
719	    {
720	      mri_last_ch = ch;
721	      ++mri_state;
722	    }
723	  else if (*mri_state != '\0'
724		   || (lex[ch] != LEX_IS_WHITESPACE
725		       && lex[ch] != LEX_IS_NEWLINE))
726	    {
727	      /* We did not get the expected character, or we didn't
728		 get a valid terminating character after seeing the
729		 entire pseudo-op, so we must go back to the
730		 beginning.  */
731	      mri_state = NULL;
732	    }
733	  else
734	    {
735	      /* We've read the entire pseudo-op.  mips_last_ch is
736                 either '0' or '1' indicating whether to enter or
737                 leave MRI mode.  */
738	      do_scrub_begin (mri_last_ch == '1');
739	      mri_state = NULL;
740
741	      /* We continue handling the character as usual.  The
742                 main gas reader must also handle the .mri pseudo-op
743                 to control expression parsing and the like.  */
744	    }
745	}
746#endif
747
748      if (ch == EOF)
749	{
750	  if (state != 0)
751	    {
752	      as_warn (_("end of file not at end of a line; newline inserted"));
753	      state = 0;
754	      PUT ('\n');
755	    }
756	  goto fromeof;
757	}
758
759      switch (lex[ch])
760	{
761	case LEX_IS_WHITESPACE:
762	  do
763	    {
764	      ch = GET ();
765	    }
766	  while (ch != EOF && IS_WHITESPACE (ch));
767	  if (ch == EOF)
768	    goto fromeof;
769
770	  if (state == 0)
771	    {
772	      /* Preserve a single whitespace character at the
773		 beginning of a line.  */
774	      state = 1;
775	      UNGET (ch);
776	      PUT (' ');
777	      break;
778	    }
779
780#ifdef KEEP_WHITE_AROUND_COLON
781	  if (lex[ch] == LEX_IS_COLON)
782	    {
783	      /* Only keep this white if there's no white *after* the
784                 colon.  */
785	      ch2 = GET ();
786	      UNGET (ch2);
787	      if (!IS_WHITESPACE (ch2))
788		{
789		  state = 9;
790		  UNGET (ch);
791		  PUT (' ');
792		  break;
793		}
794	    }
795#endif
796	  if (IS_COMMENT (ch)
797	      || ch == '/'
798	      || IS_LINE_SEPARATOR (ch))
799	    {
800	      if (scrub_m68k_mri)
801		{
802		  /* In MRI mode, we keep these spaces.  */
803		  UNGET (ch);
804		  PUT (' ');
805		  break;
806		}
807	      goto recycle;
808	    }
809
810	  /* If we're in state 2 or 11, we've seen a non-white
811	     character followed by whitespace.  If the next character
812	     is ':', this is whitespace after a label name which we
813	     normally must ignore.  In MRI mode, though, spaces are
814	     not permitted between the label and the colon.  */
815	  if ((state == 2 || state == 11)
816	      && lex[ch] == LEX_IS_COLON
817	      && ! scrub_m68k_mri)
818	    {
819	      state = 1;
820	      PUT (ch);
821	      break;
822	    }
823
824	  switch (state)
825	    {
826	    case 0:
827	      state++;
828	      goto recycle;	/* Punted leading sp */
829	    case 1:
830	      /* We can arrive here if we leave a leading whitespace
831		 character at the beginning of a line.  */
832	      goto recycle;
833	    case 2:
834	      state = 3;
835	      if (to + 1 < toend)
836		{
837		  /* Optimize common case by skipping UNGET/GET.  */
838		  PUT (' ');	/* Sp after opco */
839		  goto recycle;
840		}
841	      UNGET (ch);
842	      PUT (' ');
843	      break;
844	    case 3:
845	      if (scrub_m68k_mri)
846		{
847		  /* In MRI mode, we keep these spaces.  */
848		  UNGET (ch);
849		  PUT (' ');
850		  break;
851		}
852	      goto recycle;	/* Sp in operands */
853	    case 9:
854	    case 10:
855	      if (scrub_m68k_mri)
856		{
857		  /* In MRI mode, we keep these spaces.  */
858		  state = 3;
859		  UNGET (ch);
860		  PUT (' ');
861		  break;
862		}
863	      state = 10;	/* Sp after symbol char */
864	      goto recycle;
865	    case 11:
866	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
867		state = 1;
868	      else
869		{
870		  /* We know that ch is not ':', since we tested that
871                     case above.  Therefore this is not a label, so it
872                     must be the opcode, and we've just seen the
873                     whitespace after it.  */
874		  state = 3;
875		}
876	      UNGET (ch);
877	      PUT (' ');	/* Sp after label definition.  */
878	      break;
879	    default:
880	      BAD_CASE (state);
881	    }
882	  break;
883
884	case LEX_IS_TWOCHAR_COMMENT_1ST:
885	  ch2 = GET ();
886	  if (ch2 == '*')
887	    {
888	      for (;;)
889		{
890		  do
891		    {
892		      ch2 = GET ();
893		      if (ch2 != EOF && IS_NEWLINE (ch2))
894			add_newlines++;
895		    }
896		  while (ch2 != EOF && ch2 != '*');
897
898		  while (ch2 == '*')
899		    ch2 = GET ();
900
901		  if (ch2 == EOF || ch2 == '/')
902		    break;
903
904		  /* This UNGET will ensure that we count newlines
905                     correctly.  */
906		  UNGET (ch2);
907		}
908
909	      if (ch2 == EOF)
910		as_warn (_("end of file in multiline comment"));
911
912	      ch = ' ';
913	      goto recycle;
914	    }
915#ifdef DOUBLESLASH_LINE_COMMENTS
916	  else if (ch2 == '/')
917	    {
918	      do
919		{
920		  ch = GET ();
921		}
922	      while (ch != EOF && !IS_NEWLINE (ch));
923	      if (ch == EOF)
924		as_warn ("end of file in comment; newline inserted");
925	      state = 0;
926	      PUT ('\n');
927	      break;
928	    }
929#endif
930	  else
931	    {
932	      if (ch2 != EOF)
933		UNGET (ch2);
934	      if (state == 9 || state == 10)
935		state = 3;
936	      PUT (ch);
937	    }
938	  break;
939
940	case LEX_IS_STRINGQUOTE:
941	  if (state == 10)
942	    {
943	      /* Preserve the whitespace in foo "bar" */
944	      UNGET (ch);
945	      state = 3;
946	      PUT (' ');
947
948	      /* PUT didn't jump out.  We could just break, but we
949                 know what will happen, so optimize a bit.  */
950	      ch = GET ();
951	      old_state = 3;
952	    }
953	  else if (state == 9)
954	    old_state = 3;
955	  else
956	    old_state = state;
957	  state = 5;
958	  PUT (ch);
959	  break;
960
961#ifndef IEEE_STYLE
962	case LEX_IS_ONECHAR_QUOTE:
963	  if (state == 10)
964	    {
965	      /* Preserve the whitespace in foo 'b' */
966	      UNGET (ch);
967	      state = 3;
968	      PUT (' ');
969	      break;
970	    }
971	  ch = GET ();
972	  if (ch == EOF)
973	    {
974	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
975	      ch = 0;
976	    }
977	  if (ch == '\\')
978	    {
979	      ch = GET ();
980	      if (ch == EOF)
981		{
982		  as_warn (_("end of file in escape character"));
983		  ch = '\\';
984		}
985	      else
986		ch = process_escape (ch);
987	    }
988	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
989
990	  /* None of these 'x constants for us.  We want 'x'.  */
991	  if ((ch = GET ()) != '\'')
992	    {
993#ifdef REQUIRE_CHAR_CLOSE_QUOTE
994	      as_warn (_("Missing close quote: (assumed)"));
995#else
996	      if (ch != EOF)
997		UNGET (ch);
998#endif
999	    }
1000	  if (strlen (out_buf) == 1)
1001	    {
1002	      PUT (out_buf[0]);
1003	      break;
1004	    }
1005	  if (state == 9)
1006	    old_state = 3;
1007	  else
1008	    old_state = state;
1009	  state = -1;
1010	  out_string = out_buf;
1011	  PUT (*out_string++);
1012	  break;
1013#endif
1014
1015	case LEX_IS_COLON:
1016#ifdef KEEP_WHITE_AROUND_COLON
1017	  state = 9;
1018#else
1019	  if (state == 9 || state == 10)
1020	    state = 3;
1021	  else if (state != 3)
1022	    state = 1;
1023#endif
1024	  PUT (ch);
1025	  break;
1026
1027	case LEX_IS_NEWLINE:
1028	  /* Roll out a bunch of newlines from inside comments, etc.  */
1029	  if (add_newlines)
1030	    {
1031	      --add_newlines;
1032	      UNGET (ch);
1033	    }
1034	  /* Fall through.  */
1035
1036	case LEX_IS_LINE_SEPARATOR:
1037	  state = 0;
1038	  PUT (ch);
1039	  break;
1040
1041#ifdef TC_V850
1042	case LEX_IS_DOUBLEDASH_1ST:
1043	  ch2 = GET ();
1044	  if (ch2 != '-')
1045	    {
1046	      UNGET (ch2);
1047	      goto de_fault;
1048	    }
1049	  /* Read and skip to end of line.  */
1050	  do
1051	    {
1052	      ch = GET ();
1053	    }
1054	  while (ch != EOF && ch != '\n');
1055	  if (ch == EOF)
1056	    {
1057	      as_warn (_("end of file in comment; newline inserted"));
1058	    }
1059	  state = 0;
1060	  PUT ('\n');
1061	  break;
1062#endif
1063#ifdef DOUBLEBAR_PARALLEL
1064	case LEX_IS_DOUBLEBAR_1ST:
1065	  ch2 = GET ();
1066	  if (ch2 != '|')
1067	    {
1068	      UNGET (ch2);
1069	      goto de_fault;
1070	    }
1071	  /* Reset back to state 1 and pretend that we are parsing a line from
1072	     just after the first white space.  */
1073	  state = 1;
1074	  PUT ('|');
1075	  PUT ('|');
1076	  break;
1077#endif
1078	case LEX_IS_LINE_COMMENT_START:
1079	  /* FIXME-someday: The two character comment stuff was badly
1080	     thought out.  On i386, we want '/' as line comment start
1081	     AND we want C style comments.  hence this hack.  The
1082	     whole lexical process should be reworked.  xoxorich.  */
1083	  if (ch == '/')
1084	    {
1085	      ch2 = GET ();
1086	      if (ch2 == '*')
1087		{
1088		  old_state = 3;
1089		  state = -2;
1090		  break;
1091		}
1092	      else
1093		{
1094		  UNGET (ch2);
1095		}
1096	    } /* bad hack */
1097
1098	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1099	    {
1100	      int startch;
1101
1102	      startch = ch;
1103
1104	      do
1105		{
1106		  ch = GET ();
1107		}
1108	      while (ch != EOF && IS_WHITESPACE (ch));
1109	      if (ch == EOF)
1110		{
1111		  as_warn (_("end of file in comment; newline inserted"));
1112		  PUT ('\n');
1113		  break;
1114		}
1115	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1116		{
1117		  /* Not a cpp line.  */
1118		  while (ch != EOF && !IS_NEWLINE (ch))
1119		    ch = GET ();
1120		  if (ch == EOF)
1121		    as_warn (_("EOF in Comment: Newline inserted"));
1122		  state = 0;
1123		  PUT ('\n');
1124		  break;
1125		}
1126	      /* Looks like `# 123 "filename"' from cpp.  */
1127	      UNGET (ch);
1128	      old_state = 4;
1129	      state = -1;
1130	      if (scrub_m68k_mri)
1131		out_string = "\tappline ";
1132	      else
1133		out_string = "\t.appline ";
1134	      PUT (*out_string++);
1135	      break;
1136	    }
1137
1138#ifdef TC_D10V
1139	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1140	     Trap is the only short insn that has a first operand that is
1141	     neither register nor label.
1142	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1143	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1144	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1145	     only character in line_comment_chars for d10v, hence we
1146	     can recognize it as such.  */
1147	  /* An alternative approach would be to reset the state to 1 when
1148	     we see '||', '<'- or '->', but that seems to be overkill.  */
1149	  if (state == 10)
1150	    PUT (' ');
1151#endif
1152	  /* We have a line comment character which is not at the
1153	     start of a line.  If this is also a normal comment
1154	     character, fall through.  Otherwise treat it as a default
1155	     character.  */
1156	  if (strchr (tc_comment_chars, ch) == NULL
1157	      && (! scrub_m68k_mri
1158		  || (ch != '!' && ch != '*')))
1159	    goto de_fault;
1160	  if (scrub_m68k_mri
1161	      && (ch == '!' || ch == '*' || ch == '#')
1162	      && state != 1
1163	      && state != 10)
1164	    goto de_fault;
1165	  /* Fall through.  */
1166	case LEX_IS_COMMENT_START:
1167#if defined TC_ARM && defined OBJ_ELF
1168	  /* On the ARM, `@' is the comment character.
1169	     Unfortunately this is also a special character in ELF .symver
1170	     directives (and .type, though we deal with those another way).
1171	     So we check if this line is such a directive, and treat
1172	     the character as default if so.  This is a hack.  */
1173	  if ((symver_state != NULL) && (*symver_state == 0))
1174	    goto de_fault;
1175#endif
1176#ifdef WARN_COMMENTS
1177	  if (!found_comment)
1178	    as_where (&found_comment_file, &found_comment);
1179#endif
1180	  do
1181	    {
1182	      ch = GET ();
1183	    }
1184	  while (ch != EOF && !IS_NEWLINE (ch));
1185	  if (ch == EOF)
1186	    as_warn (_("end of file in comment; newline inserted"));
1187	  state = 0;
1188	  PUT ('\n');
1189	  break;
1190
1191	case LEX_IS_SYMBOL_COMPONENT:
1192	  if (state == 10)
1193	    {
1194	      /* This is a symbol character following another symbol
1195		 character, with whitespace in between.  We skipped
1196		 the whitespace earlier, so output it now.  */
1197	      UNGET (ch);
1198	      state = 3;
1199	      PUT (' ');
1200	      break;
1201	    }
1202
1203	  if (state == 3)
1204	    state = 9;
1205
1206	  /* This is a common case.  Quickly copy CH and all the
1207             following symbol component or normal characters.  */
1208	  if (to + 1 < toend
1209	      && mri_state == NULL
1210#if defined TC_ARM && defined OBJ_ELF
1211	      && symver_state == NULL
1212#endif
1213	      )
1214	    {
1215	      char *s;
1216	      int len;
1217
1218	      for (s = from; s < fromend; s++)
1219		{
1220		  int type;
1221
1222		  ch2 = *(unsigned char *) s;
1223		  type = lex[ch2];
1224		  if (type != 0
1225		      && type != LEX_IS_SYMBOL_COMPONENT)
1226		    break;
1227		}
1228	      if (s > from)
1229		{
1230		  /* Handle the last character normally, for
1231                     simplicity.  */
1232		  --s;
1233		}
1234	      len = s - from;
1235	      if (len > (toend - to) - 1)
1236		len = (toend - to) - 1;
1237	      if (len > 0)
1238		{
1239		  PUT (ch);
1240		  if (len > 8)
1241		    {
1242		      memcpy (to, from, len);
1243		      to += len;
1244		      from += len;
1245		    }
1246		  else
1247		    {
1248		      switch (len)
1249			{
1250			case 8: *to++ = *from++;
1251			case 7: *to++ = *from++;
1252			case 6: *to++ = *from++;
1253			case 5: *to++ = *from++;
1254			case 4: *to++ = *from++;
1255			case 3: *to++ = *from++;
1256			case 2: *to++ = *from++;
1257			case 1: *to++ = *from++;
1258			}
1259		    }
1260		  ch = GET ();
1261		}
1262	    }
1263
1264	  /* Fall through.  */
1265	default:
1266	de_fault:
1267	  /* Some relatively `normal' character.  */
1268	  if (state == 0)
1269	    {
1270	      state = 11;	/* Now seeing label definition */
1271	    }
1272	  else if (state == 1)
1273	    {
1274	      state = 2;	/* Ditto */
1275	    }
1276	  else if (state == 9)
1277	    {
1278	      if (lex[ch] != LEX_IS_SYMBOL_COMPONENT)
1279		state = 3;
1280	    }
1281	  else if (state == 10)
1282	    {
1283	      if (ch == '\\')
1284		{
1285		  /* Special handling for backslash: a backslash may
1286		     be the beginning of a formal parameter (of a
1287		     macro) following another symbol character, with
1288		     whitespace in between.  If that is the case, we
1289		     output a space before the parameter.  Strictly
1290		     speaking, correct handling depends upon what the
1291		     macro parameter expands into; if the parameter
1292		     expands into something which does not start with
1293		     an operand character, then we don't want to keep
1294		     the space.  We don't have enough information to
1295		     make the right choice, so here we are making the
1296		     choice which is more likely to be correct.  */
1297		  PUT (' ');
1298		}
1299
1300	      state = 3;
1301	    }
1302	  PUT (ch);
1303	  break;
1304	}
1305    }
1306
1307  /*NOTREACHED*/
1308
1309 fromeof:
1310  /* We have reached the end of the input.  */
1311  return to - tostart;
1312
1313 tofull:
1314  /* The output buffer is full.  Save any input we have not yet
1315     processed.  */
1316  if (fromend > from)
1317    {
1318      saved_input = from;
1319      saved_input_len = fromend - from;
1320    }
1321  else
1322    saved_input = NULL;
1323
1324  return to - tostart;
1325}
1326
1327/* end of app.c */
1328