1/* This is the Assembler Pre-Processor
2   Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
3   1999, 2000, 2001, 2002, 2003, 2006, 2007
4   Free Software Foundation, Inc.
5
6   This file is part of GAS, the GNU Assembler.
7
8   GAS is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GAS is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with GAS; see the file COPYING.  If not, write to the Free
20   Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
21   02110-1301, USA.  */
22
23/* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
24/* App, the assembler pre-processor.  This pre-processor strips out
25   excess spaces, turns single-quoted characters into a decimal
26   constant, and turns the # in # <number> <filename> <garbage> into a
27   .linefile.  This needs better error-handling.  */
28
29#include "as.h"
30
31#if (__STDC__ != 1)
32#ifndef const
33#define const  /* empty */
34#endif
35#endif
36
37#ifdef TC_M68K
38/* Whether we are scrubbing in m68k MRI mode.  This is different from
39   flag_m68k_mri, because the two flags will be affected by the .mri
40   pseudo-op at different times.  */
41static int scrub_m68k_mri;
42
43/* The pseudo-op which switches in and out of MRI mode.  See the
44   comment in do_scrub_chars.  */
45static const char mri_pseudo[] = ".mri 0";
46#else
47#define scrub_m68k_mri 0
48#endif
49
50#if defined TC_ARM && defined OBJ_ELF
51/* The pseudo-op for which we need to special-case `@' characters.
52   See the comment in do_scrub_chars.  */
53static const char   symver_pseudo[] = ".symver";
54static const char * symver_state;
55#endif
56
57static char lex[256];
58static const char symbol_chars[] =
59"$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
60
61#define LEX_IS_SYMBOL_COMPONENT		1
62#define LEX_IS_WHITESPACE		2
63#define LEX_IS_LINE_SEPARATOR		3
64#define LEX_IS_COMMENT_START		4
65#define LEX_IS_LINE_COMMENT_START	5
66#define	LEX_IS_TWOCHAR_COMMENT_1ST	6
67#define	LEX_IS_STRINGQUOTE		8
68#define	LEX_IS_COLON			9
69#define	LEX_IS_NEWLINE			10
70#define	LEX_IS_ONECHAR_QUOTE		11
71#ifdef TC_V850
72#define LEX_IS_DOUBLEDASH_1ST		12
73#endif
74#ifdef TC_M32R
75#define DOUBLEBAR_PARALLEL
76#endif
77#ifdef DOUBLEBAR_PARALLEL
78#define LEX_IS_DOUBLEBAR_1ST		13
79#endif
80#define LEX_IS_PARALLEL_SEPARATOR	14
81#define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
82#define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
83#define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
84#define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
85#define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
86#define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
87#define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
88
89static int process_escape (int);
90
91/* FIXME-soon: The entire lexer/parser thingy should be
92   built statically at compile time rather than dynamically
93   each and every time the assembler is run.  xoxorich.  */
94
95void
96do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
97{
98  const char *p;
99  int c;
100
101  lex[' '] = LEX_IS_WHITESPACE;
102  lex['\t'] = LEX_IS_WHITESPACE;
103  lex['\r'] = LEX_IS_WHITESPACE;
104  lex['\n'] = LEX_IS_NEWLINE;
105  lex[':'] = LEX_IS_COLON;
106
107#ifdef TC_M68K
108  scrub_m68k_mri = m68k_mri;
109
110  if (! m68k_mri)
111#endif
112    {
113      lex['"'] = LEX_IS_STRINGQUOTE;
114
115#if ! defined (TC_HPPA) && ! defined (TC_I370)
116      /* I370 uses single-quotes to delimit integer, float constants.  */
117      lex['\''] = LEX_IS_ONECHAR_QUOTE;
118#endif
119
120#ifdef SINGLE_QUOTE_STRINGS
121      lex['\''] = LEX_IS_STRINGQUOTE;
122#endif
123    }
124
125  /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
126     in state 5 of do_scrub_chars must be changed.  */
127
128  /* Note that these override the previous defaults, e.g. if ';' is a
129     comment char, then it isn't a line separator.  */
130  for (p = symbol_chars; *p; ++p)
131    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
132
133  for (c = 128; c < 256; ++c)
134    lex[c] = LEX_IS_SYMBOL_COMPONENT;
135
136#ifdef tc_symbol_chars
137  /* This macro permits the processor to specify all characters which
138     may appears in an operand.  This will prevent the scrubber from
139     discarding meaningful whitespace in certain cases.  The i386
140     backend uses this to support prefixes, which can confuse the
141     scrubber as to whether it is parsing operands or opcodes.  */
142  for (p = tc_symbol_chars; *p; ++p)
143    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
144#endif
145
146  /* The m68k backend wants to be able to change comment_chars.  */
147#ifndef tc_comment_chars
148#define tc_comment_chars comment_chars
149#endif
150  for (p = tc_comment_chars; *p; p++)
151    lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
152
153  for (p = line_comment_chars; *p; p++)
154    lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
155
156  for (p = line_separator_chars; *p; p++)
157    lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
158
159#ifdef tc_parallel_separator_chars
160  /* This macro permits the processor to specify all characters which
161     separate parallel insns on the same line.  */
162  for (p = tc_parallel_separator_chars; *p; p++)
163    lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
164#endif
165
166  /* Only allow slash-star comments if slash is not in use.
167     FIXME: This isn't right.  We should always permit them.  */
168  if (lex['/'] == 0)
169    lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
170
171#ifdef TC_M68K
172  if (m68k_mri)
173    {
174      lex['\''] = LEX_IS_STRINGQUOTE;
175      lex[';'] = LEX_IS_COMMENT_START;
176      lex['*'] = LEX_IS_LINE_COMMENT_START;
177      /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
178	 then it can't be used in an expression.  */
179      lex['!'] = LEX_IS_LINE_COMMENT_START;
180    }
181#endif
182
183#ifdef TC_V850
184  lex['-'] = LEX_IS_DOUBLEDASH_1ST;
185#endif
186#ifdef DOUBLEBAR_PARALLEL
187  lex['|'] = LEX_IS_DOUBLEBAR_1ST;
188#endif
189#ifdef TC_D30V
190  /* Must do this is we want VLIW instruction with "->" or "<-".  */
191  lex['-'] = LEX_IS_SYMBOL_COMPONENT;
192#endif
193}
194
195/* Saved state of the scrubber.  */
196static int state;
197static int old_state;
198static char *out_string;
199static char out_buf[20];
200static int add_newlines;
201static char *saved_input;
202static int saved_input_len;
203static char input_buffer[32 * 1024];
204static const char *mri_state;
205static char mri_last_ch;
206
207/* Data structure for saving the state of app across #include's.  Note that
208   app is called asynchronously to the parsing of the .include's, so our
209   state at the time .include is interpreted is completely unrelated.
210   That's why we have to save it all.  */
211
212struct app_save
213{
214  int          state;
215  int          old_state;
216  char *       out_string;
217  char         out_buf[sizeof (out_buf)];
218  int          add_newlines;
219  char *       saved_input;
220  int          saved_input_len;
221#ifdef TC_M68K
222  int          scrub_m68k_mri;
223#endif
224  const char * mri_state;
225  char         mri_last_ch;
226#if defined TC_ARM && defined OBJ_ELF
227  const char * symver_state;
228#endif
229};
230
231char *
232app_push (void)
233{
234  register struct app_save *saved;
235
236  saved = (struct app_save *) xmalloc (sizeof (*saved));
237  saved->state = state;
238  saved->old_state = old_state;
239  saved->out_string = out_string;
240  memcpy (saved->out_buf, out_buf, sizeof (out_buf));
241  saved->add_newlines = add_newlines;
242  if (saved_input == NULL)
243    saved->saved_input = NULL;
244  else
245    {
246      saved->saved_input = xmalloc (saved_input_len);
247      memcpy (saved->saved_input, saved_input, saved_input_len);
248      saved->saved_input_len = saved_input_len;
249    }
250#ifdef TC_M68K
251  saved->scrub_m68k_mri = scrub_m68k_mri;
252#endif
253  saved->mri_state = mri_state;
254  saved->mri_last_ch = mri_last_ch;
255#if defined TC_ARM && defined OBJ_ELF
256  saved->symver_state = symver_state;
257#endif
258
259  /* do_scrub_begin() is not useful, just wastes time.  */
260
261  state = 0;
262  saved_input = NULL;
263
264  return (char *) saved;
265}
266
267void
268app_pop (char *arg)
269{
270  register struct app_save *saved = (struct app_save *) arg;
271
272  /* There is no do_scrub_end ().  */
273  state = saved->state;
274  old_state = saved->old_state;
275  out_string = saved->out_string;
276  memcpy (out_buf, saved->out_buf, sizeof (out_buf));
277  add_newlines = saved->add_newlines;
278  if (saved->saved_input == NULL)
279    saved_input = NULL;
280  else
281    {
282      assert (saved->saved_input_len <= (int) (sizeof input_buffer));
283      memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
284      saved_input = input_buffer;
285      saved_input_len = saved->saved_input_len;
286      free (saved->saved_input);
287    }
288#ifdef TC_M68K
289  scrub_m68k_mri = saved->scrub_m68k_mri;
290#endif
291  mri_state = saved->mri_state;
292  mri_last_ch = saved->mri_last_ch;
293#if defined TC_ARM && defined OBJ_ELF
294  symver_state = saved->symver_state;
295#endif
296
297  free (arg);
298}
299
300/* @@ This assumes that \n &c are the same on host and target.  This is not
301   necessarily true.  */
302
303static int
304process_escape (int ch)
305{
306  switch (ch)
307    {
308    case 'b':
309      return '\b';
310    case 'f':
311      return '\f';
312    case 'n':
313      return '\n';
314    case 'r':
315      return '\r';
316    case 't':
317      return '\t';
318    case '\'':
319      return '\'';
320    case '"':
321      return '\"';
322    default:
323      return ch;
324    }
325}
326
327/* This function is called to process input characters.  The GET
328   parameter is used to retrieve more input characters.  GET should
329   set its parameter to point to a buffer, and return the length of
330   the buffer; it should return 0 at end of file.  The scrubbed output
331   characters are put into the buffer starting at TOSTART; the TOSTART
332   buffer is TOLEN bytes in length.  The function returns the number
333   of scrubbed characters put into TOSTART.  This will be TOLEN unless
334   end of file was seen.  This function is arranged as a state
335   machine, and saves its state so that it may return at any point.
336   This is the way the old code used to work.  */
337
338int
339do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen)
340{
341  char *to = tostart;
342  char *toend = tostart + tolen;
343  char *from;
344  char *fromend;
345  int fromlen;
346  register int ch, ch2 = 0;
347  /* Character that started the string we're working on.  */
348  static char quotechar;
349
350  /*State 0: beginning of normal line
351	  1: After first whitespace on line (flush more white)
352	  2: After first non-white (opcode) on line (keep 1white)
353	  3: after second white on line (into operands) (flush white)
354	  4: after putting out a .linefile, put out digits
355	  5: parsing a string, then go to old-state
356	  6: putting out \ escape in a "d string.
357	  7: no longer used
358	  8: no longer used
359	  9: After seeing symbol char in state 3 (keep 1white after symchar)
360	 10: After seeing whitespace in state 9 (keep white before symchar)
361	 11: After seeing a symbol character in state 0 (eg a label definition)
362	 -1: output string in out_string and go to the state in old_state
363	 -2: flush text until a '*' '/' is seen, then go to state old_state
364#ifdef TC_V850
365	 12: After seeing a dash, looking for a second dash as a start
366	     of comment.
367#endif
368#ifdef DOUBLEBAR_PARALLEL
369	 13: After seeing a vertical bar, looking for a second
370	     vertical bar as a parallel expression separator.
371#endif
372#ifdef TC_IA64
373	 14: After seeing a `(' at state 0, looking for a `)' as
374	     predicate.
375	 15: After seeing a `(' at state 1, looking for a `)' as
376	     predicate.
377#endif
378#ifdef TC_Z80
379	 16: After seeing an 'a' or an 'A' at the start of a symbol
380	 17: After seeing an 'f' or an 'F' in state 16
381#endif
382	  */
383
384  /* I added states 9 and 10 because the MIPS ECOFF assembler uses
385     constructs like ``.loc 1 20''.  This was turning into ``.loc
386     120''.  States 9 and 10 ensure that a space is never dropped in
387     between characters which could appear in an identifier.  Ian
388     Taylor, ian@cygnus.com.
389
390     I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
391     correctly on the PA (and any other target where colons are optional).
392     Jeff Law, law@cs.utah.edu.
393
394     I added state 13 so that something like "cmp r1, r2 || trap #1" does not
395     get squashed into "cmp r1,r2||trap#1", with the all important space
396     between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
397
398  /* This macro gets the next input character.  */
399
400#define GET()							\
401  (from < fromend						\
402   ? * (unsigned char *) (from++)				\
403   : (saved_input = NULL,					\
404      fromlen = (*get) (input_buffer, sizeof input_buffer),	\
405      from = input_buffer,					\
406      fromend = from + fromlen,					\
407      (fromlen == 0						\
408       ? EOF							\
409       : * (unsigned char *) (from++))))
410
411  /* This macro pushes a character back on the input stream.  */
412
413#define UNGET(uch) (*--from = (uch))
414
415  /* This macro puts a character into the output buffer.  If this
416     character fills the output buffer, this macro jumps to the label
417     TOFULL.  We use this rather ugly approach because we need to
418     handle two different termination conditions: EOF on the input
419     stream, and a full output buffer.  It would be simpler if we
420     always read in the entire input stream before processing it, but
421     I don't want to make such a significant change to the assembler's
422     memory usage.  */
423
424#define PUT(pch)				\
425  do						\
426    {						\
427      *to++ = (pch);				\
428      if (to >= toend)				\
429	goto tofull;				\
430    }						\
431  while (0)
432
433  if (saved_input != NULL)
434    {
435      from = saved_input;
436      fromend = from + saved_input_len;
437    }
438  else
439    {
440      fromlen = (*get) (input_buffer, sizeof input_buffer);
441      if (fromlen == 0)
442	return 0;
443      from = input_buffer;
444      fromend = from + fromlen;
445    }
446
447  while (1)
448    {
449      /* The cases in this switch end with continue, in order to
450	 branch back to the top of this while loop and generate the
451	 next output character in the appropriate state.  */
452      switch (state)
453	{
454	case -1:
455	  ch = *out_string++;
456	  if (*out_string == '\0')
457	    {
458	      state = old_state;
459	      old_state = 3;
460	    }
461	  PUT (ch);
462	  continue;
463
464	case -2:
465	  for (;;)
466	    {
467	      do
468		{
469		  ch = GET ();
470
471		  if (ch == EOF)
472		    {
473		      as_warn (_("end of file in comment"));
474		      goto fromeof;
475		    }
476
477		  if (ch == '\n')
478		    PUT ('\n');
479		}
480	      while (ch != '*');
481
482	      while ((ch = GET ()) == '*')
483		;
484
485	      if (ch == EOF)
486		{
487		  as_warn (_("end of file in comment"));
488		  goto fromeof;
489		}
490
491	      if (ch == '/')
492		break;
493
494	      UNGET (ch);
495	    }
496
497	  state = old_state;
498	  UNGET (' ');
499	  continue;
500
501	case 4:
502	  ch = GET ();
503	  if (ch == EOF)
504	    goto fromeof;
505	  else if (ch >= '0' && ch <= '9')
506	    PUT (ch);
507	  else
508	    {
509	      while (ch != EOF && IS_WHITESPACE (ch))
510		ch = GET ();
511	      if (ch == '"')
512		{
513		  quotechar = ch;
514		  state = 5;
515		  old_state = 3;
516		  PUT (ch);
517		}
518	      else
519		{
520		  while (ch != EOF && ch != '\n')
521		    ch = GET ();
522		  state = 0;
523		  PUT (ch);
524		}
525	    }
526	  continue;
527
528	case 5:
529	  /* We are going to copy everything up to a quote character,
530	     with special handling for a backslash.  We try to
531	     optimize the copying in the simple case without using the
532	     GET and PUT macros.  */
533	  {
534	    char *s;
535	    int len;
536
537	    for (s = from; s < fromend; s++)
538	      {
539		ch = *s;
540		if (ch == '\\'
541		    || ch == quotechar
542		    || ch == '\n')
543		  break;
544	      }
545	    len = s - from;
546	    if (len > toend - to)
547	      len = toend - to;
548	    if (len > 0)
549	      {
550		memcpy (to, from, len);
551		to += len;
552		from += len;
553		if (to >= toend)
554		  goto tofull;
555	      }
556	  }
557
558	  ch = GET ();
559	  if (ch == EOF)
560	    {
561	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
562	      state = old_state;
563	      UNGET ('\n');
564	      PUT (quotechar);
565	    }
566	  else if (ch == quotechar)
567	    {
568	      state = old_state;
569	      PUT (ch);
570	    }
571#ifndef NO_STRING_ESCAPES
572	  else if (ch == '\\')
573	    {
574	      state = 6;
575	      PUT (ch);
576	    }
577#endif
578	  else if (scrub_m68k_mri && ch == '\n')
579	    {
580	      /* Just quietly terminate the string.  This permits lines like
581		   bne	label	loop if we haven't reach end yet.  */
582	      state = old_state;
583	      UNGET (ch);
584	      PUT ('\'');
585	    }
586	  else
587	    {
588	      PUT (ch);
589	    }
590	  continue;
591
592	case 6:
593	  state = 5;
594	  ch = GET ();
595	  switch (ch)
596	    {
597	      /* Handle strings broken across lines, by turning '\n' into
598		 '\\' and 'n'.  */
599	    case '\n':
600	      UNGET ('n');
601	      add_newlines++;
602	      PUT ('\\');
603	      continue;
604
605	    case EOF:
606	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
607	      PUT (quotechar);
608	      continue;
609
610	    case '"':
611	    case '\\':
612	    case 'b':
613	    case 'f':
614	    case 'n':
615	    case 'r':
616	    case 't':
617	    case 'v':
618	    case 'x':
619	    case 'X':
620	    case '0':
621	    case '1':
622	    case '2':
623	    case '3':
624	    case '4':
625	    case '5':
626	    case '6':
627	    case '7':
628	      break;
629
630	    default:
631#ifdef ONLY_STANDARD_ESCAPES
632	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
633#endif
634	      break;
635	    }
636	  PUT (ch);
637	  continue;
638
639#ifdef DOUBLEBAR_PARALLEL
640	case 13:
641	  ch = GET ();
642	  if (ch != '|')
643	    abort ();
644
645	  /* Reset back to state 1 and pretend that we are parsing a
646	     line from just after the first white space.  */
647	  state = 1;
648	  PUT ('|');
649	  continue;
650#endif
651#ifdef TC_Z80
652	case 16:
653	  /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
654	  ch = GET ();
655	  if (ch == 'f' || ch == 'F')
656	    {
657	      state = 17;
658	      PUT (ch);
659	    }
660	  else
661	    {
662	      state = 9;
663	      break;
664	    }
665	case 17:
666	  /* We have seen "af" at the start of a symbol,
667	     a ' here is a part of that symbol.  */
668	  ch = GET ();
669	  state = 9;
670	  if (ch == '\'')
671	    /* Change to avoid warning about unclosed string.  */
672	    PUT ('`');
673	  else
674	    UNGET (ch);
675	  break;
676#endif
677	}
678
679      /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
680
681      /* flushchar: */
682      ch = GET ();
683
684#ifdef TC_IA64
685      if (ch == '(' && (state == 0 || state == 1))
686	{
687	  state += 14;
688	  PUT (ch);
689	  continue;
690	}
691      else if (state == 14 || state == 15)
692	{
693	  if (ch == ')')
694	    {
695	      state -= 14;
696	      PUT (ch);
697	      ch = GET ();
698	    }
699	  else
700	    {
701	      PUT (ch);
702	      continue;
703	    }
704	}
705#endif
706
707    recycle:
708
709#if defined TC_ARM && defined OBJ_ELF
710      /* We need to watch out for .symver directives.  See the comment later
711	 in this function.  */
712      if (symver_state == NULL)
713	{
714	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
715	    symver_state = symver_pseudo + 1;
716	}
717      else
718	{
719	  /* We advance to the next state if we find the right
720	     character.  */
721	  if (ch != '\0' && (*symver_state == ch))
722	    ++symver_state;
723	  else if (*symver_state != '\0')
724	    /* We did not get the expected character, or we didn't
725	       get a valid terminating character after seeing the
726	       entire pseudo-op, so we must go back to the beginning.  */
727	    symver_state = NULL;
728	  else
729	    {
730	      /* We've read the entire pseudo-op.  If this is the end
731		 of the line, go back to the beginning.  */
732	      if (IS_NEWLINE (ch))
733		symver_state = NULL;
734	    }
735	}
736#endif /* TC_ARM && OBJ_ELF */
737
738#ifdef TC_M68K
739      /* We want to have pseudo-ops which control whether we are in
740	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
741	 the scrubber, that means that we need a special purpose
742	 recognizer here.  */
743      if (mri_state == NULL)
744	{
745	  if ((state == 0 || state == 1)
746	      && ch == mri_pseudo[0])
747	    mri_state = mri_pseudo + 1;
748	}
749      else
750	{
751	  /* We advance to the next state if we find the right
752	     character, or if we need a space character and we get any
753	     whitespace character, or if we need a '0' and we get a
754	     '1' (this is so that we only need one state to handle
755	     ``.mri 0'' and ``.mri 1'').  */
756	  if (ch != '\0'
757	      && (*mri_state == ch
758		  || (*mri_state == ' '
759		      && lex[ch] == LEX_IS_WHITESPACE)
760		  || (*mri_state == '0'
761		      && ch == '1')))
762	    {
763	      mri_last_ch = ch;
764	      ++mri_state;
765	    }
766	  else if (*mri_state != '\0'
767		   || (lex[ch] != LEX_IS_WHITESPACE
768		       && lex[ch] != LEX_IS_NEWLINE))
769	    {
770	      /* We did not get the expected character, or we didn't
771		 get a valid terminating character after seeing the
772		 entire pseudo-op, so we must go back to the
773		 beginning.  */
774	      mri_state = NULL;
775	    }
776	  else
777	    {
778	      /* We've read the entire pseudo-op.  mips_last_ch is
779		 either '0' or '1' indicating whether to enter or
780		 leave MRI mode.  */
781	      do_scrub_begin (mri_last_ch == '1');
782	      mri_state = NULL;
783
784	      /* We continue handling the character as usual.  The
785		 main gas reader must also handle the .mri pseudo-op
786		 to control expression parsing and the like.  */
787	    }
788	}
789#endif
790
791      if (ch == EOF)
792	{
793	  if (state != 0)
794	    {
795	      as_warn (_("end of file not at end of a line; newline inserted"));
796	      state = 0;
797	      PUT ('\n');
798	    }
799	  goto fromeof;
800	}
801
802      switch (lex[ch])
803	{
804	case LEX_IS_WHITESPACE:
805	  do
806	    {
807	      ch = GET ();
808	    }
809	  while (ch != EOF && IS_WHITESPACE (ch));
810	  if (ch == EOF)
811	    goto fromeof;
812
813	  if (state == 0)
814	    {
815	      /* Preserve a single whitespace character at the
816		 beginning of a line.  */
817	      state = 1;
818	      UNGET (ch);
819	      PUT (' ');
820	      break;
821	    }
822
823#ifdef KEEP_WHITE_AROUND_COLON
824	  if (lex[ch] == LEX_IS_COLON)
825	    {
826	      /* Only keep this white if there's no white *after* the
827		 colon.  */
828	      ch2 = GET ();
829	      UNGET (ch2);
830	      if (!IS_WHITESPACE (ch2))
831		{
832		  state = 9;
833		  UNGET (ch);
834		  PUT (' ');
835		  break;
836		}
837	    }
838#endif
839	  if (IS_COMMENT (ch)
840	      || ch == '/'
841	      || IS_LINE_SEPARATOR (ch)
842	      || IS_PARALLEL_SEPARATOR (ch))
843	    {
844	      if (scrub_m68k_mri)
845		{
846		  /* In MRI mode, we keep these spaces.  */
847		  UNGET (ch);
848		  PUT (' ');
849		  break;
850		}
851	      goto recycle;
852	    }
853
854	  /* If we're in state 2 or 11, we've seen a non-white
855	     character followed by whitespace.  If the next character
856	     is ':', this is whitespace after a label name which we
857	     normally must ignore.  In MRI mode, though, spaces are
858	     not permitted between the label and the colon.  */
859	  if ((state == 2 || state == 11)
860	      && lex[ch] == LEX_IS_COLON
861	      && ! scrub_m68k_mri)
862	    {
863	      state = 1;
864	      PUT (ch);
865	      break;
866	    }
867
868	  switch (state)
869	    {
870	    case 1:
871	      /* We can arrive here if we leave a leading whitespace
872		 character at the beginning of a line.  */
873	      goto recycle;
874	    case 2:
875	      state = 3;
876	      if (to + 1 < toend)
877		{
878		  /* Optimize common case by skipping UNGET/GET.  */
879		  PUT (' ');	/* Sp after opco */
880		  goto recycle;
881		}
882	      UNGET (ch);
883	      PUT (' ');
884	      break;
885	    case 3:
886	      if (scrub_m68k_mri)
887		{
888		  /* In MRI mode, we keep these spaces.  */
889		  UNGET (ch);
890		  PUT (' ');
891		  break;
892		}
893	      goto recycle;	/* Sp in operands */
894	    case 9:
895	    case 10:
896	      if (scrub_m68k_mri)
897		{
898		  /* In MRI mode, we keep these spaces.  */
899		  state = 3;
900		  UNGET (ch);
901		  PUT (' ');
902		  break;
903		}
904	      state = 10;	/* Sp after symbol char */
905	      goto recycle;
906	    case 11:
907	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
908		state = 1;
909	      else
910		{
911		  /* We know that ch is not ':', since we tested that
912		     case above.  Therefore this is not a label, so it
913		     must be the opcode, and we've just seen the
914		     whitespace after it.  */
915		  state = 3;
916		}
917	      UNGET (ch);
918	      PUT (' ');	/* Sp after label definition.  */
919	      break;
920	    default:
921	      BAD_CASE (state);
922	    }
923	  break;
924
925	case LEX_IS_TWOCHAR_COMMENT_1ST:
926	  ch2 = GET ();
927	  if (ch2 == '*')
928	    {
929	      for (;;)
930		{
931		  do
932		    {
933		      ch2 = GET ();
934		      if (ch2 != EOF && IS_NEWLINE (ch2))
935			add_newlines++;
936		    }
937		  while (ch2 != EOF && ch2 != '*');
938
939		  while (ch2 == '*')
940		    ch2 = GET ();
941
942		  if (ch2 == EOF || ch2 == '/')
943		    break;
944
945		  /* This UNGET will ensure that we count newlines
946		     correctly.  */
947		  UNGET (ch2);
948		}
949
950	      if (ch2 == EOF)
951		as_warn (_("end of file in multiline comment"));
952
953	      ch = ' ';
954	      goto recycle;
955	    }
956#ifdef DOUBLESLASH_LINE_COMMENTS
957	  else if (ch2 == '/')
958	    {
959	      do
960		{
961		  ch = GET ();
962		}
963	      while (ch != EOF && !IS_NEWLINE (ch));
964	      if (ch == EOF)
965		as_warn ("end of file in comment; newline inserted");
966	      state = 0;
967	      PUT ('\n');
968	      break;
969	    }
970#endif
971	  else
972	    {
973	      if (ch2 != EOF)
974		UNGET (ch2);
975	      if (state == 9 || state == 10)
976		state = 3;
977	      PUT (ch);
978	    }
979	  break;
980
981	case LEX_IS_STRINGQUOTE:
982	  quotechar = ch;
983	  if (state == 10)
984	    {
985	      /* Preserve the whitespace in foo "bar".  */
986	      UNGET (ch);
987	      state = 3;
988	      PUT (' ');
989
990	      /* PUT didn't jump out.  We could just break, but we
991		 know what will happen, so optimize a bit.  */
992	      ch = GET ();
993	      old_state = 3;
994	    }
995	  else if (state == 9)
996	    old_state = 3;
997	  else
998	    old_state = state;
999	  state = 5;
1000	  PUT (ch);
1001	  break;
1002
1003#ifndef IEEE_STYLE
1004	case LEX_IS_ONECHAR_QUOTE:
1005	  if (state == 10)
1006	    {
1007	      /* Preserve the whitespace in foo 'b'.  */
1008	      UNGET (ch);
1009	      state = 3;
1010	      PUT (' ');
1011	      break;
1012	    }
1013	  ch = GET ();
1014	  if (ch == EOF)
1015	    {
1016	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1017	      ch = 0;
1018	    }
1019	  if (ch == '\\')
1020	    {
1021	      ch = GET ();
1022	      if (ch == EOF)
1023		{
1024		  as_warn (_("end of file in escape character"));
1025		  ch = '\\';
1026		}
1027	      else
1028		ch = process_escape (ch);
1029	    }
1030	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1031
1032	  /* None of these 'x constants for us.  We want 'x'.  */
1033	  if ((ch = GET ()) != '\'')
1034	    {
1035#ifdef REQUIRE_CHAR_CLOSE_QUOTE
1036	      as_warn (_("missing close quote; (assumed)"));
1037#else
1038	      if (ch != EOF)
1039		UNGET (ch);
1040#endif
1041	    }
1042	  if (strlen (out_buf) == 1)
1043	    {
1044	      PUT (out_buf[0]);
1045	      break;
1046	    }
1047	  if (state == 9)
1048	    old_state = 3;
1049	  else
1050	    old_state = state;
1051	  state = -1;
1052	  out_string = out_buf;
1053	  PUT (*out_string++);
1054	  break;
1055#endif
1056
1057	case LEX_IS_COLON:
1058#ifdef KEEP_WHITE_AROUND_COLON
1059	  state = 9;
1060#else
1061	  if (state == 9 || state == 10)
1062	    state = 3;
1063	  else if (state != 3)
1064	    state = 1;
1065#endif
1066	  PUT (ch);
1067	  break;
1068
1069	case LEX_IS_NEWLINE:
1070	  /* Roll out a bunch of newlines from inside comments, etc.  */
1071	  if (add_newlines)
1072	    {
1073	      --add_newlines;
1074	      UNGET (ch);
1075	    }
1076	  /* Fall through.  */
1077
1078	case LEX_IS_LINE_SEPARATOR:
1079	  state = 0;
1080	  PUT (ch);
1081	  break;
1082
1083	case LEX_IS_PARALLEL_SEPARATOR:
1084	  state = 1;
1085	  PUT (ch);
1086	  break;
1087
1088#ifdef TC_V850
1089	case LEX_IS_DOUBLEDASH_1ST:
1090	  ch2 = GET ();
1091	  if (ch2 != '-')
1092	    {
1093	      UNGET (ch2);
1094	      goto de_fault;
1095	    }
1096	  /* Read and skip to end of line.  */
1097	  do
1098	    {
1099	      ch = GET ();
1100	    }
1101	  while (ch != EOF && ch != '\n');
1102
1103	  if (ch == EOF)
1104	    as_warn (_("end of file in comment; newline inserted"));
1105
1106	  state = 0;
1107	  PUT ('\n');
1108	  break;
1109#endif
1110#ifdef DOUBLEBAR_PARALLEL
1111	case LEX_IS_DOUBLEBAR_1ST:
1112	  ch2 = GET ();
1113	  UNGET (ch2);
1114	  if (ch2 != '|')
1115	    goto de_fault;
1116
1117	  /* Handle '||' in two states as invoking PUT twice might
1118	     result in the first one jumping out of this loop.  We'd
1119	     then lose track of the state and one '|' char.  */
1120	  state = 13;
1121	  PUT ('|');
1122	  break;
1123#endif
1124	case LEX_IS_LINE_COMMENT_START:
1125	  /* FIXME-someday: The two character comment stuff was badly
1126	     thought out.  On i386, we want '/' as line comment start
1127	     AND we want C style comments.  hence this hack.  The
1128	     whole lexical process should be reworked.  xoxorich.  */
1129	  if (ch == '/')
1130	    {
1131	      ch2 = GET ();
1132	      if (ch2 == '*')
1133		{
1134		  old_state = 3;
1135		  state = -2;
1136		  break;
1137		}
1138	      else
1139		{
1140		  UNGET (ch2);
1141		}
1142	    }
1143
1144	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1145	    {
1146	      int startch;
1147
1148	      startch = ch;
1149
1150	      do
1151		{
1152		  ch = GET ();
1153		}
1154	      while (ch != EOF && IS_WHITESPACE (ch));
1155
1156	      if (ch == EOF)
1157		{
1158		  as_warn (_("end of file in comment; newline inserted"));
1159		  PUT ('\n');
1160		  break;
1161		}
1162
1163	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1164		{
1165		  /* Not a cpp line.  */
1166		  while (ch != EOF && !IS_NEWLINE (ch))
1167		    ch = GET ();
1168		  if (ch == EOF)
1169		    as_warn (_("end of file in comment; newline inserted"));
1170		  state = 0;
1171		  PUT ('\n');
1172		  break;
1173		}
1174	      /* Looks like `# 123 "filename"' from cpp.  */
1175	      UNGET (ch);
1176	      old_state = 4;
1177	      state = -1;
1178	      if (scrub_m68k_mri)
1179		out_string = "\tlinefile ";
1180	      else
1181		out_string = "\t.linefile ";
1182	      PUT (*out_string++);
1183	      break;
1184	    }
1185
1186#ifdef TC_D10V
1187	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1188	     Trap is the only short insn that has a first operand that is
1189	     neither register nor label.
1190	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1191	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1192	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1193	     only character in line_comment_chars for d10v, hence we
1194	     can recognize it as such.  */
1195	  /* An alternative approach would be to reset the state to 1 when
1196	     we see '||', '<'- or '->', but that seems to be overkill.  */
1197	  if (state == 10)
1198	    PUT (' ');
1199#endif
1200	  /* We have a line comment character which is not at the
1201	     start of a line.  If this is also a normal comment
1202	     character, fall through.  Otherwise treat it as a default
1203	     character.  */
1204	  if (strchr (tc_comment_chars, ch) == NULL
1205	      && (! scrub_m68k_mri
1206		  || (ch != '!' && ch != '*')))
1207	    goto de_fault;
1208	  if (scrub_m68k_mri
1209	      && (ch == '!' || ch == '*' || ch == '#')
1210	      && state != 1
1211	      && state != 10)
1212	    goto de_fault;
1213	  /* Fall through.  */
1214	case LEX_IS_COMMENT_START:
1215#if defined TC_ARM && defined OBJ_ELF
1216	  /* On the ARM, `@' is the comment character.
1217	     Unfortunately this is also a special character in ELF .symver
1218	     directives (and .type, though we deal with those another way).
1219	     So we check if this line is such a directive, and treat
1220	     the character as default if so.  This is a hack.  */
1221	  if ((symver_state != NULL) && (*symver_state == 0))
1222	    goto de_fault;
1223#endif
1224
1225#ifdef TC_ARM
1226	  /* For the ARM, care is needed not to damage occurrences of \@
1227	     by stripping the @ onwards.  Yuck.  */
1228	  if (to > tostart && *(to - 1) == '\\')
1229	    /* Do not treat the @ as a start-of-comment.  */
1230	    goto de_fault;
1231#endif
1232
1233#ifdef WARN_COMMENTS
1234	  if (!found_comment)
1235	    as_where (&found_comment_file, &found_comment);
1236#endif
1237	  do
1238	    {
1239	      ch = GET ();
1240	    }
1241	  while (ch != EOF && !IS_NEWLINE (ch));
1242	  if (ch == EOF)
1243	    as_warn (_("end of file in comment; newline inserted"));
1244	  state = 0;
1245	  PUT ('\n');
1246	  break;
1247
1248	case LEX_IS_SYMBOL_COMPONENT:
1249	  if (state == 10)
1250	    {
1251	      /* This is a symbol character following another symbol
1252		 character, with whitespace in between.  We skipped
1253		 the whitespace earlier, so output it now.  */
1254	      UNGET (ch);
1255	      state = 3;
1256	      PUT (' ');
1257	      break;
1258	    }
1259
1260#ifdef TC_Z80
1261	  /* "af'" is a symbol containing '\''.  */
1262	  if (state == 3 && (ch == 'a' || ch == 'A'))
1263	    {
1264	      state = 16;
1265	      PUT (ch);
1266	      ch = GET ();
1267	      if (ch == 'f' || ch == 'F')
1268		{
1269		  state = 17;
1270		  PUT (ch);
1271		  break;
1272		}
1273	      else
1274		{
1275		  state = 9;
1276		  if (!IS_SYMBOL_COMPONENT (ch))
1277		    {
1278		      UNGET (ch);
1279		      break;
1280		    }
1281		}
1282	    }
1283#endif
1284	  if (state == 3)
1285	    state = 9;
1286
1287	  /* This is a common case.  Quickly copy CH and all the
1288	     following symbol component or normal characters.  */
1289	  if (to + 1 < toend
1290	      && mri_state == NULL
1291#if defined TC_ARM && defined OBJ_ELF
1292	      && symver_state == NULL
1293#endif
1294	      )
1295	    {
1296	      char *s;
1297	      int len;
1298
1299	      for (s = from; s < fromend; s++)
1300		{
1301		  int type;
1302
1303		  ch2 = *(unsigned char *) s;
1304		  type = lex[ch2];
1305		  if (type != 0
1306		      && type != LEX_IS_SYMBOL_COMPONENT)
1307		    break;
1308		}
1309
1310	      if (s > from)
1311		/* Handle the last character normally, for
1312		   simplicity.  */
1313		--s;
1314
1315	      len = s - from;
1316
1317	      if (len > (toend - to) - 1)
1318		len = (toend - to) - 1;
1319
1320	      if (len > 0)
1321		{
1322		  PUT (ch);
1323		  memcpy (to, from, len);
1324		  to += len;
1325		  from += len;
1326		  if (to >= toend)
1327		    goto tofull;
1328		  ch = GET ();
1329		}
1330	    }
1331
1332	  /* Fall through.  */
1333	default:
1334	de_fault:
1335	  /* Some relatively `normal' character.  */
1336	  if (state == 0)
1337	    {
1338	      state = 11;	/* Now seeing label definition.  */
1339	    }
1340	  else if (state == 1)
1341	    {
1342	      state = 2;	/* Ditto.  */
1343	    }
1344	  else if (state == 9)
1345	    {
1346	      if (!IS_SYMBOL_COMPONENT (ch))
1347		state = 3;
1348	    }
1349	  else if (state == 10)
1350	    {
1351	      if (ch == '\\')
1352		{
1353		  /* Special handling for backslash: a backslash may
1354		     be the beginning of a formal parameter (of a
1355		     macro) following another symbol character, with
1356		     whitespace in between.  If that is the case, we
1357		     output a space before the parameter.  Strictly
1358		     speaking, correct handling depends upon what the
1359		     macro parameter expands into; if the parameter
1360		     expands into something which does not start with
1361		     an operand character, then we don't want to keep
1362		     the space.  We don't have enough information to
1363		     make the right choice, so here we are making the
1364		     choice which is more likely to be correct.  */
1365		  if (to + 1 >= toend)
1366		    {
1367		      /* If we're near the end of the buffer, save the
1368		         character for the next time round.  Otherwise
1369		         we'll lose our state.  */
1370		      UNGET (ch);
1371		      goto tofull;
1372		    }
1373		  *to++ = ' ';
1374		}
1375
1376	      state = 3;
1377	    }
1378	  PUT (ch);
1379	  break;
1380	}
1381    }
1382
1383  /*NOTREACHED*/
1384
1385 fromeof:
1386  /* We have reached the end of the input.  */
1387  return to - tostart;
1388
1389 tofull:
1390  /* The output buffer is full.  Save any input we have not yet
1391     processed.  */
1392  if (fromend > from)
1393    {
1394      saved_input = from;
1395      saved_input_len = fromend - from;
1396    }
1397  else
1398    saved_input = NULL;
1399
1400  return to - tostart;
1401}
1402
1403