1/* This is the Assembler Pre-Processor
2   Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
3   1999, 2000, 2001, 2002, 2003
4   Free Software Foundation, Inc.
5
6   This file is part of GAS, the GNU Assembler.
7
8   GAS is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GAS is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with GAS; see the file COPYING.  If not, write to the Free
20   Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
21   02110-1301, USA.  */
22
23/* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
24/* App, the assembler pre-processor.  This pre-processor strips out excess
25   spaces, turns single-quoted characters into a decimal constant, and turns
26   # <number> <filename> <garbage> into a .line <number>\n.file <filename>
27   pair.  This needs better error-handling.  */
28
29#include <stdio.h>
30#include "as.h"			/* For BAD_CASE() only.  */
31
32#if (__STDC__ != 1)
33#ifndef const
34#define const  /* empty */
35#endif
36#endif
37
38#ifdef TC_M68K
39/* Whether we are scrubbing in m68k MRI mode.  This is different from
40   flag_m68k_mri, because the two flags will be affected by the .mri
41   pseudo-op at different times.  */
42static int scrub_m68k_mri;
43
44/* The pseudo-op which switches in and out of MRI mode.  See the
45   comment in do_scrub_chars.  */
46static const char mri_pseudo[] = ".mri 0";
47#else
48#define scrub_m68k_mri 0
49#endif
50
51#if defined TC_ARM && defined OBJ_ELF
52/* The pseudo-op for which we need to special-case `@' characters.
53   See the comment in do_scrub_chars.  */
54static const char   symver_pseudo[] = ".symver";
55static const char * symver_state;
56#endif
57
58static char lex[256];
59static const char symbol_chars[] =
60"$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
61
62#define LEX_IS_SYMBOL_COMPONENT		1
63#define LEX_IS_WHITESPACE		2
64#define LEX_IS_LINE_SEPARATOR		3
65#define LEX_IS_COMMENT_START		4
66#define LEX_IS_LINE_COMMENT_START	5
67#define	LEX_IS_TWOCHAR_COMMENT_1ST	6
68#define	LEX_IS_STRINGQUOTE		8
69#define	LEX_IS_COLON			9
70#define	LEX_IS_NEWLINE			10
71#define	LEX_IS_ONECHAR_QUOTE		11
72#ifdef TC_V850
73#define LEX_IS_DOUBLEDASH_1ST		12
74#endif
75#ifdef TC_M32R
76#define DOUBLEBAR_PARALLEL
77#endif
78#ifdef DOUBLEBAR_PARALLEL
79#define LEX_IS_DOUBLEBAR_1ST		13
80#endif
81#define LEX_IS_PARALLEL_SEPARATOR	14
82#define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
83#define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
84#define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
85#define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
86#define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
87#define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
88#define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
89
90static int process_escape (int);
91
92/* FIXME-soon: The entire lexer/parser thingy should be
93   built statically at compile time rather than dynamically
94   each and every time the assembler is run.  xoxorich.  */
95
96void
97do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
98{
99  const char *p;
100  int c;
101
102  lex[' '] = LEX_IS_WHITESPACE;
103  lex['\t'] = LEX_IS_WHITESPACE;
104  lex['\r'] = LEX_IS_WHITESPACE;
105  lex['\n'] = LEX_IS_NEWLINE;
106  lex[':'] = LEX_IS_COLON;
107
108#ifdef TC_M68K
109  scrub_m68k_mri = m68k_mri;
110
111  if (! m68k_mri)
112#endif
113    {
114      lex['"'] = LEX_IS_STRINGQUOTE;
115
116#if ! defined (TC_HPPA) && ! defined (TC_I370)
117      /* I370 uses single-quotes to delimit integer, float constants.  */
118      lex['\''] = LEX_IS_ONECHAR_QUOTE;
119#endif
120
121#ifdef SINGLE_QUOTE_STRINGS
122      lex['\''] = LEX_IS_STRINGQUOTE;
123#endif
124    }
125
126  /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
127     in state 5 of do_scrub_chars must be changed.  */
128
129  /* Note that these override the previous defaults, e.g. if ';' is a
130     comment char, then it isn't a line separator.  */
131  for (p = symbol_chars; *p; ++p)
132    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
133
134  for (c = 128; c < 256; ++c)
135    lex[c] = LEX_IS_SYMBOL_COMPONENT;
136
137#ifdef tc_symbol_chars
138  /* This macro permits the processor to specify all characters which
139     may appears in an operand.  This will prevent the scrubber from
140     discarding meaningful whitespace in certain cases.  The i386
141     backend uses this to support prefixes, which can confuse the
142     scrubber as to whether it is parsing operands or opcodes.  */
143  for (p = tc_symbol_chars; *p; ++p)
144    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
145#endif
146
147  /* The m68k backend wants to be able to change comment_chars.  */
148#ifndef tc_comment_chars
149#define tc_comment_chars comment_chars
150#endif
151  for (p = tc_comment_chars; *p; p++)
152    lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
153
154  for (p = line_comment_chars; *p; p++)
155    lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
156
157  for (p = line_separator_chars; *p; p++)
158    lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
159
160#ifdef tc_parallel_separator_chars
161  /* This macro permits the processor to specify all characters which
162     separate parallel insns on the same line.  */
163  for (p = tc_parallel_separator_chars; *p; p++)
164    lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
165#endif
166
167  /* Only allow slash-star comments if slash is not in use.
168     FIXME: This isn't right.  We should always permit them.  */
169  if (lex['/'] == 0)
170    lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
171
172#ifdef TC_M68K
173  if (m68k_mri)
174    {
175      lex['\''] = LEX_IS_STRINGQUOTE;
176      lex[';'] = LEX_IS_COMMENT_START;
177      lex['*'] = LEX_IS_LINE_COMMENT_START;
178      /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
179	 then it can't be used in an expression.  */
180      lex['!'] = LEX_IS_LINE_COMMENT_START;
181    }
182#endif
183
184#ifdef TC_V850
185  lex['-'] = LEX_IS_DOUBLEDASH_1ST;
186#endif
187#ifdef DOUBLEBAR_PARALLEL
188  lex['|'] = LEX_IS_DOUBLEBAR_1ST;
189#endif
190#ifdef TC_D30V
191  /* Must do this is we want VLIW instruction with "->" or "<-".  */
192  lex['-'] = LEX_IS_SYMBOL_COMPONENT;
193#endif
194}
195
196/* Saved state of the scrubber.  */
197static int state;
198static int old_state;
199static char *out_string;
200static char out_buf[20];
201static int add_newlines;
202static char *saved_input;
203static int saved_input_len;
204static char input_buffer[32 * 1024];
205static const char *mri_state;
206static char mri_last_ch;
207
208/* Data structure for saving the state of app across #include's.  Note that
209   app is called asynchronously to the parsing of the .include's, so our
210   state at the time .include is interpreted is completely unrelated.
211   That's why we have to save it all.  */
212
213struct app_save
214{
215  int          state;
216  int          old_state;
217  char *       out_string;
218  char         out_buf[sizeof (out_buf)];
219  int          add_newlines;
220  char *       saved_input;
221  int          saved_input_len;
222#ifdef TC_M68K
223  int          scrub_m68k_mri;
224#endif
225  const char * mri_state;
226  char         mri_last_ch;
227#if defined TC_ARM && defined OBJ_ELF
228  const char * symver_state;
229#endif
230};
231
232char *
233app_push (void)
234{
235  register struct app_save *saved;
236
237  saved = (struct app_save *) xmalloc (sizeof (*saved));
238  saved->state = state;
239  saved->old_state = old_state;
240  saved->out_string = out_string;
241  memcpy (saved->out_buf, out_buf, sizeof (out_buf));
242  saved->add_newlines = add_newlines;
243  if (saved_input == NULL)
244    saved->saved_input = NULL;
245  else
246    {
247      saved->saved_input = xmalloc (saved_input_len);
248      memcpy (saved->saved_input, saved_input, saved_input_len);
249      saved->saved_input_len = saved_input_len;
250    }
251#ifdef TC_M68K
252  saved->scrub_m68k_mri = scrub_m68k_mri;
253#endif
254  saved->mri_state = mri_state;
255  saved->mri_last_ch = mri_last_ch;
256#if defined TC_ARM && defined OBJ_ELF
257  saved->symver_state = symver_state;
258#endif
259
260  /* do_scrub_begin() is not useful, just wastes time.  */
261
262  state = 0;
263  saved_input = NULL;
264
265  return (char *) saved;
266}
267
268void
269app_pop (char *arg)
270{
271  register struct app_save *saved = (struct app_save *) arg;
272
273  /* There is no do_scrub_end ().  */
274  state = saved->state;
275  old_state = saved->old_state;
276  out_string = saved->out_string;
277  memcpy (out_buf, saved->out_buf, sizeof (out_buf));
278  add_newlines = saved->add_newlines;
279  if (saved->saved_input == NULL)
280    saved_input = NULL;
281  else
282    {
283      assert (saved->saved_input_len <= (int) (sizeof input_buffer));
284      memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
285      saved_input = input_buffer;
286      saved_input_len = saved->saved_input_len;
287      free (saved->saved_input);
288    }
289#ifdef TC_M68K
290  scrub_m68k_mri = saved->scrub_m68k_mri;
291#endif
292  mri_state = saved->mri_state;
293  mri_last_ch = saved->mri_last_ch;
294#if defined TC_ARM && defined OBJ_ELF
295  symver_state = saved->symver_state;
296#endif
297
298  free (arg);
299}
300
301/* @@ This assumes that \n &c are the same on host and target.  This is not
302   necessarily true.  */
303
304static int
305process_escape (int ch)
306{
307  switch (ch)
308    {
309    case 'b':
310      return '\b';
311    case 'f':
312      return '\f';
313    case 'n':
314      return '\n';
315    case 'r':
316      return '\r';
317    case 't':
318      return '\t';
319    case '\'':
320      return '\'';
321    case '"':
322      return '\"';
323    default:
324      return ch;
325    }
326}
327
328/* This function is called to process input characters.  The GET
329   parameter is used to retrieve more input characters.  GET should
330   set its parameter to point to a buffer, and return the length of
331   the buffer; it should return 0 at end of file.  The scrubbed output
332   characters are put into the buffer starting at TOSTART; the TOSTART
333   buffer is TOLEN bytes in length.  The function returns the number
334   of scrubbed characters put into TOSTART.  This will be TOLEN unless
335   end of file was seen.  This function is arranged as a state
336   machine, and saves its state so that it may return at any point.
337   This is the way the old code used to work.  */
338
339int
340do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen)
341{
342  char *to = tostart;
343  char *toend = tostart + tolen;
344  char *from;
345  char *fromend;
346  int fromlen;
347  register int ch, ch2 = 0;
348  /* Character that started the string we're working on.  */
349  static char quotechar;
350
351  /*State 0: beginning of normal line
352	  1: After first whitespace on line (flush more white)
353	  2: After first non-white (opcode) on line (keep 1white)
354	  3: after second white on line (into operands) (flush white)
355	  4: after putting out a .line, put out digits
356	  5: parsing a string, then go to old-state
357	  6: putting out \ escape in a "d string.
358	  7: After putting out a .appfile, put out string.
359	  8: After putting out a .appfile string, flush until newline.
360	  9: After seeing symbol char in state 3 (keep 1white after symchar)
361	 10: After seeing whitespace in state 9 (keep white before symchar)
362	 11: After seeing a symbol character in state 0 (eg a label definition)
363	 -1: output string in out_string and go to the state in old_state
364	 -2: flush text until a '*' '/' is seen, then go to state old_state
365#ifdef TC_V850
366	 12: After seeing a dash, looking for a second dash as a start
367	     of comment.
368#endif
369#ifdef DOUBLEBAR_PARALLEL
370	 13: After seeing a vertical bar, looking for a second
371	     vertical bar as a parallel expression separator.
372#endif
373#ifdef TC_IA64
374	 14: After seeing a `(' at state 0, looking for a `)' as
375	     predicate.
376	 15: After seeing a `(' at state 1, looking for a `)' as
377	     predicate.
378#endif
379#ifdef TC_Z80
380	 16: After seeing an 'a' or an 'A' at the start of a symbol
381	 17: After seeing an 'f' or an 'F' in state 16
382#endif
383	  */
384
385  /* I added states 9 and 10 because the MIPS ECOFF assembler uses
386     constructs like ``.loc 1 20''.  This was turning into ``.loc
387     120''.  States 9 and 10 ensure that a space is never dropped in
388     between characters which could appear in an identifier.  Ian
389     Taylor, ian@cygnus.com.
390
391     I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
392     correctly on the PA (and any other target where colons are optional).
393     Jeff Law, law@cs.utah.edu.
394
395     I added state 13 so that something like "cmp r1, r2 || trap #1" does not
396     get squashed into "cmp r1,r2||trap#1", with the all important space
397     between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
398
399  /* This macro gets the next input character.  */
400
401#define GET()							\
402  (from < fromend						\
403   ? * (unsigned char *) (from++)				\
404   : (saved_input = NULL,					\
405      fromlen = (*get) (input_buffer, sizeof input_buffer),	\
406      from = input_buffer,					\
407      fromend = from + fromlen,					\
408      (fromlen == 0						\
409       ? EOF							\
410       : * (unsigned char *) (from++))))
411
412  /* This macro pushes a character back on the input stream.  */
413
414#define UNGET(uch) (*--from = (uch))
415
416  /* This macro puts a character into the output buffer.  If this
417     character fills the output buffer, this macro jumps to the label
418     TOFULL.  We use this rather ugly approach because we need to
419     handle two different termination conditions: EOF on the input
420     stream, and a full output buffer.  It would be simpler if we
421     always read in the entire input stream before processing it, but
422     I don't want to make such a significant change to the assembler's
423     memory usage.  */
424
425#define PUT(pch)				\
426  do						\
427    {						\
428      *to++ = (pch);				\
429      if (to >= toend)				\
430	goto tofull;				\
431    }						\
432  while (0)
433
434  if (saved_input != NULL)
435    {
436      from = saved_input;
437      fromend = from + saved_input_len;
438    }
439  else
440    {
441      fromlen = (*get) (input_buffer, sizeof input_buffer);
442      if (fromlen == 0)
443	return 0;
444      from = input_buffer;
445      fromend = from + fromlen;
446    }
447
448  while (1)
449    {
450      /* The cases in this switch end with continue, in order to
451	 branch back to the top of this while loop and generate the
452	 next output character in the appropriate state.  */
453      switch (state)
454	{
455	case -1:
456	  ch = *out_string++;
457	  if (*out_string == '\0')
458	    {
459	      state = old_state;
460	      old_state = 3;
461	    }
462	  PUT (ch);
463	  continue;
464
465	case -2:
466	  for (;;)
467	    {
468	      do
469		{
470		  ch = GET ();
471
472		  if (ch == EOF)
473		    {
474		      as_warn (_("end of file in comment"));
475		      goto fromeof;
476		    }
477
478		  if (ch == '\n')
479		    PUT ('\n');
480		}
481	      while (ch != '*');
482
483	      while ((ch = GET ()) == '*')
484		;
485
486	      if (ch == EOF)
487		{
488		  as_warn (_("end of file in comment"));
489		  goto fromeof;
490		}
491
492	      if (ch == '/')
493		break;
494
495	      UNGET (ch);
496	    }
497
498	  state = old_state;
499	  UNGET (' ');
500	  continue;
501
502	case 4:
503	  ch = GET ();
504	  if (ch == EOF)
505	    goto fromeof;
506	  else if (ch >= '0' && ch <= '9')
507	    PUT (ch);
508	  else
509	    {
510	      while (ch != EOF && IS_WHITESPACE (ch))
511		ch = GET ();
512	      if (ch == '"')
513		{
514		  UNGET (ch);
515		  if (scrub_m68k_mri)
516		    out_string = "\n\tappfile ";
517		  else
518		    out_string = "\n\t.appfile ";
519		  old_state = 7;
520		  state = -1;
521		  PUT (*out_string++);
522		}
523	      else
524		{
525		  while (ch != EOF && ch != '\n')
526		    ch = GET ();
527		  state = 0;
528		  PUT (ch);
529		}
530	    }
531	  continue;
532
533	case 5:
534	  /* We are going to copy everything up to a quote character,
535	     with special handling for a backslash.  We try to
536	     optimize the copying in the simple case without using the
537	     GET and PUT macros.  */
538	  {
539	    char *s;
540	    int len;
541
542	    for (s = from; s < fromend; s++)
543	      {
544		ch = *s;
545		if (ch == '\\'
546		    || ch == quotechar
547		    || ch == '\n')
548		  break;
549	      }
550	    len = s - from;
551	    if (len > toend - to)
552	      len = toend - to;
553	    if (len > 0)
554	      {
555		memcpy (to, from, len);
556		to += len;
557		from += len;
558	      }
559	  }
560
561	  ch = GET ();
562	  if (ch == EOF)
563	    {
564	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
565	      state = old_state;
566	      UNGET ('\n');
567	      PUT (quotechar);
568	    }
569	  else if (ch == quotechar)
570	    {
571	      state = old_state;
572	      PUT (ch);
573	    }
574#ifndef NO_STRING_ESCAPES
575	  else if (ch == '\\')
576	    {
577	      state = 6;
578	      PUT (ch);
579	    }
580#endif
581	  else if (scrub_m68k_mri && ch == '\n')
582	    {
583	      /* Just quietly terminate the string.  This permits lines like
584		   bne	label	loop if we haven't reach end yet.  */
585	      state = old_state;
586	      UNGET (ch);
587	      PUT ('\'');
588	    }
589	  else
590	    {
591	      PUT (ch);
592	    }
593	  continue;
594
595	case 6:
596	  state = 5;
597	  ch = GET ();
598	  switch (ch)
599	    {
600	      /* Handle strings broken across lines, by turning '\n' into
601		 '\\' and 'n'.  */
602	    case '\n':
603	      UNGET ('n');
604	      add_newlines++;
605	      PUT ('\\');
606	      continue;
607
608	    case EOF:
609	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
610	      PUT (quotechar);
611	      continue;
612
613	    case '"':
614	    case '\\':
615	    case 'b':
616	    case 'f':
617	    case 'n':
618	    case 'r':
619	    case 't':
620	    case 'v':
621	    case 'x':
622	    case 'X':
623	    case '0':
624	    case '1':
625	    case '2':
626	    case '3':
627	    case '4':
628	    case '5':
629	    case '6':
630	    case '7':
631	      break;
632
633	    default:
634#ifdef ONLY_STANDARD_ESCAPES
635	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
636#endif
637	      break;
638	    }
639	  PUT (ch);
640	  continue;
641
642	case 7:
643	  ch = GET ();
644	  quotechar = ch;
645	  state = 5;
646	  old_state = 8;
647	  PUT (ch);
648	  continue;
649
650	case 8:
651	  do
652	    ch = GET ();
653	  while (ch != '\n' && ch != EOF);
654	  if (ch == EOF)
655	    goto fromeof;
656	  state = 0;
657	  PUT (ch);
658	  continue;
659
660#ifdef DOUBLEBAR_PARALLEL
661	case 13:
662	  ch = GET ();
663	  if (ch != '|')
664	    abort ();
665
666	  /* Reset back to state 1 and pretend that we are parsing a
667	     line from just after the first white space.  */
668	  state = 1;
669	  PUT ('|');
670	  continue;
671#endif
672#ifdef TC_Z80
673	case 16:
674	  /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
675	  ch = GET ();
676	  if (ch == 'f' || ch == 'F')
677	    {
678	      state = 17;
679	      PUT (ch);
680	    }
681	  else
682	    {
683	      state = 9;
684	      break;
685	    }
686	case 17:
687	  /* We have seen "af" at the start of a symbol,
688	     a ' here is a part of that symbol.  */
689	  ch = GET ();
690	  state = 9;
691	  if (ch == '\'')
692	    /* Change to avoid warning about unclosed string.  */
693	    PUT ('`');
694	  else
695	    UNGET (ch);
696	  break;
697#endif
698	}
699
700      /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
701
702      /* flushchar: */
703      ch = GET ();
704
705#ifdef TC_IA64
706      if (ch == '(' && (state == 0 || state == 1))
707	{
708	  state += 14;
709	  PUT (ch);
710	  continue;
711	}
712      else if (state == 14 || state == 15)
713	{
714	  if (ch == ')')
715	    {
716	      state -= 14;
717	      PUT (ch);
718	      ch = GET ();
719	    }
720	  else
721	    {
722	      PUT (ch);
723	      continue;
724	    }
725	}
726#endif
727
728    recycle:
729
730#if defined TC_ARM && defined OBJ_ELF
731      /* We need to watch out for .symver directives.  See the comment later
732	 in this function.  */
733      if (symver_state == NULL)
734	{
735	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
736	    symver_state = symver_pseudo + 1;
737	}
738      else
739	{
740	  /* We advance to the next state if we find the right
741	     character.  */
742	  if (ch != '\0' && (*symver_state == ch))
743	    ++symver_state;
744	  else if (*symver_state != '\0')
745	    /* We did not get the expected character, or we didn't
746	       get a valid terminating character after seeing the
747	       entire pseudo-op, so we must go back to the beginning.  */
748	    symver_state = NULL;
749	  else
750	    {
751	      /* We've read the entire pseudo-op.  If this is the end
752		 of the line, go back to the beginning.  */
753	      if (IS_NEWLINE (ch))
754		symver_state = NULL;
755	    }
756	}
757#endif /* TC_ARM && OBJ_ELF */
758
759#ifdef TC_M68K
760      /* We want to have pseudo-ops which control whether we are in
761	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
762	 the scrubber, that means that we need a special purpose
763	 recognizer here.  */
764      if (mri_state == NULL)
765	{
766	  if ((state == 0 || state == 1)
767	      && ch == mri_pseudo[0])
768	    mri_state = mri_pseudo + 1;
769	}
770      else
771	{
772	  /* We advance to the next state if we find the right
773	     character, or if we need a space character and we get any
774	     whitespace character, or if we need a '0' and we get a
775	     '1' (this is so that we only need one state to handle
776	     ``.mri 0'' and ``.mri 1'').  */
777	  if (ch != '\0'
778	      && (*mri_state == ch
779		  || (*mri_state == ' '
780		      && lex[ch] == LEX_IS_WHITESPACE)
781		  || (*mri_state == '0'
782		      && ch == '1')))
783	    {
784	      mri_last_ch = ch;
785	      ++mri_state;
786	    }
787	  else if (*mri_state != '\0'
788		   || (lex[ch] != LEX_IS_WHITESPACE
789		       && lex[ch] != LEX_IS_NEWLINE))
790	    {
791	      /* We did not get the expected character, or we didn't
792		 get a valid terminating character after seeing the
793		 entire pseudo-op, so we must go back to the
794		 beginning.  */
795	      mri_state = NULL;
796	    }
797	  else
798	    {
799	      /* We've read the entire pseudo-op.  mips_last_ch is
800		 either '0' or '1' indicating whether to enter or
801		 leave MRI mode.  */
802	      do_scrub_begin (mri_last_ch == '1');
803	      mri_state = NULL;
804
805	      /* We continue handling the character as usual.  The
806		 main gas reader must also handle the .mri pseudo-op
807		 to control expression parsing and the like.  */
808	    }
809	}
810#endif
811
812      if (ch == EOF)
813	{
814	  if (state != 0)
815	    {
816	      as_warn (_("end of file not at end of a line; newline inserted"));
817	      state = 0;
818	      PUT ('\n');
819	    }
820	  goto fromeof;
821	}
822
823      switch (lex[ch])
824	{
825	case LEX_IS_WHITESPACE:
826	  do
827	    {
828	      ch = GET ();
829	    }
830	  while (ch != EOF && IS_WHITESPACE (ch));
831	  if (ch == EOF)
832	    goto fromeof;
833
834	  if (state == 0)
835	    {
836	      /* Preserve a single whitespace character at the
837		 beginning of a line.  */
838	      state = 1;
839	      UNGET (ch);
840	      PUT (' ');
841	      break;
842	    }
843
844#ifdef KEEP_WHITE_AROUND_COLON
845	  if (lex[ch] == LEX_IS_COLON)
846	    {
847	      /* Only keep this white if there's no white *after* the
848		 colon.  */
849	      ch2 = GET ();
850	      UNGET (ch2);
851	      if (!IS_WHITESPACE (ch2))
852		{
853		  state = 9;
854		  UNGET (ch);
855		  PUT (' ');
856		  break;
857		}
858	    }
859#endif
860	  if (IS_COMMENT (ch)
861	      || ch == '/'
862	      || IS_LINE_SEPARATOR (ch)
863	      || IS_PARALLEL_SEPARATOR (ch))
864	    {
865	      if (scrub_m68k_mri)
866		{
867		  /* In MRI mode, we keep these spaces.  */
868		  UNGET (ch);
869		  PUT (' ');
870		  break;
871		}
872	      goto recycle;
873	    }
874
875	  /* If we're in state 2 or 11, we've seen a non-white
876	     character followed by whitespace.  If the next character
877	     is ':', this is whitespace after a label name which we
878	     normally must ignore.  In MRI mode, though, spaces are
879	     not permitted between the label and the colon.  */
880	  if ((state == 2 || state == 11)
881	      && lex[ch] == LEX_IS_COLON
882	      && ! scrub_m68k_mri)
883	    {
884	      state = 1;
885	      PUT (ch);
886	      break;
887	    }
888
889	  switch (state)
890	    {
891	    case 0:
892	      state++;
893	      goto recycle;	/* Punted leading sp */
894	    case 1:
895	      /* We can arrive here if we leave a leading whitespace
896		 character at the beginning of a line.  */
897	      goto recycle;
898	    case 2:
899	      state = 3;
900	      if (to + 1 < toend)
901		{
902		  /* Optimize common case by skipping UNGET/GET.  */
903		  PUT (' ');	/* Sp after opco */
904		  goto recycle;
905		}
906	      UNGET (ch);
907	      PUT (' ');
908	      break;
909	    case 3:
910	      if (scrub_m68k_mri)
911		{
912		  /* In MRI mode, we keep these spaces.  */
913		  UNGET (ch);
914		  PUT (' ');
915		  break;
916		}
917	      goto recycle;	/* Sp in operands */
918	    case 9:
919	    case 10:
920	      if (scrub_m68k_mri)
921		{
922		  /* In MRI mode, we keep these spaces.  */
923		  state = 3;
924		  UNGET (ch);
925		  PUT (' ');
926		  break;
927		}
928	      state = 10;	/* Sp after symbol char */
929	      goto recycle;
930	    case 11:
931	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
932		state = 1;
933	      else
934		{
935		  /* We know that ch is not ':', since we tested that
936		     case above.  Therefore this is not a label, so it
937		     must be the opcode, and we've just seen the
938		     whitespace after it.  */
939		  state = 3;
940		}
941	      UNGET (ch);
942	      PUT (' ');	/* Sp after label definition.  */
943	      break;
944	    default:
945	      BAD_CASE (state);
946	    }
947	  break;
948
949	case LEX_IS_TWOCHAR_COMMENT_1ST:
950	  ch2 = GET ();
951	  if (ch2 == '*')
952	    {
953	      for (;;)
954		{
955		  do
956		    {
957		      ch2 = GET ();
958		      if (ch2 != EOF && IS_NEWLINE (ch2))
959			add_newlines++;
960		    }
961		  while (ch2 != EOF && ch2 != '*');
962
963		  while (ch2 == '*')
964		    ch2 = GET ();
965
966		  if (ch2 == EOF || ch2 == '/')
967		    break;
968
969		  /* This UNGET will ensure that we count newlines
970		     correctly.  */
971		  UNGET (ch2);
972		}
973
974	      if (ch2 == EOF)
975		as_warn (_("end of file in multiline comment"));
976
977	      ch = ' ';
978	      goto recycle;
979	    }
980#ifdef DOUBLESLASH_LINE_COMMENTS
981	  else if (ch2 == '/')
982	    {
983	      do
984		{
985		  ch = GET ();
986		}
987	      while (ch != EOF && !IS_NEWLINE (ch));
988	      if (ch == EOF)
989		as_warn ("end of file in comment; newline inserted");
990	      state = 0;
991	      PUT ('\n');
992	      break;
993	    }
994#endif
995	  else
996	    {
997	      if (ch2 != EOF)
998		UNGET (ch2);
999	      if (state == 9 || state == 10)
1000		state = 3;
1001	      PUT (ch);
1002	    }
1003	  break;
1004
1005	case LEX_IS_STRINGQUOTE:
1006	  quotechar = ch;
1007	  if (state == 10)
1008	    {
1009	      /* Preserve the whitespace in foo "bar".  */
1010	      UNGET (ch);
1011	      state = 3;
1012	      PUT (' ');
1013
1014	      /* PUT didn't jump out.  We could just break, but we
1015		 know what will happen, so optimize a bit.  */
1016	      ch = GET ();
1017	      old_state = 3;
1018	    }
1019	  else if (state == 9)
1020	    old_state = 3;
1021	  else
1022	    old_state = state;
1023	  state = 5;
1024	  PUT (ch);
1025	  break;
1026
1027#ifndef IEEE_STYLE
1028	case LEX_IS_ONECHAR_QUOTE:
1029	  if (state == 10)
1030	    {
1031	      /* Preserve the whitespace in foo 'b'.  */
1032	      UNGET (ch);
1033	      state = 3;
1034	      PUT (' ');
1035	      break;
1036	    }
1037	  ch = GET ();
1038	  if (ch == EOF)
1039	    {
1040	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1041	      ch = 0;
1042	    }
1043	  if (ch == '\\')
1044	    {
1045	      ch = GET ();
1046	      if (ch == EOF)
1047		{
1048		  as_warn (_("end of file in escape character"));
1049		  ch = '\\';
1050		}
1051	      else
1052		ch = process_escape (ch);
1053	    }
1054	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1055
1056	  /* None of these 'x constants for us.  We want 'x'.  */
1057	  if ((ch = GET ()) != '\'')
1058	    {
1059#ifdef REQUIRE_CHAR_CLOSE_QUOTE
1060	      as_warn (_("missing close quote; (assumed)"));
1061#else
1062	      if (ch != EOF)
1063		UNGET (ch);
1064#endif
1065	    }
1066	  if (strlen (out_buf) == 1)
1067	    {
1068	      PUT (out_buf[0]);
1069	      break;
1070	    }
1071	  if (state == 9)
1072	    old_state = 3;
1073	  else
1074	    old_state = state;
1075	  state = -1;
1076	  out_string = out_buf;
1077	  PUT (*out_string++);
1078	  break;
1079#endif
1080
1081	case LEX_IS_COLON:
1082#ifdef KEEP_WHITE_AROUND_COLON
1083	  state = 9;
1084#else
1085	  if (state == 9 || state == 10)
1086	    state = 3;
1087	  else if (state != 3)
1088	    state = 1;
1089#endif
1090	  PUT (ch);
1091	  break;
1092
1093	case LEX_IS_NEWLINE:
1094	  /* Roll out a bunch of newlines from inside comments, etc.  */
1095	  if (add_newlines)
1096	    {
1097	      --add_newlines;
1098	      UNGET (ch);
1099	    }
1100	  /* Fall through.  */
1101
1102	case LEX_IS_LINE_SEPARATOR:
1103	  state = 0;
1104	  PUT (ch);
1105	  break;
1106
1107	case LEX_IS_PARALLEL_SEPARATOR:
1108	  state = 1;
1109	  PUT (ch);
1110	  break;
1111
1112#ifdef TC_V850
1113	case LEX_IS_DOUBLEDASH_1ST:
1114	  ch2 = GET ();
1115	  if (ch2 != '-')
1116	    {
1117	      UNGET (ch2);
1118	      goto de_fault;
1119	    }
1120	  /* Read and skip to end of line.  */
1121	  do
1122	    {
1123	      ch = GET ();
1124	    }
1125	  while (ch != EOF && ch != '\n');
1126
1127	  if (ch == EOF)
1128	    as_warn (_("end of file in comment; newline inserted"));
1129
1130	  state = 0;
1131	  PUT ('\n');
1132	  break;
1133#endif
1134#ifdef DOUBLEBAR_PARALLEL
1135	case LEX_IS_DOUBLEBAR_1ST:
1136	  ch2 = GET ();
1137	  UNGET (ch2);
1138	  if (ch2 != '|')
1139	    goto de_fault;
1140
1141	  /* Handle '||' in two states as invoking PUT twice might
1142	     result in the first one jumping out of this loop.  We'd
1143	     then lose track of the state and one '|' char.  */
1144	  state = 13;
1145	  PUT ('|');
1146	  break;
1147#endif
1148	case LEX_IS_LINE_COMMENT_START:
1149	  /* FIXME-someday: The two character comment stuff was badly
1150	     thought out.  On i386, we want '/' as line comment start
1151	     AND we want C style comments.  hence this hack.  The
1152	     whole lexical process should be reworked.  xoxorich.  */
1153	  if (ch == '/')
1154	    {
1155	      ch2 = GET ();
1156	      if (ch2 == '*')
1157		{
1158		  old_state = 3;
1159		  state = -2;
1160		  break;
1161		}
1162	      else
1163		{
1164		  UNGET (ch2);
1165		}
1166	    }
1167
1168	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1169	    {
1170	      int startch;
1171
1172	      startch = ch;
1173
1174	      do
1175		{
1176		  ch = GET ();
1177		}
1178	      while (ch != EOF && IS_WHITESPACE (ch));
1179
1180	      if (ch == EOF)
1181		{
1182		  as_warn (_("end of file in comment; newline inserted"));
1183		  PUT ('\n');
1184		  break;
1185		}
1186
1187	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1188		{
1189		  /* Not a cpp line.  */
1190		  while (ch != EOF && !IS_NEWLINE (ch))
1191		    ch = GET ();
1192		  if (ch == EOF)
1193		    as_warn (_("end of file in comment; newline inserted"));
1194		  state = 0;
1195		  PUT ('\n');
1196		  break;
1197		}
1198	      /* Looks like `# 123 "filename"' from cpp.  */
1199	      UNGET (ch);
1200	      old_state = 4;
1201	      state = -1;
1202	      if (scrub_m68k_mri)
1203		out_string = "\tappline ";
1204	      else
1205		out_string = "\t.appline ";
1206	      PUT (*out_string++);
1207	      break;
1208	    }
1209
1210#ifdef TC_D10V
1211	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1212	     Trap is the only short insn that has a first operand that is
1213	     neither register nor label.
1214	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1215	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1216	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1217	     only character in line_comment_chars for d10v, hence we
1218	     can recognize it as such.  */
1219	  /* An alternative approach would be to reset the state to 1 when
1220	     we see '||', '<'- or '->', but that seems to be overkill.  */
1221	  if (state == 10)
1222	    PUT (' ');
1223#endif
1224	  /* We have a line comment character which is not at the
1225	     start of a line.  If this is also a normal comment
1226	     character, fall through.  Otherwise treat it as a default
1227	     character.  */
1228	  if (strchr (tc_comment_chars, ch) == NULL
1229	      && (! scrub_m68k_mri
1230		  || (ch != '!' && ch != '*')))
1231	    goto de_fault;
1232	  if (scrub_m68k_mri
1233	      && (ch == '!' || ch == '*' || ch == '#')
1234	      && state != 1
1235	      && state != 10)
1236	    goto de_fault;
1237	  /* Fall through.  */
1238	case LEX_IS_COMMENT_START:
1239#if defined TC_ARM && defined OBJ_ELF
1240	  /* On the ARM, `@' is the comment character.
1241	     Unfortunately this is also a special character in ELF .symver
1242	     directives (and .type, though we deal with those another way).
1243	     So we check if this line is such a directive, and treat
1244	     the character as default if so.  This is a hack.  */
1245	  if ((symver_state != NULL) && (*symver_state == 0))
1246	    goto de_fault;
1247#endif
1248#ifdef WARN_COMMENTS
1249	  if (!found_comment)
1250	    as_where (&found_comment_file, &found_comment);
1251#endif
1252	  do
1253	    {
1254	      ch = GET ();
1255	    }
1256	  while (ch != EOF && !IS_NEWLINE (ch));
1257	  if (ch == EOF)
1258	    as_warn (_("end of file in comment; newline inserted"));
1259	  state = 0;
1260	  PUT ('\n');
1261	  break;
1262
1263	case LEX_IS_SYMBOL_COMPONENT:
1264	  if (state == 10)
1265	    {
1266	      /* This is a symbol character following another symbol
1267		 character, with whitespace in between.  We skipped
1268		 the whitespace earlier, so output it now.  */
1269	      UNGET (ch);
1270	      state = 3;
1271	      PUT (' ');
1272	      break;
1273	    }
1274
1275#ifdef TC_Z80
1276	  /* "af'" is a symbol containing '\''.  */
1277	  if (state == 3 && (ch == 'a' || ch == 'A'))
1278	    {
1279	      state = 16;
1280	      PUT (ch);
1281	      ch = GET ();
1282	      if (ch == 'f' || ch == 'F')
1283		{
1284		  state = 17;
1285		  PUT (ch);
1286		  break;
1287		}
1288	      else
1289		{
1290		  state = 9;
1291		  if (!IS_SYMBOL_COMPONENT (ch))
1292		    {
1293		      UNGET (ch);
1294		      break;
1295		    }
1296		}
1297	    }
1298#endif
1299	  if (state == 3)
1300	    state = 9;
1301
1302	  /* This is a common case.  Quickly copy CH and all the
1303	     following symbol component or normal characters.  */
1304	  if (to + 1 < toend
1305	      && mri_state == NULL
1306#if defined TC_ARM && defined OBJ_ELF
1307	      && symver_state == NULL
1308#endif
1309	      )
1310	    {
1311	      char *s;
1312	      int len;
1313
1314	      for (s = from; s < fromend; s++)
1315		{
1316		  int type;
1317
1318		  ch2 = *(unsigned char *) s;
1319		  type = lex[ch2];
1320		  if (type != 0
1321		      && type != LEX_IS_SYMBOL_COMPONENT)
1322		    break;
1323		}
1324
1325	      if (s > from)
1326		/* Handle the last character normally, for
1327		   simplicity.  */
1328		--s;
1329
1330	      len = s - from;
1331
1332	      if (len > (toend - to) - 1)
1333		len = (toend - to) - 1;
1334
1335	      if (len > 0)
1336		{
1337		  PUT (ch);
1338		  memcpy (to, from, len);
1339		  to += len;
1340		  from += len;
1341		  if (to >= toend)
1342		    goto tofull;
1343		  ch = GET ();
1344		}
1345	    }
1346
1347	  /* Fall through.  */
1348	default:
1349	de_fault:
1350	  /* Some relatively `normal' character.  */
1351	  if (state == 0)
1352	    {
1353	      state = 11;	/* Now seeing label definition.  */
1354	    }
1355	  else if (state == 1)
1356	    {
1357	      state = 2;	/* Ditto.  */
1358	    }
1359	  else if (state == 9)
1360	    {
1361	      if (!IS_SYMBOL_COMPONENT (ch))
1362		state = 3;
1363	    }
1364	  else if (state == 10)
1365	    {
1366	      if (ch == '\\')
1367		{
1368		  /* Special handling for backslash: a backslash may
1369		     be the beginning of a formal parameter (of a
1370		     macro) following another symbol character, with
1371		     whitespace in between.  If that is the case, we
1372		     output a space before the parameter.  Strictly
1373		     speaking, correct handling depends upon what the
1374		     macro parameter expands into; if the parameter
1375		     expands into something which does not start with
1376		     an operand character, then we don't want to keep
1377		     the space.  We don't have enough information to
1378		     make the right choice, so here we are making the
1379		     choice which is more likely to be correct.  */
1380		  PUT (' ');
1381		}
1382
1383	      state = 3;
1384	    }
1385	  PUT (ch);
1386	  break;
1387	}
1388    }
1389
1390  /*NOTREACHED*/
1391
1392 fromeof:
1393  /* We have reached the end of the input.  */
1394  return to - tostart;
1395
1396 tofull:
1397  /* The output buffer is full.  Save any input we have not yet
1398     processed.  */
1399  if (fromend > from)
1400    {
1401      saved_input = from;
1402      saved_input_len = fromend - from;
1403    }
1404  else
1405    saved_input = NULL;
1406
1407  return to - tostart;
1408}
1409
1410