app.c revision 130562
1/* This is the Assembler Pre-Processor
2   Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
3   1999, 2000, 2002, 2003
4   Free Software Foundation, Inc.
5
6   This file is part of GAS, the GNU Assembler.
7
8   GAS is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GAS is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with GAS; see the file COPYING.  If not, write to the Free
20   Software Foundation, 59 Temple Place - Suite 330, Boston, MA
21   02111-1307, USA.  */
22
23/* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
24/* App, the assembler pre-processor.  This pre-processor strips out excess
25   spaces, turns single-quoted characters into a decimal constant, and turns
26   # <number> <filename> <garbage> into a .line <number>\n.file <filename>
27   pair.  This needs better error-handling.  */
28
29#include <stdio.h>
30#include "as.h"			/* For BAD_CASE() only.  */
31
32#if (__STDC__ != 1)
33#ifndef const
34#define const  /* empty */
35#endif
36#endif
37
38#ifdef TC_M68K
39/* Whether we are scrubbing in m68k MRI mode.  This is different from
40   flag_m68k_mri, because the two flags will be affected by the .mri
41   pseudo-op at different times.  */
42static int scrub_m68k_mri;
43
44/* The pseudo-op which switches in and out of MRI mode.  See the
45   comment in do_scrub_chars.  */
46static const char mri_pseudo[] = ".mri 0";
47#else
48#define scrub_m68k_mri 0
49#endif
50
51#if defined TC_ARM && defined OBJ_ELF
52/* The pseudo-op for which we need to special-case `@' characters.
53   See the comment in do_scrub_chars.  */
54static const char   symver_pseudo[] = ".symver";
55static const char * symver_state;
56#endif
57
58static char lex[256];
59static const char symbol_chars[] =
60"$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
61
62#define LEX_IS_SYMBOL_COMPONENT		1
63#define LEX_IS_WHITESPACE		2
64#define LEX_IS_LINE_SEPARATOR		3
65#define LEX_IS_COMMENT_START		4
66#define LEX_IS_LINE_COMMENT_START	5
67#define	LEX_IS_TWOCHAR_COMMENT_1ST	6
68#define	LEX_IS_STRINGQUOTE		8
69#define	LEX_IS_COLON			9
70#define	LEX_IS_NEWLINE			10
71#define	LEX_IS_ONECHAR_QUOTE		11
72#ifdef TC_V850
73#define LEX_IS_DOUBLEDASH_1ST		12
74#endif
75#ifdef TC_M32R
76#define DOUBLEBAR_PARALLEL
77#endif
78#ifdef DOUBLEBAR_PARALLEL
79#define LEX_IS_DOUBLEBAR_1ST		13
80#endif
81#define LEX_IS_PARALLEL_SEPARATOR	14
82#define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
83#define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
84#define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
85#define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
86#define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
87#define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
88#define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
89
90static int process_escape (int);
91
92/* FIXME-soon: The entire lexer/parser thingy should be
93   built statically at compile time rather than dynamically
94   each and every time the assembler is run.  xoxorich.  */
95
96void
97do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
98{
99  const char *p;
100  int c;
101
102  lex[' '] = LEX_IS_WHITESPACE;
103  lex['\t'] = LEX_IS_WHITESPACE;
104  lex['\r'] = LEX_IS_WHITESPACE;
105  lex['\n'] = LEX_IS_NEWLINE;
106  lex[':'] = LEX_IS_COLON;
107
108#ifdef TC_M68K
109  scrub_m68k_mri = m68k_mri;
110
111  if (! m68k_mri)
112#endif
113    {
114      lex['"'] = LEX_IS_STRINGQUOTE;
115
116#if ! defined (TC_HPPA) && ! defined (TC_I370)
117      /* I370 uses single-quotes to delimit integer, float constants.  */
118      lex['\''] = LEX_IS_ONECHAR_QUOTE;
119#endif
120
121#ifdef SINGLE_QUOTE_STRINGS
122      lex['\''] = LEX_IS_STRINGQUOTE;
123#endif
124    }
125
126  /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
127     in state 5 of do_scrub_chars must be changed.  */
128
129  /* Note that these override the previous defaults, e.g. if ';' is a
130     comment char, then it isn't a line separator.  */
131  for (p = symbol_chars; *p; ++p)
132    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
133
134  for (c = 128; c < 256; ++c)
135    lex[c] = LEX_IS_SYMBOL_COMPONENT;
136
137#ifdef tc_symbol_chars
138  /* This macro permits the processor to specify all characters which
139     may appears in an operand.  This will prevent the scrubber from
140     discarding meaningful whitespace in certain cases.  The i386
141     backend uses this to support prefixes, which can confuse the
142     scrubber as to whether it is parsing operands or opcodes.  */
143  for (p = tc_symbol_chars; *p; ++p)
144    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
145#endif
146
147  /* The m68k backend wants to be able to change comment_chars.  */
148#ifndef tc_comment_chars
149#define tc_comment_chars comment_chars
150#endif
151  for (p = tc_comment_chars; *p; p++)
152    lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
153
154  for (p = line_comment_chars; *p; p++)
155    lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
156
157  for (p = line_separator_chars; *p; p++)
158    lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
159
160#ifdef tc_parallel_separator_chars
161  /* This macro permits the processor to specify all characters which
162     separate parallel insns on the same line.  */
163  for (p = tc_parallel_separator_chars; *p; p++)
164    lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
165#endif
166
167  /* Only allow slash-star comments if slash is not in use.
168     FIXME: This isn't right.  We should always permit them.  */
169  if (lex['/'] == 0)
170    lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
171
172#ifdef TC_M68K
173  if (m68k_mri)
174    {
175      lex['\''] = LEX_IS_STRINGQUOTE;
176      lex[';'] = LEX_IS_COMMENT_START;
177      lex['*'] = LEX_IS_LINE_COMMENT_START;
178      /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
179	 then it can't be used in an expression.  */
180      lex['!'] = LEX_IS_LINE_COMMENT_START;
181    }
182#endif
183
184#ifdef TC_V850
185  lex['-'] = LEX_IS_DOUBLEDASH_1ST;
186#endif
187#ifdef DOUBLEBAR_PARALLEL
188  lex['|'] = LEX_IS_DOUBLEBAR_1ST;
189#endif
190#ifdef TC_D30V
191  /* Must do this is we want VLIW instruction with "->" or "<-".  */
192  lex['-'] = LEX_IS_SYMBOL_COMPONENT;
193#endif
194}
195
196/* Saved state of the scrubber.  */
197static int state;
198static int old_state;
199static char *out_string;
200static char out_buf[20];
201static int add_newlines;
202static char *saved_input;
203static int saved_input_len;
204static char input_buffer[32 * 1024];
205static const char *mri_state;
206static char mri_last_ch;
207
208/* Data structure for saving the state of app across #include's.  Note that
209   app is called asynchronously to the parsing of the .include's, so our
210   state at the time .include is interpreted is completely unrelated.
211   That's why we have to save it all.  */
212
213struct app_save
214{
215  int          state;
216  int          old_state;
217  char *       out_string;
218  char         out_buf[sizeof (out_buf)];
219  int          add_newlines;
220  char *       saved_input;
221  int          saved_input_len;
222#ifdef TC_M68K
223  int          scrub_m68k_mri;
224#endif
225  const char * mri_state;
226  char         mri_last_ch;
227#if defined TC_ARM && defined OBJ_ELF
228  const char * symver_state;
229#endif
230};
231
232char *
233app_push (void)
234{
235  register struct app_save *saved;
236
237  saved = (struct app_save *) xmalloc (sizeof (*saved));
238  saved->state = state;
239  saved->old_state = old_state;
240  saved->out_string = out_string;
241  memcpy (saved->out_buf, out_buf, sizeof (out_buf));
242  saved->add_newlines = add_newlines;
243  if (saved_input == NULL)
244    saved->saved_input = NULL;
245  else
246    {
247      saved->saved_input = xmalloc (saved_input_len);
248      memcpy (saved->saved_input, saved_input, saved_input_len);
249      saved->saved_input_len = saved_input_len;
250    }
251#ifdef TC_M68K
252  saved->scrub_m68k_mri = scrub_m68k_mri;
253#endif
254  saved->mri_state = mri_state;
255  saved->mri_last_ch = mri_last_ch;
256#if defined TC_ARM && defined OBJ_ELF
257  saved->symver_state = symver_state;
258#endif
259
260  /* do_scrub_begin() is not useful, just wastes time.  */
261
262  state = 0;
263  saved_input = NULL;
264
265  return (char *) saved;
266}
267
268void
269app_pop (char *arg)
270{
271  register struct app_save *saved = (struct app_save *) arg;
272
273  /* There is no do_scrub_end ().  */
274  state = saved->state;
275  old_state = saved->old_state;
276  out_string = saved->out_string;
277  memcpy (out_buf, saved->out_buf, sizeof (out_buf));
278  add_newlines = saved->add_newlines;
279  if (saved->saved_input == NULL)
280    saved_input = NULL;
281  else
282    {
283      assert (saved->saved_input_len <= (int) (sizeof input_buffer));
284      memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
285      saved_input = input_buffer;
286      saved_input_len = saved->saved_input_len;
287      free (saved->saved_input);
288    }
289#ifdef TC_M68K
290  scrub_m68k_mri = saved->scrub_m68k_mri;
291#endif
292  mri_state = saved->mri_state;
293  mri_last_ch = saved->mri_last_ch;
294#if defined TC_ARM && defined OBJ_ELF
295  symver_state = saved->symver_state;
296#endif
297
298  free (arg);
299}
300
301/* @@ This assumes that \n &c are the same on host and target.  This is not
302   necessarily true.  */
303
304static int
305process_escape (int ch)
306{
307  switch (ch)
308    {
309    case 'b':
310      return '\b';
311    case 'f':
312      return '\f';
313    case 'n':
314      return '\n';
315    case 'r':
316      return '\r';
317    case 't':
318      return '\t';
319    case '\'':
320      return '\'';
321    case '"':
322      return '\"';
323    default:
324      return ch;
325    }
326}
327
328/* This function is called to process input characters.  The GET
329   parameter is used to retrieve more input characters.  GET should
330   set its parameter to point to a buffer, and return the length of
331   the buffer; it should return 0 at end of file.  The scrubbed output
332   characters are put into the buffer starting at TOSTART; the TOSTART
333   buffer is TOLEN bytes in length.  The function returns the number
334   of scrubbed characters put into TOSTART.  This will be TOLEN unless
335   end of file was seen.  This function is arranged as a state
336   machine, and saves its state so that it may return at any point.
337   This is the way the old code used to work.  */
338
339int
340do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen)
341{
342  char *to = tostart;
343  char *toend = tostart + tolen;
344  char *from;
345  char *fromend;
346  int fromlen;
347  register int ch, ch2 = 0;
348
349  /*State 0: beginning of normal line
350	  1: After first whitespace on line (flush more white)
351	  2: After first non-white (opcode) on line (keep 1white)
352	  3: after second white on line (into operands) (flush white)
353	  4: after putting out a .line, put out digits
354	  5: parsing a string, then go to old-state
355	  6: putting out \ escape in a "d string.
356	  7: After putting out a .appfile, put out string.
357	  8: After putting out a .appfile string, flush until newline.
358	  9: After seeing symbol char in state 3 (keep 1white after symchar)
359	 10: After seeing whitespace in state 9 (keep white before symchar)
360	 11: After seeing a symbol character in state 0 (eg a label definition)
361	 -1: output string in out_string and go to the state in old_state
362	 -2: flush text until a '*' '/' is seen, then go to state old_state
363#ifdef TC_V850
364	 12: After seeing a dash, looking for a second dash as a start
365	     of comment.
366#endif
367#ifdef DOUBLEBAR_PARALLEL
368	 13: After seeing a vertical bar, looking for a second
369	     vertical bar as a parallel expression separator.
370#endif
371#ifdef TC_IA64
372	 14: After seeing a `(' at state 0, looking for a `)' as
373	     predicate.
374	 15: After seeing a `(' at state 1, looking for a `)' as
375	     predicate.
376#endif
377	  */
378
379  /* I added states 9 and 10 because the MIPS ECOFF assembler uses
380     constructs like ``.loc 1 20''.  This was turning into ``.loc
381     120''.  States 9 and 10 ensure that a space is never dropped in
382     between characters which could appear in an identifier.  Ian
383     Taylor, ian@cygnus.com.
384
385     I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
386     correctly on the PA (and any other target where colons are optional).
387     Jeff Law, law@cs.utah.edu.
388
389     I added state 13 so that something like "cmp r1, r2 || trap #1" does not
390     get squashed into "cmp r1,r2||trap#1", with the all important space
391     between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
392
393  /* This macro gets the next input character.  */
394
395#define GET()							\
396  (from < fromend						\
397   ? * (unsigned char *) (from++)				\
398   : (saved_input = NULL,					\
399      fromlen = (*get) (input_buffer, sizeof input_buffer),	\
400      from = input_buffer,					\
401      fromend = from + fromlen,					\
402      (fromlen == 0						\
403       ? EOF							\
404       : * (unsigned char *) (from++))))
405
406  /* This macro pushes a character back on the input stream.  */
407
408#define UNGET(uch) (*--from = (uch))
409
410  /* This macro puts a character into the output buffer.  If this
411     character fills the output buffer, this macro jumps to the label
412     TOFULL.  We use this rather ugly approach because we need to
413     handle two different termination conditions: EOF on the input
414     stream, and a full output buffer.  It would be simpler if we
415     always read in the entire input stream before processing it, but
416     I don't want to make such a significant change to the assembler's
417     memory usage.  */
418
419#define PUT(pch)				\
420  do						\
421    {						\
422      *to++ = (pch);				\
423      if (to >= toend)				\
424	goto tofull;				\
425    }						\
426  while (0)
427
428  if (saved_input != NULL)
429    {
430      from = saved_input;
431      fromend = from + saved_input_len;
432    }
433  else
434    {
435      fromlen = (*get) (input_buffer, sizeof input_buffer);
436      if (fromlen == 0)
437	return 0;
438      from = input_buffer;
439      fromend = from + fromlen;
440    }
441
442  while (1)
443    {
444      /* The cases in this switch end with continue, in order to
445	 branch back to the top of this while loop and generate the
446	 next output character in the appropriate state.  */
447      switch (state)
448	{
449	case -1:
450	  ch = *out_string++;
451	  if (*out_string == '\0')
452	    {
453	      state = old_state;
454	      old_state = 3;
455	    }
456	  PUT (ch);
457	  continue;
458
459	case -2:
460	  for (;;)
461	    {
462	      do
463		{
464		  ch = GET ();
465
466		  if (ch == EOF)
467		    {
468		      as_warn (_("end of file in comment"));
469		      goto fromeof;
470		    }
471
472		  if (ch == '\n')
473		    PUT ('\n');
474		}
475	      while (ch != '*');
476
477	      while ((ch = GET ()) == '*')
478		;
479
480	      if (ch == EOF)
481		{
482		  as_warn (_("end of file in comment"));
483		  goto fromeof;
484		}
485
486	      if (ch == '/')
487		break;
488
489	      UNGET (ch);
490	    }
491
492	  state = old_state;
493	  UNGET (' ');
494	  continue;
495
496	case 4:
497	  ch = GET ();
498	  if (ch == EOF)
499	    goto fromeof;
500	  else if (ch >= '0' && ch <= '9')
501	    PUT (ch);
502	  else
503	    {
504	      while (ch != EOF && IS_WHITESPACE (ch))
505		ch = GET ();
506	      if (ch == '"')
507		{
508		  UNGET (ch);
509		  if (scrub_m68k_mri)
510		    out_string = "\n\tappfile ";
511		  else
512		    out_string = "\n\t.appfile ";
513		  old_state = 7;
514		  state = -1;
515		  PUT (*out_string++);
516		}
517	      else
518		{
519		  while (ch != EOF && ch != '\n')
520		    ch = GET ();
521		  state = 0;
522		  PUT (ch);
523		}
524	    }
525	  continue;
526
527	case 5:
528	  /* We are going to copy everything up to a quote character,
529	     with special handling for a backslash.  We try to
530	     optimize the copying in the simple case without using the
531	     GET and PUT macros.  */
532	  {
533	    char *s;
534	    int len;
535
536	    for (s = from; s < fromend; s++)
537	      {
538		ch = *s;
539		/* This condition must be changed if the type of any
540		   other character can be LEX_IS_STRINGQUOTE.  */
541		if (ch == '\\'
542		    || ch == '"'
543		    || ch == '\''
544		    || ch == '\n')
545		  break;
546	      }
547	    len = s - from;
548	    if (len > toend - to)
549	      len = toend - to;
550	    if (len > 0)
551	      {
552		memcpy (to, from, len);
553		to += len;
554		from += len;
555	      }
556	  }
557
558	  ch = GET ();
559	  if (ch == EOF)
560	    {
561	      as_warn (_("end of file in string; inserted '\"'"));
562	      state = old_state;
563	      UNGET ('\n');
564	      PUT ('"');
565	    }
566	  else if (lex[ch] == LEX_IS_STRINGQUOTE)
567	    {
568	      state = old_state;
569	      PUT (ch);
570	    }
571#ifndef NO_STRING_ESCAPES
572	  else if (ch == '\\')
573	    {
574	      state = 6;
575	      PUT (ch);
576	    }
577#endif
578	  else if (scrub_m68k_mri && ch == '\n')
579	    {
580	      /* Just quietly terminate the string.  This permits lines like
581		   bne	label	loop if we haven't reach end yet.  */
582	      state = old_state;
583	      UNGET (ch);
584	      PUT ('\'');
585	    }
586	  else
587	    {
588	      PUT (ch);
589	    }
590	  continue;
591
592	case 6:
593	  state = 5;
594	  ch = GET ();
595	  switch (ch)
596	    {
597	      /* Handle strings broken across lines, by turning '\n' into
598		 '\\' and 'n'.  */
599	    case '\n':
600	      UNGET ('n');
601	      add_newlines++;
602	      PUT ('\\');
603	      continue;
604
605	    case EOF:
606	      as_warn (_("end of file in string; '\"' inserted"));
607	      PUT ('"');
608	      continue;
609
610	    case '"':
611	    case '\\':
612	    case 'b':
613	    case 'f':
614	    case 'n':
615	    case 'r':
616	    case 't':
617	    case 'v':
618	    case 'x':
619	    case 'X':
620	    case '0':
621	    case '1':
622	    case '2':
623	    case '3':
624	    case '4':
625	    case '5':
626	    case '6':
627	    case '7':
628	      break;
629
630	    default:
631#ifdef ONLY_STANDARD_ESCAPES
632	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
633#endif
634	      break;
635	    }
636	  PUT (ch);
637	  continue;
638
639	case 7:
640	  ch = GET ();
641	  state = 5;
642	  old_state = 8;
643	  if (ch == EOF)
644	    goto fromeof;
645	  PUT (ch);
646	  continue;
647
648	case 8:
649	  do
650	    ch = GET ();
651	  while (ch != '\n' && ch != EOF);
652	  if (ch == EOF)
653	    goto fromeof;
654	  state = 0;
655	  PUT (ch);
656	  continue;
657
658#ifdef DOUBLEBAR_PARALLEL
659	case 13:
660	  ch = GET ();
661	  if (ch != '|')
662	    abort ();
663
664	  /* Reset back to state 1 and pretend that we are parsing a
665	     line from just after the first white space.  */
666	  state = 1;
667	  PUT ('|');
668	  continue;
669#endif
670	}
671
672      /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
673
674      /* flushchar: */
675      ch = GET ();
676
677#ifdef TC_IA64
678      if (ch == '(' && (state == 0 || state == 1))
679	{
680	  state += 14;
681	  PUT (ch);
682	  continue;
683	}
684      else if (state == 14 || state == 15)
685	{
686	  if (ch == ')')
687	    {
688	      state -= 14;
689	      PUT (ch);
690	      ch = GET ();
691	    }
692	  else
693	    {
694	      PUT (ch);
695	      continue;
696	    }
697	}
698#endif
699
700    recycle:
701
702#if defined TC_ARM && defined OBJ_ELF
703      /* We need to watch out for .symver directives.  See the comment later
704	 in this function.  */
705      if (symver_state == NULL)
706	{
707	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
708	    symver_state = symver_pseudo + 1;
709	}
710      else
711	{
712	  /* We advance to the next state if we find the right
713	     character.  */
714	  if (ch != '\0' && (*symver_state == ch))
715	    ++symver_state;
716	  else if (*symver_state != '\0')
717	    /* We did not get the expected character, or we didn't
718	       get a valid terminating character after seeing the
719	       entire pseudo-op, so we must go back to the beginning.  */
720	    symver_state = NULL;
721	  else
722	    {
723	      /* We've read the entire pseudo-op.  If this is the end
724		 of the line, go back to the beginning.  */
725	      if (IS_NEWLINE (ch))
726		symver_state = NULL;
727	    }
728	}
729#endif /* TC_ARM && OBJ_ELF */
730
731#ifdef TC_M68K
732      /* We want to have pseudo-ops which control whether we are in
733	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
734	 the scrubber, that means that we need a special purpose
735	 recognizer here.  */
736      if (mri_state == NULL)
737	{
738	  if ((state == 0 || state == 1)
739	      && ch == mri_pseudo[0])
740	    mri_state = mri_pseudo + 1;
741	}
742      else
743	{
744	  /* We advance to the next state if we find the right
745	     character, or if we need a space character and we get any
746	     whitespace character, or if we need a '0' and we get a
747	     '1' (this is so that we only need one state to handle
748	     ``.mri 0'' and ``.mri 1'').  */
749	  if (ch != '\0'
750	      && (*mri_state == ch
751		  || (*mri_state == ' '
752		      && lex[ch] == LEX_IS_WHITESPACE)
753		  || (*mri_state == '0'
754		      && ch == '1')))
755	    {
756	      mri_last_ch = ch;
757	      ++mri_state;
758	    }
759	  else if (*mri_state != '\0'
760		   || (lex[ch] != LEX_IS_WHITESPACE
761		       && lex[ch] != LEX_IS_NEWLINE))
762	    {
763	      /* We did not get the expected character, or we didn't
764		 get a valid terminating character after seeing the
765		 entire pseudo-op, so we must go back to the
766		 beginning.  */
767	      mri_state = NULL;
768	    }
769	  else
770	    {
771	      /* We've read the entire pseudo-op.  mips_last_ch is
772		 either '0' or '1' indicating whether to enter or
773		 leave MRI mode.  */
774	      do_scrub_begin (mri_last_ch == '1');
775	      mri_state = NULL;
776
777	      /* We continue handling the character as usual.  The
778		 main gas reader must also handle the .mri pseudo-op
779		 to control expression parsing and the like.  */
780	    }
781	}
782#endif
783
784      if (ch == EOF)
785	{
786	  if (state != 0)
787	    {
788	      as_warn (_("end of file not at end of a line; newline inserted"));
789	      state = 0;
790	      PUT ('\n');
791	    }
792	  goto fromeof;
793	}
794
795      switch (lex[ch])
796	{
797	case LEX_IS_WHITESPACE:
798	  do
799	    {
800	      ch = GET ();
801	    }
802	  while (ch != EOF && IS_WHITESPACE (ch));
803	  if (ch == EOF)
804	    goto fromeof;
805
806	  if (state == 0)
807	    {
808	      /* Preserve a single whitespace character at the
809		 beginning of a line.  */
810	      state = 1;
811	      UNGET (ch);
812	      PUT (' ');
813	      break;
814	    }
815
816#ifdef KEEP_WHITE_AROUND_COLON
817	  if (lex[ch] == LEX_IS_COLON)
818	    {
819	      /* Only keep this white if there's no white *after* the
820		 colon.  */
821	      ch2 = GET ();
822	      UNGET (ch2);
823	      if (!IS_WHITESPACE (ch2))
824		{
825		  state = 9;
826		  UNGET (ch);
827		  PUT (' ');
828		  break;
829		}
830	    }
831#endif
832	  if (IS_COMMENT (ch)
833	      || ch == '/'
834	      || IS_LINE_SEPARATOR (ch)
835	      || IS_PARALLEL_SEPARATOR (ch))
836	    {
837	      if (scrub_m68k_mri)
838		{
839		  /* In MRI mode, we keep these spaces.  */
840		  UNGET (ch);
841		  PUT (' ');
842		  break;
843		}
844	      goto recycle;
845	    }
846
847	  /* If we're in state 2 or 11, we've seen a non-white
848	     character followed by whitespace.  If the next character
849	     is ':', this is whitespace after a label name which we
850	     normally must ignore.  In MRI mode, though, spaces are
851	     not permitted between the label and the colon.  */
852	  if ((state == 2 || state == 11)
853	      && lex[ch] == LEX_IS_COLON
854	      && ! scrub_m68k_mri)
855	    {
856	      state = 1;
857	      PUT (ch);
858	      break;
859	    }
860
861	  switch (state)
862	    {
863	    case 0:
864	      state++;
865	      goto recycle;	/* Punted leading sp */
866	    case 1:
867	      /* We can arrive here if we leave a leading whitespace
868		 character at the beginning of a line.  */
869	      goto recycle;
870	    case 2:
871	      state = 3;
872	      if (to + 1 < toend)
873		{
874		  /* Optimize common case by skipping UNGET/GET.  */
875		  PUT (' ');	/* Sp after opco */
876		  goto recycle;
877		}
878	      UNGET (ch);
879	      PUT (' ');
880	      break;
881	    case 3:
882	      if (scrub_m68k_mri)
883		{
884		  /* In MRI mode, we keep these spaces.  */
885		  UNGET (ch);
886		  PUT (' ');
887		  break;
888		}
889	      goto recycle;	/* Sp in operands */
890	    case 9:
891	    case 10:
892	      if (scrub_m68k_mri)
893		{
894		  /* In MRI mode, we keep these spaces.  */
895		  state = 3;
896		  UNGET (ch);
897		  PUT (' ');
898		  break;
899		}
900	      state = 10;	/* Sp after symbol char */
901	      goto recycle;
902	    case 11:
903	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
904		state = 1;
905	      else
906		{
907		  /* We know that ch is not ':', since we tested that
908		     case above.  Therefore this is not a label, so it
909		     must be the opcode, and we've just seen the
910		     whitespace after it.  */
911		  state = 3;
912		}
913	      UNGET (ch);
914	      PUT (' ');	/* Sp after label definition.  */
915	      break;
916	    default:
917	      BAD_CASE (state);
918	    }
919	  break;
920
921	case LEX_IS_TWOCHAR_COMMENT_1ST:
922	  ch2 = GET ();
923	  if (ch2 == '*')
924	    {
925	      for (;;)
926		{
927		  do
928		    {
929		      ch2 = GET ();
930		      if (ch2 != EOF && IS_NEWLINE (ch2))
931			add_newlines++;
932		    }
933		  while (ch2 != EOF && ch2 != '*');
934
935		  while (ch2 == '*')
936		    ch2 = GET ();
937
938		  if (ch2 == EOF || ch2 == '/')
939		    break;
940
941		  /* This UNGET will ensure that we count newlines
942		     correctly.  */
943		  UNGET (ch2);
944		}
945
946	      if (ch2 == EOF)
947		as_warn (_("end of file in multiline comment"));
948
949	      ch = ' ';
950	      goto recycle;
951	    }
952#ifdef DOUBLESLASH_LINE_COMMENTS
953	  else if (ch2 == '/')
954	    {
955	      do
956		{
957		  ch = GET ();
958		}
959	      while (ch != EOF && !IS_NEWLINE (ch));
960	      if (ch == EOF)
961		as_warn ("end of file in comment; newline inserted");
962	      state = 0;
963	      PUT ('\n');
964	      break;
965	    }
966#endif
967	  else
968	    {
969	      if (ch2 != EOF)
970		UNGET (ch2);
971	      if (state == 9 || state == 10)
972		state = 3;
973	      PUT (ch);
974	    }
975	  break;
976
977	case LEX_IS_STRINGQUOTE:
978	  if (state == 10)
979	    {
980	      /* Preserve the whitespace in foo "bar".  */
981	      UNGET (ch);
982	      state = 3;
983	      PUT (' ');
984
985	      /* PUT didn't jump out.  We could just break, but we
986		 know what will happen, so optimize a bit.  */
987	      ch = GET ();
988	      old_state = 3;
989	    }
990	  else if (state == 9)
991	    old_state = 3;
992	  else
993	    old_state = state;
994	  state = 5;
995	  PUT (ch);
996	  break;
997
998#ifndef IEEE_STYLE
999	case LEX_IS_ONECHAR_QUOTE:
1000	  if (state == 10)
1001	    {
1002	      /* Preserve the whitespace in foo 'b'.  */
1003	      UNGET (ch);
1004	      state = 3;
1005	      PUT (' ');
1006	      break;
1007	    }
1008	  ch = GET ();
1009	  if (ch == EOF)
1010	    {
1011	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1012	      ch = 0;
1013	    }
1014	  if (ch == '\\')
1015	    {
1016	      ch = GET ();
1017	      if (ch == EOF)
1018		{
1019		  as_warn (_("end of file in escape character"));
1020		  ch = '\\';
1021		}
1022	      else
1023		ch = process_escape (ch);
1024	    }
1025	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1026
1027	  /* None of these 'x constants for us.  We want 'x'.  */
1028	  if ((ch = GET ()) != '\'')
1029	    {
1030#ifdef REQUIRE_CHAR_CLOSE_QUOTE
1031	      as_warn (_("missing close quote; (assumed)"));
1032#else
1033	      if (ch != EOF)
1034		UNGET (ch);
1035#endif
1036	    }
1037	  if (strlen (out_buf) == 1)
1038	    {
1039	      PUT (out_buf[0]);
1040	      break;
1041	    }
1042	  if (state == 9)
1043	    old_state = 3;
1044	  else
1045	    old_state = state;
1046	  state = -1;
1047	  out_string = out_buf;
1048	  PUT (*out_string++);
1049	  break;
1050#endif
1051
1052	case LEX_IS_COLON:
1053#ifdef KEEP_WHITE_AROUND_COLON
1054	  state = 9;
1055#else
1056	  if (state == 9 || state == 10)
1057	    state = 3;
1058	  else if (state != 3)
1059	    state = 1;
1060#endif
1061	  PUT (ch);
1062	  break;
1063
1064	case LEX_IS_NEWLINE:
1065	  /* Roll out a bunch of newlines from inside comments, etc.  */
1066	  if (add_newlines)
1067	    {
1068	      --add_newlines;
1069	      UNGET (ch);
1070	    }
1071	  /* Fall through.  */
1072
1073	case LEX_IS_LINE_SEPARATOR:
1074	  state = 0;
1075	  PUT (ch);
1076	  break;
1077
1078	case LEX_IS_PARALLEL_SEPARATOR:
1079	  state = 1;
1080	  PUT (ch);
1081	  break;
1082
1083#ifdef TC_V850
1084	case LEX_IS_DOUBLEDASH_1ST:
1085	  ch2 = GET ();
1086	  if (ch2 != '-')
1087	    {
1088	      UNGET (ch2);
1089	      goto de_fault;
1090	    }
1091	  /* Read and skip to end of line.  */
1092	  do
1093	    {
1094	      ch = GET ();
1095	    }
1096	  while (ch != EOF && ch != '\n');
1097
1098	  if (ch == EOF)
1099	    as_warn (_("end of file in comment; newline inserted"));
1100
1101	  state = 0;
1102	  PUT ('\n');
1103	  break;
1104#endif
1105#ifdef DOUBLEBAR_PARALLEL
1106	case LEX_IS_DOUBLEBAR_1ST:
1107	  ch2 = GET ();
1108	  UNGET (ch2);
1109	  if (ch2 != '|')
1110	    goto de_fault;
1111
1112	  /* Handle '||' in two states as invoking PUT twice might
1113	     result in the first one jumping out of this loop.  We'd
1114	     then lose track of the state and one '|' char.  */
1115	  state = 13;
1116	  PUT ('|');
1117	  break;
1118#endif
1119	case LEX_IS_LINE_COMMENT_START:
1120	  /* FIXME-someday: The two character comment stuff was badly
1121	     thought out.  On i386, we want '/' as line comment start
1122	     AND we want C style comments.  hence this hack.  The
1123	     whole lexical process should be reworked.  xoxorich.  */
1124	  if (ch == '/')
1125	    {
1126	      ch2 = GET ();
1127	      if (ch2 == '*')
1128		{
1129		  old_state = 3;
1130		  state = -2;
1131		  break;
1132		}
1133	      else
1134		{
1135		  UNGET (ch2);
1136		}
1137	    }
1138
1139	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1140	    {
1141	      int startch;
1142
1143	      startch = ch;
1144
1145	      do
1146		{
1147		  ch = GET ();
1148		}
1149	      while (ch != EOF && IS_WHITESPACE (ch));
1150
1151	      if (ch == EOF)
1152		{
1153		  as_warn (_("end of file in comment; newline inserted"));
1154		  PUT ('\n');
1155		  break;
1156		}
1157
1158	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1159		{
1160		  /* Not a cpp line.  */
1161		  while (ch != EOF && !IS_NEWLINE (ch))
1162		    ch = GET ();
1163		  if (ch == EOF)
1164		    as_warn (_("end of file in comment; newline inserted"));
1165		  state = 0;
1166		  PUT ('\n');
1167		  break;
1168		}
1169	      /* Looks like `# 123 "filename"' from cpp.  */
1170	      UNGET (ch);
1171	      old_state = 4;
1172	      state = -1;
1173	      if (scrub_m68k_mri)
1174		out_string = "\tappline ";
1175	      else
1176		out_string = "\t.appline ";
1177	      PUT (*out_string++);
1178	      break;
1179	    }
1180
1181#ifdef TC_D10V
1182	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1183	     Trap is the only short insn that has a first operand that is
1184	     neither register nor label.
1185	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1186	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1187	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1188	     only character in line_comment_chars for d10v, hence we
1189	     can recognize it as such.  */
1190	  /* An alternative approach would be to reset the state to 1 when
1191	     we see '||', '<'- or '->', but that seems to be overkill.  */
1192	  if (state == 10)
1193	    PUT (' ');
1194#endif
1195	  /* We have a line comment character which is not at the
1196	     start of a line.  If this is also a normal comment
1197	     character, fall through.  Otherwise treat it as a default
1198	     character.  */
1199	  if (strchr (tc_comment_chars, ch) == NULL
1200	      && (! scrub_m68k_mri
1201		  || (ch != '!' && ch != '*')))
1202	    goto de_fault;
1203	  if (scrub_m68k_mri
1204	      && (ch == '!' || ch == '*' || ch == '#')
1205	      && state != 1
1206	      && state != 10)
1207	    goto de_fault;
1208	  /* Fall through.  */
1209	case LEX_IS_COMMENT_START:
1210#if defined TC_ARM && defined OBJ_ELF
1211	  /* On the ARM, `@' is the comment character.
1212	     Unfortunately this is also a special character in ELF .symver
1213	     directives (and .type, though we deal with those another way).
1214	     So we check if this line is such a directive, and treat
1215	     the character as default if so.  This is a hack.  */
1216	  if ((symver_state != NULL) && (*symver_state == 0))
1217	    goto de_fault;
1218#endif
1219#ifdef WARN_COMMENTS
1220	  if (!found_comment)
1221	    as_where (&found_comment_file, &found_comment);
1222#endif
1223	  do
1224	    {
1225	      ch = GET ();
1226	    }
1227	  while (ch != EOF && !IS_NEWLINE (ch));
1228	  if (ch == EOF)
1229	    as_warn (_("end of file in comment; newline inserted"));
1230	  state = 0;
1231	  PUT ('\n');
1232	  break;
1233
1234	case LEX_IS_SYMBOL_COMPONENT:
1235	  if (state == 10)
1236	    {
1237	      /* This is a symbol character following another symbol
1238		 character, with whitespace in between.  We skipped
1239		 the whitespace earlier, so output it now.  */
1240	      UNGET (ch);
1241	      state = 3;
1242	      PUT (' ');
1243	      break;
1244	    }
1245
1246	  if (state == 3)
1247	    state = 9;
1248
1249	  /* This is a common case.  Quickly copy CH and all the
1250	     following symbol component or normal characters.  */
1251	  if (to + 1 < toend
1252	      && mri_state == NULL
1253#if defined TC_ARM && defined OBJ_ELF
1254	      && symver_state == NULL
1255#endif
1256	      )
1257	    {
1258	      char *s;
1259	      int len;
1260
1261	      for (s = from; s < fromend; s++)
1262		{
1263		  int type;
1264
1265		  ch2 = *(unsigned char *) s;
1266		  type = lex[ch2];
1267		  if (type != 0
1268		      && type != LEX_IS_SYMBOL_COMPONENT)
1269		    break;
1270		}
1271
1272	      if (s > from)
1273		/* Handle the last character normally, for
1274		   simplicity.  */
1275		--s;
1276
1277	      len = s - from;
1278
1279	      if (len > (toend - to) - 1)
1280		len = (toend - to) - 1;
1281
1282	      if (len > 0)
1283		{
1284		  PUT (ch);
1285		  if (len > 8)
1286		    {
1287		      memcpy (to, from, len);
1288		      to += len;
1289		      from += len;
1290		    }
1291		  else
1292		    {
1293		      switch (len)
1294			{
1295			case 8: *to++ = *from++;
1296			case 7: *to++ = *from++;
1297			case 6: *to++ = *from++;
1298			case 5: *to++ = *from++;
1299			case 4: *to++ = *from++;
1300			case 3: *to++ = *from++;
1301			case 2: *to++ = *from++;
1302			case 1: *to++ = *from++;
1303			}
1304		    }
1305		  ch = GET ();
1306		}
1307	    }
1308
1309	  /* Fall through.  */
1310	default:
1311	de_fault:
1312	  /* Some relatively `normal' character.  */
1313	  if (state == 0)
1314	    {
1315	      state = 11;	/* Now seeing label definition.  */
1316	    }
1317	  else if (state == 1)
1318	    {
1319	      state = 2;	/* Ditto.  */
1320	    }
1321	  else if (state == 9)
1322	    {
1323	      if (!IS_SYMBOL_COMPONENT (ch))
1324		state = 3;
1325	    }
1326	  else if (state == 10)
1327	    {
1328	      if (ch == '\\')
1329		{
1330		  /* Special handling for backslash: a backslash may
1331		     be the beginning of a formal parameter (of a
1332		     macro) following another symbol character, with
1333		     whitespace in between.  If that is the case, we
1334		     output a space before the parameter.  Strictly
1335		     speaking, correct handling depends upon what the
1336		     macro parameter expands into; if the parameter
1337		     expands into something which does not start with
1338		     an operand character, then we don't want to keep
1339		     the space.  We don't have enough information to
1340		     make the right choice, so here we are making the
1341		     choice which is more likely to be correct.  */
1342		  PUT (' ');
1343		}
1344
1345	      state = 3;
1346	    }
1347	  PUT (ch);
1348	  break;
1349	}
1350    }
1351
1352  /*NOTREACHED*/
1353
1354 fromeof:
1355  /* We have reached the end of the input.  */
1356  return to - tostart;
1357
1358 tofull:
1359  /* The output buffer is full.  Save any input we have not yet
1360     processed.  */
1361  if (fromend > from)
1362    {
1363      saved_input = from;
1364      saved_input_len = fromend - from;
1365    }
1366  else
1367    saved_input = NULL;
1368
1369  return to - tostart;
1370}
1371
1372