app.c revision 104834
1275970Scy/* This is the Assembler Pre-Processor
2275970Scy   Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
3275970Scy   1999, 2000
4275970Scy   Free Software Foundation, Inc.
5275970Scy
6275970Scy   This file is part of GAS, the GNU Assembler.
7275970Scy
8275970Scy   GAS is free software; you can redistribute it and/or modify
9275970Scy   it under the terms of the GNU General Public License as published by
10275970Scy   the Free Software Foundation; either version 2, or (at your option)
11275970Scy   any later version.
12275970Scy
13275970Scy   GAS is distributed in the hope that it will be useful,
14275970Scy   but WITHOUT ANY WARRANTY; without even the implied warranty of
15275970Scy   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16275970Scy   GNU General Public License for more details.
17275970Scy
18275970Scy   You should have received a copy of the GNU General Public License
19275970Scy   along with GAS; see the file COPYING.  If not, write to the Free
20275970Scy   Software Foundation, 59 Temple Place - Suite 330, Boston, MA
21275970Scy   02111-1307, USA.  */
22275970Scy
23275970Scy/* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
24275970Scy/* App, the assembler pre-processor.  This pre-processor strips out excess
25275970Scy   spaces, turns single-quoted characters into a decimal constant, and turns
26275970Scy   # <number> <filename> <garbage> into a .line <number>\n.file <filename>
27275970Scy   pair.  This needs better error-handling.  */
28275970Scy
29275970Scy#include <stdio.h>
30275970Scy#include "as.h"			/* For BAD_CASE() only */
31275970Scy
32275970Scy#if (__STDC__ != 1)
33275970Scy#ifndef const
34275970Scy#define const  /* empty */
35275970Scy#endif
36275970Scy#endif
37275970Scy
38275970Scy#ifdef TC_M68K
39275970Scy/* Whether we are scrubbing in m68k MRI mode.  This is different from
40275970Scy   flag_m68k_mri, because the two flags will be affected by the .mri
41275970Scy   pseudo-op at different times.  */
42275970Scystatic int scrub_m68k_mri;
43275970Scy
44275970Scy/* The pseudo-op which switches in and out of MRI mode.  See the
45275970Scy   comment in do_scrub_chars.  */
46static const char mri_pseudo[] = ".mri 0";
47#else
48#define scrub_m68k_mri 0
49#endif
50
51#if defined TC_ARM && defined OBJ_ELF
52/* The pseudo-op for which we need to special-case `@' characters.
53   See the comment in do_scrub_chars.  */
54static const char   symver_pseudo[] = ".symver";
55static const char * symver_state;
56#endif
57
58static char lex[256];
59static const char symbol_chars[] =
60"$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
61
62#define LEX_IS_SYMBOL_COMPONENT		1
63#define LEX_IS_WHITESPACE		2
64#define LEX_IS_LINE_SEPARATOR		3
65#define LEX_IS_COMMENT_START		4
66#define LEX_IS_LINE_COMMENT_START	5
67#define	LEX_IS_TWOCHAR_COMMENT_1ST	6
68#define	LEX_IS_STRINGQUOTE		8
69#define	LEX_IS_COLON			9
70#define	LEX_IS_NEWLINE			10
71#define	LEX_IS_ONECHAR_QUOTE		11
72#ifdef TC_V850
73#define LEX_IS_DOUBLEDASH_1ST		12
74#endif
75#ifdef TC_M32R
76#define DOUBLEBAR_PARALLEL
77#endif
78#ifdef DOUBLEBAR_PARALLEL
79#define LEX_IS_DOUBLEBAR_1ST		13
80#endif
81#define LEX_IS_PARALLEL_SEPARATOR	14
82#define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
83#define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
84#define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
85#define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
86#define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
87#define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
88#define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
89
90static int process_escape PARAMS ((int));
91
92/* FIXME-soon: The entire lexer/parser thingy should be
93   built statically at compile time rather than dynamically
94   each and every time the assembler is run.  xoxorich.  */
95
96void
97do_scrub_begin (m68k_mri)
98     int m68k_mri ATTRIBUTE_UNUSED;
99{
100  const char *p;
101  int c;
102
103  lex[' '] = LEX_IS_WHITESPACE;
104  lex['\t'] = LEX_IS_WHITESPACE;
105  lex['\r'] = LEX_IS_WHITESPACE;
106  lex['\n'] = LEX_IS_NEWLINE;
107  lex[':'] = LEX_IS_COLON;
108
109#ifdef TC_M68K
110  scrub_m68k_mri = m68k_mri;
111
112  if (! m68k_mri)
113#endif
114    {
115      lex['"'] = LEX_IS_STRINGQUOTE;
116
117#if ! defined (TC_HPPA) && ! defined (TC_I370)
118      /* I370 uses single-quotes to delimit integer, float constants */
119      lex['\''] = LEX_IS_ONECHAR_QUOTE;
120#endif
121
122#ifdef SINGLE_QUOTE_STRINGS
123      lex['\''] = LEX_IS_STRINGQUOTE;
124#endif
125    }
126
127  /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
128     in state 5 of do_scrub_chars must be changed.  */
129
130  /* Note that these override the previous defaults, e.g. if ';' is a
131     comment char, then it isn't a line separator.  */
132  for (p = symbol_chars; *p; ++p)
133    {
134      lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
135    }				/* declare symbol characters */
136
137  for (c = 128; c < 256; ++c)
138    lex[c] = LEX_IS_SYMBOL_COMPONENT;
139
140#ifdef tc_symbol_chars
141  /* This macro permits the processor to specify all characters which
142     may appears in an operand.  This will prevent the scrubber from
143     discarding meaningful whitespace in certain cases.  The i386
144     backend uses this to support prefixes, which can confuse the
145     scrubber as to whether it is parsing operands or opcodes.  */
146  for (p = tc_symbol_chars; *p; ++p)
147    lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
148#endif
149
150  /* The m68k backend wants to be able to change comment_chars.  */
151#ifndef tc_comment_chars
152#define tc_comment_chars comment_chars
153#endif
154  for (p = tc_comment_chars; *p; p++)
155    {
156      lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
157    }				/* declare comment chars */
158
159  for (p = line_comment_chars; *p; p++)
160    {
161      lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
162    }				/* declare line comment chars */
163
164  for (p = line_separator_chars; *p; p++)
165    {
166      lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
167    }				/* declare line separators */
168
169#ifdef tc_parallel_separator_chars
170  /* This macro permits the processor to specify all characters which
171     separate parallel insns on the same line.  */
172  for (p = tc_parallel_separator_chars; *p; p++)
173    {
174      lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
175    }				/* declare parallel separators */
176#endif
177
178  /* Only allow slash-star comments if slash is not in use.
179     FIXME: This isn't right.  We should always permit them.  */
180  if (lex['/'] == 0)
181    {
182      lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
183    }
184
185#ifdef TC_M68K
186  if (m68k_mri)
187    {
188      lex['\''] = LEX_IS_STRINGQUOTE;
189      lex[';'] = LEX_IS_COMMENT_START;
190      lex['*'] = LEX_IS_LINE_COMMENT_START;
191      /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
192         then it can't be used in an expression.  */
193      lex['!'] = LEX_IS_LINE_COMMENT_START;
194    }
195#endif
196
197#ifdef TC_V850
198  lex['-'] = LEX_IS_DOUBLEDASH_1ST;
199#endif
200#ifdef DOUBLEBAR_PARALLEL
201  lex['|'] = LEX_IS_DOUBLEBAR_1ST;
202#endif
203#ifdef TC_D30V
204  /* must do this is we want VLIW instruction with "->" or "<-" */
205  lex['-'] = LEX_IS_SYMBOL_COMPONENT;
206#endif
207}				/* do_scrub_begin() */
208
209/* Saved state of the scrubber */
210static int state;
211static int old_state;
212static char *out_string;
213static char out_buf[20];
214static int add_newlines;
215static char *saved_input;
216static int saved_input_len;
217static char input_buffer[32 * 1024];
218static const char *mri_state;
219static char mri_last_ch;
220
221/* Data structure for saving the state of app across #include's.  Note that
222   app is called asynchronously to the parsing of the .include's, so our
223   state at the time .include is interpreted is completely unrelated.
224   That's why we have to save it all.  */
225
226struct app_save {
227  int          state;
228  int          old_state;
229  char *       out_string;
230  char         out_buf[sizeof (out_buf)];
231  int          add_newlines;
232  char *       saved_input;
233  int          saved_input_len;
234#ifdef TC_M68K
235  int          scrub_m68k_mri;
236#endif
237  const char * mri_state;
238  char         mri_last_ch;
239#if defined TC_ARM && defined OBJ_ELF
240  const char * symver_state;
241#endif
242};
243
244char *
245app_push ()
246{
247  register struct app_save *saved;
248
249  saved = (struct app_save *) xmalloc (sizeof (*saved));
250  saved->state = state;
251  saved->old_state = old_state;
252  saved->out_string = out_string;
253  memcpy (saved->out_buf, out_buf, sizeof (out_buf));
254  saved->add_newlines = add_newlines;
255  if (saved_input == NULL)
256    saved->saved_input = NULL;
257  else
258    {
259      saved->saved_input = xmalloc (saved_input_len);
260      memcpy (saved->saved_input, saved_input, saved_input_len);
261      saved->saved_input_len = saved_input_len;
262    }
263#ifdef TC_M68K
264  saved->scrub_m68k_mri = scrub_m68k_mri;
265#endif
266  saved->mri_state = mri_state;
267  saved->mri_last_ch = mri_last_ch;
268#if defined TC_ARM && defined OBJ_ELF
269  saved->symver_state = symver_state;
270#endif
271
272  /* do_scrub_begin() is not useful, just wastes time.  */
273
274  state = 0;
275  saved_input = NULL;
276
277  return (char *) saved;
278}
279
280void
281app_pop (arg)
282     char *arg;
283{
284  register struct app_save *saved = (struct app_save *) arg;
285
286  /* There is no do_scrub_end ().  */
287  state = saved->state;
288  old_state = saved->old_state;
289  out_string = saved->out_string;
290  memcpy (out_buf, saved->out_buf, sizeof (out_buf));
291  add_newlines = saved->add_newlines;
292  if (saved->saved_input == NULL)
293    saved_input = NULL;
294  else
295    {
296      assert (saved->saved_input_len <= (int) (sizeof input_buffer));
297      memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
298      saved_input = input_buffer;
299      saved_input_len = saved->saved_input_len;
300      free (saved->saved_input);
301    }
302#ifdef TC_M68K
303  scrub_m68k_mri = saved->scrub_m68k_mri;
304#endif
305  mri_state = saved->mri_state;
306  mri_last_ch = saved->mri_last_ch;
307#if defined TC_ARM && defined OBJ_ELF
308  symver_state = saved->symver_state;
309#endif
310
311  free (arg);
312}				/* app_pop() */
313
314/* @@ This assumes that \n &c are the same on host and target.  This is not
315   necessarily true.  */
316static int
317process_escape (ch)
318     int ch;
319{
320  switch (ch)
321    {
322    case 'b':
323      return '\b';
324    case 'f':
325      return '\f';
326    case 'n':
327      return '\n';
328    case 'r':
329      return '\r';
330    case 't':
331      return '\t';
332    case '\'':
333      return '\'';
334    case '"':
335      return '\"';
336    default:
337      return ch;
338    }
339}
340
341/* This function is called to process input characters.  The GET
342   parameter is used to retrieve more input characters.  GET should
343   set its parameter to point to a buffer, and return the length of
344   the buffer; it should return 0 at end of file.  The scrubbed output
345   characters are put into the buffer starting at TOSTART; the TOSTART
346   buffer is TOLEN bytes in length.  The function returns the number
347   of scrubbed characters put into TOSTART.  This will be TOLEN unless
348   end of file was seen.  This function is arranged as a state
349   machine, and saves its state so that it may return at any point.
350   This is the way the old code used to work.  */
351
352int
353do_scrub_chars (get, tostart, tolen)
354     int (*get) PARAMS ((char *, int));
355     char *tostart;
356     int tolen;
357{
358  char *to = tostart;
359  char *toend = tostart + tolen;
360  char *from;
361  char *fromend;
362  int fromlen;
363  register int ch, ch2 = 0;
364
365  /*State 0: beginning of normal line
366	  1: After first whitespace on line (flush more white)
367	  2: After first non-white (opcode) on line (keep 1white)
368	  3: after second white on line (into operands) (flush white)
369	  4: after putting out a .line, put out digits
370	  5: parsing a string, then go to old-state
371	  6: putting out \ escape in a "d string.
372	  7: After putting out a .appfile, put out string.
373	  8: After putting out a .appfile string, flush until newline.
374	  9: After seeing symbol char in state 3 (keep 1white after symchar)
375	 10: After seeing whitespace in state 9 (keep white before symchar)
376	 11: After seeing a symbol character in state 0 (eg a label definition)
377	 -1: output string in out_string and go to the state in old_state
378	 -2: flush text until a '*' '/' is seen, then go to state old_state
379#ifdef TC_V850
380         12: After seeing a dash, looking for a second dash as a start of comment.
381#endif
382#ifdef DOUBLEBAR_PARALLEL
383	 13: After seeing a vertical bar, looking for a second vertical bar as a parallel expression seperator.
384#endif
385	  */
386
387  /* I added states 9 and 10 because the MIPS ECOFF assembler uses
388     constructs like ``.loc 1 20''.  This was turning into ``.loc
389     120''.  States 9 and 10 ensure that a space is never dropped in
390     between characters which could appear in an identifier.  Ian
391     Taylor, ian@cygnus.com.
392
393     I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
394     correctly on the PA (and any other target where colons are optional).
395     Jeff Law, law@cs.utah.edu.
396
397     I added state 13 so that something like "cmp r1, r2 || trap #1" does not
398     get squashed into "cmp r1,r2||trap#1", with the all important space
399     between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
400
401  /* This macro gets the next input character.  */
402
403#define GET()							\
404  (from < fromend						\
405   ? * (unsigned char *) (from++)				\
406   : (saved_input = NULL,					\
407      fromlen = (*get) (input_buffer, sizeof input_buffer),	\
408      from = input_buffer,					\
409      fromend = from + fromlen,					\
410      (fromlen == 0						\
411       ? EOF							\
412       : * (unsigned char *) (from++))))
413
414  /* This macro pushes a character back on the input stream.  */
415
416#define UNGET(uch) (*--from = (uch))
417
418  /* This macro puts a character into the output buffer.  If this
419     character fills the output buffer, this macro jumps to the label
420     TOFULL.  We use this rather ugly approach because we need to
421     handle two different termination conditions: EOF on the input
422     stream, and a full output buffer.  It would be simpler if we
423     always read in the entire input stream before processing it, but
424     I don't want to make such a significant change to the assembler's
425     memory usage.  */
426
427#define PUT(pch)				\
428  do						\
429    {						\
430      *to++ = (pch);				\
431      if (to >= toend)				\
432	goto tofull;				\
433    }						\
434  while (0)
435
436  if (saved_input != NULL)
437    {
438      from = saved_input;
439      fromend = from + saved_input_len;
440    }
441  else
442    {
443      fromlen = (*get) (input_buffer, sizeof input_buffer);
444      if (fromlen == 0)
445	return 0;
446      from = input_buffer;
447      fromend = from + fromlen;
448    }
449
450  while (1)
451    {
452      /* The cases in this switch end with continue, in order to
453         branch back to the top of this while loop and generate the
454         next output character in the appropriate state.  */
455      switch (state)
456	{
457	case -1:
458	  ch = *out_string++;
459	  if (*out_string == '\0')
460	    {
461	      state = old_state;
462	      old_state = 3;
463	    }
464	  PUT (ch);
465	  continue;
466
467	case -2:
468	  for (;;)
469	    {
470	      do
471		{
472		  ch = GET ();
473
474		  if (ch == EOF)
475		    {
476		      as_warn (_("end of file in comment"));
477		      goto fromeof;
478		    }
479
480		  if (ch == '\n')
481		    PUT ('\n');
482		}
483	      while (ch != '*');
484
485	      while ((ch = GET ()) == '*')
486		;
487
488	      if (ch == EOF)
489		{
490		  as_warn (_("end of file in comment"));
491		  goto fromeof;
492		}
493
494	      if (ch == '/')
495		break;
496
497	      UNGET (ch);
498	    }
499
500	  state = old_state;
501	  UNGET (' ');
502	  continue;
503
504	case 4:
505	  ch = GET ();
506	  if (ch == EOF)
507	    goto fromeof;
508	  else if (ch >= '0' && ch <= '9')
509	    PUT (ch);
510	  else
511	    {
512	      while (ch != EOF && IS_WHITESPACE (ch))
513		ch = GET ();
514	      if (ch == '"')
515		{
516		  UNGET (ch);
517		  if (scrub_m68k_mri)
518		    out_string = "\n\tappfile ";
519		  else
520		    out_string = "\n\t.appfile ";
521		  old_state = 7;
522		  state = -1;
523		  PUT (*out_string++);
524		}
525	      else
526		{
527		  while (ch != EOF && ch != '\n')
528		    ch = GET ();
529		  state = 0;
530		  PUT (ch);
531		}
532	    }
533	  continue;
534
535	case 5:
536	  /* We are going to copy everything up to a quote character,
537             with special handling for a backslash.  We try to
538             optimize the copying in the simple case without using the
539             GET and PUT macros.  */
540	  {
541	    char *s;
542	    int len;
543
544	    for (s = from; s < fromend; s++)
545	      {
546		ch = *s;
547		/* This condition must be changed if the type of any
548                   other character can be LEX_IS_STRINGQUOTE.  */
549		if (ch == '\\'
550		    || ch == '"'
551		    || ch == '\''
552		    || ch == '\n')
553		  break;
554	      }
555	    len = s - from;
556	    if (len > toend - to)
557	      len = toend - to;
558	    if (len > 0)
559	      {
560		memcpy (to, from, len);
561		to += len;
562		from += len;
563	      }
564	  }
565
566	  ch = GET ();
567	  if (ch == EOF)
568	    {
569	      as_warn (_("end of file in string; inserted '\"'"));
570	      state = old_state;
571	      UNGET ('\n');
572	      PUT ('"');
573	    }
574	  else if (lex[ch] == LEX_IS_STRINGQUOTE)
575	    {
576	      state = old_state;
577	      PUT (ch);
578	    }
579#ifndef NO_STRING_ESCAPES
580	  else if (ch == '\\')
581	    {
582	      state = 6;
583	      PUT (ch);
584	    }
585#endif
586	  else if (scrub_m68k_mri && ch == '\n')
587	    {
588	      /* Just quietly terminate the string.  This permits lines like
589		   bne	label	loop if we haven't reach end yet
590		 */
591	      state = old_state;
592	      UNGET (ch);
593	      PUT ('\'');
594	    }
595	  else
596	    {
597	      PUT (ch);
598	    }
599	  continue;
600
601	case 6:
602	  state = 5;
603	  ch = GET ();
604	  switch (ch)
605	    {
606	      /* Handle strings broken across lines, by turning '\n' into
607		 '\\' and 'n'.  */
608	    case '\n':
609	      UNGET ('n');
610	      add_newlines++;
611	      PUT ('\\');
612	      continue;
613
614	    case '"':
615	    case '\\':
616	    case 'b':
617	    case 'f':
618	    case 'n':
619	    case 'r':
620	    case 't':
621	    case 'v':
622	    case 'x':
623	    case 'X':
624	    case '0':
625	    case '1':
626	    case '2':
627	    case '3':
628	    case '4':
629	    case '5':
630	    case '6':
631	    case '7':
632	      break;
633#if defined(IGNORE_NONSTANDARD_ESCAPES) | defined(ONLY_STANDARD_ESCAPES)
634	    default:
635	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
636	      break;
637#else  /* ONLY_STANDARD_ESCAPES */
638	    default:
639	      /* Accept \x as x for any x */
640	      break;
641#endif /* ONLY_STANDARD_ESCAPES */
642
643	    case EOF:
644	      as_warn (_("end of file in string; '\"' inserted"));
645	      PUT ('"');
646	      continue;
647	    }
648	  PUT (ch);
649	  continue;
650
651	case 7:
652	  ch = GET ();
653	  state = 5;
654	  old_state = 8;
655	  if (ch == EOF)
656	    goto fromeof;
657	  PUT (ch);
658	  continue;
659
660	case 8:
661	  do
662	    ch = GET ();
663	  while (ch != '\n' && ch != EOF);
664	  if (ch == EOF)
665	    goto fromeof;
666	  state = 0;
667	  PUT (ch);
668	  continue;
669	}
670
671      /* OK, we are somewhere in states 0 through 4 or 9 through 11 */
672
673      /* flushchar: */
674      ch = GET ();
675
676    recycle:
677
678#if defined TC_ARM && defined OBJ_ELF
679      /* We need to watch out for .symver directives.  See the comment later
680	 in this function.  */
681      if (symver_state == NULL)
682	{
683	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
684	    symver_state = symver_pseudo + 1;
685	}
686      else
687	{
688	  /* We advance to the next state if we find the right
689	     character.  */
690	  if (ch != '\0' && (*symver_state == ch))
691	    ++symver_state;
692	  else if (*symver_state != '\0')
693	    /* We did not get the expected character, or we didn't
694	       get a valid terminating character after seeing the
695	       entire pseudo-op, so we must go back to the beginning.  */
696	    symver_state = NULL;
697	  else
698	    {
699	      /* We've read the entire pseudo-op.  If this is the end
700		 of the line, go back to the beginning.  */
701	      if (IS_NEWLINE (ch))
702		symver_state = NULL;
703	    }
704	}
705#endif /* TC_ARM && OBJ_ELF */
706
707#ifdef TC_M68K
708      /* We want to have pseudo-ops which control whether we are in
709         MRI mode or not.  Unfortunately, since m68k MRI mode affects
710         the scrubber, that means that we need a special purpose
711         recognizer here.  */
712      if (mri_state == NULL)
713	{
714	  if ((state == 0 || state == 1)
715	      && ch == mri_pseudo[0])
716	    mri_state = mri_pseudo + 1;
717	}
718      else
719	{
720	  /* We advance to the next state if we find the right
721	     character, or if we need a space character and we get any
722	     whitespace character, or if we need a '0' and we get a
723	     '1' (this is so that we only need one state to handle
724	     ``.mri 0'' and ``.mri 1'').  */
725	  if (ch != '\0'
726	      && (*mri_state == ch
727		  || (*mri_state == ' '
728		      && lex[ch] == LEX_IS_WHITESPACE)
729		  || (*mri_state == '0'
730		      && ch == '1')))
731	    {
732	      mri_last_ch = ch;
733	      ++mri_state;
734	    }
735	  else if (*mri_state != '\0'
736		   || (lex[ch] != LEX_IS_WHITESPACE
737		       && lex[ch] != LEX_IS_NEWLINE))
738	    {
739	      /* We did not get the expected character, or we didn't
740		 get a valid terminating character after seeing the
741		 entire pseudo-op, so we must go back to the
742		 beginning.  */
743	      mri_state = NULL;
744	    }
745	  else
746	    {
747	      /* We've read the entire pseudo-op.  mips_last_ch is
748                 either '0' or '1' indicating whether to enter or
749                 leave MRI mode.  */
750	      do_scrub_begin (mri_last_ch == '1');
751	      mri_state = NULL;
752
753	      /* We continue handling the character as usual.  The
754                 main gas reader must also handle the .mri pseudo-op
755                 to control expression parsing and the like.  */
756	    }
757	}
758#endif
759
760      if (ch == EOF)
761	{
762	  if (state != 0)
763	    {
764	      as_warn (_("end of file not at end of a line; newline inserted"));
765	      state = 0;
766	      PUT ('\n');
767	    }
768	  goto fromeof;
769	}
770
771      switch (lex[ch])
772	{
773	case LEX_IS_WHITESPACE:
774	  do
775	    {
776	      ch = GET ();
777	    }
778	  while (ch != EOF && IS_WHITESPACE (ch));
779	  if (ch == EOF)
780	    goto fromeof;
781
782	  if (state == 0)
783	    {
784	      /* Preserve a single whitespace character at the
785		 beginning of a line.  */
786	      state = 1;
787	      UNGET (ch);
788	      PUT (' ');
789	      break;
790	    }
791
792#ifdef KEEP_WHITE_AROUND_COLON
793	  if (lex[ch] == LEX_IS_COLON)
794	    {
795	      /* Only keep this white if there's no white *after* the
796                 colon.  */
797	      ch2 = GET ();
798	      UNGET (ch2);
799	      if (!IS_WHITESPACE (ch2))
800		{
801		  state = 9;
802		  UNGET (ch);
803		  PUT (' ');
804		  break;
805		}
806	    }
807#endif
808	  if (IS_COMMENT (ch)
809	      || ch == '/'
810	      || IS_LINE_SEPARATOR (ch)
811	      || IS_PARALLEL_SEPARATOR (ch))
812	    {
813	      if (scrub_m68k_mri)
814		{
815		  /* In MRI mode, we keep these spaces.  */
816		  UNGET (ch);
817		  PUT (' ');
818		  break;
819		}
820	      goto recycle;
821	    }
822
823	  /* If we're in state 2 or 11, we've seen a non-white
824	     character followed by whitespace.  If the next character
825	     is ':', this is whitespace after a label name which we
826	     normally must ignore.  In MRI mode, though, spaces are
827	     not permitted between the label and the colon.  */
828	  if ((state == 2 || state == 11)
829	      && lex[ch] == LEX_IS_COLON
830	      && ! scrub_m68k_mri)
831	    {
832	      state = 1;
833	      PUT (ch);
834	      break;
835	    }
836
837	  switch (state)
838	    {
839	    case 0:
840	      state++;
841	      goto recycle;	/* Punted leading sp */
842	    case 1:
843	      /* We can arrive here if we leave a leading whitespace
844		 character at the beginning of a line.  */
845	      goto recycle;
846	    case 2:
847	      state = 3;
848	      if (to + 1 < toend)
849		{
850		  /* Optimize common case by skipping UNGET/GET.  */
851		  PUT (' ');	/* Sp after opco */
852		  goto recycle;
853		}
854	      UNGET (ch);
855	      PUT (' ');
856	      break;
857	    case 3:
858	      if (scrub_m68k_mri)
859		{
860		  /* In MRI mode, we keep these spaces.  */
861		  UNGET (ch);
862		  PUT (' ');
863		  break;
864		}
865	      goto recycle;	/* Sp in operands */
866	    case 9:
867	    case 10:
868	      if (scrub_m68k_mri)
869		{
870		  /* In MRI mode, we keep these spaces.  */
871		  state = 3;
872		  UNGET (ch);
873		  PUT (' ');
874		  break;
875		}
876	      state = 10;	/* Sp after symbol char */
877	      goto recycle;
878	    case 11:
879	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
880		state = 1;
881	      else
882		{
883		  /* We know that ch is not ':', since we tested that
884                     case above.  Therefore this is not a label, so it
885                     must be the opcode, and we've just seen the
886                     whitespace after it.  */
887		  state = 3;
888		}
889	      UNGET (ch);
890	      PUT (' ');	/* Sp after label definition.  */
891	      break;
892	    default:
893	      BAD_CASE (state);
894	    }
895	  break;
896
897	case LEX_IS_TWOCHAR_COMMENT_1ST:
898	  ch2 = GET ();
899	  if (ch2 == '*')
900	    {
901	      for (;;)
902		{
903		  do
904		    {
905		      ch2 = GET ();
906		      if (ch2 != EOF && IS_NEWLINE (ch2))
907			add_newlines++;
908		    }
909		  while (ch2 != EOF && ch2 != '*');
910
911		  while (ch2 == '*')
912		    ch2 = GET ();
913
914		  if (ch2 == EOF || ch2 == '/')
915		    break;
916
917		  /* This UNGET will ensure that we count newlines
918                     correctly.  */
919		  UNGET (ch2);
920		}
921
922	      if (ch2 == EOF)
923		as_warn (_("end of file in multiline comment"));
924
925	      ch = ' ';
926	      goto recycle;
927	    }
928#ifdef DOUBLESLASH_LINE_COMMENTS
929	  else if (ch2 == '/')
930	    {
931	      do
932		{
933		  ch = GET ();
934		}
935	      while (ch != EOF && !IS_NEWLINE (ch));
936	      if (ch == EOF)
937		as_warn ("end of file in comment; newline inserted");
938	      state = 0;
939	      PUT ('\n');
940	      break;
941	    }
942#endif
943	  else
944	    {
945	      if (ch2 != EOF)
946		UNGET (ch2);
947	      if (state == 9 || state == 10)
948		state = 3;
949	      PUT (ch);
950	    }
951	  break;
952
953	case LEX_IS_STRINGQUOTE:
954	  if (state == 10)
955	    {
956	      /* Preserve the whitespace in foo "bar" */
957	      UNGET (ch);
958	      state = 3;
959	      PUT (' ');
960
961	      /* PUT didn't jump out.  We could just break, but we
962                 know what will happen, so optimize a bit.  */
963	      ch = GET ();
964	      old_state = 3;
965	    }
966	  else if (state == 9)
967	    old_state = 3;
968	  else
969	    old_state = state;
970	  state = 5;
971	  PUT (ch);
972	  break;
973
974#ifndef IEEE_STYLE
975	case LEX_IS_ONECHAR_QUOTE:
976	  if (state == 10)
977	    {
978	      /* Preserve the whitespace in foo 'b' */
979	      UNGET (ch);
980	      state = 3;
981	      PUT (' ');
982	      break;
983	    }
984	  ch = GET ();
985	  if (ch == EOF)
986	    {
987	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
988	      ch = 0;
989	    }
990	  if (ch == '\\')
991	    {
992	      ch = GET ();
993	      if (ch == EOF)
994		{
995		  as_warn (_("end of file in escape character"));
996		  ch = '\\';
997		}
998	      else
999		ch = process_escape (ch);
1000	    }
1001	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1002
1003	  /* None of these 'x constants for us.  We want 'x'.  */
1004	  if ((ch = GET ()) != '\'')
1005	    {
1006#ifdef REQUIRE_CHAR_CLOSE_QUOTE
1007	      as_warn (_("missing close quote; (assumed)"));
1008#else
1009	      if (ch != EOF)
1010		UNGET (ch);
1011#endif
1012	    }
1013	  if (strlen (out_buf) == 1)
1014	    {
1015	      PUT (out_buf[0]);
1016	      break;
1017	    }
1018	  if (state == 9)
1019	    old_state = 3;
1020	  else
1021	    old_state = state;
1022	  state = -1;
1023	  out_string = out_buf;
1024	  PUT (*out_string++);
1025	  break;
1026#endif
1027
1028	case LEX_IS_COLON:
1029#ifdef KEEP_WHITE_AROUND_COLON
1030	  state = 9;
1031#else
1032	  if (state == 9 || state == 10)
1033	    state = 3;
1034	  else if (state != 3)
1035	    state = 1;
1036#endif
1037	  PUT (ch);
1038	  break;
1039
1040	case LEX_IS_NEWLINE:
1041	  /* Roll out a bunch of newlines from inside comments, etc.  */
1042	  if (add_newlines)
1043	    {
1044	      --add_newlines;
1045	      UNGET (ch);
1046	    }
1047	  /* Fall through.  */
1048
1049	case LEX_IS_LINE_SEPARATOR:
1050	  state = 0;
1051	  PUT (ch);
1052	  break;
1053
1054	case LEX_IS_PARALLEL_SEPARATOR:
1055	  state = 1;
1056	  PUT (ch);
1057	  break;
1058
1059#ifdef TC_V850
1060	case LEX_IS_DOUBLEDASH_1ST:
1061	  ch2 = GET ();
1062	  if (ch2 != '-')
1063	    {
1064	      UNGET (ch2);
1065	      goto de_fault;
1066	    }
1067	  /* Read and skip to end of line.  */
1068	  do
1069	    {
1070	      ch = GET ();
1071	    }
1072	  while (ch != EOF && ch != '\n');
1073	  if (ch == EOF)
1074	    {
1075	      as_warn (_("end of file in comment; newline inserted"));
1076	    }
1077	  state = 0;
1078	  PUT ('\n');
1079	  break;
1080#endif
1081#ifdef DOUBLEBAR_PARALLEL
1082	case LEX_IS_DOUBLEBAR_1ST:
1083	  ch2 = GET ();
1084	  if (ch2 != '|')
1085	    {
1086	      UNGET (ch2);
1087	      goto de_fault;
1088	    }
1089	  /* Reset back to state 1 and pretend that we are parsing a line from
1090	     just after the first white space.  */
1091	  state = 1;
1092	  PUT ('|');
1093	  PUT ('|');
1094	  break;
1095#endif
1096	case LEX_IS_LINE_COMMENT_START:
1097	  /* FIXME-someday: The two character comment stuff was badly
1098	     thought out.  On i386, we want '/' as line comment start
1099	     AND we want C style comments.  hence this hack.  The
1100	     whole lexical process should be reworked.  xoxorich.  */
1101	  if (ch == '/')
1102	    {
1103	      ch2 = GET ();
1104	      if (ch2 == '*')
1105		{
1106		  old_state = 3;
1107		  state = -2;
1108		  break;
1109		}
1110	      else
1111		{
1112		  UNGET (ch2);
1113		}
1114	    } /* bad hack */
1115
1116	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1117	    {
1118	      int startch;
1119
1120	      startch = ch;
1121
1122	      do
1123		{
1124		  ch = GET ();
1125		}
1126	      while (ch != EOF && IS_WHITESPACE (ch));
1127	      if (ch == EOF)
1128		{
1129		  as_warn (_("end of file in comment; newline inserted"));
1130		  PUT ('\n');
1131		  break;
1132		}
1133	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1134		{
1135		  /* Not a cpp line.  */
1136		  while (ch != EOF && !IS_NEWLINE (ch))
1137		    ch = GET ();
1138		  if (ch == EOF)
1139		    as_warn (_("end of file in comment; newline inserted"));
1140		  state = 0;
1141		  PUT ('\n');
1142		  break;
1143		}
1144	      /* Looks like `# 123 "filename"' from cpp.  */
1145	      UNGET (ch);
1146	      old_state = 4;
1147	      state = -1;
1148	      if (scrub_m68k_mri)
1149		out_string = "\tappline ";
1150	      else
1151		out_string = "\t.appline ";
1152	      PUT (*out_string++);
1153	      break;
1154	    }
1155
1156#ifdef TC_D10V
1157	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1158	     Trap is the only short insn that has a first operand that is
1159	     neither register nor label.
1160	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1161	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1162	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1163	     only character in line_comment_chars for d10v, hence we
1164	     can recognize it as such.  */
1165	  /* An alternative approach would be to reset the state to 1 when
1166	     we see '||', '<'- or '->', but that seems to be overkill.  */
1167	  if (state == 10)
1168	    PUT (' ');
1169#endif
1170	  /* We have a line comment character which is not at the
1171	     start of a line.  If this is also a normal comment
1172	     character, fall through.  Otherwise treat it as a default
1173	     character.  */
1174	  if (strchr (tc_comment_chars, ch) == NULL
1175	      && (! scrub_m68k_mri
1176		  || (ch != '!' && ch != '*')))
1177	    goto de_fault;
1178	  if (scrub_m68k_mri
1179	      && (ch == '!' || ch == '*' || ch == '#')
1180	      && state != 1
1181	      && state != 10)
1182	    goto de_fault;
1183	  /* Fall through.  */
1184	case LEX_IS_COMMENT_START:
1185#if defined TC_ARM && defined OBJ_ELF
1186	  /* On the ARM, `@' is the comment character.
1187	     Unfortunately this is also a special character in ELF .symver
1188	     directives (and .type, though we deal with those another way).
1189	     So we check if this line is such a directive, and treat
1190	     the character as default if so.  This is a hack.  */
1191	  if ((symver_state != NULL) && (*symver_state == 0))
1192	    goto de_fault;
1193#endif
1194#ifdef WARN_COMMENTS
1195	  if (!found_comment)
1196	    as_where (&found_comment_file, &found_comment);
1197#endif
1198	  do
1199	    {
1200	      ch = GET ();
1201	    }
1202	  while (ch != EOF && !IS_NEWLINE (ch));
1203	  if (ch == EOF)
1204	    as_warn (_("end of file in comment; newline inserted"));
1205	  state = 0;
1206	  PUT ('\n');
1207	  break;
1208
1209	case LEX_IS_SYMBOL_COMPONENT:
1210	  if (state == 10)
1211	    {
1212	      /* This is a symbol character following another symbol
1213		 character, with whitespace in between.  We skipped
1214		 the whitespace earlier, so output it now.  */
1215	      UNGET (ch);
1216	      state = 3;
1217	      PUT (' ');
1218	      break;
1219	    }
1220
1221	  if (state == 3)
1222	    state = 9;
1223
1224	  /* This is a common case.  Quickly copy CH and all the
1225             following symbol component or normal characters.  */
1226	  if (to + 1 < toend
1227	      && mri_state == NULL
1228#if defined TC_ARM && defined OBJ_ELF
1229	      && symver_state == NULL
1230#endif
1231	      )
1232	    {
1233	      char *s;
1234	      int len;
1235
1236	      for (s = from; s < fromend; s++)
1237		{
1238		  int type;
1239
1240		  ch2 = *(unsigned char *) s;
1241		  type = lex[ch2];
1242		  if (type != 0
1243		      && type != LEX_IS_SYMBOL_COMPONENT)
1244		    break;
1245		}
1246	      if (s > from)
1247		{
1248		  /* Handle the last character normally, for
1249                     simplicity.  */
1250		  --s;
1251		}
1252	      len = s - from;
1253	      if (len > (toend - to) - 1)
1254		len = (toend - to) - 1;
1255	      if (len > 0)
1256		{
1257		  PUT (ch);
1258		  if (len > 8)
1259		    {
1260		      memcpy (to, from, len);
1261		      to += len;
1262		      from += len;
1263		    }
1264		  else
1265		    {
1266		      switch (len)
1267			{
1268			case 8: *to++ = *from++;
1269			case 7: *to++ = *from++;
1270			case 6: *to++ = *from++;
1271			case 5: *to++ = *from++;
1272			case 4: *to++ = *from++;
1273			case 3: *to++ = *from++;
1274			case 2: *to++ = *from++;
1275			case 1: *to++ = *from++;
1276			}
1277		    }
1278		  ch = GET ();
1279		}
1280	    }
1281
1282	  /* Fall through.  */
1283	default:
1284	de_fault:
1285	  /* Some relatively `normal' character.  */
1286	  if (state == 0)
1287	    {
1288	      state = 11;	/* Now seeing label definition */
1289	    }
1290	  else if (state == 1)
1291	    {
1292	      state = 2;	/* Ditto */
1293	    }
1294	  else if (state == 9)
1295	    {
1296	      if (lex[ch] != LEX_IS_SYMBOL_COMPONENT)
1297		state = 3;
1298	    }
1299	  else if (state == 10)
1300	    {
1301	      if (ch == '\\')
1302		{
1303		  /* Special handling for backslash: a backslash may
1304		     be the beginning of a formal parameter (of a
1305		     macro) following another symbol character, with
1306		     whitespace in between.  If that is the case, we
1307		     output a space before the parameter.  Strictly
1308		     speaking, correct handling depends upon what the
1309		     macro parameter expands into; if the parameter
1310		     expands into something which does not start with
1311		     an operand character, then we don't want to keep
1312		     the space.  We don't have enough information to
1313		     make the right choice, so here we are making the
1314		     choice which is more likely to be correct.  */
1315		  PUT (' ');
1316		}
1317
1318	      state = 3;
1319	    }
1320	  PUT (ch);
1321	  break;
1322	}
1323    }
1324
1325  /*NOTREACHED*/
1326
1327 fromeof:
1328  /* We have reached the end of the input.  */
1329  return to - tostart;
1330
1331 tofull:
1332  /* The output buffer is full.  Save any input we have not yet
1333     processed.  */
1334  if (fromend > from)
1335    {
1336      saved_input = from;
1337      saved_input_len = fromend - from;
1338    }
1339  else
1340    saved_input = NULL;
1341
1342  return to - tostart;
1343}
1344
1345/* end of app.c */
1346