xref: /netbsd-src/external/gpl3/binutils/dist/gas/app.c (revision 96fc3e30a7c3f7bba53384bf41dad5f78306fac4)
1 /* This is the Assembler Pre-Processor
2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
3    1999, 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2008, 2009, 2010
4    Free Software Foundation, Inc.
5 
6    This file is part of GAS, the GNU Assembler.
7 
8    GAS is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3, or (at your option)
11    any later version.
12 
13    GAS is distributed in the hope that it will be useful, but WITHOUT
14    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
16    License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with GAS; see the file COPYING.  If not, write to the Free
20    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
21    02110-1301, USA.  */
22 
23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
24 /* App, the assembler pre-processor.  This pre-processor strips out
25    excess spaces, turns single-quoted characters into a decimal
26    constant, and turns the # in # <number> <filename> <garbage> into a
27    .linefile.  This needs better error-handling.  */
28 
29 #include "as.h"
30 
31 #if (__STDC__ != 1)
32 #ifndef const
33 #define const  /* empty */
34 #endif
35 #endif
36 
37 #ifdef H_TICK_HEX
38 int enable_h_tick_hex = 0;
39 #endif
40 
41 #ifdef TC_M68K
42 /* Whether we are scrubbing in m68k MRI mode.  This is different from
43    flag_m68k_mri, because the two flags will be affected by the .mri
44    pseudo-op at different times.  */
45 static int scrub_m68k_mri;
46 
47 /* The pseudo-op which switches in and out of MRI mode.  See the
48    comment in do_scrub_chars.  */
49 static const char mri_pseudo[] = ".mri 0";
50 #else
51 #define scrub_m68k_mri 0
52 #endif
53 
54 #if defined TC_ARM && defined OBJ_ELF
55 /* The pseudo-op for which we need to special-case `@' characters.
56    See the comment in do_scrub_chars.  */
57 static const char   symver_pseudo[] = ".symver";
58 static const char * symver_state;
59 #endif
60 
61 static char lex[256];
62 static const char symbol_chars[] =
63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
64 
65 #define LEX_IS_SYMBOL_COMPONENT		1
66 #define LEX_IS_WHITESPACE		2
67 #define LEX_IS_LINE_SEPARATOR		3
68 #define LEX_IS_COMMENT_START		4
69 #define LEX_IS_LINE_COMMENT_START	5
70 #define	LEX_IS_TWOCHAR_COMMENT_1ST	6
71 #define	LEX_IS_STRINGQUOTE		8
72 #define	LEX_IS_COLON			9
73 #define	LEX_IS_NEWLINE			10
74 #define	LEX_IS_ONECHAR_QUOTE		11
75 #ifdef TC_V850
76 #define LEX_IS_DOUBLEDASH_1ST		12
77 #endif
78 #ifdef TC_M32R
79 #define DOUBLEBAR_PARALLEL
80 #endif
81 #ifdef DOUBLEBAR_PARALLEL
82 #define LEX_IS_DOUBLEBAR_1ST		13
83 #endif
84 #define LEX_IS_PARALLEL_SEPARATOR	14
85 #ifdef H_TICK_HEX
86 #define LEX_IS_H			15
87 #endif
88 #define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
89 #define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
90 #define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
91 #define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
92 #define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
93 #define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
94 #define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
95 
96 static int process_escape (int);
97 
98 /* FIXME-soon: The entire lexer/parser thingy should be
99    built statically at compile time rather than dynamically
100    each and every time the assembler is run.  xoxorich.  */
101 
102 void
103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
104 {
105   const char *p;
106   int c;
107 
108   lex[' '] = LEX_IS_WHITESPACE;
109   lex['\t'] = LEX_IS_WHITESPACE;
110   lex['\r'] = LEX_IS_WHITESPACE;
111   lex['\n'] = LEX_IS_NEWLINE;
112   lex[':'] = LEX_IS_COLON;
113 
114 #ifdef TC_M68K
115   scrub_m68k_mri = m68k_mri;
116 
117   if (! m68k_mri)
118 #endif
119     {
120       lex['"'] = LEX_IS_STRINGQUOTE;
121 
122 #if ! defined (TC_HPPA) && ! defined (TC_I370)
123       /* I370 uses single-quotes to delimit integer, float constants.  */
124       lex['\''] = LEX_IS_ONECHAR_QUOTE;
125 #endif
126 
127 #ifdef SINGLE_QUOTE_STRINGS
128       lex['\''] = LEX_IS_STRINGQUOTE;
129 #endif
130     }
131 
132   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
133      in state 5 of do_scrub_chars must be changed.  */
134 
135   /* Note that these override the previous defaults, e.g. if ';' is a
136      comment char, then it isn't a line separator.  */
137   for (p = symbol_chars; *p; ++p)
138     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
139 
140   for (c = 128; c < 256; ++c)
141     lex[c] = LEX_IS_SYMBOL_COMPONENT;
142 
143 #ifdef tc_symbol_chars
144   /* This macro permits the processor to specify all characters which
145      may appears in an operand.  This will prevent the scrubber from
146      discarding meaningful whitespace in certain cases.  The i386
147      backend uses this to support prefixes, which can confuse the
148      scrubber as to whether it is parsing operands or opcodes.  */
149   for (p = tc_symbol_chars; *p; ++p)
150     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
151 #endif
152 
153   /* The m68k backend wants to be able to change comment_chars.  */
154 #ifndef tc_comment_chars
155 #define tc_comment_chars comment_chars
156 #endif
157   for (p = tc_comment_chars; *p; p++)
158     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
159 
160   for (p = line_comment_chars; *p; p++)
161     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
162 
163   for (p = line_separator_chars; *p; p++)
164     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
165 
166 #ifdef tc_parallel_separator_chars
167   /* This macro permits the processor to specify all characters which
168      separate parallel insns on the same line.  */
169   for (p = tc_parallel_separator_chars; *p; p++)
170     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
171 #endif
172 
173   /* Only allow slash-star comments if slash is not in use.
174      FIXME: This isn't right.  We should always permit them.  */
175   if (lex['/'] == 0)
176     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
177 
178 #ifdef TC_M68K
179   if (m68k_mri)
180     {
181       lex['\''] = LEX_IS_STRINGQUOTE;
182       lex[';'] = LEX_IS_COMMENT_START;
183       lex['*'] = LEX_IS_LINE_COMMENT_START;
184       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
185 	 then it can't be used in an expression.  */
186       lex['!'] = LEX_IS_LINE_COMMENT_START;
187     }
188 #endif
189 
190 #ifdef TC_V850
191   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
192 #endif
193 #ifdef DOUBLEBAR_PARALLEL
194   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
195 #endif
196 #ifdef TC_D30V
197   /* Must do this is we want VLIW instruction with "->" or "<-".  */
198   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
199 #endif
200 
201 #ifdef H_TICK_HEX
202   if (enable_h_tick_hex)
203     {
204       lex['h'] = LEX_IS_H;
205       lex['H'] = LEX_IS_H;
206     }
207 #endif
208 }
209 
210 /* Saved state of the scrubber.  */
211 static int state;
212 static int old_state;
213 static char *out_string;
214 static char out_buf[20];
215 static int add_newlines;
216 static char *saved_input;
217 static int saved_input_len;
218 static char input_buffer[32 * 1024];
219 static const char *mri_state;
220 static char mri_last_ch;
221 
222 /* Data structure for saving the state of app across #include's.  Note that
223    app is called asynchronously to the parsing of the .include's, so our
224    state at the time .include is interpreted is completely unrelated.
225    That's why we have to save it all.  */
226 
227 struct app_save
228 {
229   int          state;
230   int          old_state;
231   char *       out_string;
232   char         out_buf[sizeof (out_buf)];
233   int          add_newlines;
234   char *       saved_input;
235   int          saved_input_len;
236 #ifdef TC_M68K
237   int          scrub_m68k_mri;
238 #endif
239   const char * mri_state;
240   char         mri_last_ch;
241 #if defined TC_ARM && defined OBJ_ELF
242   const char * symver_state;
243 #endif
244 };
245 
246 char *
247 app_push (void)
248 {
249   register struct app_save *saved;
250 
251   saved = (struct app_save *) xmalloc (sizeof (*saved));
252   saved->state = state;
253   saved->old_state = old_state;
254   saved->out_string = out_string;
255   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
256   saved->add_newlines = add_newlines;
257   if (saved_input == NULL)
258     saved->saved_input = NULL;
259   else
260     {
261       saved->saved_input = (char *) xmalloc (saved_input_len);
262       memcpy (saved->saved_input, saved_input, saved_input_len);
263       saved->saved_input_len = saved_input_len;
264     }
265 #ifdef TC_M68K
266   saved->scrub_m68k_mri = scrub_m68k_mri;
267 #endif
268   saved->mri_state = mri_state;
269   saved->mri_last_ch = mri_last_ch;
270 #if defined TC_ARM && defined OBJ_ELF
271   saved->symver_state = symver_state;
272 #endif
273 
274   /* do_scrub_begin() is not useful, just wastes time.  */
275 
276   state = 0;
277   saved_input = NULL;
278 
279   return (char *) saved;
280 }
281 
282 void
283 app_pop (char *arg)
284 {
285   register struct app_save *saved = (struct app_save *) arg;
286 
287   /* There is no do_scrub_end ().  */
288   state = saved->state;
289   old_state = saved->old_state;
290   out_string = saved->out_string;
291   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
292   add_newlines = saved->add_newlines;
293   if (saved->saved_input == NULL)
294     saved_input = NULL;
295   else
296     {
297       gas_assert (saved->saved_input_len <= (int) (sizeof input_buffer));
298       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
299       saved_input = input_buffer;
300       saved_input_len = saved->saved_input_len;
301       free (saved->saved_input);
302     }
303 #ifdef TC_M68K
304   scrub_m68k_mri = saved->scrub_m68k_mri;
305 #endif
306   mri_state = saved->mri_state;
307   mri_last_ch = saved->mri_last_ch;
308 #if defined TC_ARM && defined OBJ_ELF
309   symver_state = saved->symver_state;
310 #endif
311 
312   free (arg);
313 }
314 
315 /* @@ This assumes that \n &c are the same on host and target.  This is not
316    necessarily true.  */
317 
318 static int
319 process_escape (int ch)
320 {
321   switch (ch)
322     {
323     case 'b':
324       return '\b';
325     case 'f':
326       return '\f';
327     case 'n':
328       return '\n';
329     case 'r':
330       return '\r';
331     case 't':
332       return '\t';
333     case '\'':
334       return '\'';
335     case '"':
336       return '\"';
337     default:
338       return ch;
339     }
340 }
341 
342 /* This function is called to process input characters.  The GET
343    parameter is used to retrieve more input characters.  GET should
344    set its parameter to point to a buffer, and return the length of
345    the buffer; it should return 0 at end of file.  The scrubbed output
346    characters are put into the buffer starting at TOSTART; the TOSTART
347    buffer is TOLEN bytes in length.  The function returns the number
348    of scrubbed characters put into TOSTART.  This will be TOLEN unless
349    end of file was seen.  This function is arranged as a state
350    machine, and saves its state so that it may return at any point.
351    This is the way the old code used to work.  */
352 
353 int
354 do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen)
355 {
356   char *to = tostart;
357   char *toend = tostart + tolen;
358   char *from;
359   char *fromend;
360   int fromlen;
361   register int ch, ch2 = 0;
362   /* Character that started the string we're working on.  */
363   static char quotechar;
364 
365   /*State 0: beginning of normal line
366 	  1: After first whitespace on line (flush more white)
367 	  2: After first non-white (opcode) on line (keep 1white)
368 	  3: after second white on line (into operands) (flush white)
369 	  4: after putting out a .linefile, put out digits
370 	  5: parsing a string, then go to old-state
371 	  6: putting out \ escape in a "d string.
372 	  7: no longer used
373 	  8: no longer used
374 	  9: After seeing symbol char in state 3 (keep 1white after symchar)
375 	 10: After seeing whitespace in state 9 (keep white before symchar)
376 	 11: After seeing a symbol character in state 0 (eg a label definition)
377 	 -1: output string in out_string and go to the state in old_state
378 	 -2: flush text until a '*' '/' is seen, then go to state old_state
379 #ifdef TC_V850
380 	 12: After seeing a dash, looking for a second dash as a start
381 	     of comment.
382 #endif
383 #ifdef DOUBLEBAR_PARALLEL
384 	 13: After seeing a vertical bar, looking for a second
385 	     vertical bar as a parallel expression separator.
386 #endif
387 #ifdef TC_PREDICATE_START_CHAR
388 	 14: After seeing a predicate start character at state 0, looking
389 	     for a predicate end character as predicate.
390 	 15: After seeing a predicate start character at state 1, looking
391 	     for a predicate end character as predicate.
392 #endif
393 #ifdef TC_Z80
394 	 16: After seeing an 'a' or an 'A' at the start of a symbol
395 	 17: After seeing an 'f' or an 'F' in state 16
396 #endif
397 	  */
398 
399   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
400      constructs like ``.loc 1 20''.  This was turning into ``.loc
401      120''.  States 9 and 10 ensure that a space is never dropped in
402      between characters which could appear in an identifier.  Ian
403      Taylor, ian@cygnus.com.
404 
405      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
406      correctly on the PA (and any other target where colons are optional).
407      Jeff Law, law@cs.utah.edu.
408 
409      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
410      get squashed into "cmp r1,r2||trap#1", with the all important space
411      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
412 
413   /* This macro gets the next input character.  */
414 
415 #define GET()							\
416   (from < fromend						\
417    ? * (unsigned char *) (from++)				\
418    : (saved_input = NULL,					\
419       fromlen = (*get) (input_buffer, sizeof input_buffer),	\
420       from = input_buffer,					\
421       fromend = from + fromlen,					\
422       (fromlen == 0						\
423        ? EOF							\
424        : * (unsigned char *) (from++))))
425 
426   /* This macro pushes a character back on the input stream.  */
427 
428 #define UNGET(uch) (*--from = (uch))
429 
430   /* This macro puts a character into the output buffer.  If this
431      character fills the output buffer, this macro jumps to the label
432      TOFULL.  We use this rather ugly approach because we need to
433      handle two different termination conditions: EOF on the input
434      stream, and a full output buffer.  It would be simpler if we
435      always read in the entire input stream before processing it, but
436      I don't want to make such a significant change to the assembler's
437      memory usage.  */
438 
439 #define PUT(pch)				\
440   do						\
441     {						\
442       *to++ = (pch);				\
443       if (to >= toend)				\
444 	goto tofull;				\
445     }						\
446   while (0)
447 
448   if (saved_input != NULL)
449     {
450       from = saved_input;
451       fromend = from + saved_input_len;
452     }
453   else
454     {
455       fromlen = (*get) (input_buffer, sizeof input_buffer);
456       if (fromlen == 0)
457 	return 0;
458       from = input_buffer;
459       fromend = from + fromlen;
460     }
461 
462   while (1)
463     {
464       /* The cases in this switch end with continue, in order to
465 	 branch back to the top of this while loop and generate the
466 	 next output character in the appropriate state.  */
467       switch (state)
468 	{
469 	case -1:
470 	  ch = *out_string++;
471 	  if (*out_string == '\0')
472 	    {
473 	      state = old_state;
474 	      old_state = 3;
475 	    }
476 	  PUT (ch);
477 	  continue;
478 
479 	case -2:
480 	  for (;;)
481 	    {
482 	      do
483 		{
484 		  ch = GET ();
485 
486 		  if (ch == EOF)
487 		    {
488 		      as_warn (_("end of file in comment"));
489 		      goto fromeof;
490 		    }
491 
492 		  if (ch == '\n')
493 		    PUT ('\n');
494 		}
495 	      while (ch != '*');
496 
497 	      while ((ch = GET ()) == '*')
498 		;
499 
500 	      if (ch == EOF)
501 		{
502 		  as_warn (_("end of file in comment"));
503 		  goto fromeof;
504 		}
505 
506 	      if (ch == '/')
507 		break;
508 
509 	      UNGET (ch);
510 	    }
511 
512 	  state = old_state;
513 	  UNGET (' ');
514 	  continue;
515 
516 	case 4:
517 	  ch = GET ();
518 	  if (ch == EOF)
519 	    goto fromeof;
520 	  else if (ch >= '0' && ch <= '9')
521 	    PUT (ch);
522 	  else
523 	    {
524 	      while (ch != EOF && IS_WHITESPACE (ch))
525 		ch = GET ();
526 	      if (ch == '"')
527 		{
528 		  quotechar = ch;
529 		  state = 5;
530 		  old_state = 3;
531 		  PUT (ch);
532 		}
533 	      else
534 		{
535 		  while (ch != EOF && ch != '\n')
536 		    ch = GET ();
537 		  state = 0;
538 		  PUT (ch);
539 		}
540 	    }
541 	  continue;
542 
543 	case 5:
544 	  /* We are going to copy everything up to a quote character,
545 	     with special handling for a backslash.  We try to
546 	     optimize the copying in the simple case without using the
547 	     GET and PUT macros.  */
548 	  {
549 	    char *s;
550 	    int len;
551 
552 	    for (s = from; s < fromend; s++)
553 	      {
554 		ch = *s;
555 		if (ch == '\\'
556 		    || ch == quotechar
557 		    || ch == '\n')
558 		  break;
559 	      }
560 	    len = s - from;
561 	    if (len > toend - to)
562 	      len = toend - to;
563 	    if (len > 0)
564 	      {
565 		memcpy (to, from, len);
566 		to += len;
567 		from += len;
568 		if (to >= toend)
569 		  goto tofull;
570 	      }
571 	  }
572 
573 	  ch = GET ();
574 	  if (ch == EOF)
575 	    {
576 	      /* This buffer is here specifically so
577 		 that the UNGET below will work.  */
578 	      static char one_char_buf[1];
579 
580 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
581 	      state = old_state;
582 	      from = fromend = one_char_buf + 1;
583 	      fromlen = 1;
584 	      UNGET ('\n');
585 	      PUT (quotechar);
586 	    }
587 	  else if (ch == quotechar)
588 	    {
589 	      state = old_state;
590 	      PUT (ch);
591 	    }
592 #ifndef NO_STRING_ESCAPES
593 	  else if (ch == '\\')
594 	    {
595 	      state = 6;
596 	      PUT (ch);
597 	    }
598 #endif
599 	  else if (scrub_m68k_mri && ch == '\n')
600 	    {
601 	      /* Just quietly terminate the string.  This permits lines like
602 		   bne	label	loop if we haven't reach end yet.  */
603 	      state = old_state;
604 	      UNGET (ch);
605 	      PUT ('\'');
606 	    }
607 	  else
608 	    {
609 	      PUT (ch);
610 	    }
611 	  continue;
612 
613 	case 6:
614 	  state = 5;
615 	  ch = GET ();
616 	  switch (ch)
617 	    {
618 	      /* Handle strings broken across lines, by turning '\n' into
619 		 '\\' and 'n'.  */
620 	    case '\n':
621 	      UNGET ('n');
622 	      add_newlines++;
623 	      PUT ('\\');
624 	      continue;
625 
626 	    case EOF:
627 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
628 	      PUT (quotechar);
629 	      continue;
630 
631 	    case '"':
632 	    case '\\':
633 	    case 'b':
634 	    case 'f':
635 	    case 'n':
636 	    case 'r':
637 	    case 't':
638 	    case 'v':
639 	    case 'x':
640 	    case 'X':
641 	    case '0':
642 	    case '1':
643 	    case '2':
644 	    case '3':
645 	    case '4':
646 	    case '5':
647 	    case '6':
648 	    case '7':
649 	      break;
650 
651 	    default:
652 #ifdef ONLY_STANDARD_ESCAPES
653 	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
654 #endif
655 	      break;
656 	    }
657 	  PUT (ch);
658 	  continue;
659 
660 #ifdef DOUBLEBAR_PARALLEL
661 	case 13:
662 	  ch = GET ();
663 	  if (ch != '|')
664 	    abort ();
665 
666 	  /* Reset back to state 1 and pretend that we are parsing a
667 	     line from just after the first white space.  */
668 	  state = 1;
669 	  PUT ('|');
670 #ifdef TC_TIC6X
671 	  /* "||^" is used for SPMASKed instructions.  */
672 	  ch = GET ();
673 	  if (ch == EOF)
674 	    goto fromeof;
675 	  else if (ch == '^')
676 	    PUT ('^');
677 	  else
678 	    UNGET (ch);
679 #endif
680 	  continue;
681 #endif
682 #ifdef TC_Z80
683 	case 16:
684 	  /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
685 	  ch = GET ();
686 	  if (ch == 'f' || ch == 'F')
687 	    {
688 	      state = 17;
689 	      PUT (ch);
690 	    }
691 	  else
692 	    {
693 	      state = 9;
694 	      break;
695 	    }
696 	case 17:
697 	  /* We have seen "af" at the start of a symbol,
698 	     a ' here is a part of that symbol.  */
699 	  ch = GET ();
700 	  state = 9;
701 	  if (ch == '\'')
702 	    /* Change to avoid warning about unclosed string.  */
703 	    PUT ('`');
704 	  else if (ch != EOF)
705 	    UNGET (ch);
706 	  break;
707 #endif
708 	}
709 
710       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
711 
712       /* flushchar: */
713       ch = GET ();
714 
715 #ifdef TC_PREDICATE_START_CHAR
716       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
717 	{
718 	  state += 14;
719 	  PUT (ch);
720 	  continue;
721 	}
722       else if (state == 14 || state == 15)
723 	{
724 	  if (ch == TC_PREDICATE_END_CHAR)
725 	    {
726 	      state -= 14;
727 	      PUT (ch);
728 	      ch = GET ();
729 	    }
730 	  else
731 	    {
732 	      PUT (ch);
733 	      continue;
734 	    }
735 	}
736 #endif
737 
738     recycle:
739 
740 #if defined TC_ARM && defined OBJ_ELF
741       /* We need to watch out for .symver directives.  See the comment later
742 	 in this function.  */
743       if (symver_state == NULL)
744 	{
745 	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
746 	    symver_state = symver_pseudo + 1;
747 	}
748       else
749 	{
750 	  /* We advance to the next state if we find the right
751 	     character.  */
752 	  if (ch != '\0' && (*symver_state == ch))
753 	    ++symver_state;
754 	  else if (*symver_state != '\0')
755 	    /* We did not get the expected character, or we didn't
756 	       get a valid terminating character after seeing the
757 	       entire pseudo-op, so we must go back to the beginning.  */
758 	    symver_state = NULL;
759 	  else
760 	    {
761 	      /* We've read the entire pseudo-op.  If this is the end
762 		 of the line, go back to the beginning.  */
763 	      if (IS_NEWLINE (ch))
764 		symver_state = NULL;
765 	    }
766 	}
767 #endif /* TC_ARM && OBJ_ELF */
768 
769 #ifdef TC_M68K
770       /* We want to have pseudo-ops which control whether we are in
771 	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
772 	 the scrubber, that means that we need a special purpose
773 	 recognizer here.  */
774       if (mri_state == NULL)
775 	{
776 	  if ((state == 0 || state == 1)
777 	      && ch == mri_pseudo[0])
778 	    mri_state = mri_pseudo + 1;
779 	}
780       else
781 	{
782 	  /* We advance to the next state if we find the right
783 	     character, or if we need a space character and we get any
784 	     whitespace character, or if we need a '0' and we get a
785 	     '1' (this is so that we only need one state to handle
786 	     ``.mri 0'' and ``.mri 1'').  */
787 	  if (ch != '\0'
788 	      && (*mri_state == ch
789 		  || (*mri_state == ' '
790 		      && lex[ch] == LEX_IS_WHITESPACE)
791 		  || (*mri_state == '0'
792 		      && ch == '1')))
793 	    {
794 	      mri_last_ch = ch;
795 	      ++mri_state;
796 	    }
797 	  else if (*mri_state != '\0'
798 		   || (lex[ch] != LEX_IS_WHITESPACE
799 		       && lex[ch] != LEX_IS_NEWLINE))
800 	    {
801 	      /* We did not get the expected character, or we didn't
802 		 get a valid terminating character after seeing the
803 		 entire pseudo-op, so we must go back to the
804 		 beginning.  */
805 	      mri_state = NULL;
806 	    }
807 	  else
808 	    {
809 	      /* We've read the entire pseudo-op.  mips_last_ch is
810 		 either '0' or '1' indicating whether to enter or
811 		 leave MRI mode.  */
812 	      do_scrub_begin (mri_last_ch == '1');
813 	      mri_state = NULL;
814 
815 	      /* We continue handling the character as usual.  The
816 		 main gas reader must also handle the .mri pseudo-op
817 		 to control expression parsing and the like.  */
818 	    }
819 	}
820 #endif
821 
822       if (ch == EOF)
823 	{
824 	  if (state != 0)
825 	    {
826 	      as_warn (_("end of file not at end of a line; newline inserted"));
827 	      state = 0;
828 	      PUT ('\n');
829 	    }
830 	  goto fromeof;
831 	}
832 
833       switch (lex[ch])
834 	{
835 	case LEX_IS_WHITESPACE:
836 	  do
837 	    {
838 	      ch = GET ();
839 	    }
840 	  while (ch != EOF && IS_WHITESPACE (ch));
841 	  if (ch == EOF)
842 	    goto fromeof;
843 
844 	  if (state == 0)
845 	    {
846 	      /* Preserve a single whitespace character at the
847 		 beginning of a line.  */
848 	      state = 1;
849 	      UNGET (ch);
850 	      PUT (' ');
851 	      break;
852 	    }
853 
854 #ifdef KEEP_WHITE_AROUND_COLON
855 	  if (lex[ch] == LEX_IS_COLON)
856 	    {
857 	      /* Only keep this white if there's no white *after* the
858 		 colon.  */
859 	      ch2 = GET ();
860 	      if (ch2 != EOF)
861 		UNGET (ch2);
862 	      if (!IS_WHITESPACE (ch2))
863 		{
864 		  state = 9;
865 		  UNGET (ch);
866 		  PUT (' ');
867 		  break;
868 		}
869 	    }
870 #endif
871 	  if (IS_COMMENT (ch)
872 	      || ch == '/'
873 	      || IS_LINE_SEPARATOR (ch)
874 	      || IS_PARALLEL_SEPARATOR (ch))
875 	    {
876 	      if (scrub_m68k_mri)
877 		{
878 		  /* In MRI mode, we keep these spaces.  */
879 		  UNGET (ch);
880 		  PUT (' ');
881 		  break;
882 		}
883 	      goto recycle;
884 	    }
885 
886 	  /* If we're in state 2 or 11, we've seen a non-white
887 	     character followed by whitespace.  If the next character
888 	     is ':', this is whitespace after a label name which we
889 	     normally must ignore.  In MRI mode, though, spaces are
890 	     not permitted between the label and the colon.  */
891 	  if ((state == 2 || state == 11)
892 	      && lex[ch] == LEX_IS_COLON
893 	      && ! scrub_m68k_mri)
894 	    {
895 	      state = 1;
896 	      PUT (ch);
897 	      break;
898 	    }
899 
900 	  switch (state)
901 	    {
902 	    case 1:
903 	      /* We can arrive here if we leave a leading whitespace
904 		 character at the beginning of a line.  */
905 	      goto recycle;
906 	    case 2:
907 	      state = 3;
908 	      if (to + 1 < toend)
909 		{
910 		  /* Optimize common case by skipping UNGET/GET.  */
911 		  PUT (' ');	/* Sp after opco */
912 		  goto recycle;
913 		}
914 	      UNGET (ch);
915 	      PUT (' ');
916 	      break;
917 	    case 3:
918 #ifndef TC_KEEP_OPERAND_SPACES
919 	      /* For TI C6X, we keep these spaces as they may separate
920 		 functional unit specifiers from operands.  */
921 	      if (scrub_m68k_mri)
922 #endif
923 		{
924 		  /* In MRI mode, we keep these spaces.  */
925 		  UNGET (ch);
926 		  PUT (' ');
927 		  break;
928 		}
929 	      goto recycle;	/* Sp in operands */
930 	    case 9:
931 	    case 10:
932 #ifndef TC_KEEP_OPERAND_SPACES
933 	      if (scrub_m68k_mri)
934 #endif
935 		{
936 		  /* In MRI mode, we keep these spaces.  */
937 		  state = 3;
938 		  UNGET (ch);
939 		  PUT (' ');
940 		  break;
941 		}
942 	      state = 10;	/* Sp after symbol char */
943 	      goto recycle;
944 	    case 11:
945 	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
946 		state = 1;
947 	      else
948 		{
949 		  /* We know that ch is not ':', since we tested that
950 		     case above.  Therefore this is not a label, so it
951 		     must be the opcode, and we've just seen the
952 		     whitespace after it.  */
953 		  state = 3;
954 		}
955 	      UNGET (ch);
956 	      PUT (' ');	/* Sp after label definition.  */
957 	      break;
958 	    default:
959 	      BAD_CASE (state);
960 	    }
961 	  break;
962 
963 	case LEX_IS_TWOCHAR_COMMENT_1ST:
964 	  ch2 = GET ();
965 	  if (ch2 == '*')
966 	    {
967 	      for (;;)
968 		{
969 		  do
970 		    {
971 		      ch2 = GET ();
972 		      if (ch2 != EOF && IS_NEWLINE (ch2))
973 			add_newlines++;
974 		    }
975 		  while (ch2 != EOF && ch2 != '*');
976 
977 		  while (ch2 == '*')
978 		    ch2 = GET ();
979 
980 		  if (ch2 == EOF || ch2 == '/')
981 		    break;
982 
983 		  /* This UNGET will ensure that we count newlines
984 		     correctly.  */
985 		  UNGET (ch2);
986 		}
987 
988 	      if (ch2 == EOF)
989 		as_warn (_("end of file in multiline comment"));
990 
991 	      ch = ' ';
992 	      goto recycle;
993 	    }
994 #ifdef DOUBLESLASH_LINE_COMMENTS
995 	  else if (ch2 == '/')
996 	    {
997 	      do
998 		{
999 		  ch = GET ();
1000 		}
1001 	      while (ch != EOF && !IS_NEWLINE (ch));
1002 	      if (ch == EOF)
1003 		as_warn ("end of file in comment; newline inserted");
1004 	      state = 0;
1005 	      PUT ('\n');
1006 	      break;
1007 	    }
1008 #endif
1009 	  else
1010 	    {
1011 	      if (ch2 != EOF)
1012 		UNGET (ch2);
1013 	      if (state == 9 || state == 10)
1014 		state = 3;
1015 	      PUT (ch);
1016 	    }
1017 	  break;
1018 
1019 	case LEX_IS_STRINGQUOTE:
1020 	  quotechar = ch;
1021 	  if (state == 10)
1022 	    {
1023 	      /* Preserve the whitespace in foo "bar".  */
1024 	      UNGET (ch);
1025 	      state = 3;
1026 	      PUT (' ');
1027 
1028 	      /* PUT didn't jump out.  We could just break, but we
1029 		 know what will happen, so optimize a bit.  */
1030 	      ch = GET ();
1031 	      old_state = 3;
1032 	    }
1033 	  else if (state == 9)
1034 	    old_state = 3;
1035 	  else
1036 	    old_state = state;
1037 	  state = 5;
1038 	  PUT (ch);
1039 	  break;
1040 
1041 #ifndef IEEE_STYLE
1042 	case LEX_IS_ONECHAR_QUOTE:
1043 #ifdef H_TICK_HEX
1044 	  if (state == 9 && enable_h_tick_hex)
1045 	    {
1046 	      char c;
1047 
1048 	      c = GET ();
1049 	      as_warn ("'%c found after symbol", c);
1050 	      UNGET (c);
1051 	    }
1052 #endif
1053 	  if (state == 10)
1054 	    {
1055 	      /* Preserve the whitespace in foo 'b'.  */
1056 	      UNGET (ch);
1057 	      state = 3;
1058 	      PUT (' ');
1059 	      break;
1060 	    }
1061 	  ch = GET ();
1062 	  if (ch == EOF)
1063 	    {
1064 	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1065 	      ch = 0;
1066 	    }
1067 	  if (ch == '\\')
1068 	    {
1069 	      ch = GET ();
1070 	      if (ch == EOF)
1071 		{
1072 		  as_warn (_("end of file in escape character"));
1073 		  ch = '\\';
1074 		}
1075 	      else
1076 		ch = process_escape (ch);
1077 	    }
1078 	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1079 
1080 	  /* None of these 'x constants for us.  We want 'x'.  */
1081 	  if ((ch = GET ()) != '\'')
1082 	    {
1083 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1084 	      as_warn (_("missing close quote; (assumed)"));
1085 #else
1086 	      if (ch != EOF)
1087 		UNGET (ch);
1088 #endif
1089 	    }
1090 	  if (strlen (out_buf) == 1)
1091 	    {
1092 	      PUT (out_buf[0]);
1093 	      break;
1094 	    }
1095 	  if (state == 9)
1096 	    old_state = 3;
1097 	  else
1098 	    old_state = state;
1099 	  state = -1;
1100 	  out_string = out_buf;
1101 	  PUT (*out_string++);
1102 	  break;
1103 #endif
1104 
1105 	case LEX_IS_COLON:
1106 #ifdef KEEP_WHITE_AROUND_COLON
1107 	  state = 9;
1108 #else
1109 	  if (state == 9 || state == 10)
1110 	    state = 3;
1111 	  else if (state != 3)
1112 	    state = 1;
1113 #endif
1114 	  PUT (ch);
1115 	  break;
1116 
1117 	case LEX_IS_NEWLINE:
1118 	  /* Roll out a bunch of newlines from inside comments, etc.  */
1119 	  if (add_newlines)
1120 	    {
1121 	      --add_newlines;
1122 	      UNGET (ch);
1123 	    }
1124 	  /* Fall through.  */
1125 
1126 	case LEX_IS_LINE_SEPARATOR:
1127 	  state = 0;
1128 	  PUT (ch);
1129 	  break;
1130 
1131 	case LEX_IS_PARALLEL_SEPARATOR:
1132 	  state = 1;
1133 	  PUT (ch);
1134 	  break;
1135 
1136 #ifdef TC_V850
1137 	case LEX_IS_DOUBLEDASH_1ST:
1138 	  ch2 = GET ();
1139 	  if (ch2 != '-')
1140 	    {
1141 	      if (ch2 != EOF)
1142 		UNGET (ch2);
1143 	      goto de_fault;
1144 	    }
1145 	  /* Read and skip to end of line.  */
1146 	  do
1147 	    {
1148 	      ch = GET ();
1149 	    }
1150 	  while (ch != EOF && ch != '\n');
1151 
1152 	  if (ch == EOF)
1153 	    as_warn (_("end of file in comment; newline inserted"));
1154 
1155 	  state = 0;
1156 	  PUT ('\n');
1157 	  break;
1158 #endif
1159 #ifdef DOUBLEBAR_PARALLEL
1160 	case LEX_IS_DOUBLEBAR_1ST:
1161 	  ch2 = GET ();
1162 	  if (ch2 != EOF)
1163 	    UNGET (ch2);
1164 	  if (ch2 != '|')
1165 	    goto de_fault;
1166 
1167 	  /* Handle '||' in two states as invoking PUT twice might
1168 	     result in the first one jumping out of this loop.  We'd
1169 	     then lose track of the state and one '|' char.  */
1170 	  state = 13;
1171 	  PUT ('|');
1172 	  break;
1173 #endif
1174 	case LEX_IS_LINE_COMMENT_START:
1175 	  /* FIXME-someday: The two character comment stuff was badly
1176 	     thought out.  On i386, we want '/' as line comment start
1177 	     AND we want C style comments.  hence this hack.  The
1178 	     whole lexical process should be reworked.  xoxorich.  */
1179 	  if (ch == '/')
1180 	    {
1181 	      ch2 = GET ();
1182 	      if (ch2 == '*')
1183 		{
1184 		  old_state = 3;
1185 		  state = -2;
1186 		  break;
1187 		}
1188 	      else
1189 		{
1190 		  UNGET (ch2);
1191 		}
1192 	    }
1193 
1194 	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1195 	    {
1196 	      int startch;
1197 
1198 	      startch = ch;
1199 
1200 	      do
1201 		{
1202 		  ch = GET ();
1203 		}
1204 	      while (ch != EOF && IS_WHITESPACE (ch));
1205 
1206 	      if (ch == EOF)
1207 		{
1208 		  as_warn (_("end of file in comment; newline inserted"));
1209 		  PUT ('\n');
1210 		  break;
1211 		}
1212 
1213 	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1214 		{
1215 		  /* Not a cpp line.  */
1216 		  while (ch != EOF && !IS_NEWLINE (ch))
1217 		    ch = GET ();
1218 		  if (ch == EOF)
1219 		    as_warn (_("end of file in comment; newline inserted"));
1220 		  state = 0;
1221 		  PUT ('\n');
1222 		  break;
1223 		}
1224 	      /* Looks like `# 123 "filename"' from cpp.  */
1225 	      UNGET (ch);
1226 	      old_state = 4;
1227 	      state = -1;
1228 	      if (scrub_m68k_mri)
1229 		out_string = "\tlinefile ";
1230 	      else
1231 		out_string = "\t.linefile ";
1232 	      PUT (*out_string++);
1233 	      break;
1234 	    }
1235 
1236 #ifdef TC_D10V
1237 	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1238 	     Trap is the only short insn that has a first operand that is
1239 	     neither register nor label.
1240 	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1241 	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1242 	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1243 	     only character in line_comment_chars for d10v, hence we
1244 	     can recognize it as such.  */
1245 	  /* An alternative approach would be to reset the state to 1 when
1246 	     we see '||', '<'- or '->', but that seems to be overkill.  */
1247 	  if (state == 10)
1248 	    PUT (' ');
1249 #endif
1250 	  /* We have a line comment character which is not at the
1251 	     start of a line.  If this is also a normal comment
1252 	     character, fall through.  Otherwise treat it as a default
1253 	     character.  */
1254 	  if (strchr (tc_comment_chars, ch) == NULL
1255 	      && (! scrub_m68k_mri
1256 		  || (ch != '!' && ch != '*')))
1257 	    goto de_fault;
1258 	  if (scrub_m68k_mri
1259 	      && (ch == '!' || ch == '*' || ch == '#')
1260 	      && state != 1
1261 	      && state != 10)
1262 	    goto de_fault;
1263 	  /* Fall through.  */
1264 	case LEX_IS_COMMENT_START:
1265 #if defined TC_ARM && defined OBJ_ELF
1266 	  /* On the ARM, `@' is the comment character.
1267 	     Unfortunately this is also a special character in ELF .symver
1268 	     directives (and .type, though we deal with those another way).
1269 	     So we check if this line is such a directive, and treat
1270 	     the character as default if so.  This is a hack.  */
1271 	  if ((symver_state != NULL) && (*symver_state == 0))
1272 	    goto de_fault;
1273 #endif
1274 
1275 #ifdef TC_ARM
1276 	  /* For the ARM, care is needed not to damage occurrences of \@
1277 	     by stripping the @ onwards.  Yuck.  */
1278 	  if (to > tostart && *(to - 1) == '\\')
1279 	    /* Do not treat the @ as a start-of-comment.  */
1280 	    goto de_fault;
1281 #endif
1282 
1283 #ifdef WARN_COMMENTS
1284 	  if (!found_comment)
1285 	    as_where (&found_comment_file, &found_comment);
1286 #endif
1287 	  do
1288 	    {
1289 	      ch = GET ();
1290 	    }
1291 	  while (ch != EOF && !IS_NEWLINE (ch));
1292 	  if (ch == EOF)
1293 	    as_warn (_("end of file in comment; newline inserted"));
1294 	  state = 0;
1295 	  PUT ('\n');
1296 	  break;
1297 
1298 #ifdef H_TICK_HEX
1299 	case LEX_IS_H:
1300 	  /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1301 	     the H' with 0x to make them gas-style hex characters.  */
1302 	  if (enable_h_tick_hex)
1303 	    {
1304 	      char quot;
1305 
1306 	      quot = GET ();
1307 	      if (quot == '\'')
1308 		{
1309 		  UNGET ('x');
1310 		  ch = '0';
1311 		}
1312 	      else
1313 		UNGET (quot);
1314 	    }
1315 	  /* FALL THROUGH */
1316 #endif
1317 
1318 	case LEX_IS_SYMBOL_COMPONENT:
1319 	  if (state == 10)
1320 	    {
1321 	      /* This is a symbol character following another symbol
1322 		 character, with whitespace in between.  We skipped
1323 		 the whitespace earlier, so output it now.  */
1324 	      UNGET (ch);
1325 	      state = 3;
1326 	      PUT (' ');
1327 	      break;
1328 	    }
1329 
1330 #ifdef TC_Z80
1331 	  /* "af'" is a symbol containing '\''.  */
1332 	  if (state == 3 && (ch == 'a' || ch == 'A'))
1333 	    {
1334 	      state = 16;
1335 	      PUT (ch);
1336 	      ch = GET ();
1337 	      if (ch == 'f' || ch == 'F')
1338 		{
1339 		  state = 17;
1340 		  PUT (ch);
1341 		  break;
1342 		}
1343 	      else
1344 		{
1345 		  state = 9;
1346 		  if (!IS_SYMBOL_COMPONENT (ch))
1347 		    {
1348 		      if (ch != EOF)
1349 			UNGET (ch);
1350 		      break;
1351 		    }
1352 		}
1353 	    }
1354 #endif
1355 	  if (state == 3)
1356 	    state = 9;
1357 
1358 	  /* This is a common case.  Quickly copy CH and all the
1359 	     following symbol component or normal characters.  */
1360 	  if (to + 1 < toend
1361 	      && mri_state == NULL
1362 #if defined TC_ARM && defined OBJ_ELF
1363 	      && symver_state == NULL
1364 #endif
1365 	      )
1366 	    {
1367 	      char *s;
1368 	      int len;
1369 
1370 	      for (s = from; s < fromend; s++)
1371 		{
1372 		  int type;
1373 
1374 		  ch2 = *(unsigned char *) s;
1375 		  type = lex[ch2];
1376 		  if (type != 0
1377 		      && type != LEX_IS_SYMBOL_COMPONENT)
1378 		    break;
1379 		}
1380 
1381 	      if (s > from)
1382 		/* Handle the last character normally, for
1383 		   simplicity.  */
1384 		--s;
1385 
1386 	      len = s - from;
1387 
1388 	      if (len > (toend - to) - 1)
1389 		len = (toend - to) - 1;
1390 
1391 	      if (len > 0)
1392 		{
1393 		  PUT (ch);
1394 		  memcpy (to, from, len);
1395 		  to += len;
1396 		  from += len;
1397 		  if (to >= toend)
1398 		    goto tofull;
1399 		  ch = GET ();
1400 		}
1401 	    }
1402 
1403 	  /* Fall through.  */
1404 	default:
1405 	de_fault:
1406 	  /* Some relatively `normal' character.  */
1407 	  if (state == 0)
1408 	    {
1409 	      state = 11;	/* Now seeing label definition.  */
1410 	    }
1411 	  else if (state == 1)
1412 	    {
1413 	      state = 2;	/* Ditto.  */
1414 	    }
1415 	  else if (state == 9)
1416 	    {
1417 	      if (!IS_SYMBOL_COMPONENT (ch))
1418 		state = 3;
1419 	    }
1420 	  else if (state == 10)
1421 	    {
1422 	      if (ch == '\\')
1423 		{
1424 		  /* Special handling for backslash: a backslash may
1425 		     be the beginning of a formal parameter (of a
1426 		     macro) following another symbol character, with
1427 		     whitespace in between.  If that is the case, we
1428 		     output a space before the parameter.  Strictly
1429 		     speaking, correct handling depends upon what the
1430 		     macro parameter expands into; if the parameter
1431 		     expands into something which does not start with
1432 		     an operand character, then we don't want to keep
1433 		     the space.  We don't have enough information to
1434 		     make the right choice, so here we are making the
1435 		     choice which is more likely to be correct.  */
1436 		  if (to + 1 >= toend)
1437 		    {
1438 		      /* If we're near the end of the buffer, save the
1439 		         character for the next time round.  Otherwise
1440 		         we'll lose our state.  */
1441 		      UNGET (ch);
1442 		      goto tofull;
1443 		    }
1444 		  *to++ = ' ';
1445 		}
1446 
1447 	      state = 3;
1448 	    }
1449 	  PUT (ch);
1450 	  break;
1451 	}
1452     }
1453 
1454   /*NOTREACHED*/
1455 
1456  fromeof:
1457   /* We have reached the end of the input.  */
1458   return to - tostart;
1459 
1460  tofull:
1461   /* The output buffer is full.  Save any input we have not yet
1462      processed.  */
1463   if (fromend > from)
1464     {
1465       saved_input = from;
1466       saved_input_len = fromend - from;
1467     }
1468   else
1469     saved_input = NULL;
1470 
1471   return to - tostart;
1472 }
1473