xref: /netbsd-src/external/gpl3/binutils.old/dist/gas/app.c (revision 212397c69a103ae7e5eafa8731ddfae671d2dee7)
1 /* This is the Assembler Pre-Processor
2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
3    1999, 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2008, 2009, 2010, 2012
4    Free Software Foundation, Inc.
5 
6    This file is part of GAS, the GNU Assembler.
7 
8    GAS is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3, or (at your option)
11    any later version.
12 
13    GAS is distributed in the hope that it will be useful, but WITHOUT
14    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
16    License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with GAS; see the file COPYING.  If not, write to the Free
20    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
21    02110-1301, USA.  */
22 
23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
24 /* App, the assembler pre-processor.  This pre-processor strips out
25    excess spaces, turns single-quoted characters into a decimal
26    constant, and turns the # in # <number> <filename> <garbage> into a
27    .linefile.  This needs better error-handling.  */
28 
29 #include "as.h"
30 
31 #if (__STDC__ != 1)
32 #ifndef const
33 #define const  /* empty */
34 #endif
35 #endif
36 
37 #ifdef H_TICK_HEX
38 int enable_h_tick_hex = 0;
39 #endif
40 
41 #ifdef TC_M68K
42 /* Whether we are scrubbing in m68k MRI mode.  This is different from
43    flag_m68k_mri, because the two flags will be affected by the .mri
44    pseudo-op at different times.  */
45 static int scrub_m68k_mri;
46 
47 /* The pseudo-op which switches in and out of MRI mode.  See the
48    comment in do_scrub_chars.  */
49 static const char mri_pseudo[] = ".mri 0";
50 #else
51 #define scrub_m68k_mri 0
52 #endif
53 
54 #if defined TC_ARM && defined OBJ_ELF
55 /* The pseudo-op for which we need to special-case `@' characters.
56    See the comment in do_scrub_chars.  */
57 static const char   symver_pseudo[] = ".symver";
58 static const char * symver_state;
59 #endif
60 
61 static char lex[256];
62 static const char symbol_chars[] =
63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
64 
65 #define LEX_IS_SYMBOL_COMPONENT		1
66 #define LEX_IS_WHITESPACE		2
67 #define LEX_IS_LINE_SEPARATOR		3
68 #define LEX_IS_COMMENT_START		4
69 #define LEX_IS_LINE_COMMENT_START	5
70 #define	LEX_IS_TWOCHAR_COMMENT_1ST	6
71 #define	LEX_IS_STRINGQUOTE		8
72 #define	LEX_IS_COLON			9
73 #define	LEX_IS_NEWLINE			10
74 #define	LEX_IS_ONECHAR_QUOTE		11
75 #ifdef TC_V850
76 #define LEX_IS_DOUBLEDASH_1ST		12
77 #endif
78 #ifdef TC_M32R
79 #define DOUBLEBAR_PARALLEL
80 #endif
81 #ifdef DOUBLEBAR_PARALLEL
82 #define LEX_IS_DOUBLEBAR_1ST		13
83 #endif
84 #define LEX_IS_PARALLEL_SEPARATOR	14
85 #ifdef H_TICK_HEX
86 #define LEX_IS_H			15
87 #endif
88 #define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
89 #define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
90 #define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
91 #define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
92 #define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
93 #define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
94 #define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
95 
96 static int process_escape (int);
97 
98 /* FIXME-soon: The entire lexer/parser thingy should be
99    built statically at compile time rather than dynamically
100    each and every time the assembler is run.  xoxorich.  */
101 
102 void
103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
104 {
105   const char *p;
106   int c;
107 
108   lex[' '] = LEX_IS_WHITESPACE;
109   lex['\t'] = LEX_IS_WHITESPACE;
110   lex['\r'] = LEX_IS_WHITESPACE;
111   lex['\n'] = LEX_IS_NEWLINE;
112   lex[':'] = LEX_IS_COLON;
113 
114 #ifdef TC_M68K
115   scrub_m68k_mri = m68k_mri;
116 
117   if (! m68k_mri)
118 #endif
119     {
120       lex['"'] = LEX_IS_STRINGQUOTE;
121 
122 #if ! defined (TC_HPPA) && ! defined (TC_I370)
123       /* I370 uses single-quotes to delimit integer, float constants.  */
124       lex['\''] = LEX_IS_ONECHAR_QUOTE;
125 #endif
126 
127 #ifdef SINGLE_QUOTE_STRINGS
128       lex['\''] = LEX_IS_STRINGQUOTE;
129 #endif
130     }
131 
132   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
133      in state 5 of do_scrub_chars must be changed.  */
134 
135   /* Note that these override the previous defaults, e.g. if ';' is a
136      comment char, then it isn't a line separator.  */
137   for (p = symbol_chars; *p; ++p)
138     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
139 
140   for (c = 128; c < 256; ++c)
141     lex[c] = LEX_IS_SYMBOL_COMPONENT;
142 
143 #ifdef tc_symbol_chars
144   /* This macro permits the processor to specify all characters which
145      may appears in an operand.  This will prevent the scrubber from
146      discarding meaningful whitespace in certain cases.  The i386
147      backend uses this to support prefixes, which can confuse the
148      scrubber as to whether it is parsing operands or opcodes.  */
149   for (p = tc_symbol_chars; *p; ++p)
150     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
151 #endif
152 
153   /* The m68k backend wants to be able to change comment_chars.  */
154 #ifndef tc_comment_chars
155 #define tc_comment_chars comment_chars
156 #endif
157   for (p = tc_comment_chars; *p; p++)
158     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
159 
160   for (p = line_comment_chars; *p; p++)
161     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
162 
163   for (p = line_separator_chars; *p; p++)
164     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
165 
166 #ifdef tc_parallel_separator_chars
167   /* This macro permits the processor to specify all characters which
168      separate parallel insns on the same line.  */
169   for (p = tc_parallel_separator_chars; *p; p++)
170     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
171 #endif
172 
173   /* Only allow slash-star comments if slash is not in use.
174      FIXME: This isn't right.  We should always permit them.  */
175   if (lex['/'] == 0)
176     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
177 
178 #ifdef TC_M68K
179   if (m68k_mri)
180     {
181       lex['\''] = LEX_IS_STRINGQUOTE;
182       lex[';'] = LEX_IS_COMMENT_START;
183       lex['*'] = LEX_IS_LINE_COMMENT_START;
184       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
185 	 then it can't be used in an expression.  */
186       lex['!'] = LEX_IS_LINE_COMMENT_START;
187     }
188 #endif
189 
190 #ifdef TC_V850
191   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
192 #endif
193 #ifdef DOUBLEBAR_PARALLEL
194   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
195 #endif
196 #ifdef TC_D30V
197   /* Must do this is we want VLIW instruction with "->" or "<-".  */
198   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
199 #endif
200 
201 #ifdef H_TICK_HEX
202   if (enable_h_tick_hex)
203     {
204       lex['h'] = LEX_IS_H;
205       lex['H'] = LEX_IS_H;
206     }
207 #endif
208 }
209 
210 /* Saved state of the scrubber.  */
211 static int state;
212 static int old_state;
213 static char *out_string;
214 static char out_buf[20];
215 static int add_newlines;
216 static char *saved_input;
217 static size_t saved_input_len;
218 static char input_buffer[32 * 1024];
219 static const char *mri_state;
220 static char mri_last_ch;
221 
222 /* Data structure for saving the state of app across #include's.  Note that
223    app is called asynchronously to the parsing of the .include's, so our
224    state at the time .include is interpreted is completely unrelated.
225    That's why we have to save it all.  */
226 
227 struct app_save
228 {
229   int          state;
230   int          old_state;
231   char *       out_string;
232   char         out_buf[sizeof (out_buf)];
233   int          add_newlines;
234   char *       saved_input;
235   size_t       saved_input_len;
236 #ifdef TC_M68K
237   int          scrub_m68k_mri;
238 #endif
239   const char * mri_state;
240   char         mri_last_ch;
241 #if defined TC_ARM && defined OBJ_ELF
242   const char * symver_state;
243 #endif
244 };
245 
246 char *
247 app_push (void)
248 {
249   register struct app_save *saved;
250 
251   saved = (struct app_save *) xmalloc (sizeof (*saved));
252   saved->state = state;
253   saved->old_state = old_state;
254   saved->out_string = out_string;
255   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
256   saved->add_newlines = add_newlines;
257   if (saved_input == NULL)
258     saved->saved_input = NULL;
259   else
260     {
261       saved->saved_input = (char *) xmalloc (saved_input_len);
262       memcpy (saved->saved_input, saved_input, saved_input_len);
263       saved->saved_input_len = saved_input_len;
264     }
265 #ifdef TC_M68K
266   saved->scrub_m68k_mri = scrub_m68k_mri;
267 #endif
268   saved->mri_state = mri_state;
269   saved->mri_last_ch = mri_last_ch;
270 #if defined TC_ARM && defined OBJ_ELF
271   saved->symver_state = symver_state;
272 #endif
273 
274   /* do_scrub_begin() is not useful, just wastes time.  */
275 
276   state = 0;
277   saved_input = NULL;
278   add_newlines = 0;
279 
280   return (char *) saved;
281 }
282 
283 void
284 app_pop (char *arg)
285 {
286   register struct app_save *saved = (struct app_save *) arg;
287 
288   /* There is no do_scrub_end ().  */
289   state = saved->state;
290   old_state = saved->old_state;
291   out_string = saved->out_string;
292   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
293   add_newlines = saved->add_newlines;
294   if (saved->saved_input == NULL)
295     saved_input = NULL;
296   else
297     {
298       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
299       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
300       saved_input = input_buffer;
301       saved_input_len = saved->saved_input_len;
302       free (saved->saved_input);
303     }
304 #ifdef TC_M68K
305   scrub_m68k_mri = saved->scrub_m68k_mri;
306 #endif
307   mri_state = saved->mri_state;
308   mri_last_ch = saved->mri_last_ch;
309 #if defined TC_ARM && defined OBJ_ELF
310   symver_state = saved->symver_state;
311 #endif
312 
313   free (arg);
314 }
315 
316 /* @@ This assumes that \n &c are the same on host and target.  This is not
317    necessarily true.  */
318 
319 static int
320 process_escape (int ch)
321 {
322   switch (ch)
323     {
324     case 'b':
325       return '\b';
326     case 'f':
327       return '\f';
328     case 'n':
329       return '\n';
330     case 'r':
331       return '\r';
332     case 't':
333       return '\t';
334     case '\'':
335       return '\'';
336     case '"':
337       return '\"';
338     default:
339       return ch;
340     }
341 }
342 
343 /* This function is called to process input characters.  The GET
344    parameter is used to retrieve more input characters.  GET should
345    set its parameter to point to a buffer, and return the length of
346    the buffer; it should return 0 at end of file.  The scrubbed output
347    characters are put into the buffer starting at TOSTART; the TOSTART
348    buffer is TOLEN bytes in length.  The function returns the number
349    of scrubbed characters put into TOSTART.  This will be TOLEN unless
350    end of file was seen.  This function is arranged as a state
351    machine, and saves its state so that it may return at any point.
352    This is the way the old code used to work.  */
353 
354 size_t
355 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
356 {
357   char *to = tostart;
358   char *toend = tostart + tolen;
359   char *from;
360   char *fromend;
361   size_t fromlen;
362   register int ch, ch2 = 0;
363   /* Character that started the string we're working on.  */
364   static char quotechar;
365 
366   /*State 0: beginning of normal line
367 	  1: After first whitespace on line (flush more white)
368 	  2: After first non-white (opcode) on line (keep 1white)
369 	  3: after second white on line (into operands) (flush white)
370 	  4: after putting out a .linefile, put out digits
371 	  5: parsing a string, then go to old-state
372 	  6: putting out \ escape in a "d string.
373 	  7: no longer used
374 	  8: no longer used
375 	  9: After seeing symbol char in state 3 (keep 1white after symchar)
376 	 10: After seeing whitespace in state 9 (keep white before symchar)
377 	 11: After seeing a symbol character in state 0 (eg a label definition)
378 	 -1: output string in out_string and go to the state in old_state
379 	 -2: flush text until a '*' '/' is seen, then go to state old_state
380 #ifdef TC_V850
381 	 12: After seeing a dash, looking for a second dash as a start
382 	     of comment.
383 #endif
384 #ifdef DOUBLEBAR_PARALLEL
385 	 13: After seeing a vertical bar, looking for a second
386 	     vertical bar as a parallel expression separator.
387 #endif
388 #ifdef TC_PREDICATE_START_CHAR
389 	 14: After seeing a predicate start character at state 0, looking
390 	     for a predicate end character as predicate.
391 	 15: After seeing a predicate start character at state 1, looking
392 	     for a predicate end character as predicate.
393 #endif
394 #ifdef TC_Z80
395 	 16: After seeing an 'a' or an 'A' at the start of a symbol
396 	 17: After seeing an 'f' or an 'F' in state 16
397 #endif
398 	  */
399 
400   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
401      constructs like ``.loc 1 20''.  This was turning into ``.loc
402      120''.  States 9 and 10 ensure that a space is never dropped in
403      between characters which could appear in an identifier.  Ian
404      Taylor, ian@cygnus.com.
405 
406      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
407      correctly on the PA (and any other target where colons are optional).
408      Jeff Law, law@cs.utah.edu.
409 
410      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
411      get squashed into "cmp r1,r2||trap#1", with the all important space
412      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
413 
414   /* This macro gets the next input character.  */
415 
416 #define GET()							\
417   (from < fromend						\
418    ? * (unsigned char *) (from++)				\
419    : (saved_input = NULL,					\
420       fromlen = (*get) (input_buffer, sizeof input_buffer),	\
421       from = input_buffer,					\
422       fromend = from + fromlen,					\
423       (fromlen == 0						\
424        ? EOF							\
425        : * (unsigned char *) (from++))))
426 
427   /* This macro pushes a character back on the input stream.  */
428 
429 #define UNGET(uch) (*--from = (uch))
430 
431   /* This macro puts a character into the output buffer.  If this
432      character fills the output buffer, this macro jumps to the label
433      TOFULL.  We use this rather ugly approach because we need to
434      handle two different termination conditions: EOF on the input
435      stream, and a full output buffer.  It would be simpler if we
436      always read in the entire input stream before processing it, but
437      I don't want to make such a significant change to the assembler's
438      memory usage.  */
439 
440 #define PUT(pch)				\
441   do						\
442     {						\
443       *to++ = (pch);				\
444       if (to >= toend)				\
445 	goto tofull;				\
446     }						\
447   while (0)
448 
449   if (saved_input != NULL)
450     {
451       from = saved_input;
452       fromend = from + saved_input_len;
453     }
454   else
455     {
456       fromlen = (*get) (input_buffer, sizeof input_buffer);
457       if (fromlen == 0)
458 	return 0;
459       from = input_buffer;
460       fromend = from + fromlen;
461     }
462 
463   while (1)
464     {
465       /* The cases in this switch end with continue, in order to
466 	 branch back to the top of this while loop and generate the
467 	 next output character in the appropriate state.  */
468       switch (state)
469 	{
470 	case -1:
471 	  ch = *out_string++;
472 	  if (*out_string == '\0')
473 	    {
474 	      state = old_state;
475 	      old_state = 3;
476 	    }
477 	  PUT (ch);
478 	  continue;
479 
480 	case -2:
481 	  for (;;)
482 	    {
483 	      do
484 		{
485 		  ch = GET ();
486 
487 		  if (ch == EOF)
488 		    {
489 		      as_warn (_("end of file in comment"));
490 		      goto fromeof;
491 		    }
492 
493 		  if (ch == '\n')
494 		    PUT ('\n');
495 		}
496 	      while (ch != '*');
497 
498 	      while ((ch = GET ()) == '*')
499 		;
500 
501 	      if (ch == EOF)
502 		{
503 		  as_warn (_("end of file in comment"));
504 		  goto fromeof;
505 		}
506 
507 	      if (ch == '/')
508 		break;
509 
510 	      UNGET (ch);
511 	    }
512 
513 	  state = old_state;
514 	  UNGET (' ');
515 	  continue;
516 
517 	case 4:
518 	  ch = GET ();
519 	  if (ch == EOF)
520 	    goto fromeof;
521 	  else if (ch >= '0' && ch <= '9')
522 	    PUT (ch);
523 	  else
524 	    {
525 	      while (ch != EOF && IS_WHITESPACE (ch))
526 		ch = GET ();
527 	      if (ch == '"')
528 		{
529 		  quotechar = ch;
530 		  state = 5;
531 		  old_state = 3;
532 		  PUT (ch);
533 		}
534 	      else
535 		{
536 		  while (ch != EOF && ch != '\n')
537 		    ch = GET ();
538 		  state = 0;
539 		  PUT (ch);
540 		}
541 	    }
542 	  continue;
543 
544 	case 5:
545 	  /* We are going to copy everything up to a quote character,
546 	     with special handling for a backslash.  We try to
547 	     optimize the copying in the simple case without using the
548 	     GET and PUT macros.  */
549 	  {
550 	    char *s;
551 	    ptrdiff_t len;
552 
553 	    for (s = from; s < fromend; s++)
554 	      {
555 		ch = *s;
556 		if (ch == '\\'
557 		    || ch == quotechar
558 		    || ch == '\n')
559 		  break;
560 	      }
561 	    len = s - from;
562 	    if (len > toend - to)
563 	      len = toend - to;
564 	    if (len > 0)
565 	      {
566 		memcpy (to, from, len);
567 		to += len;
568 		from += len;
569 		if (to >= toend)
570 		  goto tofull;
571 	      }
572 	  }
573 
574 	  ch = GET ();
575 	  if (ch == EOF)
576 	    {
577 	      /* This buffer is here specifically so
578 		 that the UNGET below will work.  */
579 	      static char one_char_buf[1];
580 
581 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
582 	      state = old_state;
583 	      from = fromend = one_char_buf + 1;
584 	      fromlen = 1;
585 	      UNGET ('\n');
586 	      PUT (quotechar);
587 	    }
588 	  else if (ch == quotechar)
589 	    {
590 	      state = old_state;
591 	      PUT (ch);
592 	    }
593 #ifndef NO_STRING_ESCAPES
594 	  else if (ch == '\\')
595 	    {
596 	      state = 6;
597 	      PUT (ch);
598 	    }
599 #endif
600 	  else if (scrub_m68k_mri && ch == '\n')
601 	    {
602 	      /* Just quietly terminate the string.  This permits lines like
603 		   bne	label	loop if we haven't reach end yet.  */
604 	      state = old_state;
605 	      UNGET (ch);
606 	      PUT ('\'');
607 	    }
608 	  else
609 	    {
610 	      PUT (ch);
611 	    }
612 	  continue;
613 
614 	case 6:
615 	  state = 5;
616 	  ch = GET ();
617 	  switch (ch)
618 	    {
619 	      /* Handle strings broken across lines, by turning '\n' into
620 		 '\\' and 'n'.  */
621 	    case '\n':
622 	      UNGET ('n');
623 	      add_newlines++;
624 	      PUT ('\\');
625 	      continue;
626 
627 	    case EOF:
628 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
629 	      PUT (quotechar);
630 	      continue;
631 
632 	    case '"':
633 	    case '\\':
634 	    case 'b':
635 	    case 'f':
636 	    case 'n':
637 	    case 'r':
638 	    case 't':
639 	    case 'v':
640 	    case 'x':
641 	    case 'X':
642 	    case '0':
643 	    case '1':
644 	    case '2':
645 	    case '3':
646 	    case '4':
647 	    case '5':
648 	    case '6':
649 	    case '7':
650 	      break;
651 
652 	    default:
653 #ifdef ONLY_STANDARD_ESCAPES
654 	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
655 #endif
656 	      break;
657 	    }
658 	  PUT (ch);
659 	  continue;
660 
661 #ifdef DOUBLEBAR_PARALLEL
662 	case 13:
663 	  ch = GET ();
664 	  if (ch != '|')
665 	    abort ();
666 
667 	  /* Reset back to state 1 and pretend that we are parsing a
668 	     line from just after the first white space.  */
669 	  state = 1;
670 	  PUT ('|');
671 #ifdef TC_TIC6X
672 	  /* "||^" is used for SPMASKed instructions.  */
673 	  ch = GET ();
674 	  if (ch == EOF)
675 	    goto fromeof;
676 	  else if (ch == '^')
677 	    PUT ('^');
678 	  else
679 	    UNGET (ch);
680 #endif
681 	  continue;
682 #endif
683 #ifdef TC_Z80
684 	case 16:
685 	  /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
686 	  ch = GET ();
687 	  if (ch == 'f' || ch == 'F')
688 	    {
689 	      state = 17;
690 	      PUT (ch);
691 	    }
692 	  else
693 	    {
694 	      state = 9;
695 	      break;
696 	    }
697 	case 17:
698 	  /* We have seen "af" at the start of a symbol,
699 	     a ' here is a part of that symbol.  */
700 	  ch = GET ();
701 	  state = 9;
702 	  if (ch == '\'')
703 	    /* Change to avoid warning about unclosed string.  */
704 	    PUT ('`');
705 	  else if (ch != EOF)
706 	    UNGET (ch);
707 	  break;
708 #endif
709 	}
710 
711       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
712 
713       /* flushchar: */
714       ch = GET ();
715 
716 #ifdef TC_PREDICATE_START_CHAR
717       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
718 	{
719 	  state += 14;
720 	  PUT (ch);
721 	  continue;
722 	}
723       else if (state == 14 || state == 15)
724 	{
725 	  if (ch == TC_PREDICATE_END_CHAR)
726 	    {
727 	      state -= 14;
728 	      PUT (ch);
729 	      ch = GET ();
730 	    }
731 	  else
732 	    {
733 	      PUT (ch);
734 	      continue;
735 	    }
736 	}
737 #endif
738 
739     recycle:
740 
741 #if defined TC_ARM && defined OBJ_ELF
742       /* We need to watch out for .symver directives.  See the comment later
743 	 in this function.  */
744       if (symver_state == NULL)
745 	{
746 	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
747 	    symver_state = symver_pseudo + 1;
748 	}
749       else
750 	{
751 	  /* We advance to the next state if we find the right
752 	     character.  */
753 	  if (ch != '\0' && (*symver_state == ch))
754 	    ++symver_state;
755 	  else if (*symver_state != '\0')
756 	    /* We did not get the expected character, or we didn't
757 	       get a valid terminating character after seeing the
758 	       entire pseudo-op, so we must go back to the beginning.  */
759 	    symver_state = NULL;
760 	  else
761 	    {
762 	      /* We've read the entire pseudo-op.  If this is the end
763 		 of the line, go back to the beginning.  */
764 	      if (IS_NEWLINE (ch))
765 		symver_state = NULL;
766 	    }
767 	}
768 #endif /* TC_ARM && OBJ_ELF */
769 
770 #ifdef TC_M68K
771       /* We want to have pseudo-ops which control whether we are in
772 	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
773 	 the scrubber, that means that we need a special purpose
774 	 recognizer here.  */
775       if (mri_state == NULL)
776 	{
777 	  if ((state == 0 || state == 1)
778 	      && ch == mri_pseudo[0])
779 	    mri_state = mri_pseudo + 1;
780 	}
781       else
782 	{
783 	  /* We advance to the next state if we find the right
784 	     character, or if we need a space character and we get any
785 	     whitespace character, or if we need a '0' and we get a
786 	     '1' (this is so that we only need one state to handle
787 	     ``.mri 0'' and ``.mri 1'').  */
788 	  if (ch != '\0'
789 	      && (*mri_state == ch
790 		  || (*mri_state == ' '
791 		      && lex[ch] == LEX_IS_WHITESPACE)
792 		  || (*mri_state == '0'
793 		      && ch == '1')))
794 	    {
795 	      mri_last_ch = ch;
796 	      ++mri_state;
797 	    }
798 	  else if (*mri_state != '\0'
799 		   || (lex[ch] != LEX_IS_WHITESPACE
800 		       && lex[ch] != LEX_IS_NEWLINE))
801 	    {
802 	      /* We did not get the expected character, or we didn't
803 		 get a valid terminating character after seeing the
804 		 entire pseudo-op, so we must go back to the
805 		 beginning.  */
806 	      mri_state = NULL;
807 	    }
808 	  else
809 	    {
810 	      /* We've read the entire pseudo-op.  mips_last_ch is
811 		 either '0' or '1' indicating whether to enter or
812 		 leave MRI mode.  */
813 	      do_scrub_begin (mri_last_ch == '1');
814 	      mri_state = NULL;
815 
816 	      /* We continue handling the character as usual.  The
817 		 main gas reader must also handle the .mri pseudo-op
818 		 to control expression parsing and the like.  */
819 	    }
820 	}
821 #endif
822 
823       if (ch == EOF)
824 	{
825 	  if (state != 0)
826 	    {
827 	      as_warn (_("end of file not at end of a line; newline inserted"));
828 	      state = 0;
829 	      PUT ('\n');
830 	    }
831 	  goto fromeof;
832 	}
833 
834       switch (lex[ch])
835 	{
836 	case LEX_IS_WHITESPACE:
837 	  do
838 	    {
839 	      ch = GET ();
840 	    }
841 	  while (ch != EOF && IS_WHITESPACE (ch));
842 	  if (ch == EOF)
843 	    goto fromeof;
844 
845 	  if (state == 0)
846 	    {
847 	      /* Preserve a single whitespace character at the
848 		 beginning of a line.  */
849 	      state = 1;
850 	      UNGET (ch);
851 	      PUT (' ');
852 	      break;
853 	    }
854 
855 #ifdef KEEP_WHITE_AROUND_COLON
856 	  if (lex[ch] == LEX_IS_COLON)
857 	    {
858 	      /* Only keep this white if there's no white *after* the
859 		 colon.  */
860 	      ch2 = GET ();
861 	      if (ch2 != EOF)
862 		UNGET (ch2);
863 	      if (!IS_WHITESPACE (ch2))
864 		{
865 		  state = 9;
866 		  UNGET (ch);
867 		  PUT (' ');
868 		  break;
869 		}
870 	    }
871 #endif
872 	  if (IS_COMMENT (ch)
873 	      || ch == '/'
874 	      || IS_LINE_SEPARATOR (ch)
875 	      || IS_PARALLEL_SEPARATOR (ch))
876 	    {
877 	      if (scrub_m68k_mri)
878 		{
879 		  /* In MRI mode, we keep these spaces.  */
880 		  UNGET (ch);
881 		  PUT (' ');
882 		  break;
883 		}
884 	      goto recycle;
885 	    }
886 
887 	  /* If we're in state 2 or 11, we've seen a non-white
888 	     character followed by whitespace.  If the next character
889 	     is ':', this is whitespace after a label name which we
890 	     normally must ignore.  In MRI mode, though, spaces are
891 	     not permitted between the label and the colon.  */
892 	  if ((state == 2 || state == 11)
893 	      && lex[ch] == LEX_IS_COLON
894 	      && ! scrub_m68k_mri)
895 	    {
896 	      state = 1;
897 	      PUT (ch);
898 	      break;
899 	    }
900 
901 	  switch (state)
902 	    {
903 	    case 1:
904 	      /* We can arrive here if we leave a leading whitespace
905 		 character at the beginning of a line.  */
906 	      goto recycle;
907 	    case 2:
908 	      state = 3;
909 	      if (to + 1 < toend)
910 		{
911 		  /* Optimize common case by skipping UNGET/GET.  */
912 		  PUT (' ');	/* Sp after opco */
913 		  goto recycle;
914 		}
915 	      UNGET (ch);
916 	      PUT (' ');
917 	      break;
918 	    case 3:
919 #ifndef TC_KEEP_OPERAND_SPACES
920 	      /* For TI C6X, we keep these spaces as they may separate
921 		 functional unit specifiers from operands.  */
922 	      if (scrub_m68k_mri)
923 #endif
924 		{
925 		  /* In MRI mode, we keep these spaces.  */
926 		  UNGET (ch);
927 		  PUT (' ');
928 		  break;
929 		}
930 	      goto recycle;	/* Sp in operands */
931 	    case 9:
932 	    case 10:
933 #ifndef TC_KEEP_OPERAND_SPACES
934 	      if (scrub_m68k_mri)
935 #endif
936 		{
937 		  /* In MRI mode, we keep these spaces.  */
938 		  state = 3;
939 		  UNGET (ch);
940 		  PUT (' ');
941 		  break;
942 		}
943 	      state = 10;	/* Sp after symbol char */
944 	      goto recycle;
945 	    case 11:
946 	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
947 		state = 1;
948 	      else
949 		{
950 		  /* We know that ch is not ':', since we tested that
951 		     case above.  Therefore this is not a label, so it
952 		     must be the opcode, and we've just seen the
953 		     whitespace after it.  */
954 		  state = 3;
955 		}
956 	      UNGET (ch);
957 	      PUT (' ');	/* Sp after label definition.  */
958 	      break;
959 	    default:
960 	      BAD_CASE (state);
961 	    }
962 	  break;
963 
964 	case LEX_IS_TWOCHAR_COMMENT_1ST:
965 	  ch2 = GET ();
966 	  if (ch2 == '*')
967 	    {
968 	      for (;;)
969 		{
970 		  do
971 		    {
972 		      ch2 = GET ();
973 		      if (ch2 != EOF && IS_NEWLINE (ch2))
974 			add_newlines++;
975 		    }
976 		  while (ch2 != EOF && ch2 != '*');
977 
978 		  while (ch2 == '*')
979 		    ch2 = GET ();
980 
981 		  if (ch2 == EOF || ch2 == '/')
982 		    break;
983 
984 		  /* This UNGET will ensure that we count newlines
985 		     correctly.  */
986 		  UNGET (ch2);
987 		}
988 
989 	      if (ch2 == EOF)
990 		as_warn (_("end of file in multiline comment"));
991 
992 	      ch = ' ';
993 	      goto recycle;
994 	    }
995 #ifdef DOUBLESLASH_LINE_COMMENTS
996 	  else if (ch2 == '/')
997 	    {
998 	      do
999 		{
1000 		  ch = GET ();
1001 		}
1002 	      while (ch != EOF && !IS_NEWLINE (ch));
1003 	      if (ch == EOF)
1004 		as_warn ("end of file in comment; newline inserted");
1005 	      state = 0;
1006 	      PUT ('\n');
1007 	      break;
1008 	    }
1009 #endif
1010 	  else
1011 	    {
1012 	      if (ch2 != EOF)
1013 		UNGET (ch2);
1014 	      if (state == 9 || state == 10)
1015 		state = 3;
1016 	      PUT (ch);
1017 	    }
1018 	  break;
1019 
1020 	case LEX_IS_STRINGQUOTE:
1021 	  quotechar = ch;
1022 	  if (state == 10)
1023 	    {
1024 	      /* Preserve the whitespace in foo "bar".  */
1025 	      UNGET (ch);
1026 	      state = 3;
1027 	      PUT (' ');
1028 
1029 	      /* PUT didn't jump out.  We could just break, but we
1030 		 know what will happen, so optimize a bit.  */
1031 	      ch = GET ();
1032 	      old_state = 3;
1033 	    }
1034 	  else if (state == 9)
1035 	    old_state = 3;
1036 	  else
1037 	    old_state = state;
1038 	  state = 5;
1039 	  PUT (ch);
1040 	  break;
1041 
1042 #ifndef IEEE_STYLE
1043 	case LEX_IS_ONECHAR_QUOTE:
1044 #ifdef H_TICK_HEX
1045 	  if (state == 9 && enable_h_tick_hex)
1046 	    {
1047 	      char c;
1048 
1049 	      c = GET ();
1050 	      as_warn ("'%c found after symbol", c);
1051 	      UNGET (c);
1052 	    }
1053 #endif
1054 	  if (state == 10)
1055 	    {
1056 	      /* Preserve the whitespace in foo 'b'.  */
1057 	      UNGET (ch);
1058 	      state = 3;
1059 	      PUT (' ');
1060 	      break;
1061 	    }
1062 	  ch = GET ();
1063 	  if (ch == EOF)
1064 	    {
1065 	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1066 	      ch = 0;
1067 	    }
1068 	  if (ch == '\\')
1069 	    {
1070 	      ch = GET ();
1071 	      if (ch == EOF)
1072 		{
1073 		  as_warn (_("end of file in escape character"));
1074 		  ch = '\\';
1075 		}
1076 	      else
1077 		ch = process_escape (ch);
1078 	    }
1079 	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1080 
1081 	  /* None of these 'x constants for us.  We want 'x'.  */
1082 	  if ((ch = GET ()) != '\'')
1083 	    {
1084 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1085 	      as_warn (_("missing close quote; (assumed)"));
1086 #else
1087 	      if (ch != EOF)
1088 		UNGET (ch);
1089 #endif
1090 	    }
1091 	  if (strlen (out_buf) == 1)
1092 	    {
1093 	      PUT (out_buf[0]);
1094 	      break;
1095 	    }
1096 	  if (state == 9)
1097 	    old_state = 3;
1098 	  else
1099 	    old_state = state;
1100 	  state = -1;
1101 	  out_string = out_buf;
1102 	  PUT (*out_string++);
1103 	  break;
1104 #endif
1105 
1106 	case LEX_IS_COLON:
1107 #ifdef KEEP_WHITE_AROUND_COLON
1108 	  state = 9;
1109 #else
1110 	  if (state == 9 || state == 10)
1111 	    state = 3;
1112 	  else if (state != 3)
1113 	    state = 1;
1114 #endif
1115 	  PUT (ch);
1116 	  break;
1117 
1118 	case LEX_IS_NEWLINE:
1119 	  /* Roll out a bunch of newlines from inside comments, etc.  */
1120 	  if (add_newlines)
1121 	    {
1122 	      --add_newlines;
1123 	      UNGET (ch);
1124 	    }
1125 	  /* Fall through.  */
1126 
1127 	case LEX_IS_LINE_SEPARATOR:
1128 	  state = 0;
1129 	  PUT (ch);
1130 	  break;
1131 
1132 	case LEX_IS_PARALLEL_SEPARATOR:
1133 	  state = 1;
1134 	  PUT (ch);
1135 	  break;
1136 
1137 #ifdef TC_V850
1138 	case LEX_IS_DOUBLEDASH_1ST:
1139 	  ch2 = GET ();
1140 	  if (ch2 != '-')
1141 	    {
1142 	      if (ch2 != EOF)
1143 		UNGET (ch2);
1144 	      goto de_fault;
1145 	    }
1146 	  /* Read and skip to end of line.  */
1147 	  do
1148 	    {
1149 	      ch = GET ();
1150 	    }
1151 	  while (ch != EOF && ch != '\n');
1152 
1153 	  if (ch == EOF)
1154 	    as_warn (_("end of file in comment; newline inserted"));
1155 
1156 	  state = 0;
1157 	  PUT ('\n');
1158 	  break;
1159 #endif
1160 #ifdef DOUBLEBAR_PARALLEL
1161 	case LEX_IS_DOUBLEBAR_1ST:
1162 	  ch2 = GET ();
1163 	  if (ch2 != EOF)
1164 	    UNGET (ch2);
1165 	  if (ch2 != '|')
1166 	    goto de_fault;
1167 
1168 	  /* Handle '||' in two states as invoking PUT twice might
1169 	     result in the first one jumping out of this loop.  We'd
1170 	     then lose track of the state and one '|' char.  */
1171 	  state = 13;
1172 	  PUT ('|');
1173 	  break;
1174 #endif
1175 	case LEX_IS_LINE_COMMENT_START:
1176 	  /* FIXME-someday: The two character comment stuff was badly
1177 	     thought out.  On i386, we want '/' as line comment start
1178 	     AND we want C style comments.  hence this hack.  The
1179 	     whole lexical process should be reworked.  xoxorich.  */
1180 	  if (ch == '/')
1181 	    {
1182 	      ch2 = GET ();
1183 	      if (ch2 == '*')
1184 		{
1185 		  old_state = 3;
1186 		  state = -2;
1187 		  break;
1188 		}
1189 	      else
1190 		{
1191 		  UNGET (ch2);
1192 		}
1193 	    }
1194 
1195 	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1196 	    {
1197 	      int startch;
1198 
1199 	      startch = ch;
1200 
1201 	      do
1202 		{
1203 		  ch = GET ();
1204 		}
1205 	      while (ch != EOF && IS_WHITESPACE (ch));
1206 
1207 	      if (ch == EOF)
1208 		{
1209 		  as_warn (_("end of file in comment; newline inserted"));
1210 		  PUT ('\n');
1211 		  break;
1212 		}
1213 
1214 	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1215 		{
1216 		  /* Not a cpp line.  */
1217 		  while (ch != EOF && !IS_NEWLINE (ch))
1218 		    ch = GET ();
1219 		  if (ch == EOF)
1220 		    as_warn (_("end of file in comment; newline inserted"));
1221 		  state = 0;
1222 		  PUT ('\n');
1223 		  break;
1224 		}
1225 	      /* Looks like `# 123 "filename"' from cpp.  */
1226 	      UNGET (ch);
1227 	      old_state = 4;
1228 	      state = -1;
1229 	      if (scrub_m68k_mri)
1230 		out_string = "\tlinefile ";
1231 	      else
1232 		out_string = "\t.linefile ";
1233 	      PUT (*out_string++);
1234 	      break;
1235 	    }
1236 
1237 #ifdef TC_D10V
1238 	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1239 	     Trap is the only short insn that has a first operand that is
1240 	     neither register nor label.
1241 	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1242 	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1243 	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1244 	     only character in line_comment_chars for d10v, hence we
1245 	     can recognize it as such.  */
1246 	  /* An alternative approach would be to reset the state to 1 when
1247 	     we see '||', '<'- or '->', but that seems to be overkill.  */
1248 	  if (state == 10)
1249 	    PUT (' ');
1250 #endif
1251 	  /* We have a line comment character which is not at the
1252 	     start of a line.  If this is also a normal comment
1253 	     character, fall through.  Otherwise treat it as a default
1254 	     character.  */
1255 	  if (strchr (tc_comment_chars, ch) == NULL
1256 	      && (! scrub_m68k_mri
1257 		  || (ch != '!' && ch != '*')))
1258 	    goto de_fault;
1259 	  if (scrub_m68k_mri
1260 	      && (ch == '!' || ch == '*' || ch == '#')
1261 	      && state != 1
1262 	      && state != 10)
1263 	    goto de_fault;
1264 	  /* Fall through.  */
1265 	case LEX_IS_COMMENT_START:
1266 #if defined TC_ARM && defined OBJ_ELF
1267 	  /* On the ARM, `@' is the comment character.
1268 	     Unfortunately this is also a special character in ELF .symver
1269 	     directives (and .type, though we deal with those another way).
1270 	     So we check if this line is such a directive, and treat
1271 	     the character as default if so.  This is a hack.  */
1272 	  if ((symver_state != NULL) && (*symver_state == 0))
1273 	    goto de_fault;
1274 #endif
1275 
1276 #ifdef TC_ARM
1277 	  /* For the ARM, care is needed not to damage occurrences of \@
1278 	     by stripping the @ onwards.  Yuck.  */
1279 	  if (to > tostart && *(to - 1) == '\\')
1280 	    /* Do not treat the @ as a start-of-comment.  */
1281 	    goto de_fault;
1282 #endif
1283 
1284 #ifdef WARN_COMMENTS
1285 	  if (!found_comment)
1286 	    as_where (&found_comment_file, &found_comment);
1287 #endif
1288 	  do
1289 	    {
1290 	      ch = GET ();
1291 	    }
1292 	  while (ch != EOF && !IS_NEWLINE (ch));
1293 	  if (ch == EOF)
1294 	    as_warn (_("end of file in comment; newline inserted"));
1295 	  state = 0;
1296 	  PUT ('\n');
1297 	  break;
1298 
1299 #ifdef H_TICK_HEX
1300 	case LEX_IS_H:
1301 	  /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1302 	     the H' with 0x to make them gas-style hex characters.  */
1303 	  if (enable_h_tick_hex)
1304 	    {
1305 	      char quot;
1306 
1307 	      quot = GET ();
1308 	      if (quot == '\'')
1309 		{
1310 		  UNGET ('x');
1311 		  ch = '0';
1312 		}
1313 	      else
1314 		UNGET (quot);
1315 	    }
1316 	  /* FALL THROUGH */
1317 #endif
1318 
1319 	case LEX_IS_SYMBOL_COMPONENT:
1320 	  if (state == 10)
1321 	    {
1322 	      /* This is a symbol character following another symbol
1323 		 character, with whitespace in between.  We skipped
1324 		 the whitespace earlier, so output it now.  */
1325 	      UNGET (ch);
1326 	      state = 3;
1327 	      PUT (' ');
1328 	      break;
1329 	    }
1330 
1331 #ifdef TC_Z80
1332 	  /* "af'" is a symbol containing '\''.  */
1333 	  if (state == 3 && (ch == 'a' || ch == 'A'))
1334 	    {
1335 	      state = 16;
1336 	      PUT (ch);
1337 	      ch = GET ();
1338 	      if (ch == 'f' || ch == 'F')
1339 		{
1340 		  state = 17;
1341 		  PUT (ch);
1342 		  break;
1343 		}
1344 	      else
1345 		{
1346 		  state = 9;
1347 		  if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1348 		    {
1349 		      if (ch != EOF)
1350 			UNGET (ch);
1351 		      break;
1352 		    }
1353 		}
1354 	    }
1355 #endif
1356 	  if (state == 3)
1357 	    state = 9;
1358 
1359 	  /* This is a common case.  Quickly copy CH and all the
1360 	     following symbol component or normal characters.  */
1361 	  if (to + 1 < toend
1362 	      && mri_state == NULL
1363 #if defined TC_ARM && defined OBJ_ELF
1364 	      && symver_state == NULL
1365 #endif
1366 	      )
1367 	    {
1368 	      char *s;
1369 	      ptrdiff_t len;
1370 
1371 	      for (s = from; s < fromend; s++)
1372 		{
1373 		  int type;
1374 
1375 		  ch2 = *(unsigned char *) s;
1376 		  type = lex[ch2];
1377 		  if (type != 0
1378 		      && type != LEX_IS_SYMBOL_COMPONENT)
1379 		    break;
1380 		}
1381 
1382 	      if (s > from)
1383 		/* Handle the last character normally, for
1384 		   simplicity.  */
1385 		--s;
1386 
1387 	      len = s - from;
1388 
1389 	      if (len > (toend - to) - 1)
1390 		len = (toend - to) - 1;
1391 
1392 	      if (len > 0)
1393 		{
1394 		  PUT (ch);
1395 		  memcpy (to, from, len);
1396 		  to += len;
1397 		  from += len;
1398 		  if (to >= toend)
1399 		    goto tofull;
1400 		  ch = GET ();
1401 		}
1402 	    }
1403 
1404 	  /* Fall through.  */
1405 	default:
1406 	de_fault:
1407 	  /* Some relatively `normal' character.  */
1408 	  if (state == 0)
1409 	    {
1410 	      state = 11;	/* Now seeing label definition.  */
1411 	    }
1412 	  else if (state == 1)
1413 	    {
1414 	      state = 2;	/* Ditto.  */
1415 	    }
1416 	  else if (state == 9)
1417 	    {
1418 	      if (!IS_SYMBOL_COMPONENT (ch))
1419 		state = 3;
1420 	    }
1421 	  else if (state == 10)
1422 	    {
1423 	      if (ch == '\\')
1424 		{
1425 		  /* Special handling for backslash: a backslash may
1426 		     be the beginning of a formal parameter (of a
1427 		     macro) following another symbol character, with
1428 		     whitespace in between.  If that is the case, we
1429 		     output a space before the parameter.  Strictly
1430 		     speaking, correct handling depends upon what the
1431 		     macro parameter expands into; if the parameter
1432 		     expands into something which does not start with
1433 		     an operand character, then we don't want to keep
1434 		     the space.  We don't have enough information to
1435 		     make the right choice, so here we are making the
1436 		     choice which is more likely to be correct.  */
1437 		  if (to + 1 >= toend)
1438 		    {
1439 		      /* If we're near the end of the buffer, save the
1440 		         character for the next time round.  Otherwise
1441 		         we'll lose our state.  */
1442 		      UNGET (ch);
1443 		      goto tofull;
1444 		    }
1445 		  *to++ = ' ';
1446 		}
1447 
1448 	      state = 3;
1449 	    }
1450 	  PUT (ch);
1451 	  break;
1452 	}
1453     }
1454 
1455   /*NOTREACHED*/
1456 
1457  fromeof:
1458   /* We have reached the end of the input.  */
1459   return to - tostart;
1460 
1461  tofull:
1462   /* The output buffer is full.  Save any input we have not yet
1463      processed.  */
1464   if (fromend > from)
1465     {
1466       saved_input = from;
1467       saved_input_len = fromend - from;
1468     }
1469   else
1470     saved_input = NULL;
1471 
1472   return to - tostart;
1473 }
1474