1 /* This is the Assembler Pre-Processor 2 Copyright (C) 1987-2018 Free Software Foundation, Inc. 3 4 This file is part of GAS, the GNU Assembler. 5 6 GAS is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GAS is distributed in the hope that it will be useful, but WITHOUT 12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 14 License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GAS; see the file COPYING. If not, write to the Free 18 Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA 19 02110-1301, USA. */ 20 21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */ 22 /* App, the assembler pre-processor. This pre-processor strips out 23 excess spaces, turns single-quoted characters into a decimal 24 constant, and turns the # in # <number> <filename> <garbage> into a 25 .linefile. This needs better error-handling. */ 26 27 #include "as.h" 28 29 #if (__STDC__ != 1) 30 #ifndef const 31 #define const /* empty */ 32 #endif 33 #endif 34 35 #ifdef H_TICK_HEX 36 int enable_h_tick_hex = 0; 37 #endif 38 39 #ifdef TC_M68K 40 /* Whether we are scrubbing in m68k MRI mode. This is different from 41 flag_m68k_mri, because the two flags will be affected by the .mri 42 pseudo-op at different times. */ 43 static int scrub_m68k_mri; 44 45 /* The pseudo-op which switches in and out of MRI mode. See the 46 comment in do_scrub_chars. */ 47 static const char mri_pseudo[] = ".mri 0"; 48 #else 49 #define scrub_m68k_mri 0 50 #endif 51 52 #if defined TC_ARM && defined OBJ_ELF 53 /* The pseudo-op for which we need to special-case `@' characters. 54 See the comment in do_scrub_chars. */ 55 static const char symver_pseudo[] = ".symver"; 56 static const char * symver_state; 57 #endif 58 #ifdef TC_ARM 59 static char last_char; 60 #endif 61 62 static char lex[256]; 63 static const char symbol_chars[] = 64 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 65 66 #define LEX_IS_SYMBOL_COMPONENT 1 67 #define LEX_IS_WHITESPACE 2 68 #define LEX_IS_LINE_SEPARATOR 3 69 #define LEX_IS_COMMENT_START 4 70 #define LEX_IS_LINE_COMMENT_START 5 71 #define LEX_IS_TWOCHAR_COMMENT_1ST 6 72 #define LEX_IS_STRINGQUOTE 8 73 #define LEX_IS_COLON 9 74 #define LEX_IS_NEWLINE 10 75 #define LEX_IS_ONECHAR_QUOTE 11 76 #ifdef TC_V850 77 #define LEX_IS_DOUBLEDASH_1ST 12 78 #endif 79 #ifdef TC_M32R 80 #define DOUBLEBAR_PARALLEL 81 #endif 82 #ifdef DOUBLEBAR_PARALLEL 83 #define LEX_IS_DOUBLEBAR_1ST 13 84 #endif 85 #define LEX_IS_PARALLEL_SEPARATOR 14 86 #ifdef H_TICK_HEX 87 #define LEX_IS_H 15 88 #endif 89 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT) 90 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE) 91 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR) 92 #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR) 93 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START) 94 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START) 95 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE) 96 97 static int process_escape (int); 98 99 /* FIXME-soon: The entire lexer/parser thingy should be 100 built statically at compile time rather than dynamically 101 each and every time the assembler is run. xoxorich. */ 102 103 void 104 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED) 105 { 106 const char *p; 107 int c; 108 109 lex[' '] = LEX_IS_WHITESPACE; 110 lex['\t'] = LEX_IS_WHITESPACE; 111 lex['\r'] = LEX_IS_WHITESPACE; 112 lex['\n'] = LEX_IS_NEWLINE; 113 lex[':'] = LEX_IS_COLON; 114 115 #ifdef TC_M68K 116 scrub_m68k_mri = m68k_mri; 117 118 if (! m68k_mri) 119 #endif 120 { 121 lex['"'] = LEX_IS_STRINGQUOTE; 122 123 #if ! defined (TC_HPPA) 124 lex['\''] = LEX_IS_ONECHAR_QUOTE; 125 #endif 126 127 #ifdef SINGLE_QUOTE_STRINGS 128 lex['\''] = LEX_IS_STRINGQUOTE; 129 #endif 130 } 131 132 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop 133 in state 5 of do_scrub_chars must be changed. */ 134 135 /* Note that these override the previous defaults, e.g. if ';' is a 136 comment char, then it isn't a line separator. */ 137 for (p = symbol_chars; *p; ++p) 138 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; 139 140 for (c = 128; c < 256; ++c) 141 lex[c] = LEX_IS_SYMBOL_COMPONENT; 142 143 #ifdef tc_symbol_chars 144 /* This macro permits the processor to specify all characters which 145 may appears in an operand. This will prevent the scrubber from 146 discarding meaningful whitespace in certain cases. The i386 147 backend uses this to support prefixes, which can confuse the 148 scrubber as to whether it is parsing operands or opcodes. */ 149 for (p = tc_symbol_chars; *p; ++p) 150 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; 151 #endif 152 153 /* The m68k backend wants to be able to change comment_chars. */ 154 #ifndef tc_comment_chars 155 #define tc_comment_chars comment_chars 156 #endif 157 for (p = tc_comment_chars; *p; p++) 158 lex[(unsigned char) *p] = LEX_IS_COMMENT_START; 159 160 for (p = line_comment_chars; *p; p++) 161 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START; 162 163 #ifndef tc_line_separator_chars 164 #define tc_line_separator_chars line_separator_chars 165 #endif 166 for (p = tc_line_separator_chars; *p; p++) 167 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR; 168 169 #ifdef tc_parallel_separator_chars 170 /* This macro permits the processor to specify all characters which 171 separate parallel insns on the same line. */ 172 for (p = tc_parallel_separator_chars; *p; p++) 173 lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR; 174 #endif 175 176 /* Only allow slash-star comments if slash is not in use. 177 FIXME: This isn't right. We should always permit them. */ 178 if (lex['/'] == 0) 179 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST; 180 181 #ifdef TC_M68K 182 if (m68k_mri) 183 { 184 lex['\''] = LEX_IS_STRINGQUOTE; 185 lex[';'] = LEX_IS_COMMENT_START; 186 lex['*'] = LEX_IS_LINE_COMMENT_START; 187 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but 188 then it can't be used in an expression. */ 189 lex['!'] = LEX_IS_LINE_COMMENT_START; 190 } 191 #endif 192 193 #ifdef TC_V850 194 lex['-'] = LEX_IS_DOUBLEDASH_1ST; 195 #endif 196 #ifdef DOUBLEBAR_PARALLEL 197 lex['|'] = LEX_IS_DOUBLEBAR_1ST; 198 #endif 199 #ifdef TC_D30V 200 /* Must do this is we want VLIW instruction with "->" or "<-". */ 201 lex['-'] = LEX_IS_SYMBOL_COMPONENT; 202 #endif 203 204 #ifdef H_TICK_HEX 205 if (enable_h_tick_hex) 206 { 207 lex['h'] = LEX_IS_H; 208 lex['H'] = LEX_IS_H; 209 } 210 #endif 211 } 212 213 /* Saved state of the scrubber. */ 214 static int state; 215 static int old_state; 216 static const char *out_string; 217 static char out_buf[20]; 218 static int add_newlines; 219 static char *saved_input; 220 static size_t saved_input_len; 221 static char input_buffer[32 * 1024]; 222 static const char *mri_state; 223 static char mri_last_ch; 224 225 /* Data structure for saving the state of app across #include's. Note that 226 app is called asynchronously to the parsing of the .include's, so our 227 state at the time .include is interpreted is completely unrelated. 228 That's why we have to save it all. */ 229 230 struct app_save 231 { 232 int state; 233 int old_state; 234 const char * out_string; 235 char out_buf[sizeof (out_buf)]; 236 int add_newlines; 237 char * saved_input; 238 size_t saved_input_len; 239 #ifdef TC_M68K 240 int scrub_m68k_mri; 241 #endif 242 const char * mri_state; 243 char mri_last_ch; 244 #if defined TC_ARM && defined OBJ_ELF 245 const char * symver_state; 246 #endif 247 #ifdef TC_ARM 248 char last_char; 249 #endif 250 }; 251 252 char * 253 app_push (void) 254 { 255 struct app_save *saved; 256 257 saved = XNEW (struct app_save); 258 saved->state = state; 259 saved->old_state = old_state; 260 saved->out_string = out_string; 261 memcpy (saved->out_buf, out_buf, sizeof (out_buf)); 262 saved->add_newlines = add_newlines; 263 if (saved_input == NULL) 264 saved->saved_input = NULL; 265 else 266 { 267 saved->saved_input = XNEWVEC (char, saved_input_len); 268 memcpy (saved->saved_input, saved_input, saved_input_len); 269 saved->saved_input_len = saved_input_len; 270 } 271 #ifdef TC_M68K 272 saved->scrub_m68k_mri = scrub_m68k_mri; 273 #endif 274 saved->mri_state = mri_state; 275 saved->mri_last_ch = mri_last_ch; 276 #if defined TC_ARM && defined OBJ_ELF 277 saved->symver_state = symver_state; 278 #endif 279 #ifdef TC_ARM 280 saved->last_char = last_char; 281 #endif 282 283 /* do_scrub_begin() is not useful, just wastes time. */ 284 285 state = 0; 286 saved_input = NULL; 287 add_newlines = 0; 288 289 return (char *) saved; 290 } 291 292 void 293 app_pop (char *arg) 294 { 295 struct app_save *saved = (struct app_save *) arg; 296 297 /* There is no do_scrub_end (). */ 298 state = saved->state; 299 old_state = saved->old_state; 300 out_string = saved->out_string; 301 memcpy (out_buf, saved->out_buf, sizeof (out_buf)); 302 add_newlines = saved->add_newlines; 303 if (saved->saved_input == NULL) 304 saved_input = NULL; 305 else 306 { 307 gas_assert (saved->saved_input_len <= sizeof (input_buffer)); 308 memcpy (input_buffer, saved->saved_input, saved->saved_input_len); 309 saved_input = input_buffer; 310 saved_input_len = saved->saved_input_len; 311 free (saved->saved_input); 312 } 313 #ifdef TC_M68K 314 scrub_m68k_mri = saved->scrub_m68k_mri; 315 #endif 316 mri_state = saved->mri_state; 317 mri_last_ch = saved->mri_last_ch; 318 #if defined TC_ARM && defined OBJ_ELF 319 symver_state = saved->symver_state; 320 #endif 321 #ifdef TC_ARM 322 last_char = saved->last_char; 323 #endif 324 325 free (arg); 326 } 327 328 /* @@ This assumes that \n &c are the same on host and target. This is not 329 necessarily true. */ 330 331 static int 332 process_escape (int ch) 333 { 334 switch (ch) 335 { 336 case 'b': 337 return '\b'; 338 case 'f': 339 return '\f'; 340 case 'n': 341 return '\n'; 342 case 'r': 343 return '\r'; 344 case 't': 345 return '\t'; 346 case '\'': 347 return '\''; 348 case '"': 349 return '\"'; 350 default: 351 return ch; 352 } 353 } 354 355 /* This function is called to process input characters. The GET 356 parameter is used to retrieve more input characters. GET should 357 set its parameter to point to a buffer, and return the length of 358 the buffer; it should return 0 at end of file. The scrubbed output 359 characters are put into the buffer starting at TOSTART; the TOSTART 360 buffer is TOLEN bytes in length. The function returns the number 361 of scrubbed characters put into TOSTART. This will be TOLEN unless 362 end of file was seen. This function is arranged as a state 363 machine, and saves its state so that it may return at any point. 364 This is the way the old code used to work. */ 365 366 size_t 367 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen) 368 { 369 char *to = tostart; 370 char *toend = tostart + tolen; 371 char *from; 372 char *fromend; 373 size_t fromlen; 374 int ch, ch2 = 0; 375 /* Character that started the string we're working on. */ 376 static char quotechar; 377 378 /*State 0: beginning of normal line 379 1: After first whitespace on line (flush more white) 380 2: After first non-white (opcode) on line (keep 1white) 381 3: after second white on line (into operands) (flush white) 382 4: after putting out a .linefile, put out digits 383 5: parsing a string, then go to old-state 384 6: putting out \ escape in a "d string. 385 7: no longer used 386 8: no longer used 387 9: After seeing symbol char in state 3 (keep 1white after symchar) 388 10: After seeing whitespace in state 9 (keep white before symchar) 389 11: After seeing a symbol character in state 0 (eg a label definition) 390 -1: output string in out_string and go to the state in old_state 391 -2: flush text until a '*' '/' is seen, then go to state old_state 392 #ifdef TC_V850 393 12: After seeing a dash, looking for a second dash as a start 394 of comment. 395 #endif 396 #ifdef DOUBLEBAR_PARALLEL 397 13: After seeing a vertical bar, looking for a second 398 vertical bar as a parallel expression separator. 399 #endif 400 #ifdef TC_PREDICATE_START_CHAR 401 14: After seeing a predicate start character at state 0, looking 402 for a predicate end character as predicate. 403 15: After seeing a predicate start character at state 1, looking 404 for a predicate end character as predicate. 405 #endif 406 #ifdef TC_Z80 407 16: After seeing an 'a' or an 'A' at the start of a symbol 408 17: After seeing an 'f' or an 'F' in state 16 409 #endif 410 */ 411 412 /* I added states 9 and 10 because the MIPS ECOFF assembler uses 413 constructs like ``.loc 1 20''. This was turning into ``.loc 414 120''. States 9 and 10 ensure that a space is never dropped in 415 between characters which could appear in an identifier. Ian 416 Taylor, ian@cygnus.com. 417 418 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works 419 correctly on the PA (and any other target where colons are optional). 420 Jeff Law, law@cs.utah.edu. 421 422 I added state 13 so that something like "cmp r1, r2 || trap #1" does not 423 get squashed into "cmp r1,r2||trap#1", with the all important space 424 between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */ 425 426 /* This macro gets the next input character. */ 427 428 #define GET() \ 429 (from < fromend \ 430 ? * (unsigned char *) (from++) \ 431 : (saved_input = NULL, \ 432 fromlen = (*get) (input_buffer, sizeof input_buffer), \ 433 from = input_buffer, \ 434 fromend = from + fromlen, \ 435 (fromlen == 0 \ 436 ? EOF \ 437 : * (unsigned char *) (from++)))) 438 439 /* This macro pushes a character back on the input stream. */ 440 441 #define UNGET(uch) (*--from = (uch)) 442 443 /* This macro puts a character into the output buffer. If this 444 character fills the output buffer, this macro jumps to the label 445 TOFULL. We use this rather ugly approach because we need to 446 handle two different termination conditions: EOF on the input 447 stream, and a full output buffer. It would be simpler if we 448 always read in the entire input stream before processing it, but 449 I don't want to make such a significant change to the assembler's 450 memory usage. */ 451 452 #define PUT(pch) \ 453 do \ 454 { \ 455 *to++ = (pch); \ 456 if (to >= toend) \ 457 goto tofull; \ 458 } \ 459 while (0) 460 461 if (saved_input != NULL) 462 { 463 from = saved_input; 464 fromend = from + saved_input_len; 465 } 466 else 467 { 468 fromlen = (*get) (input_buffer, sizeof input_buffer); 469 if (fromlen == 0) 470 return 0; 471 from = input_buffer; 472 fromend = from + fromlen; 473 } 474 475 while (1) 476 { 477 /* The cases in this switch end with continue, in order to 478 branch back to the top of this while loop and generate the 479 next output character in the appropriate state. */ 480 switch (state) 481 { 482 case -1: 483 ch = *out_string++; 484 if (*out_string == '\0') 485 { 486 state = old_state; 487 old_state = 3; 488 } 489 PUT (ch); 490 continue; 491 492 case -2: 493 for (;;) 494 { 495 do 496 { 497 ch = GET (); 498 499 if (ch == EOF) 500 { 501 as_warn (_("end of file in comment")); 502 goto fromeof; 503 } 504 505 if (ch == '\n') 506 PUT ('\n'); 507 } 508 while (ch != '*'); 509 510 while ((ch = GET ()) == '*') 511 ; 512 513 if (ch == EOF) 514 { 515 as_warn (_("end of file in comment")); 516 goto fromeof; 517 } 518 519 if (ch == '/') 520 break; 521 522 UNGET (ch); 523 } 524 525 state = old_state; 526 UNGET (' '); 527 continue; 528 529 case 4: 530 ch = GET (); 531 if (ch == EOF) 532 goto fromeof; 533 else if (ch >= '0' && ch <= '9') 534 PUT (ch); 535 else 536 { 537 while (ch != EOF && IS_WHITESPACE (ch)) 538 ch = GET (); 539 if (ch == '"') 540 { 541 quotechar = ch; 542 state = 5; 543 old_state = 3; 544 PUT (ch); 545 } 546 else 547 { 548 while (ch != EOF && ch != '\n') 549 ch = GET (); 550 state = 0; 551 PUT (ch); 552 } 553 } 554 continue; 555 556 case 5: 557 /* We are going to copy everything up to a quote character, 558 with special handling for a backslash. We try to 559 optimize the copying in the simple case without using the 560 GET and PUT macros. */ 561 { 562 char *s; 563 ptrdiff_t len; 564 565 for (s = from; s < fromend; s++) 566 { 567 ch = *s; 568 if (ch == '\\' 569 || ch == quotechar 570 || ch == '\n') 571 break; 572 } 573 len = s - from; 574 if (len > toend - to) 575 len = toend - to; 576 if (len > 0) 577 { 578 memcpy (to, from, len); 579 to += len; 580 from += len; 581 if (to >= toend) 582 goto tofull; 583 } 584 } 585 586 ch = GET (); 587 if (ch == EOF) 588 { 589 /* This buffer is here specifically so 590 that the UNGET below will work. */ 591 static char one_char_buf[1]; 592 593 as_warn (_("end of file in string; '%c' inserted"), quotechar); 594 state = old_state; 595 from = fromend = one_char_buf + 1; 596 fromlen = 1; 597 UNGET ('\n'); 598 PUT (quotechar); 599 } 600 else if (ch == quotechar) 601 { 602 state = old_state; 603 PUT (ch); 604 } 605 #ifndef NO_STRING_ESCAPES 606 else if (ch == '\\') 607 { 608 state = 6; 609 PUT (ch); 610 } 611 #endif 612 else if (scrub_m68k_mri && ch == '\n') 613 { 614 /* Just quietly terminate the string. This permits lines like 615 bne label loop if we haven't reach end yet. */ 616 state = old_state; 617 UNGET (ch); 618 PUT ('\''); 619 } 620 else 621 { 622 PUT (ch); 623 } 624 continue; 625 626 case 6: 627 state = 5; 628 ch = GET (); 629 switch (ch) 630 { 631 /* Handle strings broken across lines, by turning '\n' into 632 '\\' and 'n'. */ 633 case '\n': 634 UNGET ('n'); 635 add_newlines++; 636 PUT ('\\'); 637 continue; 638 639 case EOF: 640 as_warn (_("end of file in string; '%c' inserted"), quotechar); 641 PUT (quotechar); 642 continue; 643 644 case '"': 645 case '\\': 646 case 'b': 647 case 'f': 648 case 'n': 649 case 'r': 650 case 't': 651 case 'v': 652 case 'x': 653 case 'X': 654 case '0': 655 case '1': 656 case '2': 657 case '3': 658 case '4': 659 case '5': 660 case '6': 661 case '7': 662 break; 663 664 default: 665 #ifdef ONLY_STANDARD_ESCAPES 666 as_warn (_("unknown escape '\\%c' in string; ignored"), ch); 667 #endif 668 break; 669 } 670 PUT (ch); 671 continue; 672 673 #ifdef DOUBLEBAR_PARALLEL 674 case 13: 675 ch = GET (); 676 if (ch != '|') 677 abort (); 678 679 /* Reset back to state 1 and pretend that we are parsing a 680 line from just after the first white space. */ 681 state = 1; 682 PUT ('|'); 683 #ifdef TC_TIC6X 684 /* "||^" is used for SPMASKed instructions. */ 685 ch = GET (); 686 if (ch == EOF) 687 goto fromeof; 688 else if (ch == '^') 689 PUT ('^'); 690 else 691 UNGET (ch); 692 #endif 693 continue; 694 #endif 695 #ifdef TC_Z80 696 case 16: 697 /* We have seen an 'a' at the start of a symbol, look for an 'f'. */ 698 ch = GET (); 699 if (ch == 'f' || ch == 'F') 700 { 701 state = 17; 702 PUT (ch); 703 } 704 else 705 { 706 state = 9; 707 break; 708 } 709 /* Fall through. */ 710 case 17: 711 /* We have seen "af" at the start of a symbol, 712 a ' here is a part of that symbol. */ 713 ch = GET (); 714 state = 9; 715 if (ch == '\'') 716 /* Change to avoid warning about unclosed string. */ 717 PUT ('`'); 718 else if (ch != EOF) 719 UNGET (ch); 720 break; 721 #endif 722 } 723 724 /* OK, we are somewhere in states 0 through 4 or 9 through 11. */ 725 726 /* flushchar: */ 727 ch = GET (); 728 729 #ifdef TC_PREDICATE_START_CHAR 730 if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1)) 731 { 732 state += 14; 733 PUT (ch); 734 continue; 735 } 736 else if (state == 14 || state == 15) 737 { 738 if (ch == TC_PREDICATE_END_CHAR) 739 { 740 state -= 14; 741 PUT (ch); 742 ch = GET (); 743 } 744 else 745 { 746 PUT (ch); 747 continue; 748 } 749 } 750 #endif 751 752 recycle: 753 754 #if defined TC_ARM && defined OBJ_ELF 755 /* We need to watch out for .symver directives. See the comment later 756 in this function. */ 757 if (symver_state == NULL) 758 { 759 if ((state == 0 || state == 1) && ch == symver_pseudo[0]) 760 symver_state = symver_pseudo + 1; 761 } 762 else 763 { 764 /* We advance to the next state if we find the right 765 character. */ 766 if (ch != '\0' && (*symver_state == ch)) 767 ++symver_state; 768 else if (*symver_state != '\0') 769 /* We did not get the expected character, or we didn't 770 get a valid terminating character after seeing the 771 entire pseudo-op, so we must go back to the beginning. */ 772 symver_state = NULL; 773 else 774 { 775 /* We've read the entire pseudo-op. If this is the end 776 of the line, go back to the beginning. */ 777 if (IS_NEWLINE (ch)) 778 symver_state = NULL; 779 } 780 } 781 #endif /* TC_ARM && OBJ_ELF */ 782 783 #ifdef TC_M68K 784 /* We want to have pseudo-ops which control whether we are in 785 MRI mode or not. Unfortunately, since m68k MRI mode affects 786 the scrubber, that means that we need a special purpose 787 recognizer here. */ 788 if (mri_state == NULL) 789 { 790 if ((state == 0 || state == 1) 791 && ch == mri_pseudo[0]) 792 mri_state = mri_pseudo + 1; 793 } 794 else 795 { 796 /* We advance to the next state if we find the right 797 character, or if we need a space character and we get any 798 whitespace character, or if we need a '0' and we get a 799 '1' (this is so that we only need one state to handle 800 ``.mri 0'' and ``.mri 1''). */ 801 if (ch != '\0' 802 && (*mri_state == ch 803 || (*mri_state == ' ' 804 && lex[ch] == LEX_IS_WHITESPACE) 805 || (*mri_state == '0' 806 && ch == '1'))) 807 { 808 mri_last_ch = ch; 809 ++mri_state; 810 } 811 else if (*mri_state != '\0' 812 || (lex[ch] != LEX_IS_WHITESPACE 813 && lex[ch] != LEX_IS_NEWLINE)) 814 { 815 /* We did not get the expected character, or we didn't 816 get a valid terminating character after seeing the 817 entire pseudo-op, so we must go back to the 818 beginning. */ 819 mri_state = NULL; 820 } 821 else 822 { 823 /* We've read the entire pseudo-op. mips_last_ch is 824 either '0' or '1' indicating whether to enter or 825 leave MRI mode. */ 826 do_scrub_begin (mri_last_ch == '1'); 827 mri_state = NULL; 828 829 /* We continue handling the character as usual. The 830 main gas reader must also handle the .mri pseudo-op 831 to control expression parsing and the like. */ 832 } 833 } 834 #endif 835 836 if (ch == EOF) 837 { 838 if (state != 0) 839 { 840 as_warn (_("end of file not at end of a line; newline inserted")); 841 state = 0; 842 PUT ('\n'); 843 } 844 goto fromeof; 845 } 846 847 switch (lex[ch]) 848 { 849 case LEX_IS_WHITESPACE: 850 do 851 { 852 ch = GET (); 853 } 854 while (ch != EOF && IS_WHITESPACE (ch)); 855 if (ch == EOF) 856 goto fromeof; 857 858 if (state == 0) 859 { 860 /* Preserve a single whitespace character at the 861 beginning of a line. */ 862 state = 1; 863 UNGET (ch); 864 PUT (' '); 865 break; 866 } 867 868 #ifdef KEEP_WHITE_AROUND_COLON 869 if (lex[ch] == LEX_IS_COLON) 870 { 871 /* Only keep this white if there's no white *after* the 872 colon. */ 873 ch2 = GET (); 874 if (ch2 != EOF) 875 UNGET (ch2); 876 if (!IS_WHITESPACE (ch2)) 877 { 878 state = 9; 879 UNGET (ch); 880 PUT (' '); 881 break; 882 } 883 } 884 #endif 885 if (IS_COMMENT (ch) 886 || ch == '/' 887 || IS_LINE_SEPARATOR (ch) 888 || IS_PARALLEL_SEPARATOR (ch)) 889 { 890 if (scrub_m68k_mri) 891 { 892 /* In MRI mode, we keep these spaces. */ 893 UNGET (ch); 894 PUT (' '); 895 break; 896 } 897 goto recycle; 898 } 899 900 /* If we're in state 2 or 11, we've seen a non-white 901 character followed by whitespace. If the next character 902 is ':', this is whitespace after a label name which we 903 normally must ignore. In MRI mode, though, spaces are 904 not permitted between the label and the colon. */ 905 if ((state == 2 || state == 11) 906 && lex[ch] == LEX_IS_COLON 907 && ! scrub_m68k_mri) 908 { 909 state = 1; 910 PUT (ch); 911 break; 912 } 913 914 switch (state) 915 { 916 case 1: 917 /* We can arrive here if we leave a leading whitespace 918 character at the beginning of a line. */ 919 goto recycle; 920 case 2: 921 state = 3; 922 if (to + 1 < toend) 923 { 924 /* Optimize common case by skipping UNGET/GET. */ 925 PUT (' '); /* Sp after opco */ 926 goto recycle; 927 } 928 UNGET (ch); 929 PUT (' '); 930 break; 931 case 3: 932 #ifndef TC_KEEP_OPERAND_SPACES 933 /* For TI C6X, we keep these spaces as they may separate 934 functional unit specifiers from operands. */ 935 if (scrub_m68k_mri) 936 #endif 937 { 938 /* In MRI mode, we keep these spaces. */ 939 UNGET (ch); 940 PUT (' '); 941 break; 942 } 943 goto recycle; /* Sp in operands */ 944 case 9: 945 case 10: 946 #ifndef TC_KEEP_OPERAND_SPACES 947 if (scrub_m68k_mri) 948 #endif 949 { 950 /* In MRI mode, we keep these spaces. */ 951 state = 3; 952 UNGET (ch); 953 PUT (' '); 954 break; 955 } 956 state = 10; /* Sp after symbol char */ 957 goto recycle; 958 case 11: 959 if (LABELS_WITHOUT_COLONS || flag_m68k_mri) 960 state = 1; 961 else 962 { 963 /* We know that ch is not ':', since we tested that 964 case above. Therefore this is not a label, so it 965 must be the opcode, and we've just seen the 966 whitespace after it. */ 967 state = 3; 968 } 969 UNGET (ch); 970 PUT (' '); /* Sp after label definition. */ 971 break; 972 default: 973 BAD_CASE (state); 974 } 975 break; 976 977 case LEX_IS_TWOCHAR_COMMENT_1ST: 978 ch2 = GET (); 979 if (ch2 == '*') 980 { 981 for (;;) 982 { 983 do 984 { 985 ch2 = GET (); 986 if (ch2 != EOF && IS_NEWLINE (ch2)) 987 add_newlines++; 988 } 989 while (ch2 != EOF && ch2 != '*'); 990 991 while (ch2 == '*') 992 ch2 = GET (); 993 994 if (ch2 == EOF || ch2 == '/') 995 break; 996 997 /* This UNGET will ensure that we count newlines 998 correctly. */ 999 UNGET (ch2); 1000 } 1001 1002 if (ch2 == EOF) 1003 as_warn (_("end of file in multiline comment")); 1004 1005 ch = ' '; 1006 goto recycle; 1007 } 1008 #ifdef DOUBLESLASH_LINE_COMMENTS 1009 else if (ch2 == '/') 1010 { 1011 do 1012 { 1013 ch = GET (); 1014 } 1015 while (ch != EOF && !IS_NEWLINE (ch)); 1016 if (ch == EOF) 1017 as_warn ("end of file in comment; newline inserted"); 1018 state = 0; 1019 PUT ('\n'); 1020 break; 1021 } 1022 #endif 1023 else 1024 { 1025 if (ch2 != EOF) 1026 UNGET (ch2); 1027 if (state == 9 || state == 10) 1028 state = 3; 1029 PUT (ch); 1030 } 1031 break; 1032 1033 case LEX_IS_STRINGQUOTE: 1034 quotechar = ch; 1035 if (state == 10) 1036 { 1037 /* Preserve the whitespace in foo "bar". */ 1038 UNGET (ch); 1039 state = 3; 1040 PUT (' '); 1041 1042 /* PUT didn't jump out. We could just break, but we 1043 know what will happen, so optimize a bit. */ 1044 ch = GET (); 1045 old_state = 3; 1046 } 1047 else if (state == 9) 1048 old_state = 3; 1049 else 1050 old_state = state; 1051 state = 5; 1052 PUT (ch); 1053 break; 1054 1055 case LEX_IS_ONECHAR_QUOTE: 1056 #ifdef H_TICK_HEX 1057 if (state == 9 && enable_h_tick_hex) 1058 { 1059 char c; 1060 1061 c = GET (); 1062 as_warn ("'%c found after symbol", c); 1063 UNGET (c); 1064 } 1065 #endif 1066 if (state == 10) 1067 { 1068 /* Preserve the whitespace in foo 'b'. */ 1069 UNGET (ch); 1070 state = 3; 1071 PUT (' '); 1072 break; 1073 } 1074 ch = GET (); 1075 if (ch == EOF) 1076 { 1077 as_warn (_("end of file after a one-character quote; \\0 inserted")); 1078 ch = 0; 1079 } 1080 if (ch == '\\') 1081 { 1082 ch = GET (); 1083 if (ch == EOF) 1084 { 1085 as_warn (_("end of file in escape character")); 1086 ch = '\\'; 1087 } 1088 else 1089 ch = process_escape (ch); 1090 } 1091 sprintf (out_buf, "%d", (int) (unsigned char) ch); 1092 1093 /* None of these 'x constants for us. We want 'x'. */ 1094 if ((ch = GET ()) != '\'') 1095 { 1096 #ifdef REQUIRE_CHAR_CLOSE_QUOTE 1097 as_warn (_("missing close quote; (assumed)")); 1098 #else 1099 if (ch != EOF) 1100 UNGET (ch); 1101 #endif 1102 } 1103 if (strlen (out_buf) == 1) 1104 { 1105 PUT (out_buf[0]); 1106 break; 1107 } 1108 if (state == 9) 1109 old_state = 3; 1110 else 1111 old_state = state; 1112 state = -1; 1113 out_string = out_buf; 1114 PUT (*out_string++); 1115 break; 1116 1117 case LEX_IS_COLON: 1118 #ifdef KEEP_WHITE_AROUND_COLON 1119 state = 9; 1120 #else 1121 if (state == 9 || state == 10) 1122 state = 3; 1123 else if (state != 3) 1124 state = 1; 1125 #endif 1126 PUT (ch); 1127 break; 1128 1129 case LEX_IS_NEWLINE: 1130 /* Roll out a bunch of newlines from inside comments, etc. */ 1131 if (add_newlines) 1132 { 1133 --add_newlines; 1134 UNGET (ch); 1135 } 1136 /* Fall through. */ 1137 1138 case LEX_IS_LINE_SEPARATOR: 1139 state = 0; 1140 PUT (ch); 1141 break; 1142 1143 case LEX_IS_PARALLEL_SEPARATOR: 1144 state = 1; 1145 PUT (ch); 1146 break; 1147 1148 #ifdef TC_V850 1149 case LEX_IS_DOUBLEDASH_1ST: 1150 ch2 = GET (); 1151 if (ch2 != '-') 1152 { 1153 if (ch2 != EOF) 1154 UNGET (ch2); 1155 goto de_fault; 1156 } 1157 /* Read and skip to end of line. */ 1158 do 1159 { 1160 ch = GET (); 1161 } 1162 while (ch != EOF && ch != '\n'); 1163 1164 if (ch == EOF) 1165 as_warn (_("end of file in comment; newline inserted")); 1166 1167 state = 0; 1168 PUT ('\n'); 1169 break; 1170 #endif 1171 #ifdef DOUBLEBAR_PARALLEL 1172 case LEX_IS_DOUBLEBAR_1ST: 1173 ch2 = GET (); 1174 if (ch2 != EOF) 1175 UNGET (ch2); 1176 if (ch2 != '|') 1177 goto de_fault; 1178 1179 /* Handle '||' in two states as invoking PUT twice might 1180 result in the first one jumping out of this loop. We'd 1181 then lose track of the state and one '|' char. */ 1182 state = 13; 1183 PUT ('|'); 1184 break; 1185 #endif 1186 case LEX_IS_LINE_COMMENT_START: 1187 /* FIXME-someday: The two character comment stuff was badly 1188 thought out. On i386, we want '/' as line comment start 1189 AND we want C style comments. hence this hack. The 1190 whole lexical process should be reworked. xoxorich. */ 1191 if (ch == '/') 1192 { 1193 ch2 = GET (); 1194 if (ch2 == '*') 1195 { 1196 old_state = 3; 1197 state = -2; 1198 break; 1199 } 1200 else if (ch2 != EOF) 1201 { 1202 UNGET (ch2); 1203 } 1204 } 1205 1206 if (state == 0 || state == 1) /* Only comment at start of line. */ 1207 { 1208 int startch; 1209 1210 startch = ch; 1211 1212 do 1213 { 1214 ch = GET (); 1215 } 1216 while (ch != EOF && IS_WHITESPACE (ch)); 1217 1218 if (ch == EOF) 1219 { 1220 as_warn (_("end of file in comment; newline inserted")); 1221 PUT ('\n'); 1222 break; 1223 } 1224 1225 if (ch < '0' || ch > '9' || state != 0 || startch != '#') 1226 { 1227 /* Not a cpp line. */ 1228 while (ch != EOF && !IS_NEWLINE (ch)) 1229 ch = GET (); 1230 if (ch == EOF) 1231 { 1232 as_warn (_("end of file in comment; newline inserted")); 1233 PUT ('\n'); 1234 } 1235 else /* IS_NEWLINE (ch) */ 1236 { 1237 /* To process non-zero add_newlines. */ 1238 UNGET (ch); 1239 } 1240 state = 0; 1241 break; 1242 } 1243 /* Looks like `# 123 "filename"' from cpp. */ 1244 UNGET (ch); 1245 old_state = 4; 1246 state = -1; 1247 if (scrub_m68k_mri) 1248 out_string = "\tlinefile "; 1249 else 1250 out_string = "\t.linefile "; 1251 PUT (*out_string++); 1252 break; 1253 } 1254 1255 #ifdef TC_D10V 1256 /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true. 1257 Trap is the only short insn that has a first operand that is 1258 neither register nor label. 1259 We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 . 1260 We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is 1261 already LEX_IS_LINE_COMMENT_START. However, it is the 1262 only character in line_comment_chars for d10v, hence we 1263 can recognize it as such. */ 1264 /* An alternative approach would be to reset the state to 1 when 1265 we see '||', '<'- or '->', but that seems to be overkill. */ 1266 if (state == 10) 1267 PUT (' '); 1268 #endif 1269 /* We have a line comment character which is not at the 1270 start of a line. If this is also a normal comment 1271 character, fall through. Otherwise treat it as a default 1272 character. */ 1273 if (strchr (tc_comment_chars, ch) == NULL 1274 && (! scrub_m68k_mri 1275 || (ch != '!' && ch != '*'))) 1276 goto de_fault; 1277 if (scrub_m68k_mri 1278 && (ch == '!' || ch == '*' || ch == '#') 1279 && state != 1 1280 && state != 10) 1281 goto de_fault; 1282 /* Fall through. */ 1283 case LEX_IS_COMMENT_START: 1284 #if defined TC_ARM && defined OBJ_ELF 1285 /* On the ARM, `@' is the comment character. 1286 Unfortunately this is also a special character in ELF .symver 1287 directives (and .type, though we deal with those another way). 1288 So we check if this line is such a directive, and treat 1289 the character as default if so. This is a hack. */ 1290 if ((symver_state != NULL) && (*symver_state == 0)) 1291 goto de_fault; 1292 #endif 1293 1294 #ifdef TC_ARM 1295 /* For the ARM, care is needed not to damage occurrences of \@ 1296 by stripping the @ onwards. Yuck. */ 1297 if ((to > tostart ? to[-1] : last_char) == '\\') 1298 /* Do not treat the @ as a start-of-comment. */ 1299 goto de_fault; 1300 #endif 1301 1302 #ifdef WARN_COMMENTS 1303 if (!found_comment) 1304 found_comment_file = as_where (&found_comment); 1305 #endif 1306 do 1307 { 1308 ch = GET (); 1309 } 1310 while (ch != EOF && !IS_NEWLINE (ch)); 1311 if (ch == EOF) 1312 as_warn (_("end of file in comment; newline inserted")); 1313 state = 0; 1314 PUT ('\n'); 1315 break; 1316 1317 #ifdef H_TICK_HEX 1318 case LEX_IS_H: 1319 /* Look for strings like H'[0-9A-Fa-f] and if found, replace 1320 the H' with 0x to make them gas-style hex characters. */ 1321 if (enable_h_tick_hex) 1322 { 1323 char quot; 1324 1325 quot = GET (); 1326 if (quot == '\'') 1327 { 1328 UNGET ('x'); 1329 ch = '0'; 1330 } 1331 else 1332 UNGET (quot); 1333 } 1334 #endif 1335 /* Fall through. */ 1336 1337 case LEX_IS_SYMBOL_COMPONENT: 1338 if (state == 10) 1339 { 1340 /* This is a symbol character following another symbol 1341 character, with whitespace in between. We skipped 1342 the whitespace earlier, so output it now. */ 1343 UNGET (ch); 1344 state = 3; 1345 PUT (' '); 1346 break; 1347 } 1348 1349 #ifdef TC_Z80 1350 /* "af'" is a symbol containing '\''. */ 1351 if (state == 3 && (ch == 'a' || ch == 'A')) 1352 { 1353 state = 16; 1354 PUT (ch); 1355 ch = GET (); 1356 if (ch == 'f' || ch == 'F') 1357 { 1358 state = 17; 1359 PUT (ch); 1360 break; 1361 } 1362 else 1363 { 1364 state = 9; 1365 if (ch == EOF || !IS_SYMBOL_COMPONENT (ch)) 1366 { 1367 if (ch != EOF) 1368 UNGET (ch); 1369 break; 1370 } 1371 } 1372 } 1373 #endif 1374 if (state == 3) 1375 state = 9; 1376 1377 /* This is a common case. Quickly copy CH and all the 1378 following symbol component or normal characters. */ 1379 if (to + 1 < toend 1380 && mri_state == NULL 1381 #if defined TC_ARM && defined OBJ_ELF 1382 && symver_state == NULL 1383 #endif 1384 ) 1385 { 1386 char *s; 1387 ptrdiff_t len; 1388 1389 for (s = from; s < fromend; s++) 1390 { 1391 int type; 1392 1393 ch2 = *(unsigned char *) s; 1394 type = lex[ch2]; 1395 if (type != 0 1396 && type != LEX_IS_SYMBOL_COMPONENT) 1397 break; 1398 } 1399 1400 if (s > from) 1401 /* Handle the last character normally, for 1402 simplicity. */ 1403 --s; 1404 1405 len = s - from; 1406 1407 if (len > (toend - to) - 1) 1408 len = (toend - to) - 1; 1409 1410 if (len > 0) 1411 { 1412 PUT (ch); 1413 memcpy (to, from, len); 1414 to += len; 1415 from += len; 1416 if (to >= toend) 1417 goto tofull; 1418 ch = GET (); 1419 } 1420 } 1421 1422 /* Fall through. */ 1423 default: 1424 de_fault: 1425 /* Some relatively `normal' character. */ 1426 if (state == 0) 1427 { 1428 state = 11; /* Now seeing label definition. */ 1429 } 1430 else if (state == 1) 1431 { 1432 state = 2; /* Ditto. */ 1433 } 1434 else if (state == 9) 1435 { 1436 if (!IS_SYMBOL_COMPONENT (ch)) 1437 state = 3; 1438 } 1439 else if (state == 10) 1440 { 1441 if (ch == '\\') 1442 { 1443 /* Special handling for backslash: a backslash may 1444 be the beginning of a formal parameter (of a 1445 macro) following another symbol character, with 1446 whitespace in between. If that is the case, we 1447 output a space before the parameter. Strictly 1448 speaking, correct handling depends upon what the 1449 macro parameter expands into; if the parameter 1450 expands into something which does not start with 1451 an operand character, then we don't want to keep 1452 the space. We don't have enough information to 1453 make the right choice, so here we are making the 1454 choice which is more likely to be correct. */ 1455 if (to + 1 >= toend) 1456 { 1457 /* If we're near the end of the buffer, save the 1458 character for the next time round. Otherwise 1459 we'll lose our state. */ 1460 UNGET (ch); 1461 goto tofull; 1462 } 1463 *to++ = ' '; 1464 } 1465 1466 state = 3; 1467 } 1468 PUT (ch); 1469 break; 1470 } 1471 } 1472 1473 /*NOTREACHED*/ 1474 1475 fromeof: 1476 /* We have reached the end of the input. */ 1477 #ifdef TC_ARM 1478 if (to > tostart) 1479 last_char = to[-1]; 1480 #endif 1481 return to - tostart; 1482 1483 tofull: 1484 /* The output buffer is full. Save any input we have not yet 1485 processed. */ 1486 if (fromend > from) 1487 { 1488 saved_input = from; 1489 saved_input_len = fromend - from; 1490 } 1491 else 1492 saved_input = NULL; 1493 1494 #ifdef TC_ARM 1495 if (to > tostart) 1496 last_char = to[-1]; 1497 #endif 1498 return to - tostart; 1499 } 1500