1 /* This is the Assembler Pre-Processor 2 Copyright (C) 1987-2022 Free Software Foundation, Inc. 3 4 This file is part of GAS, the GNU Assembler. 5 6 GAS is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GAS is distributed in the hope that it will be useful, but WITHOUT 12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 14 License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GAS; see the file COPYING. If not, write to the Free 18 Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA 19 02110-1301, USA. */ 20 21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */ 22 /* App, the assembler pre-processor. This pre-processor strips out 23 excess spaces, turns single-quoted characters into a decimal 24 constant, and turns the # in # <number> <filename> <garbage> into a 25 .linefile. This needs better error-handling. */ 26 27 #include "as.h" 28 29 #if (__STDC__ != 1) 30 #ifndef const 31 #define const /* empty */ 32 #endif 33 #endif 34 35 #ifdef H_TICK_HEX 36 int enable_h_tick_hex = 0; 37 #endif 38 39 #ifdef TC_M68K 40 /* Whether we are scrubbing in m68k MRI mode. This is different from 41 flag_m68k_mri, because the two flags will be affected by the .mri 42 pseudo-op at different times. */ 43 static int scrub_m68k_mri; 44 45 /* The pseudo-op which switches in and out of MRI mode. See the 46 comment in do_scrub_chars. */ 47 static const char mri_pseudo[] = ".mri 0"; 48 #else 49 #define scrub_m68k_mri 0 50 #endif 51 52 #if defined TC_ARM && defined OBJ_ELF 53 /* The pseudo-op for which we need to special-case `@' characters. 54 See the comment in do_scrub_chars. */ 55 static const char symver_pseudo[] = ".symver"; 56 static const char * symver_state; 57 #endif 58 59 static char last_char; 60 61 static char lex[256]; 62 static const char symbol_chars[] = 63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 64 65 #define LEX_IS_SYMBOL_COMPONENT 1 66 #define LEX_IS_WHITESPACE 2 67 #define LEX_IS_LINE_SEPARATOR 3 68 #define LEX_IS_COMMENT_START 4 69 #define LEX_IS_LINE_COMMENT_START 5 70 #define LEX_IS_TWOCHAR_COMMENT_1ST 6 71 #define LEX_IS_STRINGQUOTE 8 72 #define LEX_IS_COLON 9 73 #define LEX_IS_NEWLINE 10 74 #define LEX_IS_ONECHAR_QUOTE 11 75 #ifdef TC_V850 76 #define LEX_IS_DOUBLEDASH_1ST 12 77 #endif 78 #ifdef TC_M32R 79 #define DOUBLEBAR_PARALLEL 80 #endif 81 #ifdef DOUBLEBAR_PARALLEL 82 #define LEX_IS_DOUBLEBAR_1ST 13 83 #endif 84 #define LEX_IS_PARALLEL_SEPARATOR 14 85 #ifdef H_TICK_HEX 86 #define LEX_IS_H 15 87 #endif 88 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT) 89 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE) 90 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR) 91 #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR) 92 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START) 93 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START) 94 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE) 95 96 static int process_escape (int); 97 98 /* FIXME-soon: The entire lexer/parser thingy should be 99 built statically at compile time rather than dynamically 100 each and every time the assembler is run. xoxorich. */ 101 102 void 103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED) 104 { 105 const char *p; 106 int c; 107 108 lex[' '] = LEX_IS_WHITESPACE; 109 lex['\t'] = LEX_IS_WHITESPACE; 110 lex['\r'] = LEX_IS_WHITESPACE; 111 lex['\n'] = LEX_IS_NEWLINE; 112 lex[':'] = LEX_IS_COLON; 113 114 #ifdef TC_M68K 115 scrub_m68k_mri = m68k_mri; 116 117 if (! m68k_mri) 118 #endif 119 { 120 lex['"'] = LEX_IS_STRINGQUOTE; 121 122 #if ! defined (TC_HPPA) 123 lex['\''] = LEX_IS_ONECHAR_QUOTE; 124 #endif 125 126 #ifdef SINGLE_QUOTE_STRINGS 127 lex['\''] = LEX_IS_STRINGQUOTE; 128 #endif 129 } 130 131 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop 132 in state 5 of do_scrub_chars must be changed. */ 133 134 /* Note that these override the previous defaults, e.g. if ';' is a 135 comment char, then it isn't a line separator. */ 136 for (p = symbol_chars; *p; ++p) 137 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; 138 139 for (c = 128; c < 256; ++c) 140 lex[c] = LEX_IS_SYMBOL_COMPONENT; 141 142 #ifdef tc_symbol_chars 143 /* This macro permits the processor to specify all characters which 144 may appears in an operand. This will prevent the scrubber from 145 discarding meaningful whitespace in certain cases. The i386 146 backend uses this to support prefixes, which can confuse the 147 scrubber as to whether it is parsing operands or opcodes. */ 148 for (p = tc_symbol_chars; *p; ++p) 149 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT; 150 #endif 151 152 /* The m68k backend wants to be able to change comment_chars. */ 153 #ifndef tc_comment_chars 154 #define tc_comment_chars comment_chars 155 #endif 156 for (p = tc_comment_chars; *p; p++) 157 lex[(unsigned char) *p] = LEX_IS_COMMENT_START; 158 159 for (p = line_comment_chars; *p; p++) 160 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START; 161 162 #ifndef tc_line_separator_chars 163 #define tc_line_separator_chars line_separator_chars 164 #endif 165 for (p = tc_line_separator_chars; *p; p++) 166 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR; 167 168 #ifdef tc_parallel_separator_chars 169 /* This macro permits the processor to specify all characters which 170 separate parallel insns on the same line. */ 171 for (p = tc_parallel_separator_chars; *p; p++) 172 lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR; 173 #endif 174 175 /* Only allow slash-star comments if slash is not in use. 176 FIXME: This isn't right. We should always permit them. */ 177 if (lex['/'] == 0) 178 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST; 179 180 #ifdef TC_M68K 181 if (m68k_mri) 182 { 183 lex['\''] = LEX_IS_STRINGQUOTE; 184 lex[';'] = LEX_IS_COMMENT_START; 185 lex['*'] = LEX_IS_LINE_COMMENT_START; 186 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but 187 then it can't be used in an expression. */ 188 lex['!'] = LEX_IS_LINE_COMMENT_START; 189 } 190 #endif 191 192 #ifdef TC_V850 193 lex['-'] = LEX_IS_DOUBLEDASH_1ST; 194 #endif 195 #ifdef DOUBLEBAR_PARALLEL 196 lex['|'] = LEX_IS_DOUBLEBAR_1ST; 197 #endif 198 #ifdef TC_D30V 199 /* Must do this is we want VLIW instruction with "->" or "<-". */ 200 lex['-'] = LEX_IS_SYMBOL_COMPONENT; 201 #endif 202 203 #ifdef H_TICK_HEX 204 if (enable_h_tick_hex) 205 { 206 lex['h'] = LEX_IS_H; 207 lex['H'] = LEX_IS_H; 208 } 209 #endif 210 } 211 212 /* Saved state of the scrubber. */ 213 static int state; 214 static int old_state; 215 static const char *out_string; 216 static char out_buf[20]; 217 static int add_newlines; 218 static char *saved_input; 219 static size_t saved_input_len; 220 static char input_buffer[32 * 1024]; 221 static const char *mri_state; 222 static char mri_last_ch; 223 224 /* Data structure for saving the state of app across #include's. Note that 225 app is called asynchronously to the parsing of the .include's, so our 226 state at the time .include is interpreted is completely unrelated. 227 That's why we have to save it all. */ 228 229 struct app_save 230 { 231 int state; 232 int old_state; 233 const char * out_string; 234 char out_buf[sizeof (out_buf)]; 235 int add_newlines; 236 char * saved_input; 237 size_t saved_input_len; 238 #ifdef TC_M68K 239 int scrub_m68k_mri; 240 #endif 241 const char * mri_state; 242 char mri_last_ch; 243 #if defined TC_ARM && defined OBJ_ELF 244 const char * symver_state; 245 #endif 246 char last_char; 247 }; 248 249 char * 250 app_push (void) 251 { 252 struct app_save *saved; 253 254 saved = XNEW (struct app_save); 255 saved->state = state; 256 saved->old_state = old_state; 257 saved->out_string = out_string; 258 memcpy (saved->out_buf, out_buf, sizeof (out_buf)); 259 saved->add_newlines = add_newlines; 260 if (saved_input == NULL) 261 saved->saved_input = NULL; 262 else 263 { 264 saved->saved_input = XNEWVEC (char, saved_input_len); 265 memcpy (saved->saved_input, saved_input, saved_input_len); 266 saved->saved_input_len = saved_input_len; 267 } 268 #ifdef TC_M68K 269 saved->scrub_m68k_mri = scrub_m68k_mri; 270 #endif 271 saved->mri_state = mri_state; 272 saved->mri_last_ch = mri_last_ch; 273 #if defined TC_ARM && defined OBJ_ELF 274 saved->symver_state = symver_state; 275 #endif 276 saved->last_char = last_char; 277 278 /* do_scrub_begin() is not useful, just wastes time. */ 279 280 state = 0; 281 saved_input = NULL; 282 add_newlines = 0; 283 284 return (char *) saved; 285 } 286 287 void 288 app_pop (char *arg) 289 { 290 struct app_save *saved = (struct app_save *) arg; 291 292 /* There is no do_scrub_end (). */ 293 state = saved->state; 294 old_state = saved->old_state; 295 out_string = saved->out_string; 296 memcpy (out_buf, saved->out_buf, sizeof (out_buf)); 297 add_newlines = saved->add_newlines; 298 if (saved->saved_input == NULL) 299 saved_input = NULL; 300 else 301 { 302 gas_assert (saved->saved_input_len <= sizeof (input_buffer)); 303 memcpy (input_buffer, saved->saved_input, saved->saved_input_len); 304 saved_input = input_buffer; 305 saved_input_len = saved->saved_input_len; 306 free (saved->saved_input); 307 } 308 #ifdef TC_M68K 309 scrub_m68k_mri = saved->scrub_m68k_mri; 310 #endif 311 mri_state = saved->mri_state; 312 mri_last_ch = saved->mri_last_ch; 313 #if defined TC_ARM && defined OBJ_ELF 314 symver_state = saved->symver_state; 315 #endif 316 last_char = saved->last_char; 317 318 free (arg); 319 } 320 321 /* @@ This assumes that \n &c are the same on host and target. This is not 322 necessarily true. */ 323 324 static int 325 process_escape (int ch) 326 { 327 switch (ch) 328 { 329 case 'b': 330 return '\b'; 331 case 'f': 332 return '\f'; 333 case 'n': 334 return '\n'; 335 case 'r': 336 return '\r'; 337 case 't': 338 return '\t'; 339 case '\'': 340 return '\''; 341 case '"': 342 return '\"'; 343 default: 344 return ch; 345 } 346 } 347 348 #define MULTIBYTE_WARN_COUNT_LIMIT 10 349 static unsigned int multibyte_warn_count = 0; 350 351 bool 352 scan_for_multibyte_characters (const unsigned char * start, 353 const unsigned char * end, 354 bool warn) 355 { 356 if (end <= start) 357 return false; 358 359 if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT) 360 return false; 361 362 bool found = false; 363 364 while (start < end) 365 { 366 unsigned char c; 367 368 if ((c = * start++) <= 0x7f) 369 continue; 370 371 if (!warn) 372 return true; 373 374 found = true; 375 376 const char * filename; 377 unsigned int lineno; 378 379 filename = as_where (& lineno); 380 if (filename == NULL) 381 as_warn (_("multibyte character (%#x) encountered in input"), c); 382 else if (lineno == 0) 383 as_warn (_("multibyte character (%#x) encountered in %s"), c, filename); 384 else 385 as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno); 386 387 if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT) 388 { 389 as_warn (_("further multibyte character warnings suppressed")); 390 break; 391 } 392 } 393 394 return found; 395 } 396 397 /* This function is called to process input characters. The GET 398 parameter is used to retrieve more input characters. GET should 399 set its parameter to point to a buffer, and return the length of 400 the buffer; it should return 0 at end of file. The scrubbed output 401 characters are put into the buffer starting at TOSTART; the TOSTART 402 buffer is TOLEN bytes in length. The function returns the number 403 of scrubbed characters put into TOSTART. This will be TOLEN unless 404 end of file was seen. This function is arranged as a state 405 machine, and saves its state so that it may return at any point. 406 This is the way the old code used to work. */ 407 408 size_t 409 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen) 410 { 411 char *to = tostart; 412 char *toend = tostart + tolen; 413 char *from; 414 char *fromend; 415 size_t fromlen; 416 int ch, ch2 = 0; 417 /* Character that started the string we're working on. */ 418 static char quotechar; 419 420 /*State 0: beginning of normal line 421 1: After first whitespace on line (flush more white) 422 2: After first non-white (opcode) on line (keep 1white) 423 3: after second white on line (into operands) (flush white) 424 4: after putting out a .linefile, put out digits 425 5: parsing a string, then go to old-state 426 6: putting out \ escape in a "d string. 427 7: no longer used 428 8: no longer used 429 9: After seeing symbol char in state 3 (keep 1white after symchar) 430 10: After seeing whitespace in state 9 (keep white before symchar) 431 11: After seeing a symbol character in state 0 (eg a label definition) 432 -1: output string in out_string and go to the state in old_state 433 -2: flush text until a '*' '/' is seen, then go to state old_state 434 #ifdef TC_V850 435 12: After seeing a dash, looking for a second dash as a start 436 of comment. 437 #endif 438 #ifdef DOUBLEBAR_PARALLEL 439 13: After seeing a vertical bar, looking for a second 440 vertical bar as a parallel expression separator. 441 #endif 442 #ifdef TC_PREDICATE_START_CHAR 443 14: After seeing a predicate start character at state 0, looking 444 for a predicate end character as predicate. 445 15: After seeing a predicate start character at state 1, looking 446 for a predicate end character as predicate. 447 #endif 448 #ifdef TC_Z80 449 16: After seeing an 'a' or an 'A' at the start of a symbol 450 17: After seeing an 'f' or an 'F' in state 16 451 #endif 452 */ 453 454 /* I added states 9 and 10 because the MIPS ECOFF assembler uses 455 constructs like ``.loc 1 20''. This was turning into ``.loc 456 120''. States 9 and 10 ensure that a space is never dropped in 457 between characters which could appear in an identifier. Ian 458 Taylor, ian@cygnus.com. 459 460 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works 461 correctly on the PA (and any other target where colons are optional). 462 Jeff Law, law@cs.utah.edu. 463 464 I added state 13 so that something like "cmp r1, r2 || trap #1" does not 465 get squashed into "cmp r1,r2||trap#1", with the all important space 466 between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */ 467 468 /* This macro gets the next input character. */ 469 470 #define GET() \ 471 (from < fromend \ 472 ? * (unsigned char *) (from++) \ 473 : (saved_input = NULL, \ 474 fromlen = (*get) (input_buffer, sizeof input_buffer), \ 475 from = input_buffer, \ 476 fromend = from + fromlen, \ 477 (fromlen == 0 \ 478 ? EOF \ 479 : * (unsigned char *) (from++)))) 480 481 /* This macro pushes a character back on the input stream. */ 482 483 #define UNGET(uch) (*--from = (uch)) 484 485 /* This macro puts a character into the output buffer. If this 486 character fills the output buffer, this macro jumps to the label 487 TOFULL. We use this rather ugly approach because we need to 488 handle two different termination conditions: EOF on the input 489 stream, and a full output buffer. It would be simpler if we 490 always read in the entire input stream before processing it, but 491 I don't want to make such a significant change to the assembler's 492 memory usage. */ 493 494 #define PUT(pch) \ 495 do \ 496 { \ 497 *to++ = (pch); \ 498 if (to >= toend) \ 499 goto tofull; \ 500 } \ 501 while (0) 502 503 if (saved_input != NULL) 504 { 505 from = saved_input; 506 fromend = from + saved_input_len; 507 } 508 else 509 { 510 fromlen = (*get) (input_buffer, sizeof input_buffer); 511 if (fromlen == 0) 512 return 0; 513 from = input_buffer; 514 fromend = from + fromlen; 515 516 if (multibyte_handling == multibyte_warn) 517 (void) scan_for_multibyte_characters ((const unsigned char *) from, 518 (const unsigned char* ) fromend, 519 true /* Generate warnings. */); 520 } 521 522 while (1) 523 { 524 /* The cases in this switch end with continue, in order to 525 branch back to the top of this while loop and generate the 526 next output character in the appropriate state. */ 527 switch (state) 528 { 529 case -1: 530 ch = *out_string++; 531 if (*out_string == '\0') 532 { 533 state = old_state; 534 old_state = 3; 535 } 536 PUT (ch); 537 continue; 538 539 case -2: 540 for (;;) 541 { 542 do 543 { 544 ch = GET (); 545 546 if (ch == EOF) 547 { 548 as_warn (_("end of file in comment")); 549 goto fromeof; 550 } 551 552 if (ch == '\n') 553 PUT ('\n'); 554 } 555 while (ch != '*'); 556 557 while ((ch = GET ()) == '*') 558 ; 559 560 if (ch == EOF) 561 { 562 as_warn (_("end of file in comment")); 563 goto fromeof; 564 } 565 566 if (ch == '/') 567 break; 568 569 UNGET (ch); 570 } 571 572 state = old_state; 573 UNGET (' '); 574 continue; 575 576 case 4: 577 ch = GET (); 578 if (ch == EOF) 579 goto fromeof; 580 else if (ch >= '0' && ch <= '9') 581 PUT (ch); 582 else 583 { 584 while (ch != EOF && IS_WHITESPACE (ch)) 585 ch = GET (); 586 if (ch == '"') 587 { 588 quotechar = ch; 589 state = 5; 590 old_state = 3; 591 PUT (ch); 592 } 593 else 594 { 595 while (ch != EOF && ch != '\n') 596 ch = GET (); 597 state = 0; 598 PUT (ch); 599 } 600 } 601 continue; 602 603 case 5: 604 /* We are going to copy everything up to a quote character, 605 with special handling for a backslash. We try to 606 optimize the copying in the simple case without using the 607 GET and PUT macros. */ 608 { 609 char *s; 610 ptrdiff_t len; 611 612 for (s = from; s < fromend; s++) 613 { 614 ch = *s; 615 if (ch == '\\' 616 || ch == quotechar 617 || ch == '\n') 618 break; 619 } 620 len = s - from; 621 if (len > toend - to) 622 len = toend - to; 623 if (len > 0) 624 { 625 memcpy (to, from, len); 626 to += len; 627 from += len; 628 if (to >= toend) 629 goto tofull; 630 } 631 } 632 633 ch = GET (); 634 if (ch == EOF) 635 { 636 /* This buffer is here specifically so 637 that the UNGET below will work. */ 638 static char one_char_buf[1]; 639 640 as_warn (_("end of file in string; '%c' inserted"), quotechar); 641 state = old_state; 642 from = fromend = one_char_buf + 1; 643 fromlen = 1; 644 UNGET ('\n'); 645 PUT (quotechar); 646 } 647 else if (ch == quotechar) 648 { 649 state = old_state; 650 PUT (ch); 651 } 652 else if (TC_STRING_ESCAPES && ch == '\\') 653 { 654 state = 6; 655 PUT (ch); 656 } 657 else if (scrub_m68k_mri && ch == '\n') 658 { 659 /* Just quietly terminate the string. This permits lines like 660 bne label loop if we haven't reach end yet. */ 661 state = old_state; 662 UNGET (ch); 663 PUT ('\''); 664 } 665 else 666 { 667 PUT (ch); 668 } 669 continue; 670 671 case 6: 672 state = 5; 673 ch = GET (); 674 switch (ch) 675 { 676 /* Handle strings broken across lines, by turning '\n' into 677 '\\' and 'n'. */ 678 case '\n': 679 UNGET ('n'); 680 add_newlines++; 681 PUT ('\\'); 682 continue; 683 684 case EOF: 685 as_warn (_("end of file in string; '%c' inserted"), quotechar); 686 PUT (quotechar); 687 continue; 688 689 case '"': 690 case '\\': 691 case 'b': 692 case 'f': 693 case 'n': 694 case 'r': 695 case 't': 696 case 'v': 697 case 'x': 698 case 'X': 699 case '0': 700 case '1': 701 case '2': 702 case '3': 703 case '4': 704 case '5': 705 case '6': 706 case '7': 707 break; 708 709 default: 710 #ifdef ONLY_STANDARD_ESCAPES 711 as_warn (_("unknown escape '\\%c' in string; ignored"), ch); 712 #endif 713 break; 714 } 715 PUT (ch); 716 continue; 717 718 #ifdef DOUBLEBAR_PARALLEL 719 case 13: 720 ch = GET (); 721 if (ch != '|') 722 abort (); 723 724 /* Reset back to state 1 and pretend that we are parsing a 725 line from just after the first white space. */ 726 state = 1; 727 PUT ('|'); 728 #ifdef TC_TIC6X 729 /* "||^" is used for SPMASKed instructions. */ 730 ch = GET (); 731 if (ch == EOF) 732 goto fromeof; 733 else if (ch == '^') 734 PUT ('^'); 735 else 736 UNGET (ch); 737 #endif 738 continue; 739 #endif 740 #ifdef TC_Z80 741 case 16: 742 /* We have seen an 'a' at the start of a symbol, look for an 'f'. */ 743 ch = GET (); 744 if (ch == 'f' || ch == 'F') 745 { 746 state = 17; 747 PUT (ch); 748 } 749 else 750 { 751 if (ch != EOF) 752 UNGET (ch); 753 state = 9; 754 break; 755 } 756 /* Fall through. */ 757 case 17: 758 /* We have seen "af" at the start of a symbol, 759 a ' here is a part of that symbol. */ 760 ch = GET (); 761 state = 9; 762 if (ch == '\'') 763 /* Change to avoid warning about unclosed string. */ 764 PUT ('`'); 765 else if (ch != EOF) 766 UNGET (ch); 767 break; 768 #endif 769 } 770 771 /* OK, we are somewhere in states 0 through 4 or 9 through 11. */ 772 773 /* flushchar: */ 774 ch = GET (); 775 776 #ifdef TC_PREDICATE_START_CHAR 777 if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1)) 778 { 779 state += 14; 780 PUT (ch); 781 continue; 782 } 783 else if (state == 14 || state == 15) 784 { 785 if (ch == TC_PREDICATE_END_CHAR) 786 { 787 state -= 14; 788 PUT (ch); 789 ch = GET (); 790 } 791 else 792 { 793 PUT (ch); 794 continue; 795 } 796 } 797 #endif 798 799 recycle: 800 801 #if defined TC_ARM && defined OBJ_ELF 802 /* We need to watch out for .symver directives. See the comment later 803 in this function. */ 804 if (symver_state == NULL) 805 { 806 if ((state == 0 || state == 1) && ch == symver_pseudo[0]) 807 symver_state = symver_pseudo + 1; 808 } 809 else 810 { 811 /* We advance to the next state if we find the right 812 character. */ 813 if (ch != '\0' && (*symver_state == ch)) 814 ++symver_state; 815 else if (*symver_state != '\0') 816 /* We did not get the expected character, or we didn't 817 get a valid terminating character after seeing the 818 entire pseudo-op, so we must go back to the beginning. */ 819 symver_state = NULL; 820 else 821 { 822 /* We've read the entire pseudo-op. If this is the end 823 of the line, go back to the beginning. */ 824 if (IS_NEWLINE (ch)) 825 symver_state = NULL; 826 } 827 } 828 #endif /* TC_ARM && OBJ_ELF */ 829 830 #ifdef TC_M68K 831 /* We want to have pseudo-ops which control whether we are in 832 MRI mode or not. Unfortunately, since m68k MRI mode affects 833 the scrubber, that means that we need a special purpose 834 recognizer here. */ 835 if (mri_state == NULL) 836 { 837 if ((state == 0 || state == 1) 838 && ch == mri_pseudo[0]) 839 mri_state = mri_pseudo + 1; 840 } 841 else 842 { 843 /* We advance to the next state if we find the right 844 character, or if we need a space character and we get any 845 whitespace character, or if we need a '0' and we get a 846 '1' (this is so that we only need one state to handle 847 ``.mri 0'' and ``.mri 1''). */ 848 if (ch != '\0' 849 && (*mri_state == ch 850 || (*mri_state == ' ' 851 && lex[ch] == LEX_IS_WHITESPACE) 852 || (*mri_state == '0' 853 && ch == '1'))) 854 { 855 mri_last_ch = ch; 856 ++mri_state; 857 } 858 else if (*mri_state != '\0' 859 || (lex[ch] != LEX_IS_WHITESPACE 860 && lex[ch] != LEX_IS_NEWLINE)) 861 { 862 /* We did not get the expected character, or we didn't 863 get a valid terminating character after seeing the 864 entire pseudo-op, so we must go back to the 865 beginning. */ 866 mri_state = NULL; 867 } 868 else 869 { 870 /* We've read the entire pseudo-op. mips_last_ch is 871 either '0' or '1' indicating whether to enter or 872 leave MRI mode. */ 873 do_scrub_begin (mri_last_ch == '1'); 874 mri_state = NULL; 875 876 /* We continue handling the character as usual. The 877 main gas reader must also handle the .mri pseudo-op 878 to control expression parsing and the like. */ 879 } 880 } 881 #endif 882 883 if (ch == EOF) 884 { 885 if (state != 0) 886 { 887 as_warn (_("end of file not at end of a line; newline inserted")); 888 state = 0; 889 PUT ('\n'); 890 } 891 goto fromeof; 892 } 893 894 switch (lex[ch]) 895 { 896 case LEX_IS_WHITESPACE: 897 do 898 { 899 ch = GET (); 900 } 901 while (ch != EOF && IS_WHITESPACE (ch)); 902 if (ch == EOF) 903 goto fromeof; 904 905 if (state == 0) 906 { 907 /* Preserve a single whitespace character at the 908 beginning of a line. */ 909 state = 1; 910 UNGET (ch); 911 PUT (' '); 912 break; 913 } 914 915 #ifdef KEEP_WHITE_AROUND_COLON 916 if (lex[ch] == LEX_IS_COLON) 917 { 918 /* Only keep this white if there's no white *after* the 919 colon. */ 920 ch2 = GET (); 921 if (ch2 != EOF) 922 UNGET (ch2); 923 if (!IS_WHITESPACE (ch2)) 924 { 925 state = 9; 926 UNGET (ch); 927 PUT (' '); 928 break; 929 } 930 } 931 #endif 932 if (IS_COMMENT (ch) 933 || IS_LINE_SEPARATOR (ch) 934 || IS_PARALLEL_SEPARATOR (ch)) 935 { 936 if (scrub_m68k_mri) 937 { 938 /* In MRI mode, we keep these spaces. */ 939 UNGET (ch); 940 PUT (' '); 941 break; 942 } 943 goto recycle; 944 } 945 946 /* If we're in state 2 or 11, we've seen a non-white 947 character followed by whitespace. If the next character 948 is ':', this is whitespace after a label name which we 949 normally must ignore. In MRI mode, though, spaces are 950 not permitted between the label and the colon. */ 951 if ((state == 2 || state == 11) 952 && lex[ch] == LEX_IS_COLON 953 && ! scrub_m68k_mri) 954 { 955 state = 1; 956 PUT (ch); 957 break; 958 } 959 960 switch (state) 961 { 962 case 1: 963 /* We can arrive here if we leave a leading whitespace 964 character at the beginning of a line. */ 965 goto recycle; 966 case 2: 967 state = 3; 968 if (to + 1 < toend) 969 { 970 /* Optimize common case by skipping UNGET/GET. */ 971 PUT (' '); /* Sp after opco */ 972 goto recycle; 973 } 974 UNGET (ch); 975 PUT (' '); 976 break; 977 case 3: 978 #ifndef TC_KEEP_OPERAND_SPACES 979 /* For TI C6X, we keep these spaces as they may separate 980 functional unit specifiers from operands. */ 981 if (scrub_m68k_mri) 982 #endif 983 { 984 /* In MRI mode, we keep these spaces. */ 985 UNGET (ch); 986 PUT (' '); 987 break; 988 } 989 goto recycle; /* Sp in operands */ 990 case 9: 991 case 10: 992 #ifndef TC_KEEP_OPERAND_SPACES 993 if (scrub_m68k_mri) 994 #endif 995 { 996 /* In MRI mode, we keep these spaces. */ 997 state = 3; 998 UNGET (ch); 999 PUT (' '); 1000 break; 1001 } 1002 state = 10; /* Sp after symbol char */ 1003 goto recycle; 1004 case 11: 1005 if (LABELS_WITHOUT_COLONS || flag_m68k_mri) 1006 state = 1; 1007 else 1008 { 1009 /* We know that ch is not ':', since we tested that 1010 case above. Therefore this is not a label, so it 1011 must be the opcode, and we've just seen the 1012 whitespace after it. */ 1013 state = 3; 1014 } 1015 UNGET (ch); 1016 PUT (' '); /* Sp after label definition. */ 1017 break; 1018 default: 1019 BAD_CASE (state); 1020 } 1021 break; 1022 1023 case LEX_IS_TWOCHAR_COMMENT_1ST: 1024 ch2 = GET (); 1025 if (ch2 == '*') 1026 { 1027 for (;;) 1028 { 1029 do 1030 { 1031 ch2 = GET (); 1032 if (ch2 != EOF && IS_NEWLINE (ch2)) 1033 add_newlines++; 1034 } 1035 while (ch2 != EOF && ch2 != '*'); 1036 1037 while (ch2 == '*') 1038 ch2 = GET (); 1039 1040 if (ch2 == EOF || ch2 == '/') 1041 break; 1042 1043 /* This UNGET will ensure that we count newlines 1044 correctly. */ 1045 UNGET (ch2); 1046 } 1047 1048 if (ch2 == EOF) 1049 as_warn (_("end of file in multiline comment")); 1050 1051 ch = ' '; 1052 goto recycle; 1053 } 1054 #ifdef DOUBLESLASH_LINE_COMMENTS 1055 else if (ch2 == '/') 1056 { 1057 do 1058 { 1059 ch = GET (); 1060 } 1061 while (ch != EOF && !IS_NEWLINE (ch)); 1062 if (ch == EOF) 1063 as_warn ("end of file in comment; newline inserted"); 1064 state = 0; 1065 PUT ('\n'); 1066 break; 1067 } 1068 #endif 1069 else 1070 { 1071 if (ch2 != EOF) 1072 UNGET (ch2); 1073 if (state == 9 || state == 10) 1074 state = 3; 1075 PUT (ch); 1076 } 1077 break; 1078 1079 case LEX_IS_STRINGQUOTE: 1080 quotechar = ch; 1081 if (state == 10) 1082 { 1083 /* Preserve the whitespace in foo "bar". */ 1084 UNGET (ch); 1085 state = 3; 1086 PUT (' '); 1087 1088 /* PUT didn't jump out. We could just break, but we 1089 know what will happen, so optimize a bit. */ 1090 ch = GET (); 1091 old_state = 9; 1092 } 1093 else if (state == 3) 1094 old_state = 9; 1095 else 1096 old_state = state; 1097 state = 5; 1098 PUT (ch); 1099 break; 1100 1101 case LEX_IS_ONECHAR_QUOTE: 1102 #ifdef H_TICK_HEX 1103 if (state == 9 && enable_h_tick_hex) 1104 { 1105 char c; 1106 1107 c = GET (); 1108 as_warn ("'%c found after symbol", c); 1109 UNGET (c); 1110 } 1111 #endif 1112 if (state == 10) 1113 { 1114 /* Preserve the whitespace in foo 'b'. */ 1115 UNGET (ch); 1116 state = 3; 1117 PUT (' '); 1118 break; 1119 } 1120 ch = GET (); 1121 if (ch == EOF) 1122 { 1123 as_warn (_("end of file after a one-character quote; \\0 inserted")); 1124 ch = 0; 1125 } 1126 if (ch == '\\') 1127 { 1128 ch = GET (); 1129 if (ch == EOF) 1130 { 1131 as_warn (_("end of file in escape character")); 1132 ch = '\\'; 1133 } 1134 else 1135 ch = process_escape (ch); 1136 } 1137 sprintf (out_buf, "%d", (int) (unsigned char) ch); 1138 1139 /* None of these 'x constants for us. We want 'x'. */ 1140 if ((ch = GET ()) != '\'') 1141 { 1142 #ifdef REQUIRE_CHAR_CLOSE_QUOTE 1143 as_warn (_("missing close quote; (assumed)")); 1144 #else 1145 if (ch != EOF) 1146 UNGET (ch); 1147 #endif 1148 } 1149 if (strlen (out_buf) == 1) 1150 { 1151 PUT (out_buf[0]); 1152 break; 1153 } 1154 if (state == 9) 1155 old_state = 3; 1156 else 1157 old_state = state; 1158 state = -1; 1159 out_string = out_buf; 1160 PUT (*out_string++); 1161 break; 1162 1163 case LEX_IS_COLON: 1164 #ifdef KEEP_WHITE_AROUND_COLON 1165 state = 9; 1166 #else 1167 if (state == 9 || state == 10) 1168 state = 3; 1169 else if (state != 3) 1170 state = 1; 1171 #endif 1172 PUT (ch); 1173 break; 1174 1175 case LEX_IS_NEWLINE: 1176 /* Roll out a bunch of newlines from inside comments, etc. */ 1177 if (add_newlines) 1178 { 1179 --add_newlines; 1180 UNGET (ch); 1181 } 1182 /* Fall through. */ 1183 1184 case LEX_IS_LINE_SEPARATOR: 1185 state = 0; 1186 PUT (ch); 1187 break; 1188 1189 case LEX_IS_PARALLEL_SEPARATOR: 1190 state = 1; 1191 PUT (ch); 1192 break; 1193 1194 #ifdef TC_V850 1195 case LEX_IS_DOUBLEDASH_1ST: 1196 ch2 = GET (); 1197 if (ch2 != '-') 1198 { 1199 if (ch2 != EOF) 1200 UNGET (ch2); 1201 goto de_fault; 1202 } 1203 /* Read and skip to end of line. */ 1204 do 1205 { 1206 ch = GET (); 1207 } 1208 while (ch != EOF && ch != '\n'); 1209 1210 if (ch == EOF) 1211 as_warn (_("end of file in comment; newline inserted")); 1212 1213 state = 0; 1214 PUT ('\n'); 1215 break; 1216 #endif 1217 #ifdef DOUBLEBAR_PARALLEL 1218 case LEX_IS_DOUBLEBAR_1ST: 1219 ch2 = GET (); 1220 if (ch2 != EOF) 1221 UNGET (ch2); 1222 if (ch2 != '|') 1223 goto de_fault; 1224 1225 /* Handle '||' in two states as invoking PUT twice might 1226 result in the first one jumping out of this loop. We'd 1227 then lose track of the state and one '|' char. */ 1228 state = 13; 1229 PUT ('|'); 1230 break; 1231 #endif 1232 case LEX_IS_LINE_COMMENT_START: 1233 /* FIXME-someday: The two character comment stuff was badly 1234 thought out. On i386, we want '/' as line comment start 1235 AND we want C style comments. hence this hack. The 1236 whole lexical process should be reworked. xoxorich. */ 1237 if (ch == '/') 1238 { 1239 ch2 = GET (); 1240 if (ch2 == '*') 1241 { 1242 old_state = 3; 1243 state = -2; 1244 break; 1245 } 1246 else if (ch2 != EOF) 1247 { 1248 UNGET (ch2); 1249 } 1250 } 1251 1252 if (state == 0 || state == 1) /* Only comment at start of line. */ 1253 { 1254 int startch; 1255 1256 startch = ch; 1257 1258 do 1259 { 1260 ch = GET (); 1261 } 1262 while (ch != EOF && IS_WHITESPACE (ch)); 1263 1264 if (ch == EOF) 1265 { 1266 as_warn (_("end of file in comment; newline inserted")); 1267 PUT ('\n'); 1268 break; 1269 } 1270 1271 if (ch < '0' || ch > '9' || state != 0 || startch != '#') 1272 { 1273 /* Not a cpp line. */ 1274 while (ch != EOF && !IS_NEWLINE (ch)) 1275 ch = GET (); 1276 if (ch == EOF) 1277 { 1278 as_warn (_("end of file in comment; newline inserted")); 1279 PUT ('\n'); 1280 } 1281 else /* IS_NEWLINE (ch) */ 1282 { 1283 /* To process non-zero add_newlines. */ 1284 UNGET (ch); 1285 } 1286 state = 0; 1287 break; 1288 } 1289 /* Looks like `# 123 "filename"' from cpp. */ 1290 UNGET (ch); 1291 old_state = 4; 1292 state = -1; 1293 if (scrub_m68k_mri) 1294 out_string = "\tlinefile "; 1295 else 1296 out_string = "\t.linefile "; 1297 PUT (*out_string++); 1298 break; 1299 } 1300 1301 #ifdef TC_D10V 1302 /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true. 1303 Trap is the only short insn that has a first operand that is 1304 neither register nor label. 1305 We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 . 1306 We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is 1307 already LEX_IS_LINE_COMMENT_START. However, it is the 1308 only character in line_comment_chars for d10v, hence we 1309 can recognize it as such. */ 1310 /* An alternative approach would be to reset the state to 1 when 1311 we see '||', '<'- or '->', but that seems to be overkill. */ 1312 if (state == 10) 1313 PUT (' '); 1314 #endif 1315 /* We have a line comment character which is not at the 1316 start of a line. If this is also a normal comment 1317 character, fall through. Otherwise treat it as a default 1318 character. */ 1319 if (strchr (tc_comment_chars, ch) == NULL 1320 && (! scrub_m68k_mri 1321 || (ch != '!' && ch != '*'))) 1322 goto de_fault; 1323 if (scrub_m68k_mri 1324 && (ch == '!' || ch == '*' || ch == '#') 1325 && state != 1 1326 && state != 10) 1327 goto de_fault; 1328 /* Fall through. */ 1329 case LEX_IS_COMMENT_START: 1330 #if defined TC_ARM && defined OBJ_ELF 1331 /* On the ARM, `@' is the comment character. 1332 Unfortunately this is also a special character in ELF .symver 1333 directives (and .type, though we deal with those another way). 1334 So we check if this line is such a directive, and treat 1335 the character as default if so. This is a hack. */ 1336 if ((symver_state != NULL) && (*symver_state == 0)) 1337 goto de_fault; 1338 #endif 1339 1340 /* Care is needed not to damage occurrences of \<comment-char> 1341 by stripping the <comment-char> onwards. Yuck. */ 1342 if ((to > tostart ? to[-1] : last_char) == '\\') 1343 /* Do not treat the <comment-char> as a start-of-comment. */ 1344 goto de_fault; 1345 1346 #ifdef WARN_COMMENTS 1347 if (!found_comment) 1348 found_comment_file = as_where (&found_comment); 1349 #endif 1350 do 1351 { 1352 ch = GET (); 1353 } 1354 while (ch != EOF && !IS_NEWLINE (ch)); 1355 if (ch == EOF) 1356 as_warn (_("end of file in comment; newline inserted")); 1357 state = 0; 1358 PUT ('\n'); 1359 break; 1360 1361 #ifdef H_TICK_HEX 1362 case LEX_IS_H: 1363 /* Look for strings like H'[0-9A-Fa-f] and if found, replace 1364 the H' with 0x to make them gas-style hex characters. */ 1365 if (enable_h_tick_hex) 1366 { 1367 char quot; 1368 1369 quot = GET (); 1370 if (quot == '\'') 1371 { 1372 UNGET ('x'); 1373 ch = '0'; 1374 } 1375 else 1376 UNGET (quot); 1377 } 1378 #endif 1379 /* Fall through. */ 1380 1381 case LEX_IS_SYMBOL_COMPONENT: 1382 if (state == 10) 1383 { 1384 /* This is a symbol character following another symbol 1385 character, with whitespace in between. We skipped 1386 the whitespace earlier, so output it now. */ 1387 UNGET (ch); 1388 state = 3; 1389 PUT (' '); 1390 break; 1391 } 1392 1393 #ifdef TC_Z80 1394 /* "af'" is a symbol containing '\''. */ 1395 if (state == 3 && (ch == 'a' || ch == 'A')) 1396 { 1397 state = 16; 1398 PUT (ch); 1399 ch = GET (); 1400 if (ch == 'f' || ch == 'F') 1401 { 1402 state = 17; 1403 PUT (ch); 1404 break; 1405 } 1406 else 1407 { 1408 state = 9; 1409 if (ch == EOF || !IS_SYMBOL_COMPONENT (ch)) 1410 { 1411 if (ch != EOF) 1412 UNGET (ch); 1413 break; 1414 } 1415 } 1416 } 1417 #endif 1418 if (state == 3) 1419 state = 9; 1420 1421 /* This is a common case. Quickly copy CH and all the 1422 following symbol component or normal characters. */ 1423 if (to + 1 < toend 1424 && mri_state == NULL 1425 #if defined TC_ARM && defined OBJ_ELF 1426 && symver_state == NULL 1427 #endif 1428 ) 1429 { 1430 char *s; 1431 ptrdiff_t len; 1432 1433 for (s = from; s < fromend; s++) 1434 { 1435 int type; 1436 1437 ch2 = *(unsigned char *) s; 1438 type = lex[ch2]; 1439 if (type != 0 1440 && type != LEX_IS_SYMBOL_COMPONENT) 1441 break; 1442 } 1443 1444 if (s > from) 1445 /* Handle the last character normally, for 1446 simplicity. */ 1447 --s; 1448 1449 len = s - from; 1450 1451 if (len > (toend - to) - 1) 1452 len = (toend - to) - 1; 1453 1454 if (len > 0) 1455 { 1456 PUT (ch); 1457 memcpy (to, from, len); 1458 to += len; 1459 from += len; 1460 if (to >= toend) 1461 goto tofull; 1462 ch = GET (); 1463 } 1464 } 1465 1466 /* Fall through. */ 1467 default: 1468 de_fault: 1469 /* Some relatively `normal' character. */ 1470 if (state == 0) 1471 { 1472 state = 11; /* Now seeing label definition. */ 1473 } 1474 else if (state == 1) 1475 { 1476 state = 2; /* Ditto. */ 1477 } 1478 else if (state == 9) 1479 { 1480 if (!IS_SYMBOL_COMPONENT (ch)) 1481 state = 3; 1482 } 1483 else if (state == 10) 1484 { 1485 if (ch == '\\') 1486 { 1487 /* Special handling for backslash: a backslash may 1488 be the beginning of a formal parameter (of a 1489 macro) following another symbol character, with 1490 whitespace in between. If that is the case, we 1491 output a space before the parameter. Strictly 1492 speaking, correct handling depends upon what the 1493 macro parameter expands into; if the parameter 1494 expands into something which does not start with 1495 an operand character, then we don't want to keep 1496 the space. We don't have enough information to 1497 make the right choice, so here we are making the 1498 choice which is more likely to be correct. */ 1499 if (to + 1 >= toend) 1500 { 1501 /* If we're near the end of the buffer, save the 1502 character for the next time round. Otherwise 1503 we'll lose our state. */ 1504 UNGET (ch); 1505 goto tofull; 1506 } 1507 *to++ = ' '; 1508 } 1509 1510 state = 3; 1511 } 1512 PUT (ch); 1513 break; 1514 } 1515 } 1516 1517 /*NOTREACHED*/ 1518 1519 fromeof: 1520 /* We have reached the end of the input. */ 1521 if (to > tostart) 1522 last_char = to[-1]; 1523 return to - tostart; 1524 1525 tofull: 1526 /* The output buffer is full. Save any input we have not yet 1527 processed. */ 1528 if (fromend > from) 1529 { 1530 saved_input = from; 1531 saved_input_len = fromend - from; 1532 } 1533 else 1534 saved_input = NULL; 1535 1536 if (to > tostart) 1537 last_char = to[-1]; 1538 return to - tostart; 1539 } 1540 1541 /* Return amount of pending input. */ 1542 1543 size_t 1544 do_scrub_pending (void) 1545 { 1546 size_t len = 0; 1547 if (saved_input) 1548 len += saved_input_len; 1549 if (state == -1) 1550 len += strlen (out_string); 1551 return len; 1552 } 1553