1 /* xgettext C/C++/ObjectiveC backend. 2 Copyright (C) 1995-1998, 2000-2006 Free Software Foundation, Inc. 3 4 This file was written by Peter Miller <millerp@canb.auug.org.au> 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 20 #ifdef HAVE_CONFIG_H 21 # include "config.h" 22 #endif 23 24 #include <errno.h> 25 #include <stdbool.h> 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 30 #include "message.h" 31 #include "xgettext.h" 32 #include "x-c.h" 33 #include "error.h" 34 #include "error-progname.h" 35 #include "xalloc.h" 36 #include "xvasprintf.h" 37 #include "exit.h" 38 #include "hash.h" 39 #include "gettext.h" 40 41 #define _(s) gettext(s) 42 43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 44 45 46 /* The ANSI C standard defines several phases of translation: 47 48 1. Terminate line by \n, regardless of the external representation 49 of a text line. Stdio does this for us. 50 51 2. Convert trigraphs to their single character equivalents. 52 53 3. Concatenate each line ending in backslash (\) with the following 54 line. 55 56 4. Replace each comment with a space character. 57 58 5. Parse each resulting logical line as preprocessing tokens a 59 white space. 60 61 6. Recognize and carry out directives (it also expands macros on 62 non-directive lines, which we do not do here). 63 64 7. Replaces escape sequences within character strings with their 65 single character equivalents (we do this in step 5, because we 66 don't have to worry about the #include argument). 67 68 8. Concatenates adjacent string literals to form single string 69 literals (because we don't expand macros, there are a few things 70 we will miss). 71 72 9. Converts the remaining preprocessing tokens to C tokens and 73 discards any white space from the translation unit. 74 75 This lexer implements the above, and presents the scanner (in 76 xgettext.c) with a stream of C tokens. The comments are 77 accumulated in a buffer, and given to xgettext when asked for. */ 78 79 80 /* ========================= Lexer customization. ========================= */ 81 82 static bool trigraphs = false; 83 84 void 85 x_c_trigraphs () 86 { 87 trigraphs = true; 88 } 89 90 91 /* ====================== Keyword set customization. ====================== */ 92 93 /* If true extract all strings. */ 94 static bool extract_all = false; 95 96 static hash_table c_keywords; 97 static hash_table objc_keywords; 98 static bool default_keywords = true; 99 100 101 void 102 x_c_extract_all () 103 { 104 extract_all = true; 105 } 106 107 108 static void 109 add_keyword (const char *name, hash_table *keywords) 110 { 111 if (name == NULL) 112 default_keywords = false; 113 else 114 { 115 const char *end; 116 struct callshape shape; 117 const char *colon; 118 119 if (keywords->table == NULL) 120 hash_init (keywords, 100); 121 122 split_keywordspec (name, &end, &shape); 123 124 /* The characters between name and end should form a valid C identifier. 125 A colon means an invalid parse in split_keywordspec(). */ 126 colon = strchr (name, ':'); 127 if (colon == NULL || colon >= end) 128 insert_keyword_callshape (keywords, name, end - name, &shape); 129 } 130 } 131 132 void 133 x_c_keyword (const char *name) 134 { 135 add_keyword (name, &c_keywords); 136 } 137 138 void 139 x_objc_keyword (const char *name) 140 { 141 add_keyword (name, &objc_keywords); 142 } 143 144 /* Finish initializing the keywords hash tables. 145 Called after argument processing, before each file is processed. */ 146 static void 147 init_keywords () 148 { 149 if (default_keywords) 150 { 151 /* When adding new keywords here, also update the documentation in 152 xgettext.texi! */ 153 x_c_keyword ("gettext"); 154 x_c_keyword ("dgettext:2"); 155 x_c_keyword ("dcgettext:2"); 156 x_c_keyword ("ngettext:1,2"); 157 x_c_keyword ("dngettext:2,3"); 158 x_c_keyword ("dcngettext:2,3"); 159 x_c_keyword ("gettext_noop"); 160 x_c_keyword ("pgettext:1c,2"); 161 x_c_keyword ("dpgettext:2c,3"); 162 x_c_keyword ("dcpgettext:2c,3"); 163 x_c_keyword ("npgettext:1c,2,3"); 164 x_c_keyword ("dnpgettext:2c,3,4"); 165 x_c_keyword ("dcnpgettext:2c,3,4"); 166 167 x_objc_keyword ("gettext"); 168 x_objc_keyword ("dgettext:2"); 169 x_objc_keyword ("dcgettext:2"); 170 x_objc_keyword ("ngettext:1,2"); 171 x_objc_keyword ("dngettext:2,3"); 172 x_objc_keyword ("dcngettext:2,3"); 173 x_objc_keyword ("gettext_noop"); 174 x_objc_keyword ("pgettext:1c,2"); 175 x_objc_keyword ("dpgettext:2c,3"); 176 x_objc_keyword ("dcpgettext:2c,3"); 177 x_objc_keyword ("npgettext:1c,2,3"); 178 x_objc_keyword ("dnpgettext:2c,3,4"); 179 x_objc_keyword ("dcnpgettext:2c,3,4"); 180 x_objc_keyword ("NSLocalizedString"); /* similar to gettext */ 181 x_objc_keyword ("_"); /* similar to gettext */ 182 x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */ 183 x_objc_keyword ("__"); /* similar to gettext_noop */ 184 185 default_keywords = false; 186 } 187 } 188 189 void 190 init_flag_table_c () 191 { 192 xgettext_record_flag ("gettext:1:pass-c-format"); 193 xgettext_record_flag ("dgettext:2:pass-c-format"); 194 xgettext_record_flag ("dcgettext:2:pass-c-format"); 195 xgettext_record_flag ("ngettext:1:pass-c-format"); 196 xgettext_record_flag ("ngettext:2:pass-c-format"); 197 xgettext_record_flag ("dngettext:2:pass-c-format"); 198 xgettext_record_flag ("dngettext:3:pass-c-format"); 199 xgettext_record_flag ("dcngettext:2:pass-c-format"); 200 xgettext_record_flag ("dcngettext:3:pass-c-format"); 201 xgettext_record_flag ("gettext_noop:1:pass-c-format"); 202 xgettext_record_flag ("pgettext:2:pass-c-format"); 203 xgettext_record_flag ("dpgettext:3:pass-c-format"); 204 xgettext_record_flag ("dcpgettext:3:pass-c-format"); 205 xgettext_record_flag ("npgettext:2:pass-c-format"); 206 xgettext_record_flag ("npgettext:3:pass-c-format"); 207 xgettext_record_flag ("dnpgettext:3:pass-c-format"); 208 xgettext_record_flag ("dnpgettext:4:pass-c-format"); 209 xgettext_record_flag ("dcnpgettext:3:pass-c-format"); 210 xgettext_record_flag ("dcnpgettext:4:pass-c-format"); 211 212 /* <stdio.h> */ 213 xgettext_record_flag ("fprintf:2:c-format"); 214 xgettext_record_flag ("vfprintf:2:c-format"); 215 xgettext_record_flag ("printf:1:c-format"); 216 xgettext_record_flag ("vprintf:1:c-format"); 217 xgettext_record_flag ("sprintf:2:c-format"); 218 xgettext_record_flag ("vsprintf:2:c-format"); 219 xgettext_record_flag ("snprintf:3:c-format"); 220 xgettext_record_flag ("vsnprintf:3:c-format"); 221 #if 0 /* These functions are not standard. */ 222 /* <stdio.h> */ 223 xgettext_record_flag ("asprintf:2:c-format"); 224 xgettext_record_flag ("vasprintf:2:c-format"); 225 xgettext_record_flag ("dprintf:2:c-format"); 226 xgettext_record_flag ("vdprintf:2:c-format"); 227 xgettext_record_flag ("obstack_printf:2:c-format"); 228 xgettext_record_flag ("obstack_vprintf:2:c-format"); 229 /* <error.h> */ 230 xgettext_record_flag ("error:3:c-format"); 231 xgettext_record_flag ("error_at_line:5:c-format"); 232 /* <argp.h> */ 233 xgettext_record_flag ("argp_error:2:c-format"); 234 xgettext_record_flag ("argp_failure:2:c-format"); 235 #endif 236 237 xgettext_record_flag ("gettext:1:pass-boost-format"); 238 xgettext_record_flag ("dgettext:2:pass-boost-format"); 239 xgettext_record_flag ("dcgettext:2:pass-boost-format"); 240 xgettext_record_flag ("ngettext:1:pass-boost-format"); 241 xgettext_record_flag ("ngettext:2:pass-boost-format"); 242 xgettext_record_flag ("dngettext:2:pass-boost-format"); 243 xgettext_record_flag ("dngettext:3:pass-boost-format"); 244 xgettext_record_flag ("dcngettext:2:pass-boost-format"); 245 xgettext_record_flag ("dcngettext:3:pass-boost-format"); 246 xgettext_record_flag ("gettext_noop:1:pass-boost-format"); 247 xgettext_record_flag ("pgettext:2:pass-boost-format"); 248 xgettext_record_flag ("dpgettext:3:pass-boost-format"); 249 xgettext_record_flag ("dcpgettext:3:pass-boost-format"); 250 xgettext_record_flag ("npgettext:2:pass-boost-format"); 251 xgettext_record_flag ("npgettext:3:pass-boost-format"); 252 xgettext_record_flag ("dnpgettext:3:pass-boost-format"); 253 xgettext_record_flag ("dnpgettext:4:pass-boost-format"); 254 xgettext_record_flag ("dcnpgettext:3:pass-boost-format"); 255 xgettext_record_flag ("dcnpgettext:4:pass-boost-format"); 256 257 /* <boost/format.hpp> */ 258 xgettext_record_flag ("format:1:boost-format"); 259 } 260 261 void 262 init_flag_table_objc () 263 { 264 /* Since the settings done in init_flag_table_c() also have an effect for 265 the ObjectiveC parser, we don't have to repeat them here. */ 266 xgettext_record_flag ("gettext:1:pass-objc-format"); 267 xgettext_record_flag ("dgettext:2:pass-objc-format"); 268 xgettext_record_flag ("dcgettext:2:pass-objc-format"); 269 xgettext_record_flag ("ngettext:1:pass-objc-format"); 270 xgettext_record_flag ("ngettext:2:pass-objc-format"); 271 xgettext_record_flag ("dngettext:2:pass-objc-format"); 272 xgettext_record_flag ("dngettext:3:pass-objc-format"); 273 xgettext_record_flag ("dcngettext:2:pass-objc-format"); 274 xgettext_record_flag ("dcngettext:3:pass-objc-format"); 275 xgettext_record_flag ("gettext_noop:1:pass-objc-format"); 276 xgettext_record_flag ("pgettext:2:pass-objc-format"); 277 xgettext_record_flag ("dpgettext:3:pass-objc-format"); 278 xgettext_record_flag ("dcpgettext:3:pass-objc-format"); 279 xgettext_record_flag ("npgettext:2:pass-objc-format"); 280 xgettext_record_flag ("npgettext:3:pass-objc-format"); 281 xgettext_record_flag ("dnpgettext:3:pass-objc-format"); 282 xgettext_record_flag ("dnpgettext:4:pass-objc-format"); 283 xgettext_record_flag ("dcnpgettext:3:pass-objc-format"); 284 xgettext_record_flag ("dcnpgettext:4:pass-objc-format"); 285 xgettext_record_flag ("NSLocalizedString:1:pass-c-format"); 286 xgettext_record_flag ("NSLocalizedString:1:pass-objc-format"); 287 xgettext_record_flag ("_:1:pass-c-format"); 288 xgettext_record_flag ("_:1:pass-objc-format"); 289 xgettext_record_flag ("stringWithFormat::1:objc-format"); 290 xgettext_record_flag ("initWithFormat::1:objc-format"); 291 xgettext_record_flag ("stringByAppendingFormat::1:objc-format"); 292 xgettext_record_flag ("localizedStringWithFormat::1:objc-format"); 293 xgettext_record_flag ("appendFormat::1:objc-format"); 294 } 295 296 void 297 init_flag_table_gcc_internal () 298 { 299 xgettext_record_flag ("gettext:1:pass-gcc-internal-format"); 300 xgettext_record_flag ("dgettext:2:pass-gcc-internal-format"); 301 xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format"); 302 xgettext_record_flag ("ngettext:1:pass-gcc-internal-format"); 303 xgettext_record_flag ("ngettext:2:pass-gcc-internal-format"); 304 xgettext_record_flag ("dngettext:2:pass-gcc-internal-format"); 305 xgettext_record_flag ("dngettext:3:pass-gcc-internal-format"); 306 xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format"); 307 xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format"); 308 xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format"); 309 xgettext_record_flag ("pgettext:2:pass-gcc-internal-format"); 310 xgettext_record_flag ("dpgettext:3:pass-gcc-internal-format"); 311 xgettext_record_flag ("dcpgettext:3:pass-gcc-internal-format"); 312 xgettext_record_flag ("npgettext:2:pass-gcc-internal-format"); 313 xgettext_record_flag ("npgettext:3:pass-gcc-internal-format"); 314 xgettext_record_flag ("dnpgettext:3:pass-gcc-internal-format"); 315 xgettext_record_flag ("dnpgettext:4:pass-gcc-internal-format"); 316 xgettext_record_flag ("dcnpgettext:3:pass-gcc-internal-format"); 317 xgettext_record_flag ("dcnpgettext:4:pass-gcc-internal-format"); 318 #if 0 /* This should better be done inside GCC. */ 319 /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */ 320 /* c-format.c */ 321 xgettext_record_flag ("status_warning:2:gcc-internal-format"); 322 /* c-tree.h */ 323 xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format"); 324 /* collect2.h */ 325 //xgettext_record_flag ("error:1:c-format"); // 3 different versions 326 xgettext_record_flag ("notice:1:c-format"); 327 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions 328 xgettext_record_flag ("fatal_perror:1:c-format"); 329 /* cpplib.h */ 330 xgettext_record_flag ("cpp_error:3:c-format"); 331 xgettext_record_flag ("cpp_error_with_line:5:c-format"); 332 /* diagnostic.h */ 333 xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format"); 334 xgettext_record_flag ("output_printf:2:gcc-internal-format"); 335 xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format"); 336 xgettext_record_flag ("verbatim:1:gcc-internal-format"); 337 xgettext_record_flag ("inform:1:pass-gcc-internal-format"); 338 /* gcc.h */ 339 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions 340 //xgettext_record_flag ("error:1:c-format"); // 3 different versions 341 /* genattrtab.h */ 342 xgettext_record_flag ("attr_printf:2:pass-c-format"); 343 /* gengtype.h */ 344 xgettext_record_flag ("error_at_line:2:pass-c-format"); 345 xgettext_record_flag ("xvasprintf:2:pass-c-format"); 346 xgettext_record_flag ("xasprintf:1:pass-c-format"); 347 xgettext_record_flag ("oprintf:2:pass-c-format"); 348 /* gensupport.h */ 349 xgettext_record_flag ("message_with_line:2:pass-c-format"); 350 /* output.h */ 351 xgettext_record_flag ("output_operand_lossage:1:c-format"); 352 /* ra.h */ 353 xgettext_record_flag ("ra_debug_msg:2:pass-c-format"); 354 /* toplev.h */ 355 xgettext_record_flag ("fnotice:2:c-format"); 356 xgettext_record_flag ("fatal_io_error:2:gcc-internal-format"); 357 xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format"); 358 xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format"); 359 xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format"); 360 xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format"); 361 xgettext_record_flag ("pedwarn:1:gcc-internal-format"); 362 xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format"); 363 xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format"); 364 xgettext_record_flag ("sorry:1:gcc-internal-format"); 365 xgettext_record_flag ("error:1:pass-gcc-internal-format"); 366 xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format"); 367 xgettext_record_flag ("internal_error:1:pass-gcc-internal-format"); 368 xgettext_record_flag ("warning:1:pass-gcc-internal-format"); 369 xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format"); 370 xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format"); 371 /* f/com.h */ 372 xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format"); 373 /* f/sts.h */ 374 xgettext_record_flag ("ffests_printf:2:pass-c-format"); 375 /* java/java-tree.h */ 376 xgettext_record_flag ("parse_error_context:2:pass-c-format"); 377 #endif 378 } 379 380 381 /* ======================== Reading of characters. ======================== */ 382 383 /* Real filename, used in error messages about the input file. */ 384 static const char *real_file_name; 385 386 /* Logical filename and line number, used to label the extracted messages. */ 387 static char *logical_file_name; 388 static int line_number; 389 390 /* The input file stream. */ 391 static FILE *fp; 392 393 394 /* 0. Terminate line by \n, regardless whether the external representation of 395 a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows). 396 It is debatable whether supporting CR/LF line terminators in C sources 397 on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it 398 unconditionally, it must be OK. 399 The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n 400 automatically, but here we also need this conversion on Unix. As a side 401 effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this 402 is not a problem. */ 403 404 405 static int 406 phase0_getc () 407 { 408 int c; 409 410 c = getc (fp); 411 if (c == EOF) 412 { 413 if (ferror (fp)) 414 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 415 real_file_name); 416 return EOF; 417 } 418 419 if (c == '\r') 420 { 421 int c1 = getc (fp); 422 423 if (c1 != EOF && c1 != '\n') 424 ungetc (c1, fp); 425 426 /* Seen line terminator CR or CR/LF. */ 427 return '\n'; 428 } 429 430 return c; 431 } 432 433 434 /* Supports only one pushback character, and not '\n'. */ 435 static inline void 436 phase0_ungetc (int c) 437 { 438 if (c != EOF) 439 ungetc (c, fp); 440 } 441 442 443 /* 1. line_number handling. Combine backslash-newline to nothing. */ 444 445 static unsigned char phase1_pushback[2]; 446 static int phase1_pushback_length; 447 448 449 static int 450 phase1_getc () 451 { 452 int c; 453 454 if (phase1_pushback_length) 455 { 456 c = phase1_pushback[--phase1_pushback_length]; 457 if (c == '\n') 458 ++line_number; 459 return c; 460 } 461 for (;;) 462 { 463 c = phase0_getc (); 464 switch (c) 465 { 466 case '\n': 467 ++line_number; 468 return '\n'; 469 470 case '\\': 471 c = phase0_getc (); 472 if (c != '\n') 473 { 474 phase0_ungetc (c); 475 return '\\'; 476 } 477 ++line_number; 478 break; 479 480 default: 481 return c; 482 } 483 } 484 } 485 486 487 /* Supports 2 characters of pushback. */ 488 static void 489 phase1_ungetc (int c) 490 { 491 switch (c) 492 { 493 case EOF: 494 break; 495 496 case '\n': 497 --line_number; 498 /* FALLTHROUGH */ 499 500 default: 501 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 502 abort (); 503 phase1_pushback[phase1_pushback_length++] = c; 504 break; 505 } 506 } 507 508 509 /* 2. Convert trigraphs to their single character equivalents. Most 510 sane human beings vomit copiously at the mention of trigraphs, which 511 is why they are an option. */ 512 513 static unsigned char phase2_pushback[1]; 514 static int phase2_pushback_length; 515 516 517 static int 518 phase2_getc () 519 { 520 int c; 521 522 if (phase2_pushback_length) 523 return phase2_pushback[--phase2_pushback_length]; 524 if (!trigraphs) 525 return phase1_getc (); 526 527 c = phase1_getc (); 528 if (c != '?') 529 return c; 530 c = phase1_getc (); 531 if (c != '?') 532 { 533 phase1_ungetc (c); 534 return '?'; 535 } 536 c = phase1_getc (); 537 switch (c) 538 { 539 case '(': 540 return '['; 541 case '/': 542 return '\\'; 543 case ')': 544 return ']'; 545 case '\'': 546 return '^'; 547 case '<': 548 return '{'; 549 case '!': 550 return '|'; 551 case '>': 552 return '}'; 553 case '-': 554 return '~'; 555 case '#': 556 return '='; 557 } 558 phase1_ungetc (c); 559 phase1_ungetc ('?'); 560 return '?'; 561 } 562 563 564 /* Supports only one pushback character. */ 565 static void 566 phase2_ungetc (int c) 567 { 568 if (c != EOF) 569 { 570 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 571 abort (); 572 phase2_pushback[phase2_pushback_length++] = c; 573 } 574 } 575 576 577 /* 3. Concatenate each line ending in backslash (\) with the following 578 line. Basically, all you need to do is elide "\\\n" sequences from 579 the input. */ 580 581 static unsigned char phase3_pushback[2]; 582 static int phase3_pushback_length; 583 584 585 static int 586 phase3_getc () 587 { 588 if (phase3_pushback_length) 589 return phase3_pushback[--phase3_pushback_length]; 590 for (;;) 591 { 592 int c = phase2_getc (); 593 if (c != '\\') 594 return c; 595 c = phase2_getc (); 596 if (c != '\n') 597 { 598 phase2_ungetc (c); 599 return '\\'; 600 } 601 } 602 } 603 604 605 /* Supports 2 characters of pushback. */ 606 static void 607 phase3_ungetc (int c) 608 { 609 if (c != EOF) 610 { 611 if (phase3_pushback_length == SIZEOF (phase3_pushback)) 612 abort (); 613 phase3_pushback[phase3_pushback_length++] = c; 614 } 615 } 616 617 618 /* Accumulating comments. */ 619 620 static char *buffer; 621 static size_t bufmax; 622 static size_t buflen; 623 624 static inline void 625 comment_start () 626 { 627 buflen = 0; 628 } 629 630 static inline void 631 comment_add (int c) 632 { 633 if (buflen >= bufmax) 634 { 635 bufmax = 2 * bufmax + 10; 636 buffer = xrealloc (buffer, bufmax); 637 } 638 buffer[buflen++] = c; 639 } 640 641 static inline void 642 comment_line_end (size_t chars_to_remove) 643 { 644 buflen -= chars_to_remove; 645 while (buflen >= 1 646 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 647 --buflen; 648 if (chars_to_remove == 0 && buflen >= bufmax) 649 { 650 bufmax = 2 * bufmax + 10; 651 buffer = xrealloc (buffer, bufmax); 652 } 653 buffer[buflen] = '\0'; 654 savable_comment_add (buffer); 655 } 656 657 658 /* These are for tracking whether comments count as immediately before 659 keyword. */ 660 static int last_comment_line; 661 static int last_non_comment_line; 662 static int newline_count; 663 664 665 /* 4. Replace each comment that is not inside a character constant or 666 string literal with a space character. We need to remember the 667 comment for later, because it may be attached to a keyword string. 668 We also optionally understand C++ comments. */ 669 670 static int 671 phase4_getc () 672 { 673 int c; 674 bool last_was_star; 675 676 c = phase3_getc (); 677 if (c != '/') 678 return c; 679 c = phase3_getc (); 680 switch (c) 681 { 682 default: 683 phase3_ungetc (c); 684 return '/'; 685 686 case '*': 687 /* C comment. */ 688 comment_start (); 689 last_was_star = false; 690 for (;;) 691 { 692 c = phase3_getc (); 693 if (c == EOF) 694 break; 695 /* We skip all leading white space, but not EOLs. */ 696 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 697 comment_add (c); 698 switch (c) 699 { 700 case '\n': 701 comment_line_end (1); 702 comment_start (); 703 last_was_star = false; 704 continue; 705 706 case '*': 707 last_was_star = true; 708 continue; 709 710 case '/': 711 if (last_was_star) 712 { 713 comment_line_end (2); 714 break; 715 } 716 /* FALLTHROUGH */ 717 718 default: 719 last_was_star = false; 720 continue; 721 } 722 break; 723 } 724 last_comment_line = newline_count; 725 return ' '; 726 727 case '/': 728 /* C++ or ISO C 99 comment. */ 729 comment_start (); 730 for (;;) 731 { 732 c = phase3_getc (); 733 if (c == '\n' || c == EOF) 734 break; 735 /* We skip all leading white space, but not EOLs. */ 736 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 737 comment_add (c); 738 } 739 comment_line_end (0); 740 last_comment_line = newline_count; 741 return '\n'; 742 } 743 } 744 745 746 /* Supports only one pushback character. */ 747 static void 748 phase4_ungetc (int c) 749 { 750 phase3_ungetc (c); 751 } 752 753 754 /* ========================== Reading of tokens. ========================== */ 755 756 757 /* True if ObjectiveC extensions are recognized. */ 758 static bool objc_extensions; 759 760 enum token_type_ty 761 { 762 token_type_character_constant, /* 'x' */ 763 token_type_eof, 764 token_type_eoln, 765 token_type_hash, /* # */ 766 token_type_lparen, /* ( */ 767 token_type_rparen, /* ) */ 768 token_type_comma, /* , */ 769 token_type_colon, /* : */ 770 token_type_name, /* abc */ 771 token_type_number, /* 2.7 */ 772 token_type_string_literal, /* "abc" */ 773 token_type_symbol, /* < > = etc. */ 774 token_type_objc_special, /* @ */ 775 token_type_white_space 776 }; 777 typedef enum token_type_ty token_type_ty; 778 779 typedef struct token_ty token_ty; 780 struct token_ty 781 { 782 token_type_ty type; 783 char *string; /* for token_type_name, token_type_string_literal */ 784 refcounted_string_list_ty *comment; /* for token_type_string_literal, 785 token_type_objc_special */ 786 long number; 787 int line_number; 788 }; 789 790 791 /* 7. Replace escape sequences within character strings with their 792 single character equivalents. This is called from phase 5, because 793 we don't have to worry about the #include argument. There are 794 pathological cases which could bite us (like the DOS directory 795 separator), but just pretend it can't happen. */ 796 797 #define P7_QUOTES (1000 + '"') 798 #define P7_QUOTE (1000 + '\'') 799 #define P7_NEWLINE (1000 + '\n') 800 801 static int 802 phase7_getc () 803 { 804 int c, n, j; 805 806 /* Use phase 3, because phase 4 elides comments. */ 807 c = phase3_getc (); 808 809 /* Return a magic newline indicator, so that we can distinguish 810 between the user requesting a newline in the string (e.g. using 811 "\n" or "\012") from the user failing to terminate the string or 812 character constant. The ANSI C standard says: 3.1.3.4 Character 813 Constants contain ``any character except single quote, backslash or 814 newline; or an escape sequence'' and 3.1.4 String Literals contain 815 ``any character except double quote, backslash or newline; or an 816 escape sequence''. 817 818 Most compilers give a fatal error in this case, however gcc is 819 stupidly silent, even though this is a very common typo. OK, so 820 gcc --pedantic will tell me, but that gripes about too much other 821 stuff. Could I have a ``gcc -Wnewline-in-string'' option, or 822 better yet a ``gcc -fno-newline-in-string'' option, please? Gcc is 823 also inconsistent between string literals and character constants: 824 you may not embed newlines in character constants; try it, you get 825 a useful diagnostic. --PMiller */ 826 if (c == '\n') 827 return P7_NEWLINE; 828 829 if (c == '"') 830 return P7_QUOTES; 831 if (c == '\'') 832 return P7_QUOTE; 833 if (c != '\\') 834 return c; 835 c = phase3_getc (); 836 switch (c) 837 { 838 default: 839 /* Unknown escape sequences really should be an error, but just 840 ignore them, and let the real compiler complain. */ 841 phase3_ungetc (c); 842 return '\\'; 843 844 case '"': 845 case '\'': 846 case '?': 847 case '\\': 848 return c; 849 850 case 'a': 851 return '\a'; 852 case 'b': 853 return '\b'; 854 855 /* The \e escape is preculiar to gcc, and assumes an ASCII 856 character set (or superset). We don't provide support for it 857 here. */ 858 859 case 'f': 860 return '\f'; 861 case 'n': 862 return '\n'; 863 case 'r': 864 return '\r'; 865 case 't': 866 return '\t'; 867 case 'v': 868 return '\v'; 869 870 case 'x': 871 c = phase3_getc (); 872 switch (c) 873 { 874 default: 875 phase3_ungetc (c); 876 phase3_ungetc ('x'); 877 return '\\'; 878 879 case '0': case '1': case '2': case '3': case '4': 880 case '5': case '6': case '7': case '8': case '9': 881 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 882 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 883 break; 884 } 885 n = 0; 886 for (;;) 887 { 888 switch (c) 889 { 890 default: 891 phase3_ungetc (c); 892 return n; 893 894 case '0': case '1': case '2': case '3': case '4': 895 case '5': case '6': case '7': case '8': case '9': 896 n = n * 16 + c - '0'; 897 break; 898 899 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 900 n = n * 16 + 10 + c - 'A'; 901 break; 902 903 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 904 n = n * 16 + 10 + c - 'a'; 905 break; 906 } 907 c = phase3_getc (); 908 } 909 return n; 910 911 case '0': case '1': case '2': case '3': 912 case '4': case '5': case '6': case '7': 913 n = 0; 914 for (j = 0; j < 3; ++j) 915 { 916 n = n * 8 + c - '0'; 917 c = phase3_getc (); 918 switch (c) 919 { 920 default: 921 break; 922 923 case '0': case '1': case '2': case '3': 924 case '4': case '5': case '6': case '7': 925 continue; 926 } 927 break; 928 } 929 phase3_ungetc (c); 930 return n; 931 } 932 } 933 934 935 static void 936 phase7_ungetc (int c) 937 { 938 phase3_ungetc (c); 939 } 940 941 942 /* Free the memory pointed to by a 'struct token_ty'. */ 943 static inline void 944 free_token (token_ty *tp) 945 { 946 if (tp->type == token_type_name || tp->type == token_type_string_literal) 947 free (tp->string); 948 if (tp->type == token_type_string_literal 949 || tp->type == token_type_objc_special) 950 drop_reference (tp->comment); 951 } 952 953 954 /* 5. Parse each resulting logical line as preprocessing tokens and 955 white space. Preprocessing tokens and C tokens don't always match. */ 956 957 static token_ty phase5_pushback[1]; 958 static int phase5_pushback_length; 959 960 961 static void 962 phase5_get (token_ty *tp) 963 { 964 static char *buffer; 965 static int bufmax; 966 int bufpos; 967 int c; 968 969 if (phase5_pushback_length) 970 { 971 *tp = phase5_pushback[--phase5_pushback_length]; 972 return; 973 } 974 tp->string = NULL; 975 tp->number = 0; 976 tp->line_number = line_number; 977 c = phase4_getc (); 978 switch (c) 979 { 980 case EOF: 981 tp->type = token_type_eof; 982 return; 983 984 case '\n': 985 tp->type = token_type_eoln; 986 return; 987 988 case ' ': 989 case '\f': 990 case '\t': 991 for (;;) 992 { 993 c = phase4_getc (); 994 switch (c) 995 { 996 case ' ': 997 case '\f': 998 case '\t': 999 continue; 1000 1001 default: 1002 phase4_ungetc (c); 1003 break; 1004 } 1005 break; 1006 } 1007 tp->type = token_type_white_space; 1008 return; 1009 1010 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 1011 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 1012 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 1013 case 'V': case 'W': case 'X': case 'Y': case 'Z': 1014 case '_': 1015 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 1016 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 1017 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 1018 case 'v': case 'w': case 'x': case 'y': case 'z': 1019 bufpos = 0; 1020 for (;;) 1021 { 1022 if (bufpos >= bufmax) 1023 { 1024 bufmax = 2 * bufmax + 10; 1025 buffer = xrealloc (buffer, bufmax); 1026 } 1027 buffer[bufpos++] = c; 1028 c = phase4_getc (); 1029 switch (c) 1030 { 1031 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1032 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1033 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1034 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1035 case 'Y': case 'Z': 1036 case '_': 1037 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1038 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1039 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1040 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1041 case 'y': case 'z': 1042 case '0': case '1': case '2': case '3': case '4': 1043 case '5': case '6': case '7': case '8': case '9': 1044 continue; 1045 1046 default: 1047 phase4_ungetc (c); 1048 break; 1049 } 1050 break; 1051 } 1052 if (bufpos >= bufmax) 1053 { 1054 bufmax = 2 * bufmax + 10; 1055 buffer = xrealloc (buffer, bufmax); 1056 } 1057 buffer[bufpos] = 0; 1058 tp->string = xstrdup (buffer); 1059 tp->type = token_type_name; 1060 return; 1061 1062 case '.': 1063 c = phase4_getc (); 1064 phase4_ungetc (c); 1065 switch (c) 1066 { 1067 default: 1068 tp->type = token_type_symbol; 1069 return; 1070 1071 case '0': case '1': case '2': case '3': case '4': 1072 case '5': case '6': case '7': case '8': case '9': 1073 c = '.'; 1074 break; 1075 } 1076 /* FALLTHROUGH */ 1077 1078 case '0': case '1': case '2': case '3': case '4': 1079 case '5': case '6': case '7': case '8': case '9': 1080 /* The preprocessing number token is more "generous" than the C 1081 number tokens. This is mostly due to token pasting (another 1082 thing we can ignore here). */ 1083 bufpos = 0; 1084 for (;;) 1085 { 1086 if (bufpos >= bufmax) 1087 { 1088 bufmax = 2 * bufmax + 10; 1089 buffer = xrealloc (buffer, bufmax); 1090 } 1091 buffer[bufpos++] = c; 1092 c = phase4_getc (); 1093 switch (c) 1094 { 1095 case 'e': 1096 case 'E': 1097 if (bufpos >= bufmax) 1098 { 1099 bufmax = 2 * bufmax + 10; 1100 buffer = xrealloc (buffer, bufmax); 1101 } 1102 buffer[bufpos++] = c; 1103 c = phase4_getc (); 1104 if (c != '+' || c != '-') 1105 { 1106 phase4_ungetc (c); 1107 break; 1108 } 1109 continue; 1110 1111 case 'A': case 'B': case 'C': case 'D': case 'F': 1112 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1113 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1114 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1115 case 'Y': case 'Z': 1116 case 'a': case 'b': case 'c': case 'd': case 'f': 1117 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1118 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1119 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1120 case 'y': case 'z': 1121 case '0': case '1': case '2': case '3': case '4': 1122 case '5': case '6': case '7': case '8': case '9': 1123 case '.': 1124 continue; 1125 1126 default: 1127 phase4_ungetc (c); 1128 break; 1129 } 1130 break; 1131 } 1132 if (bufpos >= bufmax) 1133 { 1134 bufmax = 2 * bufmax + 10; 1135 buffer = xrealloc (buffer, bufmax); 1136 } 1137 buffer[bufpos] = 0; 1138 tp->type = token_type_number; 1139 tp->number = atol (buffer); 1140 return; 1141 1142 case '\'': 1143 /* We could worry about the 'L' before wide character constants, 1144 but ignoring it has no effect unless one of the keywords is 1145 "L". Just pretend it won't happen. Also, we don't need to 1146 remember the character constant. */ 1147 for (;;) 1148 { 1149 c = phase7_getc (); 1150 if (c == P7_NEWLINE) 1151 { 1152 error_with_progname = false; 1153 error (0, 0, _("%s:%d: warning: unterminated character constant"), 1154 logical_file_name, line_number - 1); 1155 error_with_progname = true; 1156 phase7_ungetc ('\n'); 1157 break; 1158 } 1159 if (c == EOF || c == P7_QUOTE) 1160 break; 1161 } 1162 tp->type = token_type_character_constant; 1163 return; 1164 1165 case '"': 1166 /* We could worry about the 'L' before wide string constants, 1167 but since gettext's argument is not a wide character string, 1168 let the compiler complain about the argument not matching the 1169 prototype. Just pretend it won't happen. */ 1170 bufpos = 0; 1171 for (;;) 1172 { 1173 c = phase7_getc (); 1174 if (c == P7_NEWLINE) 1175 { 1176 error_with_progname = false; 1177 error (0, 0, _("%s:%d: warning: unterminated string literal"), 1178 logical_file_name, line_number - 1); 1179 error_with_progname = true; 1180 phase7_ungetc ('\n'); 1181 break; 1182 } 1183 if (c == EOF || c == P7_QUOTES) 1184 break; 1185 if (c == P7_QUOTE) 1186 c = '\''; 1187 if (bufpos >= bufmax) 1188 { 1189 bufmax = 2 * bufmax + 10; 1190 buffer = xrealloc (buffer, bufmax); 1191 } 1192 buffer[bufpos++] = c; 1193 } 1194 if (bufpos >= bufmax) 1195 { 1196 bufmax = 2 * bufmax + 10; 1197 buffer = xrealloc (buffer, bufmax); 1198 } 1199 buffer[bufpos] = 0; 1200 tp->type = token_type_string_literal; 1201 tp->string = xstrdup (buffer); 1202 tp->comment = add_reference (savable_comment); 1203 return; 1204 1205 case '(': 1206 tp->type = token_type_lparen; 1207 return; 1208 1209 case ')': 1210 tp->type = token_type_rparen; 1211 return; 1212 1213 case ',': 1214 tp->type = token_type_comma; 1215 return; 1216 1217 case '#': 1218 tp->type = token_type_hash; 1219 return; 1220 1221 case ':': 1222 tp->type = token_type_colon; 1223 return; 1224 1225 case '@': 1226 if (objc_extensions) 1227 { 1228 tp->type = token_type_objc_special; 1229 tp->comment = add_reference (savable_comment); 1230 return; 1231 } 1232 /* FALLTHROUGH */ 1233 1234 default: 1235 /* We could carefully recognize each of the 2 and 3 character 1236 operators, but it is not necessary, as we only need to recognize 1237 gettext invocations. Don't bother. */ 1238 tp->type = token_type_symbol; 1239 return; 1240 } 1241 } 1242 1243 1244 /* Supports only one pushback token. */ 1245 static void 1246 phase5_unget (token_ty *tp) 1247 { 1248 if (tp->type != token_type_eof) 1249 { 1250 if (phase5_pushback_length == SIZEOF (phase5_pushback)) 1251 abort (); 1252 phase5_pushback[phase5_pushback_length++] = *tp; 1253 } 1254 } 1255 1256 1257 /* X. Recognize a leading # symbol. Leave leading hash as a hash, but 1258 turn hash in the middle of a line into a plain symbol token. This 1259 makes the phase 6 easier. */ 1260 1261 static void 1262 phaseX_get (token_ty *tp) 1263 { 1264 static bool middle; /* false at the beginning of a line, true otherwise. */ 1265 1266 phase5_get (tp); 1267 1268 if (tp->type == token_type_eoln || tp->type == token_type_eof) 1269 middle = false; 1270 else 1271 { 1272 if (middle) 1273 { 1274 /* Turn hash in the middle of a line into a plain symbol token. */ 1275 if (tp->type == token_type_hash) 1276 tp->type = token_type_symbol; 1277 } 1278 else 1279 { 1280 /* When we see leading whitespace followed by a hash sign, 1281 discard the leading white space token. The hash is all 1282 phase 6 is interested in. */ 1283 if (tp->type == token_type_white_space) 1284 { 1285 token_ty next; 1286 1287 phase5_get (&next); 1288 if (next.type == token_type_hash) 1289 *tp = next; 1290 else 1291 phase5_unget (&next); 1292 } 1293 middle = true; 1294 } 1295 } 1296 } 1297 1298 1299 /* 6. Recognize and carry out directives (it also expands macros on 1300 non-directive lines, which we do not do here). The only directive 1301 we care about are the #line and #define directive. We throw all the 1302 others away. */ 1303 1304 static token_ty phase6_pushback[2]; 1305 static int phase6_pushback_length; 1306 1307 1308 static void 1309 phase6_get (token_ty *tp) 1310 { 1311 static token_ty *buf; 1312 static int bufmax; 1313 int bufpos; 1314 int j; 1315 1316 if (phase6_pushback_length) 1317 { 1318 *tp = phase6_pushback[--phase6_pushback_length]; 1319 return; 1320 } 1321 for (;;) 1322 { 1323 /* Get the next token. If it is not a '#' at the beginning of a 1324 line (ignoring whitespace), return immediately. */ 1325 phaseX_get (tp); 1326 if (tp->type != token_type_hash) 1327 return; 1328 1329 /* Accumulate the rest of the directive in a buffer, until the 1330 "define" keyword is seen or until end of line. */ 1331 bufpos = 0; 1332 for (;;) 1333 { 1334 phaseX_get (tp); 1335 if (tp->type == token_type_eoln || tp->type == token_type_eof) 1336 break; 1337 1338 /* Before the "define" keyword and inside other directives 1339 white space is irrelevant. So just throw it away. */ 1340 if (tp->type != token_type_white_space) 1341 { 1342 /* If it is a #define directive, return immediately, 1343 thus treating the body of the #define directive like 1344 normal input. */ 1345 if (bufpos == 0 1346 && tp->type == token_type_name 1347 && strcmp (tp->string, "define") == 0) 1348 return; 1349 1350 /* Accumulate. */ 1351 if (bufpos >= bufmax) 1352 { 1353 bufmax = 2 * bufmax + 10; 1354 buf = xrealloc (buf, bufmax * sizeof (buf[0])); 1355 } 1356 buf[bufpos++] = *tp; 1357 } 1358 } 1359 1360 /* If it is a #line directive, with no macros to expand, act on 1361 it. Ignore all other directives. */ 1362 if (bufpos >= 3 && buf[0].type == token_type_name 1363 && strcmp (buf[0].string, "line") == 0 1364 && buf[1].type == token_type_number 1365 && buf[2].type == token_type_string_literal) 1366 { 1367 logical_file_name = xstrdup (buf[2].string); 1368 line_number = buf[1].number; 1369 } 1370 if (bufpos >= 2 && buf[0].type == token_type_number 1371 && buf[1].type == token_type_string_literal) 1372 { 1373 logical_file_name = xstrdup (buf[1].string); 1374 line_number = buf[0].number; 1375 } 1376 1377 /* Release the storage held by the directive. */ 1378 for (j = 0; j < bufpos; ++j) 1379 free_token (&buf[j]); 1380 1381 /* We must reset the selected comments. */ 1382 savable_comment_reset (); 1383 } 1384 } 1385 1386 1387 /* Supports 2 tokens of pushback. */ 1388 static void 1389 phase6_unget (token_ty *tp) 1390 { 1391 if (tp->type != token_type_eof) 1392 { 1393 if (phase6_pushback_length == SIZEOF (phase6_pushback)) 1394 abort (); 1395 phase6_pushback[phase6_pushback_length++] = *tp; 1396 } 1397 } 1398 1399 1400 /* 8a. Convert ISO C 99 section 7.8.1 format string directives to string 1401 literal placeholders. */ 1402 1403 /* Test for an ISO C 99 section 7.8.1 format string directive. */ 1404 static bool 1405 is_inttypes_macro (const char *name) 1406 { 1407 /* Syntax: 1408 P R I { d | i | o | u | x | X } 1409 { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR } */ 1410 if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I') 1411 { 1412 name += 3; 1413 if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u' 1414 || name[0] == 'x' || name[0] == 'X') 1415 { 1416 name += 1; 1417 if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X' 1418 && name[3] == '\0') 1419 return true; 1420 if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R' 1421 && name[3] == '\0') 1422 return true; 1423 if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A' 1424 && name[3] == 'S' && name[4] == 'T') 1425 name += 5; 1426 else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S' 1427 && name[3] == 'T') 1428 name += 4; 1429 if (name[0] == '8' && name[1] == '\0') 1430 return true; 1431 if (name[0] == '1' && name[1] == '6' && name[2] == '\0') 1432 return true; 1433 if (name[0] == '3' && name[1] == '2' && name[2] == '\0') 1434 return true; 1435 if (name[0] == '6' && name[1] == '4' && name[2] == '\0') 1436 return true; 1437 } 1438 } 1439 return false; 1440 } 1441 1442 static void 1443 phase8a_get (token_ty *tp) 1444 { 1445 phase6_get (tp); 1446 if (tp->type == token_type_name && is_inttypes_macro (tp->string)) 1447 { 1448 /* Turn PRIdXXX into "<PRIdXXX>". */ 1449 char *new_string = xasprintf ("<%s>", tp->string); 1450 free (tp->string); 1451 tp->string = new_string; 1452 tp->comment = add_reference (savable_comment); 1453 tp->type = token_type_string_literal; 1454 } 1455 } 1456 1457 /* Supports 2 tokens of pushback. */ 1458 static inline void 1459 phase8a_unget (token_ty *tp) 1460 { 1461 phase6_unget (tp); 1462 } 1463 1464 1465 /* 8b. Drop whitespace. */ 1466 static void 1467 phase8b_get (token_ty *tp) 1468 { 1469 for (;;) 1470 { 1471 phase8a_get (tp); 1472 1473 if (tp->type == token_type_white_space) 1474 continue; 1475 if (tp->type == token_type_eoln) 1476 { 1477 /* We have to track the last occurrence of a string. One 1478 mode of xgettext allows to group an extracted message 1479 with a comment for documentation. The rule which states 1480 which comment is assumed to be grouped with the message 1481 says it should immediately precede it. Our 1482 interpretation: between the last line of the comment and 1483 the line in which the keyword is found must be no line 1484 with non-white space tokens. */ 1485 ++newline_count; 1486 if (last_non_comment_line > last_comment_line) 1487 savable_comment_reset (); 1488 continue; 1489 } 1490 break; 1491 } 1492 } 1493 1494 /* Supports 2 tokens of pushback. */ 1495 static inline void 1496 phase8b_unget (token_ty *tp) 1497 { 1498 phase8a_unget (tp); 1499 } 1500 1501 1502 /* 8c. In ObjectiveC mode, drop '@' before a literal string. We need to 1503 do this before performing concatenation of adjacent string literals. */ 1504 static void 1505 phase8c_get (token_ty *tp) 1506 { 1507 token_ty tmp; 1508 1509 phase8b_get (tp); 1510 if (tp->type != token_type_objc_special) 1511 return; 1512 phase8b_get (&tmp); 1513 if (tmp.type != token_type_string_literal) 1514 { 1515 phase8b_unget (&tmp); 1516 return; 1517 } 1518 /* Drop the '@' token and return immediately the following string. */ 1519 drop_reference (tmp.comment); 1520 tmp.comment = tp->comment; 1521 *tp = tmp; 1522 } 1523 1524 /* Supports only one pushback token. */ 1525 static inline void 1526 phase8c_unget (token_ty *tp) 1527 { 1528 phase8b_unget (tp); 1529 } 1530 1531 1532 /* 8. Concatenate adjacent string literals to form single string 1533 literals (because we don't expand macros, there are a few things we 1534 will miss). */ 1535 1536 static void 1537 phase8_get (token_ty *tp) 1538 { 1539 phase8c_get (tp); 1540 if (tp->type != token_type_string_literal) 1541 return; 1542 for (;;) 1543 { 1544 token_ty tmp; 1545 size_t len; 1546 1547 phase8c_get (&tmp); 1548 if (tmp.type != token_type_string_literal) 1549 { 1550 phase8c_unget (&tmp); 1551 return; 1552 } 1553 len = strlen (tp->string); 1554 tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1); 1555 strcpy (tp->string + len, tmp.string); 1556 free (tmp.string); 1557 } 1558 } 1559 1560 1561 /* ===================== Reading of high-level tokens. ==================== */ 1562 1563 1564 enum xgettext_token_type_ty 1565 { 1566 xgettext_token_type_eof, 1567 xgettext_token_type_keyword, 1568 xgettext_token_type_symbol, 1569 xgettext_token_type_lparen, 1570 xgettext_token_type_rparen, 1571 xgettext_token_type_comma, 1572 xgettext_token_type_colon, 1573 xgettext_token_type_string_literal, 1574 xgettext_token_type_other 1575 }; 1576 typedef enum xgettext_token_type_ty xgettext_token_type_ty; 1577 1578 typedef struct xgettext_token_ty xgettext_token_ty; 1579 struct xgettext_token_ty 1580 { 1581 xgettext_token_type_ty type; 1582 1583 /* This field is used only for xgettext_token_type_keyword. */ 1584 const struct callshapes *shapes; 1585 1586 /* This field is used only for xgettext_token_type_string_literal, 1587 xgettext_token_type_keyword, xgettext_token_type_symbol. */ 1588 char *string; 1589 1590 /* This field is used only for xgettext_token_type_string_literal. */ 1591 refcounted_string_list_ty *comment; 1592 1593 /* These fields are only for 1594 xgettext_token_type_keyword, 1595 xgettext_token_type_string_literal. */ 1596 lex_pos_ty pos; 1597 }; 1598 1599 1600 /* 9. Convert the remaining preprocessing tokens to C tokens and 1601 discards any white space from the translation unit. */ 1602 1603 static void 1604 x_c_lex (xgettext_token_ty *tp) 1605 { 1606 for (;;) 1607 { 1608 token_ty token; 1609 void *keyword_value; 1610 1611 phase8_get (&token); 1612 switch (token.type) 1613 { 1614 case token_type_eof: 1615 tp->type = xgettext_token_type_eof; 1616 return; 1617 1618 case token_type_name: 1619 last_non_comment_line = newline_count; 1620 1621 if (hash_find_entry (objc_extensions ? &objc_keywords : &c_keywords, 1622 token.string, strlen (token.string), 1623 &keyword_value) 1624 == 0) 1625 { 1626 tp->type = xgettext_token_type_keyword; 1627 tp->shapes = (const struct callshapes *) keyword_value; 1628 tp->pos.file_name = logical_file_name; 1629 tp->pos.line_number = token.line_number; 1630 } 1631 else 1632 tp->type = xgettext_token_type_symbol; 1633 tp->string = token.string; 1634 return; 1635 1636 case token_type_lparen: 1637 last_non_comment_line = newline_count; 1638 1639 tp->type = xgettext_token_type_lparen; 1640 return; 1641 1642 case token_type_rparen: 1643 last_non_comment_line = newline_count; 1644 1645 tp->type = xgettext_token_type_rparen; 1646 return; 1647 1648 case token_type_comma: 1649 last_non_comment_line = newline_count; 1650 1651 tp->type = xgettext_token_type_comma; 1652 return; 1653 1654 case token_type_colon: 1655 last_non_comment_line = newline_count; 1656 1657 tp->type = xgettext_token_type_colon; 1658 return; 1659 1660 case token_type_string_literal: 1661 last_non_comment_line = newline_count; 1662 1663 tp->type = xgettext_token_type_string_literal; 1664 tp->string = token.string; 1665 tp->comment = token.comment; 1666 tp->pos.file_name = logical_file_name; 1667 tp->pos.line_number = token.line_number; 1668 return; 1669 1670 case token_type_objc_special: 1671 drop_reference (token.comment); 1672 /* FALLTHROUGH */ 1673 1674 default: 1675 last_non_comment_line = newline_count; 1676 1677 tp->type = xgettext_token_type_other; 1678 return; 1679 } 1680 } 1681 } 1682 1683 1684 /* ========================= Extracting strings. ========================== */ 1685 1686 1687 /* Context lookup table. */ 1688 static flag_context_list_table_ty *flag_context_list_table; 1689 1690 1691 /* The file is broken into tokens. Scan the token stream, looking for 1692 a keyword, followed by a left paren, followed by a string. When we 1693 see this sequence, we have something to remember. We assume we are 1694 looking at a valid C or C++ program, and leave the complaints about 1695 the grammar to the compiler. 1696 1697 Normal handling: Look for 1698 keyword ( ... msgid ... ) 1699 Plural handling: Look for 1700 keyword ( ... msgid ... msgid_plural ... ) 1701 1702 We use recursion because the arguments before msgid or between msgid 1703 and msgid_plural can contain subexpressions of the same form. */ 1704 1705 1706 /* Extract messages until the next balanced closing parenthesis. 1707 Extracted messages are added to MLP. 1708 Return true upon eof, false upon closing parenthesis. */ 1709 static bool 1710 extract_parenthesized (message_list_ty *mlp, 1711 flag_context_ty outer_context, 1712 flag_context_list_iterator_ty context_iter, 1713 struct arglist_parser *argparser) 1714 { 1715 /* Current argument number. */ 1716 int arg = 1; 1717 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1718 int state; 1719 /* Parameters of the keyword just seen. Defined only in state 1. */ 1720 const struct callshapes *next_shapes = NULL; 1721 /* Context iterator that will be used if the next token is a '('. */ 1722 flag_context_list_iterator_ty next_context_iter = 1723 passthrough_context_list_iterator; 1724 /* Context iterator that will be used if the next token is a ':'. 1725 (Objective C selector syntax.) */ 1726 flag_context_list_iterator_ty selectorcall_context_iter = 1727 passthrough_context_list_iterator; 1728 /* Current context. */ 1729 flag_context_ty inner_context = 1730 inherited_context (outer_context, 1731 flag_context_list_iterator_advance (&context_iter)); 1732 1733 /* Start state is 0. */ 1734 state = 0; 1735 1736 for (;;) 1737 { 1738 xgettext_token_ty token; 1739 1740 x_c_lex (&token); 1741 switch (token.type) 1742 { 1743 case xgettext_token_type_keyword: 1744 next_shapes = token.shapes; 1745 state = 1; 1746 goto keyword_or_symbol; 1747 1748 case xgettext_token_type_symbol: 1749 state = 0; 1750 keyword_or_symbol: 1751 next_context_iter = 1752 flag_context_list_iterator ( 1753 flag_context_list_table_lookup ( 1754 flag_context_list_table, 1755 token.string, strlen (token.string))); 1756 if (objc_extensions) 1757 { 1758 size_t token_string_len = strlen (token.string); 1759 token.string = xrealloc (token.string, token_string_len + 2); 1760 token.string[token_string_len] = ':'; 1761 token.string[token_string_len + 1] = '\0'; 1762 selectorcall_context_iter = 1763 flag_context_list_iterator ( 1764 flag_context_list_table_lookup ( 1765 flag_context_list_table, 1766 token.string, token_string_len + 1)); 1767 } 1768 free (token.string); 1769 continue; 1770 1771 case xgettext_token_type_lparen: 1772 if (extract_parenthesized (mlp, inner_context, next_context_iter, 1773 arglist_parser_alloc (mlp, 1774 state ? next_shapes : NULL))) 1775 { 1776 arglist_parser_done (argparser, arg); 1777 return true; 1778 } 1779 next_context_iter = null_context_list_iterator; 1780 selectorcall_context_iter = null_context_list_iterator; 1781 state = 0; 1782 continue; 1783 1784 case xgettext_token_type_rparen: 1785 arglist_parser_done (argparser, arg); 1786 return false; 1787 1788 case xgettext_token_type_comma: 1789 arg++; 1790 inner_context = 1791 inherited_context (outer_context, 1792 flag_context_list_iterator_advance ( 1793 &context_iter)); 1794 next_context_iter = passthrough_context_list_iterator; 1795 selectorcall_context_iter = passthrough_context_list_iterator; 1796 state = 0; 1797 continue; 1798 1799 case xgettext_token_type_colon: 1800 if (objc_extensions) 1801 { 1802 context_iter = selectorcall_context_iter; 1803 inner_context = 1804 inherited_context (inner_context, 1805 flag_context_list_iterator_advance ( 1806 &context_iter)); 1807 next_context_iter = passthrough_context_list_iterator; 1808 selectorcall_context_iter = passthrough_context_list_iterator; 1809 } 1810 else 1811 { 1812 next_context_iter = null_context_list_iterator; 1813 selectorcall_context_iter = null_context_list_iterator; 1814 } 1815 state = 0; 1816 continue; 1817 1818 case xgettext_token_type_string_literal: 1819 if (extract_all) 1820 remember_a_message (mlp, NULL, token.string, inner_context, 1821 &token.pos, token.comment); 1822 else 1823 arglist_parser_remember (argparser, arg, token.string, 1824 inner_context, 1825 token.pos.file_name, token.pos.line_number, 1826 token.comment); 1827 drop_reference (token.comment); 1828 next_context_iter = null_context_list_iterator; 1829 selectorcall_context_iter = null_context_list_iterator; 1830 state = 0; 1831 continue; 1832 1833 case xgettext_token_type_other: 1834 next_context_iter = null_context_list_iterator; 1835 selectorcall_context_iter = null_context_list_iterator; 1836 state = 0; 1837 continue; 1838 1839 case xgettext_token_type_eof: 1840 arglist_parser_done (argparser, arg); 1841 return true; 1842 1843 default: 1844 abort (); 1845 } 1846 } 1847 } 1848 1849 1850 static void 1851 extract_whole_file (FILE *f, 1852 const char *real_filename, const char *logical_filename, 1853 flag_context_list_table_ty *flag_table, 1854 msgdomain_list_ty *mdlp) 1855 { 1856 message_list_ty *mlp = mdlp->item[0]->messages; 1857 1858 fp = f; 1859 real_file_name = real_filename; 1860 logical_file_name = xstrdup (logical_filename); 1861 line_number = 1; 1862 1863 newline_count = 0; 1864 last_comment_line = -1; 1865 last_non_comment_line = -1; 1866 1867 flag_context_list_table = flag_table; 1868 1869 init_keywords (); 1870 1871 /* Eat tokens until eof is seen. When extract_parenthesized returns 1872 due to an unbalanced closing parenthesis, just restart it. */ 1873 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator, 1874 arglist_parser_alloc (mlp, NULL))) 1875 ; 1876 1877 /* Close scanner. */ 1878 fp = NULL; 1879 real_file_name = NULL; 1880 logical_file_name = NULL; 1881 line_number = 0; 1882 } 1883 1884 1885 void 1886 extract_c (FILE *f, 1887 const char *real_filename, const char *logical_filename, 1888 flag_context_list_table_ty *flag_table, 1889 msgdomain_list_ty *mdlp) 1890 { 1891 objc_extensions = false; 1892 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp); 1893 } 1894 1895 void 1896 extract_objc (FILE *f, 1897 const char *real_filename, const char *logical_filename, 1898 flag_context_list_table_ty *flag_table, 1899 msgdomain_list_ty *mdlp) 1900 { 1901 objc_extensions = true; 1902 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp); 1903 } 1904