1 /* xgettext C/C++/ObjectiveC backend.
2 Copyright (C) 1995-1998, 2000-2006 Free Software Foundation, Inc.
3
4 This file was written by Peter Miller <millerp@canb.auug.org.au>
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-c.h"
33 #include "error.h"
34 #include "error-progname.h"
35 #include "xalloc.h"
36 #include "xvasprintf.h"
37 #include "exit.h"
38 #include "hash.h"
39 #include "gettext.h"
40
41 #define _(s) gettext(s)
42
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44
45
46 /* The ANSI C standard defines several phases of translation:
47
48 1. Terminate line by \n, regardless of the external representation
49 of a text line. Stdio does this for us.
50
51 2. Convert trigraphs to their single character equivalents.
52
53 3. Concatenate each line ending in backslash (\) with the following
54 line.
55
56 4. Replace each comment with a space character.
57
58 5. Parse each resulting logical line as preprocessing tokens a
59 white space.
60
61 6. Recognize and carry out directives (it also expands macros on
62 non-directive lines, which we do not do here).
63
64 7. Replaces escape sequences within character strings with their
65 single character equivalents (we do this in step 5, because we
66 don't have to worry about the #include argument).
67
68 8. Concatenates adjacent string literals to form single string
69 literals (because we don't expand macros, there are a few things
70 we will miss).
71
72 9. Converts the remaining preprocessing tokens to C tokens and
73 discards any white space from the translation unit.
74
75 This lexer implements the above, and presents the scanner (in
76 xgettext.c) with a stream of C tokens. The comments are
77 accumulated in a buffer, and given to xgettext when asked for. */
78
79
80 /* ========================= Lexer customization. ========================= */
81
82 static bool trigraphs = false;
83
84 void
x_c_trigraphs()85 x_c_trigraphs ()
86 {
87 trigraphs = true;
88 }
89
90
91 /* ====================== Keyword set customization. ====================== */
92
93 /* If true extract all strings. */
94 static bool extract_all = false;
95
96 static hash_table c_keywords;
97 static hash_table objc_keywords;
98 static bool default_keywords = true;
99
100
101 void
x_c_extract_all()102 x_c_extract_all ()
103 {
104 extract_all = true;
105 }
106
107
108 static void
add_keyword(const char * name,hash_table * keywords)109 add_keyword (const char *name, hash_table *keywords)
110 {
111 if (name == NULL)
112 default_keywords = false;
113 else
114 {
115 const char *end;
116 struct callshape shape;
117 const char *colon;
118
119 if (keywords->table == NULL)
120 hash_init (keywords, 100);
121
122 split_keywordspec (name, &end, &shape);
123
124 /* The characters between name and end should form a valid C identifier.
125 A colon means an invalid parse in split_keywordspec(). */
126 colon = strchr (name, ':');
127 if (colon == NULL || colon >= end)
128 insert_keyword_callshape (keywords, name, end - name, &shape);
129 }
130 }
131
132 void
x_c_keyword(const char * name)133 x_c_keyword (const char *name)
134 {
135 add_keyword (name, &c_keywords);
136 }
137
138 void
x_objc_keyword(const char * name)139 x_objc_keyword (const char *name)
140 {
141 add_keyword (name, &objc_keywords);
142 }
143
144 /* Finish initializing the keywords hash tables.
145 Called after argument processing, before each file is processed. */
146 static void
init_keywords()147 init_keywords ()
148 {
149 if (default_keywords)
150 {
151 /* When adding new keywords here, also update the documentation in
152 xgettext.texi! */
153 x_c_keyword ("gettext");
154 x_c_keyword ("dgettext:2");
155 x_c_keyword ("dcgettext:2");
156 x_c_keyword ("ngettext:1,2");
157 x_c_keyword ("dngettext:2,3");
158 x_c_keyword ("dcngettext:2,3");
159 x_c_keyword ("gettext_noop");
160 x_c_keyword ("pgettext:1c,2");
161 x_c_keyword ("dpgettext:2c,3");
162 x_c_keyword ("dcpgettext:2c,3");
163 x_c_keyword ("npgettext:1c,2,3");
164 x_c_keyword ("dnpgettext:2c,3,4");
165 x_c_keyword ("dcnpgettext:2c,3,4");
166
167 x_objc_keyword ("gettext");
168 x_objc_keyword ("dgettext:2");
169 x_objc_keyword ("dcgettext:2");
170 x_objc_keyword ("ngettext:1,2");
171 x_objc_keyword ("dngettext:2,3");
172 x_objc_keyword ("dcngettext:2,3");
173 x_objc_keyword ("gettext_noop");
174 x_objc_keyword ("pgettext:1c,2");
175 x_objc_keyword ("dpgettext:2c,3");
176 x_objc_keyword ("dcpgettext:2c,3");
177 x_objc_keyword ("npgettext:1c,2,3");
178 x_objc_keyword ("dnpgettext:2c,3,4");
179 x_objc_keyword ("dcnpgettext:2c,3,4");
180 x_objc_keyword ("NSLocalizedString"); /* similar to gettext */
181 x_objc_keyword ("_"); /* similar to gettext */
182 x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
183 x_objc_keyword ("__"); /* similar to gettext_noop */
184
185 default_keywords = false;
186 }
187 }
188
189 void
init_flag_table_c()190 init_flag_table_c ()
191 {
192 xgettext_record_flag ("gettext:1:pass-c-format");
193 xgettext_record_flag ("dgettext:2:pass-c-format");
194 xgettext_record_flag ("dcgettext:2:pass-c-format");
195 xgettext_record_flag ("ngettext:1:pass-c-format");
196 xgettext_record_flag ("ngettext:2:pass-c-format");
197 xgettext_record_flag ("dngettext:2:pass-c-format");
198 xgettext_record_flag ("dngettext:3:pass-c-format");
199 xgettext_record_flag ("dcngettext:2:pass-c-format");
200 xgettext_record_flag ("dcngettext:3:pass-c-format");
201 xgettext_record_flag ("gettext_noop:1:pass-c-format");
202 xgettext_record_flag ("pgettext:2:pass-c-format");
203 xgettext_record_flag ("dpgettext:3:pass-c-format");
204 xgettext_record_flag ("dcpgettext:3:pass-c-format");
205 xgettext_record_flag ("npgettext:2:pass-c-format");
206 xgettext_record_flag ("npgettext:3:pass-c-format");
207 xgettext_record_flag ("dnpgettext:3:pass-c-format");
208 xgettext_record_flag ("dnpgettext:4:pass-c-format");
209 xgettext_record_flag ("dcnpgettext:3:pass-c-format");
210 xgettext_record_flag ("dcnpgettext:4:pass-c-format");
211
212 /* <stdio.h> */
213 xgettext_record_flag ("fprintf:2:c-format");
214 xgettext_record_flag ("vfprintf:2:c-format");
215 xgettext_record_flag ("printf:1:c-format");
216 xgettext_record_flag ("vprintf:1:c-format");
217 xgettext_record_flag ("sprintf:2:c-format");
218 xgettext_record_flag ("vsprintf:2:c-format");
219 xgettext_record_flag ("snprintf:3:c-format");
220 xgettext_record_flag ("vsnprintf:3:c-format");
221 #if 0 /* These functions are not standard. */
222 /* <stdio.h> */
223 xgettext_record_flag ("asprintf:2:c-format");
224 xgettext_record_flag ("vasprintf:2:c-format");
225 xgettext_record_flag ("dprintf:2:c-format");
226 xgettext_record_flag ("vdprintf:2:c-format");
227 xgettext_record_flag ("obstack_printf:2:c-format");
228 xgettext_record_flag ("obstack_vprintf:2:c-format");
229 /* <error.h> */
230 xgettext_record_flag ("error:3:c-format");
231 xgettext_record_flag ("error_at_line:5:c-format");
232 /* <argp.h> */
233 xgettext_record_flag ("argp_error:2:c-format");
234 xgettext_record_flag ("argp_failure:2:c-format");
235 #endif
236
237 xgettext_record_flag ("gettext:1:pass-boost-format");
238 xgettext_record_flag ("dgettext:2:pass-boost-format");
239 xgettext_record_flag ("dcgettext:2:pass-boost-format");
240 xgettext_record_flag ("ngettext:1:pass-boost-format");
241 xgettext_record_flag ("ngettext:2:pass-boost-format");
242 xgettext_record_flag ("dngettext:2:pass-boost-format");
243 xgettext_record_flag ("dngettext:3:pass-boost-format");
244 xgettext_record_flag ("dcngettext:2:pass-boost-format");
245 xgettext_record_flag ("dcngettext:3:pass-boost-format");
246 xgettext_record_flag ("gettext_noop:1:pass-boost-format");
247 xgettext_record_flag ("pgettext:2:pass-boost-format");
248 xgettext_record_flag ("dpgettext:3:pass-boost-format");
249 xgettext_record_flag ("dcpgettext:3:pass-boost-format");
250 xgettext_record_flag ("npgettext:2:pass-boost-format");
251 xgettext_record_flag ("npgettext:3:pass-boost-format");
252 xgettext_record_flag ("dnpgettext:3:pass-boost-format");
253 xgettext_record_flag ("dnpgettext:4:pass-boost-format");
254 xgettext_record_flag ("dcnpgettext:3:pass-boost-format");
255 xgettext_record_flag ("dcnpgettext:4:pass-boost-format");
256
257 /* <boost/format.hpp> */
258 xgettext_record_flag ("format:1:boost-format");
259 }
260
261 void
init_flag_table_objc()262 init_flag_table_objc ()
263 {
264 /* Since the settings done in init_flag_table_c() also have an effect for
265 the ObjectiveC parser, we don't have to repeat them here. */
266 xgettext_record_flag ("gettext:1:pass-objc-format");
267 xgettext_record_flag ("dgettext:2:pass-objc-format");
268 xgettext_record_flag ("dcgettext:2:pass-objc-format");
269 xgettext_record_flag ("ngettext:1:pass-objc-format");
270 xgettext_record_flag ("ngettext:2:pass-objc-format");
271 xgettext_record_flag ("dngettext:2:pass-objc-format");
272 xgettext_record_flag ("dngettext:3:pass-objc-format");
273 xgettext_record_flag ("dcngettext:2:pass-objc-format");
274 xgettext_record_flag ("dcngettext:3:pass-objc-format");
275 xgettext_record_flag ("gettext_noop:1:pass-objc-format");
276 xgettext_record_flag ("pgettext:2:pass-objc-format");
277 xgettext_record_flag ("dpgettext:3:pass-objc-format");
278 xgettext_record_flag ("dcpgettext:3:pass-objc-format");
279 xgettext_record_flag ("npgettext:2:pass-objc-format");
280 xgettext_record_flag ("npgettext:3:pass-objc-format");
281 xgettext_record_flag ("dnpgettext:3:pass-objc-format");
282 xgettext_record_flag ("dnpgettext:4:pass-objc-format");
283 xgettext_record_flag ("dcnpgettext:3:pass-objc-format");
284 xgettext_record_flag ("dcnpgettext:4:pass-objc-format");
285 xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
286 xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
287 xgettext_record_flag ("_:1:pass-c-format");
288 xgettext_record_flag ("_:1:pass-objc-format");
289 xgettext_record_flag ("stringWithFormat::1:objc-format");
290 xgettext_record_flag ("initWithFormat::1:objc-format");
291 xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
292 xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
293 xgettext_record_flag ("appendFormat::1:objc-format");
294 }
295
296 void
init_flag_table_gcc_internal()297 init_flag_table_gcc_internal ()
298 {
299 xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
300 xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
301 xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
302 xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
303 xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
304 xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
305 xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
306 xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
307 xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
308 xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
309 xgettext_record_flag ("pgettext:2:pass-gcc-internal-format");
310 xgettext_record_flag ("dpgettext:3:pass-gcc-internal-format");
311 xgettext_record_flag ("dcpgettext:3:pass-gcc-internal-format");
312 xgettext_record_flag ("npgettext:2:pass-gcc-internal-format");
313 xgettext_record_flag ("npgettext:3:pass-gcc-internal-format");
314 xgettext_record_flag ("dnpgettext:3:pass-gcc-internal-format");
315 xgettext_record_flag ("dnpgettext:4:pass-gcc-internal-format");
316 xgettext_record_flag ("dcnpgettext:3:pass-gcc-internal-format");
317 xgettext_record_flag ("dcnpgettext:4:pass-gcc-internal-format");
318 #if 0 /* This should better be done inside GCC. */
319 /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
320 /* c-format.c */
321 xgettext_record_flag ("status_warning:2:gcc-internal-format");
322 /* c-tree.h */
323 xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
324 /* collect2.h */
325 //xgettext_record_flag ("error:1:c-format"); // 3 different versions
326 xgettext_record_flag ("notice:1:c-format");
327 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
328 xgettext_record_flag ("fatal_perror:1:c-format");
329 /* cpplib.h */
330 xgettext_record_flag ("cpp_error:3:c-format");
331 xgettext_record_flag ("cpp_error_with_line:5:c-format");
332 /* diagnostic.h */
333 xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
334 xgettext_record_flag ("output_printf:2:gcc-internal-format");
335 xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
336 xgettext_record_flag ("verbatim:1:gcc-internal-format");
337 xgettext_record_flag ("inform:1:pass-gcc-internal-format");
338 /* gcc.h */
339 //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
340 //xgettext_record_flag ("error:1:c-format"); // 3 different versions
341 /* genattrtab.h */
342 xgettext_record_flag ("attr_printf:2:pass-c-format");
343 /* gengtype.h */
344 xgettext_record_flag ("error_at_line:2:pass-c-format");
345 xgettext_record_flag ("xvasprintf:2:pass-c-format");
346 xgettext_record_flag ("xasprintf:1:pass-c-format");
347 xgettext_record_flag ("oprintf:2:pass-c-format");
348 /* gensupport.h */
349 xgettext_record_flag ("message_with_line:2:pass-c-format");
350 /* output.h */
351 xgettext_record_flag ("output_operand_lossage:1:c-format");
352 /* ra.h */
353 xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
354 /* toplev.h */
355 xgettext_record_flag ("fnotice:2:c-format");
356 xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
357 xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
358 xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
359 xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
360 xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
361 xgettext_record_flag ("pedwarn:1:gcc-internal-format");
362 xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
363 xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
364 xgettext_record_flag ("sorry:1:gcc-internal-format");
365 xgettext_record_flag ("error:1:pass-gcc-internal-format");
366 xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
367 xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
368 xgettext_record_flag ("warning:1:pass-gcc-internal-format");
369 xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
370 xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
371 /* f/com.h */
372 xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
373 /* f/sts.h */
374 xgettext_record_flag ("ffests_printf:2:pass-c-format");
375 /* java/java-tree.h */
376 xgettext_record_flag ("parse_error_context:2:pass-c-format");
377 #endif
378 }
379
380
381 /* ======================== Reading of characters. ======================== */
382
383 /* Real filename, used in error messages about the input file. */
384 static const char *real_file_name;
385
386 /* Logical filename and line number, used to label the extracted messages. */
387 static char *logical_file_name;
388 static int line_number;
389
390 /* The input file stream. */
391 static FILE *fp;
392
393
394 /* 0. Terminate line by \n, regardless whether the external representation of
395 a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
396 It is debatable whether supporting CR/LF line terminators in C sources
397 on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
398 unconditionally, it must be OK.
399 The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
400 automatically, but here we also need this conversion on Unix. As a side
401 effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
402 is not a problem. */
403
404
405 static int
phase0_getc()406 phase0_getc ()
407 {
408 int c;
409
410 c = getc (fp);
411 if (c == EOF)
412 {
413 if (ferror (fp))
414 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
415 real_file_name);
416 return EOF;
417 }
418
419 if (c == '\r')
420 {
421 int c1 = getc (fp);
422
423 if (c1 != EOF && c1 != '\n')
424 ungetc (c1, fp);
425
426 /* Seen line terminator CR or CR/LF. */
427 return '\n';
428 }
429
430 return c;
431 }
432
433
434 /* Supports only one pushback character, and not '\n'. */
435 static inline void
phase0_ungetc(int c)436 phase0_ungetc (int c)
437 {
438 if (c != EOF)
439 ungetc (c, fp);
440 }
441
442
443 /* 1. line_number handling. Combine backslash-newline to nothing. */
444
445 static unsigned char phase1_pushback[2];
446 static int phase1_pushback_length;
447
448
449 static int
phase1_getc()450 phase1_getc ()
451 {
452 int c;
453
454 if (phase1_pushback_length)
455 {
456 c = phase1_pushback[--phase1_pushback_length];
457 if (c == '\n')
458 ++line_number;
459 return c;
460 }
461 for (;;)
462 {
463 c = phase0_getc ();
464 switch (c)
465 {
466 case '\n':
467 ++line_number;
468 return '\n';
469
470 case '\\':
471 c = phase0_getc ();
472 if (c != '\n')
473 {
474 phase0_ungetc (c);
475 return '\\';
476 }
477 ++line_number;
478 break;
479
480 default:
481 return c;
482 }
483 }
484 }
485
486
487 /* Supports 2 characters of pushback. */
488 static void
phase1_ungetc(int c)489 phase1_ungetc (int c)
490 {
491 switch (c)
492 {
493 case EOF:
494 break;
495
496 case '\n':
497 --line_number;
498 /* FALLTHROUGH */
499
500 default:
501 if (phase1_pushback_length == SIZEOF (phase1_pushback))
502 abort ();
503 phase1_pushback[phase1_pushback_length++] = c;
504 break;
505 }
506 }
507
508
509 /* 2. Convert trigraphs to their single character equivalents. Most
510 sane human beings vomit copiously at the mention of trigraphs, which
511 is why they are an option. */
512
513 static unsigned char phase2_pushback[1];
514 static int phase2_pushback_length;
515
516
517 static int
phase2_getc()518 phase2_getc ()
519 {
520 int c;
521
522 if (phase2_pushback_length)
523 return phase2_pushback[--phase2_pushback_length];
524 if (!trigraphs)
525 return phase1_getc ();
526
527 c = phase1_getc ();
528 if (c != '?')
529 return c;
530 c = phase1_getc ();
531 if (c != '?')
532 {
533 phase1_ungetc (c);
534 return '?';
535 }
536 c = phase1_getc ();
537 switch (c)
538 {
539 case '(':
540 return '[';
541 case '/':
542 return '\\';
543 case ')':
544 return ']';
545 case '\'':
546 return '^';
547 case '<':
548 return '{';
549 case '!':
550 return '|';
551 case '>':
552 return '}';
553 case '-':
554 return '~';
555 case '#':
556 return '=';
557 }
558 phase1_ungetc (c);
559 phase1_ungetc ('?');
560 return '?';
561 }
562
563
564 /* Supports only one pushback character. */
565 static void
phase2_ungetc(int c)566 phase2_ungetc (int c)
567 {
568 if (c != EOF)
569 {
570 if (phase2_pushback_length == SIZEOF (phase2_pushback))
571 abort ();
572 phase2_pushback[phase2_pushback_length++] = c;
573 }
574 }
575
576
577 /* 3. Concatenate each line ending in backslash (\) with the following
578 line. Basically, all you need to do is elide "\\\n" sequences from
579 the input. */
580
581 static unsigned char phase3_pushback[2];
582 static int phase3_pushback_length;
583
584
585 static int
phase3_getc()586 phase3_getc ()
587 {
588 if (phase3_pushback_length)
589 return phase3_pushback[--phase3_pushback_length];
590 for (;;)
591 {
592 int c = phase2_getc ();
593 if (c != '\\')
594 return c;
595 c = phase2_getc ();
596 if (c != '\n')
597 {
598 phase2_ungetc (c);
599 return '\\';
600 }
601 }
602 }
603
604
605 /* Supports 2 characters of pushback. */
606 static void
phase3_ungetc(int c)607 phase3_ungetc (int c)
608 {
609 if (c != EOF)
610 {
611 if (phase3_pushback_length == SIZEOF (phase3_pushback))
612 abort ();
613 phase3_pushback[phase3_pushback_length++] = c;
614 }
615 }
616
617
618 /* Accumulating comments. */
619
620 static char *buffer;
621 static size_t bufmax;
622 static size_t buflen;
623
624 static inline void
comment_start()625 comment_start ()
626 {
627 buflen = 0;
628 }
629
630 static inline void
comment_add(int c)631 comment_add (int c)
632 {
633 if (buflen >= bufmax)
634 {
635 bufmax = 2 * bufmax + 10;
636 buffer = xrealloc (buffer, bufmax);
637 }
638 buffer[buflen++] = c;
639 }
640
641 static inline void
comment_line_end(size_t chars_to_remove)642 comment_line_end (size_t chars_to_remove)
643 {
644 buflen -= chars_to_remove;
645 while (buflen >= 1
646 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
647 --buflen;
648 if (chars_to_remove == 0 && buflen >= bufmax)
649 {
650 bufmax = 2 * bufmax + 10;
651 buffer = xrealloc (buffer, bufmax);
652 }
653 buffer[buflen] = '\0';
654 savable_comment_add (buffer);
655 }
656
657
658 /* These are for tracking whether comments count as immediately before
659 keyword. */
660 static int last_comment_line;
661 static int last_non_comment_line;
662 static int newline_count;
663
664
665 /* 4. Replace each comment that is not inside a character constant or
666 string literal with a space character. We need to remember the
667 comment for later, because it may be attached to a keyword string.
668 We also optionally understand C++ comments. */
669
670 static int
phase4_getc()671 phase4_getc ()
672 {
673 int c;
674 bool last_was_star;
675
676 c = phase3_getc ();
677 if (c != '/')
678 return c;
679 c = phase3_getc ();
680 switch (c)
681 {
682 default:
683 phase3_ungetc (c);
684 return '/';
685
686 case '*':
687 /* C comment. */
688 comment_start ();
689 last_was_star = false;
690 for (;;)
691 {
692 c = phase3_getc ();
693 if (c == EOF)
694 break;
695 /* We skip all leading white space, but not EOLs. */
696 if (!(buflen == 0 && (c == ' ' || c == '\t')))
697 comment_add (c);
698 switch (c)
699 {
700 case '\n':
701 comment_line_end (1);
702 comment_start ();
703 last_was_star = false;
704 continue;
705
706 case '*':
707 last_was_star = true;
708 continue;
709
710 case '/':
711 if (last_was_star)
712 {
713 comment_line_end (2);
714 break;
715 }
716 /* FALLTHROUGH */
717
718 default:
719 last_was_star = false;
720 continue;
721 }
722 break;
723 }
724 last_comment_line = newline_count;
725 return ' ';
726
727 case '/':
728 /* C++ or ISO C 99 comment. */
729 comment_start ();
730 for (;;)
731 {
732 c = phase3_getc ();
733 if (c == '\n' || c == EOF)
734 break;
735 /* We skip all leading white space, but not EOLs. */
736 if (!(buflen == 0 && (c == ' ' || c == '\t')))
737 comment_add (c);
738 }
739 comment_line_end (0);
740 last_comment_line = newline_count;
741 return '\n';
742 }
743 }
744
745
746 /* Supports only one pushback character. */
747 static void
phase4_ungetc(int c)748 phase4_ungetc (int c)
749 {
750 phase3_ungetc (c);
751 }
752
753
754 /* ========================== Reading of tokens. ========================== */
755
756
757 /* True if ObjectiveC extensions are recognized. */
758 static bool objc_extensions;
759
760 enum token_type_ty
761 {
762 token_type_character_constant, /* 'x' */
763 token_type_eof,
764 token_type_eoln,
765 token_type_hash, /* # */
766 token_type_lparen, /* ( */
767 token_type_rparen, /* ) */
768 token_type_comma, /* , */
769 token_type_colon, /* : */
770 token_type_name, /* abc */
771 token_type_number, /* 2.7 */
772 token_type_string_literal, /* "abc" */
773 token_type_symbol, /* < > = etc. */
774 token_type_objc_special, /* @ */
775 token_type_white_space
776 };
777 typedef enum token_type_ty token_type_ty;
778
779 typedef struct token_ty token_ty;
780 struct token_ty
781 {
782 token_type_ty type;
783 char *string; /* for token_type_name, token_type_string_literal */
784 refcounted_string_list_ty *comment; /* for token_type_string_literal,
785 token_type_objc_special */
786 long number;
787 int line_number;
788 };
789
790
791 /* 7. Replace escape sequences within character strings with their
792 single character equivalents. This is called from phase 5, because
793 we don't have to worry about the #include argument. There are
794 pathological cases which could bite us (like the DOS directory
795 separator), but just pretend it can't happen. */
796
797 #define P7_QUOTES (1000 + '"')
798 #define P7_QUOTE (1000 + '\'')
799 #define P7_NEWLINE (1000 + '\n')
800
801 static int
phase7_getc()802 phase7_getc ()
803 {
804 int c, n, j;
805
806 /* Use phase 3, because phase 4 elides comments. */
807 c = phase3_getc ();
808
809 /* Return a magic newline indicator, so that we can distinguish
810 between the user requesting a newline in the string (e.g. using
811 "\n" or "\012") from the user failing to terminate the string or
812 character constant. The ANSI C standard says: 3.1.3.4 Character
813 Constants contain ``any character except single quote, backslash or
814 newline; or an escape sequence'' and 3.1.4 String Literals contain
815 ``any character except double quote, backslash or newline; or an
816 escape sequence''.
817
818 Most compilers give a fatal error in this case, however gcc is
819 stupidly silent, even though this is a very common typo. OK, so
820 gcc --pedantic will tell me, but that gripes about too much other
821 stuff. Could I have a ``gcc -Wnewline-in-string'' option, or
822 better yet a ``gcc -fno-newline-in-string'' option, please? Gcc is
823 also inconsistent between string literals and character constants:
824 you may not embed newlines in character constants; try it, you get
825 a useful diagnostic. --PMiller */
826 if (c == '\n')
827 return P7_NEWLINE;
828
829 if (c == '"')
830 return P7_QUOTES;
831 if (c == '\'')
832 return P7_QUOTE;
833 if (c != '\\')
834 return c;
835 c = phase3_getc ();
836 switch (c)
837 {
838 default:
839 /* Unknown escape sequences really should be an error, but just
840 ignore them, and let the real compiler complain. */
841 phase3_ungetc (c);
842 return '\\';
843
844 case '"':
845 case '\'':
846 case '?':
847 case '\\':
848 return c;
849
850 case 'a':
851 return '\a';
852 case 'b':
853 return '\b';
854
855 /* The \e escape is preculiar to gcc, and assumes an ASCII
856 character set (or superset). We don't provide support for it
857 here. */
858
859 case 'f':
860 return '\f';
861 case 'n':
862 return '\n';
863 case 'r':
864 return '\r';
865 case 't':
866 return '\t';
867 case 'v':
868 return '\v';
869
870 case 'x':
871 c = phase3_getc ();
872 switch (c)
873 {
874 default:
875 phase3_ungetc (c);
876 phase3_ungetc ('x');
877 return '\\';
878
879 case '0': case '1': case '2': case '3': case '4':
880 case '5': case '6': case '7': case '8': case '9':
881 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
882 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
883 break;
884 }
885 n = 0;
886 for (;;)
887 {
888 switch (c)
889 {
890 default:
891 phase3_ungetc (c);
892 return n;
893
894 case '0': case '1': case '2': case '3': case '4':
895 case '5': case '6': case '7': case '8': case '9':
896 n = n * 16 + c - '0';
897 break;
898
899 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
900 n = n * 16 + 10 + c - 'A';
901 break;
902
903 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
904 n = n * 16 + 10 + c - 'a';
905 break;
906 }
907 c = phase3_getc ();
908 }
909 return n;
910
911 case '0': case '1': case '2': case '3':
912 case '4': case '5': case '6': case '7':
913 n = 0;
914 for (j = 0; j < 3; ++j)
915 {
916 n = n * 8 + c - '0';
917 c = phase3_getc ();
918 switch (c)
919 {
920 default:
921 break;
922
923 case '0': case '1': case '2': case '3':
924 case '4': case '5': case '6': case '7':
925 continue;
926 }
927 break;
928 }
929 phase3_ungetc (c);
930 return n;
931 }
932 }
933
934
935 static void
phase7_ungetc(int c)936 phase7_ungetc (int c)
937 {
938 phase3_ungetc (c);
939 }
940
941
942 /* Free the memory pointed to by a 'struct token_ty'. */
943 static inline void
free_token(token_ty * tp)944 free_token (token_ty *tp)
945 {
946 if (tp->type == token_type_name || tp->type == token_type_string_literal)
947 free (tp->string);
948 if (tp->type == token_type_string_literal
949 || tp->type == token_type_objc_special)
950 drop_reference (tp->comment);
951 }
952
953
954 /* 5. Parse each resulting logical line as preprocessing tokens and
955 white space. Preprocessing tokens and C tokens don't always match. */
956
957 static token_ty phase5_pushback[1];
958 static int phase5_pushback_length;
959
960
961 static void
phase5_get(token_ty * tp)962 phase5_get (token_ty *tp)
963 {
964 static char *buffer;
965 static int bufmax;
966 int bufpos;
967 int c;
968
969 if (phase5_pushback_length)
970 {
971 *tp = phase5_pushback[--phase5_pushback_length];
972 return;
973 }
974 tp->string = NULL;
975 tp->number = 0;
976 tp->line_number = line_number;
977 c = phase4_getc ();
978 switch (c)
979 {
980 case EOF:
981 tp->type = token_type_eof;
982 return;
983
984 case '\n':
985 tp->type = token_type_eoln;
986 return;
987
988 case ' ':
989 case '\f':
990 case '\t':
991 for (;;)
992 {
993 c = phase4_getc ();
994 switch (c)
995 {
996 case ' ':
997 case '\f':
998 case '\t':
999 continue;
1000
1001 default:
1002 phase4_ungetc (c);
1003 break;
1004 }
1005 break;
1006 }
1007 tp->type = token_type_white_space;
1008 return;
1009
1010 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1011 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1012 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1013 case 'V': case 'W': case 'X': case 'Y': case 'Z':
1014 case '_':
1015 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1016 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1017 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1018 case 'v': case 'w': case 'x': case 'y': case 'z':
1019 bufpos = 0;
1020 for (;;)
1021 {
1022 if (bufpos >= bufmax)
1023 {
1024 bufmax = 2 * bufmax + 10;
1025 buffer = xrealloc (buffer, bufmax);
1026 }
1027 buffer[bufpos++] = c;
1028 c = phase4_getc ();
1029 switch (c)
1030 {
1031 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1032 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1033 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1034 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1035 case 'Y': case 'Z':
1036 case '_':
1037 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1038 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1039 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1040 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1041 case 'y': case 'z':
1042 case '0': case '1': case '2': case '3': case '4':
1043 case '5': case '6': case '7': case '8': case '9':
1044 continue;
1045
1046 default:
1047 phase4_ungetc (c);
1048 break;
1049 }
1050 break;
1051 }
1052 if (bufpos >= bufmax)
1053 {
1054 bufmax = 2 * bufmax + 10;
1055 buffer = xrealloc (buffer, bufmax);
1056 }
1057 buffer[bufpos] = 0;
1058 tp->string = xstrdup (buffer);
1059 tp->type = token_type_name;
1060 return;
1061
1062 case '.':
1063 c = phase4_getc ();
1064 phase4_ungetc (c);
1065 switch (c)
1066 {
1067 default:
1068 tp->type = token_type_symbol;
1069 return;
1070
1071 case '0': case '1': case '2': case '3': case '4':
1072 case '5': case '6': case '7': case '8': case '9':
1073 c = '.';
1074 break;
1075 }
1076 /* FALLTHROUGH */
1077
1078 case '0': case '1': case '2': case '3': case '4':
1079 case '5': case '6': case '7': case '8': case '9':
1080 /* The preprocessing number token is more "generous" than the C
1081 number tokens. This is mostly due to token pasting (another
1082 thing we can ignore here). */
1083 bufpos = 0;
1084 for (;;)
1085 {
1086 if (bufpos >= bufmax)
1087 {
1088 bufmax = 2 * bufmax + 10;
1089 buffer = xrealloc (buffer, bufmax);
1090 }
1091 buffer[bufpos++] = c;
1092 c = phase4_getc ();
1093 switch (c)
1094 {
1095 case 'e':
1096 case 'E':
1097 if (bufpos >= bufmax)
1098 {
1099 bufmax = 2 * bufmax + 10;
1100 buffer = xrealloc (buffer, bufmax);
1101 }
1102 buffer[bufpos++] = c;
1103 c = phase4_getc ();
1104 if (c != '+' || c != '-')
1105 {
1106 phase4_ungetc (c);
1107 break;
1108 }
1109 continue;
1110
1111 case 'A': case 'B': case 'C': case 'D': case 'F':
1112 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1113 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1114 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1115 case 'Y': case 'Z':
1116 case 'a': case 'b': case 'c': case 'd': case 'f':
1117 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1118 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1119 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1120 case 'y': case 'z':
1121 case '0': case '1': case '2': case '3': case '4':
1122 case '5': case '6': case '7': case '8': case '9':
1123 case '.':
1124 continue;
1125
1126 default:
1127 phase4_ungetc (c);
1128 break;
1129 }
1130 break;
1131 }
1132 if (bufpos >= bufmax)
1133 {
1134 bufmax = 2 * bufmax + 10;
1135 buffer = xrealloc (buffer, bufmax);
1136 }
1137 buffer[bufpos] = 0;
1138 tp->type = token_type_number;
1139 tp->number = atol (buffer);
1140 return;
1141
1142 case '\'':
1143 /* We could worry about the 'L' before wide character constants,
1144 but ignoring it has no effect unless one of the keywords is
1145 "L". Just pretend it won't happen. Also, we don't need to
1146 remember the character constant. */
1147 for (;;)
1148 {
1149 c = phase7_getc ();
1150 if (c == P7_NEWLINE)
1151 {
1152 error_with_progname = false;
1153 error (0, 0, _("%s:%d: warning: unterminated character constant"),
1154 logical_file_name, line_number - 1);
1155 error_with_progname = true;
1156 phase7_ungetc ('\n');
1157 break;
1158 }
1159 if (c == EOF || c == P7_QUOTE)
1160 break;
1161 }
1162 tp->type = token_type_character_constant;
1163 return;
1164
1165 case '"':
1166 /* We could worry about the 'L' before wide string constants,
1167 but since gettext's argument is not a wide character string,
1168 let the compiler complain about the argument not matching the
1169 prototype. Just pretend it won't happen. */
1170 bufpos = 0;
1171 for (;;)
1172 {
1173 c = phase7_getc ();
1174 if (c == P7_NEWLINE)
1175 {
1176 error_with_progname = false;
1177 error (0, 0, _("%s:%d: warning: unterminated string literal"),
1178 logical_file_name, line_number - 1);
1179 error_with_progname = true;
1180 phase7_ungetc ('\n');
1181 break;
1182 }
1183 if (c == EOF || c == P7_QUOTES)
1184 break;
1185 if (c == P7_QUOTE)
1186 c = '\'';
1187 if (bufpos >= bufmax)
1188 {
1189 bufmax = 2 * bufmax + 10;
1190 buffer = xrealloc (buffer, bufmax);
1191 }
1192 buffer[bufpos++] = c;
1193 }
1194 if (bufpos >= bufmax)
1195 {
1196 bufmax = 2 * bufmax + 10;
1197 buffer = xrealloc (buffer, bufmax);
1198 }
1199 buffer[bufpos] = 0;
1200 tp->type = token_type_string_literal;
1201 tp->string = xstrdup (buffer);
1202 tp->comment = add_reference (savable_comment);
1203 return;
1204
1205 case '(':
1206 tp->type = token_type_lparen;
1207 return;
1208
1209 case ')':
1210 tp->type = token_type_rparen;
1211 return;
1212
1213 case ',':
1214 tp->type = token_type_comma;
1215 return;
1216
1217 case '#':
1218 tp->type = token_type_hash;
1219 return;
1220
1221 case ':':
1222 tp->type = token_type_colon;
1223 return;
1224
1225 case '@':
1226 if (objc_extensions)
1227 {
1228 tp->type = token_type_objc_special;
1229 tp->comment = add_reference (savable_comment);
1230 return;
1231 }
1232 /* FALLTHROUGH */
1233
1234 default:
1235 /* We could carefully recognize each of the 2 and 3 character
1236 operators, but it is not necessary, as we only need to recognize
1237 gettext invocations. Don't bother. */
1238 tp->type = token_type_symbol;
1239 return;
1240 }
1241 }
1242
1243
1244 /* Supports only one pushback token. */
1245 static void
phase5_unget(token_ty * tp)1246 phase5_unget (token_ty *tp)
1247 {
1248 if (tp->type != token_type_eof)
1249 {
1250 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1251 abort ();
1252 phase5_pushback[phase5_pushback_length++] = *tp;
1253 }
1254 }
1255
1256
1257 /* X. Recognize a leading # symbol. Leave leading hash as a hash, but
1258 turn hash in the middle of a line into a plain symbol token. This
1259 makes the phase 6 easier. */
1260
1261 static void
phaseX_get(token_ty * tp)1262 phaseX_get (token_ty *tp)
1263 {
1264 static bool middle; /* false at the beginning of a line, true otherwise. */
1265
1266 phase5_get (tp);
1267
1268 if (tp->type == token_type_eoln || tp->type == token_type_eof)
1269 middle = false;
1270 else
1271 {
1272 if (middle)
1273 {
1274 /* Turn hash in the middle of a line into a plain symbol token. */
1275 if (tp->type == token_type_hash)
1276 tp->type = token_type_symbol;
1277 }
1278 else
1279 {
1280 /* When we see leading whitespace followed by a hash sign,
1281 discard the leading white space token. The hash is all
1282 phase 6 is interested in. */
1283 if (tp->type == token_type_white_space)
1284 {
1285 token_ty next;
1286
1287 phase5_get (&next);
1288 if (next.type == token_type_hash)
1289 *tp = next;
1290 else
1291 phase5_unget (&next);
1292 }
1293 middle = true;
1294 }
1295 }
1296 }
1297
1298
1299 /* 6. Recognize and carry out directives (it also expands macros on
1300 non-directive lines, which we do not do here). The only directive
1301 we care about are the #line and #define directive. We throw all the
1302 others away. */
1303
1304 static token_ty phase6_pushback[2];
1305 static int phase6_pushback_length;
1306
1307
1308 static void
phase6_get(token_ty * tp)1309 phase6_get (token_ty *tp)
1310 {
1311 static token_ty *buf;
1312 static int bufmax;
1313 int bufpos;
1314 int j;
1315
1316 if (phase6_pushback_length)
1317 {
1318 *tp = phase6_pushback[--phase6_pushback_length];
1319 return;
1320 }
1321 for (;;)
1322 {
1323 /* Get the next token. If it is not a '#' at the beginning of a
1324 line (ignoring whitespace), return immediately. */
1325 phaseX_get (tp);
1326 if (tp->type != token_type_hash)
1327 return;
1328
1329 /* Accumulate the rest of the directive in a buffer, until the
1330 "define" keyword is seen or until end of line. */
1331 bufpos = 0;
1332 for (;;)
1333 {
1334 phaseX_get (tp);
1335 if (tp->type == token_type_eoln || tp->type == token_type_eof)
1336 break;
1337
1338 /* Before the "define" keyword and inside other directives
1339 white space is irrelevant. So just throw it away. */
1340 if (tp->type != token_type_white_space)
1341 {
1342 /* If it is a #define directive, return immediately,
1343 thus treating the body of the #define directive like
1344 normal input. */
1345 if (bufpos == 0
1346 && tp->type == token_type_name
1347 && strcmp (tp->string, "define") == 0)
1348 return;
1349
1350 /* Accumulate. */
1351 if (bufpos >= bufmax)
1352 {
1353 bufmax = 2 * bufmax + 10;
1354 buf = xrealloc (buf, bufmax * sizeof (buf[0]));
1355 }
1356 buf[bufpos++] = *tp;
1357 }
1358 }
1359
1360 /* If it is a #line directive, with no macros to expand, act on
1361 it. Ignore all other directives. */
1362 if (bufpos >= 3 && buf[0].type == token_type_name
1363 && strcmp (buf[0].string, "line") == 0
1364 && buf[1].type == token_type_number
1365 && buf[2].type == token_type_string_literal)
1366 {
1367 logical_file_name = xstrdup (buf[2].string);
1368 line_number = buf[1].number;
1369 }
1370 if (bufpos >= 2 && buf[0].type == token_type_number
1371 && buf[1].type == token_type_string_literal)
1372 {
1373 logical_file_name = xstrdup (buf[1].string);
1374 line_number = buf[0].number;
1375 }
1376
1377 /* Release the storage held by the directive. */
1378 for (j = 0; j < bufpos; ++j)
1379 free_token (&buf[j]);
1380
1381 /* We must reset the selected comments. */
1382 savable_comment_reset ();
1383 }
1384 }
1385
1386
1387 /* Supports 2 tokens of pushback. */
1388 static void
phase6_unget(token_ty * tp)1389 phase6_unget (token_ty *tp)
1390 {
1391 if (tp->type != token_type_eof)
1392 {
1393 if (phase6_pushback_length == SIZEOF (phase6_pushback))
1394 abort ();
1395 phase6_pushback[phase6_pushback_length++] = *tp;
1396 }
1397 }
1398
1399
1400 /* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1401 literal placeholders. */
1402
1403 /* Test for an ISO C 99 section 7.8.1 format string directive. */
1404 static bool
is_inttypes_macro(const char * name)1405 is_inttypes_macro (const char *name)
1406 {
1407 /* Syntax:
1408 P R I { d | i | o | u | x | X }
1409 { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR } */
1410 if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I')
1411 {
1412 name += 3;
1413 if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u'
1414 || name[0] == 'x' || name[0] == 'X')
1415 {
1416 name += 1;
1417 if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X'
1418 && name[3] == '\0')
1419 return true;
1420 if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R'
1421 && name[3] == '\0')
1422 return true;
1423 if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A'
1424 && name[3] == 'S' && name[4] == 'T')
1425 name += 5;
1426 else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S'
1427 && name[3] == 'T')
1428 name += 4;
1429 if (name[0] == '8' && name[1] == '\0')
1430 return true;
1431 if (name[0] == '1' && name[1] == '6' && name[2] == '\0')
1432 return true;
1433 if (name[0] == '3' && name[1] == '2' && name[2] == '\0')
1434 return true;
1435 if (name[0] == '6' && name[1] == '4' && name[2] == '\0')
1436 return true;
1437 }
1438 }
1439 return false;
1440 }
1441
1442 static void
phase8a_get(token_ty * tp)1443 phase8a_get (token_ty *tp)
1444 {
1445 phase6_get (tp);
1446 if (tp->type == token_type_name && is_inttypes_macro (tp->string))
1447 {
1448 /* Turn PRIdXXX into "<PRIdXXX>". */
1449 char *new_string = xasprintf ("<%s>", tp->string);
1450 free (tp->string);
1451 tp->string = new_string;
1452 tp->comment = add_reference (savable_comment);
1453 tp->type = token_type_string_literal;
1454 }
1455 }
1456
1457 /* Supports 2 tokens of pushback. */
1458 static inline void
phase8a_unget(token_ty * tp)1459 phase8a_unget (token_ty *tp)
1460 {
1461 phase6_unget (tp);
1462 }
1463
1464
1465 /* 8b. Drop whitespace. */
1466 static void
phase8b_get(token_ty * tp)1467 phase8b_get (token_ty *tp)
1468 {
1469 for (;;)
1470 {
1471 phase8a_get (tp);
1472
1473 if (tp->type == token_type_white_space)
1474 continue;
1475 if (tp->type == token_type_eoln)
1476 {
1477 /* We have to track the last occurrence of a string. One
1478 mode of xgettext allows to group an extracted message
1479 with a comment for documentation. The rule which states
1480 which comment is assumed to be grouped with the message
1481 says it should immediately precede it. Our
1482 interpretation: between the last line of the comment and
1483 the line in which the keyword is found must be no line
1484 with non-white space tokens. */
1485 ++newline_count;
1486 if (last_non_comment_line > last_comment_line)
1487 savable_comment_reset ();
1488 continue;
1489 }
1490 break;
1491 }
1492 }
1493
1494 /* Supports 2 tokens of pushback. */
1495 static inline void
phase8b_unget(token_ty * tp)1496 phase8b_unget (token_ty *tp)
1497 {
1498 phase8a_unget (tp);
1499 }
1500
1501
1502 /* 8c. In ObjectiveC mode, drop '@' before a literal string. We need to
1503 do this before performing concatenation of adjacent string literals. */
1504 static void
phase8c_get(token_ty * tp)1505 phase8c_get (token_ty *tp)
1506 {
1507 token_ty tmp;
1508
1509 phase8b_get (tp);
1510 if (tp->type != token_type_objc_special)
1511 return;
1512 phase8b_get (&tmp);
1513 if (tmp.type != token_type_string_literal)
1514 {
1515 phase8b_unget (&tmp);
1516 return;
1517 }
1518 /* Drop the '@' token and return immediately the following string. */
1519 drop_reference (tmp.comment);
1520 tmp.comment = tp->comment;
1521 *tp = tmp;
1522 }
1523
1524 /* Supports only one pushback token. */
1525 static inline void
phase8c_unget(token_ty * tp)1526 phase8c_unget (token_ty *tp)
1527 {
1528 phase8b_unget (tp);
1529 }
1530
1531
1532 /* 8. Concatenate adjacent string literals to form single string
1533 literals (because we don't expand macros, there are a few things we
1534 will miss). */
1535
1536 static void
phase8_get(token_ty * tp)1537 phase8_get (token_ty *tp)
1538 {
1539 phase8c_get (tp);
1540 if (tp->type != token_type_string_literal)
1541 return;
1542 for (;;)
1543 {
1544 token_ty tmp;
1545 size_t len;
1546
1547 phase8c_get (&tmp);
1548 if (tmp.type != token_type_string_literal)
1549 {
1550 phase8c_unget (&tmp);
1551 return;
1552 }
1553 len = strlen (tp->string);
1554 tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1555 strcpy (tp->string + len, tmp.string);
1556 free (tmp.string);
1557 }
1558 }
1559
1560
1561 /* ===================== Reading of high-level tokens. ==================== */
1562
1563
1564 enum xgettext_token_type_ty
1565 {
1566 xgettext_token_type_eof,
1567 xgettext_token_type_keyword,
1568 xgettext_token_type_symbol,
1569 xgettext_token_type_lparen,
1570 xgettext_token_type_rparen,
1571 xgettext_token_type_comma,
1572 xgettext_token_type_colon,
1573 xgettext_token_type_string_literal,
1574 xgettext_token_type_other
1575 };
1576 typedef enum xgettext_token_type_ty xgettext_token_type_ty;
1577
1578 typedef struct xgettext_token_ty xgettext_token_ty;
1579 struct xgettext_token_ty
1580 {
1581 xgettext_token_type_ty type;
1582
1583 /* This field is used only for xgettext_token_type_keyword. */
1584 const struct callshapes *shapes;
1585
1586 /* This field is used only for xgettext_token_type_string_literal,
1587 xgettext_token_type_keyword, xgettext_token_type_symbol. */
1588 char *string;
1589
1590 /* This field is used only for xgettext_token_type_string_literal. */
1591 refcounted_string_list_ty *comment;
1592
1593 /* These fields are only for
1594 xgettext_token_type_keyword,
1595 xgettext_token_type_string_literal. */
1596 lex_pos_ty pos;
1597 };
1598
1599
1600 /* 9. Convert the remaining preprocessing tokens to C tokens and
1601 discards any white space from the translation unit. */
1602
1603 static void
x_c_lex(xgettext_token_ty * tp)1604 x_c_lex (xgettext_token_ty *tp)
1605 {
1606 for (;;)
1607 {
1608 token_ty token;
1609 void *keyword_value;
1610
1611 phase8_get (&token);
1612 switch (token.type)
1613 {
1614 case token_type_eof:
1615 tp->type = xgettext_token_type_eof;
1616 return;
1617
1618 case token_type_name:
1619 last_non_comment_line = newline_count;
1620
1621 if (hash_find_entry (objc_extensions ? &objc_keywords : &c_keywords,
1622 token.string, strlen (token.string),
1623 &keyword_value)
1624 == 0)
1625 {
1626 tp->type = xgettext_token_type_keyword;
1627 tp->shapes = (const struct callshapes *) keyword_value;
1628 tp->pos.file_name = logical_file_name;
1629 tp->pos.line_number = token.line_number;
1630 }
1631 else
1632 tp->type = xgettext_token_type_symbol;
1633 tp->string = token.string;
1634 return;
1635
1636 case token_type_lparen:
1637 last_non_comment_line = newline_count;
1638
1639 tp->type = xgettext_token_type_lparen;
1640 return;
1641
1642 case token_type_rparen:
1643 last_non_comment_line = newline_count;
1644
1645 tp->type = xgettext_token_type_rparen;
1646 return;
1647
1648 case token_type_comma:
1649 last_non_comment_line = newline_count;
1650
1651 tp->type = xgettext_token_type_comma;
1652 return;
1653
1654 case token_type_colon:
1655 last_non_comment_line = newline_count;
1656
1657 tp->type = xgettext_token_type_colon;
1658 return;
1659
1660 case token_type_string_literal:
1661 last_non_comment_line = newline_count;
1662
1663 tp->type = xgettext_token_type_string_literal;
1664 tp->string = token.string;
1665 tp->comment = token.comment;
1666 tp->pos.file_name = logical_file_name;
1667 tp->pos.line_number = token.line_number;
1668 return;
1669
1670 case token_type_objc_special:
1671 drop_reference (token.comment);
1672 /* FALLTHROUGH */
1673
1674 default:
1675 last_non_comment_line = newline_count;
1676
1677 tp->type = xgettext_token_type_other;
1678 return;
1679 }
1680 }
1681 }
1682
1683
1684 /* ========================= Extracting strings. ========================== */
1685
1686
1687 /* Context lookup table. */
1688 static flag_context_list_table_ty *flag_context_list_table;
1689
1690
1691 /* The file is broken into tokens. Scan the token stream, looking for
1692 a keyword, followed by a left paren, followed by a string. When we
1693 see this sequence, we have something to remember. We assume we are
1694 looking at a valid C or C++ program, and leave the complaints about
1695 the grammar to the compiler.
1696
1697 Normal handling: Look for
1698 keyword ( ... msgid ... )
1699 Plural handling: Look for
1700 keyword ( ... msgid ... msgid_plural ... )
1701
1702 We use recursion because the arguments before msgid or between msgid
1703 and msgid_plural can contain subexpressions of the same form. */
1704
1705
1706 /* Extract messages until the next balanced closing parenthesis.
1707 Extracted messages are added to MLP.
1708 Return true upon eof, false upon closing parenthesis. */
1709 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1710 extract_parenthesized (message_list_ty *mlp,
1711 flag_context_ty outer_context,
1712 flag_context_list_iterator_ty context_iter,
1713 struct arglist_parser *argparser)
1714 {
1715 /* Current argument number. */
1716 int arg = 1;
1717 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1718 int state;
1719 /* Parameters of the keyword just seen. Defined only in state 1. */
1720 const struct callshapes *next_shapes = NULL;
1721 /* Context iterator that will be used if the next token is a '('. */
1722 flag_context_list_iterator_ty next_context_iter =
1723 passthrough_context_list_iterator;
1724 /* Context iterator that will be used if the next token is a ':'.
1725 (Objective C selector syntax.) */
1726 flag_context_list_iterator_ty selectorcall_context_iter =
1727 passthrough_context_list_iterator;
1728 /* Current context. */
1729 flag_context_ty inner_context =
1730 inherited_context (outer_context,
1731 flag_context_list_iterator_advance (&context_iter));
1732
1733 /* Start state is 0. */
1734 state = 0;
1735
1736 for (;;)
1737 {
1738 xgettext_token_ty token;
1739
1740 x_c_lex (&token);
1741 switch (token.type)
1742 {
1743 case xgettext_token_type_keyword:
1744 next_shapes = token.shapes;
1745 state = 1;
1746 goto keyword_or_symbol;
1747
1748 case xgettext_token_type_symbol:
1749 state = 0;
1750 keyword_or_symbol:
1751 next_context_iter =
1752 flag_context_list_iterator (
1753 flag_context_list_table_lookup (
1754 flag_context_list_table,
1755 token.string, strlen (token.string)));
1756 if (objc_extensions)
1757 {
1758 size_t token_string_len = strlen (token.string);
1759 token.string = xrealloc (token.string, token_string_len + 2);
1760 token.string[token_string_len] = ':';
1761 token.string[token_string_len + 1] = '\0';
1762 selectorcall_context_iter =
1763 flag_context_list_iterator (
1764 flag_context_list_table_lookup (
1765 flag_context_list_table,
1766 token.string, token_string_len + 1));
1767 }
1768 free (token.string);
1769 continue;
1770
1771 case xgettext_token_type_lparen:
1772 if (extract_parenthesized (mlp, inner_context, next_context_iter,
1773 arglist_parser_alloc (mlp,
1774 state ? next_shapes : NULL)))
1775 {
1776 arglist_parser_done (argparser, arg);
1777 return true;
1778 }
1779 next_context_iter = null_context_list_iterator;
1780 selectorcall_context_iter = null_context_list_iterator;
1781 state = 0;
1782 continue;
1783
1784 case xgettext_token_type_rparen:
1785 arglist_parser_done (argparser, arg);
1786 return false;
1787
1788 case xgettext_token_type_comma:
1789 arg++;
1790 inner_context =
1791 inherited_context (outer_context,
1792 flag_context_list_iterator_advance (
1793 &context_iter));
1794 next_context_iter = passthrough_context_list_iterator;
1795 selectorcall_context_iter = passthrough_context_list_iterator;
1796 state = 0;
1797 continue;
1798
1799 case xgettext_token_type_colon:
1800 if (objc_extensions)
1801 {
1802 context_iter = selectorcall_context_iter;
1803 inner_context =
1804 inherited_context (inner_context,
1805 flag_context_list_iterator_advance (
1806 &context_iter));
1807 next_context_iter = passthrough_context_list_iterator;
1808 selectorcall_context_iter = passthrough_context_list_iterator;
1809 }
1810 else
1811 {
1812 next_context_iter = null_context_list_iterator;
1813 selectorcall_context_iter = null_context_list_iterator;
1814 }
1815 state = 0;
1816 continue;
1817
1818 case xgettext_token_type_string_literal:
1819 if (extract_all)
1820 remember_a_message (mlp, NULL, token.string, inner_context,
1821 &token.pos, token.comment);
1822 else
1823 arglist_parser_remember (argparser, arg, token.string,
1824 inner_context,
1825 token.pos.file_name, token.pos.line_number,
1826 token.comment);
1827 drop_reference (token.comment);
1828 next_context_iter = null_context_list_iterator;
1829 selectorcall_context_iter = null_context_list_iterator;
1830 state = 0;
1831 continue;
1832
1833 case xgettext_token_type_other:
1834 next_context_iter = null_context_list_iterator;
1835 selectorcall_context_iter = null_context_list_iterator;
1836 state = 0;
1837 continue;
1838
1839 case xgettext_token_type_eof:
1840 arglist_parser_done (argparser, arg);
1841 return true;
1842
1843 default:
1844 abort ();
1845 }
1846 }
1847 }
1848
1849
1850 static void
extract_whole_file(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1851 extract_whole_file (FILE *f,
1852 const char *real_filename, const char *logical_filename,
1853 flag_context_list_table_ty *flag_table,
1854 msgdomain_list_ty *mdlp)
1855 {
1856 message_list_ty *mlp = mdlp->item[0]->messages;
1857
1858 fp = f;
1859 real_file_name = real_filename;
1860 logical_file_name = xstrdup (logical_filename);
1861 line_number = 1;
1862
1863 newline_count = 0;
1864 last_comment_line = -1;
1865 last_non_comment_line = -1;
1866
1867 flag_context_list_table = flag_table;
1868
1869 init_keywords ();
1870
1871 /* Eat tokens until eof is seen. When extract_parenthesized returns
1872 due to an unbalanced closing parenthesis, just restart it. */
1873 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1874 arglist_parser_alloc (mlp, NULL)))
1875 ;
1876
1877 /* Close scanner. */
1878 fp = NULL;
1879 real_file_name = NULL;
1880 logical_file_name = NULL;
1881 line_number = 0;
1882 }
1883
1884
1885 void
extract_c(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1886 extract_c (FILE *f,
1887 const char *real_filename, const char *logical_filename,
1888 flag_context_list_table_ty *flag_table,
1889 msgdomain_list_ty *mdlp)
1890 {
1891 objc_extensions = false;
1892 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1893 }
1894
1895 void
extract_objc(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1896 extract_objc (FILE *f,
1897 const char *real_filename, const char *logical_filename,
1898 flag_context_list_table_ty *flag_table,
1899 msgdomain_list_ty *mdlp)
1900 {
1901 objc_extensions = true;
1902 extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1903 }
1904