xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-c.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext C/C++/ObjectiveC backend.
2    Copyright (C) 1995-1998, 2000-2006 Free Software Foundation, Inc.
3 
4    This file was written by Peter Miller <millerp@canb.auug.org.au>
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-c.h"
33 #include "error.h"
34 #include "error-progname.h"
35 #include "xalloc.h"
36 #include "xvasprintf.h"
37 #include "exit.h"
38 #include "hash.h"
39 #include "gettext.h"
40 
41 #define _(s) gettext(s)
42 
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44 
45 
46 /* The ANSI C standard defines several phases of translation:
47 
48    1. Terminate line by \n, regardless of the external representation
49       of a text line.  Stdio does this for us.
50 
51    2. Convert trigraphs to their single character equivalents.
52 
53    3. Concatenate each line ending in backslash (\) with the following
54       line.
55 
56    4. Replace each comment with a space character.
57 
58    5. Parse each resulting logical line as preprocessing tokens a
59       white space.
60 
61    6. Recognize and carry out directives (it also expands macros on
62       non-directive lines, which we do not do here).
63 
64    7. Replaces escape sequences within character strings with their
65       single character equivalents (we do this in step 5, because we
66       don't have to worry about the #include argument).
67 
68    8. Concatenates adjacent string literals to form single string
69       literals (because we don't expand macros, there are a few things
70       we will miss).
71 
72    9. Converts the remaining preprocessing tokens to C tokens and
73       discards any white space from the translation unit.
74 
75    This lexer implements the above, and presents the scanner (in
76    xgettext.c) with a stream of C tokens.  The comments are
77    accumulated in a buffer, and given to xgettext when asked for.  */
78 
79 
80 /* ========================= Lexer customization.  ========================= */
81 
82 static bool trigraphs = false;
83 
84 void
x_c_trigraphs()85 x_c_trigraphs ()
86 {
87   trigraphs = true;
88 }
89 
90 
91 /* ====================== Keyword set customization.  ====================== */
92 
93 /* If true extract all strings.  */
94 static bool extract_all = false;
95 
96 static hash_table c_keywords;
97 static hash_table objc_keywords;
98 static bool default_keywords = true;
99 
100 
101 void
x_c_extract_all()102 x_c_extract_all ()
103 {
104   extract_all = true;
105 }
106 
107 
108 static void
add_keyword(const char * name,hash_table * keywords)109 add_keyword (const char *name, hash_table *keywords)
110 {
111   if (name == NULL)
112     default_keywords = false;
113   else
114     {
115       const char *end;
116       struct callshape shape;
117       const char *colon;
118 
119       if (keywords->table == NULL)
120 	hash_init (keywords, 100);
121 
122       split_keywordspec (name, &end, &shape);
123 
124       /* The characters between name and end should form a valid C identifier.
125 	 A colon means an invalid parse in split_keywordspec().  */
126       colon = strchr (name, ':');
127       if (colon == NULL || colon >= end)
128 	insert_keyword_callshape (keywords, name, end - name, &shape);
129     }
130 }
131 
132 void
x_c_keyword(const char * name)133 x_c_keyword (const char *name)
134 {
135   add_keyword (name, &c_keywords);
136 }
137 
138 void
x_objc_keyword(const char * name)139 x_objc_keyword (const char *name)
140 {
141   add_keyword (name, &objc_keywords);
142 }
143 
144 /* Finish initializing the keywords hash tables.
145    Called after argument processing, before each file is processed.  */
146 static void
init_keywords()147 init_keywords ()
148 {
149   if (default_keywords)
150     {
151       /* When adding new keywords here, also update the documentation in
152 	 xgettext.texi!  */
153       x_c_keyword ("gettext");
154       x_c_keyword ("dgettext:2");
155       x_c_keyword ("dcgettext:2");
156       x_c_keyword ("ngettext:1,2");
157       x_c_keyword ("dngettext:2,3");
158       x_c_keyword ("dcngettext:2,3");
159       x_c_keyword ("gettext_noop");
160       x_c_keyword ("pgettext:1c,2");
161       x_c_keyword ("dpgettext:2c,3");
162       x_c_keyword ("dcpgettext:2c,3");
163       x_c_keyword ("npgettext:1c,2,3");
164       x_c_keyword ("dnpgettext:2c,3,4");
165       x_c_keyword ("dcnpgettext:2c,3,4");
166 
167       x_objc_keyword ("gettext");
168       x_objc_keyword ("dgettext:2");
169       x_objc_keyword ("dcgettext:2");
170       x_objc_keyword ("ngettext:1,2");
171       x_objc_keyword ("dngettext:2,3");
172       x_objc_keyword ("dcngettext:2,3");
173       x_objc_keyword ("gettext_noop");
174       x_objc_keyword ("pgettext:1c,2");
175       x_objc_keyword ("dpgettext:2c,3");
176       x_objc_keyword ("dcpgettext:2c,3");
177       x_objc_keyword ("npgettext:1c,2,3");
178       x_objc_keyword ("dnpgettext:2c,3,4");
179       x_objc_keyword ("dcnpgettext:2c,3,4");
180       x_objc_keyword ("NSLocalizedString");	  /* similar to gettext */
181       x_objc_keyword ("_");			  /* similar to gettext */
182       x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
183       x_objc_keyword ("__");			  /* similar to gettext_noop */
184 
185       default_keywords = false;
186     }
187 }
188 
189 void
init_flag_table_c()190 init_flag_table_c ()
191 {
192   xgettext_record_flag ("gettext:1:pass-c-format");
193   xgettext_record_flag ("dgettext:2:pass-c-format");
194   xgettext_record_flag ("dcgettext:2:pass-c-format");
195   xgettext_record_flag ("ngettext:1:pass-c-format");
196   xgettext_record_flag ("ngettext:2:pass-c-format");
197   xgettext_record_flag ("dngettext:2:pass-c-format");
198   xgettext_record_flag ("dngettext:3:pass-c-format");
199   xgettext_record_flag ("dcngettext:2:pass-c-format");
200   xgettext_record_flag ("dcngettext:3:pass-c-format");
201   xgettext_record_flag ("gettext_noop:1:pass-c-format");
202   xgettext_record_flag ("pgettext:2:pass-c-format");
203   xgettext_record_flag ("dpgettext:3:pass-c-format");
204   xgettext_record_flag ("dcpgettext:3:pass-c-format");
205   xgettext_record_flag ("npgettext:2:pass-c-format");
206   xgettext_record_flag ("npgettext:3:pass-c-format");
207   xgettext_record_flag ("dnpgettext:3:pass-c-format");
208   xgettext_record_flag ("dnpgettext:4:pass-c-format");
209   xgettext_record_flag ("dcnpgettext:3:pass-c-format");
210   xgettext_record_flag ("dcnpgettext:4:pass-c-format");
211 
212   /* <stdio.h> */
213   xgettext_record_flag ("fprintf:2:c-format");
214   xgettext_record_flag ("vfprintf:2:c-format");
215   xgettext_record_flag ("printf:1:c-format");
216   xgettext_record_flag ("vprintf:1:c-format");
217   xgettext_record_flag ("sprintf:2:c-format");
218   xgettext_record_flag ("vsprintf:2:c-format");
219   xgettext_record_flag ("snprintf:3:c-format");
220   xgettext_record_flag ("vsnprintf:3:c-format");
221 #if 0 /* These functions are not standard.  */
222   /* <stdio.h> */
223   xgettext_record_flag ("asprintf:2:c-format");
224   xgettext_record_flag ("vasprintf:2:c-format");
225   xgettext_record_flag ("dprintf:2:c-format");
226   xgettext_record_flag ("vdprintf:2:c-format");
227   xgettext_record_flag ("obstack_printf:2:c-format");
228   xgettext_record_flag ("obstack_vprintf:2:c-format");
229   /* <error.h> */
230   xgettext_record_flag ("error:3:c-format");
231   xgettext_record_flag ("error_at_line:5:c-format");
232   /* <argp.h> */
233   xgettext_record_flag ("argp_error:2:c-format");
234   xgettext_record_flag ("argp_failure:2:c-format");
235 #endif
236 
237   xgettext_record_flag ("gettext:1:pass-boost-format");
238   xgettext_record_flag ("dgettext:2:pass-boost-format");
239   xgettext_record_flag ("dcgettext:2:pass-boost-format");
240   xgettext_record_flag ("ngettext:1:pass-boost-format");
241   xgettext_record_flag ("ngettext:2:pass-boost-format");
242   xgettext_record_flag ("dngettext:2:pass-boost-format");
243   xgettext_record_flag ("dngettext:3:pass-boost-format");
244   xgettext_record_flag ("dcngettext:2:pass-boost-format");
245   xgettext_record_flag ("dcngettext:3:pass-boost-format");
246   xgettext_record_flag ("gettext_noop:1:pass-boost-format");
247   xgettext_record_flag ("pgettext:2:pass-boost-format");
248   xgettext_record_flag ("dpgettext:3:pass-boost-format");
249   xgettext_record_flag ("dcpgettext:3:pass-boost-format");
250   xgettext_record_flag ("npgettext:2:pass-boost-format");
251   xgettext_record_flag ("npgettext:3:pass-boost-format");
252   xgettext_record_flag ("dnpgettext:3:pass-boost-format");
253   xgettext_record_flag ("dnpgettext:4:pass-boost-format");
254   xgettext_record_flag ("dcnpgettext:3:pass-boost-format");
255   xgettext_record_flag ("dcnpgettext:4:pass-boost-format");
256 
257   /* <boost/format.hpp> */
258   xgettext_record_flag ("format:1:boost-format");
259 }
260 
261 void
init_flag_table_objc()262 init_flag_table_objc ()
263 {
264   /* Since the settings done in init_flag_table_c() also have an effect for
265      the ObjectiveC parser, we don't have to repeat them here.  */
266   xgettext_record_flag ("gettext:1:pass-objc-format");
267   xgettext_record_flag ("dgettext:2:pass-objc-format");
268   xgettext_record_flag ("dcgettext:2:pass-objc-format");
269   xgettext_record_flag ("ngettext:1:pass-objc-format");
270   xgettext_record_flag ("ngettext:2:pass-objc-format");
271   xgettext_record_flag ("dngettext:2:pass-objc-format");
272   xgettext_record_flag ("dngettext:3:pass-objc-format");
273   xgettext_record_flag ("dcngettext:2:pass-objc-format");
274   xgettext_record_flag ("dcngettext:3:pass-objc-format");
275   xgettext_record_flag ("gettext_noop:1:pass-objc-format");
276   xgettext_record_flag ("pgettext:2:pass-objc-format");
277   xgettext_record_flag ("dpgettext:3:pass-objc-format");
278   xgettext_record_flag ("dcpgettext:3:pass-objc-format");
279   xgettext_record_flag ("npgettext:2:pass-objc-format");
280   xgettext_record_flag ("npgettext:3:pass-objc-format");
281   xgettext_record_flag ("dnpgettext:3:pass-objc-format");
282   xgettext_record_flag ("dnpgettext:4:pass-objc-format");
283   xgettext_record_flag ("dcnpgettext:3:pass-objc-format");
284   xgettext_record_flag ("dcnpgettext:4:pass-objc-format");
285   xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
286   xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
287   xgettext_record_flag ("_:1:pass-c-format");
288   xgettext_record_flag ("_:1:pass-objc-format");
289   xgettext_record_flag ("stringWithFormat::1:objc-format");
290   xgettext_record_flag ("initWithFormat::1:objc-format");
291   xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
292   xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
293   xgettext_record_flag ("appendFormat::1:objc-format");
294 }
295 
296 void
init_flag_table_gcc_internal()297 init_flag_table_gcc_internal ()
298 {
299   xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
300   xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
301   xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
302   xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
303   xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
304   xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
305   xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
306   xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
307   xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
308   xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
309   xgettext_record_flag ("pgettext:2:pass-gcc-internal-format");
310   xgettext_record_flag ("dpgettext:3:pass-gcc-internal-format");
311   xgettext_record_flag ("dcpgettext:3:pass-gcc-internal-format");
312   xgettext_record_flag ("npgettext:2:pass-gcc-internal-format");
313   xgettext_record_flag ("npgettext:3:pass-gcc-internal-format");
314   xgettext_record_flag ("dnpgettext:3:pass-gcc-internal-format");
315   xgettext_record_flag ("dnpgettext:4:pass-gcc-internal-format");
316   xgettext_record_flag ("dcnpgettext:3:pass-gcc-internal-format");
317   xgettext_record_flag ("dcnpgettext:4:pass-gcc-internal-format");
318 #if 0 /* This should better be done inside GCC.  */
319   /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
320   /* c-format.c */
321   xgettext_record_flag ("status_warning:2:gcc-internal-format");
322   /* c-tree.h */
323   xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
324   /* collect2.h */
325   //xgettext_record_flag ("error:1:c-format"); // 3 different versions
326   xgettext_record_flag ("notice:1:c-format");
327   //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
328   xgettext_record_flag ("fatal_perror:1:c-format");
329   /* cpplib.h */
330   xgettext_record_flag ("cpp_error:3:c-format");
331   xgettext_record_flag ("cpp_error_with_line:5:c-format");
332   /* diagnostic.h */
333   xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
334   xgettext_record_flag ("output_printf:2:gcc-internal-format");
335   xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
336   xgettext_record_flag ("verbatim:1:gcc-internal-format");
337   xgettext_record_flag ("inform:1:pass-gcc-internal-format");
338   /* gcc.h */
339   //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
340   //xgettext_record_flag ("error:1:c-format"); // 3 different versions
341   /* genattrtab.h */
342   xgettext_record_flag ("attr_printf:2:pass-c-format");
343   /* gengtype.h */
344   xgettext_record_flag ("error_at_line:2:pass-c-format");
345   xgettext_record_flag ("xvasprintf:2:pass-c-format");
346   xgettext_record_flag ("xasprintf:1:pass-c-format");
347   xgettext_record_flag ("oprintf:2:pass-c-format");
348   /* gensupport.h */
349   xgettext_record_flag ("message_with_line:2:pass-c-format");
350   /* output.h */
351   xgettext_record_flag ("output_operand_lossage:1:c-format");
352   /* ra.h */
353    xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
354   /* toplev.h */
355   xgettext_record_flag ("fnotice:2:c-format");
356   xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
357   xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
358   xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
359   xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
360   xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
361   xgettext_record_flag ("pedwarn:1:gcc-internal-format");
362   xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
363   xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
364   xgettext_record_flag ("sorry:1:gcc-internal-format");
365   xgettext_record_flag ("error:1:pass-gcc-internal-format");
366   xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
367   xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
368   xgettext_record_flag ("warning:1:pass-gcc-internal-format");
369   xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
370   xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
371   /* f/com.h */
372   xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
373   /* f/sts.h */
374   xgettext_record_flag ("ffests_printf:2:pass-c-format");
375   /* java/java-tree.h */
376   xgettext_record_flag ("parse_error_context:2:pass-c-format");
377 #endif
378 }
379 
380 
381 /* ======================== Reading of characters.  ======================== */
382 
383 /* Real filename, used in error messages about the input file.  */
384 static const char *real_file_name;
385 
386 /* Logical filename and line number, used to label the extracted messages.  */
387 static char *logical_file_name;
388 static int line_number;
389 
390 /* The input file stream.  */
391 static FILE *fp;
392 
393 
394 /* 0. Terminate line by \n, regardless whether the external representation of
395    a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
396    It is debatable whether supporting CR/LF line terminators in C sources
397    on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
398    unconditionally, it must be OK.
399    The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
400    automatically, but here we also need this conversion on Unix.  As a side
401    effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
402    is not a problem.  */
403 
404 
405 static int
phase0_getc()406 phase0_getc ()
407 {
408   int c;
409 
410   c = getc (fp);
411   if (c == EOF)
412     {
413       if (ferror (fp))
414 	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
415 	       real_file_name);
416       return EOF;
417     }
418 
419   if (c == '\r')
420     {
421       int c1 = getc (fp);
422 
423       if (c1 != EOF && c1 != '\n')
424 	ungetc (c1, fp);
425 
426       /* Seen line terminator CR or CR/LF.  */
427       return '\n';
428     }
429 
430   return c;
431 }
432 
433 
434 /* Supports only one pushback character, and not '\n'.  */
435 static inline void
phase0_ungetc(int c)436 phase0_ungetc (int c)
437 {
438   if (c != EOF)
439     ungetc (c, fp);
440 }
441 
442 
443 /* 1. line_number handling.  Combine backslash-newline to nothing.  */
444 
445 static unsigned char phase1_pushback[2];
446 static int phase1_pushback_length;
447 
448 
449 static int
phase1_getc()450 phase1_getc ()
451 {
452   int c;
453 
454   if (phase1_pushback_length)
455     {
456       c = phase1_pushback[--phase1_pushback_length];
457       if (c == '\n')
458 	++line_number;
459       return c;
460     }
461   for (;;)
462     {
463       c = phase0_getc ();
464       switch (c)
465 	{
466 	case '\n':
467 	  ++line_number;
468 	  return '\n';
469 
470 	case '\\':
471 	  c = phase0_getc ();
472 	  if (c != '\n')
473 	    {
474 	      phase0_ungetc (c);
475 	      return '\\';
476 	    }
477 	  ++line_number;
478 	  break;
479 
480 	default:
481 	  return c;
482 	}
483     }
484 }
485 
486 
487 /* Supports 2 characters of pushback.  */
488 static void
phase1_ungetc(int c)489 phase1_ungetc (int c)
490 {
491   switch (c)
492     {
493     case EOF:
494       break;
495 
496     case '\n':
497       --line_number;
498       /* FALLTHROUGH */
499 
500     default:
501       if (phase1_pushback_length == SIZEOF (phase1_pushback))
502 	abort ();
503       phase1_pushback[phase1_pushback_length++] = c;
504       break;
505     }
506 }
507 
508 
509 /* 2. Convert trigraphs to their single character equivalents.  Most
510    sane human beings vomit copiously at the mention of trigraphs, which
511    is why they are an option.  */
512 
513 static unsigned char phase2_pushback[1];
514 static int phase2_pushback_length;
515 
516 
517 static int
phase2_getc()518 phase2_getc ()
519 {
520   int c;
521 
522   if (phase2_pushback_length)
523     return phase2_pushback[--phase2_pushback_length];
524   if (!trigraphs)
525     return phase1_getc ();
526 
527   c = phase1_getc ();
528   if (c != '?')
529     return c;
530   c = phase1_getc ();
531   if (c != '?')
532     {
533       phase1_ungetc (c);
534       return '?';
535     }
536   c = phase1_getc ();
537   switch (c)
538     {
539     case '(':
540       return '[';
541     case '/':
542       return '\\';
543     case ')':
544       return ']';
545     case '\'':
546       return '^';
547     case '<':
548       return '{';
549     case '!':
550       return '|';
551     case '>':
552       return '}';
553     case '-':
554       return '~';
555     case '#':
556       return '=';
557     }
558   phase1_ungetc (c);
559   phase1_ungetc ('?');
560   return '?';
561 }
562 
563 
564 /* Supports only one pushback character.  */
565 static void
phase2_ungetc(int c)566 phase2_ungetc (int c)
567 {
568   if (c != EOF)
569     {
570       if (phase2_pushback_length == SIZEOF (phase2_pushback))
571 	abort ();
572       phase2_pushback[phase2_pushback_length++] = c;
573     }
574 }
575 
576 
577 /* 3. Concatenate each line ending in backslash (\) with the following
578    line.  Basically, all you need to do is elide "\\\n" sequences from
579    the input.  */
580 
581 static unsigned char phase3_pushback[2];
582 static int phase3_pushback_length;
583 
584 
585 static int
phase3_getc()586 phase3_getc ()
587 {
588   if (phase3_pushback_length)
589     return phase3_pushback[--phase3_pushback_length];
590   for (;;)
591     {
592       int c = phase2_getc ();
593       if (c != '\\')
594 	return c;
595       c = phase2_getc ();
596       if (c != '\n')
597 	{
598 	  phase2_ungetc (c);
599 	  return '\\';
600 	}
601     }
602 }
603 
604 
605 /* Supports 2 characters of pushback.  */
606 static void
phase3_ungetc(int c)607 phase3_ungetc (int c)
608 {
609   if (c != EOF)
610     {
611       if (phase3_pushback_length == SIZEOF (phase3_pushback))
612 	abort ();
613       phase3_pushback[phase3_pushback_length++] = c;
614     }
615 }
616 
617 
618 /* Accumulating comments.  */
619 
620 static char *buffer;
621 static size_t bufmax;
622 static size_t buflen;
623 
624 static inline void
comment_start()625 comment_start ()
626 {
627   buflen = 0;
628 }
629 
630 static inline void
comment_add(int c)631 comment_add (int c)
632 {
633   if (buflen >= bufmax)
634     {
635       bufmax = 2 * bufmax + 10;
636       buffer = xrealloc (buffer, bufmax);
637     }
638   buffer[buflen++] = c;
639 }
640 
641 static inline void
comment_line_end(size_t chars_to_remove)642 comment_line_end (size_t chars_to_remove)
643 {
644   buflen -= chars_to_remove;
645   while (buflen >= 1
646 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
647     --buflen;
648   if (chars_to_remove == 0 && buflen >= bufmax)
649     {
650       bufmax = 2 * bufmax + 10;
651       buffer = xrealloc (buffer, bufmax);
652     }
653   buffer[buflen] = '\0';
654   savable_comment_add (buffer);
655 }
656 
657 
658 /* These are for tracking whether comments count as immediately before
659    keyword.  */
660 static int last_comment_line;
661 static int last_non_comment_line;
662 static int newline_count;
663 
664 
665 /* 4. Replace each comment that is not inside a character constant or
666    string literal with a space character.  We need to remember the
667    comment for later, because it may be attached to a keyword string.
668    We also optionally understand C++ comments.  */
669 
670 static int
phase4_getc()671 phase4_getc ()
672 {
673   int c;
674   bool last_was_star;
675 
676   c = phase3_getc ();
677   if (c != '/')
678     return c;
679   c = phase3_getc ();
680   switch (c)
681     {
682     default:
683       phase3_ungetc (c);
684       return '/';
685 
686     case '*':
687       /* C comment.  */
688       comment_start ();
689       last_was_star = false;
690       for (;;)
691 	{
692 	  c = phase3_getc ();
693 	  if (c == EOF)
694 	    break;
695 	  /* We skip all leading white space, but not EOLs.  */
696 	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
697 	    comment_add (c);
698 	  switch (c)
699 	    {
700 	    case '\n':
701 	      comment_line_end (1);
702 	      comment_start ();
703 	      last_was_star = false;
704 	      continue;
705 
706 	    case '*':
707 	      last_was_star = true;
708 	      continue;
709 
710 	    case '/':
711 	      if (last_was_star)
712 		{
713 		  comment_line_end (2);
714 		  break;
715 		}
716 	      /* FALLTHROUGH */
717 
718 	    default:
719 	      last_was_star = false;
720 	      continue;
721 	    }
722 	  break;
723 	}
724       last_comment_line = newline_count;
725       return ' ';
726 
727     case '/':
728       /* C++ or ISO C 99 comment.  */
729       comment_start ();
730       for (;;)
731 	{
732 	  c = phase3_getc ();
733 	  if (c == '\n' || c == EOF)
734 	    break;
735 	  /* We skip all leading white space, but not EOLs.  */
736 	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
737 	    comment_add (c);
738 	}
739       comment_line_end (0);
740       last_comment_line = newline_count;
741       return '\n';
742     }
743 }
744 
745 
746 /* Supports only one pushback character.  */
747 static void
phase4_ungetc(int c)748 phase4_ungetc (int c)
749 {
750   phase3_ungetc (c);
751 }
752 
753 
754 /* ========================== Reading of tokens.  ========================== */
755 
756 
757 /* True if ObjectiveC extensions are recognized.  */
758 static bool objc_extensions;
759 
760 enum token_type_ty
761 {
762   token_type_character_constant,	/* 'x' */
763   token_type_eof,
764   token_type_eoln,
765   token_type_hash,			/* # */
766   token_type_lparen,			/* ( */
767   token_type_rparen,			/* ) */
768   token_type_comma,			/* , */
769   token_type_colon,			/* : */
770   token_type_name,			/* abc */
771   token_type_number,			/* 2.7 */
772   token_type_string_literal,		/* "abc" */
773   token_type_symbol,			/* < > = etc. */
774   token_type_objc_special,		/* @ */
775   token_type_white_space
776 };
777 typedef enum token_type_ty token_type_ty;
778 
779 typedef struct token_ty token_ty;
780 struct token_ty
781 {
782   token_type_ty type;
783   char *string;		/* for token_type_name, token_type_string_literal */
784   refcounted_string_list_ty *comment;	/* for token_type_string_literal,
785 					   token_type_objc_special */
786   long number;
787   int line_number;
788 };
789 
790 
791 /* 7. Replace escape sequences within character strings with their
792    single character equivalents.  This is called from phase 5, because
793    we don't have to worry about the #include argument.  There are
794    pathological cases which could bite us (like the DOS directory
795    separator), but just pretend it can't happen.  */
796 
797 #define P7_QUOTES (1000 + '"')
798 #define P7_QUOTE (1000 + '\'')
799 #define P7_NEWLINE (1000 + '\n')
800 
801 static int
phase7_getc()802 phase7_getc ()
803 {
804   int c, n, j;
805 
806   /* Use phase 3, because phase 4 elides comments.  */
807   c = phase3_getc ();
808 
809   /* Return a magic newline indicator, so that we can distinguish
810      between the user requesting a newline in the string (e.g. using
811      "\n" or "\012") from the user failing to terminate the string or
812      character constant.  The ANSI C standard says: 3.1.3.4 Character
813      Constants contain ``any character except single quote, backslash or
814      newline; or an escape sequence'' and 3.1.4 String Literals contain
815      ``any character except double quote, backslash or newline; or an
816      escape sequence''.
817 
818      Most compilers give a fatal error in this case, however gcc is
819      stupidly silent, even though this is a very common typo.  OK, so
820      gcc --pedantic will tell me, but that gripes about too much other
821      stuff.  Could I have a ``gcc -Wnewline-in-string'' option, or
822      better yet a ``gcc -fno-newline-in-string'' option, please?  Gcc is
823      also inconsistent between string literals and character constants:
824      you may not embed newlines in character constants; try it, you get
825      a useful diagnostic.  --PMiller  */
826   if (c == '\n')
827     return P7_NEWLINE;
828 
829   if (c == '"')
830     return P7_QUOTES;
831   if (c == '\'')
832     return P7_QUOTE;
833   if (c != '\\')
834     return c;
835   c = phase3_getc ();
836   switch (c)
837     {
838     default:
839       /* Unknown escape sequences really should be an error, but just
840 	 ignore them, and let the real compiler complain.  */
841       phase3_ungetc (c);
842       return '\\';
843 
844     case '"':
845     case '\'':
846     case '?':
847     case '\\':
848       return c;
849 
850     case 'a':
851       return '\a';
852     case 'b':
853       return '\b';
854 
855       /* The \e escape is preculiar to gcc, and assumes an ASCII
856 	 character set (or superset).  We don't provide support for it
857 	 here.  */
858 
859     case 'f':
860       return '\f';
861     case 'n':
862       return '\n';
863     case 'r':
864       return '\r';
865     case 't':
866       return '\t';
867     case 'v':
868       return '\v';
869 
870     case 'x':
871       c = phase3_getc ();
872       switch (c)
873 	{
874 	default:
875 	  phase3_ungetc (c);
876 	  phase3_ungetc ('x');
877 	  return '\\';
878 
879 	case '0': case '1': case '2': case '3': case '4':
880 	case '5': case '6': case '7': case '8': case '9':
881 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
882 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
883 	  break;
884 	}
885       n = 0;
886       for (;;)
887 	{
888 	  switch (c)
889 	    {
890 	    default:
891 	      phase3_ungetc (c);
892 	      return n;
893 
894 	    case '0': case '1': case '2': case '3': case '4':
895 	    case '5': case '6': case '7': case '8': case '9':
896 	      n = n * 16 + c - '0';
897 	      break;
898 
899 	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
900 	      n = n * 16 + 10 + c - 'A';
901 	      break;
902 
903 	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
904 	      n = n * 16 + 10 + c - 'a';
905 	      break;
906 	    }
907 	  c = phase3_getc ();
908 	}
909       return n;
910 
911     case '0': case '1': case '2': case '3':
912     case '4': case '5': case '6': case '7':
913       n = 0;
914       for (j = 0; j < 3; ++j)
915 	{
916 	  n = n * 8 + c - '0';
917 	  c = phase3_getc ();
918 	  switch (c)
919 	    {
920 	    default:
921 	      break;
922 
923 	    case '0': case '1': case '2': case '3':
924 	    case '4': case '5': case '6': case '7':
925 	      continue;
926 	    }
927 	  break;
928 	}
929       phase3_ungetc (c);
930       return n;
931     }
932 }
933 
934 
935 static void
phase7_ungetc(int c)936 phase7_ungetc (int c)
937 {
938   phase3_ungetc (c);
939 }
940 
941 
942 /* Free the memory pointed to by a 'struct token_ty'.  */
943 static inline void
free_token(token_ty * tp)944 free_token (token_ty *tp)
945 {
946   if (tp->type == token_type_name || tp->type == token_type_string_literal)
947     free (tp->string);
948   if (tp->type == token_type_string_literal
949       || tp->type == token_type_objc_special)
950     drop_reference (tp->comment);
951 }
952 
953 
954 /* 5. Parse each resulting logical line as preprocessing tokens and
955    white space.  Preprocessing tokens and C tokens don't always match.  */
956 
957 static token_ty phase5_pushback[1];
958 static int phase5_pushback_length;
959 
960 
961 static void
phase5_get(token_ty * tp)962 phase5_get (token_ty *tp)
963 {
964   static char *buffer;
965   static int bufmax;
966   int bufpos;
967   int c;
968 
969   if (phase5_pushback_length)
970     {
971       *tp = phase5_pushback[--phase5_pushback_length];
972       return;
973     }
974   tp->string = NULL;
975   tp->number = 0;
976   tp->line_number = line_number;
977   c = phase4_getc ();
978   switch (c)
979     {
980     case EOF:
981       tp->type = token_type_eof;
982       return;
983 
984     case '\n':
985       tp->type = token_type_eoln;
986       return;
987 
988     case ' ':
989     case '\f':
990     case '\t':
991       for (;;)
992 	{
993 	  c = phase4_getc ();
994 	  switch (c)
995 	    {
996 	    case ' ':
997 	    case '\f':
998 	    case '\t':
999 	      continue;
1000 
1001 	    default:
1002 	      phase4_ungetc (c);
1003 	      break;
1004 	    }
1005 	  break;
1006 	}
1007       tp->type = token_type_white_space;
1008       return;
1009 
1010     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1011     case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1012     case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1013     case 'V': case 'W': case 'X': case 'Y': case 'Z':
1014     case '_':
1015     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1016     case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1017     case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1018     case 'v': case 'w': case 'x': case 'y': case 'z':
1019       bufpos = 0;
1020       for (;;)
1021 	{
1022 	  if (bufpos >= bufmax)
1023 	    {
1024 	      bufmax = 2 * bufmax + 10;
1025 	      buffer = xrealloc (buffer, bufmax);
1026 	    }
1027 	  buffer[bufpos++] = c;
1028 	  c = phase4_getc ();
1029 	  switch (c)
1030 	    {
1031 	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1032 	    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1033 	    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1034 	    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1035 	    case 'Y': case 'Z':
1036 	    case '_':
1037 	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1038 	    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1039 	    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1040 	    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1041 	    case 'y': case 'z':
1042 	    case '0': case '1': case '2': case '3': case '4':
1043 	    case '5': case '6': case '7': case '8': case '9':
1044 	      continue;
1045 
1046 	    default:
1047 	      phase4_ungetc (c);
1048 	      break;
1049 	    }
1050 	  break;
1051 	}
1052       if (bufpos >= bufmax)
1053 	{
1054 	  bufmax = 2 * bufmax + 10;
1055 	  buffer = xrealloc (buffer, bufmax);
1056 	}
1057       buffer[bufpos] = 0;
1058       tp->string = xstrdup (buffer);
1059       tp->type = token_type_name;
1060       return;
1061 
1062     case '.':
1063       c = phase4_getc ();
1064       phase4_ungetc (c);
1065       switch (c)
1066 	{
1067 	default:
1068 	  tp->type = token_type_symbol;
1069 	  return;
1070 
1071 	case '0': case '1': case '2': case '3': case '4':
1072 	case '5': case '6': case '7': case '8': case '9':
1073 	  c = '.';
1074 	  break;
1075 	}
1076       /* FALLTHROUGH */
1077 
1078     case '0': case '1': case '2': case '3': case '4':
1079     case '5': case '6': case '7': case '8': case '9':
1080       /* The preprocessing number token is more "generous" than the C
1081 	 number tokens.  This is mostly due to token pasting (another
1082 	 thing we can ignore here).  */
1083       bufpos = 0;
1084       for (;;)
1085 	{
1086 	  if (bufpos >= bufmax)
1087 	    {
1088 	      bufmax = 2 * bufmax + 10;
1089 	      buffer = xrealloc (buffer, bufmax);
1090 	    }
1091 	  buffer[bufpos++] = c;
1092 	  c = phase4_getc ();
1093 	  switch (c)
1094 	    {
1095 	    case 'e':
1096 	    case 'E':
1097 	      if (bufpos >= bufmax)
1098 		{
1099 		  bufmax = 2 * bufmax + 10;
1100 		  buffer = xrealloc (buffer, bufmax);
1101 		}
1102 	      buffer[bufpos++] = c;
1103 	      c = phase4_getc ();
1104 	      if (c != '+' || c != '-')
1105 		{
1106 		  phase4_ungetc (c);
1107 		  break;
1108 		}
1109 	      continue;
1110 
1111 	    case 'A': case 'B': case 'C': case 'D':           case 'F':
1112 	    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1113 	    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1114 	    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1115 	    case 'Y': case 'Z':
1116 	    case 'a': case 'b': case 'c': case 'd':           case 'f':
1117 	    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1118 	    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1119 	    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1120 	    case 'y': case 'z':
1121 	    case '0': case '1': case '2': case '3': case '4':
1122 	    case '5': case '6': case '7': case '8': case '9':
1123 	    case '.':
1124 	      continue;
1125 
1126 	    default:
1127 	      phase4_ungetc (c);
1128 	      break;
1129 	    }
1130 	  break;
1131 	}
1132       if (bufpos >= bufmax)
1133 	{
1134 	  bufmax = 2 * bufmax + 10;
1135 	  buffer = xrealloc (buffer, bufmax);
1136 	}
1137       buffer[bufpos] = 0;
1138       tp->type = token_type_number;
1139       tp->number = atol (buffer);
1140       return;
1141 
1142     case '\'':
1143       /* We could worry about the 'L' before wide character constants,
1144 	 but ignoring it has no effect unless one of the keywords is
1145 	 "L".  Just pretend it won't happen.  Also, we don't need to
1146 	 remember the character constant.  */
1147       for (;;)
1148 	{
1149 	  c = phase7_getc ();
1150 	  if (c == P7_NEWLINE)
1151 	    {
1152 	      error_with_progname = false;
1153 	      error (0, 0, _("%s:%d: warning: unterminated character constant"),
1154 		     logical_file_name, line_number - 1);
1155 	      error_with_progname = true;
1156 	      phase7_ungetc ('\n');
1157 	      break;
1158 	    }
1159 	  if (c == EOF || c == P7_QUOTE)
1160 	    break;
1161 	}
1162       tp->type = token_type_character_constant;
1163       return;
1164 
1165     case '"':
1166       /* We could worry about the 'L' before wide string constants,
1167 	 but since gettext's argument is not a wide character string,
1168 	 let the compiler complain about the argument not matching the
1169 	 prototype.  Just pretend it won't happen.  */
1170       bufpos = 0;
1171       for (;;)
1172 	{
1173 	  c = phase7_getc ();
1174 	  if (c == P7_NEWLINE)
1175 	    {
1176 	      error_with_progname = false;
1177 	      error (0, 0, _("%s:%d: warning: unterminated string literal"),
1178 		     logical_file_name, line_number - 1);
1179 	      error_with_progname = true;
1180 	      phase7_ungetc ('\n');
1181 	      break;
1182 	    }
1183 	  if (c == EOF || c == P7_QUOTES)
1184 	    break;
1185 	  if (c == P7_QUOTE)
1186 	    c = '\'';
1187 	  if (bufpos >= bufmax)
1188 	    {
1189 	      bufmax = 2 * bufmax + 10;
1190 	      buffer = xrealloc (buffer, bufmax);
1191 	    }
1192 	  buffer[bufpos++] = c;
1193 	}
1194       if (bufpos >= bufmax)
1195 	{
1196 	  bufmax = 2 * bufmax + 10;
1197 	  buffer = xrealloc (buffer, bufmax);
1198 	}
1199       buffer[bufpos] = 0;
1200       tp->type = token_type_string_literal;
1201       tp->string = xstrdup (buffer);
1202       tp->comment = add_reference (savable_comment);
1203       return;
1204 
1205     case '(':
1206       tp->type = token_type_lparen;
1207       return;
1208 
1209     case ')':
1210       tp->type = token_type_rparen;
1211       return;
1212 
1213     case ',':
1214       tp->type = token_type_comma;
1215       return;
1216 
1217     case '#':
1218       tp->type = token_type_hash;
1219       return;
1220 
1221     case ':':
1222       tp->type = token_type_colon;
1223       return;
1224 
1225     case '@':
1226       if (objc_extensions)
1227 	{
1228 	  tp->type = token_type_objc_special;
1229 	  tp->comment = add_reference (savable_comment);
1230 	  return;
1231 	}
1232       /* FALLTHROUGH */
1233 
1234     default:
1235       /* We could carefully recognize each of the 2 and 3 character
1236 	operators, but it is not necessary, as we only need to recognize
1237 	gettext invocations.  Don't bother.  */
1238       tp->type = token_type_symbol;
1239       return;
1240     }
1241 }
1242 
1243 
1244 /* Supports only one pushback token.  */
1245 static void
phase5_unget(token_ty * tp)1246 phase5_unget (token_ty *tp)
1247 {
1248   if (tp->type != token_type_eof)
1249     {
1250       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1251 	abort ();
1252       phase5_pushback[phase5_pushback_length++] = *tp;
1253     }
1254 }
1255 
1256 
1257 /* X. Recognize a leading # symbol.  Leave leading hash as a hash, but
1258    turn hash in the middle of a line into a plain symbol token.  This
1259    makes the phase 6 easier.  */
1260 
1261 static void
phaseX_get(token_ty * tp)1262 phaseX_get (token_ty *tp)
1263 {
1264   static bool middle;	/* false at the beginning of a line, true otherwise.  */
1265 
1266   phase5_get (tp);
1267 
1268   if (tp->type == token_type_eoln || tp->type == token_type_eof)
1269     middle = false;
1270   else
1271     {
1272       if (middle)
1273 	{
1274 	  /* Turn hash in the middle of a line into a plain symbol token.  */
1275 	  if (tp->type == token_type_hash)
1276 	    tp->type = token_type_symbol;
1277 	}
1278       else
1279 	{
1280 	  /* When we see leading whitespace followed by a hash sign,
1281 	     discard the leading white space token.  The hash is all
1282 	     phase 6 is interested in.  */
1283 	  if (tp->type == token_type_white_space)
1284 	    {
1285 	      token_ty next;
1286 
1287 	      phase5_get (&next);
1288 	      if (next.type == token_type_hash)
1289 		*tp = next;
1290 	      else
1291 		phase5_unget (&next);
1292 	    }
1293 	  middle = true;
1294 	}
1295     }
1296 }
1297 
1298 
1299 /* 6. Recognize and carry out directives (it also expands macros on
1300    non-directive lines, which we do not do here).  The only directive
1301    we care about are the #line and #define directive.  We throw all the
1302    others away.  */
1303 
1304 static token_ty phase6_pushback[2];
1305 static int phase6_pushback_length;
1306 
1307 
1308 static void
phase6_get(token_ty * tp)1309 phase6_get (token_ty *tp)
1310 {
1311   static token_ty *buf;
1312   static int bufmax;
1313   int bufpos;
1314   int j;
1315 
1316   if (phase6_pushback_length)
1317     {
1318       *tp = phase6_pushback[--phase6_pushback_length];
1319       return;
1320     }
1321   for (;;)
1322     {
1323       /* Get the next token.  If it is not a '#' at the beginning of a
1324 	 line (ignoring whitespace), return immediately.  */
1325       phaseX_get (tp);
1326       if (tp->type != token_type_hash)
1327 	return;
1328 
1329       /* Accumulate the rest of the directive in a buffer, until the
1330 	 "define" keyword is seen or until end of line.  */
1331       bufpos = 0;
1332       for (;;)
1333 	{
1334 	  phaseX_get (tp);
1335 	  if (tp->type == token_type_eoln || tp->type == token_type_eof)
1336 	    break;
1337 
1338 	  /* Before the "define" keyword and inside other directives
1339 	     white space is irrelevant.  So just throw it away.  */
1340 	  if (tp->type != token_type_white_space)
1341 	    {
1342 	      /* If it is a #define directive, return immediately,
1343 		 thus treating the body of the #define directive like
1344 		 normal input.  */
1345 	      if (bufpos == 0
1346 		  && tp->type == token_type_name
1347 		  && strcmp (tp->string, "define") == 0)
1348 		return;
1349 
1350 	      /* Accumulate.  */
1351 	      if (bufpos >= bufmax)
1352 		{
1353 		  bufmax = 2 * bufmax + 10;
1354 		  buf = xrealloc (buf, bufmax * sizeof (buf[0]));
1355 		}
1356 	      buf[bufpos++] = *tp;
1357 	    }
1358 	}
1359 
1360       /* If it is a #line directive, with no macros to expand, act on
1361 	 it.  Ignore all other directives.  */
1362       if (bufpos >= 3 && buf[0].type == token_type_name
1363 	  && strcmp (buf[0].string, "line") == 0
1364 	  && buf[1].type == token_type_number
1365 	  && buf[2].type == token_type_string_literal)
1366 	{
1367 	  logical_file_name = xstrdup (buf[2].string);
1368 	  line_number = buf[1].number;
1369 	}
1370       if (bufpos >= 2 && buf[0].type == token_type_number
1371 	  && buf[1].type == token_type_string_literal)
1372 	{
1373 	  logical_file_name = xstrdup (buf[1].string);
1374 	  line_number = buf[0].number;
1375 	}
1376 
1377       /* Release the storage held by the directive.  */
1378       for (j = 0; j < bufpos; ++j)
1379 	free_token (&buf[j]);
1380 
1381       /* We must reset the selected comments.  */
1382       savable_comment_reset ();
1383     }
1384 }
1385 
1386 
1387 /* Supports 2 tokens of pushback.  */
1388 static void
phase6_unget(token_ty * tp)1389 phase6_unget (token_ty *tp)
1390 {
1391   if (tp->type != token_type_eof)
1392     {
1393       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1394 	abort ();
1395       phase6_pushback[phase6_pushback_length++] = *tp;
1396     }
1397 }
1398 
1399 
1400 /* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1401    literal placeholders.  */
1402 
1403 /* Test for an ISO C 99 section 7.8.1 format string directive.  */
1404 static bool
is_inttypes_macro(const char * name)1405 is_inttypes_macro (const char *name)
1406 {
1407   /* Syntax:
1408      P R I { d | i | o | u | x | X }
1409      { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR }  */
1410   if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I')
1411     {
1412       name += 3;
1413       if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u'
1414 	  || name[0] == 'x' || name[0] == 'X')
1415 	{
1416 	  name += 1;
1417 	  if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X'
1418 	      && name[3] == '\0')
1419 	    return true;
1420 	  if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R'
1421 	      && name[3] == '\0')
1422 	    return true;
1423 	  if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A'
1424 	      && name[3] == 'S' && name[4] == 'T')
1425 	    name += 5;
1426 	  else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S'
1427 		   && name[3] == 'T')
1428 	    name += 4;
1429 	  if (name[0] == '8' && name[1] == '\0')
1430 	    return true;
1431 	  if (name[0] == '1' && name[1] == '6' && name[2] == '\0')
1432 	    return true;
1433 	  if (name[0] == '3' && name[1] == '2' && name[2] == '\0')
1434 	    return true;
1435 	  if (name[0] == '6' && name[1] == '4' && name[2] == '\0')
1436 	    return true;
1437 	}
1438     }
1439   return false;
1440 }
1441 
1442 static void
phase8a_get(token_ty * tp)1443 phase8a_get (token_ty *tp)
1444 {
1445   phase6_get (tp);
1446   if (tp->type == token_type_name && is_inttypes_macro (tp->string))
1447     {
1448       /* Turn PRIdXXX into "<PRIdXXX>".  */
1449       char *new_string = xasprintf ("<%s>", tp->string);
1450       free (tp->string);
1451       tp->string = new_string;
1452       tp->comment = add_reference (savable_comment);
1453       tp->type = token_type_string_literal;
1454     }
1455 }
1456 
1457 /* Supports 2 tokens of pushback.  */
1458 static inline void
phase8a_unget(token_ty * tp)1459 phase8a_unget (token_ty *tp)
1460 {
1461   phase6_unget (tp);
1462 }
1463 
1464 
1465 /* 8b. Drop whitespace.  */
1466 static void
phase8b_get(token_ty * tp)1467 phase8b_get (token_ty *tp)
1468 {
1469   for (;;)
1470     {
1471       phase8a_get (tp);
1472 
1473       if (tp->type == token_type_white_space)
1474 	continue;
1475       if (tp->type == token_type_eoln)
1476 	{
1477 	  /* We have to track the last occurrence of a string.  One
1478 	     mode of xgettext allows to group an extracted message
1479 	     with a comment for documentation.  The rule which states
1480 	     which comment is assumed to be grouped with the message
1481 	     says it should immediately precede it.  Our
1482 	     interpretation: between the last line of the comment and
1483 	     the line in which the keyword is found must be no line
1484 	     with non-white space tokens.  */
1485 	  ++newline_count;
1486 	  if (last_non_comment_line > last_comment_line)
1487 	    savable_comment_reset ();
1488 	  continue;
1489 	}
1490       break;
1491     }
1492 }
1493 
1494 /* Supports 2 tokens of pushback.  */
1495 static inline void
phase8b_unget(token_ty * tp)1496 phase8b_unget (token_ty *tp)
1497 {
1498   phase8a_unget (tp);
1499 }
1500 
1501 
1502 /* 8c. In ObjectiveC mode, drop '@' before a literal string.  We need to
1503    do this before performing concatenation of adjacent string literals.  */
1504 static void
phase8c_get(token_ty * tp)1505 phase8c_get (token_ty *tp)
1506 {
1507   token_ty tmp;
1508 
1509   phase8b_get (tp);
1510   if (tp->type != token_type_objc_special)
1511     return;
1512   phase8b_get (&tmp);
1513   if (tmp.type != token_type_string_literal)
1514     {
1515       phase8b_unget (&tmp);
1516       return;
1517     }
1518   /* Drop the '@' token and return immediately the following string.  */
1519   drop_reference (tmp.comment);
1520   tmp.comment = tp->comment;
1521   *tp = tmp;
1522 }
1523 
1524 /* Supports only one pushback token.  */
1525 static inline void
phase8c_unget(token_ty * tp)1526 phase8c_unget (token_ty *tp)
1527 {
1528   phase8b_unget (tp);
1529 }
1530 
1531 
1532 /* 8. Concatenate adjacent string literals to form single string
1533    literals (because we don't expand macros, there are a few things we
1534    will miss).  */
1535 
1536 static void
phase8_get(token_ty * tp)1537 phase8_get (token_ty *tp)
1538 {
1539   phase8c_get (tp);
1540   if (tp->type != token_type_string_literal)
1541     return;
1542   for (;;)
1543     {
1544       token_ty tmp;
1545       size_t len;
1546 
1547       phase8c_get (&tmp);
1548       if (tmp.type != token_type_string_literal)
1549 	{
1550 	  phase8c_unget (&tmp);
1551 	  return;
1552 	}
1553       len = strlen (tp->string);
1554       tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1555       strcpy (tp->string + len, tmp.string);
1556       free (tmp.string);
1557     }
1558 }
1559 
1560 
1561 /* ===================== Reading of high-level tokens.  ==================== */
1562 
1563 
1564 enum xgettext_token_type_ty
1565 {
1566   xgettext_token_type_eof,
1567   xgettext_token_type_keyword,
1568   xgettext_token_type_symbol,
1569   xgettext_token_type_lparen,
1570   xgettext_token_type_rparen,
1571   xgettext_token_type_comma,
1572   xgettext_token_type_colon,
1573   xgettext_token_type_string_literal,
1574   xgettext_token_type_other
1575 };
1576 typedef enum xgettext_token_type_ty xgettext_token_type_ty;
1577 
1578 typedef struct xgettext_token_ty xgettext_token_ty;
1579 struct xgettext_token_ty
1580 {
1581   xgettext_token_type_ty type;
1582 
1583   /* This field is used only for xgettext_token_type_keyword.  */
1584   const struct callshapes *shapes;
1585 
1586   /* This field is used only for xgettext_token_type_string_literal,
1587      xgettext_token_type_keyword, xgettext_token_type_symbol.  */
1588   char *string;
1589 
1590   /* This field is used only for xgettext_token_type_string_literal.  */
1591   refcounted_string_list_ty *comment;
1592 
1593   /* These fields are only for
1594        xgettext_token_type_keyword,
1595        xgettext_token_type_string_literal.  */
1596   lex_pos_ty pos;
1597 };
1598 
1599 
1600 /* 9. Convert the remaining preprocessing tokens to C tokens and
1601    discards any white space from the translation unit.  */
1602 
1603 static void
x_c_lex(xgettext_token_ty * tp)1604 x_c_lex (xgettext_token_ty *tp)
1605 {
1606   for (;;)
1607     {
1608       token_ty token;
1609       void *keyword_value;
1610 
1611       phase8_get (&token);
1612       switch (token.type)
1613 	{
1614 	case token_type_eof:
1615 	  tp->type = xgettext_token_type_eof;
1616 	  return;
1617 
1618 	case token_type_name:
1619 	  last_non_comment_line = newline_count;
1620 
1621 	  if (hash_find_entry (objc_extensions ? &objc_keywords : &c_keywords,
1622 			       token.string, strlen (token.string),
1623 			       &keyword_value)
1624 	      == 0)
1625 	    {
1626 	      tp->type = xgettext_token_type_keyword;
1627 	      tp->shapes = (const struct callshapes *) keyword_value;
1628 	      tp->pos.file_name = logical_file_name;
1629 	      tp->pos.line_number = token.line_number;
1630 	    }
1631 	  else
1632 	    tp->type = xgettext_token_type_symbol;
1633 	  tp->string = token.string;
1634 	  return;
1635 
1636 	case token_type_lparen:
1637 	  last_non_comment_line = newline_count;
1638 
1639 	  tp->type = xgettext_token_type_lparen;
1640 	  return;
1641 
1642 	case token_type_rparen:
1643 	  last_non_comment_line = newline_count;
1644 
1645 	  tp->type = xgettext_token_type_rparen;
1646 	  return;
1647 
1648 	case token_type_comma:
1649 	  last_non_comment_line = newline_count;
1650 
1651 	  tp->type = xgettext_token_type_comma;
1652 	  return;
1653 
1654 	case token_type_colon:
1655 	  last_non_comment_line = newline_count;
1656 
1657 	  tp->type = xgettext_token_type_colon;
1658 	  return;
1659 
1660 	case token_type_string_literal:
1661 	  last_non_comment_line = newline_count;
1662 
1663 	  tp->type = xgettext_token_type_string_literal;
1664 	  tp->string = token.string;
1665 	  tp->comment = token.comment;
1666 	  tp->pos.file_name = logical_file_name;
1667 	  tp->pos.line_number = token.line_number;
1668 	  return;
1669 
1670 	case token_type_objc_special:
1671 	  drop_reference (token.comment);
1672 	  /* FALLTHROUGH */
1673 
1674 	default:
1675 	  last_non_comment_line = newline_count;
1676 
1677 	  tp->type = xgettext_token_type_other;
1678 	  return;
1679 	}
1680     }
1681 }
1682 
1683 
1684 /* ========================= Extracting strings.  ========================== */
1685 
1686 
1687 /* Context lookup table.  */
1688 static flag_context_list_table_ty *flag_context_list_table;
1689 
1690 
1691 /* The file is broken into tokens.  Scan the token stream, looking for
1692    a keyword, followed by a left paren, followed by a string.  When we
1693    see this sequence, we have something to remember.  We assume we are
1694    looking at a valid C or C++ program, and leave the complaints about
1695    the grammar to the compiler.
1696 
1697      Normal handling: Look for
1698        keyword ( ... msgid ... )
1699      Plural handling: Look for
1700        keyword ( ... msgid ... msgid_plural ... )
1701 
1702    We use recursion because the arguments before msgid or between msgid
1703    and msgid_plural can contain subexpressions of the same form.  */
1704 
1705 
1706 /* Extract messages until the next balanced closing parenthesis.
1707    Extracted messages are added to MLP.
1708    Return true upon eof, false upon closing parenthesis.  */
1709 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1710 extract_parenthesized (message_list_ty *mlp,
1711 		       flag_context_ty outer_context,
1712 		       flag_context_list_iterator_ty context_iter,
1713 		       struct arglist_parser *argparser)
1714 {
1715   /* Current argument number.  */
1716   int arg = 1;
1717   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1718   int state;
1719   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1720   const struct callshapes *next_shapes = NULL;
1721   /* Context iterator that will be used if the next token is a '('.  */
1722   flag_context_list_iterator_ty next_context_iter =
1723     passthrough_context_list_iterator;
1724   /* Context iterator that will be used if the next token is a ':'.
1725      (Objective C selector syntax.)  */
1726   flag_context_list_iterator_ty selectorcall_context_iter =
1727     passthrough_context_list_iterator;
1728   /* Current context.  */
1729   flag_context_ty inner_context =
1730     inherited_context (outer_context,
1731 		       flag_context_list_iterator_advance (&context_iter));
1732 
1733   /* Start state is 0.  */
1734   state = 0;
1735 
1736   for (;;)
1737     {
1738       xgettext_token_ty token;
1739 
1740       x_c_lex (&token);
1741       switch (token.type)
1742 	{
1743 	case xgettext_token_type_keyword:
1744 	  next_shapes = token.shapes;
1745 	  state = 1;
1746 	  goto keyword_or_symbol;
1747 
1748 	case xgettext_token_type_symbol:
1749 	  state = 0;
1750 	keyword_or_symbol:
1751 	  next_context_iter =
1752 	    flag_context_list_iterator (
1753 	      flag_context_list_table_lookup (
1754 		flag_context_list_table,
1755 		token.string, strlen (token.string)));
1756 	  if (objc_extensions)
1757 	    {
1758 	      size_t token_string_len = strlen (token.string);
1759 	      token.string = xrealloc (token.string, token_string_len + 2);
1760 	      token.string[token_string_len] = ':';
1761 	      token.string[token_string_len + 1] = '\0';
1762 	      selectorcall_context_iter =
1763 		flag_context_list_iterator (
1764 		  flag_context_list_table_lookup (
1765 		    flag_context_list_table,
1766 		    token.string, token_string_len + 1));
1767 	    }
1768 	  free (token.string);
1769 	  continue;
1770 
1771 	case xgettext_token_type_lparen:
1772 	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
1773 				     arglist_parser_alloc (mlp,
1774 							   state ? next_shapes : NULL)))
1775 	    {
1776 	      arglist_parser_done (argparser, arg);
1777 	      return true;
1778 	    }
1779 	  next_context_iter = null_context_list_iterator;
1780 	  selectorcall_context_iter = null_context_list_iterator;
1781 	  state = 0;
1782 	  continue;
1783 
1784 	case xgettext_token_type_rparen:
1785 	  arglist_parser_done (argparser, arg);
1786 	  return false;
1787 
1788 	case xgettext_token_type_comma:
1789 	  arg++;
1790 	  inner_context =
1791 	    inherited_context (outer_context,
1792 			       flag_context_list_iterator_advance (
1793 				 &context_iter));
1794 	  next_context_iter = passthrough_context_list_iterator;
1795 	  selectorcall_context_iter = passthrough_context_list_iterator;
1796 	  state = 0;
1797 	  continue;
1798 
1799 	case xgettext_token_type_colon:
1800 	  if (objc_extensions)
1801 	    {
1802 	      context_iter = selectorcall_context_iter;
1803 	      inner_context =
1804 		inherited_context (inner_context,
1805 				   flag_context_list_iterator_advance (
1806 				     &context_iter));
1807 	      next_context_iter = passthrough_context_list_iterator;
1808 	      selectorcall_context_iter = passthrough_context_list_iterator;
1809 	    }
1810 	  else
1811 	    {
1812 	      next_context_iter = null_context_list_iterator;
1813 	      selectorcall_context_iter = null_context_list_iterator;
1814 	    }
1815 	  state = 0;
1816 	  continue;
1817 
1818 	case xgettext_token_type_string_literal:
1819 	  if (extract_all)
1820 	    remember_a_message (mlp, NULL, token.string, inner_context,
1821 				&token.pos, token.comment);
1822 	  else
1823 	    arglist_parser_remember (argparser, arg, token.string,
1824 				     inner_context,
1825 				     token.pos.file_name, token.pos.line_number,
1826 				     token.comment);
1827 	  drop_reference (token.comment);
1828 	  next_context_iter = null_context_list_iterator;
1829 	  selectorcall_context_iter = null_context_list_iterator;
1830 	  state = 0;
1831 	  continue;
1832 
1833 	case xgettext_token_type_other:
1834 	  next_context_iter = null_context_list_iterator;
1835 	  selectorcall_context_iter = null_context_list_iterator;
1836 	  state = 0;
1837 	  continue;
1838 
1839 	case xgettext_token_type_eof:
1840 	  arglist_parser_done (argparser, arg);
1841 	  return true;
1842 
1843 	default:
1844 	  abort ();
1845 	}
1846     }
1847 }
1848 
1849 
1850 static void
extract_whole_file(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1851 extract_whole_file (FILE *f,
1852 		    const char *real_filename, const char *logical_filename,
1853 		    flag_context_list_table_ty *flag_table,
1854 		    msgdomain_list_ty *mdlp)
1855 {
1856   message_list_ty *mlp = mdlp->item[0]->messages;
1857 
1858   fp = f;
1859   real_file_name = real_filename;
1860   logical_file_name = xstrdup (logical_filename);
1861   line_number = 1;
1862 
1863   newline_count = 0;
1864   last_comment_line = -1;
1865   last_non_comment_line = -1;
1866 
1867   flag_context_list_table = flag_table;
1868 
1869   init_keywords ();
1870 
1871   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1872      due to an unbalanced closing parenthesis, just restart it.  */
1873   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1874 				 arglist_parser_alloc (mlp, NULL)))
1875     ;
1876 
1877   /* Close scanner.  */
1878   fp = NULL;
1879   real_file_name = NULL;
1880   logical_file_name = NULL;
1881   line_number = 0;
1882 }
1883 
1884 
1885 void
extract_c(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1886 extract_c (FILE *f,
1887 	   const char *real_filename, const char *logical_filename,
1888 	   flag_context_list_table_ty *flag_table,
1889 	   msgdomain_list_ty *mdlp)
1890 {
1891   objc_extensions = false;
1892   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1893 }
1894 
1895 void
extract_objc(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1896 extract_objc (FILE *f,
1897 	      const char *real_filename, const char *logical_filename,
1898 	      flag_context_list_table_ty *flag_table,
1899 	      msgdomain_list_ty *mdlp)
1900 {
1901   objc_extensions = true;
1902   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1903 }
1904