1 /* xgettext Java backend.
2 Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 #include <errno.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 #include "message.h"
30 #include "xgettext.h"
31 #include "x-java.h"
32 #include "error.h"
33 #include "xalloc.h"
34 #include "exit.h"
35 #include "hash.h"
36 #include "po-charset.h"
37 #include "utf16-ucs4.h"
38 #include "ucs4-utf8.h"
39 #include "gettext.h"
40
41 #define _(s) gettext(s)
42
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44
45
46 /* The Java syntax is defined in the
47 Java Language Specification, Second Edition,
48 (available from http://java.sun.com/),
49 chapter 3 "Lexical Structure". */
50
51
52 /* ====================== Keyword set customization. ====================== */
53
54 /* If true extract all strings. */
55 static bool extract_all = false;
56
57 static hash_table keywords;
58 static bool default_keywords = true;
59
60
61 void
x_java_extract_all()62 x_java_extract_all ()
63 {
64 extract_all = true;
65 }
66
67
68 void
x_java_keyword(const char * name)69 x_java_keyword (const char *name)
70 {
71 if (name == NULL)
72 default_keywords = false;
73 else
74 {
75 const char *end;
76 struct callshape shape;
77 const char *colon;
78
79 if (keywords.table == NULL)
80 hash_init (&keywords, 100);
81
82 split_keywordspec (name, &end, &shape);
83
84 /* The characters between name and end should form a valid Java
85 identifier sequence with dots.
86 A colon means an invalid parse in split_keywordspec(). */
87 colon = strchr (name, ':');
88 if (colon == NULL || colon >= end)
89 insert_keyword_callshape (&keywords, name, end - name, &shape);
90 }
91 }
92
93 /* Finish initializing the keywords hash table.
94 Called after argument processing, before each file is processed. */
95 static void
init_keywords()96 init_keywords ()
97 {
98 if (default_keywords)
99 {
100 /* When adding new keywords here, also update the documentation in
101 xgettext.texi! */
102 x_java_keyword ("GettextResource.gettext:2"); /* static method */
103 x_java_keyword ("GettextResource.ngettext:2,3"); /* static method */
104 x_java_keyword ("gettext");
105 x_java_keyword ("ngettext:1,2");
106 x_java_keyword ("getString"); /* ResourceBundle.getString */
107 default_keywords = false;
108 }
109 }
110
111 void
init_flag_table_java()112 init_flag_table_java ()
113 {
114 xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
115 xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
116 xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
117 xgettext_record_flag ("gettext:1:pass-java-format");
118 xgettext_record_flag ("ngettext:1:pass-java-format");
119 xgettext_record_flag ("ngettext:2:pass-java-format");
120 xgettext_record_flag ("getString:1:pass-java-format");
121 xgettext_record_flag ("MessageFormat:1:java-format");
122 xgettext_record_flag ("MessageFormat.format:1:java-format");
123 }
124
125
126 /* ======================== Reading of characters. ======================== */
127
128 /* Real filename, used in error messages about the input file. */
129 static const char *real_file_name;
130
131 /* Logical filename and line number, used to label the extracted messages. */
132 static char *logical_file_name;
133 static int line_number;
134
135 /* The input file stream. */
136 static FILE *fp;
137
138
139 /* Fetch the next single-byte character from the input file.
140 Pushback can consist of an unlimited number of 'u' followed by up to 4
141 other characters. */
142
143 /* Special coding of multiple 'u's in the pushback buffer. */
144 #define MULTIPLE_U(count) (0x1000 + (count))
145
146 static int phase1_pushback[5];
147 static unsigned int phase1_pushback_length;
148
149 static int
phase1_getc()150 phase1_getc ()
151 {
152 int c;
153
154 if (phase1_pushback_length)
155 {
156 c = phase1_pushback[--phase1_pushback_length];
157 if (c >= MULTIPLE_U (0))
158 {
159 if (c > MULTIPLE_U (1))
160 phase1_pushback[phase1_pushback_length++] = c - 1;
161 return 'u';
162 }
163 else
164 return c;
165 }
166
167 c = getc (fp);
168
169 if (c == EOF)
170 {
171 if (ferror (fp))
172 error (EXIT_FAILURE, errno, _("\
173 error while reading \"%s\""), real_file_name);
174 }
175
176 return c;
177 }
178
179 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback. */
180 static void
phase1_ungetc(int c)181 phase1_ungetc (int c)
182 {
183 if (c != EOF)
184 {
185 if (c == 'u')
186 {
187 if (phase1_pushback_length > 0
188 && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
189 phase1_pushback[phase1_pushback_length - 1]++;
190 else
191 {
192 if (phase1_pushback_length == SIZEOF (phase1_pushback))
193 abort ();
194 phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
195 }
196 }
197 else
198 {
199 if (phase1_pushback_length == SIZEOF (phase1_pushback))
200 abort ();
201 phase1_pushback[phase1_pushback_length++] = c;
202 }
203 }
204 }
205
206
207 /* Fetch the next single-byte character or Unicode character from the file.
208 (Here, as in the Java Language Specification, when we say "Unicode
209 character", we actually mean "UTF-16 encoding unit".) */
210
211 /* Return value of phase 2, 3, 4 when EOF is reached. */
212 #define P2_EOF 0xffff
213
214 /* Convert an UTF-16 code point to a return value that can be distinguished
215 from a single-byte return value. */
216 #define UNICODE(code) (0x10000 + (code))
217
218 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
219 point. */
220 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
221
222 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */
223 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
224
225 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
226 so that it can be more easily compared against an ASCII character.
227 (RED (c) == 'x') is equivalent to (c == 'x' || c == UNICODE ('x')). */
228 #define RED(p2_result) ((p2_result) & 0xffff)
229
230 static int phase2_pushback[1];
231 static int phase2_pushback_length;
232
233 static int
phase2_getc()234 phase2_getc ()
235 {
236 int c;
237
238 if (phase2_pushback_length)
239 return phase2_pushback[--phase2_pushback_length];
240
241 c = phase1_getc ();
242 if (c == EOF)
243 return P2_EOF;
244 if (c == '\\')
245 {
246 c = phase1_getc ();
247 if (c == 'u')
248 {
249 unsigned int u_count = 1;
250 unsigned char buf[4];
251 unsigned int n;
252 int i;
253
254 for (;;)
255 {
256 c = phase1_getc ();
257 if (c != 'u')
258 break;
259 u_count++;
260 }
261 phase1_ungetc (c);
262
263 n = 0;
264 for (i = 0; i < 4; i++)
265 {
266 c = phase1_getc ();
267
268 if (c >= '0' && c <= '9')
269 n = (n << 4) + (c - '0');
270 else if (c >= 'A' && c <= 'F')
271 n = (n << 4) + (c - 'A' + 10);
272 else if (c >= 'a' && c <= 'f')
273 n = (n << 4) + (c - 'a' + 10);
274 else
275 {
276 phase1_ungetc (c);
277 while (--i >= 0)
278 phase1_ungetc (buf[i]);
279 for (; u_count > 0; u_count--)
280 phase1_ungetc ('u');
281 return '\\';
282 }
283
284 buf[i] = c;
285 }
286 return UNICODE (n);
287 }
288 phase1_ungetc (c);
289 return '\\';
290 }
291 return c;
292 }
293
294 /* Supports only one pushback character. */
295 static void
phase2_ungetc(int c)296 phase2_ungetc (int c)
297 {
298 if (c != P2_EOF)
299 {
300 if (phase2_pushback_length == SIZEOF (phase2_pushback))
301 abort ();
302 phase2_pushback[phase2_pushback_length++] = c;
303 }
304 }
305
306
307 /* Fetch the next single-byte character or Unicode character from the file.
308 With line number handling.
309 Convert line terminators to '\n' or UNICODE ('\n'). */
310
311 static int phase3_pushback[2];
312 static int phase3_pushback_length;
313
314 static int
phase3_getc()315 phase3_getc ()
316 {
317 int c;
318
319 if (phase3_pushback_length)
320 {
321 c = phase3_pushback[--phase3_pushback_length];
322 if (c == '\n')
323 ++line_number;
324 return c;
325 }
326
327 c = phase2_getc ();
328
329 /* Handle line terminators. */
330 if (RED (c) == '\r')
331 {
332 int c1 = phase2_getc ();
333
334 if (RED (c1) != '\n')
335 phase2_ungetc (c1);
336
337 /* Seen line terminator CR or CR/LF. */
338 if (c == '\r' || c1 == '\n')
339 {
340 ++line_number;
341 return '\n';
342 }
343 else
344 return UNICODE ('\n');
345 }
346 else if (RED (c) == '\n')
347 {
348 /* Seen line terminator LF. */
349 if (c == '\n')
350 {
351 ++line_number;
352 return '\n';
353 }
354 else
355 return UNICODE ('\n');
356 }
357
358 return c;
359 }
360
361 /* Supports 2 characters of pushback. */
362 static void
phase3_ungetc(int c)363 phase3_ungetc (int c)
364 {
365 if (c != P2_EOF)
366 {
367 if (c == '\n')
368 --line_number;
369 if (phase3_pushback_length == SIZEOF (phase3_pushback))
370 abort ();
371 phase3_pushback[phase3_pushback_length++] = c;
372 }
373 }
374
375
376 /* ========================= Accumulating strings. ======================== */
377
378 /* A string buffer type that allows appending bytes (in the
379 xgettext_current_source_encoding) or Unicode characters.
380 Returns the entire string in UTF-8 encoding. */
381
382 struct string_buffer
383 {
384 /* The part of the string that has already been converted to UTF-8. */
385 char *utf8_buffer;
386 size_t utf8_buflen;
387 size_t utf8_allocated;
388 /* The first half of an UTF-16 surrogate character. */
389 unsigned short utf16_surr;
390 /* The part of the string that is still in the source encoding. */
391 char *curr_buffer;
392 size_t curr_buflen;
393 size_t curr_allocated;
394 };
395
396 /* Initialize a 'struct string_buffer' to empty. */
397 static inline void
init_string_buffer(struct string_buffer * bp)398 init_string_buffer (struct string_buffer *bp)
399 {
400 bp->utf8_buffer = NULL;
401 bp->utf8_buflen = 0;
402 bp->utf8_allocated = 0;
403 bp->utf16_surr = 0;
404 bp->curr_buffer = NULL;
405 bp->curr_buflen = 0;
406 bp->curr_allocated = 0;
407 }
408
409 /* Auxiliary function: Append a byte to bp->curr. */
410 static inline void
string_buffer_append_byte(struct string_buffer * bp,unsigned char c)411 string_buffer_append_byte (struct string_buffer *bp, unsigned char c)
412 {
413 if (bp->curr_buflen == bp->curr_allocated)
414 {
415 bp->curr_allocated = 2 * bp->curr_allocated + 10;
416 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
417 }
418 bp->curr_buffer[bp->curr_buflen++] = c;
419 }
420
421 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
422 static inline void
string_buffer_append_unicode_grow(struct string_buffer * bp,size_t count)423 string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
424 {
425 if (bp->utf8_buflen + count > bp->utf8_allocated)
426 {
427 size_t new_allocated = 2 * bp->utf8_allocated + 10;
428 if (new_allocated < bp->utf8_buflen + count)
429 new_allocated = bp->utf8_buflen + count;
430 bp->utf8_allocated = new_allocated;
431 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
432 }
433 }
434
435 /* Auxiliary function: Append a Unicode character to bp->utf8.
436 uc must be < 0x110000. */
437 static inline void
string_buffer_append_unicode(struct string_buffer * bp,unsigned int uc)438 string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
439 {
440 unsigned char utf8buf[6];
441 int count = u8_uctomb (utf8buf, uc, 6);
442
443 if (count < 0)
444 /* The caller should have ensured that uc is not out-of-range. */
445 abort ();
446
447 string_buffer_append_unicode_grow (bp, count);
448 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
449 bp->utf8_buflen += count;
450 }
451
452 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */
453 static inline void
string_buffer_flush_utf16_surr(struct string_buffer * bp)454 string_buffer_flush_utf16_surr (struct string_buffer *bp)
455 {
456 if (bp->utf16_surr != 0)
457 {
458 /* A half surrogate is invalid, therefore use U+FFFD instead. */
459 string_buffer_append_unicode (bp, 0xfffd);
460 bp->utf16_surr = 0;
461 }
462 }
463
464 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */
465 static inline void
string_buffer_flush_curr_buffer(struct string_buffer * bp,int lineno)466 string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno)
467 {
468 if (bp->curr_buflen > 0)
469 {
470 char *curr;
471 size_t count;
472
473 string_buffer_append_byte (bp, '\0');
474
475 /* Convert from the source encoding to UTF-8. */
476 curr = from_current_source_encoding (bp->curr_buffer,
477 logical_file_name, lineno);
478
479 /* Append it to bp->utf8_buffer. */
480 count = strlen (curr);
481 string_buffer_append_unicode_grow (bp, count);
482 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
483 bp->utf8_buflen += count;
484
485 if (curr != bp->curr_buffer)
486 free (curr);
487 bp->curr_buflen = 0;
488 }
489 }
490
491 /* Append a character or Unicode character to a 'struct string_buffer'. */
492 static void
string_buffer_append(struct string_buffer * bp,int c)493 string_buffer_append (struct string_buffer *bp, int c)
494 {
495 if (IS_UNICODE (c))
496 {
497 /* Append a Unicode character. */
498
499 /* Switch from multibyte character mode to Unicode character mode. */
500 string_buffer_flush_curr_buffer (bp, line_number);
501
502 /* Test whether this character and the previous one form a Unicode
503 surrogate character pair. */
504 if (bp->utf16_surr != 0
505 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
506 {
507 unsigned short utf16buf[2];
508 unsigned int uc;
509
510 utf16buf[0] = bp->utf16_surr;
511 utf16buf[1] = UTF16_VALUE (c);
512 if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
513 abort ();
514
515 string_buffer_append_unicode (bp, uc);
516 bp->utf16_surr = 0;
517 }
518 else
519 {
520 string_buffer_flush_utf16_surr (bp);
521
522 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
523 bp->utf16_surr = UTF16_VALUE (c);
524 else
525 string_buffer_append_unicode (bp, UTF16_VALUE (c));
526 }
527 }
528 else
529 {
530 /* Append a single byte. */
531
532 /* Switch from Unicode character mode to multibyte character mode. */
533 string_buffer_flush_utf16_surr (bp);
534
535 /* When a newline is seen, convert the accumulated multibyte sequence.
536 This ensures a correct line number in the error message in case of
537 a conversion error. The "- 1" is to account for the newline. */
538 if (c == '\n')
539 string_buffer_flush_curr_buffer (bp, line_number - 1);
540
541 string_buffer_append_byte (bp, (unsigned char) c);
542 }
543 }
544
545 /* Return the string buffer's contents. */
546 static char *
string_buffer_result(struct string_buffer * bp)547 string_buffer_result (struct string_buffer *bp)
548 {
549 /* Flush all into bp->utf8_buffer. */
550 string_buffer_flush_utf16_surr (bp);
551 string_buffer_flush_curr_buffer (bp, line_number);
552 /* NUL-terminate it. */
553 string_buffer_append_unicode_grow (bp, 1);
554 bp->utf8_buffer[bp->utf8_buflen] = '\0';
555 /* Return it. */
556 return bp->utf8_buffer;
557 }
558
559 /* Free the memory pointed to by a 'struct string_buffer'. */
560 static inline void
free_string_buffer(struct string_buffer * bp)561 free_string_buffer (struct string_buffer *bp)
562 {
563 free (bp->utf8_buffer);
564 free (bp->curr_buffer);
565 }
566
567
568 /* ======================== Accumulating comments. ======================== */
569
570
571 /* Accumulating a single comment line. */
572
573 static struct string_buffer comment_buffer;
574
575 static inline void
comment_start()576 comment_start ()
577 {
578 comment_buffer.utf8_buflen = 0;
579 comment_buffer.utf16_surr = 0;
580 comment_buffer.curr_buflen = 0;
581 }
582
583 static inline bool
comment_at_start()584 comment_at_start ()
585 {
586 return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0
587 && comment_buffer.curr_buflen == 0);
588 }
589
590 static inline void
comment_add(int c)591 comment_add (int c)
592 {
593 string_buffer_append (&comment_buffer, c);
594 }
595
596 static inline void
comment_line_end(size_t chars_to_remove)597 comment_line_end (size_t chars_to_remove)
598 {
599 char *buffer = string_buffer_result (&comment_buffer);
600 size_t buflen = strlen (buffer);
601
602 buflen -= chars_to_remove;
603 while (buflen >= 1
604 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
605 --buflen;
606 buffer[buflen] = '\0';
607 savable_comment_add (buffer);
608 }
609
610
611 /* These are for tracking whether comments count as immediately before
612 keyword. */
613 static int last_comment_line;
614 static int last_non_comment_line;
615
616
617 /* Replace each comment that is not inside a character constant or string
618 literal with a space or newline character. */
619
620 static int
phase4_getc()621 phase4_getc ()
622 {
623 int c0;
624 int c;
625 bool last_was_star;
626
627 c0 = phase3_getc ();
628 if (RED (c0) != '/')
629 return c0;
630 c = phase3_getc ();
631 switch (RED (c))
632 {
633 default:
634 phase3_ungetc (c);
635 return c0;
636
637 case '*':
638 /* C style comment. */
639 comment_start ();
640 last_was_star = false;
641 for (;;)
642 {
643 c = phase3_getc ();
644 if (c == P2_EOF)
645 break;
646 /* We skip all leading white space, but not EOLs. */
647 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
648 comment_add (c);
649 switch (RED (c))
650 {
651 case '\n':
652 comment_line_end (1);
653 comment_start ();
654 last_was_star = false;
655 continue;
656
657 case '*':
658 last_was_star = true;
659 continue;
660
661 case '/':
662 if (last_was_star)
663 {
664 comment_line_end (2);
665 break;
666 }
667 /* FALLTHROUGH */
668
669 default:
670 last_was_star = false;
671 continue;
672 }
673 break;
674 }
675 last_comment_line = line_number;
676 return ' ';
677
678 case '/':
679 /* C++ style comment. */
680 last_comment_line = line_number;
681 comment_start ();
682 for (;;)
683 {
684 c = phase3_getc ();
685 if (RED (c) == '\n' || c == P2_EOF)
686 break;
687 /* We skip all leading white space, but not EOLs. */
688 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
689 comment_add (c);
690 }
691 phase3_ungetc (c); /* push back the newline, to decrement line_number */
692 comment_line_end (0);
693 phase3_getc (); /* read the newline again */
694 return '\n';
695 }
696 }
697
698 /* Supports only one pushback character. */
699 static void
phase4_ungetc(int c)700 phase4_ungetc (int c)
701 {
702 phase3_ungetc (c);
703 }
704
705
706 /* ========================== Reading of tokens. ========================== */
707
708 enum token_type_ty
709 {
710 token_type_eof,
711 token_type_lparen, /* ( */
712 token_type_rparen, /* ) */
713 token_type_lbrace, /* { */
714 token_type_rbrace, /* } */
715 token_type_comma, /* , */
716 token_type_dot, /* . */
717 token_type_string_literal, /* "abc" */
718 token_type_number, /* 1.23 */
719 token_type_symbol, /* identifier, keyword, null */
720 token_type_plus, /* + */
721 token_type_other /* character literal, misc. operator */
722 };
723 typedef enum token_type_ty token_type_ty;
724
725 typedef struct token_ty token_ty;
726 struct token_ty
727 {
728 token_type_ty type;
729 char *string; /* for token_type_string_literal, token_type_symbol */
730 refcounted_string_list_ty *comment; /* for token_type_string_literal */
731 int line_number;
732 };
733
734
735 /* Free the memory pointed to by a 'struct token_ty'. */
736 static inline void
free_token(token_ty * tp)737 free_token (token_ty *tp)
738 {
739 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
740 free (tp->string);
741 if (tp->type == token_type_string_literal)
742 drop_reference (tp->comment);
743 }
744
745
746 /* Read an escape sequence inside a string literal or character literal. */
747 static inline int
do_getc_escaped()748 do_getc_escaped ()
749 {
750 int c;
751
752 /* Use phase 3, because phase 4 elides comments. */
753 c = phase3_getc ();
754 if (c == P2_EOF)
755 return UNICODE ('\\');
756 switch (RED (c))
757 {
758 case 'b':
759 return UNICODE (0x08);
760 case 't':
761 return UNICODE (0x09);
762 case 'n':
763 return UNICODE (0x0a);
764 case 'f':
765 return UNICODE (0x0c);
766 case 'r':
767 return UNICODE (0x0d);
768 case '"':
769 return UNICODE ('"');
770 case '\'':
771 return UNICODE ('\'');
772 case '\\':
773 return UNICODE ('\\');
774 case '0': case '1': case '2': case '3':
775 case '4': case '5': case '6': case '7':
776 {
777 int n = RED (c) - '0';
778 bool maybe3digits = (n < 4);
779
780 c = phase3_getc ();
781 if (RED (c) >= '0' && RED (c) <= '7')
782 {
783 n = (n << 3) + (RED (c) - '0');
784 if (maybe3digits)
785 {
786 c = phase3_getc ();
787 if (RED (c) >= '0' && RED (c) <= '7')
788 n = (n << 3) + (RED (c) - '0');
789 else
790 phase3_ungetc (c);
791 }
792 }
793 else
794 phase3_ungetc (c);
795
796 return UNICODE (n);
797 }
798 default:
799 /* Invalid escape sequence. */
800 phase3_ungetc (c);
801 return UNICODE ('\\');
802 }
803 }
804
805 /* Read a string literal or character literal. */
806 static void
accumulate_escaped(struct string_buffer * literal,int delimiter)807 accumulate_escaped (struct string_buffer *literal, int delimiter)
808 {
809 int c;
810
811 for (;;)
812 {
813 /* Use phase 3, because phase 4 elides comments. */
814 c = phase3_getc ();
815 if (c == P2_EOF || RED (c) == delimiter)
816 break;
817 if (RED (c) == '\n')
818 {
819 phase3_ungetc (c);
820 error_with_progname = false;
821 if (delimiter == '\'')
822 error (0, 0, _("%s:%d: warning: unterminated character constant"),
823 logical_file_name, line_number);
824 else
825 error (0, 0, _("%s:%d: warning: unterminated string constant"),
826 logical_file_name, line_number);
827 error_with_progname = true;
828 break;
829 }
830 if (RED (c) == '\\')
831 c = do_getc_escaped ();
832 string_buffer_append (literal, c);
833 }
834 }
835
836
837 /* Combine characters into tokens. Discard whitespace. */
838
839 static token_ty phase5_pushback[3];
840 static int phase5_pushback_length;
841
842 static void
phase5_get(token_ty * tp)843 phase5_get (token_ty *tp)
844 {
845 int c;
846
847 if (phase5_pushback_length)
848 {
849 *tp = phase5_pushback[--phase5_pushback_length];
850 return;
851 }
852 tp->string = NULL;
853
854 for (;;)
855 {
856 tp->line_number = line_number;
857 c = phase4_getc ();
858
859 if (c == P2_EOF)
860 {
861 tp->type = token_type_eof;
862 return;
863 }
864
865 switch (RED (c))
866 {
867 case '\n':
868 if (last_non_comment_line > last_comment_line)
869 savable_comment_reset ();
870 /* FALLTHROUGH */
871 case ' ':
872 case '\t':
873 case '\f':
874 /* Ignore whitespace and comments. */
875 continue;
876 }
877
878 last_non_comment_line = tp->line_number;
879
880 switch (RED (c))
881 {
882 case '(':
883 tp->type = token_type_lparen;
884 return;
885
886 case ')':
887 tp->type = token_type_rparen;
888 return;
889
890 case '{':
891 tp->type = token_type_lbrace;
892 return;
893
894 case '}':
895 tp->type = token_type_rbrace;
896 return;
897
898 case ',':
899 tp->type = token_type_comma;
900 return;
901
902 case '.':
903 c = phase4_getc ();
904 if (!(RED (c) >= '0' && RED (c) <= '9'))
905 {
906 phase4_ungetc (c);
907 tp->type = token_type_dot;
908 return;
909 }
910 /* FALLTHROUGH */
911
912 case '0': case '1': case '2': case '3': case '4':
913 case '5': case '6': case '7': case '8': case '9':
914 {
915 /* Don't need to verify the complicated syntax of integers and
916 floating-point numbers. We assume a valid Java input.
917 The simplified syntax that we recognize as number is: any
918 sequence of alphanumeric characters, additionally '+' and '-'
919 immediately after 'e' or 'E' except in hexadecimal numbers. */
920 bool hexadecimal = false;
921
922 for (;;)
923 {
924 c = phase4_getc ();
925 if (RED (c) >= '0' && RED (c) <= '9')
926 continue;
927 if ((RED (c) >= 'A' && RED (c) <= 'Z')
928 || (RED (c) >= 'a' && RED (c) <= 'z'))
929 {
930 if (RED (c) == 'X' || RED (c) == 'x')
931 hexadecimal = true;
932 if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
933 {
934 c = phase4_getc ();
935 if (!(RED (c) == '+' || RED (c) == '-'))
936 phase4_ungetc (c);
937 }
938 continue;
939 }
940 if (RED (c) == '.')
941 continue;
942 break;
943 }
944 phase4_ungetc (c);
945 tp->type = token_type_number;
946 return;
947 }
948
949 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
950 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
951 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
952 case 'V': case 'W': case 'X': case 'Y': case 'Z':
953 case '_':
954 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
955 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
956 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
957 case 'v': case 'w': case 'x': case 'y': case 'z':
958 /* Although Java allows identifiers containing many Unicode
959 characters, we recognize only identifiers consisting of ASCII
960 characters. This avoids conversion hassles w.r.t. the --keyword
961 arguments, and shouldn't be a big problem in practice. */
962 {
963 static char *buffer;
964 static int bufmax;
965 int bufpos = 0;
966 for (;;)
967 {
968 if (bufpos >= bufmax)
969 {
970 bufmax = 2 * bufmax + 10;
971 buffer = xrealloc (buffer, bufmax);
972 }
973 buffer[bufpos++] = RED (c);
974 c = phase4_getc ();
975 if (!((RED (c) >= 'A' && RED (c) <= 'Z')
976 || (RED (c) >= 'a' && RED (c) <= 'z')
977 || (RED (c) >= '0' && RED (c) <= '9')
978 || RED (c) == '_'))
979 break;
980 }
981 phase4_ungetc (c);
982 if (bufpos >= bufmax)
983 {
984 bufmax = 2 * bufmax + 10;
985 buffer = xrealloc (buffer, bufmax);
986 }
987 buffer[bufpos] = '\0';
988 tp->string = xstrdup (buffer);
989 tp->type = token_type_symbol;
990 return;
991 }
992
993 case '"':
994 /* String literal. */
995 {
996 struct string_buffer literal;
997
998 init_string_buffer (&literal);
999 accumulate_escaped (&literal, '"');
1000 tp->string = xstrdup (string_buffer_result (&literal));
1001 free_string_buffer (&literal);
1002 tp->comment = add_reference (savable_comment);
1003 tp->type = token_type_string_literal;
1004 return;
1005 }
1006
1007 case '\'':
1008 /* Character literal. */
1009 {
1010 struct string_buffer literal;
1011
1012 init_string_buffer (&literal);
1013 accumulate_escaped (&literal, '\'');
1014 free_string_buffer (&literal);
1015 tp->type = token_type_other;
1016 return;
1017 }
1018
1019 case '+':
1020 c = phase4_getc ();
1021 if (RED (c) == '+')
1022 /* Operator ++ */
1023 tp->type = token_type_other;
1024 else if (RED (c) == '=')
1025 /* Operator += */
1026 tp->type = token_type_other;
1027 else
1028 {
1029 /* Operator + */
1030 phase4_ungetc (c);
1031 tp->type = token_type_plus;
1032 }
1033 return;
1034
1035 default:
1036 /* Misc. operator. */
1037 tp->type = token_type_other;
1038 return;
1039 }
1040 }
1041 }
1042
1043 /* Supports 3 tokens of pushback. */
1044 static void
phase5_unget(token_ty * tp)1045 phase5_unget (token_ty *tp)
1046 {
1047 if (tp->type != token_type_eof)
1048 {
1049 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1050 abort ();
1051 phase5_pushback[phase5_pushback_length++] = *tp;
1052 }
1053 }
1054
1055
1056 /* Compile-time optimization of string literal concatenation.
1057 Combine "string1" + ... + "stringN" to the concatenated string if
1058 - the token before this expression is not ')' (because then the first
1059 string could be part of a cast expression),
1060 - the token after this expression is not '.' (because then the last
1061 string could be part of a method call expression). */
1062
1063 static token_ty phase6_pushback[2];
1064 static int phase6_pushback_length;
1065
1066 static token_type_ty phase6_last;
1067
1068 static void
phase6_get(token_ty * tp)1069 phase6_get (token_ty *tp)
1070 {
1071 if (phase6_pushback_length)
1072 {
1073 *tp = phase6_pushback[--phase6_pushback_length];
1074 return;
1075 }
1076
1077 phase5_get (tp);
1078 if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1079 {
1080 char *sum = tp->string;
1081 size_t sum_len = strlen (sum);
1082
1083 for (;;)
1084 {
1085 token_ty token2;
1086
1087 phase5_get (&token2);
1088 if (token2.type == token_type_plus)
1089 {
1090 token_ty token3;
1091
1092 phase5_get (&token3);
1093 if (token3.type == token_type_string_literal)
1094 {
1095 token_ty token_after;
1096
1097 phase5_get (&token_after);
1098 if (token_after.type != token_type_dot)
1099 {
1100 char *addend = token3.string;
1101 size_t addend_len = strlen (addend);
1102
1103 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1104 memcpy (sum + sum_len, addend, addend_len + 1);
1105 sum_len += addend_len;
1106
1107 phase5_unget (&token_after);
1108 free_token (&token3);
1109 free_token (&token2);
1110 continue;
1111 }
1112 phase5_unget (&token_after);
1113 }
1114 phase5_unget (&token3);
1115 }
1116 phase5_unget (&token2);
1117 break;
1118 }
1119 tp->string = sum;
1120 }
1121 phase6_last = tp->type;
1122 }
1123
1124 /* Supports 2 tokens of pushback. */
1125 static void
phase6_unget(token_ty * tp)1126 phase6_unget (token_ty *tp)
1127 {
1128 if (tp->type != token_type_eof)
1129 {
1130 if (phase6_pushback_length == SIZEOF (phase6_pushback))
1131 abort ();
1132 phase6_pushback[phase6_pushback_length++] = *tp;
1133 }
1134 }
1135
1136
1137 static void
x_java_lex(token_ty * tp)1138 x_java_lex (token_ty *tp)
1139 {
1140 phase6_get (tp);
1141 }
1142
1143 /* Supports 2 tokens of pushback. */
1144 static void
x_java_unlex(token_ty * tp)1145 x_java_unlex (token_ty *tp)
1146 {
1147 phase6_unget (tp);
1148 }
1149
1150
1151 /* ========================= Extracting strings. ========================== */
1152
1153
1154 /* Context lookup table. */
1155 static flag_context_list_table_ty *flag_context_list_table;
1156
1157
1158 /* The file is broken into tokens. Scan the token stream, looking for
1159 a keyword, followed by a left paren, followed by a string. When we
1160 see this sequence, we have something to remember. We assume we are
1161 looking at a valid C or C++ program, and leave the complaints about
1162 the grammar to the compiler.
1163
1164 Normal handling: Look for
1165 keyword ( ... msgid ... )
1166 Plural handling: Look for
1167 keyword ( ... msgid ... msgid_plural ... )
1168
1169 We use recursion because the arguments before msgid or between msgid
1170 and msgid_plural can contain subexpressions of the same form. */
1171
1172
1173 /* Extract messages until the next balanced closing parenthesis or brace,
1174 depending on TERMINATOR.
1175 Extracted messages are added to MLP.
1176 Return true upon eof, false upon closing parenthesis or brace. */
1177 static bool
extract_parenthesized(message_list_ty * mlp,token_type_ty terminator,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1178 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1179 flag_context_ty outer_context,
1180 flag_context_list_iterator_ty context_iter,
1181 struct arglist_parser *argparser)
1182 {
1183 /* Current argument number. */
1184 int arg = 1;
1185 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1186 int state;
1187 /* Parameters of the keyword just seen. Defined only in state 1. */
1188 const struct callshapes *next_shapes = NULL;
1189 /* Context iterator that will be used if the next token is a '('. */
1190 flag_context_list_iterator_ty next_context_iter =
1191 passthrough_context_list_iterator;
1192 /* Current context. */
1193 flag_context_ty inner_context =
1194 inherited_context (outer_context,
1195 flag_context_list_iterator_advance (&context_iter));
1196
1197 /* Start state is 0. */
1198 state = 0;
1199
1200 for (;;)
1201 {
1202 token_ty token;
1203
1204 x_java_lex (&token);
1205 switch (token.type)
1206 {
1207 case token_type_symbol:
1208 {
1209 /* Combine symbol1 . ... . symbolN to a single strings, so that
1210 we can recognize static function calls like
1211 GettextResource.gettext. The information present for
1212 symbolI.....symbolN has precedence over the information for
1213 symbolJ.....symbolN with J > I. */
1214 char *sum = token.string;
1215 size_t sum_len = strlen (sum);
1216 const char *dottedname;
1217 flag_context_list_ty *context_list;
1218
1219 for (;;)
1220 {
1221 token_ty token2;
1222
1223 x_java_lex (&token2);
1224 if (token2.type == token_type_dot)
1225 {
1226 token_ty token3;
1227
1228 x_java_lex (&token3);
1229 if (token3.type == token_type_symbol)
1230 {
1231 char *addend = token3.string;
1232 size_t addend_len = strlen (addend);
1233
1234 sum =
1235 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1236 sum[sum_len] = '.';
1237 memcpy (sum + sum_len + 1, addend, addend_len + 1);
1238 sum_len += 1 + addend_len;
1239
1240 free_token (&token3);
1241 free_token (&token2);
1242 continue;
1243 }
1244 x_java_unlex (&token3);
1245 }
1246 x_java_unlex (&token2);
1247 break;
1248 }
1249
1250 for (dottedname = sum;;)
1251 {
1252 void *keyword_value;
1253
1254 if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1255 &keyword_value)
1256 == 0)
1257 {
1258 next_shapes = (const struct callshapes *) keyword_value;
1259 state = 1;
1260 break;
1261 }
1262
1263 dottedname = strchr (dottedname, '.');
1264 if (dottedname == NULL)
1265 {
1266 state = 0;
1267 break;
1268 }
1269 dottedname++;
1270 }
1271
1272 for (dottedname = sum;;)
1273 {
1274 context_list =
1275 flag_context_list_table_lookup (
1276 flag_context_list_table,
1277 dottedname, strlen (dottedname));
1278 if (context_list != NULL)
1279 break;
1280
1281 dottedname = strchr (dottedname, '.');
1282 if (dottedname == NULL)
1283 break;
1284 dottedname++;
1285 }
1286 next_context_iter = flag_context_list_iterator (context_list);
1287
1288 free (sum);
1289 continue;
1290 }
1291
1292 case token_type_lparen:
1293 if (extract_parenthesized (mlp, token_type_rparen,
1294 inner_context, next_context_iter,
1295 arglist_parser_alloc (mlp,
1296 state ? next_shapes : NULL)))
1297 {
1298 xgettext_current_source_encoding = po_charset_utf8;
1299 arglist_parser_done (argparser, arg);
1300 xgettext_current_source_encoding = xgettext_global_source_encoding;
1301 return true;
1302 }
1303 next_context_iter = null_context_list_iterator;
1304 state = 0;
1305 continue;
1306
1307 case token_type_rparen:
1308 if (terminator == token_type_rparen)
1309 {
1310 xgettext_current_source_encoding = po_charset_utf8;
1311 arglist_parser_done (argparser, arg);
1312 xgettext_current_source_encoding = xgettext_global_source_encoding;
1313 return false;
1314 }
1315 if (terminator == token_type_rbrace)
1316 {
1317 error_with_progname = false;
1318 error (0, 0,
1319 _("%s:%d: warning: ')' found where '}' was expected"),
1320 logical_file_name, token.line_number);
1321 error_with_progname = true;
1322 }
1323 next_context_iter = null_context_list_iterator;
1324 state = 0;
1325 continue;
1326
1327 case token_type_lbrace:
1328 if (extract_parenthesized (mlp, token_type_rbrace,
1329 null_context, null_context_list_iterator,
1330 arglist_parser_alloc (mlp, NULL)))
1331 {
1332 xgettext_current_source_encoding = po_charset_utf8;
1333 arglist_parser_done (argparser, arg);
1334 xgettext_current_source_encoding = xgettext_global_source_encoding;
1335 return true;
1336 }
1337 next_context_iter = null_context_list_iterator;
1338 state = 0;
1339 continue;
1340
1341 case token_type_rbrace:
1342 if (terminator == token_type_rbrace)
1343 {
1344 xgettext_current_source_encoding = po_charset_utf8;
1345 arglist_parser_done (argparser, arg);
1346 xgettext_current_source_encoding = xgettext_global_source_encoding;
1347 return false;
1348 }
1349 if (terminator == token_type_rparen)
1350 {
1351 error_with_progname = false;
1352 error (0, 0,
1353 _("%s:%d: warning: '}' found where ')' was expected"),
1354 logical_file_name, token.line_number);
1355 error_with_progname = true;
1356 }
1357 next_context_iter = null_context_list_iterator;
1358 state = 0;
1359 continue;
1360
1361 case token_type_comma:
1362 arg++;
1363 inner_context =
1364 inherited_context (outer_context,
1365 flag_context_list_iterator_advance (
1366 &context_iter));
1367 next_context_iter = passthrough_context_list_iterator;
1368 state = 0;
1369 continue;
1370
1371 case token_type_string_literal:
1372 {
1373 lex_pos_ty pos;
1374 pos.file_name = logical_file_name;
1375 pos.line_number = token.line_number;
1376
1377 xgettext_current_source_encoding = po_charset_utf8;
1378 if (extract_all)
1379 remember_a_message (mlp, NULL, token.string, inner_context,
1380 &pos, token.comment);
1381 else
1382 arglist_parser_remember (argparser, arg, token.string,
1383 inner_context,
1384 pos.file_name, pos.line_number,
1385 token.comment);
1386 xgettext_current_source_encoding = xgettext_global_source_encoding;
1387 }
1388 drop_reference (token.comment);
1389 next_context_iter = null_context_list_iterator;
1390 state = 0;
1391 continue;
1392
1393 case token_type_eof:
1394 xgettext_current_source_encoding = po_charset_utf8;
1395 arglist_parser_done (argparser, arg);
1396 xgettext_current_source_encoding = xgettext_global_source_encoding;
1397 return true;
1398
1399 case token_type_dot:
1400 case token_type_number:
1401 case token_type_plus:
1402 case token_type_other:
1403 next_context_iter = null_context_list_iterator;
1404 state = 0;
1405 continue;
1406
1407 default:
1408 abort ();
1409 }
1410 }
1411 }
1412
1413
1414 void
extract_java(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1415 extract_java (FILE *f,
1416 const char *real_filename, const char *logical_filename,
1417 flag_context_list_table_ty *flag_table,
1418 msgdomain_list_ty *mdlp)
1419 {
1420 message_list_ty *mlp = mdlp->item[0]->messages;
1421
1422 fp = f;
1423 real_file_name = real_filename;
1424 logical_file_name = xstrdup (logical_filename);
1425 line_number = 1;
1426
1427 last_comment_line = -1;
1428 last_non_comment_line = -1;
1429
1430 phase6_last = token_type_eof;
1431
1432 flag_context_list_table = flag_table;
1433
1434 init_keywords ();
1435
1436 /* Eat tokens until eof is seen. When extract_parenthesized returns
1437 due to an unbalanced closing parenthesis, just restart it. */
1438 while (!extract_parenthesized (mlp, token_type_eof,
1439 null_context, null_context_list_iterator,
1440 arglist_parser_alloc (mlp, NULL)))
1441 ;
1442
1443 fp = NULL;
1444 real_file_name = NULL;
1445 logical_file_name = NULL;
1446 line_number = 0;
1447 }
1448