xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-java.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext Java backend.
2    Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software Foundation,
17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 #include <errno.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 #include "message.h"
30 #include "xgettext.h"
31 #include "x-java.h"
32 #include "error.h"
33 #include "xalloc.h"
34 #include "exit.h"
35 #include "hash.h"
36 #include "po-charset.h"
37 #include "utf16-ucs4.h"
38 #include "ucs4-utf8.h"
39 #include "gettext.h"
40 
41 #define _(s) gettext(s)
42 
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
44 
45 
46 /* The Java syntax is defined in the
47      Java Language Specification, Second Edition,
48      (available from http://java.sun.com/),
49      chapter 3 "Lexical Structure".  */
50 
51 
52 /* ====================== Keyword set customization.  ====================== */
53 
54 /* If true extract all strings.  */
55 static bool extract_all = false;
56 
57 static hash_table keywords;
58 static bool default_keywords = true;
59 
60 
61 void
x_java_extract_all()62 x_java_extract_all ()
63 {
64   extract_all = true;
65 }
66 
67 
68 void
x_java_keyword(const char * name)69 x_java_keyword (const char *name)
70 {
71   if (name == NULL)
72     default_keywords = false;
73   else
74     {
75       const char *end;
76       struct callshape shape;
77       const char *colon;
78 
79       if (keywords.table == NULL)
80 	hash_init (&keywords, 100);
81 
82       split_keywordspec (name, &end, &shape);
83 
84       /* The characters between name and end should form a valid Java
85 	 identifier sequence with dots.
86 	 A colon means an invalid parse in split_keywordspec().  */
87       colon = strchr (name, ':');
88       if (colon == NULL || colon >= end)
89 	insert_keyword_callshape (&keywords, name, end - name, &shape);
90     }
91 }
92 
93 /* Finish initializing the keywords hash table.
94    Called after argument processing, before each file is processed.  */
95 static void
init_keywords()96 init_keywords ()
97 {
98   if (default_keywords)
99     {
100       /* When adding new keywords here, also update the documentation in
101 	 xgettext.texi!  */
102       x_java_keyword ("GettextResource.gettext:2");	/* static method */
103       x_java_keyword ("GettextResource.ngettext:2,3");	/* static method */
104       x_java_keyword ("gettext");
105       x_java_keyword ("ngettext:1,2");
106       x_java_keyword ("getString");	/* ResourceBundle.getString */
107       default_keywords = false;
108     }
109 }
110 
111 void
init_flag_table_java()112 init_flag_table_java ()
113 {
114   xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
115   xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
116   xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
117   xgettext_record_flag ("gettext:1:pass-java-format");
118   xgettext_record_flag ("ngettext:1:pass-java-format");
119   xgettext_record_flag ("ngettext:2:pass-java-format");
120   xgettext_record_flag ("getString:1:pass-java-format");
121   xgettext_record_flag ("MessageFormat:1:java-format");
122   xgettext_record_flag ("MessageFormat.format:1:java-format");
123 }
124 
125 
126 /* ======================== Reading of characters.  ======================== */
127 
128 /* Real filename, used in error messages about the input file.  */
129 static const char *real_file_name;
130 
131 /* Logical filename and line number, used to label the extracted messages.  */
132 static char *logical_file_name;
133 static int line_number;
134 
135 /* The input file stream.  */
136 static FILE *fp;
137 
138 
139 /* Fetch the next single-byte character from the input file.
140    Pushback can consist of an unlimited number of 'u' followed by up to 4
141    other characters.  */
142 
143 /* Special coding of multiple 'u's in the pushback buffer.  */
144 #define MULTIPLE_U(count) (0x1000 + (count))
145 
146 static int phase1_pushback[5];
147 static unsigned int phase1_pushback_length;
148 
149 static int
phase1_getc()150 phase1_getc ()
151 {
152   int c;
153 
154   if (phase1_pushback_length)
155     {
156       c = phase1_pushback[--phase1_pushback_length];
157       if (c >= MULTIPLE_U (0))
158 	{
159 	  if (c > MULTIPLE_U (1))
160 	    phase1_pushback[phase1_pushback_length++] = c - 1;
161 	  return 'u';
162 	}
163       else
164 	return c;
165     }
166 
167   c = getc (fp);
168 
169   if (c == EOF)
170     {
171       if (ferror (fp))
172 	error (EXIT_FAILURE, errno, _("\
173 error while reading \"%s\""), real_file_name);
174     }
175 
176   return c;
177 }
178 
179 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback.  */
180 static void
phase1_ungetc(int c)181 phase1_ungetc (int c)
182 {
183   if (c != EOF)
184     {
185       if (c == 'u')
186 	{
187 	  if (phase1_pushback_length > 0
188 	      && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
189 	    phase1_pushback[phase1_pushback_length - 1]++;
190 	  else
191 	    {
192 	      if (phase1_pushback_length == SIZEOF (phase1_pushback))
193 		abort ();
194 	      phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
195 	    }
196 	}
197       else
198 	{
199 	  if (phase1_pushback_length == SIZEOF (phase1_pushback))
200 	    abort ();
201 	  phase1_pushback[phase1_pushback_length++] = c;
202 	}
203     }
204 }
205 
206 
207 /* Fetch the next single-byte character or Unicode character from the file.
208    (Here, as in the Java Language Specification, when we say "Unicode
209    character", we actually mean "UTF-16 encoding unit".)  */
210 
211 /* Return value of phase 2, 3, 4 when EOF is reached.  */
212 #define P2_EOF 0xffff
213 
214 /* Convert an UTF-16 code point to a return value that can be distinguished
215    from a single-byte return value.  */
216 #define UNICODE(code) (0x10000 + (code))
217 
218 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
219    point.  */
220 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
221 
222 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
223 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
224 
225 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
226    so that it can be more easily compared against an ASCII character.
227    (RED (c) == 'x')  is equivalent to  (c == 'x' || c == UNICODE ('x')).  */
228 #define RED(p2_result) ((p2_result) & 0xffff)
229 
230 static int phase2_pushback[1];
231 static int phase2_pushback_length;
232 
233 static int
phase2_getc()234 phase2_getc ()
235 {
236   int c;
237 
238   if (phase2_pushback_length)
239     return phase2_pushback[--phase2_pushback_length];
240 
241   c = phase1_getc ();
242   if (c == EOF)
243     return P2_EOF;
244   if (c == '\\')
245     {
246       c = phase1_getc ();
247       if (c == 'u')
248 	{
249 	  unsigned int u_count = 1;
250 	  unsigned char buf[4];
251 	  unsigned int n;
252 	  int i;
253 
254 	  for (;;)
255 	    {
256 	      c = phase1_getc ();
257 	      if (c != 'u')
258 		break;
259 	      u_count++;
260 	    }
261 	  phase1_ungetc (c);
262 
263 	  n = 0;
264 	  for (i = 0; i < 4; i++)
265 	    {
266 	      c = phase1_getc ();
267 
268 	      if (c >= '0' && c <= '9')
269 		n = (n << 4) + (c - '0');
270 	      else if (c >= 'A' && c <= 'F')
271 		n = (n << 4) + (c - 'A' + 10);
272 	      else if (c >= 'a' && c <= 'f')
273 		n = (n << 4) + (c - 'a' + 10);
274 	      else
275 		{
276 		  phase1_ungetc (c);
277 		  while (--i >= 0)
278 		    phase1_ungetc (buf[i]);
279 		  for (; u_count > 0; u_count--)
280 		    phase1_ungetc ('u');
281 		  return '\\';
282 		}
283 
284 	      buf[i] = c;
285 	    }
286 	  return UNICODE (n);
287 	}
288       phase1_ungetc (c);
289       return '\\';
290     }
291   return c;
292 }
293 
294 /* Supports only one pushback character.  */
295 static void
phase2_ungetc(int c)296 phase2_ungetc (int c)
297 {
298   if (c != P2_EOF)
299     {
300       if (phase2_pushback_length == SIZEOF (phase2_pushback))
301 	abort ();
302       phase2_pushback[phase2_pushback_length++] = c;
303     }
304 }
305 
306 
307 /* Fetch the next single-byte character or Unicode character from the file.
308    With line number handling.
309    Convert line terminators to '\n' or UNICODE ('\n').  */
310 
311 static int phase3_pushback[2];
312 static int phase3_pushback_length;
313 
314 static int
phase3_getc()315 phase3_getc ()
316 {
317   int c;
318 
319   if (phase3_pushback_length)
320     {
321       c = phase3_pushback[--phase3_pushback_length];
322       if (c == '\n')
323 	++line_number;
324       return c;
325     }
326 
327   c = phase2_getc ();
328 
329   /* Handle line terminators.  */
330   if (RED (c) == '\r')
331     {
332       int c1 = phase2_getc ();
333 
334       if (RED (c1) != '\n')
335 	phase2_ungetc (c1);
336 
337       /* Seen line terminator CR or CR/LF.  */
338       if (c == '\r' || c1 == '\n')
339 	{
340 	  ++line_number;
341 	  return '\n';
342 	}
343       else
344 	return UNICODE ('\n');
345     }
346   else if (RED (c) == '\n')
347     {
348       /* Seen line terminator LF.  */
349       if (c == '\n')
350 	{
351 	  ++line_number;
352 	  return '\n';
353 	}
354       else
355 	return UNICODE ('\n');
356     }
357 
358   return c;
359 }
360 
361 /* Supports 2 characters of pushback.  */
362 static void
phase3_ungetc(int c)363 phase3_ungetc (int c)
364 {
365   if (c != P2_EOF)
366     {
367       if (c == '\n')
368 	--line_number;
369       if (phase3_pushback_length == SIZEOF (phase3_pushback))
370 	abort ();
371       phase3_pushback[phase3_pushback_length++] = c;
372     }
373 }
374 
375 
376 /* ========================= Accumulating strings.  ======================== */
377 
378 /* A string buffer type that allows appending bytes (in the
379    xgettext_current_source_encoding) or Unicode characters.
380    Returns the entire string in UTF-8 encoding.  */
381 
382 struct string_buffer
383 {
384   /* The part of the string that has already been converted to UTF-8.  */
385   char *utf8_buffer;
386   size_t utf8_buflen;
387   size_t utf8_allocated;
388   /* The first half of an UTF-16 surrogate character.  */
389   unsigned short utf16_surr;
390   /* The part of the string that is still in the source encoding.  */
391   char *curr_buffer;
392   size_t curr_buflen;
393   size_t curr_allocated;
394 };
395 
396 /* Initialize a 'struct string_buffer' to empty.  */
397 static inline void
init_string_buffer(struct string_buffer * bp)398 init_string_buffer (struct string_buffer *bp)
399 {
400   bp->utf8_buffer = NULL;
401   bp->utf8_buflen = 0;
402   bp->utf8_allocated = 0;
403   bp->utf16_surr = 0;
404   bp->curr_buffer = NULL;
405   bp->curr_buflen = 0;
406   bp->curr_allocated = 0;
407 }
408 
409 /* Auxiliary function: Append a byte to bp->curr.  */
410 static inline void
string_buffer_append_byte(struct string_buffer * bp,unsigned char c)411 string_buffer_append_byte (struct string_buffer *bp, unsigned char c)
412 {
413   if (bp->curr_buflen == bp->curr_allocated)
414     {
415       bp->curr_allocated = 2 * bp->curr_allocated + 10;
416       bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
417     }
418   bp->curr_buffer[bp->curr_buflen++] = c;
419 }
420 
421 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
422 static inline void
string_buffer_append_unicode_grow(struct string_buffer * bp,size_t count)423 string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
424 {
425   if (bp->utf8_buflen + count > bp->utf8_allocated)
426     {
427       size_t new_allocated = 2 * bp->utf8_allocated + 10;
428       if (new_allocated < bp->utf8_buflen + count)
429 	new_allocated = bp->utf8_buflen + count;
430       bp->utf8_allocated = new_allocated;
431       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
432     }
433 }
434 
435 /* Auxiliary function: Append a Unicode character to bp->utf8.
436    uc must be < 0x110000.  */
437 static inline void
string_buffer_append_unicode(struct string_buffer * bp,unsigned int uc)438 string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
439 {
440   unsigned char utf8buf[6];
441   int count = u8_uctomb (utf8buf, uc, 6);
442 
443   if (count < 0)
444     /* The caller should have ensured that uc is not out-of-range.  */
445     abort ();
446 
447   string_buffer_append_unicode_grow (bp, count);
448   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
449   bp->utf8_buflen += count;
450 }
451 
452 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
453 static inline void
string_buffer_flush_utf16_surr(struct string_buffer * bp)454 string_buffer_flush_utf16_surr (struct string_buffer *bp)
455 {
456   if (bp->utf16_surr != 0)
457     {
458       /* A half surrogate is invalid, therefore use U+FFFD instead.  */
459       string_buffer_append_unicode (bp, 0xfffd);
460       bp->utf16_surr = 0;
461     }
462 }
463 
464 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
465 static inline void
string_buffer_flush_curr_buffer(struct string_buffer * bp,int lineno)466 string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno)
467 {
468   if (bp->curr_buflen > 0)
469     {
470       char *curr;
471       size_t count;
472 
473       string_buffer_append_byte (bp, '\0');
474 
475       /* Convert from the source encoding to UTF-8.  */
476       curr = from_current_source_encoding (bp->curr_buffer,
477 					   logical_file_name, lineno);
478 
479       /* Append it to bp->utf8_buffer.  */
480       count = strlen (curr);
481       string_buffer_append_unicode_grow (bp, count);
482       memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
483       bp->utf8_buflen += count;
484 
485       if (curr != bp->curr_buffer)
486 	free (curr);
487       bp->curr_buflen = 0;
488     }
489 }
490 
491 /* Append a character or Unicode character to a 'struct string_buffer'.  */
492 static void
string_buffer_append(struct string_buffer * bp,int c)493 string_buffer_append (struct string_buffer *bp, int c)
494 {
495   if (IS_UNICODE (c))
496     {
497       /* Append a Unicode character.  */
498 
499       /* Switch from multibyte character mode to Unicode character mode.  */
500       string_buffer_flush_curr_buffer (bp, line_number);
501 
502       /* Test whether this character and the previous one form a Unicode
503 	 surrogate character pair.  */
504       if (bp->utf16_surr != 0
505 	  && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
506 	{
507 	  unsigned short utf16buf[2];
508 	  unsigned int uc;
509 
510 	  utf16buf[0] = bp->utf16_surr;
511 	  utf16buf[1] = UTF16_VALUE (c);
512 	  if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
513 	    abort ();
514 
515 	  string_buffer_append_unicode (bp, uc);
516 	  bp->utf16_surr = 0;
517 	}
518       else
519 	{
520 	  string_buffer_flush_utf16_surr (bp);
521 
522 	  if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
523 	    bp->utf16_surr = UTF16_VALUE (c);
524 	  else
525 	    string_buffer_append_unicode (bp, UTF16_VALUE (c));
526 	}
527     }
528   else
529     {
530       /* Append a single byte.  */
531 
532       /* Switch from Unicode character mode to multibyte character mode.  */
533       string_buffer_flush_utf16_surr (bp);
534 
535       /* When a newline is seen, convert the accumulated multibyte sequence.
536 	 This ensures a correct line number in the error message in case of
537 	 a conversion error.  The "- 1" is to account for the newline.  */
538       if (c == '\n')
539 	string_buffer_flush_curr_buffer (bp, line_number - 1);
540 
541       string_buffer_append_byte (bp, (unsigned char) c);
542     }
543 }
544 
545 /* Return the string buffer's contents.  */
546 static char *
string_buffer_result(struct string_buffer * bp)547 string_buffer_result (struct string_buffer *bp)
548 {
549   /* Flush all into bp->utf8_buffer.  */
550   string_buffer_flush_utf16_surr (bp);
551   string_buffer_flush_curr_buffer (bp, line_number);
552   /* NUL-terminate it.  */
553   string_buffer_append_unicode_grow (bp, 1);
554   bp->utf8_buffer[bp->utf8_buflen] = '\0';
555   /* Return it.  */
556   return bp->utf8_buffer;
557 }
558 
559 /* Free the memory pointed to by a 'struct string_buffer'.  */
560 static inline void
free_string_buffer(struct string_buffer * bp)561 free_string_buffer (struct string_buffer *bp)
562 {
563   free (bp->utf8_buffer);
564   free (bp->curr_buffer);
565 }
566 
567 
568 /* ======================== Accumulating comments.  ======================== */
569 
570 
571 /* Accumulating a single comment line.  */
572 
573 static struct string_buffer comment_buffer;
574 
575 static inline void
comment_start()576 comment_start ()
577 {
578   comment_buffer.utf8_buflen = 0;
579   comment_buffer.utf16_surr = 0;
580   comment_buffer.curr_buflen = 0;
581 }
582 
583 static inline bool
comment_at_start()584 comment_at_start ()
585 {
586   return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0
587 	  && comment_buffer.curr_buflen == 0);
588 }
589 
590 static inline void
comment_add(int c)591 comment_add (int c)
592 {
593   string_buffer_append (&comment_buffer, c);
594 }
595 
596 static inline void
comment_line_end(size_t chars_to_remove)597 comment_line_end (size_t chars_to_remove)
598 {
599   char *buffer = string_buffer_result (&comment_buffer);
600   size_t buflen = strlen (buffer);
601 
602   buflen -= chars_to_remove;
603   while (buflen >= 1
604 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
605     --buflen;
606   buffer[buflen] = '\0';
607   savable_comment_add (buffer);
608 }
609 
610 
611 /* These are for tracking whether comments count as immediately before
612    keyword.  */
613 static int last_comment_line;
614 static int last_non_comment_line;
615 
616 
617 /* Replace each comment that is not inside a character constant or string
618    literal with a space or newline character.  */
619 
620 static int
phase4_getc()621 phase4_getc ()
622 {
623   int c0;
624   int c;
625   bool last_was_star;
626 
627   c0 = phase3_getc ();
628   if (RED (c0) != '/')
629     return c0;
630   c = phase3_getc ();
631   switch (RED (c))
632     {
633     default:
634       phase3_ungetc (c);
635       return c0;
636 
637     case '*':
638       /* C style comment.  */
639       comment_start ();
640       last_was_star = false;
641       for (;;)
642 	{
643 	  c = phase3_getc ();
644 	  if (c == P2_EOF)
645 	    break;
646 	  /* We skip all leading white space, but not EOLs.  */
647 	  if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
648 	    comment_add (c);
649 	  switch (RED (c))
650 	    {
651 	    case '\n':
652 	      comment_line_end (1);
653 	      comment_start ();
654 	      last_was_star = false;
655 	      continue;
656 
657 	    case '*':
658 	      last_was_star = true;
659 	      continue;
660 
661 	    case '/':
662 	      if (last_was_star)
663 		{
664 		  comment_line_end (2);
665 		  break;
666 		}
667 	      /* FALLTHROUGH */
668 
669 	    default:
670 	      last_was_star = false;
671 	      continue;
672 	    }
673 	  break;
674 	}
675       last_comment_line = line_number;
676       return ' ';
677 
678     case '/':
679       /* C++ style comment.  */
680       last_comment_line = line_number;
681       comment_start ();
682       for (;;)
683 	{
684 	  c = phase3_getc ();
685 	  if (RED (c) == '\n' || c == P2_EOF)
686 	    break;
687 	  /* We skip all leading white space, but not EOLs.  */
688 	  if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
689 	    comment_add (c);
690 	}
691       phase3_ungetc (c); /* push back the newline, to decrement line_number */
692       comment_line_end (0);
693       phase3_getc (); /* read the newline again */
694       return '\n';
695     }
696 }
697 
698 /* Supports only one pushback character.  */
699 static void
phase4_ungetc(int c)700 phase4_ungetc (int c)
701 {
702   phase3_ungetc (c);
703 }
704 
705 
706 /* ========================== Reading of tokens.  ========================== */
707 
708 enum token_type_ty
709 {
710   token_type_eof,
711   token_type_lparen,		/* ( */
712   token_type_rparen,		/* ) */
713   token_type_lbrace,		/* { */
714   token_type_rbrace,		/* } */
715   token_type_comma,		/* , */
716   token_type_dot,		/* . */
717   token_type_string_literal,	/* "abc" */
718   token_type_number,		/* 1.23 */
719   token_type_symbol,		/* identifier, keyword, null */
720   token_type_plus,		/* + */
721   token_type_other		/* character literal, misc. operator */
722 };
723 typedef enum token_type_ty token_type_ty;
724 
725 typedef struct token_ty token_ty;
726 struct token_ty
727 {
728   token_type_ty type;
729   char *string;		/* for token_type_string_literal, token_type_symbol */
730   refcounted_string_list_ty *comment;	/* for token_type_string_literal */
731   int line_number;
732 };
733 
734 
735 /* Free the memory pointed to by a 'struct token_ty'.  */
736 static inline void
free_token(token_ty * tp)737 free_token (token_ty *tp)
738 {
739   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
740     free (tp->string);
741   if (tp->type == token_type_string_literal)
742     drop_reference (tp->comment);
743 }
744 
745 
746 /* Read an escape sequence inside a string literal or character literal.  */
747 static inline int
do_getc_escaped()748 do_getc_escaped ()
749 {
750   int c;
751 
752   /* Use phase 3, because phase 4 elides comments.  */
753   c = phase3_getc ();
754   if (c == P2_EOF)
755     return UNICODE ('\\');
756   switch (RED (c))
757     {
758     case 'b':
759       return UNICODE (0x08);
760     case 't':
761       return UNICODE (0x09);
762     case 'n':
763       return UNICODE (0x0a);
764     case 'f':
765       return UNICODE (0x0c);
766     case 'r':
767       return UNICODE (0x0d);
768     case '"':
769       return UNICODE ('"');
770     case '\'':
771       return UNICODE ('\'');
772     case '\\':
773       return UNICODE ('\\');
774     case '0': case '1': case '2': case '3':
775     case '4': case '5': case '6': case '7':
776       {
777 	int n = RED (c) - '0';
778 	bool maybe3digits = (n < 4);
779 
780 	c = phase3_getc ();
781 	if (RED (c) >= '0' && RED (c) <= '7')
782 	  {
783 	    n = (n << 3) + (RED (c) - '0');
784 	    if (maybe3digits)
785 	      {
786 		c = phase3_getc ();
787 		if (RED (c) >= '0' && RED (c) <= '7')
788 		  n = (n << 3) + (RED (c) - '0');
789 		else
790 		  phase3_ungetc (c);
791 	      }
792 	  }
793 	else
794 	  phase3_ungetc (c);
795 
796 	return UNICODE (n);
797       }
798     default:
799       /* Invalid escape sequence.  */
800       phase3_ungetc (c);
801       return UNICODE ('\\');
802     }
803 }
804 
805 /* Read a string literal or character literal.  */
806 static void
accumulate_escaped(struct string_buffer * literal,int delimiter)807 accumulate_escaped (struct string_buffer *literal, int delimiter)
808 {
809   int c;
810 
811   for (;;)
812     {
813       /* Use phase 3, because phase 4 elides comments.  */
814       c = phase3_getc ();
815       if (c == P2_EOF || RED (c) == delimiter)
816 	break;
817       if (RED (c) == '\n')
818 	{
819 	  phase3_ungetc (c);
820 	  error_with_progname = false;
821 	  if (delimiter == '\'')
822 	    error (0, 0, _("%s:%d: warning: unterminated character constant"),
823 		   logical_file_name, line_number);
824 	  else
825 	    error (0, 0, _("%s:%d: warning: unterminated string constant"),
826 		   logical_file_name, line_number);
827 	  error_with_progname = true;
828 	  break;
829 	}
830       if (RED (c) == '\\')
831 	c = do_getc_escaped ();
832       string_buffer_append (literal, c);
833     }
834 }
835 
836 
837 /* Combine characters into tokens.  Discard whitespace.  */
838 
839 static token_ty phase5_pushback[3];
840 static int phase5_pushback_length;
841 
842 static void
phase5_get(token_ty * tp)843 phase5_get (token_ty *tp)
844 {
845   int c;
846 
847   if (phase5_pushback_length)
848     {
849       *tp = phase5_pushback[--phase5_pushback_length];
850       return;
851     }
852   tp->string = NULL;
853 
854   for (;;)
855     {
856       tp->line_number = line_number;
857       c = phase4_getc ();
858 
859       if (c == P2_EOF)
860 	{
861 	  tp->type = token_type_eof;
862 	  return;
863 	}
864 
865       switch (RED (c))
866 	{
867 	case '\n':
868 	  if (last_non_comment_line > last_comment_line)
869 	    savable_comment_reset ();
870 	  /* FALLTHROUGH */
871 	case ' ':
872 	case '\t':
873 	case '\f':
874 	  /* Ignore whitespace and comments.  */
875 	  continue;
876 	}
877 
878       last_non_comment_line = tp->line_number;
879 
880       switch (RED (c))
881 	{
882 	case '(':
883 	  tp->type = token_type_lparen;
884 	  return;
885 
886 	case ')':
887 	  tp->type = token_type_rparen;
888 	  return;
889 
890 	case '{':
891 	  tp->type = token_type_lbrace;
892 	  return;
893 
894 	case '}':
895 	  tp->type = token_type_rbrace;
896 	  return;
897 
898 	case ',':
899 	  tp->type = token_type_comma;
900 	  return;
901 
902 	case '.':
903 	  c = phase4_getc ();
904 	  if (!(RED (c) >= '0' && RED (c) <= '9'))
905 	    {
906 	      phase4_ungetc (c);
907 	      tp->type = token_type_dot;
908 	      return;
909 	    }
910 	  /* FALLTHROUGH */
911 
912 	case '0': case '1': case '2': case '3': case '4':
913 	case '5': case '6': case '7': case '8': case '9':
914 	  {
915 	    /* Don't need to verify the complicated syntax of integers and
916 	       floating-point numbers.  We assume a valid Java input.
917 	       The simplified syntax that we recognize as number is: any
918 	       sequence of alphanumeric characters, additionally '+' and '-'
919 	       immediately after 'e' or 'E' except in hexadecimal numbers.  */
920 	    bool hexadecimal = false;
921 
922 	    for (;;)
923 	      {
924 		c = phase4_getc ();
925 		if (RED (c) >= '0' && RED (c) <= '9')
926 		  continue;
927 		if ((RED (c) >= 'A' && RED (c) <= 'Z')
928 		    || (RED (c) >= 'a' && RED (c) <= 'z'))
929 		  {
930 		    if (RED (c) == 'X' || RED (c) == 'x')
931 		      hexadecimal = true;
932 		    if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
933 		      {
934 			c = phase4_getc ();
935 			if (!(RED (c) == '+' || RED (c) == '-'))
936 			  phase4_ungetc (c);
937 		      }
938 		    continue;
939 		  }
940 		if (RED (c) == '.')
941 		  continue;
942 		break;
943 	      }
944 	    phase4_ungetc (c);
945 	    tp->type = token_type_number;
946 	    return;
947 	  }
948 
949 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
950 	case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
951 	case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
952 	case 'V': case 'W': case 'X': case 'Y': case 'Z':
953 	case '_':
954 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
955 	case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
956 	case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
957 	case 'v': case 'w': case 'x': case 'y': case 'z':
958 	  /* Although Java allows identifiers containing many Unicode
959 	     characters, we recognize only identifiers consisting of ASCII
960 	     characters.  This avoids conversion hassles w.r.t. the --keyword
961 	     arguments, and shouldn't be a big problem in practice.  */
962 	  {
963 	    static char *buffer;
964 	    static int bufmax;
965 	    int bufpos = 0;
966 	    for (;;)
967 	      {
968 		if (bufpos >= bufmax)
969 		  {
970 		    bufmax = 2 * bufmax + 10;
971 		    buffer = xrealloc (buffer, bufmax);
972 		  }
973 		buffer[bufpos++] = RED (c);
974 		c = phase4_getc ();
975 		if (!((RED (c) >= 'A' && RED (c) <= 'Z')
976 		      || (RED (c) >= 'a' && RED (c) <= 'z')
977 		      || (RED (c) >= '0' && RED (c) <= '9')
978 		      || RED (c) == '_'))
979 		  break;
980 	      }
981 	    phase4_ungetc (c);
982 	    if (bufpos >= bufmax)
983 	      {
984 		bufmax = 2 * bufmax + 10;
985 		buffer = xrealloc (buffer, bufmax);
986 	      }
987 	    buffer[bufpos] = '\0';
988 	    tp->string = xstrdup (buffer);
989 	    tp->type = token_type_symbol;
990 	    return;
991 	  }
992 
993 	case '"':
994 	  /* String literal.  */
995 	  {
996 	    struct string_buffer literal;
997 
998 	    init_string_buffer (&literal);
999 	    accumulate_escaped (&literal, '"');
1000 	    tp->string = xstrdup (string_buffer_result (&literal));
1001 	    free_string_buffer (&literal);
1002 	    tp->comment = add_reference (savable_comment);
1003 	    tp->type = token_type_string_literal;
1004 	    return;
1005 	  }
1006 
1007 	case '\'':
1008 	  /* Character literal.  */
1009 	  {
1010 	    struct string_buffer literal;
1011 
1012 	    init_string_buffer (&literal);
1013 	    accumulate_escaped (&literal, '\'');
1014 	    free_string_buffer (&literal);
1015 	    tp->type = token_type_other;
1016 	    return;
1017 	  }
1018 
1019 	case '+':
1020 	  c = phase4_getc ();
1021 	  if (RED (c) == '+')
1022 	    /* Operator ++ */
1023 	    tp->type = token_type_other;
1024 	  else if (RED (c) == '=')
1025 	    /* Operator += */
1026 	    tp->type = token_type_other;
1027 	  else
1028 	    {
1029 	      /* Operator + */
1030 	      phase4_ungetc (c);
1031 	      tp->type = token_type_plus;
1032 	    }
1033 	  return;
1034 
1035 	default:
1036 	  /* Misc. operator.  */
1037 	  tp->type = token_type_other;
1038 	  return;
1039 	}
1040     }
1041 }
1042 
1043 /* Supports 3 tokens of pushback.  */
1044 static void
phase5_unget(token_ty * tp)1045 phase5_unget (token_ty *tp)
1046 {
1047   if (tp->type != token_type_eof)
1048     {
1049       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1050 	abort ();
1051       phase5_pushback[phase5_pushback_length++] = *tp;
1052     }
1053 }
1054 
1055 
1056 /* Compile-time optimization of string literal concatenation.
1057    Combine "string1" + ... + "stringN" to the concatenated string if
1058      - the token before this expression is not ')' (because then the first
1059        string could be part of a cast expression),
1060      - the token after this expression is not '.' (because then the last
1061        string could be part of a method call expression).  */
1062 
1063 static token_ty phase6_pushback[2];
1064 static int phase6_pushback_length;
1065 
1066 static token_type_ty phase6_last;
1067 
1068 static void
phase6_get(token_ty * tp)1069 phase6_get (token_ty *tp)
1070 {
1071   if (phase6_pushback_length)
1072     {
1073       *tp = phase6_pushback[--phase6_pushback_length];
1074       return;
1075     }
1076 
1077   phase5_get (tp);
1078   if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1079     {
1080       char *sum = tp->string;
1081       size_t sum_len = strlen (sum);
1082 
1083       for (;;)
1084 	{
1085 	  token_ty token2;
1086 
1087 	  phase5_get (&token2);
1088 	  if (token2.type == token_type_plus)
1089 	    {
1090 	      token_ty token3;
1091 
1092 	      phase5_get (&token3);
1093 	      if (token3.type == token_type_string_literal)
1094 		{
1095 		  token_ty token_after;
1096 
1097 		  phase5_get (&token_after);
1098 		  if (token_after.type != token_type_dot)
1099 		    {
1100 		      char *addend = token3.string;
1101 		      size_t addend_len = strlen (addend);
1102 
1103 		      sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1104 		      memcpy (sum + sum_len, addend, addend_len + 1);
1105 		      sum_len += addend_len;
1106 
1107 		      phase5_unget (&token_after);
1108 		      free_token (&token3);
1109 		      free_token (&token2);
1110 		      continue;
1111 		    }
1112 		  phase5_unget (&token_after);
1113 		}
1114 	      phase5_unget (&token3);
1115 	    }
1116 	  phase5_unget (&token2);
1117 	  break;
1118 	}
1119       tp->string = sum;
1120     }
1121   phase6_last = tp->type;
1122 }
1123 
1124 /* Supports 2 tokens of pushback.  */
1125 static void
phase6_unget(token_ty * tp)1126 phase6_unget (token_ty *tp)
1127 {
1128   if (tp->type != token_type_eof)
1129     {
1130       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1131 	abort ();
1132       phase6_pushback[phase6_pushback_length++] = *tp;
1133     }
1134 }
1135 
1136 
1137 static void
x_java_lex(token_ty * tp)1138 x_java_lex (token_ty *tp)
1139 {
1140   phase6_get (tp);
1141 }
1142 
1143 /* Supports 2 tokens of pushback.  */
1144 static void
x_java_unlex(token_ty * tp)1145 x_java_unlex (token_ty *tp)
1146 {
1147   phase6_unget (tp);
1148 }
1149 
1150 
1151 /* ========================= Extracting strings.  ========================== */
1152 
1153 
1154 /* Context lookup table.  */
1155 static flag_context_list_table_ty *flag_context_list_table;
1156 
1157 
1158 /* The file is broken into tokens.  Scan the token stream, looking for
1159    a keyword, followed by a left paren, followed by a string.  When we
1160    see this sequence, we have something to remember.  We assume we are
1161    looking at a valid C or C++ program, and leave the complaints about
1162    the grammar to the compiler.
1163 
1164      Normal handling: Look for
1165        keyword ( ... msgid ... )
1166      Plural handling: Look for
1167        keyword ( ... msgid ... msgid_plural ... )
1168 
1169    We use recursion because the arguments before msgid or between msgid
1170    and msgid_plural can contain subexpressions of the same form.  */
1171 
1172 
1173 /* Extract messages until the next balanced closing parenthesis or brace,
1174    depending on TERMINATOR.
1175    Extracted messages are added to MLP.
1176    Return true upon eof, false upon closing parenthesis or brace.  */
1177 static bool
extract_parenthesized(message_list_ty * mlp,token_type_ty terminator,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1178 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1179 		       flag_context_ty outer_context,
1180 		       flag_context_list_iterator_ty context_iter,
1181 		       struct arglist_parser *argparser)
1182 {
1183   /* Current argument number.  */
1184   int arg = 1;
1185   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1186   int state;
1187   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1188   const struct callshapes *next_shapes = NULL;
1189   /* Context iterator that will be used if the next token is a '('.  */
1190   flag_context_list_iterator_ty next_context_iter =
1191     passthrough_context_list_iterator;
1192   /* Current context.  */
1193   flag_context_ty inner_context =
1194     inherited_context (outer_context,
1195 		       flag_context_list_iterator_advance (&context_iter));
1196 
1197   /* Start state is 0.  */
1198   state = 0;
1199 
1200   for (;;)
1201     {
1202       token_ty token;
1203 
1204       x_java_lex (&token);
1205       switch (token.type)
1206 	{
1207 	case token_type_symbol:
1208 	  {
1209 	    /* Combine symbol1 . ... . symbolN to a single strings, so that
1210 	       we can recognize static function calls like
1211 	       GettextResource.gettext.  The information present for
1212 	       symbolI.....symbolN has precedence over the information for
1213 	       symbolJ.....symbolN with J > I.  */
1214 	    char *sum = token.string;
1215 	    size_t sum_len = strlen (sum);
1216 	    const char *dottedname;
1217 	    flag_context_list_ty *context_list;
1218 
1219 	    for (;;)
1220 	      {
1221 		token_ty token2;
1222 
1223 		x_java_lex (&token2);
1224 		if (token2.type == token_type_dot)
1225 		  {
1226 		    token_ty token3;
1227 
1228 		    x_java_lex (&token3);
1229 		    if (token3.type == token_type_symbol)
1230 		      {
1231 			char *addend = token3.string;
1232 			size_t addend_len = strlen (addend);
1233 
1234 			sum =
1235 			  (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1236 			sum[sum_len] = '.';
1237 			memcpy (sum + sum_len + 1, addend, addend_len + 1);
1238 			sum_len += 1 + addend_len;
1239 
1240 			free_token (&token3);
1241 			free_token (&token2);
1242 			continue;
1243 		      }
1244 		    x_java_unlex (&token3);
1245 		  }
1246 		x_java_unlex (&token2);
1247 		break;
1248 	      }
1249 
1250 	    for (dottedname = sum;;)
1251 	      {
1252 		void *keyword_value;
1253 
1254 		if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1255 				     &keyword_value)
1256 		    == 0)
1257 		  {
1258 		    next_shapes = (const struct callshapes *) keyword_value;
1259 		    state = 1;
1260 		    break;
1261 		  }
1262 
1263 		dottedname = strchr (dottedname, '.');
1264 		if (dottedname == NULL)
1265 		  {
1266 		    state = 0;
1267 		    break;
1268 		  }
1269 		dottedname++;
1270 	      }
1271 
1272 	    for (dottedname = sum;;)
1273 	      {
1274 		context_list =
1275 		  flag_context_list_table_lookup (
1276 		    flag_context_list_table,
1277 		    dottedname, strlen (dottedname));
1278 		if (context_list != NULL)
1279 		  break;
1280 
1281 		dottedname = strchr (dottedname, '.');
1282 		if (dottedname == NULL)
1283 		  break;
1284 		dottedname++;
1285 	      }
1286 	    next_context_iter = flag_context_list_iterator (context_list);
1287 
1288 	    free (sum);
1289 	    continue;
1290 	  }
1291 
1292 	case token_type_lparen:
1293 	  if (extract_parenthesized (mlp, token_type_rparen,
1294 				     inner_context, next_context_iter,
1295 				     arglist_parser_alloc (mlp,
1296 							   state ? next_shapes : NULL)))
1297 	    {
1298 	      xgettext_current_source_encoding = po_charset_utf8;
1299 	      arglist_parser_done (argparser, arg);
1300 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1301 	      return true;
1302 	    }
1303 	  next_context_iter = null_context_list_iterator;
1304 	  state = 0;
1305 	  continue;
1306 
1307 	case token_type_rparen:
1308 	  if (terminator == token_type_rparen)
1309 	    {
1310 	      xgettext_current_source_encoding = po_charset_utf8;
1311 	      arglist_parser_done (argparser, arg);
1312 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1313 	      return false;
1314 	    }
1315 	  if (terminator == token_type_rbrace)
1316 	    {
1317 	      error_with_progname = false;
1318 	      error (0, 0,
1319 		     _("%s:%d: warning: ')' found where '}' was expected"),
1320 		     logical_file_name, token.line_number);
1321 	      error_with_progname = true;
1322 	    }
1323 	  next_context_iter = null_context_list_iterator;
1324 	  state = 0;
1325 	  continue;
1326 
1327 	case token_type_lbrace:
1328 	  if (extract_parenthesized (mlp, token_type_rbrace,
1329 				     null_context, null_context_list_iterator,
1330 				     arglist_parser_alloc (mlp, NULL)))
1331 	    {
1332 	      xgettext_current_source_encoding = po_charset_utf8;
1333 	      arglist_parser_done (argparser, arg);
1334 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1335 	      return true;
1336 	    }
1337 	  next_context_iter = null_context_list_iterator;
1338 	  state = 0;
1339 	  continue;
1340 
1341 	case token_type_rbrace:
1342 	  if (terminator == token_type_rbrace)
1343 	    {
1344 	      xgettext_current_source_encoding = po_charset_utf8;
1345 	      arglist_parser_done (argparser, arg);
1346 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1347 	      return false;
1348 	    }
1349 	  if (terminator == token_type_rparen)
1350 	    {
1351 	      error_with_progname = false;
1352 	      error (0, 0,
1353 		     _("%s:%d: warning: '}' found where ')' was expected"),
1354 		     logical_file_name, token.line_number);
1355 	      error_with_progname = true;
1356 	    }
1357 	  next_context_iter = null_context_list_iterator;
1358 	  state = 0;
1359 	  continue;
1360 
1361 	case token_type_comma:
1362 	  arg++;
1363 	  inner_context =
1364 	    inherited_context (outer_context,
1365 			       flag_context_list_iterator_advance (
1366 				 &context_iter));
1367 	  next_context_iter = passthrough_context_list_iterator;
1368 	  state = 0;
1369 	  continue;
1370 
1371 	case token_type_string_literal:
1372 	  {
1373 	    lex_pos_ty pos;
1374 	    pos.file_name = logical_file_name;
1375 	    pos.line_number = token.line_number;
1376 
1377 	    xgettext_current_source_encoding = po_charset_utf8;
1378 	    if (extract_all)
1379 	      remember_a_message (mlp, NULL, token.string, inner_context,
1380 				  &pos, token.comment);
1381 	    else
1382 	      arglist_parser_remember (argparser, arg, token.string,
1383 				       inner_context,
1384 				       pos.file_name, pos.line_number,
1385 				       token.comment);
1386 	    xgettext_current_source_encoding = xgettext_global_source_encoding;
1387 	  }
1388 	  drop_reference (token.comment);
1389 	  next_context_iter = null_context_list_iterator;
1390 	  state = 0;
1391 	  continue;
1392 
1393 	case token_type_eof:
1394 	  xgettext_current_source_encoding = po_charset_utf8;
1395 	  arglist_parser_done (argparser, arg);
1396 	  xgettext_current_source_encoding = xgettext_global_source_encoding;
1397 	  return true;
1398 
1399 	case token_type_dot:
1400 	case token_type_number:
1401 	case token_type_plus:
1402 	case token_type_other:
1403 	  next_context_iter = null_context_list_iterator;
1404 	  state = 0;
1405 	  continue;
1406 
1407 	default:
1408 	  abort ();
1409 	}
1410     }
1411 }
1412 
1413 
1414 void
extract_java(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1415 extract_java (FILE *f,
1416 	      const char *real_filename, const char *logical_filename,
1417 	      flag_context_list_table_ty *flag_table,
1418 	      msgdomain_list_ty *mdlp)
1419 {
1420   message_list_ty *mlp = mdlp->item[0]->messages;
1421 
1422   fp = f;
1423   real_file_name = real_filename;
1424   logical_file_name = xstrdup (logical_filename);
1425   line_number = 1;
1426 
1427   last_comment_line = -1;
1428   last_non_comment_line = -1;
1429 
1430   phase6_last = token_type_eof;
1431 
1432   flag_context_list_table = flag_table;
1433 
1434   init_keywords ();
1435 
1436   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1437      due to an unbalanced closing parenthesis, just restart it.  */
1438   while (!extract_parenthesized (mlp, token_type_eof,
1439 				 null_context, null_context_list_iterator,
1440 				 arglist_parser_alloc (mlp, NULL)))
1441     ;
1442 
1443   fp = NULL;
1444   real_file_name = NULL;
1445   logical_file_name = NULL;
1446   line_number = 0;
1447 }
1448