xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/read-properties.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Reading Java .properties files.
2    Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software Foundation,
17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
22 
23 /* Specification.  */
24 #include "read-properties.h"
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "error.h"
34 #include "error-progname.h"
35 #include "message.h"
36 #include "read-catalog-abstract.h"
37 #include "xalloc.h"
38 #include "xvasprintf.h"
39 #include "po-xerror.h"
40 #include "msgl-ascii.h"
41 #include "utf16-ucs4.h"
42 #include "ucs4-utf8.h"
43 #include "gettext.h"
44 
45 #define _(str) gettext (str)
46 
47 /* The format of the Java .properties files is documented in the JDK
48    documentation for class java.util.Properties.  In the case of .properties
49    files for PropertyResourceBundle, each non-comment line contains a
50    key/value pair in the form "key = value" or "key : value" or "key value",
51    where the key is the msgid and the value is the msgstr.  Messages with
52    plurals are not supported in this format.  */
53 
54 /* Handling of comments: We copy all comments from the .properties file to
55    the PO file. This is not really needed; it's a service for translators
56    who don't like PO files and prefer to maintain the .properties file.  */
57 
58 /* Real filename, used in error messages about the input file.  */
59 static const char *real_file_name;
60 
61 /* File name and line number.  */
62 extern lex_pos_ty gram_pos;
63 
64 /* The input file stream.  */
65 static FILE *fp;
66 
67 
68 /* Phase 1: Read an ISO-8859-1 character.
69    Max. 1 pushback character.  */
70 
71 static int
phase1_getc()72 phase1_getc ()
73 {
74   int c;
75 
76   c = getc (fp);
77 
78   if (c == EOF)
79     {
80       if (ferror (fp))
81 	{
82 	  const char *errno_description = strerror (errno);
83 	  po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
84 		     xasprintf ("%s: %s",
85 				xasprintf (_("error while reading \"%s\""),
86 					   real_file_name),
87 				errno_description));
88 	}
89       return EOF;
90     }
91 
92   return c;
93 }
94 
95 static inline void
phase1_ungetc(int c)96 phase1_ungetc (int c)
97 {
98   if (c != EOF)
99     ungetc (c, fp);
100 }
101 
102 
103 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
104    Max. 2 pushback characters.  */
105 
106 static unsigned char phase2_pushback[2];
107 static int phase2_pushback_length;
108 
109 static int
phase2_getc()110 phase2_getc ()
111 {
112   int c;
113 
114   if (phase2_pushback_length)
115     c = phase2_pushback[--phase2_pushback_length];
116   else
117     {
118       c = phase1_getc ();
119 
120       if (c == '\r')
121 	{
122 	  int c2 = phase1_getc ();
123 	  if (c2 == '\n')
124 	    c = c2;
125 	  else
126 	    phase1_ungetc (c2);
127 	}
128     }
129 
130   if (c == '\n')
131     gram_pos.line_number++;
132 
133   return c;
134 }
135 
136 static void
phase2_ungetc(int c)137 phase2_ungetc (int c)
138 {
139   if (c == '\n')
140     --gram_pos.line_number;
141   if (c != EOF)
142     phase2_pushback[phase2_pushback_length++] = c;
143 }
144 
145 
146 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
147    with handling of continuation lines.
148    Max. 1 pushback character.  */
149 
150 static int
phase3_getc()151 phase3_getc ()
152 {
153   int c = phase2_getc ();
154 
155   for (;;)
156     {
157       if (c != '\\')
158 	return c;
159 
160       c = phase2_getc ();
161       if (c != '\n')
162 	{
163 	  phase2_ungetc (c);
164 	  return '\\';
165 	}
166 
167       /* Skip the backslash-newline and all whitespace that follows it.  */
168       do
169 	c = phase2_getc ();
170       while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
171     }
172 }
173 
174 static inline void
phase3_ungetc(int c)175 phase3_ungetc (int c)
176 {
177   phase2_ungetc (c);
178 }
179 
180 
181 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
182    with handling of continuation lines and of \uxxxx sequences.  */
183 
184 static int
phase4_getuc()185 phase4_getuc ()
186 {
187   int c = phase3_getc ();
188 
189   if (c == EOF)
190     return -1;
191   if (c == '\\')
192     {
193       int c2 = phase3_getc ();
194 
195       if (c2 == 't')
196 	return '\t';
197       if (c2 == 'n')
198 	return '\n';
199       if (c2 == 'r')
200 	return '\r';
201       if (c2 == 'f')
202 	return '\f';
203       if (c2 == 'u')
204 	{
205 	  unsigned int n = 0;
206 	  int i;
207 
208 	  for (i = 0; i < 4; i++)
209 	    {
210 	      int c1 = phase3_getc ();
211 
212 	      if (c1 >= '0' && c1 <= '9')
213 		n = (n << 4) + (c1 - '0');
214 	      else if (c1 >= 'A' && c1 <= 'F')
215 		n = (n << 4) + (c1 - 'A' + 10);
216 	      else if (c1 >= 'a' && c1 <= 'f')
217 		n = (n << 4) + (c1 - 'a' + 10);
218 	      else
219 		{
220 		  phase3_ungetc (c1);
221 		  po_xerror (PO_SEVERITY_ERROR, NULL,
222 			     real_file_name, gram_pos.line_number, (size_t)(-1),
223 			     false, _("warning: invalid \\uxxxx syntax for Unicode character"));
224 		  return 'u';
225 		}
226 	    }
227 	  return n;
228 	}
229 
230       return c2;
231     }
232   else
233     return c;
234 }
235 
236 
237 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
238 static char *
conv_from_iso_8859_1(char * string)239 conv_from_iso_8859_1 (char *string)
240 {
241   if (is_ascii_string (string))
242     return string;
243   else
244     {
245       size_t length = strlen (string);
246       /* Each ISO-8859-1 character needs 2 bytes at worst.  */
247       unsigned char *utf8_string = (unsigned char *) xmalloc (2 * length + 1);
248       unsigned char *q = utf8_string;
249       const char *str = string;
250       const char *str_limit = str + length;
251 
252       while (str < str_limit)
253 	{
254 	  unsigned int uc = (unsigned char) *str++;
255 	  int n = u8_uctomb (q, uc, 6);
256 	  assert (n > 0);
257 	  q += n;
258 	}
259       *q = '\0';
260       assert (q - utf8_string <= 2 * length);
261 
262       return (char *) utf8_string;
263     }
264 }
265 
266 
267 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
268    encoding.  May destructively modify the argument string.  */
269 static char *
conv_from_java(char * string)270 conv_from_java (char *string)
271 {
272   /* This conversion can only shrink the string, never increase its size.
273      So there is no need to xmalloc the result freshly.  */
274   const char *p = string;
275   unsigned char *q = (unsigned char *) string;
276 
277   while (*p != '\0')
278     {
279       if (p[0] == '\\' && p[1] == 'u')
280 	{
281 	  unsigned int n = 0;
282 	  int i;
283 
284 	  for (i = 0; i < 4; i++)
285 	    {
286 	      int c1 = (unsigned char) p[2 + i];
287 
288 	      if (c1 >= '0' && c1 <= '9')
289 		n = (n << 4) + (c1 - '0');
290 	      else if (c1 >= 'A' && c1 <= 'F')
291 		n = (n << 4) + (c1 - 'A' + 10);
292 	      else if (c1 >= 'a' && c1 <= 'f')
293 		n = (n << 4) + (c1 - 'a' + 10);
294 	      else
295 		goto just_one_byte;
296 	    }
297 
298 	  if (i == 4)
299 	    {
300 	      unsigned int uc;
301 
302 	      if (n >= 0xd800 && n < 0xdc00)
303 		{
304 		  if (p[6] == '\\' && p[7] == 'u')
305 		    {
306 		      unsigned int m = 0;
307 
308 		      for (i = 0; i < 4; i++)
309 			{
310 			  int c1 = (unsigned char) p[8 + i];
311 
312 			  if (c1 >= '0' && c1 <= '9')
313 			    m = (m << 4) + (c1 - '0');
314 			  else if (c1 >= 'A' && c1 <= 'F')
315 			    m = (m << 4) + (c1 - 'A' + 10);
316 			  else if (c1 >= 'a' && c1 <= 'f')
317 			    m = (m << 4) + (c1 - 'a' + 10);
318 			  else
319 			    goto just_one_byte;
320 			}
321 
322 		      if (i == 4 && (m >= 0xdc00 && m < 0xe000))
323 			{
324 			  /* Combine two UTF-16 words to a character.  */
325 			  uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
326 			  p += 12;
327 			}
328 		      else
329 			goto just_one_byte;
330 		    }
331 		  else
332 		    goto just_one_byte;
333 		}
334 	      else
335 		{
336 		  uc = n;
337 		  p += 6;
338 		}
339 
340 	      q += u8_uctomb (q, uc, 6);
341 	      continue;
342 	    }
343 	}
344       just_one_byte:
345 	*q++ = (unsigned char) *p++;
346     }
347   *q = '\0';
348   return string;
349 }
350 
351 
352 /* Reads a key or value string.
353    Returns the string in UTF-8 encoding, or NULL if the end of the logical
354    line is reached.
355    Parsing ends:
356      - when returning NULL, after the end of the logical line,
357      - otherwise, if in_key is true, after the whitespace and possibly the
358        separator that follows after the string,
359      - otherwise, if in_key is false, after the end of the logical line. */
360 
361 static char *
read_escaped_string(bool in_key)362 read_escaped_string (bool in_key)
363 {
364   static unsigned short *buffer;
365   static size_t bufmax;
366   static size_t buflen;
367   int c;
368 
369   /* Skip whitespace before the string.  */
370   do
371     c = phase3_getc ();
372   while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
373 
374   if (c == EOF || c == '\n')
375     /* Empty string.  */
376     return NULL;
377 
378   /* Start accumulating the string.  We store the string in UTF-16 before
379      converting it to UTF-8.  Why not converting every character directly to
380      UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
381      we must combine them to a single UTF-8 character.  */
382   buflen = 0;
383   for (;;)
384     {
385       if (in_key && (c == '=' || c == ':'
386 		     || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
387 	{
388 	  /* Skip whitespace after the string.  */
389 	  while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
390 	    c = phase3_getc ();
391 	  /* Skip '=' or ':' separator.  */
392 	  if (!(c == '=' || c == ':'))
393 	    phase3_ungetc (c);
394 	  break;
395 	}
396 
397       phase3_ungetc (c);
398 
399       /* Read the next UTF-16 codepoint.  */
400       c = phase4_getuc ();
401       if (c < 0)
402 	break;
403       /* Append it to the buffer.  */
404       if (buflen >= bufmax)
405 	{
406 	  bufmax += 100;
407 	  buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
408 	}
409       buffer[buflen++] = c;
410 
411       c = phase3_getc ();
412       if (c == EOF || c == '\n')
413 	{
414 	  if (in_key)
415 	    phase3_ungetc (c);
416 	  break;
417 	}
418     }
419 
420   /* Now convert from UTF-16 to UTF-8.  */
421   {
422     size_t pos;
423     unsigned char *utf8_string;
424     unsigned char *q;
425 
426     /* Each UTF-16 word needs 3 bytes at worst.  */
427     utf8_string = (unsigned char *) xmalloc (3 * buflen + 1);
428     for (pos = 0, q = utf8_string; pos < buflen; )
429       {
430 	unsigned int uc;
431 	int n;
432 
433 	pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
434 	n = u8_uctomb (q, uc, 6);
435 	assert (n > 0);
436 	q += n;
437       }
438     *q = '\0';
439     assert (q - utf8_string <= 3 * buflen);
440 
441     return (char *) utf8_string;
442   }
443 }
444 
445 
446 /* Read a .properties file from a stream, and dispatch to the various
447    abstract_catalog_reader_class_ty methods.  */
448 static void
properties_parse(abstract_catalog_reader_ty * this,FILE * file,const char * real_filename,const char * logical_filename)449 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
450 		  const char *real_filename, const char *logical_filename)
451 {
452   fp = file;
453   real_file_name = real_filename;
454   gram_pos.file_name = xstrdup (real_file_name);
455   gram_pos.line_number = 1;
456 
457   for (;;)
458     {
459       int c;
460       bool comment;
461       bool hidden;
462 
463       c = phase2_getc ();
464 
465       if (c == EOF)
466 	break;
467 
468       comment = false;
469       hidden = false;
470       if (c == '#')
471 	comment = true;
472       else if (c == '!')
473 	{
474 	  /* For compatibility with write-properties.c, we treat '!' not
475 	     followed by space as a fuzzy or untranslated message.  */
476 	  int c2 = phase2_getc ();
477 	  if (c2 == ' ' || c2 == '\n' || c2 == EOF)
478 	    comment = true;
479 	  else
480 	    hidden = true;
481 	  phase2_ungetc (c2);
482 	}
483       else
484 	phase2_ungetc (c);
485 
486       if (comment)
487 	{
488 	  /* A comment line.  */
489 	  static char *buffer;
490 	  static size_t bufmax;
491 	  static size_t buflen;
492 
493 	  buflen = 0;
494 	  for (;;)
495 	    {
496 	      c = phase2_getc ();
497 
498 	      if (buflen >= bufmax)
499 		{
500 		  bufmax += 100;
501 		  buffer = xrealloc (buffer, bufmax);
502 		}
503 
504 	      if (c == EOF || c == '\n')
505 		break;
506 
507 	      buffer[buflen++] = c;
508 	    }
509 	  buffer[buflen] = '\0';
510 
511 	  po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
512 	}
513       else
514 	{
515 	  /* A key/value pair.  */
516 	  char *msgid;
517 	  lex_pos_ty msgid_pos;
518 
519 	  msgid_pos = gram_pos;
520 	  msgid = read_escaped_string (true);
521 	  if (msgid == NULL)
522 	    /* Skip blank line.  */
523 	    ;
524 	  else
525 	    {
526 	      char *msgstr;
527 	      lex_pos_ty msgstr_pos;
528 	      bool force_fuzzy;
529 
530 	      msgstr_pos = gram_pos;
531 	      msgstr = read_escaped_string (false);
532 	      if (msgstr == NULL)
533 		msgstr = xstrdup ("");
534 
535 	      /* Be sure to make the message fuzzy if it was commented out
536 		 and if it is not already header/fuzzy/untranslated.  */
537 	      force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
538 
539 	      po_callback_message (NULL, msgid, &msgid_pos, NULL,
540 				   msgstr, strlen (msgstr) + 1, &msgstr_pos,
541 				   NULL, NULL, NULL,
542 				   force_fuzzy, false);
543 	    }
544 	}
545     }
546 
547   fp = NULL;
548   real_file_name = NULL;
549   gram_pos.line_number = 0;
550 }
551 
552 const struct catalog_input_format input_format_properties =
553 {
554   properties_parse,			/* parse */
555   true					/* produces_utf8 */
556 };
557