1 /* Reading Java .properties files.
2 Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
22
23 /* Specification. */
24 #include "read-properties.h"
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "error.h"
34 #include "error-progname.h"
35 #include "message.h"
36 #include "read-catalog-abstract.h"
37 #include "xalloc.h"
38 #include "xvasprintf.h"
39 #include "po-xerror.h"
40 #include "msgl-ascii.h"
41 #include "utf16-ucs4.h"
42 #include "ucs4-utf8.h"
43 #include "gettext.h"
44
45 #define _(str) gettext (str)
46
47 /* The format of the Java .properties files is documented in the JDK
48 documentation for class java.util.Properties. In the case of .properties
49 files for PropertyResourceBundle, each non-comment line contains a
50 key/value pair in the form "key = value" or "key : value" or "key value",
51 where the key is the msgid and the value is the msgstr. Messages with
52 plurals are not supported in this format. */
53
54 /* Handling of comments: We copy all comments from the .properties file to
55 the PO file. This is not really needed; it's a service for translators
56 who don't like PO files and prefer to maintain the .properties file. */
57
58 /* Real filename, used in error messages about the input file. */
59 static const char *real_file_name;
60
61 /* File name and line number. */
62 extern lex_pos_ty gram_pos;
63
64 /* The input file stream. */
65 static FILE *fp;
66
67
68 /* Phase 1: Read an ISO-8859-1 character.
69 Max. 1 pushback character. */
70
71 static int
phase1_getc()72 phase1_getc ()
73 {
74 int c;
75
76 c = getc (fp);
77
78 if (c == EOF)
79 {
80 if (ferror (fp))
81 {
82 const char *errno_description = strerror (errno);
83 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
84 xasprintf ("%s: %s",
85 xasprintf (_("error while reading \"%s\""),
86 real_file_name),
87 errno_description));
88 }
89 return EOF;
90 }
91
92 return c;
93 }
94
95 static inline void
phase1_ungetc(int c)96 phase1_ungetc (int c)
97 {
98 if (c != EOF)
99 ungetc (c, fp);
100 }
101
102
103 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
104 Max. 2 pushback characters. */
105
106 static unsigned char phase2_pushback[2];
107 static int phase2_pushback_length;
108
109 static int
phase2_getc()110 phase2_getc ()
111 {
112 int c;
113
114 if (phase2_pushback_length)
115 c = phase2_pushback[--phase2_pushback_length];
116 else
117 {
118 c = phase1_getc ();
119
120 if (c == '\r')
121 {
122 int c2 = phase1_getc ();
123 if (c2 == '\n')
124 c = c2;
125 else
126 phase1_ungetc (c2);
127 }
128 }
129
130 if (c == '\n')
131 gram_pos.line_number++;
132
133 return c;
134 }
135
136 static void
phase2_ungetc(int c)137 phase2_ungetc (int c)
138 {
139 if (c == '\n')
140 --gram_pos.line_number;
141 if (c != EOF)
142 phase2_pushback[phase2_pushback_length++] = c;
143 }
144
145
146 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
147 with handling of continuation lines.
148 Max. 1 pushback character. */
149
150 static int
phase3_getc()151 phase3_getc ()
152 {
153 int c = phase2_getc ();
154
155 for (;;)
156 {
157 if (c != '\\')
158 return c;
159
160 c = phase2_getc ();
161 if (c != '\n')
162 {
163 phase2_ungetc (c);
164 return '\\';
165 }
166
167 /* Skip the backslash-newline and all whitespace that follows it. */
168 do
169 c = phase2_getc ();
170 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
171 }
172 }
173
174 static inline void
phase3_ungetc(int c)175 phase3_ungetc (int c)
176 {
177 phase2_ungetc (c);
178 }
179
180
181 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
182 with handling of continuation lines and of \uxxxx sequences. */
183
184 static int
phase4_getuc()185 phase4_getuc ()
186 {
187 int c = phase3_getc ();
188
189 if (c == EOF)
190 return -1;
191 if (c == '\\')
192 {
193 int c2 = phase3_getc ();
194
195 if (c2 == 't')
196 return '\t';
197 if (c2 == 'n')
198 return '\n';
199 if (c2 == 'r')
200 return '\r';
201 if (c2 == 'f')
202 return '\f';
203 if (c2 == 'u')
204 {
205 unsigned int n = 0;
206 int i;
207
208 for (i = 0; i < 4; i++)
209 {
210 int c1 = phase3_getc ();
211
212 if (c1 >= '0' && c1 <= '9')
213 n = (n << 4) + (c1 - '0');
214 else if (c1 >= 'A' && c1 <= 'F')
215 n = (n << 4) + (c1 - 'A' + 10);
216 else if (c1 >= 'a' && c1 <= 'f')
217 n = (n << 4) + (c1 - 'a' + 10);
218 else
219 {
220 phase3_ungetc (c1);
221 po_xerror (PO_SEVERITY_ERROR, NULL,
222 real_file_name, gram_pos.line_number, (size_t)(-1),
223 false, _("warning: invalid \\uxxxx syntax for Unicode character"));
224 return 'u';
225 }
226 }
227 return n;
228 }
229
230 return c2;
231 }
232 else
233 return c;
234 }
235
236
237 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
238 static char *
conv_from_iso_8859_1(char * string)239 conv_from_iso_8859_1 (char *string)
240 {
241 if (is_ascii_string (string))
242 return string;
243 else
244 {
245 size_t length = strlen (string);
246 /* Each ISO-8859-1 character needs 2 bytes at worst. */
247 unsigned char *utf8_string = (unsigned char *) xmalloc (2 * length + 1);
248 unsigned char *q = utf8_string;
249 const char *str = string;
250 const char *str_limit = str + length;
251
252 while (str < str_limit)
253 {
254 unsigned int uc = (unsigned char) *str++;
255 int n = u8_uctomb (q, uc, 6);
256 assert (n > 0);
257 q += n;
258 }
259 *q = '\0';
260 assert (q - utf8_string <= 2 * length);
261
262 return (char *) utf8_string;
263 }
264 }
265
266
267 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
268 encoding. May destructively modify the argument string. */
269 static char *
conv_from_java(char * string)270 conv_from_java (char *string)
271 {
272 /* This conversion can only shrink the string, never increase its size.
273 So there is no need to xmalloc the result freshly. */
274 const char *p = string;
275 unsigned char *q = (unsigned char *) string;
276
277 while (*p != '\0')
278 {
279 if (p[0] == '\\' && p[1] == 'u')
280 {
281 unsigned int n = 0;
282 int i;
283
284 for (i = 0; i < 4; i++)
285 {
286 int c1 = (unsigned char) p[2 + i];
287
288 if (c1 >= '0' && c1 <= '9')
289 n = (n << 4) + (c1 - '0');
290 else if (c1 >= 'A' && c1 <= 'F')
291 n = (n << 4) + (c1 - 'A' + 10);
292 else if (c1 >= 'a' && c1 <= 'f')
293 n = (n << 4) + (c1 - 'a' + 10);
294 else
295 goto just_one_byte;
296 }
297
298 if (i == 4)
299 {
300 unsigned int uc;
301
302 if (n >= 0xd800 && n < 0xdc00)
303 {
304 if (p[6] == '\\' && p[7] == 'u')
305 {
306 unsigned int m = 0;
307
308 for (i = 0; i < 4; i++)
309 {
310 int c1 = (unsigned char) p[8 + i];
311
312 if (c1 >= '0' && c1 <= '9')
313 m = (m << 4) + (c1 - '0');
314 else if (c1 >= 'A' && c1 <= 'F')
315 m = (m << 4) + (c1 - 'A' + 10);
316 else if (c1 >= 'a' && c1 <= 'f')
317 m = (m << 4) + (c1 - 'a' + 10);
318 else
319 goto just_one_byte;
320 }
321
322 if (i == 4 && (m >= 0xdc00 && m < 0xe000))
323 {
324 /* Combine two UTF-16 words to a character. */
325 uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
326 p += 12;
327 }
328 else
329 goto just_one_byte;
330 }
331 else
332 goto just_one_byte;
333 }
334 else
335 {
336 uc = n;
337 p += 6;
338 }
339
340 q += u8_uctomb (q, uc, 6);
341 continue;
342 }
343 }
344 just_one_byte:
345 *q++ = (unsigned char) *p++;
346 }
347 *q = '\0';
348 return string;
349 }
350
351
352 /* Reads a key or value string.
353 Returns the string in UTF-8 encoding, or NULL if the end of the logical
354 line is reached.
355 Parsing ends:
356 - when returning NULL, after the end of the logical line,
357 - otherwise, if in_key is true, after the whitespace and possibly the
358 separator that follows after the string,
359 - otherwise, if in_key is false, after the end of the logical line. */
360
361 static char *
read_escaped_string(bool in_key)362 read_escaped_string (bool in_key)
363 {
364 static unsigned short *buffer;
365 static size_t bufmax;
366 static size_t buflen;
367 int c;
368
369 /* Skip whitespace before the string. */
370 do
371 c = phase3_getc ();
372 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
373
374 if (c == EOF || c == '\n')
375 /* Empty string. */
376 return NULL;
377
378 /* Start accumulating the string. We store the string in UTF-16 before
379 converting it to UTF-8. Why not converting every character directly to
380 UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
381 we must combine them to a single UTF-8 character. */
382 buflen = 0;
383 for (;;)
384 {
385 if (in_key && (c == '=' || c == ':'
386 || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
387 {
388 /* Skip whitespace after the string. */
389 while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
390 c = phase3_getc ();
391 /* Skip '=' or ':' separator. */
392 if (!(c == '=' || c == ':'))
393 phase3_ungetc (c);
394 break;
395 }
396
397 phase3_ungetc (c);
398
399 /* Read the next UTF-16 codepoint. */
400 c = phase4_getuc ();
401 if (c < 0)
402 break;
403 /* Append it to the buffer. */
404 if (buflen >= bufmax)
405 {
406 bufmax += 100;
407 buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
408 }
409 buffer[buflen++] = c;
410
411 c = phase3_getc ();
412 if (c == EOF || c == '\n')
413 {
414 if (in_key)
415 phase3_ungetc (c);
416 break;
417 }
418 }
419
420 /* Now convert from UTF-16 to UTF-8. */
421 {
422 size_t pos;
423 unsigned char *utf8_string;
424 unsigned char *q;
425
426 /* Each UTF-16 word needs 3 bytes at worst. */
427 utf8_string = (unsigned char *) xmalloc (3 * buflen + 1);
428 for (pos = 0, q = utf8_string; pos < buflen; )
429 {
430 unsigned int uc;
431 int n;
432
433 pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
434 n = u8_uctomb (q, uc, 6);
435 assert (n > 0);
436 q += n;
437 }
438 *q = '\0';
439 assert (q - utf8_string <= 3 * buflen);
440
441 return (char *) utf8_string;
442 }
443 }
444
445
446 /* Read a .properties file from a stream, and dispatch to the various
447 abstract_catalog_reader_class_ty methods. */
448 static void
properties_parse(abstract_catalog_reader_ty * this,FILE * file,const char * real_filename,const char * logical_filename)449 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
450 const char *real_filename, const char *logical_filename)
451 {
452 fp = file;
453 real_file_name = real_filename;
454 gram_pos.file_name = xstrdup (real_file_name);
455 gram_pos.line_number = 1;
456
457 for (;;)
458 {
459 int c;
460 bool comment;
461 bool hidden;
462
463 c = phase2_getc ();
464
465 if (c == EOF)
466 break;
467
468 comment = false;
469 hidden = false;
470 if (c == '#')
471 comment = true;
472 else if (c == '!')
473 {
474 /* For compatibility with write-properties.c, we treat '!' not
475 followed by space as a fuzzy or untranslated message. */
476 int c2 = phase2_getc ();
477 if (c2 == ' ' || c2 == '\n' || c2 == EOF)
478 comment = true;
479 else
480 hidden = true;
481 phase2_ungetc (c2);
482 }
483 else
484 phase2_ungetc (c);
485
486 if (comment)
487 {
488 /* A comment line. */
489 static char *buffer;
490 static size_t bufmax;
491 static size_t buflen;
492
493 buflen = 0;
494 for (;;)
495 {
496 c = phase2_getc ();
497
498 if (buflen >= bufmax)
499 {
500 bufmax += 100;
501 buffer = xrealloc (buffer, bufmax);
502 }
503
504 if (c == EOF || c == '\n')
505 break;
506
507 buffer[buflen++] = c;
508 }
509 buffer[buflen] = '\0';
510
511 po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
512 }
513 else
514 {
515 /* A key/value pair. */
516 char *msgid;
517 lex_pos_ty msgid_pos;
518
519 msgid_pos = gram_pos;
520 msgid = read_escaped_string (true);
521 if (msgid == NULL)
522 /* Skip blank line. */
523 ;
524 else
525 {
526 char *msgstr;
527 lex_pos_ty msgstr_pos;
528 bool force_fuzzy;
529
530 msgstr_pos = gram_pos;
531 msgstr = read_escaped_string (false);
532 if (msgstr == NULL)
533 msgstr = xstrdup ("");
534
535 /* Be sure to make the message fuzzy if it was commented out
536 and if it is not already header/fuzzy/untranslated. */
537 force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
538
539 po_callback_message (NULL, msgid, &msgid_pos, NULL,
540 msgstr, strlen (msgstr) + 1, &msgstr_pos,
541 NULL, NULL, NULL,
542 force_fuzzy, false);
543 }
544 }
545 }
546
547 fp = NULL;
548 real_file_name = NULL;
549 gram_pos.line_number = 0;
550 }
551
552 const struct catalog_input_format input_format_properties =
553 {
554 properties_parse, /* parse */
555 true /* produces_utf8 */
556 };
557