1 /* Reading Java .properties files. 2 Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19 #ifdef HAVE_CONFIG_H 20 # include <config.h> 21 #endif 22 23 /* Specification. */ 24 #include "read-properties.h" 25 26 #include <assert.h> 27 #include <errno.h> 28 #include <stdbool.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 33 #include "error.h" 34 #include "error-progname.h" 35 #include "message.h" 36 #include "read-catalog-abstract.h" 37 #include "xalloc.h" 38 #include "xvasprintf.h" 39 #include "po-xerror.h" 40 #include "msgl-ascii.h" 41 #include "utf16-ucs4.h" 42 #include "ucs4-utf8.h" 43 #include "gettext.h" 44 45 #define _(str) gettext (str) 46 47 /* The format of the Java .properties files is documented in the JDK 48 documentation for class java.util.Properties. In the case of .properties 49 files for PropertyResourceBundle, each non-comment line contains a 50 key/value pair in the form "key = value" or "key : value" or "key value", 51 where the key is the msgid and the value is the msgstr. Messages with 52 plurals are not supported in this format. */ 53 54 /* Handling of comments: We copy all comments from the .properties file to 55 the PO file. This is not really needed; it's a service for translators 56 who don't like PO files and prefer to maintain the .properties file. */ 57 58 /* Real filename, used in error messages about the input file. */ 59 static const char *real_file_name; 60 61 /* File name and line number. */ 62 extern lex_pos_ty gram_pos; 63 64 /* The input file stream. */ 65 static FILE *fp; 66 67 68 /* Phase 1: Read an ISO-8859-1 character. 69 Max. 1 pushback character. */ 70 71 static int 72 phase1_getc () 73 { 74 int c; 75 76 c = getc (fp); 77 78 if (c == EOF) 79 { 80 if (ferror (fp)) 81 { 82 const char *errno_description = strerror (errno); 83 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false, 84 xasprintf ("%s: %s", 85 xasprintf (_("error while reading \"%s\""), 86 real_file_name), 87 errno_description)); 88 } 89 return EOF; 90 } 91 92 return c; 93 } 94 95 static inline void 96 phase1_ungetc (int c) 97 { 98 if (c != EOF) 99 ungetc (c, fp); 100 } 101 102 103 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF. 104 Max. 2 pushback characters. */ 105 106 static unsigned char phase2_pushback[2]; 107 static int phase2_pushback_length; 108 109 static int 110 phase2_getc () 111 { 112 int c; 113 114 if (phase2_pushback_length) 115 c = phase2_pushback[--phase2_pushback_length]; 116 else 117 { 118 c = phase1_getc (); 119 120 if (c == '\r') 121 { 122 int c2 = phase1_getc (); 123 if (c2 == '\n') 124 c = c2; 125 else 126 phase1_ungetc (c2); 127 } 128 } 129 130 if (c == '\n') 131 gram_pos.line_number++; 132 133 return c; 134 } 135 136 static void 137 phase2_ungetc (int c) 138 { 139 if (c == '\n') 140 --gram_pos.line_number; 141 if (c != EOF) 142 phase2_pushback[phase2_pushback_length++] = c; 143 } 144 145 146 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF, 147 with handling of continuation lines. 148 Max. 1 pushback character. */ 149 150 static int 151 phase3_getc () 152 { 153 int c = phase2_getc (); 154 155 for (;;) 156 { 157 if (c != '\\') 158 return c; 159 160 c = phase2_getc (); 161 if (c != '\n') 162 { 163 phase2_ungetc (c); 164 return '\\'; 165 } 166 167 /* Skip the backslash-newline and all whitespace that follows it. */ 168 do 169 c = phase2_getc (); 170 while (c == ' ' || c == '\t' || c == '\r' || c == '\f'); 171 } 172 } 173 174 static inline void 175 phase3_ungetc (int c) 176 { 177 phase2_ungetc (c); 178 } 179 180 181 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF, 182 with handling of continuation lines and of \uxxxx sequences. */ 183 184 static int 185 phase4_getuc () 186 { 187 int c = phase3_getc (); 188 189 if (c == EOF) 190 return -1; 191 if (c == '\\') 192 { 193 int c2 = phase3_getc (); 194 195 if (c2 == 't') 196 return '\t'; 197 if (c2 == 'n') 198 return '\n'; 199 if (c2 == 'r') 200 return '\r'; 201 if (c2 == 'f') 202 return '\f'; 203 if (c2 == 'u') 204 { 205 unsigned int n = 0; 206 int i; 207 208 for (i = 0; i < 4; i++) 209 { 210 int c1 = phase3_getc (); 211 212 if (c1 >= '0' && c1 <= '9') 213 n = (n << 4) + (c1 - '0'); 214 else if (c1 >= 'A' && c1 <= 'F') 215 n = (n << 4) + (c1 - 'A' + 10); 216 else if (c1 >= 'a' && c1 <= 'f') 217 n = (n << 4) + (c1 - 'a' + 10); 218 else 219 { 220 phase3_ungetc (c1); 221 po_xerror (PO_SEVERITY_ERROR, NULL, 222 real_file_name, gram_pos.line_number, (size_t)(-1), 223 false, _("warning: invalid \\uxxxx syntax for Unicode character")); 224 return 'u'; 225 } 226 } 227 return n; 228 } 229 230 return c2; 231 } 232 else 233 return c; 234 } 235 236 237 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */ 238 static char * 239 conv_from_iso_8859_1 (char *string) 240 { 241 if (is_ascii_string (string)) 242 return string; 243 else 244 { 245 size_t length = strlen (string); 246 /* Each ISO-8859-1 character needs 2 bytes at worst. */ 247 unsigned char *utf8_string = (unsigned char *) xmalloc (2 * length + 1); 248 unsigned char *q = utf8_string; 249 const char *str = string; 250 const char *str_limit = str + length; 251 252 while (str < str_limit) 253 { 254 unsigned int uc = (unsigned char) *str++; 255 int n = u8_uctomb (q, uc, 6); 256 assert (n > 0); 257 q += n; 258 } 259 *q = '\0'; 260 assert (q - utf8_string <= 2 * length); 261 262 return (char *) utf8_string; 263 } 264 } 265 266 267 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8 268 encoding. May destructively modify the argument string. */ 269 static char * 270 conv_from_java (char *string) 271 { 272 /* This conversion can only shrink the string, never increase its size. 273 So there is no need to xmalloc the result freshly. */ 274 const char *p = string; 275 unsigned char *q = (unsigned char *) string; 276 277 while (*p != '\0') 278 { 279 if (p[0] == '\\' && p[1] == 'u') 280 { 281 unsigned int n = 0; 282 int i; 283 284 for (i = 0; i < 4; i++) 285 { 286 int c1 = (unsigned char) p[2 + i]; 287 288 if (c1 >= '0' && c1 <= '9') 289 n = (n << 4) + (c1 - '0'); 290 else if (c1 >= 'A' && c1 <= 'F') 291 n = (n << 4) + (c1 - 'A' + 10); 292 else if (c1 >= 'a' && c1 <= 'f') 293 n = (n << 4) + (c1 - 'a' + 10); 294 else 295 goto just_one_byte; 296 } 297 298 if (i == 4) 299 { 300 unsigned int uc; 301 302 if (n >= 0xd800 && n < 0xdc00) 303 { 304 if (p[6] == '\\' && p[7] == 'u') 305 { 306 unsigned int m = 0; 307 308 for (i = 0; i < 4; i++) 309 { 310 int c1 = (unsigned char) p[8 + i]; 311 312 if (c1 >= '0' && c1 <= '9') 313 m = (m << 4) + (c1 - '0'); 314 else if (c1 >= 'A' && c1 <= 'F') 315 m = (m << 4) + (c1 - 'A' + 10); 316 else if (c1 >= 'a' && c1 <= 'f') 317 m = (m << 4) + (c1 - 'a' + 10); 318 else 319 goto just_one_byte; 320 } 321 322 if (i == 4 && (m >= 0xdc00 && m < 0xe000)) 323 { 324 /* Combine two UTF-16 words to a character. */ 325 uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00); 326 p += 12; 327 } 328 else 329 goto just_one_byte; 330 } 331 else 332 goto just_one_byte; 333 } 334 else 335 { 336 uc = n; 337 p += 6; 338 } 339 340 q += u8_uctomb (q, uc, 6); 341 continue; 342 } 343 } 344 just_one_byte: 345 *q++ = (unsigned char) *p++; 346 } 347 *q = '\0'; 348 return string; 349 } 350 351 352 /* Reads a key or value string. 353 Returns the string in UTF-8 encoding, or NULL if the end of the logical 354 line is reached. 355 Parsing ends: 356 - when returning NULL, after the end of the logical line, 357 - otherwise, if in_key is true, after the whitespace and possibly the 358 separator that follows after the string, 359 - otherwise, if in_key is false, after the end of the logical line. */ 360 361 static char * 362 read_escaped_string (bool in_key) 363 { 364 static unsigned short *buffer; 365 static size_t bufmax; 366 static size_t buflen; 367 int c; 368 369 /* Skip whitespace before the string. */ 370 do 371 c = phase3_getc (); 372 while (c == ' ' || c == '\t' || c == '\r' || c == '\f'); 373 374 if (c == EOF || c == '\n') 375 /* Empty string. */ 376 return NULL; 377 378 /* Start accumulating the string. We store the string in UTF-16 before 379 converting it to UTF-8. Why not converting every character directly to 380 UTF-8? Because a string can contain surrogates like \uD800\uDF00, and 381 we must combine them to a single UTF-8 character. */ 382 buflen = 0; 383 for (;;) 384 { 385 if (in_key && (c == '=' || c == ':' 386 || c == ' ' || c == '\t' || c == '\r' || c == '\f')) 387 { 388 /* Skip whitespace after the string. */ 389 while (c == ' ' || c == '\t' || c == '\r' || c == '\f') 390 c = phase3_getc (); 391 /* Skip '=' or ':' separator. */ 392 if (!(c == '=' || c == ':')) 393 phase3_ungetc (c); 394 break; 395 } 396 397 phase3_ungetc (c); 398 399 /* Read the next UTF-16 codepoint. */ 400 c = phase4_getuc (); 401 if (c < 0) 402 break; 403 /* Append it to the buffer. */ 404 if (buflen >= bufmax) 405 { 406 bufmax += 100; 407 buffer = xrealloc (buffer, bufmax * sizeof (unsigned short)); 408 } 409 buffer[buflen++] = c; 410 411 c = phase3_getc (); 412 if (c == EOF || c == '\n') 413 { 414 if (in_key) 415 phase3_ungetc (c); 416 break; 417 } 418 } 419 420 /* Now convert from UTF-16 to UTF-8. */ 421 { 422 size_t pos; 423 unsigned char *utf8_string; 424 unsigned char *q; 425 426 /* Each UTF-16 word needs 3 bytes at worst. */ 427 utf8_string = (unsigned char *) xmalloc (3 * buflen + 1); 428 for (pos = 0, q = utf8_string; pos < buflen; ) 429 { 430 unsigned int uc; 431 int n; 432 433 pos += u16_mbtouc (&uc, buffer + pos, buflen - pos); 434 n = u8_uctomb (q, uc, 6); 435 assert (n > 0); 436 q += n; 437 } 438 *q = '\0'; 439 assert (q - utf8_string <= 3 * buflen); 440 441 return (char *) utf8_string; 442 } 443 } 444 445 446 /* Read a .properties file from a stream, and dispatch to the various 447 abstract_catalog_reader_class_ty methods. */ 448 static void 449 properties_parse (abstract_catalog_reader_ty *this, FILE *file, 450 const char *real_filename, const char *logical_filename) 451 { 452 fp = file; 453 real_file_name = real_filename; 454 gram_pos.file_name = xstrdup (real_file_name); 455 gram_pos.line_number = 1; 456 457 for (;;) 458 { 459 int c; 460 bool comment; 461 bool hidden; 462 463 c = phase2_getc (); 464 465 if (c == EOF) 466 break; 467 468 comment = false; 469 hidden = false; 470 if (c == '#') 471 comment = true; 472 else if (c == '!') 473 { 474 /* For compatibility with write-properties.c, we treat '!' not 475 followed by space as a fuzzy or untranslated message. */ 476 int c2 = phase2_getc (); 477 if (c2 == ' ' || c2 == '\n' || c2 == EOF) 478 comment = true; 479 else 480 hidden = true; 481 phase2_ungetc (c2); 482 } 483 else 484 phase2_ungetc (c); 485 486 if (comment) 487 { 488 /* A comment line. */ 489 static char *buffer; 490 static size_t bufmax; 491 static size_t buflen; 492 493 buflen = 0; 494 for (;;) 495 { 496 c = phase2_getc (); 497 498 if (buflen >= bufmax) 499 { 500 bufmax += 100; 501 buffer = xrealloc (buffer, bufmax); 502 } 503 504 if (c == EOF || c == '\n') 505 break; 506 507 buffer[buflen++] = c; 508 } 509 buffer[buflen] = '\0'; 510 511 po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer))); 512 } 513 else 514 { 515 /* A key/value pair. */ 516 char *msgid; 517 lex_pos_ty msgid_pos; 518 519 msgid_pos = gram_pos; 520 msgid = read_escaped_string (true); 521 if (msgid == NULL) 522 /* Skip blank line. */ 523 ; 524 else 525 { 526 char *msgstr; 527 lex_pos_ty msgstr_pos; 528 bool force_fuzzy; 529 530 msgstr_pos = gram_pos; 531 msgstr = read_escaped_string (false); 532 if (msgstr == NULL) 533 msgstr = xstrdup (""); 534 535 /* Be sure to make the message fuzzy if it was commented out 536 and if it is not already header/fuzzy/untranslated. */ 537 force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0'); 538 539 po_callback_message (NULL, msgid, &msgid_pos, NULL, 540 msgstr, strlen (msgstr) + 1, &msgstr_pos, 541 NULL, NULL, NULL, 542 force_fuzzy, false); 543 } 544 } 545 } 546 547 fp = NULL; 548 real_file_name = NULL; 549 gram_pos.line_number = 0; 550 } 551 552 const struct catalog_input_format input_format_properties = 553 { 554 properties_parse, /* parse */ 555 true /* produces_utf8 */ 556 }; 557