1 /* xgettext Smalltalk backend. 2 Copyright (C) 2002-2003, 2005 Free Software Foundation, Inc. 3 4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 20 #ifdef HAVE_CONFIG_H 21 # include "config.h" 22 #endif 23 24 #include <errno.h> 25 #include <stdio.h> 26 #include <stdlib.h> 27 28 #include "message.h" 29 #include "xgettext.h" 30 #include "x-smalltalk.h" 31 #include "error.h" 32 #include "xalloc.h" 33 #include "exit.h" 34 #include "gettext.h" 35 36 #define _(s) gettext(s) 37 38 #define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 39 40 41 /* The relevant parts of the Smalltalk syntax are: 42 43 stringliteral ::= string | stringconst | symconst 44 stringconst ::= "#"string 45 string ::= "'"[char]*"'" 46 symconst ::= "#"symbol 47 symbol ::= id | binsel | keysel[keysel]* 48 keysel ::= id":" 49 id ::= letter[letter|digit]* 50 letter ::= "A".."Z" | "a".."z" 51 digit ::= "0".."9" 52 binsel ::= selchar[selchar] 53 selchar ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">" 54 | "=" | "&" | "@" | "?" | "%" | "\" 55 56 Strings can contain any characters; to include the string delimiter itself, 57 it must be duplicated. 58 59 Character constants are written "$"char 60 61 Comments are enclosed within double quotes. 62 63 In well-formed expressions, {} and [] and () are balanced. 64 */ 65 66 67 /* ======================== Reading of characters. ======================== */ 68 69 70 /* Real filename, used in error messages about the input file. */ 71 static const char *real_file_name; 72 73 /* Logical filename and line number, used to label the extracted messages. */ 74 static char *logical_file_name; 75 static int line_number; 76 77 /* The input file stream. */ 78 static FILE *fp; 79 80 81 /* 1. line_number handling. */ 82 83 static int 84 phase1_getc () 85 { 86 int c = getc (fp); 87 88 if (c == EOF) 89 { 90 if (ferror (fp)) 91 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 92 real_file_name); 93 return EOF; 94 } 95 96 if (c == '\n') 97 line_number++; 98 99 return c; 100 } 101 102 /* Supports only one pushback character. */ 103 static void 104 phase1_ungetc (int c) 105 { 106 if (c != EOF) 107 { 108 if (c == '\n') 109 --line_number; 110 111 ungetc (c, fp); 112 } 113 } 114 115 116 /* Accumulating comments. */ 117 118 static char *buffer; 119 static size_t bufmax; 120 static size_t buflen; 121 122 static inline void 123 comment_start () 124 { 125 buflen = 0; 126 } 127 128 static inline void 129 comment_add (int c) 130 { 131 if (buflen >= bufmax) 132 { 133 bufmax = 2 * bufmax + 10; 134 buffer = xrealloc (buffer, bufmax); 135 } 136 buffer[buflen++] = c; 137 } 138 139 static inline void 140 comment_line_end () 141 { 142 while (buflen >= 1 143 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 144 --buflen; 145 if (buflen >= bufmax) 146 { 147 bufmax = 2 * bufmax + 10; 148 buffer = xrealloc (buffer, bufmax); 149 } 150 buffer[buflen] = '\0'; 151 savable_comment_add (buffer); 152 } 153 154 155 /* These are for tracking whether comments count as immediately before 156 keyword. */ 157 static int last_comment_line; 158 static int last_non_comment_line; 159 160 161 /* ========================== Reading of tokens. ========================== */ 162 163 164 enum token_type_ty 165 { 166 token_type_eof, 167 token_type_uniq, /* # */ 168 token_type_symbol, /* symbol */ 169 token_type_string_literal, /* string, stringconst, symbolconst */ 170 token_type_other /* misc. operator */ 171 }; 172 typedef enum token_type_ty token_type_ty; 173 174 typedef struct token_ty token_ty; 175 struct token_ty 176 { 177 token_type_ty type; 178 char *string; /* for token_type_string_literal, token_type_symbol */ 179 int line_number; 180 }; 181 182 183 /* 2. Combine characters into tokens. Discard comments and whitespace. */ 184 185 static token_ty phase2_pushback[1]; 186 static int phase2_pushback_length; 187 188 static void 189 phase2_get (token_ty *tp) 190 { 191 static char *buffer; 192 static int bufmax; 193 int bufpos; 194 int c; 195 196 if (phase2_pushback_length) 197 { 198 *tp = phase2_pushback[--phase2_pushback_length]; 199 return; 200 } 201 202 tp->string = NULL; 203 204 for (;;) 205 { 206 tp->line_number = line_number; 207 c = phase1_getc (); 208 switch (c) 209 { 210 case EOF: 211 tp->type = token_type_eof; 212 return; 213 214 case '"': 215 { 216 /* Comment. */ 217 int lineno; 218 219 comment_start (); 220 lineno = line_number; 221 for (;;) 222 { 223 c = phase1_getc (); 224 if (c == '"' || c == EOF) 225 break; 226 if (c == '\n') 227 { 228 comment_line_end (); 229 comment_start (); 230 } 231 else 232 { 233 /* We skip all leading white space, but not EOLs. */ 234 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 235 comment_add (c); 236 } 237 } 238 comment_line_end (); 239 last_comment_line = lineno; 240 continue; 241 } 242 243 case '\n': 244 if (last_non_comment_line > last_comment_line) 245 savable_comment_reset (); 246 /* FALLTHROUGH */ 247 case ' ': 248 case '\t': 249 case '\r': 250 /* Ignore whitespace. */ 251 continue; 252 } 253 254 last_non_comment_line = tp->line_number; 255 256 switch (c) 257 { 258 case '\'': 259 /* String literal. */ 260 bufpos = 0; 261 for (;;) 262 { 263 c = phase1_getc (); 264 if (c == EOF) 265 break; 266 if (c == '\'') 267 { 268 c = phase1_getc (); 269 if (c != '\'') 270 { 271 phase1_ungetc (c); 272 break; 273 } 274 } 275 if (bufpos >= bufmax) 276 { 277 bufmax = 2 * bufmax + 10; 278 buffer = xrealloc (buffer, bufmax); 279 } 280 buffer[bufpos++] = c; 281 } 282 if (bufpos >= bufmax) 283 { 284 bufmax = 2 * bufmax + 10; 285 buffer = xrealloc (buffer, bufmax); 286 } 287 buffer[bufpos] = 0; 288 tp->type = token_type_string_literal; 289 tp->string = xstrdup (buffer); 290 return; 291 292 case '+': 293 case '-': 294 case '*': 295 case '/': 296 case '~': 297 case '|': 298 case ',': 299 case '<': 300 case '>': 301 case '=': 302 case '&': 303 case '@': 304 case '?': 305 case '%': 306 case '\\': 307 { 308 char *name; 309 int c2 = phase1_getc (); 310 switch (c2) 311 { 312 case '+': 313 case '-': 314 case '*': 315 case '/': 316 case '~': 317 case '|': 318 case ',': 319 case '<': 320 case '>': 321 case '=': 322 case '&': 323 case '@': 324 case '?': 325 case '%': 326 name = xmalloc (3); 327 name[0] = c; 328 name[1] = c2; 329 name[2] = '\0'; 330 tp->type = token_type_symbol; 331 tp->string = name; 332 return; 333 default: 334 phase1_ungetc (c2); 335 break; 336 } 337 name = xmalloc (2); 338 name[0] = c; 339 name[1] = '\0'; 340 tp->type = token_type_symbol; 341 tp->string = name; 342 return; 343 } 344 345 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 346 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 347 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 348 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 349 case 'Y': case 'Z': 350 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 351 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 352 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 353 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 354 case 'y': case 'z': 355 /* Recognize id or id":"[id":"]* or id":"[id":"]*id. */ 356 bufpos = 0; 357 for (;;) 358 { 359 if (bufpos >= bufmax) 360 { 361 bufmax = 2 * bufmax + 10; 362 buffer = xrealloc (buffer, bufmax); 363 } 364 buffer[bufpos++] = c; 365 c = phase1_getc (); 366 switch (c) 367 { 368 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 369 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 370 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 371 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 372 case 'Y': case 'Z': 373 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 374 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 375 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 376 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 377 case 'y': case 'z': 378 case '0': case '1': case '2': case '3': case '4': 379 case '5': case '6': case '7': case '8': case '9': 380 continue; 381 case ':': 382 if (bufpos >= bufmax) 383 { 384 bufmax = 2 * bufmax + 10; 385 buffer = xrealloc (buffer, bufmax); 386 } 387 buffer[bufpos++] = c; 388 c = phase1_getc (); 389 switch (c) 390 { 391 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 392 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 393 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 394 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 395 case 'Y': case 'Z': 396 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 397 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 398 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 399 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 400 case 'y': case 'z': 401 continue; 402 default: 403 phase1_ungetc (c); 404 break; 405 } 406 break; 407 default: 408 phase1_ungetc (c); 409 break; 410 } 411 break; 412 } 413 if (bufpos >= bufmax) 414 { 415 bufmax = 2 * bufmax + 10; 416 buffer = xrealloc (buffer, bufmax); 417 } 418 buffer[bufpos] = '\0'; 419 tp->string = xstrdup (buffer); 420 tp->type = token_type_symbol; 421 return; 422 423 case '#': 424 /* Uniquification operator. */ 425 tp->type = token_type_uniq; 426 return; 427 428 case '$': 429 c = phase1_getc (); 430 tp->type = token_type_other; 431 return; 432 433 default: 434 tp->type = token_type_other; 435 return; 436 } 437 } 438 } 439 440 /* Supports only one pushback token. */ 441 static void 442 phase2_unget (token_ty *tp) 443 { 444 if (tp->type != token_type_eof) 445 { 446 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 447 abort (); 448 phase2_pushback[phase2_pushback_length++] = *tp; 449 } 450 } 451 452 453 /* 3. Combine "# string_literal" and "# symbol" to a single token. */ 454 455 static void 456 x_smalltalk_lex (token_ty *tp) 457 { 458 phase2_get (tp); 459 if (tp->type == token_type_uniq) 460 { 461 token_ty token2; 462 463 phase2_get (&token2); 464 if (token2.type == token_type_symbol 465 || token2.type == token_type_string_literal) 466 { 467 tp->type = token_type_string_literal; 468 tp->string = token2.string; 469 } 470 else 471 phase2_unget (&token2); 472 } 473 } 474 475 476 /* ========================= Extracting strings. ========================== */ 477 478 /* The file is broken into tokens. Scan the token stream, looking for the 479 following patterns 480 NLS ? <string> 481 NLS at: <string> 482 NLS at: <string> plural: <string> 483 where <string> is one of 484 string_literal 485 # string_literal 486 # symbol 487 */ 488 489 void 490 extract_smalltalk (FILE *f, 491 const char *real_filename, const char *logical_filename, 492 flag_context_list_table_ty *flag_table, 493 msgdomain_list_ty *mdlp) 494 { 495 message_list_ty *mlp = mdlp->item[0]->messages; 496 497 fp = f; 498 real_file_name = real_filename; 499 logical_file_name = xstrdup (logical_filename); 500 line_number = 1; 501 502 last_comment_line = -1; 503 last_non_comment_line = -1; 504 505 /* Eat tokens until eof is seen. */ 506 { 507 /* 0 when no "NLS" has been seen. 508 1 after "NLS". 509 2 after "NLS ?". 510 3 after "NLS at:". 511 4 after "NLS at: <string>". 512 5 after "NLS at: <string> plural:". */ 513 int state; 514 /* Remember the message containing the msgid, for msgid_plural. 515 Non-NULL in states 4, 5. */ 516 message_ty *plural_mp = NULL; 517 518 /* Start state is 0. */ 519 state = 0; 520 521 for (;;) 522 { 523 token_ty token; 524 525 x_smalltalk_lex (&token); 526 527 switch (token.type) 528 { 529 case token_type_symbol: 530 state = (strcmp (token.string, "NLS") == 0 ? 1 : 531 strcmp (token.string, "?") == 0 && state == 1 ? 2 : 532 strcmp (token.string, "at:") == 0 && state == 1 ? 3 : 533 strcmp (token.string, "plural:") == 0 && state == 4 ? 5 : 534 0); 535 free (token.string); 536 break; 537 538 case token_type_string_literal: 539 if (state == 2) 540 { 541 lex_pos_ty pos; 542 pos.file_name = logical_file_name; 543 pos.line_number = token.line_number; 544 remember_a_message (mlp, NULL, token.string, null_context, 545 &pos, savable_comment); 546 state = 0; 547 break; 548 } 549 if (state == 3) 550 { 551 lex_pos_ty pos; 552 pos.file_name = logical_file_name; 553 pos.line_number = token.line_number; 554 plural_mp = remember_a_message (mlp, NULL, token.string, 555 null_context, &pos, 556 savable_comment); 557 state = 4; 558 break; 559 } 560 if (state == 5) 561 { 562 lex_pos_ty pos; 563 pos.file_name = logical_file_name; 564 pos.line_number = token.line_number; 565 remember_a_message_plural (plural_mp, token.string, 566 null_context, &pos, 567 savable_comment); 568 state = 0; 569 break; 570 } 571 state = 0; 572 free (token.string); 573 break; 574 575 case token_type_uniq: 576 case token_type_other: 577 state = 0; 578 break; 579 580 case token_type_eof: 581 break; 582 583 default: 584 abort (); 585 } 586 587 if (token.type == token_type_eof) 588 break; 589 } 590 } 591 592 /* Close scanner. */ 593 fp = NULL; 594 real_file_name = NULL; 595 logical_file_name = NULL; 596 line_number = 0; 597 } 598