xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-ycp.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext YCP backend.
2    Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-ycp.h"
33 #include "error.h"
34 #include "xalloc.h"
35 #include "exit.h"
36 #include "gettext.h"
37 
38 #define _(s) gettext(s)
39 
40 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
41 
42 
43 /* The YCP syntax is defined in libycp/doc/syntax.html.
44    See also libycp/src/scanner.ll.
45    Both are part of the yast2-core package in SuSE Linux distributions.  */
46 
47 
48 void
init_flag_table_ycp()49 init_flag_table_ycp ()
50 {
51   xgettext_record_flag ("sformat:1:ycp-format");
52   xgettext_record_flag ("y2debug:1:ycp-format");
53   xgettext_record_flag ("y2milestone:1:ycp-format");
54   xgettext_record_flag ("y2warning:1:ycp-format");
55   xgettext_record_flag ("y2error:1:ycp-format");
56   xgettext_record_flag ("y2security:1:ycp-format");
57   xgettext_record_flag ("y2internal:1:ycp-format");
58 }
59 
60 
61 /* ======================== Reading of characters.  ======================== */
62 
63 
64 /* Real filename, used in error messages about the input file.  */
65 static const char *real_file_name;
66 
67 /* Logical filename and line number, used to label the extracted messages.  */
68 static char *logical_file_name;
69 static int line_number;
70 static int char_in_line;
71 
72 /* The input file stream.  */
73 static FILE *fp;
74 
75 /* These are for tracking whether comments count as immediately before
76    keyword.  */
77 static int last_comment_line;
78 static int last_non_comment_line;
79 
80 
81 /* 1. line_number handling.  */
82 
83 static int
phase1_getc()84 phase1_getc ()
85 {
86   int c = getc (fp);
87 
88   if (c == EOF)
89     {
90       if (ferror (fp))
91 	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
92 	       real_file_name);
93       return EOF;
94     }
95 
96   if (c == '\n')
97     {
98       line_number++;
99       char_in_line = 0;
100     }
101   else
102     char_in_line++;
103 
104   return c;
105 }
106 
107 /* Supports only one pushback character.  */
108 static void
phase1_ungetc(int c)109 phase1_ungetc (int c)
110 {
111   if (c != EOF)
112     {
113       if (c == '\n')
114 	{
115 	  --line_number;
116 	  char_in_line = INT_MAX;
117 	}
118       else
119 	--char_in_line;
120 
121       ungetc (c, fp);
122     }
123 }
124 
125 
126 /* 2. Replace each comment that is not inside a character constant or
127    string literal with a space character.  We need to remember the
128    comment for later, because it may be attached to a keyword string.
129    YCP comments can be in C comment syntax, C++ comment syntax or sh
130    comment syntax.  */
131 
132 static unsigned char phase2_pushback[1];
133 static int phase2_pushback_length;
134 
135 static int
phase2_getc()136 phase2_getc ()
137 {
138   static char *buffer;
139   static size_t bufmax;
140   size_t buflen;
141   int lineno;
142   int c;
143   bool last_was_star;
144 
145   if (phase2_pushback_length)
146     return phase2_pushback[--phase2_pushback_length];
147 
148   if (char_in_line == 0)
149     {
150       /* Eat whitespace, to recognize ^[\t ]*# pattern.  */
151       do
152 	c = phase1_getc ();
153       while (c == '\t' || c == ' ');
154 
155       if (c == '#')
156 	{
157 	  /* sh comment.  */
158 	  buflen = 0;
159 	  lineno = line_number;
160 	  for (;;)
161 	    {
162 	      c = phase1_getc ();
163 	      if (c == '\n' || c == EOF)
164 		break;
165 	      /* We skip all leading white space, but not EOLs.  */
166 	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
167 		{
168 		  if (buflen >= bufmax)
169 		    {
170 		      bufmax = 2 * bufmax + 10;
171 		      buffer = xrealloc (buffer, bufmax);
172 		    }
173 		  buffer[buflen++] = c;
174 		}
175 	    }
176 	  if (buflen >= bufmax)
177 	    {
178 	      bufmax = 2 * bufmax + 10;
179 	      buffer = xrealloc (buffer, bufmax);
180 	    }
181 	  buffer[buflen] = '\0';
182 	  savable_comment_add (buffer);
183 	  last_comment_line = lineno;
184 	  return '\n';
185 	}
186     }
187   else
188     c = phase1_getc ();
189 
190   if (c == '/')
191     {
192       c = phase1_getc ();
193 
194       switch (c)
195 	{
196 	default:
197 	  phase1_ungetc (c);
198 	  return '/';
199 
200 	case '*':
201 	  /* C comment.  */
202 	  buflen = 0;
203 	  lineno = line_number;
204 	  last_was_star = false;
205 	  for (;;)
206 	    {
207 	      c = phase1_getc ();
208 	      if (c == EOF)
209 		break;
210 	      /* We skip all leading white space, but not EOLs.  */
211 	      if (buflen == 0 && (c == ' ' || c == '\t'))
212 		continue;
213 	      if (buflen >= bufmax)
214 		{
215 		  bufmax = 2 * bufmax + 10;
216 		  buffer = xrealloc (buffer, bufmax);
217 	        }
218 	      buffer[buflen++] = c;
219 	      switch (c)
220 		{
221 		case '\n':
222 		  --buflen;
223 		  while (buflen >= 1
224 			 && (buffer[buflen - 1] == ' '
225 			     || buffer[buflen - 1] == '\t'))
226 		    --buflen;
227 		  buffer[buflen] = '\0';
228 		  savable_comment_add (buffer);
229 		  buflen = 0;
230 		  lineno = line_number;
231 		  last_was_star = false;
232 		  continue;
233 
234 		case '*':
235 		  last_was_star = true;
236 		  continue;
237 
238 		case '/':
239 		  if (last_was_star)
240 		    {
241 		      buflen -= 2;
242 		      while (buflen >= 1
243 			     && (buffer[buflen - 1] == ' '
244 				 || buffer[buflen - 1] == '\t'))
245 			--buflen;
246 		      buffer[buflen] = '\0';
247 		      savable_comment_add (buffer);
248 		      break;
249 		    }
250 		  /* FALLTHROUGH */
251 
252 		default:
253 		  last_was_star = false;
254 		  continue;
255 		}
256 	      break;
257 	    }
258 	  last_comment_line = lineno;
259 	  return ' ';
260 
261 	case '/':
262 	  /* C++ comment.  */
263 	  buflen = 0;
264 	  lineno = line_number;
265 	  for (;;)
266 	    {
267 	      c = phase1_getc ();
268 	      if (c == '\n' || c == EOF)
269 		break;
270 	      /* We skip all leading white space, but not EOLs.  */
271 	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
272 		{
273 		  if (buflen >= bufmax)
274 		    {
275 		      bufmax = 2 * bufmax + 10;
276 		      buffer = xrealloc (buffer, bufmax);
277 		    }
278 		  buffer[buflen++] = c;
279 		}
280 	    }
281 	  if (buflen >= bufmax)
282 	    {
283 	      bufmax = 2 * bufmax + 10;
284 	      buffer = xrealloc (buffer, bufmax);
285 	    }
286 	  buffer[buflen] = '\0';
287 	  savable_comment_add (buffer);
288 	  last_comment_line = lineno;
289 	  return '\n';
290 	}
291     }
292   else
293     return c;
294 }
295 
296 /* Supports only one pushback character.  */
297 static void
phase2_ungetc(int c)298 phase2_ungetc (int c)
299 {
300   if (c != EOF)
301     {
302       if (phase2_pushback_length == SIZEOF (phase2_pushback))
303 	abort ();
304       phase2_pushback[phase2_pushback_length++] = c;
305     }
306 }
307 
308 
309 /* ========================== Reading of tokens.  ========================== */
310 
311 
312 enum token_type_ty
313 {
314   token_type_eof,
315   token_type_lparen,		/* ( */
316   token_type_rparen,		/* ) */
317   token_type_comma,		/* , */
318   token_type_i18n,		/* _( */
319   token_type_string_literal,	/* "abc" */
320   token_type_symbol,		/* symbol, number */
321   token_type_other		/* misc. operator */
322 };
323 typedef enum token_type_ty token_type_ty;
324 
325 typedef struct token_ty token_ty;
326 struct token_ty
327 {
328   token_type_ty type;
329   char *string;		/* for token_type_string_literal, token_type_symbol */
330   int line_number;
331 };
332 
333 
334 /* 7. Replace escape sequences within character strings with their
335    single character equivalents.  */
336 
337 #define P7_QUOTES (1000 + '"')
338 
339 static int
phase7_getc()340 phase7_getc ()
341 {
342   int c;
343 
344   for (;;)
345     {
346       /* Use phase 1, because phase 2 elides comments.  */
347       c = phase1_getc ();
348 
349       if (c == '"')
350 	return P7_QUOTES;
351       if (c != '\\')
352 	return c;
353       c = phase1_getc ();
354       if (c != '\n')
355 	switch (c)
356 	  {
357 	  case 'b':
358 	    return '\b';
359 	  case 'f':
360 	    return '\f';
361 	  case 'n':
362 	    return '\n';
363 	  case 'r':
364 	    return '\r';
365 	  case 't':
366 	    return '\t';
367 
368 	  /* FIXME: What is the octal escape syntax?
369 	     syntax.html says: [0] [0-7]+
370 	     scanner.ll says:  [0-7] [0-7] [0-7]
371 	   */
372 #if 0
373 	  case '0': case '1': case '2': case '3':
374 	  case '4': case '5': case '6': case '7':
375 	    {
376 	      int n, j;
377 
378 	      n = 0;
379 	      for (j = 0; j < 3; ++j)
380 		{
381 		  n = n * 8 + c - '0';
382 		  c = phase1_getc ();
383 		  switch (c)
384 		    {
385 		    default:
386 		      break;
387 
388 		    case '0': case '1': case '2': case '3':
389 		    case '4': case '5': case '6': case '7':
390 		      continue;
391 		    }
392 		  break;
393 		}
394 	      phase1_ungetc (c);
395 	      return n;
396 	    }
397 #endif
398 
399 	  default:
400 	    return c;
401 	  }
402     }
403 }
404 
405 
406 /* Combine characters into tokens.  Discard whitespace.  */
407 
408 static token_ty phase5_pushback[1];
409 static int phase5_pushback_length;
410 
411 static void
phase5_get(token_ty * tp)412 phase5_get (token_ty *tp)
413 {
414   static char *buffer;
415   static int bufmax;
416   int bufpos;
417   int c;
418 
419   if (phase5_pushback_length)
420     {
421       *tp = phase5_pushback[--phase5_pushback_length];
422       return;
423     }
424   for (;;)
425     {
426       tp->line_number = line_number;
427       c = phase2_getc ();
428 
429       switch (c)
430 	{
431 	case EOF:
432 	  tp->type = token_type_eof;
433 	  return;
434 
435 	case '\n':
436 	  if (last_non_comment_line > last_comment_line)
437 	    savable_comment_reset ();
438 	  /* FALLTHROUGH */
439 	case '\r':
440 	case '\t':
441 	case ' ':
442 	  /* Ignore whitespace and comments.  */
443 	  continue;
444 	}
445 
446       last_non_comment_line = tp->line_number;
447 
448       switch (c)
449 	{
450 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
451 	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
452 	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
453 	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
454 	case 'Y': case 'Z':
455 	case '_':
456 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
457 	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
458 	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
459 	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
460 	case 'y': case 'z':
461 	case '0': case '1': case '2': case '3': case '4':
462 	case '5': case '6': case '7': case '8': case '9':
463 	  /* Symbol, or part of a number.  */
464 	  bufpos = 0;
465 	  for (;;)
466 	    {
467 	      if (bufpos >= bufmax)
468 		{
469 		  bufmax = 2 * bufmax + 10;
470 		  buffer = xrealloc (buffer, bufmax);
471 		}
472 	      buffer[bufpos++] = c;
473 	      c = phase2_getc ();
474 	      switch (c)
475 		{
476 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
477 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
478 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
479 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
480 		case 'Y': case 'Z':
481 		case '_':
482 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
483 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
484 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
485 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
486 		case 'y': case 'z':
487 		case '0': case '1': case '2': case '3': case '4':
488 		case '5': case '6': case '7': case '8': case '9':
489 		  continue;
490 		default:
491 		  if (bufpos == 1 && buffer[0] == '_' && c == '(')
492 		    {
493 		      tp->type = token_type_i18n;
494 		      return;
495 		    }
496 		  phase2_ungetc (c);
497 		  break;
498 		}
499 	      break;
500 	    }
501 	  if (bufpos >= bufmax)
502 	    {
503 	      bufmax = 2 * bufmax + 10;
504 	      buffer = xrealloc (buffer, bufmax);
505 	    }
506 	  buffer[bufpos] = '\0';
507 	  tp->string = xstrdup (buffer);
508 	  tp->type = token_type_symbol;
509 	  return;
510 
511 	case '"':
512 	  bufpos = 0;
513 	  for (;;)
514 	    {
515 	      c = phase7_getc ();
516 	      if (c == EOF || c == P7_QUOTES)
517 		break;
518 	      if (bufpos >= bufmax)
519 		{
520 		  bufmax = 2 * bufmax + 10;
521 		  buffer = xrealloc (buffer, bufmax);
522 		}
523 	      buffer[bufpos++] = c;
524 	    }
525 	  if (bufpos >= bufmax)
526 	    {
527 	      bufmax = 2 * bufmax + 10;
528 	      buffer = xrealloc (buffer, bufmax);
529 	    }
530 	  buffer[bufpos] = '\0';
531 	  tp->string = xstrdup (buffer);
532 	  tp->type = token_type_string_literal;
533 	  return;
534 
535 	case '(':
536 	  tp->type = token_type_lparen;
537 	  return;
538 
539 	case ')':
540 	  tp->type = token_type_rparen;
541 	  return;
542 
543 	case ',':
544 	  tp->type = token_type_comma;
545 	  return;
546 
547 	default:
548 	  /* We could carefully recognize each of the 2 and 3 character
549 	     operators, but it is not necessary, as we only need to recognize
550 	     gettext invocations.  Don't bother.  */
551 	  tp->type = token_type_other;
552 	  return;
553 	}
554     }
555 }
556 
557 /* Supports only one pushback token.  */
558 static void
phase5_unget(token_ty * tp)559 phase5_unget (token_ty *tp)
560 {
561   if (tp->type != token_type_eof)
562     {
563       if (phase5_pushback_length == SIZEOF (phase5_pushback))
564 	abort ();
565       phase5_pushback[phase5_pushback_length++] = *tp;
566     }
567 }
568 
569 
570 /* Concatenate adjacent string literals to form single string literals.
571    (See libycp/src/parser.yy, rule 'string' vs. terminal 'STRING'.)  */
572 
573 static void
phase8_get(token_ty * tp)574 phase8_get (token_ty *tp)
575 {
576   phase5_get (tp);
577   if (tp->type != token_type_string_literal)
578     return;
579   for (;;)
580     {
581       token_ty tmp;
582       size_t len;
583 
584       phase5_get (&tmp);
585       if (tmp.type != token_type_string_literal)
586 	{
587 	  phase5_unget (&tmp);
588 	  return;
589 	}
590       len = strlen (tp->string);
591       tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
592       strcpy (tp->string + len, tmp.string);
593       free (tmp.string);
594     }
595 }
596 
597 
598 /* ========================= Extracting strings.  ========================== */
599 
600 
601 /* Context lookup table.  */
602 static flag_context_list_table_ty *flag_context_list_table;
603 
604 
605 /* The file is broken into tokens.
606 
607      Normal handling: Look for
608        [A] _( [B] msgid ... )
609      Plural handling: Look for
610        [A] _( [B] msgid [C] , [D] msgid_plural ... )
611      At point [A]: state == 0.
612      At point [B]: state == 1, plural_mp == NULL.
613      At point [C]: state == 2, plural_mp != NULL.
614      At point [D]: state == 1, plural_mp != NULL.
615 
616    We use recursion because we have to set the context according to the given
617    flags.  */
618 
619 
620 /* Extract messages until the next balanced closing parenthesis.
621    Extracted messages are added to MLP.
622    Return true upon eof, false upon closing parenthesis.  */
623 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,bool in_i18n)624 extract_parenthesized (message_list_ty *mlp,
625 		       flag_context_ty outer_context,
626 		       flag_context_list_iterator_ty context_iter,
627 		       bool in_i18n)
628 {
629   int state; /* 1 or 2 inside _( ... ), otherwise 0 */
630   message_ty *plural_mp = NULL;	/* defined only when in states 1 and 2 */
631   /* Context iterator that will be used if the next token is a '('.  */
632   flag_context_list_iterator_ty next_context_iter =
633     passthrough_context_list_iterator;
634   /* Current context.  */
635   flag_context_ty inner_context =
636     inherited_context (outer_context,
637 		       flag_context_list_iterator_advance (&context_iter));
638 
639   /* Start state is 0 or 1.  */
640   state = (in_i18n ? 1 : 0);
641 
642   for (;;)
643     {
644       token_ty token;
645 
646       if (in_i18n)
647 	phase8_get (&token);
648       else
649 	phase5_get (&token);
650 
651       switch (token.type)
652 	{
653 	case token_type_i18n:
654 	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
655 				     true))
656 	    return true;
657 	  next_context_iter = null_context_list_iterator;
658 	  state = 0;
659 	  continue;
660 
661 	case token_type_string_literal:
662 	  if (state == 1)
663 	    {
664 	      lex_pos_ty pos;
665 	      pos.file_name = logical_file_name;
666 	      pos.line_number = token.line_number;
667 
668 	      if (plural_mp == NULL)
669 		{
670 		  /* Seen an msgid.  */
671 		  plural_mp = remember_a_message (mlp, NULL, token.string,
672 						  inner_context, &pos,
673 						  savable_comment);
674 		  state = 2;
675 		}
676 	      else
677 		{
678 		  /* Seen an msgid_plural.  */
679 		  remember_a_message_plural (plural_mp, token.string,
680 					     inner_context, &pos,
681 					     savable_comment);
682 		  state = 0;
683 		}
684 	    }
685 	  else
686 	    {
687 	      free (token.string);
688 	      state = 0;
689 	    }
690 	  next_context_iter = null_context_list_iterator;
691 	  continue;
692 
693 	case token_type_symbol:
694 	  next_context_iter =
695 	    flag_context_list_iterator (
696 	      flag_context_list_table_lookup (
697 		flag_context_list_table,
698 		token.string, strlen (token.string)));
699 	  free (token.string);
700 	  state = 0;
701 	  continue;
702 
703 	case token_type_lparen:
704 	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
705 				     false))
706 	    return true;
707 	  next_context_iter = null_context_list_iterator;
708 	  state = 0;
709 	  continue;
710 
711 	case token_type_rparen:
712 	  return false;
713 
714 	case token_type_comma:
715 	  if (state == 2)
716 	    state = 1;
717 	  else
718 	    state = 0;
719 	  inner_context =
720 	    inherited_context (outer_context,
721 			       flag_context_list_iterator_advance (
722 				 &context_iter));
723 	  next_context_iter = passthrough_context_list_iterator;
724 	  continue;
725 
726 	case token_type_other:
727 	  next_context_iter = null_context_list_iterator;
728 	  state = 0;
729 	  continue;
730 
731 	case token_type_eof:
732 	  return true;
733 
734 	default:
735 	  abort ();
736 	}
737     }
738 }
739 
740 
741 void
extract_ycp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)742 extract_ycp (FILE *f,
743 	     const char *real_filename, const char *logical_filename,
744 	     flag_context_list_table_ty *flag_table,
745 	     msgdomain_list_ty *mdlp)
746 {
747   message_list_ty *mlp = mdlp->item[0]->messages;
748 
749   fp = f;
750   real_file_name = real_filename;
751   logical_file_name = xstrdup (logical_filename);
752   line_number = 1;
753   char_in_line = 0;
754 
755   last_comment_line = -1;
756   last_non_comment_line = -1;
757 
758   flag_context_list_table = flag_table;
759 
760   /* Eat tokens until eof is seen.  When extract_parenthesized returns
761      due to an unbalanced closing parenthesis, just restart it.  */
762   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
763 				 false))
764     ;
765 
766   fp = NULL;
767   real_file_name = NULL;
768   logical_file_name = NULL;
769   line_number = 0;
770   char_in_line = 0;
771 }
772