xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-lisp.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext Lisp backend.
2    Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-lisp.h"
33 #include "error.h"
34 #include "xalloc.h"
35 #include "exit.h"
36 #include "hash.h"
37 #include "gettext.h"
38 
39 #define _(s) gettext(s)
40 
41 
42 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
43    Since we are interested only in strings and in forms similar to
44         (gettext msgid ...)
45    or   (ngettext msgid msgid_plural ...)
46    we make the following simplifications:
47 
48    - Assume the keywords and strings are in an ASCII compatible encoding.
49      This means we can read the input file one byte at a time, instead of
50      one character at a time.  No need to worry about multibyte characters:
51      If they occur as part of identifiers, they most probably act as
52      constituent characters, and the byte based approach will do the same.
53 
54    - Assume the read table is the standard Common Lisp read table.
55      Non-standard read tables are mostly used to read data, not programs.
56 
57    - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
58 
59    - Don't interpret #n= and #n#, they usually don't appear in programs.
60 
61    - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
62 
63    The remaining syntax rules are:
64 
65    - The syntax code assigned to each character, and how tokens are built
66      up from characters (single escape, multiple escape etc.).
67 
68    - Comment syntax: ';' and '#| ... |#'.
69 
70    - String syntax: "..." with single escapes.
71 
72    - Read macros and dispatch macro character '#'.  Needed to be able to
73      tell which is the n-th argument of a function call.
74 
75  */
76 
77 
78 /* ========================= Lexer customization.  ========================= */
79 
80 /* 'readtable_case' is the case conversion that is applied to non-escaped
81     parts of symbol tokens.  In Common Lisp: (readtable-case *readtable*).  */
82 
83 enum rtcase
84 {
85   case_upcase,
86   case_downcase,
87   case_preserve,
88   case_invert
89 };
90 
91 static enum rtcase readtable_case = case_upcase;
92 
93 /* 'read_base' is the assumed radix of integers and rational numbers.
94    In Common Lisp: *read-base*.  */
95 static int read_base = 10;
96 
97 /* 'read_preserve_whitespace' specifies whether a whitespace character
98    that terminates a token must be pushed back on the input stream.
99    We set it to true, because the special newline side effect in read_object()
100    requires that read_object() sees every newline not inside a token.  */
101 static bool read_preserve_whitespace = true;
102 
103 
104 /* ====================== Keyword set customization.  ====================== */
105 
106 /* If true extract all strings.  */
107 static bool extract_all = false;
108 
109 static hash_table keywords;
110 static bool default_keywords = true;
111 
112 
113 void
x_lisp_extract_all()114 x_lisp_extract_all ()
115 {
116   extract_all = true;
117 }
118 
119 
120 void
x_lisp_keyword(const char * name)121 x_lisp_keyword (const char *name)
122 {
123   if (name == NULL)
124     default_keywords = false;
125   else
126     {
127       const char *end;
128       struct callshape shape;
129       const char *colon;
130       size_t len;
131       char *symname;
132       size_t i;
133 
134       if (keywords.table == NULL)
135 	hash_init (&keywords, 100);
136 
137       split_keywordspec (name, &end, &shape);
138 
139       /* The characters between name and end should form a valid Lisp symbol.
140 	 Extract the symbol name part.  */
141       colon = strchr (name, ':');
142       if (colon != NULL && colon < end)
143 	{
144 	  name = colon + 1;
145 	  if (name < end && *name == ':')
146 	    name++;
147 	  colon = strchr (name, ':');
148 	  if (colon != NULL && colon < end)
149 	    return;
150 	}
151 
152       /* Uppercase it.  */
153       len = end - name;
154       symname = (char *) xmalloc (len);
155       for (i = 0; i < len; i++)
156 	symname[i] =
157 	  (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
158 
159       insert_keyword_callshape (&keywords, symname, len, &shape);
160     }
161 }
162 
163 /* Finish initializing the keywords hash table.
164    Called after argument processing, before each file is processed.  */
165 static void
init_keywords()166 init_keywords ()
167 {
168   if (default_keywords)
169     {
170       /* When adding new keywords here, also update the documentation in
171 	 xgettext.texi!  */
172       x_lisp_keyword ("gettext");	/* I18N:GETTEXT */
173       x_lisp_keyword ("ngettext:1,2");	/* I18N:NGETTEXT */
174       x_lisp_keyword ("gettext-noop");
175       default_keywords = false;
176     }
177 }
178 
179 void
init_flag_table_lisp()180 init_flag_table_lisp ()
181 {
182   xgettext_record_flag ("gettext:1:pass-lisp-format");
183   xgettext_record_flag ("ngettext:1:pass-lisp-format");
184   xgettext_record_flag ("ngettext:2:pass-lisp-format");
185   xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
186   xgettext_record_flag ("format:2:lisp-format");
187 }
188 
189 
190 /* ======================== Reading of characters.  ======================== */
191 
192 /* Real filename, used in error messages about the input file.  */
193 static const char *real_file_name;
194 
195 /* Logical filename and line number, used to label the extracted messages.  */
196 static char *logical_file_name;
197 static int line_number;
198 
199 /* The input file stream.  */
200 static FILE *fp;
201 
202 
203 /* Fetch the next character from the input file.  */
204 static int
do_getc()205 do_getc ()
206 {
207   int c = getc (fp);
208 
209   if (c == EOF)
210     {
211       if (ferror (fp))
212 	error (EXIT_FAILURE, errno, _("\
213 error while reading \"%s\""), real_file_name);
214     }
215   else if (c == '\n')
216    line_number++;
217 
218   return c;
219 }
220 
221 /* Put back the last fetched character, not EOF.  */
222 static void
do_ungetc(int c)223 do_ungetc (int c)
224 {
225   if (c == '\n')
226     line_number--;
227   ungetc (c, fp);
228 }
229 
230 
231 /* ========= Reading of tokens.  See CLHS 2.2 "Reader Algorithm".  ========= */
232 
233 
234 /* Syntax code.  See CLHS 2.1.4 "Character Syntax Types".  */
235 
236 enum syntax_code
237 {
238   syntax_illegal,	/* non-printable, except whitespace	*/
239   syntax_single_esc,	/* '\' (single escape)			*/
240   syntax_multi_esc,	/* '|' (multiple escape)		*/
241   syntax_constituent,	/* everything else (constituent)	*/
242   syntax_whitespace,	/* TAB,LF,FF,CR,' ' (whitespace)	*/
243   syntax_eof,		/* EOF					*/
244   syntax_t_macro,	/* '()'"' (terminating macro)		*/
245   syntax_nt_macro	/* '#' (non-terminating macro)		*/
246 };
247 
248 /* Returns the syntax code of a character.  */
249 static enum syntax_code
syntax_code_of(unsigned char c)250 syntax_code_of (unsigned char c)
251 {
252   switch (c)
253     {
254     case '\\':
255       return syntax_single_esc;
256     case '|':
257       return syntax_multi_esc;
258     case '\t': case '\n': case '\f': case '\r': case ' ':
259       return syntax_whitespace;
260     case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
261       return syntax_t_macro;
262     case '#':
263       return syntax_nt_macro;
264     default:
265       if (c < ' ' && c != '\b')
266 	return syntax_illegal;
267       else
268 	return syntax_constituent;
269     }
270 }
271 
272 struct char_syntax
273 {
274   int ch;			/* character */
275   enum syntax_code scode;	/* syntax code */
276 };
277 
278 /* Returns the next character and its syntax code.  */
279 static void
read_char_syntax(struct char_syntax * p)280 read_char_syntax (struct char_syntax *p)
281 {
282   int c = do_getc ();
283 
284   p->ch = c;
285   p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
286 }
287 
288 /* Every character in a token has an attribute assigned.  The attributes
289    help during interpretation of the token.  See
290    CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
291    and CLHS 2.1.4.2 "Constituent Traits".  */
292 
293 enum attribute
294 {
295   a_illg,	/* invalid constituent */
296   a_pack_m,	/* ':' package marker */
297   a_alpha,	/* normal alphabetic */
298   a_escaped,	/* alphabetic but not subject to case conversion */
299   a_ratio,	/* '/' */
300   a_dot,	/* '.' */
301   a_sign,	/* '+-' */
302   a_extens,	/* '_^' extension characters */
303   a_digit,	/* '0123456789' */
304   a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
305   a_expodigit,	/* 'esfdlESFDL' below base */
306   a_letter,	/* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
307   a_expo	/* 'esfdlESFDL' */
308 };
309 
310 #define is_letter_attribute(a) ((a) >= a_letter)
311 #define is_number_attribute(a) ((a) >= a_ratio)
312 
313 /* Returns the attribute of a character, assuming base 10.  */
314 static enum attribute
attribute_of(unsigned char c)315 attribute_of (unsigned char c)
316 {
317   switch (c)
318     {
319     case ':':
320       return a_pack_m;
321     case '/':
322       return a_ratio;
323     case '.':
324       return a_dot;
325     case '+': case '-':
326       return a_sign;
327     case '_': case '^':
328       return a_extens;
329     case '0': case '1': case '2': case '3': case '4':
330     case '5': case '6': case '7': case '8': case '9':
331       return a_digit;
332     case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
333     case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
334     case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
335     case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
336     case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
337     case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
338       return a_letter;
339     case 'e': case 's': case 'd': case 'f': case 'l':
340     case 'E': case 'S': case 'D': case 'F': case 'L':
341       return a_expo;
342     default:
343       /* Treat everything as valid.  Never return a_illg.  */
344       return a_alpha;
345     }
346 }
347 
348 struct token_char
349 {
350   unsigned char ch;		/* character */
351   unsigned char attribute;	/* attribute */
352 };
353 
354 /* A token consists of a sequence of characters with associated attribute.  */
355 struct token
356 {
357   int allocated;		/* number of allocated 'token_char's */
358   int charcount;		/* number of used 'token_char's */
359   struct token_char *chars;	/* the token's constituents */
360   bool with_escape;		/* whether single-escape or multiple escape occurs */
361 };
362 
363 /* Initialize a 'struct token'.  */
364 static inline void
init_token(struct token * tp)365 init_token (struct token *tp)
366 {
367   tp->allocated = 10;
368   tp->chars =
369     (struct token_char *) xmalloc (tp->allocated * sizeof (struct token_char));
370   tp->charcount = 0;
371 }
372 
373 /* Free the memory pointed to by a 'struct token'.  */
374 static inline void
free_token(struct token * tp)375 free_token (struct token *tp)
376 {
377   free (tp->chars);
378 }
379 
380 /* Ensure there is enough room in the token for one more character.  */
381 static inline void
grow_token(struct token * tp)382 grow_token (struct token *tp)
383 {
384   if (tp->charcount == tp->allocated)
385     {
386       tp->allocated *= 2;
387       tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
388     }
389 }
390 
391 /* Read the next token.  If 'first' is given, it points to the first
392    character, which has already been read.
393    The algorithm follows CLHS 2.2 "Reader Algorithm".  */
394 static void
read_token(struct token * tp,const struct char_syntax * first)395 read_token (struct token *tp, const struct char_syntax *first)
396 {
397   bool multiple_escape_flag;
398   struct char_syntax curr;
399 
400   init_token (tp);
401   tp->with_escape = false;
402 
403   multiple_escape_flag = false;
404   if (first)
405     curr = *first;
406   else
407     read_char_syntax (&curr);
408 
409   for (;; read_char_syntax (&curr))
410     {
411       switch (curr.scode)
412 	{
413 	case syntax_illegal:
414 	  /* Invalid input.  Be tolerant, no error message.  */
415 	  do_ungetc (curr.ch);
416 	  return;
417 
418 	case syntax_single_esc:
419 	  tp->with_escape = true;
420 	  read_char_syntax (&curr);
421 	  if (curr.scode == syntax_eof)
422 	    /* Invalid input.  Be tolerant, no error message.  */
423 	    return;
424 	  grow_token (tp);
425 	  tp->chars[tp->charcount].ch = curr.ch;
426 	  tp->chars[tp->charcount].attribute = a_escaped;
427 	  tp->charcount++;
428 	  break;
429 
430 	case syntax_multi_esc:
431 	  multiple_escape_flag = !multiple_escape_flag;
432 	  tp->with_escape = true;
433 	  break;
434 
435 	case syntax_constituent:
436 	case syntax_nt_macro:
437 	  grow_token (tp);
438 	  if (multiple_escape_flag)
439 	    {
440 	      tp->chars[tp->charcount].ch = curr.ch;
441 	      tp->chars[tp->charcount].attribute = a_escaped;
442 	      tp->charcount++;
443 	    }
444 	  else
445 	    {
446 	      tp->chars[tp->charcount].ch = curr.ch;
447 	      tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
448 	      tp->charcount++;
449 	    }
450 	  break;
451 
452 	case syntax_whitespace:
453 	case syntax_t_macro:
454 	  if (multiple_escape_flag)
455 	    {
456 	      grow_token (tp);
457 	      tp->chars[tp->charcount].ch = curr.ch;
458 	      tp->chars[tp->charcount].attribute = a_escaped;
459 	      tp->charcount++;
460 	    }
461 	  else
462 	    {
463 	      if (curr.scode != syntax_whitespace || read_preserve_whitespace)
464 		do_ungetc (curr.ch);
465 	      return;
466 	    }
467 	  break;
468 
469 	case syntax_eof:
470 	  if (multiple_escape_flag)
471 	    /* Invalid input.  Be tolerant, no error message.  */
472 	    ;
473 	  return;
474 	}
475     }
476 }
477 
478 /* A potential number is a token which
479    1. consists only of digits, '+','-','/','^','_','.' and number markers.
480       The base for digits is context dependent, but always 10 if a dot '.'
481       occurs. A number marker is a non-digit letter which is not adjacent
482       to a non-digit letter.
483    2. has at least one digit.
484    3. starts with a digit, '+','-','.','^' or '_'.
485    4. does not end with '+' or '-'.
486    See CLHS 2.3.1.1 "Potential Numbers as Tokens".
487  */
488 
489 static inline bool
has_a_dot(const struct token * tp)490 has_a_dot (const struct token *tp)
491 {
492   int n = tp->charcount;
493   int i;
494 
495   for (i = 0; i < n; i++)
496     if (tp->chars[i].attribute == a_dot)
497       return true;
498   return false;
499 }
500 
501 static inline bool
all_a_number(const struct token * tp)502 all_a_number (const struct token *tp)
503 {
504   int n = tp->charcount;
505   int i;
506 
507   for (i = 0; i < n; i++)
508     if (!is_number_attribute (tp->chars[i].attribute))
509       return false;
510   return true;
511 }
512 
513 static inline void
a_letter_to_digit(const struct token * tp,int base)514 a_letter_to_digit (const struct token *tp, int base)
515 {
516   int n = tp->charcount;
517   int i;
518 
519   for (i = 0; i < n; i++)
520     if (is_letter_attribute (tp->chars[i].attribute))
521       {
522 	int c = tp->chars[i].ch;
523 
524 	if (c >= 'a')
525 	  c -= 'a' - 'A';
526 	if (c - 'A' + 10 < base)
527 	  tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
528 					  a_expo -> a_expodigit */
529       }
530 }
531 
532 static inline bool
has_a_digit(const struct token * tp)533 has_a_digit (const struct token *tp)
534 {
535   int n = tp->charcount;
536   int i;
537 
538   for (i = 0; i < n; i++)
539     if (tp->chars[i].attribute == a_digit
540 	|| tp->chars[i].attribute == a_letterdigit
541 	|| tp->chars[i].attribute == a_expodigit)
542       return true;
543   return false;
544 }
545 
546 static inline bool
has_adjacent_letters(const struct token * tp)547 has_adjacent_letters (const struct token *tp)
548 {
549   int n = tp->charcount;
550   int i;
551 
552   for (i = 1; i < n; i++)
553     if (is_letter_attribute (tp->chars[i-1].attribute)
554 	&& is_letter_attribute (tp->chars[i].attribute))
555       return true;
556   return false;
557 }
558 
559 static bool
is_potential_number(const struct token * tp,int * basep)560 is_potential_number (const struct token *tp, int *basep)
561 {
562   /* CLHS 2.3.1.1.1:
563      "A potential number cannot contain any escape characters."  */
564   if (tp->with_escape)
565     return false;
566 
567   if (has_a_dot (tp))
568     *basep = 10;
569 
570   if (!all_a_number (tp))
571     return false;
572 
573   a_letter_to_digit (tp, *basep);
574 
575   if (!has_a_digit (tp))
576     return false;
577 
578   if (has_adjacent_letters (tp))
579     return false;
580 
581   if (!(tp->chars[0].attribute >= a_dot
582 	&& tp->chars[0].attribute <= a_expodigit))
583     return false;
584 
585   if (tp->chars[tp->charcount - 1].attribute == a_sign)
586     return false;
587 
588   return true;
589 }
590 
591 /* A number is one of integer, ratio, float.  Each has a particular syntax.
592    See CLHS 2.3.1 "Numbers as Tokens".
593    But note a mistake: The exponent rule should read:
594        exponent ::= exponent-marker [sign] {decimal-digit}+
595    (see 22.1.3.1.3 "Printing Floats").  */
596 
597 enum number_type
598 {
599   n_none,
600   n_integer,
601   n_ratio,
602   n_float
603 };
604 
605 static enum number_type
is_number(const struct token * tp,int * basep)606 is_number (const struct token *tp, int *basep)
607 {
608   struct token_char *ptr_limit;
609   struct token_char *ptr1;
610 
611   if (!is_potential_number (tp, basep))
612     return n_none;
613 
614   /* is_potential_number guarantees
615      - all attributes are >= a_ratio,
616      - there is at least one a_digit or a_letterdigit or a_expodigit, and
617      - if there is an a_dot, then *basep = 10.  */
618 
619   ptr1 = &tp->chars[0];
620   ptr_limit = &tp->chars[tp->charcount];
621 
622   if (ptr1->attribute == a_sign)
623     ptr1++;
624 
625   /* Test for syntax
626    * { a_sign | }
627    * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
628    */
629   {
630     bool seen_a_ratio = false;
631     bool seen_a_digit = false;	/* seen a digit in last digit block? */
632     struct token_char *ptr;
633 
634     for (ptr = ptr1;; ptr++)
635       {
636 	if (ptr >= ptr_limit)
637 	  {
638 	    if (!seen_a_digit)
639 	      break;
640 	    if (seen_a_ratio)
641 	      return n_ratio;
642 	    else
643 	      return n_integer;
644 	  }
645 	if (ptr->attribute == a_digit
646 	    || ptr->attribute == a_letterdigit
647 	    || ptr->attribute == a_expodigit)
648 	  {
649 	    int c = ptr->ch;
650 
651 	    c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
652 	    if (c >= *basep)
653 	      break;
654 	    seen_a_digit = true;
655 	  }
656 	else if (ptr->attribute == a_ratio)
657 	  {
658 	    if (seen_a_ratio || !seen_a_digit)
659 	      break;
660 	    seen_a_ratio = true;
661 	    seen_a_digit = false;
662 	  }
663 	else
664 	  break;
665       }
666   }
667 
668   /* Test for syntax
669    * { a_sign | }
670    * { a_digit }* { a_dot { a_digit }* | }
671    * { a_expo { a_sign | } { a_digit }+ | }
672    *
673    * If there is an exponent part, there must be digits before the dot or
674    * after the dot. The result is a float.
675    * If there is no exponen:
676    *   If there is no dot, it would an integer in base 10, but is has already
677    *   been verified to not be an integer in the current base.
678    *   If there is a dot:
679    *     If there are digits after the dot, it's a float.
680    *     Otherwise, if there are digits before the dot, it's an integer.
681    */
682   *basep = 10;
683   {
684     bool seen_a_dot = false;
685     bool seen_a_dot_with_leading_digits = false;
686     bool seen_a_digit = false;	/* seen a digit in last digit block? */
687     struct token_char *ptr;
688 
689     for (ptr = ptr1;; ptr++)
690       {
691 	if (ptr >= ptr_limit)
692 	  {
693 	    /* no exponent */
694 	    if (!seen_a_dot)
695 	      return n_none;
696 	    if (seen_a_digit)
697 	      return n_float;
698 	    if (seen_a_dot_with_leading_digits)
699 	      return n_integer;
700 	    else
701 	      return n_none;
702 	  }
703 	if (ptr->attribute == a_digit)
704 	  {
705 	    seen_a_digit = true;
706 	  }
707 	else if (ptr->attribute == a_dot)
708 	  {
709 	    if (seen_a_dot)
710 	      return n_none;
711 	    seen_a_dot = true;
712 	    if (seen_a_digit)
713 	      seen_a_dot_with_leading_digits = true;
714 	    seen_a_digit = false;
715 	  }
716 	else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
717 	  break;
718 	else
719 	  return n_none;
720       }
721     ptr++;
722     if (!seen_a_dot_with_leading_digits || !seen_a_digit)
723       return n_none;
724     if (ptr >= ptr_limit)
725       return n_none;
726     if (ptr->attribute == a_sign)
727       ptr++;
728     seen_a_digit = false;
729     for (;; ptr++)
730       {
731 	if (ptr >= ptr_limit)
732 	  break;
733 	if (ptr->attribute != a_digit)
734 	  return n_none;
735 	seen_a_digit = true;
736       }
737     if (!seen_a_digit)
738       return n_none;
739     return n_float;
740   }
741 }
742 
743 /* A token representing a symbol must be case converted.
744    For portability, we convert only ASCII characters here.  */
745 
746 static void
upcase_token(struct token * tp)747 upcase_token (struct token *tp)
748 {
749   int n = tp->charcount;
750   int i;
751 
752   for (i = 0; i < n; i++)
753     if (tp->chars[i].attribute != a_escaped)
754       {
755 	unsigned char c = tp->chars[i].ch;
756 	if (c >= 'a' && c <= 'z')
757 	  tp->chars[i].ch = c - 'a' + 'A';
758       }
759 }
760 
761 static void
downcase_token(struct token * tp)762 downcase_token (struct token *tp)
763 {
764   int n = tp->charcount;
765   int i;
766 
767   for (i = 0; i < n; i++)
768     if (tp->chars[i].attribute != a_escaped)
769       {
770 	unsigned char c = tp->chars[i].ch;
771 	if (c >= 'A' && c <= 'Z')
772 	  tp->chars[i].ch = c - 'A' + 'a';
773       }
774 }
775 
776 static void
case_convert_token(struct token * tp)777 case_convert_token (struct token *tp)
778 {
779   int n = tp->charcount;
780   int i;
781 
782   switch (readtable_case)
783     {
784     case case_upcase:
785       upcase_token (tp);
786       break;
787 
788     case case_downcase:
789       downcase_token (tp);
790       break;
791 
792     case case_preserve:
793       break;
794 
795     case case_invert:
796       {
797 	bool seen_uppercase = false;
798 	bool seen_lowercase = false;
799 	for (i = 0; i < n; i++)
800 	  if (tp->chars[i].attribute != a_escaped)
801 	    {
802 	      unsigned char c = tp->chars[i].ch;
803 	      if (c >= 'a' && c <= 'z')
804 		seen_lowercase = true;
805 	      if (c >= 'A' && c <= 'Z')
806 		seen_uppercase = true;
807 	    }
808 	if (seen_uppercase)
809 	  {
810 	    if (!seen_lowercase)
811 	      downcase_token (tp);
812 	  }
813 	else
814 	  {
815 	    if (seen_lowercase)
816 	      upcase_token (tp);
817 	  }
818       }
819       break;
820     }
821 }
822 
823 
824 /* ========================= Accumulating comments ========================= */
825 
826 
827 static char *buffer;
828 static size_t bufmax;
829 static size_t buflen;
830 
831 static inline void
comment_start()832 comment_start ()
833 {
834   buflen = 0;
835 }
836 
837 static inline void
comment_add(int c)838 comment_add (int c)
839 {
840   if (buflen >= bufmax)
841     {
842       bufmax = 2 * bufmax + 10;
843       buffer = xrealloc (buffer, bufmax);
844     }
845   buffer[buflen++] = c;
846 }
847 
848 static inline void
comment_line_end(size_t chars_to_remove)849 comment_line_end (size_t chars_to_remove)
850 {
851   buflen -= chars_to_remove;
852   while (buflen >= 1
853 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
854     --buflen;
855   if (chars_to_remove == 0 && buflen >= bufmax)
856     {
857       bufmax = 2 * bufmax + 10;
858       buffer = xrealloc (buffer, bufmax);
859     }
860   buffer[buflen] = '\0';
861   savable_comment_add (buffer);
862 }
863 
864 
865 /* These are for tracking whether comments count as immediately before
866    keyword.  */
867 static int last_comment_line;
868 static int last_non_comment_line;
869 
870 
871 /* ========================= Accumulating messages ========================= */
872 
873 
874 static message_list_ty *mlp;
875 
876 
877 /* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
878 
879 
880 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
881    Other objects need not to be represented precisely.  */
882 enum object_type
883 {
884   t_symbol,	/* symbol */
885   t_string,	/* string */
886   t_other,	/* other kind of real object */
887   t_dot,	/* '.' pseudo object */
888   t_close,	/* ')' pseudo object */
889   t_eof		/* EOF marker */
890 };
891 
892 struct object
893 {
894   enum object_type type;
895   struct token *token;		/* for t_symbol and t_string */
896   int line_number_at_start;	/* for t_string */
897 };
898 
899 /* Free the memory pointed to by a 'struct object'.  */
900 static inline void
free_object(struct object * op)901 free_object (struct object *op)
902 {
903   if (op->type == t_symbol || op->type == t_string)
904     {
905       free_token (op->token);
906       free (op->token);
907     }
908 }
909 
910 /* Convert a t_symbol/t_string token to a char*.  */
911 static char *
string_of_object(const struct object * op)912 string_of_object (const struct object *op)
913 {
914   char *str;
915   const struct token_char *p;
916   char *q;
917   int n;
918 
919   if (!(op->type == t_symbol || op->type == t_string))
920     abort ();
921   n = op->token->charcount;
922   str = (char *) xmalloc (n + 1);
923   q = str;
924   for (p = op->token->chars; n > 0; p++, n--)
925     *q++ = p->ch;
926   *q = '\0';
927   return str;
928 }
929 
930 /* Context lookup table.  */
931 static flag_context_list_table_ty *flag_context_list_table;
932 
933 /* Read the next object.  */
934 static void
read_object(struct object * op,flag_context_ty outer_context)935 read_object (struct object *op, flag_context_ty outer_context)
936 {
937   for (;;)
938     {
939       struct char_syntax curr;
940 
941       read_char_syntax (&curr);
942 
943       switch (curr.scode)
944 	{
945 	case syntax_eof:
946 	  op->type = t_eof;
947 	  return;
948 
949 	case syntax_whitespace:
950 	  if (curr.ch == '\n')
951 	    /* Comments assumed to be grouped with a message must immediately
952 	       precede it, with no non-whitespace token on a line between
953 	       both.  */
954 	    if (last_non_comment_line > last_comment_line)
955 	      savable_comment_reset ();
956 	  continue;
957 
958 	case syntax_illegal:
959 	  op->type = t_other;
960 	  return;
961 
962 	case syntax_single_esc:
963 	case syntax_multi_esc:
964 	case syntax_constituent:
965 	  /* Start reading a token.  */
966 	  op->token = (struct token *) xmalloc (sizeof (struct token));
967 	  read_token (op->token, &curr);
968 	  last_non_comment_line = line_number;
969 
970 	  /* Interpret the token.  */
971 
972 	  /* Dots.  */
973 	  if (!op->token->with_escape
974 	      && op->token->charcount == 1
975 	      && op->token->chars[0].attribute == a_dot)
976 	    {
977 	      free_token (op->token);
978 	      free (op->token);
979 	      op->type = t_dot;
980 	      return;
981 	    }
982 	  /* Tokens consisting entirely of dots are illegal, but be tolerant
983 	     here.  */
984 
985 	  /* Number.  */
986 	  {
987 	    int base = read_base;
988 
989 	    if (is_number (op->token, &base) != n_none)
990 	      {
991 		free_token (op->token);
992 		free (op->token);
993 		op->type = t_other;
994 		return;
995 	      }
996 	  }
997 
998 	  /* We interpret all other tokens as symbols (including 'reserved
999 	     tokens', i.e. potential numbers which are not numbers).  */
1000 	  case_convert_token (op->token);
1001 	  op->type = t_symbol;
1002 	  return;
1003 
1004 	case syntax_t_macro:
1005 	case syntax_nt_macro:
1006 	  /* Read a macro.  */
1007 	  switch (curr.ch)
1008 	    {
1009 	    case '(':
1010 	      {
1011 		int arg = 0;		/* Current argument number.  */
1012 		flag_context_list_iterator_ty context_iter;
1013 		const struct callshapes *shapes = NULL;
1014 		struct arglist_parser *argparser = NULL;
1015 
1016 		for (;; arg++)
1017 		  {
1018 		    struct object inner;
1019 		    flag_context_ty inner_context;
1020 
1021 		    if (arg == 0)
1022 		      inner_context = null_context;
1023 		    else
1024 		      inner_context =
1025 			inherited_context (outer_context,
1026 					   flag_context_list_iterator_advance (
1027 					     &context_iter));
1028 
1029 		    read_object (&inner, inner_context);
1030 
1031 		    /* Recognize end of list.  */
1032 		    if (inner.type == t_close)
1033 		      {
1034 			op->type = t_other;
1035 			/* Don't bother converting "()" to "NIL".  */
1036 			last_non_comment_line = line_number;
1037 			if (argparser != NULL)
1038 			  arglist_parser_done (argparser, arg);
1039 			return;
1040 		      }
1041 
1042 		    /* Dots are not allowed in every position.
1043 		       But be tolerant.  */
1044 
1045 		    /* EOF inside list is illegal.
1046 		       But be tolerant.  */
1047 		    if (inner.type == t_eof)
1048 		      break;
1049 
1050 		    if (arg == 0)
1051 		      {
1052 			/* This is the function position.  */
1053 			if (inner.type == t_symbol)
1054 			  {
1055 			    char *symbol_name = string_of_object (&inner);
1056 			    int i;
1057 			    int prefix_len;
1058 			    void *keyword_value;
1059 
1060 			    /* Omit any package name.  */
1061 			    i = inner.token->charcount;
1062 			    while (i > 0
1063 				   && inner.token->chars[i-1].attribute != a_pack_m)
1064 			      i--;
1065 			    prefix_len = i;
1066 
1067 			    if (hash_find_entry (&keywords,
1068 						 symbol_name + prefix_len,
1069 						 strlen (symbol_name + prefix_len),
1070 						 &keyword_value)
1071 				== 0)
1072 			      shapes = (const struct callshapes *) keyword_value;
1073 
1074 			    argparser = arglist_parser_alloc (mlp, shapes);
1075 
1076 			    context_iter =
1077 			      flag_context_list_iterator (
1078 				flag_context_list_table_lookup (
1079 				  flag_context_list_table,
1080 				  symbol_name, strlen (symbol_name)));
1081 
1082 			    free (symbol_name);
1083 			  }
1084 			else
1085 			  context_iter = null_context_list_iterator;
1086 		      }
1087 		    else
1088 		      {
1089 			/* These are the argument positions.  */
1090 			if (argparser != NULL && inner.type == t_string)
1091 			  arglist_parser_remember (argparser, arg,
1092 						   string_of_object (&inner),
1093 						   inner_context,
1094 						   logical_file_name,
1095 						   inner.line_number_at_start,
1096 						   savable_comment);
1097 		      }
1098 
1099 		    free_object (&inner);
1100 		  }
1101 
1102 		if (argparser != NULL)
1103 		  arglist_parser_done (argparser, arg);
1104 	      }
1105 	      op->type = t_other;
1106 	      last_non_comment_line = line_number;
1107 	      return;
1108 
1109 	    case ')':
1110 	      /* Tell the caller about the end of list.
1111 		 Unmatched closing parenthesis is illegal.
1112 		 But be tolerant.  */
1113 	      op->type = t_close;
1114 	      last_non_comment_line = line_number;
1115 	      return;
1116 
1117 	    case ',':
1118 	      {
1119 		int c = do_getc ();
1120 		/* The ,@ handling inside lists is wrong anyway, because
1121 		   ,@form expands to an unknown number of elements.  */
1122 		if (c != EOF && c != '@' && c != '.')
1123 		  do_ungetc (c);
1124 	      }
1125 	      /*FALLTHROUGH*/
1126 	    case '\'':
1127 	    case '`':
1128 	      {
1129 		struct object inner;
1130 
1131 		read_object (&inner, null_context);
1132 
1133 		/* Dots and EOF are not allowed here.  But be tolerant.  */
1134 
1135 		free_object (&inner);
1136 
1137 		op->type = t_other;
1138 		last_non_comment_line = line_number;
1139 		return;
1140 	      }
1141 
1142 	    case ';':
1143 	      {
1144 		bool all_semicolons = true;
1145 
1146 		last_comment_line = line_number;
1147 		comment_start ();
1148 		for (;;)
1149 		  {
1150 		    int c = do_getc ();
1151 		    if (c == EOF || c == '\n')
1152 		      break;
1153 		    if (c != ';')
1154 		      all_semicolons = false;
1155 		    if (!all_semicolons)
1156 		      {
1157 			/* We skip all leading white space, but not EOLs.  */
1158 			if (!(buflen == 0 && (c == ' ' || c == '\t')))
1159 			  comment_add (c);
1160 		      }
1161 		  }
1162 		comment_line_end (0);
1163 		continue;
1164 	      }
1165 
1166 	    case '"':
1167 	      {
1168 		op->token = (struct token *) xmalloc (sizeof (struct token));
1169 		init_token (op->token);
1170 		op->line_number_at_start = line_number;
1171 		for (;;)
1172 		  {
1173 		    int c = do_getc ();
1174 		    if (c == EOF)
1175 		      /* Invalid input.  Be tolerant, no error message.  */
1176 		      break;
1177 		    if (c == '"')
1178 		      break;
1179 		    if (c == '\\') /* syntax_single_esc */
1180 		      {
1181 			c = do_getc ();
1182 			if (c == EOF)
1183 			  /* Invalid input.  Be tolerant, no error message.  */
1184 			  break;
1185 		      }
1186 		    grow_token (op->token);
1187 		    op->token->chars[op->token->charcount++].ch = c;
1188 		  }
1189 		op->type = t_string;
1190 
1191 		if (extract_all)
1192 		  {
1193 		    lex_pos_ty pos;
1194 
1195 		    pos.file_name = logical_file_name;
1196 		    pos.line_number = op->line_number_at_start;
1197 		    remember_a_message (mlp, NULL, string_of_object (op),
1198 					null_context, &pos, savable_comment);
1199 		  }
1200 		last_non_comment_line = line_number;
1201 		return;
1202 	      }
1203 
1204 	    case '#':
1205 	      /* Dispatch macro handling.  */
1206 	      {
1207 		int c;
1208 
1209 		for (;;)
1210 		  {
1211 		    c = do_getc ();
1212 		    if (c == EOF)
1213 		      /* Invalid input.  Be tolerant, no error message.  */
1214 		      {
1215 			op->type = t_other;
1216 			return;
1217 		      }
1218 		    if (!(c >= '0' && c <= '9'))
1219 		      break;
1220 		  }
1221 
1222 		switch (c)
1223 		  {
1224 		  case '(':
1225 		  case '"':
1226 		    do_ungetc (c);
1227 		    /*FALLTHROUGH*/
1228 		  case '\'':
1229 		  case ':':
1230 		  case '.':
1231 		  case ',':
1232 		  case 'A': case 'a':
1233 		  case 'C': case 'c':
1234 		  case 'P': case 'p':
1235 		  case 'S': case 's':
1236 		    {
1237 		      struct object inner;
1238 		      read_object (&inner, null_context);
1239 		      /* Dots and EOF are not allowed here.
1240 			 But be tolerant.  */
1241 		      free_object (&inner);
1242 		      op->type = t_other;
1243 		      last_non_comment_line = line_number;
1244 		      return;
1245 		    }
1246 
1247 		  case '|':
1248 		    {
1249 		      int depth = 0;
1250 		      int c;
1251 
1252 		      comment_start ();
1253 		      c = do_getc ();
1254 		      for (;;)
1255 			{
1256 			  if (c == EOF)
1257 			    break;
1258 			  if (c == '|')
1259 			    {
1260 			      c = do_getc ();
1261 			      if (c == EOF)
1262 				break;
1263 			      if (c == '#')
1264 				{
1265 				  if (depth == 0)
1266 				    {
1267 				      comment_line_end (0);
1268 				      break;
1269 				    }
1270 				  depth--;
1271 				  comment_add ('|');
1272 				  comment_add ('#');
1273 				  c = do_getc ();
1274 				}
1275 			      else
1276 				comment_add ('|');
1277 			    }
1278 			  else if (c == '#')
1279 			    {
1280 			      c = do_getc ();
1281 			      if (c == EOF)
1282 				break;
1283 			      comment_add ('#');
1284 			      if (c == '|')
1285 				{
1286 				  depth++;
1287 				  comment_add ('|');
1288 				  c = do_getc ();
1289 				}
1290 			    }
1291 			  else
1292 			    {
1293 			      /* We skip all leading white space.  */
1294 			      if (!(buflen == 0 && (c == ' ' || c == '\t')))
1295 				comment_add (c);
1296 			      if (c == '\n')
1297 				{
1298 				  comment_line_end (1);
1299 				  comment_start ();
1300 				}
1301 			      c = do_getc ();
1302 			    }
1303 			}
1304 		      if (c == EOF)
1305 			{
1306 			  /* EOF not allowed here.  But be tolerant.  */
1307 			  op->type = t_eof;
1308 			  return;
1309 			}
1310 		      last_comment_line = line_number;
1311 		      continue;
1312 		    }
1313 
1314 		  case '\\':
1315 		    {
1316 		      struct token token;
1317 		      struct char_syntax first;
1318 		      first.ch = '\\';
1319 		      first.scode = syntax_single_esc;
1320 		      read_token (&token, &first);
1321 		      free_token (&token);
1322 		      op->type = t_other;
1323 		      last_non_comment_line = line_number;
1324 		      return;
1325 		    }
1326 
1327 		  case 'B': case 'b':
1328 		  case 'O': case 'o':
1329 		  case 'X': case 'x':
1330 		  case 'R': case 'r':
1331 		  case '*':
1332 		    {
1333 		      struct token token;
1334 		      read_token (&token, NULL);
1335 		      free_token (&token);
1336 		      op->type = t_other;
1337 		      last_non_comment_line = line_number;
1338 		      return;
1339 		    }
1340 
1341 		  case '=':
1342 		    /* Ignore read labels.  */
1343 		    continue;
1344 
1345 		  case '#':
1346 		    /* Don't bother looking up the corresponding object.  */
1347 		    op->type = t_other;
1348 		    last_non_comment_line = line_number;
1349 		    return;
1350 
1351 		  case '+':
1352 		  case '-':
1353 		    /* Simply assume every feature expression is true.  */
1354 		    {
1355 		      struct object inner;
1356 		      read_object (&inner, null_context);
1357 		      /* Dots and EOF are not allowed here.
1358 			 But be tolerant.  */
1359 		      free_object (&inner);
1360 		      continue;
1361 		    }
1362 
1363 		  default:
1364 		    op->type = t_other;
1365 		    last_non_comment_line = line_number;
1366 		    return;
1367 		  }
1368 		/*NOTREACHED*/
1369 		abort ();
1370 	      }
1371 
1372 	    default:
1373 	      /*NOTREACHED*/
1374 	      abort ();
1375 	    }
1376 
1377 	default:
1378 	  /*NOTREACHED*/
1379 	  abort ();
1380 	}
1381     }
1382 }
1383 
1384 
1385 void
extract_lisp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1386 extract_lisp (FILE *f,
1387 	      const char *real_filename, const char *logical_filename,
1388 	      flag_context_list_table_ty *flag_table,
1389 	      msgdomain_list_ty *mdlp)
1390 {
1391   mlp = mdlp->item[0]->messages;
1392 
1393   fp = f;
1394   real_file_name = real_filename;
1395   logical_file_name = xstrdup (logical_filename);
1396   line_number = 1;
1397 
1398   last_comment_line = -1;
1399   last_non_comment_line = -1;
1400 
1401   flag_context_list_table = flag_table;
1402 
1403   init_keywords ();
1404 
1405   /* Eat tokens until eof is seen.  When read_object returns
1406      due to an unbalanced closing parenthesis, just restart it.  */
1407   do
1408     {
1409       struct object toplevel_object;
1410 
1411       read_object (&toplevel_object, null_context);
1412 
1413       if (toplevel_object.type == t_eof)
1414 	break;
1415 
1416       free_object (&toplevel_object);
1417     }
1418   while (!feof (fp));
1419 
1420   /* Close scanner.  */
1421   fp = NULL;
1422   real_file_name = NULL;
1423   logical_file_name = NULL;
1424   line_number = 0;
1425 }
1426