1 /* xgettext Lisp backend.
2 Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-lisp.h"
33 #include "error.h"
34 #include "xalloc.h"
35 #include "exit.h"
36 #include "hash.h"
37 #include "gettext.h"
38
39 #define _(s) gettext(s)
40
41
42 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
43 Since we are interested only in strings and in forms similar to
44 (gettext msgid ...)
45 or (ngettext msgid msgid_plural ...)
46 we make the following simplifications:
47
48 - Assume the keywords and strings are in an ASCII compatible encoding.
49 This means we can read the input file one byte at a time, instead of
50 one character at a time. No need to worry about multibyte characters:
51 If they occur as part of identifiers, they most probably act as
52 constituent characters, and the byte based approach will do the same.
53
54 - Assume the read table is the standard Common Lisp read table.
55 Non-standard read tables are mostly used to read data, not programs.
56
57 - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
58
59 - Don't interpret #n= and #n#, they usually don't appear in programs.
60
61 - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
62
63 The remaining syntax rules are:
64
65 - The syntax code assigned to each character, and how tokens are built
66 up from characters (single escape, multiple escape etc.).
67
68 - Comment syntax: ';' and '#| ... |#'.
69
70 - String syntax: "..." with single escapes.
71
72 - Read macros and dispatch macro character '#'. Needed to be able to
73 tell which is the n-th argument of a function call.
74
75 */
76
77
78 /* ========================= Lexer customization. ========================= */
79
80 /* 'readtable_case' is the case conversion that is applied to non-escaped
81 parts of symbol tokens. In Common Lisp: (readtable-case *readtable*). */
82
83 enum rtcase
84 {
85 case_upcase,
86 case_downcase,
87 case_preserve,
88 case_invert
89 };
90
91 static enum rtcase readtable_case = case_upcase;
92
93 /* 'read_base' is the assumed radix of integers and rational numbers.
94 In Common Lisp: *read-base*. */
95 static int read_base = 10;
96
97 /* 'read_preserve_whitespace' specifies whether a whitespace character
98 that terminates a token must be pushed back on the input stream.
99 We set it to true, because the special newline side effect in read_object()
100 requires that read_object() sees every newline not inside a token. */
101 static bool read_preserve_whitespace = true;
102
103
104 /* ====================== Keyword set customization. ====================== */
105
106 /* If true extract all strings. */
107 static bool extract_all = false;
108
109 static hash_table keywords;
110 static bool default_keywords = true;
111
112
113 void
x_lisp_extract_all()114 x_lisp_extract_all ()
115 {
116 extract_all = true;
117 }
118
119
120 void
x_lisp_keyword(const char * name)121 x_lisp_keyword (const char *name)
122 {
123 if (name == NULL)
124 default_keywords = false;
125 else
126 {
127 const char *end;
128 struct callshape shape;
129 const char *colon;
130 size_t len;
131 char *symname;
132 size_t i;
133
134 if (keywords.table == NULL)
135 hash_init (&keywords, 100);
136
137 split_keywordspec (name, &end, &shape);
138
139 /* The characters between name and end should form a valid Lisp symbol.
140 Extract the symbol name part. */
141 colon = strchr (name, ':');
142 if (colon != NULL && colon < end)
143 {
144 name = colon + 1;
145 if (name < end && *name == ':')
146 name++;
147 colon = strchr (name, ':');
148 if (colon != NULL && colon < end)
149 return;
150 }
151
152 /* Uppercase it. */
153 len = end - name;
154 symname = (char *) xmalloc (len);
155 for (i = 0; i < len; i++)
156 symname[i] =
157 (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
158
159 insert_keyword_callshape (&keywords, symname, len, &shape);
160 }
161 }
162
163 /* Finish initializing the keywords hash table.
164 Called after argument processing, before each file is processed. */
165 static void
init_keywords()166 init_keywords ()
167 {
168 if (default_keywords)
169 {
170 /* When adding new keywords here, also update the documentation in
171 xgettext.texi! */
172 x_lisp_keyword ("gettext"); /* I18N:GETTEXT */
173 x_lisp_keyword ("ngettext:1,2"); /* I18N:NGETTEXT */
174 x_lisp_keyword ("gettext-noop");
175 default_keywords = false;
176 }
177 }
178
179 void
init_flag_table_lisp()180 init_flag_table_lisp ()
181 {
182 xgettext_record_flag ("gettext:1:pass-lisp-format");
183 xgettext_record_flag ("ngettext:1:pass-lisp-format");
184 xgettext_record_flag ("ngettext:2:pass-lisp-format");
185 xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
186 xgettext_record_flag ("format:2:lisp-format");
187 }
188
189
190 /* ======================== Reading of characters. ======================== */
191
192 /* Real filename, used in error messages about the input file. */
193 static const char *real_file_name;
194
195 /* Logical filename and line number, used to label the extracted messages. */
196 static char *logical_file_name;
197 static int line_number;
198
199 /* The input file stream. */
200 static FILE *fp;
201
202
203 /* Fetch the next character from the input file. */
204 static int
do_getc()205 do_getc ()
206 {
207 int c = getc (fp);
208
209 if (c == EOF)
210 {
211 if (ferror (fp))
212 error (EXIT_FAILURE, errno, _("\
213 error while reading \"%s\""), real_file_name);
214 }
215 else if (c == '\n')
216 line_number++;
217
218 return c;
219 }
220
221 /* Put back the last fetched character, not EOF. */
222 static void
do_ungetc(int c)223 do_ungetc (int c)
224 {
225 if (c == '\n')
226 line_number--;
227 ungetc (c, fp);
228 }
229
230
231 /* ========= Reading of tokens. See CLHS 2.2 "Reader Algorithm". ========= */
232
233
234 /* Syntax code. See CLHS 2.1.4 "Character Syntax Types". */
235
236 enum syntax_code
237 {
238 syntax_illegal, /* non-printable, except whitespace */
239 syntax_single_esc, /* '\' (single escape) */
240 syntax_multi_esc, /* '|' (multiple escape) */
241 syntax_constituent, /* everything else (constituent) */
242 syntax_whitespace, /* TAB,LF,FF,CR,' ' (whitespace) */
243 syntax_eof, /* EOF */
244 syntax_t_macro, /* '()'"' (terminating macro) */
245 syntax_nt_macro /* '#' (non-terminating macro) */
246 };
247
248 /* Returns the syntax code of a character. */
249 static enum syntax_code
syntax_code_of(unsigned char c)250 syntax_code_of (unsigned char c)
251 {
252 switch (c)
253 {
254 case '\\':
255 return syntax_single_esc;
256 case '|':
257 return syntax_multi_esc;
258 case '\t': case '\n': case '\f': case '\r': case ' ':
259 return syntax_whitespace;
260 case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
261 return syntax_t_macro;
262 case '#':
263 return syntax_nt_macro;
264 default:
265 if (c < ' ' && c != '\b')
266 return syntax_illegal;
267 else
268 return syntax_constituent;
269 }
270 }
271
272 struct char_syntax
273 {
274 int ch; /* character */
275 enum syntax_code scode; /* syntax code */
276 };
277
278 /* Returns the next character and its syntax code. */
279 static void
read_char_syntax(struct char_syntax * p)280 read_char_syntax (struct char_syntax *p)
281 {
282 int c = do_getc ();
283
284 p->ch = c;
285 p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
286 }
287
288 /* Every character in a token has an attribute assigned. The attributes
289 help during interpretation of the token. See
290 CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
291 and CLHS 2.1.4.2 "Constituent Traits". */
292
293 enum attribute
294 {
295 a_illg, /* invalid constituent */
296 a_pack_m, /* ':' package marker */
297 a_alpha, /* normal alphabetic */
298 a_escaped, /* alphabetic but not subject to case conversion */
299 a_ratio, /* '/' */
300 a_dot, /* '.' */
301 a_sign, /* '+-' */
302 a_extens, /* '_^' extension characters */
303 a_digit, /* '0123456789' */
304 a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
305 a_expodigit, /* 'esfdlESFDL' below base */
306 a_letter, /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
307 a_expo /* 'esfdlESFDL' */
308 };
309
310 #define is_letter_attribute(a) ((a) >= a_letter)
311 #define is_number_attribute(a) ((a) >= a_ratio)
312
313 /* Returns the attribute of a character, assuming base 10. */
314 static enum attribute
attribute_of(unsigned char c)315 attribute_of (unsigned char c)
316 {
317 switch (c)
318 {
319 case ':':
320 return a_pack_m;
321 case '/':
322 return a_ratio;
323 case '.':
324 return a_dot;
325 case '+': case '-':
326 return a_sign;
327 case '_': case '^':
328 return a_extens;
329 case '0': case '1': case '2': case '3': case '4':
330 case '5': case '6': case '7': case '8': case '9':
331 return a_digit;
332 case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
333 case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
334 case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
335 case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
336 case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
337 case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
338 return a_letter;
339 case 'e': case 's': case 'd': case 'f': case 'l':
340 case 'E': case 'S': case 'D': case 'F': case 'L':
341 return a_expo;
342 default:
343 /* Treat everything as valid. Never return a_illg. */
344 return a_alpha;
345 }
346 }
347
348 struct token_char
349 {
350 unsigned char ch; /* character */
351 unsigned char attribute; /* attribute */
352 };
353
354 /* A token consists of a sequence of characters with associated attribute. */
355 struct token
356 {
357 int allocated; /* number of allocated 'token_char's */
358 int charcount; /* number of used 'token_char's */
359 struct token_char *chars; /* the token's constituents */
360 bool with_escape; /* whether single-escape or multiple escape occurs */
361 };
362
363 /* Initialize a 'struct token'. */
364 static inline void
init_token(struct token * tp)365 init_token (struct token *tp)
366 {
367 tp->allocated = 10;
368 tp->chars =
369 (struct token_char *) xmalloc (tp->allocated * sizeof (struct token_char));
370 tp->charcount = 0;
371 }
372
373 /* Free the memory pointed to by a 'struct token'. */
374 static inline void
free_token(struct token * tp)375 free_token (struct token *tp)
376 {
377 free (tp->chars);
378 }
379
380 /* Ensure there is enough room in the token for one more character. */
381 static inline void
grow_token(struct token * tp)382 grow_token (struct token *tp)
383 {
384 if (tp->charcount == tp->allocated)
385 {
386 tp->allocated *= 2;
387 tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
388 }
389 }
390
391 /* Read the next token. If 'first' is given, it points to the first
392 character, which has already been read.
393 The algorithm follows CLHS 2.2 "Reader Algorithm". */
394 static void
read_token(struct token * tp,const struct char_syntax * first)395 read_token (struct token *tp, const struct char_syntax *first)
396 {
397 bool multiple_escape_flag;
398 struct char_syntax curr;
399
400 init_token (tp);
401 tp->with_escape = false;
402
403 multiple_escape_flag = false;
404 if (first)
405 curr = *first;
406 else
407 read_char_syntax (&curr);
408
409 for (;; read_char_syntax (&curr))
410 {
411 switch (curr.scode)
412 {
413 case syntax_illegal:
414 /* Invalid input. Be tolerant, no error message. */
415 do_ungetc (curr.ch);
416 return;
417
418 case syntax_single_esc:
419 tp->with_escape = true;
420 read_char_syntax (&curr);
421 if (curr.scode == syntax_eof)
422 /* Invalid input. Be tolerant, no error message. */
423 return;
424 grow_token (tp);
425 tp->chars[tp->charcount].ch = curr.ch;
426 tp->chars[tp->charcount].attribute = a_escaped;
427 tp->charcount++;
428 break;
429
430 case syntax_multi_esc:
431 multiple_escape_flag = !multiple_escape_flag;
432 tp->with_escape = true;
433 break;
434
435 case syntax_constituent:
436 case syntax_nt_macro:
437 grow_token (tp);
438 if (multiple_escape_flag)
439 {
440 tp->chars[tp->charcount].ch = curr.ch;
441 tp->chars[tp->charcount].attribute = a_escaped;
442 tp->charcount++;
443 }
444 else
445 {
446 tp->chars[tp->charcount].ch = curr.ch;
447 tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
448 tp->charcount++;
449 }
450 break;
451
452 case syntax_whitespace:
453 case syntax_t_macro:
454 if (multiple_escape_flag)
455 {
456 grow_token (tp);
457 tp->chars[tp->charcount].ch = curr.ch;
458 tp->chars[tp->charcount].attribute = a_escaped;
459 tp->charcount++;
460 }
461 else
462 {
463 if (curr.scode != syntax_whitespace || read_preserve_whitespace)
464 do_ungetc (curr.ch);
465 return;
466 }
467 break;
468
469 case syntax_eof:
470 if (multiple_escape_flag)
471 /* Invalid input. Be tolerant, no error message. */
472 ;
473 return;
474 }
475 }
476 }
477
478 /* A potential number is a token which
479 1. consists only of digits, '+','-','/','^','_','.' and number markers.
480 The base for digits is context dependent, but always 10 if a dot '.'
481 occurs. A number marker is a non-digit letter which is not adjacent
482 to a non-digit letter.
483 2. has at least one digit.
484 3. starts with a digit, '+','-','.','^' or '_'.
485 4. does not end with '+' or '-'.
486 See CLHS 2.3.1.1 "Potential Numbers as Tokens".
487 */
488
489 static inline bool
has_a_dot(const struct token * tp)490 has_a_dot (const struct token *tp)
491 {
492 int n = tp->charcount;
493 int i;
494
495 for (i = 0; i < n; i++)
496 if (tp->chars[i].attribute == a_dot)
497 return true;
498 return false;
499 }
500
501 static inline bool
all_a_number(const struct token * tp)502 all_a_number (const struct token *tp)
503 {
504 int n = tp->charcount;
505 int i;
506
507 for (i = 0; i < n; i++)
508 if (!is_number_attribute (tp->chars[i].attribute))
509 return false;
510 return true;
511 }
512
513 static inline void
a_letter_to_digit(const struct token * tp,int base)514 a_letter_to_digit (const struct token *tp, int base)
515 {
516 int n = tp->charcount;
517 int i;
518
519 for (i = 0; i < n; i++)
520 if (is_letter_attribute (tp->chars[i].attribute))
521 {
522 int c = tp->chars[i].ch;
523
524 if (c >= 'a')
525 c -= 'a' - 'A';
526 if (c - 'A' + 10 < base)
527 tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
528 a_expo -> a_expodigit */
529 }
530 }
531
532 static inline bool
has_a_digit(const struct token * tp)533 has_a_digit (const struct token *tp)
534 {
535 int n = tp->charcount;
536 int i;
537
538 for (i = 0; i < n; i++)
539 if (tp->chars[i].attribute == a_digit
540 || tp->chars[i].attribute == a_letterdigit
541 || tp->chars[i].attribute == a_expodigit)
542 return true;
543 return false;
544 }
545
546 static inline bool
has_adjacent_letters(const struct token * tp)547 has_adjacent_letters (const struct token *tp)
548 {
549 int n = tp->charcount;
550 int i;
551
552 for (i = 1; i < n; i++)
553 if (is_letter_attribute (tp->chars[i-1].attribute)
554 && is_letter_attribute (tp->chars[i].attribute))
555 return true;
556 return false;
557 }
558
559 static bool
is_potential_number(const struct token * tp,int * basep)560 is_potential_number (const struct token *tp, int *basep)
561 {
562 /* CLHS 2.3.1.1.1:
563 "A potential number cannot contain any escape characters." */
564 if (tp->with_escape)
565 return false;
566
567 if (has_a_dot (tp))
568 *basep = 10;
569
570 if (!all_a_number (tp))
571 return false;
572
573 a_letter_to_digit (tp, *basep);
574
575 if (!has_a_digit (tp))
576 return false;
577
578 if (has_adjacent_letters (tp))
579 return false;
580
581 if (!(tp->chars[0].attribute >= a_dot
582 && tp->chars[0].attribute <= a_expodigit))
583 return false;
584
585 if (tp->chars[tp->charcount - 1].attribute == a_sign)
586 return false;
587
588 return true;
589 }
590
591 /* A number is one of integer, ratio, float. Each has a particular syntax.
592 See CLHS 2.3.1 "Numbers as Tokens".
593 But note a mistake: The exponent rule should read:
594 exponent ::= exponent-marker [sign] {decimal-digit}+
595 (see 22.1.3.1.3 "Printing Floats"). */
596
597 enum number_type
598 {
599 n_none,
600 n_integer,
601 n_ratio,
602 n_float
603 };
604
605 static enum number_type
is_number(const struct token * tp,int * basep)606 is_number (const struct token *tp, int *basep)
607 {
608 struct token_char *ptr_limit;
609 struct token_char *ptr1;
610
611 if (!is_potential_number (tp, basep))
612 return n_none;
613
614 /* is_potential_number guarantees
615 - all attributes are >= a_ratio,
616 - there is at least one a_digit or a_letterdigit or a_expodigit, and
617 - if there is an a_dot, then *basep = 10. */
618
619 ptr1 = &tp->chars[0];
620 ptr_limit = &tp->chars[tp->charcount];
621
622 if (ptr1->attribute == a_sign)
623 ptr1++;
624
625 /* Test for syntax
626 * { a_sign | }
627 * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
628 */
629 {
630 bool seen_a_ratio = false;
631 bool seen_a_digit = false; /* seen a digit in last digit block? */
632 struct token_char *ptr;
633
634 for (ptr = ptr1;; ptr++)
635 {
636 if (ptr >= ptr_limit)
637 {
638 if (!seen_a_digit)
639 break;
640 if (seen_a_ratio)
641 return n_ratio;
642 else
643 return n_integer;
644 }
645 if (ptr->attribute == a_digit
646 || ptr->attribute == a_letterdigit
647 || ptr->attribute == a_expodigit)
648 {
649 int c = ptr->ch;
650
651 c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
652 if (c >= *basep)
653 break;
654 seen_a_digit = true;
655 }
656 else if (ptr->attribute == a_ratio)
657 {
658 if (seen_a_ratio || !seen_a_digit)
659 break;
660 seen_a_ratio = true;
661 seen_a_digit = false;
662 }
663 else
664 break;
665 }
666 }
667
668 /* Test for syntax
669 * { a_sign | }
670 * { a_digit }* { a_dot { a_digit }* | }
671 * { a_expo { a_sign | } { a_digit }+ | }
672 *
673 * If there is an exponent part, there must be digits before the dot or
674 * after the dot. The result is a float.
675 * If there is no exponen:
676 * If there is no dot, it would an integer in base 10, but is has already
677 * been verified to not be an integer in the current base.
678 * If there is a dot:
679 * If there are digits after the dot, it's a float.
680 * Otherwise, if there are digits before the dot, it's an integer.
681 */
682 *basep = 10;
683 {
684 bool seen_a_dot = false;
685 bool seen_a_dot_with_leading_digits = false;
686 bool seen_a_digit = false; /* seen a digit in last digit block? */
687 struct token_char *ptr;
688
689 for (ptr = ptr1;; ptr++)
690 {
691 if (ptr >= ptr_limit)
692 {
693 /* no exponent */
694 if (!seen_a_dot)
695 return n_none;
696 if (seen_a_digit)
697 return n_float;
698 if (seen_a_dot_with_leading_digits)
699 return n_integer;
700 else
701 return n_none;
702 }
703 if (ptr->attribute == a_digit)
704 {
705 seen_a_digit = true;
706 }
707 else if (ptr->attribute == a_dot)
708 {
709 if (seen_a_dot)
710 return n_none;
711 seen_a_dot = true;
712 if (seen_a_digit)
713 seen_a_dot_with_leading_digits = true;
714 seen_a_digit = false;
715 }
716 else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
717 break;
718 else
719 return n_none;
720 }
721 ptr++;
722 if (!seen_a_dot_with_leading_digits || !seen_a_digit)
723 return n_none;
724 if (ptr >= ptr_limit)
725 return n_none;
726 if (ptr->attribute == a_sign)
727 ptr++;
728 seen_a_digit = false;
729 for (;; ptr++)
730 {
731 if (ptr >= ptr_limit)
732 break;
733 if (ptr->attribute != a_digit)
734 return n_none;
735 seen_a_digit = true;
736 }
737 if (!seen_a_digit)
738 return n_none;
739 return n_float;
740 }
741 }
742
743 /* A token representing a symbol must be case converted.
744 For portability, we convert only ASCII characters here. */
745
746 static void
upcase_token(struct token * tp)747 upcase_token (struct token *tp)
748 {
749 int n = tp->charcount;
750 int i;
751
752 for (i = 0; i < n; i++)
753 if (tp->chars[i].attribute != a_escaped)
754 {
755 unsigned char c = tp->chars[i].ch;
756 if (c >= 'a' && c <= 'z')
757 tp->chars[i].ch = c - 'a' + 'A';
758 }
759 }
760
761 static void
downcase_token(struct token * tp)762 downcase_token (struct token *tp)
763 {
764 int n = tp->charcount;
765 int i;
766
767 for (i = 0; i < n; i++)
768 if (tp->chars[i].attribute != a_escaped)
769 {
770 unsigned char c = tp->chars[i].ch;
771 if (c >= 'A' && c <= 'Z')
772 tp->chars[i].ch = c - 'A' + 'a';
773 }
774 }
775
776 static void
case_convert_token(struct token * tp)777 case_convert_token (struct token *tp)
778 {
779 int n = tp->charcount;
780 int i;
781
782 switch (readtable_case)
783 {
784 case case_upcase:
785 upcase_token (tp);
786 break;
787
788 case case_downcase:
789 downcase_token (tp);
790 break;
791
792 case case_preserve:
793 break;
794
795 case case_invert:
796 {
797 bool seen_uppercase = false;
798 bool seen_lowercase = false;
799 for (i = 0; i < n; i++)
800 if (tp->chars[i].attribute != a_escaped)
801 {
802 unsigned char c = tp->chars[i].ch;
803 if (c >= 'a' && c <= 'z')
804 seen_lowercase = true;
805 if (c >= 'A' && c <= 'Z')
806 seen_uppercase = true;
807 }
808 if (seen_uppercase)
809 {
810 if (!seen_lowercase)
811 downcase_token (tp);
812 }
813 else
814 {
815 if (seen_lowercase)
816 upcase_token (tp);
817 }
818 }
819 break;
820 }
821 }
822
823
824 /* ========================= Accumulating comments ========================= */
825
826
827 static char *buffer;
828 static size_t bufmax;
829 static size_t buflen;
830
831 static inline void
comment_start()832 comment_start ()
833 {
834 buflen = 0;
835 }
836
837 static inline void
comment_add(int c)838 comment_add (int c)
839 {
840 if (buflen >= bufmax)
841 {
842 bufmax = 2 * bufmax + 10;
843 buffer = xrealloc (buffer, bufmax);
844 }
845 buffer[buflen++] = c;
846 }
847
848 static inline void
comment_line_end(size_t chars_to_remove)849 comment_line_end (size_t chars_to_remove)
850 {
851 buflen -= chars_to_remove;
852 while (buflen >= 1
853 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
854 --buflen;
855 if (chars_to_remove == 0 && buflen >= bufmax)
856 {
857 bufmax = 2 * bufmax + 10;
858 buffer = xrealloc (buffer, bufmax);
859 }
860 buffer[buflen] = '\0';
861 savable_comment_add (buffer);
862 }
863
864
865 /* These are for tracking whether comments count as immediately before
866 keyword. */
867 static int last_comment_line;
868 static int last_non_comment_line;
869
870
871 /* ========================= Accumulating messages ========================= */
872
873
874 static message_list_ty *mlp;
875
876
877 /* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
878
879
880 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
881 Other objects need not to be represented precisely. */
882 enum object_type
883 {
884 t_symbol, /* symbol */
885 t_string, /* string */
886 t_other, /* other kind of real object */
887 t_dot, /* '.' pseudo object */
888 t_close, /* ')' pseudo object */
889 t_eof /* EOF marker */
890 };
891
892 struct object
893 {
894 enum object_type type;
895 struct token *token; /* for t_symbol and t_string */
896 int line_number_at_start; /* for t_string */
897 };
898
899 /* Free the memory pointed to by a 'struct object'. */
900 static inline void
free_object(struct object * op)901 free_object (struct object *op)
902 {
903 if (op->type == t_symbol || op->type == t_string)
904 {
905 free_token (op->token);
906 free (op->token);
907 }
908 }
909
910 /* Convert a t_symbol/t_string token to a char*. */
911 static char *
string_of_object(const struct object * op)912 string_of_object (const struct object *op)
913 {
914 char *str;
915 const struct token_char *p;
916 char *q;
917 int n;
918
919 if (!(op->type == t_symbol || op->type == t_string))
920 abort ();
921 n = op->token->charcount;
922 str = (char *) xmalloc (n + 1);
923 q = str;
924 for (p = op->token->chars; n > 0; p++, n--)
925 *q++ = p->ch;
926 *q = '\0';
927 return str;
928 }
929
930 /* Context lookup table. */
931 static flag_context_list_table_ty *flag_context_list_table;
932
933 /* Read the next object. */
934 static void
read_object(struct object * op,flag_context_ty outer_context)935 read_object (struct object *op, flag_context_ty outer_context)
936 {
937 for (;;)
938 {
939 struct char_syntax curr;
940
941 read_char_syntax (&curr);
942
943 switch (curr.scode)
944 {
945 case syntax_eof:
946 op->type = t_eof;
947 return;
948
949 case syntax_whitespace:
950 if (curr.ch == '\n')
951 /* Comments assumed to be grouped with a message must immediately
952 precede it, with no non-whitespace token on a line between
953 both. */
954 if (last_non_comment_line > last_comment_line)
955 savable_comment_reset ();
956 continue;
957
958 case syntax_illegal:
959 op->type = t_other;
960 return;
961
962 case syntax_single_esc:
963 case syntax_multi_esc:
964 case syntax_constituent:
965 /* Start reading a token. */
966 op->token = (struct token *) xmalloc (sizeof (struct token));
967 read_token (op->token, &curr);
968 last_non_comment_line = line_number;
969
970 /* Interpret the token. */
971
972 /* Dots. */
973 if (!op->token->with_escape
974 && op->token->charcount == 1
975 && op->token->chars[0].attribute == a_dot)
976 {
977 free_token (op->token);
978 free (op->token);
979 op->type = t_dot;
980 return;
981 }
982 /* Tokens consisting entirely of dots are illegal, but be tolerant
983 here. */
984
985 /* Number. */
986 {
987 int base = read_base;
988
989 if (is_number (op->token, &base) != n_none)
990 {
991 free_token (op->token);
992 free (op->token);
993 op->type = t_other;
994 return;
995 }
996 }
997
998 /* We interpret all other tokens as symbols (including 'reserved
999 tokens', i.e. potential numbers which are not numbers). */
1000 case_convert_token (op->token);
1001 op->type = t_symbol;
1002 return;
1003
1004 case syntax_t_macro:
1005 case syntax_nt_macro:
1006 /* Read a macro. */
1007 switch (curr.ch)
1008 {
1009 case '(':
1010 {
1011 int arg = 0; /* Current argument number. */
1012 flag_context_list_iterator_ty context_iter;
1013 const struct callshapes *shapes = NULL;
1014 struct arglist_parser *argparser = NULL;
1015
1016 for (;; arg++)
1017 {
1018 struct object inner;
1019 flag_context_ty inner_context;
1020
1021 if (arg == 0)
1022 inner_context = null_context;
1023 else
1024 inner_context =
1025 inherited_context (outer_context,
1026 flag_context_list_iterator_advance (
1027 &context_iter));
1028
1029 read_object (&inner, inner_context);
1030
1031 /* Recognize end of list. */
1032 if (inner.type == t_close)
1033 {
1034 op->type = t_other;
1035 /* Don't bother converting "()" to "NIL". */
1036 last_non_comment_line = line_number;
1037 if (argparser != NULL)
1038 arglist_parser_done (argparser, arg);
1039 return;
1040 }
1041
1042 /* Dots are not allowed in every position.
1043 But be tolerant. */
1044
1045 /* EOF inside list is illegal.
1046 But be tolerant. */
1047 if (inner.type == t_eof)
1048 break;
1049
1050 if (arg == 0)
1051 {
1052 /* This is the function position. */
1053 if (inner.type == t_symbol)
1054 {
1055 char *symbol_name = string_of_object (&inner);
1056 int i;
1057 int prefix_len;
1058 void *keyword_value;
1059
1060 /* Omit any package name. */
1061 i = inner.token->charcount;
1062 while (i > 0
1063 && inner.token->chars[i-1].attribute != a_pack_m)
1064 i--;
1065 prefix_len = i;
1066
1067 if (hash_find_entry (&keywords,
1068 symbol_name + prefix_len,
1069 strlen (symbol_name + prefix_len),
1070 &keyword_value)
1071 == 0)
1072 shapes = (const struct callshapes *) keyword_value;
1073
1074 argparser = arglist_parser_alloc (mlp, shapes);
1075
1076 context_iter =
1077 flag_context_list_iterator (
1078 flag_context_list_table_lookup (
1079 flag_context_list_table,
1080 symbol_name, strlen (symbol_name)));
1081
1082 free (symbol_name);
1083 }
1084 else
1085 context_iter = null_context_list_iterator;
1086 }
1087 else
1088 {
1089 /* These are the argument positions. */
1090 if (argparser != NULL && inner.type == t_string)
1091 arglist_parser_remember (argparser, arg,
1092 string_of_object (&inner),
1093 inner_context,
1094 logical_file_name,
1095 inner.line_number_at_start,
1096 savable_comment);
1097 }
1098
1099 free_object (&inner);
1100 }
1101
1102 if (argparser != NULL)
1103 arglist_parser_done (argparser, arg);
1104 }
1105 op->type = t_other;
1106 last_non_comment_line = line_number;
1107 return;
1108
1109 case ')':
1110 /* Tell the caller about the end of list.
1111 Unmatched closing parenthesis is illegal.
1112 But be tolerant. */
1113 op->type = t_close;
1114 last_non_comment_line = line_number;
1115 return;
1116
1117 case ',':
1118 {
1119 int c = do_getc ();
1120 /* The ,@ handling inside lists is wrong anyway, because
1121 ,@form expands to an unknown number of elements. */
1122 if (c != EOF && c != '@' && c != '.')
1123 do_ungetc (c);
1124 }
1125 /*FALLTHROUGH*/
1126 case '\'':
1127 case '`':
1128 {
1129 struct object inner;
1130
1131 read_object (&inner, null_context);
1132
1133 /* Dots and EOF are not allowed here. But be tolerant. */
1134
1135 free_object (&inner);
1136
1137 op->type = t_other;
1138 last_non_comment_line = line_number;
1139 return;
1140 }
1141
1142 case ';':
1143 {
1144 bool all_semicolons = true;
1145
1146 last_comment_line = line_number;
1147 comment_start ();
1148 for (;;)
1149 {
1150 int c = do_getc ();
1151 if (c == EOF || c == '\n')
1152 break;
1153 if (c != ';')
1154 all_semicolons = false;
1155 if (!all_semicolons)
1156 {
1157 /* We skip all leading white space, but not EOLs. */
1158 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1159 comment_add (c);
1160 }
1161 }
1162 comment_line_end (0);
1163 continue;
1164 }
1165
1166 case '"':
1167 {
1168 op->token = (struct token *) xmalloc (sizeof (struct token));
1169 init_token (op->token);
1170 op->line_number_at_start = line_number;
1171 for (;;)
1172 {
1173 int c = do_getc ();
1174 if (c == EOF)
1175 /* Invalid input. Be tolerant, no error message. */
1176 break;
1177 if (c == '"')
1178 break;
1179 if (c == '\\') /* syntax_single_esc */
1180 {
1181 c = do_getc ();
1182 if (c == EOF)
1183 /* Invalid input. Be tolerant, no error message. */
1184 break;
1185 }
1186 grow_token (op->token);
1187 op->token->chars[op->token->charcount++].ch = c;
1188 }
1189 op->type = t_string;
1190
1191 if (extract_all)
1192 {
1193 lex_pos_ty pos;
1194
1195 pos.file_name = logical_file_name;
1196 pos.line_number = op->line_number_at_start;
1197 remember_a_message (mlp, NULL, string_of_object (op),
1198 null_context, &pos, savable_comment);
1199 }
1200 last_non_comment_line = line_number;
1201 return;
1202 }
1203
1204 case '#':
1205 /* Dispatch macro handling. */
1206 {
1207 int c;
1208
1209 for (;;)
1210 {
1211 c = do_getc ();
1212 if (c == EOF)
1213 /* Invalid input. Be tolerant, no error message. */
1214 {
1215 op->type = t_other;
1216 return;
1217 }
1218 if (!(c >= '0' && c <= '9'))
1219 break;
1220 }
1221
1222 switch (c)
1223 {
1224 case '(':
1225 case '"':
1226 do_ungetc (c);
1227 /*FALLTHROUGH*/
1228 case '\'':
1229 case ':':
1230 case '.':
1231 case ',':
1232 case 'A': case 'a':
1233 case 'C': case 'c':
1234 case 'P': case 'p':
1235 case 'S': case 's':
1236 {
1237 struct object inner;
1238 read_object (&inner, null_context);
1239 /* Dots and EOF are not allowed here.
1240 But be tolerant. */
1241 free_object (&inner);
1242 op->type = t_other;
1243 last_non_comment_line = line_number;
1244 return;
1245 }
1246
1247 case '|':
1248 {
1249 int depth = 0;
1250 int c;
1251
1252 comment_start ();
1253 c = do_getc ();
1254 for (;;)
1255 {
1256 if (c == EOF)
1257 break;
1258 if (c == '|')
1259 {
1260 c = do_getc ();
1261 if (c == EOF)
1262 break;
1263 if (c == '#')
1264 {
1265 if (depth == 0)
1266 {
1267 comment_line_end (0);
1268 break;
1269 }
1270 depth--;
1271 comment_add ('|');
1272 comment_add ('#');
1273 c = do_getc ();
1274 }
1275 else
1276 comment_add ('|');
1277 }
1278 else if (c == '#')
1279 {
1280 c = do_getc ();
1281 if (c == EOF)
1282 break;
1283 comment_add ('#');
1284 if (c == '|')
1285 {
1286 depth++;
1287 comment_add ('|');
1288 c = do_getc ();
1289 }
1290 }
1291 else
1292 {
1293 /* We skip all leading white space. */
1294 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1295 comment_add (c);
1296 if (c == '\n')
1297 {
1298 comment_line_end (1);
1299 comment_start ();
1300 }
1301 c = do_getc ();
1302 }
1303 }
1304 if (c == EOF)
1305 {
1306 /* EOF not allowed here. But be tolerant. */
1307 op->type = t_eof;
1308 return;
1309 }
1310 last_comment_line = line_number;
1311 continue;
1312 }
1313
1314 case '\\':
1315 {
1316 struct token token;
1317 struct char_syntax first;
1318 first.ch = '\\';
1319 first.scode = syntax_single_esc;
1320 read_token (&token, &first);
1321 free_token (&token);
1322 op->type = t_other;
1323 last_non_comment_line = line_number;
1324 return;
1325 }
1326
1327 case 'B': case 'b':
1328 case 'O': case 'o':
1329 case 'X': case 'x':
1330 case 'R': case 'r':
1331 case '*':
1332 {
1333 struct token token;
1334 read_token (&token, NULL);
1335 free_token (&token);
1336 op->type = t_other;
1337 last_non_comment_line = line_number;
1338 return;
1339 }
1340
1341 case '=':
1342 /* Ignore read labels. */
1343 continue;
1344
1345 case '#':
1346 /* Don't bother looking up the corresponding object. */
1347 op->type = t_other;
1348 last_non_comment_line = line_number;
1349 return;
1350
1351 case '+':
1352 case '-':
1353 /* Simply assume every feature expression is true. */
1354 {
1355 struct object inner;
1356 read_object (&inner, null_context);
1357 /* Dots and EOF are not allowed here.
1358 But be tolerant. */
1359 free_object (&inner);
1360 continue;
1361 }
1362
1363 default:
1364 op->type = t_other;
1365 last_non_comment_line = line_number;
1366 return;
1367 }
1368 /*NOTREACHED*/
1369 abort ();
1370 }
1371
1372 default:
1373 /*NOTREACHED*/
1374 abort ();
1375 }
1376
1377 default:
1378 /*NOTREACHED*/
1379 abort ();
1380 }
1381 }
1382 }
1383
1384
1385 void
extract_lisp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1386 extract_lisp (FILE *f,
1387 const char *real_filename, const char *logical_filename,
1388 flag_context_list_table_ty *flag_table,
1389 msgdomain_list_ty *mdlp)
1390 {
1391 mlp = mdlp->item[0]->messages;
1392
1393 fp = f;
1394 real_file_name = real_filename;
1395 logical_file_name = xstrdup (logical_filename);
1396 line_number = 1;
1397
1398 last_comment_line = -1;
1399 last_non_comment_line = -1;
1400
1401 flag_context_list_table = flag_table;
1402
1403 init_keywords ();
1404
1405 /* Eat tokens until eof is seen. When read_object returns
1406 due to an unbalanced closing parenthesis, just restart it. */
1407 do
1408 {
1409 struct object toplevel_object;
1410
1411 read_object (&toplevel_object, null_context);
1412
1413 if (toplevel_object.type == t_eof)
1414 break;
1415
1416 free_object (&toplevel_object);
1417 }
1418 while (!feof (fp));
1419
1420 /* Close scanner. */
1421 fp = NULL;
1422 real_file_name = NULL;
1423 logical_file_name = NULL;
1424 line_number = 0;
1425 }
1426