1 /* xgettext Emacs Lisp backend.
2 Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001-2002.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-elisp.h"
33 #include "error.h"
34 #include "xalloc.h"
35 #include "exit.h"
36 #include "hash.h"
37 #include "c-ctype.h"
38 #include "gettext.h"
39
40 #define _(s) gettext(s)
41
42
43 /* Summary of Emacs Lisp syntax:
44 - ';' starts a comment until end of line.
45 - '#@nn' starts a comment of nn bytes.
46 - Integers are constituted of an optional prefix (#b, #B for binary,
47 #o, #O for octal, #x, #X for hexadecimal, #nnr, #nnR for any radix),
48 an optional sign (+ or -), the digits, and an optional trailing dot.
49 - Characters are written as '?' followed by the character, possibly
50 with an escape sequence, for examples '?a', '?\n', '?\177'.
51 - Strings are delimited by double quotes. Backslash introduces an escape
52 sequence. The following are understood: '\n', '\r', '\f', '\t', '\a',
53 '\\', '\^C', '\012' (octal), '\x12' (hexadecimal).
54 - Symbols: can contain meta-characters if preceded by backslash.
55 - Uninterned symbols: written as #:SYMBOL.
56 - () delimit lists.
57 - [] delimit vectors.
58 The reader is implemented in emacs-21.1/src/lread.c. */
59
60
61 /* ====================== Keyword set customization. ====================== */
62
63 /* If true extract all strings. */
64 static bool extract_all = false;
65
66 static hash_table keywords;
67 static bool default_keywords = true;
68
69
70 void
x_elisp_extract_all()71 x_elisp_extract_all ()
72 {
73 extract_all = true;
74 }
75
76
77 void
x_elisp_keyword(const char * name)78 x_elisp_keyword (const char *name)
79 {
80 if (name == NULL)
81 default_keywords = false;
82 else
83 {
84 const char *end;
85 struct callshape shape;
86 const char *colon;
87
88 if (keywords.table == NULL)
89 hash_init (&keywords, 100);
90
91 split_keywordspec (name, &end, &shape);
92
93 /* The characters between name and end should form a valid Lisp
94 symbol. */
95 colon = strchr (name, ':');
96 if (colon == NULL || colon >= end)
97 insert_keyword_callshape (&keywords, name, end - name, &shape);
98 }
99 }
100
101 /* Finish initializing the keywords hash table.
102 Called after argument processing, before each file is processed. */
103 static void
init_keywords()104 init_keywords ()
105 {
106 if (default_keywords)
107 {
108 /* When adding new keywords here, also update the documentation in
109 xgettext.texi! */
110 x_elisp_keyword ("_");
111 default_keywords = false;
112 }
113 }
114
115 void
init_flag_table_elisp()116 init_flag_table_elisp ()
117 {
118 xgettext_record_flag ("_:1:pass-elisp-format");
119 xgettext_record_flag ("format:1:elisp-format");
120 }
121
122
123 /* ======================== Reading of characters. ======================== */
124
125 /* Real filename, used in error messages about the input file. */
126 static const char *real_file_name;
127
128 /* Logical filename and line number, used to label the extracted messages. */
129 static char *logical_file_name;
130 static int line_number;
131
132 /* The input file stream. */
133 static FILE *fp;
134
135
136 /* Fetch the next character from the input file. */
137 static int
do_getc()138 do_getc ()
139 {
140 int c = getc (fp);
141
142 if (c == EOF)
143 {
144 if (ferror (fp))
145 error (EXIT_FAILURE, errno, _("\
146 error while reading \"%s\""), real_file_name);
147 }
148 else if (c == '\n')
149 line_number++;
150
151 return c;
152 }
153
154 /* Put back the last fetched character, not EOF. */
155 static void
do_ungetc(int c)156 do_ungetc (int c)
157 {
158 if (c == '\n')
159 line_number--;
160 ungetc (c, fp);
161 }
162
163
164 /* ========================== Reading of tokens. ========================== */
165
166
167 /* A token consists of a sequence of characters. */
168 struct token
169 {
170 int allocated; /* number of allocated 'token_char's */
171 int charcount; /* number of used 'token_char's */
172 char *chars; /* the token's constituents */
173 };
174
175 /* Initialize a 'struct token'. */
176 static inline void
init_token(struct token * tp)177 init_token (struct token *tp)
178 {
179 tp->allocated = 10;
180 tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
181 tp->charcount = 0;
182 }
183
184 /* Free the memory pointed to by a 'struct token'. */
185 static inline void
free_token(struct token * tp)186 free_token (struct token *tp)
187 {
188 free (tp->chars);
189 }
190
191 /* Ensure there is enough room in the token for one more character. */
192 static inline void
grow_token(struct token * tp)193 grow_token (struct token *tp)
194 {
195 if (tp->charcount == tp->allocated)
196 {
197 tp->allocated *= 2;
198 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
199 }
200 }
201
202 /* Test whether a token has integer syntax. */
203 static inline bool
is_integer(const char * p)204 is_integer (const char *p)
205 {
206 /* NB: Yes, '+.' and '-.' both designate the integer 0. */
207 const char *p_start = p;
208
209 if (*p == '+' || *p == '-')
210 p++;
211 if (*p == '\0')
212 return false;
213 while (*p >= '0' && *p <= '9')
214 p++;
215 if (p > p_start && *p == '.')
216 p++;
217 return (*p == '\0');
218 }
219
220 /* Test whether a token has float syntax. */
221 static inline bool
is_float(const char * p)222 is_float (const char *p)
223 {
224 enum { LEAD_INT = 1, DOT_CHAR = 2, TRAIL_INT = 4, E_CHAR = 8, EXP_INT = 16 };
225 int state;
226
227 state = 0;
228 if (*p == '+' || *p == '-')
229 p++;
230 if (*p >= '0' && *p <= '9')
231 {
232 state |= LEAD_INT;
233 do
234 p++;
235 while (*p >= '0' && *p <= '9');
236 }
237 if (*p == '.')
238 {
239 state |= DOT_CHAR;
240 p++;
241 }
242 if (*p >= '0' && *p <= '9')
243 {
244 state |= TRAIL_INT;
245 do
246 p++;
247 while (*p >= '0' && *p <= '9');
248 }
249 if (*p == 'e' || *p == 'E')
250 {
251 state |= E_CHAR;
252 p++;
253 if (*p == '+' || *p == '-')
254 p++;
255 if (*p >= '0' && *p <= '9')
256 {
257 state |= EXP_INT;
258 do
259 p++;
260 while (*p >= '0' && *p <= '9');
261 }
262 else if (p[-1] == '+'
263 && ((p[0] == 'I' && p[1] == 'N' && p[2] == 'F')
264 || (p[0] == 'N' && p[1] == 'a' && p[2] == 'N')))
265 {
266 state |= EXP_INT;
267 p += 3;
268 }
269 }
270 return (*p == '\0')
271 && (state == (LEAD_INT | DOT_CHAR | TRAIL_INT)
272 || state == (DOT_CHAR | TRAIL_INT)
273 || state == (LEAD_INT | E_CHAR | EXP_INT)
274 || state == (LEAD_INT | DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT)
275 || state == (DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT));
276 }
277
278 /* Read the next token. 'first' is the first character, which has already
279 been read. Returns true for a symbol, false for a number. */
280 static bool
read_token(struct token * tp,int first)281 read_token (struct token *tp, int first)
282 {
283 int c;
284 bool quoted = false;
285
286 init_token (tp);
287
288 c = first;
289
290 for (;; c = do_getc ())
291 {
292 if (c == EOF)
293 break;
294 if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
295 break;
296 if (c == '\"' || c == '\'' || c == ';' || c == '(' || c == ')'
297 || c == '[' || c == ']' || c == '#')
298 break;
299 if (c == '\\')
300 {
301 quoted = true;
302 c = do_getc ();
303 if (c == EOF)
304 /* Invalid, but be tolerant. */
305 break;
306 }
307 grow_token (tp);
308 tp->chars[tp->charcount++] = c;
309 }
310 if (c != EOF)
311 do_ungetc (c);
312
313 if (quoted)
314 return true; /* symbol */
315
316 /* Add a NUL byte at the end, for is_integer and is_float. */
317 grow_token (tp);
318 tp->chars[tp->charcount] = '\0';
319
320 if (is_integer (tp->chars) || is_float (tp->chars))
321 return false; /* number */
322 else
323 return true; /* symbol */
324 }
325
326
327 /* ========================= Accumulating comments ========================= */
328
329
330 static char *buffer;
331 static size_t bufmax;
332 static size_t buflen;
333
334 static inline void
comment_start()335 comment_start ()
336 {
337 buflen = 0;
338 }
339
340 static inline void
comment_add(int c)341 comment_add (int c)
342 {
343 if (buflen >= bufmax)
344 {
345 bufmax = 2 * bufmax + 10;
346 buffer = xrealloc (buffer, bufmax);
347 }
348 buffer[buflen++] = c;
349 }
350
351 static inline void
comment_line_end(size_t chars_to_remove)352 comment_line_end (size_t chars_to_remove)
353 {
354 buflen -= chars_to_remove;
355 while (buflen >= 1
356 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
357 --buflen;
358 if (chars_to_remove == 0 && buflen >= bufmax)
359 {
360 bufmax = 2 * bufmax + 10;
361 buffer = xrealloc (buffer, bufmax);
362 }
363 buffer[buflen] = '\0';
364 savable_comment_add (buffer);
365 }
366
367
368 /* These are for tracking whether comments count as immediately before
369 keyword. */
370 static int last_comment_line;
371 static int last_non_comment_line;
372
373
374 /* ========================= Accumulating messages ========================= */
375
376
377 static message_list_ty *mlp;
378
379
380 /* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
381
382
383 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
384 Other objects need not to be represented precisely. */
385 enum object_type
386 {
387 t_symbol, /* symbol */
388 t_string, /* string */
389 t_other, /* other kind of real object */
390 t_dot, /* '.' pseudo object */
391 t_listclose, /* ')' pseudo object */
392 t_vectorclose,/* ']' pseudo object */
393 t_eof /* EOF marker */
394 };
395
396 struct object
397 {
398 enum object_type type;
399 struct token *token; /* for t_symbol and t_string */
400 int line_number_at_start; /* for t_string */
401 };
402
403 /* Free the memory pointed to by a 'struct object'. */
404 static inline void
free_object(struct object * op)405 free_object (struct object *op)
406 {
407 if (op->type == t_symbol || op->type == t_string)
408 {
409 free_token (op->token);
410 free (op->token);
411 }
412 }
413
414 /* Convert a t_symbol/t_string token to a char*. */
415 static char *
string_of_object(const struct object * op)416 string_of_object (const struct object *op)
417 {
418 char *str;
419 int n;
420
421 if (!(op->type == t_symbol || op->type == t_string))
422 abort ();
423 n = op->token->charcount;
424 str = (char *) xmalloc (n + 1);
425 memcpy (str, op->token->chars, n);
426 str[n] = '\0';
427 return str;
428 }
429
430 /* Context lookup table. */
431 static flag_context_list_table_ty *flag_context_list_table;
432
433 /* Returns the character represented by an escape sequence. */
434 #define IGNORABLE_ESCAPE (EOF - 1)
435 static int
do_getc_escaped(int c,bool in_string)436 do_getc_escaped (int c, bool in_string)
437 {
438 switch (c)
439 {
440 case 'a':
441 return '\a';
442 case 'b':
443 return '\b';
444 case 'd':
445 return 0x7F;
446 case 'e':
447 return 0x1B;
448 case 'f':
449 return '\f';
450 case 'n':
451 return '\n';
452 case 'r':
453 return '\r';
454 case 't':
455 return '\t';
456 case 'v':
457 return '\v';
458
459 case '\n':
460 return IGNORABLE_ESCAPE;
461
462 case ' ':
463 return (in_string ? IGNORABLE_ESCAPE : ' ');
464
465 case 'M': /* meta */
466 c = do_getc ();
467 if (c == EOF)
468 return EOF;
469 if (c != '-')
470 /* Invalid input. But be tolerant. */
471 return c;
472 c = do_getc ();
473 if (c == EOF)
474 return EOF;
475 if (c == '\\')
476 {
477 c = do_getc ();
478 if (c == EOF)
479 return EOF;
480 c = do_getc_escaped (c, false);
481 }
482 return c | 0x80;
483
484 case 'S': /* shift */
485 c = do_getc ();
486 if (c == EOF)
487 return EOF;
488 if (c != '-')
489 /* Invalid input. But be tolerant. */
490 return c;
491 c = do_getc ();
492 if (c == EOF)
493 return EOF;
494 if (c == '\\')
495 {
496 c = do_getc ();
497 if (c == EOF)
498 return EOF;
499 c = do_getc_escaped (c, false);
500 }
501 return (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
502
503 case 'H': /* hyper */
504 case 'A': /* alt */
505 case 's': /* super */
506 c = do_getc ();
507 if (c == EOF)
508 return EOF;
509 if (c != '-')
510 /* Invalid input. But be tolerant. */
511 return c;
512 c = do_getc ();
513 if (c == EOF)
514 return EOF;
515 if (c == '\\')
516 {
517 c = do_getc ();
518 if (c == EOF)
519 return EOF;
520 c = do_getc_escaped (c, false);
521 }
522 return c;
523
524 case 'C': /* ctrl */
525 c = do_getc ();
526 if (c == EOF)
527 return EOF;
528 if (c != '-')
529 /* Invalid input. But be tolerant. */
530 return c;
531 /*FALLTHROUGH*/
532 case '^':
533 c = do_getc ();
534 if (c == EOF)
535 return EOF;
536 if (c == '\\')
537 {
538 c = do_getc ();
539 if (c == EOF)
540 return EOF;
541 c = do_getc_escaped (c, false);
542 }
543 if (c == '?')
544 return 0x7F;
545 if ((c & 0x5F) >= 0x41 && (c & 0x5F) <= 0x5A)
546 return c & 0x9F;
547 if ((c & 0x7F) >= 0x40 && (c & 0x7F) <= 0x5F)
548 return c & 0x9F;
549 #if 0 /* We cannot handle NUL bytes in strings. */
550 if (c == ' ')
551 return 0x00;
552 #endif
553 return c;
554
555 case '0': case '1': case '2': case '3': case '4':
556 case '5': case '6': case '7':
557 /* An octal escape, as in ANSI C. */
558 {
559 int n = c - '0';
560
561 c = do_getc ();
562 if (c != EOF)
563 {
564 if (c >= '0' && c <= '7')
565 {
566 n = (n << 3) + (c - '0');
567 c = do_getc ();
568 if (c != EOF)
569 {
570 if (c >= '0' && c <= '7')
571 n = (n << 3) + (c - '0');
572 else
573 do_ungetc (c);
574 }
575 }
576 else
577 do_ungetc (c);
578 }
579 return (unsigned char) n;
580 }
581
582 case 'x':
583 /* A hexadecimal escape, as in ANSI C. */
584 {
585 int n = 0;
586
587 for (;;)
588 {
589 c = do_getc ();
590 if (c == EOF)
591 break;
592 else if (c >= '0' && c <= '9')
593 n = (n << 4) + (c - '0');
594 else if (c >= 'A' && c <= 'F')
595 n = (n << 4) + (c - 'A' + 10);
596 else if (c >= 'a' && c <= 'f')
597 n = (n << 4) + (c - 'a' + 10);
598 else
599 {
600 do_ungetc (c);
601 break;
602 }
603 }
604 return (unsigned char) n;
605 }
606
607 default:
608 /* Ignore Emacs multibyte character stuff. All the strings we are
609 interested in are ASCII strings. */
610 return c;
611 }
612 }
613
614 /* Read the next object.
615 'first_in_list' and 'new_backquote_flag' are used for reading old
616 backquote syntax and new backquote syntax. */
617 static void
read_object(struct object * op,bool first_in_list,bool new_backquote_flag,flag_context_ty outer_context)618 read_object (struct object *op, bool first_in_list, bool new_backquote_flag,
619 flag_context_ty outer_context)
620 {
621 for (;;)
622 {
623 int c;
624
625 c = do_getc ();
626
627 switch (c)
628 {
629 case EOF:
630 op->type = t_eof;
631 return;
632
633 case '\n':
634 /* Comments assumed to be grouped with a message must immediately
635 precede it, with no non-whitespace token on a line between
636 both. */
637 if (last_non_comment_line > last_comment_line)
638 savable_comment_reset ();
639 continue;
640
641 case '(':
642 {
643 int arg = 0; /* Current argument number. */
644 flag_context_list_iterator_ty context_iter;
645 const struct callshapes *shapes = NULL;
646 struct arglist_parser *argparser = NULL;
647
648 for (;; arg++)
649 {
650 struct object inner;
651 flag_context_ty inner_context;
652
653 if (arg == 0)
654 inner_context = null_context;
655 else
656 inner_context =
657 inherited_context (outer_context,
658 flag_context_list_iterator_advance (
659 &context_iter));
660
661 read_object (&inner, arg == 0, new_backquote_flag,
662 inner_context);
663
664 /* Recognize end of list. */
665 if (inner.type == t_listclose)
666 {
667 op->type = t_other;
668 /* Don't bother converting "()" to "NIL". */
669 last_non_comment_line = line_number;
670 if (argparser != NULL)
671 arglist_parser_done (argparser, arg);
672 return;
673 }
674
675 /* Dots are not allowed in every position. ']' is not allowed.
676 But be tolerant. */
677
678 /* EOF inside list is illegal. But be tolerant. */
679 if (inner.type == t_eof)
680 break;
681
682 if (arg == 0)
683 {
684 /* This is the function position. */
685 if (inner.type == t_symbol)
686 {
687 char *symbol_name = string_of_object (&inner);
688 void *keyword_value;
689
690 if (hash_find_entry (&keywords,
691 symbol_name, strlen (symbol_name),
692 &keyword_value)
693 == 0)
694 shapes = (const struct callshapes *) keyword_value;
695
696 argparser = arglist_parser_alloc (mlp, shapes);
697
698 context_iter =
699 flag_context_list_iterator (
700 flag_context_list_table_lookup (
701 flag_context_list_table,
702 symbol_name, strlen (symbol_name)));
703
704 free (symbol_name);
705 }
706 else
707 context_iter = null_context_list_iterator;
708 }
709 else
710 {
711 /* These are the argument positions. */
712 if (argparser != NULL && inner.type == t_string)
713 arglist_parser_remember (argparser, arg,
714 string_of_object (&inner),
715 inner_context,
716 logical_file_name,
717 inner.line_number_at_start,
718 savable_comment);
719 }
720
721 free_object (&inner);
722 }
723
724 if (argparser != NULL)
725 arglist_parser_done (argparser, arg);
726 }
727 op->type = t_other;
728 last_non_comment_line = line_number;
729 return;
730
731 case ')':
732 /* Tell the caller about the end of list.
733 Unmatched closing parenthesis is illegal. But be tolerant. */
734 op->type = t_listclose;
735 last_non_comment_line = line_number;
736 return;
737
738 case '[':
739 {
740 for (;;)
741 {
742 struct object inner;
743
744 read_object (&inner, false, new_backquote_flag, null_context);
745
746 /* Recognize end of vector. */
747 if (inner.type == t_vectorclose)
748 {
749 op->type = t_other;
750 last_non_comment_line = line_number;
751 return;
752 }
753
754 /* Dots and ')' are not allowed. But be tolerant. */
755
756 /* EOF inside vector is illegal. But be tolerant. */
757 if (inner.type == t_eof)
758 break;
759
760 free_object (&inner);
761 }
762 }
763 op->type = t_other;
764 last_non_comment_line = line_number;
765 return;
766
767 case ']':
768 /* Tell the caller about the end of vector.
769 Unmatched closing bracket is illegal. But be tolerant. */
770 op->type = t_vectorclose;
771 last_non_comment_line = line_number;
772 return;
773
774 case '\'':
775 {
776 struct object inner;
777
778 read_object (&inner, false, new_backquote_flag, null_context);
779
780 /* Dots and EOF are not allowed here. But be tolerant. */
781
782 free_object (&inner);
783
784 op->type = t_other;
785 last_non_comment_line = line_number;
786 return;
787 }
788
789 case '`':
790 if (first_in_list)
791 goto default_label;
792 {
793 struct object inner;
794
795 read_object (&inner, false, true, null_context);
796
797 /* Dots and EOF are not allowed here. But be tolerant. */
798
799 free_object (&inner);
800
801 op->type = t_other;
802 last_non_comment_line = line_number;
803 return;
804 }
805
806 case ',':
807 if (!new_backquote_flag)
808 goto default_label;
809 {
810 int c = do_getc ();
811 /* The ,@ handling inside lists is wrong anyway, because
812 ,@form expands to an unknown number of elements. */
813 if (c != EOF && c != '@' && c != '.')
814 do_ungetc (c);
815 }
816 {
817 struct object inner;
818
819 read_object (&inner, false, false, null_context);
820
821 /* Dots and EOF are not allowed here. But be tolerant. */
822
823 free_object (&inner);
824
825 op->type = t_other;
826 last_non_comment_line = line_number;
827 return;
828 }
829
830 case ';':
831 {
832 bool all_semicolons = true;
833
834 last_comment_line = line_number;
835 comment_start ();
836 for (;;)
837 {
838 int c = do_getc ();
839 if (c == EOF || c == '\n')
840 break;
841 if (c != ';')
842 all_semicolons = false;
843 if (!all_semicolons)
844 {
845 /* We skip all leading white space, but not EOLs. */
846 if (!(buflen == 0 && (c == ' ' || c == '\t')))
847 comment_add (c);
848 }
849 }
850 comment_line_end (0);
851 continue;
852 }
853
854 case '"':
855 {
856 op->token = (struct token *) xmalloc (sizeof (struct token));
857 init_token (op->token);
858 op->line_number_at_start = line_number;
859 for (;;)
860 {
861 int c = do_getc ();
862 if (c == EOF)
863 /* Invalid input. Be tolerant, no error message. */
864 break;
865 if (c == '"')
866 break;
867 if (c == '\\')
868 {
869 c = do_getc ();
870 if (c == EOF)
871 /* Invalid input. Be tolerant, no error message. */
872 break;
873 c = do_getc_escaped (c, true);
874 if (c == EOF)
875 /* Invalid input. Be tolerant, no error message. */
876 break;
877 if (c == IGNORABLE_ESCAPE)
878 /* Ignore escaped newline and escaped space. */
879 ;
880 else
881 {
882 grow_token (op->token);
883 op->token->chars[op->token->charcount++] = c;
884 }
885 }
886 else
887 {
888 grow_token (op->token);
889 op->token->chars[op->token->charcount++] = c;
890 }
891 }
892 op->type = t_string;
893
894 if (extract_all)
895 {
896 lex_pos_ty pos;
897
898 pos.file_name = logical_file_name;
899 pos.line_number = op->line_number_at_start;
900 remember_a_message (mlp, NULL, string_of_object (op),
901 null_context, &pos, savable_comment);
902 }
903 last_non_comment_line = line_number;
904 return;
905 }
906
907 case '?':
908 c = do_getc ();
909 if (c == EOF)
910 /* Invalid input. Be tolerant, no error message. */
911 ;
912 else if (c == '\\')
913 {
914 c = do_getc ();
915 if (c == EOF)
916 /* Invalid input. Be tolerant, no error message. */
917 ;
918 else
919 {
920 c = do_getc_escaped (c, false);
921 if (c == EOF)
922 /* Invalid input. Be tolerant, no error message. */
923 ;
924 }
925 }
926 /* Impossible to deal with Emacs multibyte character stuff here. */
927 op->type = t_other;
928 last_non_comment_line = line_number;
929 return;
930
931 case '#':
932 /* Dispatch macro handling. */
933 c = do_getc ();
934 if (c == EOF)
935 /* Invalid input. Be tolerant, no error message. */
936 {
937 op->type = t_other;
938 return;
939 }
940
941 switch (c)
942 {
943 case '^':
944 c = do_getc ();
945 if (c == '^')
946 c = do_getc ();
947 if (c == '[')
948 {
949 /* Read a char table, same syntax as a vector. */
950 for (;;)
951 {
952 struct object inner;
953
954 read_object (&inner, false, new_backquote_flag,
955 null_context);
956
957 /* Recognize end of vector. */
958 if (inner.type == t_vectorclose)
959 {
960 op->type = t_other;
961 last_non_comment_line = line_number;
962 return;
963 }
964
965 /* Dots and ')' are not allowed. But be tolerant. */
966
967 /* EOF inside vector is illegal. But be tolerant. */
968 if (inner.type == t_eof)
969 break;
970
971 free_object (&inner);
972 }
973 op->type = t_other;
974 last_non_comment_line = line_number;
975 return;
976 }
977 else
978 /* Invalid input. Be tolerant, no error message. */
979 {
980 op->type = t_other;
981 if (c != EOF)
982 last_non_comment_line = line_number;
983 return;
984 }
985
986 case '&':
987 /* Read a bit vector. */
988 {
989 struct object length;
990 read_object (&length, first_in_list, new_backquote_flag,
991 null_context);
992 /* Dots and EOF are not allowed here.
993 But be tolerant. */
994 free_object (&length);
995 }
996 c = do_getc ();
997 if (c == '"')
998 {
999 struct object string;
1000 read_object (&string, first_in_list, new_backquote_flag,
1001 null_context);
1002 free_object (&string);
1003 }
1004 else
1005 /* Invalid input. Be tolerant, no error message. */
1006 do_ungetc (c);
1007 op->type = t_other;
1008 last_non_comment_line = line_number;
1009 return;
1010
1011 case '[':
1012 /* Read a compiled function, same syntax as a vector. */
1013 case '(':
1014 /* Read a string with properties, same syntax as a list. */
1015 {
1016 struct object inner;
1017 do_ungetc (c);
1018 read_object (&inner, false, new_backquote_flag, null_context);
1019 /* Dots and EOF are not allowed here.
1020 But be tolerant. */
1021 free_object (&inner);
1022 op->type = t_other;
1023 last_non_comment_line = line_number;
1024 return;
1025 }
1026
1027 case '@':
1028 /* Read a comment consisting of a given number of bytes. */
1029 {
1030 unsigned int nskip = 0;
1031
1032 for (;;)
1033 {
1034 c = do_getc ();
1035 if (!(c >= '0' && c <= '9'))
1036 break;
1037 nskip = 10 * nskip + (c - '0');
1038 }
1039 if (c != EOF)
1040 {
1041 do_ungetc (c);
1042 for (; nskip > 0; nskip--)
1043 if (do_getc () == EOF)
1044 break;
1045 }
1046 continue;
1047 }
1048
1049 case '$':
1050 op->type = t_other;
1051 last_non_comment_line = line_number;
1052 return;
1053
1054 case '\'':
1055 case ':':
1056 case 'S': case 's': /* XEmacs only */
1057 {
1058 struct object inner;
1059 read_object (&inner, false, new_backquote_flag, null_context);
1060 /* Dots and EOF are not allowed here.
1061 But be tolerant. */
1062 free_object (&inner);
1063 op->type = t_other;
1064 last_non_comment_line = line_number;
1065 return;
1066 }
1067
1068 case '0': case '1': case '2': case '3': case '4':
1069 case '5': case '6': case '7': case '8': case '9':
1070 /* Read Common Lisp style #n# or #n=. */
1071 for (;;)
1072 {
1073 c = do_getc ();
1074 if (!(c >= '0' && c <= '9'))
1075 break;
1076 }
1077 if (c == EOF)
1078 /* Invalid input. Be tolerant, no error message. */
1079 {
1080 op->type = t_other;
1081 return;
1082 }
1083 if (c == '=')
1084 {
1085 read_object (op, false, new_backquote_flag, outer_context);
1086 last_non_comment_line = line_number;
1087 return;
1088 }
1089 if (c == '#')
1090 {
1091 op->type = t_other;
1092 last_non_comment_line = line_number;
1093 return;
1094 }
1095 if (c == 'R' || c == 'r')
1096 {
1097 /* Read an integer. */
1098 c = do_getc ();
1099 if (c == '+' || c == '-')
1100 c = do_getc ();
1101 for (; c != EOF; c = do_getc ())
1102 if (!c_isalnum (c))
1103 {
1104 do_ungetc (c);
1105 break;
1106 }
1107 op->type = t_other;
1108 last_non_comment_line = line_number;
1109 return;
1110 }
1111 /* Invalid input. Be tolerant, no error message. */
1112 op->type = t_other;
1113 last_non_comment_line = line_number;
1114 return;
1115
1116 case 'X': case 'x':
1117 case 'O': case 'o':
1118 case 'B': case 'b':
1119 {
1120 /* Read an integer. */
1121 c = do_getc ();
1122 if (c == '+' || c == '-')
1123 c = do_getc ();
1124 for (; c != EOF; c = do_getc ())
1125 if (!c_isalnum (c))
1126 {
1127 do_ungetc (c);
1128 break;
1129 }
1130 op->type = t_other;
1131 last_non_comment_line = line_number;
1132 return;
1133 }
1134
1135 case '*': /* XEmacs only */
1136 {
1137 /* Read a bit-vector. */
1138 do
1139 c = do_getc ();
1140 while (c == '0' || c == '1');
1141 if (c != EOF)
1142 do_ungetc (c);
1143 op->type = t_other;
1144 last_non_comment_line = line_number;
1145 return;
1146 }
1147
1148 case '+': /* XEmacs only */
1149 case '-': /* XEmacs only */
1150 /* Simply assume every feature expression is true. */
1151 {
1152 struct object inner;
1153 read_object (&inner, false, new_backquote_flag, null_context);
1154 /* Dots and EOF are not allowed here.
1155 But be tolerant. */
1156 free_object (&inner);
1157 continue;
1158 }
1159
1160 default:
1161 /* Invalid input. Be tolerant, no error message. */
1162 op->type = t_other;
1163 last_non_comment_line = line_number;
1164 return;
1165 }
1166
1167 /*NOTREACHED*/
1168 abort ();
1169
1170 case '.':
1171 c = do_getc ();
1172 if (c != EOF)
1173 {
1174 do_ungetc (c);
1175 if (c <= ' ' /* FIXME: Assumes ASCII compatible encoding */
1176 || strchr ("\"'`,(", c) != NULL)
1177 {
1178 op->type = t_dot;
1179 last_non_comment_line = line_number;
1180 return;
1181 }
1182 }
1183 c = '.';
1184 /*FALLTHROUGH*/
1185 default:
1186 default_label:
1187 if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
1188 continue;
1189 /* Read a token. */
1190 {
1191 bool symbol;
1192
1193 op->token = (struct token *) xmalloc (sizeof (struct token));
1194 symbol = read_token (op->token, c);
1195 if (symbol)
1196 {
1197 op->type = t_symbol;
1198 last_non_comment_line = line_number;
1199 return;
1200 }
1201 else
1202 {
1203 free_token (op->token);
1204 free (op->token);
1205 op->type = t_other;
1206 last_non_comment_line = line_number;
1207 return;
1208 }
1209 }
1210 }
1211 }
1212 }
1213
1214
1215 void
extract_elisp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1216 extract_elisp (FILE *f,
1217 const char *real_filename, const char *logical_filename,
1218 flag_context_list_table_ty *flag_table,
1219 msgdomain_list_ty *mdlp)
1220 {
1221 mlp = mdlp->item[0]->messages;
1222
1223 fp = f;
1224 real_file_name = real_filename;
1225 logical_file_name = xstrdup (logical_filename);
1226 line_number = 1;
1227
1228 last_comment_line = -1;
1229 last_non_comment_line = -1;
1230
1231 flag_context_list_table = flag_table;
1232
1233 init_keywords ();
1234
1235 /* Eat tokens until eof is seen. When read_object returns
1236 due to an unbalanced closing parenthesis, just restart it. */
1237 do
1238 {
1239 struct object toplevel_object;
1240
1241 read_object (&toplevel_object, false, false, null_context);
1242
1243 if (toplevel_object.type == t_eof)
1244 break;
1245
1246 free_object (&toplevel_object);
1247 }
1248 while (!feof (fp));
1249
1250 /* Close scanner. */
1251 fp = NULL;
1252 real_file_name = NULL;
1253 logical_file_name = NULL;
1254 line_number = 0;
1255 }
1256