xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-elisp.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext Emacs Lisp backend.
2    Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001-2002.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-elisp.h"
33 #include "error.h"
34 #include "xalloc.h"
35 #include "exit.h"
36 #include "hash.h"
37 #include "c-ctype.h"
38 #include "gettext.h"
39 
40 #define _(s) gettext(s)
41 
42 
43 /* Summary of Emacs Lisp syntax:
44    - ';' starts a comment until end of line.
45    - '#@nn' starts a comment of nn bytes.
46    - Integers are constituted of an optional prefix (#b, #B for binary,
47      #o, #O for octal, #x, #X for hexadecimal, #nnr, #nnR for any radix),
48      an optional sign (+ or -), the digits, and an optional trailing dot.
49    - Characters are written as '?' followed by the character, possibly
50      with an escape sequence, for examples '?a', '?\n', '?\177'.
51    - Strings are delimited by double quotes. Backslash introduces an escape
52      sequence. The following are understood: '\n', '\r', '\f', '\t', '\a',
53      '\\', '\^C', '\012' (octal), '\x12' (hexadecimal).
54    - Symbols: can contain meta-characters if preceded by backslash.
55    - Uninterned symbols: written as #:SYMBOL.
56    - () delimit lists.
57    - [] delimit vectors.
58    The reader is implemented in emacs-21.1/src/lread.c.  */
59 
60 
61 /* ====================== Keyword set customization.  ====================== */
62 
63 /* If true extract all strings.  */
64 static bool extract_all = false;
65 
66 static hash_table keywords;
67 static bool default_keywords = true;
68 
69 
70 void
x_elisp_extract_all()71 x_elisp_extract_all ()
72 {
73   extract_all = true;
74 }
75 
76 
77 void
x_elisp_keyword(const char * name)78 x_elisp_keyword (const char *name)
79 {
80   if (name == NULL)
81     default_keywords = false;
82   else
83     {
84       const char *end;
85       struct callshape shape;
86       const char *colon;
87 
88       if (keywords.table == NULL)
89 	hash_init (&keywords, 100);
90 
91       split_keywordspec (name, &end, &shape);
92 
93       /* The characters between name and end should form a valid Lisp
94 	 symbol.  */
95       colon = strchr (name, ':');
96       if (colon == NULL || colon >= end)
97 	insert_keyword_callshape (&keywords, name, end - name, &shape);
98     }
99 }
100 
101 /* Finish initializing the keywords hash table.
102    Called after argument processing, before each file is processed.  */
103 static void
init_keywords()104 init_keywords ()
105 {
106   if (default_keywords)
107     {
108       /* When adding new keywords here, also update the documentation in
109 	 xgettext.texi!  */
110       x_elisp_keyword ("_");
111       default_keywords = false;
112     }
113 }
114 
115 void
init_flag_table_elisp()116 init_flag_table_elisp ()
117 {
118   xgettext_record_flag ("_:1:pass-elisp-format");
119   xgettext_record_flag ("format:1:elisp-format");
120 }
121 
122 
123 /* ======================== Reading of characters.  ======================== */
124 
125 /* Real filename, used in error messages about the input file.  */
126 static const char *real_file_name;
127 
128 /* Logical filename and line number, used to label the extracted messages.  */
129 static char *logical_file_name;
130 static int line_number;
131 
132 /* The input file stream.  */
133 static FILE *fp;
134 
135 
136 /* Fetch the next character from the input file.  */
137 static int
do_getc()138 do_getc ()
139 {
140   int c = getc (fp);
141 
142   if (c == EOF)
143     {
144       if (ferror (fp))
145 	error (EXIT_FAILURE, errno, _("\
146 error while reading \"%s\""), real_file_name);
147     }
148   else if (c == '\n')
149    line_number++;
150 
151   return c;
152 }
153 
154 /* Put back the last fetched character, not EOF.  */
155 static void
do_ungetc(int c)156 do_ungetc (int c)
157 {
158   if (c == '\n')
159     line_number--;
160   ungetc (c, fp);
161 }
162 
163 
164 /* ========================== Reading of tokens.  ========================== */
165 
166 
167 /* A token consists of a sequence of characters.  */
168 struct token
169 {
170   int allocated;		/* number of allocated 'token_char's */
171   int charcount;		/* number of used 'token_char's */
172   char *chars;			/* the token's constituents */
173 };
174 
175 /* Initialize a 'struct token'.  */
176 static inline void
init_token(struct token * tp)177 init_token (struct token *tp)
178 {
179   tp->allocated = 10;
180   tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
181   tp->charcount = 0;
182 }
183 
184 /* Free the memory pointed to by a 'struct token'.  */
185 static inline void
free_token(struct token * tp)186 free_token (struct token *tp)
187 {
188   free (tp->chars);
189 }
190 
191 /* Ensure there is enough room in the token for one more character.  */
192 static inline void
grow_token(struct token * tp)193 grow_token (struct token *tp)
194 {
195   if (tp->charcount == tp->allocated)
196     {
197       tp->allocated *= 2;
198       tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
199     }
200 }
201 
202 /* Test whether a token has integer syntax.  */
203 static inline bool
is_integer(const char * p)204 is_integer (const char *p)
205 {
206   /* NB: Yes, '+.' and '-.' both designate the integer 0.  */
207   const char *p_start = p;
208 
209   if (*p == '+' || *p == '-')
210     p++;
211   if (*p == '\0')
212     return false;
213   while (*p >= '0' && *p <= '9')
214     p++;
215   if (p > p_start && *p == '.')
216     p++;
217   return (*p == '\0');
218 }
219 
220 /* Test whether a token has float syntax.  */
221 static inline bool
is_float(const char * p)222 is_float (const char *p)
223 {
224   enum { LEAD_INT = 1, DOT_CHAR = 2, TRAIL_INT = 4, E_CHAR = 8, EXP_INT = 16 };
225   int state;
226 
227   state = 0;
228   if (*p == '+' || *p == '-')
229     p++;
230   if (*p >= '0' && *p <= '9')
231     {
232       state |= LEAD_INT;
233       do
234 	p++;
235       while (*p >= '0' && *p <= '9');
236     }
237   if (*p == '.')
238     {
239       state |= DOT_CHAR;
240       p++;
241     }
242   if (*p >= '0' && *p <= '9')
243     {
244       state |= TRAIL_INT;
245       do
246 	p++;
247       while (*p >= '0' && *p <= '9');
248     }
249   if (*p == 'e' || *p == 'E')
250     {
251       state |= E_CHAR;
252       p++;
253       if (*p == '+' || *p == '-')
254 	p++;
255       if (*p >= '0' && *p <= '9')
256 	{
257 	  state |= EXP_INT;
258 	  do
259 	    p++;
260 	  while (*p >= '0' && *p <= '9');
261 	}
262       else if (p[-1] == '+'
263 	       && ((p[0] == 'I' && p[1] == 'N' && p[2] == 'F')
264 		   || (p[0] == 'N' && p[1] == 'a' && p[2] == 'N')))
265 	{
266 	  state |= EXP_INT;
267 	  p += 3;
268 	}
269     }
270   return (*p == '\0')
271 	 && (state == (LEAD_INT | DOT_CHAR | TRAIL_INT)
272 	     || state == (DOT_CHAR | TRAIL_INT)
273 	     || state == (LEAD_INT | E_CHAR | EXP_INT)
274 	     || state == (LEAD_INT | DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT)
275 	     || state == (DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT));
276 }
277 
278 /* Read the next token.  'first' is the first character, which has already
279    been read.  Returns true for a symbol, false for a number.  */
280 static bool
read_token(struct token * tp,int first)281 read_token (struct token *tp, int first)
282 {
283   int c;
284   bool quoted = false;
285 
286   init_token (tp);
287 
288   c = first;
289 
290   for (;; c = do_getc ())
291     {
292       if (c == EOF)
293 	break;
294       if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
295 	break;
296       if (c == '\"' || c == '\'' || c == ';' || c == '(' || c == ')'
297 	  || c == '[' || c == ']' || c == '#')
298 	break;
299       if (c == '\\')
300 	{
301 	  quoted = true;
302 	  c = do_getc ();
303 	  if (c == EOF)
304 	    /* Invalid, but be tolerant.  */
305 	    break;
306 	}
307       grow_token (tp);
308       tp->chars[tp->charcount++] = c;
309     }
310   if (c != EOF)
311     do_ungetc (c);
312 
313   if (quoted)
314     return true; /* symbol */
315 
316   /* Add a NUL byte at the end, for is_integer and is_float.  */
317   grow_token (tp);
318   tp->chars[tp->charcount] = '\0';
319 
320   if (is_integer (tp->chars) || is_float (tp->chars))
321     return false; /* number */
322   else
323     return true; /* symbol */
324 }
325 
326 
327 /* ========================= Accumulating comments ========================= */
328 
329 
330 static char *buffer;
331 static size_t bufmax;
332 static size_t buflen;
333 
334 static inline void
comment_start()335 comment_start ()
336 {
337   buflen = 0;
338 }
339 
340 static inline void
comment_add(int c)341 comment_add (int c)
342 {
343   if (buflen >= bufmax)
344     {
345       bufmax = 2 * bufmax + 10;
346       buffer = xrealloc (buffer, bufmax);
347     }
348   buffer[buflen++] = c;
349 }
350 
351 static inline void
comment_line_end(size_t chars_to_remove)352 comment_line_end (size_t chars_to_remove)
353 {
354   buflen -= chars_to_remove;
355   while (buflen >= 1
356 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
357     --buflen;
358   if (chars_to_remove == 0 && buflen >= bufmax)
359     {
360       bufmax = 2 * bufmax + 10;
361       buffer = xrealloc (buffer, bufmax);
362     }
363   buffer[buflen] = '\0';
364   savable_comment_add (buffer);
365 }
366 
367 
368 /* These are for tracking whether comments count as immediately before
369    keyword.  */
370 static int last_comment_line;
371 static int last_non_comment_line;
372 
373 
374 /* ========================= Accumulating messages ========================= */
375 
376 
377 static message_list_ty *mlp;
378 
379 
380 /* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
381 
382 
383 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
384    Other objects need not to be represented precisely.  */
385 enum object_type
386 {
387   t_symbol,	/* symbol */
388   t_string,	/* string */
389   t_other,	/* other kind of real object */
390   t_dot,	/* '.' pseudo object */
391   t_listclose,	/* ')' pseudo object */
392   t_vectorclose,/* ']' pseudo object */
393   t_eof		/* EOF marker */
394 };
395 
396 struct object
397 {
398   enum object_type type;
399   struct token *token;		/* for t_symbol and t_string */
400   int line_number_at_start;	/* for t_string */
401 };
402 
403 /* Free the memory pointed to by a 'struct object'.  */
404 static inline void
free_object(struct object * op)405 free_object (struct object *op)
406 {
407   if (op->type == t_symbol || op->type == t_string)
408     {
409       free_token (op->token);
410       free (op->token);
411     }
412 }
413 
414 /* Convert a t_symbol/t_string token to a char*.  */
415 static char *
string_of_object(const struct object * op)416 string_of_object (const struct object *op)
417 {
418   char *str;
419   int n;
420 
421   if (!(op->type == t_symbol || op->type == t_string))
422     abort ();
423   n = op->token->charcount;
424   str = (char *) xmalloc (n + 1);
425   memcpy (str, op->token->chars, n);
426   str[n] = '\0';
427   return str;
428 }
429 
430 /* Context lookup table.  */
431 static flag_context_list_table_ty *flag_context_list_table;
432 
433 /* Returns the character represented by an escape sequence.  */
434 #define IGNORABLE_ESCAPE (EOF - 1)
435 static int
do_getc_escaped(int c,bool in_string)436 do_getc_escaped (int c, bool in_string)
437 {
438   switch (c)
439     {
440     case 'a':
441       return '\a';
442     case 'b':
443       return '\b';
444     case 'd':
445       return 0x7F;
446     case 'e':
447       return 0x1B;
448     case 'f':
449       return '\f';
450     case 'n':
451       return '\n';
452     case 'r':
453       return '\r';
454     case 't':
455       return '\t';
456     case 'v':
457       return '\v';
458 
459     case '\n':
460       return IGNORABLE_ESCAPE;
461 
462     case ' ':
463       return (in_string ? IGNORABLE_ESCAPE : ' ');
464 
465     case 'M': /* meta */
466       c = do_getc ();
467       if (c == EOF)
468 	return EOF;
469       if (c != '-')
470 	/* Invalid input.  But be tolerant.  */
471 	return c;
472       c = do_getc ();
473       if (c == EOF)
474 	return EOF;
475       if (c == '\\')
476 	{
477 	  c = do_getc ();
478 	  if (c == EOF)
479 	    return EOF;
480 	  c = do_getc_escaped (c, false);
481 	}
482       return c | 0x80;
483 
484     case 'S': /* shift */
485       c = do_getc ();
486       if (c == EOF)
487 	return EOF;
488       if (c != '-')
489 	/* Invalid input.  But be tolerant.  */
490 	return c;
491       c = do_getc ();
492       if (c == EOF)
493 	return EOF;
494       if (c == '\\')
495 	{
496 	  c = do_getc ();
497 	  if (c == EOF)
498 	    return EOF;
499 	  c = do_getc_escaped (c, false);
500 	}
501       return (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
502 
503     case 'H': /* hyper */
504     case 'A': /* alt */
505     case 's': /* super */
506       c = do_getc ();
507       if (c == EOF)
508 	return EOF;
509       if (c != '-')
510 	/* Invalid input.  But be tolerant.  */
511 	return c;
512       c = do_getc ();
513       if (c == EOF)
514 	return EOF;
515       if (c == '\\')
516 	{
517 	  c = do_getc ();
518 	  if (c == EOF)
519 	    return EOF;
520 	  c = do_getc_escaped (c, false);
521 	}
522       return c;
523 
524     case 'C': /* ctrl */
525       c = do_getc ();
526       if (c == EOF)
527 	return EOF;
528       if (c != '-')
529 	/* Invalid input.  But be tolerant.  */
530 	return c;
531       /*FALLTHROUGH*/
532     case '^':
533       c = do_getc ();
534       if (c == EOF)
535 	return EOF;
536       if (c == '\\')
537 	{
538 	  c = do_getc ();
539 	  if (c == EOF)
540 	    return EOF;
541 	  c = do_getc_escaped (c, false);
542 	}
543       if (c == '?')
544 	return 0x7F;
545       if ((c & 0x5F) >= 0x41 && (c & 0x5F) <= 0x5A)
546 	return c & 0x9F;
547       if ((c & 0x7F) >= 0x40 && (c & 0x7F) <= 0x5F)
548 	return c & 0x9F;
549 #if 0 /* We cannot handle NUL bytes in strings.  */
550       if (c == ' ')
551 	return 0x00;
552 #endif
553       return c;
554 
555     case '0': case '1': case '2': case '3': case '4':
556     case '5': case '6': case '7':
557       /* An octal escape, as in ANSI C.  */
558       {
559 	int n = c - '0';
560 
561 	c = do_getc ();
562 	if (c != EOF)
563 	  {
564 	    if (c >= '0' && c <= '7')
565 	      {
566 		n = (n << 3) + (c - '0');
567 		c = do_getc ();
568 		if (c != EOF)
569 		  {
570 		    if (c >= '0' && c <= '7')
571 		      n = (n << 3) + (c - '0');
572 		    else
573 		      do_ungetc (c);
574 		  }
575 	      }
576 	    else
577 	      do_ungetc (c);
578 	  }
579 	return (unsigned char) n;
580       }
581 
582     case 'x':
583       /* A hexadecimal escape, as in ANSI C.  */
584       {
585 	int n = 0;
586 
587 	for (;;)
588 	  {
589 	    c = do_getc ();
590 	    if (c == EOF)
591 	      break;
592 	    else if (c >= '0' && c <= '9')
593 	      n = (n << 4) + (c - '0');
594 	    else if (c >= 'A' && c <= 'F')
595 	      n = (n << 4) + (c - 'A' + 10);
596 	    else if (c >= 'a' && c <= 'f')
597 	      n = (n << 4) + (c - 'a' + 10);
598 	    else
599 	      {
600 		do_ungetc (c);
601 		break;
602 	      }
603 	  }
604 	return (unsigned char) n;
605       }
606 
607     default:
608       /* Ignore Emacs multibyte character stuff.  All the strings we are
609 	 interested in are ASCII strings.  */
610       return c;
611     }
612 }
613 
614 /* Read the next object.
615    'first_in_list' and 'new_backquote_flag' are used for reading old
616    backquote syntax and new backquote syntax.  */
617 static void
read_object(struct object * op,bool first_in_list,bool new_backquote_flag,flag_context_ty outer_context)618 read_object (struct object *op, bool first_in_list, bool new_backquote_flag,
619 	     flag_context_ty outer_context)
620 {
621   for (;;)
622     {
623       int c;
624 
625       c = do_getc ();
626 
627       switch (c)
628 	{
629 	case EOF:
630 	  op->type = t_eof;
631 	  return;
632 
633 	case '\n':
634 	  /* Comments assumed to be grouped with a message must immediately
635 	     precede it, with no non-whitespace token on a line between
636 	     both.  */
637 	  if (last_non_comment_line > last_comment_line)
638 	    savable_comment_reset ();
639 	  continue;
640 
641 	case '(':
642 	  {
643 	    int arg = 0;		/* Current argument number.  */
644 	    flag_context_list_iterator_ty context_iter;
645 	    const struct callshapes *shapes = NULL;
646 	    struct arglist_parser *argparser = NULL;
647 
648 	    for (;; arg++)
649 	      {
650 		struct object inner;
651 		flag_context_ty inner_context;
652 
653 		if (arg == 0)
654 		  inner_context = null_context;
655 		else
656 		  inner_context =
657 		    inherited_context (outer_context,
658 				       flag_context_list_iterator_advance (
659 					 &context_iter));
660 
661 		read_object (&inner, arg == 0, new_backquote_flag,
662 			     inner_context);
663 
664 		/* Recognize end of list.  */
665 		if (inner.type == t_listclose)
666 		  {
667 		    op->type = t_other;
668 		    /* Don't bother converting "()" to "NIL".  */
669 		    last_non_comment_line = line_number;
670 		    if (argparser != NULL)
671 		      arglist_parser_done (argparser, arg);
672 		    return;
673 		  }
674 
675 		/* Dots are not allowed in every position. ']' is not allowed.
676 		   But be tolerant.  */
677 
678 		/* EOF inside list is illegal.  But be tolerant.  */
679 		if (inner.type == t_eof)
680 		  break;
681 
682 		if (arg == 0)
683 		  {
684 		    /* This is the function position.  */
685 		    if (inner.type == t_symbol)
686 		      {
687 			char *symbol_name = string_of_object (&inner);
688 			void *keyword_value;
689 
690 			if (hash_find_entry (&keywords,
691 					     symbol_name, strlen (symbol_name),
692 					     &keyword_value)
693 			    == 0)
694 			  shapes = (const struct callshapes *) keyword_value;
695 
696 			argparser = arglist_parser_alloc (mlp, shapes);
697 
698 			context_iter =
699 			  flag_context_list_iterator (
700 			    flag_context_list_table_lookup (
701 			      flag_context_list_table,
702 			      symbol_name, strlen (symbol_name)));
703 
704 			free (symbol_name);
705 		      }
706 		    else
707 		      context_iter = null_context_list_iterator;
708 		  }
709 		else
710 		  {
711 		    /* These are the argument positions.  */
712 		    if (argparser != NULL && inner.type == t_string)
713 		      arglist_parser_remember (argparser, arg,
714 					       string_of_object (&inner),
715 					       inner_context,
716 					       logical_file_name,
717 					       inner.line_number_at_start,
718 					       savable_comment);
719 		  }
720 
721 		free_object (&inner);
722 	      }
723 
724 	    if (argparser != NULL)
725 	      arglist_parser_done (argparser, arg);
726 	  }
727 	  op->type = t_other;
728 	  last_non_comment_line = line_number;
729 	  return;
730 
731 	case ')':
732 	  /* Tell the caller about the end of list.
733 	     Unmatched closing parenthesis is illegal.  But be tolerant.  */
734 	  op->type = t_listclose;
735 	  last_non_comment_line = line_number;
736 	  return;
737 
738 	case '[':
739 	  {
740 	    for (;;)
741 	      {
742 		struct object inner;
743 
744 		read_object (&inner, false, new_backquote_flag, null_context);
745 
746 		/* Recognize end of vector.  */
747 		if (inner.type == t_vectorclose)
748 		  {
749 		    op->type = t_other;
750 		    last_non_comment_line = line_number;
751 		    return;
752 		  }
753 
754 		/* Dots and ')' are not allowed.  But be tolerant.  */
755 
756 		/* EOF inside vector is illegal.  But be tolerant.  */
757 		if (inner.type == t_eof)
758 		  break;
759 
760 		free_object (&inner);
761 	      }
762 	  }
763 	  op->type = t_other;
764 	  last_non_comment_line = line_number;
765 	  return;
766 
767 	case ']':
768 	  /* Tell the caller about the end of vector.
769 	     Unmatched closing bracket is illegal.  But be tolerant.  */
770 	  op->type = t_vectorclose;
771 	  last_non_comment_line = line_number;
772 	  return;
773 
774 	case '\'':
775 	  {
776 	    struct object inner;
777 
778 	    read_object (&inner, false, new_backquote_flag, null_context);
779 
780 	    /* Dots and EOF are not allowed here.  But be tolerant.  */
781 
782 	    free_object (&inner);
783 
784 	    op->type = t_other;
785 	    last_non_comment_line = line_number;
786 	    return;
787 	  }
788 
789 	case '`':
790 	  if (first_in_list)
791 	    goto default_label;
792 	  {
793 	    struct object inner;
794 
795 	    read_object (&inner, false, true, null_context);
796 
797 	    /* Dots and EOF are not allowed here.  But be tolerant.  */
798 
799 	    free_object (&inner);
800 
801 	    op->type = t_other;
802 	    last_non_comment_line = line_number;
803 	    return;
804 	  }
805 
806 	case ',':
807 	  if (!new_backquote_flag)
808 	    goto default_label;
809 	  {
810 	    int c = do_getc ();
811 	    /* The ,@ handling inside lists is wrong anyway, because
812 	       ,@form expands to an unknown number of elements.  */
813 	    if (c != EOF && c != '@' && c != '.')
814 	      do_ungetc (c);
815 	  }
816 	  {
817 	    struct object inner;
818 
819 	    read_object (&inner, false, false, null_context);
820 
821 	    /* Dots and EOF are not allowed here.  But be tolerant.  */
822 
823 	    free_object (&inner);
824 
825 	    op->type = t_other;
826 	    last_non_comment_line = line_number;
827 	    return;
828 	  }
829 
830 	case ';':
831 	  {
832 	    bool all_semicolons = true;
833 
834 	    last_comment_line = line_number;
835 	    comment_start ();
836 	    for (;;)
837 	      {
838 		int c = do_getc ();
839 		if (c == EOF || c == '\n')
840 		  break;
841 		if (c != ';')
842 		  all_semicolons = false;
843 		if (!all_semicolons)
844 		  {
845 		    /* We skip all leading white space, but not EOLs.  */
846 		    if (!(buflen == 0 && (c == ' ' || c == '\t')))
847 		      comment_add (c);
848 		  }
849 	      }
850 	    comment_line_end (0);
851 	    continue;
852 	  }
853 
854 	case '"':
855 	  {
856 	    op->token = (struct token *) xmalloc (sizeof (struct token));
857 	    init_token (op->token);
858 	    op->line_number_at_start = line_number;
859 	    for (;;)
860 	      {
861 		int c = do_getc ();
862 		if (c == EOF)
863 		  /* Invalid input.  Be tolerant, no error message.  */
864 		  break;
865 		if (c == '"')
866 		  break;
867 		if (c == '\\')
868 		  {
869 		    c = do_getc ();
870 		    if (c == EOF)
871 		      /* Invalid input.  Be tolerant, no error message.  */
872 		      break;
873 		    c = do_getc_escaped (c, true);
874 		    if (c == EOF)
875 		      /* Invalid input.  Be tolerant, no error message.  */
876 		      break;
877 		    if (c == IGNORABLE_ESCAPE)
878 		      /* Ignore escaped newline and escaped space.  */
879 		      ;
880 		    else
881 		      {
882 			grow_token (op->token);
883 			op->token->chars[op->token->charcount++] = c;
884 		      }
885 		  }
886 		else
887 		  {
888 		    grow_token (op->token);
889 		    op->token->chars[op->token->charcount++] = c;
890 		  }
891 	      }
892 	    op->type = t_string;
893 
894 	    if (extract_all)
895 	      {
896 		lex_pos_ty pos;
897 
898 		pos.file_name = logical_file_name;
899 		pos.line_number = op->line_number_at_start;
900 		remember_a_message (mlp, NULL, string_of_object (op),
901 				    null_context, &pos, savable_comment);
902 	      }
903 	    last_non_comment_line = line_number;
904 	    return;
905 	  }
906 
907 	case '?':
908 	  c = do_getc ();
909 	  if (c == EOF)
910 	    /* Invalid input.  Be tolerant, no error message.  */
911 	    ;
912 	  else if (c == '\\')
913 	    {
914 	      c = do_getc ();
915 	      if (c == EOF)
916 		/* Invalid input.  Be tolerant, no error message.  */
917 		;
918 	      else
919 		{
920 		  c = do_getc_escaped (c, false);
921 		  if (c == EOF)
922 		    /* Invalid input.  Be tolerant, no error message.  */
923 		    ;
924 		}
925 	    }
926 	  /* Impossible to deal with Emacs multibyte character stuff here.  */
927 	  op->type = t_other;
928 	  last_non_comment_line = line_number;
929 	  return;
930 
931 	case '#':
932 	  /* Dispatch macro handling.  */
933 	  c = do_getc ();
934 	  if (c == EOF)
935 	    /* Invalid input.  Be tolerant, no error message.  */
936 	    {
937 	      op->type = t_other;
938 	      return;
939 	    }
940 
941 	  switch (c)
942 	    {
943 	    case '^':
944 	      c = do_getc ();
945 	      if (c == '^')
946 		c = do_getc ();
947 	      if (c == '[')
948 		{
949 		  /* Read a char table, same syntax as a vector.  */
950 		  for (;;)
951 		    {
952 		      struct object inner;
953 
954 		      read_object (&inner, false, new_backquote_flag,
955 				   null_context);
956 
957 		      /* Recognize end of vector.  */
958 		      if (inner.type == t_vectorclose)
959 			{
960 			  op->type = t_other;
961 			  last_non_comment_line = line_number;
962 			  return;
963 			}
964 
965 		      /* Dots and ')' are not allowed.  But be tolerant.  */
966 
967 		      /* EOF inside vector is illegal.  But be tolerant.  */
968 		      if (inner.type == t_eof)
969 			break;
970 
971 		      free_object (&inner);
972 		    }
973 		  op->type = t_other;
974 		  last_non_comment_line = line_number;
975 		  return;
976 		}
977 	      else
978 		/* Invalid input.  Be tolerant, no error message.  */
979 		{
980 		  op->type = t_other;
981 		  if (c != EOF)
982 		    last_non_comment_line = line_number;
983 		  return;
984 		}
985 
986 	    case '&':
987 	      /* Read a bit vector.  */
988 	      {
989 		struct object length;
990 		read_object (&length, first_in_list, new_backquote_flag,
991 			     null_context);
992 		/* Dots and EOF are not allowed here.
993 		   But be tolerant.  */
994 		free_object (&length);
995 	      }
996 	      c = do_getc ();
997 	      if (c == '"')
998 		{
999 		  struct object string;
1000 		  read_object (&string, first_in_list, new_backquote_flag,
1001 			       null_context);
1002 		  free_object (&string);
1003 		}
1004 	      else
1005 		/* Invalid input.  Be tolerant, no error message.  */
1006 		do_ungetc (c);
1007 	      op->type = t_other;
1008 	      last_non_comment_line = line_number;
1009 	      return;
1010 
1011 	    case '[':
1012 	      /* Read a compiled function, same syntax as a vector.  */
1013 	    case '(':
1014 	      /* Read a string with properties, same syntax as a list.  */
1015 	      {
1016 		struct object inner;
1017 		do_ungetc (c);
1018 		read_object (&inner, false, new_backquote_flag, null_context);
1019 		/* Dots and EOF are not allowed here.
1020 		   But be tolerant.  */
1021 		free_object (&inner);
1022 		op->type = t_other;
1023 		last_non_comment_line = line_number;
1024 		return;
1025 	      }
1026 
1027 	    case '@':
1028 	      /* Read a comment consisting of a given number of bytes.  */
1029 	      {
1030 		unsigned int nskip = 0;
1031 
1032 		for (;;)
1033 		  {
1034 		    c = do_getc ();
1035 		    if (!(c >= '0' && c <= '9'))
1036 		      break;
1037 		    nskip = 10 * nskip + (c - '0');
1038 		  }
1039 		if (c != EOF)
1040 		  {
1041 		    do_ungetc (c);
1042 		    for (; nskip > 0; nskip--)
1043 		      if (do_getc () == EOF)
1044 			break;
1045 		  }
1046 		continue;
1047 	      }
1048 
1049 	    case '$':
1050 	      op->type = t_other;
1051 	      last_non_comment_line = line_number;
1052 	      return;
1053 
1054 	    case '\'':
1055 	    case ':':
1056 	    case 'S': case 's': /* XEmacs only */
1057 	      {
1058 		struct object inner;
1059 		read_object (&inner, false, new_backquote_flag, null_context);
1060 		/* Dots and EOF are not allowed here.
1061 		   But be tolerant.  */
1062 		free_object (&inner);
1063 		op->type = t_other;
1064 		last_non_comment_line = line_number;
1065 		return;
1066 	      }
1067 
1068 	    case '0': case '1': case '2': case '3': case '4':
1069 	    case '5': case '6': case '7': case '8': case '9':
1070 	      /* Read Common Lisp style #n# or #n=.  */
1071 	      for (;;)
1072 		{
1073 		  c = do_getc ();
1074 		  if (!(c >= '0' && c <= '9'))
1075 		    break;
1076 		}
1077 	      if (c == EOF)
1078 		/* Invalid input.  Be tolerant, no error message.  */
1079 		{
1080 		  op->type = t_other;
1081 		  return;
1082 		}
1083 	      if (c == '=')
1084 		{
1085 		  read_object (op, false, new_backquote_flag, outer_context);
1086 		  last_non_comment_line = line_number;
1087 		  return;
1088 		}
1089 	      if (c == '#')
1090 		{
1091 		  op->type = t_other;
1092 		  last_non_comment_line = line_number;
1093 		  return;
1094 		}
1095 	      if (c == 'R' || c == 'r')
1096 		{
1097 		  /* Read an integer.  */
1098 		  c = do_getc ();
1099 		  if (c == '+' || c == '-')
1100 		    c = do_getc ();
1101 		  for (; c != EOF; c = do_getc ())
1102 		    if (!c_isalnum (c))
1103 		      {
1104 			do_ungetc (c);
1105 			break;
1106 		      }
1107 		  op->type = t_other;
1108 		  last_non_comment_line = line_number;
1109 		  return;
1110 		}
1111 	      /* Invalid input.  Be tolerant, no error message.  */
1112 	      op->type = t_other;
1113 	      last_non_comment_line = line_number;
1114 	      return;
1115 
1116 	    case 'X': case 'x':
1117 	    case 'O': case 'o':
1118 	    case 'B': case 'b':
1119 	      {
1120 		/* Read an integer.  */
1121 		c = do_getc ();
1122 		if (c == '+' || c == '-')
1123 		  c = do_getc ();
1124 		for (; c != EOF; c = do_getc ())
1125 		  if (!c_isalnum (c))
1126 		    {
1127 		      do_ungetc (c);
1128 		      break;
1129 		    }
1130 		op->type = t_other;
1131 		last_non_comment_line = line_number;
1132 		return;
1133 	      }
1134 
1135 	    case '*': /* XEmacs only */
1136 	      {
1137 		/* Read a bit-vector.  */
1138 		do
1139 		  c = do_getc ();
1140 		while (c == '0' || c == '1');
1141 		if (c != EOF)
1142 		  do_ungetc (c);
1143 		op->type = t_other;
1144 		last_non_comment_line = line_number;
1145 		return;
1146 	      }
1147 
1148 	    case '+': /* XEmacs only */
1149 	    case '-': /* XEmacs only */
1150 	      /* Simply assume every feature expression is true.  */
1151 	      {
1152 		struct object inner;
1153 		read_object (&inner, false, new_backquote_flag, null_context);
1154 		/* Dots and EOF are not allowed here.
1155 		   But be tolerant.  */
1156 		free_object (&inner);
1157 		continue;
1158 	      }
1159 
1160 	    default:
1161 	      /* Invalid input.  Be tolerant, no error message.  */
1162 	      op->type = t_other;
1163 	      last_non_comment_line = line_number;
1164 	      return;
1165 	    }
1166 
1167 	  /*NOTREACHED*/
1168 	  abort ();
1169 
1170 	case '.':
1171 	  c = do_getc ();
1172 	  if (c != EOF)
1173 	    {
1174 	      do_ungetc (c);
1175 	      if (c <= ' ' /* FIXME: Assumes ASCII compatible encoding */
1176 		  || strchr ("\"'`,(", c) != NULL)
1177 		{
1178 		  op->type = t_dot;
1179 		  last_non_comment_line = line_number;
1180 		  return;
1181 		}
1182 	    }
1183 	  c = '.';
1184 	  /*FALLTHROUGH*/
1185 	default:
1186 	default_label:
1187 	  if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
1188 	    continue;
1189 	  /* Read a token.  */
1190 	  {
1191 	    bool symbol;
1192 
1193 	    op->token = (struct token *) xmalloc (sizeof (struct token));
1194 	    symbol = read_token (op->token, c);
1195 	    if (symbol)
1196 	      {
1197 		op->type = t_symbol;
1198 		last_non_comment_line = line_number;
1199 		return;
1200 	      }
1201 	    else
1202 	      {
1203 		free_token (op->token);
1204 		free (op->token);
1205 		op->type = t_other;
1206 		last_non_comment_line = line_number;
1207 		return;
1208 	      }
1209 	  }
1210 	}
1211     }
1212 }
1213 
1214 
1215 void
extract_elisp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1216 extract_elisp (FILE *f,
1217 	       const char *real_filename, const char *logical_filename,
1218 	       flag_context_list_table_ty *flag_table,
1219 	       msgdomain_list_ty *mdlp)
1220 {
1221   mlp = mdlp->item[0]->messages;
1222 
1223   fp = f;
1224   real_file_name = real_filename;
1225   logical_file_name = xstrdup (logical_filename);
1226   line_number = 1;
1227 
1228   last_comment_line = -1;
1229   last_non_comment_line = -1;
1230 
1231   flag_context_list_table = flag_table;
1232 
1233   init_keywords ();
1234 
1235   /* Eat tokens until eof is seen.  When read_object returns
1236      due to an unbalanced closing parenthesis, just restart it.  */
1237   do
1238     {
1239       struct object toplevel_object;
1240 
1241       read_object (&toplevel_object, false, false, null_context);
1242 
1243       if (toplevel_object.type == t_eof)
1244 	break;
1245 
1246       free_object (&toplevel_object);
1247     }
1248   while (!feof (fp));
1249 
1250   /* Close scanner.  */
1251   fp = NULL;
1252   real_file_name = NULL;
1253   logical_file_name = NULL;
1254   line_number = 0;
1255 }
1256