xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-csharp.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext C# backend.
2    Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software Foundation,
17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 #include <errno.h>
24 #include <stdbool.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 #include "message.h"
30 #include "xgettext.h"
31 #include "x-csharp.h"
32 #include "c-ctype.h"
33 #include "error.h"
34 #include "error-progname.h"
35 #include "xalloc.h"
36 #include "xerror.h"
37 #include "xvasprintf.h"
38 #include "exit.h"
39 #include "hash.h"
40 #include "po-charset.h"
41 #include "utf8-ucs4.h"
42 #include "ucs4-utf8.h"
43 #include "gettext.h"
44 
45 #define _(s) gettext(s)
46 
47 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
48 
49 
50 /* The C# syntax is defined in ECMA-334, second edition.  */
51 
52 
53 /* ====================== Keyword set customization.  ====================== */
54 
55 /* If true extract all strings.  */
56 static bool extract_all = false;
57 
58 static hash_table keywords;
59 static bool default_keywords = true;
60 
61 
62 void
x_csharp_extract_all()63 x_csharp_extract_all ()
64 {
65   extract_all = true;
66 }
67 
68 
69 /* Processes a --keyword option.
70    Non-ASCII function names can be used if given in UTF-8 encoding.  */
71 void
x_csharp_keyword(const char * name)72 x_csharp_keyword (const char *name)
73 {
74   if (name == NULL)
75     default_keywords = false;
76   else
77     {
78       const char *end;
79       struct callshape shape;
80       const char *colon;
81 
82       if (keywords.table == NULL)
83 	hash_init (&keywords, 100);
84 
85       split_keywordspec (name, &end, &shape);
86 
87       /* The characters between name and end should form a valid C#
88 	 identifier sequence with dots.
89 	 A colon means an invalid parse in split_keywordspec().  */
90       colon = strchr (name, ':');
91       if (colon == NULL || colon >= end)
92 	insert_keyword_callshape (&keywords, name, end - name, &shape);
93     }
94 }
95 
96 /* Finish initializing the keywords hash table.
97    Called after argument processing, before each file is processed.  */
98 static void
init_keywords()99 init_keywords ()
100 {
101   if (default_keywords)
102     {
103       /* When adding new keywords here, also update the documentation in
104 	 xgettext.texi!  */
105       x_csharp_keyword ("GetString");	/* Resource{Manager,Set}.GetString */
106       x_csharp_keyword ("GetPluralString:1,2");	/* GettextResource{Manager,Set}.GetPluralString */
107       default_keywords = false;
108     }
109 }
110 
111 void
init_flag_table_csharp()112 init_flag_table_csharp ()
113 {
114   xgettext_record_flag ("GetString:1:pass-csharp-format");
115   xgettext_record_flag ("GetPluralString:1:pass-csharp-format");
116   xgettext_record_flag ("GetPluralString:2:pass-csharp-format");
117   xgettext_record_flag ("String.Format:1:csharp-format");
118 }
119 
120 
121 /* ======================== Reading of characters.  ======================== */
122 
123 /* Real filename, used in error messages about the input file.  */
124 static const char *real_file_name;
125 
126 /* Logical filename and line number, used to label the extracted messages.  */
127 static char *logical_file_name;
128 static int line_number;
129 
130 /* The input file stream.  */
131 static FILE *fp;
132 
133 
134 /* Phase 1: line_number handling.  */
135 
136 /* Maximum used, roughly a safer MB_LEN_MAX.  */
137 #define MAX_PHASE1_PUSHBACK 16
138 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
139 static int phase1_pushback_length;
140 
141 /* Read the next single byte from the input file.  */
142 static int
phase1_getc()143 phase1_getc ()
144 {
145   int c;
146 
147   if (phase1_pushback_length)
148     {
149       c = phase1_pushback[--phase1_pushback_length];
150       if (c == '\n')
151 	++line_number;
152       return c;
153     }
154 
155   c = getc (fp);
156   if (c == EOF)
157     {
158       if (ferror (fp))
159 	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
160 	       real_file_name);
161       return EOF;
162     }
163 
164   if (c == '\n')
165     ++line_number;
166   return c;
167 }
168 
169 /* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
170 static void
phase1_ungetc(int c)171 phase1_ungetc (int c)
172 {
173   if (c != EOF)
174     {
175       if (c == '\n')
176 	--line_number;
177       if (phase1_pushback_length == SIZEOF (phase1_pushback))
178 	abort ();
179       phase1_pushback[phase1_pushback_length++] = c;
180     }
181 }
182 
183 
184 /* Phase 2: Conversion to Unicode.
185    This is done early because ECMA-334 section 9.1. says that the source is
186    "an ordered sequence of Unicode characters", and because the recognition
187    of the line terminators (ECMA-334 section 9.3.1) is hardly possible without
188    prior conversion to Unicode.  */
189 
190 /* End-of-file indicator for functions returning an UCS-4 character.  */
191 #define UEOF -1
192 
193 /* Newline Unicode character.  */
194 #define UNL 0x000a
195 
196 static int phase2_pushback[1];
197 static int phase2_pushback_length;
198 
199 /* Read the next Unicode UCS-4 character from the input file.  */
200 static int
phase2_getc()201 phase2_getc ()
202 {
203   if (phase2_pushback_length)
204     return phase2_pushback[--phase2_pushback_length];
205 
206   if (xgettext_current_source_encoding == po_charset_ascii)
207     {
208       int c = phase1_getc ();
209       if (c == EOF)
210 	return UEOF;
211       if (!c_isascii (c))
212 	{
213 	  char buffer[21];
214 	  sprintf (buffer, ":%ld", (long) line_number);
215 	  multiline_error (xstrdup (""),
216 			   xasprintf (_("\
217 Non-ASCII string at %s%s.\n\
218 Please specify the source encoding through --from-code.\n"),
219 			   real_file_name, buffer));
220 	  exit (EXIT_FAILURE);
221 	}
222       return c;
223     }
224   else if (xgettext_current_source_encoding != po_charset_utf8)
225     {
226 #if HAVE_ICONV
227       /* Use iconv on an increasing number of bytes.  Read only as many bytes
228 	 through phase1_getc as needed.  This is needed to give reasonable
229 	 interactive behaviour when fp is connected to an interactive tty.  */
230       unsigned char buf[MAX_PHASE1_PUSHBACK];
231       size_t bufcount;
232       int c = phase1_getc ();
233       if (c == EOF)
234 	return UEOF;
235       buf[0] = (unsigned char) c;
236       bufcount = 1;
237 
238       for (;;)
239 	{
240 	  unsigned char scratchbuf[6];
241 	  const char *inptr = (const char *) &buf[0];
242 	  size_t insize = bufcount;
243 	  char *outptr = (char *) &scratchbuf[0];
244 	  size_t outsize = sizeof (scratchbuf);
245 
246 	  size_t res = iconv (xgettext_current_source_iconv,
247 			      (ICONV_CONST char **) &inptr, &insize,
248 			      &outptr, &outsize);
249 	  /* We expect that a character has been produced if and only if
250 	     some input bytes have been consumed.  */
251 	  if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
252 	    abort ();
253 	  if (outsize == sizeof (scratchbuf))
254 	    {
255 	      /* No character has been produced.  Must be an error.  */
256 	      if (res != (size_t)(-1))
257 		abort ();
258 
259 	      if (errno == EILSEQ)
260 		{
261 		  /* An invalid multibyte sequence was encountered.  */
262 		  multiline_error (xstrdup (""),
263 				   xasprintf (_("\
264 %s:%d: Invalid multibyte sequence.\n\
265 Please specify the correct source encoding through --from-code.\n"),
266 				   real_file_name, line_number));
267 		  exit (EXIT_FAILURE);
268 		}
269 	      else if (errno == EINVAL)
270 		{
271 		  /* An incomplete multibyte character.  */
272 		  int c;
273 
274 		  if (bufcount == MAX_PHASE1_PUSHBACK)
275 		    {
276 		      /* An overlong incomplete multibyte sequence was
277 			 encountered.  */
278 		      multiline_error (xstrdup (""),
279 				       xasprintf (_("\
280 %s:%d: Long incomplete multibyte sequence.\n\
281 Please specify the correct source encoding through --from-code.\n"),
282 				       real_file_name, line_number));
283 		      exit (EXIT_FAILURE);
284 		    }
285 
286 		  /* Read one more byte and retry iconv.  */
287 		  c = phase1_getc ();
288 		  if (c == EOF)
289 		    {
290 		      multiline_error (xstrdup (""),
291 				       xasprintf (_("\
292 %s:%d: Incomplete multibyte sequence at end of file.\n\
293 Please specify the correct source encoding through --from-code.\n"),
294 				       real_file_name, line_number));
295 		      exit (EXIT_FAILURE);
296 		    }
297 		  if (c == '\n')
298 		    {
299 		      multiline_error (xstrdup (""),
300 				       xasprintf (_("\
301 %s:%d: Incomplete multibyte sequence at end of line.\n\
302 Please specify the correct source encoding through --from-code.\n"),
303 				       real_file_name, line_number - 1));
304 		      exit (EXIT_FAILURE);
305 		    }
306 		  buf[bufcount++] = (unsigned char) c;
307 		}
308 	      else
309 		error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
310 		       real_file_name, line_number);
311 	    }
312 	  else
313 	    {
314 	      size_t outbytes = sizeof (scratchbuf) - outsize;
315 	      size_t bytes = bufcount - insize;
316 	      unsigned int uc;
317 
318 	      /* We expect that one character has been produced.  */
319 	      if (bytes == 0)
320 		abort ();
321 	      if (outbytes == 0)
322 		abort ();
323 	      /* Push back the unused bytes.  */
324 	      while (insize > 0)
325 		phase1_ungetc (buf[--insize]);
326 	      /* Convert the character from UTF-8 to UCS-4.  */
327 	      if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
328 		{
329 		  /* scratchbuf contains an out-of-range Unicode character
330 		     (> 0x10ffff).  */
331 		  multiline_error (xstrdup (""),
332 				   xasprintf (_("\
333 %s:%d: Invalid multibyte sequence.\n\
334 Please specify the source encoding through --from-code.\n"),
335 				   real_file_name, line_number));
336 		  exit (EXIT_FAILURE);
337 		}
338 	      return uc;
339 	    }
340 	}
341 #else
342       /* If we don't have iconv(), the only supported values for
343 	 xgettext_global_source_encoding and thus also for
344 	 xgettext_current_source_encoding are ASCII and UTF-8.  */
345       abort ();
346 #endif
347     }
348   else
349     {
350       /* Read an UTF-8 encoded character.  */
351       unsigned char buf[6];
352       unsigned int count;
353       int c;
354       unsigned int uc;
355 
356       c = phase1_getc ();
357       if (c == EOF)
358 	return UEOF;
359       buf[0] = c;
360       count = 1;
361 
362       if (buf[0] >= 0xc0)
363 	{
364 	  c = phase1_getc ();
365 	  if (c == EOF)
366 	    return UEOF;
367 	  buf[1] = c;
368 	  count = 2;
369 	}
370 
371       if (buf[0] >= 0xe0
372 	  && ((buf[1] ^ 0x80) < 0x40))
373 	{
374 	  c = phase1_getc ();
375 	  if (c == EOF)
376 	    return UEOF;
377 	  buf[2] = c;
378 	  count = 3;
379 	}
380 
381       if (buf[0] >= 0xf0
382 	  && ((buf[1] ^ 0x80) < 0x40)
383 	  && ((buf[2] ^ 0x80) < 0x40))
384 	{
385 	  c = phase1_getc ();
386 	  if (c == EOF)
387 	    return UEOF;
388 	  buf[3] = c;
389 	  count = 4;
390 	}
391 
392       if (buf[0] >= 0xf8
393 	  && ((buf[1] ^ 0x80) < 0x40)
394 	  && ((buf[2] ^ 0x80) < 0x40)
395 	  && ((buf[3] ^ 0x80) < 0x40))
396 	{
397 	  c = phase1_getc ();
398 	  if (c == EOF)
399 	    return UEOF;
400 	  buf[4] = c;
401 	  count = 5;
402 	}
403 
404       if (buf[0] >= 0xfc
405 	  && ((buf[1] ^ 0x80) < 0x40)
406 	  && ((buf[2] ^ 0x80) < 0x40)
407 	  && ((buf[3] ^ 0x80) < 0x40)
408 	  && ((buf[4] ^ 0x80) < 0x40))
409 	{
410 	  c = phase1_getc ();
411 	  if (c == EOF)
412 	    return UEOF;
413 	  buf[5] = c;
414 	  count = 6;
415 	}
416 
417       u8_mbtouc (&uc, buf, count);
418       return uc;
419     }
420 }
421 
422 /* Supports only one pushback character.  */
423 static void
phase2_ungetc(int c)424 phase2_ungetc (int c)
425 {
426   if (c != UEOF)
427     {
428       if (phase2_pushback_length == SIZEOF (phase2_pushback))
429 	abort ();
430       phase2_pushback[phase2_pushback_length++] = c;
431     }
432 }
433 
434 
435 /* Phase 3: Convert all line terminators to LF.
436    See ECMA-334 section 9.3.1.  */
437 
438 /* Line number defined in terms of phase3.  */
439 static int logical_line_number;
440 
441 static int phase3_pushback[9];
442 static int phase3_pushback_length;
443 
444 /* Read the next Unicode UCS-4 character from the input file, mapping
445    all line terminators to U+000A, and dropping U+001A at the end of file.  */
446 static int
phase3_getc()447 phase3_getc ()
448 {
449   int c;
450 
451   if (phase3_pushback_length)
452     {
453       c = phase3_pushback[--phase3_pushback_length];
454       if (c == UNL)
455 	++logical_line_number;
456       return c;
457     }
458 
459   c = phase2_getc ();
460 
461   if (c == 0x000d)
462     {
463       int c1 = phase2_getc ();
464 
465       if (c1 != UEOF && c1 != 0x000a)
466 	phase2_ungetc (c1);
467 
468       /* Seen line terminator CR or CR/LF.  */
469       ++logical_line_number;
470       return UNL;
471     }
472 
473   if (c == 0x0085 || c == 0x2028 || c == 0x2029)
474     {
475       /* Seen Unicode word processor newline.  */
476       ++logical_line_number;
477       return UNL;
478     }
479 
480   if (c == 0x001a)
481     {
482       int c1 = phase2_getc ();
483 
484       if (c1 == UEOF)
485 	/* Seen U+001A right before the end of file.  */
486 	return UEOF;
487 
488       phase2_ungetc (c1);
489     }
490 
491   if (c == UNL)
492     ++logical_line_number;
493   return c;
494 }
495 
496 /* Supports 9 characters of pushback.  */
497 static void
phase3_ungetc(int c)498 phase3_ungetc (int c)
499 {
500   if (c != UEOF)
501     {
502       if (c == UNL)
503 	--logical_line_number;
504       if (phase3_pushback_length == SIZEOF (phase3_pushback))
505 	abort ();
506       phase3_pushback[phase3_pushback_length++] = c;
507     }
508 }
509 
510 
511 /* ========================= Accumulating strings.  ======================== */
512 
513 /* A string buffer type that allows appending Unicode characters.
514    Returns the entire string in UTF-8 encoding.  */
515 
516 struct string_buffer
517 {
518   /* The part of the string that has already been converted to UTF-8.  */
519   char *utf8_buffer;
520   size_t utf8_buflen;
521   size_t utf8_allocated;
522 };
523 
524 /* Initialize a 'struct string_buffer' to empty.  */
525 static inline void
init_string_buffer(struct string_buffer * bp)526 init_string_buffer (struct string_buffer *bp)
527 {
528   bp->utf8_buffer = NULL;
529   bp->utf8_buflen = 0;
530   bp->utf8_allocated = 0;
531 }
532 
533 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
534 static inline void
string_buffer_append_unicode_grow(struct string_buffer * bp,size_t count)535 string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
536 {
537   if (bp->utf8_buflen + count > bp->utf8_allocated)
538     {
539       size_t new_allocated = 2 * bp->utf8_allocated + 10;
540       if (new_allocated < bp->utf8_buflen + count)
541 	new_allocated = bp->utf8_buflen + count;
542       bp->utf8_allocated = new_allocated;
543       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
544     }
545 }
546 
547 /* Auxiliary function: Append a Unicode character to bp->utf8.
548    uc must be < 0x110000.  */
549 static inline void
string_buffer_append_unicode(struct string_buffer * bp,unsigned int uc)550 string_buffer_append_unicode (struct string_buffer *bp, unsigned int uc)
551 {
552   unsigned char utf8buf[6];
553   int count = u8_uctomb (utf8buf, uc, 6);
554 
555   if (count < 0)
556     /* The caller should have ensured that uc is not out-of-range.  */
557     abort ();
558 
559   string_buffer_append_unicode_grow (bp, count);
560   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
561   bp->utf8_buflen += count;
562 }
563 
564 /* Return the string buffer's contents.  */
565 static char *
string_buffer_result(struct string_buffer * bp)566 string_buffer_result (struct string_buffer *bp)
567 {
568   /* NUL-terminate it.  */
569   string_buffer_append_unicode_grow (bp, 1);
570   bp->utf8_buffer[bp->utf8_buflen] = '\0';
571   /* Return it.  */
572   return bp->utf8_buffer;
573 }
574 
575 /* Free the memory pointed to by a 'struct string_buffer'.  */
576 static inline void
free_string_buffer(struct string_buffer * bp)577 free_string_buffer (struct string_buffer *bp)
578 {
579   free (bp->utf8_buffer);
580 }
581 
582 
583 /* ======================== Accumulating comments.  ======================== */
584 
585 
586 /* Accumulating a single comment line.  */
587 
588 static struct string_buffer comment_buffer;
589 
590 static inline void
comment_start()591 comment_start ()
592 {
593   comment_buffer.utf8_buflen = 0;
594 }
595 
596 static inline bool
comment_at_start()597 comment_at_start ()
598 {
599   return (comment_buffer.utf8_buflen == 0);
600 }
601 
602 static inline void
comment_add(int c)603 comment_add (int c)
604 {
605   string_buffer_append_unicode (&comment_buffer, c);
606 }
607 
608 static inline void
comment_line_end(size_t chars_to_remove)609 comment_line_end (size_t chars_to_remove)
610 {
611   char *buffer = string_buffer_result (&comment_buffer);
612   size_t buflen = strlen (buffer);
613 
614   buflen -= chars_to_remove;
615   while (buflen >= 1
616 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
617     --buflen;
618   buffer[buflen] = '\0';
619   savable_comment_add (buffer);
620 }
621 
622 
623 /* These are for tracking whether comments count as immediately before
624    keyword.  */
625 static int last_comment_line;
626 static int last_non_comment_line;
627 
628 
629 /* Phase 4: Replace each comment that is not inside a character constant or
630    string literal with a space or newline character.
631    See ECMA-334 section 9.3.2.  */
632 
633 static int
phase4_getc()634 phase4_getc ()
635 {
636   int c0;
637   int c;
638   bool last_was_star;
639 
640   c0 = phase3_getc ();
641   if (c0 != '/')
642     return c0;
643   c = phase3_getc ();
644   switch (c)
645     {
646     default:
647       phase3_ungetc (c);
648       return c0;
649 
650     case '*':
651       /* C style comment.  */
652       comment_start ();
653       last_was_star = false;
654       for (;;)
655 	{
656 	  c = phase3_getc ();
657 	  if (c == UEOF)
658 	    break;
659 	  /* We skip all leading white space, but not EOLs.  */
660 	  if (!(comment_at_start () && (c == ' ' || c == '\t')))
661 	    comment_add (c);
662 	  switch (c)
663 	    {
664 	    case UNL:
665 	      comment_line_end (1);
666 	      comment_start ();
667 	      last_was_star = false;
668 	      continue;
669 
670 	    case '*':
671 	      last_was_star = true;
672 	      continue;
673 
674 	    case '/':
675 	      if (last_was_star)
676 		{
677 		  comment_line_end (2);
678 		  break;
679 		}
680 	      /* FALLTHROUGH */
681 
682 	    default:
683 	      last_was_star = false;
684 	      continue;
685 	    }
686 	  break;
687 	}
688       last_comment_line = logical_line_number;
689       return ' ';
690 
691     case '/':
692       /* C++ style comment.  */
693       last_comment_line = logical_line_number;
694       comment_start ();
695       for (;;)
696 	{
697 	  c = phase3_getc ();
698 	  if (c == UNL || c == UEOF)
699 	    break;
700 	  /* We skip all leading white space, but not EOLs.  */
701 	  if (!(comment_at_start () && (c == ' ' || c == '\t')))
702 	    comment_add (c);
703 	}
704       phase3_ungetc (c); /* push back the newline, to decrement logical_line_number */
705       comment_line_end (0);
706       phase3_getc (); /* read the newline again */
707       return UNL;
708     }
709 }
710 
711 /* Supports only one pushback character.  */
712 static void
phase4_ungetc(int c)713 phase4_ungetc (int c)
714 {
715   phase3_ungetc (c);
716 }
717 
718 
719 /* ======================= Character classification.  ====================== */
720 
721 
722 /* Return true if a given character is white space.
723    See ECMA-334 section 9.3.3.  */
724 static bool
is_whitespace(int c)725 is_whitespace (int c)
726 {
727   /* Unicode character class Zs, as of Unicode 4.0.  */
728   /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */
729   switch (c >> 8)
730     {
731     case 0x00:
732       return (c == 0x0020 || c == 0x00a0);
733     case 0x16:
734       return (c == 0x1680);
735     case 0x18:
736       return (c == 0x180e);
737     case 0x20:
738       return ((c >= 0x2000 && c <= 0x200b) || c == 0x202f || c == 0x205f);
739     case 0x30:
740       return (c == 0x3000);
741     default:
742       return false;
743     }
744 }
745 
746 
747 /* C# allows identifiers containing many Unicode characters.  We recognize
748    them; to use an identifier with Unicode characters in a --keyword option,
749    it must be specified in UTF-8.  */
750 
751 static inline int
bitmap_lookup(const void * table,unsigned int uc)752 bitmap_lookup (const void *table, unsigned int uc)
753 {
754   unsigned int index1 = uc >> 16;
755   if (index1 < ((const int *) table)[0])
756     {
757       int lookup1 = ((const int *) table)[1 + index1];
758       if (lookup1 >= 0)
759 	{
760 	  unsigned int index2 = (uc >> 9) & 0x7f;
761 	  int lookup2 = ((const int *) table)[lookup1 + index2];
762 	  if (lookup2 >= 0)
763 	    {
764 	      unsigned int index3 = (uc >> 5) & 0xf;
765 	      unsigned int lookup3 = ((const int *) table)[lookup2 + index3];
766 
767 	      return (lookup3 >> (uc & 0x1f)) & 1;
768 	    }
769 	}
770     }
771   return 0;
772 }
773 
774 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0,
775    plus the underscore.  */
776 static const
777 struct
778   {
779     int header[1];
780     int level1[3];
781     int level2[3 << 7];
782     /*unsigned*/ int level3[34 << 4];
783   }
784 table_identifier_start =
785 {
786   { 3 },
787   {     4,   132,   260 },
788   {
789       388,   404,   420,   436,   452,   468,   484,   500,
790       516,   532,   548,   564,   580,    -1,   596,   612,
791       628,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
792       644,    -1,   660,   660,   660,   660,   660,   660,
793       660,   660,   660,   660,   660,   660,   676,   660,
794       660,   660,   660,   660,   660,   660,   660,   660,
795       660,   660,   660,   660,   660,   660,   660,   660,
796       660,   660,   660,   660,   660,   660,   660,   660,
797       660,   660,   660,   660,   660,   660,   660,   660,
798       660,   660,   660,   660,   660,   660,   660,   692,
799       660,   660,   708,    -1,    -1,    -1,   660,   660,
800       660,   660,   660,   660,   660,   660,   660,   660,
801       660,   660,   660,   660,   660,   660,   660,   660,
802       660,   660,   660,   724,    -1,    -1,    -1,    -1,
803        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
804        -1,    -1,    -1,    -1,   740,   756,   772,   788,
805       804,   820,   836,    -1,   852,    -1,    -1,    -1,
806        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
807        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
808        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
809        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
810        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
811        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
812        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
813        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
814        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
815        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
816        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
817        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
818        -1,    -1,   868,   884,    -1,    -1,    -1,    -1,
819        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
820        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
821       660,   660,   660,   660,   660,   660,   660,   660,
822       660,   660,   660,   660,   660,   660,   660,   660,
823       660,   660,   660,   660,   660,   660,   660,   660,
824       660,   660,   660,   660,   660,   660,   660,   660,
825       660,   660,   660,   660,   660,   660,   660,   660,
826       660,   660,   660,   660,   660,   660,   660,   660,
827       660,   660,   660,   660,   660,   660,   660,   660,
828       660,   660,   660,   660,   660,   660,   660,   660,
829       660,   660,   660,   660,   660,   660,   660,   660,
830       660,   660,   660,   660,   660,   660,   660,   660,
831       660,   660,   660,   900,    -1,    -1,    -1,    -1,
832        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
833        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
834        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
835        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
836        -1,    -1,    -1,    -1,   660,   916,    -1,    -1
837   },
838   {
839     0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE,
840     0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF,
841     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
842     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
843     0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
844     0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
845     0x00000000, 0x00000000, 0x00000000, 0x04000000,
846     0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
847     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
848     0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
849     0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
850     0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF,
851     0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000,
852     0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060,
853     0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000,
854     0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000,
855     0x00000000, 0x00000000, 0x00000000, 0x00000000,
856     0x00000000, 0x00000000, 0x00000000, 0x00000000,
857     0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003,
858     0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003,
859     0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000,
860     0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003,
861     0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003,
862     0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000,
863     0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003,
864     0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003,
865     0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003,
866     0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000,
867     0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000,
868     0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000,
869     0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF,
870     0x00000F00, 0x00000000, 0x00000000, 0x00000000,
871     0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000,
872     0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
873     0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
874     0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
875     0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
876     0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
877     0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000,
878     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
879     0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
880     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
881     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
882     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
883     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
884     0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
885     0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF,
886     0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000,
887     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
888     0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000,
889     0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF,
890     0x00000000, 0x00000000, 0x00000000, 0x00000000,
891     0x00000000, 0x00000000, 0x00000000, 0x00000000,
892     0x00000000, 0x00000000, 0x00000000, 0x00000000,
893     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
894     0x00000000, 0x00000000, 0x00000000, 0x00000000,
895     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
896     0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
897     0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
898     0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
899     0x00000000, 0x00000000, 0x00000000, 0x80020000,
900     0x00000000, 0x00000000, 0x00000000, 0x00000000,
901     0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
902     0x0000000F, 0x00000000, 0x00000000, 0x00000000,
903     0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF,
904     0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF,
905     0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
906     0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
907     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
908     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
909     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
910     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
911     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
912     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
913     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
914     0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
915     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
916     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
917     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
918     0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
919     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
920     0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
921     0x00000000, 0x00000000, 0x00000000, 0x00000000,
922     0x00000000, 0x00000000, 0x00000000, 0x00000000,
923     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
924     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
925     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
926     0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
927     0x00000000, 0x00000000, 0x00000000, 0x00000000,
928     0x00000000, 0x00000000, 0x00000000, 0x00000000,
929     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
930     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
931     0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
932     0x00000000, 0x00000000, 0x00000000, 0x00000000,
933     0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
934     0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
935     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
936     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
937     0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
938     0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
939     0x00000000, 0x00000000, 0x00000000, 0xFFDF0000,
940     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF,
941     0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0,
942     0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000,
943     0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
944     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
945     0x00000000, 0x00000000, 0x00000000, 0x00000000,
946     0x00000000, 0x00000000, 0x00000000, 0x00000000,
947     0x00000000, 0x00000000, 0x00000000, 0x00000000,
948     0x00000000, 0x00000000, 0x00000000, 0x00000000,
949     0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
950     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
951     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
952     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
953     0x00000000, 0x00000000, 0x00000000, 0x00000000,
954     0x00000000, 0x00000000, 0x00000000, 0x00000000,
955     0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
956     0x00000000, 0x00000000, 0x00000000, 0x00000000,
957     0x00000000, 0x00000000, 0x00000000, 0x00000000,
958     0x00000000, 0x00000000, 0x00000000, 0x00000000,
959     0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
960     0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
961     0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
962     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
963     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
964     0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
965     0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
966     0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000,
967     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
968     0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
969     0x00000000, 0x00000000, 0x00000000, 0x00000000,
970     0x00000000, 0x00000000, 0x00000000, 0x00000000,
971     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
972     0x00000000, 0x00000000, 0x00000000, 0x00000000,
973     0x00000000, 0x00000000, 0x00000000, 0x00000000,
974     0x00000000, 0x00000000, 0x00000000, 0x00000000
975   }
976 };
977 
978 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf,
979    as of Unicode 4.0.  */
980 static const
981 struct
982   {
983     int header[1];
984     int level1[15];
985     int level2[4 << 7];
986     /*unsigned*/ int level3[36 << 4];
987   }
988 table_identifier_part =
989 {
990   { 15 },
991   {
992        16,   144,   272,    -1,    -1,    -1,    -1,    -1,
993        -1,    -1,    -1,    -1,    -1,    -1,   400
994   },
995   {
996       528,   544,   560,   576,   592,   608,   624,   640,
997       656,   672,   688,   704,   720,    -1,   736,   752,
998       768,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
999       784,    -1,   800,   800,   800,   800,   800,   800,
1000       800,   800,   800,   800,   800,   800,   816,   800,
1001       800,   800,   800,   800,   800,   800,   800,   800,
1002       800,   800,   800,   800,   800,   800,   800,   800,
1003       800,   800,   800,   800,   800,   800,   800,   800,
1004       800,   800,   800,   800,   800,   800,   800,   800,
1005       800,   800,   800,   800,   800,   800,   800,   832,
1006       800,   800,   848,    -1,    -1,    -1,   800,   800,
1007       800,   800,   800,   800,   800,   800,   800,   800,
1008       800,   800,   800,   800,   800,   800,   800,   800,
1009       800,   800,   800,   864,    -1,    -1,    -1,    -1,
1010        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1011        -1,    -1,    -1,    -1,   880,   896,   912,   928,
1012       944,   960,   976,    -1,   992,    -1,    -1,    -1,
1013        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1014        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1015        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1016        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1017        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1018        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1019        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1020        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1021        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1022        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1023        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1024        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1025      1008,    -1,  1024,  1040,    -1,    -1,    -1,    -1,
1026        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1027        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1028       800,   800,   800,   800,   800,   800,   800,   800,
1029       800,   800,   800,   800,   800,   800,   800,   800,
1030       800,   800,   800,   800,   800,   800,   800,   800,
1031       800,   800,   800,   800,   800,   800,   800,   800,
1032       800,   800,   800,   800,   800,   800,   800,   800,
1033       800,   800,   800,   800,   800,   800,   800,   800,
1034       800,   800,   800,   800,   800,   800,   800,   800,
1035       800,   800,   800,   800,   800,   800,   800,   800,
1036       800,   800,   800,   800,   800,   800,   800,   800,
1037       800,   800,   800,   800,   800,   800,   800,   800,
1038       800,   800,   800,  1056,    -1,    -1,    -1,    -1,
1039        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1040        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1041        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1042        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1043        -1,    -1,    -1,    -1,   800,  1072,    -1,    -1,
1044      1088,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1045        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1046        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1047        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1048        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1049        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1050        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1051        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1052        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1053        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1054        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1055        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1056        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1057        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1058        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1059        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1
1060   },
1061   {
1062     0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE,
1063     0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF,
1064     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1065     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1066     0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
1067     0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
1068     0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF,
1069     0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
1070     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1071     0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
1072     0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
1073     0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF,
1074     0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF,
1075     0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF,
1076     0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000,
1077     0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000,
1078     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1079     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1080     0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF,
1081     0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF,
1082     0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0,
1083     0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF,
1084     0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3,
1085     0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80,
1086     0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3,
1087     0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3,
1088     0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3,
1089     0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000,
1090     0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000,
1091     0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000,
1092     0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF,
1093     0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000,
1094     0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000,
1095     0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
1096     0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
1097     0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
1098     0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
1099     0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
1100     0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00,
1101     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
1102     0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1103     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1104     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1105     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1106     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
1107     0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
1108     0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF,
1109     0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF,
1110     0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
1111     0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1112     0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF,
1113     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1114     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1115     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1116     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
1117     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1118     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1119     0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
1120     0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
1121     0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
1122     0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F,
1123     0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2,
1124     0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
1125     0x0000000F, 0x00000000, 0x00000000, 0x00000000,
1126     0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF,
1127     0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF,
1128     0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
1129     0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
1130     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1131     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1132     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1133     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1134     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1135     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1136     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1137     0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
1138     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1139     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1140     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1141     0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
1142     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1143     0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
1144     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1145     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1146     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1147     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1148     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1149     0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
1150     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1151     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1152     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1153     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1154     0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
1155     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1156     0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
1157     0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
1158     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1159     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1160     0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
1161     0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
1162     0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000,
1163     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF,
1164     0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0,
1165     0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000,
1166     0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
1167     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
1168     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1169     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1170     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1171     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1172     0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
1173     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1174     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1175     0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1176     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1177     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1178     0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
1179     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1180     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1181     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1182     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1183     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1184     0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0,
1185     0x00000FE7, 0x00003C00, 0x00000000, 0x00000000,
1186     0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
1187     0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
1188     0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
1189     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1190     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1191     0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
1192     0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
1193     0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF,
1194     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1195     0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
1196     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1197     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1198     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1199     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1200     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1201     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1202     0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1203     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1204     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1205     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF
1206   }
1207 };
1208 
1209 /* Return true if a given character can occur as first character of an
1210    identifier.  See ECMA-334 section 9.4.2.  */
1211 static bool
is_identifier_start(int c)1212 is_identifier_start (int c)
1213 {
1214   return bitmap_lookup (&table_identifier_start, c);
1215   /* In ASCII only this would be:
1216      return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_');
1217    */
1218 }
1219 
1220 /* Return true if a given character can occur as character of an identifier.
1221    See ECMA-334 section 9.4.2.  */
1222 static bool
is_identifier_part(int c)1223 is_identifier_part (int c)
1224 {
1225   return bitmap_lookup (&table_identifier_part, c);
1226   /* In ASCII only this would be:
1227      return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1228              || (c >= '0' && c <= '9') || c == '_');
1229    */
1230 }
1231 
1232 static bool
is_any_character(int c)1233 is_any_character (int c)
1234 {
1235   return true;
1236 }
1237 
1238 
1239 /* ======================= Preprocessor directives.  ======================= */
1240 
1241 
1242 /* Phase 5: Remove preprocessor lines.  See ECMA-334 section 9.5.
1243    As a side effect, this also removes initial whitespace on every line;
1244    this whitespace doesn't matter.  */
1245 
1246 static int phase5_pushback[10];
1247 static int phase5_pushback_length;
1248 
1249 static int
phase5_getc()1250 phase5_getc ()
1251 {
1252   int c;
1253 
1254   if (phase5_pushback_length)
1255     return phase5_pushback[--phase5_pushback_length];
1256 
1257   c = phase4_getc ();
1258   if (c != UNL)
1259     return c;
1260 
1261   do
1262     c = phase3_getc ();
1263   while (c != UEOF && is_whitespace (c));
1264 
1265   if (c == '#')
1266     {
1267       /* Ignore the entire line containing the preprocessor directive
1268 	 (including the // comment if it contains one).  */
1269       do
1270 	c = phase3_getc ();
1271       while (c != UEOF && c != UNL);
1272       return c;
1273     }
1274   else
1275     {
1276       phase3_ungetc (c);
1277       return UNL;
1278     }
1279 }
1280 
1281 #ifdef unused
1282 static void
phase5_ungetc(int c)1283 phase5_ungetc (int c)
1284 {
1285   if (c != UEOF)
1286     {
1287       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1288 	abort ();
1289       phase5_pushback[phase5_pushback_length++] = c;
1290     }
1291 }
1292 #endif
1293 
1294 
1295 /* ========================== Reading of tokens.  ========================== */
1296 
1297 enum token_type_ty
1298 {
1299   token_type_eof,
1300   token_type_lparen,		/* ( */
1301   token_type_rparen,		/* ) */
1302   token_type_lbrace,		/* { */
1303   token_type_rbrace,		/* } */
1304   token_type_comma,		/* , */
1305   token_type_dot,		/* . */
1306   token_type_string_literal,	/* "abc", @"abc" */
1307   token_type_number,		/* 1.23 */
1308   token_type_symbol,		/* identifier, keyword, null */
1309   token_type_plus,		/* + */
1310   token_type_other		/* character literal, misc. operator */
1311 };
1312 typedef enum token_type_ty token_type_ty;
1313 
1314 typedef struct token_ty token_ty;
1315 struct token_ty
1316 {
1317   token_type_ty type;
1318   char *string;		/* for token_type_string_literal, token_type_symbol */
1319   refcounted_string_list_ty *comment;	/* for token_type_string_literal */
1320   int line_number;
1321   int logical_line_number;
1322 };
1323 
1324 
1325 /* Free the memory pointed to by a 'struct token_ty'.  */
1326 static inline void
free_token(token_ty * tp)1327 free_token (token_ty *tp)
1328 {
1329   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
1330     free (tp->string);
1331   if (tp->type == token_type_string_literal)
1332     drop_reference (tp->comment);
1333 }
1334 
1335 
1336 /* Read a Unicode escape sequence outside string/character literals.
1337    Reject Unicode escapes that don't fulfill the given predicate.
1338    See ECMA-334 section 9.4.2.  */
1339 static int
do_getc_unicode_escaped(bool (* predicate)(int))1340 do_getc_unicode_escaped (bool (*predicate) (int))
1341 {
1342   int c;
1343 
1344   /* Use phase 3, because phase 4 elides comments.  */
1345   c = phase3_getc ();
1346   if (c == UEOF)
1347     return '\\';
1348   if (c == 'u' || c == 'U')
1349     {
1350       unsigned char buf[8];
1351       int expect;
1352       unsigned int n;
1353       int i;
1354 
1355       expect = (c == 'U' ? 8 : 4);
1356       n = 0;
1357       for (i = 0; i < expect; i++)
1358 	{
1359 	  int c1 = phase3_getc ();
1360 
1361 	  if (c1 >= '0' && c1 <= '9')
1362 	    n = (n << 4) + (c1 - '0');
1363 	  else if (c1 >= 'A' && c1 <= 'F')
1364 	    n = (n << 4) + (c1 - 'A' + 10);
1365 	  else if (c1 >= 'a' && c1 <= 'f')
1366 	    n = (n << 4) + (c1 - 'a' + 10);
1367 	  else
1368 	    {
1369 	      phase3_ungetc (c1);
1370 	      while (--i >= 0)
1371 		phase3_ungetc (buf[i]);
1372 	      phase3_ungetc (c);
1373 	      return '\\';
1374 	    }
1375 
1376 	  buf[i] = c1;
1377 	}
1378 
1379       if (n >= 0x110000)
1380 	{
1381 	  error_with_progname = false;
1382 	  error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1383 		 logical_file_name, line_number);
1384 	  error_with_progname = true;
1385 	}
1386       else if (predicate (n))
1387 	return n;
1388 
1389       while (--i >= 0)
1390 	phase3_ungetc (buf[i]);
1391     }
1392   phase3_ungetc (c);
1393   return '\\';
1394 }
1395 
1396 
1397 /* Read an escape sequence inside a string literal or character literal.
1398    See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
1399 static int
do_getc_escaped()1400 do_getc_escaped ()
1401 {
1402   int c;
1403   int n;
1404   int i;
1405 
1406   /* Use phase 3, because phase 4 elides comments.  */
1407   c = phase3_getc ();
1408   if (c == UEOF)
1409     return '\\';
1410   switch (c)
1411     {
1412     case 'a':
1413       return 0x0007;
1414     case 'b':
1415       return 0x0008;
1416     case 't':
1417       return 0x0009;
1418     case 'n':
1419       return 0x000a;
1420     case 'v':
1421       return 0x000b;
1422     case 'f':
1423       return 0x000c;
1424     case 'r':
1425       return 0x000d;
1426     case '"':
1427       return '"';
1428     case '\'':
1429       return '\'';
1430     case '\\':
1431       return '\\';
1432     case '0':
1433       return 0x0000;
1434     case 'x':
1435       c = phase3_getc ();
1436       switch (c)
1437 	{
1438 	default:
1439 	  phase3_ungetc (c);
1440 	  phase3_ungetc ('x');
1441 	  return '\\';
1442 
1443 	case '0': case '1': case '2': case '3': case '4':
1444 	case '5': case '6': case '7': case '8': case '9':
1445 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1446 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1447 	  break;
1448 	}
1449       n = 0;
1450       for (i = 0;; i++)
1451 	{
1452 	  switch (c)
1453 	    {
1454 	    default:
1455 	      phase3_ungetc (c);
1456 	      return n;
1457 	    case '0': case '1': case '2': case '3': case '4':
1458 	    case '5': case '6': case '7': case '8': case '9':
1459 	      n = n * 16 + c - '0';
1460 	      break;
1461 	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1462 	      n = n * 16 + 10 + c - 'A';
1463 	      break;
1464 	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1465 	      n = n * 16 + 10 + c - 'a';
1466 	      break;
1467 	    }
1468 	  if (i == 3)
1469 	    break;
1470 	  c = phase3_getc ();
1471 	}
1472       return n;
1473     case 'u': case 'U':
1474       phase3_ungetc (c);
1475       return do_getc_unicode_escaped (is_any_character);
1476     default:
1477       /* Invalid escape sequence.  */
1478       phase3_ungetc (c);
1479       return '\\';
1480     }
1481 }
1482 
1483 /* Read a regular string literal or character literal.
1484    See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
1485 static void
accumulate_escaped(struct string_buffer * literal,int delimiter)1486 accumulate_escaped (struct string_buffer *literal, int delimiter)
1487 {
1488   int c;
1489 
1490   for (;;)
1491     {
1492       /* Use phase 3, because phase 4 elides comments.  */
1493       c = phase3_getc ();
1494       if (c == UEOF || c == delimiter)
1495 	break;
1496       if (c == UNL)
1497 	{
1498 	  phase3_ungetc (c);
1499 	  error_with_progname = false;
1500 	  if (delimiter == '\'')
1501 	    error (0, 0, _("%s:%d: warning: unterminated character constant"),
1502 		   logical_file_name, line_number);
1503 	  else
1504 	    error (0, 0, _("%s:%d: warning: unterminated string constant"),
1505 		   logical_file_name, line_number);
1506 	  error_with_progname = true;
1507 	  break;
1508 	}
1509       if (c == '\\')
1510 	c = do_getc_escaped ();
1511       string_buffer_append_unicode (literal, c);
1512     }
1513 }
1514 
1515 
1516 /* Combine characters into tokens.  Discard whitespace.  */
1517 
1518 /* Maximum used guaranteed to be < 4.  */
1519 static token_ty phase6_pushback[4];
1520 static int phase6_pushback_length;
1521 
1522 static void
phase6_get(token_ty * tp)1523 phase6_get (token_ty *tp)
1524 {
1525   int c;
1526 
1527   if (phase6_pushback_length)
1528     {
1529       *tp = phase6_pushback[--phase6_pushback_length];
1530       return;
1531     }
1532   tp->string = NULL;
1533 
1534   for (;;)
1535     {
1536       tp->line_number = line_number;
1537       tp->logical_line_number = logical_line_number;
1538       c = phase5_getc ();
1539 
1540       if (c == UEOF)
1541 	{
1542 	  tp->type = token_type_eof;
1543 	  return;
1544 	}
1545 
1546       switch (c)
1547 	{
1548 	case UNL:
1549 	  if (last_non_comment_line > last_comment_line)
1550 	    savable_comment_reset ();
1551 	  /* FALLTHROUGH */
1552 	case ' ':
1553 	case '\t':
1554 	case '\f':
1555 	  /* Ignore whitespace and comments.  */
1556 	  continue;
1557 	}
1558 
1559       last_non_comment_line = tp->logical_line_number;
1560 
1561       switch (c)
1562 	{
1563 	case '(':
1564 	  tp->type = token_type_lparen;
1565 	  return;
1566 
1567 	case ')':
1568 	  tp->type = token_type_rparen;
1569 	  return;
1570 
1571 	case '{':
1572 	  tp->type = token_type_lbrace;
1573 	  return;
1574 
1575 	case '}':
1576 	  tp->type = token_type_rbrace;
1577 	  return;
1578 
1579 	case ',':
1580 	  tp->type = token_type_comma;
1581 	  return;
1582 
1583 	case '.':
1584 	  c = phase4_getc ();
1585 	  if (!(c >= '0' && c <= '9'))
1586 	    {
1587 	      phase4_ungetc (c);
1588 	      tp->type = token_type_dot;
1589 	      return;
1590 	    }
1591 	  /* FALLTHROUGH */
1592 
1593 	case '0': case '1': case '2': case '3': case '4':
1594 	case '5': case '6': case '7': case '8': case '9':
1595 	  {
1596 	    /* Don't need to verify the complicated syntax of integers and
1597 	       floating-point numbers.  We assume a valid C# input.
1598 	       The simplified syntax that we recognize as number is: any
1599 	       sequence of alphanumeric characters, additionally '+' and '-'
1600 	       immediately after 'e' or 'E' except in hexadecimal numbers.  */
1601 	    bool hexadecimal = false;
1602 
1603 	    for (;;)
1604 	      {
1605 		c = phase4_getc ();
1606 		if (c >= '0' && c <= '9')
1607 		  continue;
1608 		if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&c <= 'z'))
1609 		  {
1610 		    if (c == 'X' || c == 'x')
1611 		      hexadecimal = true;
1612 		    if ((c == 'E' || c == 'e') && !hexadecimal)
1613 		      {
1614 			c = phase4_getc ();
1615 			if (!(c == '+' || c == '-'))
1616 			  phase4_ungetc (c);
1617 		      }
1618 		    continue;
1619 		  }
1620 		if (c == '.')
1621 		  continue;
1622 		break;
1623 	      }
1624 	    phase4_ungetc (c);
1625 	    tp->type = token_type_number;
1626 	    return;
1627 	  }
1628 
1629 	case '"':
1630 	  /* Regular string literal.  */
1631 	  {
1632 	    struct string_buffer literal;
1633 
1634 	    init_string_buffer (&literal);
1635 	    accumulate_escaped (&literal, '"');
1636 	    tp->string = xstrdup (string_buffer_result (&literal));
1637 	    free_string_buffer (&literal);
1638 	    tp->comment = add_reference (savable_comment);
1639 	    tp->type = token_type_string_literal;
1640 	    return;
1641 	  }
1642 
1643 	case '\'':
1644 	  /* Character literal.  */
1645 	  {
1646 	    struct string_buffer literal;
1647 
1648 	    init_string_buffer (&literal);
1649 	    accumulate_escaped (&literal, '\'');
1650 	    free_string_buffer (&literal);
1651 	    tp->type = token_type_other;
1652 	    return;
1653 	  }
1654 
1655 	case '+':
1656 	  c = phase4_getc ();
1657 	  if (c == '+')
1658 	    /* Operator ++ */
1659 	    tp->type = token_type_other;
1660 	  else if (c == '=')
1661 	    /* Operator += */
1662 	    tp->type = token_type_other;
1663 	  else
1664 	    {
1665 	      /* Operator + */
1666 	      phase4_ungetc (c);
1667 	      tp->type = token_type_plus;
1668 	    }
1669 	  return;
1670 
1671 	case '@':
1672 	  c = phase4_getc ();
1673 	  if (c == '"')
1674 	    {
1675 	      /* Verbatim string literal.  */
1676 	      struct string_buffer literal;
1677 
1678 	      init_string_buffer (&literal);
1679 	      for (;;)
1680 		{
1681 		  /* Use phase 2, because phase 4 elides comments and phase 3
1682 		     mixes up the newline characters.  */
1683 		  c = phase2_getc ();
1684 		  if (c == UEOF)
1685 		    break;
1686 		  if (c == '"')
1687 		    {
1688 		      c = phase2_getc ();
1689 		      if (c != '"')
1690 			{
1691 			  phase2_ungetc (c);
1692 			  break;
1693 			}
1694 		    }
1695 		  /* No special treatment of newline and backslash here.  */
1696 		  string_buffer_append_unicode (&literal, c);
1697 		}
1698 	      tp->string = xstrdup (string_buffer_result (&literal));
1699 	      free_string_buffer (&literal);
1700 	      tp->comment = add_reference (savable_comment);
1701 	      tp->type = token_type_string_literal;
1702 	      return;
1703 	    }
1704 	  /* FALLTHROUGH, so that @identifier is recognized.  */
1705 
1706 	default:
1707 	  if (c == '\\')
1708 	    c = do_getc_unicode_escaped (is_identifier_start);
1709 	  if (is_identifier_start (c))
1710 	    {
1711 	      static struct string_buffer buffer;
1712 	      buffer.utf8_buflen = 0;
1713 	      for (;;)
1714 		{
1715 		  string_buffer_append_unicode (&buffer, c);
1716 		  c = phase4_getc ();
1717 		  if (c == '\\')
1718 		    c = do_getc_unicode_escaped (is_identifier_part);
1719 		  if (!is_identifier_part (c))
1720 		    break;
1721 		}
1722 	      phase4_ungetc (c);
1723 	      tp->string = xstrdup (string_buffer_result (&buffer));
1724 	      tp->type = token_type_symbol;
1725 	      return;
1726 	    }
1727 	  else
1728 	    {
1729 	      /* Misc. operator.  */
1730 	      tp->type = token_type_other;
1731 	      return;
1732 	    }
1733 	}
1734     }
1735 }
1736 
1737 /* Supports 3 tokens of pushback.  */
1738 static void
phase6_unget(token_ty * tp)1739 phase6_unget (token_ty *tp)
1740 {
1741   if (tp->type != token_type_eof)
1742     {
1743       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1744 	abort ();
1745       phase6_pushback[phase6_pushback_length++] = *tp;
1746     }
1747 }
1748 
1749 
1750 /* Compile-time optimization of string literal concatenation.
1751    Combine "string1" + ... + "stringN" to the concatenated string if
1752      - the token after this expression is not '.' (because then the last
1753        string could be part of a method call expression).  */
1754 
1755 static token_ty phase7_pushback[2];
1756 static int phase7_pushback_length;
1757 
1758 static void
phase7_get(token_ty * tp)1759 phase7_get (token_ty *tp)
1760 {
1761   if (phase7_pushback_length)
1762     {
1763       *tp = phase7_pushback[--phase7_pushback_length];
1764       return;
1765     }
1766 
1767   phase6_get (tp);
1768   if (tp->type == token_type_string_literal)
1769     {
1770       char *sum = tp->string;
1771       size_t sum_len = strlen (sum);
1772 
1773       for (;;)
1774 	{
1775 	  token_ty token2;
1776 
1777 	  phase6_get (&token2);
1778 	  if (token2.type == token_type_plus)
1779 	    {
1780 	      token_ty token3;
1781 
1782 	      phase6_get (&token3);
1783 	      if (token3.type == token_type_string_literal)
1784 		{
1785 		  token_ty token_after;
1786 
1787 		  phase6_get (&token_after);
1788 		  if (token_after.type != token_type_dot)
1789 		    {
1790 		      char *addend = token3.string;
1791 		      size_t addend_len = strlen (addend);
1792 
1793 		      sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1794 		      memcpy (sum + sum_len, addend, addend_len + 1);
1795 		      sum_len += addend_len;
1796 
1797 		      phase6_unget (&token_after);
1798 		      free_token (&token3);
1799 		      free_token (&token2);
1800 		      continue;
1801 		    }
1802 		  phase6_unget (&token_after);
1803 		}
1804 	      phase6_unget (&token3);
1805 	    }
1806 	  phase6_unget (&token2);
1807 	  break;
1808 	}
1809       tp->string = sum;
1810     }
1811 }
1812 
1813 /* Supports 2 tokens of pushback.  */
1814 static void
phase7_unget(token_ty * tp)1815 phase7_unget (token_ty *tp)
1816 {
1817   if (tp->type != token_type_eof)
1818     {
1819       if (phase7_pushback_length == SIZEOF (phase7_pushback))
1820 	abort ();
1821       phase7_pushback[phase7_pushback_length++] = *tp;
1822     }
1823 }
1824 
1825 
1826 static void
x_csharp_lex(token_ty * tp)1827 x_csharp_lex (token_ty *tp)
1828 {
1829   phase7_get (tp);
1830 }
1831 
1832 /* Supports 2 tokens of pushback.  */
1833 static void
x_csharp_unlex(token_ty * tp)1834 x_csharp_unlex (token_ty *tp)
1835 {
1836   phase7_unget (tp);
1837 }
1838 
1839 
1840 /* ========================= Extracting strings.  ========================== */
1841 
1842 
1843 /* Context lookup table.  */
1844 static flag_context_list_table_ty *flag_context_list_table;
1845 
1846 
1847 /* The file is broken into tokens.  Scan the token stream, looking for
1848    a keyword, followed by a left paren, followed by a string.  When we
1849    see this sequence, we have something to remember.  We assume we are
1850    looking at a valid C or C++ program, and leave the complaints about
1851    the grammar to the compiler.
1852 
1853      Normal handling: Look for
1854        keyword ( ... msgid ... )
1855      Plural handling: Look for
1856        keyword ( ... msgid ... msgid_plural ... )
1857 
1858    We use recursion because the arguments before msgid or between msgid
1859    and msgid_plural can contain subexpressions of the same form.  */
1860 
1861 
1862 /* Extract messages until the next balanced closing parenthesis or brace,
1863    depending on TERMINATOR.
1864    Extracted messages are added to MLP.
1865    Return true upon eof, false upon closing parenthesis or brace.  */
1866 static bool
extract_parenthesized(message_list_ty * mlp,token_type_ty terminator,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1867 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1868 		       flag_context_ty outer_context,
1869 		       flag_context_list_iterator_ty context_iter,
1870 		       struct arglist_parser *argparser)
1871 {
1872   /* Current argument number.  */
1873   int arg = 1;
1874   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1875   int state;
1876   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1877   const struct callshapes *next_shapes = NULL;
1878   /* Context iterator that will be used if the next token is a '('.  */
1879   flag_context_list_iterator_ty next_context_iter =
1880     passthrough_context_list_iterator;
1881   /* Current context.  */
1882   flag_context_ty inner_context =
1883     inherited_context (outer_context,
1884 		       flag_context_list_iterator_advance (&context_iter));
1885 
1886   /* Start state is 0.  */
1887   state = 0;
1888 
1889   for (;;)
1890     {
1891       token_ty token;
1892 
1893       x_csharp_lex (&token);
1894       switch (token.type)
1895 	{
1896 	case token_type_symbol:
1897 	  {
1898 	    /* Combine symbol1 . ... . symbolN to a single strings, so that
1899 	       we can recognize static function calls like
1900 	       GettextResource.gettext.  The information present for
1901 	       symbolI.....symbolN has precedence over the information for
1902 	       symbolJ.....symbolN with J > I.  */
1903 	    char *sum = token.string;
1904 	    size_t sum_len = strlen (sum);
1905 	    const char *dottedname;
1906 	    flag_context_list_ty *context_list;
1907 
1908 	    for (;;)
1909 	      {
1910 		token_ty token2;
1911 
1912 		x_csharp_lex (&token2);
1913 		if (token2.type == token_type_dot)
1914 		  {
1915 		    token_ty token3;
1916 
1917 		    x_csharp_lex (&token3);
1918 		    if (token3.type == token_type_symbol)
1919 		      {
1920 			char *addend = token3.string;
1921 			size_t addend_len = strlen (addend);
1922 
1923 			sum =
1924 			  (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1925 			sum[sum_len] = '.';
1926 			memcpy (sum + sum_len + 1, addend, addend_len + 1);
1927 			sum_len += 1 + addend_len;
1928 
1929 			free_token (&token3);
1930 			free_token (&token2);
1931 			continue;
1932 		      }
1933 		    x_csharp_unlex (&token3);
1934 		  }
1935 		x_csharp_unlex (&token2);
1936 		break;
1937 	      }
1938 
1939 	    for (dottedname = sum;;)
1940 	      {
1941 		void *keyword_value;
1942 
1943 		if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1944 				     &keyword_value)
1945 		    == 0)
1946 		  {
1947 		    next_shapes = (const struct callshapes *) keyword_value;
1948 		    state = 1;
1949 		    break;
1950 		  }
1951 
1952 		dottedname = strchr (dottedname, '.');
1953 		if (dottedname == NULL)
1954 		  {
1955 		    state = 0;
1956 		    break;
1957 		  }
1958 		dottedname++;
1959 	      }
1960 
1961 	    for (dottedname = sum;;)
1962 	      {
1963 		context_list =
1964 		  flag_context_list_table_lookup (
1965 		    flag_context_list_table,
1966 		    dottedname, strlen (dottedname));
1967 		if (context_list != NULL)
1968 		  break;
1969 
1970 		dottedname = strchr (dottedname, '.');
1971 		if (dottedname == NULL)
1972 		  break;
1973 		dottedname++;
1974 	      }
1975 	    next_context_iter = flag_context_list_iterator (context_list);
1976 
1977 	    free (sum);
1978 	    continue;
1979 	  }
1980 
1981 	case token_type_lparen:
1982 	  if (extract_parenthesized (mlp, token_type_rparen,
1983 				     inner_context, next_context_iter,
1984 				     arglist_parser_alloc (mlp,
1985 							   state ? next_shapes : NULL)))
1986 	    {
1987 	      xgettext_current_source_encoding = po_charset_utf8;
1988 	      arglist_parser_done (argparser, arg);
1989 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1990 	      return true;
1991 	    }
1992 	  next_context_iter = null_context_list_iterator;
1993 	  state = 0;
1994 	  continue;
1995 
1996 	case token_type_rparen:
1997 	  if (terminator == token_type_rparen)
1998 	    {
1999 	      xgettext_current_source_encoding = po_charset_utf8;
2000 	      arglist_parser_done (argparser, arg);
2001 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2002 	      return false;
2003 	    }
2004 	  if (terminator == token_type_rbrace)
2005 	    {
2006 	      error_with_progname = false;
2007 	      error (0, 0,
2008 		     _("%s:%d: warning: ')' found where '}' was expected"),
2009 		     logical_file_name, token.line_number);
2010 	      error_with_progname = true;
2011 	    }
2012 	  next_context_iter = null_context_list_iterator;
2013 	  state = 0;
2014 	  continue;
2015 
2016 	case token_type_lbrace:
2017 	  if (extract_parenthesized (mlp, token_type_rbrace,
2018 				     null_context, null_context_list_iterator,
2019 				     arglist_parser_alloc (mlp, NULL)))
2020 	    {
2021 	      xgettext_current_source_encoding = po_charset_utf8;
2022 	      arglist_parser_done (argparser, arg);
2023 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2024 	      return true;
2025 	    }
2026 	  next_context_iter = null_context_list_iterator;
2027 	  state = 0;
2028 	  continue;
2029 
2030 	case token_type_rbrace:
2031 	  if (terminator == token_type_rbrace)
2032 	    {
2033 	      xgettext_current_source_encoding = po_charset_utf8;
2034 	      arglist_parser_done (argparser, arg);
2035 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2036 	      return false;
2037 	    }
2038 	  if (terminator == token_type_rparen)
2039 	    {
2040 	      error_with_progname = false;
2041 	      error (0, 0,
2042 		     _("%s:%d: warning: '}' found where ')' was expected"),
2043 		     logical_file_name, token.line_number);
2044 	      error_with_progname = true;
2045 	    }
2046 	  next_context_iter = null_context_list_iterator;
2047 	  state = 0;
2048 	  continue;
2049 
2050 	case token_type_comma:
2051 	  arg++;
2052 	  inner_context =
2053 	    inherited_context (outer_context,
2054 			       flag_context_list_iterator_advance (
2055 				 &context_iter));
2056 	  next_context_iter = passthrough_context_list_iterator;
2057 	  state = 0;
2058 	  continue;
2059 
2060 	case token_type_string_literal:
2061 	  {
2062 	    lex_pos_ty pos;
2063 	    pos.file_name = logical_file_name;
2064 	    pos.line_number = token.line_number;
2065 
2066 	    xgettext_current_source_encoding = po_charset_utf8;
2067 	    if (extract_all)
2068 	      remember_a_message (mlp, NULL, token.string, inner_context,
2069 				  &pos, token.comment);
2070 	    else
2071 	      arglist_parser_remember (argparser, arg, token.string,
2072 				       inner_context,
2073 				       pos.file_name, pos.line_number,
2074 				       token.comment);
2075 	    xgettext_current_source_encoding = xgettext_global_source_encoding;
2076 	  }
2077 	  drop_reference (token.comment);
2078 	  next_context_iter = null_context_list_iterator;
2079 	  state = 0;
2080 	  continue;
2081 
2082 	case token_type_eof:
2083 	  xgettext_current_source_encoding = po_charset_utf8;
2084 	  arglist_parser_done (argparser, arg);
2085 	  xgettext_current_source_encoding = xgettext_global_source_encoding;
2086 	  return true;
2087 
2088 	case token_type_dot:
2089 	case token_type_number:
2090 	case token_type_plus:
2091 	case token_type_other:
2092 	  next_context_iter = null_context_list_iterator;
2093 	  state = 0;
2094 	  continue;
2095 
2096 	default:
2097 	  abort ();
2098 	}
2099     }
2100 }
2101 
2102 
2103 void
extract_csharp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2104 extract_csharp (FILE *f,
2105 		const char *real_filename, const char *logical_filename,
2106 		flag_context_list_table_ty *flag_table,
2107 		msgdomain_list_ty *mdlp)
2108 {
2109   message_list_ty *mlp = mdlp->item[0]->messages;
2110 
2111   fp = f;
2112   real_file_name = real_filename;
2113   logical_file_name = xstrdup (logical_filename);
2114   line_number = 1;
2115 
2116   logical_line_number = 1;
2117   last_comment_line = -1;
2118   last_non_comment_line = -1;
2119 
2120   flag_context_list_table = flag_table;
2121 
2122   init_keywords ();
2123 
2124   /* Eat tokens until eof is seen.  When extract_parenthesized returns
2125      due to an unbalanced closing parenthesis, just restart it.  */
2126   while (!extract_parenthesized (mlp, token_type_eof,
2127 				 null_context, null_context_list_iterator,
2128 				 arglist_parser_alloc (mlp, NULL)))
2129     ;
2130 
2131   fp = NULL;
2132   real_file_name = NULL;
2133   logical_file_name = NULL;
2134   line_number = 0;
2135 }
2136