xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/libgrep/dfa.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* dfa.c - deterministic extended regexp routines for GNU
2    Copyright 1988, 1998, 2000, 2005 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 2, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software Foundation,
16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-13017, USA */
17 
18 /* Written June, 1988 by Mike Haertel
19    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24 
25 #include <assert.h>
26 #include <ctype.h>
27 #include <stdio.h>
28 
29 #include <sys/types.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <locale.h>
33 
34 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
35 /* We can handle multibyte string.  */
36 # define MBS_SUPPORT
37 #endif
38 
39 #ifdef MBS_SUPPORT
40 # include <wchar.h>
41 # include <wctype.h>
42 #endif
43 
44 #ifndef DEBUG	/* use the same approach as regex.c */
45 #undef assert
46 #define assert(e)
47 #endif /* DEBUG */
48 
49 #ifndef isgraph
50 #define isgraph(C) (isprint(C) && !isspace(C))
51 #endif
52 
53 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
54 #define ISALPHA(C) isalpha(C)
55 #define ISUPPER(C) isupper(C)
56 #define ISLOWER(C) islower(C)
57 #define ISDIGIT(C) isdigit(C)
58 #define ISXDIGIT(C) isxdigit(C)
59 #define ISSPACE(C) isspace(C)
60 #define ISPUNCT(C) ispunct(C)
61 #define ISALNUM(C) isalnum(C)
62 #define ISPRINT(C) isprint(C)
63 #define ISGRAPH(C) isgraph(C)
64 #define ISCNTRL(C) iscntrl(C)
65 #else
66 #define ISALPHA(C) (isascii(C) && isalpha(C))
67 #define ISUPPER(C) (isascii(C) && isupper(C))
68 #define ISLOWER(C) (isascii(C) && islower(C))
69 #define ISDIGIT(C) (isascii(C) && isdigit(C))
70 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
71 #define ISSPACE(C) (isascii(C) && isspace(C))
72 #define ISPUNCT(C) (isascii(C) && ispunct(C))
73 #define ISALNUM(C) (isascii(C) && isalnum(C))
74 #define ISPRINT(C) (isascii(C) && isprint(C))
75 #define ISGRAPH(C) (isascii(C) && isgraph(C))
76 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
77 #endif
78 
79 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
80    - Its arg may be any int or unsigned int; it need not be an unsigned char.
81    - It's guaranteed to evaluate its argument exactly once.
82    - It's typically faster.
83    Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
84    only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
85    it's important to use the locale's definition of `digit' even when the
86    host does not conform to Posix.  */
87 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
88 
89 #include "regex.h"
90 #include "dfa.h"
91 #include "hard-locale.h"
92 #include "gettext.h"
93 #define _(str) gettext (str)
94 
95 /* HPUX, define those as macros in sys/param.h */
96 #ifdef setbit
97 # undef setbit
98 #endif
99 #ifdef clrbit
100 # undef clrbit
101 #endif
102 
103 static void dfamust (struct dfa *dfa);
104 static void regexp (int toplevel);
105 
106 static ptr_t
xcalloc(size_t n,size_t s)107 xcalloc (size_t n, size_t s)
108 {
109   ptr_t r = calloc(n, s);
110 
111   if (!r)
112     dfaerror(_("Memory exhausted"));
113   return r;
114 }
115 
116 static ptr_t
xmalloc(size_t n)117 xmalloc (size_t n)
118 {
119   ptr_t r = malloc(n);
120 
121   assert(n != 0);
122   if (!r)
123     dfaerror(_("Memory exhausted"));
124   return r;
125 }
126 
127 static ptr_t
xrealloc(ptr_t p,size_t n)128 xrealloc (ptr_t p, size_t n)
129 {
130   ptr_t r = realloc(p, n);
131 
132   assert(n != 0);
133   if (!r)
134     dfaerror(_("Memory exhausted"));
135   return r;
136 }
137 
138 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
139 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
140 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
141 
142 /* Reallocate an array of type t if nalloc is too small for index. */
143 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
144   if ((index) >= (nalloc))			  \
145     {						  \
146       do					  \
147 	(nalloc) *= 2;				  \
148       while ((index) >= (nalloc));		  \
149       REALLOC(p, t, nalloc);			  \
150     }
151 
152 #ifdef DEBUG
153 
154 static void
prtok(token t)155 prtok (token t)
156 {
157   char const *s;
158 
159   if (t < 0)
160     fprintf(stderr, "END");
161   else if (t < NOTCHAR)
162     fprintf(stderr, "%c", t);
163   else
164     {
165       switch (t)
166 	{
167 	case EMPTY: s = "EMPTY"; break;
168 	case BACKREF: s = "BACKREF"; break;
169 	case BEGLINE: s = "BEGLINE"; break;
170 	case ENDLINE: s = "ENDLINE"; break;
171 	case BEGWORD: s = "BEGWORD"; break;
172 	case ENDWORD: s = "ENDWORD"; break;
173 	case LIMWORD: s = "LIMWORD"; break;
174 	case NOTLIMWORD: s = "NOTLIMWORD"; break;
175 	case QMARK: s = "QMARK"; break;
176 	case STAR: s = "STAR"; break;
177 	case PLUS: s = "PLUS"; break;
178 	case CAT: s = "CAT"; break;
179 	case OR: s = "OR"; break;
180 	case ORTOP: s = "ORTOP"; break;
181 	case LPAREN: s = "LPAREN"; break;
182 	case RPAREN: s = "RPAREN"; break;
183 	case CRANGE: s = "CRANGE"; break;
184 #ifdef MBS_SUPPORT
185 	case ANYCHAR: s = "ANYCHAR"; break;
186 	case MBCSET: s = "MBCSET"; break;
187 #endif /* MBS_SUPPORT */
188 	default: s = "CSET"; break;
189 	}
190       fprintf(stderr, "%s", s);
191     }
192 }
193 #endif /* DEBUG */
194 
195 /* Stuff pertaining to charclasses. */
196 
197 static int
tstbit(unsigned b,charclass c)198 tstbit (unsigned b, charclass c)
199 {
200   return c[b / INTBITS] & 1 << b % INTBITS;
201 }
202 
203 static void
setbit(unsigned b,charclass c)204 setbit (unsigned b, charclass c)
205 {
206   c[b / INTBITS] |= 1 << b % INTBITS;
207 }
208 
209 static void
clrbit(unsigned b,charclass c)210 clrbit (unsigned b, charclass c)
211 {
212   c[b / INTBITS] &= ~(1 << b % INTBITS);
213 }
214 
215 static void
copyset(charclass src,charclass dst)216 copyset (charclass src, charclass dst)
217 {
218   memcpy (dst, src, sizeof (charclass));
219 }
220 
221 static void
zeroset(charclass s)222 zeroset (charclass s)
223 {
224   memset (s, 0, sizeof (charclass));
225 }
226 
227 static void
notset(charclass s)228 notset (charclass s)
229 {
230   int i;
231 
232   for (i = 0; i < CHARCLASS_INTS; ++i)
233     s[i] = ~s[i];
234 }
235 
236 static int
equal(charclass s1,charclass s2)237 equal (charclass s1, charclass s2)
238 {
239   return memcmp (s1, s2, sizeof (charclass)) == 0;
240 }
241 
242 /* A pointer to the current dfa is kept here during parsing. */
243 static struct dfa *dfa;
244 
245 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
246 static int
charclass_index(charclass s)247 charclass_index (charclass s)
248 {
249   int i;
250 
251   for (i = 0; i < dfa->cindex; ++i)
252     if (equal(s, dfa->charclasses[i]))
253       return i;
254   REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
255   ++dfa->cindex;
256   copyset(s, dfa->charclasses[i]);
257   return i;
258 }
259 
260 /* Syntax bits controlling the behavior of the lexical analyzer. */
261 static reg_syntax_t syntax_bits, syntax_bits_set;
262 
263 /* Flag for case-folding letters into sets. */
264 static int case_fold;
265 
266 /* End-of-line byte in data.  */
267 static unsigned char eolbyte;
268 
269 /* Entry point to set syntax options. */
270 void
dfasyntax(reg_syntax_t bits,int fold,unsigned char eol)271 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
272 {
273   syntax_bits_set = 1;
274   syntax_bits = bits;
275   case_fold = fold;
276   eolbyte = eol;
277 }
278 
279 /* Like setbit, but if case is folded, set both cases of a letter.  */
280 static void
setbit_case_fold(unsigned b,charclass c)281 setbit_case_fold (unsigned b, charclass c)
282 {
283   setbit (b, c);
284   if (case_fold)
285     {
286       if (ISUPPER (b))
287 	setbit (tolower (b), c);
288       else if (ISLOWER (b))
289 	setbit (toupper (b), c);
290     }
291 }
292 
293 /* Lexical analyzer.  All the dross that deals with the obnoxious
294    GNU Regex syntax bits is located here.  The poor, suffering
295    reader is referred to the GNU Regex documentation for the
296    meaning of the @#%!@#%^!@ syntax bits. */
297 
298 static char const *lexstart;	/* Pointer to beginning of input string. */
299 static char const *lexptr;	/* Pointer to next input character. */
300 static int lexleft;		/* Number of characters remaining. */
301 static token lasttok;		/* Previous token returned; initially END. */
302 static int laststart;		/* True if we're separated from beginning or (, |
303 				   only by zero-width characters. */
304 static int parens;		/* Count of outstanding left parens. */
305 static int minrep, maxrep;	/* Repeat counts for {m,n}. */
306 static int hard_LC_COLLATE;	/* Nonzero if LC_COLLATE is hard.  */
307 
308 #ifdef MBS_SUPPORT
309 /* These variables are used only if (MB_CUR_MAX > 1).  */
310 static mbstate_t mbs;		/* Mbstate for mbrlen().  */
311 static int cur_mb_len;		/* Byte length of the current scanning
312 				   multibyte character.  */
313 static int cur_mb_index;        /* Byte index of the current scanning multibyte
314                                    character.
315 
316 				   singlebyte character : cur_mb_index = 0
317 				   multibyte character
318 				       1st byte : cur_mb_index = 1
319 				       2nd byte : cur_mb_index = 2
320 				         ...
321 				       nth byte : cur_mb_index = n  */
322 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
323                                   Each element store the amount of remain
324                                   byte of corresponding multibyte character
325                                   in the input string.  A element's value
326                                   is 0 if corresponding character is a
327                                   singlebyte chracter.
328                                   e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
329                                    mblen_buf :   0,       3,       2,       1
330                                */
331 static wchar_t *inputwcs;	/* Wide character representation of input
332 				   string in dfaexec().
333 				   The length of this array is same as
334 				   the length of input string(char array).
335 				   inputstring[i] is a single-byte char,
336 				   or 1st byte of a multibyte char.
337 				   And inputwcs[i] is the codepoint.  */
338 static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
339 static unsigned char const *buf_end;	/* refference to end in dfaexec().  */
340 #endif /* MBS_SUPPORT  */
341 
342 #ifdef MBS_SUPPORT
343 /* This function update cur_mb_len, and cur_mb_index.
344    p points current lexptr, len is the remaining buffer length.  */
345 static void
update_mb_len_index(char const * p,int len)346 update_mb_len_index (char const *p, int len)
347 {
348   /* If last character is a part of a multibyte character,
349      we update cur_mb_index.  */
350   if (cur_mb_index)
351     cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
352 			: cur_mb_index + 1;
353 
354   /* If last character is a single byte character, or the
355      last portion of a multibyte character, we check whether
356      next character is a multibyte character or not.  */
357   if (! cur_mb_index)
358     {
359       cur_mb_len = mbrlen(p, len, &mbs);
360       if (cur_mb_len > 1)
361 	/* It is a multibyte character.
362 	   cur_mb_len was already set by mbrlen().  */
363 	cur_mb_index = 1;
364       else if (cur_mb_len < 1)
365 	/* Invalid sequence.  We treat it as a singlebyte character.
366 	   cur_mb_index is aleady 0.  */
367 	cur_mb_len = 1;
368       /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
369 	 cur_mb_index is aleady 0.  */
370     }
371 }
372 #endif /* MBS_SUPPORT */
373 
374 #ifdef MBS_SUPPORT
375 /* Note that characters become unsigned here. */
376 # define FETCH(c, eoferr)			\
377   {						\
378     if (! lexleft)				\
379      {						\
380 	if (eoferr != 0)			\
381 	  dfaerror (eoferr);			\
382 	else					\
383 	  return lasttok = END;			\
384       }						\
385     if (MB_CUR_MAX > 1)				\
386       update_mb_len_index(lexptr, lexleft);	\
387     (c) = (unsigned char) *lexptr++;		\
388     --lexleft;					\
389   }
390 
391 /* This function fetch a wide character, and update cur_mb_len,
392    used only if the current locale is a multibyte environment.  */
393 static wchar_t
fetch_wc(char const * eoferr)394 fetch_wc (char const *eoferr)
395 {
396   wchar_t wc;
397   if (! lexleft)
398     {
399       if (eoferr != 0)
400 	dfaerror (eoferr);
401       else
402 	return -1;
403     }
404 
405   cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
406   if (cur_mb_len <= 0)
407    {
408       cur_mb_len = 1;
409       wc = *lexptr;
410     }
411   lexptr += cur_mb_len;
412   lexleft -= cur_mb_len;
413   return wc;
414 }
415 #else
416 /* Note that characters become unsigned here. */
417 # define FETCH(c, eoferr)   	      \
418   {			   	      \
419     if (! lexleft)	   	      \
420       {				      \
421 	if (eoferr != 0)	      \
422 	  dfaerror (eoferr);	      \
423 	else		   	      \
424 	  return lasttok = END;	      \
425       }				      \
426     (c) = (unsigned char) *lexptr++;  \
427     --lexleft;		   	      \
428   }
429 #endif /* MBS_SUPPORT */
430 
431 #ifdef MBS_SUPPORT
432 /* Multibyte character handling sub-routin for lex.
433    This function  parse a bracket expression and build a struct
434    mb_char_classes.  */
435 static void
parse_bracket_exp_mb()436 parse_bracket_exp_mb ()
437 {
438   wchar_t wc, wc1, wc2;
439 
440   /* Work area to build a mb_char_classes.  */
441   struct mb_char_classes *work_mbc;
442   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
443     equivs_al, coll_elems_al;
444 
445   REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
446 		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
447   /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
448      We will update dfa->multibyte_prop in addtok(), because we can't
449      decide the index in dfa->tokens[].  */
450 
451   /* Initialize work are */
452   work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
453 
454   chars_al = 1;
455   range_sts_al = range_ends_al = 0;
456   ch_classes_al = equivs_al = coll_elems_al = 0;
457   MALLOC(work_mbc->chars, wchar_t, chars_al);
458 
459   work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
460   work_mbc->nequivs = work_mbc->ncoll_elems = 0;
461   work_mbc->chars = NULL;
462   work_mbc->ch_classes = NULL;
463   work_mbc->range_sts = work_mbc->range_ends = NULL;
464   work_mbc->equivs = work_mbc->coll_elems = NULL;
465 
466   wc = fetch_wc(_("Unbalanced ["));
467   if (wc == L'^')
468     {
469       wc = fetch_wc(_("Unbalanced ["));
470       work_mbc->invert = 1;
471     }
472   else
473     work_mbc->invert = 0;
474   do
475     {
476       wc1 = -1; /* mark wc1 is not initialized".  */
477 
478       /* Note that if we're looking at some other [:...:] construct,
479 	 we just treat it as a bunch of ordinary characters.  We can do
480 	 this because we assume regex has checked for syntax errors before
481 	 dfa is ever called. */
482       if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
483 	{
484 #define BRACKET_BUFFER_SIZE 128
485 	  char str[BRACKET_BUFFER_SIZE];
486 	  wc1 = wc;
487 	  wc = fetch_wc(_("Unbalanced ["));
488 
489 	  /* If pattern contains `[[:', `[[.', or `[[='.  */
490 	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
491 	    {
492 	      unsigned char c;
493 	      unsigned char delim = (unsigned char)wc;
494 	      int len = 0;
495 	      for (;;)
496 		{
497 		  if (! lexleft)
498 		    dfaerror (_("Unbalanced ["));
499 		  c = (unsigned char) *lexptr++;
500 		  --lexleft;
501 
502 		  if ((c == delim && *lexptr == ']') || lexleft == 0)
503 		    break;
504 		  if (len < BRACKET_BUFFER_SIZE)
505 		    str[len++] = c;
506 		  else
507 		    /* This is in any case an invalid class name.  */
508 		    str[0] = '\0';
509 		}
510 	      str[len] = '\0';
511 
512 	      if (lexleft == 0)
513 		{
514 		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
515 				       work_mbc->nchars + 2);
516 		  work_mbc->chars[work_mbc->nchars++] = L'[';
517 		  work_mbc->chars[work_mbc->nchars++] = delim;
518 		  break;
519 		}
520 
521 	      if (--lexleft, *lexptr++ != ']')
522 		dfaerror (_("Unbalanced ["));
523 	      if (delim == ':')
524 		/* build character class.  */
525 		{
526 		  wctype_t wt;
527 		  /* Query the character class as wctype_t.  */
528 		  wt = wctype (str);
529 
530 		  if (ch_classes_al == 0)
531 		    MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
532 		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
533 				       ch_classes_al,
534 				       work_mbc->nch_classes + 1);
535 		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
536 
537  		}
538 	      else if (delim == '=' || delim == '.')
539 		{
540 		  char *elem;
541 		  MALLOC(elem, char, len + 1);
542 		  strncpy(elem, str, len + 1);
543 
544 		  if (delim == '=')
545 		    /* build equivalent class.  */
546 		    {
547 		      if (equivs_al == 0)
548 			MALLOC(work_mbc->equivs, char*, ++equivs_al);
549 		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
550 					   equivs_al,
551 					   work_mbc->nequivs + 1);
552 		      work_mbc->equivs[work_mbc->nequivs++] = elem;
553 		    }
554 
555 		  if (delim == '.')
556 		    /* build collating element.  */
557 		    {
558 		      if (coll_elems_al == 0)
559 			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
560 		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
561 					   coll_elems_al,
562 					   work_mbc->ncoll_elems + 1);
563 		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
564 		    }
565  		}
566 	      wc = -1;
567 	    }
568 	  else
569 	    /* We treat '[' as a normal character here.  */
570 	    {
571 	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
572 	    }
573 	}
574       else
575 	{
576 	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
577 	    wc = fetch_wc(("Unbalanced ["));
578 	}
579 
580       if (wc1 == -1)
581 	wc1 = fetch_wc(_("Unbalanced ["));
582 
583       if (wc1 == L'-')
584 	/* build range characters.  */
585 	{
586 	  wc2 = fetch_wc(_("Unbalanced ["));
587 	  if (wc2 == L']')
588 	    {
589 	      /* In the case [x-], the - is an ordinary hyphen,
590 		 which is left in c1, the lookahead character. */
591 	      lexptr -= cur_mb_len;
592 	      lexleft += cur_mb_len;
593 	      wc2 = wc;
594 	    }
595 	  else
596 	    {
597 	      if (wc2 == L'\\'
598 		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
599 		wc2 = fetch_wc(_("Unbalanced ["));
600 	      wc1 = fetch_wc(_("Unbalanced ["));
601 	    }
602 
603 	  if (range_sts_al == 0)
604 	    {
605 	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
606 	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
607 	    }
608 	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
609 			       range_sts_al, work_mbc->nranges + 1);
610 	  work_mbc->range_sts[work_mbc->nranges] = wc;
611 	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
612 			       range_ends_al, work_mbc->nranges + 1);
613 	  work_mbc->range_ends[work_mbc->nranges++] = wc2;
614 	}
615       else if (wc != -1)
616 	/* build normal characters.  */
617 	{
618 	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
619 			       work_mbc->nchars + 1);
620 	  work_mbc->chars[work_mbc->nchars++] = wc;
621 	}
622     }
623   while ((wc = wc1) != L']');
624 }
625 #endif /* MBS_SUPPORT */
626 
627 #ifdef __STDC__
628 #define FUNC(F, P) static int F(int c) { return P(c); }
629 #else
630 #define FUNC(F, P) static int F(c) int c; { return P(c); }
631 #endif
632 
FUNC(is_alpha,ISALPHA)633 FUNC(is_alpha, ISALPHA)
634 FUNC(is_upper, ISUPPER)
635 FUNC(is_lower, ISLOWER)
636 FUNC(is_digit, ISDIGIT)
637 FUNC(is_xdigit, ISXDIGIT)
638 FUNC(is_space, ISSPACE)
639 FUNC(is_punct, ISPUNCT)
640 FUNC(is_alnum, ISALNUM)
641 FUNC(is_print, ISPRINT)
642 FUNC(is_graph, ISGRAPH)
643 FUNC(is_cntrl, ISCNTRL)
644 
645 static int
646 is_blank (int c)
647 {
648    return (c == ' ' || c == '\t');
649 }
650 
651 /* The following list maps the names of the Posix named character classes
652    to predicate functions that determine whether a given character is in
653    the class.  The leading [ has already been eaten by the lexical analyzer. */
654 static struct {
655   const char *name;
656   int (*pred) (int);
657 } const prednames[] = {
658   { ":alpha:]", is_alpha },
659   { ":upper:]", is_upper },
660   { ":lower:]", is_lower },
661   { ":digit:]", is_digit },
662   { ":xdigit:]", is_xdigit },
663   { ":space:]", is_space },
664   { ":punct:]", is_punct },
665   { ":alnum:]", is_alnum },
666   { ":print:]", is_print },
667   { ":graph:]", is_graph },
668   { ":cntrl:]", is_cntrl },
669   { ":blank:]", is_blank },
670   { 0 }
671 };
672 
673 /* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
674 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
675 
676 static int
looking_at(char const * s)677 looking_at (char const *s)
678 {
679   size_t len;
680 
681   len = strlen(s);
682   if (lexleft < len)
683     return 0;
684   return strncmp(s, lexptr, len) == 0;
685 }
686 
687 static token
lex(void)688 lex (void)
689 {
690   unsigned c, c1, c2;
691   int backslash = 0, invert;
692   charclass ccl;
693   int i;
694 
695   /* Basic plan: We fetch a character.  If it's a backslash,
696      we set the backslash flag and go through the loop again.
697      On the plus side, this avoids having a duplicate of the
698      main switch inside the backslash case.  On the minus side,
699      it means that just about every case begins with
700      "if (backslash) ...".  */
701   for (i = 0; i < 2; ++i)
702     {
703       FETCH(c, 0);
704 #ifdef MBS_SUPPORT
705       if (MB_CUR_MAX > 1 && cur_mb_index)
706 	/* If this is a part of a multi-byte character, we must treat
707 	   this byte data as a normal character.
708 	   e.g. In case of SJIS encoding, some character contains '\',
709 	        but they must not be backslash.  */
710 	goto normal_char;
711 #endif /* MBS_SUPPORT  */
712       switch (c)
713 	{
714 	case '\\':
715 	  if (backslash)
716 	    goto normal_char;
717 	  if (lexleft == 0)
718 	    dfaerror(_("Unfinished \\ escape"));
719 	  backslash = 1;
720 	  break;
721 
722 	case '^':
723 	  if (backslash)
724 	    goto normal_char;
725 	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
726 	      || lasttok == END
727 	      || lasttok == LPAREN
728 	      || lasttok == OR)
729 	    return lasttok = BEGLINE;
730 	  goto normal_char;
731 
732 	case '$':
733 	  if (backslash)
734 	    goto normal_char;
735 	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
736 	      || lexleft == 0
737 	      || (syntax_bits & RE_NO_BK_PARENS
738 		  ? lexleft > 0 && *lexptr == ')'
739 		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
740 	      || (syntax_bits & RE_NO_BK_VBAR
741 		  ? lexleft > 0 && *lexptr == '|'
742 		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
743 	      || ((syntax_bits & RE_NEWLINE_ALT)
744 	          && lexleft > 0 && *lexptr == '\n'))
745 	    return lasttok = ENDLINE;
746 	  goto normal_char;
747 
748 	case '1':
749 	case '2':
750 	case '3':
751 	case '4':
752 	case '5':
753 	case '6':
754 	case '7':
755 	case '8':
756 	case '9':
757 	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
758 	    {
759 	      laststart = 0;
760 	      return lasttok = BACKREF;
761 	    }
762 	  goto normal_char;
763 
764 	case '`':
765 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
766 	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
767 	  goto normal_char;
768 
769 	case '\'':
770 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
771 	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
772 	  goto normal_char;
773 
774 	case '<':
775 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
776 	    return lasttok = BEGWORD;
777 	  goto normal_char;
778 
779 	case '>':
780 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
781 	    return lasttok = ENDWORD;
782 	  goto normal_char;
783 
784 	case 'b':
785 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
786 	    return lasttok = LIMWORD;
787 	  goto normal_char;
788 
789 	case 'B':
790 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
791 	    return lasttok = NOTLIMWORD;
792 	  goto normal_char;
793 
794 	case '?':
795 	  if (syntax_bits & RE_LIMITED_OPS)
796 	    goto normal_char;
797 	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
798 	    goto normal_char;
799 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
800 	    goto normal_char;
801 	  return lasttok = QMARK;
802 
803 	case '*':
804 	  if (backslash)
805 	    goto normal_char;
806 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
807 	    goto normal_char;
808 	  return lasttok = STAR;
809 
810 	case '+':
811 	  if (syntax_bits & RE_LIMITED_OPS)
812 	    goto normal_char;
813 	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
814 	    goto normal_char;
815 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
816 	    goto normal_char;
817 	  return lasttok = PLUS;
818 
819 	case '{':
820 	  if (!(syntax_bits & RE_INTERVALS))
821 	    goto normal_char;
822 	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
823 	    goto normal_char;
824 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
825 	    goto normal_char;
826 
827 	  if (syntax_bits & RE_NO_BK_BRACES)
828 	    {
829 	      /* Scan ahead for a valid interval; if it's not valid,
830 		 treat it as a literal '{'.  */
831 	      int lo = -1, hi = -1;
832 	      char const *p = lexptr;
833 	      char const *lim = p + lexleft;
834 	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
835 		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
836 	      if (p != lim && *p == ',')
837 		while (++p != lim && ISASCIIDIGIT (*p))
838 		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
839 	      else
840 		hi = lo;
841 	      if (p == lim || *p != '}'
842 		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
843 		goto normal_char;
844 	    }
845 
846 	  minrep = 0;
847 	  /* Cases:
848 	     {M} - exact count
849 	     {M,} - minimum count, maximum is infinity
850 	     {M,N} - M through N */
851 	  FETCH(c, _("unfinished repeat count"));
852 	  if (ISASCIIDIGIT (c))
853 	    {
854 	      minrep = c - '0';
855 	      for (;;)
856 		{
857 		  FETCH(c, _("unfinished repeat count"));
858 		  if (! ISASCIIDIGIT (c))
859 		    break;
860 		  minrep = 10 * minrep + c - '0';
861 		}
862 	    }
863 	  else
864 	    dfaerror(_("malformed repeat count"));
865 	  if (c == ',')
866 	    {
867 	      FETCH (c, _("unfinished repeat count"));
868 	      if (! ISASCIIDIGIT (c))
869 		maxrep = -1;
870 	      else
871 		{
872 		  maxrep = c - '0';
873 		  for (;;)
874 		    {
875 		      FETCH (c, _("unfinished repeat count"));
876 		      if (! ISASCIIDIGIT (c))
877 			break;
878 		      maxrep = 10 * maxrep + c - '0';
879 		    }
880 		  if (0 <= maxrep && maxrep < minrep)
881 		    dfaerror (_("malformed repeat count"));
882 		}
883 	    }
884 	  else
885 	    maxrep = minrep;
886 	  if (!(syntax_bits & RE_NO_BK_BRACES))
887 	    {
888 	      if (c != '\\')
889 		dfaerror(_("malformed repeat count"));
890 	      FETCH(c, _("unfinished repeat count"));
891 	    }
892 	  if (c != '}')
893 	    dfaerror(_("malformed repeat count"));
894 	  laststart = 0;
895 	  return lasttok = REPMN;
896 
897 	case '|':
898 	  if (syntax_bits & RE_LIMITED_OPS)
899 	    goto normal_char;
900 	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
901 	    goto normal_char;
902 	  laststart = 1;
903 	  return lasttok = OR;
904 
905 	case '\n':
906 	  if (syntax_bits & RE_LIMITED_OPS
907 	      || backslash
908 	      || !(syntax_bits & RE_NEWLINE_ALT))
909 	    goto normal_char;
910 	  laststart = 1;
911 	  return lasttok = OR;
912 
913 	case '(':
914 	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
915 	    goto normal_char;
916 	  ++parens;
917 	  laststart = 1;
918 	  return lasttok = LPAREN;
919 
920 	case ')':
921 	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
922 	    goto normal_char;
923 	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
924 	    goto normal_char;
925 	  --parens;
926 	  laststart = 0;
927 	  return lasttok = RPAREN;
928 
929 	case '.':
930 	  if (backslash)
931 	    goto normal_char;
932 #ifdef MBS_SUPPORT
933 	  if (MB_CUR_MAX > 1)
934 	    {
935 	      /* In multibyte environment period must match with a single
936 		 character not a byte.  So we use ANYCHAR.  */
937 	      laststart = 0;
938 	      return lasttok = ANYCHAR;
939 	    }
940 #endif /* MBS_SUPPORT */
941 	  zeroset(ccl);
942 	  notset(ccl);
943 	  if (!(syntax_bits & RE_DOT_NEWLINE))
944 	    clrbit(eolbyte, ccl);
945 	  if (syntax_bits & RE_DOT_NOT_NULL)
946 	    clrbit('\0', ccl);
947 	  laststart = 0;
948 	  return lasttok = CSET + charclass_index(ccl);
949 
950 	case 'w':
951 	case 'W':
952 	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
953 	    goto normal_char;
954 	  zeroset(ccl);
955 	  for (c2 = 0; c2 < NOTCHAR; ++c2)
956 	    if (IS_WORD_CONSTITUENT(c2))
957 	      setbit(c2, ccl);
958 	  if (c == 'W')
959 	    notset(ccl);
960 	  laststart = 0;
961 	  return lasttok = CSET + charclass_index(ccl);
962 
963 	case '[':
964 	  if (backslash)
965 	    goto normal_char;
966 	  laststart = 0;
967 #ifdef MBS_SUPPORT
968 	  if (MB_CUR_MAX > 1)
969 	    {
970 	      /* In multibyte environment a bracket expression may contain
971 		 multibyte characters, which must be treated as characters
972 		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
973 	      parse_bracket_exp_mb();
974 	      return lasttok = MBCSET;
975 	    }
976 #endif
977 	  zeroset(ccl);
978 	  FETCH(c, _("Unbalanced ["));
979 	  if (c == '^')
980 	    {
981 	      FETCH(c, _("Unbalanced ["));
982 	      invert = 1;
983 	    }
984 	  else
985 	    invert = 0;
986 	  do
987 	    {
988 	      /* Nobody ever said this had to be fast. :-)
989 		 Note that if we're looking at some other [:...:]
990 		 construct, we just treat it as a bunch of ordinary
991 		 characters.  We can do this because we assume
992 		 regex has checked for syntax errors before
993 		 dfa is ever called. */
994 	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
995 		for (c1 = 0; prednames[c1].name; ++c1)
996 		  if (looking_at(prednames[c1].name))
997 		    {
998 		      int (*pred) (int) = prednames[c1].pred;
999 
1000 		      for (c2 = 0; c2 < NOTCHAR; ++c2)
1001 			if ((*pred)(c2))
1002 			  setbit_case_fold (c2, ccl);
1003 		      lexptr += strlen(prednames[c1].name);
1004 		      lexleft -= strlen(prednames[c1].name);
1005 		      FETCH(c1, _("Unbalanced ["));
1006 		      goto skip;
1007 		    }
1008 	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1009 		FETCH(c, _("Unbalanced ["));
1010 	      FETCH(c1, _("Unbalanced ["));
1011 	      if (c1 == '-')
1012 		{
1013 		  FETCH(c2, _("Unbalanced ["));
1014 		  if (c2 == ']')
1015 		    {
1016 		      /* In the case [x-], the - is an ordinary hyphen,
1017 			 which is left in c1, the lookahead character. */
1018 		      --lexptr;
1019 		      ++lexleft;
1020 		    }
1021 		  else
1022 		    {
1023 		      if (c2 == '\\'
1024 			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1025 			FETCH(c2, _("Unbalanced ["));
1026 		      FETCH(c1, _("Unbalanced ["));
1027 		      if (!hard_LC_COLLATE) {
1028 		        for (; c <= c2; c++)
1029 			  setbit_case_fold (c, ccl);
1030 		      } else {
1031 			/* POSIX locales are painful - leave the decision to libc */
1032 			regex_t re;
1033 			char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */
1034 
1035 			expr[0] = '['; expr[1] = c; expr[2] = '-';
1036 			expr[3] = c2; expr[4] = ']'; expr[5] = '\0';
1037 			if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1038 			  for (c = 0; c < NOTCHAR; ++c) {
1039 			    regmatch_t mat;
1040 			    char buf[2]; /* = { c, '\0' }; */
1041 
1042 			    buf[0] = c; buf[1] = '\0';
1043 			    if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1044                                && mat.rm_so == 0 && mat.rm_eo == 1)
1045                               setbit_case_fold (c, ccl);
1046 			  }
1047 			  regfree (&re);
1048 			}
1049 		      }
1050 		      continue;
1051 		    }
1052 		}
1053 
1054 	      setbit_case_fold (c, ccl);
1055 
1056 	    skip:
1057 	      ;
1058 	    }
1059 	  while ((c = c1) != ']');
1060 	  if (invert)
1061 	    {
1062 	      notset(ccl);
1063 	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1064 		clrbit(eolbyte, ccl);
1065 	    }
1066 	  return lasttok = CSET + charclass_index(ccl);
1067 
1068 	default:
1069 	normal_char:
1070 	  laststart = 0;
1071 	  if (case_fold && ISALPHA(c))
1072 	    {
1073 	      zeroset(ccl);
1074 	      setbit_case_fold (c, ccl);
1075 	      return lasttok = CSET + charclass_index(ccl);
1076 	    }
1077 	  return c;
1078 	}
1079     }
1080 
1081   /* The above loop should consume at most a backslash
1082      and some other character. */
1083   abort();
1084   return END;	/* keeps pedantic compilers happy. */
1085 }
1086 
1087 /* Recursive descent parser for regular expressions. */
1088 
1089 static token tok;		/* Lookahead token. */
1090 static int depth;		/* Current depth of a hypothetical stack
1091 				   holding deferred productions.  This is
1092 				   used to determine the depth that will be
1093 				   required of the real stack later on in
1094 				   dfaanalyze(). */
1095 
1096 /* Add the given token to the parse tree, maintaining the depth count and
1097    updating the maximum depth if necessary. */
1098 static void
addtok(token t)1099 addtok (token t)
1100 {
1101 #ifdef MBS_SUPPORT
1102   if (MB_CUR_MAX > 1)
1103     {
1104       REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1105 			   dfa->tindex);
1106       /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
1107       if (t == MBCSET)
1108 	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1109       else if (t < NOTCHAR)
1110 	dfa->multibyte_prop[dfa->tindex]
1111 	  = (cur_mb_len == 1)? 3 /* single-byte char */
1112 	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1113 	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1114       else
1115 	/* It may be unnecesssary, but it is safer to treat other
1116 	   symbols as singlebyte characters.  */
1117 	dfa->multibyte_prop[dfa->tindex] = 3;
1118     }
1119 #endif
1120 
1121   REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1122   dfa->tokens[dfa->tindex++] = t;
1123 
1124   switch (t)
1125     {
1126     case QMARK:
1127     case STAR:
1128     case PLUS:
1129       break;
1130 
1131     case CAT:
1132     case OR:
1133     case ORTOP:
1134       --depth;
1135       break;
1136 
1137     default:
1138       ++dfa->nleaves;
1139     case EMPTY:
1140       ++depth;
1141       break;
1142     }
1143   if (depth > dfa->depth)
1144     dfa->depth = depth;
1145 }
1146 
1147 /* The grammar understood by the parser is as follows.
1148 
1149    regexp:
1150      regexp OR branch
1151      branch
1152 
1153    branch:
1154      branch closure
1155      closure
1156 
1157    closure:
1158      closure QMARK
1159      closure STAR
1160      closure PLUS
1161      closure REPMN
1162      atom
1163 
1164    atom:
1165      <normal character>
1166      <multibyte character>
1167      ANYCHAR
1168      MBCSET
1169      CSET
1170      BACKREF
1171      BEGLINE
1172      ENDLINE
1173      BEGWORD
1174      ENDWORD
1175      LIMWORD
1176      NOTLIMWORD
1177      CRANGE
1178      LPAREN regexp RPAREN
1179      <empty>
1180 
1181    The parser builds a parse tree in postfix form in an array of tokens. */
1182 
1183 static void
atom(void)1184 atom (void)
1185 {
1186   if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1187       || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1188 #ifdef MBS_SUPPORT
1189       || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1190 #endif /* MBS_SUPPORT */
1191       || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1192     {
1193       addtok(tok);
1194       tok = lex();
1195 #ifdef MBS_SUPPORT
1196       /* We treat a multibyte character as a single atom, so that DFA
1197 	 can treat a multibyte character as a single expression.
1198 
1199          e.g. We construct following tree from "<mb1><mb2>".
1200               <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1201               <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1202       */
1203       if (MB_CUR_MAX > 1)
1204 	{
1205 	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1206 	    {
1207 	      addtok(tok);
1208 	      addtok(CAT);
1209 	      tok = lex();
1210 	    }
1211 	}
1212 #endif /* MBS_SUPPORT  */
1213     }
1214   else if (tok == CRANGE)
1215     {
1216       /* A character range like "[a-z]" in a locale other than "C" or
1217 	 "POSIX".  This range might any sequence of one or more
1218 	 characters.  Unfortunately the POSIX locale primitives give
1219 	 us no practical way to find what character sequences might be
1220 	 matched.  Treat this approximately like "(.\1)" -- i.e. match
1221 	 one character, and then punt to the full matcher.  */
1222       charclass ccl;
1223       zeroset (ccl);
1224       notset (ccl);
1225       addtok (CSET + charclass_index (ccl));
1226       addtok (BACKREF);
1227       addtok (CAT);
1228       tok = lex ();
1229     }
1230   else if (tok == LPAREN)
1231     {
1232       tok = lex();
1233       regexp(0);
1234       if (tok != RPAREN)
1235 	dfaerror(_("Unbalanced ("));
1236       tok = lex();
1237     }
1238   else
1239     addtok(EMPTY);
1240 }
1241 
1242 /* Return the number of tokens in the given subexpression. */
1243 static int
nsubtoks(int tindex)1244 nsubtoks (int tindex)
1245 {
1246   int ntoks1;
1247 
1248   switch (dfa->tokens[tindex - 1])
1249     {
1250     default:
1251       return 1;
1252     case QMARK:
1253     case STAR:
1254     case PLUS:
1255       return 1 + nsubtoks(tindex - 1);
1256     case CAT:
1257     case OR:
1258     case ORTOP:
1259       ntoks1 = nsubtoks(tindex - 1);
1260       return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1261     }
1262 }
1263 
1264 /* Copy the given subexpression to the top of the tree. */
1265 static void
copytoks(int tindex,int ntokens)1266 copytoks (int tindex, int ntokens)
1267 {
1268   int i;
1269 
1270   for (i = 0; i < ntokens; ++i)
1271     addtok(dfa->tokens[tindex + i]);
1272 }
1273 
1274 static void
closure(void)1275 closure (void)
1276 {
1277   int tindex, ntokens, i;
1278 
1279   atom();
1280   while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1281     if (tok == REPMN)
1282       {
1283 	ntokens = nsubtoks(dfa->tindex);
1284 	tindex = dfa->tindex - ntokens;
1285 	if (maxrep < 0)
1286 	  addtok(PLUS);
1287 	if (minrep == 0)
1288 	  addtok(QMARK);
1289 	for (i = 1; i < minrep; ++i)
1290 	  {
1291 	    copytoks(tindex, ntokens);
1292 	    addtok(CAT);
1293 	  }
1294 	for (; i < maxrep; ++i)
1295 	  {
1296 	    copytoks(tindex, ntokens);
1297 	    addtok(QMARK);
1298 	    addtok(CAT);
1299 	  }
1300 	tok = lex();
1301       }
1302     else
1303       {
1304 	addtok(tok);
1305 	tok = lex();
1306       }
1307 }
1308 
1309 static void
branch(void)1310 branch (void)
1311 {
1312   closure();
1313   while (tok != RPAREN && tok != OR && tok >= 0)
1314     {
1315       closure();
1316       addtok(CAT);
1317     }
1318 }
1319 
1320 static void
regexp(int toplevel)1321 regexp (int toplevel)
1322 {
1323   branch();
1324   while (tok == OR)
1325     {
1326       tok = lex();
1327       branch();
1328       if (toplevel)
1329 	addtok(ORTOP);
1330       else
1331 	addtok(OR);
1332     }
1333 }
1334 
1335 /* Main entry point for the parser.  S is a string to be parsed, len is the
1336    length of the string, so s can include NUL characters.  D is a pointer to
1337    the struct dfa to parse into. */
1338 void
dfaparse(char const * s,size_t len,struct dfa * d)1339 dfaparse (char const *s, size_t len, struct dfa *d)
1340 {
1341   dfa = d;
1342   lexstart = lexptr = s;
1343   lexleft = len;
1344   lasttok = END;
1345   laststart = 1;
1346   parens = 0;
1347 #if ENABLE_NLS
1348   hard_LC_COLLATE = hard_locale (LC_COLLATE);
1349 #endif
1350 #ifdef MBS_SUPPORT
1351   if (MB_CUR_MAX > 1)
1352     {
1353       cur_mb_index = 0;
1354       cur_mb_len = 0;
1355       memset(&mbs, 0, sizeof(mbstate_t));
1356     }
1357 #endif /* MBS_SUPPORT  */
1358 
1359   if (! syntax_bits_set)
1360     dfaerror(_("No syntax specified"));
1361 
1362   tok = lex();
1363   depth = d->depth;
1364 
1365   regexp(1);
1366 
1367   if (tok != END)
1368     dfaerror(_("Unbalanced )"));
1369 
1370   addtok(END - d->nregexps);
1371   addtok(CAT);
1372 
1373   if (d->nregexps)
1374     addtok(ORTOP);
1375 
1376   ++d->nregexps;
1377 }
1378 
1379 /* Some primitives for operating on sets of positions. */
1380 
1381 /* Copy one set to another; the destination must be large enough. */
1382 static void
copy(position_set const * src,position_set * dst)1383 copy (position_set const *src, position_set *dst)
1384 {
1385   int i;
1386 
1387   for (i = 0; i < src->nelem; ++i)
1388     dst->elems[i] = src->elems[i];
1389   dst->nelem = src->nelem;
1390 }
1391 
1392 /* Insert a position in a set.  Position sets are maintained in sorted
1393    order according to index.  If position already exists in the set with
1394    the same index then their constraints are logically or'd together.
1395    S->elems must point to an array large enough to hold the resulting set. */
1396 static void
insert(position p,position_set * s)1397 insert (position p, position_set *s)
1398 {
1399   int i;
1400   position t1, t2;
1401 
1402   for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1403     continue;
1404   if (i < s->nelem && p.index == s->elems[i].index)
1405     s->elems[i].constraint |= p.constraint;
1406   else
1407     {
1408       t1 = p;
1409       ++s->nelem;
1410       while (i < s->nelem)
1411 	{
1412 	  t2 = s->elems[i];
1413 	  s->elems[i++] = t1;
1414 	  t1 = t2;
1415 	}
1416     }
1417 }
1418 
1419 /* Merge two sets of positions into a third.  The result is exactly as if
1420    the positions of both sets were inserted into an initially empty set. */
1421 static void
merge(position_set const * s1,position_set const * s2,position_set * m)1422 merge (position_set const *s1, position_set const *s2, position_set *m)
1423 {
1424   int i = 0, j = 0;
1425 
1426   m->nelem = 0;
1427   while (i < s1->nelem && j < s2->nelem)
1428     if (s1->elems[i].index > s2->elems[j].index)
1429       m->elems[m->nelem++] = s1->elems[i++];
1430     else if (s1->elems[i].index < s2->elems[j].index)
1431       m->elems[m->nelem++] = s2->elems[j++];
1432     else
1433       {
1434 	m->elems[m->nelem] = s1->elems[i++];
1435 	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1436       }
1437   while (i < s1->nelem)
1438     m->elems[m->nelem++] = s1->elems[i++];
1439   while (j < s2->nelem)
1440     m->elems[m->nelem++] = s2->elems[j++];
1441 }
1442 
1443 /* Delete a position from a set. */
1444 static void
delete(position p,position_set * s)1445 delete (position p, position_set *s)
1446 {
1447   int i;
1448 
1449   for (i = 0; i < s->nelem; ++i)
1450     if (p.index == s->elems[i].index)
1451       break;
1452   if (i < s->nelem)
1453     for (--s->nelem; i < s->nelem; ++i)
1454       s->elems[i] = s->elems[i + 1];
1455 }
1456 
1457 /* Find the index of the state corresponding to the given position set with
1458    the given preceding context, or create a new state if there is no such
1459    state.  Newline and letter tell whether we got here on a newline or
1460    letter, respectively. */
1461 static int
state_index(struct dfa * d,position_set const * s,int newline,int letter)1462 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1463 {
1464   int hash = 0;
1465   int constraint;
1466   int i, j;
1467 
1468   newline = newline ? 1 : 0;
1469   letter = letter ? 1 : 0;
1470 
1471   for (i = 0; i < s->nelem; ++i)
1472     hash ^= s->elems[i].index + s->elems[i].constraint;
1473 
1474   /* Try to find a state that exactly matches the proposed one. */
1475   for (i = 0; i < d->sindex; ++i)
1476     {
1477       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1478 	  || newline != d->states[i].newline || letter != d->states[i].letter)
1479 	continue;
1480       for (j = 0; j < s->nelem; ++j)
1481 	if (s->elems[j].constraint
1482 	    != d->states[i].elems.elems[j].constraint
1483 	    || s->elems[j].index != d->states[i].elems.elems[j].index)
1484 	  break;
1485       if (j == s->nelem)
1486 	return i;
1487     }
1488 
1489   /* We'll have to create a new state. */
1490   REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1491   d->states[i].hash = hash;
1492   MALLOC(d->states[i].elems.elems, position, s->nelem);
1493   copy(s, &d->states[i].elems);
1494   d->states[i].newline = newline;
1495   d->states[i].letter = letter;
1496   d->states[i].backref = 0;
1497   d->states[i].constraint = 0;
1498   d->states[i].first_end = 0;
1499 #ifdef MBS_SUPPORT
1500   if (MB_CUR_MAX > 1)
1501     d->states[i].mbps.nelem = 0;
1502 #endif
1503   for (j = 0; j < s->nelem; ++j)
1504     if (d->tokens[s->elems[j].index] < 0)
1505       {
1506 	constraint = s->elems[j].constraint;
1507 	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1508 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1509 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1510 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1511 	  d->states[i].constraint |= constraint;
1512 	if (! d->states[i].first_end)
1513 	  d->states[i].first_end = d->tokens[s->elems[j].index];
1514       }
1515     else if (d->tokens[s->elems[j].index] == BACKREF)
1516       {
1517 	d->states[i].constraint = NO_CONSTRAINT;
1518 	d->states[i].backref = 1;
1519       }
1520 
1521   ++d->sindex;
1522 
1523   return i;
1524 }
1525 
1526 /* Find the epsilon closure of a set of positions.  If any position of the set
1527    contains a symbol that matches the empty string in some context, replace
1528    that position with the elements of its follow labeled with an appropriate
1529    constraint.  Repeat exhaustively until no funny positions are left.
1530    S->elems must be large enough to hold the result. */
1531 static void
epsclosure(position_set * s,struct dfa const * d)1532 epsclosure (position_set *s, struct dfa const *d)
1533 {
1534   int i, j;
1535   int *visited;
1536   position p, old;
1537 
1538   MALLOC(visited, int, d->tindex);
1539   for (i = 0; i < d->tindex; ++i)
1540     visited[i] = 0;
1541 
1542   for (i = 0; i < s->nelem; ++i)
1543     if (d->tokens[s->elems[i].index] >= NOTCHAR
1544 	&& d->tokens[s->elems[i].index] != BACKREF
1545 #ifdef MBS_SUPPORT
1546 	&& d->tokens[s->elems[i].index] != ANYCHAR
1547 	&& d->tokens[s->elems[i].index] != MBCSET
1548 #endif
1549 	&& d->tokens[s->elems[i].index] < CSET)
1550       {
1551 	old = s->elems[i];
1552 	p.constraint = old.constraint;
1553 	delete(s->elems[i], s);
1554 	if (visited[old.index])
1555 	  {
1556 	    --i;
1557 	    continue;
1558 	  }
1559 	visited[old.index] = 1;
1560 	switch (d->tokens[old.index])
1561 	  {
1562 	  case BEGLINE:
1563 	    p.constraint &= BEGLINE_CONSTRAINT;
1564 	    break;
1565 	  case ENDLINE:
1566 	    p.constraint &= ENDLINE_CONSTRAINT;
1567 	    break;
1568 	  case BEGWORD:
1569 	    p.constraint &= BEGWORD_CONSTRAINT;
1570 	    break;
1571 	  case ENDWORD:
1572 	    p.constraint &= ENDWORD_CONSTRAINT;
1573 	    break;
1574 	  case LIMWORD:
1575 	    p.constraint &= LIMWORD_CONSTRAINT;
1576 	    break;
1577 	  case NOTLIMWORD:
1578 	    p.constraint &= NOTLIMWORD_CONSTRAINT;
1579 	    break;
1580 	  default:
1581 	    break;
1582 	  }
1583 	for (j = 0; j < d->follows[old.index].nelem; ++j)
1584 	  {
1585 	    p.index = d->follows[old.index].elems[j].index;
1586 	    insert(p, s);
1587 	  }
1588 	/* Force rescan to start at the beginning. */
1589 	i = -1;
1590       }
1591 
1592   free(visited);
1593 }
1594 
1595 /* Perform bottom-up analysis on the parse tree, computing various functions.
1596    Note that at this point, we're pretending constructs like \< are real
1597    characters rather than constraints on what can follow them.
1598 
1599    Nullable:  A node is nullable if it is at the root of a regexp that can
1600    match the empty string.
1601    *  EMPTY leaves are nullable.
1602    * No other leaf is nullable.
1603    * A QMARK or STAR node is nullable.
1604    * A PLUS node is nullable if its argument is nullable.
1605    * A CAT node is nullable if both its arguments are nullable.
1606    * An OR node is nullable if either argument is nullable.
1607 
1608    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1609    that could correspond to the first character of a string matching the
1610    regexp rooted at the given node.
1611    * EMPTY leaves have empty firstpos.
1612    * The firstpos of a nonempty leaf is that leaf itself.
1613    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1614      argument.
1615    * The firstpos of a CAT node is the firstpos of the left argument, union
1616      the firstpos of the right if the left argument is nullable.
1617    * The firstpos of an OR node is the union of firstpos of each argument.
1618 
1619    Lastpos:  The lastpos of a node is the set of positions that could
1620    correspond to the last character of a string matching the regexp at
1621    the given node.
1622    * EMPTY leaves have empty lastpos.
1623    * The lastpos of a nonempty leaf is that leaf itself.
1624    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1625      argument.
1626    * The lastpos of a CAT node is the lastpos of its right argument, union
1627      the lastpos of the left if the right argument is nullable.
1628    * The lastpos of an OR node is the union of the lastpos of each argument.
1629 
1630    Follow:  The follow of a position is the set of positions that could
1631    correspond to the character following a character matching the node in
1632    a string matching the regexp.  At this point we consider special symbols
1633    that match the empty string in some context to be just normal characters.
1634    Later, if we find that a special symbol is in a follow set, we will
1635    replace it with the elements of its follow, labeled with an appropriate
1636    constraint.
1637    * Every node in the firstpos of the argument of a STAR or PLUS node is in
1638      the follow of every node in the lastpos.
1639    * Every node in the firstpos of the second argument of a CAT node is in
1640      the follow of every node in the lastpos of the first argument.
1641 
1642    Because of the postfix representation of the parse tree, the depth-first
1643    analysis is conveniently done by a linear scan with the aid of a stack.
1644    Sets are stored as arrays of the elements, obeying a stack-like allocation
1645    scheme; the number of elements in each set deeper in the stack can be
1646    used to determine the address of a particular set's array. */
1647 void
dfaanalyze(struct dfa * d,int searchflag)1648 dfaanalyze (struct dfa *d, int searchflag)
1649 {
1650   int *nullable;		/* Nullable stack. */
1651   int *nfirstpos;		/* Element count stack for firstpos sets. */
1652   position *firstpos;		/* Array where firstpos elements are stored. */
1653   int *nlastpos;		/* Element count stack for lastpos sets. */
1654   position *lastpos;		/* Array where lastpos elements are stored. */
1655   int *nalloc;			/* Sizes of arrays allocated to follow sets. */
1656   position_set tmp;		/* Temporary set for merging sets. */
1657   position_set merged;		/* Result of merging sets. */
1658   int wants_newline;		/* True if some position wants newline info. */
1659   int *o_nullable;
1660   int *o_nfirst, *o_nlast;
1661   position *o_firstpos, *o_lastpos;
1662   int i, j;
1663   position *pos;
1664 
1665 #ifdef DEBUG
1666   fprintf(stderr, "dfaanalyze:\n");
1667   for (i = 0; i < d->tindex; ++i)
1668     {
1669       fprintf(stderr, " %d:", i);
1670       prtok(d->tokens[i]);
1671     }
1672   putc('\n', stderr);
1673 #endif
1674 
1675   d->searchflag = searchflag;
1676 
1677   MALLOC(nullable, int, d->depth);
1678   o_nullable = nullable;
1679   MALLOC(nfirstpos, int, d->depth);
1680   o_nfirst = nfirstpos;
1681   MALLOC(firstpos, position, d->nleaves);
1682   o_firstpos = firstpos, firstpos += d->nleaves;
1683   MALLOC(nlastpos, int, d->depth);
1684   o_nlast = nlastpos;
1685   MALLOC(lastpos, position, d->nleaves);
1686   o_lastpos = lastpos, lastpos += d->nleaves;
1687   MALLOC(nalloc, int, d->tindex);
1688   for (i = 0; i < d->tindex; ++i)
1689     nalloc[i] = 0;
1690   MALLOC(merged.elems, position, d->nleaves);
1691 
1692   CALLOC(d->follows, position_set, d->tindex);
1693 
1694   for (i = 0; i < d->tindex; ++i)
1695 #ifdef DEBUG
1696     {				/* Nonsyntactic #ifdef goo... */
1697 #endif
1698     switch (d->tokens[i])
1699       {
1700       case EMPTY:
1701 	/* The empty set is nullable. */
1702 	*nullable++ = 1;
1703 
1704 	/* The firstpos and lastpos of the empty leaf are both empty. */
1705 	*nfirstpos++ = *nlastpos++ = 0;
1706 	break;
1707 
1708       case STAR:
1709       case PLUS:
1710 	/* Every element in the firstpos of the argument is in the follow
1711 	   of every element in the lastpos. */
1712 	tmp.nelem = nfirstpos[-1];
1713 	tmp.elems = firstpos;
1714 	pos = lastpos;
1715 	for (j = 0; j < nlastpos[-1]; ++j)
1716 	  {
1717 	    merge(&tmp, &d->follows[pos[j].index], &merged);
1718 	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1719 				 nalloc[pos[j].index], merged.nelem - 1);
1720 	    copy(&merged, &d->follows[pos[j].index]);
1721 	  }
1722 
1723       case QMARK:
1724 	/* A QMARK or STAR node is automatically nullable. */
1725 	if (d->tokens[i] != PLUS)
1726 	  nullable[-1] = 1;
1727 	break;
1728 
1729       case CAT:
1730 	/* Every element in the firstpos of the second argument is in the
1731 	   follow of every element in the lastpos of the first argument. */
1732 	tmp.nelem = nfirstpos[-1];
1733 	tmp.elems = firstpos;
1734 	pos = lastpos + nlastpos[-1];
1735 	for (j = 0; j < nlastpos[-2]; ++j)
1736 	  {
1737 	    merge(&tmp, &d->follows[pos[j].index], &merged);
1738 	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1739 				 nalloc[pos[j].index], merged.nelem - 1);
1740 	    copy(&merged, &d->follows[pos[j].index]);
1741 	  }
1742 
1743 	/* The firstpos of a CAT node is the firstpos of the first argument,
1744 	   union that of the second argument if the first is nullable. */
1745 	if (nullable[-2])
1746 	  nfirstpos[-2] += nfirstpos[-1];
1747 	else
1748 	  firstpos += nfirstpos[-1];
1749 	--nfirstpos;
1750 
1751 	/* The lastpos of a CAT node is the lastpos of the second argument,
1752 	   union that of the first argument if the second is nullable. */
1753 	if (nullable[-1])
1754 	  nlastpos[-2] += nlastpos[-1];
1755 	else
1756 	  {
1757 	    pos = lastpos + nlastpos[-2];
1758 	    for (j = nlastpos[-1] - 1; j >= 0; --j)
1759 	      pos[j] = lastpos[j];
1760 	    lastpos += nlastpos[-2];
1761 	    nlastpos[-2] = nlastpos[-1];
1762 	  }
1763 	--nlastpos;
1764 
1765 	/* A CAT node is nullable if both arguments are nullable. */
1766 	nullable[-2] = nullable[-1] && nullable[-2];
1767 	--nullable;
1768 	break;
1769 
1770       case OR:
1771       case ORTOP:
1772 	/* The firstpos is the union of the firstpos of each argument. */
1773 	nfirstpos[-2] += nfirstpos[-1];
1774 	--nfirstpos;
1775 
1776 	/* The lastpos is the union of the lastpos of each argument. */
1777 	nlastpos[-2] += nlastpos[-1];
1778 	--nlastpos;
1779 
1780 	/* An OR node is nullable if either argument is nullable. */
1781 	nullable[-2] = nullable[-1] || nullable[-2];
1782 	--nullable;
1783 	break;
1784 
1785       default:
1786 	/* Anything else is a nonempty position.  (Note that special
1787 	   constructs like \< are treated as nonempty strings here;
1788 	   an "epsilon closure" effectively makes them nullable later.
1789 	   Backreferences have to get a real position so we can detect
1790 	   transitions on them later.  But they are nullable. */
1791 	*nullable++ = d->tokens[i] == BACKREF;
1792 
1793 	/* This position is in its own firstpos and lastpos. */
1794 	*nfirstpos++ = *nlastpos++ = 1;
1795 	--firstpos, --lastpos;
1796 	firstpos->index = lastpos->index = i;
1797 	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1798 
1799 	/* Allocate the follow set for this position. */
1800 	nalloc[i] = 1;
1801 	MALLOC(d->follows[i].elems, position, nalloc[i]);
1802 	break;
1803       }
1804 #ifdef DEBUG
1805     /* ... balance the above nonsyntactic #ifdef goo... */
1806       fprintf(stderr, "node %d:", i);
1807       prtok(d->tokens[i]);
1808       putc('\n', stderr);
1809       fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1810       fprintf(stderr, " firstpos:");
1811       for (j = nfirstpos[-1] - 1; j >= 0; --j)
1812 	{
1813 	  fprintf(stderr, " %d:", firstpos[j].index);
1814 	  prtok(d->tokens[firstpos[j].index]);
1815 	}
1816       fprintf(stderr, "\n lastpos:");
1817       for (j = nlastpos[-1] - 1; j >= 0; --j)
1818 	{
1819 	  fprintf(stderr, " %d:", lastpos[j].index);
1820 	  prtok(d->tokens[lastpos[j].index]);
1821 	}
1822       putc('\n', stderr);
1823     }
1824 #endif
1825 
1826   /* For each follow set that is the follow set of a real position, replace
1827      it with its epsilon closure. */
1828   for (i = 0; i < d->tindex; ++i)
1829     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1830 #ifdef MBS_SUPPORT
1831         || d->tokens[i] == ANYCHAR
1832         || d->tokens[i] == MBCSET
1833 #endif
1834 	|| d->tokens[i] >= CSET)
1835       {
1836 #ifdef DEBUG
1837 	fprintf(stderr, "follows(%d:", i);
1838 	prtok(d->tokens[i]);
1839 	fprintf(stderr, "):");
1840 	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1841 	  {
1842 	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1843 	    prtok(d->tokens[d->follows[i].elems[j].index]);
1844 	  }
1845 	putc('\n', stderr);
1846 #endif
1847 	copy(&d->follows[i], &merged);
1848 	epsclosure(&merged, d);
1849 	if (d->follows[i].nelem < merged.nelem)
1850 	  REALLOC(d->follows[i].elems, position, merged.nelem);
1851 	copy(&merged, &d->follows[i]);
1852       }
1853 
1854   /* Get the epsilon closure of the firstpos of the regexp.  The result will
1855      be the set of positions of state 0. */
1856   merged.nelem = 0;
1857   for (i = 0; i < nfirstpos[-1]; ++i)
1858     insert(firstpos[i], &merged);
1859   epsclosure(&merged, d);
1860 
1861   /* Check if any of the positions of state 0 will want newline context. */
1862   wants_newline = 0;
1863   for (i = 0; i < merged.nelem; ++i)
1864     if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1865       wants_newline = 1;
1866 
1867   /* Build the initial state. */
1868   d->salloc = 1;
1869   d->sindex = 0;
1870   MALLOC(d->states, dfa_state, d->salloc);
1871   state_index(d, &merged, wants_newline, 0);
1872 
1873   free(o_nullable);
1874   free(o_nfirst);
1875   free(o_firstpos);
1876   free(o_nlast);
1877   free(o_lastpos);
1878   free(nalloc);
1879   free(merged.elems);
1880 }
1881 
1882 /* Find, for each character, the transition out of state s of d, and store
1883    it in the appropriate slot of trans.
1884 
1885    We divide the positions of s into groups (positions can appear in more
1886    than one group).  Each group is labeled with a set of characters that
1887    every position in the group matches (taking into account, if necessary,
1888    preceding context information of s).  For each group, find the union
1889    of the its elements' follows.  This set is the set of positions of the
1890    new state.  For each character in the group's label, set the transition
1891    on this character to be to a state corresponding to the set's positions,
1892    and its associated backward context information, if necessary.
1893 
1894    If we are building a searching matcher, we include the positions of state
1895    0 in every state.
1896 
1897    The collection of groups is constructed by building an equivalence-class
1898    partition of the positions of s.
1899 
1900    For each position, find the set of characters C that it matches.  Eliminate
1901    any characters from C that fail on grounds of backward context.
1902 
1903    Search through the groups, looking for a group whose label L has nonempty
1904    intersection with C.  If L - C is nonempty, create a new group labeled
1905    L - C and having the same positions as the current group, and set L to
1906    the intersection of L and C.  Insert the position in this group, set
1907    C = C - L, and resume scanning.
1908 
1909    If after comparing with every group there are characters remaining in C,
1910    create a new group labeled with the characters of C and insert this
1911    position in that group. */
1912 void
dfastate(int s,struct dfa * d,int trans[])1913 dfastate (int s, struct dfa *d, int trans[])
1914 {
1915   position_set grps[NOTCHAR];	/* As many as will ever be needed. */
1916   charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
1917   int ngrps = 0;		/* Number of groups actually used. */
1918   position pos;			/* Current position being considered. */
1919   charclass matches;		/* Set of matching characters. */
1920   int matchesf;			/* True if matches is nonempty. */
1921   charclass intersect;		/* Intersection with some label set. */
1922   int intersectf;		/* True if intersect is nonempty. */
1923   charclass leftovers;		/* Stuff in the label that didn't match. */
1924   int leftoversf;		/* True if leftovers is nonempty. */
1925   static charclass letters;	/* Set of characters considered letters. */
1926   static charclass newline;	/* Set of characters that aren't newline. */
1927   position_set follows;		/* Union of the follows of some group. */
1928   position_set tmp;		/* Temporary space for merging sets. */
1929   int state;			/* New state. */
1930   int wants_newline;		/* New state wants to know newline context. */
1931   int state_newline;		/* New state on a newline transition. */
1932   int wants_letter;		/* New state wants to know letter context. */
1933   int state_letter;		/* New state on a letter transition. */
1934   static int initialized;	/* Flag for static initialization. */
1935 #ifdef MBS_SUPPORT
1936   int next_isnt_1st_byte = 0;	/* Flag If we can't add state0.  */
1937 #endif
1938   int i, j, k;
1939 
1940   /* Initialize the set of letters, if necessary. */
1941   if (! initialized)
1942     {
1943       initialized = 1;
1944       for (i = 0; i < NOTCHAR; ++i)
1945 	if (IS_WORD_CONSTITUENT(i))
1946 	  setbit(i, letters);
1947       setbit(eolbyte, newline);
1948     }
1949 
1950   zeroset(matches);
1951 
1952   for (i = 0; i < d->states[s].elems.nelem; ++i)
1953     {
1954       pos = d->states[s].elems.elems[i];
1955       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1956 	setbit(d->tokens[pos.index], matches);
1957       else if (d->tokens[pos.index] >= CSET)
1958 	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1959 #ifdef MBS_SUPPORT
1960       else if (d->tokens[pos.index] == ANYCHAR
1961                || d->tokens[pos.index] == MBCSET)
1962       /* MB_CUR_MAX > 1  */
1963 	{
1964 	  /* ANYCHAR and MBCSET must match with a single character, so we
1965 	     must put it to d->states[s].mbps, which contains the positions
1966 	     which can match with a single character not a byte.  */
1967 	  if (d->states[s].mbps.nelem == 0)
1968 	    {
1969 	      MALLOC(d->states[s].mbps.elems, position,
1970 		     d->states[s].elems.nelem);
1971 	    }
1972 	  insert(pos, &(d->states[s].mbps));
1973 	  continue;
1974 	}
1975 #endif /* MBS_SUPPORT */
1976       else
1977 	continue;
1978 
1979       /* Some characters may need to be eliminated from matches because
1980 	 they fail in the current context. */
1981       if (pos.constraint != 0xFF)
1982 	{
1983 	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1984 					 d->states[s].newline, 1))
1985 	    clrbit(eolbyte, matches);
1986 	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1987 					 d->states[s].newline, 0))
1988 	    for (j = 0; j < CHARCLASS_INTS; ++j)
1989 	      matches[j] &= newline[j];
1990 	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1991 					d->states[s].letter, 1))
1992 	    for (j = 0; j < CHARCLASS_INTS; ++j)
1993 	      matches[j] &= ~letters[j];
1994 	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1995 					d->states[s].letter, 0))
1996 	    for (j = 0; j < CHARCLASS_INTS; ++j)
1997 	      matches[j] &= letters[j];
1998 
1999 	  /* If there are no characters left, there's no point in going on. */
2000 	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2001 	    continue;
2002 	  if (j == CHARCLASS_INTS)
2003 	    continue;
2004 	}
2005 
2006       for (j = 0; j < ngrps; ++j)
2007 	{
2008 	  /* If matches contains a single character only, and the current
2009 	     group's label doesn't contain that character, go on to the
2010 	     next group. */
2011 	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2012 	      && !tstbit(d->tokens[pos.index], labels[j]))
2013 	    continue;
2014 
2015 	  /* Check if this group's label has a nonempty intersection with
2016 	     matches. */
2017 	  intersectf = 0;
2018 	  for (k = 0; k < CHARCLASS_INTS; ++k)
2019 	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2020 	  if (! intersectf)
2021 	    continue;
2022 
2023 	  /* It does; now find the set differences both ways. */
2024 	  leftoversf = matchesf = 0;
2025 	  for (k = 0; k < CHARCLASS_INTS; ++k)
2026 	    {
2027 	      /* Even an optimizing compiler can't know this for sure. */
2028 	      int match = matches[k], label = labels[j][k];
2029 
2030 	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2031 	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2032 	    }
2033 
2034 	  /* If there were leftovers, create a new group labeled with them. */
2035 	  if (leftoversf)
2036 	    {
2037 	      copyset(leftovers, labels[ngrps]);
2038 	      copyset(intersect, labels[j]);
2039 	      MALLOC(grps[ngrps].elems, position, d->nleaves);
2040 	      copy(&grps[j], &grps[ngrps]);
2041 	      ++ngrps;
2042 	    }
2043 
2044 	  /* Put the position in the current group.  Note that there is no
2045 	     reason to call insert() here. */
2046 	  grps[j].elems[grps[j].nelem++] = pos;
2047 
2048 	  /* If every character matching the current position has been
2049 	     accounted for, we're done. */
2050 	  if (! matchesf)
2051 	    break;
2052 	}
2053 
2054       /* If we've passed the last group, and there are still characters
2055 	 unaccounted for, then we'll have to create a new group. */
2056       if (j == ngrps)
2057 	{
2058 	  copyset(matches, labels[ngrps]);
2059 	  zeroset(matches);
2060 	  MALLOC(grps[ngrps].elems, position, d->nleaves);
2061 	  grps[ngrps].nelem = 1;
2062 	  grps[ngrps].elems[0] = pos;
2063 	  ++ngrps;
2064 	}
2065     }
2066 
2067   MALLOC(follows.elems, position, d->nleaves);
2068   MALLOC(tmp.elems, position, d->nleaves);
2069 
2070   /* If we are a searching matcher, the default transition is to a state
2071      containing the positions of state 0, otherwise the default transition
2072      is to fail miserably. */
2073   if (d->searchflag)
2074     {
2075       wants_newline = 0;
2076       wants_letter = 0;
2077       for (i = 0; i < d->states[0].elems.nelem; ++i)
2078 	{
2079 	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2080 	    wants_newline = 1;
2081 	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2082 	    wants_letter = 1;
2083 	}
2084       copy(&d->states[0].elems, &follows);
2085       state = state_index(d, &follows, 0, 0);
2086       if (wants_newline)
2087 	state_newline = state_index(d, &follows, 1, 0);
2088       else
2089 	state_newline = state;
2090       if (wants_letter)
2091 	state_letter = state_index(d, &follows, 0, 1);
2092       else
2093 	state_letter = state;
2094       for (i = 0; i < NOTCHAR; ++i)
2095 	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2096       trans[eolbyte] = state_newline;
2097     }
2098   else
2099     for (i = 0; i < NOTCHAR; ++i)
2100       trans[i] = -1;
2101 
2102   for (i = 0; i < ngrps; ++i)
2103     {
2104       follows.nelem = 0;
2105 
2106       /* Find the union of the follows of the positions of the group.
2107 	 This is a hideously inefficient loop.  Fix it someday. */
2108       for (j = 0; j < grps[i].nelem; ++j)
2109 	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2110 	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2111 
2112 #ifdef MBS_SUPPORT
2113       if (MB_CUR_MAX > 1)
2114 	{
2115 	  /* If a token in follows.elems is not 1st byte of a multibyte
2116 	     character, or the states of follows must accept the bytes
2117 	     which are not 1st byte of the multibyte character.
2118 	     Then, if a state of follows encounter a byte, it must not be
2119 	     a 1st byte of a multibyte character nor singlebyte character.
2120 	     We cansel to add state[0].follows to next state, because
2121 	     state[0] must accept 1st-byte
2122 
2123 	     For example, we assume <sb a> is a certain singlebyte
2124 	     character, <mb A> is a certain multibyte character, and the
2125 	     codepoint of <sb a> equals the 2nd byte of the codepoint of
2126 	     <mb A>.
2127 	     When state[0] accepts <sb a>, state[i] transit to state[i+1]
2128 	     by accepting accepts 1st byte of <mb A>, and state[i+1]
2129 	     accepts 2nd byte of <mb A>, if state[i+1] encounter the
2130 	     codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2131 	     <mb A>, so we can not add state[0].  */
2132 
2133 	  next_isnt_1st_byte = 0;
2134 	  for (j = 0; j < follows.nelem; ++j)
2135 	    {
2136 	      if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2137 		{
2138 		  next_isnt_1st_byte = 1;
2139 		  break;
2140 		}
2141 	    }
2142 	}
2143 #endif
2144 
2145       /* If we are building a searching matcher, throw in the positions
2146 	 of state 0 as well. */
2147 #ifdef MBS_SUPPORT
2148       if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2149 #else
2150       if (d->searchflag)
2151 #endif
2152 	for (j = 0; j < d->states[0].elems.nelem; ++j)
2153 	  insert(d->states[0].elems.elems[j], &follows);
2154 
2155       /* Find out if the new state will want any context information. */
2156       wants_newline = 0;
2157       if (tstbit(eolbyte, labels[i]))
2158 	for (j = 0; j < follows.nelem; ++j)
2159 	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2160 	    wants_newline = 1;
2161 
2162       wants_letter = 0;
2163       for (j = 0; j < CHARCLASS_INTS; ++j)
2164 	if (labels[i][j] & letters[j])
2165 	  break;
2166       if (j < CHARCLASS_INTS)
2167 	for (j = 0; j < follows.nelem; ++j)
2168 	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2169 	    wants_letter = 1;
2170 
2171       /* Find the state(s) corresponding to the union of the follows. */
2172       state = state_index(d, &follows, 0, 0);
2173       if (wants_newline)
2174 	state_newline = state_index(d, &follows, 1, 0);
2175       else
2176 	state_newline = state;
2177       if (wants_letter)
2178 	state_letter = state_index(d, &follows, 0, 1);
2179       else
2180 	state_letter = state;
2181 
2182       /* Set the transitions for each character in the current label. */
2183       for (j = 0; j < CHARCLASS_INTS; ++j)
2184 	for (k = 0; k < INTBITS; ++k)
2185 	  if (labels[i][j] & 1 << k)
2186 	    {
2187 	      int c = j * INTBITS + k;
2188 
2189 	      if (c == eolbyte)
2190 		trans[c] = state_newline;
2191 	      else if (IS_WORD_CONSTITUENT(c))
2192 		trans[c] = state_letter;
2193 	      else if (c < NOTCHAR)
2194 		trans[c] = state;
2195 	    }
2196     }
2197 
2198   for (i = 0; i < ngrps; ++i)
2199     free(grps[i].elems);
2200   free(follows.elems);
2201   free(tmp.elems);
2202 }
2203 
2204 /* Some routines for manipulating a compiled dfa's transition tables.
2205    Each state may or may not have a transition table; if it does, and it
2206    is a non-accepting state, then d->trans[state] points to its table.
2207    If it is an accepting state then d->fails[state] points to its table.
2208    If it has no table at all, then d->trans[state] is NULL.
2209    TODO: Improve this comment, get rid of the unnecessary redundancy. */
2210 
2211 static void
build_state(int s,struct dfa * d)2212 build_state (int s, struct dfa *d)
2213 {
2214   int *trans;			/* The new transition table. */
2215   int i;
2216 
2217   /* Set an upper limit on the number of transition tables that will ever
2218      exist at once.  1024 is arbitrary.  The idea is that the frequently
2219      used transition tables will be quickly rebuilt, whereas the ones that
2220      were only needed once or twice will be cleared away. */
2221   if (d->trcount >= 1024)
2222     {
2223       for (i = 0; i < d->tralloc; ++i)
2224 	if (d->trans[i])
2225 	  {
2226 	    free((ptr_t) d->trans[i]);
2227 	    d->trans[i] = NULL;
2228 	  }
2229 	else if (d->fails[i])
2230 	  {
2231 	    free((ptr_t) d->fails[i]);
2232 	    d->fails[i] = NULL;
2233 	  }
2234       d->trcount = 0;
2235     }
2236 
2237   ++d->trcount;
2238 
2239   /* Set up the success bits for this state. */
2240   d->success[s] = 0;
2241   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2242       s, *d))
2243     d->success[s] |= 4;
2244   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2245       s, *d))
2246     d->success[s] |= 2;
2247   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2248       s, *d))
2249     d->success[s] |= 1;
2250 
2251   MALLOC(trans, int, NOTCHAR);
2252   dfastate(s, d, trans);
2253 
2254   /* Now go through the new transition table, and make sure that the trans
2255      and fail arrays are allocated large enough to hold a pointer for the
2256      largest state mentioned in the table. */
2257   for (i = 0; i < NOTCHAR; ++i)
2258     if (trans[i] >= d->tralloc)
2259       {
2260 	int oldalloc = d->tralloc;
2261 
2262 	while (trans[i] >= d->tralloc)
2263 	  d->tralloc *= 2;
2264 	REALLOC(d->realtrans, int *, d->tralloc + 1);
2265 	d->trans = d->realtrans + 1;
2266 	REALLOC(d->fails, int *, d->tralloc);
2267 	REALLOC(d->success, int, d->tralloc);
2268 	while (oldalloc < d->tralloc)
2269 	  {
2270 	    d->trans[oldalloc] = NULL;
2271 	    d->fails[oldalloc++] = NULL;
2272 	  }
2273       }
2274 
2275   /* Newline is a sentinel.  */
2276   trans[eolbyte] = -1;
2277 
2278   if (ACCEPTING(s, *d))
2279     d->fails[s] = trans;
2280   else
2281     d->trans[s] = trans;
2282 }
2283 
2284 static void
build_state_zero(struct dfa * d)2285 build_state_zero (struct dfa *d)
2286 {
2287   d->tralloc = 1;
2288   d->trcount = 0;
2289   CALLOC(d->realtrans, int *, d->tralloc + 1);
2290   d->trans = d->realtrans + 1;
2291   CALLOC(d->fails, int *, d->tralloc);
2292   MALLOC(d->success, int, d->tralloc);
2293   build_state(0, d);
2294 }
2295 
2296 #ifdef MBS_SUPPORT
2297 /* Multibyte character handling sub-routins for dfaexec.  */
2298 
2299 /* Initial state may encounter the byte which is not a singlebyte character
2300    nor 1st byte of a multibyte character.  But it is incorrect for initial
2301    state to accept such a byte.
2302    For example, in sjis encoding the regular expression like "\\" accepts
2303    the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2304    0x815c. Then Initial state must skip the bytes which are not a singlebyte
2305    character nor 1st byte of a multibyte character.  */
2306 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)		\
2307   if (s == 0)						\
2308     {							\
2309       while (inputwcs[p - buf_begin] == 0		\
2310             && mblen_buf[p - buf_begin] > 0		\
2311 	    && p < buf_end)				\
2312         ++p;						\
2313       if (p >= end)					\
2314 	{						\
2315           free(mblen_buf);				\
2316           free(inputwcs);				\
2317 	  return (size_t) -1;				\
2318 	}						\
2319     }
2320 
2321 static void
realloc_trans_if_necessary(struct dfa * d,int new_state)2322 realloc_trans_if_necessary(struct dfa *d, int new_state)
2323 {
2324   /* Make sure that the trans and fail arrays are allocated large enough
2325      to hold a pointer for the new state. */
2326   if (new_state >= d->tralloc)
2327     {
2328       int oldalloc = d->tralloc;
2329 
2330       while (new_state >= d->tralloc)
2331 	d->tralloc *= 2;
2332       REALLOC(d->realtrans, int *, d->tralloc + 1);
2333       d->trans = d->realtrans + 1;
2334       REALLOC(d->fails, int *, d->tralloc);
2335       REALLOC(d->success, int, d->tralloc);
2336       while (oldalloc < d->tralloc)
2337 	{
2338 	  d->trans[oldalloc] = NULL;
2339 	  d->fails[oldalloc++] = NULL;
2340 	}
2341     }
2342 }
2343 
2344 /* Return values of transit_state_singlebyte(), and
2345    transit_state_consume_1char.  */
2346 typedef enum
2347 {
2348   TRANSIT_STATE_IN_PROGRESS,	/* State transition has not finished.  */
2349   TRANSIT_STATE_DONE,		/* State transition has finished.  */
2350   TRANSIT_STATE_END_BUFFER	/* Reach the end of the buffer.  */
2351 } status_transit_state;
2352 
2353 /* Consume a single byte and transit state from 's' to '*next_state'.
2354    This function is almost same as the state transition routin in dfaexec().
2355    But state transition is done just once, otherwise matching succeed or
2356    reach the end of the buffer.  */
2357 static status_transit_state
transit_state_singlebyte(struct dfa * d,int s,unsigned char const * p,int * next_state)2358 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2359 				  int *next_state)
2360 {
2361   int *t;
2362   int works = s;
2363 
2364   status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2365 
2366   while (rval == TRANSIT_STATE_IN_PROGRESS)
2367     {
2368       if ((t = d->trans[works]) != NULL)
2369 	{
2370 	  works = t[*p];
2371 	  rval = TRANSIT_STATE_DONE;
2372 	  if (works < 0)
2373 	    works = 0;
2374 	}
2375       else if (works < 0)
2376 	{
2377 	  if (p == buf_end)
2378 	    /* At the moment, it must not happen.  */
2379 	    return TRANSIT_STATE_END_BUFFER;
2380 	  works = 0;
2381 	}
2382       else if (d->fails[works])
2383 	{
2384 	  works = d->fails[works][*p];
2385 	  rval = TRANSIT_STATE_DONE;
2386 	}
2387       else
2388 	{
2389 	  build_state(works, d);
2390 	}
2391     }
2392   *next_state = works;
2393   return rval;
2394 }
2395 
2396 /* Check whether period can match or not in the current context.  If it can,
2397    return the amount of the bytes with which period can match, otherwise
2398    return 0.
2399    `pos' is the position of the period.  `index' is the index from the
2400    buf_begin, and it is the current position in the buffer.  */
2401 static int
match_anychar(struct dfa * d,int s,position pos,int index)2402 match_anychar (struct dfa *d, int s, position pos, int index)
2403 {
2404   int newline = 0;
2405   int letter = 0;
2406   wchar_t wc;
2407   int mbclen;
2408 
2409   wc = inputwcs[index];
2410   mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2411 
2412   /* Check context.  */
2413   if (wc == (wchar_t)eolbyte)
2414     {
2415       if (!(syntax_bits & RE_DOT_NEWLINE))
2416 	return 0;
2417       newline = 1;
2418     }
2419   else if (wc == (wchar_t)'\0')
2420     {
2421       if (syntax_bits & RE_DOT_NOT_NULL)
2422 	return 0;
2423       newline = 1;
2424     }
2425 
2426   if (iswalnum(wc) || wc == L'_')
2427     letter = 1;
2428 
2429   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2430 			   newline, d->states[s].letter, letter))
2431     return 0;
2432 
2433   return mbclen;
2434 }
2435 
2436 /* Check whether bracket expression can match or not in the current context.
2437    If it can, return the amount of the bytes with which expression can match,
2438    otherwise return 0.
2439    `pos' is the position of the bracket expression.  `index' is the index
2440    from the buf_begin, and it is the current position in the buffer.  */
2441 int
match_mb_charset(struct dfa * d,int s,position pos,int index)2442 match_mb_charset (struct dfa *d, int s, position pos, int index)
2443 {
2444   int i;
2445   int match;		/* Flag which represent that matching succeed.  */
2446   int match_len;	/* Length of the character (or collating element)
2447 			   with which this operator match.  */
2448   int op_len;		/* Length of the operator.  */
2449   char buffer[128];
2450   wchar_t wcbuf[6];
2451 
2452   /* Pointer to the structure to which we are currently reffering.  */
2453   struct mb_char_classes *work_mbc;
2454 
2455   int newline = 0;
2456   int letter = 0;
2457   wchar_t wc;		/* Current reffering character.  */
2458 
2459   wc = inputwcs[index];
2460 
2461   /* Check context.  */
2462   if (wc == (wchar_t)eolbyte)
2463     {
2464       if (!(syntax_bits & RE_DOT_NEWLINE))
2465 	return 0;
2466       newline = 1;
2467     }
2468   else if (wc == (wchar_t)'\0')
2469     {
2470       if (syntax_bits & RE_DOT_NOT_NULL)
2471 	return 0;
2472       newline = 1;
2473     }
2474   if (iswalnum(wc) || wc == L'_')
2475     letter = 1;
2476   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2477 			   newline, d->states[s].letter, letter))
2478     return 0;
2479 
2480   /* Assign the current reffering operator to work_mbc.  */
2481   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2482   match = !work_mbc->invert;
2483   match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2484 
2485   /* match with a character class?  */
2486   for (i = 0; i<work_mbc->nch_classes; i++)
2487     {
2488       if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2489 	goto charset_matched;
2490     }
2491 
2492   strncpy(buffer, buf_begin + index, match_len);
2493   buffer[match_len] = '\0';
2494 
2495   /* match with an equivalent class?  */
2496   for (i = 0; i<work_mbc->nequivs; i++)
2497     {
2498       op_len = strlen(work_mbc->equivs[i]);
2499       strncpy(buffer, buf_begin + index, op_len);
2500       buffer[op_len] = '\0';
2501       if (strcoll(work_mbc->equivs[i], buffer) == 0)
2502 	{
2503 	  match_len = op_len;
2504 	  goto charset_matched;
2505 	}
2506     }
2507 
2508   /* match with a collating element?  */
2509   for (i = 0; i<work_mbc->ncoll_elems; i++)
2510     {
2511       op_len = strlen(work_mbc->coll_elems[i]);
2512       strncpy(buffer, buf_begin + index, op_len);
2513       buffer[op_len] = '\0';
2514 
2515       if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2516 	{
2517 	  match_len = op_len;
2518 	  goto charset_matched;
2519 	}
2520     }
2521 
2522   wcbuf[0] = wc;
2523   wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2524 
2525   /* match with a range?  */
2526   for (i = 0; i<work_mbc->nranges; i++)
2527     {
2528       wcbuf[2] = work_mbc->range_sts[i];
2529       wcbuf[4] = work_mbc->range_ends[i];
2530 
2531       if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2532 	  wcscoll(wcbuf+4, wcbuf) >= 0)
2533 	goto charset_matched;
2534     }
2535 
2536   /* match with a character?  */
2537   for (i = 0; i<work_mbc->nchars; i++)
2538     {
2539       if (wc == work_mbc->chars[i])
2540 	goto charset_matched;
2541     }
2542 
2543   match = !match;
2544 
2545  charset_matched:
2546   return match ? match_len : 0;
2547 }
2548 
2549 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2550    array which corresponds to `d->states[s].mbps.elem' and each element of
2551    the array contains the amount of the bytes with which the element can
2552    match.
2553    `index' is the index from the buf_begin, and it is the current position
2554    in the buffer.
2555    Caller MUST free the array which this function return.  */
2556 static int*
check_matching_with_multibyte_ops(struct dfa * d,int s,int index)2557 check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2558 {
2559   int i;
2560   int* rarray;
2561 
2562   MALLOC(rarray, int, d->states[s].mbps.nelem);
2563   for (i = 0; i < d->states[s].mbps.nelem; ++i)
2564     {
2565       position pos = d->states[s].mbps.elems[i];
2566       switch(d->tokens[pos.index])
2567 	{
2568 	case ANYCHAR:
2569 	  rarray[i] = match_anychar(d, s, pos, index);
2570 	  break;
2571 	case MBCSET:
2572 	  rarray[i] = match_mb_charset(d, s, pos, index);
2573 	  break;
2574 	default:
2575 	  break; /* can not happen.  */
2576 	}
2577     }
2578   return rarray;
2579 }
2580 
2581 /* Consume a single character and enumerate all of the positions which can
2582    be next position from the state `s'.
2583    `match_lens' is the input. It can be NULL, but it can also be the output
2584    of check_matching_with_multibyte_ops() for optimization.
2585    `mbclen' and `pps' are the output.  `mbclen' is the length of the
2586    character consumed, and `pps' is the set this function enumerate.  */
2587 static status_transit_state
transit_state_consume_1char(struct dfa * d,int s,unsigned char const ** pp,int * match_lens,int * mbclen,position_set * pps)2588 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2589 			     int *match_lens, int *mbclen, position_set *pps)
2590 {
2591   int i, j;
2592   int s1, s2;
2593   int* work_mbls;
2594   status_transit_state rs = TRANSIT_STATE_DONE;
2595 
2596   /* Calculate the length of the (single/multi byte) character
2597      to which p points.  */
2598   *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2599     : mblen_buf[*pp - buf_begin];
2600 
2601   /* Calculate the state which can be reached from the state `s' by
2602      consuming `*mbclen' single bytes from the buffer.  */
2603   s1 = s;
2604   for (i = 0; i < *mbclen; i++)
2605     {
2606       s2 = s1;
2607       rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2608     }
2609   /* Copy the positions contained by `s1' to the set `pps'.  */
2610   copy(&(d->states[s1].elems), pps);
2611 
2612   /* Check (inputed)match_lens, and initialize if it is NULL.  */
2613   if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2614     work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2615   else
2616     work_mbls = match_lens;
2617 
2618   /* Add all of the positions which can be reached from `s' by consuming
2619      a single character.  */
2620   for (i = 0; i < d->states[s].mbps.nelem ; i++)
2621    {
2622       if (work_mbls[i] == *mbclen)
2623 	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2624 	     j++)
2625 	  insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2626 		 pps);
2627     }
2628 
2629   if (match_lens == NULL && work_mbls != NULL)
2630     free(work_mbls);
2631   return rs;
2632 }
2633 
2634 /* Transit state from s, then return new state and update the pointer of the
2635    buffer.  This function is for some operator which can match with a multi-
2636    byte character or a collating element(which may be multi characters).  */
2637 static int
transit_state(struct dfa * d,int s,unsigned char const ** pp)2638 transit_state (struct dfa *d, int s, unsigned char const **pp)
2639 {
2640   int s1;
2641   int mbclen;		/* The length of current input multibyte character. */
2642   int maxlen = 0;
2643   int i, j;
2644   int *match_lens = NULL;
2645   int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2646   position_set follows;
2647   unsigned char const *p1 = *pp;
2648   status_transit_state rs;
2649   wchar_t wc;
2650 
2651   if (nelem > 0)
2652     /* This state has (a) multibyte operator(s).
2653        We check whether each of them can match or not.  */
2654     {
2655       /* Note: caller must free the return value of this function.  */
2656       match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2657 
2658       for (i = 0; i < nelem; i++)
2659 	/* Search the operator which match the longest string,
2660 	   in this state.  */
2661 	{
2662 	  if (match_lens[i] > maxlen)
2663 	    maxlen = match_lens[i];
2664 	}
2665     }
2666 
2667   if (nelem == 0 || maxlen == 0)
2668     /* This state has no multibyte operator which can match.
2669        We need to  check only one singlebyte character.  */
2670     {
2671       status_transit_state rs;
2672       rs = transit_state_singlebyte(d, s, *pp, &s1);
2673 
2674       /* We must update the pointer if state transition succeeded.  */
2675       if (rs == TRANSIT_STATE_DONE)
2676 	++*pp;
2677 
2678       if (match_lens != NULL)
2679 	free(match_lens);
2680       return s1;
2681     }
2682 
2683   /* This state has some operators which can match a multibyte character.  */
2684   follows.nelem = 0;
2685   MALLOC(follows.elems, position, d->nleaves);
2686 
2687   /* `maxlen' may be longer than the length of a character, because it may
2688      not be a character but a (multi character) collating element.
2689      We enumerate all of the positions which `s' can reach by consuming
2690      `maxlen' bytes.  */
2691   rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2692 
2693   wc = inputwcs[*pp - mbclen - buf_begin];
2694   s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2695   realloc_trans_if_necessary(d, s1);
2696 
2697   while (*pp - p1 < maxlen)
2698     {
2699       follows.nelem = 0;
2700       rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2701 
2702       for (i = 0; i < nelem ; i++)
2703 	{
2704 	  if (match_lens[i] == *pp - p1)
2705 	    for (j = 0;
2706 		 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2707 	      insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2708 		     &follows);
2709 	}
2710 
2711       wc = inputwcs[*pp - mbclen - buf_begin];
2712       s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2713       realloc_trans_if_necessary(d, s1);
2714     }
2715   free(match_lens);
2716   free(follows.elems);
2717   return s1;
2718 }
2719 
2720 #endif
2721 
2722 /* Search through a buffer looking for a match to the given struct dfa.
2723    Find the first occurrence of a string matching the regexp in the buffer,
2724    and the shortest possible version thereof.  Return the offset of the first
2725    character after the match, or (size_t) -1 if none is found.  BEGIN points to
2726    the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
2727    is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
2728    where we're supposed to store a 1 if backreferencing happened and the
2729    match needs to be verified by a backtracking matcher.  Otherwise
2730    we store a 0 in *backref. */
2731 size_t
dfaexec(struct dfa * d,char const * begin,size_t size,int * backref)2732 dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2733 {
2734   register int s;	/* Current state. */
2735   register unsigned char const *p; /* Current input character. */
2736   register unsigned char const *end; /* One past the last input character.  */
2737   register int **trans, *t;	/* Copy of d->trans so it can be optimized
2738 				   into a register. */
2739   register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
2740   static int sbit[NOTCHAR];	/* Table for anding with d->success. */
2741   static int sbit_init;
2742 
2743   if (! sbit_init)
2744     {
2745       int i;
2746 
2747       sbit_init = 1;
2748       for (i = 0; i < NOTCHAR; ++i)
2749 	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2750       sbit[eol] = 4;
2751     }
2752 
2753   if (! d->tralloc)
2754     build_state_zero(d);
2755 
2756   s = 0;
2757   p = (unsigned char const *) begin;
2758   end = p + size;
2759   trans = d->trans;
2760 
2761 #ifdef MBS_SUPPORT
2762   if (MB_CUR_MAX > 1)
2763     {
2764       int remain_bytes, i;
2765       buf_begin = begin;
2766       buf_end = end;
2767 
2768       /* initialize mblen_buf, and inputwcs.  */
2769       MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2770       MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2771       memset(&mbs, 0, sizeof(mbstate_t));
2772       remain_bytes = 0;
2773       for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2774 	{
2775 	  if (remain_bytes == 0)
2776 	    {
2777 	      remain_bytes
2778 		= mbrtowc(inputwcs + i, begin + i,
2779 			  end - (unsigned char const *)begin - i + 1, &mbs);
2780 	      if (remain_bytes <= 1)
2781 		{
2782 		  remain_bytes = 0;
2783 		  inputwcs[i] = (wchar_t)begin[i];
2784 		  mblen_buf[i] = 0;
2785 		}
2786 	      else
2787 		{
2788 		  mblen_buf[i] = remain_bytes;
2789 		  remain_bytes--;
2790 		}
2791 	    }
2792 	  else
2793 	    {
2794 	      mblen_buf[i] = remain_bytes;
2795 	      inputwcs[i] = 0;
2796 	      remain_bytes--;
2797 	    }
2798 	}
2799       mblen_buf[i] = 0;
2800       inputwcs[i] = 0; /* sentinel */
2801     }
2802 #endif /* MBS_SUPPORT */
2803 
2804   for (;;)
2805     {
2806 #ifdef MBS_SUPPORT
2807       if (MB_CUR_MAX > 1)
2808 	while ((t = trans[s]))
2809 	  {
2810 	    if (p == end)
2811 	      {
2812 		free(mblen_buf);
2813 		free(inputwcs);
2814 		return (size_t) -1;
2815 	      }
2816 	    if (d->states[s].mbps.nelem != 0)
2817 	      {
2818 		/* Can match with a multibyte character( and multi character
2819 		   collating element).  */
2820 		unsigned char const *nextp;
2821 
2822 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2823 
2824 		nextp = p;
2825 		s = transit_state(d, s, &nextp);
2826 		p = nextp;
2827 
2828 		/* Trans table might be updated.  */
2829 		trans = d->trans;
2830 	      }
2831 	    else
2832 	      {
2833 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2834 		s = t[*p++];
2835 	      }
2836 	  }
2837       else
2838 #endif /* MBS_SUPPORT */
2839         while ((t = trans[s]))
2840 	  {
2841 	    if (p == end)
2842 	      return (size_t) -1;
2843 	    s = t[*p++];
2844 	  }
2845 
2846       if (s < 0)
2847 	{
2848 	  if (p == end)
2849 	    {
2850 #ifdef MBS_SUPPORT
2851 	      if (MB_CUR_MAX > 1)
2852 		{
2853 		  free(mblen_buf);
2854 		  free(inputwcs);
2855 		}
2856 #endif /* MBS_SUPPORT */
2857 	      return (size_t) -1;
2858 	    }
2859 	  s = 0;
2860 	}
2861       else if ((t = d->fails[s]))
2862 	{
2863 	  if (d->success[s] & sbit[*p])
2864 	    {
2865 	      if (backref)
2866 		*backref = (d->states[s].backref != 0);
2867 #ifdef MBS_SUPPORT
2868 	      if (MB_CUR_MAX > 1)
2869 		{
2870 		  free(mblen_buf);
2871 		  free(inputwcs);
2872 		}
2873 #endif /* MBS_SUPPORT */
2874 	      return (char const *) p - begin;
2875 	    }
2876 
2877 #ifdef MBS_SUPPORT
2878 	  if (MB_CUR_MAX > 1)
2879 	    {
2880 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2881 		if (d->states[s].mbps.nelem != 0)
2882 		  {
2883 		    /* Can match with a multibyte character( and multi
2884 		       character collating element).  */
2885 		    unsigned char const *nextp;
2886 		    nextp = p;
2887 		    s = transit_state(d, s, &nextp);
2888 		    p = nextp;
2889 
2890 		    /* Trans table might be updated.  */
2891 		    trans = d->trans;
2892 		  }
2893 		else
2894 		s = t[*p++];
2895 	    }
2896 	  else
2897 #endif /* MBS_SUPPORT */
2898 	  s = t[*p++];
2899 	}
2900       else
2901 	{
2902 	  build_state(s, d);
2903 	  trans = d->trans;
2904 	}
2905     }
2906 }
2907 
2908 /* Initialize the components of a dfa that the other routines don't
2909    initialize for themselves. */
2910 void
dfainit(struct dfa * d)2911 dfainit (struct dfa *d)
2912 {
2913   d->calloc = 1;
2914   MALLOC(d->charclasses, charclass, d->calloc);
2915   d->cindex = 0;
2916 
2917   d->talloc = 1;
2918   MALLOC(d->tokens, token, d->talloc);
2919   d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2920 #ifdef MBS_SUPPORT
2921   if (MB_CUR_MAX > 1)
2922     {
2923       d->nmultibyte_prop = 1;
2924       MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2925       d->nmbcsets = 0;
2926       d->mbcsets_alloc = 1;
2927       MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2928     }
2929 #endif
2930 
2931   d->searchflag = 0;
2932   d->tralloc = 0;
2933 
2934   d->musts = 0;
2935 }
2936 
2937 /* Parse and analyze a single string of the given length. */
2938 void
dfacomp(char const * s,size_t len,struct dfa * d,int searchflag)2939 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2940 {
2941   if (case_fold)	/* dummy folding in service of dfamust() */
2942     {
2943       char *lcopy;
2944       int i;
2945 
2946       lcopy = malloc(len);
2947       if (!lcopy)
2948 	dfaerror(_("out of memory"));
2949 
2950       /* This is a kludge. */
2951       case_fold = 0;
2952       for (i = 0; i < len; ++i)
2953 	if (ISUPPER ((unsigned char) s[i]))
2954 	  lcopy[i] = tolower ((unsigned char) s[i]);
2955 	else
2956 	  lcopy[i] = s[i];
2957 
2958       dfainit(d);
2959       dfaparse(lcopy, len, d);
2960       free(lcopy);
2961       dfamust(d);
2962       d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2963       case_fold = 1;
2964       dfaparse(s, len, d);
2965       dfaanalyze(d, searchflag);
2966     }
2967   else
2968     {
2969         dfainit(d);
2970         dfaparse(s, len, d);
2971 	dfamust(d);
2972         dfaanalyze(d, searchflag);
2973     }
2974 }
2975 
2976 /* Free the storage held by the components of a dfa. */
2977 void
dfafree(struct dfa * d)2978 dfafree (struct dfa *d)
2979 {
2980   int i;
2981   struct dfamust *dm, *ndm;
2982 
2983   free((ptr_t) d->charclasses);
2984   free((ptr_t) d->tokens);
2985 
2986 #ifdef MBS_SUPPORT
2987   if (MB_CUR_MAX > 1)
2988     {
2989       free((ptr_t) d->multibyte_prop);
2990       for (i = 0; i < d->nmbcsets; ++i)
2991 	{
2992 	  int j;
2993 	  struct mb_char_classes *p = &(d->mbcsets[i]);
2994 	  if (p->chars != NULL)
2995 	    free(p->chars);
2996 	  if (p->ch_classes != NULL)
2997 	    free(p->ch_classes);
2998 	  if (p->range_sts != NULL)
2999 	    free(p->range_sts);
3000 	  if (p->range_ends != NULL)
3001 	    free(p->range_ends);
3002 
3003 	  for (j = 0; j < p->nequivs; ++j)
3004 	    free(p->equivs[j]);
3005 	  if (p->equivs != NULL)
3006 	    free(p->equivs);
3007 
3008 	  for (j = 0; j < p->ncoll_elems; ++j)
3009 	    free(p->coll_elems[j]);
3010 	  if (p->coll_elems != NULL)
3011 	    free(p->coll_elems);
3012 	}
3013       free((ptr_t) d->mbcsets);
3014     }
3015 #endif /* MBS_SUPPORT */
3016 
3017   for (i = 0; i < d->sindex; ++i)
3018     free((ptr_t) d->states[i].elems.elems);
3019   free((ptr_t) d->states);
3020   for (i = 0; i < d->tindex; ++i)
3021     if (d->follows[i].elems)
3022       free((ptr_t) d->follows[i].elems);
3023   free((ptr_t) d->follows);
3024   for (i = 0; i < d->tralloc; ++i)
3025     if (d->trans[i])
3026       free((ptr_t) d->trans[i]);
3027     else if (d->fails[i])
3028       free((ptr_t) d->fails[i]);
3029   if (d->realtrans) free((ptr_t) d->realtrans);
3030   if (d->fails) free((ptr_t) d->fails);
3031   if (d->success) free((ptr_t) d->success);
3032   for (dm = d->musts; dm; dm = ndm)
3033     {
3034       ndm = dm->next;
3035       free(dm->must);
3036       free((ptr_t) dm);
3037     }
3038 }
3039 
3040 /* Having found the postfix representation of the regular expression,
3041    try to find a long sequence of characters that must appear in any line
3042    containing the r.e.
3043    Finding a "longest" sequence is beyond the scope here;
3044    we take an easy way out and hope for the best.
3045    (Take "(ab|a)b"--please.)
3046 
3047    We do a bottom-up calculation of sequences of characters that must appear
3048    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3049    representation:
3050 	sequences that must appear at the left of the match ("left")
3051 	sequences that must appear at the right of the match ("right")
3052 	lists of sequences that must appear somewhere in the match ("in")
3053 	sequences that must constitute the match ("is")
3054 
3055    When we get to the root of the tree, we use one of the longest of its
3056    calculated "in" sequences as our answer.  The sequence we find is returned in
3057    d->must (where "d" is the single argument passed to "dfamust");
3058    the length of the sequence is returned in d->mustn.
3059 
3060    The sequences calculated for the various types of node (in pseudo ANSI c)
3061    are shown below.  "p" is the operand of unary operators (and the left-hand
3062    operand of binary operators); "q" is the right-hand operand of binary
3063    operators.
3064 
3065    "ZERO" means "a zero-length sequence" below.
3066 
3067 	Type	left		right		is		in
3068 	----	----		-----		--		--
3069 	char c	# c		# c		# c		# c
3070 
3071 	ANYCHAR	ZERO		ZERO		ZERO		ZERO
3072 
3073 	MBCSET	ZERO		ZERO		ZERO		ZERO
3074 
3075 	CSET	ZERO		ZERO		ZERO		ZERO
3076 
3077 	STAR	ZERO		ZERO		ZERO		ZERO
3078 
3079 	QMARK	ZERO		ZERO		ZERO		ZERO
3080 
3081 	PLUS	p->left		p->right	ZERO		p->in
3082 
3083 	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
3084 		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
3085 		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
3086 						ZERO
3087 
3088 	OR	longest common	longest common	(do p->is and	substrings common to
3089 		leading		trailing	q->is have same	p->in and q->in
3090 		(sub)sequence	(sub)sequence	length and
3091 		of p->left	of p->right	content) ?
3092 		and q->left	and q->right	p->is : NULL
3093 
3094    If there's anything else we recognize in the tree, all four sequences get set
3095    to zero-length sequences.  If there's something we don't recognize in the tree,
3096    we just return a zero-length sequence.
3097 
3098    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3099    'aaa')?
3100 
3101    And. . .is it here or someplace that we might ponder "optimizations" such as
3102 	egrep 'psi|epsilon'	->	egrep 'psi'
3103 	egrep 'pepsi|epsilon'	->	egrep 'epsi'
3104 					(Yes, we now find "epsi" as a "string
3105 					that must occur", but we might also
3106 					simplify the *entire* r.e. being sought)
3107 	grep '[c]'		->	grep 'c'
3108 	grep '(ab|a)b'		->	grep 'ab'
3109 	grep 'ab*'		->	grep 'a'
3110 	grep 'a*b'		->	grep 'b'
3111 
3112    There are several issues:
3113 
3114    Is optimization easy (enough)?
3115 
3116    Does optimization actually accomplish anything,
3117    or is the automaton you get from "psi|epsilon" (for example)
3118    the same as the one you get from "psi" (for example)?
3119 
3120    Are optimizable r.e.'s likely to be used in real-life situations
3121    (something like 'ab*' is probably unlikely; something like is
3122    'psi|epsilon' is likelier)? */
3123 
3124 static char *
icatalloc(char * old,char * new)3125 icatalloc (char *old, char *new)
3126 {
3127   char *result;
3128   size_t oldsize, newsize;
3129 
3130   newsize = (new == NULL) ? 0 : strlen(new);
3131   if (old == NULL)
3132     oldsize = 0;
3133   else if (newsize == 0)
3134     return old;
3135   else	oldsize = strlen(old);
3136   if (old == NULL)
3137     result = (char *) malloc(newsize + 1);
3138   else
3139     result = (char *) realloc((void *) old, oldsize + newsize + 1);
3140   if (result != NULL && new != NULL)
3141     (void) strcpy(result + oldsize, new);
3142   return result;
3143 }
3144 
3145 static char *
icpyalloc(char * string)3146 icpyalloc (char *string)
3147 {
3148   return icatalloc((char *) NULL, string);
3149 }
3150 
3151 static char *
istrstr(char * lookin,char * lookfor)3152 istrstr (char *lookin, char *lookfor)
3153 {
3154   char *cp;
3155   size_t len;
3156 
3157   len = strlen(lookfor);
3158   for (cp = lookin; *cp != '\0'; ++cp)
3159     if (strncmp(cp, lookfor, len) == 0)
3160       return cp;
3161   return NULL;
3162 }
3163 
3164 static void
ifree(char * cp)3165 ifree (char *cp)
3166 {
3167   if (cp != NULL)
3168     free(cp);
3169 }
3170 
3171 static void
freelist(char ** cpp)3172 freelist (char **cpp)
3173 {
3174   int i;
3175 
3176   if (cpp == NULL)
3177     return;
3178   for (i = 0; cpp[i] != NULL; ++i)
3179     {
3180       free(cpp[i]);
3181       cpp[i] = NULL;
3182     }
3183 }
3184 
3185 static char **
enlist(char ** cpp,char * new,size_t len)3186 enlist (char **cpp, char *new, size_t len)
3187 {
3188   int i, j;
3189 
3190   if (cpp == NULL)
3191     return NULL;
3192   if ((new = icpyalloc(new)) == NULL)
3193     {
3194       freelist(cpp);
3195       return NULL;
3196     }
3197   new[len] = '\0';
3198   /* Is there already something in the list that's new (or longer)? */
3199   for (i = 0; cpp[i] != NULL; ++i)
3200     if (istrstr(cpp[i], new) != NULL)
3201       {
3202 	free(new);
3203 	return cpp;
3204       }
3205   /* Eliminate any obsoleted strings. */
3206   j = 0;
3207   while (cpp[j] != NULL)
3208     if (istrstr(new, cpp[j]) == NULL)
3209       ++j;
3210     else
3211       {
3212 	free(cpp[j]);
3213 	if (--i == j)
3214 	  break;
3215 	cpp[j] = cpp[i];
3216 	cpp[i] = NULL;
3217       }
3218   /* Add the new string. */
3219   cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3220   if (cpp == NULL)
3221     return NULL;
3222   cpp[i] = new;
3223   cpp[i + 1] = NULL;
3224   return cpp;
3225 }
3226 
3227 /* Given pointers to two strings, return a pointer to an allocated
3228    list of their distinct common substrings. Return NULL if something
3229    seems wild. */
3230 static char **
comsubs(char * left,char * right)3231 comsubs (char *left, char *right)
3232 {
3233   char **cpp;
3234   char *lcp;
3235   char *rcp;
3236   size_t i, len;
3237 
3238   if (left == NULL || right == NULL)
3239     return NULL;
3240   cpp = (char **) malloc(sizeof *cpp);
3241   if (cpp == NULL)
3242     return NULL;
3243   cpp[0] = NULL;
3244   for (lcp = left; *lcp != '\0'; ++lcp)
3245     {
3246       len = 0;
3247       rcp = strchr (right, *lcp);
3248       while (rcp != NULL)
3249 	{
3250 	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3251 	    continue;
3252 	  if (i > len)
3253 	    len = i;
3254 	  rcp = strchr (rcp + 1, *lcp);
3255 	}
3256       if (len == 0)
3257 	continue;
3258       if ((cpp = enlist(cpp, lcp, len)) == NULL)
3259 	break;
3260     }
3261   return cpp;
3262 }
3263 
3264 static char **
addlists(char ** old,char ** new)3265 addlists (char **old, char **new)
3266 {
3267   int i;
3268 
3269   if (old == NULL || new == NULL)
3270     return NULL;
3271   for (i = 0; new[i] != NULL; ++i)
3272     {
3273       old = enlist(old, new[i], strlen(new[i]));
3274       if (old == NULL)
3275 	break;
3276     }
3277   return old;
3278 }
3279 
3280 /* Given two lists of substrings, return a new list giving substrings
3281    common to both. */
3282 static char **
inboth(char ** left,char ** right)3283 inboth (char **left, char **right)
3284 {
3285   char **both;
3286   char **temp;
3287   int lnum, rnum;
3288 
3289   if (left == NULL || right == NULL)
3290     return NULL;
3291   both = (char **) malloc(sizeof *both);
3292   if (both == NULL)
3293     return NULL;
3294   both[0] = NULL;
3295   for (lnum = 0; left[lnum] != NULL; ++lnum)
3296     {
3297       for (rnum = 0; right[rnum] != NULL; ++rnum)
3298 	{
3299 	  temp = comsubs(left[lnum], right[rnum]);
3300 	  if (temp == NULL)
3301 	    {
3302 	      freelist(both);
3303 	      return NULL;
3304 	    }
3305 	  both = addlists(both, temp);
3306 	  freelist(temp);
3307 	  free(temp);
3308 	  if (both == NULL)
3309 	    return NULL;
3310 	}
3311     }
3312   return both;
3313 }
3314 
3315 typedef struct
3316 {
3317   char **in;
3318   char *left;
3319   char *right;
3320   char *is;
3321 } must;
3322 
3323 static void
resetmust(must * mp)3324 resetmust (must *mp)
3325 {
3326   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3327   freelist(mp->in);
3328 }
3329 
3330 static void
dfamust(struct dfa * dfa)3331 dfamust (struct dfa *dfa)
3332 {
3333   must *musts;
3334   must *mp;
3335   char *result;
3336   int ri;
3337   int i;
3338   int exact;
3339   token t;
3340   static must must0;
3341   struct dfamust *dm;
3342   static char empty_string[] = "";
3343 
3344   result = empty_string;
3345   exact = 0;
3346   musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3347   if (musts == NULL)
3348     return;
3349   mp = musts;
3350   for (i = 0; i <= dfa->tindex; ++i)
3351     mp[i] = must0;
3352   for (i = 0; i <= dfa->tindex; ++i)
3353     {
3354       mp[i].in = (char **) malloc(sizeof *mp[i].in);
3355       mp[i].left = malloc(2);
3356       mp[i].right = malloc(2);
3357       mp[i].is = malloc(2);
3358       if (mp[i].in == NULL || mp[i].left == NULL ||
3359 	  mp[i].right == NULL || mp[i].is == NULL)
3360 	goto done;
3361       mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3362       mp[i].in[0] = NULL;
3363     }
3364 #ifdef DEBUG
3365   fprintf(stderr, "dfamust:\n");
3366   for (i = 0; i < dfa->tindex; ++i)
3367     {
3368       fprintf(stderr, " %d:", i);
3369       prtok(dfa->tokens[i]);
3370     }
3371   putc('\n', stderr);
3372 #endif
3373   for (ri = 0; ri < dfa->tindex; ++ri)
3374     {
3375       switch (t = dfa->tokens[ri])
3376 	{
3377 	case LPAREN:
3378 	case RPAREN:
3379 	  goto done;		/* "cannot happen" */
3380 	case EMPTY:
3381 	case BEGLINE:
3382 	case ENDLINE:
3383 	case BEGWORD:
3384 	case ENDWORD:
3385 	case LIMWORD:
3386 	case NOTLIMWORD:
3387 	case BACKREF:
3388 	  resetmust(mp);
3389 	  break;
3390 	case STAR:
3391 	case QMARK:
3392 	  if (mp <= musts)
3393 	    goto done;		/* "cannot happen" */
3394 	  --mp;
3395 	  resetmust(mp);
3396 	  break;
3397 	case OR:
3398 	case ORTOP:
3399 	  if (mp < &musts[2])
3400 	    goto done;		/* "cannot happen" */
3401 	  {
3402 	    char **new;
3403 	    must *lmp;
3404 	    must *rmp;
3405 	    int j, ln, rn, n;
3406 
3407 	    rmp = --mp;
3408 	    lmp = --mp;
3409 	    /* Guaranteed to be.  Unlikely, but. . . */
3410 	    if (strcmp(lmp->is, rmp->is) != 0)
3411 	      lmp->is[0] = '\0';
3412 	    /* Left side--easy */
3413 	    i = 0;
3414 	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3415 	      ++i;
3416 	    lmp->left[i] = '\0';
3417 	    /* Right side */
3418 	    ln = strlen(lmp->right);
3419 	    rn = strlen(rmp->right);
3420 	    n = ln;
3421 	    if (n > rn)
3422 	      n = rn;
3423 	    for (i = 0; i < n; ++i)
3424 	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3425 		break;
3426 	    for (j = 0; j < i; ++j)
3427 	      lmp->right[j] = lmp->right[(ln - i) + j];
3428 	    lmp->right[j] = '\0';
3429 	    new = inboth(lmp->in, rmp->in);
3430 	    if (new == NULL)
3431 	      goto done;
3432 	    freelist(lmp->in);
3433 	    free((char *) lmp->in);
3434 	    lmp->in = new;
3435 	  }
3436 	  break;
3437 	case PLUS:
3438 	  if (mp <= musts)
3439 	    goto done;		/* "cannot happen" */
3440 	  --mp;
3441 	  mp->is[0] = '\0';
3442 	  break;
3443 	case END:
3444 	  if (mp != &musts[1])
3445 	    goto done;		/* "cannot happen" */
3446 	  for (i = 0; musts[0].in[i] != NULL; ++i)
3447 	    if (strlen(musts[0].in[i]) > strlen(result))
3448 	      result = musts[0].in[i];
3449 	  if (strcmp(result, musts[0].is) == 0)
3450 	    exact = 1;
3451 	  goto done;
3452 	case CAT:
3453 	  if (mp < &musts[2])
3454 	    goto done;		/* "cannot happen" */
3455 	  {
3456 	    must *lmp;
3457 	    must *rmp;
3458 
3459 	    rmp = --mp;
3460 	    lmp = --mp;
3461 	    /* In.  Everything in left, plus everything in
3462 	       right, plus catenation of
3463 	       left's right and right's left. */
3464 	    lmp->in = addlists(lmp->in, rmp->in);
3465 	    if (lmp->in == NULL)
3466 	      goto done;
3467 	    if (lmp->right[0] != '\0' &&
3468 		rmp->left[0] != '\0')
3469 	      {
3470 		char *tp;
3471 
3472 		tp = icpyalloc(lmp->right);
3473 		if (tp == NULL)
3474 		  goto done;
3475 		tp = icatalloc(tp, rmp->left);
3476 		if (tp == NULL)
3477 		  goto done;
3478 		lmp->in = enlist(lmp->in, tp,
3479 				 strlen(tp));
3480 		free(tp);
3481 		if (lmp->in == NULL)
3482 		  goto done;
3483 	      }
3484 	    /* Left-hand */
3485 	    if (lmp->is[0] != '\0')
3486 	      {
3487 		lmp->left = icatalloc(lmp->left,
3488 				      rmp->left);
3489 		if (lmp->left == NULL)
3490 		  goto done;
3491 	      }
3492 	    /* Right-hand */
3493 	    if (rmp->is[0] == '\0')
3494 	      lmp->right[0] = '\0';
3495 	    lmp->right = icatalloc(lmp->right, rmp->right);
3496 	    if (lmp->right == NULL)
3497 	      goto done;
3498 	    /* Guaranteed to be */
3499 	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3500 	      {
3501 		lmp->is = icatalloc(lmp->is, rmp->is);
3502 		if (lmp->is == NULL)
3503 		  goto done;
3504 	      }
3505 	    else
3506 	      lmp->is[0] = '\0';
3507 	  }
3508 	  break;
3509 	default:
3510 	  if (t < END)
3511 	    {
3512 	      /* "cannot happen" */
3513 	      goto done;
3514 	    }
3515 	  else if (t == '\0')
3516 	    {
3517 	      /* not on *my* shift */
3518 	      goto done;
3519 	    }
3520 	  else if (t >= CSET
3521 #ifdef MBS_SUPPORT
3522 		   || t == ANYCHAR
3523 		   || t == MBCSET
3524 #endif /* MBS_SUPPORT */
3525 		   )
3526 	    {
3527 	      /* easy enough */
3528 	      resetmust(mp);
3529 	    }
3530 	  else
3531 	    {
3532 	      /* plain character */
3533 	      resetmust(mp);
3534 	      mp->is[0] = mp->left[0] = mp->right[0] = t;
3535 	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3536 	      mp->in = enlist(mp->in, mp->is, (size_t)1);
3537 	      if (mp->in == NULL)
3538 		goto done;
3539 	    }
3540 	  break;
3541 	}
3542 #ifdef DEBUG
3543       fprintf(stderr, " node: %d:", ri);
3544       prtok(dfa->tokens[ri]);
3545       fprintf(stderr, "\n  in:");
3546       for (i = 0; mp->in[i]; ++i)
3547 	fprintf(stderr, " \"%s\"", mp->in[i]);
3548       fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
3549       fprintf(stderr, "  left: \"%s\"\n", mp->left);
3550       fprintf(stderr, "  right: \"%s\"\n", mp->right);
3551 #endif
3552       ++mp;
3553     }
3554  done:
3555   if (strlen(result))
3556     {
3557       dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3558       dm->exact = exact;
3559       dm->must = malloc(strlen(result) + 1);
3560       strcpy(dm->must, result);
3561       dm->next = dfa->musts;
3562       dfa->musts = dm;
3563     }
3564   mp = musts;
3565   for (i = 0; i <= dfa->tindex; ++i)
3566     {
3567       freelist(mp[i].in);
3568       ifree((char *) mp[i].in);
3569       ifree(mp[i].left);
3570       ifree(mp[i].right);
3571       ifree(mp[i].is);
3572     }
3573   free((char *) mp);
3574 }
3575 /* vim:set shiftwidth=2: */
3576