xref: /netbsd-src/external/gpl2/grep/dist/src/dfa.c (revision 1ec328561f11503113febedadb5064276f8d8017)
1 /*	$NetBSD: dfa.c,v 1.3 2018/06/12 21:22:47 kamil Exp $	*/
2 
3 /* dfa.c - deterministic extended regexp routines for GNU
4    Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software
18    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
19 
20 /* Written June, 1988 by Mike Haertel
21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
22 
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26 
27 #include <assert.h>
28 #include <ctype.h>
29 #include <stdio.h>
30 
31 #include <sys/types.h>
32 #ifdef STDC_HEADERS
33 #include <stdlib.h>
34 #else
35 extern char *calloc(), *malloc(), *realloc();
36 extern void free();
37 #endif
38 
39 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
40 #include <string.h>
41 #else
42 #include <strings.h>
43 #endif
44 
45 #if HAVE_SETLOCALE
46 # include <locale.h>
47 #endif
48 
49 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC && defined HAVE_WCTYPE
50 /* We can handle multibyte string.  */
51 # define MBS_SUPPORT
52 #endif
53 
54 #ifdef MBS_SUPPORT
55 # include <wchar.h>
56 # include <wctype.h>
57 #endif
58 
59 #ifndef DEBUG	/* use the same approach as regex.c */
60 #undef assert
61 #define assert(e)
62 #endif /* DEBUG */
63 
64 #ifndef isgraph
65 #define isgraph(C) (isprint(C) && !isspace(C))
66 #endif
67 
68 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
69 #define ISALPHA(C) isalpha(C)
70 #define ISUPPER(C) isupper(C)
71 #define ISLOWER(C) islower(C)
72 #define ISDIGIT(C) isdigit(C)
73 #define ISXDIGIT(C) isxdigit(C)
74 #define ISSPACE(C) isspace(C)
75 #define ISPUNCT(C) ispunct(C)
76 #define ISALNUM(C) isalnum(C)
77 #define ISPRINT(C) isprint(C)
78 #define ISGRAPH(C) isgraph(C)
79 #define ISCNTRL(C) iscntrl(C)
80 #else
81 #define ISALPHA(C) (isascii(C) && isalpha(C))
82 #define ISUPPER(C) (isascii(C) && isupper(C))
83 #define ISLOWER(C) (isascii(C) && islower(C))
84 #define ISDIGIT(C) (isascii(C) && isdigit(C))
85 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
86 #define ISSPACE(C) (isascii(C) && isspace(C))
87 #define ISPUNCT(C) (isascii(C) && ispunct(C))
88 #define ISALNUM(C) (isascii(C) && isalnum(C))
89 #define ISPRINT(C) (isascii(C) && isprint(C))
90 #define ISGRAPH(C) (isascii(C) && isgraph(C))
91 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
92 #endif
93 
94 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
95    - Its arg may be any int or unsigned int; it need not be an unsigned char.
96    - It's guaranteed to evaluate its argument exactly once.
97    - It's typically faster.
98    Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
99    only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless
100    it's important to use the locale's definition of `digit' even when the
101    host does not conform to Posix.  */
102 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
103 
104 /* If we (don't) have I18N.  */
105 /* glibc defines _ */
106 #ifndef _
107 # ifdef HAVE_LIBINTL_H
108 #  include <libintl.h>
109 #  ifndef _
110 #   define _(Str) gettext (Str)
111 #  endif
112 # else
113 #  define _(Str) (Str)
114 # endif
115 #endif
116 
117 #include "regex.h"
118 #include "dfa.h"
119 #include "hard-locale.h"
120 
121 /* HPUX, define those as macros in sys/param.h */
122 #ifdef setbit
123 # undef setbit
124 #endif
125 #ifdef clrbit
126 # undef clrbit
127 #endif
128 
129 static void dfamust PARAMS ((struct dfa *dfa));
130 static void regexp PARAMS ((int toplevel));
131 
132 static ptr_t
xcalloc(size_t n,size_t s)133 xcalloc (size_t n, size_t s)
134 {
135   ptr_t r = calloc(n, s);
136 
137   if (!r)
138     dfaerror(_("Memory exhausted"));
139   return r;
140 }
141 
142 static ptr_t
xmalloc(size_t n)143 xmalloc (size_t n)
144 {
145   ptr_t r = malloc(n);
146 
147   assert(n != 0);
148   if (!r)
149     dfaerror(_("Memory exhausted"));
150   return r;
151 }
152 
153 static ptr_t
xrealloc(ptr_t p,size_t n)154 xrealloc (ptr_t p, size_t n)
155 {
156   ptr_t r = realloc(p, n);
157 
158   assert(n != 0);
159   if (!r)
160     dfaerror(_("Memory exhausted"));
161   return r;
162 }
163 
164 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
165 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
166 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
167 
168 /* Reallocate an array of type t if nalloc is too small for index. */
169 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
170   if ((index) >= (nalloc))			  \
171     {						  \
172       do					  \
173 	(nalloc) *= 2;				  \
174       while ((index) >= (nalloc));		  \
175       REALLOC(p, t, nalloc);			  \
176     }
177 
178 #ifdef DEBUG
179 
180 static void
prtok(token t)181 prtok (token t)
182 {
183   char const *s;
184 
185   if (t < 0)
186     fprintf(stderr, "END");
187   else if (t < NOTCHAR)
188     fprintf(stderr, "%c", t);
189   else
190     {
191       switch (t)
192 	{
193 	case EMPTY: s = "EMPTY"; break;
194 	case BACKREF: s = "BACKREF"; break;
195 	case BEGLINE: s = "BEGLINE"; break;
196 	case ENDLINE: s = "ENDLINE"; break;
197 	case BEGWORD: s = "BEGWORD"; break;
198 	case ENDWORD: s = "ENDWORD"; break;
199 	case LIMWORD: s = "LIMWORD"; break;
200 	case NOTLIMWORD: s = "NOTLIMWORD"; break;
201 	case QMARK: s = "QMARK"; break;
202 	case STAR: s = "STAR"; break;
203 	case PLUS: s = "PLUS"; break;
204 	case CAT: s = "CAT"; break;
205 	case OR: s = "OR"; break;
206 	case ORTOP: s = "ORTOP"; break;
207 	case LPAREN: s = "LPAREN"; break;
208 	case RPAREN: s = "RPAREN"; break;
209 	case CRANGE: s = "CRANGE"; break;
210 #ifdef MBS_SUPPORT
211 	case ANYCHAR: s = "ANYCHAR"; break;
212 	case MBCSET: s = "MBCSET"; break;
213 #endif /* MBS_SUPPORT */
214 	default: s = "CSET"; break;
215 	}
216       fprintf(stderr, "%s", s);
217     }
218 }
219 #endif /* DEBUG */
220 
221 /* Stuff pertaining to charclasses. */
222 
223 static int
tstbit(unsigned b,charclass c)224 tstbit (unsigned b, charclass c)
225 {
226   return c[b / INTBITS] & 1U << b % INTBITS;
227 }
228 
229 static void
setbit(unsigned b,charclass c)230 setbit (unsigned b, charclass c)
231 {
232   c[b / INTBITS] |= 1U << b % INTBITS;
233 }
234 
235 static void
clrbit(unsigned b,charclass c)236 clrbit (unsigned b, charclass c)
237 {
238   c[b / INTBITS] &= ~(1U << b % INTBITS);
239 }
240 
241 static void
copyset(charclass src,charclass dst)242 copyset (charclass src, charclass dst)
243 {
244   memcpy (dst, src, sizeof (charclass));
245 }
246 
247 static void
zeroset(charclass s)248 zeroset (charclass s)
249 {
250   memset (s, 0, sizeof (charclass));
251 }
252 
253 static void
notset(charclass s)254 notset (charclass s)
255 {
256   int i;
257 
258   for (i = 0; i < CHARCLASS_INTS; ++i)
259     s[i] = ~s[i];
260 }
261 
262 static int
equal(charclass s1,charclass s2)263 equal (charclass s1, charclass s2)
264 {
265   return memcmp (s1, s2, sizeof (charclass)) == 0;
266 }
267 
268 /* A pointer to the current dfa is kept here during parsing. */
269 static struct dfa *dfa;
270 
271 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
272 static int
charclass_index(charclass s)273 charclass_index (charclass s)
274 {
275   int i;
276 
277   for (i = 0; i < dfa->cindex; ++i)
278     if (equal(s, dfa->charclasses[i]))
279       return i;
280   REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
281   ++dfa->cindex;
282   copyset(s, dfa->charclasses[i]);
283   return i;
284 }
285 
286 /* Syntax bits controlling the behavior of the lexical analyzer. */
287 static reg_syntax_t syntax_bits, syntax_bits_set;
288 
289 /* Flag for case-folding letters into sets. */
290 static int case_fold;
291 
292 /* End-of-line byte in data.  */
293 static unsigned char eolbyte;
294 
295 /* Entry point to set syntax options. */
296 void
dfasyntax(reg_syntax_t bits,int fold,unsigned char eol)297 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
298 {
299   syntax_bits_set = 1;
300   syntax_bits = bits;
301   case_fold = fold;
302   eolbyte = eol;
303 }
304 
305 /* Like setbit, but if case is folded, set both cases of a letter.  */
306 static void
setbit_case_fold(unsigned b,charclass c)307 setbit_case_fold (unsigned b, charclass c)
308 {
309   setbit (b, c);
310   if (case_fold)
311     {
312       if (ISUPPER (b))
313 	setbit (tolower (b), c);
314       else if (ISLOWER (b))
315 	setbit (toupper (b), c);
316     }
317 }
318 
319 /* Lexical analyzer.  All the dross that deals with the obnoxious
320    GNU Regex syntax bits is located here.  The poor, suffering
321    reader is referred to the GNU Regex documentation for the
322    meaning of the @#%!@#%^!@ syntax bits. */
323 
324 static char const *lexstart;	/* Pointer to beginning of input string. */
325 static char const *lexptr;	/* Pointer to next input character. */
326 static int lexleft;		/* Number of characters remaining. */
327 static token lasttok;		/* Previous token returned; initially END. */
328 static int laststart;		/* True if we're separated from beginning or (, |
329 				   only by zero-width characters. */
330 static int parens;		/* Count of outstanding left parens. */
331 static int minrep, maxrep;	/* Repeat counts for {m,n}. */
332 static int hard_LC_COLLATE;	/* Nonzero if LC_COLLATE is hard.  */
333 
334 #ifdef MBS_SUPPORT
335 /* These variables are used only if (MB_CUR_MAX > 1).  */
336 static mbstate_t mbs;		/* Mbstate for mbrlen().  */
337 static ssize_t cur_mb_len;		/* Byte length of the current scanning
338 				   multibyte character.  */
339 static ssize_t cur_mb_index;        /* Byte index of the current scanning multibyte
340                                    character.
341 
342 				   singlebyte character : cur_mb_index = 0
343 				   multibyte character
344 				       1st byte : cur_mb_index = 1
345 				       2nd byte : cur_mb_index = 2
346 				         ...
347 				       nth byte : cur_mb_index = n  */
348 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
349                                   Each element store the amount of remain
350                                   byte of corresponding multibyte character
351                                   in the input string.  A element's value
352                                   is 0 if corresponding character is a
353                                   singlebyte chracter.
354                                   e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
355                                    mblen_buf :   0,       3,       2,       1
356                                */
357 static wchar_t *inputwcs;	/* Wide character representation of input
358 				   string in dfaexec().
359 				   The length of this array is same as
360 				   the length of input string(char array).
361 				   inputstring[i] is a single-byte char,
362 				   or 1st byte of a multibyte char.
363 				   And inputwcs[i] is the codepoint.  */
364 static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
365 static unsigned char const *buf_end;	/* refference to end in dfaexec().  */
366 #endif /* MBS_SUPPORT  */
367 
368 #ifdef MBS_SUPPORT
369 /* This function update cur_mb_len, and cur_mb_index.
370    p points current lexptr, len is the remaining buffer length.  */
371 static void
update_mb_len_index(unsigned char const * p,size_t len)372 update_mb_len_index (unsigned char const *p, size_t len)
373 {
374   /* If last character is a part of a multibyte character,
375      we update cur_mb_index.  */
376   if (cur_mb_index)
377     cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
378 			: cur_mb_index + 1;
379 
380   /* If last character is a single byte character, or the
381      last portion of a multibyte character, we check whether
382      next character is a multibyte character or not.  */
383   if (! cur_mb_index)
384     {
385       cur_mb_len = mbrlen(p, len, &mbs);
386       if (cur_mb_len > 1)
387 	/* It is a multibyte character.
388 	   cur_mb_len was already set by mbrlen().  */
389 	cur_mb_index = 1;
390       else if (cur_mb_len < 1)
391 	/* Invalid sequence.  We treat it as a singlebyte character.
392 	   cur_mb_index is aleady 0.  */
393 	cur_mb_len = 1;
394       /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
395 	 cur_mb_index is aleady 0.  */
396     }
397 }
398 #endif /* MBS_SUPPORT */
399 
400 #ifdef MBS_SUPPORT
401 /* Note that characters become unsigned here. */
402 # define FETCH(c, eoferr)			\
403   {						\
404     if (! lexleft)				\
405      {						\
406 	if (eoferr != 0)			\
407 	  dfaerror (eoferr);			\
408 	else					\
409 	  return lasttok = END;			\
410       }						\
411     if (MB_CUR_MAX > 1)				\
412       update_mb_len_index(lexptr, lexleft);	\
413     (c) = (unsigned char) *lexptr++;		\
414     --lexleft;					\
415   }
416 
417 /* This function fetch a wide character, and update cur_mb_len,
418    used only if the current locale is a multibyte environment.  */
419 static wchar_t
fetch_wc(char const * eoferr)420 fetch_wc (char const *eoferr)
421 {
422   wchar_t wc;
423   if (! lexleft)
424     {
425       if (eoferr != 0)
426 	dfaerror (eoferr);
427       else
428 	return -1;
429     }
430 
431   cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
432   if (cur_mb_len <= 0)
433    {
434       cur_mb_len = 1;
435       wc = *lexptr;
436     }
437   lexptr += cur_mb_len;
438   lexleft -= cur_mb_len;
439   return wc;
440 }
441 #else
442 /* Note that characters become unsigned here. */
443 # define FETCH(c, eoferr)   	      \
444   {			   	      \
445     if (! lexleft)	   	      \
446       {				      \
447 	if (eoferr != 0)	      \
448 	  dfaerror (eoferr);	      \
449 	else		   	      \
450 	  return lasttok = END;	      \
451       }				      \
452     (c) = (unsigned char) *lexptr++;  \
453     --lexleft;		   	      \
454   }
455 #endif /* MBS_SUPPORT */
456 
457 #ifdef MBS_SUPPORT
458 /* Multibyte character handling sub-routin for lex.
459    This function  parse a bracket expression and build a struct
460    mb_char_classes.  */
461 static void
parse_bracket_exp_mb()462 parse_bracket_exp_mb ()
463 {
464   wchar_t wc, wc1, wc2;
465 
466   /* Work area to build a mb_char_classes.  */
467   struct mb_char_classes *work_mbc;
468   int chars_al, range_sts_al, range_ends_al, ch_classes_al,
469     equivs_al, coll_elems_al;
470 
471   REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
472 		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);
473   /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
474      We will update dfa->multibyte_prop in addtok(), because we can't
475      decide the index in dfa->tokens[].  */
476 
477   /* Initialize work are */
478   work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
479 
480   chars_al = 1;
481   range_sts_al = range_ends_al = 0;
482   ch_classes_al = equivs_al = coll_elems_al = 0;
483   MALLOC(work_mbc->chars, wchar_t, chars_al);
484 
485   work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
486   work_mbc->nequivs = work_mbc->ncoll_elems = 0;
487   work_mbc->chars = NULL;
488   work_mbc->ch_classes = NULL;
489   work_mbc->range_sts = work_mbc->range_ends = NULL;
490   work_mbc->equivs = work_mbc->coll_elems = NULL;
491 
492   wc = fetch_wc(_("Unbalanced ["));
493   if (wc == L'^')
494     {
495       wc = fetch_wc(_("Unbalanced ["));
496       work_mbc->invert = 1;
497     }
498   else
499     work_mbc->invert = 0;
500   do
501     {
502       wc1 = -1; /* mark wc1 is not initialized".  */
503 
504       /* Note that if we're looking at some other [:...:] construct,
505 	 we just treat it as a bunch of ordinary characters.  We can do
506 	 this because we assume regex has checked for syntax errors before
507 	 dfa is ever called. */
508       if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
509 	{
510 #define BRACKET_BUFFER_SIZE 128
511 	  char str[BRACKET_BUFFER_SIZE];
512 	  wc1 = wc;
513 	  wc = fetch_wc(_("Unbalanced ["));
514 
515 	  /* If pattern contains `[[:', `[[.', or `[[='.  */
516 	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
517 	    {
518 	      unsigned char c;
519 	      unsigned char delim = (unsigned char)wc;
520 	      int len = 0;
521 	      for (;;)
522 		{
523 		  if (! lexleft)
524 		    dfaerror (_("Unbalanced ["));
525 		  c = (unsigned char) *lexptr++;
526 		  --lexleft;
527 
528 		  if ((c == delim && *lexptr == ']') || lexleft == 0)
529 		    break;
530 		  if (len < BRACKET_BUFFER_SIZE)
531 		    str[len++] = c;
532 		  else
533 		    /* This is in any case an invalid class name.  */
534 		    str[0] = '\0';
535 		}
536 	      str[len] = '\0';
537 
538 	      if (lexleft == 0)
539 		{
540 		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
541 				       work_mbc->nchars + 2);
542 		  work_mbc->chars[work_mbc->nchars++] = L'[';
543 		  work_mbc->chars[work_mbc->nchars++] = delim;
544 		  break;
545 		}
546 
547 	      if (--lexleft, *lexptr++ != ']')
548 		dfaerror (_("Unbalanced ["));
549 	      if (delim == ':')
550 		/* build character class.  */
551 		{
552 		  wctype_t wt;
553 		  /* Query the character class as wctype_t.  */
554 		  wt = wctype (str);
555 
556 		  if (ch_classes_al == 0)
557 		    MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
558 		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
559 				       ch_classes_al,
560 				       work_mbc->nch_classes + 1);
561 		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
562 
563  		}
564 	      else if (delim == '=' || delim == '.')
565 		{
566 		  char *elem;
567 		  MALLOC(elem, char, len + 1);
568 		  strncpy(elem, str, len + 1);
569 
570 		  if (delim == '=')
571 		    /* build equivalent class.  */
572 		    {
573 		      if (equivs_al == 0)
574 			MALLOC(work_mbc->equivs, char*, ++equivs_al);
575 		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
576 					   equivs_al,
577 					   work_mbc->nequivs + 1);
578 		      work_mbc->equivs[work_mbc->nequivs++] = elem;
579 		    }
580 
581 		  if (delim == '.')
582 		    /* build collating element.  */
583 		    {
584 		      if (coll_elems_al == 0)
585 			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
586 		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
587 					   coll_elems_al,
588 					   work_mbc->ncoll_elems + 1);
589 		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
590 		    }
591  		}
592 	      wc = -1;
593 	    }
594 	  else
595 	    /* We treat '[' as a normal character here.  */
596 	    {
597 	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
598 	    }
599 	}
600       else
601 	{
602 	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
603 	    wc = fetch_wc(("Unbalanced ["));
604 	}
605 
606       if (wc1 == -1)
607 	wc1 = fetch_wc(_("Unbalanced ["));
608 
609       if (wc1 == L'-')
610 	/* build range characters.  */
611 	{
612 	  wc2 = fetch_wc(_("Unbalanced ["));
613 	  if (wc2 == L']')
614 	    {
615 	      /* In the case [x-], the - is an ordinary hyphen,
616 		 which is left in c1, the lookahead character. */
617 	      lexptr -= cur_mb_len;
618 	      lexleft += cur_mb_len;
619 	      wc2 = wc;
620 	    }
621 	  else
622 	    {
623 	      if (wc2 == L'\\'
624 		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
625 		wc2 = fetch_wc(_("Unbalanced ["));
626 	      wc1 = fetch_wc(_("Unbalanced ["));
627 	    }
628 
629 	  if (range_sts_al == 0)
630 	    {
631 	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
632 	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
633 	    }
634 	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
635 			       range_sts_al, work_mbc->nranges + 1);
636 	  work_mbc->range_sts[work_mbc->nranges] = wc;
637 	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
638 			       range_ends_al, work_mbc->nranges + 1);
639 	  work_mbc->range_ends[work_mbc->nranges++] = wc2;
640 	}
641       else if (wc != -1)
642 	/* build normal characters.  */
643 	{
644 	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
645 			       work_mbc->nchars + 1);
646 	  work_mbc->chars[work_mbc->nchars++] = wc;
647 	}
648     }
649   while ((wc = wc1) != L']');
650 }
651 #endif /* MBS_SUPPORT */
652 
653 #ifdef __STDC__
654 #define FUNC(F, P) static int F(int c) { return P(c); }
655 #else
656 #define FUNC(F, P) static int F(c) int c; { return P(c); }
657 #endif
658 
FUNC(is_alpha,ISALPHA)659 FUNC(is_alpha, ISALPHA)
660 FUNC(is_upper, ISUPPER)
661 FUNC(is_lower, ISLOWER)
662 FUNC(is_digit, ISDIGIT)
663 FUNC(is_xdigit, ISXDIGIT)
664 FUNC(is_space, ISSPACE)
665 FUNC(is_punct, ISPUNCT)
666 FUNC(is_alnum, ISALNUM)
667 FUNC(is_print, ISPRINT)
668 FUNC(is_graph, ISGRAPH)
669 FUNC(is_cntrl, ISCNTRL)
670 
671 static int
672 is_blank (int c)
673 {
674    return (c == ' ' || c == '\t');
675 }
676 
677 /* The following list maps the names of the Posix named character classes
678    to predicate functions that determine whether a given character is in
679    the class.  The leading [ has already been eaten by the lexical analyzer. */
680 static struct {
681   const char *name;
682   int (*pred) PARAMS ((int));
683 } const prednames[] = {
684   { ":alpha:]", is_alpha },
685   { ":upper:]", is_upper },
686   { ":lower:]", is_lower },
687   { ":digit:]", is_digit },
688   { ":xdigit:]", is_xdigit },
689   { ":space:]", is_space },
690   { ":punct:]", is_punct },
691   { ":alnum:]", is_alnum },
692   { ":print:]", is_print },
693   { ":graph:]", is_graph },
694   { ":cntrl:]", is_cntrl },
695   { ":blank:]", is_blank },
696   { 0 }
697 };
698 
699 /* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */
700 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
701 
702 static int
looking_at(char const * s)703 looking_at (char const *s)
704 {
705   size_t len;
706 
707   len = strlen(s);
708   if (lexleft < len)
709     return 0;
710   return strncmp(s, lexptr, len) == 0;
711 }
712 
713 static token
lex(void)714 lex (void)
715 {
716   unsigned c, c1, c2;
717   int backslash = 0, invert;
718   charclass ccl;
719   int i;
720 
721   /* Basic plan: We fetch a character.  If it's a backslash,
722      we set the backslash flag and go through the loop again.
723      On the plus side, this avoids having a duplicate of the
724      main switch inside the backslash case.  On the minus side,
725      it means that just about every case begins with
726      "if (backslash) ...".  */
727   for (i = 0; i < 2; ++i)
728     {
729       FETCH(c, 0);
730 #ifdef MBS_SUPPORT
731       if (MB_CUR_MAX > 1 && cur_mb_index)
732 	/* If this is a part of a multi-byte character, we must treat
733 	   this byte data as a normal character.
734 	   e.g. In case of SJIS encoding, some character contains '\',
735 	        but they must not be backslash.  */
736 	goto normal_char;
737 #endif /* MBS_SUPPORT  */
738       switch (c)
739 	{
740 	case '\\':
741 	  if (backslash)
742 	    goto normal_char;
743 	  if (lexleft == 0)
744 	    dfaerror(_("Unfinished \\ escape"));
745 	  backslash = 1;
746 	  break;
747 
748 	case '^':
749 	  if (backslash)
750 	    goto normal_char;
751 	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
752 	      || lasttok == END
753 	      || lasttok == LPAREN
754 	      || lasttok == OR)
755 	    return lasttok = BEGLINE;
756 	  goto normal_char;
757 
758 	case '$':
759 	  if (backslash)
760 	    goto normal_char;
761 	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
762 	      || lexleft == 0
763 	      || (syntax_bits & RE_NO_BK_PARENS
764 		  ? lexleft > 0 && *lexptr == ')'
765 		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
766 	      || (syntax_bits & RE_NO_BK_VBAR
767 		  ? lexleft > 0 && *lexptr == '|'
768 		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
769 	      || ((syntax_bits & RE_NEWLINE_ALT)
770 	          && lexleft > 0 && *lexptr == '\n'))
771 	    return lasttok = ENDLINE;
772 	  goto normal_char;
773 
774 	case '1':
775 	case '2':
776 	case '3':
777 	case '4':
778 	case '5':
779 	case '6':
780 	case '7':
781 	case '8':
782 	case '9':
783 	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))
784 	    {
785 	      laststart = 0;
786 	      return lasttok = BACKREF;
787 	    }
788 	  goto normal_char;
789 
790 	case '`':
791 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
792 	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */
793 	  goto normal_char;
794 
795 	case '\'':
796 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
797 	    return lasttok = ENDLINE;	/* FIXME: should be end of string */
798 	  goto normal_char;
799 
800 	case '<':
801 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
802 	    return lasttok = BEGWORD;
803 	  goto normal_char;
804 
805 	case '>':
806 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
807 	    return lasttok = ENDWORD;
808 	  goto normal_char;
809 
810 	case 'b':
811 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
812 	    return lasttok = LIMWORD;
813 	  goto normal_char;
814 
815 	case 'B':
816 	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
817 	    return lasttok = NOTLIMWORD;
818 	  goto normal_char;
819 
820 	case '?':
821 	  if (syntax_bits & RE_LIMITED_OPS)
822 	    goto normal_char;
823 	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
824 	    goto normal_char;
825 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
826 	    goto normal_char;
827 	  return lasttok = QMARK;
828 
829 	case '*':
830 	  if (backslash)
831 	    goto normal_char;
832 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
833 	    goto normal_char;
834 	  return lasttok = STAR;
835 
836 	case '+':
837 	  if (syntax_bits & RE_LIMITED_OPS)
838 	    goto normal_char;
839 	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
840 	    goto normal_char;
841 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
842 	    goto normal_char;
843 	  return lasttok = PLUS;
844 
845 	case '{':
846 	  if (!(syntax_bits & RE_INTERVALS))
847 	    goto normal_char;
848 	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
849 	    goto normal_char;
850 	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
851 	    goto normal_char;
852 
853 	  if (syntax_bits & RE_NO_BK_BRACES)
854 	    {
855 	      /* Scan ahead for a valid interval; if it's not valid,
856 		 treat it as a literal '{'.  */
857 	      int lo = -1, hi = -1;
858 	      char const *p = lexptr;
859 	      char const *lim = p + lexleft;
860 	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)
861 		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
862 	      if (p != lim && *p == ',')
863 		while (++p != lim && ISASCIIDIGIT (*p))
864 		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
865 	      else
866 		hi = lo;
867 	      if (p == lim || *p != '}'
868 		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
869 		goto normal_char;
870 	    }
871 
872 	  minrep = 0;
873 	  /* Cases:
874 	     {M} - exact count
875 	     {M,} - minimum count, maximum is infinity
876 	     {M,N} - M through N */
877 	  FETCH(c, _("unfinished repeat count"));
878 	  if (ISASCIIDIGIT (c))
879 	    {
880 	      minrep = c - '0';
881 	      for (;;)
882 		{
883 		  FETCH(c, _("unfinished repeat count"));
884 		  if (! ISASCIIDIGIT (c))
885 		    break;
886 		  minrep = 10 * minrep + c - '0';
887 		}
888 	    }
889 	  else
890 	    dfaerror(_("malformed repeat count"));
891 	  if (c == ',')
892 	    {
893 	      FETCH (c, _("unfinished repeat count"));
894 	      if (! ISASCIIDIGIT (c))
895 		maxrep = -1;
896 	      else
897 		{
898 		  maxrep = c - '0';
899 		  for (;;)
900 		    {
901 		      FETCH (c, _("unfinished repeat count"));
902 		      if (! ISASCIIDIGIT (c))
903 			break;
904 		      maxrep = 10 * maxrep + c - '0';
905 		    }
906 		  if (0 <= maxrep && maxrep < minrep)
907 		    dfaerror (_("malformed repeat count"));
908 		}
909 	    }
910 	  else
911 	    maxrep = minrep;
912 	  if (!(syntax_bits & RE_NO_BK_BRACES))
913 	    {
914 	      if (c != '\\')
915 		dfaerror(_("malformed repeat count"));
916 	      FETCH(c, _("unfinished repeat count"));
917 	    }
918 	  if (c != '}')
919 	    dfaerror(_("malformed repeat count"));
920 	  laststart = 0;
921 	  return lasttok = REPMN;
922 
923 	case '|':
924 	  if (syntax_bits & RE_LIMITED_OPS)
925 	    goto normal_char;
926 	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
927 	    goto normal_char;
928 	  laststart = 1;
929 	  return lasttok = OR;
930 
931 	case '\n':
932 	  if (syntax_bits & RE_LIMITED_OPS
933 	      || backslash
934 	      || !(syntax_bits & RE_NEWLINE_ALT))
935 	    goto normal_char;
936 	  laststart = 1;
937 	  return lasttok = OR;
938 
939 	case '(':
940 	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
941 	    goto normal_char;
942 	  ++parens;
943 	  laststart = 1;
944 	  return lasttok = LPAREN;
945 
946 	case ')':
947 	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
948 	    goto normal_char;
949 	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
950 	    goto normal_char;
951 	  --parens;
952 	  laststart = 0;
953 	  return lasttok = RPAREN;
954 
955 	case '.':
956 	  if (backslash)
957 	    goto normal_char;
958 #ifdef MBS_SUPPORT
959 	  if (MB_CUR_MAX > 1)
960 	    {
961 	      /* In multibyte environment period must match with a single
962 		 character not a byte.  So we use ANYCHAR.  */
963 	      laststart = 0;
964 	      return lasttok = ANYCHAR;
965 	    }
966 #endif /* MBS_SUPPORT */
967 	  zeroset(ccl);
968 	  notset(ccl);
969 	  if (!(syntax_bits & RE_DOT_NEWLINE))
970 	    clrbit(eolbyte, ccl);
971 	  if (syntax_bits & RE_DOT_NOT_NULL)
972 	    clrbit('\0', ccl);
973 	  laststart = 0;
974 	  return lasttok = CSET + charclass_index(ccl);
975 
976 	case 'w':
977 	case 'W':
978 	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
979 	    goto normal_char;
980 	  zeroset(ccl);
981 	  for (c2 = 0; c2 < NOTCHAR; ++c2)
982 	    if (IS_WORD_CONSTITUENT(c2))
983 	      setbit(c2, ccl);
984 	  if (c == 'W')
985 	    notset(ccl);
986 	  laststart = 0;
987 	  return lasttok = CSET + charclass_index(ccl);
988 
989 	case '[':
990 	  if (backslash)
991 	    goto normal_char;
992 	  laststart = 0;
993 #ifdef MBS_SUPPORT
994 	  if (MB_CUR_MAX > 1)
995 	    {
996 	      /* In multibyte environment a bracket expression may contain
997 		 multibyte characters, which must be treated as characters
998 		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */
999 	      parse_bracket_exp_mb();
1000 	      return lasttok = MBCSET;
1001 	    }
1002 #endif
1003 	  zeroset(ccl);
1004 	  FETCH(c, _("Unbalanced ["));
1005 	  if (c == '^')
1006 	    {
1007 	      FETCH(c, _("Unbalanced ["));
1008 	      invert = 1;
1009 	    }
1010 	  else
1011 	    invert = 0;
1012 	  do
1013 	    {
1014 	      /* Nobody ever said this had to be fast. :-)
1015 		 Note that if we're looking at some other [:...:]
1016 		 construct, we just treat it as a bunch of ordinary
1017 		 characters.  We can do this because we assume
1018 		 regex has checked for syntax errors before
1019 		 dfa is ever called. */
1020 	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1021 		for (c1 = 0; prednames[c1].name; ++c1)
1022 		  if (looking_at(prednames[c1].name))
1023 		    {
1024 		      int (*pred) PARAMS ((int)) = prednames[c1].pred;
1025 
1026 		      for (c2 = 0; c2 < NOTCHAR; ++c2)
1027 			if ((*pred)(c2))
1028 			  setbit_case_fold (c2, ccl);
1029 		      lexptr += strlen(prednames[c1].name);
1030 		      lexleft -= strlen(prednames[c1].name);
1031 		      FETCH(c1, _("Unbalanced ["));
1032 		      goto skip;
1033 		    }
1034 	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1035 		FETCH(c, _("Unbalanced ["));
1036 	      FETCH(c1, _("Unbalanced ["));
1037 	      if (c1 == '-')
1038 		{
1039 		  FETCH(c2, _("Unbalanced ["));
1040 		  if (c2 == ']')
1041 		    {
1042 		      /* In the case [x-], the - is an ordinary hyphen,
1043 			 which is left in c1, the lookahead character. */
1044 		      --lexptr;
1045 		      ++lexleft;
1046 		    }
1047 		  else
1048 		    {
1049 		      if (c2 == '\\'
1050 			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1051 			FETCH(c2, _("Unbalanced ["));
1052 		      FETCH(c1, _("Unbalanced ["));
1053 		      if (!hard_LC_COLLATE) {
1054 		        for (; c <= c2; c++)
1055 			  setbit_case_fold (c, ccl);
1056 		      } else {
1057 			/* POSIX locales are painful - leave the decision to libc */
1058 			char expr[6] = { '[', c, '-', c2, ']', '\0' };
1059 			regex_t re;
1060 			if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1061 			  for (c = 0; c < NOTCHAR; ++c) {
1062 			    char buf[2] = { c, '\0' };
1063 			    regmatch_t mat;
1064 			    if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1065                                && mat.rm_so == 0 && mat.rm_eo == 1)
1066                               setbit_case_fold (c, ccl);
1067 			  }
1068 			  regfree (&re);
1069 			}
1070 		      }
1071 		      continue;
1072 		    }
1073 		}
1074 
1075 	      setbit_case_fold (c, ccl);
1076 
1077 	    skip:
1078 	      ;
1079 	    }
1080 	  while ((c = c1) != ']');
1081 	  if (invert)
1082 	    {
1083 	      notset(ccl);
1084 	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1085 		clrbit(eolbyte, ccl);
1086 	    }
1087 	  return lasttok = CSET + charclass_index(ccl);
1088 
1089 	default:
1090 	normal_char:
1091 	  laststart = 0;
1092 	  if (case_fold && ISALPHA(c))
1093 	    {
1094 	      zeroset(ccl);
1095 	      setbit_case_fold (c, ccl);
1096 	      return lasttok = CSET + charclass_index(ccl);
1097 	    }
1098 	  return c;
1099 	}
1100     }
1101 
1102   /* The above loop should consume at most a backslash
1103      and some other character. */
1104   abort();
1105   return END;	/* keeps pedantic compilers happy. */
1106 }
1107 
1108 /* Recursive descent parser for regular expressions. */
1109 
1110 static token tok;		/* Lookahead token. */
1111 static int depth;		/* Current depth of a hypothetical stack
1112 				   holding deferred productions.  This is
1113 				   used to determine the depth that will be
1114 				   required of the real stack later on in
1115 				   dfaanalyze(). */
1116 
1117 /* Add the given token to the parse tree, maintaining the depth count and
1118    updating the maximum depth if necessary. */
1119 static void
addtok(token t)1120 addtok (token t)
1121 {
1122 #ifdef MBS_SUPPORT
1123   if (MB_CUR_MAX > 1)
1124     {
1125       REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1126 			   dfa->tindex);
1127       /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */
1128       if (t == MBCSET)
1129 	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1130       else if (t < NOTCHAR)
1131 	dfa->multibyte_prop[dfa->tindex]
1132 	  = (cur_mb_len == 1)? 3 /* single-byte char */
1133 	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1134 	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1135       else
1136 	/* It may be unnecesssary, but it is safer to treat other
1137 	   symbols as singlebyte characters.  */
1138 	dfa->multibyte_prop[dfa->tindex] = 3;
1139     }
1140 #endif
1141 
1142   REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1143   dfa->tokens[dfa->tindex++] = t;
1144 
1145   switch (t)
1146     {
1147     case QMARK:
1148     case STAR:
1149     case PLUS:
1150       break;
1151 
1152     case CAT:
1153     case OR:
1154     case ORTOP:
1155       --depth;
1156       break;
1157 
1158     default:
1159       ++dfa->nleaves;
1160     case EMPTY:
1161       ++depth;
1162       break;
1163     }
1164   if (depth > dfa->depth)
1165     dfa->depth = depth;
1166 }
1167 
1168 /* The grammar understood by the parser is as follows.
1169 
1170    regexp:
1171      regexp OR branch
1172      branch
1173 
1174    branch:
1175      branch closure
1176      closure
1177 
1178    closure:
1179      closure QMARK
1180      closure STAR
1181      closure PLUS
1182      closure REPMN
1183      atom
1184 
1185    atom:
1186      <normal character>
1187      <multibyte character>
1188      ANYCHAR
1189      MBCSET
1190      CSET
1191      BACKREF
1192      BEGLINE
1193      ENDLINE
1194      BEGWORD
1195      ENDWORD
1196      LIMWORD
1197      NOTLIMWORD
1198      CRANGE
1199      LPAREN regexp RPAREN
1200      <empty>
1201 
1202    The parser builds a parse tree in postfix form in an array of tokens. */
1203 
1204 static void
atom(void)1205 atom (void)
1206 {
1207   if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1208       || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1209 #ifdef MBS_SUPPORT
1210       || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1211 #endif /* MBS_SUPPORT */
1212       || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1213     {
1214       addtok(tok);
1215       tok = lex();
1216 #ifdef MBS_SUPPORT
1217       /* We treat a multibyte character as a single atom, so that DFA
1218 	 can treat a multibyte character as a single expression.
1219 
1220          e.g. We construct following tree from "<mb1><mb2>".
1221               <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1222               <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1223       */
1224       if (MB_CUR_MAX > 1)
1225 	{
1226 	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1227 	    {
1228 	      addtok(tok);
1229 	      addtok(CAT);
1230 	      tok = lex();
1231 	    }
1232 	}
1233 #endif /* MBS_SUPPORT  */
1234     }
1235   else if (tok == CRANGE)
1236     {
1237       /* A character range like "[a-z]" in a locale other than "C" or
1238 	 "POSIX".  This range might any sequence of one or more
1239 	 characters.  Unfortunately the POSIX locale primitives give
1240 	 us no practical way to find what character sequences might be
1241 	 matched.  Treat this approximately like "(.\1)" -- i.e. match
1242 	 one character, and then punt to the full matcher.  */
1243       charclass ccl;
1244       zeroset (ccl);
1245       notset (ccl);
1246       addtok (CSET + charclass_index (ccl));
1247       addtok (BACKREF);
1248       addtok (CAT);
1249       tok = lex ();
1250     }
1251   else if (tok == LPAREN)
1252     {
1253       tok = lex();
1254       regexp(0);
1255       if (tok != RPAREN)
1256 	dfaerror(_("Unbalanced ("));
1257       tok = lex();
1258     }
1259   else
1260     addtok(EMPTY);
1261 }
1262 
1263 /* Return the number of tokens in the given subexpression. */
1264 static int
nsubtoks(int tindex)1265 nsubtoks (int tindex)
1266 {
1267   int ntoks1;
1268 
1269   switch (dfa->tokens[tindex - 1])
1270     {
1271     default:
1272       return 1;
1273     case QMARK:
1274     case STAR:
1275     case PLUS:
1276       return 1 + nsubtoks(tindex - 1);
1277     case CAT:
1278     case OR:
1279     case ORTOP:
1280       ntoks1 = nsubtoks(tindex - 1);
1281       return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1282     }
1283 }
1284 
1285 /* Copy the given subexpression to the top of the tree. */
1286 static void
copytoks(int tindex,int ntokens)1287 copytoks (int tindex, int ntokens)
1288 {
1289   int i;
1290 
1291   for (i = 0; i < ntokens; ++i)
1292     addtok(dfa->tokens[tindex + i]);
1293 }
1294 
1295 static void
closure(void)1296 closure (void)
1297 {
1298   int tindex, ntokens, i;
1299 
1300   atom();
1301   while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1302     if (tok == REPMN)
1303       {
1304 	ntokens = nsubtoks(dfa->tindex);
1305 	tindex = dfa->tindex - ntokens;
1306 	if (maxrep < 0)
1307 	  addtok(PLUS);
1308 	if (minrep == 0)
1309 	  addtok(QMARK);
1310 	for (i = 1; i < minrep; ++i)
1311 	  {
1312 	    copytoks(tindex, ntokens);
1313 	    addtok(CAT);
1314 	  }
1315 	for (; i < maxrep; ++i)
1316 	  {
1317 	    copytoks(tindex, ntokens);
1318 	    addtok(QMARK);
1319 	    addtok(CAT);
1320 	  }
1321 	tok = lex();
1322       }
1323     else
1324       {
1325 	addtok(tok);
1326 	tok = lex();
1327       }
1328 }
1329 
1330 static void
branch(void)1331 branch (void)
1332 {
1333   closure();
1334   while (tok != RPAREN && tok != OR && tok >= 0)
1335     {
1336       closure();
1337       addtok(CAT);
1338     }
1339 }
1340 
1341 static void
regexp(int toplevel)1342 regexp (int toplevel)
1343 {
1344   branch();
1345   while (tok == OR)
1346     {
1347       tok = lex();
1348       branch();
1349       if (toplevel)
1350 	addtok(ORTOP);
1351       else
1352 	addtok(OR);
1353     }
1354 }
1355 
1356 /* Main entry point for the parser.  S is a string to be parsed, len is the
1357    length of the string, so s can include NUL characters.  D is a pointer to
1358    the struct dfa to parse into. */
1359 void
dfaparse(char const * s,size_t len,struct dfa * d)1360 dfaparse (char const *s, size_t len, struct dfa *d)
1361 {
1362   dfa = d;
1363   lexstart = lexptr = s;
1364   lexleft = len;
1365   lasttok = END;
1366   laststart = 1;
1367   parens = 0;
1368 #if ENABLE_NLS
1369   hard_LC_COLLATE = hard_locale (LC_COLLATE);
1370 #endif
1371 #ifdef MBS_SUPPORT
1372   if (MB_CUR_MAX > 1)
1373     {
1374       cur_mb_index = 0;
1375       cur_mb_len = 0;
1376       memset(&mbs, 0, sizeof(mbstate_t));
1377     }
1378 #endif /* MBS_SUPPORT  */
1379 
1380   if (! syntax_bits_set)
1381     dfaerror(_("No syntax specified"));
1382 
1383   tok = lex();
1384   depth = d->depth;
1385 
1386   regexp(1);
1387 
1388   if (tok != END)
1389     dfaerror(_("Unbalanced )"));
1390 
1391   addtok(END - d->nregexps);
1392   addtok(CAT);
1393 
1394   if (d->nregexps)
1395     addtok(ORTOP);
1396 
1397   ++d->nregexps;
1398 }
1399 
1400 /* Some primitives for operating on sets of positions. */
1401 
1402 /* Copy one set to another; the destination must be large enough. */
1403 static void
copy(position_set const * src,position_set * dst)1404 copy (position_set const *src, position_set *dst)
1405 {
1406   int i;
1407 
1408   for (i = 0; i < src->nelem; ++i)
1409     dst->elems[i] = src->elems[i];
1410   dst->nelem = src->nelem;
1411 }
1412 
1413 /* Insert a position in a set.  Position sets are maintained in sorted
1414    order according to index.  If position already exists in the set with
1415    the same index then their constraints are logically or'd together.
1416    S->elems must point to an array large enough to hold the resulting set. */
1417 static void
insert(position p,position_set * s)1418 insert (position p, position_set *s)
1419 {
1420   int i;
1421   position t1, t2;
1422 
1423   for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1424     continue;
1425   if (i < s->nelem && p.index == s->elems[i].index)
1426     s->elems[i].constraint |= p.constraint;
1427   else
1428     {
1429       t1 = p;
1430       ++s->nelem;
1431       while (i < s->nelem)
1432 	{
1433 	  t2 = s->elems[i];
1434 	  s->elems[i++] = t1;
1435 	  t1 = t2;
1436 	}
1437     }
1438 }
1439 
1440 /* Merge two sets of positions into a third.  The result is exactly as if
1441    the positions of both sets were inserted into an initially empty set. */
1442 static void
merge(position_set const * s1,position_set const * s2,position_set * m)1443 merge (position_set const *s1, position_set const *s2, position_set *m)
1444 {
1445   int i = 0, j = 0;
1446 
1447   m->nelem = 0;
1448   while (i < s1->nelem && j < s2->nelem)
1449     if (s1->elems[i].index > s2->elems[j].index)
1450       m->elems[m->nelem++] = s1->elems[i++];
1451     else if (s1->elems[i].index < s2->elems[j].index)
1452       m->elems[m->nelem++] = s2->elems[j++];
1453     else
1454       {
1455 	m->elems[m->nelem] = s1->elems[i++];
1456 	m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1457       }
1458   while (i < s1->nelem)
1459     m->elems[m->nelem++] = s1->elems[i++];
1460   while (j < s2->nelem)
1461     m->elems[m->nelem++] = s2->elems[j++];
1462 }
1463 
1464 /* Delete a position from a set. */
1465 static void
delete(position p,position_set * s)1466 delete (position p, position_set *s)
1467 {
1468   int i;
1469 
1470   for (i = 0; i < s->nelem; ++i)
1471     if (p.index == s->elems[i].index)
1472       break;
1473   if (i < s->nelem)
1474     for (--s->nelem; i < s->nelem; ++i)
1475       s->elems[i] = s->elems[i + 1];
1476 }
1477 
1478 /* Find the index of the state corresponding to the given position set with
1479    the given preceding context, or create a new state if there is no such
1480    state.  Newline and letter tell whether we got here on a newline or
1481    letter, respectively. */
1482 static int
state_index(struct dfa * d,position_set const * s,int newline,int letter)1483 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1484 {
1485   int hash = 0;
1486   int constraint;
1487   int i, j;
1488 
1489   newline = newline ? 1 : 0;
1490   letter = letter ? 1 : 0;
1491 
1492   for (i = 0; i < s->nelem; ++i)
1493     hash ^= s->elems[i].index + s->elems[i].constraint;
1494 
1495   /* Try to find a state that exactly matches the proposed one. */
1496   for (i = 0; i < d->sindex; ++i)
1497     {
1498       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1499 	  || newline != d->states[i].newline || letter != d->states[i].letter)
1500 	continue;
1501       for (j = 0; j < s->nelem; ++j)
1502 	if (s->elems[j].constraint
1503 	    != d->states[i].elems.elems[j].constraint
1504 	    || s->elems[j].index != d->states[i].elems.elems[j].index)
1505 	  break;
1506       if (j == s->nelem)
1507 	return i;
1508     }
1509 
1510   /* We'll have to create a new state. */
1511   REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1512   d->states[i].hash = hash;
1513   MALLOC(d->states[i].elems.elems, position, s->nelem);
1514   copy(s, &d->states[i].elems);
1515   d->states[i].newline = newline;
1516   d->states[i].letter = letter;
1517   d->states[i].backref = 0;
1518   d->states[i].constraint = 0;
1519   d->states[i].first_end = 0;
1520 #ifdef MBS_SUPPORT
1521   if (MB_CUR_MAX > 1)
1522     d->states[i].mbps.nelem = 0;
1523 #endif
1524   for (j = 0; j < s->nelem; ++j)
1525     if (d->tokens[s->elems[j].index] < 0)
1526       {
1527 	constraint = s->elems[j].constraint;
1528 	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1529 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1530 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1531 	    || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1532 	  d->states[i].constraint |= constraint;
1533 	if (! d->states[i].first_end)
1534 	  d->states[i].first_end = d->tokens[s->elems[j].index];
1535       }
1536     else if (d->tokens[s->elems[j].index] == BACKREF)
1537       {
1538 	d->states[i].constraint = NO_CONSTRAINT;
1539 	d->states[i].backref = 1;
1540       }
1541 
1542   ++d->sindex;
1543 
1544   return i;
1545 }
1546 
1547 /* Find the epsilon closure of a set of positions.  If any position of the set
1548    contains a symbol that matches the empty string in some context, replace
1549    that position with the elements of its follow labeled with an appropriate
1550    constraint.  Repeat exhaustively until no funny positions are left.
1551    S->elems must be large enough to hold the result. */
1552 static void
epsclosure(position_set * s,struct dfa const * d)1553 epsclosure (position_set *s, struct dfa const *d)
1554 {
1555   int i, j;
1556   int *visited;
1557   position p, old;
1558 
1559   MALLOC(visited, int, d->tindex);
1560   for (i = 0; i < d->tindex; ++i)
1561     visited[i] = 0;
1562 
1563   for (i = 0; i < s->nelem; ++i)
1564     if (d->tokens[s->elems[i].index] >= NOTCHAR
1565 	&& d->tokens[s->elems[i].index] != BACKREF
1566 #ifdef MBS_SUPPORT
1567 	&& d->tokens[s->elems[i].index] != ANYCHAR
1568 	&& d->tokens[s->elems[i].index] != MBCSET
1569 #endif
1570 	&& d->tokens[s->elems[i].index] < CSET)
1571       {
1572 	old = s->elems[i];
1573 	p.constraint = old.constraint;
1574 	delete(s->elems[i], s);
1575 	if (visited[old.index])
1576 	  {
1577 	    --i;
1578 	    continue;
1579 	  }
1580 	visited[old.index] = 1;
1581 	switch (d->tokens[old.index])
1582 	  {
1583 	  case BEGLINE:
1584 	    p.constraint &= BEGLINE_CONSTRAINT;
1585 	    break;
1586 	  case ENDLINE:
1587 	    p.constraint &= ENDLINE_CONSTRAINT;
1588 	    break;
1589 	  case BEGWORD:
1590 	    p.constraint &= BEGWORD_CONSTRAINT;
1591 	    break;
1592 	  case ENDWORD:
1593 	    p.constraint &= ENDWORD_CONSTRAINT;
1594 	    break;
1595 	  case LIMWORD:
1596 	    p.constraint &= LIMWORD_CONSTRAINT;
1597 	    break;
1598 	  case NOTLIMWORD:
1599 	    p.constraint &= NOTLIMWORD_CONSTRAINT;
1600 	    break;
1601 	  default:
1602 	    break;
1603 	  }
1604 	for (j = 0; j < d->follows[old.index].nelem; ++j)
1605 	  {
1606 	    p.index = d->follows[old.index].elems[j].index;
1607 	    insert(p, s);
1608 	  }
1609 	/* Force rescan to start at the beginning. */
1610 	i = -1;
1611       }
1612 
1613   free(visited);
1614 }
1615 
1616 /* Perform bottom-up analysis on the parse tree, computing various functions.
1617    Note that at this point, we're pretending constructs like \< are real
1618    characters rather than constraints on what can follow them.
1619 
1620    Nullable:  A node is nullable if it is at the root of a regexp that can
1621    match the empty string.
1622    *  EMPTY leaves are nullable.
1623    * No other leaf is nullable.
1624    * A QMARK or STAR node is nullable.
1625    * A PLUS node is nullable if its argument is nullable.
1626    * A CAT node is nullable if both its arguments are nullable.
1627    * An OR node is nullable if either argument is nullable.
1628 
1629    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
1630    that could correspond to the first character of a string matching the
1631    regexp rooted at the given node.
1632    * EMPTY leaves have empty firstpos.
1633    * The firstpos of a nonempty leaf is that leaf itself.
1634    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1635      argument.
1636    * The firstpos of a CAT node is the firstpos of the left argument, union
1637      the firstpos of the right if the left argument is nullable.
1638    * The firstpos of an OR node is the union of firstpos of each argument.
1639 
1640    Lastpos:  The lastpos of a node is the set of positions that could
1641    correspond to the last character of a string matching the regexp at
1642    the given node.
1643    * EMPTY leaves have empty lastpos.
1644    * The lastpos of a nonempty leaf is that leaf itself.
1645    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1646      argument.
1647    * The lastpos of a CAT node is the lastpos of its right argument, union
1648      the lastpos of the left if the right argument is nullable.
1649    * The lastpos of an OR node is the union of the lastpos of each argument.
1650 
1651    Follow:  The follow of a position is the set of positions that could
1652    correspond to the character following a character matching the node in
1653    a string matching the regexp.  At this point we consider special symbols
1654    that match the empty string in some context to be just normal characters.
1655    Later, if we find that a special symbol is in a follow set, we will
1656    replace it with the elements of its follow, labeled with an appropriate
1657    constraint.
1658    * Every node in the firstpos of the argument of a STAR or PLUS node is in
1659      the follow of every node in the lastpos.
1660    * Every node in the firstpos of the second argument of a CAT node is in
1661      the follow of every node in the lastpos of the first argument.
1662 
1663    Because of the postfix representation of the parse tree, the depth-first
1664    analysis is conveniently done by a linear scan with the aid of a stack.
1665    Sets are stored as arrays of the elements, obeying a stack-like allocation
1666    scheme; the number of elements in each set deeper in the stack can be
1667    used to determine the address of a particular set's array. */
1668 void
dfaanalyze(struct dfa * d,int searchflag)1669 dfaanalyze (struct dfa *d, int searchflag)
1670 {
1671   int *nullable;		/* Nullable stack. */
1672   int *nfirstpos;		/* Element count stack for firstpos sets. */
1673   position *firstpos;		/* Array where firstpos elements are stored. */
1674   int *nlastpos;		/* Element count stack for lastpos sets. */
1675   position *lastpos;		/* Array where lastpos elements are stored. */
1676   int *nalloc;			/* Sizes of arrays allocated to follow sets. */
1677   position_set tmp;		/* Temporary set for merging sets. */
1678   position_set merged;		/* Result of merging sets. */
1679   int wants_newline;		/* True if some position wants newline info. */
1680   int *o_nullable;
1681   int *o_nfirst, *o_nlast;
1682   position *o_firstpos, *o_lastpos;
1683   int i, j;
1684   position *pos;
1685 
1686 #ifdef DEBUG
1687   fprintf(stderr, "dfaanalyze:\n");
1688   for (i = 0; i < d->tindex; ++i)
1689     {
1690       fprintf(stderr, " %d:", i);
1691       prtok(d->tokens[i]);
1692     }
1693   putc('\n', stderr);
1694 #endif
1695 
1696   d->searchflag = searchflag;
1697 
1698   MALLOC(nullable, int, d->depth);
1699   o_nullable = nullable;
1700   MALLOC(nfirstpos, int, d->depth);
1701   o_nfirst = nfirstpos;
1702   MALLOC(firstpos, position, d->nleaves);
1703   o_firstpos = firstpos, firstpos += d->nleaves;
1704   MALLOC(nlastpos, int, d->depth);
1705   o_nlast = nlastpos;
1706   MALLOC(lastpos, position, d->nleaves);
1707   o_lastpos = lastpos, lastpos += d->nleaves;
1708   MALLOC(nalloc, int, d->tindex);
1709   for (i = 0; i < d->tindex; ++i)
1710     nalloc[i] = 0;
1711   MALLOC(merged.elems, position, d->nleaves);
1712 
1713   CALLOC(d->follows, position_set, d->tindex);
1714 
1715   for (i = 0; i < d->tindex; ++i)
1716 #ifdef DEBUG
1717     {				/* Nonsyntactic #ifdef goo... */
1718 #endif
1719     switch (d->tokens[i])
1720       {
1721       case EMPTY:
1722 	/* The empty set is nullable. */
1723 	*nullable++ = 1;
1724 
1725 	/* The firstpos and lastpos of the empty leaf are both empty. */
1726 	*nfirstpos++ = *nlastpos++ = 0;
1727 	break;
1728 
1729       case STAR:
1730       case PLUS:
1731 	/* Every element in the firstpos of the argument is in the follow
1732 	   of every element in the lastpos. */
1733 	tmp.nelem = nfirstpos[-1];
1734 	tmp.elems = firstpos;
1735 	pos = lastpos;
1736 	for (j = 0; j < nlastpos[-1]; ++j)
1737 	  {
1738 	    merge(&tmp, &d->follows[pos[j].index], &merged);
1739 	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1740 				 nalloc[pos[j].index], merged.nelem - 1);
1741 	    copy(&merged, &d->follows[pos[j].index]);
1742 	  }
1743 
1744       case QMARK:
1745 	/* A QMARK or STAR node is automatically nullable. */
1746 	if (d->tokens[i] != PLUS)
1747 	  nullable[-1] = 1;
1748 	break;
1749 
1750       case CAT:
1751 	/* Every element in the firstpos of the second argument is in the
1752 	   follow of every element in the lastpos of the first argument. */
1753 	tmp.nelem = nfirstpos[-1];
1754 	tmp.elems = firstpos;
1755 	pos = lastpos + nlastpos[-1];
1756 	for (j = 0; j < nlastpos[-2]; ++j)
1757 	  {
1758 	    merge(&tmp, &d->follows[pos[j].index], &merged);
1759 	    REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1760 				 nalloc[pos[j].index], merged.nelem - 1);
1761 	    copy(&merged, &d->follows[pos[j].index]);
1762 	  }
1763 
1764 	/* The firstpos of a CAT node is the firstpos of the first argument,
1765 	   union that of the second argument if the first is nullable. */
1766 	if (nullable[-2])
1767 	  nfirstpos[-2] += nfirstpos[-1];
1768 	else
1769 	  firstpos += nfirstpos[-1];
1770 	--nfirstpos;
1771 
1772 	/* The lastpos of a CAT node is the lastpos of the second argument,
1773 	   union that of the first argument if the second is nullable. */
1774 	if (nullable[-1])
1775 	  nlastpos[-2] += nlastpos[-1];
1776 	else
1777 	  {
1778 	    pos = lastpos + nlastpos[-2];
1779 	    for (j = nlastpos[-1] - 1; j >= 0; --j)
1780 	      pos[j] = lastpos[j];
1781 	    lastpos += nlastpos[-2];
1782 	    nlastpos[-2] = nlastpos[-1];
1783 	  }
1784 	--nlastpos;
1785 
1786 	/* A CAT node is nullable if both arguments are nullable. */
1787 	nullable[-2] = nullable[-1] && nullable[-2];
1788 	--nullable;
1789 	break;
1790 
1791       case OR:
1792       case ORTOP:
1793 	/* The firstpos is the union of the firstpos of each argument. */
1794 	nfirstpos[-2] += nfirstpos[-1];
1795 	--nfirstpos;
1796 
1797 	/* The lastpos is the union of the lastpos of each argument. */
1798 	nlastpos[-2] += nlastpos[-1];
1799 	--nlastpos;
1800 
1801 	/* An OR node is nullable if either argument is nullable. */
1802 	nullable[-2] = nullable[-1] || nullable[-2];
1803 	--nullable;
1804 	break;
1805 
1806       default:
1807 	/* Anything else is a nonempty position.  (Note that special
1808 	   constructs like \< are treated as nonempty strings here;
1809 	   an "epsilon closure" effectively makes them nullable later.
1810 	   Backreferences have to get a real position so we can detect
1811 	   transitions on them later.  But they are nullable. */
1812 	*nullable++ = d->tokens[i] == BACKREF;
1813 
1814 	/* This position is in its own firstpos and lastpos. */
1815 	*nfirstpos++ = *nlastpos++ = 1;
1816 	--firstpos, --lastpos;
1817 	firstpos->index = lastpos->index = i;
1818 	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1819 
1820 	/* Allocate the follow set for this position. */
1821 	nalloc[i] = 1;
1822 	MALLOC(d->follows[i].elems, position, nalloc[i]);
1823 	break;
1824       }
1825 #ifdef DEBUG
1826     /* ... balance the above nonsyntactic #ifdef goo... */
1827       fprintf(stderr, "node %d:", i);
1828       prtok(d->tokens[i]);
1829       putc('\n', stderr);
1830       fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1831       fprintf(stderr, " firstpos:");
1832       for (j = nfirstpos[-1] - 1; j >= 0; --j)
1833 	{
1834 	  fprintf(stderr, " %d:", firstpos[j].index);
1835 	  prtok(d->tokens[firstpos[j].index]);
1836 	}
1837       fprintf(stderr, "\n lastpos:");
1838       for (j = nlastpos[-1] - 1; j >= 0; --j)
1839 	{
1840 	  fprintf(stderr, " %d:", lastpos[j].index);
1841 	  prtok(d->tokens[lastpos[j].index]);
1842 	}
1843       putc('\n', stderr);
1844     }
1845 #endif
1846 
1847   /* For each follow set that is the follow set of a real position, replace
1848      it with its epsilon closure. */
1849   for (i = 0; i < d->tindex; ++i)
1850     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1851 #ifdef MBS_SUPPORT
1852         || d->tokens[i] == ANYCHAR
1853         || d->tokens[i] == MBCSET
1854 #endif
1855 	|| d->tokens[i] >= CSET)
1856       {
1857 #ifdef DEBUG
1858 	fprintf(stderr, "follows(%d:", i);
1859 	prtok(d->tokens[i]);
1860 	fprintf(stderr, "):");
1861 	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1862 	  {
1863 	    fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1864 	    prtok(d->tokens[d->follows[i].elems[j].index]);
1865 	  }
1866 	putc('\n', stderr);
1867 #endif
1868 	copy(&d->follows[i], &merged);
1869 	epsclosure(&merged, d);
1870 	if (d->follows[i].nelem < merged.nelem)
1871 	  REALLOC(d->follows[i].elems, position, merged.nelem);
1872 	copy(&merged, &d->follows[i]);
1873       }
1874 
1875   /* Get the epsilon closure of the firstpos of the regexp.  The result will
1876      be the set of positions of state 0. */
1877   merged.nelem = 0;
1878   for (i = 0; i < nfirstpos[-1]; ++i)
1879     insert(firstpos[i], &merged);
1880   epsclosure(&merged, d);
1881 
1882   /* Check if any of the positions of state 0 will want newline context. */
1883   wants_newline = 0;
1884   for (i = 0; i < merged.nelem; ++i)
1885     if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1886       wants_newline = 1;
1887 
1888   /* Build the initial state. */
1889   d->salloc = 1;
1890   d->sindex = 0;
1891   MALLOC(d->states, dfa_state, d->salloc);
1892   state_index(d, &merged, wants_newline, 0);
1893 
1894   free(o_nullable);
1895   free(o_nfirst);
1896   free(o_firstpos);
1897   free(o_nlast);
1898   free(o_lastpos);
1899   free(nalloc);
1900   free(merged.elems);
1901 }
1902 
1903 /* Find, for each character, the transition out of state s of d, and store
1904    it in the appropriate slot of trans.
1905 
1906    We divide the positions of s into groups (positions can appear in more
1907    than one group).  Each group is labeled with a set of characters that
1908    every position in the group matches (taking into account, if necessary,
1909    preceding context information of s).  For each group, find the union
1910    of the its elements' follows.  This set is the set of positions of the
1911    new state.  For each character in the group's label, set the transition
1912    on this character to be to a state corresponding to the set's positions,
1913    and its associated backward context information, if necessary.
1914 
1915    If we are building a searching matcher, we include the positions of state
1916    0 in every state.
1917 
1918    The collection of groups is constructed by building an equivalence-class
1919    partition of the positions of s.
1920 
1921    For each position, find the set of characters C that it matches.  Eliminate
1922    any characters from C that fail on grounds of backward context.
1923 
1924    Search through the groups, looking for a group whose label L has nonempty
1925    intersection with C.  If L - C is nonempty, create a new group labeled
1926    L - C and having the same positions as the current group, and set L to
1927    the intersection of L and C.  Insert the position in this group, set
1928    C = C - L, and resume scanning.
1929 
1930    If after comparing with every group there are characters remaining in C,
1931    create a new group labeled with the characters of C and insert this
1932    position in that group. */
1933 void
dfastate(int s,struct dfa * d,int trans[])1934 dfastate (int s, struct dfa *d, int trans[])
1935 {
1936   position_set grps[NOTCHAR];	/* As many as will ever be needed. */
1937   charclass labels[NOTCHAR];	/* Labels corresponding to the groups. */
1938   int ngrps = 0;		/* Number of groups actually used. */
1939   position pos;			/* Current position being considered. */
1940   charclass matches;		/* Set of matching characters. */
1941   int matchesf;			/* True if matches is nonempty. */
1942   charclass intersect;		/* Intersection with some label set. */
1943   int intersectf;		/* True if intersect is nonempty. */
1944   charclass leftovers;		/* Stuff in the label that didn't match. */
1945   int leftoversf;		/* True if leftovers is nonempty. */
1946   static charclass letters;	/* Set of characters considered letters. */
1947   static charclass newline;	/* Set of characters that aren't newline. */
1948   position_set follows;		/* Union of the follows of some group. */
1949   position_set tmp;		/* Temporary space for merging sets. */
1950   int state;			/* New state. */
1951   int wants_newline;		/* New state wants to know newline context. */
1952   int state_newline;		/* New state on a newline transition. */
1953   int wants_letter;		/* New state wants to know letter context. */
1954   int state_letter;		/* New state on a letter transition. */
1955   static int initialized;	/* Flag for static initialization. */
1956 #ifdef MBS_SUPPORT
1957   int next_isnt_1st_byte = 0;	/* Flag If we can't add state0.  */
1958 #endif
1959   int i, j, k;
1960 
1961   /* Initialize the set of letters, if necessary. */
1962   if (! initialized)
1963     {
1964       initialized = 1;
1965       for (i = 0; i < NOTCHAR; ++i)
1966 	if (IS_WORD_CONSTITUENT(i))
1967 	  setbit(i, letters);
1968       setbit(eolbyte, newline);
1969     }
1970 
1971   zeroset(matches);
1972 
1973   for (i = 0; i < d->states[s].elems.nelem; ++i)
1974     {
1975       pos = d->states[s].elems.elems[i];
1976       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1977 	setbit(d->tokens[pos.index], matches);
1978       else if (d->tokens[pos.index] >= CSET)
1979 	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1980 #ifdef MBS_SUPPORT
1981       else if (d->tokens[pos.index] == ANYCHAR
1982                || d->tokens[pos.index] == MBCSET)
1983       /* MB_CUR_MAX > 1  */
1984 	{
1985 	  /* ANYCHAR and MBCSET must match with a single character, so we
1986 	     must put it to d->states[s].mbps, which contains the positions
1987 	     which can match with a single character not a byte.  */
1988 	  if (d->states[s].mbps.nelem == 0)
1989 	    {
1990 	      MALLOC(d->states[s].mbps.elems, position,
1991 		     d->states[s].elems.nelem);
1992 	    }
1993 	  insert(pos, &(d->states[s].mbps));
1994 	  continue;
1995 	}
1996 #endif /* MBS_SUPPORT */
1997       else
1998 	continue;
1999 
2000       /* Some characters may need to be eliminated from matches because
2001 	 they fail in the current context. */
2002       if (pos.constraint != 0xFF)
2003 	{
2004 	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2005 					 d->states[s].newline, 1))
2006 	    clrbit(eolbyte, matches);
2007 	  if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2008 					 d->states[s].newline, 0))
2009 	    for (j = 0; j < CHARCLASS_INTS; ++j)
2010 	      matches[j] &= newline[j];
2011 	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2012 					d->states[s].letter, 1))
2013 	    for (j = 0; j < CHARCLASS_INTS; ++j)
2014 	      matches[j] &= ~letters[j];
2015 	  if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2016 					d->states[s].letter, 0))
2017 	    for (j = 0; j < CHARCLASS_INTS; ++j)
2018 	      matches[j] &= letters[j];
2019 
2020 	  /* If there are no characters left, there's no point in going on. */
2021 	  for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2022 	    continue;
2023 	  if (j == CHARCLASS_INTS)
2024 	    continue;
2025 	}
2026 
2027       for (j = 0; j < ngrps; ++j)
2028 	{
2029 	  /* If matches contains a single character only, and the current
2030 	     group's label doesn't contain that character, go on to the
2031 	     next group. */
2032 	  if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2033 	      && !tstbit(d->tokens[pos.index], labels[j]))
2034 	    continue;
2035 
2036 	  /* Check if this group's label has a nonempty intersection with
2037 	     matches. */
2038 	  intersectf = 0;
2039 	  for (k = 0; k < CHARCLASS_INTS; ++k)
2040 	    (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2041 	  if (! intersectf)
2042 	    continue;
2043 
2044 	  /* It does; now find the set differences both ways. */
2045 	  leftoversf = matchesf = 0;
2046 	  for (k = 0; k < CHARCLASS_INTS; ++k)
2047 	    {
2048 	      /* Even an optimizing compiler can't know this for sure. */
2049 	      int match = matches[k], label = labels[j][k];
2050 
2051 	      (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2052 	      (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2053 	    }
2054 
2055 	  /* If there were leftovers, create a new group labeled with them. */
2056 	  if (leftoversf)
2057 	    {
2058 	      copyset(leftovers, labels[ngrps]);
2059 	      copyset(intersect, labels[j]);
2060 	      MALLOC(grps[ngrps].elems, position, d->nleaves);
2061 	      copy(&grps[j], &grps[ngrps]);
2062 	      ++ngrps;
2063 	    }
2064 
2065 	  /* Put the position in the current group.  Note that there is no
2066 	     reason to call insert() here. */
2067 	  grps[j].elems[grps[j].nelem++] = pos;
2068 
2069 	  /* If every character matching the current position has been
2070 	     accounted for, we're done. */
2071 	  if (! matchesf)
2072 	    break;
2073 	}
2074 
2075       /* If we've passed the last group, and there are still characters
2076 	 unaccounted for, then we'll have to create a new group. */
2077       if (j == ngrps)
2078 	{
2079 	  copyset(matches, labels[ngrps]);
2080 	  zeroset(matches);
2081 	  MALLOC(grps[ngrps].elems, position, d->nleaves);
2082 	  grps[ngrps].nelem = 1;
2083 	  grps[ngrps].elems[0] = pos;
2084 	  ++ngrps;
2085 	}
2086     }
2087 
2088   MALLOC(follows.elems, position, d->nleaves);
2089   MALLOC(tmp.elems, position, d->nleaves);
2090 
2091   /* If we are a searching matcher, the default transition is to a state
2092      containing the positions of state 0, otherwise the default transition
2093      is to fail miserably. */
2094   if (d->searchflag)
2095     {
2096       wants_newline = 0;
2097       wants_letter = 0;
2098       for (i = 0; i < d->states[0].elems.nelem; ++i)
2099 	{
2100 	  if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2101 	    wants_newline = 1;
2102 	  if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2103 	    wants_letter = 1;
2104 	}
2105       copy(&d->states[0].elems, &follows);
2106       state = state_index(d, &follows, 0, 0);
2107       if (wants_newline)
2108 	state_newline = state_index(d, &follows, 1, 0);
2109       else
2110 	state_newline = state;
2111       if (wants_letter)
2112 	state_letter = state_index(d, &follows, 0, 1);
2113       else
2114 	state_letter = state;
2115       for (i = 0; i < NOTCHAR; ++i)
2116 	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2117       trans[eolbyte] = state_newline;
2118     }
2119   else
2120     for (i = 0; i < NOTCHAR; ++i)
2121       trans[i] = -1;
2122 
2123   for (i = 0; i < ngrps; ++i)
2124     {
2125       follows.nelem = 0;
2126 
2127       /* Find the union of the follows of the positions of the group.
2128 	 This is a hideously inefficient loop.  Fix it someday. */
2129       for (j = 0; j < grps[i].nelem; ++j)
2130 	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2131 	  insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2132 
2133 #ifdef MBS_SUPPORT
2134       if (MB_CUR_MAX > 1)
2135 	{
2136 	  /* If a token in follows.elems is not 1st byte of a multibyte
2137 	     character, or the states of follows must accept the bytes
2138 	     which are not 1st byte of the multibyte character.
2139 	     Then, if a state of follows encounter a byte, it must not be
2140 	     a 1st byte of a multibyte character nor singlebyte character.
2141 	     We cansel to add state[0].follows to next state, because
2142 	     state[0] must accept 1st-byte
2143 
2144 	     For example, we assume <sb a> is a certain singlebyte
2145 	     character, <mb A> is a certain multibyte character, and the
2146 	     codepoint of <sb a> equals the 2nd byte of the codepoint of
2147 	     <mb A>.
2148 	     When state[0] accepts <sb a>, state[i] transit to state[i+1]
2149 	     by accepting accepts 1st byte of <mb A>, and state[i+1]
2150 	     accepts 2nd byte of <mb A>, if state[i+1] encounter the
2151 	     codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2152 	     <mb A>, so we can not add state[0].  */
2153 
2154 	  next_isnt_1st_byte = 0;
2155 	  for (j = 0; j < follows.nelem; ++j)
2156 	    {
2157 	      if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2158 		{
2159 		  next_isnt_1st_byte = 1;
2160 		  break;
2161 		}
2162 	    }
2163 	}
2164 #endif
2165 
2166       /* If we are building a searching matcher, throw in the positions
2167 	 of state 0 as well. */
2168 #ifdef MBS_SUPPORT
2169       if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2170 #else
2171       if (d->searchflag)
2172 #endif
2173 	for (j = 0; j < d->states[0].elems.nelem; ++j)
2174 	  insert(d->states[0].elems.elems[j], &follows);
2175 
2176       /* Find out if the new state will want any context information. */
2177       wants_newline = 0;
2178       if (tstbit(eolbyte, labels[i]))
2179 	for (j = 0; j < follows.nelem; ++j)
2180 	  if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2181 	    wants_newline = 1;
2182 
2183       wants_letter = 0;
2184       for (j = 0; j < CHARCLASS_INTS; ++j)
2185 	if (labels[i][j] & letters[j])
2186 	  break;
2187       if (j < CHARCLASS_INTS)
2188 	for (j = 0; j < follows.nelem; ++j)
2189 	  if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2190 	    wants_letter = 1;
2191 
2192       /* Find the state(s) corresponding to the union of the follows. */
2193       state = state_index(d, &follows, 0, 0);
2194       if (wants_newline)
2195 	state_newline = state_index(d, &follows, 1, 0);
2196       else
2197 	state_newline = state;
2198       if (wants_letter)
2199 	state_letter = state_index(d, &follows, 0, 1);
2200       else
2201 	state_letter = state;
2202 
2203       /* Set the transitions for each character in the current label. */
2204       for (j = 0; j < CHARCLASS_INTS; ++j)
2205 	for (k = 0; k < INTBITS; ++k)
2206 	  if (labels[i][j] & 1U << k)
2207 	    {
2208 	      int c = j * INTBITS + k;
2209 
2210 	      if (c == eolbyte)
2211 		trans[c] = state_newline;
2212 	      else if (IS_WORD_CONSTITUENT(c))
2213 		trans[c] = state_letter;
2214 	      else if (c < NOTCHAR)
2215 		trans[c] = state;
2216 	    }
2217     }
2218 
2219   for (i = 0; i < ngrps; ++i)
2220     free(grps[i].elems);
2221   free(follows.elems);
2222   free(tmp.elems);
2223 }
2224 
2225 /* Some routines for manipulating a compiled dfa's transition tables.
2226    Each state may or may not have a transition table; if it does, and it
2227    is a non-accepting state, then d->trans[state] points to its table.
2228    If it is an accepting state then d->fails[state] points to its table.
2229    If it has no table at all, then d->trans[state] is NULL.
2230    TODO: Improve this comment, get rid of the unnecessary redundancy. */
2231 
2232 static void
build_state(int s,struct dfa * d)2233 build_state (int s, struct dfa *d)
2234 {
2235   int *trans;			/* The new transition table. */
2236   int i;
2237 
2238   /* Set an upper limit on the number of transition tables that will ever
2239      exist at once.  1024 is arbitrary.  The idea is that the frequently
2240      used transition tables will be quickly rebuilt, whereas the ones that
2241      were only needed once or twice will be cleared away. */
2242   if (d->trcount >= 1024)
2243     {
2244       for (i = 0; i < d->tralloc; ++i)
2245 	if (d->trans[i])
2246 	  {
2247 	    free((ptr_t) d->trans[i]);
2248 	    d->trans[i] = NULL;
2249 	  }
2250 	else if (d->fails[i])
2251 	  {
2252 	    free((ptr_t) d->fails[i]);
2253 	    d->fails[i] = NULL;
2254 	  }
2255       d->trcount = 0;
2256     }
2257 
2258   ++d->trcount;
2259 
2260   /* Set up the success bits for this state. */
2261   d->success[s] = 0;
2262   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2263       s, *d))
2264     d->success[s] |= 4;
2265   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2266       s, *d))
2267     d->success[s] |= 2;
2268   if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2269       s, *d))
2270     d->success[s] |= 1;
2271 
2272   MALLOC(trans, int, NOTCHAR);
2273   dfastate(s, d, trans);
2274 
2275   /* Now go through the new transition table, and make sure that the trans
2276      and fail arrays are allocated large enough to hold a pointer for the
2277      largest state mentioned in the table. */
2278   for (i = 0; i < NOTCHAR; ++i)
2279     if (trans[i] >= d->tralloc)
2280       {
2281 	int oldalloc = d->tralloc;
2282 
2283 	while (trans[i] >= d->tralloc)
2284 	  d->tralloc *= 2;
2285 	REALLOC(d->realtrans, int *, d->tralloc + 1);
2286 	d->trans = d->realtrans + 1;
2287 	REALLOC(d->fails, int *, d->tralloc);
2288 	REALLOC(d->success, int, d->tralloc);
2289 	while (oldalloc < d->tralloc)
2290 	  {
2291 	    d->trans[oldalloc] = NULL;
2292 	    d->fails[oldalloc++] = NULL;
2293 	  }
2294       }
2295 
2296   /* Newline is a sentinel.  */
2297   trans[eolbyte] = -1;
2298 
2299   if (ACCEPTING(s, *d))
2300     d->fails[s] = trans;
2301   else
2302     d->trans[s] = trans;
2303 }
2304 
2305 static void
build_state_zero(struct dfa * d)2306 build_state_zero (struct dfa *d)
2307 {
2308   d->tralloc = 1;
2309   d->trcount = 0;
2310   CALLOC(d->realtrans, int *, d->tralloc + 1);
2311   d->trans = d->realtrans + 1;
2312   CALLOC(d->fails, int *, d->tralloc);
2313   MALLOC(d->success, int, d->tralloc);
2314   build_state(0, d);
2315 }
2316 
2317 #ifdef MBS_SUPPORT
2318 /* Multibyte character handling sub-routins for dfaexec.  */
2319 
2320 /* Initial state may encounter the byte which is not a singlebyte character
2321    nor 1st byte of a multibyte character.  But it is incorrect for initial
2322    state to accept such a byte.
2323    For example, in sjis encoding the regular expression like "\\" accepts
2324    the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2325    0x815c. Then Initial state must skip the bytes which are not a singlebyte
2326    character nor 1st byte of a multibyte character.  */
2327 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)		\
2328   if (s == 0)						\
2329     {							\
2330       while (inputwcs[p - buf_begin] == 0		\
2331             && mblen_buf[p - buf_begin] > 0		\
2332 	    && p < buf_end)				\
2333         ++p;						\
2334       if (p >= end)					\
2335 	{						\
2336           free(mblen_buf);				\
2337           free(inputwcs);				\
2338 	  return (size_t) -1;				\
2339 	}						\
2340     }
2341 
2342 static void
realloc_trans_if_necessary(struct dfa * d,int new_state)2343 realloc_trans_if_necessary(struct dfa *d, int new_state)
2344 {
2345   /* Make sure that the trans and fail arrays are allocated large enough
2346      to hold a pointer for the new state. */
2347   if (new_state >= d->tralloc)
2348     {
2349       int oldalloc = d->tralloc;
2350 
2351       while (new_state >= d->tralloc)
2352 	d->tralloc *= 2;
2353       REALLOC(d->realtrans, int *, d->tralloc + 1);
2354       d->trans = d->realtrans + 1;
2355       REALLOC(d->fails, int *, d->tralloc);
2356       REALLOC(d->success, int, d->tralloc);
2357       while (oldalloc < d->tralloc)
2358 	{
2359 	  d->trans[oldalloc] = NULL;
2360 	  d->fails[oldalloc++] = NULL;
2361 	}
2362     }
2363 }
2364 
2365 /* Return values of transit_state_singlebyte(), and
2366    transit_state_consume_1char.  */
2367 typedef enum
2368 {
2369   TRANSIT_STATE_IN_PROGRESS,	/* State transition has not finished.  */
2370   TRANSIT_STATE_DONE,		/* State transition has finished.  */
2371   TRANSIT_STATE_END_BUFFER	/* Reach the end of the buffer.  */
2372 } status_transit_state;
2373 
2374 /* Consume a single byte and transit state from 's' to '*next_state'.
2375    This function is almost same as the state transition routin in dfaexec().
2376    But state transition is done just once, otherwise matching succeed or
2377    reach the end of the buffer.  */
2378 static status_transit_state
transit_state_singlebyte(struct dfa * d,int s,unsigned char const * p,int * next_state)2379 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2380 				  int *next_state)
2381 {
2382   int *t;
2383   int works = s;
2384 
2385   status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2386 
2387   while (rval == TRANSIT_STATE_IN_PROGRESS)
2388     {
2389       if ((t = d->trans[works]) != NULL)
2390 	{
2391 	  works = t[*p];
2392 	  rval = TRANSIT_STATE_DONE;
2393 	  if (works < 0)
2394 	    works = 0;
2395 	}
2396       else if (works < 0)
2397 	{
2398 	  if (p == buf_end)
2399 	    /* At the moment, it must not happen.  */
2400 	    return TRANSIT_STATE_END_BUFFER;
2401 	  works = 0;
2402 	}
2403       else if (d->fails[works])
2404 	{
2405 	  works = d->fails[works][*p];
2406 	  rval = TRANSIT_STATE_DONE;
2407 	}
2408       else
2409 	{
2410 	  build_state(works, d);
2411 	}
2412     }
2413   *next_state = works;
2414   return rval;
2415 }
2416 
2417 /* Check whether period can match or not in the current context.  If it can,
2418    return the amount of the bytes with which period can match, otherwise
2419    return 0.
2420    `pos' is the position of the period.  `index' is the index from the
2421    buf_begin, and it is the current position in the buffer.  */
2422 static int
match_anychar(struct dfa * d,int s,position pos,int index)2423 match_anychar (struct dfa *d, int s, position pos, int index)
2424 {
2425   int newline = 0;
2426   int letter = 0;
2427   wchar_t wc;
2428   int mbclen;
2429 
2430   wc = inputwcs[index];
2431   mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2432 
2433   /* Check context.  */
2434   if (wc == (wchar_t)eolbyte)
2435     {
2436       if (!(syntax_bits & RE_DOT_NEWLINE))
2437 	return 0;
2438       newline = 1;
2439     }
2440   else if (wc == (wchar_t)'\0')
2441     {
2442       if (syntax_bits & RE_DOT_NOT_NULL)
2443 	return 0;
2444       newline = 1;
2445     }
2446 
2447   if (iswalnum(wc) || wc == L'_')
2448     letter = 1;
2449 
2450   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2451 			   newline, d->states[s].letter, letter))
2452     return 0;
2453 
2454   return mbclen;
2455 }
2456 
2457 /* Check whether bracket expression can match or not in the current context.
2458    If it can, return the amount of the bytes with which expression can match,
2459    otherwise return 0.
2460    `pos' is the position of the bracket expression.  `index' is the index
2461    from the buf_begin, and it is the current position in the buffer.  */
2462 int
match_mb_charset(struct dfa * d,int s,position pos,int index)2463 match_mb_charset (struct dfa *d, int s, position pos, int index)
2464 {
2465   int i;
2466   int match;		/* Flag which represent that matching succeed.  */
2467   int match_len;	/* Length of the character (or collating element)
2468 			   with which this operator match.  */
2469   size_t op_len;		/* Length of the operator.  */
2470   char buffer[128];
2471   wchar_t wcbuf[6];
2472 
2473   /* Pointer to the structure to which we are currently reffering.  */
2474   struct mb_char_classes *work_mbc;
2475 
2476   int newline = 0;
2477   int letter = 0;
2478   wchar_t wc;		/* Current reffering character.  */
2479 
2480   wc = inputwcs[index];
2481 
2482   /* Check context.  */
2483   if (wc == (wchar_t)eolbyte)
2484     {
2485       if (!(syntax_bits & RE_DOT_NEWLINE))
2486 	return 0;
2487       newline = 1;
2488     }
2489   else if (wc == (wchar_t)'\0')
2490     {
2491       if (syntax_bits & RE_DOT_NOT_NULL)
2492 	return 0;
2493       newline = 1;
2494     }
2495   if (iswalnum(wc) || wc == L'_')
2496     letter = 1;
2497   if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2498 			   newline, d->states[s].letter, letter))
2499     return 0;
2500 
2501   /* Assign the current reffering operator to work_mbc.  */
2502   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2503   match = !work_mbc->invert;
2504   match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2505 
2506   /* match with a character class?  */
2507   for (i = 0; i<work_mbc->nch_classes; i++)
2508     {
2509       if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2510 	goto charset_matched;
2511     }
2512 
2513   strncpy(buffer, buf_begin + index, match_len);
2514   buffer[match_len] = '\0';
2515 
2516   /* match with an equivalent class?  */
2517   for (i = 0; i<work_mbc->nequivs; i++)
2518     {
2519       op_len = strlen(work_mbc->equivs[i]);
2520       strncpy(buffer, buf_begin + index, op_len);
2521       buffer[op_len] = '\0';
2522       if (strcoll(work_mbc->equivs[i], buffer) == 0)
2523 	{
2524 	  match_len = op_len;
2525 	  goto charset_matched;
2526 	}
2527     }
2528 
2529   /* match with a collating element?  */
2530   for (i = 0; i<work_mbc->ncoll_elems; i++)
2531     {
2532       op_len = strlen(work_mbc->coll_elems[i]);
2533       strncpy(buffer, buf_begin + index, op_len);
2534       buffer[op_len] = '\0';
2535 
2536       if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2537 	{
2538 	  match_len = op_len;
2539 	  goto charset_matched;
2540 	}
2541     }
2542 
2543   wcbuf[0] = wc;
2544   wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2545 
2546   /* match with a range?  */
2547   for (i = 0; i<work_mbc->nranges; i++)
2548     {
2549       wcbuf[2] = work_mbc->range_sts[i];
2550       wcbuf[4] = work_mbc->range_ends[i];
2551 
2552       if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2553 	  wcscoll(wcbuf+4, wcbuf) >= 0)
2554 	goto charset_matched;
2555     }
2556 
2557   /* match with a character?  */
2558   for (i = 0; i<work_mbc->nchars; i++)
2559     {
2560       if (wc == work_mbc->chars[i])
2561 	goto charset_matched;
2562     }
2563 
2564   match = !match;
2565 
2566  charset_matched:
2567   return match ? match_len : 0;
2568 }
2569 
2570 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2571    array which corresponds to `d->states[s].mbps.elem' and each element of
2572    the array contains the amount of the bytes with which the element can
2573    match.
2574    `index' is the index from the buf_begin, and it is the current position
2575    in the buffer.
2576    Caller MUST free the array which this function return.  */
2577 static int*
check_matching_with_multibyte_ops(struct dfa * d,int s,int index)2578 check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2579 {
2580   int i;
2581   int* rarray;
2582 
2583   MALLOC(rarray, int, d->states[s].mbps.nelem);
2584   for (i = 0; i < d->states[s].mbps.nelem; ++i)
2585     {
2586       position pos = d->states[s].mbps.elems[i];
2587       switch(d->tokens[pos.index])
2588 	{
2589 	case ANYCHAR:
2590 	  rarray[i] = match_anychar(d, s, pos, index);
2591 	  break;
2592 	case MBCSET:
2593 	  rarray[i] = match_mb_charset(d, s, pos, index);
2594 	  break;
2595 	default:
2596 	  break; /* can not happen.  */
2597 	}
2598     }
2599   return rarray;
2600 }
2601 
2602 /* Consume a single character and enumerate all of the positions which can
2603    be next position from the state `s'.
2604    `match_lens' is the input. It can be NULL, but it can also be the output
2605    of check_matching_with_multibyte_ops() for optimization.
2606    `mbclen' and `pps' are the output.  `mbclen' is the length of the
2607    character consumed, and `pps' is the set this function enumerate.  */
2608 static status_transit_state
transit_state_consume_1char(struct dfa * d,int s,unsigned char const ** pp,int * match_lens,int * mbclen,position_set * pps)2609 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2610 			     int *match_lens, int *mbclen, position_set *pps)
2611 {
2612   int i, j;
2613   int s1, s2;
2614   int* work_mbls;
2615   status_transit_state rs = TRANSIT_STATE_DONE;
2616 
2617   /* Calculate the length of the (single/multi byte) character
2618      to which p points.  */
2619   *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2620     : mblen_buf[*pp - buf_begin];
2621 
2622   /* Calculate the state which can be reached from the state `s' by
2623      consuming `*mbclen' single bytes from the buffer.  */
2624   s1 = s;
2625   for (i = 0; i < *mbclen; i++)
2626     {
2627       s2 = s1;
2628       rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2629     }
2630   /* Copy the positions contained by `s1' to the set `pps'.  */
2631   copy(&(d->states[s1].elems), pps);
2632 
2633   /* Check (inputed)match_lens, and initialize if it is NULL.  */
2634   if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2635     work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2636   else
2637     work_mbls = match_lens;
2638 
2639   /* Add all of the positions which can be reached from `s' by consuming
2640      a single character.  */
2641   for (i = 0; i < d->states[s].mbps.nelem ; i++)
2642    {
2643       if (work_mbls[i] == *mbclen)
2644 	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2645 	     j++)
2646 	  insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2647 		 pps);
2648     }
2649 
2650   if (match_lens == NULL && work_mbls != NULL)
2651     free(work_mbls);
2652   return rs;
2653 }
2654 
2655 /* Transit state from s, then return new state and update the pointer of the
2656    buffer.  This function is for some operator which can match with a multi-
2657    byte character or a collating element(which may be multi characters).  */
2658 static int
transit_state(struct dfa * d,int s,unsigned char const ** pp)2659 transit_state (struct dfa *d, int s, unsigned char const **pp)
2660 {
2661   int s1;
2662   int mbclen;		/* The length of current input multibyte character. */
2663   int maxlen = 0;
2664   int i, j;
2665   int *match_lens = NULL;
2666   int nelem = d->states[s].mbps.nelem; /* Just a alias.  */
2667   position_set follows;
2668   unsigned char const *p1 = *pp;
2669   status_transit_state rs;
2670   wchar_t wc;
2671 
2672   if (nelem > 0)
2673     /* This state has (a) multibyte operator(s).
2674        We check whether each of them can match or not.  */
2675     {
2676       /* Note: caller must free the return value of this function.  */
2677       match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2678 
2679       for (i = 0; i < nelem; i++)
2680 	/* Search the operator which match the longest string,
2681 	   in this state.  */
2682 	{
2683 	  if (match_lens[i] > maxlen)
2684 	    maxlen = match_lens[i];
2685 	}
2686     }
2687 
2688   if (nelem == 0 || maxlen == 0)
2689     /* This state has no multibyte operator which can match.
2690        We need to  check only one singlebyte character.  */
2691     {
2692       status_transit_state rs;
2693       rs = transit_state_singlebyte(d, s, *pp, &s1);
2694 
2695       /* We must update the pointer if state transition succeeded.  */
2696       if (rs == TRANSIT_STATE_DONE)
2697 	++*pp;
2698 
2699       if (match_lens != NULL)
2700 	free(match_lens);
2701       return s1;
2702     }
2703 
2704   /* This state has some operators which can match a multibyte character.  */
2705   follows.nelem = 0;
2706   MALLOC(follows.elems, position, d->nleaves);
2707 
2708   /* `maxlen' may be longer than the length of a character, because it may
2709      not be a character but a (multi character) collating element.
2710      We enumerate all of the positions which `s' can reach by consuming
2711      `maxlen' bytes.  */
2712   rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2713 
2714   wc = inputwcs[*pp - mbclen - buf_begin];
2715   s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2716   realloc_trans_if_necessary(d, s1);
2717 
2718   while (*pp - p1 < maxlen)
2719     {
2720       follows.nelem = 0;
2721       rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2722 
2723       for (i = 0; i < nelem ; i++)
2724 	{
2725 	  if (match_lens[i] == *pp - p1)
2726 	    for (j = 0;
2727 		 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2728 	      insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2729 		     &follows);
2730 	}
2731 
2732       wc = inputwcs[*pp - mbclen - buf_begin];
2733       s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2734       realloc_trans_if_necessary(d, s1);
2735     }
2736   free(match_lens);
2737   free(follows.elems);
2738   return s1;
2739 }
2740 
2741 #endif
2742 
2743 /* Search through a buffer looking for a match to the given struct dfa.
2744    Find the first occurrence of a string matching the regexp in the buffer,
2745    and the shortest possible version thereof.  Return the offset of the first
2746    character after the match, or (size_t) -1 if none is found.  BEGIN points to
2747    the beginning of the buffer, and SIZE is the size of the buffer.  If SIZE
2748    is nonzero, BEGIN[SIZE - 1] must be a newline.  BACKREF points to a place
2749    where we're supposed to store a 1 if backreferencing happened and the
2750    match needs to be verified by a backtracking matcher.  Otherwise
2751    we store a 0 in *backref. */
2752 size_t
dfaexec(struct dfa * d,char const * begin,size_t size,int * backref)2753 dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2754 {
2755   register int s;	/* Current state. */
2756   register unsigned char const *p; /* Current input character. */
2757   register unsigned char const *end; /* One past the last input character.  */
2758   register int **trans, *t;	/* Copy of d->trans so it can be optimized
2759 				   into a register. */
2760   register unsigned char eol = eolbyte;	/* Likewise for eolbyte.  */
2761   static int sbit[NOTCHAR];	/* Table for anding with d->success. */
2762   static int sbit_init;
2763 
2764   if (! sbit_init)
2765     {
2766       int i;
2767 
2768       sbit_init = 1;
2769       for (i = 0; i < NOTCHAR; ++i)
2770 	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2771       sbit[eol] = 4;
2772     }
2773 
2774   if (! d->tralloc)
2775     build_state_zero(d);
2776 
2777   s = 0;
2778   p = (unsigned char const *) begin;
2779   end = p + size;
2780   trans = d->trans;
2781 
2782 #ifdef MBS_SUPPORT
2783   if (MB_CUR_MAX > 1)
2784     {
2785       int remain_bytes, i;
2786       buf_begin = begin;
2787       buf_end = end;
2788 
2789       /* initialize mblen_buf, and inputwcs.  */
2790       MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2791       MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2792       memset(&mbs, 0, sizeof(mbstate_t));
2793       remain_bytes = 0;
2794       for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2795 	{
2796 	  if (remain_bytes == 0)
2797 	    {
2798 	      remain_bytes
2799 		= mbrtowc(inputwcs + i, begin + i,
2800 			  end - (unsigned char const *)begin - i + 1, &mbs);
2801 	      if (remain_bytes <= 1)
2802 		{
2803 		  remain_bytes = 0;
2804 		  inputwcs[i] = (wchar_t)begin[i];
2805 		  mblen_buf[i] = 0;
2806 		}
2807 	      else
2808 		{
2809 		  mblen_buf[i] = remain_bytes;
2810 		  remain_bytes--;
2811 		}
2812 	    }
2813 	  else
2814 	    {
2815 	      mblen_buf[i] = remain_bytes;
2816 	      inputwcs[i] = 0;
2817 	      remain_bytes--;
2818 	    }
2819 	}
2820       mblen_buf[i] = 0;
2821       inputwcs[i] = 0; /* sentinel */
2822     }
2823 #endif /* MBS_SUPPORT */
2824 
2825   for (;;)
2826     {
2827 #ifdef MBS_SUPPORT
2828       if (MB_CUR_MAX > 1)
2829 	while ((t = trans[s]))
2830 	  {
2831 	    if (d->states[s].mbps.nelem != 0)
2832 	      {
2833 		/* Can match with a multibyte character( and multi character
2834 		   collating element).  */
2835 		unsigned char const *nextp;
2836 
2837 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2838 
2839 		nextp = p;
2840 		s = transit_state(d, s, &nextp);
2841 		p = nextp;
2842 
2843 		/* Trans table might be updated.  */
2844 		trans = d->trans;
2845 	      }
2846 	    else
2847 	      {
2848 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2849 		s = t[*p++];
2850 	      }
2851 	  }
2852       else
2853 #endif /* MBS_SUPPORT */
2854         while ((t = trans[s]))
2855 	  s = t[*p++];
2856 
2857       if (s < 0)
2858 	{
2859 	  if (p == end)
2860 	    {
2861 #ifdef MBS_SUPPORT
2862 	      if (MB_CUR_MAX > 1)
2863 		{
2864 		  free(mblen_buf);
2865 		  free(inputwcs);
2866 		}
2867 #endif /* MBS_SUPPORT */
2868 	      return (size_t) -1;
2869 	    }
2870 	  s = 0;
2871 	}
2872       else if ((t = d->fails[s]))
2873 	{
2874 	  if (d->success[s] & sbit[*p])
2875 	    {
2876 	      if (backref)
2877 		*backref = (d->states[s].backref != 0);
2878 #ifdef MBS_SUPPORT
2879 	      if (MB_CUR_MAX > 1)
2880 		{
2881 		  free(mblen_buf);
2882 		  free(inputwcs);
2883 		}
2884 #endif /* MBS_SUPPORT */
2885 	      return (char const *) p - begin;
2886 	    }
2887 
2888 #ifdef MBS_SUPPORT
2889 	  if (MB_CUR_MAX > 1)
2890 	    {
2891 		SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2892 		if (d->states[s].mbps.nelem != 0)
2893 		  {
2894 		    /* Can match with a multibyte character( and multi
2895 		       character collating element).  */
2896 		    unsigned char const *nextp;
2897 		    nextp = p;
2898 		    s = transit_state(d, s, &nextp);
2899 		    p = nextp;
2900 
2901 		    /* Trans table might be updated.  */
2902 		    trans = d->trans;
2903 		  }
2904 		else
2905 		s = t[*p++];
2906 	    }
2907 	  else
2908 #endif /* MBS_SUPPORT */
2909 	  s = t[*p++];
2910 	}
2911       else
2912 	{
2913 	  build_state(s, d);
2914 	  trans = d->trans;
2915 	}
2916     }
2917 }
2918 
2919 /* Initialize the components of a dfa that the other routines don't
2920    initialize for themselves. */
2921 void
dfainit(struct dfa * d)2922 dfainit (struct dfa *d)
2923 {
2924   d->calloc = 1;
2925   MALLOC(d->charclasses, charclass, d->calloc);
2926   d->cindex = 0;
2927 
2928   d->talloc = 1;
2929   MALLOC(d->tokens, token, d->talloc);
2930   d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2931 #ifdef MBS_SUPPORT
2932   if (MB_CUR_MAX > 1)
2933     {
2934       d->nmultibyte_prop = 1;
2935       MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2936       d->nmbcsets = 0;
2937       d->mbcsets_alloc = 1;
2938       MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2939     }
2940 #endif
2941 
2942   d->searchflag = 0;
2943   d->tralloc = 0;
2944 
2945   d->musts = 0;
2946 }
2947 
2948 /* Parse and analyze a single string of the given length. */
2949 void
dfacomp(char const * s,size_t len,struct dfa * d,int searchflag)2950 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2951 {
2952   if (case_fold)	/* dummy folding in service of dfamust() */
2953     {
2954       char *lcopy;
2955       int i;
2956 
2957       lcopy = malloc(len);
2958       if (!lcopy)
2959 	dfaerror(_("out of memory"));
2960 
2961       /* This is a kludge. */
2962       case_fold = 0;
2963       for (i = 0; i < len; ++i)
2964 	if (ISUPPER ((unsigned char) s[i]))
2965 	  lcopy[i] = tolower ((unsigned char) s[i]);
2966 	else
2967 	  lcopy[i] = s[i];
2968 
2969       dfainit(d);
2970       dfaparse(lcopy, len, d);
2971       free(lcopy);
2972       dfamust(d);
2973       d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2974       case_fold = 1;
2975       dfaparse(s, len, d);
2976       dfaanalyze(d, searchflag);
2977     }
2978   else
2979     {
2980         dfainit(d);
2981         dfaparse(s, len, d);
2982 	dfamust(d);
2983         dfaanalyze(d, searchflag);
2984     }
2985 }
2986 
2987 /* Free the storage held by the components of a dfa. */
2988 void
dfafree(struct dfa * d)2989 dfafree (struct dfa *d)
2990 {
2991   int i;
2992   struct dfamust *dm, *ndm;
2993 
2994   free((ptr_t) d->charclasses);
2995   free((ptr_t) d->tokens);
2996 
2997 #ifdef MBS_SUPPORT
2998   if (MB_CUR_MAX > 1)
2999     {
3000       free((ptr_t) d->multibyte_prop);
3001       for (i = 0; i < d->nmbcsets; ++i)
3002 	{
3003 	  int j;
3004 	  struct mb_char_classes *p = &(d->mbcsets[i]);
3005 	  if (p->chars != NULL)
3006 	    free(p->chars);
3007 	  if (p->ch_classes != NULL)
3008 	    free(p->ch_classes);
3009 	  if (p->range_sts != NULL)
3010 	    free(p->range_sts);
3011 	  if (p->range_ends != NULL)
3012 	    free(p->range_ends);
3013 
3014 	  for (j = 0; j < p->nequivs; ++j)
3015 	    free(p->equivs[j]);
3016 	  if (p->equivs != NULL)
3017 	    free(p->equivs);
3018 
3019 	  for (j = 0; j < p->ncoll_elems; ++j)
3020 	    free(p->coll_elems[j]);
3021 	  if (p->coll_elems != NULL)
3022 	    free(p->coll_elems);
3023 	}
3024       free((ptr_t) d->mbcsets);
3025     }
3026 #endif /* MBS_SUPPORT */
3027 
3028   for (i = 0; i < d->sindex; ++i)
3029     free((ptr_t) d->states[i].elems.elems);
3030   free((ptr_t) d->states);
3031   for (i = 0; i < d->tindex; ++i)
3032     if (d->follows[i].elems)
3033       free((ptr_t) d->follows[i].elems);
3034   free((ptr_t) d->follows);
3035   for (i = 0; i < d->tralloc; ++i)
3036     if (d->trans[i])
3037       free((ptr_t) d->trans[i]);
3038     else if (d->fails[i])
3039       free((ptr_t) d->fails[i]);
3040   if (d->realtrans) free((ptr_t) d->realtrans);
3041   if (d->fails) free((ptr_t) d->fails);
3042   if (d->success) free((ptr_t) d->success);
3043   for (dm = d->musts; dm; dm = ndm)
3044     {
3045       ndm = dm->next;
3046       free(dm->must);
3047       free((ptr_t) dm);
3048     }
3049 }
3050 
3051 /* Having found the postfix representation of the regular expression,
3052    try to find a long sequence of characters that must appear in any line
3053    containing the r.e.
3054    Finding a "longest" sequence is beyond the scope here;
3055    we take an easy way out and hope for the best.
3056    (Take "(ab|a)b"--please.)
3057 
3058    We do a bottom-up calculation of sequences of characters that must appear
3059    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3060    representation:
3061 	sequences that must appear at the left of the match ("left")
3062 	sequences that must appear at the right of the match ("right")
3063 	lists of sequences that must appear somewhere in the match ("in")
3064 	sequences that must constitute the match ("is")
3065 
3066    When we get to the root of the tree, we use one of the longest of its
3067    calculated "in" sequences as our answer.  The sequence we find is returned in
3068    d->must (where "d" is the single argument passed to "dfamust");
3069    the length of the sequence is returned in d->mustn.
3070 
3071    The sequences calculated for the various types of node (in pseudo ANSI c)
3072    are shown below.  "p" is the operand of unary operators (and the left-hand
3073    operand of binary operators); "q" is the right-hand operand of binary
3074    operators.
3075 
3076    "ZERO" means "a zero-length sequence" below.
3077 
3078 	Type	left		right		is		in
3079 	----	----		-----		--		--
3080 	char c	# c		# c		# c		# c
3081 
3082 	ANYCHAR	ZERO		ZERO		ZERO		ZERO
3083 
3084 	MBCSET	ZERO		ZERO		ZERO		ZERO
3085 
3086 	CSET	ZERO		ZERO		ZERO		ZERO
3087 
3088 	STAR	ZERO		ZERO		ZERO		ZERO
3089 
3090 	QMARK	ZERO		ZERO		ZERO		ZERO
3091 
3092 	PLUS	p->left		p->right	ZERO		p->in
3093 
3094 	CAT	(p->is==ZERO)?	(q->is==ZERO)?	(p->is!=ZERO &&	p->in plus
3095 		p->left :	q->right :	q->is!=ZERO) ?	q->in plus
3096 		p->is##q->left	p->right##q->is	p->is##q->is :	p->right##q->left
3097 						ZERO
3098 
3099 	OR	longest common	longest common	(do p->is and	substrings common to
3100 		leading		trailing	q->is have same	p->in and q->in
3101 		(sub)sequence	(sub)sequence	length and
3102 		of p->left	of p->right	content) ?
3103 		and q->left	and q->right	p->is : NULL
3104 
3105    If there's anything else we recognize in the tree, all four sequences get set
3106    to zero-length sequences.  If there's something we don't recognize in the tree,
3107    we just return a zero-length sequence.
3108 
3109    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3110    'aaa')?
3111 
3112    And. . .is it here or someplace that we might ponder "optimizations" such as
3113 	egrep 'psi|epsilon'	->	egrep 'psi'
3114 	egrep 'pepsi|epsilon'	->	egrep 'epsi'
3115 					(Yes, we now find "epsi" as a "string
3116 					that must occur", but we might also
3117 					simplify the *entire* r.e. being sought)
3118 	grep '[c]'		->	grep 'c'
3119 	grep '(ab|a)b'		->	grep 'ab'
3120 	grep 'ab*'		->	grep 'a'
3121 	grep 'a*b'		->	grep 'b'
3122 
3123    There are several issues:
3124 
3125    Is optimization easy (enough)?
3126 
3127    Does optimization actually accomplish anything,
3128    or is the automaton you get from "psi|epsilon" (for example)
3129    the same as the one you get from "psi" (for example)?
3130 
3131    Are optimizable r.e.'s likely to be used in real-life situations
3132    (something like 'ab*' is probably unlikely; something like is
3133    'psi|epsilon' is likelier)? */
3134 
3135 static char *
icatalloc(char * old,char * new)3136 icatalloc (char *old, char *new)
3137 {
3138   char *result;
3139   size_t oldsize, newsize;
3140 
3141   newsize = (new == NULL) ? 0 : strlen(new);
3142   if (old == NULL)
3143     oldsize = 0;
3144   else if (newsize == 0)
3145     return old;
3146   else	oldsize = strlen(old);
3147   if (old == NULL)
3148     result = (char *) malloc(newsize + 1);
3149   else
3150     result = (char *) realloc((void *) old, oldsize + newsize + 1);
3151   if (result != NULL && new != NULL)
3152     (void) strcpy(result + oldsize, new);
3153   return result;
3154 }
3155 
3156 static char *
icpyalloc(char * string)3157 icpyalloc (char *string)
3158 {
3159   return icatalloc((char *) NULL, string);
3160 }
3161 
3162 static char *
istrstr(char * lookin,char * lookfor)3163 istrstr (char *lookin, char *lookfor)
3164 {
3165   char *cp;
3166   size_t len;
3167 
3168   len = strlen(lookfor);
3169   for (cp = lookin; *cp != '\0'; ++cp)
3170     if (strncmp(cp, lookfor, len) == 0)
3171       return cp;
3172   return NULL;
3173 }
3174 
3175 static void
ifree(char * cp)3176 ifree (char *cp)
3177 {
3178   if (cp != NULL)
3179     free(cp);
3180 }
3181 
3182 static void
freelist(char ** cpp)3183 freelist (char **cpp)
3184 {
3185   int i;
3186 
3187   if (cpp == NULL)
3188     return;
3189   for (i = 0; cpp[i] != NULL; ++i)
3190     {
3191       free(cpp[i]);
3192       cpp[i] = NULL;
3193     }
3194 }
3195 
3196 static char **
enlist(char ** cpp,char * new,size_t len)3197 enlist (char **cpp, char *new, size_t len)
3198 {
3199   int i, j;
3200 
3201   if (cpp == NULL)
3202     return NULL;
3203   if ((new = icpyalloc(new)) == NULL)
3204     {
3205       freelist(cpp);
3206       return NULL;
3207     }
3208   new[len] = '\0';
3209   /* Is there already something in the list that's new (or longer)? */
3210   for (i = 0; cpp[i] != NULL; ++i)
3211     if (istrstr(cpp[i], new) != NULL)
3212       {
3213 	free(new);
3214 	return cpp;
3215       }
3216   /* Eliminate any obsoleted strings. */
3217   j = 0;
3218   while (cpp[j] != NULL)
3219     if (istrstr(new, cpp[j]) == NULL)
3220       ++j;
3221     else
3222       {
3223 	free(cpp[j]);
3224 	if (--i == j)
3225 	  break;
3226 	cpp[j] = cpp[i];
3227 	cpp[i] = NULL;
3228       }
3229   /* Add the new string. */
3230   cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3231   if (cpp == NULL)
3232     return NULL;
3233   cpp[i] = new;
3234   cpp[i + 1] = NULL;
3235   return cpp;
3236 }
3237 
3238 /* Given pointers to two strings, return a pointer to an allocated
3239    list of their distinct common substrings. Return NULL if something
3240    seems wild. */
3241 static char **
comsubs(char * left,char * right)3242 comsubs (char *left, char *right)
3243 {
3244   char **cpp;
3245   char *lcp;
3246   char *rcp;
3247   size_t i, len;
3248 
3249   if (left == NULL || right == NULL)
3250     return NULL;
3251   cpp = (char **) malloc(sizeof *cpp);
3252   if (cpp == NULL)
3253     return NULL;
3254   cpp[0] = NULL;
3255   for (lcp = left; *lcp != '\0'; ++lcp)
3256     {
3257       len = 0;
3258       rcp = strchr (right, *lcp);
3259       while (rcp != NULL)
3260 	{
3261 	  for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3262 	    continue;
3263 	  if (i > len)
3264 	    len = i;
3265 	  rcp = strchr (rcp + 1, *lcp);
3266 	}
3267       if (len == 0)
3268 	continue;
3269       if ((cpp = enlist(cpp, lcp, len)) == NULL)
3270 	break;
3271     }
3272   return cpp;
3273 }
3274 
3275 static char **
addlists(char ** old,char ** new)3276 addlists (char **old, char **new)
3277 {
3278   int i;
3279 
3280   if (old == NULL || new == NULL)
3281     return NULL;
3282   for (i = 0; new[i] != NULL; ++i)
3283     {
3284       old = enlist(old, new[i], strlen(new[i]));
3285       if (old == NULL)
3286 	break;
3287     }
3288   return old;
3289 }
3290 
3291 /* Given two lists of substrings, return a new list giving substrings
3292    common to both. */
3293 static char **
inboth(char ** left,char ** right)3294 inboth (char **left, char **right)
3295 {
3296   char **both;
3297   char **temp;
3298   int lnum, rnum;
3299 
3300   if (left == NULL || right == NULL)
3301     return NULL;
3302   both = (char **) malloc(sizeof *both);
3303   if (both == NULL)
3304     return NULL;
3305   both[0] = NULL;
3306   for (lnum = 0; left[lnum] != NULL; ++lnum)
3307     {
3308       for (rnum = 0; right[rnum] != NULL; ++rnum)
3309 	{
3310 	  temp = comsubs(left[lnum], right[rnum]);
3311 	  if (temp == NULL)
3312 	    {
3313 	      freelist(both);
3314 	      return NULL;
3315 	    }
3316 	  both = addlists(both, temp);
3317 	  freelist(temp);
3318 	  free(temp);
3319 	  if (both == NULL)
3320 	    return NULL;
3321 	}
3322     }
3323   return both;
3324 }
3325 
3326 typedef struct
3327 {
3328   char **in;
3329   char *left;
3330   char *right;
3331   char *is;
3332 } must;
3333 
3334 static void
resetmust(must * mp)3335 resetmust (must *mp)
3336 {
3337   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3338   freelist(mp->in);
3339 }
3340 
3341 static void
dfamust(struct dfa * dfa)3342 dfamust (struct dfa *dfa)
3343 {
3344   must *musts;
3345   must *mp;
3346   char *result;
3347   int ri;
3348   int i;
3349   int exact;
3350   token t;
3351   static must must0;
3352   struct dfamust *dm;
3353   static char empty_string[] = "";
3354 
3355   result = empty_string;
3356   exact = 0;
3357   musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3358   if (musts == NULL)
3359     return;
3360   mp = musts;
3361   for (i = 0; i <= dfa->tindex; ++i)
3362     mp[i] = must0;
3363   for (i = 0; i <= dfa->tindex; ++i)
3364     {
3365       mp[i].in = (char **) malloc(sizeof *mp[i].in);
3366       mp[i].left = malloc(2);
3367       mp[i].right = malloc(2);
3368       mp[i].is = malloc(2);
3369       if (mp[i].in == NULL || mp[i].left == NULL ||
3370 	  mp[i].right == NULL || mp[i].is == NULL)
3371 	goto done;
3372       mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3373       mp[i].in[0] = NULL;
3374     }
3375 #ifdef DEBUG
3376   fprintf(stderr, "dfamust:\n");
3377   for (i = 0; i < dfa->tindex; ++i)
3378     {
3379       fprintf(stderr, " %d:", i);
3380       prtok(dfa->tokens[i]);
3381     }
3382   putc('\n', stderr);
3383 #endif
3384   for (ri = 0; ri < dfa->tindex; ++ri)
3385     {
3386       switch (t = dfa->tokens[ri])
3387 	{
3388 	case LPAREN:
3389 	case RPAREN:
3390 	  goto done;		/* "cannot happen" */
3391 	case EMPTY:
3392 	case BEGLINE:
3393 	case ENDLINE:
3394 	case BEGWORD:
3395 	case ENDWORD:
3396 	case LIMWORD:
3397 	case NOTLIMWORD:
3398 	case BACKREF:
3399 	  resetmust(mp);
3400 	  break;
3401 	case STAR:
3402 	case QMARK:
3403 	  if (mp <= musts)
3404 	    goto done;		/* "cannot happen" */
3405 	  --mp;
3406 	  resetmust(mp);
3407 	  break;
3408 	case OR:
3409 	case ORTOP:
3410 	  if (mp < &musts[2])
3411 	    goto done;		/* "cannot happen" */
3412 	  {
3413 	    char **new;
3414 	    must *lmp;
3415 	    must *rmp;
3416 	    int j, ln, rn, n;
3417 
3418 	    rmp = --mp;
3419 	    lmp = --mp;
3420 	    /* Guaranteed to be.  Unlikely, but. . . */
3421 	    if (strcmp(lmp->is, rmp->is) != 0)
3422 	      lmp->is[0] = '\0';
3423 	    /* Left side--easy */
3424 	    i = 0;
3425 	    while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3426 	      ++i;
3427 	    lmp->left[i] = '\0';
3428 	    /* Right side */
3429 	    ln = strlen(lmp->right);
3430 	    rn = strlen(rmp->right);
3431 	    n = ln;
3432 	    if (n > rn)
3433 	      n = rn;
3434 	    for (i = 0; i < n; ++i)
3435 	      if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3436 		break;
3437 	    for (j = 0; j < i; ++j)
3438 	      lmp->right[j] = lmp->right[(ln - i) + j];
3439 	    lmp->right[j] = '\0';
3440 	    new = inboth(lmp->in, rmp->in);
3441 	    if (new == NULL)
3442 	      goto done;
3443 	    freelist(lmp->in);
3444 	    free((char *) lmp->in);
3445 	    lmp->in = new;
3446 	  }
3447 	  break;
3448 	case PLUS:
3449 	  if (mp <= musts)
3450 	    goto done;		/* "cannot happen" */
3451 	  --mp;
3452 	  mp->is[0] = '\0';
3453 	  break;
3454 	case END:
3455 	  if (mp != &musts[1])
3456 	    goto done;		/* "cannot happen" */
3457 	  for (i = 0; musts[0].in[i] != NULL; ++i)
3458 	    if (strlen(musts[0].in[i]) > strlen(result))
3459 	      result = musts[0].in[i];
3460 	  if (strcmp(result, musts[0].is) == 0)
3461 	    exact = 1;
3462 	  goto done;
3463 	case CAT:
3464 	  if (mp < &musts[2])
3465 	    goto done;		/* "cannot happen" */
3466 	  {
3467 	    must *lmp;
3468 	    must *rmp;
3469 
3470 	    rmp = --mp;
3471 	    lmp = --mp;
3472 	    /* In.  Everything in left, plus everything in
3473 	       right, plus catenation of
3474 	       left's right and right's left. */
3475 	    lmp->in = addlists(lmp->in, rmp->in);
3476 	    if (lmp->in == NULL)
3477 	      goto done;
3478 	    if (lmp->right[0] != '\0' &&
3479 		rmp->left[0] != '\0')
3480 	      {
3481 		char *tp;
3482 
3483 		tp = icpyalloc(lmp->right);
3484 		if (tp == NULL)
3485 		  goto done;
3486 		tp = icatalloc(tp, rmp->left);
3487 		if (tp == NULL)
3488 		  goto done;
3489 		lmp->in = enlist(lmp->in, tp,
3490 				 strlen(tp));
3491 		free(tp);
3492 		if (lmp->in == NULL)
3493 		  goto done;
3494 	      }
3495 	    /* Left-hand */
3496 	    if (lmp->is[0] != '\0')
3497 	      {
3498 		lmp->left = icatalloc(lmp->left,
3499 				      rmp->left);
3500 		if (lmp->left == NULL)
3501 		  goto done;
3502 	      }
3503 	    /* Right-hand */
3504 	    if (rmp->is[0] == '\0')
3505 	      lmp->right[0] = '\0';
3506 	    lmp->right = icatalloc(lmp->right, rmp->right);
3507 	    if (lmp->right == NULL)
3508 	      goto done;
3509 	    /* Guaranteed to be */
3510 	    if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3511 	      {
3512 		lmp->is = icatalloc(lmp->is, rmp->is);
3513 		if (lmp->is == NULL)
3514 		  goto done;
3515 	      }
3516 	    else
3517 	      lmp->is[0] = '\0';
3518 	  }
3519 	  break;
3520 	default:
3521 	  if (t < END)
3522 	    {
3523 	      /* "cannot happen" */
3524 	      goto done;
3525 	    }
3526 	  else if (t == '\0')
3527 	    {
3528 	      /* not on *my* shift */
3529 	      goto done;
3530 	    }
3531 	  else if (t >= CSET
3532 #ifdef MBS_SUPPORT
3533 		   || t == ANYCHAR
3534 		   || t == MBCSET
3535 #endif /* MBS_SUPPORT */
3536 		   )
3537 	    {
3538 	      /* easy enough */
3539 	      resetmust(mp);
3540 	    }
3541 	  else
3542 	    {
3543 	      /* plain character */
3544 	      resetmust(mp);
3545 	      mp->is[0] = mp->left[0] = mp->right[0] = t;
3546 	      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3547 	      mp->in = enlist(mp->in, mp->is, (size_t)1);
3548 	      if (mp->in == NULL)
3549 		goto done;
3550 	    }
3551 	  break;
3552 	}
3553 #ifdef DEBUG
3554       fprintf(stderr, " node: %d:", ri);
3555       prtok(dfa->tokens[ri]);
3556       fprintf(stderr, "\n  in:");
3557       for (i = 0; mp->in[i]; ++i)
3558 	fprintf(stderr, " \"%s\"", mp->in[i]);
3559       fprintf(stderr, "\n  is: \"%s\"\n", mp->is);
3560       fprintf(stderr, "  left: \"%s\"\n", mp->left);
3561       fprintf(stderr, "  right: \"%s\"\n", mp->right);
3562 #endif
3563       ++mp;
3564     }
3565  done:
3566   if (strlen(result))
3567     {
3568       dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3569       dm->exact = exact;
3570       dm->must = malloc(strlen(result) + 1);
3571       strcpy(dm->must, result);
3572       dm->next = dfa->musts;
3573       dfa->musts = dm;
3574     }
3575   mp = musts;
3576   for (i = 0; i <= dfa->tindex; ++i)
3577     {
3578       freelist(mp[i].in);
3579       ifree((char *) mp[i].in);
3580       ifree(mp[i].left);
3581       ifree(mp[i].right);
3582       ifree(mp[i].is);
3583     }
3584   free((char *) mp);
3585 }
3586 /* vim:set shiftwidth=2: */
3587