1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright 1988, 1998, 2000, 2005 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-13017, USA */
17
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
20
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24
25 #include <assert.h>
26 #include <ctype.h>
27 #include <stdio.h>
28
29 #include <sys/types.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <locale.h>
33
34 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
35 /* We can handle multibyte string. */
36 # define MBS_SUPPORT
37 #endif
38
39 #ifdef MBS_SUPPORT
40 # include <wchar.h>
41 # include <wctype.h>
42 #endif
43
44 #ifndef DEBUG /* use the same approach as regex.c */
45 #undef assert
46 #define assert(e)
47 #endif /* DEBUG */
48
49 #ifndef isgraph
50 #define isgraph(C) (isprint(C) && !isspace(C))
51 #endif
52
53 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
54 #define ISALPHA(C) isalpha(C)
55 #define ISUPPER(C) isupper(C)
56 #define ISLOWER(C) islower(C)
57 #define ISDIGIT(C) isdigit(C)
58 #define ISXDIGIT(C) isxdigit(C)
59 #define ISSPACE(C) isspace(C)
60 #define ISPUNCT(C) ispunct(C)
61 #define ISALNUM(C) isalnum(C)
62 #define ISPRINT(C) isprint(C)
63 #define ISGRAPH(C) isgraph(C)
64 #define ISCNTRL(C) iscntrl(C)
65 #else
66 #define ISALPHA(C) (isascii(C) && isalpha(C))
67 #define ISUPPER(C) (isascii(C) && isupper(C))
68 #define ISLOWER(C) (isascii(C) && islower(C))
69 #define ISDIGIT(C) (isascii(C) && isdigit(C))
70 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
71 #define ISSPACE(C) (isascii(C) && isspace(C))
72 #define ISPUNCT(C) (isascii(C) && ispunct(C))
73 #define ISALNUM(C) (isascii(C) && isalnum(C))
74 #define ISPRINT(C) (isascii(C) && isprint(C))
75 #define ISGRAPH(C) (isascii(C) && isgraph(C))
76 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
77 #endif
78
79 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
80 - Its arg may be any int or unsigned int; it need not be an unsigned char.
81 - It's guaranteed to evaluate its argument exactly once.
82 - It's typically faster.
83 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
84 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
85 it's important to use the locale's definition of `digit' even when the
86 host does not conform to Posix. */
87 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
88
89 #include "regex.h"
90 #include "dfa.h"
91 #include "hard-locale.h"
92 #include "gettext.h"
93 #define _(str) gettext (str)
94
95 /* HPUX, define those as macros in sys/param.h */
96 #ifdef setbit
97 # undef setbit
98 #endif
99 #ifdef clrbit
100 # undef clrbit
101 #endif
102
103 static void dfamust (struct dfa *dfa);
104 static void regexp (int toplevel);
105
106 static ptr_t
xcalloc(size_t n,size_t s)107 xcalloc (size_t n, size_t s)
108 {
109 ptr_t r = calloc(n, s);
110
111 if (!r)
112 dfaerror(_("Memory exhausted"));
113 return r;
114 }
115
116 static ptr_t
xmalloc(size_t n)117 xmalloc (size_t n)
118 {
119 ptr_t r = malloc(n);
120
121 assert(n != 0);
122 if (!r)
123 dfaerror(_("Memory exhausted"));
124 return r;
125 }
126
127 static ptr_t
xrealloc(ptr_t p,size_t n)128 xrealloc (ptr_t p, size_t n)
129 {
130 ptr_t r = realloc(p, n);
131
132 assert(n != 0);
133 if (!r)
134 dfaerror(_("Memory exhausted"));
135 return r;
136 }
137
138 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
139 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
140 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
141
142 /* Reallocate an array of type t if nalloc is too small for index. */
143 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
144 if ((index) >= (nalloc)) \
145 { \
146 do \
147 (nalloc) *= 2; \
148 while ((index) >= (nalloc)); \
149 REALLOC(p, t, nalloc); \
150 }
151
152 #ifdef DEBUG
153
154 static void
prtok(token t)155 prtok (token t)
156 {
157 char const *s;
158
159 if (t < 0)
160 fprintf(stderr, "END");
161 else if (t < NOTCHAR)
162 fprintf(stderr, "%c", t);
163 else
164 {
165 switch (t)
166 {
167 case EMPTY: s = "EMPTY"; break;
168 case BACKREF: s = "BACKREF"; break;
169 case BEGLINE: s = "BEGLINE"; break;
170 case ENDLINE: s = "ENDLINE"; break;
171 case BEGWORD: s = "BEGWORD"; break;
172 case ENDWORD: s = "ENDWORD"; break;
173 case LIMWORD: s = "LIMWORD"; break;
174 case NOTLIMWORD: s = "NOTLIMWORD"; break;
175 case QMARK: s = "QMARK"; break;
176 case STAR: s = "STAR"; break;
177 case PLUS: s = "PLUS"; break;
178 case CAT: s = "CAT"; break;
179 case OR: s = "OR"; break;
180 case ORTOP: s = "ORTOP"; break;
181 case LPAREN: s = "LPAREN"; break;
182 case RPAREN: s = "RPAREN"; break;
183 case CRANGE: s = "CRANGE"; break;
184 #ifdef MBS_SUPPORT
185 case ANYCHAR: s = "ANYCHAR"; break;
186 case MBCSET: s = "MBCSET"; break;
187 #endif /* MBS_SUPPORT */
188 default: s = "CSET"; break;
189 }
190 fprintf(stderr, "%s", s);
191 }
192 }
193 #endif /* DEBUG */
194
195 /* Stuff pertaining to charclasses. */
196
197 static int
tstbit(unsigned b,charclass c)198 tstbit (unsigned b, charclass c)
199 {
200 return c[b / INTBITS] & 1 << b % INTBITS;
201 }
202
203 static void
setbit(unsigned b,charclass c)204 setbit (unsigned b, charclass c)
205 {
206 c[b / INTBITS] |= 1 << b % INTBITS;
207 }
208
209 static void
clrbit(unsigned b,charclass c)210 clrbit (unsigned b, charclass c)
211 {
212 c[b / INTBITS] &= ~(1 << b % INTBITS);
213 }
214
215 static void
copyset(charclass src,charclass dst)216 copyset (charclass src, charclass dst)
217 {
218 memcpy (dst, src, sizeof (charclass));
219 }
220
221 static void
zeroset(charclass s)222 zeroset (charclass s)
223 {
224 memset (s, 0, sizeof (charclass));
225 }
226
227 static void
notset(charclass s)228 notset (charclass s)
229 {
230 int i;
231
232 for (i = 0; i < CHARCLASS_INTS; ++i)
233 s[i] = ~s[i];
234 }
235
236 static int
equal(charclass s1,charclass s2)237 equal (charclass s1, charclass s2)
238 {
239 return memcmp (s1, s2, sizeof (charclass)) == 0;
240 }
241
242 /* A pointer to the current dfa is kept here during parsing. */
243 static struct dfa *dfa;
244
245 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
246 static int
charclass_index(charclass s)247 charclass_index (charclass s)
248 {
249 int i;
250
251 for (i = 0; i < dfa->cindex; ++i)
252 if (equal(s, dfa->charclasses[i]))
253 return i;
254 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
255 ++dfa->cindex;
256 copyset(s, dfa->charclasses[i]);
257 return i;
258 }
259
260 /* Syntax bits controlling the behavior of the lexical analyzer. */
261 static reg_syntax_t syntax_bits, syntax_bits_set;
262
263 /* Flag for case-folding letters into sets. */
264 static int case_fold;
265
266 /* End-of-line byte in data. */
267 static unsigned char eolbyte;
268
269 /* Entry point to set syntax options. */
270 void
dfasyntax(reg_syntax_t bits,int fold,unsigned char eol)271 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
272 {
273 syntax_bits_set = 1;
274 syntax_bits = bits;
275 case_fold = fold;
276 eolbyte = eol;
277 }
278
279 /* Like setbit, but if case is folded, set both cases of a letter. */
280 static void
setbit_case_fold(unsigned b,charclass c)281 setbit_case_fold (unsigned b, charclass c)
282 {
283 setbit (b, c);
284 if (case_fold)
285 {
286 if (ISUPPER (b))
287 setbit (tolower (b), c);
288 else if (ISLOWER (b))
289 setbit (toupper (b), c);
290 }
291 }
292
293 /* Lexical analyzer. All the dross that deals with the obnoxious
294 GNU Regex syntax bits is located here. The poor, suffering
295 reader is referred to the GNU Regex documentation for the
296 meaning of the @#%!@#%^!@ syntax bits. */
297
298 static char const *lexstart; /* Pointer to beginning of input string. */
299 static char const *lexptr; /* Pointer to next input character. */
300 static int lexleft; /* Number of characters remaining. */
301 static token lasttok; /* Previous token returned; initially END. */
302 static int laststart; /* True if we're separated from beginning or (, |
303 only by zero-width characters. */
304 static int parens; /* Count of outstanding left parens. */
305 static int minrep, maxrep; /* Repeat counts for {m,n}. */
306 static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
307
308 #ifdef MBS_SUPPORT
309 /* These variables are used only if (MB_CUR_MAX > 1). */
310 static mbstate_t mbs; /* Mbstate for mbrlen(). */
311 static int cur_mb_len; /* Byte length of the current scanning
312 multibyte character. */
313 static int cur_mb_index; /* Byte index of the current scanning multibyte
314 character.
315
316 singlebyte character : cur_mb_index = 0
317 multibyte character
318 1st byte : cur_mb_index = 1
319 2nd byte : cur_mb_index = 2
320 ...
321 nth byte : cur_mb_index = n */
322 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
323 Each element store the amount of remain
324 byte of corresponding multibyte character
325 in the input string. A element's value
326 is 0 if corresponding character is a
327 singlebyte chracter.
328 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
329 mblen_buf : 0, 3, 2, 1
330 */
331 static wchar_t *inputwcs; /* Wide character representation of input
332 string in dfaexec().
333 The length of this array is same as
334 the length of input string(char array).
335 inputstring[i] is a single-byte char,
336 or 1st byte of a multibyte char.
337 And inputwcs[i] is the codepoint. */
338 static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */
339 static unsigned char const *buf_end; /* refference to end in dfaexec(). */
340 #endif /* MBS_SUPPORT */
341
342 #ifdef MBS_SUPPORT
343 /* This function update cur_mb_len, and cur_mb_index.
344 p points current lexptr, len is the remaining buffer length. */
345 static void
update_mb_len_index(char const * p,int len)346 update_mb_len_index (char const *p, int len)
347 {
348 /* If last character is a part of a multibyte character,
349 we update cur_mb_index. */
350 if (cur_mb_index)
351 cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
352 : cur_mb_index + 1;
353
354 /* If last character is a single byte character, or the
355 last portion of a multibyte character, we check whether
356 next character is a multibyte character or not. */
357 if (! cur_mb_index)
358 {
359 cur_mb_len = mbrlen(p, len, &mbs);
360 if (cur_mb_len > 1)
361 /* It is a multibyte character.
362 cur_mb_len was already set by mbrlen(). */
363 cur_mb_index = 1;
364 else if (cur_mb_len < 1)
365 /* Invalid sequence. We treat it as a singlebyte character.
366 cur_mb_index is aleady 0. */
367 cur_mb_len = 1;
368 /* Otherwise, cur_mb_len == 1, it is a singlebyte character.
369 cur_mb_index is aleady 0. */
370 }
371 }
372 #endif /* MBS_SUPPORT */
373
374 #ifdef MBS_SUPPORT
375 /* Note that characters become unsigned here. */
376 # define FETCH(c, eoferr) \
377 { \
378 if (! lexleft) \
379 { \
380 if (eoferr != 0) \
381 dfaerror (eoferr); \
382 else \
383 return lasttok = END; \
384 } \
385 if (MB_CUR_MAX > 1) \
386 update_mb_len_index(lexptr, lexleft); \
387 (c) = (unsigned char) *lexptr++; \
388 --lexleft; \
389 }
390
391 /* This function fetch a wide character, and update cur_mb_len,
392 used only if the current locale is a multibyte environment. */
393 static wchar_t
fetch_wc(char const * eoferr)394 fetch_wc (char const *eoferr)
395 {
396 wchar_t wc;
397 if (! lexleft)
398 {
399 if (eoferr != 0)
400 dfaerror (eoferr);
401 else
402 return -1;
403 }
404
405 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
406 if (cur_mb_len <= 0)
407 {
408 cur_mb_len = 1;
409 wc = *lexptr;
410 }
411 lexptr += cur_mb_len;
412 lexleft -= cur_mb_len;
413 return wc;
414 }
415 #else
416 /* Note that characters become unsigned here. */
417 # define FETCH(c, eoferr) \
418 { \
419 if (! lexleft) \
420 { \
421 if (eoferr != 0) \
422 dfaerror (eoferr); \
423 else \
424 return lasttok = END; \
425 } \
426 (c) = (unsigned char) *lexptr++; \
427 --lexleft; \
428 }
429 #endif /* MBS_SUPPORT */
430
431 #ifdef MBS_SUPPORT
432 /* Multibyte character handling sub-routin for lex.
433 This function parse a bracket expression and build a struct
434 mb_char_classes. */
435 static void
parse_bracket_exp_mb()436 parse_bracket_exp_mb ()
437 {
438 wchar_t wc, wc1, wc2;
439
440 /* Work area to build a mb_char_classes. */
441 struct mb_char_classes *work_mbc;
442 int chars_al, range_sts_al, range_ends_al, ch_classes_al,
443 equivs_al, coll_elems_al;
444
445 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
446 dfa->mbcsets_alloc, dfa->nmbcsets + 1);
447 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
448 We will update dfa->multibyte_prop in addtok(), because we can't
449 decide the index in dfa->tokens[]. */
450
451 /* Initialize work are */
452 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
453
454 chars_al = 1;
455 range_sts_al = range_ends_al = 0;
456 ch_classes_al = equivs_al = coll_elems_al = 0;
457 MALLOC(work_mbc->chars, wchar_t, chars_al);
458
459 work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
460 work_mbc->nequivs = work_mbc->ncoll_elems = 0;
461 work_mbc->chars = NULL;
462 work_mbc->ch_classes = NULL;
463 work_mbc->range_sts = work_mbc->range_ends = NULL;
464 work_mbc->equivs = work_mbc->coll_elems = NULL;
465
466 wc = fetch_wc(_("Unbalanced ["));
467 if (wc == L'^')
468 {
469 wc = fetch_wc(_("Unbalanced ["));
470 work_mbc->invert = 1;
471 }
472 else
473 work_mbc->invert = 0;
474 do
475 {
476 wc1 = -1; /* mark wc1 is not initialized". */
477
478 /* Note that if we're looking at some other [:...:] construct,
479 we just treat it as a bunch of ordinary characters. We can do
480 this because we assume regex has checked for syntax errors before
481 dfa is ever called. */
482 if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
483 {
484 #define BRACKET_BUFFER_SIZE 128
485 char str[BRACKET_BUFFER_SIZE];
486 wc1 = wc;
487 wc = fetch_wc(_("Unbalanced ["));
488
489 /* If pattern contains `[[:', `[[.', or `[[='. */
490 if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
491 {
492 unsigned char c;
493 unsigned char delim = (unsigned char)wc;
494 int len = 0;
495 for (;;)
496 {
497 if (! lexleft)
498 dfaerror (_("Unbalanced ["));
499 c = (unsigned char) *lexptr++;
500 --lexleft;
501
502 if ((c == delim && *lexptr == ']') || lexleft == 0)
503 break;
504 if (len < BRACKET_BUFFER_SIZE)
505 str[len++] = c;
506 else
507 /* This is in any case an invalid class name. */
508 str[0] = '\0';
509 }
510 str[len] = '\0';
511
512 if (lexleft == 0)
513 {
514 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
515 work_mbc->nchars + 2);
516 work_mbc->chars[work_mbc->nchars++] = L'[';
517 work_mbc->chars[work_mbc->nchars++] = delim;
518 break;
519 }
520
521 if (--lexleft, *lexptr++ != ']')
522 dfaerror (_("Unbalanced ["));
523 if (delim == ':')
524 /* build character class. */
525 {
526 wctype_t wt;
527 /* Query the character class as wctype_t. */
528 wt = wctype (str);
529
530 if (ch_classes_al == 0)
531 MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
532 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
533 ch_classes_al,
534 work_mbc->nch_classes + 1);
535 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
536
537 }
538 else if (delim == '=' || delim == '.')
539 {
540 char *elem;
541 MALLOC(elem, char, len + 1);
542 strncpy(elem, str, len + 1);
543
544 if (delim == '=')
545 /* build equivalent class. */
546 {
547 if (equivs_al == 0)
548 MALLOC(work_mbc->equivs, char*, ++equivs_al);
549 REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
550 equivs_al,
551 work_mbc->nequivs + 1);
552 work_mbc->equivs[work_mbc->nequivs++] = elem;
553 }
554
555 if (delim == '.')
556 /* build collating element. */
557 {
558 if (coll_elems_al == 0)
559 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
560 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
561 coll_elems_al,
562 work_mbc->ncoll_elems + 1);
563 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
564 }
565 }
566 wc = -1;
567 }
568 else
569 /* We treat '[' as a normal character here. */
570 {
571 wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
572 }
573 }
574 else
575 {
576 if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
577 wc = fetch_wc(("Unbalanced ["));
578 }
579
580 if (wc1 == -1)
581 wc1 = fetch_wc(_("Unbalanced ["));
582
583 if (wc1 == L'-')
584 /* build range characters. */
585 {
586 wc2 = fetch_wc(_("Unbalanced ["));
587 if (wc2 == L']')
588 {
589 /* In the case [x-], the - is an ordinary hyphen,
590 which is left in c1, the lookahead character. */
591 lexptr -= cur_mb_len;
592 lexleft += cur_mb_len;
593 wc2 = wc;
594 }
595 else
596 {
597 if (wc2 == L'\\'
598 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
599 wc2 = fetch_wc(_("Unbalanced ["));
600 wc1 = fetch_wc(_("Unbalanced ["));
601 }
602
603 if (range_sts_al == 0)
604 {
605 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
606 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
607 }
608 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
609 range_sts_al, work_mbc->nranges + 1);
610 work_mbc->range_sts[work_mbc->nranges] = wc;
611 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
612 range_ends_al, work_mbc->nranges + 1);
613 work_mbc->range_ends[work_mbc->nranges++] = wc2;
614 }
615 else if (wc != -1)
616 /* build normal characters. */
617 {
618 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
619 work_mbc->nchars + 1);
620 work_mbc->chars[work_mbc->nchars++] = wc;
621 }
622 }
623 while ((wc = wc1) != L']');
624 }
625 #endif /* MBS_SUPPORT */
626
627 #ifdef __STDC__
628 #define FUNC(F, P) static int F(int c) { return P(c); }
629 #else
630 #define FUNC(F, P) static int F(c) int c; { return P(c); }
631 #endif
632
FUNC(is_alpha,ISALPHA)633 FUNC(is_alpha, ISALPHA)
634 FUNC(is_upper, ISUPPER)
635 FUNC(is_lower, ISLOWER)
636 FUNC(is_digit, ISDIGIT)
637 FUNC(is_xdigit, ISXDIGIT)
638 FUNC(is_space, ISSPACE)
639 FUNC(is_punct, ISPUNCT)
640 FUNC(is_alnum, ISALNUM)
641 FUNC(is_print, ISPRINT)
642 FUNC(is_graph, ISGRAPH)
643 FUNC(is_cntrl, ISCNTRL)
644
645 static int
646 is_blank (int c)
647 {
648 return (c == ' ' || c == '\t');
649 }
650
651 /* The following list maps the names of the Posix named character classes
652 to predicate functions that determine whether a given character is in
653 the class. The leading [ has already been eaten by the lexical analyzer. */
654 static struct {
655 const char *name;
656 int (*pred) (int);
657 } const prednames[] = {
658 { ":alpha:]", is_alpha },
659 { ":upper:]", is_upper },
660 { ":lower:]", is_lower },
661 { ":digit:]", is_digit },
662 { ":xdigit:]", is_xdigit },
663 { ":space:]", is_space },
664 { ":punct:]", is_punct },
665 { ":alnum:]", is_alnum },
666 { ":print:]", is_print },
667 { ":graph:]", is_graph },
668 { ":cntrl:]", is_cntrl },
669 { ":blank:]", is_blank },
670 { 0 }
671 };
672
673 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
674 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
675
676 static int
looking_at(char const * s)677 looking_at (char const *s)
678 {
679 size_t len;
680
681 len = strlen(s);
682 if (lexleft < len)
683 return 0;
684 return strncmp(s, lexptr, len) == 0;
685 }
686
687 static token
lex(void)688 lex (void)
689 {
690 unsigned c, c1, c2;
691 int backslash = 0, invert;
692 charclass ccl;
693 int i;
694
695 /* Basic plan: We fetch a character. If it's a backslash,
696 we set the backslash flag and go through the loop again.
697 On the plus side, this avoids having a duplicate of the
698 main switch inside the backslash case. On the minus side,
699 it means that just about every case begins with
700 "if (backslash) ...". */
701 for (i = 0; i < 2; ++i)
702 {
703 FETCH(c, 0);
704 #ifdef MBS_SUPPORT
705 if (MB_CUR_MAX > 1 && cur_mb_index)
706 /* If this is a part of a multi-byte character, we must treat
707 this byte data as a normal character.
708 e.g. In case of SJIS encoding, some character contains '\',
709 but they must not be backslash. */
710 goto normal_char;
711 #endif /* MBS_SUPPORT */
712 switch (c)
713 {
714 case '\\':
715 if (backslash)
716 goto normal_char;
717 if (lexleft == 0)
718 dfaerror(_("Unfinished \\ escape"));
719 backslash = 1;
720 break;
721
722 case '^':
723 if (backslash)
724 goto normal_char;
725 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
726 || lasttok == END
727 || lasttok == LPAREN
728 || lasttok == OR)
729 return lasttok = BEGLINE;
730 goto normal_char;
731
732 case '$':
733 if (backslash)
734 goto normal_char;
735 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
736 || lexleft == 0
737 || (syntax_bits & RE_NO_BK_PARENS
738 ? lexleft > 0 && *lexptr == ')'
739 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
740 || (syntax_bits & RE_NO_BK_VBAR
741 ? lexleft > 0 && *lexptr == '|'
742 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
743 || ((syntax_bits & RE_NEWLINE_ALT)
744 && lexleft > 0 && *lexptr == '\n'))
745 return lasttok = ENDLINE;
746 goto normal_char;
747
748 case '1':
749 case '2':
750 case '3':
751 case '4':
752 case '5':
753 case '6':
754 case '7':
755 case '8':
756 case '9':
757 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
758 {
759 laststart = 0;
760 return lasttok = BACKREF;
761 }
762 goto normal_char;
763
764 case '`':
765 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
766 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
767 goto normal_char;
768
769 case '\'':
770 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
771 return lasttok = ENDLINE; /* FIXME: should be end of string */
772 goto normal_char;
773
774 case '<':
775 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
776 return lasttok = BEGWORD;
777 goto normal_char;
778
779 case '>':
780 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
781 return lasttok = ENDWORD;
782 goto normal_char;
783
784 case 'b':
785 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
786 return lasttok = LIMWORD;
787 goto normal_char;
788
789 case 'B':
790 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
791 return lasttok = NOTLIMWORD;
792 goto normal_char;
793
794 case '?':
795 if (syntax_bits & RE_LIMITED_OPS)
796 goto normal_char;
797 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
798 goto normal_char;
799 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
800 goto normal_char;
801 return lasttok = QMARK;
802
803 case '*':
804 if (backslash)
805 goto normal_char;
806 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
807 goto normal_char;
808 return lasttok = STAR;
809
810 case '+':
811 if (syntax_bits & RE_LIMITED_OPS)
812 goto normal_char;
813 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
814 goto normal_char;
815 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
816 goto normal_char;
817 return lasttok = PLUS;
818
819 case '{':
820 if (!(syntax_bits & RE_INTERVALS))
821 goto normal_char;
822 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
823 goto normal_char;
824 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
825 goto normal_char;
826
827 if (syntax_bits & RE_NO_BK_BRACES)
828 {
829 /* Scan ahead for a valid interval; if it's not valid,
830 treat it as a literal '{'. */
831 int lo = -1, hi = -1;
832 char const *p = lexptr;
833 char const *lim = p + lexleft;
834 for (; p != lim && ISASCIIDIGIT (*p); p++)
835 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
836 if (p != lim && *p == ',')
837 while (++p != lim && ISASCIIDIGIT (*p))
838 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
839 else
840 hi = lo;
841 if (p == lim || *p != '}'
842 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
843 goto normal_char;
844 }
845
846 minrep = 0;
847 /* Cases:
848 {M} - exact count
849 {M,} - minimum count, maximum is infinity
850 {M,N} - M through N */
851 FETCH(c, _("unfinished repeat count"));
852 if (ISASCIIDIGIT (c))
853 {
854 minrep = c - '0';
855 for (;;)
856 {
857 FETCH(c, _("unfinished repeat count"));
858 if (! ISASCIIDIGIT (c))
859 break;
860 minrep = 10 * minrep + c - '0';
861 }
862 }
863 else
864 dfaerror(_("malformed repeat count"));
865 if (c == ',')
866 {
867 FETCH (c, _("unfinished repeat count"));
868 if (! ISASCIIDIGIT (c))
869 maxrep = -1;
870 else
871 {
872 maxrep = c - '0';
873 for (;;)
874 {
875 FETCH (c, _("unfinished repeat count"));
876 if (! ISASCIIDIGIT (c))
877 break;
878 maxrep = 10 * maxrep + c - '0';
879 }
880 if (0 <= maxrep && maxrep < minrep)
881 dfaerror (_("malformed repeat count"));
882 }
883 }
884 else
885 maxrep = minrep;
886 if (!(syntax_bits & RE_NO_BK_BRACES))
887 {
888 if (c != '\\')
889 dfaerror(_("malformed repeat count"));
890 FETCH(c, _("unfinished repeat count"));
891 }
892 if (c != '}')
893 dfaerror(_("malformed repeat count"));
894 laststart = 0;
895 return lasttok = REPMN;
896
897 case '|':
898 if (syntax_bits & RE_LIMITED_OPS)
899 goto normal_char;
900 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
901 goto normal_char;
902 laststart = 1;
903 return lasttok = OR;
904
905 case '\n':
906 if (syntax_bits & RE_LIMITED_OPS
907 || backslash
908 || !(syntax_bits & RE_NEWLINE_ALT))
909 goto normal_char;
910 laststart = 1;
911 return lasttok = OR;
912
913 case '(':
914 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
915 goto normal_char;
916 ++parens;
917 laststart = 1;
918 return lasttok = LPAREN;
919
920 case ')':
921 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
922 goto normal_char;
923 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
924 goto normal_char;
925 --parens;
926 laststart = 0;
927 return lasttok = RPAREN;
928
929 case '.':
930 if (backslash)
931 goto normal_char;
932 #ifdef MBS_SUPPORT
933 if (MB_CUR_MAX > 1)
934 {
935 /* In multibyte environment period must match with a single
936 character not a byte. So we use ANYCHAR. */
937 laststart = 0;
938 return lasttok = ANYCHAR;
939 }
940 #endif /* MBS_SUPPORT */
941 zeroset(ccl);
942 notset(ccl);
943 if (!(syntax_bits & RE_DOT_NEWLINE))
944 clrbit(eolbyte, ccl);
945 if (syntax_bits & RE_DOT_NOT_NULL)
946 clrbit('\0', ccl);
947 laststart = 0;
948 return lasttok = CSET + charclass_index(ccl);
949
950 case 'w':
951 case 'W':
952 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
953 goto normal_char;
954 zeroset(ccl);
955 for (c2 = 0; c2 < NOTCHAR; ++c2)
956 if (IS_WORD_CONSTITUENT(c2))
957 setbit(c2, ccl);
958 if (c == 'W')
959 notset(ccl);
960 laststart = 0;
961 return lasttok = CSET + charclass_index(ccl);
962
963 case '[':
964 if (backslash)
965 goto normal_char;
966 laststart = 0;
967 #ifdef MBS_SUPPORT
968 if (MB_CUR_MAX > 1)
969 {
970 /* In multibyte environment a bracket expression may contain
971 multibyte characters, which must be treated as characters
972 (not bytes). So we parse it by parse_bracket_exp_mb(). */
973 parse_bracket_exp_mb();
974 return lasttok = MBCSET;
975 }
976 #endif
977 zeroset(ccl);
978 FETCH(c, _("Unbalanced ["));
979 if (c == '^')
980 {
981 FETCH(c, _("Unbalanced ["));
982 invert = 1;
983 }
984 else
985 invert = 0;
986 do
987 {
988 /* Nobody ever said this had to be fast. :-)
989 Note that if we're looking at some other [:...:]
990 construct, we just treat it as a bunch of ordinary
991 characters. We can do this because we assume
992 regex has checked for syntax errors before
993 dfa is ever called. */
994 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
995 for (c1 = 0; prednames[c1].name; ++c1)
996 if (looking_at(prednames[c1].name))
997 {
998 int (*pred) (int) = prednames[c1].pred;
999
1000 for (c2 = 0; c2 < NOTCHAR; ++c2)
1001 if ((*pred)(c2))
1002 setbit_case_fold (c2, ccl);
1003 lexptr += strlen(prednames[c1].name);
1004 lexleft -= strlen(prednames[c1].name);
1005 FETCH(c1, _("Unbalanced ["));
1006 goto skip;
1007 }
1008 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1009 FETCH(c, _("Unbalanced ["));
1010 FETCH(c1, _("Unbalanced ["));
1011 if (c1 == '-')
1012 {
1013 FETCH(c2, _("Unbalanced ["));
1014 if (c2 == ']')
1015 {
1016 /* In the case [x-], the - is an ordinary hyphen,
1017 which is left in c1, the lookahead character. */
1018 --lexptr;
1019 ++lexleft;
1020 }
1021 else
1022 {
1023 if (c2 == '\\'
1024 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1025 FETCH(c2, _("Unbalanced ["));
1026 FETCH(c1, _("Unbalanced ["));
1027 if (!hard_LC_COLLATE) {
1028 for (; c <= c2; c++)
1029 setbit_case_fold (c, ccl);
1030 } else {
1031 /* POSIX locales are painful - leave the decision to libc */
1032 regex_t re;
1033 char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */
1034
1035 expr[0] = '['; expr[1] = c; expr[2] = '-';
1036 expr[3] = c2; expr[4] = ']'; expr[5] = '\0';
1037 if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1038 for (c = 0; c < NOTCHAR; ++c) {
1039 regmatch_t mat;
1040 char buf[2]; /* = { c, '\0' }; */
1041
1042 buf[0] = c; buf[1] = '\0';
1043 if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1044 && mat.rm_so == 0 && mat.rm_eo == 1)
1045 setbit_case_fold (c, ccl);
1046 }
1047 regfree (&re);
1048 }
1049 }
1050 continue;
1051 }
1052 }
1053
1054 setbit_case_fold (c, ccl);
1055
1056 skip:
1057 ;
1058 }
1059 while ((c = c1) != ']');
1060 if (invert)
1061 {
1062 notset(ccl);
1063 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1064 clrbit(eolbyte, ccl);
1065 }
1066 return lasttok = CSET + charclass_index(ccl);
1067
1068 default:
1069 normal_char:
1070 laststart = 0;
1071 if (case_fold && ISALPHA(c))
1072 {
1073 zeroset(ccl);
1074 setbit_case_fold (c, ccl);
1075 return lasttok = CSET + charclass_index(ccl);
1076 }
1077 return c;
1078 }
1079 }
1080
1081 /* The above loop should consume at most a backslash
1082 and some other character. */
1083 abort();
1084 return END; /* keeps pedantic compilers happy. */
1085 }
1086
1087 /* Recursive descent parser for regular expressions. */
1088
1089 static token tok; /* Lookahead token. */
1090 static int depth; /* Current depth of a hypothetical stack
1091 holding deferred productions. This is
1092 used to determine the depth that will be
1093 required of the real stack later on in
1094 dfaanalyze(). */
1095
1096 /* Add the given token to the parse tree, maintaining the depth count and
1097 updating the maximum depth if necessary. */
1098 static void
addtok(token t)1099 addtok (token t)
1100 {
1101 #ifdef MBS_SUPPORT
1102 if (MB_CUR_MAX > 1)
1103 {
1104 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1105 dfa->tindex);
1106 /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
1107 if (t == MBCSET)
1108 dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1109 else if (t < NOTCHAR)
1110 dfa->multibyte_prop[dfa->tindex]
1111 = (cur_mb_len == 1)? 3 /* single-byte char */
1112 : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1113 + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1114 else
1115 /* It may be unnecesssary, but it is safer to treat other
1116 symbols as singlebyte characters. */
1117 dfa->multibyte_prop[dfa->tindex] = 3;
1118 }
1119 #endif
1120
1121 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1122 dfa->tokens[dfa->tindex++] = t;
1123
1124 switch (t)
1125 {
1126 case QMARK:
1127 case STAR:
1128 case PLUS:
1129 break;
1130
1131 case CAT:
1132 case OR:
1133 case ORTOP:
1134 --depth;
1135 break;
1136
1137 default:
1138 ++dfa->nleaves;
1139 case EMPTY:
1140 ++depth;
1141 break;
1142 }
1143 if (depth > dfa->depth)
1144 dfa->depth = depth;
1145 }
1146
1147 /* The grammar understood by the parser is as follows.
1148
1149 regexp:
1150 regexp OR branch
1151 branch
1152
1153 branch:
1154 branch closure
1155 closure
1156
1157 closure:
1158 closure QMARK
1159 closure STAR
1160 closure PLUS
1161 closure REPMN
1162 atom
1163
1164 atom:
1165 <normal character>
1166 <multibyte character>
1167 ANYCHAR
1168 MBCSET
1169 CSET
1170 BACKREF
1171 BEGLINE
1172 ENDLINE
1173 BEGWORD
1174 ENDWORD
1175 LIMWORD
1176 NOTLIMWORD
1177 CRANGE
1178 LPAREN regexp RPAREN
1179 <empty>
1180
1181 The parser builds a parse tree in postfix form in an array of tokens. */
1182
1183 static void
atom(void)1184 atom (void)
1185 {
1186 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1187 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1188 #ifdef MBS_SUPPORT
1189 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1190 #endif /* MBS_SUPPORT */
1191 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1192 {
1193 addtok(tok);
1194 tok = lex();
1195 #ifdef MBS_SUPPORT
1196 /* We treat a multibyte character as a single atom, so that DFA
1197 can treat a multibyte character as a single expression.
1198
1199 e.g. We construct following tree from "<mb1><mb2>".
1200 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1201 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1202 */
1203 if (MB_CUR_MAX > 1)
1204 {
1205 while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1206 {
1207 addtok(tok);
1208 addtok(CAT);
1209 tok = lex();
1210 }
1211 }
1212 #endif /* MBS_SUPPORT */
1213 }
1214 else if (tok == CRANGE)
1215 {
1216 /* A character range like "[a-z]" in a locale other than "C" or
1217 "POSIX". This range might any sequence of one or more
1218 characters. Unfortunately the POSIX locale primitives give
1219 us no practical way to find what character sequences might be
1220 matched. Treat this approximately like "(.\1)" -- i.e. match
1221 one character, and then punt to the full matcher. */
1222 charclass ccl;
1223 zeroset (ccl);
1224 notset (ccl);
1225 addtok (CSET + charclass_index (ccl));
1226 addtok (BACKREF);
1227 addtok (CAT);
1228 tok = lex ();
1229 }
1230 else if (tok == LPAREN)
1231 {
1232 tok = lex();
1233 regexp(0);
1234 if (tok != RPAREN)
1235 dfaerror(_("Unbalanced ("));
1236 tok = lex();
1237 }
1238 else
1239 addtok(EMPTY);
1240 }
1241
1242 /* Return the number of tokens in the given subexpression. */
1243 static int
nsubtoks(int tindex)1244 nsubtoks (int tindex)
1245 {
1246 int ntoks1;
1247
1248 switch (dfa->tokens[tindex - 1])
1249 {
1250 default:
1251 return 1;
1252 case QMARK:
1253 case STAR:
1254 case PLUS:
1255 return 1 + nsubtoks(tindex - 1);
1256 case CAT:
1257 case OR:
1258 case ORTOP:
1259 ntoks1 = nsubtoks(tindex - 1);
1260 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1261 }
1262 }
1263
1264 /* Copy the given subexpression to the top of the tree. */
1265 static void
copytoks(int tindex,int ntokens)1266 copytoks (int tindex, int ntokens)
1267 {
1268 int i;
1269
1270 for (i = 0; i < ntokens; ++i)
1271 addtok(dfa->tokens[tindex + i]);
1272 }
1273
1274 static void
closure(void)1275 closure (void)
1276 {
1277 int tindex, ntokens, i;
1278
1279 atom();
1280 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1281 if (tok == REPMN)
1282 {
1283 ntokens = nsubtoks(dfa->tindex);
1284 tindex = dfa->tindex - ntokens;
1285 if (maxrep < 0)
1286 addtok(PLUS);
1287 if (minrep == 0)
1288 addtok(QMARK);
1289 for (i = 1; i < minrep; ++i)
1290 {
1291 copytoks(tindex, ntokens);
1292 addtok(CAT);
1293 }
1294 for (; i < maxrep; ++i)
1295 {
1296 copytoks(tindex, ntokens);
1297 addtok(QMARK);
1298 addtok(CAT);
1299 }
1300 tok = lex();
1301 }
1302 else
1303 {
1304 addtok(tok);
1305 tok = lex();
1306 }
1307 }
1308
1309 static void
branch(void)1310 branch (void)
1311 {
1312 closure();
1313 while (tok != RPAREN && tok != OR && tok >= 0)
1314 {
1315 closure();
1316 addtok(CAT);
1317 }
1318 }
1319
1320 static void
regexp(int toplevel)1321 regexp (int toplevel)
1322 {
1323 branch();
1324 while (tok == OR)
1325 {
1326 tok = lex();
1327 branch();
1328 if (toplevel)
1329 addtok(ORTOP);
1330 else
1331 addtok(OR);
1332 }
1333 }
1334
1335 /* Main entry point for the parser. S is a string to be parsed, len is the
1336 length of the string, so s can include NUL characters. D is a pointer to
1337 the struct dfa to parse into. */
1338 void
dfaparse(char const * s,size_t len,struct dfa * d)1339 dfaparse (char const *s, size_t len, struct dfa *d)
1340 {
1341 dfa = d;
1342 lexstart = lexptr = s;
1343 lexleft = len;
1344 lasttok = END;
1345 laststart = 1;
1346 parens = 0;
1347 #if ENABLE_NLS
1348 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1349 #endif
1350 #ifdef MBS_SUPPORT
1351 if (MB_CUR_MAX > 1)
1352 {
1353 cur_mb_index = 0;
1354 cur_mb_len = 0;
1355 memset(&mbs, 0, sizeof(mbstate_t));
1356 }
1357 #endif /* MBS_SUPPORT */
1358
1359 if (! syntax_bits_set)
1360 dfaerror(_("No syntax specified"));
1361
1362 tok = lex();
1363 depth = d->depth;
1364
1365 regexp(1);
1366
1367 if (tok != END)
1368 dfaerror(_("Unbalanced )"));
1369
1370 addtok(END - d->nregexps);
1371 addtok(CAT);
1372
1373 if (d->nregexps)
1374 addtok(ORTOP);
1375
1376 ++d->nregexps;
1377 }
1378
1379 /* Some primitives for operating on sets of positions. */
1380
1381 /* Copy one set to another; the destination must be large enough. */
1382 static void
copy(position_set const * src,position_set * dst)1383 copy (position_set const *src, position_set *dst)
1384 {
1385 int i;
1386
1387 for (i = 0; i < src->nelem; ++i)
1388 dst->elems[i] = src->elems[i];
1389 dst->nelem = src->nelem;
1390 }
1391
1392 /* Insert a position in a set. Position sets are maintained in sorted
1393 order according to index. If position already exists in the set with
1394 the same index then their constraints are logically or'd together.
1395 S->elems must point to an array large enough to hold the resulting set. */
1396 static void
insert(position p,position_set * s)1397 insert (position p, position_set *s)
1398 {
1399 int i;
1400 position t1, t2;
1401
1402 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1403 continue;
1404 if (i < s->nelem && p.index == s->elems[i].index)
1405 s->elems[i].constraint |= p.constraint;
1406 else
1407 {
1408 t1 = p;
1409 ++s->nelem;
1410 while (i < s->nelem)
1411 {
1412 t2 = s->elems[i];
1413 s->elems[i++] = t1;
1414 t1 = t2;
1415 }
1416 }
1417 }
1418
1419 /* Merge two sets of positions into a third. The result is exactly as if
1420 the positions of both sets were inserted into an initially empty set. */
1421 static void
merge(position_set const * s1,position_set const * s2,position_set * m)1422 merge (position_set const *s1, position_set const *s2, position_set *m)
1423 {
1424 int i = 0, j = 0;
1425
1426 m->nelem = 0;
1427 while (i < s1->nelem && j < s2->nelem)
1428 if (s1->elems[i].index > s2->elems[j].index)
1429 m->elems[m->nelem++] = s1->elems[i++];
1430 else if (s1->elems[i].index < s2->elems[j].index)
1431 m->elems[m->nelem++] = s2->elems[j++];
1432 else
1433 {
1434 m->elems[m->nelem] = s1->elems[i++];
1435 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1436 }
1437 while (i < s1->nelem)
1438 m->elems[m->nelem++] = s1->elems[i++];
1439 while (j < s2->nelem)
1440 m->elems[m->nelem++] = s2->elems[j++];
1441 }
1442
1443 /* Delete a position from a set. */
1444 static void
delete(position p,position_set * s)1445 delete (position p, position_set *s)
1446 {
1447 int i;
1448
1449 for (i = 0; i < s->nelem; ++i)
1450 if (p.index == s->elems[i].index)
1451 break;
1452 if (i < s->nelem)
1453 for (--s->nelem; i < s->nelem; ++i)
1454 s->elems[i] = s->elems[i + 1];
1455 }
1456
1457 /* Find the index of the state corresponding to the given position set with
1458 the given preceding context, or create a new state if there is no such
1459 state. Newline and letter tell whether we got here on a newline or
1460 letter, respectively. */
1461 static int
state_index(struct dfa * d,position_set const * s,int newline,int letter)1462 state_index (struct dfa *d, position_set const *s, int newline, int letter)
1463 {
1464 int hash = 0;
1465 int constraint;
1466 int i, j;
1467
1468 newline = newline ? 1 : 0;
1469 letter = letter ? 1 : 0;
1470
1471 for (i = 0; i < s->nelem; ++i)
1472 hash ^= s->elems[i].index + s->elems[i].constraint;
1473
1474 /* Try to find a state that exactly matches the proposed one. */
1475 for (i = 0; i < d->sindex; ++i)
1476 {
1477 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1478 || newline != d->states[i].newline || letter != d->states[i].letter)
1479 continue;
1480 for (j = 0; j < s->nelem; ++j)
1481 if (s->elems[j].constraint
1482 != d->states[i].elems.elems[j].constraint
1483 || s->elems[j].index != d->states[i].elems.elems[j].index)
1484 break;
1485 if (j == s->nelem)
1486 return i;
1487 }
1488
1489 /* We'll have to create a new state. */
1490 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1491 d->states[i].hash = hash;
1492 MALLOC(d->states[i].elems.elems, position, s->nelem);
1493 copy(s, &d->states[i].elems);
1494 d->states[i].newline = newline;
1495 d->states[i].letter = letter;
1496 d->states[i].backref = 0;
1497 d->states[i].constraint = 0;
1498 d->states[i].first_end = 0;
1499 #ifdef MBS_SUPPORT
1500 if (MB_CUR_MAX > 1)
1501 d->states[i].mbps.nelem = 0;
1502 #endif
1503 for (j = 0; j < s->nelem; ++j)
1504 if (d->tokens[s->elems[j].index] < 0)
1505 {
1506 constraint = s->elems[j].constraint;
1507 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1508 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1509 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1510 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1511 d->states[i].constraint |= constraint;
1512 if (! d->states[i].first_end)
1513 d->states[i].first_end = d->tokens[s->elems[j].index];
1514 }
1515 else if (d->tokens[s->elems[j].index] == BACKREF)
1516 {
1517 d->states[i].constraint = NO_CONSTRAINT;
1518 d->states[i].backref = 1;
1519 }
1520
1521 ++d->sindex;
1522
1523 return i;
1524 }
1525
1526 /* Find the epsilon closure of a set of positions. If any position of the set
1527 contains a symbol that matches the empty string in some context, replace
1528 that position with the elements of its follow labeled with an appropriate
1529 constraint. Repeat exhaustively until no funny positions are left.
1530 S->elems must be large enough to hold the result. */
1531 static void
epsclosure(position_set * s,struct dfa const * d)1532 epsclosure (position_set *s, struct dfa const *d)
1533 {
1534 int i, j;
1535 int *visited;
1536 position p, old;
1537
1538 MALLOC(visited, int, d->tindex);
1539 for (i = 0; i < d->tindex; ++i)
1540 visited[i] = 0;
1541
1542 for (i = 0; i < s->nelem; ++i)
1543 if (d->tokens[s->elems[i].index] >= NOTCHAR
1544 && d->tokens[s->elems[i].index] != BACKREF
1545 #ifdef MBS_SUPPORT
1546 && d->tokens[s->elems[i].index] != ANYCHAR
1547 && d->tokens[s->elems[i].index] != MBCSET
1548 #endif
1549 && d->tokens[s->elems[i].index] < CSET)
1550 {
1551 old = s->elems[i];
1552 p.constraint = old.constraint;
1553 delete(s->elems[i], s);
1554 if (visited[old.index])
1555 {
1556 --i;
1557 continue;
1558 }
1559 visited[old.index] = 1;
1560 switch (d->tokens[old.index])
1561 {
1562 case BEGLINE:
1563 p.constraint &= BEGLINE_CONSTRAINT;
1564 break;
1565 case ENDLINE:
1566 p.constraint &= ENDLINE_CONSTRAINT;
1567 break;
1568 case BEGWORD:
1569 p.constraint &= BEGWORD_CONSTRAINT;
1570 break;
1571 case ENDWORD:
1572 p.constraint &= ENDWORD_CONSTRAINT;
1573 break;
1574 case LIMWORD:
1575 p.constraint &= LIMWORD_CONSTRAINT;
1576 break;
1577 case NOTLIMWORD:
1578 p.constraint &= NOTLIMWORD_CONSTRAINT;
1579 break;
1580 default:
1581 break;
1582 }
1583 for (j = 0; j < d->follows[old.index].nelem; ++j)
1584 {
1585 p.index = d->follows[old.index].elems[j].index;
1586 insert(p, s);
1587 }
1588 /* Force rescan to start at the beginning. */
1589 i = -1;
1590 }
1591
1592 free(visited);
1593 }
1594
1595 /* Perform bottom-up analysis on the parse tree, computing various functions.
1596 Note that at this point, we're pretending constructs like \< are real
1597 characters rather than constraints on what can follow them.
1598
1599 Nullable: A node is nullable if it is at the root of a regexp that can
1600 match the empty string.
1601 * EMPTY leaves are nullable.
1602 * No other leaf is nullable.
1603 * A QMARK or STAR node is nullable.
1604 * A PLUS node is nullable if its argument is nullable.
1605 * A CAT node is nullable if both its arguments are nullable.
1606 * An OR node is nullable if either argument is nullable.
1607
1608 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1609 that could correspond to the first character of a string matching the
1610 regexp rooted at the given node.
1611 * EMPTY leaves have empty firstpos.
1612 * The firstpos of a nonempty leaf is that leaf itself.
1613 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1614 argument.
1615 * The firstpos of a CAT node is the firstpos of the left argument, union
1616 the firstpos of the right if the left argument is nullable.
1617 * The firstpos of an OR node is the union of firstpos of each argument.
1618
1619 Lastpos: The lastpos of a node is the set of positions that could
1620 correspond to the last character of a string matching the regexp at
1621 the given node.
1622 * EMPTY leaves have empty lastpos.
1623 * The lastpos of a nonempty leaf is that leaf itself.
1624 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1625 argument.
1626 * The lastpos of a CAT node is the lastpos of its right argument, union
1627 the lastpos of the left if the right argument is nullable.
1628 * The lastpos of an OR node is the union of the lastpos of each argument.
1629
1630 Follow: The follow of a position is the set of positions that could
1631 correspond to the character following a character matching the node in
1632 a string matching the regexp. At this point we consider special symbols
1633 that match the empty string in some context to be just normal characters.
1634 Later, if we find that a special symbol is in a follow set, we will
1635 replace it with the elements of its follow, labeled with an appropriate
1636 constraint.
1637 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1638 the follow of every node in the lastpos.
1639 * Every node in the firstpos of the second argument of a CAT node is in
1640 the follow of every node in the lastpos of the first argument.
1641
1642 Because of the postfix representation of the parse tree, the depth-first
1643 analysis is conveniently done by a linear scan with the aid of a stack.
1644 Sets are stored as arrays of the elements, obeying a stack-like allocation
1645 scheme; the number of elements in each set deeper in the stack can be
1646 used to determine the address of a particular set's array. */
1647 void
dfaanalyze(struct dfa * d,int searchflag)1648 dfaanalyze (struct dfa *d, int searchflag)
1649 {
1650 int *nullable; /* Nullable stack. */
1651 int *nfirstpos; /* Element count stack for firstpos sets. */
1652 position *firstpos; /* Array where firstpos elements are stored. */
1653 int *nlastpos; /* Element count stack for lastpos sets. */
1654 position *lastpos; /* Array where lastpos elements are stored. */
1655 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1656 position_set tmp; /* Temporary set for merging sets. */
1657 position_set merged; /* Result of merging sets. */
1658 int wants_newline; /* True if some position wants newline info. */
1659 int *o_nullable;
1660 int *o_nfirst, *o_nlast;
1661 position *o_firstpos, *o_lastpos;
1662 int i, j;
1663 position *pos;
1664
1665 #ifdef DEBUG
1666 fprintf(stderr, "dfaanalyze:\n");
1667 for (i = 0; i < d->tindex; ++i)
1668 {
1669 fprintf(stderr, " %d:", i);
1670 prtok(d->tokens[i]);
1671 }
1672 putc('\n', stderr);
1673 #endif
1674
1675 d->searchflag = searchflag;
1676
1677 MALLOC(nullable, int, d->depth);
1678 o_nullable = nullable;
1679 MALLOC(nfirstpos, int, d->depth);
1680 o_nfirst = nfirstpos;
1681 MALLOC(firstpos, position, d->nleaves);
1682 o_firstpos = firstpos, firstpos += d->nleaves;
1683 MALLOC(nlastpos, int, d->depth);
1684 o_nlast = nlastpos;
1685 MALLOC(lastpos, position, d->nleaves);
1686 o_lastpos = lastpos, lastpos += d->nleaves;
1687 MALLOC(nalloc, int, d->tindex);
1688 for (i = 0; i < d->tindex; ++i)
1689 nalloc[i] = 0;
1690 MALLOC(merged.elems, position, d->nleaves);
1691
1692 CALLOC(d->follows, position_set, d->tindex);
1693
1694 for (i = 0; i < d->tindex; ++i)
1695 #ifdef DEBUG
1696 { /* Nonsyntactic #ifdef goo... */
1697 #endif
1698 switch (d->tokens[i])
1699 {
1700 case EMPTY:
1701 /* The empty set is nullable. */
1702 *nullable++ = 1;
1703
1704 /* The firstpos and lastpos of the empty leaf are both empty. */
1705 *nfirstpos++ = *nlastpos++ = 0;
1706 break;
1707
1708 case STAR:
1709 case PLUS:
1710 /* Every element in the firstpos of the argument is in the follow
1711 of every element in the lastpos. */
1712 tmp.nelem = nfirstpos[-1];
1713 tmp.elems = firstpos;
1714 pos = lastpos;
1715 for (j = 0; j < nlastpos[-1]; ++j)
1716 {
1717 merge(&tmp, &d->follows[pos[j].index], &merged);
1718 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1719 nalloc[pos[j].index], merged.nelem - 1);
1720 copy(&merged, &d->follows[pos[j].index]);
1721 }
1722
1723 case QMARK:
1724 /* A QMARK or STAR node is automatically nullable. */
1725 if (d->tokens[i] != PLUS)
1726 nullable[-1] = 1;
1727 break;
1728
1729 case CAT:
1730 /* Every element in the firstpos of the second argument is in the
1731 follow of every element in the lastpos of the first argument. */
1732 tmp.nelem = nfirstpos[-1];
1733 tmp.elems = firstpos;
1734 pos = lastpos + nlastpos[-1];
1735 for (j = 0; j < nlastpos[-2]; ++j)
1736 {
1737 merge(&tmp, &d->follows[pos[j].index], &merged);
1738 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1739 nalloc[pos[j].index], merged.nelem - 1);
1740 copy(&merged, &d->follows[pos[j].index]);
1741 }
1742
1743 /* The firstpos of a CAT node is the firstpos of the first argument,
1744 union that of the second argument if the first is nullable. */
1745 if (nullable[-2])
1746 nfirstpos[-2] += nfirstpos[-1];
1747 else
1748 firstpos += nfirstpos[-1];
1749 --nfirstpos;
1750
1751 /* The lastpos of a CAT node is the lastpos of the second argument,
1752 union that of the first argument if the second is nullable. */
1753 if (nullable[-1])
1754 nlastpos[-2] += nlastpos[-1];
1755 else
1756 {
1757 pos = lastpos + nlastpos[-2];
1758 for (j = nlastpos[-1] - 1; j >= 0; --j)
1759 pos[j] = lastpos[j];
1760 lastpos += nlastpos[-2];
1761 nlastpos[-2] = nlastpos[-1];
1762 }
1763 --nlastpos;
1764
1765 /* A CAT node is nullable if both arguments are nullable. */
1766 nullable[-2] = nullable[-1] && nullable[-2];
1767 --nullable;
1768 break;
1769
1770 case OR:
1771 case ORTOP:
1772 /* The firstpos is the union of the firstpos of each argument. */
1773 nfirstpos[-2] += nfirstpos[-1];
1774 --nfirstpos;
1775
1776 /* The lastpos is the union of the lastpos of each argument. */
1777 nlastpos[-2] += nlastpos[-1];
1778 --nlastpos;
1779
1780 /* An OR node is nullable if either argument is nullable. */
1781 nullable[-2] = nullable[-1] || nullable[-2];
1782 --nullable;
1783 break;
1784
1785 default:
1786 /* Anything else is a nonempty position. (Note that special
1787 constructs like \< are treated as nonempty strings here;
1788 an "epsilon closure" effectively makes them nullable later.
1789 Backreferences have to get a real position so we can detect
1790 transitions on them later. But they are nullable. */
1791 *nullable++ = d->tokens[i] == BACKREF;
1792
1793 /* This position is in its own firstpos and lastpos. */
1794 *nfirstpos++ = *nlastpos++ = 1;
1795 --firstpos, --lastpos;
1796 firstpos->index = lastpos->index = i;
1797 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1798
1799 /* Allocate the follow set for this position. */
1800 nalloc[i] = 1;
1801 MALLOC(d->follows[i].elems, position, nalloc[i]);
1802 break;
1803 }
1804 #ifdef DEBUG
1805 /* ... balance the above nonsyntactic #ifdef goo... */
1806 fprintf(stderr, "node %d:", i);
1807 prtok(d->tokens[i]);
1808 putc('\n', stderr);
1809 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1810 fprintf(stderr, " firstpos:");
1811 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1812 {
1813 fprintf(stderr, " %d:", firstpos[j].index);
1814 prtok(d->tokens[firstpos[j].index]);
1815 }
1816 fprintf(stderr, "\n lastpos:");
1817 for (j = nlastpos[-1] - 1; j >= 0; --j)
1818 {
1819 fprintf(stderr, " %d:", lastpos[j].index);
1820 prtok(d->tokens[lastpos[j].index]);
1821 }
1822 putc('\n', stderr);
1823 }
1824 #endif
1825
1826 /* For each follow set that is the follow set of a real position, replace
1827 it with its epsilon closure. */
1828 for (i = 0; i < d->tindex; ++i)
1829 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1830 #ifdef MBS_SUPPORT
1831 || d->tokens[i] == ANYCHAR
1832 || d->tokens[i] == MBCSET
1833 #endif
1834 || d->tokens[i] >= CSET)
1835 {
1836 #ifdef DEBUG
1837 fprintf(stderr, "follows(%d:", i);
1838 prtok(d->tokens[i]);
1839 fprintf(stderr, "):");
1840 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1841 {
1842 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1843 prtok(d->tokens[d->follows[i].elems[j].index]);
1844 }
1845 putc('\n', stderr);
1846 #endif
1847 copy(&d->follows[i], &merged);
1848 epsclosure(&merged, d);
1849 if (d->follows[i].nelem < merged.nelem)
1850 REALLOC(d->follows[i].elems, position, merged.nelem);
1851 copy(&merged, &d->follows[i]);
1852 }
1853
1854 /* Get the epsilon closure of the firstpos of the regexp. The result will
1855 be the set of positions of state 0. */
1856 merged.nelem = 0;
1857 for (i = 0; i < nfirstpos[-1]; ++i)
1858 insert(firstpos[i], &merged);
1859 epsclosure(&merged, d);
1860
1861 /* Check if any of the positions of state 0 will want newline context. */
1862 wants_newline = 0;
1863 for (i = 0; i < merged.nelem; ++i)
1864 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1865 wants_newline = 1;
1866
1867 /* Build the initial state. */
1868 d->salloc = 1;
1869 d->sindex = 0;
1870 MALLOC(d->states, dfa_state, d->salloc);
1871 state_index(d, &merged, wants_newline, 0);
1872
1873 free(o_nullable);
1874 free(o_nfirst);
1875 free(o_firstpos);
1876 free(o_nlast);
1877 free(o_lastpos);
1878 free(nalloc);
1879 free(merged.elems);
1880 }
1881
1882 /* Find, for each character, the transition out of state s of d, and store
1883 it in the appropriate slot of trans.
1884
1885 We divide the positions of s into groups (positions can appear in more
1886 than one group). Each group is labeled with a set of characters that
1887 every position in the group matches (taking into account, if necessary,
1888 preceding context information of s). For each group, find the union
1889 of the its elements' follows. This set is the set of positions of the
1890 new state. For each character in the group's label, set the transition
1891 on this character to be to a state corresponding to the set's positions,
1892 and its associated backward context information, if necessary.
1893
1894 If we are building a searching matcher, we include the positions of state
1895 0 in every state.
1896
1897 The collection of groups is constructed by building an equivalence-class
1898 partition of the positions of s.
1899
1900 For each position, find the set of characters C that it matches. Eliminate
1901 any characters from C that fail on grounds of backward context.
1902
1903 Search through the groups, looking for a group whose label L has nonempty
1904 intersection with C. If L - C is nonempty, create a new group labeled
1905 L - C and having the same positions as the current group, and set L to
1906 the intersection of L and C. Insert the position in this group, set
1907 C = C - L, and resume scanning.
1908
1909 If after comparing with every group there are characters remaining in C,
1910 create a new group labeled with the characters of C and insert this
1911 position in that group. */
1912 void
dfastate(int s,struct dfa * d,int trans[])1913 dfastate (int s, struct dfa *d, int trans[])
1914 {
1915 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1916 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1917 int ngrps = 0; /* Number of groups actually used. */
1918 position pos; /* Current position being considered. */
1919 charclass matches; /* Set of matching characters. */
1920 int matchesf; /* True if matches is nonempty. */
1921 charclass intersect; /* Intersection with some label set. */
1922 int intersectf; /* True if intersect is nonempty. */
1923 charclass leftovers; /* Stuff in the label that didn't match. */
1924 int leftoversf; /* True if leftovers is nonempty. */
1925 static charclass letters; /* Set of characters considered letters. */
1926 static charclass newline; /* Set of characters that aren't newline. */
1927 position_set follows; /* Union of the follows of some group. */
1928 position_set tmp; /* Temporary space for merging sets. */
1929 int state; /* New state. */
1930 int wants_newline; /* New state wants to know newline context. */
1931 int state_newline; /* New state on a newline transition. */
1932 int wants_letter; /* New state wants to know letter context. */
1933 int state_letter; /* New state on a letter transition. */
1934 static int initialized; /* Flag for static initialization. */
1935 #ifdef MBS_SUPPORT
1936 int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */
1937 #endif
1938 int i, j, k;
1939
1940 /* Initialize the set of letters, if necessary. */
1941 if (! initialized)
1942 {
1943 initialized = 1;
1944 for (i = 0; i < NOTCHAR; ++i)
1945 if (IS_WORD_CONSTITUENT(i))
1946 setbit(i, letters);
1947 setbit(eolbyte, newline);
1948 }
1949
1950 zeroset(matches);
1951
1952 for (i = 0; i < d->states[s].elems.nelem; ++i)
1953 {
1954 pos = d->states[s].elems.elems[i];
1955 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1956 setbit(d->tokens[pos.index], matches);
1957 else if (d->tokens[pos.index] >= CSET)
1958 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1959 #ifdef MBS_SUPPORT
1960 else if (d->tokens[pos.index] == ANYCHAR
1961 || d->tokens[pos.index] == MBCSET)
1962 /* MB_CUR_MAX > 1 */
1963 {
1964 /* ANYCHAR and MBCSET must match with a single character, so we
1965 must put it to d->states[s].mbps, which contains the positions
1966 which can match with a single character not a byte. */
1967 if (d->states[s].mbps.nelem == 0)
1968 {
1969 MALLOC(d->states[s].mbps.elems, position,
1970 d->states[s].elems.nelem);
1971 }
1972 insert(pos, &(d->states[s].mbps));
1973 continue;
1974 }
1975 #endif /* MBS_SUPPORT */
1976 else
1977 continue;
1978
1979 /* Some characters may need to be eliminated from matches because
1980 they fail in the current context. */
1981 if (pos.constraint != 0xFF)
1982 {
1983 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1984 d->states[s].newline, 1))
1985 clrbit(eolbyte, matches);
1986 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1987 d->states[s].newline, 0))
1988 for (j = 0; j < CHARCLASS_INTS; ++j)
1989 matches[j] &= newline[j];
1990 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1991 d->states[s].letter, 1))
1992 for (j = 0; j < CHARCLASS_INTS; ++j)
1993 matches[j] &= ~letters[j];
1994 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1995 d->states[s].letter, 0))
1996 for (j = 0; j < CHARCLASS_INTS; ++j)
1997 matches[j] &= letters[j];
1998
1999 /* If there are no characters left, there's no point in going on. */
2000 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2001 continue;
2002 if (j == CHARCLASS_INTS)
2003 continue;
2004 }
2005
2006 for (j = 0; j < ngrps; ++j)
2007 {
2008 /* If matches contains a single character only, and the current
2009 group's label doesn't contain that character, go on to the
2010 next group. */
2011 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2012 && !tstbit(d->tokens[pos.index], labels[j]))
2013 continue;
2014
2015 /* Check if this group's label has a nonempty intersection with
2016 matches. */
2017 intersectf = 0;
2018 for (k = 0; k < CHARCLASS_INTS; ++k)
2019 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2020 if (! intersectf)
2021 continue;
2022
2023 /* It does; now find the set differences both ways. */
2024 leftoversf = matchesf = 0;
2025 for (k = 0; k < CHARCLASS_INTS; ++k)
2026 {
2027 /* Even an optimizing compiler can't know this for sure. */
2028 int match = matches[k], label = labels[j][k];
2029
2030 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2031 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2032 }
2033
2034 /* If there were leftovers, create a new group labeled with them. */
2035 if (leftoversf)
2036 {
2037 copyset(leftovers, labels[ngrps]);
2038 copyset(intersect, labels[j]);
2039 MALLOC(grps[ngrps].elems, position, d->nleaves);
2040 copy(&grps[j], &grps[ngrps]);
2041 ++ngrps;
2042 }
2043
2044 /* Put the position in the current group. Note that there is no
2045 reason to call insert() here. */
2046 grps[j].elems[grps[j].nelem++] = pos;
2047
2048 /* If every character matching the current position has been
2049 accounted for, we're done. */
2050 if (! matchesf)
2051 break;
2052 }
2053
2054 /* If we've passed the last group, and there are still characters
2055 unaccounted for, then we'll have to create a new group. */
2056 if (j == ngrps)
2057 {
2058 copyset(matches, labels[ngrps]);
2059 zeroset(matches);
2060 MALLOC(grps[ngrps].elems, position, d->nleaves);
2061 grps[ngrps].nelem = 1;
2062 grps[ngrps].elems[0] = pos;
2063 ++ngrps;
2064 }
2065 }
2066
2067 MALLOC(follows.elems, position, d->nleaves);
2068 MALLOC(tmp.elems, position, d->nleaves);
2069
2070 /* If we are a searching matcher, the default transition is to a state
2071 containing the positions of state 0, otherwise the default transition
2072 is to fail miserably. */
2073 if (d->searchflag)
2074 {
2075 wants_newline = 0;
2076 wants_letter = 0;
2077 for (i = 0; i < d->states[0].elems.nelem; ++i)
2078 {
2079 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2080 wants_newline = 1;
2081 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2082 wants_letter = 1;
2083 }
2084 copy(&d->states[0].elems, &follows);
2085 state = state_index(d, &follows, 0, 0);
2086 if (wants_newline)
2087 state_newline = state_index(d, &follows, 1, 0);
2088 else
2089 state_newline = state;
2090 if (wants_letter)
2091 state_letter = state_index(d, &follows, 0, 1);
2092 else
2093 state_letter = state;
2094 for (i = 0; i < NOTCHAR; ++i)
2095 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2096 trans[eolbyte] = state_newline;
2097 }
2098 else
2099 for (i = 0; i < NOTCHAR; ++i)
2100 trans[i] = -1;
2101
2102 for (i = 0; i < ngrps; ++i)
2103 {
2104 follows.nelem = 0;
2105
2106 /* Find the union of the follows of the positions of the group.
2107 This is a hideously inefficient loop. Fix it someday. */
2108 for (j = 0; j < grps[i].nelem; ++j)
2109 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2110 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2111
2112 #ifdef MBS_SUPPORT
2113 if (MB_CUR_MAX > 1)
2114 {
2115 /* If a token in follows.elems is not 1st byte of a multibyte
2116 character, or the states of follows must accept the bytes
2117 which are not 1st byte of the multibyte character.
2118 Then, if a state of follows encounter a byte, it must not be
2119 a 1st byte of a multibyte character nor singlebyte character.
2120 We cansel to add state[0].follows to next state, because
2121 state[0] must accept 1st-byte
2122
2123 For example, we assume <sb a> is a certain singlebyte
2124 character, <mb A> is a certain multibyte character, and the
2125 codepoint of <sb a> equals the 2nd byte of the codepoint of
2126 <mb A>.
2127 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2128 by accepting accepts 1st byte of <mb A>, and state[i+1]
2129 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2130 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2131 <mb A>, so we can not add state[0]. */
2132
2133 next_isnt_1st_byte = 0;
2134 for (j = 0; j < follows.nelem; ++j)
2135 {
2136 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2137 {
2138 next_isnt_1st_byte = 1;
2139 break;
2140 }
2141 }
2142 }
2143 #endif
2144
2145 /* If we are building a searching matcher, throw in the positions
2146 of state 0 as well. */
2147 #ifdef MBS_SUPPORT
2148 if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2149 #else
2150 if (d->searchflag)
2151 #endif
2152 for (j = 0; j < d->states[0].elems.nelem; ++j)
2153 insert(d->states[0].elems.elems[j], &follows);
2154
2155 /* Find out if the new state will want any context information. */
2156 wants_newline = 0;
2157 if (tstbit(eolbyte, labels[i]))
2158 for (j = 0; j < follows.nelem; ++j)
2159 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2160 wants_newline = 1;
2161
2162 wants_letter = 0;
2163 for (j = 0; j < CHARCLASS_INTS; ++j)
2164 if (labels[i][j] & letters[j])
2165 break;
2166 if (j < CHARCLASS_INTS)
2167 for (j = 0; j < follows.nelem; ++j)
2168 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2169 wants_letter = 1;
2170
2171 /* Find the state(s) corresponding to the union of the follows. */
2172 state = state_index(d, &follows, 0, 0);
2173 if (wants_newline)
2174 state_newline = state_index(d, &follows, 1, 0);
2175 else
2176 state_newline = state;
2177 if (wants_letter)
2178 state_letter = state_index(d, &follows, 0, 1);
2179 else
2180 state_letter = state;
2181
2182 /* Set the transitions for each character in the current label. */
2183 for (j = 0; j < CHARCLASS_INTS; ++j)
2184 for (k = 0; k < INTBITS; ++k)
2185 if (labels[i][j] & 1 << k)
2186 {
2187 int c = j * INTBITS + k;
2188
2189 if (c == eolbyte)
2190 trans[c] = state_newline;
2191 else if (IS_WORD_CONSTITUENT(c))
2192 trans[c] = state_letter;
2193 else if (c < NOTCHAR)
2194 trans[c] = state;
2195 }
2196 }
2197
2198 for (i = 0; i < ngrps; ++i)
2199 free(grps[i].elems);
2200 free(follows.elems);
2201 free(tmp.elems);
2202 }
2203
2204 /* Some routines for manipulating a compiled dfa's transition tables.
2205 Each state may or may not have a transition table; if it does, and it
2206 is a non-accepting state, then d->trans[state] points to its table.
2207 If it is an accepting state then d->fails[state] points to its table.
2208 If it has no table at all, then d->trans[state] is NULL.
2209 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2210
2211 static void
build_state(int s,struct dfa * d)2212 build_state (int s, struct dfa *d)
2213 {
2214 int *trans; /* The new transition table. */
2215 int i;
2216
2217 /* Set an upper limit on the number of transition tables that will ever
2218 exist at once. 1024 is arbitrary. The idea is that the frequently
2219 used transition tables will be quickly rebuilt, whereas the ones that
2220 were only needed once or twice will be cleared away. */
2221 if (d->trcount >= 1024)
2222 {
2223 for (i = 0; i < d->tralloc; ++i)
2224 if (d->trans[i])
2225 {
2226 free((ptr_t) d->trans[i]);
2227 d->trans[i] = NULL;
2228 }
2229 else if (d->fails[i])
2230 {
2231 free((ptr_t) d->fails[i]);
2232 d->fails[i] = NULL;
2233 }
2234 d->trcount = 0;
2235 }
2236
2237 ++d->trcount;
2238
2239 /* Set up the success bits for this state. */
2240 d->success[s] = 0;
2241 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2242 s, *d))
2243 d->success[s] |= 4;
2244 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2245 s, *d))
2246 d->success[s] |= 2;
2247 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2248 s, *d))
2249 d->success[s] |= 1;
2250
2251 MALLOC(trans, int, NOTCHAR);
2252 dfastate(s, d, trans);
2253
2254 /* Now go through the new transition table, and make sure that the trans
2255 and fail arrays are allocated large enough to hold a pointer for the
2256 largest state mentioned in the table. */
2257 for (i = 0; i < NOTCHAR; ++i)
2258 if (trans[i] >= d->tralloc)
2259 {
2260 int oldalloc = d->tralloc;
2261
2262 while (trans[i] >= d->tralloc)
2263 d->tralloc *= 2;
2264 REALLOC(d->realtrans, int *, d->tralloc + 1);
2265 d->trans = d->realtrans + 1;
2266 REALLOC(d->fails, int *, d->tralloc);
2267 REALLOC(d->success, int, d->tralloc);
2268 while (oldalloc < d->tralloc)
2269 {
2270 d->trans[oldalloc] = NULL;
2271 d->fails[oldalloc++] = NULL;
2272 }
2273 }
2274
2275 /* Newline is a sentinel. */
2276 trans[eolbyte] = -1;
2277
2278 if (ACCEPTING(s, *d))
2279 d->fails[s] = trans;
2280 else
2281 d->trans[s] = trans;
2282 }
2283
2284 static void
build_state_zero(struct dfa * d)2285 build_state_zero (struct dfa *d)
2286 {
2287 d->tralloc = 1;
2288 d->trcount = 0;
2289 CALLOC(d->realtrans, int *, d->tralloc + 1);
2290 d->trans = d->realtrans + 1;
2291 CALLOC(d->fails, int *, d->tralloc);
2292 MALLOC(d->success, int, d->tralloc);
2293 build_state(0, d);
2294 }
2295
2296 #ifdef MBS_SUPPORT
2297 /* Multibyte character handling sub-routins for dfaexec. */
2298
2299 /* Initial state may encounter the byte which is not a singlebyte character
2300 nor 1st byte of a multibyte character. But it is incorrect for initial
2301 state to accept such a byte.
2302 For example, in sjis encoding the regular expression like "\\" accepts
2303 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2304 0x815c. Then Initial state must skip the bytes which are not a singlebyte
2305 character nor 1st byte of a multibyte character. */
2306 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2307 if (s == 0) \
2308 { \
2309 while (inputwcs[p - buf_begin] == 0 \
2310 && mblen_buf[p - buf_begin] > 0 \
2311 && p < buf_end) \
2312 ++p; \
2313 if (p >= end) \
2314 { \
2315 free(mblen_buf); \
2316 free(inputwcs); \
2317 return (size_t) -1; \
2318 } \
2319 }
2320
2321 static void
realloc_trans_if_necessary(struct dfa * d,int new_state)2322 realloc_trans_if_necessary(struct dfa *d, int new_state)
2323 {
2324 /* Make sure that the trans and fail arrays are allocated large enough
2325 to hold a pointer for the new state. */
2326 if (new_state >= d->tralloc)
2327 {
2328 int oldalloc = d->tralloc;
2329
2330 while (new_state >= d->tralloc)
2331 d->tralloc *= 2;
2332 REALLOC(d->realtrans, int *, d->tralloc + 1);
2333 d->trans = d->realtrans + 1;
2334 REALLOC(d->fails, int *, d->tralloc);
2335 REALLOC(d->success, int, d->tralloc);
2336 while (oldalloc < d->tralloc)
2337 {
2338 d->trans[oldalloc] = NULL;
2339 d->fails[oldalloc++] = NULL;
2340 }
2341 }
2342 }
2343
2344 /* Return values of transit_state_singlebyte(), and
2345 transit_state_consume_1char. */
2346 typedef enum
2347 {
2348 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2349 TRANSIT_STATE_DONE, /* State transition has finished. */
2350 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2351 } status_transit_state;
2352
2353 /* Consume a single byte and transit state from 's' to '*next_state'.
2354 This function is almost same as the state transition routin in dfaexec().
2355 But state transition is done just once, otherwise matching succeed or
2356 reach the end of the buffer. */
2357 static status_transit_state
transit_state_singlebyte(struct dfa * d,int s,unsigned char const * p,int * next_state)2358 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2359 int *next_state)
2360 {
2361 int *t;
2362 int works = s;
2363
2364 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2365
2366 while (rval == TRANSIT_STATE_IN_PROGRESS)
2367 {
2368 if ((t = d->trans[works]) != NULL)
2369 {
2370 works = t[*p];
2371 rval = TRANSIT_STATE_DONE;
2372 if (works < 0)
2373 works = 0;
2374 }
2375 else if (works < 0)
2376 {
2377 if (p == buf_end)
2378 /* At the moment, it must not happen. */
2379 return TRANSIT_STATE_END_BUFFER;
2380 works = 0;
2381 }
2382 else if (d->fails[works])
2383 {
2384 works = d->fails[works][*p];
2385 rval = TRANSIT_STATE_DONE;
2386 }
2387 else
2388 {
2389 build_state(works, d);
2390 }
2391 }
2392 *next_state = works;
2393 return rval;
2394 }
2395
2396 /* Check whether period can match or not in the current context. If it can,
2397 return the amount of the bytes with which period can match, otherwise
2398 return 0.
2399 `pos' is the position of the period. `index' is the index from the
2400 buf_begin, and it is the current position in the buffer. */
2401 static int
match_anychar(struct dfa * d,int s,position pos,int index)2402 match_anychar (struct dfa *d, int s, position pos, int index)
2403 {
2404 int newline = 0;
2405 int letter = 0;
2406 wchar_t wc;
2407 int mbclen;
2408
2409 wc = inputwcs[index];
2410 mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2411
2412 /* Check context. */
2413 if (wc == (wchar_t)eolbyte)
2414 {
2415 if (!(syntax_bits & RE_DOT_NEWLINE))
2416 return 0;
2417 newline = 1;
2418 }
2419 else if (wc == (wchar_t)'\0')
2420 {
2421 if (syntax_bits & RE_DOT_NOT_NULL)
2422 return 0;
2423 newline = 1;
2424 }
2425
2426 if (iswalnum(wc) || wc == L'_')
2427 letter = 1;
2428
2429 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2430 newline, d->states[s].letter, letter))
2431 return 0;
2432
2433 return mbclen;
2434 }
2435
2436 /* Check whether bracket expression can match or not in the current context.
2437 If it can, return the amount of the bytes with which expression can match,
2438 otherwise return 0.
2439 `pos' is the position of the bracket expression. `index' is the index
2440 from the buf_begin, and it is the current position in the buffer. */
2441 int
match_mb_charset(struct dfa * d,int s,position pos,int index)2442 match_mb_charset (struct dfa *d, int s, position pos, int index)
2443 {
2444 int i;
2445 int match; /* Flag which represent that matching succeed. */
2446 int match_len; /* Length of the character (or collating element)
2447 with which this operator match. */
2448 int op_len; /* Length of the operator. */
2449 char buffer[128];
2450 wchar_t wcbuf[6];
2451
2452 /* Pointer to the structure to which we are currently reffering. */
2453 struct mb_char_classes *work_mbc;
2454
2455 int newline = 0;
2456 int letter = 0;
2457 wchar_t wc; /* Current reffering character. */
2458
2459 wc = inputwcs[index];
2460
2461 /* Check context. */
2462 if (wc == (wchar_t)eolbyte)
2463 {
2464 if (!(syntax_bits & RE_DOT_NEWLINE))
2465 return 0;
2466 newline = 1;
2467 }
2468 else if (wc == (wchar_t)'\0')
2469 {
2470 if (syntax_bits & RE_DOT_NOT_NULL)
2471 return 0;
2472 newline = 1;
2473 }
2474 if (iswalnum(wc) || wc == L'_')
2475 letter = 1;
2476 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2477 newline, d->states[s].letter, letter))
2478 return 0;
2479
2480 /* Assign the current reffering operator to work_mbc. */
2481 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2482 match = !work_mbc->invert;
2483 match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2484
2485 /* match with a character class? */
2486 for (i = 0; i<work_mbc->nch_classes; i++)
2487 {
2488 if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2489 goto charset_matched;
2490 }
2491
2492 strncpy(buffer, buf_begin + index, match_len);
2493 buffer[match_len] = '\0';
2494
2495 /* match with an equivalent class? */
2496 for (i = 0; i<work_mbc->nequivs; i++)
2497 {
2498 op_len = strlen(work_mbc->equivs[i]);
2499 strncpy(buffer, buf_begin + index, op_len);
2500 buffer[op_len] = '\0';
2501 if (strcoll(work_mbc->equivs[i], buffer) == 0)
2502 {
2503 match_len = op_len;
2504 goto charset_matched;
2505 }
2506 }
2507
2508 /* match with a collating element? */
2509 for (i = 0; i<work_mbc->ncoll_elems; i++)
2510 {
2511 op_len = strlen(work_mbc->coll_elems[i]);
2512 strncpy(buffer, buf_begin + index, op_len);
2513 buffer[op_len] = '\0';
2514
2515 if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2516 {
2517 match_len = op_len;
2518 goto charset_matched;
2519 }
2520 }
2521
2522 wcbuf[0] = wc;
2523 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2524
2525 /* match with a range? */
2526 for (i = 0; i<work_mbc->nranges; i++)
2527 {
2528 wcbuf[2] = work_mbc->range_sts[i];
2529 wcbuf[4] = work_mbc->range_ends[i];
2530
2531 if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2532 wcscoll(wcbuf+4, wcbuf) >= 0)
2533 goto charset_matched;
2534 }
2535
2536 /* match with a character? */
2537 for (i = 0; i<work_mbc->nchars; i++)
2538 {
2539 if (wc == work_mbc->chars[i])
2540 goto charset_matched;
2541 }
2542
2543 match = !match;
2544
2545 charset_matched:
2546 return match ? match_len : 0;
2547 }
2548
2549 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2550 array which corresponds to `d->states[s].mbps.elem' and each element of
2551 the array contains the amount of the bytes with which the element can
2552 match.
2553 `index' is the index from the buf_begin, and it is the current position
2554 in the buffer.
2555 Caller MUST free the array which this function return. */
2556 static int*
check_matching_with_multibyte_ops(struct dfa * d,int s,int index)2557 check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2558 {
2559 int i;
2560 int* rarray;
2561
2562 MALLOC(rarray, int, d->states[s].mbps.nelem);
2563 for (i = 0; i < d->states[s].mbps.nelem; ++i)
2564 {
2565 position pos = d->states[s].mbps.elems[i];
2566 switch(d->tokens[pos.index])
2567 {
2568 case ANYCHAR:
2569 rarray[i] = match_anychar(d, s, pos, index);
2570 break;
2571 case MBCSET:
2572 rarray[i] = match_mb_charset(d, s, pos, index);
2573 break;
2574 default:
2575 break; /* can not happen. */
2576 }
2577 }
2578 return rarray;
2579 }
2580
2581 /* Consume a single character and enumerate all of the positions which can
2582 be next position from the state `s'.
2583 `match_lens' is the input. It can be NULL, but it can also be the output
2584 of check_matching_with_multibyte_ops() for optimization.
2585 `mbclen' and `pps' are the output. `mbclen' is the length of the
2586 character consumed, and `pps' is the set this function enumerate. */
2587 static status_transit_state
transit_state_consume_1char(struct dfa * d,int s,unsigned char const ** pp,int * match_lens,int * mbclen,position_set * pps)2588 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2589 int *match_lens, int *mbclen, position_set *pps)
2590 {
2591 int i, j;
2592 int s1, s2;
2593 int* work_mbls;
2594 status_transit_state rs = TRANSIT_STATE_DONE;
2595
2596 /* Calculate the length of the (single/multi byte) character
2597 to which p points. */
2598 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2599 : mblen_buf[*pp - buf_begin];
2600
2601 /* Calculate the state which can be reached from the state `s' by
2602 consuming `*mbclen' single bytes from the buffer. */
2603 s1 = s;
2604 for (i = 0; i < *mbclen; i++)
2605 {
2606 s2 = s1;
2607 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2608 }
2609 /* Copy the positions contained by `s1' to the set `pps'. */
2610 copy(&(d->states[s1].elems), pps);
2611
2612 /* Check (inputed)match_lens, and initialize if it is NULL. */
2613 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2614 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2615 else
2616 work_mbls = match_lens;
2617
2618 /* Add all of the positions which can be reached from `s' by consuming
2619 a single character. */
2620 for (i = 0; i < d->states[s].mbps.nelem ; i++)
2621 {
2622 if (work_mbls[i] == *mbclen)
2623 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2624 j++)
2625 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2626 pps);
2627 }
2628
2629 if (match_lens == NULL && work_mbls != NULL)
2630 free(work_mbls);
2631 return rs;
2632 }
2633
2634 /* Transit state from s, then return new state and update the pointer of the
2635 buffer. This function is for some operator which can match with a multi-
2636 byte character or a collating element(which may be multi characters). */
2637 static int
transit_state(struct dfa * d,int s,unsigned char const ** pp)2638 transit_state (struct dfa *d, int s, unsigned char const **pp)
2639 {
2640 int s1;
2641 int mbclen; /* The length of current input multibyte character. */
2642 int maxlen = 0;
2643 int i, j;
2644 int *match_lens = NULL;
2645 int nelem = d->states[s].mbps.nelem; /* Just a alias. */
2646 position_set follows;
2647 unsigned char const *p1 = *pp;
2648 status_transit_state rs;
2649 wchar_t wc;
2650
2651 if (nelem > 0)
2652 /* This state has (a) multibyte operator(s).
2653 We check whether each of them can match or not. */
2654 {
2655 /* Note: caller must free the return value of this function. */
2656 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2657
2658 for (i = 0; i < nelem; i++)
2659 /* Search the operator which match the longest string,
2660 in this state. */
2661 {
2662 if (match_lens[i] > maxlen)
2663 maxlen = match_lens[i];
2664 }
2665 }
2666
2667 if (nelem == 0 || maxlen == 0)
2668 /* This state has no multibyte operator which can match.
2669 We need to check only one singlebyte character. */
2670 {
2671 status_transit_state rs;
2672 rs = transit_state_singlebyte(d, s, *pp, &s1);
2673
2674 /* We must update the pointer if state transition succeeded. */
2675 if (rs == TRANSIT_STATE_DONE)
2676 ++*pp;
2677
2678 if (match_lens != NULL)
2679 free(match_lens);
2680 return s1;
2681 }
2682
2683 /* This state has some operators which can match a multibyte character. */
2684 follows.nelem = 0;
2685 MALLOC(follows.elems, position, d->nleaves);
2686
2687 /* `maxlen' may be longer than the length of a character, because it may
2688 not be a character but a (multi character) collating element.
2689 We enumerate all of the positions which `s' can reach by consuming
2690 `maxlen' bytes. */
2691 rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2692
2693 wc = inputwcs[*pp - mbclen - buf_begin];
2694 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2695 realloc_trans_if_necessary(d, s1);
2696
2697 while (*pp - p1 < maxlen)
2698 {
2699 follows.nelem = 0;
2700 rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2701
2702 for (i = 0; i < nelem ; i++)
2703 {
2704 if (match_lens[i] == *pp - p1)
2705 for (j = 0;
2706 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2707 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2708 &follows);
2709 }
2710
2711 wc = inputwcs[*pp - mbclen - buf_begin];
2712 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2713 realloc_trans_if_necessary(d, s1);
2714 }
2715 free(match_lens);
2716 free(follows.elems);
2717 return s1;
2718 }
2719
2720 #endif
2721
2722 /* Search through a buffer looking for a match to the given struct dfa.
2723 Find the first occurrence of a string matching the regexp in the buffer,
2724 and the shortest possible version thereof. Return the offset of the first
2725 character after the match, or (size_t) -1 if none is found. BEGIN points to
2726 the beginning of the buffer, and SIZE is the size of the buffer. If SIZE
2727 is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place
2728 where we're supposed to store a 1 if backreferencing happened and the
2729 match needs to be verified by a backtracking matcher. Otherwise
2730 we store a 0 in *backref. */
2731 size_t
dfaexec(struct dfa * d,char const * begin,size_t size,int * backref)2732 dfaexec (struct dfa *d, char const *begin, size_t size, int *backref)
2733 {
2734 register int s; /* Current state. */
2735 register unsigned char const *p; /* Current input character. */
2736 register unsigned char const *end; /* One past the last input character. */
2737 register int **trans, *t; /* Copy of d->trans so it can be optimized
2738 into a register. */
2739 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
2740 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
2741 static int sbit_init;
2742
2743 if (! sbit_init)
2744 {
2745 int i;
2746
2747 sbit_init = 1;
2748 for (i = 0; i < NOTCHAR; ++i)
2749 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2750 sbit[eol] = 4;
2751 }
2752
2753 if (! d->tralloc)
2754 build_state_zero(d);
2755
2756 s = 0;
2757 p = (unsigned char const *) begin;
2758 end = p + size;
2759 trans = d->trans;
2760
2761 #ifdef MBS_SUPPORT
2762 if (MB_CUR_MAX > 1)
2763 {
2764 int remain_bytes, i;
2765 buf_begin = begin;
2766 buf_end = end;
2767
2768 /* initialize mblen_buf, and inputwcs. */
2769 MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2770 MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2771 memset(&mbs, 0, sizeof(mbstate_t));
2772 remain_bytes = 0;
2773 for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2774 {
2775 if (remain_bytes == 0)
2776 {
2777 remain_bytes
2778 = mbrtowc(inputwcs + i, begin + i,
2779 end - (unsigned char const *)begin - i + 1, &mbs);
2780 if (remain_bytes <= 1)
2781 {
2782 remain_bytes = 0;
2783 inputwcs[i] = (wchar_t)begin[i];
2784 mblen_buf[i] = 0;
2785 }
2786 else
2787 {
2788 mblen_buf[i] = remain_bytes;
2789 remain_bytes--;
2790 }
2791 }
2792 else
2793 {
2794 mblen_buf[i] = remain_bytes;
2795 inputwcs[i] = 0;
2796 remain_bytes--;
2797 }
2798 }
2799 mblen_buf[i] = 0;
2800 inputwcs[i] = 0; /* sentinel */
2801 }
2802 #endif /* MBS_SUPPORT */
2803
2804 for (;;)
2805 {
2806 #ifdef MBS_SUPPORT
2807 if (MB_CUR_MAX > 1)
2808 while ((t = trans[s]))
2809 {
2810 if (p == end)
2811 {
2812 free(mblen_buf);
2813 free(inputwcs);
2814 return (size_t) -1;
2815 }
2816 if (d->states[s].mbps.nelem != 0)
2817 {
2818 /* Can match with a multibyte character( and multi character
2819 collating element). */
2820 unsigned char const *nextp;
2821
2822 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2823
2824 nextp = p;
2825 s = transit_state(d, s, &nextp);
2826 p = nextp;
2827
2828 /* Trans table might be updated. */
2829 trans = d->trans;
2830 }
2831 else
2832 {
2833 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2834 s = t[*p++];
2835 }
2836 }
2837 else
2838 #endif /* MBS_SUPPORT */
2839 while ((t = trans[s]))
2840 {
2841 if (p == end)
2842 return (size_t) -1;
2843 s = t[*p++];
2844 }
2845
2846 if (s < 0)
2847 {
2848 if (p == end)
2849 {
2850 #ifdef MBS_SUPPORT
2851 if (MB_CUR_MAX > 1)
2852 {
2853 free(mblen_buf);
2854 free(inputwcs);
2855 }
2856 #endif /* MBS_SUPPORT */
2857 return (size_t) -1;
2858 }
2859 s = 0;
2860 }
2861 else if ((t = d->fails[s]))
2862 {
2863 if (d->success[s] & sbit[*p])
2864 {
2865 if (backref)
2866 *backref = (d->states[s].backref != 0);
2867 #ifdef MBS_SUPPORT
2868 if (MB_CUR_MAX > 1)
2869 {
2870 free(mblen_buf);
2871 free(inputwcs);
2872 }
2873 #endif /* MBS_SUPPORT */
2874 return (char const *) p - begin;
2875 }
2876
2877 #ifdef MBS_SUPPORT
2878 if (MB_CUR_MAX > 1)
2879 {
2880 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2881 if (d->states[s].mbps.nelem != 0)
2882 {
2883 /* Can match with a multibyte character( and multi
2884 character collating element). */
2885 unsigned char const *nextp;
2886 nextp = p;
2887 s = transit_state(d, s, &nextp);
2888 p = nextp;
2889
2890 /* Trans table might be updated. */
2891 trans = d->trans;
2892 }
2893 else
2894 s = t[*p++];
2895 }
2896 else
2897 #endif /* MBS_SUPPORT */
2898 s = t[*p++];
2899 }
2900 else
2901 {
2902 build_state(s, d);
2903 trans = d->trans;
2904 }
2905 }
2906 }
2907
2908 /* Initialize the components of a dfa that the other routines don't
2909 initialize for themselves. */
2910 void
dfainit(struct dfa * d)2911 dfainit (struct dfa *d)
2912 {
2913 d->calloc = 1;
2914 MALLOC(d->charclasses, charclass, d->calloc);
2915 d->cindex = 0;
2916
2917 d->talloc = 1;
2918 MALLOC(d->tokens, token, d->talloc);
2919 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2920 #ifdef MBS_SUPPORT
2921 if (MB_CUR_MAX > 1)
2922 {
2923 d->nmultibyte_prop = 1;
2924 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2925 d->nmbcsets = 0;
2926 d->mbcsets_alloc = 1;
2927 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2928 }
2929 #endif
2930
2931 d->searchflag = 0;
2932 d->tralloc = 0;
2933
2934 d->musts = 0;
2935 }
2936
2937 /* Parse and analyze a single string of the given length. */
2938 void
dfacomp(char const * s,size_t len,struct dfa * d,int searchflag)2939 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
2940 {
2941 if (case_fold) /* dummy folding in service of dfamust() */
2942 {
2943 char *lcopy;
2944 int i;
2945
2946 lcopy = malloc(len);
2947 if (!lcopy)
2948 dfaerror(_("out of memory"));
2949
2950 /* This is a kludge. */
2951 case_fold = 0;
2952 for (i = 0; i < len; ++i)
2953 if (ISUPPER ((unsigned char) s[i]))
2954 lcopy[i] = tolower ((unsigned char) s[i]);
2955 else
2956 lcopy[i] = s[i];
2957
2958 dfainit(d);
2959 dfaparse(lcopy, len, d);
2960 free(lcopy);
2961 dfamust(d);
2962 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2963 case_fold = 1;
2964 dfaparse(s, len, d);
2965 dfaanalyze(d, searchflag);
2966 }
2967 else
2968 {
2969 dfainit(d);
2970 dfaparse(s, len, d);
2971 dfamust(d);
2972 dfaanalyze(d, searchflag);
2973 }
2974 }
2975
2976 /* Free the storage held by the components of a dfa. */
2977 void
dfafree(struct dfa * d)2978 dfafree (struct dfa *d)
2979 {
2980 int i;
2981 struct dfamust *dm, *ndm;
2982
2983 free((ptr_t) d->charclasses);
2984 free((ptr_t) d->tokens);
2985
2986 #ifdef MBS_SUPPORT
2987 if (MB_CUR_MAX > 1)
2988 {
2989 free((ptr_t) d->multibyte_prop);
2990 for (i = 0; i < d->nmbcsets; ++i)
2991 {
2992 int j;
2993 struct mb_char_classes *p = &(d->mbcsets[i]);
2994 if (p->chars != NULL)
2995 free(p->chars);
2996 if (p->ch_classes != NULL)
2997 free(p->ch_classes);
2998 if (p->range_sts != NULL)
2999 free(p->range_sts);
3000 if (p->range_ends != NULL)
3001 free(p->range_ends);
3002
3003 for (j = 0; j < p->nequivs; ++j)
3004 free(p->equivs[j]);
3005 if (p->equivs != NULL)
3006 free(p->equivs);
3007
3008 for (j = 0; j < p->ncoll_elems; ++j)
3009 free(p->coll_elems[j]);
3010 if (p->coll_elems != NULL)
3011 free(p->coll_elems);
3012 }
3013 free((ptr_t) d->mbcsets);
3014 }
3015 #endif /* MBS_SUPPORT */
3016
3017 for (i = 0; i < d->sindex; ++i)
3018 free((ptr_t) d->states[i].elems.elems);
3019 free((ptr_t) d->states);
3020 for (i = 0; i < d->tindex; ++i)
3021 if (d->follows[i].elems)
3022 free((ptr_t) d->follows[i].elems);
3023 free((ptr_t) d->follows);
3024 for (i = 0; i < d->tralloc; ++i)
3025 if (d->trans[i])
3026 free((ptr_t) d->trans[i]);
3027 else if (d->fails[i])
3028 free((ptr_t) d->fails[i]);
3029 if (d->realtrans) free((ptr_t) d->realtrans);
3030 if (d->fails) free((ptr_t) d->fails);
3031 if (d->success) free((ptr_t) d->success);
3032 for (dm = d->musts; dm; dm = ndm)
3033 {
3034 ndm = dm->next;
3035 free(dm->must);
3036 free((ptr_t) dm);
3037 }
3038 }
3039
3040 /* Having found the postfix representation of the regular expression,
3041 try to find a long sequence of characters that must appear in any line
3042 containing the r.e.
3043 Finding a "longest" sequence is beyond the scope here;
3044 we take an easy way out and hope for the best.
3045 (Take "(ab|a)b"--please.)
3046
3047 We do a bottom-up calculation of sequences of characters that must appear
3048 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3049 representation:
3050 sequences that must appear at the left of the match ("left")
3051 sequences that must appear at the right of the match ("right")
3052 lists of sequences that must appear somewhere in the match ("in")
3053 sequences that must constitute the match ("is")
3054
3055 When we get to the root of the tree, we use one of the longest of its
3056 calculated "in" sequences as our answer. The sequence we find is returned in
3057 d->must (where "d" is the single argument passed to "dfamust");
3058 the length of the sequence is returned in d->mustn.
3059
3060 The sequences calculated for the various types of node (in pseudo ANSI c)
3061 are shown below. "p" is the operand of unary operators (and the left-hand
3062 operand of binary operators); "q" is the right-hand operand of binary
3063 operators.
3064
3065 "ZERO" means "a zero-length sequence" below.
3066
3067 Type left right is in
3068 ---- ---- ----- -- --
3069 char c # c # c # c # c
3070
3071 ANYCHAR ZERO ZERO ZERO ZERO
3072
3073 MBCSET ZERO ZERO ZERO ZERO
3074
3075 CSET ZERO ZERO ZERO ZERO
3076
3077 STAR ZERO ZERO ZERO ZERO
3078
3079 QMARK ZERO ZERO ZERO ZERO
3080
3081 PLUS p->left p->right ZERO p->in
3082
3083 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3084 p->left : q->right : q->is!=ZERO) ? q->in plus
3085 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3086 ZERO
3087
3088 OR longest common longest common (do p->is and substrings common to
3089 leading trailing q->is have same p->in and q->in
3090 (sub)sequence (sub)sequence length and
3091 of p->left of p->right content) ?
3092 and q->left and q->right p->is : NULL
3093
3094 If there's anything else we recognize in the tree, all four sequences get set
3095 to zero-length sequences. If there's something we don't recognize in the tree,
3096 we just return a zero-length sequence.
3097
3098 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3099 'aaa')?
3100
3101 And. . .is it here or someplace that we might ponder "optimizations" such as
3102 egrep 'psi|epsilon' -> egrep 'psi'
3103 egrep 'pepsi|epsilon' -> egrep 'epsi'
3104 (Yes, we now find "epsi" as a "string
3105 that must occur", but we might also
3106 simplify the *entire* r.e. being sought)
3107 grep '[c]' -> grep 'c'
3108 grep '(ab|a)b' -> grep 'ab'
3109 grep 'ab*' -> grep 'a'
3110 grep 'a*b' -> grep 'b'
3111
3112 There are several issues:
3113
3114 Is optimization easy (enough)?
3115
3116 Does optimization actually accomplish anything,
3117 or is the automaton you get from "psi|epsilon" (for example)
3118 the same as the one you get from "psi" (for example)?
3119
3120 Are optimizable r.e.'s likely to be used in real-life situations
3121 (something like 'ab*' is probably unlikely; something like is
3122 'psi|epsilon' is likelier)? */
3123
3124 static char *
icatalloc(char * old,char * new)3125 icatalloc (char *old, char *new)
3126 {
3127 char *result;
3128 size_t oldsize, newsize;
3129
3130 newsize = (new == NULL) ? 0 : strlen(new);
3131 if (old == NULL)
3132 oldsize = 0;
3133 else if (newsize == 0)
3134 return old;
3135 else oldsize = strlen(old);
3136 if (old == NULL)
3137 result = (char *) malloc(newsize + 1);
3138 else
3139 result = (char *) realloc((void *) old, oldsize + newsize + 1);
3140 if (result != NULL && new != NULL)
3141 (void) strcpy(result + oldsize, new);
3142 return result;
3143 }
3144
3145 static char *
icpyalloc(char * string)3146 icpyalloc (char *string)
3147 {
3148 return icatalloc((char *) NULL, string);
3149 }
3150
3151 static char *
istrstr(char * lookin,char * lookfor)3152 istrstr (char *lookin, char *lookfor)
3153 {
3154 char *cp;
3155 size_t len;
3156
3157 len = strlen(lookfor);
3158 for (cp = lookin; *cp != '\0'; ++cp)
3159 if (strncmp(cp, lookfor, len) == 0)
3160 return cp;
3161 return NULL;
3162 }
3163
3164 static void
ifree(char * cp)3165 ifree (char *cp)
3166 {
3167 if (cp != NULL)
3168 free(cp);
3169 }
3170
3171 static void
freelist(char ** cpp)3172 freelist (char **cpp)
3173 {
3174 int i;
3175
3176 if (cpp == NULL)
3177 return;
3178 for (i = 0; cpp[i] != NULL; ++i)
3179 {
3180 free(cpp[i]);
3181 cpp[i] = NULL;
3182 }
3183 }
3184
3185 static char **
enlist(char ** cpp,char * new,size_t len)3186 enlist (char **cpp, char *new, size_t len)
3187 {
3188 int i, j;
3189
3190 if (cpp == NULL)
3191 return NULL;
3192 if ((new = icpyalloc(new)) == NULL)
3193 {
3194 freelist(cpp);
3195 return NULL;
3196 }
3197 new[len] = '\0';
3198 /* Is there already something in the list that's new (or longer)? */
3199 for (i = 0; cpp[i] != NULL; ++i)
3200 if (istrstr(cpp[i], new) != NULL)
3201 {
3202 free(new);
3203 return cpp;
3204 }
3205 /* Eliminate any obsoleted strings. */
3206 j = 0;
3207 while (cpp[j] != NULL)
3208 if (istrstr(new, cpp[j]) == NULL)
3209 ++j;
3210 else
3211 {
3212 free(cpp[j]);
3213 if (--i == j)
3214 break;
3215 cpp[j] = cpp[i];
3216 cpp[i] = NULL;
3217 }
3218 /* Add the new string. */
3219 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3220 if (cpp == NULL)
3221 return NULL;
3222 cpp[i] = new;
3223 cpp[i + 1] = NULL;
3224 return cpp;
3225 }
3226
3227 /* Given pointers to two strings, return a pointer to an allocated
3228 list of their distinct common substrings. Return NULL if something
3229 seems wild. */
3230 static char **
comsubs(char * left,char * right)3231 comsubs (char *left, char *right)
3232 {
3233 char **cpp;
3234 char *lcp;
3235 char *rcp;
3236 size_t i, len;
3237
3238 if (left == NULL || right == NULL)
3239 return NULL;
3240 cpp = (char **) malloc(sizeof *cpp);
3241 if (cpp == NULL)
3242 return NULL;
3243 cpp[0] = NULL;
3244 for (lcp = left; *lcp != '\0'; ++lcp)
3245 {
3246 len = 0;
3247 rcp = strchr (right, *lcp);
3248 while (rcp != NULL)
3249 {
3250 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3251 continue;
3252 if (i > len)
3253 len = i;
3254 rcp = strchr (rcp + 1, *lcp);
3255 }
3256 if (len == 0)
3257 continue;
3258 if ((cpp = enlist(cpp, lcp, len)) == NULL)
3259 break;
3260 }
3261 return cpp;
3262 }
3263
3264 static char **
addlists(char ** old,char ** new)3265 addlists (char **old, char **new)
3266 {
3267 int i;
3268
3269 if (old == NULL || new == NULL)
3270 return NULL;
3271 for (i = 0; new[i] != NULL; ++i)
3272 {
3273 old = enlist(old, new[i], strlen(new[i]));
3274 if (old == NULL)
3275 break;
3276 }
3277 return old;
3278 }
3279
3280 /* Given two lists of substrings, return a new list giving substrings
3281 common to both. */
3282 static char **
inboth(char ** left,char ** right)3283 inboth (char **left, char **right)
3284 {
3285 char **both;
3286 char **temp;
3287 int lnum, rnum;
3288
3289 if (left == NULL || right == NULL)
3290 return NULL;
3291 both = (char **) malloc(sizeof *both);
3292 if (both == NULL)
3293 return NULL;
3294 both[0] = NULL;
3295 for (lnum = 0; left[lnum] != NULL; ++lnum)
3296 {
3297 for (rnum = 0; right[rnum] != NULL; ++rnum)
3298 {
3299 temp = comsubs(left[lnum], right[rnum]);
3300 if (temp == NULL)
3301 {
3302 freelist(both);
3303 return NULL;
3304 }
3305 both = addlists(both, temp);
3306 freelist(temp);
3307 free(temp);
3308 if (both == NULL)
3309 return NULL;
3310 }
3311 }
3312 return both;
3313 }
3314
3315 typedef struct
3316 {
3317 char **in;
3318 char *left;
3319 char *right;
3320 char *is;
3321 } must;
3322
3323 static void
resetmust(must * mp)3324 resetmust (must *mp)
3325 {
3326 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3327 freelist(mp->in);
3328 }
3329
3330 static void
dfamust(struct dfa * dfa)3331 dfamust (struct dfa *dfa)
3332 {
3333 must *musts;
3334 must *mp;
3335 char *result;
3336 int ri;
3337 int i;
3338 int exact;
3339 token t;
3340 static must must0;
3341 struct dfamust *dm;
3342 static char empty_string[] = "";
3343
3344 result = empty_string;
3345 exact = 0;
3346 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3347 if (musts == NULL)
3348 return;
3349 mp = musts;
3350 for (i = 0; i <= dfa->tindex; ++i)
3351 mp[i] = must0;
3352 for (i = 0; i <= dfa->tindex; ++i)
3353 {
3354 mp[i].in = (char **) malloc(sizeof *mp[i].in);
3355 mp[i].left = malloc(2);
3356 mp[i].right = malloc(2);
3357 mp[i].is = malloc(2);
3358 if (mp[i].in == NULL || mp[i].left == NULL ||
3359 mp[i].right == NULL || mp[i].is == NULL)
3360 goto done;
3361 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3362 mp[i].in[0] = NULL;
3363 }
3364 #ifdef DEBUG
3365 fprintf(stderr, "dfamust:\n");
3366 for (i = 0; i < dfa->tindex; ++i)
3367 {
3368 fprintf(stderr, " %d:", i);
3369 prtok(dfa->tokens[i]);
3370 }
3371 putc('\n', stderr);
3372 #endif
3373 for (ri = 0; ri < dfa->tindex; ++ri)
3374 {
3375 switch (t = dfa->tokens[ri])
3376 {
3377 case LPAREN:
3378 case RPAREN:
3379 goto done; /* "cannot happen" */
3380 case EMPTY:
3381 case BEGLINE:
3382 case ENDLINE:
3383 case BEGWORD:
3384 case ENDWORD:
3385 case LIMWORD:
3386 case NOTLIMWORD:
3387 case BACKREF:
3388 resetmust(mp);
3389 break;
3390 case STAR:
3391 case QMARK:
3392 if (mp <= musts)
3393 goto done; /* "cannot happen" */
3394 --mp;
3395 resetmust(mp);
3396 break;
3397 case OR:
3398 case ORTOP:
3399 if (mp < &musts[2])
3400 goto done; /* "cannot happen" */
3401 {
3402 char **new;
3403 must *lmp;
3404 must *rmp;
3405 int j, ln, rn, n;
3406
3407 rmp = --mp;
3408 lmp = --mp;
3409 /* Guaranteed to be. Unlikely, but. . . */
3410 if (strcmp(lmp->is, rmp->is) != 0)
3411 lmp->is[0] = '\0';
3412 /* Left side--easy */
3413 i = 0;
3414 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3415 ++i;
3416 lmp->left[i] = '\0';
3417 /* Right side */
3418 ln = strlen(lmp->right);
3419 rn = strlen(rmp->right);
3420 n = ln;
3421 if (n > rn)
3422 n = rn;
3423 for (i = 0; i < n; ++i)
3424 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3425 break;
3426 for (j = 0; j < i; ++j)
3427 lmp->right[j] = lmp->right[(ln - i) + j];
3428 lmp->right[j] = '\0';
3429 new = inboth(lmp->in, rmp->in);
3430 if (new == NULL)
3431 goto done;
3432 freelist(lmp->in);
3433 free((char *) lmp->in);
3434 lmp->in = new;
3435 }
3436 break;
3437 case PLUS:
3438 if (mp <= musts)
3439 goto done; /* "cannot happen" */
3440 --mp;
3441 mp->is[0] = '\0';
3442 break;
3443 case END:
3444 if (mp != &musts[1])
3445 goto done; /* "cannot happen" */
3446 for (i = 0; musts[0].in[i] != NULL; ++i)
3447 if (strlen(musts[0].in[i]) > strlen(result))
3448 result = musts[0].in[i];
3449 if (strcmp(result, musts[0].is) == 0)
3450 exact = 1;
3451 goto done;
3452 case CAT:
3453 if (mp < &musts[2])
3454 goto done; /* "cannot happen" */
3455 {
3456 must *lmp;
3457 must *rmp;
3458
3459 rmp = --mp;
3460 lmp = --mp;
3461 /* In. Everything in left, plus everything in
3462 right, plus catenation of
3463 left's right and right's left. */
3464 lmp->in = addlists(lmp->in, rmp->in);
3465 if (lmp->in == NULL)
3466 goto done;
3467 if (lmp->right[0] != '\0' &&
3468 rmp->left[0] != '\0')
3469 {
3470 char *tp;
3471
3472 tp = icpyalloc(lmp->right);
3473 if (tp == NULL)
3474 goto done;
3475 tp = icatalloc(tp, rmp->left);
3476 if (tp == NULL)
3477 goto done;
3478 lmp->in = enlist(lmp->in, tp,
3479 strlen(tp));
3480 free(tp);
3481 if (lmp->in == NULL)
3482 goto done;
3483 }
3484 /* Left-hand */
3485 if (lmp->is[0] != '\0')
3486 {
3487 lmp->left = icatalloc(lmp->left,
3488 rmp->left);
3489 if (lmp->left == NULL)
3490 goto done;
3491 }
3492 /* Right-hand */
3493 if (rmp->is[0] == '\0')
3494 lmp->right[0] = '\0';
3495 lmp->right = icatalloc(lmp->right, rmp->right);
3496 if (lmp->right == NULL)
3497 goto done;
3498 /* Guaranteed to be */
3499 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3500 {
3501 lmp->is = icatalloc(lmp->is, rmp->is);
3502 if (lmp->is == NULL)
3503 goto done;
3504 }
3505 else
3506 lmp->is[0] = '\0';
3507 }
3508 break;
3509 default:
3510 if (t < END)
3511 {
3512 /* "cannot happen" */
3513 goto done;
3514 }
3515 else if (t == '\0')
3516 {
3517 /* not on *my* shift */
3518 goto done;
3519 }
3520 else if (t >= CSET
3521 #ifdef MBS_SUPPORT
3522 || t == ANYCHAR
3523 || t == MBCSET
3524 #endif /* MBS_SUPPORT */
3525 )
3526 {
3527 /* easy enough */
3528 resetmust(mp);
3529 }
3530 else
3531 {
3532 /* plain character */
3533 resetmust(mp);
3534 mp->is[0] = mp->left[0] = mp->right[0] = t;
3535 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3536 mp->in = enlist(mp->in, mp->is, (size_t)1);
3537 if (mp->in == NULL)
3538 goto done;
3539 }
3540 break;
3541 }
3542 #ifdef DEBUG
3543 fprintf(stderr, " node: %d:", ri);
3544 prtok(dfa->tokens[ri]);
3545 fprintf(stderr, "\n in:");
3546 for (i = 0; mp->in[i]; ++i)
3547 fprintf(stderr, " \"%s\"", mp->in[i]);
3548 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
3549 fprintf(stderr, " left: \"%s\"\n", mp->left);
3550 fprintf(stderr, " right: \"%s\"\n", mp->right);
3551 #endif
3552 ++mp;
3553 }
3554 done:
3555 if (strlen(result))
3556 {
3557 dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3558 dm->exact = exact;
3559 dm->must = malloc(strlen(result) + 1);
3560 strcpy(dm->must, result);
3561 dm->next = dfa->musts;
3562 dfa->musts = dm;
3563 }
3564 mp = musts;
3565 for (i = 0; i <= dfa->tindex; ++i)
3566 {
3567 freelist(mp[i].in);
3568 ifree((char *) mp[i].in);
3569 ifree(mp[i].left);
3570 ifree(mp[i].right);
3571 ifree(mp[i].is);
3572 }
3573 free((char *) mp);
3574 }
3575 /* vim:set shiftwidth=2: */
3576