1*09d4459fSDaniel Fojt /* dfa.c - deterministic extended regexp routines for GNU
2*09d4459fSDaniel Fojt Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2020 Free Software
3*09d4459fSDaniel Fojt Foundation, Inc.
4*09d4459fSDaniel Fojt
5*09d4459fSDaniel Fojt This program is free software; you can redistribute it and/or modify
6*09d4459fSDaniel Fojt it under the terms of the GNU General Public License as published by
7*09d4459fSDaniel Fojt the Free Software Foundation; either version 3, or (at your option)
8*09d4459fSDaniel Fojt any later version.
9*09d4459fSDaniel Fojt
10*09d4459fSDaniel Fojt This program is distributed in the hope that it will be useful,
11*09d4459fSDaniel Fojt but WITHOUT ANY WARRANTY; without even the implied warranty of
12*09d4459fSDaniel Fojt MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13*09d4459fSDaniel Fojt GNU General Public License for more details.
14*09d4459fSDaniel Fojt
15*09d4459fSDaniel Fojt You should have received a copy of the GNU General Public License
16*09d4459fSDaniel Fojt along with this program; if not, write to the Free Software
17*09d4459fSDaniel Fojt Foundation, Inc.,
18*09d4459fSDaniel Fojt 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19*09d4459fSDaniel Fojt
20*09d4459fSDaniel Fojt /* Written June, 1988 by Mike Haertel
21*09d4459fSDaniel Fojt Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22*09d4459fSDaniel Fojt
23*09d4459fSDaniel Fojt #include <config.h>
24*09d4459fSDaniel Fojt
25*09d4459fSDaniel Fojt #include "dfa.h"
26*09d4459fSDaniel Fojt
27*09d4459fSDaniel Fojt #include "flexmember.h"
28*09d4459fSDaniel Fojt
29*09d4459fSDaniel Fojt #include <assert.h>
30*09d4459fSDaniel Fojt #include <ctype.h>
31*09d4459fSDaniel Fojt #include <stdint.h>
32*09d4459fSDaniel Fojt #include <stdio.h>
33*09d4459fSDaniel Fojt #include <stdlib.h>
34*09d4459fSDaniel Fojt #include <limits.h>
35*09d4459fSDaniel Fojt #include <string.h>
36*09d4459fSDaniel Fojt
37*09d4459fSDaniel Fojt /* Another name for ptrdiff_t, for sizes of objects and nonnegative
38*09d4459fSDaniel Fojt indexes into objects. It is signed to help catch integer overflow.
39*09d4459fSDaniel Fojt It has its own name because it is for nonnegative values only. */
40*09d4459fSDaniel Fojt typedef ptrdiff_t idx_t;
41*09d4459fSDaniel Fojt static idx_t const IDX_MAX = PTRDIFF_MAX;
42*09d4459fSDaniel Fojt
43*09d4459fSDaniel Fojt static bool
streq(char const * a,char const * b)44*09d4459fSDaniel Fojt streq (char const *a, char const *b)
45*09d4459fSDaniel Fojt {
46*09d4459fSDaniel Fojt return strcmp (a, b) == 0;
47*09d4459fSDaniel Fojt }
48*09d4459fSDaniel Fojt
49*09d4459fSDaniel Fojt static bool
isasciidigit(char c)50*09d4459fSDaniel Fojt isasciidigit (char c)
51*09d4459fSDaniel Fojt {
52*09d4459fSDaniel Fojt return '0' <= c && c <= '9';
53*09d4459fSDaniel Fojt }
54*09d4459fSDaniel Fojt
55*09d4459fSDaniel Fojt #include "gettext.h"
56*09d4459fSDaniel Fojt #define _(str) gettext (str)
57*09d4459fSDaniel Fojt
58*09d4459fSDaniel Fojt #include <wchar.h>
59*09d4459fSDaniel Fojt
60*09d4459fSDaniel Fojt #include "intprops.h"
61*09d4459fSDaniel Fojt #include "xalloc.h"
62*09d4459fSDaniel Fojt #include "localeinfo.h"
63*09d4459fSDaniel Fojt
64*09d4459fSDaniel Fojt #ifndef FALLTHROUGH
65*09d4459fSDaniel Fojt # if __GNUC__ < 7
66*09d4459fSDaniel Fojt # define FALLTHROUGH ((void) 0)
67*09d4459fSDaniel Fojt # else
68*09d4459fSDaniel Fojt # define FALLTHROUGH __attribute__ ((__fallthrough__))
69*09d4459fSDaniel Fojt # endif
70*09d4459fSDaniel Fojt #endif
71*09d4459fSDaniel Fojt
72*09d4459fSDaniel Fojt #ifndef MIN
73*09d4459fSDaniel Fojt # define MIN(a,b) ((a) < (b) ? (a) : (b))
74*09d4459fSDaniel Fojt #endif
75*09d4459fSDaniel Fojt
76*09d4459fSDaniel Fojt /* HPUX defines these as macros in sys/param.h. */
77*09d4459fSDaniel Fojt #ifdef setbit
78*09d4459fSDaniel Fojt # undef setbit
79*09d4459fSDaniel Fojt #endif
80*09d4459fSDaniel Fojt #ifdef clrbit
81*09d4459fSDaniel Fojt # undef clrbit
82*09d4459fSDaniel Fojt #endif
83*09d4459fSDaniel Fojt
84*09d4459fSDaniel Fojt /* First integer value that is greater than any character code. */
85*09d4459fSDaniel Fojt enum { NOTCHAR = 1 << CHAR_BIT };
86*09d4459fSDaniel Fojt
87*09d4459fSDaniel Fojt /* Number of bits used in a charclass word. */
88*09d4459fSDaniel Fojt enum { CHARCLASS_WORD_BITS = 64 };
89*09d4459fSDaniel Fojt
90*09d4459fSDaniel Fojt /* This represents part of a character class. It must be unsigned and
91*09d4459fSDaniel Fojt at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
92*09d4459fSDaniel Fojt typedef uint_least64_t charclass_word;
93*09d4459fSDaniel Fojt
94*09d4459fSDaniel Fojt /* An initializer for a charclass whose 64-bit words are A through D. */
95*09d4459fSDaniel Fojt #define CHARCLASS_INIT(a, b, c, d) {{a, b, c, d}}
96*09d4459fSDaniel Fojt
97*09d4459fSDaniel Fojt /* The maximum useful value of a charclass_word; all used bits are 1. */
98*09d4459fSDaniel Fojt static charclass_word const CHARCLASS_WORD_MASK
99*09d4459fSDaniel Fojt = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
100*09d4459fSDaniel Fojt
101*09d4459fSDaniel Fojt /* Number of words required to hold a bit for every character. */
102*09d4459fSDaniel Fojt enum
103*09d4459fSDaniel Fojt {
104*09d4459fSDaniel Fojt CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
105*09d4459fSDaniel Fojt };
106*09d4459fSDaniel Fojt
107*09d4459fSDaniel Fojt /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
108*09d4459fSDaniel Fojt typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
109*09d4459fSDaniel Fojt
110*09d4459fSDaniel Fojt /* Convert a possibly-signed character to an unsigned character. This is
111*09d4459fSDaniel Fojt a bit safer than casting to unsigned char, since it catches some type
112*09d4459fSDaniel Fojt errors that the cast doesn't. */
113*09d4459fSDaniel Fojt static unsigned char
to_uchar(char ch)114*09d4459fSDaniel Fojt to_uchar (char ch)
115*09d4459fSDaniel Fojt {
116*09d4459fSDaniel Fojt return ch;
117*09d4459fSDaniel Fojt }
118*09d4459fSDaniel Fojt
119*09d4459fSDaniel Fojt /* Contexts tell us whether a character is a newline or a word constituent.
120*09d4459fSDaniel Fojt Word-constituent characters are those that satisfy iswalnum, plus '_'.
121*09d4459fSDaniel Fojt Each character has a single CTX_* value; bitmasks of CTX_* values denote
122*09d4459fSDaniel Fojt a particular character class.
123*09d4459fSDaniel Fojt
124*09d4459fSDaniel Fojt A state also stores a context value, which is a bitmask of CTX_* values.
125*09d4459fSDaniel Fojt A state's context represents a set of characters that the state's
126*09d4459fSDaniel Fojt predecessors must match. For example, a state whose context does not
127*09d4459fSDaniel Fojt include CTX_LETTER will never have transitions where the previous
128*09d4459fSDaniel Fojt character is a word constituent. A state whose context is CTX_ANY
129*09d4459fSDaniel Fojt might have transitions from any character. */
130*09d4459fSDaniel Fojt
131*09d4459fSDaniel Fojt enum
132*09d4459fSDaniel Fojt {
133*09d4459fSDaniel Fojt CTX_NONE = 1,
134*09d4459fSDaniel Fojt CTX_LETTER = 2,
135*09d4459fSDaniel Fojt CTX_NEWLINE = 4,
136*09d4459fSDaniel Fojt CTX_ANY = 7
137*09d4459fSDaniel Fojt };
138*09d4459fSDaniel Fojt
139*09d4459fSDaniel Fojt /* Sometimes characters can only be matched depending on the surrounding
140*09d4459fSDaniel Fojt context. Such context decisions depend on what the previous character
141*09d4459fSDaniel Fojt was, and the value of the current (lookahead) character. Context
142*09d4459fSDaniel Fojt dependent constraints are encoded as 9-bit integers. Each bit that
143*09d4459fSDaniel Fojt is set indicates that the constraint succeeds in the corresponding
144*09d4459fSDaniel Fojt context.
145*09d4459fSDaniel Fojt
146*09d4459fSDaniel Fojt bit 6-8 - valid contexts when next character is CTX_NEWLINE
147*09d4459fSDaniel Fojt bit 3-5 - valid contexts when next character is CTX_LETTER
148*09d4459fSDaniel Fojt bit 0-2 - valid contexts when next character is CTX_NONE
149*09d4459fSDaniel Fojt
150*09d4459fSDaniel Fojt succeeds_in_context determines whether a given constraint
151*09d4459fSDaniel Fojt succeeds in a particular context. Prev is a bitmask of possible
152*09d4459fSDaniel Fojt context values for the previous character, curr is the (single-bit)
153*09d4459fSDaniel Fojt context value for the lookahead character. */
154*09d4459fSDaniel Fojt static int
newline_constraint(int constraint)155*09d4459fSDaniel Fojt newline_constraint (int constraint)
156*09d4459fSDaniel Fojt {
157*09d4459fSDaniel Fojt return (constraint >> 6) & 7;
158*09d4459fSDaniel Fojt }
159*09d4459fSDaniel Fojt static int
letter_constraint(int constraint)160*09d4459fSDaniel Fojt letter_constraint (int constraint)
161*09d4459fSDaniel Fojt {
162*09d4459fSDaniel Fojt return (constraint >> 3) & 7;
163*09d4459fSDaniel Fojt }
164*09d4459fSDaniel Fojt static int
other_constraint(int constraint)165*09d4459fSDaniel Fojt other_constraint (int constraint)
166*09d4459fSDaniel Fojt {
167*09d4459fSDaniel Fojt return constraint & 7;
168*09d4459fSDaniel Fojt }
169*09d4459fSDaniel Fojt
170*09d4459fSDaniel Fojt static bool
succeeds_in_context(int constraint,int prev,int curr)171*09d4459fSDaniel Fojt succeeds_in_context (int constraint, int prev, int curr)
172*09d4459fSDaniel Fojt {
173*09d4459fSDaniel Fojt return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
174*09d4459fSDaniel Fojt | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
175*09d4459fSDaniel Fojt | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
176*09d4459fSDaniel Fojt & prev);
177*09d4459fSDaniel Fojt }
178*09d4459fSDaniel Fojt
179*09d4459fSDaniel Fojt /* The following describe what a constraint depends on. */
180*09d4459fSDaniel Fojt static bool
prev_newline_dependent(int constraint)181*09d4459fSDaniel Fojt prev_newline_dependent (int constraint)
182*09d4459fSDaniel Fojt {
183*09d4459fSDaniel Fojt return ((constraint ^ constraint >> 2) & 0111) != 0;
184*09d4459fSDaniel Fojt }
185*09d4459fSDaniel Fojt static bool
prev_letter_dependent(int constraint)186*09d4459fSDaniel Fojt prev_letter_dependent (int constraint)
187*09d4459fSDaniel Fojt {
188*09d4459fSDaniel Fojt return ((constraint ^ constraint >> 1) & 0111) != 0;
189*09d4459fSDaniel Fojt }
190*09d4459fSDaniel Fojt
191*09d4459fSDaniel Fojt /* Tokens that match the empty string subject to some constraint actually
192*09d4459fSDaniel Fojt work by applying that constraint to determine what may follow them,
193*09d4459fSDaniel Fojt taking into account what has gone before. The following values are
194*09d4459fSDaniel Fojt the constraints corresponding to the special tokens previously defined. */
195*09d4459fSDaniel Fojt enum
196*09d4459fSDaniel Fojt {
197*09d4459fSDaniel Fojt NO_CONSTRAINT = 0777,
198*09d4459fSDaniel Fojt BEGLINE_CONSTRAINT = 0444,
199*09d4459fSDaniel Fojt ENDLINE_CONSTRAINT = 0700,
200*09d4459fSDaniel Fojt BEGWORD_CONSTRAINT = 0050,
201*09d4459fSDaniel Fojt ENDWORD_CONSTRAINT = 0202,
202*09d4459fSDaniel Fojt LIMWORD_CONSTRAINT = 0252,
203*09d4459fSDaniel Fojt NOTLIMWORD_CONSTRAINT = 0525
204*09d4459fSDaniel Fojt };
205*09d4459fSDaniel Fojt
206*09d4459fSDaniel Fojt /* The regexp is parsed into an array of tokens in postfix form. Some tokens
207*09d4459fSDaniel Fojt are operators and others are terminal symbols. Most (but not all) of these
208*09d4459fSDaniel Fojt codes are returned by the lexical analyzer. */
209*09d4459fSDaniel Fojt
210*09d4459fSDaniel Fojt typedef ptrdiff_t token;
211*09d4459fSDaniel Fojt static token const TOKEN_MAX = PTRDIFF_MAX;
212*09d4459fSDaniel Fojt
213*09d4459fSDaniel Fojt /* States are indexed by state_num values. These are normally
214*09d4459fSDaniel Fojt nonnegative but -1 is used as a special value. */
215*09d4459fSDaniel Fojt typedef ptrdiff_t state_num;
216*09d4459fSDaniel Fojt
217*09d4459fSDaniel Fojt /* Predefined token values. */
218*09d4459fSDaniel Fojt enum
219*09d4459fSDaniel Fojt {
220*09d4459fSDaniel Fojt END = -1, /* END is a terminal symbol that matches the
221*09d4459fSDaniel Fojt end of input; any value of END or less in
222*09d4459fSDaniel Fojt the parse tree is such a symbol. Accepting
223*09d4459fSDaniel Fojt states of the DFA are those that would have
224*09d4459fSDaniel Fojt a transition on END. This is -1, not some
225*09d4459fSDaniel Fojt more-negative value, to tweak the speed of
226*09d4459fSDaniel Fojt comparisons to END. */
227*09d4459fSDaniel Fojt
228*09d4459fSDaniel Fojt /* Ordinary character values are terminal symbols that match themselves. */
229*09d4459fSDaniel Fojt
230*09d4459fSDaniel Fojt /* CSET must come last in the following list of special tokens. Otherwise,
231*09d4459fSDaniel Fojt the list order matters only for performance. Related special tokens
232*09d4459fSDaniel Fojt should have nearby values so that code like (t == ANYCHAR || t == MBCSET
233*09d4459fSDaniel Fojt || CSET <= t) can be done with a single machine-level comparison. */
234*09d4459fSDaniel Fojt
235*09d4459fSDaniel Fojt EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
236*09d4459fSDaniel Fojt the empty string. */
237*09d4459fSDaniel Fojt
238*09d4459fSDaniel Fojt QMARK, /* QMARK is an operator of one argument that
239*09d4459fSDaniel Fojt matches zero or one occurrences of its
240*09d4459fSDaniel Fojt argument. */
241*09d4459fSDaniel Fojt
242*09d4459fSDaniel Fojt STAR, /* STAR is an operator of one argument that
243*09d4459fSDaniel Fojt matches the Kleene closure (zero or more
244*09d4459fSDaniel Fojt occurrences) of its argument. */
245*09d4459fSDaniel Fojt
246*09d4459fSDaniel Fojt PLUS, /* PLUS is an operator of one argument that
247*09d4459fSDaniel Fojt matches the positive closure (one or more
248*09d4459fSDaniel Fojt occurrences) of its argument. */
249*09d4459fSDaniel Fojt
250*09d4459fSDaniel Fojt REPMN, /* REPMN is a lexical token corresponding
251*09d4459fSDaniel Fojt to the {m,n} construct. REPMN never
252*09d4459fSDaniel Fojt appears in the compiled token vector. */
253*09d4459fSDaniel Fojt
254*09d4459fSDaniel Fojt CAT, /* CAT is an operator of two arguments that
255*09d4459fSDaniel Fojt matches the concatenation of its
256*09d4459fSDaniel Fojt arguments. CAT is never returned by the
257*09d4459fSDaniel Fojt lexical analyzer. */
258*09d4459fSDaniel Fojt
259*09d4459fSDaniel Fojt OR, /* OR is an operator of two arguments that
260*09d4459fSDaniel Fojt matches either of its arguments. */
261*09d4459fSDaniel Fojt
262*09d4459fSDaniel Fojt LPAREN, /* LPAREN never appears in the parse tree,
263*09d4459fSDaniel Fojt it is only a lexeme. */
264*09d4459fSDaniel Fojt
265*09d4459fSDaniel Fojt RPAREN, /* RPAREN never appears in the parse tree. */
266*09d4459fSDaniel Fojt
267*09d4459fSDaniel Fojt WCHAR, /* Only returned by lex. wctok contains
268*09d4459fSDaniel Fojt the wide character representation. */
269*09d4459fSDaniel Fojt
270*09d4459fSDaniel Fojt ANYCHAR, /* ANYCHAR is a terminal symbol that matches
271*09d4459fSDaniel Fojt a valid multibyte (or single byte) character.
272*09d4459fSDaniel Fojt It is used only if MB_CUR_MAX > 1. */
273*09d4459fSDaniel Fojt
274*09d4459fSDaniel Fojt BEG, /* BEG is an initial symbol that matches the
275*09d4459fSDaniel Fojt beginning of input. */
276*09d4459fSDaniel Fojt
277*09d4459fSDaniel Fojt BEGLINE, /* BEGLINE is a terminal symbol that matches
278*09d4459fSDaniel Fojt the empty string at the beginning of a
279*09d4459fSDaniel Fojt line. */
280*09d4459fSDaniel Fojt
281*09d4459fSDaniel Fojt ENDLINE, /* ENDLINE is a terminal symbol that matches
282*09d4459fSDaniel Fojt the empty string at the end of a line. */
283*09d4459fSDaniel Fojt
284*09d4459fSDaniel Fojt BEGWORD, /* BEGWORD is a terminal symbol that matches
285*09d4459fSDaniel Fojt the empty string at the beginning of a
286*09d4459fSDaniel Fojt word. */
287*09d4459fSDaniel Fojt
288*09d4459fSDaniel Fojt ENDWORD, /* ENDWORD is a terminal symbol that matches
289*09d4459fSDaniel Fojt the empty string at the end of a word. */
290*09d4459fSDaniel Fojt
291*09d4459fSDaniel Fojt LIMWORD, /* LIMWORD is a terminal symbol that matches
292*09d4459fSDaniel Fojt the empty string at the beginning or the
293*09d4459fSDaniel Fojt end of a word. */
294*09d4459fSDaniel Fojt
295*09d4459fSDaniel Fojt NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
296*09d4459fSDaniel Fojt matches the empty string not at
297*09d4459fSDaniel Fojt the beginning or end of a word. */
298*09d4459fSDaniel Fojt
299*09d4459fSDaniel Fojt BACKREF, /* BACKREF is generated by \<digit>
300*09d4459fSDaniel Fojt or by any other construct that
301*09d4459fSDaniel Fojt is not completely handled. If the scanner
302*09d4459fSDaniel Fojt detects a transition on backref, it returns
303*09d4459fSDaniel Fojt a kind of "semi-success" indicating that
304*09d4459fSDaniel Fojt the match will have to be verified with
305*09d4459fSDaniel Fojt a backtracking matcher. */
306*09d4459fSDaniel Fojt
307*09d4459fSDaniel Fojt MBCSET, /* MBCSET is similar to CSET, but for
308*09d4459fSDaniel Fojt multibyte characters. */
309*09d4459fSDaniel Fojt
310*09d4459fSDaniel Fojt CSET /* CSET and (and any value greater) is a
311*09d4459fSDaniel Fojt terminal symbol that matches any of a
312*09d4459fSDaniel Fojt class of characters. */
313*09d4459fSDaniel Fojt };
314*09d4459fSDaniel Fojt
315*09d4459fSDaniel Fojt
316*09d4459fSDaniel Fojt /* States of the recognizer correspond to sets of positions in the parse
317*09d4459fSDaniel Fojt tree, together with the constraints under which they may be matched.
318*09d4459fSDaniel Fojt So a position is encoded as an index into the parse tree together with
319*09d4459fSDaniel Fojt a constraint. */
320*09d4459fSDaniel Fojt typedef struct
321*09d4459fSDaniel Fojt {
322*09d4459fSDaniel Fojt idx_t index; /* Index into the parse array. */
323*09d4459fSDaniel Fojt unsigned int constraint; /* Constraint for matching this position. */
324*09d4459fSDaniel Fojt } position;
325*09d4459fSDaniel Fojt
326*09d4459fSDaniel Fojt /* Sets of positions are stored as arrays. */
327*09d4459fSDaniel Fojt typedef struct
328*09d4459fSDaniel Fojt {
329*09d4459fSDaniel Fojt position *elems; /* Elements of this position set. */
330*09d4459fSDaniel Fojt idx_t nelem; /* Number of elements in this set. */
331*09d4459fSDaniel Fojt idx_t alloc; /* Number of elements allocated in ELEMS. */
332*09d4459fSDaniel Fojt } position_set;
333*09d4459fSDaniel Fojt
334*09d4459fSDaniel Fojt /* A state of the dfa consists of a set of positions, some flags,
335*09d4459fSDaniel Fojt and the token value of the lowest-numbered position of the state that
336*09d4459fSDaniel Fojt contains an END token. */
337*09d4459fSDaniel Fojt typedef struct
338*09d4459fSDaniel Fojt {
339*09d4459fSDaniel Fojt size_t hash; /* Hash of the positions of this state. */
340*09d4459fSDaniel Fojt position_set elems; /* Positions this state could match. */
341*09d4459fSDaniel Fojt unsigned char context; /* Context from previous state. */
342*09d4459fSDaniel Fojt unsigned short constraint; /* Constraint for this state to accept. */
343*09d4459fSDaniel Fojt token first_end; /* Token value of the first END in elems. */
344*09d4459fSDaniel Fojt position_set mbps; /* Positions which can match multibyte
345*09d4459fSDaniel Fojt characters or the follows, e.g., period.
346*09d4459fSDaniel Fojt Used only if MB_CUR_MAX > 1. */
347*09d4459fSDaniel Fojt state_num mb_trindex; /* Index of this state in MB_TRANS, or
348*09d4459fSDaniel Fojt negative if the state does not have
349*09d4459fSDaniel Fojt ANYCHAR. */
350*09d4459fSDaniel Fojt } dfa_state;
351*09d4459fSDaniel Fojt
352*09d4459fSDaniel Fojt /* Maximum for any transition table count. This should be at least 3,
353*09d4459fSDaniel Fojt for the initial state setup. */
354*09d4459fSDaniel Fojt enum { MAX_TRCOUNT = 1024 };
355*09d4459fSDaniel Fojt
356*09d4459fSDaniel Fojt /* A bracket operator.
357*09d4459fSDaniel Fojt e.g., [a-c], [[:alpha:]], etc. */
358*09d4459fSDaniel Fojt struct mb_char_classes
359*09d4459fSDaniel Fojt {
360*09d4459fSDaniel Fojt ptrdiff_t cset;
361*09d4459fSDaniel Fojt bool invert;
362*09d4459fSDaniel Fojt wchar_t *chars; /* Normal characters. */
363*09d4459fSDaniel Fojt idx_t nchars;
364*09d4459fSDaniel Fojt idx_t nchars_alloc;
365*09d4459fSDaniel Fojt };
366*09d4459fSDaniel Fojt
367*09d4459fSDaniel Fojt struct regex_syntax
368*09d4459fSDaniel Fojt {
369*09d4459fSDaniel Fojt /* Syntax bits controlling the behavior of the lexical analyzer. */
370*09d4459fSDaniel Fojt reg_syntax_t syntax_bits;
371*09d4459fSDaniel Fojt bool syntax_bits_set;
372*09d4459fSDaniel Fojt
373*09d4459fSDaniel Fojt /* Flag for case-folding letters into sets. */
374*09d4459fSDaniel Fojt bool case_fold;
375*09d4459fSDaniel Fojt
376*09d4459fSDaniel Fojt /* True if ^ and $ match only the start and end of data, and do not match
377*09d4459fSDaniel Fojt end-of-line within data. */
378*09d4459fSDaniel Fojt bool anchor;
379*09d4459fSDaniel Fojt
380*09d4459fSDaniel Fojt /* End-of-line byte in data. */
381*09d4459fSDaniel Fojt unsigned char eolbyte;
382*09d4459fSDaniel Fojt
383*09d4459fSDaniel Fojt /* Cache of char-context values. */
384*09d4459fSDaniel Fojt char sbit[NOTCHAR];
385*09d4459fSDaniel Fojt
386*09d4459fSDaniel Fojt /* If never_trail[B], the byte B cannot be a non-initial byte in a
387*09d4459fSDaniel Fojt multibyte character. */
388*09d4459fSDaniel Fojt bool never_trail[NOTCHAR];
389*09d4459fSDaniel Fojt
390*09d4459fSDaniel Fojt /* Set of characters considered letters. */
391*09d4459fSDaniel Fojt charclass letters;
392*09d4459fSDaniel Fojt
393*09d4459fSDaniel Fojt /* Set of characters that are newline. */
394*09d4459fSDaniel Fojt charclass newline;
395*09d4459fSDaniel Fojt };
396*09d4459fSDaniel Fojt
397*09d4459fSDaniel Fojt /* Lexical analyzer. All the dross that deals with the obnoxious
398*09d4459fSDaniel Fojt GNU Regex syntax bits is located here. The poor, suffering
399*09d4459fSDaniel Fojt reader is referred to the GNU Regex documentation for the
400*09d4459fSDaniel Fojt meaning of the @#%!@#%^!@ syntax bits. */
401*09d4459fSDaniel Fojt struct lexer_state
402*09d4459fSDaniel Fojt {
403*09d4459fSDaniel Fojt char const *ptr; /* Pointer to next input character. */
404*09d4459fSDaniel Fojt idx_t left; /* Number of characters remaining. */
405*09d4459fSDaniel Fojt token lasttok; /* Previous token returned; initially END. */
406*09d4459fSDaniel Fojt idx_t parens; /* Count of outstanding left parens. */
407*09d4459fSDaniel Fojt int minrep, maxrep; /* Repeat counts for {m,n}. */
408*09d4459fSDaniel Fojt
409*09d4459fSDaniel Fojt /* Wide character representation of the current multibyte character,
410*09d4459fSDaniel Fojt or WEOF if there was an encoding error. Used only if
411*09d4459fSDaniel Fojt MB_CUR_MAX > 1. */
412*09d4459fSDaniel Fojt wint_t wctok;
413*09d4459fSDaniel Fojt
414*09d4459fSDaniel Fojt /* The most recently analyzed multibyte bracket expression. */
415*09d4459fSDaniel Fojt struct mb_char_classes brack;
416*09d4459fSDaniel Fojt
417*09d4459fSDaniel Fojt /* We're separated from beginning or (, | only by zero-width characters. */
418*09d4459fSDaniel Fojt bool laststart;
419*09d4459fSDaniel Fojt };
420*09d4459fSDaniel Fojt
421*09d4459fSDaniel Fojt /* Recursive descent parser for regular expressions. */
422*09d4459fSDaniel Fojt
423*09d4459fSDaniel Fojt struct parser_state
424*09d4459fSDaniel Fojt {
425*09d4459fSDaniel Fojt token tok; /* Lookahead token. */
426*09d4459fSDaniel Fojt idx_t depth; /* Current depth of a hypothetical stack
427*09d4459fSDaniel Fojt holding deferred productions. This is
428*09d4459fSDaniel Fojt used to determine the depth that will be
429*09d4459fSDaniel Fojt required of the real stack later on in
430*09d4459fSDaniel Fojt dfaanalyze. */
431*09d4459fSDaniel Fojt };
432*09d4459fSDaniel Fojt
433*09d4459fSDaniel Fojt /* A compiled regular expression. */
434*09d4459fSDaniel Fojt struct dfa
435*09d4459fSDaniel Fojt {
436*09d4459fSDaniel Fojt /* Fields filled by the scanner. */
437*09d4459fSDaniel Fojt charclass *charclasses; /* Array of character sets for CSET tokens. */
438*09d4459fSDaniel Fojt idx_t cindex; /* Index for adding new charclasses. */
439*09d4459fSDaniel Fojt idx_t calloc; /* Number of charclasses allocated. */
440*09d4459fSDaniel Fojt ptrdiff_t canychar; /* Index of anychar class, or -1. */
441*09d4459fSDaniel Fojt
442*09d4459fSDaniel Fojt /* Scanner state */
443*09d4459fSDaniel Fojt struct lexer_state lex;
444*09d4459fSDaniel Fojt
445*09d4459fSDaniel Fojt /* Parser state */
446*09d4459fSDaniel Fojt struct parser_state parse;
447*09d4459fSDaniel Fojt
448*09d4459fSDaniel Fojt /* Fields filled by the parser. */
449*09d4459fSDaniel Fojt token *tokens; /* Postfix parse array. */
450*09d4459fSDaniel Fojt idx_t tindex; /* Index for adding new tokens. */
451*09d4459fSDaniel Fojt idx_t talloc; /* Number of tokens currently allocated. */
452*09d4459fSDaniel Fojt idx_t depth; /* Depth required of an evaluation stack
453*09d4459fSDaniel Fojt used for depth-first traversal of the
454*09d4459fSDaniel Fojt parse tree. */
455*09d4459fSDaniel Fojt idx_t nleaves; /* Number of leaves on the parse tree. */
456*09d4459fSDaniel Fojt idx_t nregexps; /* Count of parallel regexps being built
457*09d4459fSDaniel Fojt with dfaparse. */
458*09d4459fSDaniel Fojt bool fast; /* The DFA is fast. */
459*09d4459fSDaniel Fojt token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
460*09d4459fSDaniel Fojt mbstate_t mbs; /* Multibyte conversion state. */
461*09d4459fSDaniel Fojt
462*09d4459fSDaniel Fojt /* The following are valid only if MB_CUR_MAX > 1. */
463*09d4459fSDaniel Fojt
464*09d4459fSDaniel Fojt /* The value of multibyte_prop[i] is defined by following rule.
465*09d4459fSDaniel Fojt if tokens[i] < NOTCHAR
466*09d4459fSDaniel Fojt bit 0 : tokens[i] is the first byte of a character, including
467*09d4459fSDaniel Fojt single-byte characters.
468*09d4459fSDaniel Fojt bit 1 : tokens[i] is the last byte of a character, including
469*09d4459fSDaniel Fojt single-byte characters.
470*09d4459fSDaniel Fojt
471*09d4459fSDaniel Fojt e.g.
472*09d4459fSDaniel Fojt tokens
473*09d4459fSDaniel Fojt = 'single_byte_a', 'multi_byte_A', single_byte_b'
474*09d4459fSDaniel Fojt = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
475*09d4459fSDaniel Fojt multibyte_prop
476*09d4459fSDaniel Fojt = 3 , 1 , 0 , 2 , 3
477*09d4459fSDaniel Fojt */
478*09d4459fSDaniel Fojt char *multibyte_prop;
479*09d4459fSDaniel Fojt
480*09d4459fSDaniel Fojt /* Fields filled by the superset. */
481*09d4459fSDaniel Fojt struct dfa *superset; /* Hint of the dfa. */
482*09d4459fSDaniel Fojt
483*09d4459fSDaniel Fojt /* Fields filled by the state builder. */
484*09d4459fSDaniel Fojt dfa_state *states; /* States of the dfa. */
485*09d4459fSDaniel Fojt state_num sindex; /* Index for adding new states. */
486*09d4459fSDaniel Fojt idx_t salloc; /* Number of states currently allocated. */
487*09d4459fSDaniel Fojt
488*09d4459fSDaniel Fojt /* Fields filled by the parse tree->NFA conversion. */
489*09d4459fSDaniel Fojt position_set *follows; /* Array of follow sets, indexed by position
490*09d4459fSDaniel Fojt index. The follow of a position is the set
491*09d4459fSDaniel Fojt of positions containing characters that
492*09d4459fSDaniel Fojt could conceivably follow a character
493*09d4459fSDaniel Fojt matching the given position in a string
494*09d4459fSDaniel Fojt matching the regexp. Allocated to the
495*09d4459fSDaniel Fojt maximum possible position index. */
496*09d4459fSDaniel Fojt bool searchflag; /* We are supposed to build a searching
497*09d4459fSDaniel Fojt as opposed to an exact matcher. A searching
498*09d4459fSDaniel Fojt matcher finds the first and shortest string
499*09d4459fSDaniel Fojt matching a regexp anywhere in the buffer,
500*09d4459fSDaniel Fojt whereas an exact matcher finds the longest
501*09d4459fSDaniel Fojt string matching, but anchored to the
502*09d4459fSDaniel Fojt beginning of the buffer. */
503*09d4459fSDaniel Fojt
504*09d4459fSDaniel Fojt /* Fields filled by dfaanalyze. */
505*09d4459fSDaniel Fojt int *constraints; /* Array of union of accepting constraints
506*09d4459fSDaniel Fojt in the follow of a position. */
507*09d4459fSDaniel Fojt int *separates; /* Array of contexts on follow of a
508*09d4459fSDaniel Fojt position. */
509*09d4459fSDaniel Fojt
510*09d4459fSDaniel Fojt /* Fields filled by dfaexec. */
511*09d4459fSDaniel Fojt state_num tralloc; /* Number of transition tables that have
512*09d4459fSDaniel Fojt slots so far, not counting trans[-1] and
513*09d4459fSDaniel Fojt trans[-2]. */
514*09d4459fSDaniel Fojt int trcount; /* Number of transition tables that have
515*09d4459fSDaniel Fojt been built, other than for initial
516*09d4459fSDaniel Fojt states. */
517*09d4459fSDaniel Fojt int min_trcount; /* Number of initial states. Equivalently,
518*09d4459fSDaniel Fojt the minimum state number for which trcount
519*09d4459fSDaniel Fojt counts transitions. */
520*09d4459fSDaniel Fojt state_num **trans; /* Transition tables for states that can
521*09d4459fSDaniel Fojt never accept. If the transitions for a
522*09d4459fSDaniel Fojt state have not yet been computed, or the
523*09d4459fSDaniel Fojt state could possibly accept, its entry in
524*09d4459fSDaniel Fojt this table is NULL. This points to two
525*09d4459fSDaniel Fojt past the start of the allocated array,
526*09d4459fSDaniel Fojt and trans[-1] and trans[-2] are always
527*09d4459fSDaniel Fojt NULL. */
528*09d4459fSDaniel Fojt state_num **fails; /* Transition tables after failing to accept
529*09d4459fSDaniel Fojt on a state that potentially could do so.
530*09d4459fSDaniel Fojt If trans[i] is non-null, fails[i] must
531*09d4459fSDaniel Fojt be null. */
532*09d4459fSDaniel Fojt char *success; /* Table of acceptance conditions used in
533*09d4459fSDaniel Fojt dfaexec and computed in build_state. */
534*09d4459fSDaniel Fojt state_num *newlines; /* Transitions on newlines. The entry for a
535*09d4459fSDaniel Fojt newline in any transition table is always
536*09d4459fSDaniel Fojt -1 so we can count lines without wasting
537*09d4459fSDaniel Fojt too many cycles. The transition for a
538*09d4459fSDaniel Fojt newline is stored separately and handled
539*09d4459fSDaniel Fojt as a special case. Newline is also used
540*09d4459fSDaniel Fojt as a sentinel at the end of the buffer. */
541*09d4459fSDaniel Fojt state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
542*09d4459fSDaniel Fojt context in multibyte locales, in which we
543*09d4459fSDaniel Fojt do not distinguish between their contexts,
544*09d4459fSDaniel Fojt as not supported word. */
545*09d4459fSDaniel Fojt position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
546*09d4459fSDaniel Fojt state_num **mb_trans; /* Transition tables for states with
547*09d4459fSDaniel Fojt ANYCHAR. */
548*09d4459fSDaniel Fojt state_num mb_trcount; /* Number of transition tables for states with
549*09d4459fSDaniel Fojt ANYCHAR that have actually been built. */
550*09d4459fSDaniel Fojt
551*09d4459fSDaniel Fojt /* Syntax configuration. This is near the end so that dfacopysyntax
552*09d4459fSDaniel Fojt can memset up to here. */
553*09d4459fSDaniel Fojt struct regex_syntax syntax;
554*09d4459fSDaniel Fojt
555*09d4459fSDaniel Fojt /* Information derived from the locale. This is at the end so that
556*09d4459fSDaniel Fojt a quick memset need not clear it specially. */
557*09d4459fSDaniel Fojt
558*09d4459fSDaniel Fojt /* dfaexec implementation. */
559*09d4459fSDaniel Fojt char *(*dfaexec) (struct dfa *, char const *, char *,
560*09d4459fSDaniel Fojt bool, ptrdiff_t *, bool *);
561*09d4459fSDaniel Fojt
562*09d4459fSDaniel Fojt /* Other cached information derived from the locale. */
563*09d4459fSDaniel Fojt struct localeinfo localeinfo;
564*09d4459fSDaniel Fojt };
565*09d4459fSDaniel Fojt
566*09d4459fSDaniel Fojt /* User access to dfa internals. */
567*09d4459fSDaniel Fojt
568*09d4459fSDaniel Fojt /* S could possibly be an accepting state of R. */
569*09d4459fSDaniel Fojt static bool
accepting(state_num s,struct dfa const * r)570*09d4459fSDaniel Fojt accepting (state_num s, struct dfa const *r)
571*09d4459fSDaniel Fojt {
572*09d4459fSDaniel Fojt return r->states[s].constraint != 0;
573*09d4459fSDaniel Fojt }
574*09d4459fSDaniel Fojt
575*09d4459fSDaniel Fojt /* STATE accepts in the specified context. */
576*09d4459fSDaniel Fojt static bool
accepts_in_context(int prev,int curr,state_num state,struct dfa const * dfa)577*09d4459fSDaniel Fojt accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
578*09d4459fSDaniel Fojt {
579*09d4459fSDaniel Fojt return succeeds_in_context (dfa->states[state].constraint, prev, curr);
580*09d4459fSDaniel Fojt }
581*09d4459fSDaniel Fojt
582*09d4459fSDaniel Fojt static void regexp (struct dfa *dfa);
583*09d4459fSDaniel Fojt
584*09d4459fSDaniel Fojt /* Store into *PWC the result of converting the leading bytes of the
585*09d4459fSDaniel Fojt multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
586*09d4459fSDaniel Fojt and updating the conversion state in *D. On conversion error,
587*09d4459fSDaniel Fojt convert just a single byte, to WEOF. Return the number of bytes
588*09d4459fSDaniel Fojt converted.
589*09d4459fSDaniel Fojt
590*09d4459fSDaniel Fojt This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
591*09d4459fSDaniel Fojt
592*09d4459fSDaniel Fojt * PWC points to wint_t, not to wchar_t.
593*09d4459fSDaniel Fojt * The last arg is a dfa *D instead of merely a multibyte conversion
594*09d4459fSDaniel Fojt state D->mbs.
595*09d4459fSDaniel Fojt * N must be at least 1.
596*09d4459fSDaniel Fojt * S[N - 1] must be a sentinel byte.
597*09d4459fSDaniel Fojt * Shift encodings are not supported.
598*09d4459fSDaniel Fojt * The return value is always in the range 1..N.
599*09d4459fSDaniel Fojt * D->mbs is always valid afterwards.
600*09d4459fSDaniel Fojt * *PWC is always set to something. */
601*09d4459fSDaniel Fojt static int
mbs_to_wchar(wint_t * pwc,char const * s,size_t n,struct dfa * d)602*09d4459fSDaniel Fojt mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
603*09d4459fSDaniel Fojt {
604*09d4459fSDaniel Fojt unsigned char uc = s[0];
605*09d4459fSDaniel Fojt wint_t wc = d->localeinfo.sbctowc[uc];
606*09d4459fSDaniel Fojt
607*09d4459fSDaniel Fojt if (wc == WEOF)
608*09d4459fSDaniel Fojt {
609*09d4459fSDaniel Fojt wchar_t wch;
610*09d4459fSDaniel Fojt size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
611*09d4459fSDaniel Fojt if (0 < nbytes && nbytes < (size_t) -2)
612*09d4459fSDaniel Fojt {
613*09d4459fSDaniel Fojt *pwc = wch;
614*09d4459fSDaniel Fojt return nbytes;
615*09d4459fSDaniel Fojt }
616*09d4459fSDaniel Fojt memset (&d->mbs, 0, sizeof d->mbs);
617*09d4459fSDaniel Fojt }
618*09d4459fSDaniel Fojt
619*09d4459fSDaniel Fojt *pwc = wc;
620*09d4459fSDaniel Fojt return 1;
621*09d4459fSDaniel Fojt }
622*09d4459fSDaniel Fojt
623*09d4459fSDaniel Fojt #ifdef DEBUG
624*09d4459fSDaniel Fojt
625*09d4459fSDaniel Fojt static void
prtok(token t)626*09d4459fSDaniel Fojt prtok (token t)
627*09d4459fSDaniel Fojt {
628*09d4459fSDaniel Fojt if (t <= END)
629*09d4459fSDaniel Fojt fprintf (stderr, "END");
630*09d4459fSDaniel Fojt else if (0 <= t && t < NOTCHAR)
631*09d4459fSDaniel Fojt {
632*09d4459fSDaniel Fojt unsigned int ch = t;
633*09d4459fSDaniel Fojt fprintf (stderr, "0x%02x", ch);
634*09d4459fSDaniel Fojt }
635*09d4459fSDaniel Fojt else
636*09d4459fSDaniel Fojt {
637*09d4459fSDaniel Fojt char const *s;
638*09d4459fSDaniel Fojt switch (t)
639*09d4459fSDaniel Fojt {
640*09d4459fSDaniel Fojt case BEG:
641*09d4459fSDaniel Fojt s = "BEG";
642*09d4459fSDaniel Fojt break;
643*09d4459fSDaniel Fojt case EMPTY:
644*09d4459fSDaniel Fojt s = "EMPTY";
645*09d4459fSDaniel Fojt break;
646*09d4459fSDaniel Fojt case BACKREF:
647*09d4459fSDaniel Fojt s = "BACKREF";
648*09d4459fSDaniel Fojt break;
649*09d4459fSDaniel Fojt case BEGLINE:
650*09d4459fSDaniel Fojt s = "BEGLINE";
651*09d4459fSDaniel Fojt break;
652*09d4459fSDaniel Fojt case ENDLINE:
653*09d4459fSDaniel Fojt s = "ENDLINE";
654*09d4459fSDaniel Fojt break;
655*09d4459fSDaniel Fojt case BEGWORD:
656*09d4459fSDaniel Fojt s = "BEGWORD";
657*09d4459fSDaniel Fojt break;
658*09d4459fSDaniel Fojt case ENDWORD:
659*09d4459fSDaniel Fojt s = "ENDWORD";
660*09d4459fSDaniel Fojt break;
661*09d4459fSDaniel Fojt case LIMWORD:
662*09d4459fSDaniel Fojt s = "LIMWORD";
663*09d4459fSDaniel Fojt break;
664*09d4459fSDaniel Fojt case NOTLIMWORD:
665*09d4459fSDaniel Fojt s = "NOTLIMWORD";
666*09d4459fSDaniel Fojt break;
667*09d4459fSDaniel Fojt case QMARK:
668*09d4459fSDaniel Fojt s = "QMARK";
669*09d4459fSDaniel Fojt break;
670*09d4459fSDaniel Fojt case STAR:
671*09d4459fSDaniel Fojt s = "STAR";
672*09d4459fSDaniel Fojt break;
673*09d4459fSDaniel Fojt case PLUS:
674*09d4459fSDaniel Fojt s = "PLUS";
675*09d4459fSDaniel Fojt break;
676*09d4459fSDaniel Fojt case CAT:
677*09d4459fSDaniel Fojt s = "CAT";
678*09d4459fSDaniel Fojt break;
679*09d4459fSDaniel Fojt case OR:
680*09d4459fSDaniel Fojt s = "OR";
681*09d4459fSDaniel Fojt break;
682*09d4459fSDaniel Fojt case LPAREN:
683*09d4459fSDaniel Fojt s = "LPAREN";
684*09d4459fSDaniel Fojt break;
685*09d4459fSDaniel Fojt case RPAREN:
686*09d4459fSDaniel Fojt s = "RPAREN";
687*09d4459fSDaniel Fojt break;
688*09d4459fSDaniel Fojt case ANYCHAR:
689*09d4459fSDaniel Fojt s = "ANYCHAR";
690*09d4459fSDaniel Fojt break;
691*09d4459fSDaniel Fojt case MBCSET:
692*09d4459fSDaniel Fojt s = "MBCSET";
693*09d4459fSDaniel Fojt break;
694*09d4459fSDaniel Fojt default:
695*09d4459fSDaniel Fojt s = "CSET";
696*09d4459fSDaniel Fojt break;
697*09d4459fSDaniel Fojt }
698*09d4459fSDaniel Fojt fprintf (stderr, "%s", s);
699*09d4459fSDaniel Fojt }
700*09d4459fSDaniel Fojt }
701*09d4459fSDaniel Fojt #endif /* DEBUG */
702*09d4459fSDaniel Fojt
703*09d4459fSDaniel Fojt /* Stuff pertaining to charclasses. */
704*09d4459fSDaniel Fojt
705*09d4459fSDaniel Fojt static bool
tstbit(unsigned int b,charclass const * c)706*09d4459fSDaniel Fojt tstbit (unsigned int b, charclass const *c)
707*09d4459fSDaniel Fojt {
708*09d4459fSDaniel Fojt return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
709*09d4459fSDaniel Fojt }
710*09d4459fSDaniel Fojt
711*09d4459fSDaniel Fojt static void
setbit(unsigned int b,charclass * c)712*09d4459fSDaniel Fojt setbit (unsigned int b, charclass *c)
713*09d4459fSDaniel Fojt {
714*09d4459fSDaniel Fojt charclass_word one = 1;
715*09d4459fSDaniel Fojt c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
716*09d4459fSDaniel Fojt }
717*09d4459fSDaniel Fojt
718*09d4459fSDaniel Fojt static void
clrbit(unsigned int b,charclass * c)719*09d4459fSDaniel Fojt clrbit (unsigned int b, charclass *c)
720*09d4459fSDaniel Fojt {
721*09d4459fSDaniel Fojt charclass_word one = 1;
722*09d4459fSDaniel Fojt c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
723*09d4459fSDaniel Fojt }
724*09d4459fSDaniel Fojt
725*09d4459fSDaniel Fojt static void
zeroset(charclass * s)726*09d4459fSDaniel Fojt zeroset (charclass *s)
727*09d4459fSDaniel Fojt {
728*09d4459fSDaniel Fojt memset (s, 0, sizeof *s);
729*09d4459fSDaniel Fojt }
730*09d4459fSDaniel Fojt
731*09d4459fSDaniel Fojt static void
fillset(charclass * s)732*09d4459fSDaniel Fojt fillset (charclass *s)
733*09d4459fSDaniel Fojt {
734*09d4459fSDaniel Fojt for (int i = 0; i < CHARCLASS_WORDS; i++)
735*09d4459fSDaniel Fojt s->w[i] = CHARCLASS_WORD_MASK;
736*09d4459fSDaniel Fojt }
737*09d4459fSDaniel Fojt
738*09d4459fSDaniel Fojt static void
notset(charclass * s)739*09d4459fSDaniel Fojt notset (charclass *s)
740*09d4459fSDaniel Fojt {
741*09d4459fSDaniel Fojt for (int i = 0; i < CHARCLASS_WORDS; ++i)
742*09d4459fSDaniel Fojt s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
743*09d4459fSDaniel Fojt }
744*09d4459fSDaniel Fojt
745*09d4459fSDaniel Fojt static bool
equal(charclass const * s1,charclass const * s2)746*09d4459fSDaniel Fojt equal (charclass const *s1, charclass const *s2)
747*09d4459fSDaniel Fojt {
748*09d4459fSDaniel Fojt charclass_word w = 0;
749*09d4459fSDaniel Fojt for (int i = 0; i < CHARCLASS_WORDS; i++)
750*09d4459fSDaniel Fojt w |= s1->w[i] ^ s2->w[i];
751*09d4459fSDaniel Fojt return w == 0;
752*09d4459fSDaniel Fojt }
753*09d4459fSDaniel Fojt
754*09d4459fSDaniel Fojt static bool
emptyset(charclass const * s)755*09d4459fSDaniel Fojt emptyset (charclass const *s)
756*09d4459fSDaniel Fojt {
757*09d4459fSDaniel Fojt charclass_word w = 0;
758*09d4459fSDaniel Fojt for (int i = 0; i < CHARCLASS_WORDS; i++)
759*09d4459fSDaniel Fojt w |= s->w[i];
760*09d4459fSDaniel Fojt return w == 0;
761*09d4459fSDaniel Fojt }
762*09d4459fSDaniel Fojt
763*09d4459fSDaniel Fojt /* Grow PA, which points to an array of *NITEMS items, and return the
764*09d4459fSDaniel Fojt location of the reallocated array, updating *NITEMS to reflect its
765*09d4459fSDaniel Fojt new size. The new array will contain at least NITEMS_INCR_MIN more
766*09d4459fSDaniel Fojt items, but will not contain more than NITEMS_MAX items total.
767*09d4459fSDaniel Fojt ITEM_SIZE is the size of each item, in bytes.
768*09d4459fSDaniel Fojt
769*09d4459fSDaniel Fojt ITEM_SIZE and NITEMS_INCR_MIN must be positive. *NITEMS must be
770*09d4459fSDaniel Fojt nonnegative. If NITEMS_MAX is -1, it is treated as if it were
771*09d4459fSDaniel Fojt infinity.
772*09d4459fSDaniel Fojt
773*09d4459fSDaniel Fojt If PA is null, then allocate a new array instead of reallocating
774*09d4459fSDaniel Fojt the old one.
775*09d4459fSDaniel Fojt
776*09d4459fSDaniel Fojt Thus, to grow an array A without saving its old contents, do
777*09d4459fSDaniel Fojt { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */
778*09d4459fSDaniel Fojt
779*09d4459fSDaniel Fojt static void *
xpalloc(void * pa,idx_t * nitems,idx_t nitems_incr_min,ptrdiff_t nitems_max,idx_t item_size)780*09d4459fSDaniel Fojt xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
781*09d4459fSDaniel Fojt ptrdiff_t nitems_max, idx_t item_size)
782*09d4459fSDaniel Fojt {
783*09d4459fSDaniel Fojt idx_t n0 = *nitems;
784*09d4459fSDaniel Fojt
785*09d4459fSDaniel Fojt /* The approximate size to use for initial small allocation
786*09d4459fSDaniel Fojt requests. This is the largest "small" request for the GNU C
787*09d4459fSDaniel Fojt library malloc. */
788*09d4459fSDaniel Fojt enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
789*09d4459fSDaniel Fojt
790*09d4459fSDaniel Fojt /* If the array is tiny, grow it to about (but no greater than)
791*09d4459fSDaniel Fojt DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%.
792*09d4459fSDaniel Fojt Adjust the growth according to three constraints: NITEMS_INCR_MIN,
793*09d4459fSDaniel Fojt NITEMS_MAX, and what the C language can represent safely. */
794*09d4459fSDaniel Fojt
795*09d4459fSDaniel Fojt idx_t n, nbytes;
796*09d4459fSDaniel Fojt if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
797*09d4459fSDaniel Fojt n = IDX_MAX;
798*09d4459fSDaniel Fojt if (0 <= nitems_max && nitems_max < n)
799*09d4459fSDaniel Fojt n = nitems_max;
800*09d4459fSDaniel Fojt
801*09d4459fSDaniel Fojt idx_t adjusted_nbytes
802*09d4459fSDaniel Fojt = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
803*09d4459fSDaniel Fojt ? MIN (IDX_MAX, SIZE_MAX)
804*09d4459fSDaniel Fojt : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
805*09d4459fSDaniel Fojt if (adjusted_nbytes)
806*09d4459fSDaniel Fojt {
807*09d4459fSDaniel Fojt n = adjusted_nbytes / item_size;
808*09d4459fSDaniel Fojt nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
809*09d4459fSDaniel Fojt }
810*09d4459fSDaniel Fojt
811*09d4459fSDaniel Fojt if (! pa)
812*09d4459fSDaniel Fojt *nitems = 0;
813*09d4459fSDaniel Fojt if (n - n0 < nitems_incr_min
814*09d4459fSDaniel Fojt && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
815*09d4459fSDaniel Fojt || (0 <= nitems_max && nitems_max < n)
816*09d4459fSDaniel Fojt || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
817*09d4459fSDaniel Fojt xalloc_die ();
818*09d4459fSDaniel Fojt pa = xrealloc (pa, nbytes);
819*09d4459fSDaniel Fojt *nitems = n;
820*09d4459fSDaniel Fojt return pa;
821*09d4459fSDaniel Fojt }
822*09d4459fSDaniel Fojt
823*09d4459fSDaniel Fojt /* Ensure that the array addressed by PA holds at least I + 1 items.
824*09d4459fSDaniel Fojt Either return PA, or reallocate the array and return its new address.
825*09d4459fSDaniel Fojt Although PA may be null, the returned value is never null.
826*09d4459fSDaniel Fojt
827*09d4459fSDaniel Fojt The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
828*09d4459fSDaniel Fojt is updated on reallocation. If PA is null, *NITEMS must be zero.
829*09d4459fSDaniel Fojt Do not allocate more than NITEMS_MAX items total; -1 means no limit.
830*09d4459fSDaniel Fojt ITEM_SIZE is the size of one item; it must be positive.
831*09d4459fSDaniel Fojt Avoid O(N**2) behavior on arrays growing linearly. */
832*09d4459fSDaniel Fojt static void *
maybe_realloc(void * pa,idx_t i,idx_t * nitems,ptrdiff_t nitems_max,idx_t item_size)833*09d4459fSDaniel Fojt maybe_realloc (void *pa, idx_t i, idx_t *nitems,
834*09d4459fSDaniel Fojt ptrdiff_t nitems_max, idx_t item_size)
835*09d4459fSDaniel Fojt {
836*09d4459fSDaniel Fojt if (i < *nitems)
837*09d4459fSDaniel Fojt return pa;
838*09d4459fSDaniel Fojt return xpalloc (pa, nitems, 1, nitems_max, item_size);
839*09d4459fSDaniel Fojt }
840*09d4459fSDaniel Fojt
841*09d4459fSDaniel Fojt /* In DFA D, find the index of charclass S, or allocate a new one. */
842*09d4459fSDaniel Fojt static idx_t
charclass_index(struct dfa * d,charclass const * s)843*09d4459fSDaniel Fojt charclass_index (struct dfa *d, charclass const *s)
844*09d4459fSDaniel Fojt {
845*09d4459fSDaniel Fojt idx_t i;
846*09d4459fSDaniel Fojt
847*09d4459fSDaniel Fojt for (i = 0; i < d->cindex; ++i)
848*09d4459fSDaniel Fojt if (equal (s, &d->charclasses[i]))
849*09d4459fSDaniel Fojt return i;
850*09d4459fSDaniel Fojt d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
851*09d4459fSDaniel Fojt TOKEN_MAX - CSET, sizeof *d->charclasses);
852*09d4459fSDaniel Fojt ++d->cindex;
853*09d4459fSDaniel Fojt d->charclasses[i] = *s;
854*09d4459fSDaniel Fojt return i;
855*09d4459fSDaniel Fojt }
856*09d4459fSDaniel Fojt
857*09d4459fSDaniel Fojt static bool
unibyte_word_constituent(struct dfa const * dfa,unsigned char c)858*09d4459fSDaniel Fojt unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
859*09d4459fSDaniel Fojt {
860*09d4459fSDaniel Fojt return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
861*09d4459fSDaniel Fojt }
862*09d4459fSDaniel Fojt
863*09d4459fSDaniel Fojt static int
char_context(struct dfa const * dfa,unsigned char c)864*09d4459fSDaniel Fojt char_context (struct dfa const *dfa, unsigned char c)
865*09d4459fSDaniel Fojt {
866*09d4459fSDaniel Fojt if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
867*09d4459fSDaniel Fojt return CTX_NEWLINE;
868*09d4459fSDaniel Fojt if (unibyte_word_constituent (dfa, c))
869*09d4459fSDaniel Fojt return CTX_LETTER;
870*09d4459fSDaniel Fojt return CTX_NONE;
871*09d4459fSDaniel Fojt }
872*09d4459fSDaniel Fojt
873*09d4459fSDaniel Fojt /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
874*09d4459fSDaniel Fojt is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
875*09d4459fSDaniel Fojt this may happen when folding case in weird Turkish locales where
876*09d4459fSDaniel Fojt dotless i/dotted I are not included in the chosen character set.
877*09d4459fSDaniel Fojt Return whether a bit was set in the charclass. */
878*09d4459fSDaniel Fojt static bool
setbit_wc(wint_t wc,charclass * c)879*09d4459fSDaniel Fojt setbit_wc (wint_t wc, charclass *c)
880*09d4459fSDaniel Fojt {
881*09d4459fSDaniel Fojt int b = wctob (wc);
882*09d4459fSDaniel Fojt if (b < 0)
883*09d4459fSDaniel Fojt return false;
884*09d4459fSDaniel Fojt
885*09d4459fSDaniel Fojt setbit (b, c);
886*09d4459fSDaniel Fojt return true;
887*09d4459fSDaniel Fojt }
888*09d4459fSDaniel Fojt
889*09d4459fSDaniel Fojt /* Set a bit for B and its case variants in the charclass C.
890*09d4459fSDaniel Fojt MB_CUR_MAX must be 1. */
891*09d4459fSDaniel Fojt static void
setbit_case_fold_c(int b,charclass * c)892*09d4459fSDaniel Fojt setbit_case_fold_c (int b, charclass *c)
893*09d4459fSDaniel Fojt {
894*09d4459fSDaniel Fojt int ub = toupper (b);
895*09d4459fSDaniel Fojt for (int i = 0; i < NOTCHAR; i++)
896*09d4459fSDaniel Fojt if (toupper (i) == ub)
897*09d4459fSDaniel Fojt setbit (i, c);
898*09d4459fSDaniel Fojt }
899*09d4459fSDaniel Fojt
900*09d4459fSDaniel Fojt /* Fetch the next lexical input character from the pattern. There
901*09d4459fSDaniel Fojt must at least one byte of pattern input. Set DFA->lex.wctok to the
902*09d4459fSDaniel Fojt value of the character or to WEOF depending on whether the input is
903*09d4459fSDaniel Fojt a valid multibyte character (possibly of length 1). Then return
904*09d4459fSDaniel Fojt the next input byte value, except return EOF if the input is a
905*09d4459fSDaniel Fojt multibyte character of length greater than 1. */
906*09d4459fSDaniel Fojt static int
fetch_wc(struct dfa * dfa)907*09d4459fSDaniel Fojt fetch_wc (struct dfa *dfa)
908*09d4459fSDaniel Fojt {
909*09d4459fSDaniel Fojt int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
910*09d4459fSDaniel Fojt dfa);
911*09d4459fSDaniel Fojt int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
912*09d4459fSDaniel Fojt dfa->lex.ptr += nbytes;
913*09d4459fSDaniel Fojt dfa->lex.left -= nbytes;
914*09d4459fSDaniel Fojt return c;
915*09d4459fSDaniel Fojt }
916*09d4459fSDaniel Fojt
917*09d4459fSDaniel Fojt /* If there is no more input, report an error about unbalanced brackets.
918*09d4459fSDaniel Fojt Otherwise, behave as with fetch_wc (DFA). */
919*09d4459fSDaniel Fojt static int
bracket_fetch_wc(struct dfa * dfa)920*09d4459fSDaniel Fojt bracket_fetch_wc (struct dfa *dfa)
921*09d4459fSDaniel Fojt {
922*09d4459fSDaniel Fojt if (! dfa->lex.left)
923*09d4459fSDaniel Fojt dfaerror (_("unbalanced ["));
924*09d4459fSDaniel Fojt return fetch_wc (dfa);
925*09d4459fSDaniel Fojt }
926*09d4459fSDaniel Fojt
927*09d4459fSDaniel Fojt typedef int predicate (int);
928*09d4459fSDaniel Fojt
929*09d4459fSDaniel Fojt /* The following list maps the names of the Posix named character classes
930*09d4459fSDaniel Fojt to predicate functions that determine whether a given character is in
931*09d4459fSDaniel Fojt the class. The leading [ has already been eaten by the lexical
932*09d4459fSDaniel Fojt analyzer. */
933*09d4459fSDaniel Fojt struct dfa_ctype
934*09d4459fSDaniel Fojt {
935*09d4459fSDaniel Fojt const char *name;
936*09d4459fSDaniel Fojt predicate *func;
937*09d4459fSDaniel Fojt bool single_byte_only;
938*09d4459fSDaniel Fojt };
939*09d4459fSDaniel Fojt
940*09d4459fSDaniel Fojt static const struct dfa_ctype prednames[] = {
941*09d4459fSDaniel Fojt {"alpha", isalpha, false},
942*09d4459fSDaniel Fojt {"upper", isupper, false},
943*09d4459fSDaniel Fojt {"lower", islower, false},
944*09d4459fSDaniel Fojt {"digit", isdigit, true},
945*09d4459fSDaniel Fojt {"xdigit", isxdigit, false},
946*09d4459fSDaniel Fojt {"space", isspace, false},
947*09d4459fSDaniel Fojt {"punct", ispunct, false},
948*09d4459fSDaniel Fojt {"alnum", isalnum, false},
949*09d4459fSDaniel Fojt {"print", isprint, false},
950*09d4459fSDaniel Fojt {"graph", isgraph, false},
951*09d4459fSDaniel Fojt {"cntrl", iscntrl, false},
952*09d4459fSDaniel Fojt {"blank", isblank, false},
953*09d4459fSDaniel Fojt {NULL, NULL, false}
954*09d4459fSDaniel Fojt };
955*09d4459fSDaniel Fojt
956*09d4459fSDaniel Fojt static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
find_pred(const char * str)957*09d4459fSDaniel Fojt find_pred (const char *str)
958*09d4459fSDaniel Fojt {
959*09d4459fSDaniel Fojt for (int i = 0; prednames[i].name; i++)
960*09d4459fSDaniel Fojt if (streq (str, prednames[i].name))
961*09d4459fSDaniel Fojt return &prednames[i];
962*09d4459fSDaniel Fojt return NULL;
963*09d4459fSDaniel Fojt }
964*09d4459fSDaniel Fojt
965*09d4459fSDaniel Fojt /* Parse a bracket expression, which possibly includes multibyte
966*09d4459fSDaniel Fojt characters. */
967*09d4459fSDaniel Fojt static token
parse_bracket_exp(struct dfa * dfa)968*09d4459fSDaniel Fojt parse_bracket_exp (struct dfa *dfa)
969*09d4459fSDaniel Fojt {
970*09d4459fSDaniel Fojt /* This is a bracket expression that dfaexec is known to
971*09d4459fSDaniel Fojt process correctly. */
972*09d4459fSDaniel Fojt bool known_bracket_exp = true;
973*09d4459fSDaniel Fojt
974*09d4459fSDaniel Fojt /* Used to warn about [:space:].
975*09d4459fSDaniel Fojt Bit 0 = first character is a colon.
976*09d4459fSDaniel Fojt Bit 1 = last character is a colon.
977*09d4459fSDaniel Fojt Bit 2 = includes any other character but a colon.
978*09d4459fSDaniel Fojt Bit 3 = includes ranges, char/equiv classes or collation elements. */
979*09d4459fSDaniel Fojt int colon_warning_state;
980*09d4459fSDaniel Fojt
981*09d4459fSDaniel Fojt dfa->lex.brack.nchars = 0;
982*09d4459fSDaniel Fojt charclass ccl;
983*09d4459fSDaniel Fojt zeroset (&ccl);
984*09d4459fSDaniel Fojt int c = bracket_fetch_wc (dfa);
985*09d4459fSDaniel Fojt bool invert = c == '^';
986*09d4459fSDaniel Fojt if (invert)
987*09d4459fSDaniel Fojt {
988*09d4459fSDaniel Fojt c = bracket_fetch_wc (dfa);
989*09d4459fSDaniel Fojt known_bracket_exp = dfa->localeinfo.simple;
990*09d4459fSDaniel Fojt }
991*09d4459fSDaniel Fojt wint_t wc = dfa->lex.wctok;
992*09d4459fSDaniel Fojt int c1;
993*09d4459fSDaniel Fojt wint_t wc1;
994*09d4459fSDaniel Fojt colon_warning_state = (c == ':');
995*09d4459fSDaniel Fojt do
996*09d4459fSDaniel Fojt {
997*09d4459fSDaniel Fojt c1 = NOTCHAR; /* Mark c1 as not initialized. */
998*09d4459fSDaniel Fojt colon_warning_state &= ~2;
999*09d4459fSDaniel Fojt
1000*09d4459fSDaniel Fojt /* Note that if we're looking at some other [:...:] construct,
1001*09d4459fSDaniel Fojt we just treat it as a bunch of ordinary characters. We can do
1002*09d4459fSDaniel Fojt this because we assume regex has checked for syntax errors before
1003*09d4459fSDaniel Fojt dfa is ever called. */
1004*09d4459fSDaniel Fojt if (c == '[')
1005*09d4459fSDaniel Fojt {
1006*09d4459fSDaniel Fojt c1 = bracket_fetch_wc (dfa);
1007*09d4459fSDaniel Fojt wc1 = dfa->lex.wctok;
1008*09d4459fSDaniel Fojt
1009*09d4459fSDaniel Fojt if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1010*09d4459fSDaniel Fojt || c1 == '.' || c1 == '=')
1011*09d4459fSDaniel Fojt {
1012*09d4459fSDaniel Fojt enum { MAX_BRACKET_STRING_LEN = 32 };
1013*09d4459fSDaniel Fojt char str[MAX_BRACKET_STRING_LEN + 1];
1014*09d4459fSDaniel Fojt int len = 0;
1015*09d4459fSDaniel Fojt for (;;)
1016*09d4459fSDaniel Fojt {
1017*09d4459fSDaniel Fojt c = bracket_fetch_wc (dfa);
1018*09d4459fSDaniel Fojt if (dfa->lex.left == 0
1019*09d4459fSDaniel Fojt || (c == c1 && dfa->lex.ptr[0] == ']'))
1020*09d4459fSDaniel Fojt break;
1021*09d4459fSDaniel Fojt if (len < MAX_BRACKET_STRING_LEN)
1022*09d4459fSDaniel Fojt str[len++] = c;
1023*09d4459fSDaniel Fojt else
1024*09d4459fSDaniel Fojt /* This is in any case an invalid class name. */
1025*09d4459fSDaniel Fojt str[0] = '\0';
1026*09d4459fSDaniel Fojt }
1027*09d4459fSDaniel Fojt str[len] = '\0';
1028*09d4459fSDaniel Fojt
1029*09d4459fSDaniel Fojt /* Fetch bracket. */
1030*09d4459fSDaniel Fojt c = bracket_fetch_wc (dfa);
1031*09d4459fSDaniel Fojt wc = dfa->lex.wctok;
1032*09d4459fSDaniel Fojt if (c1 == ':')
1033*09d4459fSDaniel Fojt /* Build character class. POSIX allows character
1034*09d4459fSDaniel Fojt classes to match multicharacter collating elements,
1035*09d4459fSDaniel Fojt but the regex code does not support that, so do not
1036*09d4459fSDaniel Fojt worry about that possibility. */
1037*09d4459fSDaniel Fojt {
1038*09d4459fSDaniel Fojt char const *class
1039*09d4459fSDaniel Fojt = (dfa->syntax.case_fold && (streq (str, "upper")
1040*09d4459fSDaniel Fojt || streq (str, "lower"))
1041*09d4459fSDaniel Fojt ? "alpha" : str);
1042*09d4459fSDaniel Fojt const struct dfa_ctype *pred = find_pred (class);
1043*09d4459fSDaniel Fojt if (!pred)
1044*09d4459fSDaniel Fojt dfaerror (_("invalid character class"));
1045*09d4459fSDaniel Fojt
1046*09d4459fSDaniel Fojt if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1047*09d4459fSDaniel Fojt known_bracket_exp = false;
1048*09d4459fSDaniel Fojt else
1049*09d4459fSDaniel Fojt for (int c2 = 0; c2 < NOTCHAR; ++c2)
1050*09d4459fSDaniel Fojt if (pred->func (c2))
1051*09d4459fSDaniel Fojt setbit (c2, &ccl);
1052*09d4459fSDaniel Fojt }
1053*09d4459fSDaniel Fojt else
1054*09d4459fSDaniel Fojt known_bracket_exp = false;
1055*09d4459fSDaniel Fojt
1056*09d4459fSDaniel Fojt colon_warning_state |= 8;
1057*09d4459fSDaniel Fojt
1058*09d4459fSDaniel Fojt /* Fetch new lookahead character. */
1059*09d4459fSDaniel Fojt c1 = bracket_fetch_wc (dfa);
1060*09d4459fSDaniel Fojt wc1 = dfa->lex.wctok;
1061*09d4459fSDaniel Fojt continue;
1062*09d4459fSDaniel Fojt }
1063*09d4459fSDaniel Fojt
1064*09d4459fSDaniel Fojt /* We treat '[' as a normal character here. c/c1/wc/wc1
1065*09d4459fSDaniel Fojt are already set up. */
1066*09d4459fSDaniel Fojt }
1067*09d4459fSDaniel Fojt
1068*09d4459fSDaniel Fojt if (c == '\\'
1069*09d4459fSDaniel Fojt && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1070*09d4459fSDaniel Fojt {
1071*09d4459fSDaniel Fojt c = bracket_fetch_wc (dfa);
1072*09d4459fSDaniel Fojt wc = dfa->lex.wctok;
1073*09d4459fSDaniel Fojt }
1074*09d4459fSDaniel Fojt
1075*09d4459fSDaniel Fojt if (c1 == NOTCHAR)
1076*09d4459fSDaniel Fojt {
1077*09d4459fSDaniel Fojt c1 = bracket_fetch_wc (dfa);
1078*09d4459fSDaniel Fojt wc1 = dfa->lex.wctok;
1079*09d4459fSDaniel Fojt }
1080*09d4459fSDaniel Fojt
1081*09d4459fSDaniel Fojt if (c1 == '-')
1082*09d4459fSDaniel Fojt /* build range characters. */
1083*09d4459fSDaniel Fojt {
1084*09d4459fSDaniel Fojt int c2 = bracket_fetch_wc (dfa);
1085*09d4459fSDaniel Fojt wint_t wc2 = dfa->lex.wctok;
1086*09d4459fSDaniel Fojt
1087*09d4459fSDaniel Fojt /* A bracket expression like [a-[.aa.]] matches an unknown set.
1088*09d4459fSDaniel Fojt Treat it like [-a[.aa.]] while parsing it, and
1089*09d4459fSDaniel Fojt remember that the set is unknown. */
1090*09d4459fSDaniel Fojt if (c2 == '[' && dfa->lex.ptr[0] == '.')
1091*09d4459fSDaniel Fojt {
1092*09d4459fSDaniel Fojt known_bracket_exp = false;
1093*09d4459fSDaniel Fojt c2 = ']';
1094*09d4459fSDaniel Fojt }
1095*09d4459fSDaniel Fojt
1096*09d4459fSDaniel Fojt if (c2 == ']')
1097*09d4459fSDaniel Fojt {
1098*09d4459fSDaniel Fojt /* In the case [x-], the - is an ordinary hyphen,
1099*09d4459fSDaniel Fojt which is left in c1, the lookahead character. */
1100*09d4459fSDaniel Fojt dfa->lex.ptr--;
1101*09d4459fSDaniel Fojt dfa->lex.left++;
1102*09d4459fSDaniel Fojt }
1103*09d4459fSDaniel Fojt else
1104*09d4459fSDaniel Fojt {
1105*09d4459fSDaniel Fojt if (c2 == '\\' && (dfa->syntax.syntax_bits
1106*09d4459fSDaniel Fojt & RE_BACKSLASH_ESCAPE_IN_LISTS))
1107*09d4459fSDaniel Fojt {
1108*09d4459fSDaniel Fojt c2 = bracket_fetch_wc (dfa);
1109*09d4459fSDaniel Fojt wc2 = dfa->lex.wctok;
1110*09d4459fSDaniel Fojt }
1111*09d4459fSDaniel Fojt
1112*09d4459fSDaniel Fojt colon_warning_state |= 8;
1113*09d4459fSDaniel Fojt c1 = bracket_fetch_wc (dfa);
1114*09d4459fSDaniel Fojt wc1 = dfa->lex.wctok;
1115*09d4459fSDaniel Fojt
1116*09d4459fSDaniel Fojt /* Treat [x-y] as a range if x != y. */
1117*09d4459fSDaniel Fojt if (wc != wc2 || wc == WEOF)
1118*09d4459fSDaniel Fojt {
1119*09d4459fSDaniel Fojt if (dfa->localeinfo.simple
1120*09d4459fSDaniel Fojt || (isasciidigit (c) & isasciidigit (c2)))
1121*09d4459fSDaniel Fojt {
1122*09d4459fSDaniel Fojt for (int ci = c; ci <= c2; ci++)
1123*09d4459fSDaniel Fojt if (dfa->syntax.case_fold && isalpha (ci))
1124*09d4459fSDaniel Fojt setbit_case_fold_c (ci, &ccl);
1125*09d4459fSDaniel Fojt else
1126*09d4459fSDaniel Fojt setbit (ci, &ccl);
1127*09d4459fSDaniel Fojt }
1128*09d4459fSDaniel Fojt else
1129*09d4459fSDaniel Fojt known_bracket_exp = false;
1130*09d4459fSDaniel Fojt
1131*09d4459fSDaniel Fojt continue;
1132*09d4459fSDaniel Fojt }
1133*09d4459fSDaniel Fojt }
1134*09d4459fSDaniel Fojt }
1135*09d4459fSDaniel Fojt
1136*09d4459fSDaniel Fojt colon_warning_state |= (c == ':') ? 2 : 4;
1137*09d4459fSDaniel Fojt
1138*09d4459fSDaniel Fojt if (!dfa->localeinfo.multibyte)
1139*09d4459fSDaniel Fojt {
1140*09d4459fSDaniel Fojt if (dfa->syntax.case_fold && isalpha (c))
1141*09d4459fSDaniel Fojt setbit_case_fold_c (c, &ccl);
1142*09d4459fSDaniel Fojt else
1143*09d4459fSDaniel Fojt setbit (c, &ccl);
1144*09d4459fSDaniel Fojt continue;
1145*09d4459fSDaniel Fojt }
1146*09d4459fSDaniel Fojt
1147*09d4459fSDaniel Fojt if (wc == WEOF)
1148*09d4459fSDaniel Fojt known_bracket_exp = false;
1149*09d4459fSDaniel Fojt else
1150*09d4459fSDaniel Fojt {
1151*09d4459fSDaniel Fojt wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1152*09d4459fSDaniel Fojt int n = (dfa->syntax.case_fold
1153*09d4459fSDaniel Fojt ? case_folded_counterparts (wc, folded + 1) + 1
1154*09d4459fSDaniel Fojt : 1);
1155*09d4459fSDaniel Fojt folded[0] = wc;
1156*09d4459fSDaniel Fojt for (int i = 0; i < n; i++)
1157*09d4459fSDaniel Fojt if (!setbit_wc (folded[i], &ccl))
1158*09d4459fSDaniel Fojt {
1159*09d4459fSDaniel Fojt dfa->lex.brack.chars
1160*09d4459fSDaniel Fojt = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1161*09d4459fSDaniel Fojt &dfa->lex.brack.nchars_alloc, -1,
1162*09d4459fSDaniel Fojt sizeof *dfa->lex.brack.chars);
1163*09d4459fSDaniel Fojt dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1164*09d4459fSDaniel Fojt }
1165*09d4459fSDaniel Fojt }
1166*09d4459fSDaniel Fojt }
1167*09d4459fSDaniel Fojt while ((wc = wc1, (c = c1) != ']'));
1168*09d4459fSDaniel Fojt
1169*09d4459fSDaniel Fojt if (colon_warning_state == 7)
1170*09d4459fSDaniel Fojt dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1171*09d4459fSDaniel Fojt
1172*09d4459fSDaniel Fojt if (! known_bracket_exp)
1173*09d4459fSDaniel Fojt return BACKREF;
1174*09d4459fSDaniel Fojt
1175*09d4459fSDaniel Fojt if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1176*09d4459fSDaniel Fojt {
1177*09d4459fSDaniel Fojt dfa->lex.brack.invert = invert;
1178*09d4459fSDaniel Fojt dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1179*09d4459fSDaniel Fojt return MBCSET;
1180*09d4459fSDaniel Fojt }
1181*09d4459fSDaniel Fojt
1182*09d4459fSDaniel Fojt if (invert)
1183*09d4459fSDaniel Fojt {
1184*09d4459fSDaniel Fojt notset (&ccl);
1185*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1186*09d4459fSDaniel Fojt clrbit ('\n', &ccl);
1187*09d4459fSDaniel Fojt }
1188*09d4459fSDaniel Fojt
1189*09d4459fSDaniel Fojt return CSET + charclass_index (dfa, &ccl);
1190*09d4459fSDaniel Fojt }
1191*09d4459fSDaniel Fojt
1192*09d4459fSDaniel Fojt struct lexptr
1193*09d4459fSDaniel Fojt {
1194*09d4459fSDaniel Fojt char const *ptr;
1195*09d4459fSDaniel Fojt idx_t left;
1196*09d4459fSDaniel Fojt };
1197*09d4459fSDaniel Fojt
1198*09d4459fSDaniel Fojt static void
push_lex_state(struct dfa * dfa,struct lexptr * ls,char const * s)1199*09d4459fSDaniel Fojt push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1200*09d4459fSDaniel Fojt {
1201*09d4459fSDaniel Fojt ls->ptr = dfa->lex.ptr;
1202*09d4459fSDaniel Fojt ls->left = dfa->lex.left;
1203*09d4459fSDaniel Fojt dfa->lex.ptr = s;
1204*09d4459fSDaniel Fojt dfa->lex.left = strlen (s);
1205*09d4459fSDaniel Fojt }
1206*09d4459fSDaniel Fojt
1207*09d4459fSDaniel Fojt static void
pop_lex_state(struct dfa * dfa,struct lexptr const * ls)1208*09d4459fSDaniel Fojt pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1209*09d4459fSDaniel Fojt {
1210*09d4459fSDaniel Fojt dfa->lex.ptr = ls->ptr;
1211*09d4459fSDaniel Fojt dfa->lex.left = ls->left;
1212*09d4459fSDaniel Fojt }
1213*09d4459fSDaniel Fojt
1214*09d4459fSDaniel Fojt static token
lex(struct dfa * dfa)1215*09d4459fSDaniel Fojt lex (struct dfa *dfa)
1216*09d4459fSDaniel Fojt {
1217*09d4459fSDaniel Fojt bool backslash = false;
1218*09d4459fSDaniel Fojt
1219*09d4459fSDaniel Fojt /* Basic plan: We fetch a character. If it's a backslash,
1220*09d4459fSDaniel Fojt we set the backslash flag and go through the loop again.
1221*09d4459fSDaniel Fojt On the plus side, this avoids having a duplicate of the
1222*09d4459fSDaniel Fojt main switch inside the backslash case. On the minus side,
1223*09d4459fSDaniel Fojt it means that just about every case begins with
1224*09d4459fSDaniel Fojt "if (backslash) ...". */
1225*09d4459fSDaniel Fojt for (int i = 0; i < 2; ++i)
1226*09d4459fSDaniel Fojt {
1227*09d4459fSDaniel Fojt if (! dfa->lex.left)
1228*09d4459fSDaniel Fojt return dfa->lex.lasttok = END;
1229*09d4459fSDaniel Fojt int c = fetch_wc (dfa);
1230*09d4459fSDaniel Fojt
1231*09d4459fSDaniel Fojt switch (c)
1232*09d4459fSDaniel Fojt {
1233*09d4459fSDaniel Fojt case '\\':
1234*09d4459fSDaniel Fojt if (backslash)
1235*09d4459fSDaniel Fojt goto normal_char;
1236*09d4459fSDaniel Fojt if (dfa->lex.left == 0)
1237*09d4459fSDaniel Fojt dfaerror (_("unfinished \\ escape"));
1238*09d4459fSDaniel Fojt backslash = true;
1239*09d4459fSDaniel Fojt break;
1240*09d4459fSDaniel Fojt
1241*09d4459fSDaniel Fojt case '^':
1242*09d4459fSDaniel Fojt if (backslash)
1243*09d4459fSDaniel Fojt goto normal_char;
1244*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1245*09d4459fSDaniel Fojt || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1246*09d4459fSDaniel Fojt || dfa->lex.lasttok == OR)
1247*09d4459fSDaniel Fojt return dfa->lex.lasttok = BEGLINE;
1248*09d4459fSDaniel Fojt goto normal_char;
1249*09d4459fSDaniel Fojt
1250*09d4459fSDaniel Fojt case '$':
1251*09d4459fSDaniel Fojt if (backslash)
1252*09d4459fSDaniel Fojt goto normal_char;
1253*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1254*09d4459fSDaniel Fojt || dfa->lex.left == 0
1255*09d4459fSDaniel Fojt || ((dfa->lex.left
1256*09d4459fSDaniel Fojt > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1257*09d4459fSDaniel Fojt && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1258*09d4459fSDaniel Fojt & (dfa->lex.ptr[0] == '\\')]
1259*09d4459fSDaniel Fojt == ')'))
1260*09d4459fSDaniel Fojt || ((dfa->lex.left
1261*09d4459fSDaniel Fojt > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1262*09d4459fSDaniel Fojt && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1263*09d4459fSDaniel Fojt & (dfa->lex.ptr[0] == '\\')]
1264*09d4459fSDaniel Fojt == '|'))
1265*09d4459fSDaniel Fojt || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1266*09d4459fSDaniel Fojt && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1267*09d4459fSDaniel Fojt return dfa->lex.lasttok = ENDLINE;
1268*09d4459fSDaniel Fojt goto normal_char;
1269*09d4459fSDaniel Fojt
1270*09d4459fSDaniel Fojt case '1':
1271*09d4459fSDaniel Fojt case '2':
1272*09d4459fSDaniel Fojt case '3':
1273*09d4459fSDaniel Fojt case '4':
1274*09d4459fSDaniel Fojt case '5':
1275*09d4459fSDaniel Fojt case '6':
1276*09d4459fSDaniel Fojt case '7':
1277*09d4459fSDaniel Fojt case '8':
1278*09d4459fSDaniel Fojt case '9':
1279*09d4459fSDaniel Fojt if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1280*09d4459fSDaniel Fojt {
1281*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1282*09d4459fSDaniel Fojt return dfa->lex.lasttok = BACKREF;
1283*09d4459fSDaniel Fojt }
1284*09d4459fSDaniel Fojt goto normal_char;
1285*09d4459fSDaniel Fojt
1286*09d4459fSDaniel Fojt case '`':
1287*09d4459fSDaniel Fojt if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1288*09d4459fSDaniel Fojt {
1289*09d4459fSDaniel Fojt /* FIXME: should be beginning of string */
1290*09d4459fSDaniel Fojt return dfa->lex.lasttok = BEGLINE;
1291*09d4459fSDaniel Fojt }
1292*09d4459fSDaniel Fojt goto normal_char;
1293*09d4459fSDaniel Fojt
1294*09d4459fSDaniel Fojt case '\'':
1295*09d4459fSDaniel Fojt if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1296*09d4459fSDaniel Fojt {
1297*09d4459fSDaniel Fojt /* FIXME: should be end of string */
1298*09d4459fSDaniel Fojt return dfa->lex.lasttok = ENDLINE;
1299*09d4459fSDaniel Fojt }
1300*09d4459fSDaniel Fojt goto normal_char;
1301*09d4459fSDaniel Fojt
1302*09d4459fSDaniel Fojt case '<':
1303*09d4459fSDaniel Fojt if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1304*09d4459fSDaniel Fojt return dfa->lex.lasttok = BEGWORD;
1305*09d4459fSDaniel Fojt goto normal_char;
1306*09d4459fSDaniel Fojt
1307*09d4459fSDaniel Fojt case '>':
1308*09d4459fSDaniel Fojt if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1309*09d4459fSDaniel Fojt return dfa->lex.lasttok = ENDWORD;
1310*09d4459fSDaniel Fojt goto normal_char;
1311*09d4459fSDaniel Fojt
1312*09d4459fSDaniel Fojt case 'b':
1313*09d4459fSDaniel Fojt if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1314*09d4459fSDaniel Fojt return dfa->lex.lasttok = LIMWORD;
1315*09d4459fSDaniel Fojt goto normal_char;
1316*09d4459fSDaniel Fojt
1317*09d4459fSDaniel Fojt case 'B':
1318*09d4459fSDaniel Fojt if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1319*09d4459fSDaniel Fojt return dfa->lex.lasttok = NOTLIMWORD;
1320*09d4459fSDaniel Fojt goto normal_char;
1321*09d4459fSDaniel Fojt
1322*09d4459fSDaniel Fojt case '?':
1323*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1324*09d4459fSDaniel Fojt goto normal_char;
1325*09d4459fSDaniel Fojt if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1326*09d4459fSDaniel Fojt goto normal_char;
1327*09d4459fSDaniel Fojt if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1328*09d4459fSDaniel Fojt && dfa->lex.laststart)
1329*09d4459fSDaniel Fojt goto normal_char;
1330*09d4459fSDaniel Fojt return dfa->lex.lasttok = QMARK;
1331*09d4459fSDaniel Fojt
1332*09d4459fSDaniel Fojt case '*':
1333*09d4459fSDaniel Fojt if (backslash)
1334*09d4459fSDaniel Fojt goto normal_char;
1335*09d4459fSDaniel Fojt if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1336*09d4459fSDaniel Fojt && dfa->lex.laststart)
1337*09d4459fSDaniel Fojt goto normal_char;
1338*09d4459fSDaniel Fojt return dfa->lex.lasttok = STAR;
1339*09d4459fSDaniel Fojt
1340*09d4459fSDaniel Fojt case '+':
1341*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1342*09d4459fSDaniel Fojt goto normal_char;
1343*09d4459fSDaniel Fojt if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1344*09d4459fSDaniel Fojt goto normal_char;
1345*09d4459fSDaniel Fojt if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1346*09d4459fSDaniel Fojt && dfa->lex.laststart)
1347*09d4459fSDaniel Fojt goto normal_char;
1348*09d4459fSDaniel Fojt return dfa->lex.lasttok = PLUS;
1349*09d4459fSDaniel Fojt
1350*09d4459fSDaniel Fojt case '{':
1351*09d4459fSDaniel Fojt if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1352*09d4459fSDaniel Fojt goto normal_char;
1353*09d4459fSDaniel Fojt if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1354*09d4459fSDaniel Fojt goto normal_char;
1355*09d4459fSDaniel Fojt if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1356*09d4459fSDaniel Fojt && dfa->lex.laststart)
1357*09d4459fSDaniel Fojt goto normal_char;
1358*09d4459fSDaniel Fojt
1359*09d4459fSDaniel Fojt /* Cases:
1360*09d4459fSDaniel Fojt {M} - exact count
1361*09d4459fSDaniel Fojt {M,} - minimum count, maximum is infinity
1362*09d4459fSDaniel Fojt {,N} - 0 through N
1363*09d4459fSDaniel Fojt {,} - 0 to infinity (same as '*')
1364*09d4459fSDaniel Fojt {M,N} - M through N */
1365*09d4459fSDaniel Fojt {
1366*09d4459fSDaniel Fojt char const *p = dfa->lex.ptr;
1367*09d4459fSDaniel Fojt char const *lim = p + dfa->lex.left;
1368*09d4459fSDaniel Fojt dfa->lex.minrep = dfa->lex.maxrep = -1;
1369*09d4459fSDaniel Fojt for (; p != lim && isasciidigit (*p); p++)
1370*09d4459fSDaniel Fojt dfa->lex.minrep = (dfa->lex.minrep < 0
1371*09d4459fSDaniel Fojt ? *p - '0'
1372*09d4459fSDaniel Fojt : MIN (RE_DUP_MAX + 1,
1373*09d4459fSDaniel Fojt dfa->lex.minrep * 10 + *p - '0'));
1374*09d4459fSDaniel Fojt if (p != lim)
1375*09d4459fSDaniel Fojt {
1376*09d4459fSDaniel Fojt if (*p != ',')
1377*09d4459fSDaniel Fojt dfa->lex.maxrep = dfa->lex.minrep;
1378*09d4459fSDaniel Fojt else
1379*09d4459fSDaniel Fojt {
1380*09d4459fSDaniel Fojt if (dfa->lex.minrep < 0)
1381*09d4459fSDaniel Fojt dfa->lex.minrep = 0;
1382*09d4459fSDaniel Fojt while (++p != lim && isasciidigit (*p))
1383*09d4459fSDaniel Fojt dfa->lex.maxrep
1384*09d4459fSDaniel Fojt = (dfa->lex.maxrep < 0
1385*09d4459fSDaniel Fojt ? *p - '0'
1386*09d4459fSDaniel Fojt : MIN (RE_DUP_MAX + 1,
1387*09d4459fSDaniel Fojt dfa->lex.maxrep * 10 + *p - '0'));
1388*09d4459fSDaniel Fojt }
1389*09d4459fSDaniel Fojt }
1390*09d4459fSDaniel Fojt if (! ((! backslash || (p != lim && *p++ == '\\'))
1391*09d4459fSDaniel Fojt && p != lim && *p++ == '}'
1392*09d4459fSDaniel Fojt && 0 <= dfa->lex.minrep
1393*09d4459fSDaniel Fojt && (dfa->lex.maxrep < 0
1394*09d4459fSDaniel Fojt || dfa->lex.minrep <= dfa->lex.maxrep)))
1395*09d4459fSDaniel Fojt {
1396*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1397*09d4459fSDaniel Fojt goto normal_char;
1398*09d4459fSDaniel Fojt dfaerror (_("invalid content of \\{\\}"));
1399*09d4459fSDaniel Fojt }
1400*09d4459fSDaniel Fojt if (RE_DUP_MAX < dfa->lex.maxrep)
1401*09d4459fSDaniel Fojt dfaerror (_("regular expression too big"));
1402*09d4459fSDaniel Fojt dfa->lex.ptr = p;
1403*09d4459fSDaniel Fojt dfa->lex.left = lim - p;
1404*09d4459fSDaniel Fojt }
1405*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1406*09d4459fSDaniel Fojt return dfa->lex.lasttok = REPMN;
1407*09d4459fSDaniel Fojt
1408*09d4459fSDaniel Fojt case '|':
1409*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1410*09d4459fSDaniel Fojt goto normal_char;
1411*09d4459fSDaniel Fojt if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1412*09d4459fSDaniel Fojt goto normal_char;
1413*09d4459fSDaniel Fojt dfa->lex.laststart = true;
1414*09d4459fSDaniel Fojt return dfa->lex.lasttok = OR;
1415*09d4459fSDaniel Fojt
1416*09d4459fSDaniel Fojt case '\n':
1417*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1418*09d4459fSDaniel Fojt || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1419*09d4459fSDaniel Fojt goto normal_char;
1420*09d4459fSDaniel Fojt dfa->lex.laststart = true;
1421*09d4459fSDaniel Fojt return dfa->lex.lasttok = OR;
1422*09d4459fSDaniel Fojt
1423*09d4459fSDaniel Fojt case '(':
1424*09d4459fSDaniel Fojt if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1425*09d4459fSDaniel Fojt goto normal_char;
1426*09d4459fSDaniel Fojt dfa->lex.parens++;
1427*09d4459fSDaniel Fojt dfa->lex.laststart = true;
1428*09d4459fSDaniel Fojt return dfa->lex.lasttok = LPAREN;
1429*09d4459fSDaniel Fojt
1430*09d4459fSDaniel Fojt case ')':
1431*09d4459fSDaniel Fojt if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1432*09d4459fSDaniel Fojt goto normal_char;
1433*09d4459fSDaniel Fojt if (dfa->lex.parens == 0
1434*09d4459fSDaniel Fojt && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1435*09d4459fSDaniel Fojt goto normal_char;
1436*09d4459fSDaniel Fojt dfa->lex.parens--;
1437*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1438*09d4459fSDaniel Fojt return dfa->lex.lasttok = RPAREN;
1439*09d4459fSDaniel Fojt
1440*09d4459fSDaniel Fojt case '.':
1441*09d4459fSDaniel Fojt if (backslash)
1442*09d4459fSDaniel Fojt goto normal_char;
1443*09d4459fSDaniel Fojt if (dfa->canychar < 0)
1444*09d4459fSDaniel Fojt {
1445*09d4459fSDaniel Fojt charclass ccl;
1446*09d4459fSDaniel Fojt fillset (&ccl);
1447*09d4459fSDaniel Fojt if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1448*09d4459fSDaniel Fojt clrbit ('\n', &ccl);
1449*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1450*09d4459fSDaniel Fojt clrbit ('\0', &ccl);
1451*09d4459fSDaniel Fojt if (dfa->localeinfo.multibyte)
1452*09d4459fSDaniel Fojt for (int c2 = 0; c2 < NOTCHAR; c2++)
1453*09d4459fSDaniel Fojt if (dfa->localeinfo.sbctowc[c2] == WEOF)
1454*09d4459fSDaniel Fojt clrbit (c2, &ccl);
1455*09d4459fSDaniel Fojt dfa->canychar = charclass_index (dfa, &ccl);
1456*09d4459fSDaniel Fojt }
1457*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1458*09d4459fSDaniel Fojt return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1459*09d4459fSDaniel Fojt ? ANYCHAR
1460*09d4459fSDaniel Fojt : CSET + dfa->canychar);
1461*09d4459fSDaniel Fojt
1462*09d4459fSDaniel Fojt case 's':
1463*09d4459fSDaniel Fojt case 'S':
1464*09d4459fSDaniel Fojt if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1465*09d4459fSDaniel Fojt goto normal_char;
1466*09d4459fSDaniel Fojt if (!dfa->localeinfo.multibyte)
1467*09d4459fSDaniel Fojt {
1468*09d4459fSDaniel Fojt charclass ccl;
1469*09d4459fSDaniel Fojt zeroset (&ccl);
1470*09d4459fSDaniel Fojt for (int c2 = 0; c2 < NOTCHAR; ++c2)
1471*09d4459fSDaniel Fojt if (isspace (c2))
1472*09d4459fSDaniel Fojt setbit (c2, &ccl);
1473*09d4459fSDaniel Fojt if (c == 'S')
1474*09d4459fSDaniel Fojt notset (&ccl);
1475*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1476*09d4459fSDaniel Fojt return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1477*09d4459fSDaniel Fojt }
1478*09d4459fSDaniel Fojt
1479*09d4459fSDaniel Fojt /* FIXME: see if optimizing this, as is done with ANYCHAR and
1480*09d4459fSDaniel Fojt add_utf8_anychar, makes sense. */
1481*09d4459fSDaniel Fojt
1482*09d4459fSDaniel Fojt /* \s and \S are documented to be equivalent to [[:space:]] and
1483*09d4459fSDaniel Fojt [^[:space:]] respectively, so tell the lexer to process those
1484*09d4459fSDaniel Fojt strings, each minus its "already processed" '['. */
1485*09d4459fSDaniel Fojt {
1486*09d4459fSDaniel Fojt struct lexptr ls;
1487*09d4459fSDaniel Fojt push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1488*09d4459fSDaniel Fojt dfa->lex.lasttok = parse_bracket_exp (dfa);
1489*09d4459fSDaniel Fojt pop_lex_state (dfa, &ls);
1490*09d4459fSDaniel Fojt }
1491*09d4459fSDaniel Fojt
1492*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1493*09d4459fSDaniel Fojt return dfa->lex.lasttok;
1494*09d4459fSDaniel Fojt
1495*09d4459fSDaniel Fojt case 'w':
1496*09d4459fSDaniel Fojt case 'W':
1497*09d4459fSDaniel Fojt if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1498*09d4459fSDaniel Fojt goto normal_char;
1499*09d4459fSDaniel Fojt
1500*09d4459fSDaniel Fojt if (!dfa->localeinfo.multibyte)
1501*09d4459fSDaniel Fojt {
1502*09d4459fSDaniel Fojt charclass ccl;
1503*09d4459fSDaniel Fojt zeroset (&ccl);
1504*09d4459fSDaniel Fojt for (int c2 = 0; c2 < NOTCHAR; ++c2)
1505*09d4459fSDaniel Fojt if (dfa->syntax.sbit[c2] == CTX_LETTER)
1506*09d4459fSDaniel Fojt setbit (c2, &ccl);
1507*09d4459fSDaniel Fojt if (c == 'W')
1508*09d4459fSDaniel Fojt notset (&ccl);
1509*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1510*09d4459fSDaniel Fojt return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1511*09d4459fSDaniel Fojt }
1512*09d4459fSDaniel Fojt
1513*09d4459fSDaniel Fojt /* FIXME: see if optimizing this, as is done with ANYCHAR and
1514*09d4459fSDaniel Fojt add_utf8_anychar, makes sense. */
1515*09d4459fSDaniel Fojt
1516*09d4459fSDaniel Fojt /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1517*09d4459fSDaniel Fojt [^_[:alnum:]] respectively, so tell the lexer to process those
1518*09d4459fSDaniel Fojt strings, each minus its "already processed" '['. */
1519*09d4459fSDaniel Fojt {
1520*09d4459fSDaniel Fojt struct lexptr ls;
1521*09d4459fSDaniel Fojt push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1522*09d4459fSDaniel Fojt dfa->lex.lasttok = parse_bracket_exp (dfa);
1523*09d4459fSDaniel Fojt pop_lex_state (dfa, &ls);
1524*09d4459fSDaniel Fojt }
1525*09d4459fSDaniel Fojt
1526*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1527*09d4459fSDaniel Fojt return dfa->lex.lasttok;
1528*09d4459fSDaniel Fojt
1529*09d4459fSDaniel Fojt case '[':
1530*09d4459fSDaniel Fojt if (backslash)
1531*09d4459fSDaniel Fojt goto normal_char;
1532*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1533*09d4459fSDaniel Fojt return dfa->lex.lasttok = parse_bracket_exp (dfa);
1534*09d4459fSDaniel Fojt
1535*09d4459fSDaniel Fojt default:
1536*09d4459fSDaniel Fojt normal_char:
1537*09d4459fSDaniel Fojt dfa->lex.laststart = false;
1538*09d4459fSDaniel Fojt /* For multibyte character sets, folding is done in atom. Always
1539*09d4459fSDaniel Fojt return WCHAR. */
1540*09d4459fSDaniel Fojt if (dfa->localeinfo.multibyte)
1541*09d4459fSDaniel Fojt return dfa->lex.lasttok = WCHAR;
1542*09d4459fSDaniel Fojt
1543*09d4459fSDaniel Fojt if (dfa->syntax.case_fold && isalpha (c))
1544*09d4459fSDaniel Fojt {
1545*09d4459fSDaniel Fojt charclass ccl;
1546*09d4459fSDaniel Fojt zeroset (&ccl);
1547*09d4459fSDaniel Fojt setbit_case_fold_c (c, &ccl);
1548*09d4459fSDaniel Fojt return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1549*09d4459fSDaniel Fojt }
1550*09d4459fSDaniel Fojt
1551*09d4459fSDaniel Fojt return dfa->lex.lasttok = c;
1552*09d4459fSDaniel Fojt }
1553*09d4459fSDaniel Fojt }
1554*09d4459fSDaniel Fojt
1555*09d4459fSDaniel Fojt /* The above loop should consume at most a backslash
1556*09d4459fSDaniel Fojt and some other character. */
1557*09d4459fSDaniel Fojt abort ();
1558*09d4459fSDaniel Fojt return END; /* keeps pedantic compilers happy. */
1559*09d4459fSDaniel Fojt }
1560*09d4459fSDaniel Fojt
1561*09d4459fSDaniel Fojt static void
addtok_mb(struct dfa * dfa,token t,char mbprop)1562*09d4459fSDaniel Fojt addtok_mb (struct dfa *dfa, token t, char mbprop)
1563*09d4459fSDaniel Fojt {
1564*09d4459fSDaniel Fojt if (dfa->talloc == dfa->tindex)
1565*09d4459fSDaniel Fojt {
1566*09d4459fSDaniel Fojt dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1567*09d4459fSDaniel Fojt sizeof *dfa->tokens);
1568*09d4459fSDaniel Fojt if (dfa->localeinfo.multibyte)
1569*09d4459fSDaniel Fojt dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1570*09d4459fSDaniel Fojt sizeof *dfa->multibyte_prop);
1571*09d4459fSDaniel Fojt }
1572*09d4459fSDaniel Fojt if (dfa->localeinfo.multibyte)
1573*09d4459fSDaniel Fojt dfa->multibyte_prop[dfa->tindex] = mbprop;
1574*09d4459fSDaniel Fojt dfa->tokens[dfa->tindex++] = t;
1575*09d4459fSDaniel Fojt
1576*09d4459fSDaniel Fojt switch (t)
1577*09d4459fSDaniel Fojt {
1578*09d4459fSDaniel Fojt case QMARK:
1579*09d4459fSDaniel Fojt case STAR:
1580*09d4459fSDaniel Fojt case PLUS:
1581*09d4459fSDaniel Fojt break;
1582*09d4459fSDaniel Fojt
1583*09d4459fSDaniel Fojt case CAT:
1584*09d4459fSDaniel Fojt case OR:
1585*09d4459fSDaniel Fojt dfa->parse.depth--;
1586*09d4459fSDaniel Fojt break;
1587*09d4459fSDaniel Fojt
1588*09d4459fSDaniel Fojt case BACKREF:
1589*09d4459fSDaniel Fojt dfa->fast = false;
1590*09d4459fSDaniel Fojt FALLTHROUGH;
1591*09d4459fSDaniel Fojt default:
1592*09d4459fSDaniel Fojt dfa->nleaves++;
1593*09d4459fSDaniel Fojt FALLTHROUGH;
1594*09d4459fSDaniel Fojt case EMPTY:
1595*09d4459fSDaniel Fojt dfa->parse.depth++;
1596*09d4459fSDaniel Fojt break;
1597*09d4459fSDaniel Fojt }
1598*09d4459fSDaniel Fojt if (dfa->parse.depth > dfa->depth)
1599*09d4459fSDaniel Fojt dfa->depth = dfa->parse.depth;
1600*09d4459fSDaniel Fojt }
1601*09d4459fSDaniel Fojt
1602*09d4459fSDaniel Fojt static void addtok_wc (struct dfa *dfa, wint_t wc);
1603*09d4459fSDaniel Fojt
1604*09d4459fSDaniel Fojt /* Add the given token to the parse tree, maintaining the depth count and
1605*09d4459fSDaniel Fojt updating the maximum depth if necessary. */
1606*09d4459fSDaniel Fojt static void
addtok(struct dfa * dfa,token t)1607*09d4459fSDaniel Fojt addtok (struct dfa *dfa, token t)
1608*09d4459fSDaniel Fojt {
1609*09d4459fSDaniel Fojt if (dfa->localeinfo.multibyte && t == MBCSET)
1610*09d4459fSDaniel Fojt {
1611*09d4459fSDaniel Fojt bool need_or = false;
1612*09d4459fSDaniel Fojt
1613*09d4459fSDaniel Fojt /* Extract wide characters into alternations for better performance.
1614*09d4459fSDaniel Fojt This does not require UTF-8. */
1615*09d4459fSDaniel Fojt for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1616*09d4459fSDaniel Fojt {
1617*09d4459fSDaniel Fojt addtok_wc (dfa, dfa->lex.brack.chars[i]);
1618*09d4459fSDaniel Fojt if (need_or)
1619*09d4459fSDaniel Fojt addtok (dfa, OR);
1620*09d4459fSDaniel Fojt need_or = true;
1621*09d4459fSDaniel Fojt }
1622*09d4459fSDaniel Fojt dfa->lex.brack.nchars = 0;
1623*09d4459fSDaniel Fojt
1624*09d4459fSDaniel Fojt /* Wide characters have been handled above, so it is possible
1625*09d4459fSDaniel Fojt that the set is empty now. Do nothing in that case. */
1626*09d4459fSDaniel Fojt if (dfa->lex.brack.cset != -1)
1627*09d4459fSDaniel Fojt {
1628*09d4459fSDaniel Fojt addtok (dfa, CSET + dfa->lex.brack.cset);
1629*09d4459fSDaniel Fojt if (need_or)
1630*09d4459fSDaniel Fojt addtok (dfa, OR);
1631*09d4459fSDaniel Fojt }
1632*09d4459fSDaniel Fojt }
1633*09d4459fSDaniel Fojt else
1634*09d4459fSDaniel Fojt {
1635*09d4459fSDaniel Fojt addtok_mb (dfa, t, 3);
1636*09d4459fSDaniel Fojt }
1637*09d4459fSDaniel Fojt }
1638*09d4459fSDaniel Fojt
1639*09d4459fSDaniel Fojt /* We treat a multibyte character as a single atom, so that DFA
1640*09d4459fSDaniel Fojt can treat a multibyte character as a single expression.
1641*09d4459fSDaniel Fojt
1642*09d4459fSDaniel Fojt e.g., we construct the following tree from "<mb1><mb2>".
1643*09d4459fSDaniel Fojt <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1644*09d4459fSDaniel Fojt <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1645*09d4459fSDaniel Fojt static void
addtok_wc(struct dfa * dfa,wint_t wc)1646*09d4459fSDaniel Fojt addtok_wc (struct dfa *dfa, wint_t wc)
1647*09d4459fSDaniel Fojt {
1648*09d4459fSDaniel Fojt unsigned char buf[MB_LEN_MAX];
1649*09d4459fSDaniel Fojt mbstate_t s = { 0 };
1650*09d4459fSDaniel Fojt size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1651*09d4459fSDaniel Fojt int buflen;
1652*09d4459fSDaniel Fojt
1653*09d4459fSDaniel Fojt if (stored_bytes != (size_t) -1)
1654*09d4459fSDaniel Fojt buflen = stored_bytes;
1655*09d4459fSDaniel Fojt else
1656*09d4459fSDaniel Fojt {
1657*09d4459fSDaniel Fojt /* This is merely stop-gap. buf[0] is undefined, yet skipping
1658*09d4459fSDaniel Fojt the addtok_mb call altogether can corrupt the heap. */
1659*09d4459fSDaniel Fojt buflen = 1;
1660*09d4459fSDaniel Fojt buf[0] = 0;
1661*09d4459fSDaniel Fojt }
1662*09d4459fSDaniel Fojt
1663*09d4459fSDaniel Fojt addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1664*09d4459fSDaniel Fojt for (int i = 1; i < buflen; i++)
1665*09d4459fSDaniel Fojt {
1666*09d4459fSDaniel Fojt addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1667*09d4459fSDaniel Fojt addtok (dfa, CAT);
1668*09d4459fSDaniel Fojt }
1669*09d4459fSDaniel Fojt }
1670*09d4459fSDaniel Fojt
1671*09d4459fSDaniel Fojt static void
add_utf8_anychar(struct dfa * dfa)1672*09d4459fSDaniel Fojt add_utf8_anychar (struct dfa *dfa)
1673*09d4459fSDaniel Fojt {
1674*09d4459fSDaniel Fojt /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1675*09d4459fSDaniel Fojt UTF-8 byte sequence has been defined as follows:
1676*09d4459fSDaniel Fojt
1677*09d4459fSDaniel Fojt ([\x00-\x7f]
1678*09d4459fSDaniel Fojt |[\xc2-\xdf][\x80-\xbf]
1679*09d4459fSDaniel Fojt |[\xe0][\xa0-\xbf][\x80-\xbf]
1680*09d4459fSDaniel Fojt |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1681*09d4459fSDaniel Fojt |[\xed][\x80-\x9f][\x80-\xbf]
1682*09d4459fSDaniel Fojt |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1683*09d4459fSDaniel Fojt |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1684*09d4459fSDaniel Fojt |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1685*09d4459fSDaniel Fojt
1686*09d4459fSDaniel Fojt which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1687*09d4459fSDaniel Fojt where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1688*09d4459fSDaniel Fojt D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1689*09d4459fSDaniel Fojt H = [\x80-\x9f], I = [\xf0],
1690*09d4459fSDaniel Fojt J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1691*09d4459fSDaniel Fojt
1692*09d4459fSDaniel Fojt This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1693*09d4459fSDaniel Fojt
1694*09d4459fSDaniel Fojt /* Mnemonics for classes containing two or more bytes. */
1695*09d4459fSDaniel Fojt enum { A, B, C, E, F, H, J, K, M };
1696*09d4459fSDaniel Fojt
1697*09d4459fSDaniel Fojt /* Mnemonics for single-byte tokens. */
1698*09d4459fSDaniel Fojt enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1699*09d4459fSDaniel Fojt
1700*09d4459fSDaniel Fojt static charclass const utf8_classes[] = {
1701*09d4459fSDaniel Fojt /* A. 00-7f: 1-byte sequence. */
1702*09d4459fSDaniel Fojt CHARCLASS_INIT (0xffffffffffffffff, 0xffffffffffffffff, 0, 0),
1703*09d4459fSDaniel Fojt
1704*09d4459fSDaniel Fojt /* B. c2-df: 1st byte of a 2-byte sequence. */
1705*09d4459fSDaniel Fojt CHARCLASS_INIT (0, 0, 0, 0x00000000fffffffc),
1706*09d4459fSDaniel Fojt
1707*09d4459fSDaniel Fojt /* C. 80-bf: non-leading bytes. */
1708*09d4459fSDaniel Fojt CHARCLASS_INIT (0, 0, 0xffffffffffffffff, 0),
1709*09d4459fSDaniel Fojt
1710*09d4459fSDaniel Fojt /* D. e0 (just a token). */
1711*09d4459fSDaniel Fojt
1712*09d4459fSDaniel Fojt /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1713*09d4459fSDaniel Fojt CHARCLASS_INIT (0, 0, 0xffffffff00000000, 0),
1714*09d4459fSDaniel Fojt
1715*09d4459fSDaniel Fojt /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1716*09d4459fSDaniel Fojt CHARCLASS_INIT (0, 0, 0, 0x0000dffe00000000),
1717*09d4459fSDaniel Fojt
1718*09d4459fSDaniel Fojt /* G. ed (just a token). */
1719*09d4459fSDaniel Fojt
1720*09d4459fSDaniel Fojt /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1721*09d4459fSDaniel Fojt CHARCLASS_INIT (0, 0, 0x000000000000ffff, 0),
1722*09d4459fSDaniel Fojt
1723*09d4459fSDaniel Fojt /* I. f0 (just a token). */
1724*09d4459fSDaniel Fojt
1725*09d4459fSDaniel Fojt /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1726*09d4459fSDaniel Fojt CHARCLASS_INIT (0, 0, 0xffffffffffff0000, 0),
1727*09d4459fSDaniel Fojt
1728*09d4459fSDaniel Fojt /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1729*09d4459fSDaniel Fojt CHARCLASS_INIT (0, 0, 0, 0x000e000000000000),
1730*09d4459fSDaniel Fojt
1731*09d4459fSDaniel Fojt /* L. f4 (just a token). */
1732*09d4459fSDaniel Fojt
1733*09d4459fSDaniel Fojt /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1734*09d4459fSDaniel Fojt CHARCLASS_INIT (0, 0, 0x00000000000000ff, 0),
1735*09d4459fSDaniel Fojt };
1736*09d4459fSDaniel Fojt
1737*09d4459fSDaniel Fojt /* Define the character classes that are needed below. */
1738*09d4459fSDaniel Fojt if (dfa->utf8_anychar_classes[0] == 0)
1739*09d4459fSDaniel Fojt {
1740*09d4459fSDaniel Fojt charclass c = utf8_classes[0];
1741*09d4459fSDaniel Fojt if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1742*09d4459fSDaniel Fojt clrbit ('\n', &c);
1743*09d4459fSDaniel Fojt if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1744*09d4459fSDaniel Fojt clrbit ('\0', &c);
1745*09d4459fSDaniel Fojt dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1746*09d4459fSDaniel Fojt
1747*09d4459fSDaniel Fojt for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1748*09d4459fSDaniel Fojt dfa->utf8_anychar_classes[i]
1749*09d4459fSDaniel Fojt = CSET + charclass_index (dfa, &utf8_classes[i]);
1750*09d4459fSDaniel Fojt }
1751*09d4459fSDaniel Fojt
1752*09d4459fSDaniel Fojt /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1753*09d4459fSDaniel Fojt The token buffer is in reverse Polish order, so we get
1754*09d4459fSDaniel Fojt "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1755*09d4459fSDaniel Fojt C CAT OR C CAT OR C CAT OR". */
1756*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[A]);
1757*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[B]);
1758*09d4459fSDaniel Fojt addtok (dfa, D_token);
1759*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[E]);
1760*09d4459fSDaniel Fojt addtok (dfa, CAT);
1761*09d4459fSDaniel Fojt addtok (dfa, OR);
1762*09d4459fSDaniel Fojt addtok (dfa, G_token);
1763*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[H]);
1764*09d4459fSDaniel Fojt addtok (dfa, CAT);
1765*09d4459fSDaniel Fojt addtok (dfa, OR);
1766*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[F]);
1767*09d4459fSDaniel Fojt addtok (dfa, I_token);
1768*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[J]);
1769*09d4459fSDaniel Fojt addtok (dfa, CAT);
1770*09d4459fSDaniel Fojt addtok (dfa, OR);
1771*09d4459fSDaniel Fojt addtok (dfa, L_token);
1772*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[M]);
1773*09d4459fSDaniel Fojt addtok (dfa, CAT);
1774*09d4459fSDaniel Fojt addtok (dfa, OR);
1775*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[K]);
1776*09d4459fSDaniel Fojt for (int i = 0; i < 3; i++)
1777*09d4459fSDaniel Fojt {
1778*09d4459fSDaniel Fojt addtok (dfa, dfa->utf8_anychar_classes[C]);
1779*09d4459fSDaniel Fojt addtok (dfa, CAT);
1780*09d4459fSDaniel Fojt addtok (dfa, OR);
1781*09d4459fSDaniel Fojt }
1782*09d4459fSDaniel Fojt }
1783*09d4459fSDaniel Fojt
1784*09d4459fSDaniel Fojt /* The grammar understood by the parser is as follows.
1785*09d4459fSDaniel Fojt
1786*09d4459fSDaniel Fojt regexp:
1787*09d4459fSDaniel Fojt regexp OR branch
1788*09d4459fSDaniel Fojt branch
1789*09d4459fSDaniel Fojt
1790*09d4459fSDaniel Fojt branch:
1791*09d4459fSDaniel Fojt branch closure
1792*09d4459fSDaniel Fojt closure
1793*09d4459fSDaniel Fojt
1794*09d4459fSDaniel Fojt closure:
1795*09d4459fSDaniel Fojt closure QMARK
1796*09d4459fSDaniel Fojt closure STAR
1797*09d4459fSDaniel Fojt closure PLUS
1798*09d4459fSDaniel Fojt closure REPMN
1799*09d4459fSDaniel Fojt atom
1800*09d4459fSDaniel Fojt
1801*09d4459fSDaniel Fojt atom:
1802*09d4459fSDaniel Fojt <normal character>
1803*09d4459fSDaniel Fojt <multibyte character>
1804*09d4459fSDaniel Fojt ANYCHAR
1805*09d4459fSDaniel Fojt MBCSET
1806*09d4459fSDaniel Fojt CSET
1807*09d4459fSDaniel Fojt BACKREF
1808*09d4459fSDaniel Fojt BEGLINE
1809*09d4459fSDaniel Fojt ENDLINE
1810*09d4459fSDaniel Fojt BEGWORD
1811*09d4459fSDaniel Fojt ENDWORD
1812*09d4459fSDaniel Fojt LIMWORD
1813*09d4459fSDaniel Fojt NOTLIMWORD
1814*09d4459fSDaniel Fojt LPAREN regexp RPAREN
1815*09d4459fSDaniel Fojt <empty>
1816*09d4459fSDaniel Fojt
1817*09d4459fSDaniel Fojt The parser builds a parse tree in postfix form in an array of tokens. */
1818*09d4459fSDaniel Fojt
1819*09d4459fSDaniel Fojt static void
atom(struct dfa * dfa)1820*09d4459fSDaniel Fojt atom (struct dfa *dfa)
1821*09d4459fSDaniel Fojt {
1822*09d4459fSDaniel Fojt if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1823*09d4459fSDaniel Fojt || dfa->parse.tok >= CSET
1824*09d4459fSDaniel Fojt || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1825*09d4459fSDaniel Fojt || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1826*09d4459fSDaniel Fojt || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1827*09d4459fSDaniel Fojt || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1828*09d4459fSDaniel Fojt || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1829*09d4459fSDaniel Fojt {
1830*09d4459fSDaniel Fojt if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1831*09d4459fSDaniel Fojt {
1832*09d4459fSDaniel Fojt /* For UTF-8 expand the period to a series of CSETs that define a
1833*09d4459fSDaniel Fojt valid UTF-8 character. This avoids using the slow multibyte
1834*09d4459fSDaniel Fojt path. I'm pretty sure it would be both profitable and correct to
1835*09d4459fSDaniel Fojt do it for any encoding; however, the optimization must be done
1836*09d4459fSDaniel Fojt manually as it is done above in add_utf8_anychar. So, let's
1837*09d4459fSDaniel Fojt start with UTF-8: it is the most used, and the structure of the
1838*09d4459fSDaniel Fojt encoding makes the correctness more obvious. */
1839*09d4459fSDaniel Fojt add_utf8_anychar (dfa);
1840*09d4459fSDaniel Fojt }
1841*09d4459fSDaniel Fojt else
1842*09d4459fSDaniel Fojt addtok (dfa, dfa->parse.tok);
1843*09d4459fSDaniel Fojt dfa->parse.tok = lex (dfa);
1844*09d4459fSDaniel Fojt }
1845*09d4459fSDaniel Fojt else if (dfa->parse.tok == WCHAR)
1846*09d4459fSDaniel Fojt {
1847*09d4459fSDaniel Fojt if (dfa->lex.wctok == WEOF)
1848*09d4459fSDaniel Fojt addtok (dfa, BACKREF);
1849*09d4459fSDaniel Fojt else
1850*09d4459fSDaniel Fojt {
1851*09d4459fSDaniel Fojt addtok_wc (dfa, dfa->lex.wctok);
1852*09d4459fSDaniel Fojt
1853*09d4459fSDaniel Fojt if (dfa->syntax.case_fold)
1854*09d4459fSDaniel Fojt {
1855*09d4459fSDaniel Fojt wchar_t folded[CASE_FOLDED_BUFSIZE];
1856*09d4459fSDaniel Fojt int n = case_folded_counterparts (dfa->lex.wctok, folded);
1857*09d4459fSDaniel Fojt for (int i = 0; i < n; i++)
1858*09d4459fSDaniel Fojt {
1859*09d4459fSDaniel Fojt addtok_wc (dfa, folded[i]);
1860*09d4459fSDaniel Fojt addtok (dfa, OR);
1861*09d4459fSDaniel Fojt }
1862*09d4459fSDaniel Fojt }
1863*09d4459fSDaniel Fojt }
1864*09d4459fSDaniel Fojt
1865*09d4459fSDaniel Fojt dfa->parse.tok = lex (dfa);
1866*09d4459fSDaniel Fojt }
1867*09d4459fSDaniel Fojt else if (dfa->parse.tok == LPAREN)
1868*09d4459fSDaniel Fojt {
1869*09d4459fSDaniel Fojt dfa->parse.tok = lex (dfa);
1870*09d4459fSDaniel Fojt regexp (dfa);
1871*09d4459fSDaniel Fojt if (dfa->parse.tok != RPAREN)
1872*09d4459fSDaniel Fojt dfaerror (_("unbalanced ("));
1873*09d4459fSDaniel Fojt dfa->parse.tok = lex (dfa);
1874*09d4459fSDaniel Fojt }
1875*09d4459fSDaniel Fojt else
1876*09d4459fSDaniel Fojt addtok (dfa, EMPTY);
1877*09d4459fSDaniel Fojt }
1878*09d4459fSDaniel Fojt
1879*09d4459fSDaniel Fojt /* Return the number of tokens in the given subexpression. */
1880*09d4459fSDaniel Fojt static idx_t _GL_ATTRIBUTE_PURE
nsubtoks(struct dfa const * dfa,idx_t tindex)1881*09d4459fSDaniel Fojt nsubtoks (struct dfa const *dfa, idx_t tindex)
1882*09d4459fSDaniel Fojt {
1883*09d4459fSDaniel Fojt switch (dfa->tokens[tindex - 1])
1884*09d4459fSDaniel Fojt {
1885*09d4459fSDaniel Fojt default:
1886*09d4459fSDaniel Fojt return 1;
1887*09d4459fSDaniel Fojt case QMARK:
1888*09d4459fSDaniel Fojt case STAR:
1889*09d4459fSDaniel Fojt case PLUS:
1890*09d4459fSDaniel Fojt return 1 + nsubtoks (dfa, tindex - 1);
1891*09d4459fSDaniel Fojt case CAT:
1892*09d4459fSDaniel Fojt case OR:
1893*09d4459fSDaniel Fojt {
1894*09d4459fSDaniel Fojt idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1895*09d4459fSDaniel Fojt return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1896*09d4459fSDaniel Fojt }
1897*09d4459fSDaniel Fojt }
1898*09d4459fSDaniel Fojt }
1899*09d4459fSDaniel Fojt
1900*09d4459fSDaniel Fojt /* Copy the given subexpression to the top of the tree. */
1901*09d4459fSDaniel Fojt static void
copytoks(struct dfa * dfa,idx_t tindex,idx_t ntokens)1902*09d4459fSDaniel Fojt copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1903*09d4459fSDaniel Fojt {
1904*09d4459fSDaniel Fojt if (dfa->localeinfo.multibyte)
1905*09d4459fSDaniel Fojt for (idx_t i = 0; i < ntokens; i++)
1906*09d4459fSDaniel Fojt addtok_mb (dfa, dfa->tokens[tindex + i],
1907*09d4459fSDaniel Fojt dfa->multibyte_prop[tindex + i]);
1908*09d4459fSDaniel Fojt else
1909*09d4459fSDaniel Fojt for (idx_t i = 0; i < ntokens; i++)
1910*09d4459fSDaniel Fojt addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1911*09d4459fSDaniel Fojt }
1912*09d4459fSDaniel Fojt
1913*09d4459fSDaniel Fojt static void
closure(struct dfa * dfa)1914*09d4459fSDaniel Fojt closure (struct dfa *dfa)
1915*09d4459fSDaniel Fojt {
1916*09d4459fSDaniel Fojt atom (dfa);
1917*09d4459fSDaniel Fojt while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1918*09d4459fSDaniel Fojt || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1919*09d4459fSDaniel Fojt if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1920*09d4459fSDaniel Fojt {
1921*09d4459fSDaniel Fojt idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1922*09d4459fSDaniel Fojt idx_t tindex = dfa->tindex - ntokens;
1923*09d4459fSDaniel Fojt if (dfa->lex.maxrep < 0)
1924*09d4459fSDaniel Fojt addtok (dfa, PLUS);
1925*09d4459fSDaniel Fojt if (dfa->lex.minrep == 0)
1926*09d4459fSDaniel Fojt addtok (dfa, QMARK);
1927*09d4459fSDaniel Fojt int i;
1928*09d4459fSDaniel Fojt for (i = 1; i < dfa->lex.minrep; i++)
1929*09d4459fSDaniel Fojt {
1930*09d4459fSDaniel Fojt copytoks (dfa, tindex, ntokens);
1931*09d4459fSDaniel Fojt addtok (dfa, CAT);
1932*09d4459fSDaniel Fojt }
1933*09d4459fSDaniel Fojt for (; i < dfa->lex.maxrep; i++)
1934*09d4459fSDaniel Fojt {
1935*09d4459fSDaniel Fojt copytoks (dfa, tindex, ntokens);
1936*09d4459fSDaniel Fojt addtok (dfa, QMARK);
1937*09d4459fSDaniel Fojt addtok (dfa, CAT);
1938*09d4459fSDaniel Fojt }
1939*09d4459fSDaniel Fojt dfa->parse.tok = lex (dfa);
1940*09d4459fSDaniel Fojt }
1941*09d4459fSDaniel Fojt else if (dfa->parse.tok == REPMN)
1942*09d4459fSDaniel Fojt {
1943*09d4459fSDaniel Fojt dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1944*09d4459fSDaniel Fojt dfa->parse.tok = lex (dfa);
1945*09d4459fSDaniel Fojt closure (dfa);
1946*09d4459fSDaniel Fojt }
1947*09d4459fSDaniel Fojt else
1948*09d4459fSDaniel Fojt {
1949*09d4459fSDaniel Fojt addtok (dfa, dfa->parse.tok);
1950*09d4459fSDaniel Fojt dfa->parse.tok = lex (dfa);
1951*09d4459fSDaniel Fojt }
1952*09d4459fSDaniel Fojt }
1953*09d4459fSDaniel Fojt
1954*09d4459fSDaniel Fojt static void
branch(struct dfa * dfa)1955*09d4459fSDaniel Fojt branch (struct dfa* dfa)
1956*09d4459fSDaniel Fojt {
1957*09d4459fSDaniel Fojt closure (dfa);
1958*09d4459fSDaniel Fojt while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1959*09d4459fSDaniel Fojt && dfa->parse.tok >= 0)
1960*09d4459fSDaniel Fojt {
1961*09d4459fSDaniel Fojt closure (dfa);
1962*09d4459fSDaniel Fojt addtok (dfa, CAT);
1963*09d4459fSDaniel Fojt }
1964*09d4459fSDaniel Fojt }
1965*09d4459fSDaniel Fojt
1966*09d4459fSDaniel Fojt static void
regexp(struct dfa * dfa)1967*09d4459fSDaniel Fojt regexp (struct dfa *dfa)
1968*09d4459fSDaniel Fojt {
1969*09d4459fSDaniel Fojt branch (dfa);
1970*09d4459fSDaniel Fojt while (dfa->parse.tok == OR)
1971*09d4459fSDaniel Fojt {
1972*09d4459fSDaniel Fojt dfa->parse.tok = lex (dfa);
1973*09d4459fSDaniel Fojt branch (dfa);
1974*09d4459fSDaniel Fojt addtok (dfa, OR);
1975*09d4459fSDaniel Fojt }
1976*09d4459fSDaniel Fojt }
1977*09d4459fSDaniel Fojt
1978*09d4459fSDaniel Fojt /* Parse a string S of length LEN into D. S can include NUL characters.
1979*09d4459fSDaniel Fojt This is the main entry point for the parser. */
1980*09d4459fSDaniel Fojt void
dfaparse(char const * s,idx_t len,struct dfa * d)1981*09d4459fSDaniel Fojt dfaparse (char const *s, idx_t len, struct dfa *d)
1982*09d4459fSDaniel Fojt {
1983*09d4459fSDaniel Fojt d->lex.ptr = s;
1984*09d4459fSDaniel Fojt d->lex.left = len;
1985*09d4459fSDaniel Fojt d->lex.lasttok = END;
1986*09d4459fSDaniel Fojt d->lex.laststart = true;
1987*09d4459fSDaniel Fojt
1988*09d4459fSDaniel Fojt if (!d->syntax.syntax_bits_set)
1989*09d4459fSDaniel Fojt dfaerror (_("no syntax specified"));
1990*09d4459fSDaniel Fojt
1991*09d4459fSDaniel Fojt if (!d->nregexps)
1992*09d4459fSDaniel Fojt addtok (d, BEG);
1993*09d4459fSDaniel Fojt
1994*09d4459fSDaniel Fojt d->parse.tok = lex (d);
1995*09d4459fSDaniel Fojt d->parse.depth = d->depth;
1996*09d4459fSDaniel Fojt
1997*09d4459fSDaniel Fojt regexp (d);
1998*09d4459fSDaniel Fojt
1999*09d4459fSDaniel Fojt if (d->parse.tok != END)
2000*09d4459fSDaniel Fojt dfaerror (_("unbalanced )"));
2001*09d4459fSDaniel Fojt
2002*09d4459fSDaniel Fojt addtok (d, END - d->nregexps);
2003*09d4459fSDaniel Fojt addtok (d, CAT);
2004*09d4459fSDaniel Fojt
2005*09d4459fSDaniel Fojt if (d->nregexps)
2006*09d4459fSDaniel Fojt addtok (d, OR);
2007*09d4459fSDaniel Fojt
2008*09d4459fSDaniel Fojt ++d->nregexps;
2009*09d4459fSDaniel Fojt }
2010*09d4459fSDaniel Fojt
2011*09d4459fSDaniel Fojt /* Some primitives for operating on sets of positions. */
2012*09d4459fSDaniel Fojt
2013*09d4459fSDaniel Fojt /* Copy one set to another. */
2014*09d4459fSDaniel Fojt static void
copy(position_set const * src,position_set * dst)2015*09d4459fSDaniel Fojt copy (position_set const *src, position_set *dst)
2016*09d4459fSDaniel Fojt {
2017*09d4459fSDaniel Fojt if (dst->alloc < src->nelem)
2018*09d4459fSDaniel Fojt {
2019*09d4459fSDaniel Fojt free (dst->elems);
2020*09d4459fSDaniel Fojt dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2021*09d4459fSDaniel Fojt sizeof *dst->elems);
2022*09d4459fSDaniel Fojt }
2023*09d4459fSDaniel Fojt dst->nelem = src->nelem;
2024*09d4459fSDaniel Fojt if (src->nelem != 0)
2025*09d4459fSDaniel Fojt memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2026*09d4459fSDaniel Fojt }
2027*09d4459fSDaniel Fojt
2028*09d4459fSDaniel Fojt static void
alloc_position_set(position_set * s,idx_t size)2029*09d4459fSDaniel Fojt alloc_position_set (position_set *s, idx_t size)
2030*09d4459fSDaniel Fojt {
2031*09d4459fSDaniel Fojt s->elems = xnmalloc (size, sizeof *s->elems);
2032*09d4459fSDaniel Fojt s->alloc = size;
2033*09d4459fSDaniel Fojt s->nelem = 0;
2034*09d4459fSDaniel Fojt }
2035*09d4459fSDaniel Fojt
2036*09d4459fSDaniel Fojt /* Insert position P in set S. S is maintained in sorted order on
2037*09d4459fSDaniel Fojt decreasing index. If there is already an entry in S with P.index
2038*09d4459fSDaniel Fojt then merge (logically-OR) P's constraints into the one in S.
2039*09d4459fSDaniel Fojt S->elems must point to an array large enough to hold the resulting set. */
2040*09d4459fSDaniel Fojt static void
insert(position p,position_set * s)2041*09d4459fSDaniel Fojt insert (position p, position_set *s)
2042*09d4459fSDaniel Fojt {
2043*09d4459fSDaniel Fojt idx_t count = s->nelem;
2044*09d4459fSDaniel Fojt idx_t lo = 0, hi = count;
2045*09d4459fSDaniel Fojt while (lo < hi)
2046*09d4459fSDaniel Fojt {
2047*09d4459fSDaniel Fojt idx_t mid = (lo + hi) >> 1;
2048*09d4459fSDaniel Fojt if (s->elems[mid].index < p.index)
2049*09d4459fSDaniel Fojt lo = mid + 1;
2050*09d4459fSDaniel Fojt else if (s->elems[mid].index == p.index)
2051*09d4459fSDaniel Fojt {
2052*09d4459fSDaniel Fojt s->elems[mid].constraint |= p.constraint;
2053*09d4459fSDaniel Fojt return;
2054*09d4459fSDaniel Fojt }
2055*09d4459fSDaniel Fojt else
2056*09d4459fSDaniel Fojt hi = mid;
2057*09d4459fSDaniel Fojt }
2058*09d4459fSDaniel Fojt
2059*09d4459fSDaniel Fojt s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2060*09d4459fSDaniel Fojt for (idx_t i = count; i > lo; i--)
2061*09d4459fSDaniel Fojt s->elems[i] = s->elems[i - 1];
2062*09d4459fSDaniel Fojt s->elems[lo] = p;
2063*09d4459fSDaniel Fojt ++s->nelem;
2064*09d4459fSDaniel Fojt }
2065*09d4459fSDaniel Fojt
2066*09d4459fSDaniel Fojt static void
append(position p,position_set * s)2067*09d4459fSDaniel Fojt append (position p, position_set *s)
2068*09d4459fSDaniel Fojt {
2069*09d4459fSDaniel Fojt idx_t count = s->nelem;
2070*09d4459fSDaniel Fojt s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2071*09d4459fSDaniel Fojt s->elems[s->nelem++] = p;
2072*09d4459fSDaniel Fojt }
2073*09d4459fSDaniel Fojt
2074*09d4459fSDaniel Fojt /* Merge S1 and S2 (with the additional constraint C2) into M. The
2075*09d4459fSDaniel Fojt result is as if the positions of S1, and of S2 with the additional
2076*09d4459fSDaniel Fojt constraint C2, were inserted into an initially empty set. */
2077*09d4459fSDaniel Fojt static void
merge_constrained(position_set const * s1,position_set const * s2,unsigned int c2,position_set * m)2078*09d4459fSDaniel Fojt merge_constrained (position_set const *s1, position_set const *s2,
2079*09d4459fSDaniel Fojt unsigned int c2, position_set *m)
2080*09d4459fSDaniel Fojt {
2081*09d4459fSDaniel Fojt idx_t i = 0, j = 0;
2082*09d4459fSDaniel Fojt
2083*09d4459fSDaniel Fojt if (m->alloc - s1->nelem < s2->nelem)
2084*09d4459fSDaniel Fojt {
2085*09d4459fSDaniel Fojt free (m->elems);
2086*09d4459fSDaniel Fojt m->alloc = s1->nelem;
2087*09d4459fSDaniel Fojt m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2088*09d4459fSDaniel Fojt }
2089*09d4459fSDaniel Fojt m->nelem = 0;
2090*09d4459fSDaniel Fojt while (i < s1->nelem || j < s2->nelem)
2091*09d4459fSDaniel Fojt if (! (j < s2->nelem)
2092*09d4459fSDaniel Fojt || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2093*09d4459fSDaniel Fojt {
2094*09d4459fSDaniel Fojt unsigned int c = ((i < s1->nelem && j < s2->nelem
2095*09d4459fSDaniel Fojt && s1->elems[i].index == s2->elems[j].index)
2096*09d4459fSDaniel Fojt ? s2->elems[j++].constraint & c2
2097*09d4459fSDaniel Fojt : 0);
2098*09d4459fSDaniel Fojt m->elems[m->nelem].index = s1->elems[i].index;
2099*09d4459fSDaniel Fojt m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2100*09d4459fSDaniel Fojt }
2101*09d4459fSDaniel Fojt else
2102*09d4459fSDaniel Fojt {
2103*09d4459fSDaniel Fojt if (s2->elems[j].constraint & c2)
2104*09d4459fSDaniel Fojt {
2105*09d4459fSDaniel Fojt m->elems[m->nelem].index = s2->elems[j].index;
2106*09d4459fSDaniel Fojt m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2107*09d4459fSDaniel Fojt }
2108*09d4459fSDaniel Fojt j++;
2109*09d4459fSDaniel Fojt }
2110*09d4459fSDaniel Fojt }
2111*09d4459fSDaniel Fojt
2112*09d4459fSDaniel Fojt /* Merge two sets of positions into a third. The result is exactly as if
2113*09d4459fSDaniel Fojt the positions of both sets were inserted into an initially empty set. */
2114*09d4459fSDaniel Fojt static void
merge(position_set const * s1,position_set const * s2,position_set * m)2115*09d4459fSDaniel Fojt merge (position_set const *s1, position_set const *s2, position_set *m)
2116*09d4459fSDaniel Fojt {
2117*09d4459fSDaniel Fojt merge_constrained (s1, s2, -1, m);
2118*09d4459fSDaniel Fojt }
2119*09d4459fSDaniel Fojt
2120*09d4459fSDaniel Fojt static void
merge2(position_set * dst,position_set const * src,position_set * m)2121*09d4459fSDaniel Fojt merge2 (position_set *dst, position_set const *src, position_set *m)
2122*09d4459fSDaniel Fojt {
2123*09d4459fSDaniel Fojt if (src->nelem < 4)
2124*09d4459fSDaniel Fojt {
2125*09d4459fSDaniel Fojt for (idx_t i = 0; i < src->nelem; i++)
2126*09d4459fSDaniel Fojt insert (src->elems[i], dst);
2127*09d4459fSDaniel Fojt }
2128*09d4459fSDaniel Fojt else
2129*09d4459fSDaniel Fojt {
2130*09d4459fSDaniel Fojt merge (src, dst, m);
2131*09d4459fSDaniel Fojt copy (m, dst);
2132*09d4459fSDaniel Fojt }
2133*09d4459fSDaniel Fojt }
2134*09d4459fSDaniel Fojt
2135*09d4459fSDaniel Fojt /* Delete a position from a set. Return the nonzero constraint of the
2136*09d4459fSDaniel Fojt deleted position, or zero if there was no such position. */
2137*09d4459fSDaniel Fojt static unsigned int
delete(idx_t del,position_set * s)2138*09d4459fSDaniel Fojt delete (idx_t del, position_set *s)
2139*09d4459fSDaniel Fojt {
2140*09d4459fSDaniel Fojt idx_t count = s->nelem;
2141*09d4459fSDaniel Fojt idx_t lo = 0, hi = count;
2142*09d4459fSDaniel Fojt while (lo < hi)
2143*09d4459fSDaniel Fojt {
2144*09d4459fSDaniel Fojt idx_t mid = (lo + hi) >> 1;
2145*09d4459fSDaniel Fojt if (s->elems[mid].index < del)
2146*09d4459fSDaniel Fojt lo = mid + 1;
2147*09d4459fSDaniel Fojt else if (s->elems[mid].index == del)
2148*09d4459fSDaniel Fojt {
2149*09d4459fSDaniel Fojt unsigned int c = s->elems[mid].constraint;
2150*09d4459fSDaniel Fojt idx_t i;
2151*09d4459fSDaniel Fojt for (i = mid; i + 1 < count; i++)
2152*09d4459fSDaniel Fojt s->elems[i] = s->elems[i + 1];
2153*09d4459fSDaniel Fojt s->nelem = i;
2154*09d4459fSDaniel Fojt return c;
2155*09d4459fSDaniel Fojt }
2156*09d4459fSDaniel Fojt else
2157*09d4459fSDaniel Fojt hi = mid;
2158*09d4459fSDaniel Fojt }
2159*09d4459fSDaniel Fojt return 0;
2160*09d4459fSDaniel Fojt }
2161*09d4459fSDaniel Fojt
2162*09d4459fSDaniel Fojt /* Replace a position with the followed set. */
2163*09d4459fSDaniel Fojt static void
replace(position_set * dst,idx_t del,position_set * add,unsigned int constraint,position_set * tmp)2164*09d4459fSDaniel Fojt replace (position_set *dst, idx_t del, position_set *add,
2165*09d4459fSDaniel Fojt unsigned int constraint, position_set *tmp)
2166*09d4459fSDaniel Fojt {
2167*09d4459fSDaniel Fojt unsigned int c = delete (del, dst) & constraint;
2168*09d4459fSDaniel Fojt
2169*09d4459fSDaniel Fojt if (c)
2170*09d4459fSDaniel Fojt {
2171*09d4459fSDaniel Fojt copy (dst, tmp);
2172*09d4459fSDaniel Fojt merge_constrained (tmp, add, c, dst);
2173*09d4459fSDaniel Fojt }
2174*09d4459fSDaniel Fojt }
2175*09d4459fSDaniel Fojt
2176*09d4459fSDaniel Fojt /* Find the index of the state corresponding to the given position set with
2177*09d4459fSDaniel Fojt the given preceding context, or create a new state if there is no such
2178*09d4459fSDaniel Fojt state. Context tells whether we got here on a newline or letter. */
2179*09d4459fSDaniel Fojt static state_num
state_index(struct dfa * d,position_set const * s,int context)2180*09d4459fSDaniel Fojt state_index (struct dfa *d, position_set const *s, int context)
2181*09d4459fSDaniel Fojt {
2182*09d4459fSDaniel Fojt size_t hash = 0;
2183*09d4459fSDaniel Fojt int constraint = 0;
2184*09d4459fSDaniel Fojt state_num i;
2185*09d4459fSDaniel Fojt token first_end = 0;
2186*09d4459fSDaniel Fojt
2187*09d4459fSDaniel Fojt for (i = 0; i < s->nelem; ++i)
2188*09d4459fSDaniel Fojt {
2189*09d4459fSDaniel Fojt size_t ind = s->elems[i].index;
2190*09d4459fSDaniel Fojt hash ^= ind + s->elems[i].constraint;
2191*09d4459fSDaniel Fojt }
2192*09d4459fSDaniel Fojt
2193*09d4459fSDaniel Fojt /* Try to find a state that exactly matches the proposed one. */
2194*09d4459fSDaniel Fojt for (i = 0; i < d->sindex; ++i)
2195*09d4459fSDaniel Fojt {
2196*09d4459fSDaniel Fojt if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2197*09d4459fSDaniel Fojt || context != d->states[i].context)
2198*09d4459fSDaniel Fojt continue;
2199*09d4459fSDaniel Fojt state_num j;
2200*09d4459fSDaniel Fojt for (j = 0; j < s->nelem; ++j)
2201*09d4459fSDaniel Fojt if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2202*09d4459fSDaniel Fojt || s->elems[j].index != d->states[i].elems.elems[j].index)
2203*09d4459fSDaniel Fojt break;
2204*09d4459fSDaniel Fojt if (j == s->nelem)
2205*09d4459fSDaniel Fojt return i;
2206*09d4459fSDaniel Fojt }
2207*09d4459fSDaniel Fojt
2208*09d4459fSDaniel Fojt #ifdef DEBUG
2209*09d4459fSDaniel Fojt fprintf (stderr, "new state %td\n nextpos:", i);
2210*09d4459fSDaniel Fojt for (state_num j = 0; j < s->nelem; j++)
2211*09d4459fSDaniel Fojt {
2212*09d4459fSDaniel Fojt fprintf (stderr, " %td:", s->elems[j].index);
2213*09d4459fSDaniel Fojt prtok (d->tokens[s->elems[j].index]);
2214*09d4459fSDaniel Fojt }
2215*09d4459fSDaniel Fojt fprintf (stderr, "\n context:");
2216*09d4459fSDaniel Fojt if (context ^ CTX_ANY)
2217*09d4459fSDaniel Fojt {
2218*09d4459fSDaniel Fojt if (context & CTX_NONE)
2219*09d4459fSDaniel Fojt fprintf (stderr, " CTX_NONE");
2220*09d4459fSDaniel Fojt if (context & CTX_LETTER)
2221*09d4459fSDaniel Fojt fprintf (stderr, " CTX_LETTER");
2222*09d4459fSDaniel Fojt if (context & CTX_NEWLINE)
2223*09d4459fSDaniel Fojt fprintf (stderr, " CTX_NEWLINE");
2224*09d4459fSDaniel Fojt }
2225*09d4459fSDaniel Fojt else
2226*09d4459fSDaniel Fojt fprintf (stderr, " CTX_ANY");
2227*09d4459fSDaniel Fojt fprintf (stderr, "\n");
2228*09d4459fSDaniel Fojt #endif
2229*09d4459fSDaniel Fojt
2230*09d4459fSDaniel Fojt for (state_num j = 0; j < s->nelem; j++)
2231*09d4459fSDaniel Fojt {
2232*09d4459fSDaniel Fojt int c = d->constraints[s->elems[j].index];
2233*09d4459fSDaniel Fojt
2234*09d4459fSDaniel Fojt if (c != 0)
2235*09d4459fSDaniel Fojt {
2236*09d4459fSDaniel Fojt if (succeeds_in_context (c, context, CTX_ANY))
2237*09d4459fSDaniel Fojt constraint |= c;
2238*09d4459fSDaniel Fojt if (!first_end)
2239*09d4459fSDaniel Fojt first_end = d->tokens[s->elems[j].index];
2240*09d4459fSDaniel Fojt }
2241*09d4459fSDaniel Fojt else if (d->tokens[s->elems[j].index] == BACKREF)
2242*09d4459fSDaniel Fojt constraint = NO_CONSTRAINT;
2243*09d4459fSDaniel Fojt }
2244*09d4459fSDaniel Fojt
2245*09d4459fSDaniel Fojt
2246*09d4459fSDaniel Fojt /* Create a new state. */
2247*09d4459fSDaniel Fojt d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2248*09d4459fSDaniel Fojt sizeof *d->states);
2249*09d4459fSDaniel Fojt d->states[i].hash = hash;
2250*09d4459fSDaniel Fojt alloc_position_set (&d->states[i].elems, s->nelem);
2251*09d4459fSDaniel Fojt copy (s, &d->states[i].elems);
2252*09d4459fSDaniel Fojt d->states[i].context = context;
2253*09d4459fSDaniel Fojt d->states[i].constraint = constraint;
2254*09d4459fSDaniel Fojt d->states[i].first_end = first_end;
2255*09d4459fSDaniel Fojt d->states[i].mbps.nelem = 0;
2256*09d4459fSDaniel Fojt d->states[i].mbps.elems = NULL;
2257*09d4459fSDaniel Fojt d->states[i].mb_trindex = -1;
2258*09d4459fSDaniel Fojt
2259*09d4459fSDaniel Fojt ++d->sindex;
2260*09d4459fSDaniel Fojt
2261*09d4459fSDaniel Fojt return i;
2262*09d4459fSDaniel Fojt }
2263*09d4459fSDaniel Fojt
2264*09d4459fSDaniel Fojt /* Find the epsilon closure of a set of positions. If any position of the set
2265*09d4459fSDaniel Fojt contains a symbol that matches the empty string in some context, replace
2266*09d4459fSDaniel Fojt that position with the elements of its follow labeled with an appropriate
2267*09d4459fSDaniel Fojt constraint. Repeat exhaustively until no funny positions are left.
2268*09d4459fSDaniel Fojt S->elems must be large enough to hold the result. */
2269*09d4459fSDaniel Fojt static void
epsclosure(struct dfa const * d)2270*09d4459fSDaniel Fojt epsclosure (struct dfa const *d)
2271*09d4459fSDaniel Fojt {
2272*09d4459fSDaniel Fojt position_set tmp;
2273*09d4459fSDaniel Fojt alloc_position_set (&tmp, d->nleaves);
2274*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
2275*09d4459fSDaniel Fojt if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
2276*09d4459fSDaniel Fojt && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
2277*09d4459fSDaniel Fojt && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
2278*09d4459fSDaniel Fojt {
2279*09d4459fSDaniel Fojt unsigned int constraint;
2280*09d4459fSDaniel Fojt switch (d->tokens[i])
2281*09d4459fSDaniel Fojt {
2282*09d4459fSDaniel Fojt case BEGLINE:
2283*09d4459fSDaniel Fojt constraint = BEGLINE_CONSTRAINT;
2284*09d4459fSDaniel Fojt break;
2285*09d4459fSDaniel Fojt case ENDLINE:
2286*09d4459fSDaniel Fojt constraint = ENDLINE_CONSTRAINT;
2287*09d4459fSDaniel Fojt break;
2288*09d4459fSDaniel Fojt case BEGWORD:
2289*09d4459fSDaniel Fojt constraint = BEGWORD_CONSTRAINT;
2290*09d4459fSDaniel Fojt break;
2291*09d4459fSDaniel Fojt case ENDWORD:
2292*09d4459fSDaniel Fojt constraint = ENDWORD_CONSTRAINT;
2293*09d4459fSDaniel Fojt break;
2294*09d4459fSDaniel Fojt case LIMWORD:
2295*09d4459fSDaniel Fojt constraint = LIMWORD_CONSTRAINT;
2296*09d4459fSDaniel Fojt break;
2297*09d4459fSDaniel Fojt case NOTLIMWORD:
2298*09d4459fSDaniel Fojt constraint = NOTLIMWORD_CONSTRAINT;
2299*09d4459fSDaniel Fojt break;
2300*09d4459fSDaniel Fojt default:
2301*09d4459fSDaniel Fojt constraint = NO_CONSTRAINT;
2302*09d4459fSDaniel Fojt break;
2303*09d4459fSDaniel Fojt }
2304*09d4459fSDaniel Fojt
2305*09d4459fSDaniel Fojt delete (i, &d->follows[i]);
2306*09d4459fSDaniel Fojt
2307*09d4459fSDaniel Fojt for (idx_t j = 0; j < d->tindex; j++)
2308*09d4459fSDaniel Fojt if (i != j && d->follows[j].nelem > 0)
2309*09d4459fSDaniel Fojt replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
2310*09d4459fSDaniel Fojt }
2311*09d4459fSDaniel Fojt free (tmp.elems);
2312*09d4459fSDaniel Fojt }
2313*09d4459fSDaniel Fojt
2314*09d4459fSDaniel Fojt /* Returns the set of contexts for which there is at least one
2315*09d4459fSDaniel Fojt character included in C. */
2316*09d4459fSDaniel Fojt
2317*09d4459fSDaniel Fojt static int
charclass_context(struct dfa const * dfa,charclass const * c)2318*09d4459fSDaniel Fojt charclass_context (struct dfa const *dfa, charclass const *c)
2319*09d4459fSDaniel Fojt {
2320*09d4459fSDaniel Fojt int context = 0;
2321*09d4459fSDaniel Fojt
2322*09d4459fSDaniel Fojt for (int j = 0; j < CHARCLASS_WORDS; j++)
2323*09d4459fSDaniel Fojt {
2324*09d4459fSDaniel Fojt if (c->w[j] & dfa->syntax.newline.w[j])
2325*09d4459fSDaniel Fojt context |= CTX_NEWLINE;
2326*09d4459fSDaniel Fojt if (c->w[j] & dfa->syntax.letters.w[j])
2327*09d4459fSDaniel Fojt context |= CTX_LETTER;
2328*09d4459fSDaniel Fojt if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2329*09d4459fSDaniel Fojt context |= CTX_NONE;
2330*09d4459fSDaniel Fojt }
2331*09d4459fSDaniel Fojt
2332*09d4459fSDaniel Fojt return context;
2333*09d4459fSDaniel Fojt }
2334*09d4459fSDaniel Fojt
2335*09d4459fSDaniel Fojt /* Returns the contexts on which the position set S depends. Each context
2336*09d4459fSDaniel Fojt in the set of returned contexts (let's call it SC) may have a different
2337*09d4459fSDaniel Fojt follow set than other contexts in SC, and also different from the
2338*09d4459fSDaniel Fojt follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2339*09d4459fSDaniel Fojt in the complement set will have the same follow set. */
2340*09d4459fSDaniel Fojt
2341*09d4459fSDaniel Fojt static int _GL_ATTRIBUTE_PURE
state_separate_contexts(struct dfa * d,position_set const * s)2342*09d4459fSDaniel Fojt state_separate_contexts (struct dfa *d, position_set const *s)
2343*09d4459fSDaniel Fojt {
2344*09d4459fSDaniel Fojt int separate_contexts = 0;
2345*09d4459fSDaniel Fojt
2346*09d4459fSDaniel Fojt for (idx_t j = 0; j < s->nelem; j++)
2347*09d4459fSDaniel Fojt separate_contexts |= d->separates[s->elems[j].index];
2348*09d4459fSDaniel Fojt
2349*09d4459fSDaniel Fojt return separate_contexts;
2350*09d4459fSDaniel Fojt }
2351*09d4459fSDaniel Fojt
2352*09d4459fSDaniel Fojt enum
2353*09d4459fSDaniel Fojt {
2354*09d4459fSDaniel Fojt /* Single token is repeated. It is distinguished from non-repeated. */
2355*09d4459fSDaniel Fojt OPT_REPEAT = (1 << 0),
2356*09d4459fSDaniel Fojt
2357*09d4459fSDaniel Fojt /* Multiple tokens are repeated. This flag is on at head of tokens. The
2358*09d4459fSDaniel Fojt node is not merged. */
2359*09d4459fSDaniel Fojt OPT_LPAREN = (1 << 1),
2360*09d4459fSDaniel Fojt
2361*09d4459fSDaniel Fojt /* Multiple branches are joined. The node is not merged. */
2362*09d4459fSDaniel Fojt OPT_RPAREN = (1 << 2),
2363*09d4459fSDaniel Fojt
2364*09d4459fSDaniel Fojt /* The node is walked. If the node is found in walking again, OPT_RPAREN
2365*09d4459fSDaniel Fojt flag is turned on. */
2366*09d4459fSDaniel Fojt OPT_WALKED = (1 << 3),
2367*09d4459fSDaniel Fojt
2368*09d4459fSDaniel Fojt /* The node is queued. The node is not queued again. */
2369*09d4459fSDaniel Fojt OPT_QUEUED = (1 << 4)
2370*09d4459fSDaniel Fojt };
2371*09d4459fSDaniel Fojt
2372*09d4459fSDaniel Fojt static void
merge_nfa_state(struct dfa * d,idx_t tindex,char * flags,position_set * merged)2373*09d4459fSDaniel Fojt merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2374*09d4459fSDaniel Fojt position_set *merged)
2375*09d4459fSDaniel Fojt {
2376*09d4459fSDaniel Fojt position_set *follows = d->follows;
2377*09d4459fSDaniel Fojt idx_t nelem = 0;
2378*09d4459fSDaniel Fojt
2379*09d4459fSDaniel Fojt d->constraints[tindex] = 0;
2380*09d4459fSDaniel Fojt
2381*09d4459fSDaniel Fojt for (idx_t i = 0; i < follows[tindex].nelem; i++)
2382*09d4459fSDaniel Fojt {
2383*09d4459fSDaniel Fojt idx_t sindex = follows[tindex].elems[i].index;
2384*09d4459fSDaniel Fojt
2385*09d4459fSDaniel Fojt /* Skip the node as pruned in future. */
2386*09d4459fSDaniel Fojt unsigned int iconstraint = follows[tindex].elems[i].constraint;
2387*09d4459fSDaniel Fojt if (iconstraint == 0)
2388*09d4459fSDaniel Fojt continue;
2389*09d4459fSDaniel Fojt
2390*09d4459fSDaniel Fojt if (d->tokens[follows[tindex].elems[i].index] <= END)
2391*09d4459fSDaniel Fojt {
2392*09d4459fSDaniel Fojt d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2393*09d4459fSDaniel Fojt continue;
2394*09d4459fSDaniel Fojt }
2395*09d4459fSDaniel Fojt
2396*09d4459fSDaniel Fojt if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2397*09d4459fSDaniel Fojt {
2398*09d4459fSDaniel Fojt idx_t j;
2399*09d4459fSDaniel Fojt
2400*09d4459fSDaniel Fojt for (j = 0; j < nelem; j++)
2401*09d4459fSDaniel Fojt {
2402*09d4459fSDaniel Fojt idx_t dindex = follows[tindex].elems[j].index;
2403*09d4459fSDaniel Fojt
2404*09d4459fSDaniel Fojt if (follows[tindex].elems[j].constraint != iconstraint)
2405*09d4459fSDaniel Fojt continue;
2406*09d4459fSDaniel Fojt
2407*09d4459fSDaniel Fojt if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2408*09d4459fSDaniel Fojt continue;
2409*09d4459fSDaniel Fojt
2410*09d4459fSDaniel Fojt if (d->tokens[sindex] != d->tokens[dindex])
2411*09d4459fSDaniel Fojt continue;
2412*09d4459fSDaniel Fojt
2413*09d4459fSDaniel Fojt if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2414*09d4459fSDaniel Fojt continue;
2415*09d4459fSDaniel Fojt
2416*09d4459fSDaniel Fojt if (flags[sindex] & OPT_REPEAT)
2417*09d4459fSDaniel Fojt delete (sindex, &follows[sindex]);
2418*09d4459fSDaniel Fojt
2419*09d4459fSDaniel Fojt merge2 (&follows[dindex], &follows[sindex], merged);
2420*09d4459fSDaniel Fojt
2421*09d4459fSDaniel Fojt break;
2422*09d4459fSDaniel Fojt }
2423*09d4459fSDaniel Fojt
2424*09d4459fSDaniel Fojt if (j < nelem)
2425*09d4459fSDaniel Fojt continue;
2426*09d4459fSDaniel Fojt }
2427*09d4459fSDaniel Fojt
2428*09d4459fSDaniel Fojt follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2429*09d4459fSDaniel Fojt flags[sindex] |= OPT_QUEUED;
2430*09d4459fSDaniel Fojt }
2431*09d4459fSDaniel Fojt
2432*09d4459fSDaniel Fojt follows[tindex].nelem = nelem;
2433*09d4459fSDaniel Fojt }
2434*09d4459fSDaniel Fojt
2435*09d4459fSDaniel Fojt static int
compare(const void * a,const void * b)2436*09d4459fSDaniel Fojt compare (const void *a, const void *b)
2437*09d4459fSDaniel Fojt {
2438*09d4459fSDaniel Fojt position const *p = a, *q = b;
2439*09d4459fSDaniel Fojt return p->index < q->index ? -1 : p->index > q->index;
2440*09d4459fSDaniel Fojt }
2441*09d4459fSDaniel Fojt
2442*09d4459fSDaniel Fojt static void
reorder_tokens(struct dfa * d)2443*09d4459fSDaniel Fojt reorder_tokens (struct dfa *d)
2444*09d4459fSDaniel Fojt {
2445*09d4459fSDaniel Fojt idx_t nleaves;
2446*09d4459fSDaniel Fojt ptrdiff_t *map;
2447*09d4459fSDaniel Fojt token *tokens;
2448*09d4459fSDaniel Fojt position_set *follows;
2449*09d4459fSDaniel Fojt int *constraints;
2450*09d4459fSDaniel Fojt char *multibyte_prop;
2451*09d4459fSDaniel Fojt
2452*09d4459fSDaniel Fojt nleaves = 0;
2453*09d4459fSDaniel Fojt
2454*09d4459fSDaniel Fojt map = xnmalloc (d->tindex, sizeof *map);
2455*09d4459fSDaniel Fojt
2456*09d4459fSDaniel Fojt map[0] = nleaves++;
2457*09d4459fSDaniel Fojt
2458*09d4459fSDaniel Fojt for (idx_t i = 1; i < d->tindex; i++)
2459*09d4459fSDaniel Fojt map[i] = -1;
2460*09d4459fSDaniel Fojt
2461*09d4459fSDaniel Fojt tokens = xnmalloc (d->nleaves, sizeof *tokens);
2462*09d4459fSDaniel Fojt follows = xnmalloc (d->nleaves, sizeof *follows);
2463*09d4459fSDaniel Fojt constraints = xnmalloc (d->nleaves, sizeof *constraints);
2464*09d4459fSDaniel Fojt
2465*09d4459fSDaniel Fojt if (d->localeinfo.multibyte)
2466*09d4459fSDaniel Fojt multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop);
2467*09d4459fSDaniel Fojt else
2468*09d4459fSDaniel Fojt multibyte_prop = NULL;
2469*09d4459fSDaniel Fojt
2470*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
2471*09d4459fSDaniel Fojt {
2472*09d4459fSDaniel Fojt if (map[i] == -1)
2473*09d4459fSDaniel Fojt {
2474*09d4459fSDaniel Fojt free (d->follows[i].elems);
2475*09d4459fSDaniel Fojt d->follows[i].elems = NULL;
2476*09d4459fSDaniel Fojt d->follows[i].nelem = 0;
2477*09d4459fSDaniel Fojt continue;
2478*09d4459fSDaniel Fojt }
2479*09d4459fSDaniel Fojt
2480*09d4459fSDaniel Fojt tokens[map[i]] = d->tokens[i];
2481*09d4459fSDaniel Fojt follows[map[i]] = d->follows[i];
2482*09d4459fSDaniel Fojt constraints[map[i]] = d->constraints[i];
2483*09d4459fSDaniel Fojt
2484*09d4459fSDaniel Fojt if (multibyte_prop != NULL)
2485*09d4459fSDaniel Fojt multibyte_prop[map[i]] = d->multibyte_prop[i];
2486*09d4459fSDaniel Fojt
2487*09d4459fSDaniel Fojt for (idx_t j = 0; j < d->follows[i].nelem; j++)
2488*09d4459fSDaniel Fojt {
2489*09d4459fSDaniel Fojt if (map[d->follows[i].elems[j].index] == -1)
2490*09d4459fSDaniel Fojt map[d->follows[i].elems[j].index] = nleaves++;
2491*09d4459fSDaniel Fojt
2492*09d4459fSDaniel Fojt d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2493*09d4459fSDaniel Fojt }
2494*09d4459fSDaniel Fojt
2495*09d4459fSDaniel Fojt qsort (d->follows[i].elems, d->follows[i].nelem,
2496*09d4459fSDaniel Fojt sizeof *d->follows[i].elems, compare);
2497*09d4459fSDaniel Fojt }
2498*09d4459fSDaniel Fojt
2499*09d4459fSDaniel Fojt for (idx_t i = 0; i < nleaves; i++)
2500*09d4459fSDaniel Fojt {
2501*09d4459fSDaniel Fojt d->tokens[i] = tokens[i];
2502*09d4459fSDaniel Fojt d->follows[i] = follows[i];
2503*09d4459fSDaniel Fojt d->constraints[i] = constraints[i];
2504*09d4459fSDaniel Fojt
2505*09d4459fSDaniel Fojt if (multibyte_prop != NULL)
2506*09d4459fSDaniel Fojt d->multibyte_prop[i] = multibyte_prop[i];
2507*09d4459fSDaniel Fojt }
2508*09d4459fSDaniel Fojt
2509*09d4459fSDaniel Fojt d->tindex = d->nleaves = nleaves;
2510*09d4459fSDaniel Fojt
2511*09d4459fSDaniel Fojt free (tokens);
2512*09d4459fSDaniel Fojt free (follows);
2513*09d4459fSDaniel Fojt free (constraints);
2514*09d4459fSDaniel Fojt free (multibyte_prop);
2515*09d4459fSDaniel Fojt free (map);
2516*09d4459fSDaniel Fojt }
2517*09d4459fSDaniel Fojt
2518*09d4459fSDaniel Fojt static void
dfaoptimize(struct dfa * d)2519*09d4459fSDaniel Fojt dfaoptimize (struct dfa *d)
2520*09d4459fSDaniel Fojt {
2521*09d4459fSDaniel Fojt char *flags = xzalloc (d->tindex);
2522*09d4459fSDaniel Fojt
2523*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
2524*09d4459fSDaniel Fojt {
2525*09d4459fSDaniel Fojt for (idx_t j = 0; j < d->follows[i].nelem; j++)
2526*09d4459fSDaniel Fojt {
2527*09d4459fSDaniel Fojt if (d->follows[i].elems[j].index == i)
2528*09d4459fSDaniel Fojt flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2529*09d4459fSDaniel Fojt else if (d->follows[i].elems[j].index < i)
2530*09d4459fSDaniel Fojt flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2531*09d4459fSDaniel Fojt else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2532*09d4459fSDaniel Fojt flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2533*09d4459fSDaniel Fojt else
2534*09d4459fSDaniel Fojt flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2535*09d4459fSDaniel Fojt }
2536*09d4459fSDaniel Fojt }
2537*09d4459fSDaniel Fojt
2538*09d4459fSDaniel Fojt flags[0] |= OPT_QUEUED;
2539*09d4459fSDaniel Fojt
2540*09d4459fSDaniel Fojt position_set merged0;
2541*09d4459fSDaniel Fojt position_set *merged = &merged0;
2542*09d4459fSDaniel Fojt alloc_position_set (merged, d->nleaves);
2543*09d4459fSDaniel Fojt
2544*09d4459fSDaniel Fojt d->constraints = xnmalloc (d->tindex, sizeof *d->constraints);
2545*09d4459fSDaniel Fojt
2546*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
2547*09d4459fSDaniel Fojt if (flags[i] & OPT_QUEUED)
2548*09d4459fSDaniel Fojt merge_nfa_state (d, i, flags, merged);
2549*09d4459fSDaniel Fojt
2550*09d4459fSDaniel Fojt reorder_tokens (d);
2551*09d4459fSDaniel Fojt
2552*09d4459fSDaniel Fojt free (merged->elems);
2553*09d4459fSDaniel Fojt free (flags);
2554*09d4459fSDaniel Fojt }
2555*09d4459fSDaniel Fojt
2556*09d4459fSDaniel Fojt /* Perform bottom-up analysis on the parse tree, computing various functions.
2557*09d4459fSDaniel Fojt Note that at this point, we're pretending constructs like \< are real
2558*09d4459fSDaniel Fojt characters rather than constraints on what can follow them.
2559*09d4459fSDaniel Fojt
2560*09d4459fSDaniel Fojt Nullable: A node is nullable if it is at the root of a regexp that can
2561*09d4459fSDaniel Fojt match the empty string.
2562*09d4459fSDaniel Fojt * EMPTY leaves are nullable.
2563*09d4459fSDaniel Fojt * No other leaf is nullable.
2564*09d4459fSDaniel Fojt * A QMARK or STAR node is nullable.
2565*09d4459fSDaniel Fojt * A PLUS node is nullable if its argument is nullable.
2566*09d4459fSDaniel Fojt * A CAT node is nullable if both its arguments are nullable.
2567*09d4459fSDaniel Fojt * An OR node is nullable if either argument is nullable.
2568*09d4459fSDaniel Fojt
2569*09d4459fSDaniel Fojt Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2570*09d4459fSDaniel Fojt that could correspond to the first character of a string matching the
2571*09d4459fSDaniel Fojt regexp rooted at the given node.
2572*09d4459fSDaniel Fojt * EMPTY leaves have empty firstpos.
2573*09d4459fSDaniel Fojt * The firstpos of a nonempty leaf is that leaf itself.
2574*09d4459fSDaniel Fojt * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2575*09d4459fSDaniel Fojt argument.
2576*09d4459fSDaniel Fojt * The firstpos of a CAT node is the firstpos of the left argument, union
2577*09d4459fSDaniel Fojt the firstpos of the right if the left argument is nullable.
2578*09d4459fSDaniel Fojt * The firstpos of an OR node is the union of firstpos of each argument.
2579*09d4459fSDaniel Fojt
2580*09d4459fSDaniel Fojt Lastpos: The lastpos of a node is the set of positions that could
2581*09d4459fSDaniel Fojt correspond to the last character of a string matching the regexp at
2582*09d4459fSDaniel Fojt the given node.
2583*09d4459fSDaniel Fojt * EMPTY leaves have empty lastpos.
2584*09d4459fSDaniel Fojt * The lastpos of a nonempty leaf is that leaf itself.
2585*09d4459fSDaniel Fojt * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2586*09d4459fSDaniel Fojt argument.
2587*09d4459fSDaniel Fojt * The lastpos of a CAT node is the lastpos of its right argument, union
2588*09d4459fSDaniel Fojt the lastpos of the left if the right argument is nullable.
2589*09d4459fSDaniel Fojt * The lastpos of an OR node is the union of the lastpos of each argument.
2590*09d4459fSDaniel Fojt
2591*09d4459fSDaniel Fojt Follow: The follow of a position is the set of positions that could
2592*09d4459fSDaniel Fojt correspond to the character following a character matching the node in
2593*09d4459fSDaniel Fojt a string matching the regexp. At this point we consider special symbols
2594*09d4459fSDaniel Fojt that match the empty string in some context to be just normal characters.
2595*09d4459fSDaniel Fojt Later, if we find that a special symbol is in a follow set, we will
2596*09d4459fSDaniel Fojt replace it with the elements of its follow, labeled with an appropriate
2597*09d4459fSDaniel Fojt constraint.
2598*09d4459fSDaniel Fojt * Every node in the firstpos of the argument of a STAR or PLUS node is in
2599*09d4459fSDaniel Fojt the follow of every node in the lastpos.
2600*09d4459fSDaniel Fojt * Every node in the firstpos of the second argument of a CAT node is in
2601*09d4459fSDaniel Fojt the follow of every node in the lastpos of the first argument.
2602*09d4459fSDaniel Fojt
2603*09d4459fSDaniel Fojt Because of the postfix representation of the parse tree, the depth-first
2604*09d4459fSDaniel Fojt analysis is conveniently done by a linear scan with the aid of a stack.
2605*09d4459fSDaniel Fojt Sets are stored as arrays of the elements, obeying a stack-like allocation
2606*09d4459fSDaniel Fojt scheme; the number of elements in each set deeper in the stack can be
2607*09d4459fSDaniel Fojt used to determine the address of a particular set's array. */
2608*09d4459fSDaniel Fojt static void
dfaanalyze(struct dfa * d,bool searchflag)2609*09d4459fSDaniel Fojt dfaanalyze (struct dfa *d, bool searchflag)
2610*09d4459fSDaniel Fojt {
2611*09d4459fSDaniel Fojt /* Array allocated to hold position sets. */
2612*09d4459fSDaniel Fojt position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2613*09d4459fSDaniel Fojt /* Firstpos and lastpos elements. */
2614*09d4459fSDaniel Fojt position *firstpos = posalloc;
2615*09d4459fSDaniel Fojt position *lastpos = firstpos + d->nleaves;
2616*09d4459fSDaniel Fojt position pos;
2617*09d4459fSDaniel Fojt position_set tmp;
2618*09d4459fSDaniel Fojt
2619*09d4459fSDaniel Fojt /* Stack for element counts and nullable flags. */
2620*09d4459fSDaniel Fojt struct
2621*09d4459fSDaniel Fojt {
2622*09d4459fSDaniel Fojt /* Whether the entry is nullable. */
2623*09d4459fSDaniel Fojt bool nullable;
2624*09d4459fSDaniel Fojt
2625*09d4459fSDaniel Fojt /* Counts of firstpos and lastpos sets. */
2626*09d4459fSDaniel Fojt idx_t nfirstpos;
2627*09d4459fSDaniel Fojt idx_t nlastpos;
2628*09d4459fSDaniel Fojt } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2629*09d4459fSDaniel Fojt
2630*09d4459fSDaniel Fojt position_set merged; /* Result of merging sets. */
2631*09d4459fSDaniel Fojt
2632*09d4459fSDaniel Fojt addtok (d, CAT);
2633*09d4459fSDaniel Fojt
2634*09d4459fSDaniel Fojt #ifdef DEBUG
2635*09d4459fSDaniel Fojt fprintf (stderr, "dfaanalyze:\n");
2636*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
2637*09d4459fSDaniel Fojt {
2638*09d4459fSDaniel Fojt fprintf (stderr, " %td:", i);
2639*09d4459fSDaniel Fojt prtok (d->tokens[i]);
2640*09d4459fSDaniel Fojt }
2641*09d4459fSDaniel Fojt putc ('\n', stderr);
2642*09d4459fSDaniel Fojt #endif
2643*09d4459fSDaniel Fojt
2644*09d4459fSDaniel Fojt d->searchflag = searchflag;
2645*09d4459fSDaniel Fojt alloc_position_set (&merged, d->nleaves);
2646*09d4459fSDaniel Fojt d->follows = xcalloc (d->tindex, sizeof *d->follows);
2647*09d4459fSDaniel Fojt
2648*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
2649*09d4459fSDaniel Fojt {
2650*09d4459fSDaniel Fojt switch (d->tokens[i])
2651*09d4459fSDaniel Fojt {
2652*09d4459fSDaniel Fojt case EMPTY:
2653*09d4459fSDaniel Fojt /* The empty set is nullable. */
2654*09d4459fSDaniel Fojt stk->nullable = true;
2655*09d4459fSDaniel Fojt
2656*09d4459fSDaniel Fojt /* The firstpos and lastpos of the empty leaf are both empty. */
2657*09d4459fSDaniel Fojt stk->nfirstpos = stk->nlastpos = 0;
2658*09d4459fSDaniel Fojt stk++;
2659*09d4459fSDaniel Fojt break;
2660*09d4459fSDaniel Fojt
2661*09d4459fSDaniel Fojt case STAR:
2662*09d4459fSDaniel Fojt case PLUS:
2663*09d4459fSDaniel Fojt /* Every element in the firstpos of the argument is in the follow
2664*09d4459fSDaniel Fojt of every element in the lastpos. */
2665*09d4459fSDaniel Fojt {
2666*09d4459fSDaniel Fojt tmp.elems = firstpos - stk[-1].nfirstpos;
2667*09d4459fSDaniel Fojt tmp.nelem = stk[-1].nfirstpos;
2668*09d4459fSDaniel Fojt position *p = lastpos - stk[-1].nlastpos;
2669*09d4459fSDaniel Fojt for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2670*09d4459fSDaniel Fojt {
2671*09d4459fSDaniel Fojt merge (&tmp, &d->follows[p[j].index], &merged);
2672*09d4459fSDaniel Fojt copy (&merged, &d->follows[p[j].index]);
2673*09d4459fSDaniel Fojt }
2674*09d4459fSDaniel Fojt }
2675*09d4459fSDaniel Fojt FALLTHROUGH;
2676*09d4459fSDaniel Fojt case QMARK:
2677*09d4459fSDaniel Fojt /* A QMARK or STAR node is automatically nullable. */
2678*09d4459fSDaniel Fojt if (d->tokens[i] != PLUS)
2679*09d4459fSDaniel Fojt stk[-1].nullable = true;
2680*09d4459fSDaniel Fojt break;
2681*09d4459fSDaniel Fojt
2682*09d4459fSDaniel Fojt case CAT:
2683*09d4459fSDaniel Fojt /* Every element in the firstpos of the second argument is in the
2684*09d4459fSDaniel Fojt follow of every element in the lastpos of the first argument. */
2685*09d4459fSDaniel Fojt {
2686*09d4459fSDaniel Fojt tmp.nelem = stk[-1].nfirstpos;
2687*09d4459fSDaniel Fojt tmp.elems = firstpos - stk[-1].nfirstpos;
2688*09d4459fSDaniel Fojt position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2689*09d4459fSDaniel Fojt for (idx_t j = 0; j < stk[-2].nlastpos; j++)
2690*09d4459fSDaniel Fojt {
2691*09d4459fSDaniel Fojt merge (&tmp, &d->follows[p[j].index], &merged);
2692*09d4459fSDaniel Fojt copy (&merged, &d->follows[p[j].index]);
2693*09d4459fSDaniel Fojt }
2694*09d4459fSDaniel Fojt }
2695*09d4459fSDaniel Fojt
2696*09d4459fSDaniel Fojt /* The firstpos of a CAT node is the firstpos of the first argument,
2697*09d4459fSDaniel Fojt union that of the second argument if the first is nullable. */
2698*09d4459fSDaniel Fojt if (stk[-2].nullable)
2699*09d4459fSDaniel Fojt stk[-2].nfirstpos += stk[-1].nfirstpos;
2700*09d4459fSDaniel Fojt else
2701*09d4459fSDaniel Fojt firstpos -= stk[-1].nfirstpos;
2702*09d4459fSDaniel Fojt
2703*09d4459fSDaniel Fojt /* The lastpos of a CAT node is the lastpos of the second argument,
2704*09d4459fSDaniel Fojt union that of the first argument if the second is nullable. */
2705*09d4459fSDaniel Fojt if (stk[-1].nullable)
2706*09d4459fSDaniel Fojt stk[-2].nlastpos += stk[-1].nlastpos;
2707*09d4459fSDaniel Fojt else
2708*09d4459fSDaniel Fojt {
2709*09d4459fSDaniel Fojt position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2710*09d4459fSDaniel Fojt for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2711*09d4459fSDaniel Fojt p[j] = p[j + stk[-2].nlastpos];
2712*09d4459fSDaniel Fojt lastpos -= stk[-2].nlastpos;
2713*09d4459fSDaniel Fojt stk[-2].nlastpos = stk[-1].nlastpos;
2714*09d4459fSDaniel Fojt }
2715*09d4459fSDaniel Fojt
2716*09d4459fSDaniel Fojt /* A CAT node is nullable if both arguments are nullable. */
2717*09d4459fSDaniel Fojt stk[-2].nullable &= stk[-1].nullable;
2718*09d4459fSDaniel Fojt stk--;
2719*09d4459fSDaniel Fojt break;
2720*09d4459fSDaniel Fojt
2721*09d4459fSDaniel Fojt case OR:
2722*09d4459fSDaniel Fojt /* The firstpos is the union of the firstpos of each argument. */
2723*09d4459fSDaniel Fojt stk[-2].nfirstpos += stk[-1].nfirstpos;
2724*09d4459fSDaniel Fojt
2725*09d4459fSDaniel Fojt /* The lastpos is the union of the lastpos of each argument. */
2726*09d4459fSDaniel Fojt stk[-2].nlastpos += stk[-1].nlastpos;
2727*09d4459fSDaniel Fojt
2728*09d4459fSDaniel Fojt /* An OR node is nullable if either argument is nullable. */
2729*09d4459fSDaniel Fojt stk[-2].nullable |= stk[-1].nullable;
2730*09d4459fSDaniel Fojt stk--;
2731*09d4459fSDaniel Fojt break;
2732*09d4459fSDaniel Fojt
2733*09d4459fSDaniel Fojt default:
2734*09d4459fSDaniel Fojt /* Anything else is a nonempty position. (Note that special
2735*09d4459fSDaniel Fojt constructs like \< are treated as nonempty strings here;
2736*09d4459fSDaniel Fojt an "epsilon closure" effectively makes them nullable later.
2737*09d4459fSDaniel Fojt Backreferences have to get a real position so we can detect
2738*09d4459fSDaniel Fojt transitions on them later. But they are nullable. */
2739*09d4459fSDaniel Fojt stk->nullable = d->tokens[i] == BACKREF;
2740*09d4459fSDaniel Fojt
2741*09d4459fSDaniel Fojt /* This position is in its own firstpos and lastpos. */
2742*09d4459fSDaniel Fojt stk->nfirstpos = stk->nlastpos = 1;
2743*09d4459fSDaniel Fojt stk++;
2744*09d4459fSDaniel Fojt
2745*09d4459fSDaniel Fojt firstpos->index = lastpos->index = i;
2746*09d4459fSDaniel Fojt firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2747*09d4459fSDaniel Fojt firstpos++, lastpos++;
2748*09d4459fSDaniel Fojt
2749*09d4459fSDaniel Fojt break;
2750*09d4459fSDaniel Fojt }
2751*09d4459fSDaniel Fojt #ifdef DEBUG
2752*09d4459fSDaniel Fojt /* ... balance the above nonsyntactic #ifdef goo... */
2753*09d4459fSDaniel Fojt fprintf (stderr, "node %td:", i);
2754*09d4459fSDaniel Fojt prtok (d->tokens[i]);
2755*09d4459fSDaniel Fojt putc ('\n', stderr);
2756*09d4459fSDaniel Fojt fprintf (stderr,
2757*09d4459fSDaniel Fojt stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2758*09d4459fSDaniel Fojt fprintf (stderr, " firstpos:");
2759*09d4459fSDaniel Fojt for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2760*09d4459fSDaniel Fojt {
2761*09d4459fSDaniel Fojt fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2762*09d4459fSDaniel Fojt prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2763*09d4459fSDaniel Fojt }
2764*09d4459fSDaniel Fojt fprintf (stderr, "\n lastpos:");
2765*09d4459fSDaniel Fojt for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2766*09d4459fSDaniel Fojt {
2767*09d4459fSDaniel Fojt fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2768*09d4459fSDaniel Fojt prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2769*09d4459fSDaniel Fojt }
2770*09d4459fSDaniel Fojt putc ('\n', stderr);
2771*09d4459fSDaniel Fojt #endif
2772*09d4459fSDaniel Fojt }
2773*09d4459fSDaniel Fojt
2774*09d4459fSDaniel Fojt /* For each follow set that is the follow set of a real position, replace
2775*09d4459fSDaniel Fojt it with its epsilon closure. */
2776*09d4459fSDaniel Fojt epsclosure (d);
2777*09d4459fSDaniel Fojt
2778*09d4459fSDaniel Fojt dfaoptimize (d);
2779*09d4459fSDaniel Fojt
2780*09d4459fSDaniel Fojt #ifdef DEBUG
2781*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
2782*09d4459fSDaniel Fojt if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2783*09d4459fSDaniel Fojt || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2784*09d4459fSDaniel Fojt || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2785*09d4459fSDaniel Fojt {
2786*09d4459fSDaniel Fojt fprintf (stderr, "follows(%td:", i);
2787*09d4459fSDaniel Fojt prtok (d->tokens[i]);
2788*09d4459fSDaniel Fojt fprintf (stderr, "):");
2789*09d4459fSDaniel Fojt for (idx_t j = 0; j < d->follows[i].nelem; j++)
2790*09d4459fSDaniel Fojt {
2791*09d4459fSDaniel Fojt fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2792*09d4459fSDaniel Fojt prtok (d->tokens[d->follows[i].elems[j].index]);
2793*09d4459fSDaniel Fojt }
2794*09d4459fSDaniel Fojt putc ('\n', stderr);
2795*09d4459fSDaniel Fojt }
2796*09d4459fSDaniel Fojt #endif
2797*09d4459fSDaniel Fojt
2798*09d4459fSDaniel Fojt pos.index = 0;
2799*09d4459fSDaniel Fojt pos.constraint = NO_CONSTRAINT;
2800*09d4459fSDaniel Fojt
2801*09d4459fSDaniel Fojt alloc_position_set (&tmp, 1);
2802*09d4459fSDaniel Fojt
2803*09d4459fSDaniel Fojt append (pos, &tmp);
2804*09d4459fSDaniel Fojt
2805*09d4459fSDaniel Fojt d->separates = xnmalloc (d->tindex, sizeof *d->separates);
2806*09d4459fSDaniel Fojt
2807*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
2808*09d4459fSDaniel Fojt {
2809*09d4459fSDaniel Fojt d->separates[i] = 0;
2810*09d4459fSDaniel Fojt
2811*09d4459fSDaniel Fojt if (prev_newline_dependent (d->constraints[i]))
2812*09d4459fSDaniel Fojt d->separates[i] |= CTX_NEWLINE;
2813*09d4459fSDaniel Fojt if (prev_letter_dependent (d->constraints[i]))
2814*09d4459fSDaniel Fojt d->separates[i] |= CTX_LETTER;
2815*09d4459fSDaniel Fojt
2816*09d4459fSDaniel Fojt for (idx_t j = 0; j < d->follows[i].nelem; j++)
2817*09d4459fSDaniel Fojt {
2818*09d4459fSDaniel Fojt if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2819*09d4459fSDaniel Fojt d->separates[i] |= CTX_NEWLINE;
2820*09d4459fSDaniel Fojt if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2821*09d4459fSDaniel Fojt d->separates[i] |= CTX_LETTER;
2822*09d4459fSDaniel Fojt }
2823*09d4459fSDaniel Fojt }
2824*09d4459fSDaniel Fojt
2825*09d4459fSDaniel Fojt /* Context wanted by some position. */
2826*09d4459fSDaniel Fojt int separate_contexts = state_separate_contexts (d, &tmp);
2827*09d4459fSDaniel Fojt
2828*09d4459fSDaniel Fojt /* Build the initial state. */
2829*09d4459fSDaniel Fojt if (separate_contexts & CTX_NEWLINE)
2830*09d4459fSDaniel Fojt state_index (d, &tmp, CTX_NEWLINE);
2831*09d4459fSDaniel Fojt d->initstate_notbol = d->min_trcount
2832*09d4459fSDaniel Fojt = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2833*09d4459fSDaniel Fojt if (separate_contexts & CTX_LETTER)
2834*09d4459fSDaniel Fojt d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2835*09d4459fSDaniel Fojt d->min_trcount++;
2836*09d4459fSDaniel Fojt d->trcount = 0;
2837*09d4459fSDaniel Fojt
2838*09d4459fSDaniel Fojt free (posalloc);
2839*09d4459fSDaniel Fojt free (stkalloc);
2840*09d4459fSDaniel Fojt free (merged.elems);
2841*09d4459fSDaniel Fojt free (tmp.elems);
2842*09d4459fSDaniel Fojt }
2843*09d4459fSDaniel Fojt
2844*09d4459fSDaniel Fojt /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2845*09d4459fSDaniel Fojt static void
realloc_trans_if_necessary(struct dfa * d)2846*09d4459fSDaniel Fojt realloc_trans_if_necessary (struct dfa *d)
2847*09d4459fSDaniel Fojt {
2848*09d4459fSDaniel Fojt state_num oldalloc = d->tralloc;
2849*09d4459fSDaniel Fojt if (oldalloc < d->sindex)
2850*09d4459fSDaniel Fojt {
2851*09d4459fSDaniel Fojt state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2852*09d4459fSDaniel Fojt idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2853*09d4459fSDaniel Fojt realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2854*09d4459fSDaniel Fojt -1, sizeof *realtrans);
2855*09d4459fSDaniel Fojt realtrans[0] = realtrans[1] = NULL;
2856*09d4459fSDaniel Fojt d->trans = realtrans + 2;
2857*09d4459fSDaniel Fojt idx_t newalloc = d->tralloc = newalloc1 - 2;
2858*09d4459fSDaniel Fojt d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2859*09d4459fSDaniel Fojt d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2860*09d4459fSDaniel Fojt d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2861*09d4459fSDaniel Fojt if (d->localeinfo.multibyte)
2862*09d4459fSDaniel Fojt {
2863*09d4459fSDaniel Fojt realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2864*09d4459fSDaniel Fojt realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2865*09d4459fSDaniel Fojt if (oldalloc == 0)
2866*09d4459fSDaniel Fojt realtrans[0] = realtrans[1] = NULL;
2867*09d4459fSDaniel Fojt d->mb_trans = realtrans + 2;
2868*09d4459fSDaniel Fojt }
2869*09d4459fSDaniel Fojt for (; oldalloc < newalloc; oldalloc++)
2870*09d4459fSDaniel Fojt {
2871*09d4459fSDaniel Fojt d->trans[oldalloc] = NULL;
2872*09d4459fSDaniel Fojt d->fails[oldalloc] = NULL;
2873*09d4459fSDaniel Fojt if (d->localeinfo.multibyte)
2874*09d4459fSDaniel Fojt d->mb_trans[oldalloc] = NULL;
2875*09d4459fSDaniel Fojt }
2876*09d4459fSDaniel Fojt }
2877*09d4459fSDaniel Fojt }
2878*09d4459fSDaniel Fojt
2879*09d4459fSDaniel Fojt /*
2880*09d4459fSDaniel Fojt Calculate the transition table for a new state derived from state s
2881*09d4459fSDaniel Fojt for a compiled dfa d after input character uc, and return the new
2882*09d4459fSDaniel Fojt state number.
2883*09d4459fSDaniel Fojt
2884*09d4459fSDaniel Fojt Do not worry about all possible input characters; calculate just the group
2885*09d4459fSDaniel Fojt of positions that match uc. Label it with the set of characters that
2886*09d4459fSDaniel Fojt every position in the group matches (taking into account, if necessary,
2887*09d4459fSDaniel Fojt preceding context information of s). Then find the union
2888*09d4459fSDaniel Fojt of these positions' follows, i.e., the set of positions of the
2889*09d4459fSDaniel Fojt new state. For each character in the group's label, set the transition
2890*09d4459fSDaniel Fojt on this character to be to a state corresponding to the set's positions,
2891*09d4459fSDaniel Fojt and its associated backward context information, if necessary.
2892*09d4459fSDaniel Fojt
2893*09d4459fSDaniel Fojt When building a searching matcher, include the positions of state
2894*09d4459fSDaniel Fojt 0 in every state.
2895*09d4459fSDaniel Fojt
2896*09d4459fSDaniel Fojt The group is constructed by building an equivalence-class
2897*09d4459fSDaniel Fojt partition of the positions of s.
2898*09d4459fSDaniel Fojt
2899*09d4459fSDaniel Fojt For each position, find the set of characters C that it matches. Eliminate
2900*09d4459fSDaniel Fojt any characters from C that fail on grounds of backward context.
2901*09d4459fSDaniel Fojt
2902*09d4459fSDaniel Fojt Check whether the group's label L has nonempty
2903*09d4459fSDaniel Fojt intersection with C. If L - C is nonempty, create a new group labeled
2904*09d4459fSDaniel Fojt L - C and having the same positions as the current group, and set L to
2905*09d4459fSDaniel Fojt the intersection of L and C. Insert the position in the group, set
2906*09d4459fSDaniel Fojt C = C - L, and resume scanning.
2907*09d4459fSDaniel Fojt
2908*09d4459fSDaniel Fojt If after comparing with every group there are characters remaining in C,
2909*09d4459fSDaniel Fojt create a new group labeled with the characters of C and insert this
2910*09d4459fSDaniel Fojt position in that group. */
2911*09d4459fSDaniel Fojt
2912*09d4459fSDaniel Fojt static state_num
build_state(state_num s,struct dfa * d,unsigned char uc)2913*09d4459fSDaniel Fojt build_state (state_num s, struct dfa *d, unsigned char uc)
2914*09d4459fSDaniel Fojt {
2915*09d4459fSDaniel Fojt position_set follows; /* Union of the follows for each
2916*09d4459fSDaniel Fojt position of the current state. */
2917*09d4459fSDaniel Fojt position_set group; /* Positions that match the input char. */
2918*09d4459fSDaniel Fojt position_set tmp; /* Temporary space for merging sets. */
2919*09d4459fSDaniel Fojt state_num state; /* New state. */
2920*09d4459fSDaniel Fojt state_num state_newline; /* New state on a newline transition. */
2921*09d4459fSDaniel Fojt state_num state_letter; /* New state on a letter transition. */
2922*09d4459fSDaniel Fojt
2923*09d4459fSDaniel Fojt #ifdef DEBUG
2924*09d4459fSDaniel Fojt fprintf (stderr, "build state %td\n", s);
2925*09d4459fSDaniel Fojt #endif
2926*09d4459fSDaniel Fojt
2927*09d4459fSDaniel Fojt /* A pointer to the new transition table, and the table itself. */
2928*09d4459fSDaniel Fojt state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2929*09d4459fSDaniel Fojt state_num *trans = *ptrans;
2930*09d4459fSDaniel Fojt
2931*09d4459fSDaniel Fojt if (!trans)
2932*09d4459fSDaniel Fojt {
2933*09d4459fSDaniel Fojt /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2934*09d4459fSDaniel Fojt transition tables that can exist at once, other than for
2935*09d4459fSDaniel Fojt initial states. Often-used transition tables are quickly
2936*09d4459fSDaniel Fojt rebuilt, whereas rarely-used ones are cleared away. */
2937*09d4459fSDaniel Fojt if (MAX_TRCOUNT <= d->trcount)
2938*09d4459fSDaniel Fojt {
2939*09d4459fSDaniel Fojt for (state_num i = d->min_trcount; i < d->tralloc; i++)
2940*09d4459fSDaniel Fojt {
2941*09d4459fSDaniel Fojt free (d->trans[i]);
2942*09d4459fSDaniel Fojt free (d->fails[i]);
2943*09d4459fSDaniel Fojt d->trans[i] = d->fails[i] = NULL;
2944*09d4459fSDaniel Fojt }
2945*09d4459fSDaniel Fojt d->trcount = 0;
2946*09d4459fSDaniel Fojt }
2947*09d4459fSDaniel Fojt
2948*09d4459fSDaniel Fojt d->trcount++;
2949*09d4459fSDaniel Fojt *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2950*09d4459fSDaniel Fojt
2951*09d4459fSDaniel Fojt /* Fill transition table with a default value which means that the
2952*09d4459fSDaniel Fojt transited state has not been calculated yet. */
2953*09d4459fSDaniel Fojt for (int i = 0; i < NOTCHAR; i++)
2954*09d4459fSDaniel Fojt trans[i] = -2;
2955*09d4459fSDaniel Fojt }
2956*09d4459fSDaniel Fojt
2957*09d4459fSDaniel Fojt /* Set up the success bits for this state. */
2958*09d4459fSDaniel Fojt d->success[s] = 0;
2959*09d4459fSDaniel Fojt if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2960*09d4459fSDaniel Fojt d->success[s] |= CTX_NEWLINE;
2961*09d4459fSDaniel Fojt if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2962*09d4459fSDaniel Fojt d->success[s] |= CTX_LETTER;
2963*09d4459fSDaniel Fojt if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2964*09d4459fSDaniel Fojt d->success[s] |= CTX_NONE;
2965*09d4459fSDaniel Fojt
2966*09d4459fSDaniel Fojt alloc_position_set (&follows, d->nleaves);
2967*09d4459fSDaniel Fojt
2968*09d4459fSDaniel Fojt /* Find the union of the follows of the positions of the group.
2969*09d4459fSDaniel Fojt This is a hideously inefficient loop. Fix it someday. */
2970*09d4459fSDaniel Fojt for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
2971*09d4459fSDaniel Fojt for (idx_t k = 0;
2972*09d4459fSDaniel Fojt k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2973*09d4459fSDaniel Fojt insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2974*09d4459fSDaniel Fojt &follows);
2975*09d4459fSDaniel Fojt
2976*09d4459fSDaniel Fojt /* Positions that match the input char. */
2977*09d4459fSDaniel Fojt alloc_position_set (&group, d->nleaves);
2978*09d4459fSDaniel Fojt
2979*09d4459fSDaniel Fojt /* The group's label. */
2980*09d4459fSDaniel Fojt charclass label;
2981*09d4459fSDaniel Fojt fillset (&label);
2982*09d4459fSDaniel Fojt
2983*09d4459fSDaniel Fojt for (idx_t i = 0; i < follows.nelem; i++)
2984*09d4459fSDaniel Fojt {
2985*09d4459fSDaniel Fojt charclass matches; /* Set of matching characters. */
2986*09d4459fSDaniel Fojt position pos = follows.elems[i];
2987*09d4459fSDaniel Fojt bool matched = false;
2988*09d4459fSDaniel Fojt if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2989*09d4459fSDaniel Fojt {
2990*09d4459fSDaniel Fojt zeroset (&matches);
2991*09d4459fSDaniel Fojt setbit (d->tokens[pos.index], &matches);
2992*09d4459fSDaniel Fojt if (d->tokens[pos.index] == uc)
2993*09d4459fSDaniel Fojt matched = true;
2994*09d4459fSDaniel Fojt }
2995*09d4459fSDaniel Fojt else if (d->tokens[pos.index] >= CSET)
2996*09d4459fSDaniel Fojt {
2997*09d4459fSDaniel Fojt matches = d->charclasses[d->tokens[pos.index] - CSET];
2998*09d4459fSDaniel Fojt if (tstbit (uc, &matches))
2999*09d4459fSDaniel Fojt matched = true;
3000*09d4459fSDaniel Fojt }
3001*09d4459fSDaniel Fojt else if (d->tokens[pos.index] == ANYCHAR)
3002*09d4459fSDaniel Fojt {
3003*09d4459fSDaniel Fojt matches = d->charclasses[d->canychar];
3004*09d4459fSDaniel Fojt if (tstbit (uc, &matches))
3005*09d4459fSDaniel Fojt matched = true;
3006*09d4459fSDaniel Fojt
3007*09d4459fSDaniel Fojt /* ANYCHAR must match with a single character, so we must put
3008*09d4459fSDaniel Fojt it to D->states[s].mbps which contains the positions which
3009*09d4459fSDaniel Fojt can match with a single character not a byte. If all
3010*09d4459fSDaniel Fojt positions which has ANYCHAR does not depend on context of
3011*09d4459fSDaniel Fojt next character, we put the follows instead of it to
3012*09d4459fSDaniel Fojt D->states[s].mbps to optimize. */
3013*09d4459fSDaniel Fojt if (succeeds_in_context (pos.constraint, d->states[s].context,
3014*09d4459fSDaniel Fojt CTX_NONE))
3015*09d4459fSDaniel Fojt {
3016*09d4459fSDaniel Fojt if (d->states[s].mbps.nelem == 0)
3017*09d4459fSDaniel Fojt alloc_position_set (&d->states[s].mbps, 1);
3018*09d4459fSDaniel Fojt insert (pos, &d->states[s].mbps);
3019*09d4459fSDaniel Fojt }
3020*09d4459fSDaniel Fojt }
3021*09d4459fSDaniel Fojt else
3022*09d4459fSDaniel Fojt continue;
3023*09d4459fSDaniel Fojt
3024*09d4459fSDaniel Fojt /* Some characters may need to be eliminated from matches because
3025*09d4459fSDaniel Fojt they fail in the current context. */
3026*09d4459fSDaniel Fojt if (pos.constraint != NO_CONSTRAINT)
3027*09d4459fSDaniel Fojt {
3028*09d4459fSDaniel Fojt if (!succeeds_in_context (pos.constraint,
3029*09d4459fSDaniel Fojt d->states[s].context, CTX_NEWLINE))
3030*09d4459fSDaniel Fojt for (int j = 0; j < CHARCLASS_WORDS; j++)
3031*09d4459fSDaniel Fojt matches.w[j] &= ~d->syntax.newline.w[j];
3032*09d4459fSDaniel Fojt if (!succeeds_in_context (pos.constraint,
3033*09d4459fSDaniel Fojt d->states[s].context, CTX_LETTER))
3034*09d4459fSDaniel Fojt for (int j = 0; j < CHARCLASS_WORDS; ++j)
3035*09d4459fSDaniel Fojt matches.w[j] &= ~d->syntax.letters.w[j];
3036*09d4459fSDaniel Fojt if (!succeeds_in_context (pos.constraint,
3037*09d4459fSDaniel Fojt d->states[s].context, CTX_NONE))
3038*09d4459fSDaniel Fojt for (int j = 0; j < CHARCLASS_WORDS; ++j)
3039*09d4459fSDaniel Fojt matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3040*09d4459fSDaniel Fojt
3041*09d4459fSDaniel Fojt /* If there are no characters left, there's no point in going on. */
3042*09d4459fSDaniel Fojt if (emptyset (&matches))
3043*09d4459fSDaniel Fojt continue;
3044*09d4459fSDaniel Fojt
3045*09d4459fSDaniel Fojt /* If we have reset the bit that made us declare "matched", reset
3046*09d4459fSDaniel Fojt that indicator, too. This is required to avoid an infinite loop
3047*09d4459fSDaniel Fojt with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3048*09d4459fSDaniel Fojt if (!tstbit (uc, &matches))
3049*09d4459fSDaniel Fojt matched = false;
3050*09d4459fSDaniel Fojt }
3051*09d4459fSDaniel Fojt
3052*09d4459fSDaniel Fojt #ifdef DEBUG
3053*09d4459fSDaniel Fojt fprintf (stderr, " nextpos %td:", pos.index);
3054*09d4459fSDaniel Fojt prtok (d->tokens[pos.index]);
3055*09d4459fSDaniel Fojt fprintf (stderr, " of");
3056*09d4459fSDaniel Fojt for (unsigned j = 0; j < NOTCHAR; j++)
3057*09d4459fSDaniel Fojt if (tstbit (j, &matches))
3058*09d4459fSDaniel Fojt fprintf (stderr, " 0x%02x", j);
3059*09d4459fSDaniel Fojt fprintf (stderr, "\n");
3060*09d4459fSDaniel Fojt #endif
3061*09d4459fSDaniel Fojt
3062*09d4459fSDaniel Fojt if (matched)
3063*09d4459fSDaniel Fojt {
3064*09d4459fSDaniel Fojt for (int k = 0; k < CHARCLASS_WORDS; ++k)
3065*09d4459fSDaniel Fojt label.w[k] &= matches.w[k];
3066*09d4459fSDaniel Fojt append (pos, &group);
3067*09d4459fSDaniel Fojt }
3068*09d4459fSDaniel Fojt else
3069*09d4459fSDaniel Fojt {
3070*09d4459fSDaniel Fojt for (int k = 0; k < CHARCLASS_WORDS; ++k)
3071*09d4459fSDaniel Fojt label.w[k] &= ~matches.w[k];
3072*09d4459fSDaniel Fojt }
3073*09d4459fSDaniel Fojt }
3074*09d4459fSDaniel Fojt
3075*09d4459fSDaniel Fojt alloc_position_set (&tmp, d->nleaves);
3076*09d4459fSDaniel Fojt
3077*09d4459fSDaniel Fojt if (group.nelem > 0)
3078*09d4459fSDaniel Fojt {
3079*09d4459fSDaniel Fojt /* If we are building a searching matcher, throw in the positions
3080*09d4459fSDaniel Fojt of state 0 as well, if possible. */
3081*09d4459fSDaniel Fojt if (d->searchflag)
3082*09d4459fSDaniel Fojt {
3083*09d4459fSDaniel Fojt /* If a token in follows.elems is not 1st byte of a multibyte
3084*09d4459fSDaniel Fojt character, or the states of follows must accept the bytes
3085*09d4459fSDaniel Fojt which are not 1st byte of the multibyte character.
3086*09d4459fSDaniel Fojt Then, if a state of follows encounters a byte, it must not be
3087*09d4459fSDaniel Fojt a 1st byte of a multibyte character nor a single byte character.
3088*09d4459fSDaniel Fojt In this case, do not add state[0].follows to next state, because
3089*09d4459fSDaniel Fojt state[0] must accept 1st-byte.
3090*09d4459fSDaniel Fojt
3091*09d4459fSDaniel Fojt For example, suppose <sb a> is a certain single byte character,
3092*09d4459fSDaniel Fojt <mb A> is a certain multibyte character, and the codepoint of
3093*09d4459fSDaniel Fojt <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3094*09d4459fSDaniel Fojt state[0] accepts <sb a>, state[i] transits to state[i+1] by
3095*09d4459fSDaniel Fojt accepting the 1st byte of <mb A>, and state[i+1] accepts the
3096*09d4459fSDaniel Fojt 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3097*09d4459fSDaniel Fojt <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3098*09d4459fSDaniel Fojt not add state[0]. */
3099*09d4459fSDaniel Fojt
3100*09d4459fSDaniel Fojt bool mergeit = !d->localeinfo.multibyte;
3101*09d4459fSDaniel Fojt if (!mergeit)
3102*09d4459fSDaniel Fojt {
3103*09d4459fSDaniel Fojt mergeit = true;
3104*09d4459fSDaniel Fojt for (idx_t j = 0; mergeit && j < group.nelem; j++)
3105*09d4459fSDaniel Fojt mergeit &= d->multibyte_prop[group.elems[j].index];
3106*09d4459fSDaniel Fojt }
3107*09d4459fSDaniel Fojt if (mergeit)
3108*09d4459fSDaniel Fojt {
3109*09d4459fSDaniel Fojt merge (&d->states[0].elems, &group, &tmp);
3110*09d4459fSDaniel Fojt copy (&tmp, &group);
3111*09d4459fSDaniel Fojt }
3112*09d4459fSDaniel Fojt }
3113*09d4459fSDaniel Fojt
3114*09d4459fSDaniel Fojt /* Find out if the new state will want any context information,
3115*09d4459fSDaniel Fojt by calculating possible contexts that the group can match,
3116*09d4459fSDaniel Fojt and separate contexts that the new state wants to know. */
3117*09d4459fSDaniel Fojt int possible_contexts = charclass_context (d, &label);
3118*09d4459fSDaniel Fojt int separate_contexts = state_separate_contexts (d, &group);
3119*09d4459fSDaniel Fojt
3120*09d4459fSDaniel Fojt /* Find the state(s) corresponding to the union of the follows. */
3121*09d4459fSDaniel Fojt if (possible_contexts & ~separate_contexts)
3122*09d4459fSDaniel Fojt state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3123*09d4459fSDaniel Fojt else
3124*09d4459fSDaniel Fojt state = -1;
3125*09d4459fSDaniel Fojt if (separate_contexts & possible_contexts & CTX_NEWLINE)
3126*09d4459fSDaniel Fojt state_newline = state_index (d, &group, CTX_NEWLINE);
3127*09d4459fSDaniel Fojt else
3128*09d4459fSDaniel Fojt state_newline = state;
3129*09d4459fSDaniel Fojt if (separate_contexts & possible_contexts & CTX_LETTER)
3130*09d4459fSDaniel Fojt state_letter = state_index (d, &group, CTX_LETTER);
3131*09d4459fSDaniel Fojt else
3132*09d4459fSDaniel Fojt state_letter = state;
3133*09d4459fSDaniel Fojt
3134*09d4459fSDaniel Fojt /* Reallocate now, to reallocate any newline transition properly. */
3135*09d4459fSDaniel Fojt realloc_trans_if_necessary (d);
3136*09d4459fSDaniel Fojt }
3137*09d4459fSDaniel Fojt
3138*09d4459fSDaniel Fojt /* If we are a searching matcher, the default transition is to a state
3139*09d4459fSDaniel Fojt containing the positions of state 0, otherwise the default transition
3140*09d4459fSDaniel Fojt is to fail miserably. */
3141*09d4459fSDaniel Fojt else if (d->searchflag)
3142*09d4459fSDaniel Fojt {
3143*09d4459fSDaniel Fojt state_newline = 0;
3144*09d4459fSDaniel Fojt state_letter = d->min_trcount - 1;
3145*09d4459fSDaniel Fojt state = d->initstate_notbol;
3146*09d4459fSDaniel Fojt }
3147*09d4459fSDaniel Fojt else
3148*09d4459fSDaniel Fojt {
3149*09d4459fSDaniel Fojt state_newline = -1;
3150*09d4459fSDaniel Fojt state_letter = -1;
3151*09d4459fSDaniel Fojt state = -1;
3152*09d4459fSDaniel Fojt }
3153*09d4459fSDaniel Fojt
3154*09d4459fSDaniel Fojt /* Set the transitions for each character in the label. */
3155*09d4459fSDaniel Fojt for (int i = 0; i < NOTCHAR; i++)
3156*09d4459fSDaniel Fojt if (tstbit (i, &label))
3157*09d4459fSDaniel Fojt switch (d->syntax.sbit[i])
3158*09d4459fSDaniel Fojt {
3159*09d4459fSDaniel Fojt case CTX_NEWLINE:
3160*09d4459fSDaniel Fojt trans[i] = state_newline;
3161*09d4459fSDaniel Fojt break;
3162*09d4459fSDaniel Fojt case CTX_LETTER:
3163*09d4459fSDaniel Fojt trans[i] = state_letter;
3164*09d4459fSDaniel Fojt break;
3165*09d4459fSDaniel Fojt default:
3166*09d4459fSDaniel Fojt trans[i] = state;
3167*09d4459fSDaniel Fojt break;
3168*09d4459fSDaniel Fojt }
3169*09d4459fSDaniel Fojt
3170*09d4459fSDaniel Fojt #ifdef DEBUG
3171*09d4459fSDaniel Fojt fprintf (stderr, "trans table %td", s);
3172*09d4459fSDaniel Fojt for (int i = 0; i < NOTCHAR; ++i)
3173*09d4459fSDaniel Fojt {
3174*09d4459fSDaniel Fojt if (!(i & 0xf))
3175*09d4459fSDaniel Fojt fprintf (stderr, "\n");
3176*09d4459fSDaniel Fojt fprintf (stderr, " %2td", trans[i]);
3177*09d4459fSDaniel Fojt }
3178*09d4459fSDaniel Fojt fprintf (stderr, "\n");
3179*09d4459fSDaniel Fojt #endif
3180*09d4459fSDaniel Fojt
3181*09d4459fSDaniel Fojt free (group.elems);
3182*09d4459fSDaniel Fojt free (follows.elems);
3183*09d4459fSDaniel Fojt free (tmp.elems);
3184*09d4459fSDaniel Fojt
3185*09d4459fSDaniel Fojt /* Keep the newline transition in a special place so we can use it as
3186*09d4459fSDaniel Fojt a sentinel. */
3187*09d4459fSDaniel Fojt if (tstbit (d->syntax.eolbyte, &label))
3188*09d4459fSDaniel Fojt {
3189*09d4459fSDaniel Fojt d->newlines[s] = trans[d->syntax.eolbyte];
3190*09d4459fSDaniel Fojt trans[d->syntax.eolbyte] = -1;
3191*09d4459fSDaniel Fojt }
3192*09d4459fSDaniel Fojt
3193*09d4459fSDaniel Fojt return trans[uc];
3194*09d4459fSDaniel Fojt }
3195*09d4459fSDaniel Fojt
3196*09d4459fSDaniel Fojt /* Multibyte character handling sub-routines for dfaexec. */
3197*09d4459fSDaniel Fojt
3198*09d4459fSDaniel Fojt /* Consume a single byte and transit state from 's' to '*next_state'.
3199*09d4459fSDaniel Fojt This function is almost same as the state transition routin in dfaexec.
3200*09d4459fSDaniel Fojt But state transition is done just once, otherwise matching succeed or
3201*09d4459fSDaniel Fojt reach the end of the buffer. */
3202*09d4459fSDaniel Fojt static state_num
transit_state_singlebyte(struct dfa * d,state_num s,unsigned char const ** pp)3203*09d4459fSDaniel Fojt transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3204*09d4459fSDaniel Fojt {
3205*09d4459fSDaniel Fojt state_num *t;
3206*09d4459fSDaniel Fojt
3207*09d4459fSDaniel Fojt if (d->trans[s])
3208*09d4459fSDaniel Fojt t = d->trans[s];
3209*09d4459fSDaniel Fojt else if (d->fails[s])
3210*09d4459fSDaniel Fojt t = d->fails[s];
3211*09d4459fSDaniel Fojt else
3212*09d4459fSDaniel Fojt {
3213*09d4459fSDaniel Fojt build_state (s, d, **pp);
3214*09d4459fSDaniel Fojt if (d->trans[s])
3215*09d4459fSDaniel Fojt t = d->trans[s];
3216*09d4459fSDaniel Fojt else
3217*09d4459fSDaniel Fojt {
3218*09d4459fSDaniel Fojt t = d->fails[s];
3219*09d4459fSDaniel Fojt assert (t);
3220*09d4459fSDaniel Fojt }
3221*09d4459fSDaniel Fojt }
3222*09d4459fSDaniel Fojt
3223*09d4459fSDaniel Fojt if (t[**pp] == -2)
3224*09d4459fSDaniel Fojt build_state (s, d, **pp);
3225*09d4459fSDaniel Fojt
3226*09d4459fSDaniel Fojt return t[*(*pp)++];
3227*09d4459fSDaniel Fojt }
3228*09d4459fSDaniel Fojt
3229*09d4459fSDaniel Fojt /* Transit state from s, then return new state and update the pointer of
3230*09d4459fSDaniel Fojt the buffer. This function is for a period operator which can match a
3231*09d4459fSDaniel Fojt multi-byte character. */
3232*09d4459fSDaniel Fojt static state_num
transit_state(struct dfa * d,state_num s,unsigned char const ** pp,unsigned char const * end)3233*09d4459fSDaniel Fojt transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3234*09d4459fSDaniel Fojt unsigned char const *end)
3235*09d4459fSDaniel Fojt {
3236*09d4459fSDaniel Fojt wint_t wc;
3237*09d4459fSDaniel Fojt
3238*09d4459fSDaniel Fojt int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3239*09d4459fSDaniel Fojt
3240*09d4459fSDaniel Fojt /* This state has some operators which can match a multibyte character. */
3241*09d4459fSDaniel Fojt d->mb_follows.nelem = 0;
3242*09d4459fSDaniel Fojt
3243*09d4459fSDaniel Fojt /* Calculate the state which can be reached from the state 's' by
3244*09d4459fSDaniel Fojt consuming 'mbclen' single bytes from the buffer. */
3245*09d4459fSDaniel Fojt state_num s1 = s;
3246*09d4459fSDaniel Fojt int mbci;
3247*09d4459fSDaniel Fojt for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3248*09d4459fSDaniel Fojt s = transit_state_singlebyte (d, s, pp);
3249*09d4459fSDaniel Fojt *pp += mbclen - mbci;
3250*09d4459fSDaniel Fojt
3251*09d4459fSDaniel Fojt if (wc == WEOF)
3252*09d4459fSDaniel Fojt {
3253*09d4459fSDaniel Fojt /* It is an invalid character, so ANYCHAR is not accepted. */
3254*09d4459fSDaniel Fojt return s;
3255*09d4459fSDaniel Fojt }
3256*09d4459fSDaniel Fojt
3257*09d4459fSDaniel Fojt /* If all positions which have ANYCHAR do not depend on the context
3258*09d4459fSDaniel Fojt of the next character, calculate the next state with
3259*09d4459fSDaniel Fojt pre-calculated follows and cache the result. */
3260*09d4459fSDaniel Fojt if (d->states[s1].mb_trindex < 0)
3261*09d4459fSDaniel Fojt {
3262*09d4459fSDaniel Fojt if (MAX_TRCOUNT <= d->mb_trcount)
3263*09d4459fSDaniel Fojt {
3264*09d4459fSDaniel Fojt state_num s3;
3265*09d4459fSDaniel Fojt for (s3 = -1; s3 < d->tralloc; s3++)
3266*09d4459fSDaniel Fojt {
3267*09d4459fSDaniel Fojt free (d->mb_trans[s3]);
3268*09d4459fSDaniel Fojt d->mb_trans[s3] = NULL;
3269*09d4459fSDaniel Fojt }
3270*09d4459fSDaniel Fojt
3271*09d4459fSDaniel Fojt for (state_num i = 0; i < d->sindex; i++)
3272*09d4459fSDaniel Fojt d->states[i].mb_trindex = -1;
3273*09d4459fSDaniel Fojt d->mb_trcount = 0;
3274*09d4459fSDaniel Fojt }
3275*09d4459fSDaniel Fojt d->states[s1].mb_trindex = d->mb_trcount++;
3276*09d4459fSDaniel Fojt }
3277*09d4459fSDaniel Fojt
3278*09d4459fSDaniel Fojt if (! d->mb_trans[s])
3279*09d4459fSDaniel Fojt {
3280*09d4459fSDaniel Fojt enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3281*09d4459fSDaniel Fojt enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3282*09d4459fSDaniel Fojt d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3283*09d4459fSDaniel Fojt for (int i = 0; i < MAX_TRCOUNT; i++)
3284*09d4459fSDaniel Fojt d->mb_trans[s][i] = -1;
3285*09d4459fSDaniel Fojt }
3286*09d4459fSDaniel Fojt else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3287*09d4459fSDaniel Fojt return d->mb_trans[s][d->states[s1].mb_trindex];
3288*09d4459fSDaniel Fojt
3289*09d4459fSDaniel Fojt if (s == -1)
3290*09d4459fSDaniel Fojt copy (&d->states[s1].mbps, &d->mb_follows);
3291*09d4459fSDaniel Fojt else
3292*09d4459fSDaniel Fojt merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3293*09d4459fSDaniel Fojt
3294*09d4459fSDaniel Fojt int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3295*09d4459fSDaniel Fojt state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3296*09d4459fSDaniel Fojt realloc_trans_if_necessary (d);
3297*09d4459fSDaniel Fojt
3298*09d4459fSDaniel Fojt d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3299*09d4459fSDaniel Fojt
3300*09d4459fSDaniel Fojt return s2;
3301*09d4459fSDaniel Fojt }
3302*09d4459fSDaniel Fojt
3303*09d4459fSDaniel Fojt /* The initial state may encounter a byte which is not a single byte character
3304*09d4459fSDaniel Fojt nor the first byte of a multibyte character. But it is incorrect for the
3305*09d4459fSDaniel Fojt initial state to accept such a byte. For example, in Shift JIS the regular
3306*09d4459fSDaniel Fojt expression "\\" accepts the codepoint 0x5c, but should not accept the second
3307*09d4459fSDaniel Fojt byte of the codepoint 0x815c. Then the initial state must skip the bytes
3308*09d4459fSDaniel Fojt that are not a single byte character nor the first byte of a multibyte
3309*09d4459fSDaniel Fojt character.
3310*09d4459fSDaniel Fojt
3311*09d4459fSDaniel Fojt Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3312*09d4459fSDaniel Fojt or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3313*09d4459fSDaniel Fojt the result is greater than P, set *WCP to the final wide character
3314*09d4459fSDaniel Fojt processed, or to WEOF if no wide character is processed. Otherwise,
3315*09d4459fSDaniel Fojt if WCP is non-NULL, *WCP may or may not be updated.
3316*09d4459fSDaniel Fojt
3317*09d4459fSDaniel Fojt Both P and MBP must be no larger than END. */
3318*09d4459fSDaniel Fojt static unsigned char const *
skip_remains_mb(struct dfa * d,unsigned char const * p,unsigned char const * mbp,char const * end)3319*09d4459fSDaniel Fojt skip_remains_mb (struct dfa *d, unsigned char const *p,
3320*09d4459fSDaniel Fojt unsigned char const *mbp, char const *end)
3321*09d4459fSDaniel Fojt {
3322*09d4459fSDaniel Fojt if (d->syntax.never_trail[*p])
3323*09d4459fSDaniel Fojt return p;
3324*09d4459fSDaniel Fojt while (mbp < p)
3325*09d4459fSDaniel Fojt {
3326*09d4459fSDaniel Fojt wint_t wc;
3327*09d4459fSDaniel Fojt mbp += mbs_to_wchar (&wc, (char const *) mbp,
3328*09d4459fSDaniel Fojt end - (char const *) mbp, d);
3329*09d4459fSDaniel Fojt }
3330*09d4459fSDaniel Fojt return mbp;
3331*09d4459fSDaniel Fojt }
3332*09d4459fSDaniel Fojt
3333*09d4459fSDaniel Fojt /* Search through a buffer looking for a match to the struct dfa *D.
3334*09d4459fSDaniel Fojt Find the first occurrence of a string matching the regexp in the
3335*09d4459fSDaniel Fojt buffer, and the shortest possible version thereof. Return a pointer to
3336*09d4459fSDaniel Fojt the first character after the match, or NULL if none is found. BEGIN
3337*09d4459fSDaniel Fojt points to the beginning of the buffer, and END points to the first byte
3338*09d4459fSDaniel Fojt after its end. Note however that we store a sentinel byte (usually
3339*09d4459fSDaniel Fojt newline) in *END, so the actual buffer must be one byte longer.
3340*09d4459fSDaniel Fojt When ALLOW_NL, newlines may appear in the matching string.
3341*09d4459fSDaniel Fojt If COUNT is non-NULL, increment *COUNT once for each newline processed.
3342*09d4459fSDaniel Fojt If MULTIBYTE, the input consists of multibyte characters and/or
3343*09d4459fSDaniel Fojt encoding-error bytes. Otherwise, it consists of single-byte characters.
3344*09d4459fSDaniel Fojt Here is the list of features that make this DFA matcher punt:
3345*09d4459fSDaniel Fojt - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3346*09d4459fSDaniel Fojt - [^...] in non-simple locale
3347*09d4459fSDaniel Fojt - [[=foo=]] or [[.foo.]]
3348*09d4459fSDaniel Fojt - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3349*09d4459fSDaniel Fojt - back-reference: (.)\1
3350*09d4459fSDaniel Fojt - word-delimiter in multibyte locale: \<, \>, \b, \B
3351*09d4459fSDaniel Fojt See struct localeinfo.simple for the definition of "simple locale". */
3352*09d4459fSDaniel Fojt
3353*09d4459fSDaniel Fojt static inline char *
dfaexec_main(struct dfa * d,char const * begin,char * end,bool allow_nl,ptrdiff_t * count,bool multibyte)3354*09d4459fSDaniel Fojt dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3355*09d4459fSDaniel Fojt ptrdiff_t *count, bool multibyte)
3356*09d4459fSDaniel Fojt {
3357*09d4459fSDaniel Fojt if (MAX_TRCOUNT <= d->sindex)
3358*09d4459fSDaniel Fojt {
3359*09d4459fSDaniel Fojt for (state_num s = d->min_trcount; s < d->sindex; s++)
3360*09d4459fSDaniel Fojt {
3361*09d4459fSDaniel Fojt free (d->states[s].elems.elems);
3362*09d4459fSDaniel Fojt free (d->states[s].mbps.elems);
3363*09d4459fSDaniel Fojt }
3364*09d4459fSDaniel Fojt d->sindex = d->min_trcount;
3365*09d4459fSDaniel Fojt
3366*09d4459fSDaniel Fojt if (d->trans)
3367*09d4459fSDaniel Fojt {
3368*09d4459fSDaniel Fojt for (state_num s = 0; s < d->tralloc; s++)
3369*09d4459fSDaniel Fojt {
3370*09d4459fSDaniel Fojt free (d->trans[s]);
3371*09d4459fSDaniel Fojt free (d->fails[s]);
3372*09d4459fSDaniel Fojt d->trans[s] = d->fails[s] = NULL;
3373*09d4459fSDaniel Fojt }
3374*09d4459fSDaniel Fojt d->trcount = 0;
3375*09d4459fSDaniel Fojt }
3376*09d4459fSDaniel Fojt
3377*09d4459fSDaniel Fojt if (d->localeinfo.multibyte && d->mb_trans)
3378*09d4459fSDaniel Fojt {
3379*09d4459fSDaniel Fojt for (state_num s = -1; s < d->tralloc; s++)
3380*09d4459fSDaniel Fojt {
3381*09d4459fSDaniel Fojt free (d->mb_trans[s]);
3382*09d4459fSDaniel Fojt d->mb_trans[s] = NULL;
3383*09d4459fSDaniel Fojt }
3384*09d4459fSDaniel Fojt for (state_num s = 0; s < d->min_trcount; s++)
3385*09d4459fSDaniel Fojt d->states[s].mb_trindex = -1;
3386*09d4459fSDaniel Fojt d->mb_trcount = 0;
3387*09d4459fSDaniel Fojt }
3388*09d4459fSDaniel Fojt }
3389*09d4459fSDaniel Fojt
3390*09d4459fSDaniel Fojt if (!d->tralloc)
3391*09d4459fSDaniel Fojt realloc_trans_if_necessary (d);
3392*09d4459fSDaniel Fojt
3393*09d4459fSDaniel Fojt /* Current state. */
3394*09d4459fSDaniel Fojt state_num s = 0, s1 = 0;
3395*09d4459fSDaniel Fojt
3396*09d4459fSDaniel Fojt /* Current input character. */
3397*09d4459fSDaniel Fojt unsigned char const *p = (unsigned char const *) begin;
3398*09d4459fSDaniel Fojt unsigned char const *mbp = p;
3399*09d4459fSDaniel Fojt
3400*09d4459fSDaniel Fojt /* Copy of d->trans so it can be optimized into a register. */
3401*09d4459fSDaniel Fojt state_num **trans = d->trans;
3402*09d4459fSDaniel Fojt unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3403*09d4459fSDaniel Fojt unsigned char saved_end = *(unsigned char *) end;
3404*09d4459fSDaniel Fojt *end = eol;
3405*09d4459fSDaniel Fojt
3406*09d4459fSDaniel Fojt if (multibyte)
3407*09d4459fSDaniel Fojt {
3408*09d4459fSDaniel Fojt memset (&d->mbs, 0, sizeof d->mbs);
3409*09d4459fSDaniel Fojt if (d->mb_follows.alloc == 0)
3410*09d4459fSDaniel Fojt alloc_position_set (&d->mb_follows, d->nleaves);
3411*09d4459fSDaniel Fojt }
3412*09d4459fSDaniel Fojt
3413*09d4459fSDaniel Fojt idx_t nlcount = 0;
3414*09d4459fSDaniel Fojt for (;;)
3415*09d4459fSDaniel Fojt {
3416*09d4459fSDaniel Fojt state_num *t;
3417*09d4459fSDaniel Fojt while ((t = trans[s]) != NULL)
3418*09d4459fSDaniel Fojt {
3419*09d4459fSDaniel Fojt if (s < d->min_trcount)
3420*09d4459fSDaniel Fojt {
3421*09d4459fSDaniel Fojt if (!multibyte || d->states[s].mbps.nelem == 0)
3422*09d4459fSDaniel Fojt {
3423*09d4459fSDaniel Fojt while (t[*p] == s)
3424*09d4459fSDaniel Fojt p++;
3425*09d4459fSDaniel Fojt }
3426*09d4459fSDaniel Fojt if (multibyte)
3427*09d4459fSDaniel Fojt p = mbp = skip_remains_mb (d, p, mbp, end);
3428*09d4459fSDaniel Fojt }
3429*09d4459fSDaniel Fojt
3430*09d4459fSDaniel Fojt if (multibyte)
3431*09d4459fSDaniel Fojt {
3432*09d4459fSDaniel Fojt s1 = s;
3433*09d4459fSDaniel Fojt
3434*09d4459fSDaniel Fojt if (d->states[s].mbps.nelem == 0
3435*09d4459fSDaniel Fojt || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3436*09d4459fSDaniel Fojt {
3437*09d4459fSDaniel Fojt /* If an input character does not match ANYCHAR, do it
3438*09d4459fSDaniel Fojt like a single-byte character. */
3439*09d4459fSDaniel Fojt s = t[*p++];
3440*09d4459fSDaniel Fojt }
3441*09d4459fSDaniel Fojt else
3442*09d4459fSDaniel Fojt {
3443*09d4459fSDaniel Fojt s = transit_state (d, s, &p, (unsigned char *) end);
3444*09d4459fSDaniel Fojt mbp = p;
3445*09d4459fSDaniel Fojt trans = d->trans;
3446*09d4459fSDaniel Fojt }
3447*09d4459fSDaniel Fojt }
3448*09d4459fSDaniel Fojt else
3449*09d4459fSDaniel Fojt {
3450*09d4459fSDaniel Fojt s1 = t[*p++];
3451*09d4459fSDaniel Fojt t = trans[s1];
3452*09d4459fSDaniel Fojt if (! t)
3453*09d4459fSDaniel Fojt {
3454*09d4459fSDaniel Fojt state_num tmp = s;
3455*09d4459fSDaniel Fojt s = s1;
3456*09d4459fSDaniel Fojt s1 = tmp; /* swap */
3457*09d4459fSDaniel Fojt break;
3458*09d4459fSDaniel Fojt }
3459*09d4459fSDaniel Fojt if (s < d->min_trcount)
3460*09d4459fSDaniel Fojt {
3461*09d4459fSDaniel Fojt while (t[*p] == s1)
3462*09d4459fSDaniel Fojt p++;
3463*09d4459fSDaniel Fojt }
3464*09d4459fSDaniel Fojt s = t[*p++];
3465*09d4459fSDaniel Fojt }
3466*09d4459fSDaniel Fojt }
3467*09d4459fSDaniel Fojt
3468*09d4459fSDaniel Fojt if (s < 0)
3469*09d4459fSDaniel Fojt {
3470*09d4459fSDaniel Fojt if (s == -2)
3471*09d4459fSDaniel Fojt {
3472*09d4459fSDaniel Fojt s = build_state (s1, d, p[-1]);
3473*09d4459fSDaniel Fojt trans = d->trans;
3474*09d4459fSDaniel Fojt }
3475*09d4459fSDaniel Fojt else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3476*09d4459fSDaniel Fojt {
3477*09d4459fSDaniel Fojt /* The previous character was a newline. Count it, and skip
3478*09d4459fSDaniel Fojt checking of multibyte character boundary until here. */
3479*09d4459fSDaniel Fojt nlcount++;
3480*09d4459fSDaniel Fojt mbp = p;
3481*09d4459fSDaniel Fojt
3482*09d4459fSDaniel Fojt s = (allow_nl ? d->newlines[s1]
3483*09d4459fSDaniel Fojt : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3484*09d4459fSDaniel Fojt : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3485*09d4459fSDaniel Fojt : d->initstate_notbol);
3486*09d4459fSDaniel Fojt }
3487*09d4459fSDaniel Fojt else
3488*09d4459fSDaniel Fojt {
3489*09d4459fSDaniel Fojt p = NULL;
3490*09d4459fSDaniel Fojt goto done;
3491*09d4459fSDaniel Fojt }
3492*09d4459fSDaniel Fojt }
3493*09d4459fSDaniel Fojt else if (d->fails[s])
3494*09d4459fSDaniel Fojt {
3495*09d4459fSDaniel Fojt if ((d->success[s] & d->syntax.sbit[*p])
3496*09d4459fSDaniel Fojt || ((char *) p == end
3497*09d4459fSDaniel Fojt && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3498*09d4459fSDaniel Fojt d)))
3499*09d4459fSDaniel Fojt goto done;
3500*09d4459fSDaniel Fojt
3501*09d4459fSDaniel Fojt if (multibyte && s < d->min_trcount)
3502*09d4459fSDaniel Fojt p = mbp = skip_remains_mb (d, p, mbp, end);
3503*09d4459fSDaniel Fojt
3504*09d4459fSDaniel Fojt s1 = s;
3505*09d4459fSDaniel Fojt if (!multibyte || d->states[s].mbps.nelem == 0
3506*09d4459fSDaniel Fojt || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3507*09d4459fSDaniel Fojt {
3508*09d4459fSDaniel Fojt /* If a input character does not match ANYCHAR, do it
3509*09d4459fSDaniel Fojt like a single-byte character. */
3510*09d4459fSDaniel Fojt s = d->fails[s][*p++];
3511*09d4459fSDaniel Fojt }
3512*09d4459fSDaniel Fojt else
3513*09d4459fSDaniel Fojt {
3514*09d4459fSDaniel Fojt s = transit_state (d, s, &p, (unsigned char *) end);
3515*09d4459fSDaniel Fojt mbp = p;
3516*09d4459fSDaniel Fojt trans = d->trans;
3517*09d4459fSDaniel Fojt }
3518*09d4459fSDaniel Fojt }
3519*09d4459fSDaniel Fojt else
3520*09d4459fSDaniel Fojt {
3521*09d4459fSDaniel Fojt build_state (s, d, p[0]);
3522*09d4459fSDaniel Fojt trans = d->trans;
3523*09d4459fSDaniel Fojt }
3524*09d4459fSDaniel Fojt }
3525*09d4459fSDaniel Fojt
3526*09d4459fSDaniel Fojt done:
3527*09d4459fSDaniel Fojt if (count)
3528*09d4459fSDaniel Fojt *count += nlcount;
3529*09d4459fSDaniel Fojt *end = saved_end;
3530*09d4459fSDaniel Fojt return (char *) p;
3531*09d4459fSDaniel Fojt }
3532*09d4459fSDaniel Fojt
3533*09d4459fSDaniel Fojt /* Specialized versions of dfaexec for multibyte and single-byte cases.
3534*09d4459fSDaniel Fojt This is for performance, as dfaexec_main is an inline function. */
3535*09d4459fSDaniel Fojt
3536*09d4459fSDaniel Fojt static char *
dfaexec_mb(struct dfa * d,char const * begin,char * end,bool allow_nl,ptrdiff_t * count,bool * backref)3537*09d4459fSDaniel Fojt dfaexec_mb (struct dfa *d, char const *begin, char *end,
3538*09d4459fSDaniel Fojt bool allow_nl, ptrdiff_t *count, bool *backref)
3539*09d4459fSDaniel Fojt {
3540*09d4459fSDaniel Fojt return dfaexec_main (d, begin, end, allow_nl, count, true);
3541*09d4459fSDaniel Fojt }
3542*09d4459fSDaniel Fojt
3543*09d4459fSDaniel Fojt static char *
dfaexec_sb(struct dfa * d,char const * begin,char * end,bool allow_nl,ptrdiff_t * count,bool * backref)3544*09d4459fSDaniel Fojt dfaexec_sb (struct dfa *d, char const *begin, char *end,
3545*09d4459fSDaniel Fojt bool allow_nl, ptrdiff_t *count, bool *backref)
3546*09d4459fSDaniel Fojt {
3547*09d4459fSDaniel Fojt return dfaexec_main (d, begin, end, allow_nl, count, false);
3548*09d4459fSDaniel Fojt }
3549*09d4459fSDaniel Fojt
3550*09d4459fSDaniel Fojt /* Always set *BACKREF and return BEGIN. Use this wrapper for
3551*09d4459fSDaniel Fojt any regexp that uses a construct not supported by this code. */
3552*09d4459fSDaniel Fojt static char *
dfaexec_noop(struct dfa * d,char const * begin,char * end,bool allow_nl,ptrdiff_t * count,bool * backref)3553*09d4459fSDaniel Fojt dfaexec_noop (struct dfa *d, char const *begin, char *end,
3554*09d4459fSDaniel Fojt bool allow_nl, ptrdiff_t *count, bool *backref)
3555*09d4459fSDaniel Fojt {
3556*09d4459fSDaniel Fojt *backref = true;
3557*09d4459fSDaniel Fojt return (char *) begin;
3558*09d4459fSDaniel Fojt }
3559*09d4459fSDaniel Fojt
3560*09d4459fSDaniel Fojt /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3561*09d4459fSDaniel Fojt but faster and set *BACKREF if the DFA code does not support this
3562*09d4459fSDaniel Fojt regexp usage. */
3563*09d4459fSDaniel Fojt
3564*09d4459fSDaniel Fojt char *
dfaexec(struct dfa * d,char const * begin,char * end,bool allow_nl,ptrdiff_t * count,bool * backref)3565*09d4459fSDaniel Fojt dfaexec (struct dfa *d, char const *begin, char *end,
3566*09d4459fSDaniel Fojt bool allow_nl, ptrdiff_t *count, bool *backref)
3567*09d4459fSDaniel Fojt {
3568*09d4459fSDaniel Fojt return d->dfaexec (d, begin, end, allow_nl, count, backref);
3569*09d4459fSDaniel Fojt }
3570*09d4459fSDaniel Fojt
3571*09d4459fSDaniel Fojt struct dfa *
dfasuperset(struct dfa const * d)3572*09d4459fSDaniel Fojt dfasuperset (struct dfa const *d)
3573*09d4459fSDaniel Fojt {
3574*09d4459fSDaniel Fojt return d->superset;
3575*09d4459fSDaniel Fojt }
3576*09d4459fSDaniel Fojt
3577*09d4459fSDaniel Fojt bool
dfaisfast(struct dfa const * d)3578*09d4459fSDaniel Fojt dfaisfast (struct dfa const *d)
3579*09d4459fSDaniel Fojt {
3580*09d4459fSDaniel Fojt return d->fast;
3581*09d4459fSDaniel Fojt }
3582*09d4459fSDaniel Fojt
3583*09d4459fSDaniel Fojt static void
free_mbdata(struct dfa * d)3584*09d4459fSDaniel Fojt free_mbdata (struct dfa *d)
3585*09d4459fSDaniel Fojt {
3586*09d4459fSDaniel Fojt free (d->multibyte_prop);
3587*09d4459fSDaniel Fojt free (d->lex.brack.chars);
3588*09d4459fSDaniel Fojt free (d->mb_follows.elems);
3589*09d4459fSDaniel Fojt
3590*09d4459fSDaniel Fojt if (d->mb_trans)
3591*09d4459fSDaniel Fojt {
3592*09d4459fSDaniel Fojt state_num s;
3593*09d4459fSDaniel Fojt for (s = -1; s < d->tralloc; s++)
3594*09d4459fSDaniel Fojt free (d->mb_trans[s]);
3595*09d4459fSDaniel Fojt free (d->mb_trans - 2);
3596*09d4459fSDaniel Fojt }
3597*09d4459fSDaniel Fojt }
3598*09d4459fSDaniel Fojt
3599*09d4459fSDaniel Fojt /* Return true if every construct in D is supported by this DFA matcher. */
3600*09d4459fSDaniel Fojt static bool _GL_ATTRIBUTE_PURE
dfa_supported(struct dfa const * d)3601*09d4459fSDaniel Fojt dfa_supported (struct dfa const *d)
3602*09d4459fSDaniel Fojt {
3603*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
3604*09d4459fSDaniel Fojt {
3605*09d4459fSDaniel Fojt switch (d->tokens[i])
3606*09d4459fSDaniel Fojt {
3607*09d4459fSDaniel Fojt case BEGWORD:
3608*09d4459fSDaniel Fojt case ENDWORD:
3609*09d4459fSDaniel Fojt case LIMWORD:
3610*09d4459fSDaniel Fojt case NOTLIMWORD:
3611*09d4459fSDaniel Fojt if (!d->localeinfo.multibyte)
3612*09d4459fSDaniel Fojt continue;
3613*09d4459fSDaniel Fojt FALLTHROUGH;
3614*09d4459fSDaniel Fojt case BACKREF:
3615*09d4459fSDaniel Fojt case MBCSET:
3616*09d4459fSDaniel Fojt return false;
3617*09d4459fSDaniel Fojt }
3618*09d4459fSDaniel Fojt }
3619*09d4459fSDaniel Fojt return true;
3620*09d4459fSDaniel Fojt }
3621*09d4459fSDaniel Fojt
3622*09d4459fSDaniel Fojt /* Disable use of the superset DFA if it is not likely to help
3623*09d4459fSDaniel Fojt performance. */
3624*09d4459fSDaniel Fojt static void
maybe_disable_superset_dfa(struct dfa * d)3625*09d4459fSDaniel Fojt maybe_disable_superset_dfa (struct dfa *d)
3626*09d4459fSDaniel Fojt {
3627*09d4459fSDaniel Fojt if (!d->localeinfo.using_utf8)
3628*09d4459fSDaniel Fojt return;
3629*09d4459fSDaniel Fojt
3630*09d4459fSDaniel Fojt bool have_backref = false;
3631*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
3632*09d4459fSDaniel Fojt {
3633*09d4459fSDaniel Fojt switch (d->tokens[i])
3634*09d4459fSDaniel Fojt {
3635*09d4459fSDaniel Fojt case ANYCHAR:
3636*09d4459fSDaniel Fojt /* Lowered. */
3637*09d4459fSDaniel Fojt abort ();
3638*09d4459fSDaniel Fojt case BACKREF:
3639*09d4459fSDaniel Fojt have_backref = true;
3640*09d4459fSDaniel Fojt break;
3641*09d4459fSDaniel Fojt case MBCSET:
3642*09d4459fSDaniel Fojt /* Requires multi-byte algorithm. */
3643*09d4459fSDaniel Fojt return;
3644*09d4459fSDaniel Fojt default:
3645*09d4459fSDaniel Fojt break;
3646*09d4459fSDaniel Fojt }
3647*09d4459fSDaniel Fojt }
3648*09d4459fSDaniel Fojt
3649*09d4459fSDaniel Fojt if (!have_backref && d->superset)
3650*09d4459fSDaniel Fojt {
3651*09d4459fSDaniel Fojt /* The superset DFA is not likely to be much faster, so remove it. */
3652*09d4459fSDaniel Fojt dfafree (d->superset);
3653*09d4459fSDaniel Fojt free (d->superset);
3654*09d4459fSDaniel Fojt d->superset = NULL;
3655*09d4459fSDaniel Fojt }
3656*09d4459fSDaniel Fojt
3657*09d4459fSDaniel Fojt free_mbdata (d);
3658*09d4459fSDaniel Fojt d->localeinfo.multibyte = false;
3659*09d4459fSDaniel Fojt d->dfaexec = dfaexec_sb;
3660*09d4459fSDaniel Fojt d->fast = true;
3661*09d4459fSDaniel Fojt }
3662*09d4459fSDaniel Fojt
3663*09d4459fSDaniel Fojt static void
dfassbuild(struct dfa * d)3664*09d4459fSDaniel Fojt dfassbuild (struct dfa *d)
3665*09d4459fSDaniel Fojt {
3666*09d4459fSDaniel Fojt struct dfa *sup = dfaalloc ();
3667*09d4459fSDaniel Fojt
3668*09d4459fSDaniel Fojt *sup = *d;
3669*09d4459fSDaniel Fojt sup->localeinfo.multibyte = false;
3670*09d4459fSDaniel Fojt sup->dfaexec = dfaexec_sb;
3671*09d4459fSDaniel Fojt sup->multibyte_prop = NULL;
3672*09d4459fSDaniel Fojt sup->superset = NULL;
3673*09d4459fSDaniel Fojt sup->states = NULL;
3674*09d4459fSDaniel Fojt sup->sindex = 0;
3675*09d4459fSDaniel Fojt sup->constraints = NULL;
3676*09d4459fSDaniel Fojt sup->separates = NULL;
3677*09d4459fSDaniel Fojt sup->follows = NULL;
3678*09d4459fSDaniel Fojt sup->tralloc = 0;
3679*09d4459fSDaniel Fojt sup->trans = NULL;
3680*09d4459fSDaniel Fojt sup->fails = NULL;
3681*09d4459fSDaniel Fojt sup->success = NULL;
3682*09d4459fSDaniel Fojt sup->newlines = NULL;
3683*09d4459fSDaniel Fojt
3684*09d4459fSDaniel Fojt sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3685*09d4459fSDaniel Fojt if (d->cindex)
3686*09d4459fSDaniel Fojt {
3687*09d4459fSDaniel Fojt memcpy (sup->charclasses, d->charclasses,
3688*09d4459fSDaniel Fojt d->cindex * sizeof *sup->charclasses);
3689*09d4459fSDaniel Fojt }
3690*09d4459fSDaniel Fojt
3691*09d4459fSDaniel Fojt sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3692*09d4459fSDaniel Fojt sup->talloc = d->tindex * 2;
3693*09d4459fSDaniel Fojt
3694*09d4459fSDaniel Fojt bool have_achar = false;
3695*09d4459fSDaniel Fojt bool have_nchar = false;
3696*09d4459fSDaniel Fojt idx_t j;
3697*09d4459fSDaniel Fojt for (idx_t i = j = 0; i < d->tindex; i++)
3698*09d4459fSDaniel Fojt {
3699*09d4459fSDaniel Fojt switch (d->tokens[i])
3700*09d4459fSDaniel Fojt {
3701*09d4459fSDaniel Fojt case ANYCHAR:
3702*09d4459fSDaniel Fojt case MBCSET:
3703*09d4459fSDaniel Fojt case BACKREF:
3704*09d4459fSDaniel Fojt {
3705*09d4459fSDaniel Fojt charclass ccl;
3706*09d4459fSDaniel Fojt fillset (&ccl);
3707*09d4459fSDaniel Fojt sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3708*09d4459fSDaniel Fojt sup->tokens[j++] = STAR;
3709*09d4459fSDaniel Fojt if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3710*09d4459fSDaniel Fojt || d->tokens[i + 1] == PLUS)
3711*09d4459fSDaniel Fojt i++;
3712*09d4459fSDaniel Fojt have_achar = true;
3713*09d4459fSDaniel Fojt }
3714*09d4459fSDaniel Fojt break;
3715*09d4459fSDaniel Fojt case BEGWORD:
3716*09d4459fSDaniel Fojt case ENDWORD:
3717*09d4459fSDaniel Fojt case LIMWORD:
3718*09d4459fSDaniel Fojt case NOTLIMWORD:
3719*09d4459fSDaniel Fojt if (d->localeinfo.multibyte)
3720*09d4459fSDaniel Fojt {
3721*09d4459fSDaniel Fojt /* These constraints aren't supported in a multibyte locale.
3722*09d4459fSDaniel Fojt Ignore them in the superset DFA. */
3723*09d4459fSDaniel Fojt sup->tokens[j++] = EMPTY;
3724*09d4459fSDaniel Fojt break;
3725*09d4459fSDaniel Fojt }
3726*09d4459fSDaniel Fojt FALLTHROUGH;
3727*09d4459fSDaniel Fojt default:
3728*09d4459fSDaniel Fojt sup->tokens[j++] = d->tokens[i];
3729*09d4459fSDaniel Fojt if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3730*09d4459fSDaniel Fojt || d->tokens[i] >= CSET)
3731*09d4459fSDaniel Fojt have_nchar = true;
3732*09d4459fSDaniel Fojt break;
3733*09d4459fSDaniel Fojt }
3734*09d4459fSDaniel Fojt }
3735*09d4459fSDaniel Fojt sup->tindex = j;
3736*09d4459fSDaniel Fojt
3737*09d4459fSDaniel Fojt if (have_nchar && (have_achar || d->localeinfo.multibyte))
3738*09d4459fSDaniel Fojt d->superset = sup;
3739*09d4459fSDaniel Fojt else
3740*09d4459fSDaniel Fojt {
3741*09d4459fSDaniel Fojt dfafree (sup);
3742*09d4459fSDaniel Fojt free (sup);
3743*09d4459fSDaniel Fojt }
3744*09d4459fSDaniel Fojt }
3745*09d4459fSDaniel Fojt
3746*09d4459fSDaniel Fojt /* Parse a string S of length LEN into D (but skip this step if S is null).
3747*09d4459fSDaniel Fojt Then analyze D and build a matcher for it.
3748*09d4459fSDaniel Fojt SEARCHFLAG says whether to build a searching or an exact matcher. */
3749*09d4459fSDaniel Fojt void
dfacomp(char const * s,idx_t len,struct dfa * d,bool searchflag)3750*09d4459fSDaniel Fojt dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3751*09d4459fSDaniel Fojt {
3752*09d4459fSDaniel Fojt if (s != NULL)
3753*09d4459fSDaniel Fojt dfaparse (s, len, d);
3754*09d4459fSDaniel Fojt
3755*09d4459fSDaniel Fojt dfassbuild (d);
3756*09d4459fSDaniel Fojt
3757*09d4459fSDaniel Fojt if (dfa_supported (d))
3758*09d4459fSDaniel Fojt {
3759*09d4459fSDaniel Fojt maybe_disable_superset_dfa (d);
3760*09d4459fSDaniel Fojt dfaanalyze (d, searchflag);
3761*09d4459fSDaniel Fojt }
3762*09d4459fSDaniel Fojt else
3763*09d4459fSDaniel Fojt {
3764*09d4459fSDaniel Fojt d->dfaexec = dfaexec_noop;
3765*09d4459fSDaniel Fojt }
3766*09d4459fSDaniel Fojt
3767*09d4459fSDaniel Fojt if (d->superset)
3768*09d4459fSDaniel Fojt {
3769*09d4459fSDaniel Fojt d->fast = true;
3770*09d4459fSDaniel Fojt dfaanalyze (d->superset, searchflag);
3771*09d4459fSDaniel Fojt }
3772*09d4459fSDaniel Fojt }
3773*09d4459fSDaniel Fojt
3774*09d4459fSDaniel Fojt /* Free the storage held by the components of a dfa. */
3775*09d4459fSDaniel Fojt void
dfafree(struct dfa * d)3776*09d4459fSDaniel Fojt dfafree (struct dfa *d)
3777*09d4459fSDaniel Fojt {
3778*09d4459fSDaniel Fojt free (d->charclasses);
3779*09d4459fSDaniel Fojt free (d->tokens);
3780*09d4459fSDaniel Fojt
3781*09d4459fSDaniel Fojt if (d->localeinfo.multibyte)
3782*09d4459fSDaniel Fojt free_mbdata (d);
3783*09d4459fSDaniel Fojt
3784*09d4459fSDaniel Fojt free (d->constraints);
3785*09d4459fSDaniel Fojt free (d->separates);
3786*09d4459fSDaniel Fojt
3787*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->sindex; i++)
3788*09d4459fSDaniel Fojt {
3789*09d4459fSDaniel Fojt free (d->states[i].elems.elems);
3790*09d4459fSDaniel Fojt free (d->states[i].mbps.elems);
3791*09d4459fSDaniel Fojt }
3792*09d4459fSDaniel Fojt free (d->states);
3793*09d4459fSDaniel Fojt
3794*09d4459fSDaniel Fojt if (d->follows)
3795*09d4459fSDaniel Fojt {
3796*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tindex; i++)
3797*09d4459fSDaniel Fojt free (d->follows[i].elems);
3798*09d4459fSDaniel Fojt free (d->follows);
3799*09d4459fSDaniel Fojt }
3800*09d4459fSDaniel Fojt
3801*09d4459fSDaniel Fojt if (d->trans)
3802*09d4459fSDaniel Fojt {
3803*09d4459fSDaniel Fojt for (idx_t i = 0; i < d->tralloc; i++)
3804*09d4459fSDaniel Fojt {
3805*09d4459fSDaniel Fojt free (d->trans[i]);
3806*09d4459fSDaniel Fojt free (d->fails[i]);
3807*09d4459fSDaniel Fojt }
3808*09d4459fSDaniel Fojt
3809*09d4459fSDaniel Fojt free (d->trans - 2);
3810*09d4459fSDaniel Fojt free (d->fails);
3811*09d4459fSDaniel Fojt free (d->newlines);
3812*09d4459fSDaniel Fojt free (d->success);
3813*09d4459fSDaniel Fojt }
3814*09d4459fSDaniel Fojt
3815*09d4459fSDaniel Fojt if (d->superset)
3816*09d4459fSDaniel Fojt {
3817*09d4459fSDaniel Fojt dfafree (d->superset);
3818*09d4459fSDaniel Fojt free (d->superset);
3819*09d4459fSDaniel Fojt }
3820*09d4459fSDaniel Fojt }
3821*09d4459fSDaniel Fojt
3822*09d4459fSDaniel Fojt /* Having found the postfix representation of the regular expression,
3823*09d4459fSDaniel Fojt try to find a long sequence of characters that must appear in any line
3824*09d4459fSDaniel Fojt containing the r.e.
3825*09d4459fSDaniel Fojt Finding a "longest" sequence is beyond the scope here;
3826*09d4459fSDaniel Fojt we take an easy way out and hope for the best.
3827*09d4459fSDaniel Fojt (Take "(ab|a)b"--please.)
3828*09d4459fSDaniel Fojt
3829*09d4459fSDaniel Fojt We do a bottom-up calculation of sequences of characters that must appear
3830*09d4459fSDaniel Fojt in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3831*09d4459fSDaniel Fojt representation:
3832*09d4459fSDaniel Fojt sequences that must appear at the left of the match ("left")
3833*09d4459fSDaniel Fojt sequences that must appear at the right of the match ("right")
3834*09d4459fSDaniel Fojt lists of sequences that must appear somewhere in the match ("in")
3835*09d4459fSDaniel Fojt sequences that must constitute the match ("is")
3836*09d4459fSDaniel Fojt
3837*09d4459fSDaniel Fojt When we get to the root of the tree, we use one of the longest of its
3838*09d4459fSDaniel Fojt calculated "in" sequences as our answer.
3839*09d4459fSDaniel Fojt
3840*09d4459fSDaniel Fojt The sequences calculated for the various types of node (in pseudo ANSI c)
3841*09d4459fSDaniel Fojt are shown below. "p" is the operand of unary operators (and the left-hand
3842*09d4459fSDaniel Fojt operand of binary operators); "q" is the right-hand operand of binary
3843*09d4459fSDaniel Fojt operators.
3844*09d4459fSDaniel Fojt
3845*09d4459fSDaniel Fojt "ZERO" means "a zero-length sequence" below.
3846*09d4459fSDaniel Fojt
3847*09d4459fSDaniel Fojt Type left right is in
3848*09d4459fSDaniel Fojt ---- ---- ----- -- --
3849*09d4459fSDaniel Fojt char c # c # c # c # c
3850*09d4459fSDaniel Fojt
3851*09d4459fSDaniel Fojt ANYCHAR ZERO ZERO ZERO ZERO
3852*09d4459fSDaniel Fojt
3853*09d4459fSDaniel Fojt MBCSET ZERO ZERO ZERO ZERO
3854*09d4459fSDaniel Fojt
3855*09d4459fSDaniel Fojt CSET ZERO ZERO ZERO ZERO
3856*09d4459fSDaniel Fojt
3857*09d4459fSDaniel Fojt STAR ZERO ZERO ZERO ZERO
3858*09d4459fSDaniel Fojt
3859*09d4459fSDaniel Fojt QMARK ZERO ZERO ZERO ZERO
3860*09d4459fSDaniel Fojt
3861*09d4459fSDaniel Fojt PLUS p->left p->right ZERO p->in
3862*09d4459fSDaniel Fojt
3863*09d4459fSDaniel Fojt CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3864*09d4459fSDaniel Fojt p->left : q->right : q->is!=ZERO) ? q->in plus
3865*09d4459fSDaniel Fojt p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3866*09d4459fSDaniel Fojt ZERO
3867*09d4459fSDaniel Fojt
3868*09d4459fSDaniel Fojt OR longest common longest common (do p->is and substrings common
3869*09d4459fSDaniel Fojt leading trailing to q->is have same p->in and
3870*09d4459fSDaniel Fojt (sub)sequence (sub)sequence q->in length and content) ?
3871*09d4459fSDaniel Fojt of p->left of p->right
3872*09d4459fSDaniel Fojt and q->left and q->right p->is : NULL
3873*09d4459fSDaniel Fojt
3874*09d4459fSDaniel Fojt If there's anything else we recognize in the tree, all four sequences get set
3875*09d4459fSDaniel Fojt to zero-length sequences. If there's something we don't recognize in the
3876*09d4459fSDaniel Fojt tree, we just return a zero-length sequence.
3877*09d4459fSDaniel Fojt
3878*09d4459fSDaniel Fojt Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3879*09d4459fSDaniel Fojt 'aaa')?
3880*09d4459fSDaniel Fojt
3881*09d4459fSDaniel Fojt And ... is it here or someplace that we might ponder "optimizations" such as
3882*09d4459fSDaniel Fojt egrep 'psi|epsilon' -> egrep 'psi'
3883*09d4459fSDaniel Fojt egrep 'pepsi|epsilon' -> egrep 'epsi'
3884*09d4459fSDaniel Fojt (Yes, we now find "epsi" as a "string
3885*09d4459fSDaniel Fojt that must occur", but we might also
3886*09d4459fSDaniel Fojt simplify the *entire* r.e. being sought)
3887*09d4459fSDaniel Fojt grep '[c]' -> grep 'c'
3888*09d4459fSDaniel Fojt grep '(ab|a)b' -> grep 'ab'
3889*09d4459fSDaniel Fojt grep 'ab*' -> grep 'a'
3890*09d4459fSDaniel Fojt grep 'a*b' -> grep 'b'
3891*09d4459fSDaniel Fojt
3892*09d4459fSDaniel Fojt There are several issues:
3893*09d4459fSDaniel Fojt
3894*09d4459fSDaniel Fojt Is optimization easy (enough)?
3895*09d4459fSDaniel Fojt
3896*09d4459fSDaniel Fojt Does optimization actually accomplish anything,
3897*09d4459fSDaniel Fojt or is the automaton you get from "psi|epsilon" (for example)
3898*09d4459fSDaniel Fojt the same as the one you get from "psi" (for example)?
3899*09d4459fSDaniel Fojt
3900*09d4459fSDaniel Fojt Are optimizable r.e.'s likely to be used in real-life situations
3901*09d4459fSDaniel Fojt (something like 'ab*' is probably unlikely; something like is
3902*09d4459fSDaniel Fojt 'psi|epsilon' is likelier)? */
3903*09d4459fSDaniel Fojt
3904*09d4459fSDaniel Fojt static char *
icatalloc(char * old,char const * new)3905*09d4459fSDaniel Fojt icatalloc (char *old, char const *new)
3906*09d4459fSDaniel Fojt {
3907*09d4459fSDaniel Fojt idx_t newsize = strlen (new);
3908*09d4459fSDaniel Fojt if (newsize == 0)
3909*09d4459fSDaniel Fojt return old;
3910*09d4459fSDaniel Fojt idx_t oldsize = strlen (old);
3911*09d4459fSDaniel Fojt char *result = xrealloc (old, oldsize + newsize + 1);
3912*09d4459fSDaniel Fojt memcpy (result + oldsize, new, newsize + 1);
3913*09d4459fSDaniel Fojt return result;
3914*09d4459fSDaniel Fojt }
3915*09d4459fSDaniel Fojt
3916*09d4459fSDaniel Fojt static void
freelist(char ** cpp)3917*09d4459fSDaniel Fojt freelist (char **cpp)
3918*09d4459fSDaniel Fojt {
3919*09d4459fSDaniel Fojt while (*cpp)
3920*09d4459fSDaniel Fojt free (*cpp++);
3921*09d4459fSDaniel Fojt }
3922*09d4459fSDaniel Fojt
3923*09d4459fSDaniel Fojt static char **
enlist(char ** cpp,char * new,idx_t len)3924*09d4459fSDaniel Fojt enlist (char **cpp, char *new, idx_t len)
3925*09d4459fSDaniel Fojt {
3926*09d4459fSDaniel Fojt new = memcpy (xmalloc (len + 1), new, len);
3927*09d4459fSDaniel Fojt new[len] = '\0';
3928*09d4459fSDaniel Fojt /* Is there already something in the list that's new (or longer)? */
3929*09d4459fSDaniel Fojt idx_t i;
3930*09d4459fSDaniel Fojt for (i = 0; cpp[i] != NULL; i++)
3931*09d4459fSDaniel Fojt if (strstr (cpp[i], new) != NULL)
3932*09d4459fSDaniel Fojt {
3933*09d4459fSDaniel Fojt free (new);
3934*09d4459fSDaniel Fojt return cpp;
3935*09d4459fSDaniel Fojt }
3936*09d4459fSDaniel Fojt /* Eliminate any obsoleted strings. */
3937*09d4459fSDaniel Fojt for (idx_t j = 0; cpp[j] != NULL; )
3938*09d4459fSDaniel Fojt if (strstr (new, cpp[j]) == NULL)
3939*09d4459fSDaniel Fojt ++j;
3940*09d4459fSDaniel Fojt else
3941*09d4459fSDaniel Fojt {
3942*09d4459fSDaniel Fojt free (cpp[j]);
3943*09d4459fSDaniel Fojt if (--i == j)
3944*09d4459fSDaniel Fojt break;
3945*09d4459fSDaniel Fojt cpp[j] = cpp[i];
3946*09d4459fSDaniel Fojt cpp[i] = NULL;
3947*09d4459fSDaniel Fojt }
3948*09d4459fSDaniel Fojt /* Add the new string. */
3949*09d4459fSDaniel Fojt cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3950*09d4459fSDaniel Fojt cpp[i] = new;
3951*09d4459fSDaniel Fojt cpp[i + 1] = NULL;
3952*09d4459fSDaniel Fojt return cpp;
3953*09d4459fSDaniel Fojt }
3954*09d4459fSDaniel Fojt
3955*09d4459fSDaniel Fojt /* Given pointers to two strings, return a pointer to an allocated
3956*09d4459fSDaniel Fojt list of their distinct common substrings. */
3957*09d4459fSDaniel Fojt static char **
comsubs(char * left,char const * right)3958*09d4459fSDaniel Fojt comsubs (char *left, char const *right)
3959*09d4459fSDaniel Fojt {
3960*09d4459fSDaniel Fojt char **cpp = xzalloc (sizeof *cpp);
3961*09d4459fSDaniel Fojt
3962*09d4459fSDaniel Fojt for (char *lcp = left; *lcp != '\0'; lcp++)
3963*09d4459fSDaniel Fojt {
3964*09d4459fSDaniel Fojt idx_t len = 0;
3965*09d4459fSDaniel Fojt char *rcp = strchr (right, *lcp);
3966*09d4459fSDaniel Fojt while (rcp != NULL)
3967*09d4459fSDaniel Fojt {
3968*09d4459fSDaniel Fojt idx_t i;
3969*09d4459fSDaniel Fojt for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3970*09d4459fSDaniel Fojt continue;
3971*09d4459fSDaniel Fojt if (i > len)
3972*09d4459fSDaniel Fojt len = i;
3973*09d4459fSDaniel Fojt rcp = strchr (rcp + 1, *lcp);
3974*09d4459fSDaniel Fojt }
3975*09d4459fSDaniel Fojt if (len != 0)
3976*09d4459fSDaniel Fojt cpp = enlist (cpp, lcp, len);
3977*09d4459fSDaniel Fojt }
3978*09d4459fSDaniel Fojt return cpp;
3979*09d4459fSDaniel Fojt }
3980*09d4459fSDaniel Fojt
3981*09d4459fSDaniel Fojt static char **
addlists(char ** old,char ** new)3982*09d4459fSDaniel Fojt addlists (char **old, char **new)
3983*09d4459fSDaniel Fojt {
3984*09d4459fSDaniel Fojt for (; *new; new++)
3985*09d4459fSDaniel Fojt old = enlist (old, *new, strlen (*new));
3986*09d4459fSDaniel Fojt return old;
3987*09d4459fSDaniel Fojt }
3988*09d4459fSDaniel Fojt
3989*09d4459fSDaniel Fojt /* Given two lists of substrings, return a new list giving substrings
3990*09d4459fSDaniel Fojt common to both. */
3991*09d4459fSDaniel Fojt static char **
inboth(char ** left,char ** right)3992*09d4459fSDaniel Fojt inboth (char **left, char **right)
3993*09d4459fSDaniel Fojt {
3994*09d4459fSDaniel Fojt char **both = xzalloc (sizeof *both);
3995*09d4459fSDaniel Fojt
3996*09d4459fSDaniel Fojt for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
3997*09d4459fSDaniel Fojt {
3998*09d4459fSDaniel Fojt for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
3999*09d4459fSDaniel Fojt {
4000*09d4459fSDaniel Fojt char **temp = comsubs (left[lnum], right[rnum]);
4001*09d4459fSDaniel Fojt both = addlists (both, temp);
4002*09d4459fSDaniel Fojt freelist (temp);
4003*09d4459fSDaniel Fojt free (temp);
4004*09d4459fSDaniel Fojt }
4005*09d4459fSDaniel Fojt }
4006*09d4459fSDaniel Fojt return both;
4007*09d4459fSDaniel Fojt }
4008*09d4459fSDaniel Fojt
4009*09d4459fSDaniel Fojt typedef struct must must;
4010*09d4459fSDaniel Fojt
4011*09d4459fSDaniel Fojt struct must
4012*09d4459fSDaniel Fojt {
4013*09d4459fSDaniel Fojt char **in;
4014*09d4459fSDaniel Fojt char *left;
4015*09d4459fSDaniel Fojt char *right;
4016*09d4459fSDaniel Fojt char *is;
4017*09d4459fSDaniel Fojt bool begline;
4018*09d4459fSDaniel Fojt bool endline;
4019*09d4459fSDaniel Fojt must *prev;
4020*09d4459fSDaniel Fojt };
4021*09d4459fSDaniel Fojt
4022*09d4459fSDaniel Fojt static must *
allocmust(must * mp,idx_t size)4023*09d4459fSDaniel Fojt allocmust (must *mp, idx_t size)
4024*09d4459fSDaniel Fojt {
4025*09d4459fSDaniel Fojt must *new_mp = xmalloc (sizeof *new_mp);
4026*09d4459fSDaniel Fojt new_mp->in = xzalloc (sizeof *new_mp->in);
4027*09d4459fSDaniel Fojt new_mp->left = xzalloc (size);
4028*09d4459fSDaniel Fojt new_mp->right = xzalloc (size);
4029*09d4459fSDaniel Fojt new_mp->is = xzalloc (size);
4030*09d4459fSDaniel Fojt new_mp->begline = false;
4031*09d4459fSDaniel Fojt new_mp->endline = false;
4032*09d4459fSDaniel Fojt new_mp->prev = mp;
4033*09d4459fSDaniel Fojt return new_mp;
4034*09d4459fSDaniel Fojt }
4035*09d4459fSDaniel Fojt
4036*09d4459fSDaniel Fojt static void
resetmust(must * mp)4037*09d4459fSDaniel Fojt resetmust (must *mp)
4038*09d4459fSDaniel Fojt {
4039*09d4459fSDaniel Fojt freelist (mp->in);
4040*09d4459fSDaniel Fojt mp->in[0] = NULL;
4041*09d4459fSDaniel Fojt mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4042*09d4459fSDaniel Fojt mp->begline = false;
4043*09d4459fSDaniel Fojt mp->endline = false;
4044*09d4459fSDaniel Fojt }
4045*09d4459fSDaniel Fojt
4046*09d4459fSDaniel Fojt static void
freemust(must * mp)4047*09d4459fSDaniel Fojt freemust (must *mp)
4048*09d4459fSDaniel Fojt {
4049*09d4459fSDaniel Fojt freelist (mp->in);
4050*09d4459fSDaniel Fojt free (mp->in);
4051*09d4459fSDaniel Fojt free (mp->left);
4052*09d4459fSDaniel Fojt free (mp->right);
4053*09d4459fSDaniel Fojt free (mp->is);
4054*09d4459fSDaniel Fojt free (mp);
4055*09d4459fSDaniel Fojt }
4056*09d4459fSDaniel Fojt
4057*09d4459fSDaniel Fojt struct dfamust *
dfamust(struct dfa const * d)4058*09d4459fSDaniel Fojt dfamust (struct dfa const *d)
4059*09d4459fSDaniel Fojt {
4060*09d4459fSDaniel Fojt must *mp = NULL;
4061*09d4459fSDaniel Fojt char const *result = "";
4062*09d4459fSDaniel Fojt bool exact = false;
4063*09d4459fSDaniel Fojt bool begline = false;
4064*09d4459fSDaniel Fojt bool endline = false;
4065*09d4459fSDaniel Fojt bool need_begline = false;
4066*09d4459fSDaniel Fojt bool need_endline = false;
4067*09d4459fSDaniel Fojt bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4068*09d4459fSDaniel Fojt
4069*09d4459fSDaniel Fojt for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4070*09d4459fSDaniel Fojt {
4071*09d4459fSDaniel Fojt token t = d->tokens[ri];
4072*09d4459fSDaniel Fojt switch (t)
4073*09d4459fSDaniel Fojt {
4074*09d4459fSDaniel Fojt case BEGLINE:
4075*09d4459fSDaniel Fojt mp = allocmust (mp, 2);
4076*09d4459fSDaniel Fojt mp->begline = true;
4077*09d4459fSDaniel Fojt need_begline = true;
4078*09d4459fSDaniel Fojt break;
4079*09d4459fSDaniel Fojt case ENDLINE:
4080*09d4459fSDaniel Fojt mp = allocmust (mp, 2);
4081*09d4459fSDaniel Fojt mp->endline = true;
4082*09d4459fSDaniel Fojt need_endline = true;
4083*09d4459fSDaniel Fojt break;
4084*09d4459fSDaniel Fojt case LPAREN:
4085*09d4459fSDaniel Fojt case RPAREN:
4086*09d4459fSDaniel Fojt assert (!"neither LPAREN nor RPAREN may appear here");
4087*09d4459fSDaniel Fojt
4088*09d4459fSDaniel Fojt case EMPTY:
4089*09d4459fSDaniel Fojt case BEGWORD:
4090*09d4459fSDaniel Fojt case ENDWORD:
4091*09d4459fSDaniel Fojt case LIMWORD:
4092*09d4459fSDaniel Fojt case NOTLIMWORD:
4093*09d4459fSDaniel Fojt case BACKREF:
4094*09d4459fSDaniel Fojt case ANYCHAR:
4095*09d4459fSDaniel Fojt case MBCSET:
4096*09d4459fSDaniel Fojt mp = allocmust (mp, 2);
4097*09d4459fSDaniel Fojt break;
4098*09d4459fSDaniel Fojt
4099*09d4459fSDaniel Fojt case STAR:
4100*09d4459fSDaniel Fojt case QMARK:
4101*09d4459fSDaniel Fojt resetmust (mp);
4102*09d4459fSDaniel Fojt break;
4103*09d4459fSDaniel Fojt
4104*09d4459fSDaniel Fojt case OR:
4105*09d4459fSDaniel Fojt {
4106*09d4459fSDaniel Fojt char **new;
4107*09d4459fSDaniel Fojt must *rmp = mp;
4108*09d4459fSDaniel Fojt must *lmp = mp = mp->prev;
4109*09d4459fSDaniel Fojt idx_t j, ln, rn, n;
4110*09d4459fSDaniel Fojt
4111*09d4459fSDaniel Fojt /* Guaranteed to be. Unlikely, but ... */
4112*09d4459fSDaniel Fojt if (streq (lmp->is, rmp->is))
4113*09d4459fSDaniel Fojt {
4114*09d4459fSDaniel Fojt lmp->begline &= rmp->begline;
4115*09d4459fSDaniel Fojt lmp->endline &= rmp->endline;
4116*09d4459fSDaniel Fojt }
4117*09d4459fSDaniel Fojt else
4118*09d4459fSDaniel Fojt {
4119*09d4459fSDaniel Fojt lmp->is[0] = '\0';
4120*09d4459fSDaniel Fojt lmp->begline = false;
4121*09d4459fSDaniel Fojt lmp->endline = false;
4122*09d4459fSDaniel Fojt }
4123*09d4459fSDaniel Fojt /* Left side--easy */
4124*09d4459fSDaniel Fojt idx_t i = 0;
4125*09d4459fSDaniel Fojt while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4126*09d4459fSDaniel Fojt ++i;
4127*09d4459fSDaniel Fojt lmp->left[i] = '\0';
4128*09d4459fSDaniel Fojt /* Right side */
4129*09d4459fSDaniel Fojt ln = strlen (lmp->right);
4130*09d4459fSDaniel Fojt rn = strlen (rmp->right);
4131*09d4459fSDaniel Fojt n = ln;
4132*09d4459fSDaniel Fojt if (n > rn)
4133*09d4459fSDaniel Fojt n = rn;
4134*09d4459fSDaniel Fojt for (i = 0; i < n; ++i)
4135*09d4459fSDaniel Fojt if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4136*09d4459fSDaniel Fojt break;
4137*09d4459fSDaniel Fojt for (j = 0; j < i; ++j)
4138*09d4459fSDaniel Fojt lmp->right[j] = lmp->right[(ln - i) + j];
4139*09d4459fSDaniel Fojt lmp->right[j] = '\0';
4140*09d4459fSDaniel Fojt new = inboth (lmp->in, rmp->in);
4141*09d4459fSDaniel Fojt freelist (lmp->in);
4142*09d4459fSDaniel Fojt free (lmp->in);
4143*09d4459fSDaniel Fojt lmp->in = new;
4144*09d4459fSDaniel Fojt freemust (rmp);
4145*09d4459fSDaniel Fojt }
4146*09d4459fSDaniel Fojt break;
4147*09d4459fSDaniel Fojt
4148*09d4459fSDaniel Fojt case PLUS:
4149*09d4459fSDaniel Fojt mp->is[0] = '\0';
4150*09d4459fSDaniel Fojt break;
4151*09d4459fSDaniel Fojt
4152*09d4459fSDaniel Fojt case END:
4153*09d4459fSDaniel Fojt assert (!mp->prev);
4154*09d4459fSDaniel Fojt for (idx_t i = 0; mp->in[i] != NULL; i++)
4155*09d4459fSDaniel Fojt if (strlen (mp->in[i]) > strlen (result))
4156*09d4459fSDaniel Fojt result = mp->in[i];
4157*09d4459fSDaniel Fojt if (streq (result, mp->is))
4158*09d4459fSDaniel Fojt {
4159*09d4459fSDaniel Fojt if ((!need_begline || mp->begline) && (!need_endline
4160*09d4459fSDaniel Fojt || mp->endline))
4161*09d4459fSDaniel Fojt exact = true;
4162*09d4459fSDaniel Fojt begline = mp->begline;
4163*09d4459fSDaniel Fojt endline = mp->endline;
4164*09d4459fSDaniel Fojt }
4165*09d4459fSDaniel Fojt goto done;
4166*09d4459fSDaniel Fojt
4167*09d4459fSDaniel Fojt case CAT:
4168*09d4459fSDaniel Fojt {
4169*09d4459fSDaniel Fojt must *rmp = mp;
4170*09d4459fSDaniel Fojt must *lmp = mp = mp->prev;
4171*09d4459fSDaniel Fojt
4172*09d4459fSDaniel Fojt /* In. Everything in left, plus everything in
4173*09d4459fSDaniel Fojt right, plus concatenation of
4174*09d4459fSDaniel Fojt left's right and right's left. */
4175*09d4459fSDaniel Fojt lmp->in = addlists (lmp->in, rmp->in);
4176*09d4459fSDaniel Fojt if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4177*09d4459fSDaniel Fojt {
4178*09d4459fSDaniel Fojt idx_t lrlen = strlen (lmp->right);
4179*09d4459fSDaniel Fojt idx_t rllen = strlen (rmp->left);
4180*09d4459fSDaniel Fojt char *tp = xmalloc (lrlen + rllen);
4181*09d4459fSDaniel Fojt memcpy (tp, lmp->right, lrlen);
4182*09d4459fSDaniel Fojt memcpy (tp + lrlen, rmp->left, rllen);
4183*09d4459fSDaniel Fojt lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4184*09d4459fSDaniel Fojt free (tp);
4185*09d4459fSDaniel Fojt }
4186*09d4459fSDaniel Fojt /* Left-hand */
4187*09d4459fSDaniel Fojt if (lmp->is[0] != '\0')
4188*09d4459fSDaniel Fojt lmp->left = icatalloc (lmp->left, rmp->left);
4189*09d4459fSDaniel Fojt /* Right-hand */
4190*09d4459fSDaniel Fojt if (rmp->is[0] == '\0')
4191*09d4459fSDaniel Fojt lmp->right[0] = '\0';
4192*09d4459fSDaniel Fojt lmp->right = icatalloc (lmp->right, rmp->right);
4193*09d4459fSDaniel Fojt /* Guaranteed to be */
4194*09d4459fSDaniel Fojt if ((lmp->is[0] != '\0' || lmp->begline)
4195*09d4459fSDaniel Fojt && (rmp->is[0] != '\0' || rmp->endline))
4196*09d4459fSDaniel Fojt {
4197*09d4459fSDaniel Fojt lmp->is = icatalloc (lmp->is, rmp->is);
4198*09d4459fSDaniel Fojt lmp->endline = rmp->endline;
4199*09d4459fSDaniel Fojt }
4200*09d4459fSDaniel Fojt else
4201*09d4459fSDaniel Fojt {
4202*09d4459fSDaniel Fojt lmp->is[0] = '\0';
4203*09d4459fSDaniel Fojt lmp->begline = false;
4204*09d4459fSDaniel Fojt lmp->endline = false;
4205*09d4459fSDaniel Fojt }
4206*09d4459fSDaniel Fojt freemust (rmp);
4207*09d4459fSDaniel Fojt }
4208*09d4459fSDaniel Fojt break;
4209*09d4459fSDaniel Fojt
4210*09d4459fSDaniel Fojt case '\0':
4211*09d4459fSDaniel Fojt /* Not on *my* shift. */
4212*09d4459fSDaniel Fojt goto done;
4213*09d4459fSDaniel Fojt
4214*09d4459fSDaniel Fojt default:
4215*09d4459fSDaniel Fojt if (CSET <= t)
4216*09d4459fSDaniel Fojt {
4217*09d4459fSDaniel Fojt /* If T is a singleton, or if case-folding in a unibyte
4218*09d4459fSDaniel Fojt locale and T's members all case-fold to the same char,
4219*09d4459fSDaniel Fojt convert T to one of its members. Otherwise, do
4220*09d4459fSDaniel Fojt nothing further with T. */
4221*09d4459fSDaniel Fojt charclass *ccl = &d->charclasses[t - CSET];
4222*09d4459fSDaniel Fojt int j;
4223*09d4459fSDaniel Fojt for (j = 0; j < NOTCHAR; j++)
4224*09d4459fSDaniel Fojt if (tstbit (j, ccl))
4225*09d4459fSDaniel Fojt break;
4226*09d4459fSDaniel Fojt if (! (j < NOTCHAR))
4227*09d4459fSDaniel Fojt {
4228*09d4459fSDaniel Fojt mp = allocmust (mp, 2);
4229*09d4459fSDaniel Fojt break;
4230*09d4459fSDaniel Fojt }
4231*09d4459fSDaniel Fojt t = j;
4232*09d4459fSDaniel Fojt while (++j < NOTCHAR)
4233*09d4459fSDaniel Fojt if (tstbit (j, ccl)
4234*09d4459fSDaniel Fojt && ! (case_fold_unibyte
4235*09d4459fSDaniel Fojt && toupper (j) == toupper (t)))
4236*09d4459fSDaniel Fojt break;
4237*09d4459fSDaniel Fojt if (j < NOTCHAR)
4238*09d4459fSDaniel Fojt {
4239*09d4459fSDaniel Fojt mp = allocmust (mp, 2);
4240*09d4459fSDaniel Fojt break;
4241*09d4459fSDaniel Fojt }
4242*09d4459fSDaniel Fojt }
4243*09d4459fSDaniel Fojt
4244*09d4459fSDaniel Fojt idx_t rj = ri + 2;
4245*09d4459fSDaniel Fojt if (d->tokens[ri + 1] == CAT)
4246*09d4459fSDaniel Fojt {
4247*09d4459fSDaniel Fojt for (; rj < d->tindex - 1; rj += 2)
4248*09d4459fSDaniel Fojt {
4249*09d4459fSDaniel Fojt if ((rj != ri && (d->tokens[rj] <= 0
4250*09d4459fSDaniel Fojt || NOTCHAR <= d->tokens[rj]))
4251*09d4459fSDaniel Fojt || d->tokens[rj + 1] != CAT)
4252*09d4459fSDaniel Fojt break;
4253*09d4459fSDaniel Fojt }
4254*09d4459fSDaniel Fojt }
4255*09d4459fSDaniel Fojt mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4256*09d4459fSDaniel Fojt mp->is[0] = mp->left[0] = mp->right[0]
4257*09d4459fSDaniel Fojt = case_fold_unibyte ? toupper (t) : t;
4258*09d4459fSDaniel Fojt
4259*09d4459fSDaniel Fojt idx_t i;
4260*09d4459fSDaniel Fojt for (i = 1; ri + 2 < rj; i++)
4261*09d4459fSDaniel Fojt {
4262*09d4459fSDaniel Fojt ri += 2;
4263*09d4459fSDaniel Fojt t = d->tokens[ri];
4264*09d4459fSDaniel Fojt mp->is[i] = mp->left[i] = mp->right[i]
4265*09d4459fSDaniel Fojt = case_fold_unibyte ? toupper (t) : t;
4266*09d4459fSDaniel Fojt }
4267*09d4459fSDaniel Fojt mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4268*09d4459fSDaniel Fojt mp->in = enlist (mp->in, mp->is, i);
4269*09d4459fSDaniel Fojt break;
4270*09d4459fSDaniel Fojt }
4271*09d4459fSDaniel Fojt }
4272*09d4459fSDaniel Fojt done:;
4273*09d4459fSDaniel Fojt
4274*09d4459fSDaniel Fojt struct dfamust *dm = NULL;
4275*09d4459fSDaniel Fojt if (*result)
4276*09d4459fSDaniel Fojt {
4277*09d4459fSDaniel Fojt dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4278*09d4459fSDaniel Fojt dm->exact = exact;
4279*09d4459fSDaniel Fojt dm->begline = begline;
4280*09d4459fSDaniel Fojt dm->endline = endline;
4281*09d4459fSDaniel Fojt strcpy (dm->must, result);
4282*09d4459fSDaniel Fojt }
4283*09d4459fSDaniel Fojt
4284*09d4459fSDaniel Fojt while (mp)
4285*09d4459fSDaniel Fojt {
4286*09d4459fSDaniel Fojt must *prev = mp->prev;
4287*09d4459fSDaniel Fojt freemust (mp);
4288*09d4459fSDaniel Fojt mp = prev;
4289*09d4459fSDaniel Fojt }
4290*09d4459fSDaniel Fojt
4291*09d4459fSDaniel Fojt return dm;
4292*09d4459fSDaniel Fojt }
4293*09d4459fSDaniel Fojt
4294*09d4459fSDaniel Fojt void
dfamustfree(struct dfamust * dm)4295*09d4459fSDaniel Fojt dfamustfree (struct dfamust *dm)
4296*09d4459fSDaniel Fojt {
4297*09d4459fSDaniel Fojt free (dm);
4298*09d4459fSDaniel Fojt }
4299*09d4459fSDaniel Fojt
4300*09d4459fSDaniel Fojt struct dfa *
dfaalloc(void)4301*09d4459fSDaniel Fojt dfaalloc (void)
4302*09d4459fSDaniel Fojt {
4303*09d4459fSDaniel Fojt return xmalloc (sizeof (struct dfa));
4304*09d4459fSDaniel Fojt }
4305*09d4459fSDaniel Fojt
4306*09d4459fSDaniel Fojt /* Initialize DFA. */
4307*09d4459fSDaniel Fojt void
dfasyntax(struct dfa * dfa,struct localeinfo const * linfo,reg_syntax_t bits,int dfaopts)4308*09d4459fSDaniel Fojt dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4309*09d4459fSDaniel Fojt reg_syntax_t bits, int dfaopts)
4310*09d4459fSDaniel Fojt {
4311*09d4459fSDaniel Fojt memset (dfa, 0, offsetof (struct dfa, dfaexec));
4312*09d4459fSDaniel Fojt dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4313*09d4459fSDaniel Fojt dfa->localeinfo = *linfo;
4314*09d4459fSDaniel Fojt
4315*09d4459fSDaniel Fojt dfa->fast = !dfa->localeinfo.multibyte;
4316*09d4459fSDaniel Fojt
4317*09d4459fSDaniel Fojt dfa->canychar = -1;
4318*09d4459fSDaniel Fojt dfa->syntax.syntax_bits_set = true;
4319*09d4459fSDaniel Fojt dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4320*09d4459fSDaniel Fojt dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4321*09d4459fSDaniel Fojt dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4322*09d4459fSDaniel Fojt dfa->syntax.syntax_bits = bits;
4323*09d4459fSDaniel Fojt
4324*09d4459fSDaniel Fojt for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4325*09d4459fSDaniel Fojt {
4326*09d4459fSDaniel Fojt unsigned char uc = i;
4327*09d4459fSDaniel Fojt
4328*09d4459fSDaniel Fojt dfa->syntax.sbit[uc] = char_context (dfa, uc);
4329*09d4459fSDaniel Fojt switch (dfa->syntax.sbit[uc])
4330*09d4459fSDaniel Fojt {
4331*09d4459fSDaniel Fojt case CTX_LETTER:
4332*09d4459fSDaniel Fojt setbit (uc, &dfa->syntax.letters);
4333*09d4459fSDaniel Fojt break;
4334*09d4459fSDaniel Fojt case CTX_NEWLINE:
4335*09d4459fSDaniel Fojt setbit (uc, &dfa->syntax.newline);
4336*09d4459fSDaniel Fojt break;
4337*09d4459fSDaniel Fojt }
4338*09d4459fSDaniel Fojt
4339*09d4459fSDaniel Fojt /* POSIX requires that the five bytes in "\n\r./" (including the
4340*09d4459fSDaniel Fojt terminating NUL) cannot occur inside a multibyte character. */
4341*09d4459fSDaniel Fojt dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4342*09d4459fSDaniel Fojt ? (uc & 0xc0) != 0x80
4343*09d4459fSDaniel Fojt : strchr ("\n\r./", uc) != NULL);
4344*09d4459fSDaniel Fojt }
4345*09d4459fSDaniel Fojt }
4346*09d4459fSDaniel Fojt
4347*09d4459fSDaniel Fojt /* Initialize TO by copying FROM's syntax settings. */
4348*09d4459fSDaniel Fojt void
dfacopysyntax(struct dfa * to,struct dfa const * from)4349*09d4459fSDaniel Fojt dfacopysyntax (struct dfa *to, struct dfa const *from)
4350*09d4459fSDaniel Fojt {
4351*09d4459fSDaniel Fojt memset (to, 0, offsetof (struct dfa, syntax));
4352*09d4459fSDaniel Fojt to->canychar = -1;
4353*09d4459fSDaniel Fojt to->fast = from->fast;
4354*09d4459fSDaniel Fojt to->syntax = from->syntax;
4355*09d4459fSDaniel Fojt to->dfaexec = from->dfaexec;
4356*09d4459fSDaniel Fojt to->localeinfo = from->localeinfo;
4357*09d4459fSDaniel Fojt }
4358*09d4459fSDaniel Fojt
4359*09d4459fSDaniel Fojt /* vim:set shiftwidth=2: */
4360