xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/libgrep/dfa.h (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* dfa.h - declarations for GNU deterministic regexp compiler
2    Copyright (C) 1988, 1998, 2005 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 2, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software Foundation,
16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
17 
18 /* Written June, 1988 by Mike Haertel */
19 
20 /* FIXME:
21    2.  We should not export so much of the DFA internals.
22    In addition to clobbering modularity, we eat up valuable
23    name space. */
24 
25 #include "regex.h"
26 
27 typedef void * ptr_t;
28 
29 /* Number of bits in an unsigned char. */
30 #ifndef CHARBITS
31 #define CHARBITS 8
32 #endif
33 
34 /* First integer value that is greater than any character code. */
35 #define NOTCHAR (1 << CHARBITS)
36 
37 /* INTBITS need not be exact, just a lower bound. */
38 #ifndef INTBITS
39 #define INTBITS (CHARBITS * sizeof (int))
40 #endif
41 
42 /* Number of ints required to hold a bit for every character. */
43 #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
44 
45 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
46 typedef int charclass[CHARCLASS_INTS];
47 
48 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
49    are operators and others are terminal symbols.  Most (but not all) of these
50    codes are returned by the lexical analyzer. */
51 
52 typedef enum
53 {
54   END = -1,			/* END is a terminal symbol that matches the
55 				   end of input; any value of END or less in
56 				   the parse tree is such a symbol.  Accepting
57 				   states of the DFA are those that would have
58 				   a transition on END. */
59 
60   /* Ordinary character values are terminal symbols that match themselves. */
61 
62   EMPTY = NOTCHAR,		/* EMPTY is a terminal symbol that matches
63 				   the empty string. */
64 
65   BACKREF,			/* BACKREF is generated by \<digit>; it
66 				   it not completely handled.  If the scanner
67 				   detects a transition on backref, it returns
68 				   a kind of "semi-success" indicating that
69 				   the match will have to be verified with
70 				   a backtracking matcher. */
71 
72   BEGLINE,			/* BEGLINE is a terminal symbol that matches
73 				   the empty string if it is at the beginning
74 				   of a line. */
75 
76   ENDLINE,			/* ENDLINE is a terminal symbol that matches
77 				   the empty string if it is at the end of
78 				   a line. */
79 
80   BEGWORD,			/* BEGWORD is a terminal symbol that matches
81 				   the empty string if it is at the beginning
82 				   of a word. */
83 
84   ENDWORD,			/* ENDWORD is a terminal symbol that matches
85 				   the empty string if it is at the end of
86 				   a word. */
87 
88   LIMWORD,			/* LIMWORD is a terminal symbol that matches
89 				   the empty string if it is at the beginning
90 				   or the end of a word. */
91 
92   NOTLIMWORD,			/* NOTLIMWORD is a terminal symbol that
93 				   matches the empty string if it is not at
94 				   the beginning or end of a word. */
95 
96   QMARK,			/* QMARK is an operator of one argument that
97 				   matches zero or one occurences of its
98 				   argument. */
99 
100   STAR,				/* STAR is an operator of one argument that
101 				   matches the Kleene closure (zero or more
102 				   occurrences) of its argument. */
103 
104   PLUS,				/* PLUS is an operator of one argument that
105 				   matches the positive closure (one or more
106 				   occurrences) of its argument. */
107 
108   REPMN,			/* REPMN is a lexical token corresponding
109 				   to the {m,n} construct.  REPMN never
110 				   appears in the compiled token vector. */
111 
112   CAT,				/* CAT is an operator of two arguments that
113 				   matches the concatenation of its
114 				   arguments.  CAT is never returned by the
115 				   lexical analyzer. */
116 
117   OR,				/* OR is an operator of two arguments that
118 				   matches either of its arguments. */
119 
120   ORTOP,			/* OR at the toplevel in the parse tree.
121 				   This is used for a boyer-moore heuristic. */
122 
123   LPAREN,			/* LPAREN never appears in the parse tree,
124 				   it is only a lexeme. */
125 
126   RPAREN,			/* RPAREN never appears in the parse tree. */
127 
128   CRANGE,			/* CRANGE never appears in the parse tree.
129 				   It stands for a character range that can
130 				   match a string of one or more characters.
131 				   For example, [a-z] can match "ch" in
132 				   a Spanish locale.  */
133 
134 #ifdef MBS_SUPPORT
135   ANYCHAR,                     /* ANYCHAR is a terminal symbol that matches
136                                   any multibyte(or singlebyte) characters.
137 			          It is used only if MB_CUR_MAX > 1.  */
138 
139   MBCSET,			/* MBCSET is similar to CSET, but for
140 				   multibyte characters.  */
141 #endif /* MBS_SUPPORT */
142 
143   CSET				/* CSET and (and any value greater) is a
144 				   terminal symbol that matches any of a
145 				   class of characters. */
146 } token;
147 
148 /* Sets are stored in an array in the compiled dfa; the index of the
149    array corresponding to a given set token is given by SET_INDEX(t). */
150 #define SET_INDEX(t) ((t) - CSET)
151 
152 /* Sometimes characters can only be matched depending on the surrounding
153    context.  Such context decisions depend on what the previous character
154    was, and the value of the current (lookahead) character.  Context
155    dependent constraints are encoded as 8 bit integers.  Each bit that
156    is set indicates that the constraint succeeds in the corresponding
157    context.
158 
159    bit 7 - previous and current are newlines
160    bit 6 - previous was newline, current isn't
161    bit 5 - previous wasn't newline, current is
162    bit 4 - neither previous nor current is a newline
163    bit 3 - previous and current are word-constituents
164    bit 2 - previous was word-constituent, current isn't
165    bit 1 - previous wasn't word-constituent, current is
166    bit 0 - neither previous nor current is word-constituent
167 
168    Word-constituent characters are those that satisfy isalnum().
169 
170    The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint
171    succeeds in a particular context.  Prevn is true if the previous character
172    was a newline, currn is true if the lookahead character is a newline.
173    Prevl and currl similarly depend upon whether the previous and current
174    characters are word-constituent letters. */
175 #define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
176   ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4))
177 #define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
178   ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0)))
179 #define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
180   (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn)		     \
181    && MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
182 
183 /* The following macros give information about what a constraint depends on. */
184 #define PREV_NEWLINE_DEPENDENT(constraint) \
185   (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
186 #define PREV_LETTER_DEPENDENT(constraint) \
187   (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
188 
189 /* Tokens that match the empty string subject to some constraint actually
190    work by applying that constraint to determine what may follow them,
191    taking into account what has gone before.  The following values are
192    the constraints corresponding to the special tokens previously defined. */
193 #define NO_CONSTRAINT 0xff
194 #define BEGLINE_CONSTRAINT 0xcf
195 #define ENDLINE_CONSTRAINT 0xaf
196 #define BEGWORD_CONSTRAINT 0xf2
197 #define ENDWORD_CONSTRAINT 0xf4
198 #define LIMWORD_CONSTRAINT 0xf6
199 #define NOTLIMWORD_CONSTRAINT 0xf9
200 
201 /* States of the recognizer correspond to sets of positions in the parse
202    tree, together with the constraints under which they may be matched.
203    So a position is encoded as an index into the parse tree together with
204    a constraint. */
205 typedef struct
206 {
207   unsigned index;		/* Index into the parse array. */
208   unsigned constraint;		/* Constraint for matching this position. */
209 } position;
210 
211 /* Sets of positions are stored as arrays. */
212 typedef struct
213 {
214   position *elems;		/* Elements of this position set. */
215   int nelem;			/* Number of elements in this set. */
216 } position_set;
217 
218 /* A state of the dfa consists of a set of positions, some flags,
219    and the token value of the lowest-numbered position of the state that
220    contains an END token. */
221 typedef struct
222 {
223   int hash;			/* Hash of the positions of this state. */
224   position_set elems;		/* Positions this state could match. */
225   char newline;			/* True if previous state matched newline. */
226   char letter;			/* True if previous state matched a letter. */
227   char backref;			/* True if this state matches a \<digit>. */
228   unsigned char constraint;	/* Constraint for this state to accept. */
229   int first_end;		/* Token value of the first END in elems. */
230 #ifdef MBS_SUPPORT
231   position_set mbps;           /* Positions which can match multibyte
232                                   characters.  e.g. period.
233 				  These staff are used only if
234 				  MB_CUR_MAX > 1.  */
235 #endif
236 } dfa_state;
237 
238 /* Element of a list of strings, at least one of which is known to
239    appear in any R.E. matching the DFA. */
240 struct dfamust
241 {
242   int exact;
243   char *must;
244   struct dfamust *next;
245 };
246 
247 #ifdef MBS_SUPPORT
248 /* A bracket operator.
249    e.g. [a-c], [[:alpha:]], etc.  */
250 struct mb_char_classes
251 {
252   int invert;
253   wchar_t *chars;		/* Normal characters.  */
254   int nchars;
255   wctype_t *ch_classes;		/* Character classes.  */
256   int nch_classes;
257   wchar_t *range_sts;		/* Range characters (start of the range).  */
258   wchar_t *range_ends;		/* Range characters (end of the range).  */
259   int nranges;
260   char **equivs;		/* Equivalent classes.  */
261   int nequivs;
262   char **coll_elems;
263   int ncoll_elems;		/* Collating elements.  */
264 };
265 #endif
266 
267 /* A compiled regular expression. */
268 struct dfa
269 {
270   /* Stuff built by the scanner. */
271   charclass *charclasses;	/* Array of character sets for CSET tokens. */
272   int cindex;			/* Index for adding new charclasses. */
273   int calloc;			/* Number of charclasses currently allocated. */
274 
275   /* Stuff built by the parser. */
276   token *tokens;		/* Postfix parse array. */
277   int tindex;			/* Index for adding new tokens. */
278   int talloc;			/* Number of tokens currently allocated. */
279   int depth;			/* Depth required of an evaluation stack
280 				   used for depth-first traversal of the
281 				   parse tree. */
282   int nleaves;			/* Number of leaves on the parse tree. */
283   int nregexps;			/* Count of parallel regexps being built
284 				   with dfaparse(). */
285 #ifdef MBS_SUPPORT
286   /* These stuff are used only if MB_CUR_MAX > 1 or multibyte environments.  */
287   int nmultibyte_prop;
288   int *multibyte_prop;
289   /* The value of multibyte_prop[i] is defined by following rule.
290        if tokens[i] < NOTCHAR
291          bit 1 : tokens[i] is a singlebyte character, or the last-byte of
292 	         a multibyte character.
293 	 bit 0 : tokens[i] is a singlebyte character, or the 1st-byte of
294 	         a multibyte character.
295        if tokens[i] = MBCSET
296          ("the index of mbcsets correspnd to this operator" << 2) + 3
297 
298      e.g.
299      tokens
300         = 'single_byte_a', 'multi_byte_A', single_byte_b'
301         = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
302      multibyte_prop
303         = 3     , 1               ,  0              ,  2              , 3
304   */
305 
306   /* Array of the bracket expressoin in the DFA.  */
307   struct mb_char_classes *mbcsets;
308   int nmbcsets;
309   int mbcsets_alloc;
310 #endif
311 
312   /* Stuff owned by the state builder. */
313   dfa_state *states;		/* States of the dfa. */
314   int sindex;			/* Index for adding new states. */
315   int salloc;			/* Number of states currently allocated. */
316 
317   /* Stuff built by the structure analyzer. */
318   position_set *follows;	/* Array of follow sets, indexed by position
319 				   index.  The follow of a position is the set
320 				   of positions containing characters that
321 				   could conceivably follow a character
322 				   matching the given position in a string
323 				   matching the regexp.  Allocated to the
324 				   maximum possible position index. */
325   int searchflag;		/* True if we are supposed to build a searching
326 				   as opposed to an exact matcher.  A searching
327 				   matcher finds the first and shortest string
328 				   matching a regexp anywhere in the buffer,
329 				   whereas an exact matcher finds the longest
330 				   string matching, but anchored to the
331 				   beginning of the buffer. */
332 
333   /* Stuff owned by the executor. */
334   int tralloc;			/* Number of transition tables that have
335 				   slots so far. */
336   int trcount;			/* Number of transition tables that have
337 				   actually been built. */
338   int **trans;			/* Transition tables for states that can
339 				   never accept.  If the transitions for a
340 				   state have not yet been computed, or the
341 				   state could possibly accept, its entry in
342 				   this table is NULL. */
343   int **realtrans;		/* Trans always points to realtrans + 1; this
344 				   is so trans[-1] can contain NULL. */
345   int **fails;			/* Transition tables after failing to accept
346 				   on a state that potentially could do so. */
347   int *success;			/* Table of acceptance conditions used in
348 				   dfaexec and computed in build_state. */
349   struct dfamust *musts;	/* List of strings, at least one of which
350 				   is known to appear in any r.e. matching
351 				   the dfa. */
352 };
353 
354 /* Some macros for user access to dfa internals. */
355 
356 /* ACCEPTING returns true if s could possibly be an accepting state of r. */
357 #define ACCEPTING(s, r) ((r).states[s].constraint)
358 
359 /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
360    specified context. */
361 #define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \
362   SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint,		   \
363 		       prevn, currn, prevl, currl)
364 
365 /* FIRST_MATCHING_REGEXP returns the index number of the first of parallel
366    regexps that a given state could accept.  Parallel regexps are numbered
367    starting at 1. */
368 #define FIRST_MATCHING_REGEXP(state, dfa) (-(dfa).states[state].first_end)
369 
370 /* Entry points. */
371 
372 /* dfasyntax() takes three arguments; the first sets the syntax bits described
373    earlier in this file, the second sets the case-folding flag, and the
374    third specifies the line terminator. */
375 extern void dfasyntax (reg_syntax_t, int, unsigned char);
376 
377 /* Compile the given string of the given length into the given struct dfa.
378    Final argument is a flag specifying whether to build a searching or an
379    exact matcher. */
380 extern void dfacomp (char const *, size_t, struct dfa *, int);
381 
382 /* Execute the given struct dfa on the buffer of characters.  The
383    last byte of the buffer must equal the end-of-line byte.
384    The final argument points to a flag that will
385    be set if further examination by a backtracking matcher is needed in
386    order to verify backreferencing; otherwise the flag will be cleared.
387    Returns (size_t) -1 if no match is found, or the offset of the first
388    character after the first & shortest matching string in the buffer. */
389 extern size_t dfaexec (struct dfa *, char const *, size_t, int *);
390 
391 /* Free the storage held by the components of a struct dfa. */
392 extern void dfafree (struct dfa *);
393 
394 /* Entry points for people who know what they're doing. */
395 
396 /* Initialize the components of a struct dfa. */
397 extern void dfainit (struct dfa *);
398 
399 /* Incrementally parse a string of given length into a struct dfa. */
400 extern void dfaparse (char const *, size_t, struct dfa *);
401 
402 /* Analyze a parsed regexp; second argument tells whether to build a searching
403    or an exact matcher. */
404 extern void dfaanalyze (struct dfa *, int);
405 
406 /* Compute, for each possible character, the transitions out of a given
407    state, storing them in an array of integers. */
408 extern void dfastate (int, struct dfa *, int []);
409 
410 /* Error handling. */
411 
412 /* dfaerror() is called by the regexp routines whenever an error occurs.  It
413    takes a single argument, a NUL-terminated string describing the error.
414    The user must supply a dfaerror.  */
415 extern void dfaerror (const char *);
416