1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License along 17 with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 #include <sys/cdefs.h> 20 __RCSID("$NetBSD: regcomp.c,v 1.2 2016/05/17 14:00:09 christos Exp $"); 21 22 23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 24 Idx length, reg_syntax_t syntax); 25 static void re_compile_fastmap_iter (regex_t *bufp, 26 const re_dfastate_t *init_state, 27 char *fastmap); 28 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len); 29 #ifdef RE_ENABLE_I18N 30 static void free_charset (re_charset_t *cset); 31 #endif /* RE_ENABLE_I18N */ 32 static void free_workarea_compile (regex_t *preg); 33 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 34 #ifdef RE_ENABLE_I18N 35 static void optimize_utf8 (re_dfa_t *dfa); 36 #endif 37 static reg_errcode_t analyze (regex_t *preg); 38 static reg_errcode_t preorder (bin_tree_t *root, 39 reg_errcode_t (fn (void *, bin_tree_t *)), 40 void *extra); 41 static reg_errcode_t postorder (bin_tree_t *root, 42 reg_errcode_t (fn (void *, bin_tree_t *)), 43 void *extra); 44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 47 bin_tree_t *node); 48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 53 unsigned int constraint); 54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 56 Idx node, bool root); 57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 58 static Idx fetch_number (re_string_t *input, re_token_t *token, 59 reg_syntax_t syntax); 60 static int peek_token (re_token_t *token, re_string_t *input, 61 reg_syntax_t syntax); 62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 63 reg_syntax_t syntax, reg_errcode_t *err); 64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 65 re_token_t *token, reg_syntax_t syntax, 66 Idx nest, reg_errcode_t *err); 67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 68 re_token_t *token, reg_syntax_t syntax, 69 Idx nest, reg_errcode_t *err); 70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 71 re_token_t *token, reg_syntax_t syntax, 72 Idx nest, reg_errcode_t *err); 73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 74 re_token_t *token, reg_syntax_t syntax, 75 Idx nest, reg_errcode_t *err); 76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 77 re_dfa_t *dfa, re_token_t *token, 78 reg_syntax_t syntax, reg_errcode_t *err); 79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 80 re_token_t *token, reg_syntax_t syntax, 81 reg_errcode_t *err); 82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 83 re_string_t *regexp, 84 re_token_t *token, int token_len, 85 re_dfa_t *dfa, 86 reg_syntax_t syntax, 87 bool accept_hyphen); 88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 89 re_string_t *regexp, 90 re_token_t *token); 91 #ifdef RE_ENABLE_I18N 92 static reg_errcode_t build_equiv_class (bitset sbcset, 93 re_charset_t *mbcset, 94 Idx *equiv_class_alloc, 95 const unsigned char *name); 96 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans, 97 bitset sbcset, 98 re_charset_t *mbcset, 99 Idx *char_class_alloc, 100 const unsigned char *class_name, 101 reg_syntax_t syntax); 102 #else /* not RE_ENABLE_I18N */ 103 static reg_errcode_t build_equiv_class (bitset sbcset, 104 const unsigned char *name); 105 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans, 106 bitset sbcset, 107 const unsigned char *class_name, 108 reg_syntax_t syntax); 109 #endif /* not RE_ENABLE_I18N */ 110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 111 unsigned REG_TRANSLATE_TYPE trans, 112 const unsigned char *class_name, 113 const unsigned char *extra, 114 bool non_match, reg_errcode_t *err); 115 static bin_tree_t *create_tree (re_dfa_t *dfa, 116 bin_tree_t *left, bin_tree_t *right, 117 re_token_type_t type); 118 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 119 bin_tree_t *left, bin_tree_t *right, 120 const re_token_t *token); 121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 122 static void free_token (re_token_t *node); 123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 125 126 /* This table gives an error message for each of the error codes listed 127 in regex.h. Obviously the order here has to be same as there. 128 POSIX doesn't require that we do anything for REG_NOERROR, 129 but why not be nice? */ 130 131 const char __re_error_msgid[] attribute_hidden = 132 { 133 #define REG_NOERROR_IDX 0 134 gettext_noop ("Success") /* REG_NOERROR */ 135 "\0" 136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 137 gettext_noop ("No match") /* REG_NOMATCH */ 138 "\0" 139 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 140 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 141 "\0" 142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 143 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 144 "\0" 145 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 146 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 147 "\0" 148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 149 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 150 "\0" 151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 152 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 153 "\0" 154 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 155 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 156 "\0" 157 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 158 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 159 "\0" 160 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 161 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 162 "\0" 163 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 164 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 165 "\0" 166 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 167 gettext_noop ("Invalid range end") /* REG_ERANGE */ 168 "\0" 169 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 170 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 171 "\0" 172 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 173 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 174 "\0" 175 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 176 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 177 "\0" 178 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 179 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 180 "\0" 181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 182 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 183 }; 184 185 const size_t __re_error_msgid_idx[] attribute_hidden = 186 { 187 REG_NOERROR_IDX, 188 REG_NOMATCH_IDX, 189 REG_BADPAT_IDX, 190 REG_ECOLLATE_IDX, 191 REG_ECTYPE_IDX, 192 REG_EESCAPE_IDX, 193 REG_ESUBREG_IDX, 194 REG_EBRACK_IDX, 195 REG_EPAREN_IDX, 196 REG_EBRACE_IDX, 197 REG_BADBR_IDX, 198 REG_ERANGE_IDX, 199 REG_ESPACE_IDX, 200 REG_BADRPT_IDX, 201 REG_EEND_IDX, 202 REG_ESIZE_IDX, 203 REG_ERPAREN_IDX 204 }; 205 206 /* Entry points for GNU code. */ 207 208 /* re_compile_pattern is the GNU regular expression compiler: it 209 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 210 Returns 0 if the pattern was valid, otherwise an error string. 211 212 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields 213 are set in BUFP on entry. */ 214 215 const char * 216 re_compile_pattern (const char *pattern, size_t length, 217 struct re_pattern_buffer *bufp) 218 { 219 reg_errcode_t ret; 220 221 /* And GNU code determines whether or not to get register information 222 by passing null for the REGS argument to re_match, etc., not by 223 setting re_no_sub, unless REG_NO_SUB is set. */ 224 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB); 225 226 /* Match anchors at newline. */ 227 bufp->re_newline_anchor = 1; 228 229 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 230 231 if (!ret) 232 return NULL; 233 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 234 } 235 #ifdef _LIBC 236 weak_alias (__re_compile_pattern, re_compile_pattern) 237 #endif 238 239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 240 also be assigned to arbitrarily: each pattern buffer stores its own 241 syntax, so it can be changed between regex compilations. */ 242 /* This has no initializer because initialized variables in Emacs 243 become read-only after dumping. */ 244 reg_syntax_t re_syntax_options; 245 246 247 /* Specify the precise syntax of regexps for compilation. This provides 248 for compatibility for various utilities which historically have 249 different, incompatible syntaxes. 250 251 The argument SYNTAX is a bit mask comprised of the various bits 252 defined in regex.h. We return the old syntax. */ 253 254 reg_syntax_t 255 re_set_syntax (reg_syntax_t syntax) 256 { 257 reg_syntax_t ret = re_syntax_options; 258 259 re_syntax_options = syntax; 260 return ret; 261 } 262 #ifdef _LIBC 263 weak_alias (__re_set_syntax, re_set_syntax) 264 #endif 265 266 int 267 re_compile_fastmap (struct re_pattern_buffer *bufp) 268 { 269 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer; 270 char *fastmap = bufp->re_fastmap; 271 272 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 273 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 274 if (dfa->init_state != dfa->init_state_word) 275 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 276 if (dfa->init_state != dfa->init_state_nl) 277 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 278 if (dfa->init_state != dfa->init_state_begbuf) 279 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 280 bufp->re_fastmap_accurate = 1; 281 return 0; 282 } 283 #ifdef _LIBC 284 weak_alias (__re_compile_fastmap, re_compile_fastmap) 285 #endif 286 287 static inline void 288 __attribute ((always_inline)) 289 re_set_fastmap (char *fastmap, bool icase, int ch) 290 { 291 fastmap[ch] = 1; 292 if (icase) 293 fastmap[tolower (ch)] = 1; 294 } 295 296 /* Helper function for re_compile_fastmap. 297 Compile fastmap for the initial_state INIT_STATE. */ 298 299 static void 300 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 301 char *fastmap) 302 { 303 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer; 304 Idx node_cnt; 305 bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE)); 306 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 307 { 308 Idx node = init_state->nodes.elems[node_cnt]; 309 re_token_type_t type = dfa->nodes[node].type; 310 311 if (type == CHARACTER) 312 { 313 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 314 #ifdef RE_ENABLE_I18N 315 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1) 316 { 317 unsigned char buf[MB_LEN_MAX]; 318 unsigned char *p; 319 wchar_t wc; 320 mbstate_t state; 321 322 p = buf; 323 *p++ = dfa->nodes[node].opr.c; 324 while (++node < dfa->nodes_len 325 && dfa->nodes[node].type == CHARACTER 326 && dfa->nodes[node].mb_partial) 327 *p++ = dfa->nodes[node].opr.c; 328 memset (&state, 0, sizeof (state)); 329 if (mbrtowc (&wc, (const char *) buf, p - buf, 330 &state) == p - buf 331 && (__wcrtomb ((char *) buf, towlower (wc), &state) 332 != (size_t) -1)) 333 re_set_fastmap (fastmap, false, buf[0]); 334 } 335 #endif 336 } 337 else if (type == SIMPLE_BRACKET) 338 { 339 int i, j, ch; 340 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 341 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 342 if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j)) 343 re_set_fastmap (fastmap, icase, ch); 344 } 345 #ifdef RE_ENABLE_I18N 346 else if (type == COMPLEX_BRACKET) 347 { 348 Idx i; 349 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 350 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes 351 || cset->nranges || cset->nchar_classes) 352 { 353 # ifdef _LIBC 354 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) 355 { 356 /* In this case we want to catch the bytes which are 357 the first byte of any collation elements. 358 e.g. In da_DK, we want to catch 'a' since "aa" 359 is a valid collation element, and don't catch 360 'b' since 'b' is the only collation element 361 which starts from 'b'. */ 362 const int32_t *table = (const int32_t *) 363 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 364 for (i = 0; i < SBC_MAX; ++i) 365 if (table[i] < 0) 366 re_set_fastmap (fastmap, icase, i); 367 } 368 # else 369 if (dfa->mb_cur_max > 1) 370 for (i = 0; i < SBC_MAX; ++i) 371 if (__btowc (i) == WEOF) 372 re_set_fastmap (fastmap, icase, i); 373 # endif /* not _LIBC */ 374 } 375 for (i = 0; i < cset->nmbchars; ++i) 376 { 377 char buf[256]; 378 mbstate_t state; 379 memset (&state, '\0', sizeof (state)); 380 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 381 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 382 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1) 383 { 384 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 385 != (size_t) -1) 386 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 387 } 388 } 389 } 390 #endif /* RE_ENABLE_I18N */ 391 else if (type == OP_PERIOD 392 #ifdef RE_ENABLE_I18N 393 || type == OP_UTF8_PERIOD 394 #endif /* RE_ENABLE_I18N */ 395 || type == END_OF_RE) 396 { 397 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 398 if (type == END_OF_RE) 399 bufp->re_can_be_null = 1; 400 return; 401 } 402 } 403 } 404 405 /* Entry point for POSIX code. */ 406 /* regcomp takes a regular expression as a string and compiles it. 407 408 PREG is a regex_t *. We do not expect any fields to be initialized, 409 since POSIX says we shouldn't. Thus, we set 410 411 `re_buffer' to the compiled pattern; 412 `re_used' to the length of the compiled pattern; 413 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the 414 REG_EXTENDED bit in CFLAGS is set; otherwise, to 415 REG_SYNTAX_POSIX_BASIC; 416 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS; 417 `re_fastmap' to an allocated space for the fastmap; 418 `re_fastmap_accurate' to zero; 419 `re_nsub' to the number of subexpressions in PATTERN. 420 421 PATTERN is the address of the pattern string. 422 423 CFLAGS is a series of bits which affect compilation. 424 425 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 426 use POSIX basic syntax. 427 428 If REG_NEWLINE is set, then . and [^...] don't match newline. 429 Also, regexec will try a match beginning after every newline. 430 431 If REG_ICASE is set, then we considers upper- and lowercase 432 versions of letters to be equivalent when matching. 433 434 If REG_NOSUB is set, then when PREG is passed to regexec, that 435 routine will report only success or failure, and nothing about the 436 registers. 437 438 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 439 the return codes and their meanings.) */ 440 441 int 442 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags) 443 { 444 reg_errcode_t ret; 445 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED 446 : REG_SYNTAX_POSIX_BASIC); 447 448 preg->re_buffer = NULL; 449 preg->re_allocated = 0; 450 preg->re_used = 0; 451 452 /* Try to allocate space for the fastmap. */ 453 preg->re_fastmap = re_malloc (char, SBC_MAX); 454 if (BE (preg->re_fastmap == NULL, 0)) 455 return REG_ESPACE; 456 457 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0; 458 459 /* If REG_NEWLINE is set, newlines are treated differently. */ 460 if (cflags & REG_NEWLINE) 461 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 462 syntax &= ~REG_DOT_NEWLINE; 463 syntax |= REG_HAT_LISTS_NOT_NEWLINE; 464 /* It also changes the matching behavior. */ 465 preg->re_newline_anchor = 1; 466 } 467 else 468 preg->re_newline_anchor = 0; 469 preg->re_no_sub = !!(cflags & REG_NOSUB); 470 preg->re_translate = NULL; 471 472 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 473 474 /* POSIX doesn't distinguish between an unmatched open-group and an 475 unmatched close-group: both are REG_EPAREN. */ 476 if (ret == REG_ERPAREN) 477 ret = REG_EPAREN; 478 479 /* We have already checked preg->re_fastmap != NULL. */ 480 if (BE (ret == REG_NOERROR, 1)) 481 /* Compute the fastmap now, since regexec cannot modify the pattern 482 buffer. This function never fails in this implementation. */ 483 (void) re_compile_fastmap (preg); 484 else 485 { 486 /* Some error occurred while compiling the expression. */ 487 re_free (preg->re_fastmap); 488 preg->re_fastmap = NULL; 489 } 490 491 return (int) ret; 492 } 493 #ifdef _LIBC 494 weak_alias (__regcomp, regcomp) 495 #endif 496 497 /* Returns a message corresponding to an error code, ERRCODE, returned 498 from either regcomp or regexec. We don't use PREG here. */ 499 500 size_t 501 regerror (int errcode, const regex_t *__restrict preg, 502 char *__restrict errbuf, size_t errbuf_size) 503 { 504 const char *msg; 505 size_t msg_size; 506 507 if (BE (errcode < 0 508 || errcode >= (int) (sizeof (__re_error_msgid_idx) 509 / sizeof (__re_error_msgid_idx[0])), 0)) 510 /* Only error codes returned by the rest of the code should be passed 511 to this routine. If we are given anything else, or if other regex 512 code generates an invalid error code, then the program has a bug. 513 Dump core so we can fix it. */ 514 abort (); 515 516 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 517 518 msg_size = strlen (msg) + 1; /* Includes the null. */ 519 520 if (BE (errbuf_size != 0, 1)) 521 { 522 if (BE (msg_size > errbuf_size, 0)) 523 { 524 #if defined HAVE_MEMPCPY || defined _LIBC 525 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 526 #else 527 memcpy (errbuf, msg, errbuf_size - 1); 528 errbuf[errbuf_size - 1] = 0; 529 #endif 530 } 531 else 532 memcpy (errbuf, msg, msg_size); 533 } 534 535 return msg_size; 536 } 537 #ifdef _LIBC 538 weak_alias (__regerror, regerror) 539 #endif 540 541 542 #ifdef RE_ENABLE_I18N 543 /* This static array is used for the map to single-byte characters when 544 UTF-8 is used. Otherwise we would allocate memory just to initialize 545 it the same all the time. UTF-8 is the preferred encoding so this is 546 a worthwhile optimization. */ 547 static const bitset utf8_sb_map = 548 { 549 /* Set the first 128 bits. */ 550 # if 2 < BITSET_WORDS 551 BITSET_WORD_MAX, 552 # endif 553 # if 4 < BITSET_WORDS 554 BITSET_WORD_MAX, 555 # endif 556 # if 6 < BITSET_WORDS 557 BITSET_WORD_MAX, 558 # endif 559 # if 8 < BITSET_WORDS 560 # error "Invalid BITSET_WORDS" 561 # endif 562 (BITSET_WORD_MAX 563 >> (SBC_MAX % BITSET_WORD_BITS == 0 564 ? 0 565 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 566 }; 567 #endif 568 569 570 static void 571 free_dfa_content (re_dfa_t *dfa) 572 { 573 Idx i, j; 574 575 if (dfa->nodes) 576 for (i = 0; i < dfa->nodes_len; ++i) 577 free_token (dfa->nodes + i); 578 re_free (dfa->nexts); 579 for (i = 0; i < dfa->nodes_len; ++i) 580 { 581 if (dfa->eclosures != NULL) 582 re_node_set_free (dfa->eclosures + i); 583 if (dfa->inveclosures != NULL) 584 re_node_set_free (dfa->inveclosures + i); 585 if (dfa->edests != NULL) 586 re_node_set_free (dfa->edests + i); 587 } 588 re_free (dfa->edests); 589 re_free (dfa->eclosures); 590 re_free (dfa->inveclosures); 591 re_free (dfa->nodes); 592 593 if (dfa->state_table) 594 for (i = 0; i <= dfa->state_hash_mask; ++i) 595 { 596 struct re_state_table_entry *entry = dfa->state_table + i; 597 for (j = 0; j < entry->num; ++j) 598 { 599 re_dfastate_t *state = entry->array[j]; 600 free_state (state); 601 } 602 re_free (entry->array); 603 } 604 re_free (dfa->state_table); 605 #ifdef RE_ENABLE_I18N 606 if (dfa->sb_char != utf8_sb_map) 607 re_free (dfa->sb_char); 608 #endif 609 re_free (dfa->subexp_map); 610 #ifdef DEBUG 611 re_free (dfa->re_str); 612 #endif 613 614 re_free (dfa); 615 } 616 617 618 /* Free dynamically allocated space used by PREG. */ 619 620 void 621 regfree (regex_t *preg) 622 { 623 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 624 if (BE (dfa != NULL, 1)) 625 free_dfa_content (dfa); 626 preg->re_buffer = NULL; 627 preg->re_allocated = 0; 628 629 re_free (preg->re_fastmap); 630 preg->re_fastmap = NULL; 631 632 re_free (preg->re_translate); 633 preg->re_translate = NULL; 634 } 635 #ifdef _LIBC 636 weak_alias (__regfree, regfree) 637 #endif 638 639 /* Entry points compatible with 4.2 BSD regex library. We don't define 640 them unless specifically requested. */ 641 642 #if defined _REGEX_RE_COMP || defined _LIBC 643 644 /* BSD has one and only one pattern buffer. */ 645 static struct re_pattern_buffer re_comp_buf; 646 647 char * 648 # ifdef _LIBC 649 /* Make these definitions weak in libc, so POSIX programs can redefine 650 these names if they don't use our functions, and still use 651 regcomp/regexec above without link errors. */ 652 weak_function 653 # endif 654 re_comp (const char *s) 655 { 656 reg_errcode_t ret; 657 char *fastmap; 658 659 if (!s) 660 { 661 if (!re_comp_buf.re_buffer) 662 return gettext ("No previous regular expression"); 663 return 0; 664 } 665 666 if (re_comp_buf.re_buffer) 667 { 668 fastmap = re_comp_buf.re_fastmap; 669 re_comp_buf.re_fastmap = NULL; 670 __regfree (&re_comp_buf); 671 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 672 re_comp_buf.re_fastmap = fastmap; 673 } 674 675 if (re_comp_buf.re_fastmap == NULL) 676 { 677 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX); 678 if (re_comp_buf.re_fastmap == NULL) 679 return (char *) gettext (__re_error_msgid 680 + __re_error_msgid_idx[(int) REG_ESPACE]); 681 } 682 683 /* Since `re_exec' always passes NULL for the `regs' argument, we 684 don't need to initialize the pattern buffer fields which affect it. */ 685 686 /* Match anchors at newlines. */ 687 re_comp_buf.re_newline_anchor = 1; 688 689 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 690 691 if (!ret) 692 return NULL; 693 694 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 695 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 696 } 697 698 #ifdef _LIBC 699 libc_freeres_fn (free_mem) 700 { 701 __regfree (&re_comp_buf); 702 } 703 #endif 704 705 #endif /* _REGEX_RE_COMP */ 706 707 /* Internal entry point. 708 Compile the regular expression PATTERN, whose length is LENGTH. 709 SYNTAX indicate regular expression's syntax. */ 710 711 static reg_errcode_t 712 re_compile_internal (regex_t *preg, const char *pattern, Idx length, 713 reg_syntax_t syntax) 714 { 715 reg_errcode_t err = REG_NOERROR; 716 re_dfa_t *dfa; 717 re_string_t regexp; 718 719 /* Initialize the pattern buffer. */ 720 preg->re_fastmap_accurate = 0; 721 preg->re_syntax = syntax; 722 preg->re_not_bol = preg->re_not_eol = 0; 723 preg->re_used = 0; 724 preg->re_nsub = 0; 725 preg->re_can_be_null = 0; 726 preg->re_regs_allocated = REG_UNALLOCATED; 727 728 /* Initialize the dfa. */ 729 dfa = (re_dfa_t *) preg->re_buffer; 730 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0)) 731 { 732 /* If zero allocated, but buffer is non-null, try to realloc 733 enough space. This loses if buffer's address is bogus, but 734 that is the user's responsibility. If buffer is null this 735 is a simple allocation. */ 736 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1); 737 if (dfa == NULL) 738 return REG_ESPACE; 739 preg->re_allocated = sizeof (re_dfa_t); 740 preg->re_buffer = (unsigned char *) dfa; 741 } 742 preg->re_used = sizeof (re_dfa_t); 743 744 __libc_lock_init (dfa->lock); 745 746 err = init_dfa (dfa, length); 747 if (BE (err != REG_NOERROR, 0)) 748 { 749 free_dfa_content (dfa); 750 preg->re_buffer = NULL; 751 preg->re_allocated = 0; 752 return err; 753 } 754 #ifdef DEBUG 755 dfa->re_str = re_malloc (char, length + 1); 756 strncpy (dfa->re_str, pattern, length + 1); 757 #endif 758 759 err = re_string_construct (®exp, pattern, length, preg->re_translate, 760 syntax & REG_IGNORE_CASE, dfa); 761 if (BE (err != REG_NOERROR, 0)) 762 { 763 re_compile_internal_free_return: 764 free_workarea_compile (preg); 765 re_string_destruct (®exp); 766 free_dfa_content (dfa); 767 preg->re_buffer = NULL; 768 preg->re_allocated = 0; 769 return err; 770 } 771 772 /* Parse the regular expression, and build a structure tree. */ 773 preg->re_nsub = 0; 774 dfa->str_tree = parse (®exp, preg, syntax, &err); 775 if (BE (dfa->str_tree == NULL, 0)) 776 goto re_compile_internal_free_return; 777 778 /* Analyze the tree and create the nfa. */ 779 err = analyze (preg); 780 if (BE (err != REG_NOERROR, 0)) 781 goto re_compile_internal_free_return; 782 783 #ifdef RE_ENABLE_I18N 784 /* If possible, do searching in single byte encoding to speed things up. */ 785 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL) 786 optimize_utf8 (dfa); 787 #endif 788 789 /* Then create the initial state of the dfa. */ 790 err = create_initial_state (dfa); 791 792 /* Release work areas. */ 793 free_workarea_compile (preg); 794 re_string_destruct (®exp); 795 796 if (BE (err != REG_NOERROR, 0)) 797 { 798 free_dfa_content (dfa); 799 preg->re_buffer = NULL; 800 preg->re_allocated = 0; 801 } 802 803 return err; 804 } 805 806 /* Initialize DFA. We use the length of the regular expression PAT_LEN 807 as the initial length of some arrays. */ 808 809 static reg_errcode_t 810 init_dfa (re_dfa_t *dfa, Idx pat_len) 811 { 812 __re_size_t table_size; 813 #ifndef _LIBC 814 char *codeset_name; 815 #endif 816 817 memset (dfa, '\0', sizeof (re_dfa_t)); 818 819 /* Force allocation of str_tree_storage the first time. */ 820 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 821 822 dfa->nodes_alloc = pat_len + 1; 823 dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc); 824 825 /* table_size = 2 ^ ceil(log pat_len) */ 826 for (table_size = 1; table_size <= pat_len; table_size <<= 1) 827 if (0 < (Idx) -1 && table_size == 0) 828 return REG_ESPACE; 829 830 dfa->state_table = re_calloc (struct re_state_table_entry, table_size); 831 dfa->state_hash_mask = table_size - 1; 832 833 dfa->mb_cur_max = MB_CUR_MAX; 834 #ifdef _LIBC 835 if (dfa->mb_cur_max == 6 836 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 837 dfa->is_utf8 = 1; 838 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 839 != 0); 840 #else 841 # ifdef HAVE_LANGINFO_CODESET 842 codeset_name = nl_langinfo (CODESET); 843 # else 844 codeset_name = getenv ("LC_ALL"); 845 if (codeset_name == NULL || codeset_name[0] == '\0') 846 codeset_name = getenv ("LC_CTYPE"); 847 if (codeset_name == NULL || codeset_name[0] == '\0') 848 codeset_name = getenv ("LANG"); 849 if (codeset_name == NULL) 850 codeset_name = ""; 851 else if (strchr (codeset_name, '.') != NULL) 852 codeset_name = strchr (codeset_name, '.') + 1; 853 # endif 854 855 if (strcasecmp (codeset_name, "UTF-8") == 0 856 || strcasecmp (codeset_name, "UTF8") == 0) 857 dfa->is_utf8 = 1; 858 859 /* We check exhaustively in the loop below if this charset is a 860 superset of ASCII. */ 861 dfa->map_notascii = 0; 862 #endif 863 864 #ifdef RE_ENABLE_I18N 865 if (dfa->mb_cur_max > 1) 866 { 867 if (dfa->is_utf8) 868 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 869 else 870 { 871 int i, j, ch; 872 873 dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS); 874 if (BE (dfa->sb_char == NULL, 0)) 875 return REG_ESPACE; 876 877 /* Set the bits corresponding to single byte chars. */ 878 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 879 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 880 { 881 wint_t wch = __btowc (ch); 882 if (wch != WEOF) 883 dfa->sb_char[i] |= (bitset_word) 1 << j; 884 # ifndef _LIBC 885 if (isascii (ch) && wch != ch) 886 dfa->map_notascii = 1; 887 # endif 888 } 889 } 890 } 891 #endif 892 893 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 894 return REG_ESPACE; 895 return REG_NOERROR; 896 } 897 898 /* Initialize WORD_CHAR table, which indicate which character is 899 "word". In this case "word" means that it is the word construction 900 character used by some operators like "\<", "\>", etc. */ 901 902 static void 903 init_word_char (re_dfa_t *dfa) 904 { 905 int i, j, ch; 906 dfa->word_ops_used = 1; 907 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 908 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 909 if (isalnum (ch) || ch == '_') 910 dfa->word_char[i] |= (bitset_word) 1 << j; 911 } 912 913 /* Free the work area which are only used while compiling. */ 914 915 static void 916 free_workarea_compile (regex_t *preg) 917 { 918 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 919 bin_tree_storage_t *storage, *next; 920 for (storage = dfa->str_tree_storage; storage; storage = next) 921 { 922 next = storage->next; 923 re_free (storage); 924 } 925 dfa->str_tree_storage = NULL; 926 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 927 dfa->str_tree = NULL; 928 re_free (dfa->org_indices); 929 dfa->org_indices = NULL; 930 } 931 932 /* Create initial states for all contexts. */ 933 934 static reg_errcode_t 935 create_initial_state (re_dfa_t *dfa) 936 { 937 Idx first, i; 938 reg_errcode_t err; 939 re_node_set init_nodes; 940 941 /* Initial states have the epsilon closure of the node which is 942 the first node of the regular expression. */ 943 first = dfa->str_tree->first->node_idx; 944 dfa->init_node = first; 945 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 946 if (BE (err != REG_NOERROR, 0)) 947 return err; 948 949 /* The back-references which are in initial states can epsilon transit, 950 since in this case all of the subexpressions can be null. 951 Then we add epsilon closures of the nodes which are the next nodes of 952 the back-references. */ 953 if (dfa->nbackref > 0) 954 for (i = 0; i < init_nodes.nelem; ++i) 955 { 956 Idx node_idx = init_nodes.elems[i]; 957 re_token_type_t type = dfa->nodes[node_idx].type; 958 959 Idx clexp_idx; 960 if (type != OP_BACK_REF) 961 continue; 962 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 963 { 964 re_token_t *clexp_node; 965 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 966 if (clexp_node->type == OP_CLOSE_SUBEXP 967 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 968 break; 969 } 970 if (clexp_idx == init_nodes.nelem) 971 continue; 972 973 if (type == OP_BACK_REF) 974 { 975 Idx dest_idx = dfa->edests[node_idx].elems[0]; 976 if (!re_node_set_contains (&init_nodes, dest_idx)) 977 { 978 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 979 i = 0; 980 } 981 } 982 } 983 984 /* It must be the first time to invoke acquire_state. */ 985 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 986 /* We don't check ERR here, since the initial state must not be NULL. */ 987 if (BE (dfa->init_state == NULL, 0)) 988 return err; 989 if (dfa->init_state->has_constraint) 990 { 991 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 992 CONTEXT_WORD); 993 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 994 CONTEXT_NEWLINE); 995 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 996 &init_nodes, 997 CONTEXT_NEWLINE 998 | CONTEXT_BEGBUF); 999 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1000 || dfa->init_state_begbuf == NULL, 0)) 1001 return err; 1002 } 1003 else 1004 dfa->init_state_word = dfa->init_state_nl 1005 = dfa->init_state_begbuf = dfa->init_state; 1006 1007 re_node_set_free (&init_nodes); 1008 return REG_NOERROR; 1009 } 1010 1011 #ifdef RE_ENABLE_I18N 1012 /* If it is possible to do searching in single byte encoding instead of UTF-8 1013 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1014 DFA nodes where needed. */ 1015 1016 static void 1017 optimize_utf8 (re_dfa_t *dfa) 1018 { 1019 Idx node; 1020 int i; 1021 bool mb_chars = false; 1022 bool has_period = false; 1023 1024 for (node = 0; node < dfa->nodes_len; ++node) 1025 switch (dfa->nodes[node].type) 1026 { 1027 case CHARACTER: 1028 if (dfa->nodes[node].opr.c >= 0x80) 1029 mb_chars = true; 1030 break; 1031 case ANCHOR: 1032 switch (dfa->nodes[node].opr.idx) 1033 { 1034 case LINE_FIRST: 1035 case LINE_LAST: 1036 case BUF_FIRST: 1037 case BUF_LAST: 1038 break; 1039 default: 1040 /* Word anchors etc. cannot be handled. */ 1041 return; 1042 } 1043 break; 1044 case OP_PERIOD: 1045 has_period = true; 1046 break; 1047 case OP_BACK_REF: 1048 case OP_ALT: 1049 case END_OF_RE: 1050 case OP_DUP_ASTERISK: 1051 case OP_OPEN_SUBEXP: 1052 case OP_CLOSE_SUBEXP: 1053 break; 1054 case COMPLEX_BRACKET: 1055 return; 1056 case SIMPLE_BRACKET: 1057 /* Just double check. */ 1058 { 1059 int rshift = 1060 (SBC_MAX / 2 % BITSET_WORD_BITS == 0 1061 ? 0 1062 : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS); 1063 for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1064 { 1065 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1066 return; 1067 rshift = 0; 1068 } 1069 } 1070 break; 1071 default: 1072 abort (); 1073 } 1074 1075 if (mb_chars || has_period) 1076 for (node = 0; node < dfa->nodes_len; ++node) 1077 { 1078 if (dfa->nodes[node].type == CHARACTER 1079 && dfa->nodes[node].opr.c >= 0x80) 1080 dfa->nodes[node].mb_partial = 0; 1081 else if (dfa->nodes[node].type == OP_PERIOD) 1082 dfa->nodes[node].type = OP_UTF8_PERIOD; 1083 } 1084 1085 /* The search can be in single byte locale. */ 1086 dfa->mb_cur_max = 1; 1087 dfa->is_utf8 = 0; 1088 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1089 } 1090 #endif 1091 1092 /* Analyze the structure tree, and calculate "first", "next", "edest", 1093 "eclosure", and "inveclosure". */ 1094 1095 static reg_errcode_t 1096 analyze (regex_t *preg) 1097 { 1098 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 1099 reg_errcode_t ret; 1100 1101 /* Allocate arrays. */ 1102 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1103 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1104 dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc); 1105 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1106 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1107 || dfa->eclosures == NULL, 0)) 1108 return REG_ESPACE; 1109 1110 dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub); 1111 if (dfa->subexp_map != NULL) 1112 { 1113 Idx i; 1114 for (i = 0; i < preg->re_nsub; i++) 1115 dfa->subexp_map[i] = i; 1116 preorder (dfa->str_tree, optimize_subexps, dfa); 1117 for (i = 0; i < preg->re_nsub; i++) 1118 if (dfa->subexp_map[i] != i) 1119 break; 1120 if (i == preg->re_nsub) 1121 { 1122 free (dfa->subexp_map); 1123 dfa->subexp_map = NULL; 1124 } 1125 } 1126 1127 ret = postorder (dfa->str_tree, lower_subexps, preg); 1128 if (BE (ret != REG_NOERROR, 0)) 1129 return ret; 1130 ret = postorder (dfa->str_tree, calc_first, dfa); 1131 if (BE (ret != REG_NOERROR, 0)) 1132 return ret; 1133 preorder (dfa->str_tree, calc_next, dfa); 1134 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1135 if (BE (ret != REG_NOERROR, 0)) 1136 return ret; 1137 ret = calc_eclosure (dfa); 1138 if (BE (ret != REG_NOERROR, 0)) 1139 return ret; 1140 1141 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1142 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1143 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1144 || dfa->nbackref) 1145 { 1146 dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len); 1147 if (BE (dfa->inveclosures == NULL, 0)) 1148 return REG_ESPACE; 1149 ret = calc_inveclosure (dfa); 1150 } 1151 1152 return ret; 1153 } 1154 1155 /* Our parse trees are very unbalanced, so we cannot use a stack to 1156 implement parse tree visits. Instead, we use parent pointers and 1157 some hairy code in these two functions. */ 1158 static reg_errcode_t 1159 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1160 void *extra) 1161 { 1162 bin_tree_t *node, *prev; 1163 1164 for (node = root; ; ) 1165 { 1166 /* Descend down the tree, preferably to the left (or to the right 1167 if that's the only child). */ 1168 while (node->left || node->right) 1169 if (node->left) 1170 node = node->left; 1171 else 1172 node = node->right; 1173 1174 do 1175 { 1176 reg_errcode_t err = fn (extra, node); 1177 if (BE (err != REG_NOERROR, 0)) 1178 return err; 1179 if (node->parent == NULL) 1180 return REG_NOERROR; 1181 prev = node; 1182 node = node->parent; 1183 } 1184 /* Go up while we have a node that is reached from the right. */ 1185 while (node->right == prev || node->right == NULL); 1186 node = node->right; 1187 } 1188 } 1189 1190 static reg_errcode_t 1191 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1192 void *extra) 1193 { 1194 bin_tree_t *node; 1195 1196 for (node = root; ; ) 1197 { 1198 reg_errcode_t err = fn (extra, node); 1199 if (BE (err != REG_NOERROR, 0)) 1200 return err; 1201 1202 /* Go to the left node, or up and to the right. */ 1203 if (node->left) 1204 node = node->left; 1205 else 1206 { 1207 bin_tree_t *prev = NULL; 1208 while (node->right == prev || node->right == NULL) 1209 { 1210 prev = node; 1211 node = node->parent; 1212 if (!node) 1213 return REG_NOERROR; 1214 } 1215 node = node->right; 1216 } 1217 } 1218 } 1219 1220 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1221 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1222 backreferences as well. Requires a preorder visit. */ 1223 static reg_errcode_t 1224 optimize_subexps (void *extra, bin_tree_t *node) 1225 { 1226 re_dfa_t *dfa = (re_dfa_t *) extra; 1227 1228 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1229 { 1230 int idx = node->token.opr.idx; 1231 node->token.opr.idx = dfa->subexp_map[idx]; 1232 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1233 } 1234 1235 else if (node->token.type == SUBEXP 1236 && node->left && node->left->token.type == SUBEXP) 1237 { 1238 Idx other_idx = node->left->token.opr.idx; 1239 1240 node->left = node->left->left; 1241 if (node->left) 1242 node->left->parent = node; 1243 1244 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1245 if (other_idx < BITSET_WORD_BITS) 1246 dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx); 1247 } 1248 1249 return REG_NOERROR; 1250 } 1251 1252 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1253 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1254 static reg_errcode_t 1255 lower_subexps (void *extra, bin_tree_t *node) 1256 { 1257 regex_t *preg = (regex_t *) extra; 1258 reg_errcode_t err = REG_NOERROR; 1259 1260 if (node->left && node->left->token.type == SUBEXP) 1261 { 1262 node->left = lower_subexp (&err, preg, node->left); 1263 if (node->left) 1264 node->left->parent = node; 1265 } 1266 if (node->right && node->right->token.type == SUBEXP) 1267 { 1268 node->right = lower_subexp (&err, preg, node->right); 1269 if (node->right) 1270 node->right->parent = node; 1271 } 1272 1273 return err; 1274 } 1275 1276 static bin_tree_t * 1277 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1278 { 1279 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 1280 bin_tree_t *body = node->left; 1281 bin_tree_t *op, *cls, *tree1, *tree; 1282 1283 if (preg->re_no_sub 1284 /* We do not optimize empty subexpressions, because otherwise we may 1285 have bad CONCAT nodes with NULL children. This is obviously not 1286 very common, so we do not lose much. An example that triggers 1287 this case is the sed "script" /\(\)/x. */ 1288 && node->left != NULL 1289 && ! (node->token.opr.idx < BITSET_WORD_BITS 1290 && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx))) 1291 return node->left; 1292 1293 /* Convert the SUBEXP node to the concatenation of an 1294 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1295 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1296 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1297 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1298 tree = create_tree (dfa, op, tree1, CONCAT); 1299 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1300 { 1301 *err = REG_ESPACE; 1302 return NULL; 1303 } 1304 1305 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1306 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1307 return tree; 1308 } 1309 1310 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1311 nodes. Requires a postorder visit. */ 1312 static reg_errcode_t 1313 calc_first (void *extra, bin_tree_t *node) 1314 { 1315 re_dfa_t *dfa = (re_dfa_t *) extra; 1316 if (node->token.type == CONCAT) 1317 { 1318 node->first = node->left->first; 1319 node->node_idx = node->left->node_idx; 1320 } 1321 else 1322 { 1323 node->first = node; 1324 node->node_idx = re_dfa_add_node (dfa, node->token); 1325 if (BE (node->node_idx == REG_MISSING, 0)) 1326 return REG_ESPACE; 1327 } 1328 return REG_NOERROR; 1329 } 1330 1331 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1332 static reg_errcode_t 1333 calc_next (void *extra, bin_tree_t *node) 1334 { 1335 switch (node->token.type) 1336 { 1337 case OP_DUP_ASTERISK: 1338 node->left->next = node; 1339 break; 1340 case CONCAT: 1341 node->left->next = node->right->first; 1342 node->right->next = node->next; 1343 break; 1344 default: 1345 if (node->left) 1346 node->left->next = node->next; 1347 if (node->right) 1348 node->right->next = node->next; 1349 break; 1350 } 1351 return REG_NOERROR; 1352 } 1353 1354 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1355 static reg_errcode_t 1356 link_nfa_nodes (void *extra, bin_tree_t *node) 1357 { 1358 re_dfa_t *dfa = (re_dfa_t *) extra; 1359 Idx idx = node->node_idx; 1360 reg_errcode_t err = REG_NOERROR; 1361 1362 switch (node->token.type) 1363 { 1364 case CONCAT: 1365 break; 1366 1367 case END_OF_RE: 1368 assert (node->next == NULL); 1369 break; 1370 1371 case OP_DUP_ASTERISK: 1372 case OP_ALT: 1373 { 1374 Idx left, right; 1375 dfa->has_plural_match = 1; 1376 if (node->left != NULL) 1377 left = node->left->first->node_idx; 1378 else 1379 left = node->next->node_idx; 1380 if (node->right != NULL) 1381 right = node->right->first->node_idx; 1382 else 1383 right = node->next->node_idx; 1384 assert (REG_VALID_INDEX (left)); 1385 assert (REG_VALID_INDEX (right)); 1386 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1387 } 1388 break; 1389 1390 case ANCHOR: 1391 case OP_OPEN_SUBEXP: 1392 case OP_CLOSE_SUBEXP: 1393 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1394 break; 1395 1396 case OP_BACK_REF: 1397 dfa->nexts[idx] = node->next->node_idx; 1398 if (node->token.type == OP_BACK_REF) 1399 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1400 break; 1401 1402 default: 1403 assert (!IS_EPSILON_NODE (node->token.type)); 1404 dfa->nexts[idx] = node->next->node_idx; 1405 break; 1406 } 1407 1408 return err; 1409 } 1410 1411 /* Duplicate the epsilon closure of the node ROOT_NODE. 1412 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1413 to their own constraint. */ 1414 1415 static reg_errcode_t 1416 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, 1417 Idx top_clone_node, Idx root_node, 1418 unsigned int init_constraint) 1419 { 1420 Idx org_node, clone_node; 1421 bool ok; 1422 unsigned int constraint = init_constraint; 1423 for (org_node = top_org_node, clone_node = top_clone_node;;) 1424 { 1425 Idx org_dest, clone_dest; 1426 if (dfa->nodes[org_node].type == OP_BACK_REF) 1427 { 1428 /* If the back reference epsilon-transit, its destination must 1429 also have the constraint. Then duplicate the epsilon closure 1430 of the destination of the back reference, and store it in 1431 edests of the back reference. */ 1432 org_dest = dfa->nexts[org_node]; 1433 re_node_set_empty (dfa->edests + clone_node); 1434 clone_dest = duplicate_node (dfa, org_dest, constraint); 1435 if (BE (clone_dest == REG_MISSING, 0)) 1436 return REG_ESPACE; 1437 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1438 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1439 if (BE (! ok, 0)) 1440 return REG_ESPACE; 1441 } 1442 else if (dfa->edests[org_node].nelem == 0) 1443 { 1444 /* In case of the node can't epsilon-transit, don't duplicate the 1445 destination and store the original destination as the 1446 destination of the node. */ 1447 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1448 break; 1449 } 1450 else if (dfa->edests[org_node].nelem == 1) 1451 { 1452 /* In case of the node can epsilon-transit, and it has only one 1453 destination. */ 1454 org_dest = dfa->edests[org_node].elems[0]; 1455 re_node_set_empty (dfa->edests + clone_node); 1456 if (dfa->nodes[org_node].type == ANCHOR) 1457 { 1458 /* In case of the node has another constraint, append it. */ 1459 if (org_node == root_node && clone_node != org_node) 1460 { 1461 /* ...but if the node is root_node itself, it means the 1462 epsilon closure have a loop, then tie it to the 1463 destination of the root_node. */ 1464 ok = re_node_set_insert (dfa->edests + clone_node, 1465 org_dest); 1466 if (BE (! ok, 0)) 1467 return REG_ESPACE; 1468 break; 1469 } 1470 constraint |= dfa->nodes[org_node].opr.ctx_type; 1471 } 1472 clone_dest = duplicate_node (dfa, org_dest, constraint); 1473 if (BE (clone_dest == REG_MISSING, 0)) 1474 return REG_ESPACE; 1475 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1476 if (BE (! ok, 0)) 1477 return REG_ESPACE; 1478 } 1479 else /* dfa->edests[org_node].nelem == 2 */ 1480 { 1481 /* In case of the node can epsilon-transit, and it has two 1482 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1483 org_dest = dfa->edests[org_node].elems[0]; 1484 re_node_set_empty (dfa->edests + clone_node); 1485 /* Search for a duplicated node which satisfies the constraint. */ 1486 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1487 if (clone_dest == REG_MISSING) 1488 { 1489 /* There are no such a duplicated node, create a new one. */ 1490 reg_errcode_t err; 1491 clone_dest = duplicate_node (dfa, org_dest, constraint); 1492 if (BE (clone_dest == REG_MISSING, 0)) 1493 return REG_ESPACE; 1494 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1495 if (BE (! ok, 0)) 1496 return REG_ESPACE; 1497 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1498 root_node, constraint); 1499 if (BE (err != REG_NOERROR, 0)) 1500 return err; 1501 } 1502 else 1503 { 1504 /* There are a duplicated node which satisfy the constraint, 1505 use it to avoid infinite loop. */ 1506 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1507 if (BE (! ok, 0)) 1508 return REG_ESPACE; 1509 } 1510 1511 org_dest = dfa->edests[org_node].elems[1]; 1512 clone_dest = duplicate_node (dfa, org_dest, constraint); 1513 if (BE (clone_dest == REG_MISSING, 0)) 1514 return REG_ESPACE; 1515 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1516 if (BE (! ok, 0)) 1517 return REG_ESPACE; 1518 } 1519 org_node = org_dest; 1520 clone_node = clone_dest; 1521 } 1522 return REG_NOERROR; 1523 } 1524 1525 /* Search for a node which is duplicated from the node ORG_NODE, and 1526 satisfies the constraint CONSTRAINT. */ 1527 1528 static Idx 1529 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1530 unsigned int constraint) 1531 { 1532 Idx idx; 1533 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1534 { 1535 if (org_node == dfa->org_indices[idx] 1536 && constraint == dfa->nodes[idx].constraint) 1537 return idx; /* Found. */ 1538 } 1539 return REG_MISSING; /* Not found. */ 1540 } 1541 1542 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1543 Return the index of the new node, or REG_MISSING if insufficient storage is 1544 available. */ 1545 1546 static Idx 1547 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1548 { 1549 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1550 if (BE (dup_idx != REG_MISSING, 1)) 1551 { 1552 dfa->nodes[dup_idx].constraint = constraint; 1553 if (dfa->nodes[org_idx].type == ANCHOR) 1554 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; 1555 dfa->nodes[dup_idx].duplicated = 1; 1556 1557 /* Store the index of the original node. */ 1558 dfa->org_indices[dup_idx] = org_idx; 1559 } 1560 return dup_idx; 1561 } 1562 1563 static reg_errcode_t 1564 calc_inveclosure (re_dfa_t *dfa) 1565 { 1566 Idx src, idx; 1567 bool ok; 1568 for (idx = 0; idx < dfa->nodes_len; ++idx) 1569 re_node_set_init_empty (dfa->inveclosures + idx); 1570 1571 for (src = 0; src < dfa->nodes_len; ++src) 1572 { 1573 Idx *elems = dfa->eclosures[src].elems; 1574 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1575 { 1576 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1577 if (BE (! ok, 0)) 1578 return REG_ESPACE; 1579 } 1580 } 1581 1582 return REG_NOERROR; 1583 } 1584 1585 /* Calculate "eclosure" for all the node in DFA. */ 1586 1587 static reg_errcode_t 1588 calc_eclosure (re_dfa_t *dfa) 1589 { 1590 Idx node_idx; 1591 bool incomplete; 1592 #ifdef DEBUG 1593 assert (dfa->nodes_len > 0); 1594 #endif 1595 incomplete = false; 1596 /* For each nodes, calculate epsilon closure. */ 1597 for (node_idx = 0; ; ++node_idx) 1598 { 1599 reg_errcode_t err; 1600 re_node_set eclosure_elem; 1601 if (node_idx == dfa->nodes_len) 1602 { 1603 if (!incomplete) 1604 break; 1605 incomplete = false; 1606 node_idx = 0; 1607 } 1608 1609 #ifdef DEBUG 1610 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1611 #endif 1612 1613 /* If we have already calculated, skip it. */ 1614 if (dfa->eclosures[node_idx].nelem != 0) 1615 continue; 1616 /* Calculate epsilon closure of `node_idx'. */ 1617 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1618 if (BE (err != REG_NOERROR, 0)) 1619 return err; 1620 1621 if (dfa->eclosures[node_idx].nelem == 0) 1622 { 1623 incomplete = true; 1624 re_node_set_free (&eclosure_elem); 1625 } 1626 } 1627 return REG_NOERROR; 1628 } 1629 1630 /* Calculate epsilon closure of NODE. */ 1631 1632 static reg_errcode_t 1633 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1634 { 1635 reg_errcode_t err; 1636 unsigned int constraint; 1637 Idx i; 1638 bool incomplete; 1639 bool ok; 1640 re_node_set eclosure; 1641 incomplete = false; 1642 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1643 if (BE (err != REG_NOERROR, 0)) 1644 return err; 1645 1646 /* This indicates that we are calculating this node now. 1647 We reference this value to avoid infinite loop. */ 1648 dfa->eclosures[node].nelem = REG_MISSING; 1649 1650 constraint = ((dfa->nodes[node].type == ANCHOR) 1651 ? dfa->nodes[node].opr.ctx_type : 0); 1652 /* If the current node has constraints, duplicate all nodes. 1653 Since they must inherit the constraints. */ 1654 if (constraint 1655 && dfa->edests[node].nelem 1656 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1657 { 1658 Idx org_node, cur_node; 1659 org_node = cur_node = node; 1660 err = duplicate_node_closure (dfa, node, node, node, constraint); 1661 if (BE (err != REG_NOERROR, 0)) 1662 return err; 1663 } 1664 1665 /* Expand each epsilon destination nodes. */ 1666 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1667 for (i = 0; i < dfa->edests[node].nelem; ++i) 1668 { 1669 re_node_set eclosure_elem; 1670 Idx edest = dfa->edests[node].elems[i]; 1671 /* If calculating the epsilon closure of `edest' is in progress, 1672 return intermediate result. */ 1673 if (dfa->eclosures[edest].nelem == REG_MISSING) 1674 { 1675 incomplete = true; 1676 continue; 1677 } 1678 /* If we haven't calculated the epsilon closure of `edest' yet, 1679 calculate now. Otherwise use calculated epsilon closure. */ 1680 if (dfa->eclosures[edest].nelem == 0) 1681 { 1682 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1683 if (BE (err != REG_NOERROR, 0)) 1684 return err; 1685 } 1686 else 1687 eclosure_elem = dfa->eclosures[edest]; 1688 /* Merge the epsilon closure of `edest'. */ 1689 re_node_set_merge (&eclosure, &eclosure_elem); 1690 /* If the epsilon closure of `edest' is incomplete, 1691 the epsilon closure of this node is also incomplete. */ 1692 if (dfa->eclosures[edest].nelem == 0) 1693 { 1694 incomplete = true; 1695 re_node_set_free (&eclosure_elem); 1696 } 1697 } 1698 1699 /* Epsilon closures include itself. */ 1700 ok = re_node_set_insert (&eclosure, node); 1701 if (BE (! ok, 0)) 1702 return REG_ESPACE; 1703 if (incomplete && !root) 1704 dfa->eclosures[node].nelem = 0; 1705 else 1706 dfa->eclosures[node] = eclosure; 1707 *new_set = eclosure; 1708 return REG_NOERROR; 1709 } 1710 1711 /* Functions for token which are used in the parser. */ 1712 1713 /* Fetch a token from INPUT. 1714 We must not use this function inside bracket expressions. */ 1715 1716 static void 1717 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1718 { 1719 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1720 } 1721 1722 /* Peek a token from INPUT, and return the length of the token. 1723 We must not use this function inside bracket expressions. */ 1724 1725 static int 1726 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1727 { 1728 unsigned char c; 1729 1730 if (re_string_eoi (input)) 1731 { 1732 token->type = END_OF_RE; 1733 return 0; 1734 } 1735 1736 c = re_string_peek_byte (input, 0); 1737 token->opr.c = c; 1738 1739 token->word_char = 0; 1740 #ifdef RE_ENABLE_I18N 1741 token->mb_partial = 0; 1742 if (input->mb_cur_max > 1 && 1743 !re_string_first_byte (input, re_string_cur_idx (input))) 1744 { 1745 token->type = CHARACTER; 1746 token->mb_partial = 1; 1747 return 1; 1748 } 1749 #endif 1750 if (c == '\\') 1751 { 1752 unsigned char c2; 1753 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1754 { 1755 token->type = BACK_SLASH; 1756 return 1; 1757 } 1758 1759 c2 = re_string_peek_byte_case (input, 1); 1760 token->opr.c = c2; 1761 token->type = CHARACTER; 1762 #ifdef RE_ENABLE_I18N 1763 if (input->mb_cur_max > 1) 1764 { 1765 wint_t wc = re_string_wchar_at (input, 1766 re_string_cur_idx (input) + 1); 1767 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1768 } 1769 else 1770 #endif 1771 token->word_char = IS_WORD_CHAR (c2) != 0; 1772 1773 switch (c2) 1774 { 1775 case '|': 1776 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR)) 1777 token->type = OP_ALT; 1778 break; 1779 case '1': case '2': case '3': case '4': case '5': 1780 case '6': case '7': case '8': case '9': 1781 if (!(syntax & REG_NO_BK_REFS)) 1782 { 1783 token->type = OP_BACK_REF; 1784 token->opr.idx = c2 - '1'; 1785 } 1786 break; 1787 case '<': 1788 if (!(syntax & REG_NO_GNU_OPS)) 1789 { 1790 token->type = ANCHOR; 1791 token->opr.ctx_type = WORD_FIRST; 1792 } 1793 break; 1794 case '>': 1795 if (!(syntax & REG_NO_GNU_OPS)) 1796 { 1797 token->type = ANCHOR; 1798 token->opr.ctx_type = WORD_LAST; 1799 } 1800 break; 1801 case 'b': 1802 if (!(syntax & REG_NO_GNU_OPS)) 1803 { 1804 token->type = ANCHOR; 1805 token->opr.ctx_type = WORD_DELIM; 1806 } 1807 break; 1808 case 'B': 1809 if (!(syntax & REG_NO_GNU_OPS)) 1810 { 1811 token->type = ANCHOR; 1812 token->opr.ctx_type = NOT_WORD_DELIM; 1813 } 1814 break; 1815 case 'w': 1816 if (!(syntax & REG_NO_GNU_OPS)) 1817 token->type = OP_WORD; 1818 break; 1819 case 'W': 1820 if (!(syntax & REG_NO_GNU_OPS)) 1821 token->type = OP_NOTWORD; 1822 break; 1823 case 's': 1824 if (!(syntax & REG_NO_GNU_OPS)) 1825 token->type = OP_SPACE; 1826 break; 1827 case 'S': 1828 if (!(syntax & REG_NO_GNU_OPS)) 1829 token->type = OP_NOTSPACE; 1830 break; 1831 case '`': 1832 if (!(syntax & REG_NO_GNU_OPS)) 1833 { 1834 token->type = ANCHOR; 1835 token->opr.ctx_type = BUF_FIRST; 1836 } 1837 break; 1838 case '\'': 1839 if (!(syntax & REG_NO_GNU_OPS)) 1840 { 1841 token->type = ANCHOR; 1842 token->opr.ctx_type = BUF_LAST; 1843 } 1844 break; 1845 case '(': 1846 if (!(syntax & REG_NO_BK_PARENS)) 1847 token->type = OP_OPEN_SUBEXP; 1848 break; 1849 case ')': 1850 if (!(syntax & REG_NO_BK_PARENS)) 1851 token->type = OP_CLOSE_SUBEXP; 1852 break; 1853 case '+': 1854 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM)) 1855 token->type = OP_DUP_PLUS; 1856 break; 1857 case '?': 1858 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM)) 1859 token->type = OP_DUP_QUESTION; 1860 break; 1861 case '{': 1862 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES))) 1863 token->type = OP_OPEN_DUP_NUM; 1864 break; 1865 case '}': 1866 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES))) 1867 token->type = OP_CLOSE_DUP_NUM; 1868 break; 1869 default: 1870 break; 1871 } 1872 return 2; 1873 } 1874 1875 token->type = CHARACTER; 1876 #ifdef RE_ENABLE_I18N 1877 if (input->mb_cur_max > 1) 1878 { 1879 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1880 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1881 } 1882 else 1883 #endif 1884 token->word_char = IS_WORD_CHAR (token->opr.c); 1885 1886 switch (c) 1887 { 1888 case '\n': 1889 if (syntax & REG_NEWLINE_ALT) 1890 token->type = OP_ALT; 1891 break; 1892 case '|': 1893 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR)) 1894 token->type = OP_ALT; 1895 break; 1896 case '*': 1897 token->type = OP_DUP_ASTERISK; 1898 break; 1899 case '+': 1900 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM)) 1901 token->type = OP_DUP_PLUS; 1902 break; 1903 case '?': 1904 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM)) 1905 token->type = OP_DUP_QUESTION; 1906 break; 1907 case '{': 1908 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES)) 1909 token->type = OP_OPEN_DUP_NUM; 1910 break; 1911 case '}': 1912 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES)) 1913 token->type = OP_CLOSE_DUP_NUM; 1914 break; 1915 case '(': 1916 if (syntax & REG_NO_BK_PARENS) 1917 token->type = OP_OPEN_SUBEXP; 1918 break; 1919 case ')': 1920 if (syntax & REG_NO_BK_PARENS) 1921 token->type = OP_CLOSE_SUBEXP; 1922 break; 1923 case '[': 1924 token->type = OP_OPEN_BRACKET; 1925 break; 1926 case '.': 1927 token->type = OP_PERIOD; 1928 break; 1929 case '^': 1930 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) && 1931 re_string_cur_idx (input) != 0) 1932 { 1933 char prev = re_string_peek_byte (input, -1); 1934 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n') 1935 break; 1936 } 1937 token->type = ANCHOR; 1938 token->opr.ctx_type = LINE_FIRST; 1939 break; 1940 case '$': 1941 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) && 1942 re_string_cur_idx (input) + 1 != re_string_length (input)) 1943 { 1944 re_token_t next; 1945 re_string_skip_bytes (input, 1); 1946 peek_token (&next, input, syntax); 1947 re_string_skip_bytes (input, -1); 1948 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1949 break; 1950 } 1951 token->type = ANCHOR; 1952 token->opr.ctx_type = LINE_LAST; 1953 break; 1954 default: 1955 break; 1956 } 1957 return 1; 1958 } 1959 1960 /* Peek a token from INPUT, and return the length of the token. 1961 We must not use this function out of bracket expressions. */ 1962 1963 static int 1964 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1965 { 1966 unsigned char c; 1967 if (re_string_eoi (input)) 1968 { 1969 token->type = END_OF_RE; 1970 return 0; 1971 } 1972 c = re_string_peek_byte (input, 0); 1973 token->opr.c = c; 1974 1975 #ifdef RE_ENABLE_I18N 1976 if (input->mb_cur_max > 1 && 1977 !re_string_first_byte (input, re_string_cur_idx (input))) 1978 { 1979 token->type = CHARACTER; 1980 return 1; 1981 } 1982 #endif /* RE_ENABLE_I18N */ 1983 1984 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS) 1985 && re_string_cur_idx (input) + 1 < re_string_length (input)) 1986 { 1987 /* In this case, '\' escape a character. */ 1988 unsigned char c2; 1989 re_string_skip_bytes (input, 1); 1990 c2 = re_string_peek_byte (input, 0); 1991 token->opr.c = c2; 1992 token->type = CHARACTER; 1993 return 1; 1994 } 1995 if (c == '[') /* '[' is a special char in a bracket exps. */ 1996 { 1997 unsigned char c2; 1998 int token_len; 1999 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2000 c2 = re_string_peek_byte (input, 1); 2001 else 2002 c2 = 0; 2003 token->opr.c = c2; 2004 token_len = 2; 2005 switch (c2) 2006 { 2007 case '.': 2008 token->type = OP_OPEN_COLL_ELEM; 2009 break; 2010 case '=': 2011 token->type = OP_OPEN_EQUIV_CLASS; 2012 break; 2013 case ':': 2014 if (syntax & REG_CHAR_CLASSES) 2015 { 2016 token->type = OP_OPEN_CHAR_CLASS; 2017 break; 2018 } 2019 /* else fall through. */ 2020 default: 2021 token->type = CHARACTER; 2022 token->opr.c = c; 2023 token_len = 1; 2024 break; 2025 } 2026 return token_len; 2027 } 2028 switch (c) 2029 { 2030 case '-': 2031 token->type = OP_CHARSET_RANGE; 2032 break; 2033 case ']': 2034 token->type = OP_CLOSE_BRACKET; 2035 break; 2036 case '^': 2037 token->type = OP_NON_MATCH_LIST; 2038 break; 2039 default: 2040 token->type = CHARACTER; 2041 } 2042 return 1; 2043 } 2044 2045 /* Functions for parser. */ 2046 2047 /* Entry point of the parser. 2048 Parse the regular expression REGEXP and return the structure tree. 2049 If an error is occured, ERR is set by error code, and return NULL. 2050 This function build the following tree, from regular expression <reg_exp>: 2051 CAT 2052 / \ 2053 / \ 2054 <reg_exp> EOR 2055 2056 CAT means concatenation. 2057 EOR means end of regular expression. */ 2058 2059 static bin_tree_t * 2060 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2061 reg_errcode_t *err) 2062 { 2063 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2064 bin_tree_t *tree, *eor, *root; 2065 re_token_t current_token; 2066 dfa->syntax = syntax; 2067 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE); 2068 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2069 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2070 return NULL; 2071 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2072 if (tree != NULL) 2073 root = create_tree (dfa, tree, eor, CONCAT); 2074 else 2075 root = eor; 2076 if (BE (eor == NULL || root == NULL, 0)) 2077 { 2078 *err = REG_ESPACE; 2079 return NULL; 2080 } 2081 return root; 2082 } 2083 2084 /* This function build the following tree, from regular expression 2085 <branch1>|<branch2>: 2086 ALT 2087 / \ 2088 / \ 2089 <branch1> <branch2> 2090 2091 ALT means alternative, which represents the operator `|'. */ 2092 2093 static bin_tree_t * 2094 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2095 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2096 { 2097 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2098 bin_tree_t *tree, *branch = NULL; 2099 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2100 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2101 return NULL; 2102 2103 while (token->type == OP_ALT) 2104 { 2105 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE); 2106 if (token->type != OP_ALT && token->type != END_OF_RE 2107 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2108 { 2109 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2110 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2111 return NULL; 2112 } 2113 else 2114 branch = NULL; 2115 tree = create_tree (dfa, tree, branch, OP_ALT); 2116 if (BE (tree == NULL, 0)) 2117 { 2118 *err = REG_ESPACE; 2119 return NULL; 2120 } 2121 } 2122 return tree; 2123 } 2124 2125 /* This function build the following tree, from regular expression 2126 <exp1><exp2>: 2127 CAT 2128 / \ 2129 / \ 2130 <exp1> <exp2> 2131 2132 CAT means concatenation. */ 2133 2134 static bin_tree_t * 2135 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2136 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2137 { 2138 bin_tree_t *tree, *exp; 2139 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2140 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2141 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2142 return NULL; 2143 2144 while (token->type != OP_ALT && token->type != END_OF_RE 2145 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2146 { 2147 exp = parse_expression (regexp, preg, token, syntax, nest, err); 2148 if (BE (*err != REG_NOERROR && exp == NULL, 0)) 2149 { 2150 return NULL; 2151 } 2152 if (tree != NULL && exp != NULL) 2153 { 2154 tree = create_tree (dfa, tree, exp, CONCAT); 2155 if (tree == NULL) 2156 { 2157 *err = REG_ESPACE; 2158 return NULL; 2159 } 2160 } 2161 else if (tree == NULL) 2162 tree = exp; 2163 /* Otherwise exp == NULL, we don't need to create new tree. */ 2164 } 2165 return tree; 2166 } 2167 2168 /* This function build the following tree, from regular expression a*: 2169 * 2170 | 2171 a 2172 */ 2173 2174 static bin_tree_t * 2175 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2176 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2177 { 2178 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2179 bin_tree_t *tree; 2180 switch (token->type) 2181 { 2182 case CHARACTER: 2183 tree = create_token_tree (dfa, NULL, NULL, token); 2184 if (BE (tree == NULL, 0)) 2185 { 2186 *err = REG_ESPACE; 2187 return NULL; 2188 } 2189 #ifdef RE_ENABLE_I18N 2190 if (dfa->mb_cur_max > 1) 2191 { 2192 while (!re_string_eoi (regexp) 2193 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2194 { 2195 bin_tree_t *mbc_remain; 2196 fetch_token (token, regexp, syntax); 2197 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2198 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2199 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2200 { 2201 *err = REG_ESPACE; 2202 return NULL; 2203 } 2204 } 2205 } 2206 #endif 2207 break; 2208 case OP_OPEN_SUBEXP: 2209 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2210 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2211 return NULL; 2212 break; 2213 case OP_OPEN_BRACKET: 2214 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2215 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2216 return NULL; 2217 break; 2218 case OP_BACK_REF: 2219 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2220 { 2221 *err = REG_ESUBREG; 2222 return NULL; 2223 } 2224 dfa->used_bkref_map |= 1 << token->opr.idx; 2225 tree = create_token_tree (dfa, NULL, NULL, token); 2226 if (BE (tree == NULL, 0)) 2227 { 2228 *err = REG_ESPACE; 2229 return NULL; 2230 } 2231 ++dfa->nbackref; 2232 dfa->has_mb_node = 1; 2233 break; 2234 case OP_OPEN_DUP_NUM: 2235 if (syntax & REG_CONTEXT_INVALID_DUP) 2236 { 2237 *err = REG_BADRPT; 2238 return NULL; 2239 } 2240 /* FALLTHROUGH */ 2241 case OP_DUP_ASTERISK: 2242 case OP_DUP_PLUS: 2243 case OP_DUP_QUESTION: 2244 if (syntax & REG_CONTEXT_INVALID_OPS) 2245 { 2246 *err = REG_BADRPT; 2247 return NULL; 2248 } 2249 else if (syntax & REG_CONTEXT_INDEP_OPS) 2250 { 2251 fetch_token (token, regexp, syntax); 2252 return parse_expression (regexp, preg, token, syntax, nest, err); 2253 } 2254 /* else fall through */ 2255 case OP_CLOSE_SUBEXP: 2256 if ((token->type == OP_CLOSE_SUBEXP) && 2257 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD)) 2258 { 2259 *err = REG_ERPAREN; 2260 return NULL; 2261 } 2262 /* else fall through */ 2263 case OP_CLOSE_DUP_NUM: 2264 /* We treat it as a normal character. */ 2265 2266 /* Then we can these characters as normal characters. */ 2267 token->type = CHARACTER; 2268 /* mb_partial and word_char bits should be initialized already 2269 by peek_token. */ 2270 tree = create_token_tree (dfa, NULL, NULL, token); 2271 if (BE (tree == NULL, 0)) 2272 { 2273 *err = REG_ESPACE; 2274 return NULL; 2275 } 2276 break; 2277 case ANCHOR: 2278 if ((token->opr.ctx_type 2279 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2280 && dfa->word_ops_used == 0) 2281 init_word_char (dfa); 2282 if (token->opr.ctx_type == WORD_DELIM 2283 || token->opr.ctx_type == NOT_WORD_DELIM) 2284 { 2285 bin_tree_t *tree_first, *tree_last; 2286 if (token->opr.ctx_type == WORD_DELIM) 2287 { 2288 token->opr.ctx_type = WORD_FIRST; 2289 tree_first = create_token_tree (dfa, NULL, NULL, token); 2290 token->opr.ctx_type = WORD_LAST; 2291 } 2292 else 2293 { 2294 token->opr.ctx_type = INSIDE_WORD; 2295 tree_first = create_token_tree (dfa, NULL, NULL, token); 2296 token->opr.ctx_type = INSIDE_NOTWORD; 2297 } 2298 tree_last = create_token_tree (dfa, NULL, NULL, token); 2299 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2300 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2301 { 2302 *err = REG_ESPACE; 2303 return NULL; 2304 } 2305 } 2306 else 2307 { 2308 tree = create_token_tree (dfa, NULL, NULL, token); 2309 if (BE (tree == NULL, 0)) 2310 { 2311 *err = REG_ESPACE; 2312 return NULL; 2313 } 2314 } 2315 /* We must return here, since ANCHORs can't be followed 2316 by repetition operators. 2317 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2318 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2319 fetch_token (token, regexp, syntax); 2320 return tree; 2321 case OP_PERIOD: 2322 tree = create_token_tree (dfa, NULL, NULL, token); 2323 if (BE (tree == NULL, 0)) 2324 { 2325 *err = REG_ESPACE; 2326 return NULL; 2327 } 2328 if (dfa->mb_cur_max > 1) 2329 dfa->has_mb_node = 1; 2330 break; 2331 case OP_WORD: 2332 case OP_NOTWORD: 2333 tree = build_charclass_op (dfa, regexp->trans, 2334 (const unsigned char *) "alnum", 2335 (const unsigned char *) "_", 2336 token->type == OP_NOTWORD, err); 2337 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2338 return NULL; 2339 break; 2340 case OP_SPACE: 2341 case OP_NOTSPACE: 2342 tree = build_charclass_op (dfa, regexp->trans, 2343 (const unsigned char *) "space", 2344 (const unsigned char *) "", 2345 token->type == OP_NOTSPACE, err); 2346 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2347 return NULL; 2348 break; 2349 case OP_ALT: 2350 case END_OF_RE: 2351 return NULL; 2352 case BACK_SLASH: 2353 *err = REG_EESCAPE; 2354 return NULL; 2355 default: 2356 /* Must not happen? */ 2357 #ifdef DEBUG 2358 assert (0); 2359 #endif 2360 return NULL; 2361 } 2362 fetch_token (token, regexp, syntax); 2363 2364 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2365 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2366 { 2367 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2368 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2369 return NULL; 2370 /* In BRE consecutive duplications are not allowed. */ 2371 if ((syntax & REG_CONTEXT_INVALID_DUP) 2372 && (token->type == OP_DUP_ASTERISK 2373 || token->type == OP_OPEN_DUP_NUM)) 2374 { 2375 *err = REG_BADRPT; 2376 return NULL; 2377 } 2378 } 2379 2380 return tree; 2381 } 2382 2383 /* This function build the following tree, from regular expression 2384 (<reg_exp>): 2385 SUBEXP 2386 | 2387 <reg_exp> 2388 */ 2389 2390 static bin_tree_t * 2391 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2392 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2393 { 2394 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2395 bin_tree_t *tree; 2396 size_t cur_nsub; 2397 cur_nsub = preg->re_nsub++; 2398 2399 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE); 2400 2401 /* The subexpression may be a null string. */ 2402 if (token->type == OP_CLOSE_SUBEXP) 2403 tree = NULL; 2404 else 2405 { 2406 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2407 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2408 *err = REG_EPAREN; 2409 if (BE (*err != REG_NOERROR, 0)) 2410 return NULL; 2411 } 2412 2413 if (cur_nsub <= '9' - '1') 2414 dfa->completed_bkref_map |= 1 << cur_nsub; 2415 2416 tree = create_tree (dfa, tree, NULL, SUBEXP); 2417 if (BE (tree == NULL, 0)) 2418 { 2419 *err = REG_ESPACE; 2420 return NULL; 2421 } 2422 tree->token.opr.idx = cur_nsub; 2423 return tree; 2424 } 2425 2426 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2427 2428 static bin_tree_t * 2429 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2430 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2431 { 2432 bin_tree_t *tree = NULL, *old_tree = NULL; 2433 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2434 re_token_t start_token = *token; 2435 2436 if (token->type == OP_OPEN_DUP_NUM) 2437 { 2438 end = 0; 2439 start = fetch_number (regexp, token, syntax); 2440 if (start == REG_MISSING) 2441 { 2442 if (token->type == CHARACTER && token->opr.c == ',') 2443 start = 0; /* We treat "{,m}" as "{0,m}". */ 2444 else 2445 { 2446 *err = REG_BADBR; /* <re>{} is invalid. */ 2447 return NULL; 2448 } 2449 } 2450 if (BE (start != REG_ERROR, 1)) 2451 { 2452 /* We treat "{n}" as "{n,n}". */ 2453 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2454 : ((token->type == CHARACTER && token->opr.c == ',') 2455 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2456 } 2457 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2458 { 2459 /* Invalid sequence. */ 2460 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0)) 2461 { 2462 if (token->type == END_OF_RE) 2463 *err = REG_EBRACE; 2464 else 2465 *err = REG_BADBR; 2466 2467 return NULL; 2468 } 2469 2470 /* If the syntax bit is set, rollback. */ 2471 re_string_set_index (regexp, start_idx); 2472 *token = start_token; 2473 token->type = CHARACTER; 2474 /* mb_partial and word_char bits should be already initialized by 2475 peek_token. */ 2476 return elem; 2477 } 2478 2479 if (BE (end != REG_MISSING && start > end, 0)) 2480 { 2481 /* First number greater than second. */ 2482 *err = REG_BADBR; 2483 return NULL; 2484 } 2485 } 2486 else 2487 { 2488 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2489 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2490 } 2491 2492 fetch_token (token, regexp, syntax); 2493 2494 if (BE (elem == NULL, 0)) 2495 return NULL; 2496 if (BE (start == 0 && end == 0, 0)) 2497 { 2498 postorder (elem, free_tree, NULL); 2499 return NULL; 2500 } 2501 2502 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2503 if (BE (start > 0, 0)) 2504 { 2505 tree = elem; 2506 for (i = 2; i <= start; ++i) 2507 { 2508 elem = duplicate_tree (elem, dfa); 2509 tree = create_tree (dfa, tree, elem, CONCAT); 2510 if (BE (elem == NULL || tree == NULL, 0)) 2511 goto parse_dup_op_espace; 2512 } 2513 2514 if (start == end) 2515 return tree; 2516 2517 /* Duplicate ELEM before it is marked optional. */ 2518 elem = duplicate_tree (elem, dfa); 2519 old_tree = tree; 2520 } 2521 else 2522 old_tree = NULL; 2523 2524 if (elem->token.type == SUBEXP) 2525 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2526 2527 tree = create_tree (dfa, elem, NULL, 2528 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2529 if (BE (tree == NULL, 0)) 2530 goto parse_dup_op_espace; 2531 2532 /* This loop is actually executed only when end != REG_MISSING, 2533 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2534 already created the start+1-th copy. */ 2535 if ((Idx) -1 < 0 || end != REG_MISSING) 2536 for (i = start + 2; i <= end; ++i) 2537 { 2538 elem = duplicate_tree (elem, dfa); 2539 tree = create_tree (dfa, tree, elem, CONCAT); 2540 if (BE (elem == NULL || tree == NULL, 0)) 2541 goto parse_dup_op_espace; 2542 2543 tree = create_tree (dfa, tree, NULL, OP_ALT); 2544 if (BE (tree == NULL, 0)) 2545 goto parse_dup_op_espace; 2546 } 2547 2548 if (old_tree) 2549 tree = create_tree (dfa, old_tree, tree, CONCAT); 2550 2551 return tree; 2552 2553 parse_dup_op_espace: 2554 *err = REG_ESPACE; 2555 return NULL; 2556 } 2557 2558 /* Size of the names for collating symbol/equivalence_class/character_class. 2559 I'm not sure, but maybe enough. */ 2560 #define BRACKET_NAME_BUF_SIZE 32 2561 2562 #ifndef _LIBC 2563 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2564 Build the range expression which starts from START_ELEM, and ends 2565 at END_ELEM. The result are written to MBCSET and SBCSET. 2566 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2567 mbcset->range_ends, is a pointer argument sinse we may 2568 update it. */ 2569 2570 static reg_errcode_t 2571 build_range_exp (bitset sbcset, 2572 # ifdef RE_ENABLE_I18N 2573 re_charset_t *mbcset, Idx *range_alloc, 2574 # endif 2575 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2576 { 2577 unsigned int start_ch, end_ch; 2578 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2579 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2580 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2581 0)) 2582 return REG_ERANGE; 2583 2584 /* We can handle no multi character collating elements without libc 2585 support. */ 2586 if (BE ((start_elem->type == COLL_SYM 2587 && strlen ((char *) start_elem->opr.name) > 1) 2588 || (end_elem->type == COLL_SYM 2589 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2590 return REG_ECOLLATE; 2591 2592 # ifdef RE_ENABLE_I18N 2593 { 2594 wchar_t wc; 2595 wint_t start_wc, end_wc; 2596 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2597 2598 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2599 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2600 : 0)); 2601 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2602 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2603 : 0)); 2604 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2605 ? __btowc (start_ch) : start_elem->opr.wch); 2606 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2607 ? __btowc (end_ch) : end_elem->opr.wch); 2608 if (start_wc == WEOF || end_wc == WEOF) 2609 return REG_ECOLLATE; 2610 cmp_buf[0] = start_wc; 2611 cmp_buf[4] = end_wc; 2612 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2613 return REG_ERANGE; 2614 2615 /* Got valid collation sequence values, add them as a new entry. 2616 However, for !_LIBC we have no collation elements: if the 2617 character set is single byte, the single byte character set 2618 that we build below suffices. parse_bracket_exp passes 2619 no MBCSET if dfa->mb_cur_max == 1. */ 2620 if (mbcset) 2621 { 2622 /* Check the space of the arrays. */ 2623 if (BE (*range_alloc == mbcset->nranges, 0)) 2624 { 2625 /* There is not enough space, need realloc. */ 2626 wchar_t *new_array_start, *new_array_end; 2627 Idx new_nranges; 2628 2629 new_nranges = mbcset->nranges; 2630 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2631 are NULL if *range_alloc == 0. */ 2632 new_array_start = re_x2realloc (mbcset->range_starts, wchar_t, 2633 &new_nranges); 2634 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2635 new_nranges); 2636 2637 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2638 return REG_ESPACE; 2639 2640 mbcset->range_starts = new_array_start; 2641 mbcset->range_ends = new_array_end; 2642 *range_alloc = new_nranges; 2643 } 2644 2645 mbcset->range_starts[mbcset->nranges] = start_wc; 2646 mbcset->range_ends[mbcset->nranges++] = end_wc; 2647 } 2648 2649 /* Build the table for single byte characters. */ 2650 for (wc = 0; wc < SBC_MAX; ++wc) 2651 { 2652 cmp_buf[2] = wc; 2653 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2654 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2655 bitset_set (sbcset, wc); 2656 } 2657 } 2658 # else /* not RE_ENABLE_I18N */ 2659 { 2660 unsigned int ch; 2661 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2662 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2663 : 0)); 2664 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2665 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2666 : 0)); 2667 if (start_ch > end_ch) 2668 return REG_ERANGE; 2669 /* Build the table for single byte characters. */ 2670 for (ch = 0; ch < SBC_MAX; ++ch) 2671 if (start_ch <= ch && ch <= end_ch) 2672 bitset_set (sbcset, ch); 2673 } 2674 # endif /* not RE_ENABLE_I18N */ 2675 return REG_NOERROR; 2676 } 2677 #endif /* not _LIBC */ 2678 2679 #ifndef _LIBC 2680 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2681 Build the collating element which is represented by NAME. 2682 The result are written to MBCSET and SBCSET. 2683 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2684 pointer argument since we may update it. */ 2685 2686 static reg_errcode_t 2687 build_collating_symbol (bitset sbcset, 2688 # ifdef RE_ENABLE_I18N 2689 re_charset_t *mbcset, Idx *coll_sym_alloc, 2690 # endif 2691 const unsigned char *name) 2692 { 2693 size_t name_len = strlen ((const char *) name); 2694 if (BE (name_len != 1, 0)) 2695 return REG_ECOLLATE; 2696 else 2697 { 2698 bitset_set (sbcset, name[0]); 2699 return REG_NOERROR; 2700 } 2701 } 2702 #endif /* not _LIBC */ 2703 2704 /* This function parse bracket expression like "[abc]", "[a-c]", 2705 "[[.a-a.]]" etc. */ 2706 2707 static bin_tree_t * 2708 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2709 reg_syntax_t syntax, reg_errcode_t *err) 2710 { 2711 #ifdef _LIBC 2712 const unsigned char *collseqmb; 2713 const char *collseqwc; 2714 uint32_t nrules; 2715 int32_t table_size; 2716 const int32_t *symb_table; 2717 const unsigned char *extra; 2718 2719 /* Local function for parse_bracket_exp used in _LIBC environement. 2720 Seek the collating symbol entry correspondings to NAME. 2721 Return the index of the symbol in the SYMB_TABLE. */ 2722 2723 auto inline int32_t 2724 __attribute ((always_inline)) 2725 seek_collating_symbol_entry (const unsigned char *name, size_t name_len) 2726 { 2727 int32_t hash = elem_hash ((const char *) name, name_len); 2728 int32_t elem = hash % table_size; 2729 int32_t second = hash % (table_size - 2); 2730 while (symb_table[2 * elem] != 0) 2731 { 2732 /* First compare the hashing value. */ 2733 if (symb_table[2 * elem] == hash 2734 /* Compare the length of the name. */ 2735 && name_len == extra[symb_table[2 * elem + 1]] 2736 /* Compare the name. */ 2737 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2738 name_len) == 0) 2739 { 2740 /* Yep, this is the entry. */ 2741 break; 2742 } 2743 2744 /* Next entry. */ 2745 elem += second; 2746 } 2747 return elem; 2748 } 2749 2750 /* Local function for parse_bracket_exp used in _LIBC environement. 2751 Look up the collation sequence value of BR_ELEM. 2752 Return the value if succeeded, UINT_MAX otherwise. */ 2753 2754 auto inline unsigned int 2755 __attribute ((always_inline)) 2756 lookup_collation_sequence_value (bracket_elem_t *br_elem) 2757 { 2758 if (br_elem->type == SB_CHAR) 2759 { 2760 /* 2761 if (MB_CUR_MAX == 1) 2762 */ 2763 if (nrules == 0) 2764 return collseqmb[br_elem->opr.ch]; 2765 else 2766 { 2767 wint_t wc = __btowc (br_elem->opr.ch); 2768 return __collseq_table_lookup (collseqwc, wc); 2769 } 2770 } 2771 else if (br_elem->type == MB_CHAR) 2772 { 2773 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2774 } 2775 else if (br_elem->type == COLL_SYM) 2776 { 2777 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2778 if (nrules != 0) 2779 { 2780 int32_t elem, idx; 2781 elem = seek_collating_symbol_entry (br_elem->opr.name, 2782 sym_name_len); 2783 if (symb_table[2 * elem] != 0) 2784 { 2785 /* We found the entry. */ 2786 idx = symb_table[2 * elem + 1]; 2787 /* Skip the name of collating element name. */ 2788 idx += 1 + extra[idx]; 2789 /* Skip the byte sequence of the collating element. */ 2790 idx += 1 + extra[idx]; 2791 /* Adjust for the alignment. */ 2792 idx = (idx + 3) & ~3; 2793 /* Skip the multibyte collation sequence value. */ 2794 idx += sizeof (unsigned int); 2795 /* Skip the wide char sequence of the collating element. */ 2796 idx += sizeof (unsigned int) * 2797 (1 + *(unsigned int *) (extra + idx)); 2798 /* Return the collation sequence value. */ 2799 return *(unsigned int *) (extra + idx); 2800 } 2801 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2802 { 2803 /* No valid character. Match it as a single byte 2804 character. */ 2805 return collseqmb[br_elem->opr.name[0]]; 2806 } 2807 } 2808 else if (sym_name_len == 1) 2809 return collseqmb[br_elem->opr.name[0]]; 2810 } 2811 return UINT_MAX; 2812 } 2813 2814 /* Local function for parse_bracket_exp used in _LIBC environement. 2815 Build the range expression which starts from START_ELEM, and ends 2816 at END_ELEM. The result are written to MBCSET and SBCSET. 2817 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2818 mbcset->range_ends, is a pointer argument sinse we may 2819 update it. */ 2820 2821 auto inline reg_errcode_t 2822 __attribute ((always_inline)) 2823 build_range_exp (bitset sbcset, re_charset_t *mbcset, 2824 Idx *range_alloc, 2825 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2826 { 2827 unsigned int ch; 2828 uint32_t start_collseq; 2829 uint32_t end_collseq; 2830 2831 /* Equivalence Classes and Character Classes can't be a range 2832 start/end. */ 2833 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2834 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2835 0)) 2836 return REG_ERANGE; 2837 2838 start_collseq = lookup_collation_sequence_value (start_elem); 2839 end_collseq = lookup_collation_sequence_value (end_elem); 2840 /* Check start/end collation sequence values. */ 2841 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2842 return REG_ECOLLATE; 2843 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2844 return REG_ERANGE; 2845 2846 /* Got valid collation sequence values, add them as a new entry. 2847 However, if we have no collation elements, and the character set 2848 is single byte, the single byte character set that we 2849 build below suffices. */ 2850 if (nrules > 0 || dfa->mb_cur_max > 1) 2851 { 2852 /* Check the space of the arrays. */ 2853 if (BE (*range_alloc == mbcset->nranges, 0)) 2854 { 2855 /* There is not enough space, need realloc. */ 2856 uint32_t *new_array_start; 2857 uint32_t *new_array_end; 2858 Idx new_nranges; 2859 2860 new_nranges = mbcset->nranges; 2861 new_array_start = re_x2realloc (mbcset->range_starts, uint32_t, 2862 &new_nranges); 2863 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2864 new_nranges); 2865 2866 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2867 return REG_ESPACE; 2868 2869 mbcset->range_starts = new_array_start; 2870 mbcset->range_ends = new_array_end; 2871 *range_alloc = new_nranges; 2872 } 2873 2874 mbcset->range_starts[mbcset->nranges] = start_collseq; 2875 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2876 } 2877 2878 /* Build the table for single byte characters. */ 2879 for (ch = 0; ch < SBC_MAX; ch++) 2880 { 2881 uint32_t ch_collseq; 2882 /* 2883 if (MB_CUR_MAX == 1) 2884 */ 2885 if (nrules == 0) 2886 ch_collseq = collseqmb[ch]; 2887 else 2888 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2889 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2890 bitset_set (sbcset, ch); 2891 } 2892 return REG_NOERROR; 2893 } 2894 2895 /* Local function for parse_bracket_exp used in _LIBC environement. 2896 Build the collating element which is represented by NAME. 2897 The result are written to MBCSET and SBCSET. 2898 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2899 pointer argument sinse we may update it. */ 2900 2901 auto inline reg_errcode_t 2902 __attribute ((always_inline)) 2903 build_collating_symbol (bitset sbcset, re_charset_t *mbcset, 2904 Idx *coll_sym_alloc, const unsigned char *name) 2905 { 2906 int32_t elem, idx; 2907 size_t name_len = strlen ((const char *) name); 2908 if (nrules != 0) 2909 { 2910 elem = seek_collating_symbol_entry (name, name_len); 2911 if (symb_table[2 * elem] != 0) 2912 { 2913 /* We found the entry. */ 2914 idx = symb_table[2 * elem + 1]; 2915 /* Skip the name of collating element name. */ 2916 idx += 1 + extra[idx]; 2917 } 2918 else if (symb_table[2 * elem] == 0 && name_len == 1) 2919 { 2920 /* No valid character, treat it as a normal 2921 character. */ 2922 bitset_set (sbcset, name[0]); 2923 return REG_NOERROR; 2924 } 2925 else 2926 return REG_ECOLLATE; 2927 2928 /* Got valid collation sequence, add it as a new entry. */ 2929 /* Check the space of the arrays. */ 2930 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 2931 { 2932 /* Not enough, realloc it. */ 2933 Idx new_coll_sym_alloc = mbcset->ncoll_syms; 2934 /* Use realloc since mbcset->coll_syms is NULL 2935 if *alloc == 0. */ 2936 int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t, 2937 &new_coll_sym_alloc); 2938 if (BE (new_coll_syms == NULL, 0)) 2939 return REG_ESPACE; 2940 mbcset->coll_syms = new_coll_syms; 2941 *coll_sym_alloc = new_coll_sym_alloc; 2942 } 2943 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 2944 return REG_NOERROR; 2945 } 2946 else 2947 { 2948 if (BE (name_len != 1, 0)) 2949 return REG_ECOLLATE; 2950 else 2951 { 2952 bitset_set (sbcset, name[0]); 2953 return REG_NOERROR; 2954 } 2955 } 2956 } 2957 #endif 2958 2959 re_token_t br_token; 2960 re_bitset_ptr_t sbcset; 2961 #ifdef RE_ENABLE_I18N 2962 re_charset_t *mbcset; 2963 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 2964 Idx equiv_class_alloc = 0, char_class_alloc = 0; 2965 #endif /* not RE_ENABLE_I18N */ 2966 bool non_match = false; 2967 bin_tree_t *work_tree; 2968 int token_len; 2969 bool first_round = true; 2970 #ifdef _LIBC 2971 collseqmb = (const unsigned char *) 2972 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 2973 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 2974 if (nrules) 2975 { 2976 /* 2977 if (MB_CUR_MAX > 1) 2978 */ 2979 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 2980 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 2981 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 2982 _NL_COLLATE_SYMB_TABLEMB); 2983 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 2984 _NL_COLLATE_SYMB_EXTRAMB); 2985 } 2986 #endif 2987 sbcset = re_calloc (bitset_word, BITSET_WORDS); 2988 #ifdef RE_ENABLE_I18N 2989 mbcset = re_calloc (re_charset_t, 1); 2990 #endif /* RE_ENABLE_I18N */ 2991 #ifdef RE_ENABLE_I18N 2992 if (BE (sbcset == NULL || mbcset == NULL, 0)) 2993 #else 2994 if (BE (sbcset == NULL, 0)) 2995 #endif /* RE_ENABLE_I18N */ 2996 { 2997 *err = REG_ESPACE; 2998 return NULL; 2999 } 3000 3001 token_len = peek_token_bracket (token, regexp, syntax); 3002 if (BE (token->type == END_OF_RE, 0)) 3003 { 3004 *err = REG_BADPAT; 3005 goto parse_bracket_exp_free_return; 3006 } 3007 if (token->type == OP_NON_MATCH_LIST) 3008 { 3009 #ifdef RE_ENABLE_I18N 3010 mbcset->non_match = 1; 3011 #endif /* not RE_ENABLE_I18N */ 3012 non_match = true; 3013 if (syntax & REG_HAT_LISTS_NOT_NEWLINE) 3014 bitset_set (sbcset, '\0'); 3015 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3016 token_len = peek_token_bracket (token, regexp, syntax); 3017 if (BE (token->type == END_OF_RE, 0)) 3018 { 3019 *err = REG_BADPAT; 3020 goto parse_bracket_exp_free_return; 3021 } 3022 } 3023 3024 /* We treat the first ']' as a normal character. */ 3025 if (token->type == OP_CLOSE_BRACKET) 3026 token->type = CHARACTER; 3027 3028 while (1) 3029 { 3030 bracket_elem_t start_elem, end_elem; 3031 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3032 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3033 reg_errcode_t ret; 3034 int token_len2 = 0; 3035 bool is_range_exp = false; 3036 re_token_t token2; 3037 3038 start_elem.opr.name = start_name_buf; 3039 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3040 syntax, first_round); 3041 if (BE (ret != REG_NOERROR, 0)) 3042 { 3043 *err = ret; 3044 goto parse_bracket_exp_free_return; 3045 } 3046 first_round = false; 3047 3048 /* Get information about the next token. We need it in any case. */ 3049 token_len = peek_token_bracket (token, regexp, syntax); 3050 3051 /* Do not check for ranges if we know they are not allowed. */ 3052 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3053 { 3054 if (BE (token->type == END_OF_RE, 0)) 3055 { 3056 *err = REG_EBRACK; 3057 goto parse_bracket_exp_free_return; 3058 } 3059 if (token->type == OP_CHARSET_RANGE) 3060 { 3061 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3062 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3063 if (BE (token2.type == END_OF_RE, 0)) 3064 { 3065 *err = REG_EBRACK; 3066 goto parse_bracket_exp_free_return; 3067 } 3068 if (token2.type == OP_CLOSE_BRACKET) 3069 { 3070 /* We treat the last '-' as a normal character. */ 3071 re_string_skip_bytes (regexp, -token_len); 3072 token->type = CHARACTER; 3073 } 3074 else 3075 is_range_exp = true; 3076 } 3077 } 3078 3079 if (is_range_exp == true) 3080 { 3081 end_elem.opr.name = end_name_buf; 3082 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3083 dfa, syntax, true); 3084 if (BE (ret != REG_NOERROR, 0)) 3085 { 3086 *err = ret; 3087 goto parse_bracket_exp_free_return; 3088 } 3089 3090 token_len = peek_token_bracket (token, regexp, syntax); 3091 3092 #ifdef _LIBC 3093 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3094 &start_elem, &end_elem); 3095 #else 3096 # ifdef RE_ENABLE_I18N 3097 *err = build_range_exp (sbcset, 3098 dfa->mb_cur_max > 1 ? mbcset : NULL, 3099 &range_alloc, &start_elem, &end_elem); 3100 # else 3101 *err = build_range_exp (sbcset, &start_elem, &end_elem); 3102 # endif 3103 #endif /* RE_ENABLE_I18N */ 3104 if (BE (*err != REG_NOERROR, 0)) 3105 goto parse_bracket_exp_free_return; 3106 } 3107 else 3108 { 3109 switch (start_elem.type) 3110 { 3111 case SB_CHAR: 3112 bitset_set (sbcset, start_elem.opr.ch); 3113 break; 3114 #ifdef RE_ENABLE_I18N 3115 case MB_CHAR: 3116 /* Check whether the array has enough space. */ 3117 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3118 { 3119 wchar_t *new_mbchars; 3120 /* Not enough, realloc it. */ 3121 mbchar_alloc = mbcset->nmbchars; 3122 /* Use realloc since array is NULL if *alloc == 0. */ 3123 new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t, 3124 &mbchar_alloc); 3125 if (BE (new_mbchars == NULL, 0)) 3126 goto parse_bracket_exp_espace; 3127 mbcset->mbchars = new_mbchars; 3128 } 3129 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3130 break; 3131 #endif /* RE_ENABLE_I18N */ 3132 case EQUIV_CLASS: 3133 *err = build_equiv_class (sbcset, 3134 #ifdef RE_ENABLE_I18N 3135 mbcset, &equiv_class_alloc, 3136 #endif /* RE_ENABLE_I18N */ 3137 start_elem.opr.name); 3138 if (BE (*err != REG_NOERROR, 0)) 3139 goto parse_bracket_exp_free_return; 3140 break; 3141 case COLL_SYM: 3142 *err = build_collating_symbol (sbcset, 3143 #ifdef RE_ENABLE_I18N 3144 mbcset, &coll_sym_alloc, 3145 #endif /* RE_ENABLE_I18N */ 3146 start_elem.opr.name); 3147 if (BE (*err != REG_NOERROR, 0)) 3148 goto parse_bracket_exp_free_return; 3149 break; 3150 case CHAR_CLASS: 3151 *err = build_charclass (regexp->trans, sbcset, 3152 #ifdef RE_ENABLE_I18N 3153 mbcset, &char_class_alloc, 3154 #endif /* RE_ENABLE_I18N */ 3155 start_elem.opr.name, syntax); 3156 if (BE (*err != REG_NOERROR, 0)) 3157 goto parse_bracket_exp_free_return; 3158 break; 3159 default: 3160 assert (0); 3161 break; 3162 } 3163 } 3164 if (BE (token->type == END_OF_RE, 0)) 3165 { 3166 *err = REG_EBRACK; 3167 goto parse_bracket_exp_free_return; 3168 } 3169 if (token->type == OP_CLOSE_BRACKET) 3170 break; 3171 } 3172 3173 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3174 3175 /* If it is non-matching list. */ 3176 if (non_match) 3177 bitset_not (sbcset); 3178 3179 #ifdef RE_ENABLE_I18N 3180 /* Ensure only single byte characters are set. */ 3181 if (dfa->mb_cur_max > 1) 3182 bitset_mask (sbcset, dfa->sb_char); 3183 3184 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3185 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3186 || mbcset->non_match))) 3187 { 3188 bin_tree_t *mbc_tree; 3189 int sbc_idx; 3190 /* Build a tree for complex bracket. */ 3191 dfa->has_mb_node = 1; 3192 br_token.type = COMPLEX_BRACKET; 3193 br_token.opr.mbcset = mbcset; 3194 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3195 if (BE (mbc_tree == NULL, 0)) 3196 goto parse_bracket_exp_espace; 3197 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3198 if (sbcset[sbc_idx]) 3199 break; 3200 /* If there are no bits set in sbcset, there is no point 3201 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3202 if (sbc_idx < BITSET_WORDS) 3203 { 3204 /* Build a tree for simple bracket. */ 3205 br_token.type = SIMPLE_BRACKET; 3206 br_token.opr.sbcset = sbcset; 3207 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3208 if (BE (work_tree == NULL, 0)) 3209 goto parse_bracket_exp_espace; 3210 3211 /* Then join them by ALT node. */ 3212 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3213 if (BE (work_tree == NULL, 0)) 3214 goto parse_bracket_exp_espace; 3215 } 3216 else 3217 { 3218 re_free (sbcset); 3219 work_tree = mbc_tree; 3220 } 3221 } 3222 else 3223 #endif /* not RE_ENABLE_I18N */ 3224 { 3225 #ifdef RE_ENABLE_I18N 3226 free_charset (mbcset); 3227 #endif 3228 /* Build a tree for simple bracket. */ 3229 br_token.type = SIMPLE_BRACKET; 3230 br_token.opr.sbcset = sbcset; 3231 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3232 if (BE (work_tree == NULL, 0)) 3233 goto parse_bracket_exp_espace; 3234 } 3235 return work_tree; 3236 3237 parse_bracket_exp_espace: 3238 *err = REG_ESPACE; 3239 parse_bracket_exp_free_return: 3240 re_free (sbcset); 3241 #ifdef RE_ENABLE_I18N 3242 free_charset (mbcset); 3243 #endif /* RE_ENABLE_I18N */ 3244 return NULL; 3245 } 3246 3247 /* Parse an element in the bracket expression. */ 3248 3249 static reg_errcode_t 3250 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3251 re_token_t *token, int token_len, re_dfa_t *dfa, 3252 reg_syntax_t syntax, bool accept_hyphen) 3253 { 3254 #ifdef RE_ENABLE_I18N 3255 int cur_char_size; 3256 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3257 if (cur_char_size > 1) 3258 { 3259 elem->type = MB_CHAR; 3260 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3261 re_string_skip_bytes (regexp, cur_char_size); 3262 return REG_NOERROR; 3263 } 3264 #endif /* RE_ENABLE_I18N */ 3265 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3266 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3267 || token->type == OP_OPEN_EQUIV_CLASS) 3268 return parse_bracket_symbol (elem, regexp, token); 3269 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3270 { 3271 /* A '-' must only appear as anything but a range indicator before 3272 the closing bracket. Everything else is an error. */ 3273 re_token_t token2; 3274 (void) peek_token_bracket (&token2, regexp, syntax); 3275 if (token2.type != OP_CLOSE_BRACKET) 3276 /* The actual error value is not standardized since this whole 3277 case is undefined. But ERANGE makes good sense. */ 3278 return REG_ERANGE; 3279 } 3280 elem->type = SB_CHAR; 3281 elem->opr.ch = token->opr.c; 3282 return REG_NOERROR; 3283 } 3284 3285 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3286 such as [:<character_class>:], [.<collating_element>.], and 3287 [=<equivalent_class>=]. */ 3288 3289 static reg_errcode_t 3290 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3291 re_token_t *token) 3292 { 3293 unsigned char ch, delim = token->opr.c; 3294 int i = 0; 3295 if (re_string_eoi(regexp)) 3296 return REG_EBRACK; 3297 for (;; ++i) 3298 { 3299 if (i >= BRACKET_NAME_BUF_SIZE) 3300 return REG_EBRACK; 3301 if (token->type == OP_OPEN_CHAR_CLASS) 3302 ch = re_string_fetch_byte_case (regexp); 3303 else 3304 ch = re_string_fetch_byte (regexp); 3305 if (re_string_eoi(regexp)) 3306 return REG_EBRACK; 3307 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3308 break; 3309 elem->opr.name[i] = ch; 3310 } 3311 re_string_skip_bytes (regexp, 1); 3312 elem->opr.name[i] = '\0'; 3313 switch (token->type) 3314 { 3315 case OP_OPEN_COLL_ELEM: 3316 elem->type = COLL_SYM; 3317 break; 3318 case OP_OPEN_EQUIV_CLASS: 3319 elem->type = EQUIV_CLASS; 3320 break; 3321 case OP_OPEN_CHAR_CLASS: 3322 elem->type = CHAR_CLASS; 3323 break; 3324 default: 3325 break; 3326 } 3327 return REG_NOERROR; 3328 } 3329 3330 /* Helper function for parse_bracket_exp. 3331 Build the equivalence class which is represented by NAME. 3332 The result are written to MBCSET and SBCSET. 3333 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3334 is a pointer argument sinse we may update it. */ 3335 3336 static reg_errcode_t 3337 build_equiv_class (bitset sbcset, 3338 #ifdef RE_ENABLE_I18N 3339 re_charset_t *mbcset, Idx *equiv_class_alloc, 3340 #endif 3341 const unsigned char *name) 3342 { 3343 #if defined _LIBC 3344 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3345 if (nrules != 0) 3346 { 3347 const int32_t *table, *indirect; 3348 const unsigned char *weights, *extra, *cp; 3349 unsigned char char_buf[2]; 3350 int32_t idx1, idx2; 3351 unsigned int ch; 3352 size_t len; 3353 /* This #include defines a local function! */ 3354 # include <locale/weight.h> 3355 /* Calculate the index for equivalence class. */ 3356 cp = name; 3357 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3358 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3359 _NL_COLLATE_WEIGHTMB); 3360 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3361 _NL_COLLATE_EXTRAMB); 3362 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3363 _NL_COLLATE_INDIRECTMB); 3364 idx1 = findidx (&cp); 3365 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3366 /* This isn't a valid character. */ 3367 return REG_ECOLLATE; 3368 3369 /* Build single byte matcing table for this equivalence class. */ 3370 char_buf[1] = (unsigned char) '\0'; 3371 len = weights[idx1]; 3372 for (ch = 0; ch < SBC_MAX; ++ch) 3373 { 3374 char_buf[0] = ch; 3375 cp = char_buf; 3376 idx2 = findidx (&cp); 3377 /* 3378 idx2 = table[ch]; 3379 */ 3380 if (idx2 == 0) 3381 /* This isn't a valid character. */ 3382 continue; 3383 if (len == weights[idx2]) 3384 { 3385 int cnt = 0; 3386 while (cnt <= len && 3387 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) 3388 ++cnt; 3389 3390 if (cnt > len) 3391 bitset_set (sbcset, ch); 3392 } 3393 } 3394 /* Check whether the array has enough space. */ 3395 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3396 { 3397 /* Not enough, realloc it. */ 3398 Idx new_equiv_class_alloc = mbcset->nequiv_classes; 3399 /* Use realloc since the array is NULL if *alloc == 0. */ 3400 int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes, 3401 int32_t, 3402 &new_equiv_class_alloc); 3403 if (BE (new_equiv_classes == NULL, 0)) 3404 return REG_ESPACE; 3405 mbcset->equiv_classes = new_equiv_classes; 3406 *equiv_class_alloc = new_equiv_class_alloc; 3407 } 3408 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3409 } 3410 else 3411 #endif /* _LIBC */ 3412 { 3413 if (BE (strlen ((const char *) name) != 1, 0)) 3414 return REG_ECOLLATE; 3415 bitset_set (sbcset, *name); 3416 } 3417 return REG_NOERROR; 3418 } 3419 3420 /* Helper function for parse_bracket_exp. 3421 Build the character class which is represented by NAME. 3422 The result are written to MBCSET and SBCSET. 3423 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3424 is a pointer argument sinse we may update it. */ 3425 3426 static reg_errcode_t 3427 build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset, 3428 #ifdef RE_ENABLE_I18N 3429 re_charset_t *mbcset, Idx *char_class_alloc, 3430 #endif 3431 const unsigned char *class_name, reg_syntax_t syntax) 3432 { 3433 int i; 3434 const char *name = (const char *) class_name; 3435 3436 /* In case of REG_ICASE "upper" and "lower" match the both of 3437 upper and lower cases. */ 3438 if ((syntax & REG_IGNORE_CASE) 3439 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3440 name = "alpha"; 3441 3442 #ifdef RE_ENABLE_I18N 3443 /* Check the space of the arrays. */ 3444 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3445 { 3446 /* Not enough, realloc it. */ 3447 Idx new_char_class_alloc = mbcset->nchar_classes; 3448 /* Use realloc since array is NULL if *alloc == 0. */ 3449 wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t, 3450 &new_char_class_alloc); 3451 if (BE (new_char_classes == NULL, 0)) 3452 return REG_ESPACE; 3453 mbcset->char_classes = new_char_classes; 3454 *char_class_alloc = new_char_class_alloc; 3455 } 3456 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3457 #endif /* RE_ENABLE_I18N */ 3458 3459 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3460 for (i = 0; i < SBC_MAX; ++i) \ 3461 { \ 3462 if (ctype_func (i)) \ 3463 { \ 3464 int ch = trans ? trans[i] : i; \ 3465 bitset_set (sbcset, ch); \ 3466 } \ 3467 } 3468 3469 if (strcmp (name, "alnum") == 0) 3470 BUILD_CHARCLASS_LOOP (isalnum) 3471 else if (strcmp (name, "cntrl") == 0) 3472 BUILD_CHARCLASS_LOOP (iscntrl) 3473 else if (strcmp (name, "lower") == 0) 3474 BUILD_CHARCLASS_LOOP (islower) 3475 else if (strcmp (name, "space") == 0) 3476 BUILD_CHARCLASS_LOOP (isspace) 3477 else if (strcmp (name, "alpha") == 0) 3478 BUILD_CHARCLASS_LOOP (isalpha) 3479 else if (strcmp (name, "digit") == 0) 3480 BUILD_CHARCLASS_LOOP (isdigit) 3481 else if (strcmp (name, "print") == 0) 3482 BUILD_CHARCLASS_LOOP (isprint) 3483 else if (strcmp (name, "upper") == 0) 3484 BUILD_CHARCLASS_LOOP (isupper) 3485 else if (strcmp (name, "blank") == 0) 3486 BUILD_CHARCLASS_LOOP (isblank) 3487 else if (strcmp (name, "graph") == 0) 3488 BUILD_CHARCLASS_LOOP (isgraph) 3489 else if (strcmp (name, "punct") == 0) 3490 BUILD_CHARCLASS_LOOP (ispunct) 3491 else if (strcmp (name, "xdigit") == 0) 3492 BUILD_CHARCLASS_LOOP (isxdigit) 3493 else 3494 return REG_ECTYPE; 3495 3496 return REG_NOERROR; 3497 } 3498 3499 static bin_tree_t * 3500 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans, 3501 const unsigned char *class_name, 3502 const unsigned char *extra, 3503 bool non_match, reg_errcode_t *err) 3504 { 3505 re_bitset_ptr_t sbcset; 3506 #ifdef RE_ENABLE_I18N 3507 re_charset_t *mbcset; 3508 Idx alloc = 0; 3509 #endif /* not RE_ENABLE_I18N */ 3510 reg_errcode_t ret; 3511 re_token_t br_token; 3512 bin_tree_t *tree; 3513 3514 sbcset = re_calloc (bitset_word, BITSET_WORDS); 3515 #ifdef RE_ENABLE_I18N 3516 mbcset = re_calloc (re_charset_t, 1); 3517 #endif /* RE_ENABLE_I18N */ 3518 3519 #ifdef RE_ENABLE_I18N 3520 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3521 #else /* not RE_ENABLE_I18N */ 3522 if (BE (sbcset == NULL, 0)) 3523 #endif /* not RE_ENABLE_I18N */ 3524 { 3525 *err = REG_ESPACE; 3526 return NULL; 3527 } 3528 3529 if (non_match) 3530 { 3531 #ifdef RE_ENABLE_I18N 3532 /* 3533 if (syntax & REG_HAT_LISTS_NOT_NEWLINE) 3534 bitset_set(cset->sbcset, '\0'); 3535 */ 3536 mbcset->non_match = 1; 3537 #endif /* not RE_ENABLE_I18N */ 3538 } 3539 3540 /* We don't care the syntax in this case. */ 3541 ret = build_charclass (trans, sbcset, 3542 #ifdef RE_ENABLE_I18N 3543 mbcset, &alloc, 3544 #endif /* RE_ENABLE_I18N */ 3545 class_name, 0); 3546 3547 if (BE (ret != REG_NOERROR, 0)) 3548 { 3549 re_free (sbcset); 3550 #ifdef RE_ENABLE_I18N 3551 free_charset (mbcset); 3552 #endif /* RE_ENABLE_I18N */ 3553 *err = ret; 3554 return NULL; 3555 } 3556 /* \w match '_' also. */ 3557 for (; *extra; extra++) 3558 bitset_set (sbcset, *extra); 3559 3560 /* If it is non-matching list. */ 3561 if (non_match) 3562 bitset_not (sbcset); 3563 3564 #ifdef RE_ENABLE_I18N 3565 /* Ensure only single byte characters are set. */ 3566 if (dfa->mb_cur_max > 1) 3567 bitset_mask (sbcset, dfa->sb_char); 3568 #endif 3569 3570 /* Build a tree for simple bracket. */ 3571 br_token.type = SIMPLE_BRACKET; 3572 br_token.opr.sbcset = sbcset; 3573 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3574 if (BE (tree == NULL, 0)) 3575 goto build_word_op_espace; 3576 3577 #ifdef RE_ENABLE_I18N 3578 if (dfa->mb_cur_max > 1) 3579 { 3580 bin_tree_t *mbc_tree; 3581 /* Build a tree for complex bracket. */ 3582 br_token.type = COMPLEX_BRACKET; 3583 br_token.opr.mbcset = mbcset; 3584 dfa->has_mb_node = 1; 3585 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3586 if (BE (mbc_tree == NULL, 0)) 3587 goto build_word_op_espace; 3588 /* Then join them by ALT node. */ 3589 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3590 if (BE (mbc_tree != NULL, 1)) 3591 return tree; 3592 } 3593 else 3594 { 3595 free_charset (mbcset); 3596 return tree; 3597 } 3598 #else /* not RE_ENABLE_I18N */ 3599 return tree; 3600 #endif /* not RE_ENABLE_I18N */ 3601 3602 build_word_op_espace: 3603 re_free (sbcset); 3604 #ifdef RE_ENABLE_I18N 3605 free_charset (mbcset); 3606 #endif /* RE_ENABLE_I18N */ 3607 *err = REG_ESPACE; 3608 return NULL; 3609 } 3610 3611 /* This is intended for the expressions like "a{1,3}". 3612 Fetch a number from `input', and return the number. 3613 Return REG_MISSING if the number field is empty like "{,1}". 3614 Return REG_ERROR if an error occurred. */ 3615 3616 static Idx 3617 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3618 { 3619 Idx num = REG_MISSING; 3620 unsigned char c; 3621 while (1) 3622 { 3623 fetch_token (token, input, syntax); 3624 c = token->opr.c; 3625 if (BE (token->type == END_OF_RE, 0)) 3626 return REG_ERROR; 3627 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3628 break; 3629 num = ((token->type != CHARACTER || c < '0' || '9' < c 3630 || num == REG_ERROR) 3631 ? REG_ERROR 3632 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); 3633 num = (num > REG_DUP_MAX) ? REG_ERROR : num; 3634 } 3635 return num; 3636 } 3637 3638 #ifdef RE_ENABLE_I18N 3639 static void 3640 free_charset (re_charset_t *cset) 3641 { 3642 re_free (cset->mbchars); 3643 # ifdef _LIBC 3644 re_free (cset->coll_syms); 3645 re_free (cset->equiv_classes); 3646 re_free (cset->range_starts); 3647 re_free (cset->range_ends); 3648 # endif 3649 re_free (cset->char_classes); 3650 re_free (cset); 3651 } 3652 #endif /* RE_ENABLE_I18N */ 3653 3654 /* Functions for binary tree operation. */ 3655 3656 /* Create a tree node. */ 3657 3658 static bin_tree_t * 3659 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3660 re_token_type_t type) 3661 { 3662 re_token_t t; 3663 t.type = type; 3664 return create_token_tree (dfa, left, right, &t); 3665 } 3666 3667 static bin_tree_t * 3668 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3669 const re_token_t *token) 3670 { 3671 bin_tree_t *tree; 3672 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3673 { 3674 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3675 3676 if (storage == NULL) 3677 return NULL; 3678 storage->next = dfa->str_tree_storage; 3679 dfa->str_tree_storage = storage; 3680 dfa->str_tree_storage_idx = 0; 3681 } 3682 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3683 3684 tree->parent = NULL; 3685 tree->left = left; 3686 tree->right = right; 3687 tree->token = *token; 3688 tree->token.duplicated = 0; 3689 tree->token.opt_subexp = 0; 3690 tree->first = NULL; 3691 tree->next = NULL; 3692 tree->node_idx = REG_MISSING; 3693 3694 if (left != NULL) 3695 left->parent = tree; 3696 if (right != NULL) 3697 right->parent = tree; 3698 return tree; 3699 } 3700 3701 /* Mark the tree SRC as an optional subexpression. 3702 To be called from preorder or postorder. */ 3703 3704 static reg_errcode_t 3705 mark_opt_subexp (void *extra, bin_tree_t *node) 3706 { 3707 Idx idx = (Idx) (long) extra; 3708 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3709 node->token.opt_subexp = 1; 3710 3711 return REG_NOERROR; 3712 } 3713 3714 /* Free the allocated memory inside NODE. */ 3715 3716 static void 3717 free_token (re_token_t *node) 3718 { 3719 #ifdef RE_ENABLE_I18N 3720 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3721 free_charset (node->opr.mbcset); 3722 else 3723 #endif /* RE_ENABLE_I18N */ 3724 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3725 re_free (node->opr.sbcset); 3726 } 3727 3728 /* Worker function for tree walking. Free the allocated memory inside NODE 3729 and its children. */ 3730 3731 static reg_errcode_t 3732 free_tree (void *extra, bin_tree_t *node) 3733 { 3734 free_token (&node->token); 3735 return REG_NOERROR; 3736 } 3737 3738 3739 /* Duplicate the node SRC, and return new node. This is a preorder 3740 visit similar to the one implemented by the generic visitor, but 3741 we need more infrastructure to maintain two parallel trees --- so, 3742 it's easier to duplicate. */ 3743 3744 static bin_tree_t * 3745 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3746 { 3747 const bin_tree_t *node; 3748 bin_tree_t *dup_root; 3749 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3750 3751 for (node = root; ; ) 3752 { 3753 /* Create a new tree and link it back to the current parent. */ 3754 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3755 if (*p_new == NULL) 3756 return NULL; 3757 (*p_new)->parent = dup_node; 3758 (*p_new)->token.duplicated = 1; 3759 dup_node = *p_new; 3760 3761 /* Go to the left node, or up and to the right. */ 3762 if (node->left) 3763 { 3764 node = node->left; 3765 p_new = &dup_node->left; 3766 } 3767 else 3768 { 3769 const bin_tree_t *prev = NULL; 3770 while (node->right == prev || node->right == NULL) 3771 { 3772 prev = node; 3773 node = node->parent; 3774 dup_node = dup_node->parent; 3775 if (!node) 3776 return dup_root; 3777 } 3778 node = node->right; 3779 p_new = &dup_node->right; 3780 } 3781 } 3782 } 3783