195b7b453SJohn Marino /* Extended regular expression matching and search library. 2*680a9cb8SJohn Marino Copyright (C) 2002-2014 Free Software Foundation, Inc. 395b7b453SJohn Marino This file is part of the GNU C Library. 495b7b453SJohn Marino Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 595b7b453SJohn Marino 6*680a9cb8SJohn Marino The GNU C Library is free software; you can redistribute it and/or 7*680a9cb8SJohn Marino modify it under the terms of the GNU General Public 8*680a9cb8SJohn Marino License as published by the Free Software Foundation; either 9*680a9cb8SJohn Marino version 3 of the License, or (at your option) any later version. 1095b7b453SJohn Marino 11*680a9cb8SJohn Marino The GNU C Library is distributed in the hope that it will be useful, 1295b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of 13*680a9cb8SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14*680a9cb8SJohn Marino General Public License for more details. 1595b7b453SJohn Marino 16*680a9cb8SJohn Marino You should have received a copy of the GNU General Public 17*680a9cb8SJohn Marino License along with the GNU C Library; if not, see 18*680a9cb8SJohn Marino <http://www.gnu.org/licenses/>. */ 1995b7b453SJohn Marino 2095b7b453SJohn Marino static void re_string_construct_common (const char *str, Idx len, 2195b7b453SJohn Marino re_string_t *pstr, 2295b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, 2395b7b453SJohn Marino const re_dfa_t *dfa) internal_function; 2495b7b453SJohn Marino static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 2595b7b453SJohn Marino const re_node_set *nodes, 2695b7b453SJohn Marino re_hashval_t hash) internal_function; 2795b7b453SJohn Marino static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 2895b7b453SJohn Marino const re_node_set *nodes, 2995b7b453SJohn Marino unsigned int context, 3095b7b453SJohn Marino re_hashval_t hash) internal_function; 3195b7b453SJohn Marino 3295b7b453SJohn Marino /* Functions for string operation. */ 3395b7b453SJohn Marino 3495b7b453SJohn Marino /* This function allocate the buffers. It is necessary to call 3595b7b453SJohn Marino re_string_reconstruct before using the object. */ 3695b7b453SJohn Marino 3795b7b453SJohn Marino static reg_errcode_t 3895b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 3995b7b453SJohn Marino re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 4095b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 4195b7b453SJohn Marino { 4295b7b453SJohn Marino reg_errcode_t ret; 4395b7b453SJohn Marino Idx init_buf_len; 4495b7b453SJohn Marino 4595b7b453SJohn Marino /* Ensure at least one character fits into the buffers. */ 4695b7b453SJohn Marino if (init_len < dfa->mb_cur_max) 4795b7b453SJohn Marino init_len = dfa->mb_cur_max; 4895b7b453SJohn Marino init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 4995b7b453SJohn Marino re_string_construct_common (str, len, pstr, trans, icase, dfa); 5095b7b453SJohn Marino 5195b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, init_buf_len); 5295b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 5395b7b453SJohn Marino return ret; 5495b7b453SJohn Marino 5595b7b453SJohn Marino pstr->word_char = dfa->word_char; 5695b7b453SJohn Marino pstr->word_ops_used = dfa->word_ops_used; 5795b7b453SJohn Marino pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 5895b7b453SJohn Marino pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 5995b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len; 6095b7b453SJohn Marino return REG_NOERROR; 6195b7b453SJohn Marino } 6295b7b453SJohn Marino 6395b7b453SJohn Marino /* This function allocate the buffers, and initialize them. */ 6495b7b453SJohn Marino 6595b7b453SJohn Marino static reg_errcode_t 6695b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 6795b7b453SJohn Marino re_string_construct (re_string_t *pstr, const char *str, Idx len, 6895b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 6995b7b453SJohn Marino { 7095b7b453SJohn Marino reg_errcode_t ret; 7195b7b453SJohn Marino memset (pstr, '\0', sizeof (re_string_t)); 7295b7b453SJohn Marino re_string_construct_common (str, len, pstr, trans, icase, dfa); 7395b7b453SJohn Marino 7495b7b453SJohn Marino if (len > 0) 7595b7b453SJohn Marino { 7695b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, len + 1); 7795b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 7895b7b453SJohn Marino return ret; 7995b7b453SJohn Marino } 8095b7b453SJohn Marino pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 8195b7b453SJohn Marino 8295b7b453SJohn Marino if (icase) 8395b7b453SJohn Marino { 8495b7b453SJohn Marino #ifdef RE_ENABLE_I18N 8595b7b453SJohn Marino if (dfa->mb_cur_max > 1) 8695b7b453SJohn Marino { 8795b7b453SJohn Marino while (1) 8895b7b453SJohn Marino { 8995b7b453SJohn Marino ret = build_wcs_upper_buffer (pstr); 9095b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 9195b7b453SJohn Marino return ret; 9295b7b453SJohn Marino if (pstr->valid_raw_len >= len) 9395b7b453SJohn Marino break; 9495b7b453SJohn Marino if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 9595b7b453SJohn Marino break; 9695b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 9795b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 9895b7b453SJohn Marino return ret; 9995b7b453SJohn Marino } 10095b7b453SJohn Marino } 10195b7b453SJohn Marino else 10295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 10395b7b453SJohn Marino build_upper_buffer (pstr); 10495b7b453SJohn Marino } 10595b7b453SJohn Marino else 10695b7b453SJohn Marino { 10795b7b453SJohn Marino #ifdef RE_ENABLE_I18N 10895b7b453SJohn Marino if (dfa->mb_cur_max > 1) 10995b7b453SJohn Marino build_wcs_buffer (pstr); 11095b7b453SJohn Marino else 11195b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 11295b7b453SJohn Marino { 11395b7b453SJohn Marino if (trans != NULL) 11495b7b453SJohn Marino re_string_translate_buffer (pstr); 11595b7b453SJohn Marino else 11695b7b453SJohn Marino { 11795b7b453SJohn Marino pstr->valid_len = pstr->bufs_len; 11895b7b453SJohn Marino pstr->valid_raw_len = pstr->bufs_len; 11995b7b453SJohn Marino } 12095b7b453SJohn Marino } 12195b7b453SJohn Marino } 12295b7b453SJohn Marino 12395b7b453SJohn Marino return REG_NOERROR; 12495b7b453SJohn Marino } 12595b7b453SJohn Marino 12695b7b453SJohn Marino /* Helper functions for re_string_allocate, and re_string_construct. */ 12795b7b453SJohn Marino 12895b7b453SJohn Marino static reg_errcode_t 12995b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 13095b7b453SJohn Marino re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 13195b7b453SJohn Marino { 13295b7b453SJohn Marino #ifdef RE_ENABLE_I18N 13395b7b453SJohn Marino if (pstr->mb_cur_max > 1) 13495b7b453SJohn Marino { 13595b7b453SJohn Marino wint_t *new_wcs; 13695b7b453SJohn Marino 137cf28ed85SJohn Marino /* Avoid overflow in realloc. */ 138cf28ed85SJohn Marino const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 139cf28ed85SJohn Marino if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0)) 14095b7b453SJohn Marino return REG_ESPACE; 14195b7b453SJohn Marino 14295b7b453SJohn Marino new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 14395b7b453SJohn Marino if (BE (new_wcs == NULL, 0)) 14495b7b453SJohn Marino return REG_ESPACE; 14595b7b453SJohn Marino pstr->wcs = new_wcs; 14695b7b453SJohn Marino if (pstr->offsets != NULL) 14795b7b453SJohn Marino { 14895b7b453SJohn Marino Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 14995b7b453SJohn Marino if (BE (new_offsets == NULL, 0)) 15095b7b453SJohn Marino return REG_ESPACE; 15195b7b453SJohn Marino pstr->offsets = new_offsets; 15295b7b453SJohn Marino } 15395b7b453SJohn Marino } 15495b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 15595b7b453SJohn Marino if (pstr->mbs_allocated) 15695b7b453SJohn Marino { 15795b7b453SJohn Marino unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 15895b7b453SJohn Marino new_buf_len); 15995b7b453SJohn Marino if (BE (new_mbs == NULL, 0)) 16095b7b453SJohn Marino return REG_ESPACE; 16195b7b453SJohn Marino pstr->mbs = new_mbs; 16295b7b453SJohn Marino } 16395b7b453SJohn Marino pstr->bufs_len = new_buf_len; 16495b7b453SJohn Marino return REG_NOERROR; 16595b7b453SJohn Marino } 16695b7b453SJohn Marino 16795b7b453SJohn Marino 16895b7b453SJohn Marino static void 16995b7b453SJohn Marino internal_function 17095b7b453SJohn Marino re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 17195b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, 17295b7b453SJohn Marino const re_dfa_t *dfa) 17395b7b453SJohn Marino { 17495b7b453SJohn Marino pstr->raw_mbs = (const unsigned char *) str; 17595b7b453SJohn Marino pstr->len = len; 17695b7b453SJohn Marino pstr->raw_len = len; 17795b7b453SJohn Marino pstr->trans = trans; 17895b7b453SJohn Marino pstr->icase = icase; 17995b7b453SJohn Marino pstr->mbs_allocated = (trans != NULL || icase); 18095b7b453SJohn Marino pstr->mb_cur_max = dfa->mb_cur_max; 18195b7b453SJohn Marino pstr->is_utf8 = dfa->is_utf8; 18295b7b453SJohn Marino pstr->map_notascii = dfa->map_notascii; 18395b7b453SJohn Marino pstr->stop = pstr->len; 18495b7b453SJohn Marino pstr->raw_stop = pstr->stop; 18595b7b453SJohn Marino } 18695b7b453SJohn Marino 18795b7b453SJohn Marino #ifdef RE_ENABLE_I18N 18895b7b453SJohn Marino 18995b7b453SJohn Marino /* Build wide character buffer PSTR->WCS. 19095b7b453SJohn Marino If the byte sequence of the string are: 19195b7b453SJohn Marino <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 19295b7b453SJohn Marino Then wide character buffer will be: 19395b7b453SJohn Marino <wc1> , WEOF , <wc2> , WEOF , <wc3> 19495b7b453SJohn Marino We use WEOF for padding, they indicate that the position isn't 19595b7b453SJohn Marino a first byte of a multibyte character. 19695b7b453SJohn Marino 19795b7b453SJohn Marino Note that this function assumes PSTR->VALID_LEN elements are already 19895b7b453SJohn Marino built and starts from PSTR->VALID_LEN. */ 19995b7b453SJohn Marino 20095b7b453SJohn Marino static void 20195b7b453SJohn Marino internal_function 20295b7b453SJohn Marino build_wcs_buffer (re_string_t *pstr) 20395b7b453SJohn Marino { 20495b7b453SJohn Marino #ifdef _LIBC 20595b7b453SJohn Marino unsigned char buf[MB_LEN_MAX]; 20695b7b453SJohn Marino assert (MB_LEN_MAX >= pstr->mb_cur_max); 20795b7b453SJohn Marino #else 20895b7b453SJohn Marino unsigned char buf[64]; 20995b7b453SJohn Marino #endif 21095b7b453SJohn Marino mbstate_t prev_st; 21195b7b453SJohn Marino Idx byte_idx, end_idx, remain_len; 21295b7b453SJohn Marino size_t mbclen; 21395b7b453SJohn Marino 21495b7b453SJohn Marino /* Build the buffers from pstr->valid_len to either pstr->len or 21595b7b453SJohn Marino pstr->bufs_len. */ 21695b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 21795b7b453SJohn Marino for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 21895b7b453SJohn Marino { 21995b7b453SJohn Marino wchar_t wc; 22095b7b453SJohn Marino const char *p; 22195b7b453SJohn Marino 22295b7b453SJohn Marino remain_len = end_idx - byte_idx; 22395b7b453SJohn Marino prev_st = pstr->cur_state; 22495b7b453SJohn Marino /* Apply the translation if we need. */ 22595b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 22695b7b453SJohn Marino { 22795b7b453SJohn Marino int i, ch; 22895b7b453SJohn Marino 22995b7b453SJohn Marino for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 23095b7b453SJohn Marino { 23195b7b453SJohn Marino ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 23295b7b453SJohn Marino buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 23395b7b453SJohn Marino } 23495b7b453SJohn Marino p = (const char *) buf; 23595b7b453SJohn Marino } 23695b7b453SJohn Marino else 23795b7b453SJohn Marino p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 23895b7b453SJohn Marino mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 239cf28ed85SJohn Marino if (BE (mbclen == (size_t) -1 || mbclen == 0 240cf28ed85SJohn Marino || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0)) 24195b7b453SJohn Marino { 24295b7b453SJohn Marino /* We treat these cases as a singlebyte character. */ 24395b7b453SJohn Marino mbclen = 1; 24495b7b453SJohn Marino wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 24595b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 24695b7b453SJohn Marino wc = pstr->trans[wc]; 24795b7b453SJohn Marino pstr->cur_state = prev_st; 24895b7b453SJohn Marino } 249cf28ed85SJohn Marino else if (BE (mbclen == (size_t) -2, 0)) 250cf28ed85SJohn Marino { 251cf28ed85SJohn Marino /* The buffer doesn't have enough space, finish to build. */ 252cf28ed85SJohn Marino pstr->cur_state = prev_st; 253cf28ed85SJohn Marino break; 254cf28ed85SJohn Marino } 25595b7b453SJohn Marino 25695b7b453SJohn Marino /* Write wide character and padding. */ 25795b7b453SJohn Marino pstr->wcs[byte_idx++] = wc; 25895b7b453SJohn Marino /* Write paddings. */ 25995b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 26095b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF; 26195b7b453SJohn Marino } 26295b7b453SJohn Marino pstr->valid_len = byte_idx; 26395b7b453SJohn Marino pstr->valid_raw_len = byte_idx; 26495b7b453SJohn Marino } 26595b7b453SJohn Marino 26695b7b453SJohn Marino /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 26795b7b453SJohn Marino but for REG_ICASE. */ 26895b7b453SJohn Marino 26995b7b453SJohn Marino static reg_errcode_t 27095b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 27195b7b453SJohn Marino build_wcs_upper_buffer (re_string_t *pstr) 27295b7b453SJohn Marino { 27395b7b453SJohn Marino mbstate_t prev_st; 27495b7b453SJohn Marino Idx src_idx, byte_idx, end_idx, remain_len; 27595b7b453SJohn Marino size_t mbclen; 27695b7b453SJohn Marino #ifdef _LIBC 27795b7b453SJohn Marino char buf[MB_LEN_MAX]; 27895b7b453SJohn Marino assert (MB_LEN_MAX >= pstr->mb_cur_max); 27995b7b453SJohn Marino #else 28095b7b453SJohn Marino char buf[64]; 28195b7b453SJohn Marino #endif 28295b7b453SJohn Marino 28395b7b453SJohn Marino byte_idx = pstr->valid_len; 28495b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 28595b7b453SJohn Marino 28695b7b453SJohn Marino /* The following optimization assumes that ASCII characters can be 28795b7b453SJohn Marino mapped to wide characters with a simple cast. */ 28895b7b453SJohn Marino if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 28995b7b453SJohn Marino { 29095b7b453SJohn Marino while (byte_idx < end_idx) 29195b7b453SJohn Marino { 29295b7b453SJohn Marino wchar_t wc; 29395b7b453SJohn Marino 29495b7b453SJohn Marino if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 29595b7b453SJohn Marino && mbsinit (&pstr->cur_state)) 29695b7b453SJohn Marino { 29795b7b453SJohn Marino /* In case of a singlebyte character. */ 29895b7b453SJohn Marino pstr->mbs[byte_idx] 29995b7b453SJohn Marino = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 30095b7b453SJohn Marino /* The next step uses the assumption that wchar_t is encoded 30195b7b453SJohn Marino ASCII-safe: all ASCII values can be converted like this. */ 30295b7b453SJohn Marino pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 30395b7b453SJohn Marino ++byte_idx; 30495b7b453SJohn Marino continue; 30595b7b453SJohn Marino } 30695b7b453SJohn Marino 30795b7b453SJohn Marino remain_len = end_idx - byte_idx; 30895b7b453SJohn Marino prev_st = pstr->cur_state; 30995b7b453SJohn Marino mbclen = __mbrtowc (&wc, 31095b7b453SJohn Marino ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 31195b7b453SJohn Marino + byte_idx), remain_len, &pstr->cur_state); 31295b7b453SJohn Marino if (BE (mbclen < (size_t) -2, 1)) 31395b7b453SJohn Marino { 314*680a9cb8SJohn Marino wchar_t wcu = towupper (wc); 315*680a9cb8SJohn Marino if (wcu != wc) 31695b7b453SJohn Marino { 31795b7b453SJohn Marino size_t mbcdlen; 31895b7b453SJohn Marino 31995b7b453SJohn Marino mbcdlen = wcrtomb (buf, wcu, &prev_st); 32095b7b453SJohn Marino if (BE (mbclen == mbcdlen, 1)) 32195b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbclen); 32295b7b453SJohn Marino else 32395b7b453SJohn Marino { 32495b7b453SJohn Marino src_idx = byte_idx; 32595b7b453SJohn Marino goto offsets_needed; 32695b7b453SJohn Marino } 32795b7b453SJohn Marino } 32895b7b453SJohn Marino else 32995b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, 33095b7b453SJohn Marino pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 33195b7b453SJohn Marino pstr->wcs[byte_idx++] = wcu; 33295b7b453SJohn Marino /* Write paddings. */ 33395b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 33495b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF; 33595b7b453SJohn Marino } 336cf28ed85SJohn Marino else if (mbclen == (size_t) -1 || mbclen == 0 337cf28ed85SJohn Marino || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) 33895b7b453SJohn Marino { 339cf28ed85SJohn Marino /* It is an invalid character, an incomplete character 340cf28ed85SJohn Marino at the end of the string, or '\0'. Just use the byte. */ 34195b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 34295b7b453SJohn Marino pstr->mbs[byte_idx] = ch; 34395b7b453SJohn Marino /* And also cast it to wide char. */ 34495b7b453SJohn Marino pstr->wcs[byte_idx++] = (wchar_t) ch; 34595b7b453SJohn Marino if (BE (mbclen == (size_t) -1, 0)) 34695b7b453SJohn Marino pstr->cur_state = prev_st; 34795b7b453SJohn Marino } 34895b7b453SJohn Marino else 34995b7b453SJohn Marino { 35095b7b453SJohn Marino /* The buffer doesn't have enough space, finish to build. */ 35195b7b453SJohn Marino pstr->cur_state = prev_st; 35295b7b453SJohn Marino break; 35395b7b453SJohn Marino } 35495b7b453SJohn Marino } 35595b7b453SJohn Marino pstr->valid_len = byte_idx; 35695b7b453SJohn Marino pstr->valid_raw_len = byte_idx; 35795b7b453SJohn Marino return REG_NOERROR; 35895b7b453SJohn Marino } 35995b7b453SJohn Marino else 36095b7b453SJohn Marino for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 36195b7b453SJohn Marino { 36295b7b453SJohn Marino wchar_t wc; 36395b7b453SJohn Marino const char *p; 36495b7b453SJohn Marino offsets_needed: 36595b7b453SJohn Marino remain_len = end_idx - byte_idx; 36695b7b453SJohn Marino prev_st = pstr->cur_state; 36795b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 36895b7b453SJohn Marino { 36995b7b453SJohn Marino int i, ch; 37095b7b453SJohn Marino 37195b7b453SJohn Marino for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 37295b7b453SJohn Marino { 37395b7b453SJohn Marino ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 37495b7b453SJohn Marino buf[i] = pstr->trans[ch]; 37595b7b453SJohn Marino } 37695b7b453SJohn Marino p = (const char *) buf; 37795b7b453SJohn Marino } 37895b7b453SJohn Marino else 37995b7b453SJohn Marino p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 38095b7b453SJohn Marino mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 38195b7b453SJohn Marino if (BE (mbclen < (size_t) -2, 1)) 38295b7b453SJohn Marino { 383*680a9cb8SJohn Marino wchar_t wcu = towupper (wc); 384*680a9cb8SJohn Marino if (wcu != wc) 38595b7b453SJohn Marino { 38695b7b453SJohn Marino size_t mbcdlen; 38795b7b453SJohn Marino 38895b7b453SJohn Marino mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 38995b7b453SJohn Marino if (BE (mbclen == mbcdlen, 1)) 39095b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbclen); 39195b7b453SJohn Marino else if (mbcdlen != (size_t) -1) 39295b7b453SJohn Marino { 39395b7b453SJohn Marino size_t i; 39495b7b453SJohn Marino 39595b7b453SJohn Marino if (byte_idx + mbcdlen > pstr->bufs_len) 39695b7b453SJohn Marino { 39795b7b453SJohn Marino pstr->cur_state = prev_st; 39895b7b453SJohn Marino break; 39995b7b453SJohn Marino } 40095b7b453SJohn Marino 40195b7b453SJohn Marino if (pstr->offsets == NULL) 40295b7b453SJohn Marino { 40395b7b453SJohn Marino pstr->offsets = re_malloc (Idx, pstr->bufs_len); 40495b7b453SJohn Marino 40595b7b453SJohn Marino if (pstr->offsets == NULL) 40695b7b453SJohn Marino return REG_ESPACE; 40795b7b453SJohn Marino } 40895b7b453SJohn Marino if (!pstr->offsets_needed) 40995b7b453SJohn Marino { 41095b7b453SJohn Marino for (i = 0; i < (size_t) byte_idx; ++i) 41195b7b453SJohn Marino pstr->offsets[i] = i; 41295b7b453SJohn Marino pstr->offsets_needed = 1; 41395b7b453SJohn Marino } 41495b7b453SJohn Marino 41595b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 41695b7b453SJohn Marino pstr->wcs[byte_idx] = wcu; 41795b7b453SJohn Marino pstr->offsets[byte_idx] = src_idx; 41895b7b453SJohn Marino for (i = 1; i < mbcdlen; ++i) 41995b7b453SJohn Marino { 42095b7b453SJohn Marino pstr->offsets[byte_idx + i] 42195b7b453SJohn Marino = src_idx + (i < mbclen ? i : mbclen - 1); 42295b7b453SJohn Marino pstr->wcs[byte_idx + i] = WEOF; 42395b7b453SJohn Marino } 42495b7b453SJohn Marino pstr->len += mbcdlen - mbclen; 42595b7b453SJohn Marino if (pstr->raw_stop > src_idx) 42695b7b453SJohn Marino pstr->stop += mbcdlen - mbclen; 42795b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) 42895b7b453SJohn Marino ? pstr->len : pstr->bufs_len; 42995b7b453SJohn Marino byte_idx += mbcdlen; 43095b7b453SJohn Marino src_idx += mbclen; 43195b7b453SJohn Marino continue; 43295b7b453SJohn Marino } 43395b7b453SJohn Marino else 43495b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, p, mbclen); 43595b7b453SJohn Marino } 43695b7b453SJohn Marino else 43795b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, p, mbclen); 43895b7b453SJohn Marino 43995b7b453SJohn Marino if (BE (pstr->offsets_needed != 0, 0)) 44095b7b453SJohn Marino { 44195b7b453SJohn Marino size_t i; 44295b7b453SJohn Marino for (i = 0; i < mbclen; ++i) 44395b7b453SJohn Marino pstr->offsets[byte_idx + i] = src_idx + i; 44495b7b453SJohn Marino } 44595b7b453SJohn Marino src_idx += mbclen; 44695b7b453SJohn Marino 44795b7b453SJohn Marino pstr->wcs[byte_idx++] = wcu; 44895b7b453SJohn Marino /* Write paddings. */ 44995b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 45095b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF; 45195b7b453SJohn Marino } 452cf28ed85SJohn Marino else if (mbclen == (size_t) -1 || mbclen == 0 453cf28ed85SJohn Marino || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len)) 45495b7b453SJohn Marino { 45595b7b453SJohn Marino /* It is an invalid character or '\0'. Just use the byte. */ 45695b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 45795b7b453SJohn Marino 45895b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 45995b7b453SJohn Marino ch = pstr->trans [ch]; 46095b7b453SJohn Marino pstr->mbs[byte_idx] = ch; 46195b7b453SJohn Marino 46295b7b453SJohn Marino if (BE (pstr->offsets_needed != 0, 0)) 46395b7b453SJohn Marino pstr->offsets[byte_idx] = src_idx; 46495b7b453SJohn Marino ++src_idx; 46595b7b453SJohn Marino 46695b7b453SJohn Marino /* And also cast it to wide char. */ 46795b7b453SJohn Marino pstr->wcs[byte_idx++] = (wchar_t) ch; 46895b7b453SJohn Marino if (BE (mbclen == (size_t) -1, 0)) 46995b7b453SJohn Marino pstr->cur_state = prev_st; 47095b7b453SJohn Marino } 47195b7b453SJohn Marino else 47295b7b453SJohn Marino { 47395b7b453SJohn Marino /* The buffer doesn't have enough space, finish to build. */ 47495b7b453SJohn Marino pstr->cur_state = prev_st; 47595b7b453SJohn Marino break; 47695b7b453SJohn Marino } 47795b7b453SJohn Marino } 47895b7b453SJohn Marino pstr->valid_len = byte_idx; 47995b7b453SJohn Marino pstr->valid_raw_len = src_idx; 48095b7b453SJohn Marino return REG_NOERROR; 48195b7b453SJohn Marino } 48295b7b453SJohn Marino 48395b7b453SJohn Marino /* Skip characters until the index becomes greater than NEW_RAW_IDX. 48495b7b453SJohn Marino Return the index. */ 48595b7b453SJohn Marino 48695b7b453SJohn Marino static Idx 48795b7b453SJohn Marino internal_function 48895b7b453SJohn Marino re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 48995b7b453SJohn Marino { 49095b7b453SJohn Marino mbstate_t prev_st; 49195b7b453SJohn Marino Idx rawbuf_idx; 49295b7b453SJohn Marino size_t mbclen; 49395b7b453SJohn Marino wint_t wc = WEOF; 49495b7b453SJohn Marino 49595b7b453SJohn Marino /* Skip the characters which are not necessary to check. */ 49695b7b453SJohn Marino for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 49795b7b453SJohn Marino rawbuf_idx < new_raw_idx;) 49895b7b453SJohn Marino { 49995b7b453SJohn Marino wchar_t wc2; 500cf28ed85SJohn Marino Idx remain_len = pstr->raw_len - rawbuf_idx; 50195b7b453SJohn Marino prev_st = pstr->cur_state; 50295b7b453SJohn Marino mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 50395b7b453SJohn Marino remain_len, &pstr->cur_state); 50495b7b453SJohn Marino if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 50595b7b453SJohn Marino { 50695b7b453SJohn Marino /* We treat these cases as a single byte character. */ 50795b7b453SJohn Marino if (mbclen == 0 || remain_len == 0) 50895b7b453SJohn Marino wc = L'\0'; 50995b7b453SJohn Marino else 51095b7b453SJohn Marino wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 51195b7b453SJohn Marino mbclen = 1; 51295b7b453SJohn Marino pstr->cur_state = prev_st; 51395b7b453SJohn Marino } 51495b7b453SJohn Marino else 51595b7b453SJohn Marino wc = wc2; 51695b7b453SJohn Marino /* Then proceed the next character. */ 51795b7b453SJohn Marino rawbuf_idx += mbclen; 51895b7b453SJohn Marino } 51995b7b453SJohn Marino *last_wc = wc; 52095b7b453SJohn Marino return rawbuf_idx; 52195b7b453SJohn Marino } 52295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 52395b7b453SJohn Marino 52495b7b453SJohn Marino /* Build the buffer PSTR->MBS, and apply the translation if we need. 52595b7b453SJohn Marino This function is used in case of REG_ICASE. */ 52695b7b453SJohn Marino 52795b7b453SJohn Marino static void 52895b7b453SJohn Marino internal_function 52995b7b453SJohn Marino build_upper_buffer (re_string_t *pstr) 53095b7b453SJohn Marino { 53195b7b453SJohn Marino Idx char_idx, end_idx; 53295b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 53395b7b453SJohn Marino 53495b7b453SJohn Marino for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 53595b7b453SJohn Marino { 53695b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 53795b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 53895b7b453SJohn Marino ch = pstr->trans[ch]; 53995b7b453SJohn Marino pstr->mbs[char_idx] = toupper (ch); 54095b7b453SJohn Marino } 54195b7b453SJohn Marino pstr->valid_len = char_idx; 54295b7b453SJohn Marino pstr->valid_raw_len = char_idx; 54395b7b453SJohn Marino } 54495b7b453SJohn Marino 54595b7b453SJohn Marino /* Apply TRANS to the buffer in PSTR. */ 54695b7b453SJohn Marino 54795b7b453SJohn Marino static void 54895b7b453SJohn Marino internal_function 54995b7b453SJohn Marino re_string_translate_buffer (re_string_t *pstr) 55095b7b453SJohn Marino { 55195b7b453SJohn Marino Idx buf_idx, end_idx; 55295b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 55395b7b453SJohn Marino 55495b7b453SJohn Marino for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 55595b7b453SJohn Marino { 55695b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 55795b7b453SJohn Marino pstr->mbs[buf_idx] = pstr->trans[ch]; 55895b7b453SJohn Marino } 55995b7b453SJohn Marino 56095b7b453SJohn Marino pstr->valid_len = buf_idx; 56195b7b453SJohn Marino pstr->valid_raw_len = buf_idx; 56295b7b453SJohn Marino } 56395b7b453SJohn Marino 56495b7b453SJohn Marino /* This function re-construct the buffers. 56595b7b453SJohn Marino Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 56695b7b453SJohn Marino convert to upper case in case of REG_ICASE, apply translation. */ 56795b7b453SJohn Marino 56895b7b453SJohn Marino static reg_errcode_t 56995b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 57095b7b453SJohn Marino re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 57195b7b453SJohn Marino { 57295b7b453SJohn Marino Idx offset; 57395b7b453SJohn Marino 57495b7b453SJohn Marino if (BE (pstr->raw_mbs_idx <= idx, 0)) 57595b7b453SJohn Marino offset = idx - pstr->raw_mbs_idx; 57695b7b453SJohn Marino else 57795b7b453SJohn Marino { 57895b7b453SJohn Marino /* Reset buffer. */ 57995b7b453SJohn Marino #ifdef RE_ENABLE_I18N 58095b7b453SJohn Marino if (pstr->mb_cur_max > 1) 58195b7b453SJohn Marino memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 58295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 58395b7b453SJohn Marino pstr->len = pstr->raw_len; 58495b7b453SJohn Marino pstr->stop = pstr->raw_stop; 58595b7b453SJohn Marino pstr->valid_len = 0; 58695b7b453SJohn Marino pstr->raw_mbs_idx = 0; 58795b7b453SJohn Marino pstr->valid_raw_len = 0; 58895b7b453SJohn Marino pstr->offsets_needed = 0; 58995b7b453SJohn Marino pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 59095b7b453SJohn Marino : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 59195b7b453SJohn Marino if (!pstr->mbs_allocated) 59295b7b453SJohn Marino pstr->mbs = (unsigned char *) pstr->raw_mbs; 59395b7b453SJohn Marino offset = idx; 59495b7b453SJohn Marino } 59595b7b453SJohn Marino 59695b7b453SJohn Marino if (BE (offset != 0, 1)) 59795b7b453SJohn Marino { 59895b7b453SJohn Marino /* Should the already checked characters be kept? */ 59995b7b453SJohn Marino if (BE (offset < pstr->valid_raw_len, 1)) 60095b7b453SJohn Marino { 60195b7b453SJohn Marino /* Yes, move them to the front of the buffer. */ 60295b7b453SJohn Marino #ifdef RE_ENABLE_I18N 60395b7b453SJohn Marino if (BE (pstr->offsets_needed, 0)) 60495b7b453SJohn Marino { 60595b7b453SJohn Marino Idx low = 0, high = pstr->valid_len, mid; 60695b7b453SJohn Marino do 60795b7b453SJohn Marino { 60895b7b453SJohn Marino mid = (high + low) / 2; 60995b7b453SJohn Marino if (pstr->offsets[mid] > offset) 61095b7b453SJohn Marino high = mid; 61195b7b453SJohn Marino else if (pstr->offsets[mid] < offset) 61295b7b453SJohn Marino low = mid + 1; 61395b7b453SJohn Marino else 61495b7b453SJohn Marino break; 61595b7b453SJohn Marino } 61695b7b453SJohn Marino while (low < high); 61795b7b453SJohn Marino if (pstr->offsets[mid] < offset) 61895b7b453SJohn Marino ++mid; 61995b7b453SJohn Marino pstr->tip_context = re_string_context_at (pstr, mid - 1, 62095b7b453SJohn Marino eflags); 62195b7b453SJohn Marino /* This can be quite complicated, so handle specially 62295b7b453SJohn Marino only the common and easy case where the character with 62395b7b453SJohn Marino different length representation of lower and upper 62495b7b453SJohn Marino case is present at or after offset. */ 62595b7b453SJohn Marino if (pstr->valid_len > offset 62695b7b453SJohn Marino && mid == offset && pstr->offsets[mid] == offset) 62795b7b453SJohn Marino { 62895b7b453SJohn Marino memmove (pstr->wcs, pstr->wcs + offset, 62995b7b453SJohn Marino (pstr->valid_len - offset) * sizeof (wint_t)); 63095b7b453SJohn Marino memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 63195b7b453SJohn Marino pstr->valid_len -= offset; 63295b7b453SJohn Marino pstr->valid_raw_len -= offset; 63395b7b453SJohn Marino for (low = 0; low < pstr->valid_len; low++) 63495b7b453SJohn Marino pstr->offsets[low] = pstr->offsets[low + offset] - offset; 63595b7b453SJohn Marino } 63695b7b453SJohn Marino else 63795b7b453SJohn Marino { 63895b7b453SJohn Marino /* Otherwise, just find out how long the partial multibyte 63995b7b453SJohn Marino character at offset is and fill it with WEOF/255. */ 64095b7b453SJohn Marino pstr->len = pstr->raw_len - idx + offset; 64195b7b453SJohn Marino pstr->stop = pstr->raw_stop - idx + offset; 64295b7b453SJohn Marino pstr->offsets_needed = 0; 64395b7b453SJohn Marino while (mid > 0 && pstr->offsets[mid - 1] == offset) 64495b7b453SJohn Marino --mid; 64595b7b453SJohn Marino while (mid < pstr->valid_len) 64695b7b453SJohn Marino if (pstr->wcs[mid] != WEOF) 64795b7b453SJohn Marino break; 64895b7b453SJohn Marino else 64995b7b453SJohn Marino ++mid; 65095b7b453SJohn Marino if (mid == pstr->valid_len) 65195b7b453SJohn Marino pstr->valid_len = 0; 65295b7b453SJohn Marino else 65395b7b453SJohn Marino { 65495b7b453SJohn Marino pstr->valid_len = pstr->offsets[mid] - offset; 65595b7b453SJohn Marino if (pstr->valid_len) 65695b7b453SJohn Marino { 65795b7b453SJohn Marino for (low = 0; low < pstr->valid_len; ++low) 65895b7b453SJohn Marino pstr->wcs[low] = WEOF; 65995b7b453SJohn Marino memset (pstr->mbs, 255, pstr->valid_len); 66095b7b453SJohn Marino } 66195b7b453SJohn Marino } 66295b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len; 66395b7b453SJohn Marino } 66495b7b453SJohn Marino } 66595b7b453SJohn Marino else 66695b7b453SJohn Marino #endif 66795b7b453SJohn Marino { 66895b7b453SJohn Marino pstr->tip_context = re_string_context_at (pstr, offset - 1, 66995b7b453SJohn Marino eflags); 67095b7b453SJohn Marino #ifdef RE_ENABLE_I18N 67195b7b453SJohn Marino if (pstr->mb_cur_max > 1) 67295b7b453SJohn Marino memmove (pstr->wcs, pstr->wcs + offset, 67395b7b453SJohn Marino (pstr->valid_len - offset) * sizeof (wint_t)); 67495b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 67595b7b453SJohn Marino if (BE (pstr->mbs_allocated, 0)) 67695b7b453SJohn Marino memmove (pstr->mbs, pstr->mbs + offset, 67795b7b453SJohn Marino pstr->valid_len - offset); 67895b7b453SJohn Marino pstr->valid_len -= offset; 67995b7b453SJohn Marino pstr->valid_raw_len -= offset; 68095b7b453SJohn Marino #if DEBUG 68195b7b453SJohn Marino assert (pstr->valid_len > 0); 68295b7b453SJohn Marino #endif 68395b7b453SJohn Marino } 68495b7b453SJohn Marino } 68595b7b453SJohn Marino else 68695b7b453SJohn Marino { 68795b7b453SJohn Marino #ifdef RE_ENABLE_I18N 68895b7b453SJohn Marino /* No, skip all characters until IDX. */ 68995b7b453SJohn Marino Idx prev_valid_len = pstr->valid_len; 69095b7b453SJohn Marino 69195b7b453SJohn Marino if (BE (pstr->offsets_needed, 0)) 69295b7b453SJohn Marino { 69395b7b453SJohn Marino pstr->len = pstr->raw_len - idx + offset; 69495b7b453SJohn Marino pstr->stop = pstr->raw_stop - idx + offset; 69595b7b453SJohn Marino pstr->offsets_needed = 0; 69695b7b453SJohn Marino } 69795b7b453SJohn Marino #endif 69895b7b453SJohn Marino pstr->valid_len = 0; 69995b7b453SJohn Marino #ifdef RE_ENABLE_I18N 70095b7b453SJohn Marino if (pstr->mb_cur_max > 1) 70195b7b453SJohn Marino { 70295b7b453SJohn Marino Idx wcs_idx; 70395b7b453SJohn Marino wint_t wc = WEOF; 70495b7b453SJohn Marino 70595b7b453SJohn Marino if (pstr->is_utf8) 70695b7b453SJohn Marino { 70795b7b453SJohn Marino const unsigned char *raw, *p, *end; 70895b7b453SJohn Marino 70995b7b453SJohn Marino /* Special case UTF-8. Multi-byte chars start with any 71095b7b453SJohn Marino byte other than 0x80 - 0xbf. */ 71195b7b453SJohn Marino raw = pstr->raw_mbs + pstr->raw_mbs_idx; 71295b7b453SJohn Marino end = raw + (offset - pstr->mb_cur_max); 71395b7b453SJohn Marino if (end < pstr->raw_mbs) 71495b7b453SJohn Marino end = pstr->raw_mbs; 71595b7b453SJohn Marino p = raw + offset - 1; 71695b7b453SJohn Marino #ifdef _LIBC 71795b7b453SJohn Marino /* We know the wchar_t encoding is UCS4, so for the simple 71895b7b453SJohn Marino case, ASCII characters, skip the conversion step. */ 71995b7b453SJohn Marino if (isascii (*p) && BE (pstr->trans == NULL, 1)) 72095b7b453SJohn Marino { 72195b7b453SJohn Marino memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 72295b7b453SJohn Marino /* pstr->valid_len = 0; */ 72395b7b453SJohn Marino wc = (wchar_t) *p; 72495b7b453SJohn Marino } 72595b7b453SJohn Marino else 72695b7b453SJohn Marino #endif 72795b7b453SJohn Marino for (; p >= end; --p) 72895b7b453SJohn Marino if ((*p & 0xc0) != 0x80) 72995b7b453SJohn Marino { 73095b7b453SJohn Marino mbstate_t cur_state; 73195b7b453SJohn Marino wchar_t wc2; 73295b7b453SJohn Marino Idx mlen = raw + pstr->len - p; 733cf28ed85SJohn Marino unsigned char buf[6]; 73495b7b453SJohn Marino size_t mbclen; 73595b7b453SJohn Marino 736cf28ed85SJohn Marino const unsigned char *pp = p; 73795b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 73895b7b453SJohn Marino { 73995b7b453SJohn Marino int i = mlen < 6 ? mlen : 6; 74095b7b453SJohn Marino while (--i >= 0) 74195b7b453SJohn Marino buf[i] = pstr->trans[p[i]]; 742cf28ed85SJohn Marino pp = buf; 74395b7b453SJohn Marino } 74495b7b453SJohn Marino /* XXX Don't use mbrtowc, we know which conversion 74595b7b453SJohn Marino to use (UTF-8 -> UCS4). */ 74695b7b453SJohn Marino memset (&cur_state, 0, sizeof (cur_state)); 747cf28ed85SJohn Marino mbclen = __mbrtowc (&wc2, (const char *) pp, mlen, 74895b7b453SJohn Marino &cur_state); 74995b7b453SJohn Marino if (raw + offset - p <= mbclen 75095b7b453SJohn Marino && mbclen < (size_t) -2) 75195b7b453SJohn Marino { 75295b7b453SJohn Marino memset (&pstr->cur_state, '\0', 75395b7b453SJohn Marino sizeof (mbstate_t)); 75495b7b453SJohn Marino pstr->valid_len = mbclen - (raw + offset - p); 75595b7b453SJohn Marino wc = wc2; 75695b7b453SJohn Marino } 75795b7b453SJohn Marino break; 75895b7b453SJohn Marino } 75995b7b453SJohn Marino } 76095b7b453SJohn Marino 76195b7b453SJohn Marino if (wc == WEOF) 76295b7b453SJohn Marino pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 76395b7b453SJohn Marino if (wc == WEOF) 76495b7b453SJohn Marino pstr->tip_context 76595b7b453SJohn Marino = re_string_context_at (pstr, prev_valid_len - 1, eflags); 76695b7b453SJohn Marino else 76795b7b453SJohn Marino pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 76895b7b453SJohn Marino && IS_WIDE_WORD_CHAR (wc)) 76995b7b453SJohn Marino ? CONTEXT_WORD 77095b7b453SJohn Marino : ((IS_WIDE_NEWLINE (wc) 77195b7b453SJohn Marino && pstr->newline_anchor) 77295b7b453SJohn Marino ? CONTEXT_NEWLINE : 0)); 77395b7b453SJohn Marino if (BE (pstr->valid_len, 0)) 77495b7b453SJohn Marino { 77595b7b453SJohn Marino for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 77695b7b453SJohn Marino pstr->wcs[wcs_idx] = WEOF; 77795b7b453SJohn Marino if (pstr->mbs_allocated) 77895b7b453SJohn Marino memset (pstr->mbs, 255, pstr->valid_len); 77995b7b453SJohn Marino } 78095b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len; 78195b7b453SJohn Marino } 78295b7b453SJohn Marino else 78395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 78495b7b453SJohn Marino { 78595b7b453SJohn Marino int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 78695b7b453SJohn Marino pstr->valid_raw_len = 0; 78795b7b453SJohn Marino if (pstr->trans) 78895b7b453SJohn Marino c = pstr->trans[c]; 78995b7b453SJohn Marino pstr->tip_context = (bitset_contain (pstr->word_char, c) 79095b7b453SJohn Marino ? CONTEXT_WORD 79195b7b453SJohn Marino : ((IS_NEWLINE (c) && pstr->newline_anchor) 79295b7b453SJohn Marino ? CONTEXT_NEWLINE : 0)); 79395b7b453SJohn Marino } 79495b7b453SJohn Marino } 79595b7b453SJohn Marino if (!BE (pstr->mbs_allocated, 0)) 79695b7b453SJohn Marino pstr->mbs += offset; 79795b7b453SJohn Marino } 79895b7b453SJohn Marino pstr->raw_mbs_idx = idx; 79995b7b453SJohn Marino pstr->len -= offset; 80095b7b453SJohn Marino pstr->stop -= offset; 80195b7b453SJohn Marino 80295b7b453SJohn Marino /* Then build the buffers. */ 80395b7b453SJohn Marino #ifdef RE_ENABLE_I18N 80495b7b453SJohn Marino if (pstr->mb_cur_max > 1) 80595b7b453SJohn Marino { 80695b7b453SJohn Marino if (pstr->icase) 80795b7b453SJohn Marino { 80895b7b453SJohn Marino reg_errcode_t ret = build_wcs_upper_buffer (pstr); 80995b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 81095b7b453SJohn Marino return ret; 81195b7b453SJohn Marino } 81295b7b453SJohn Marino else 81395b7b453SJohn Marino build_wcs_buffer (pstr); 81495b7b453SJohn Marino } 81595b7b453SJohn Marino else 81695b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 81795b7b453SJohn Marino if (BE (pstr->mbs_allocated, 0)) 81895b7b453SJohn Marino { 81995b7b453SJohn Marino if (pstr->icase) 82095b7b453SJohn Marino build_upper_buffer (pstr); 82195b7b453SJohn Marino else if (pstr->trans != NULL) 82295b7b453SJohn Marino re_string_translate_buffer (pstr); 82395b7b453SJohn Marino } 82495b7b453SJohn Marino else 82595b7b453SJohn Marino pstr->valid_len = pstr->len; 82695b7b453SJohn Marino 82795b7b453SJohn Marino pstr->cur_idx = 0; 82895b7b453SJohn Marino return REG_NOERROR; 82995b7b453SJohn Marino } 83095b7b453SJohn Marino 83195b7b453SJohn Marino static unsigned char 832*680a9cb8SJohn Marino internal_function __attribute__ ((pure)) 83395b7b453SJohn Marino re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 83495b7b453SJohn Marino { 83595b7b453SJohn Marino int ch; 83695b7b453SJohn Marino Idx off; 83795b7b453SJohn Marino 83895b7b453SJohn Marino /* Handle the common (easiest) cases first. */ 83995b7b453SJohn Marino if (BE (!pstr->mbs_allocated, 1)) 84095b7b453SJohn Marino return re_string_peek_byte (pstr, idx); 84195b7b453SJohn Marino 84295b7b453SJohn Marino #ifdef RE_ENABLE_I18N 84395b7b453SJohn Marino if (pstr->mb_cur_max > 1 84495b7b453SJohn Marino && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 84595b7b453SJohn Marino return re_string_peek_byte (pstr, idx); 84695b7b453SJohn Marino #endif 84795b7b453SJohn Marino 84895b7b453SJohn Marino off = pstr->cur_idx + idx; 84995b7b453SJohn Marino #ifdef RE_ENABLE_I18N 85095b7b453SJohn Marino if (pstr->offsets_needed) 85195b7b453SJohn Marino off = pstr->offsets[off]; 85295b7b453SJohn Marino #endif 85395b7b453SJohn Marino 85495b7b453SJohn Marino ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 85595b7b453SJohn Marino 85695b7b453SJohn Marino #ifdef RE_ENABLE_I18N 85795b7b453SJohn Marino /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 85895b7b453SJohn Marino this function returns CAPITAL LETTER I instead of first byte of 85995b7b453SJohn Marino DOTLESS SMALL LETTER I. The latter would confuse the parser, 86095b7b453SJohn Marino since peek_byte_case doesn't advance cur_idx in any way. */ 86195b7b453SJohn Marino if (pstr->offsets_needed && !isascii (ch)) 86295b7b453SJohn Marino return re_string_peek_byte (pstr, idx); 86395b7b453SJohn Marino #endif 86495b7b453SJohn Marino 86595b7b453SJohn Marino return ch; 86695b7b453SJohn Marino } 86795b7b453SJohn Marino 86895b7b453SJohn Marino static unsigned char 869cf28ed85SJohn Marino internal_function 87095b7b453SJohn Marino re_string_fetch_byte_case (re_string_t *pstr) 87195b7b453SJohn Marino { 87295b7b453SJohn Marino if (BE (!pstr->mbs_allocated, 1)) 87395b7b453SJohn Marino return re_string_fetch_byte (pstr); 87495b7b453SJohn Marino 87595b7b453SJohn Marino #ifdef RE_ENABLE_I18N 87695b7b453SJohn Marino if (pstr->offsets_needed) 87795b7b453SJohn Marino { 87895b7b453SJohn Marino Idx off; 87995b7b453SJohn Marino int ch; 88095b7b453SJohn Marino 88195b7b453SJohn Marino /* For tr_TR.UTF-8 [[:islower:]] there is 88295b7b453SJohn Marino [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 88395b7b453SJohn Marino in that case the whole multi-byte character and return 88495b7b453SJohn Marino the original letter. On the other side, with 88595b7b453SJohn Marino [[: DOTLESS SMALL LETTER I return [[:I, as doing 88695b7b453SJohn Marino anything else would complicate things too much. */ 88795b7b453SJohn Marino 88895b7b453SJohn Marino if (!re_string_first_byte (pstr, pstr->cur_idx)) 88995b7b453SJohn Marino return re_string_fetch_byte (pstr); 89095b7b453SJohn Marino 89195b7b453SJohn Marino off = pstr->offsets[pstr->cur_idx]; 89295b7b453SJohn Marino ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 89395b7b453SJohn Marino 89495b7b453SJohn Marino if (! isascii (ch)) 89595b7b453SJohn Marino return re_string_fetch_byte (pstr); 89695b7b453SJohn Marino 89795b7b453SJohn Marino re_string_skip_bytes (pstr, 89895b7b453SJohn Marino re_string_char_size_at (pstr, pstr->cur_idx)); 89995b7b453SJohn Marino return ch; 90095b7b453SJohn Marino } 90195b7b453SJohn Marino #endif 90295b7b453SJohn Marino 90395b7b453SJohn Marino return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 90495b7b453SJohn Marino } 90595b7b453SJohn Marino 90695b7b453SJohn Marino static void 90795b7b453SJohn Marino internal_function 90895b7b453SJohn Marino re_string_destruct (re_string_t *pstr) 90995b7b453SJohn Marino { 91095b7b453SJohn Marino #ifdef RE_ENABLE_I18N 91195b7b453SJohn Marino re_free (pstr->wcs); 91295b7b453SJohn Marino re_free (pstr->offsets); 91395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 91495b7b453SJohn Marino if (pstr->mbs_allocated) 91595b7b453SJohn Marino re_free (pstr->mbs); 91695b7b453SJohn Marino } 91795b7b453SJohn Marino 91895b7b453SJohn Marino /* Return the context at IDX in INPUT. */ 91995b7b453SJohn Marino 92095b7b453SJohn Marino static unsigned int 92195b7b453SJohn Marino internal_function 92295b7b453SJohn Marino re_string_context_at (const re_string_t *input, Idx idx, int eflags) 92395b7b453SJohn Marino { 92495b7b453SJohn Marino int c; 92595b7b453SJohn Marino if (BE (! REG_VALID_INDEX (idx), 0)) 92695b7b453SJohn Marino /* In this case, we use the value stored in input->tip_context, 92795b7b453SJohn Marino since we can't know the character in input->mbs[-1] here. */ 92895b7b453SJohn Marino return input->tip_context; 92995b7b453SJohn Marino if (BE (idx == input->len, 0)) 93095b7b453SJohn Marino return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 93195b7b453SJohn Marino : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 93295b7b453SJohn Marino #ifdef RE_ENABLE_I18N 93395b7b453SJohn Marino if (input->mb_cur_max > 1) 93495b7b453SJohn Marino { 93595b7b453SJohn Marino wint_t wc; 93695b7b453SJohn Marino Idx wc_idx = idx; 93795b7b453SJohn Marino while(input->wcs[wc_idx] == WEOF) 93895b7b453SJohn Marino { 93995b7b453SJohn Marino #ifdef DEBUG 94095b7b453SJohn Marino /* It must not happen. */ 94195b7b453SJohn Marino assert (REG_VALID_INDEX (wc_idx)); 94295b7b453SJohn Marino #endif 94395b7b453SJohn Marino --wc_idx; 94495b7b453SJohn Marino if (! REG_VALID_INDEX (wc_idx)) 94595b7b453SJohn Marino return input->tip_context; 94695b7b453SJohn Marino } 94795b7b453SJohn Marino wc = input->wcs[wc_idx]; 94895b7b453SJohn Marino if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 94995b7b453SJohn Marino return CONTEXT_WORD; 95095b7b453SJohn Marino return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 95195b7b453SJohn Marino ? CONTEXT_NEWLINE : 0); 95295b7b453SJohn Marino } 95395b7b453SJohn Marino else 95495b7b453SJohn Marino #endif 95595b7b453SJohn Marino { 95695b7b453SJohn Marino c = re_string_byte_at (input, idx); 95795b7b453SJohn Marino if (bitset_contain (input->word_char, c)) 95895b7b453SJohn Marino return CONTEXT_WORD; 95995b7b453SJohn Marino return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 96095b7b453SJohn Marino } 96195b7b453SJohn Marino } 96295b7b453SJohn Marino 96395b7b453SJohn Marino /* Functions for set operation. */ 96495b7b453SJohn Marino 96595b7b453SJohn Marino static reg_errcode_t 96695b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 96795b7b453SJohn Marino re_node_set_alloc (re_node_set *set, Idx size) 96895b7b453SJohn Marino { 96995b7b453SJohn Marino set->alloc = size; 97095b7b453SJohn Marino set->nelem = 0; 97195b7b453SJohn Marino set->elems = re_malloc (Idx, size); 972*680a9cb8SJohn Marino if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0)) 97395b7b453SJohn Marino return REG_ESPACE; 97495b7b453SJohn Marino return REG_NOERROR; 97595b7b453SJohn Marino } 97695b7b453SJohn Marino 97795b7b453SJohn Marino static reg_errcode_t 97895b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 97995b7b453SJohn Marino re_node_set_init_1 (re_node_set *set, Idx elem) 98095b7b453SJohn Marino { 98195b7b453SJohn Marino set->alloc = 1; 98295b7b453SJohn Marino set->nelem = 1; 98395b7b453SJohn Marino set->elems = re_malloc (Idx, 1); 98495b7b453SJohn Marino if (BE (set->elems == NULL, 0)) 98595b7b453SJohn Marino { 98695b7b453SJohn Marino set->alloc = set->nelem = 0; 98795b7b453SJohn Marino return REG_ESPACE; 98895b7b453SJohn Marino } 98995b7b453SJohn Marino set->elems[0] = elem; 99095b7b453SJohn Marino return REG_NOERROR; 99195b7b453SJohn Marino } 99295b7b453SJohn Marino 99395b7b453SJohn Marino static reg_errcode_t 99495b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 99595b7b453SJohn Marino re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 99695b7b453SJohn Marino { 99795b7b453SJohn Marino set->alloc = 2; 99895b7b453SJohn Marino set->elems = re_malloc (Idx, 2); 99995b7b453SJohn Marino if (BE (set->elems == NULL, 0)) 100095b7b453SJohn Marino return REG_ESPACE; 100195b7b453SJohn Marino if (elem1 == elem2) 100295b7b453SJohn Marino { 100395b7b453SJohn Marino set->nelem = 1; 100495b7b453SJohn Marino set->elems[0] = elem1; 100595b7b453SJohn Marino } 100695b7b453SJohn Marino else 100795b7b453SJohn Marino { 100895b7b453SJohn Marino set->nelem = 2; 100995b7b453SJohn Marino if (elem1 < elem2) 101095b7b453SJohn Marino { 101195b7b453SJohn Marino set->elems[0] = elem1; 101295b7b453SJohn Marino set->elems[1] = elem2; 101395b7b453SJohn Marino } 101495b7b453SJohn Marino else 101595b7b453SJohn Marino { 101695b7b453SJohn Marino set->elems[0] = elem2; 101795b7b453SJohn Marino set->elems[1] = elem1; 101895b7b453SJohn Marino } 101995b7b453SJohn Marino } 102095b7b453SJohn Marino return REG_NOERROR; 102195b7b453SJohn Marino } 102295b7b453SJohn Marino 102395b7b453SJohn Marino static reg_errcode_t 102495b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 102595b7b453SJohn Marino re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 102695b7b453SJohn Marino { 102795b7b453SJohn Marino dest->nelem = src->nelem; 102895b7b453SJohn Marino if (src->nelem > 0) 102995b7b453SJohn Marino { 103095b7b453SJohn Marino dest->alloc = dest->nelem; 103195b7b453SJohn Marino dest->elems = re_malloc (Idx, dest->alloc); 103295b7b453SJohn Marino if (BE (dest->elems == NULL, 0)) 103395b7b453SJohn Marino { 103495b7b453SJohn Marino dest->alloc = dest->nelem = 0; 103595b7b453SJohn Marino return REG_ESPACE; 103695b7b453SJohn Marino } 103795b7b453SJohn Marino memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 103895b7b453SJohn Marino } 103995b7b453SJohn Marino else 104095b7b453SJohn Marino re_node_set_init_empty (dest); 104195b7b453SJohn Marino return REG_NOERROR; 104295b7b453SJohn Marino } 104395b7b453SJohn Marino 104495b7b453SJohn Marino /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 104595b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded. 104695b7b453SJohn Marino Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 104795b7b453SJohn Marino 104895b7b453SJohn Marino static reg_errcode_t 104995b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 105095b7b453SJohn Marino re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 105195b7b453SJohn Marino const re_node_set *src2) 105295b7b453SJohn Marino { 105395b7b453SJohn Marino Idx i1, i2, is, id, delta, sbase; 105495b7b453SJohn Marino if (src1->nelem == 0 || src2->nelem == 0) 105595b7b453SJohn Marino return REG_NOERROR; 105695b7b453SJohn Marino 105795b7b453SJohn Marino /* We need dest->nelem + 2 * elems_in_intersection; this is a 105895b7b453SJohn Marino conservative estimate. */ 105995b7b453SJohn Marino if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 106095b7b453SJohn Marino { 106195b7b453SJohn Marino Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 106295b7b453SJohn Marino Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 106395b7b453SJohn Marino if (BE (new_elems == NULL, 0)) 106495b7b453SJohn Marino return REG_ESPACE; 106595b7b453SJohn Marino dest->elems = new_elems; 106695b7b453SJohn Marino dest->alloc = new_alloc; 106795b7b453SJohn Marino } 106895b7b453SJohn Marino 106995b7b453SJohn Marino /* Find the items in the intersection of SRC1 and SRC2, and copy 107095b7b453SJohn Marino into the top of DEST those that are not already in DEST itself. */ 107195b7b453SJohn Marino sbase = dest->nelem + src1->nelem + src2->nelem; 107295b7b453SJohn Marino i1 = src1->nelem - 1; 107395b7b453SJohn Marino i2 = src2->nelem - 1; 107495b7b453SJohn Marino id = dest->nelem - 1; 107595b7b453SJohn Marino for (;;) 107695b7b453SJohn Marino { 107795b7b453SJohn Marino if (src1->elems[i1] == src2->elems[i2]) 107895b7b453SJohn Marino { 107995b7b453SJohn Marino /* Try to find the item in DEST. Maybe we could binary search? */ 108095b7b453SJohn Marino while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 108195b7b453SJohn Marino --id; 108295b7b453SJohn Marino 108395b7b453SJohn Marino if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 108495b7b453SJohn Marino dest->elems[--sbase] = src1->elems[i1]; 108595b7b453SJohn Marino 108695b7b453SJohn Marino if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 108795b7b453SJohn Marino break; 108895b7b453SJohn Marino } 108995b7b453SJohn Marino 109095b7b453SJohn Marino /* Lower the highest of the two items. */ 109195b7b453SJohn Marino else if (src1->elems[i1] < src2->elems[i2]) 109295b7b453SJohn Marino { 109395b7b453SJohn Marino if (! REG_VALID_INDEX (--i2)) 109495b7b453SJohn Marino break; 109595b7b453SJohn Marino } 109695b7b453SJohn Marino else 109795b7b453SJohn Marino { 109895b7b453SJohn Marino if (! REG_VALID_INDEX (--i1)) 109995b7b453SJohn Marino break; 110095b7b453SJohn Marino } 110195b7b453SJohn Marino } 110295b7b453SJohn Marino 110395b7b453SJohn Marino id = dest->nelem - 1; 110495b7b453SJohn Marino is = dest->nelem + src1->nelem + src2->nelem - 1; 110595b7b453SJohn Marino delta = is - sbase + 1; 110695b7b453SJohn Marino 110795b7b453SJohn Marino /* Now copy. When DELTA becomes zero, the remaining 110895b7b453SJohn Marino DEST elements are already in place; this is more or 110995b7b453SJohn Marino less the same loop that is in re_node_set_merge. */ 111095b7b453SJohn Marino dest->nelem += delta; 111195b7b453SJohn Marino if (delta > 0 && REG_VALID_INDEX (id)) 111295b7b453SJohn Marino for (;;) 111395b7b453SJohn Marino { 111495b7b453SJohn Marino if (dest->elems[is] > dest->elems[id]) 111595b7b453SJohn Marino { 111695b7b453SJohn Marino /* Copy from the top. */ 111795b7b453SJohn Marino dest->elems[id + delta--] = dest->elems[is--]; 111895b7b453SJohn Marino if (delta == 0) 111995b7b453SJohn Marino break; 112095b7b453SJohn Marino } 112195b7b453SJohn Marino else 112295b7b453SJohn Marino { 112395b7b453SJohn Marino /* Slide from the bottom. */ 112495b7b453SJohn Marino dest->elems[id + delta] = dest->elems[id]; 112595b7b453SJohn Marino if (! REG_VALID_INDEX (--id)) 112695b7b453SJohn Marino break; 112795b7b453SJohn Marino } 112895b7b453SJohn Marino } 112995b7b453SJohn Marino 113095b7b453SJohn Marino /* Copy remaining SRC elements. */ 113195b7b453SJohn Marino memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 113295b7b453SJohn Marino 113395b7b453SJohn Marino return REG_NOERROR; 113495b7b453SJohn Marino } 113595b7b453SJohn Marino 113695b7b453SJohn Marino /* Calculate the union set of the sets SRC1 and SRC2. And store it to 113795b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 113895b7b453SJohn Marino 113995b7b453SJohn Marino static reg_errcode_t 114095b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 114195b7b453SJohn Marino re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 114295b7b453SJohn Marino const re_node_set *src2) 114395b7b453SJohn Marino { 114495b7b453SJohn Marino Idx i1, i2, id; 114595b7b453SJohn Marino if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 114695b7b453SJohn Marino { 114795b7b453SJohn Marino dest->alloc = src1->nelem + src2->nelem; 114895b7b453SJohn Marino dest->elems = re_malloc (Idx, dest->alloc); 114995b7b453SJohn Marino if (BE (dest->elems == NULL, 0)) 115095b7b453SJohn Marino return REG_ESPACE; 115195b7b453SJohn Marino } 115295b7b453SJohn Marino else 115395b7b453SJohn Marino { 115495b7b453SJohn Marino if (src1 != NULL && src1->nelem > 0) 115595b7b453SJohn Marino return re_node_set_init_copy (dest, src1); 115695b7b453SJohn Marino else if (src2 != NULL && src2->nelem > 0) 115795b7b453SJohn Marino return re_node_set_init_copy (dest, src2); 115895b7b453SJohn Marino else 115995b7b453SJohn Marino re_node_set_init_empty (dest); 116095b7b453SJohn Marino return REG_NOERROR; 116195b7b453SJohn Marino } 116295b7b453SJohn Marino for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 116395b7b453SJohn Marino { 116495b7b453SJohn Marino if (src1->elems[i1] > src2->elems[i2]) 116595b7b453SJohn Marino { 116695b7b453SJohn Marino dest->elems[id++] = src2->elems[i2++]; 116795b7b453SJohn Marino continue; 116895b7b453SJohn Marino } 116995b7b453SJohn Marino if (src1->elems[i1] == src2->elems[i2]) 117095b7b453SJohn Marino ++i2; 117195b7b453SJohn Marino dest->elems[id++] = src1->elems[i1++]; 117295b7b453SJohn Marino } 117395b7b453SJohn Marino if (i1 < src1->nelem) 117495b7b453SJohn Marino { 117595b7b453SJohn Marino memcpy (dest->elems + id, src1->elems + i1, 117695b7b453SJohn Marino (src1->nelem - i1) * sizeof (Idx)); 117795b7b453SJohn Marino id += src1->nelem - i1; 117895b7b453SJohn Marino } 117995b7b453SJohn Marino else if (i2 < src2->nelem) 118095b7b453SJohn Marino { 118195b7b453SJohn Marino memcpy (dest->elems + id, src2->elems + i2, 118295b7b453SJohn Marino (src2->nelem - i2) * sizeof (Idx)); 118395b7b453SJohn Marino id += src2->nelem - i2; 118495b7b453SJohn Marino } 118595b7b453SJohn Marino dest->nelem = id; 118695b7b453SJohn Marino return REG_NOERROR; 118795b7b453SJohn Marino } 118895b7b453SJohn Marino 118995b7b453SJohn Marino /* Calculate the union set of the sets DEST and SRC. And store it to 119095b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 119195b7b453SJohn Marino 119295b7b453SJohn Marino static reg_errcode_t 119395b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 119495b7b453SJohn Marino re_node_set_merge (re_node_set *dest, const re_node_set *src) 119595b7b453SJohn Marino { 119695b7b453SJohn Marino Idx is, id, sbase, delta; 119795b7b453SJohn Marino if (src == NULL || src->nelem == 0) 119895b7b453SJohn Marino return REG_NOERROR; 119995b7b453SJohn Marino if (dest->alloc < 2 * src->nelem + dest->nelem) 120095b7b453SJohn Marino { 120195b7b453SJohn Marino Idx new_alloc = 2 * (src->nelem + dest->alloc); 120295b7b453SJohn Marino Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 120395b7b453SJohn Marino if (BE (new_buffer == NULL, 0)) 120495b7b453SJohn Marino return REG_ESPACE; 120595b7b453SJohn Marino dest->elems = new_buffer; 120695b7b453SJohn Marino dest->alloc = new_alloc; 120795b7b453SJohn Marino } 120895b7b453SJohn Marino 120995b7b453SJohn Marino if (BE (dest->nelem == 0, 0)) 121095b7b453SJohn Marino { 121195b7b453SJohn Marino dest->nelem = src->nelem; 121295b7b453SJohn Marino memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 121395b7b453SJohn Marino return REG_NOERROR; 121495b7b453SJohn Marino } 121595b7b453SJohn Marino 121695b7b453SJohn Marino /* Copy into the top of DEST the items of SRC that are not 121795b7b453SJohn Marino found in DEST. Maybe we could binary search in DEST? */ 121895b7b453SJohn Marino for (sbase = dest->nelem + 2 * src->nelem, 121995b7b453SJohn Marino is = src->nelem - 1, id = dest->nelem - 1; 122095b7b453SJohn Marino REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 122195b7b453SJohn Marino { 122295b7b453SJohn Marino if (dest->elems[id] == src->elems[is]) 122395b7b453SJohn Marino is--, id--; 122495b7b453SJohn Marino else if (dest->elems[id] < src->elems[is]) 122595b7b453SJohn Marino dest->elems[--sbase] = src->elems[is--]; 122695b7b453SJohn Marino else /* if (dest->elems[id] > src->elems[is]) */ 122795b7b453SJohn Marino --id; 122895b7b453SJohn Marino } 122995b7b453SJohn Marino 123095b7b453SJohn Marino if (REG_VALID_INDEX (is)) 123195b7b453SJohn Marino { 123295b7b453SJohn Marino /* If DEST is exhausted, the remaining items of SRC must be unique. */ 123395b7b453SJohn Marino sbase -= is + 1; 123495b7b453SJohn Marino memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 123595b7b453SJohn Marino } 123695b7b453SJohn Marino 123795b7b453SJohn Marino id = dest->nelem - 1; 123895b7b453SJohn Marino is = dest->nelem + 2 * src->nelem - 1; 123995b7b453SJohn Marino delta = is - sbase + 1; 124095b7b453SJohn Marino if (delta == 0) 124195b7b453SJohn Marino return REG_NOERROR; 124295b7b453SJohn Marino 124395b7b453SJohn Marino /* Now copy. When DELTA becomes zero, the remaining 124495b7b453SJohn Marino DEST elements are already in place. */ 124595b7b453SJohn Marino dest->nelem += delta; 124695b7b453SJohn Marino for (;;) 124795b7b453SJohn Marino { 124895b7b453SJohn Marino if (dest->elems[is] > dest->elems[id]) 124995b7b453SJohn Marino { 125095b7b453SJohn Marino /* Copy from the top. */ 125195b7b453SJohn Marino dest->elems[id + delta--] = dest->elems[is--]; 125295b7b453SJohn Marino if (delta == 0) 125395b7b453SJohn Marino break; 125495b7b453SJohn Marino } 125595b7b453SJohn Marino else 125695b7b453SJohn Marino { 125795b7b453SJohn Marino /* Slide from the bottom. */ 125895b7b453SJohn Marino dest->elems[id + delta] = dest->elems[id]; 125995b7b453SJohn Marino if (! REG_VALID_INDEX (--id)) 126095b7b453SJohn Marino { 126195b7b453SJohn Marino /* Copy remaining SRC elements. */ 126295b7b453SJohn Marino memcpy (dest->elems, dest->elems + sbase, 126395b7b453SJohn Marino delta * sizeof (Idx)); 126495b7b453SJohn Marino break; 126595b7b453SJohn Marino } 126695b7b453SJohn Marino } 126795b7b453SJohn Marino } 126895b7b453SJohn Marino 126995b7b453SJohn Marino return REG_NOERROR; 127095b7b453SJohn Marino } 127195b7b453SJohn Marino 127295b7b453SJohn Marino /* Insert the new element ELEM to the re_node_set* SET. 127395b7b453SJohn Marino SET should not already have ELEM. 127495b7b453SJohn Marino Return true if successful. */ 127595b7b453SJohn Marino 127695b7b453SJohn Marino static bool 127795b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 127895b7b453SJohn Marino re_node_set_insert (re_node_set *set, Idx elem) 127995b7b453SJohn Marino { 128095b7b453SJohn Marino Idx idx; 128195b7b453SJohn Marino /* In case the set is empty. */ 128295b7b453SJohn Marino if (set->alloc == 0) 128395b7b453SJohn Marino return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 128495b7b453SJohn Marino 128595b7b453SJohn Marino if (BE (set->nelem, 0) == 0) 128695b7b453SJohn Marino { 128795b7b453SJohn Marino /* We already guaranteed above that set->alloc != 0. */ 128895b7b453SJohn Marino set->elems[0] = elem; 128995b7b453SJohn Marino ++set->nelem; 129095b7b453SJohn Marino return true; 129195b7b453SJohn Marino } 129295b7b453SJohn Marino 129395b7b453SJohn Marino /* Realloc if we need. */ 129495b7b453SJohn Marino if (set->alloc == set->nelem) 129595b7b453SJohn Marino { 129695b7b453SJohn Marino Idx *new_elems; 129795b7b453SJohn Marino set->alloc = set->alloc * 2; 129895b7b453SJohn Marino new_elems = re_realloc (set->elems, Idx, set->alloc); 129995b7b453SJohn Marino if (BE (new_elems == NULL, 0)) 130095b7b453SJohn Marino return false; 130195b7b453SJohn Marino set->elems = new_elems; 130295b7b453SJohn Marino } 130395b7b453SJohn Marino 130495b7b453SJohn Marino /* Move the elements which follows the new element. Test the 130595b7b453SJohn Marino first element separately to skip a check in the inner loop. */ 130695b7b453SJohn Marino if (elem < set->elems[0]) 130795b7b453SJohn Marino { 130895b7b453SJohn Marino idx = 0; 130995b7b453SJohn Marino for (idx = set->nelem; idx > 0; idx--) 131095b7b453SJohn Marino set->elems[idx] = set->elems[idx - 1]; 131195b7b453SJohn Marino } 131295b7b453SJohn Marino else 131395b7b453SJohn Marino { 131495b7b453SJohn Marino for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 131595b7b453SJohn Marino set->elems[idx] = set->elems[idx - 1]; 131695b7b453SJohn Marino } 131795b7b453SJohn Marino 131895b7b453SJohn Marino /* Insert the new element. */ 131995b7b453SJohn Marino set->elems[idx] = elem; 132095b7b453SJohn Marino ++set->nelem; 132195b7b453SJohn Marino return true; 132295b7b453SJohn Marino } 132395b7b453SJohn Marino 132495b7b453SJohn Marino /* Insert the new element ELEM to the re_node_set* SET. 132595b7b453SJohn Marino SET should not already have any element greater than or equal to ELEM. 132695b7b453SJohn Marino Return true if successful. */ 132795b7b453SJohn Marino 132895b7b453SJohn Marino static bool 132995b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 133095b7b453SJohn Marino re_node_set_insert_last (re_node_set *set, Idx elem) 133195b7b453SJohn Marino { 133295b7b453SJohn Marino /* Realloc if we need. */ 133395b7b453SJohn Marino if (set->alloc == set->nelem) 133495b7b453SJohn Marino { 133595b7b453SJohn Marino Idx *new_elems; 133695b7b453SJohn Marino set->alloc = (set->alloc + 1) * 2; 133795b7b453SJohn Marino new_elems = re_realloc (set->elems, Idx, set->alloc); 133895b7b453SJohn Marino if (BE (new_elems == NULL, 0)) 133995b7b453SJohn Marino return false; 134095b7b453SJohn Marino set->elems = new_elems; 134195b7b453SJohn Marino } 134295b7b453SJohn Marino 134395b7b453SJohn Marino /* Insert the new element. */ 134495b7b453SJohn Marino set->elems[set->nelem++] = elem; 134595b7b453SJohn Marino return true; 134695b7b453SJohn Marino } 134795b7b453SJohn Marino 134895b7b453SJohn Marino /* Compare two node sets SET1 and SET2. 134995b7b453SJohn Marino Return true if SET1 and SET2 are equivalent. */ 135095b7b453SJohn Marino 135195b7b453SJohn Marino static bool 1352*680a9cb8SJohn Marino internal_function __attribute__ ((pure)) 135395b7b453SJohn Marino re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 135495b7b453SJohn Marino { 135595b7b453SJohn Marino Idx i; 135695b7b453SJohn Marino if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 135795b7b453SJohn Marino return false; 135895b7b453SJohn Marino for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 135995b7b453SJohn Marino if (set1->elems[i] != set2->elems[i]) 136095b7b453SJohn Marino return false; 136195b7b453SJohn Marino return true; 136295b7b453SJohn Marino } 136395b7b453SJohn Marino 136495b7b453SJohn Marino /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 136595b7b453SJohn Marino 136695b7b453SJohn Marino static Idx 1367*680a9cb8SJohn Marino internal_function __attribute__ ((pure)) 136895b7b453SJohn Marino re_node_set_contains (const re_node_set *set, Idx elem) 136995b7b453SJohn Marino { 137095b7b453SJohn Marino __re_size_t idx, right, mid; 137195b7b453SJohn Marino if (! REG_VALID_NONZERO_INDEX (set->nelem)) 137295b7b453SJohn Marino return 0; 137395b7b453SJohn Marino 137495b7b453SJohn Marino /* Binary search the element. */ 137595b7b453SJohn Marino idx = 0; 137695b7b453SJohn Marino right = set->nelem - 1; 137795b7b453SJohn Marino while (idx < right) 137895b7b453SJohn Marino { 137995b7b453SJohn Marino mid = (idx + right) / 2; 138095b7b453SJohn Marino if (set->elems[mid] < elem) 138195b7b453SJohn Marino idx = mid + 1; 138295b7b453SJohn Marino else 138395b7b453SJohn Marino right = mid; 138495b7b453SJohn Marino } 138595b7b453SJohn Marino return set->elems[idx] == elem ? idx + 1 : 0; 138695b7b453SJohn Marino } 138795b7b453SJohn Marino 138895b7b453SJohn Marino static void 138995b7b453SJohn Marino internal_function 139095b7b453SJohn Marino re_node_set_remove_at (re_node_set *set, Idx idx) 139195b7b453SJohn Marino { 1392*680a9cb8SJohn Marino if (idx < 0 || idx >= set->nelem) 139395b7b453SJohn Marino return; 139495b7b453SJohn Marino --set->nelem; 139595b7b453SJohn Marino for (; idx < set->nelem; idx++) 139695b7b453SJohn Marino set->elems[idx] = set->elems[idx + 1]; 139795b7b453SJohn Marino } 139895b7b453SJohn Marino 139995b7b453SJohn Marino 140095b7b453SJohn Marino /* Add the token TOKEN to dfa->nodes, and return the index of the token. 140195b7b453SJohn Marino Or return REG_MISSING if an error occurred. */ 140295b7b453SJohn Marino 140395b7b453SJohn Marino static Idx 140495b7b453SJohn Marino internal_function 140595b7b453SJohn Marino re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 140695b7b453SJohn Marino { 140795b7b453SJohn Marino if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 140895b7b453SJohn Marino { 140995b7b453SJohn Marino size_t new_nodes_alloc = dfa->nodes_alloc * 2; 141095b7b453SJohn Marino Idx *new_nexts, *new_indices; 141195b7b453SJohn Marino re_node_set *new_edests, *new_eclosures; 141295b7b453SJohn Marino re_token_t *new_nodes; 1413cf28ed85SJohn Marino 1414cf28ed85SJohn Marino /* Avoid overflows in realloc. */ 1415cf28ed85SJohn Marino const size_t max_object_size = MAX (sizeof (re_token_t), 141695b7b453SJohn Marino MAX (sizeof (re_node_set), 141795b7b453SJohn Marino sizeof (Idx))); 1418cf28ed85SJohn Marino if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0)) 141995b7b453SJohn Marino return REG_MISSING; 142095b7b453SJohn Marino 142195b7b453SJohn Marino new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 142295b7b453SJohn Marino if (BE (new_nodes == NULL, 0)) 142395b7b453SJohn Marino return REG_MISSING; 142495b7b453SJohn Marino dfa->nodes = new_nodes; 142595b7b453SJohn Marino new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 142695b7b453SJohn Marino new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 142795b7b453SJohn Marino new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 142895b7b453SJohn Marino new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 142995b7b453SJohn Marino if (BE (new_nexts == NULL || new_indices == NULL 143095b7b453SJohn Marino || new_edests == NULL || new_eclosures == NULL, 0)) 143195b7b453SJohn Marino return REG_MISSING; 143295b7b453SJohn Marino dfa->nexts = new_nexts; 143395b7b453SJohn Marino dfa->org_indices = new_indices; 143495b7b453SJohn Marino dfa->edests = new_edests; 143595b7b453SJohn Marino dfa->eclosures = new_eclosures; 143695b7b453SJohn Marino dfa->nodes_alloc = new_nodes_alloc; 143795b7b453SJohn Marino } 143895b7b453SJohn Marino dfa->nodes[dfa->nodes_len] = token; 143995b7b453SJohn Marino dfa->nodes[dfa->nodes_len].constraint = 0; 144095b7b453SJohn Marino #ifdef RE_ENABLE_I18N 144195b7b453SJohn Marino dfa->nodes[dfa->nodes_len].accept_mb = 1442*680a9cb8SJohn Marino ((token.type == OP_PERIOD && dfa->mb_cur_max > 1) 1443*680a9cb8SJohn Marino || token.type == COMPLEX_BRACKET); 144495b7b453SJohn Marino #endif 144595b7b453SJohn Marino dfa->nexts[dfa->nodes_len] = REG_MISSING; 144695b7b453SJohn Marino re_node_set_init_empty (dfa->edests + dfa->nodes_len); 144795b7b453SJohn Marino re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 144895b7b453SJohn Marino return dfa->nodes_len++; 144995b7b453SJohn Marino } 145095b7b453SJohn Marino 1451*680a9cb8SJohn Marino static re_hashval_t 145295b7b453SJohn Marino internal_function 145395b7b453SJohn Marino calc_state_hash (const re_node_set *nodes, unsigned int context) 145495b7b453SJohn Marino { 145595b7b453SJohn Marino re_hashval_t hash = nodes->nelem + context; 145695b7b453SJohn Marino Idx i; 145795b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++) 145895b7b453SJohn Marino hash += nodes->elems[i]; 145995b7b453SJohn Marino return hash; 146095b7b453SJohn Marino } 146195b7b453SJohn Marino 146295b7b453SJohn Marino /* Search for the state whose node_set is equivalent to NODES. 146395b7b453SJohn Marino Return the pointer to the state, if we found it in the DFA. 146495b7b453SJohn Marino Otherwise create the new one and return it. In case of an error 146595b7b453SJohn Marino return NULL and set the error code in ERR. 146695b7b453SJohn Marino Note: - We assume NULL as the invalid state, then it is possible that 146795b7b453SJohn Marino return value is NULL and ERR is REG_NOERROR. 146895b7b453SJohn Marino - We never return non-NULL value in case of any errors, it is for 146995b7b453SJohn Marino optimization. */ 147095b7b453SJohn Marino 147195b7b453SJohn Marino static re_dfastate_t * 147295b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 147395b7b453SJohn Marino re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 147495b7b453SJohn Marino const re_node_set *nodes) 147595b7b453SJohn Marino { 147695b7b453SJohn Marino re_hashval_t hash; 147795b7b453SJohn Marino re_dfastate_t *new_state; 147895b7b453SJohn Marino struct re_state_table_entry *spot; 147995b7b453SJohn Marino Idx i; 148095b7b453SJohn Marino #ifdef lint 148195b7b453SJohn Marino /* Suppress bogus uninitialized-variable warnings. */ 148295b7b453SJohn Marino *err = REG_NOERROR; 148395b7b453SJohn Marino #endif 148495b7b453SJohn Marino if (BE (nodes->nelem == 0, 0)) 148595b7b453SJohn Marino { 148695b7b453SJohn Marino *err = REG_NOERROR; 148795b7b453SJohn Marino return NULL; 148895b7b453SJohn Marino } 148995b7b453SJohn Marino hash = calc_state_hash (nodes, 0); 149095b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask); 149195b7b453SJohn Marino 149295b7b453SJohn Marino for (i = 0 ; i < spot->num ; i++) 149395b7b453SJohn Marino { 149495b7b453SJohn Marino re_dfastate_t *state = spot->array[i]; 149595b7b453SJohn Marino if (hash != state->hash) 149695b7b453SJohn Marino continue; 149795b7b453SJohn Marino if (re_node_set_compare (&state->nodes, nodes)) 149895b7b453SJohn Marino return state; 149995b7b453SJohn Marino } 150095b7b453SJohn Marino 150195b7b453SJohn Marino /* There are no appropriate state in the dfa, create the new one. */ 150295b7b453SJohn Marino new_state = create_ci_newstate (dfa, nodes, hash); 150395b7b453SJohn Marino if (BE (new_state == NULL, 0)) 150495b7b453SJohn Marino *err = REG_ESPACE; 150595b7b453SJohn Marino 150695b7b453SJohn Marino return new_state; 150795b7b453SJohn Marino } 150895b7b453SJohn Marino 150995b7b453SJohn Marino /* Search for the state whose node_set is equivalent to NODES and 151095b7b453SJohn Marino whose context is equivalent to CONTEXT. 151195b7b453SJohn Marino Return the pointer to the state, if we found it in the DFA. 151295b7b453SJohn Marino Otherwise create the new one and return it. In case of an error 151395b7b453SJohn Marino return NULL and set the error code in ERR. 151495b7b453SJohn Marino Note: - We assume NULL as the invalid state, then it is possible that 151595b7b453SJohn Marino return value is NULL and ERR is REG_NOERROR. 151695b7b453SJohn Marino - We never return non-NULL value in case of any errors, it is for 151795b7b453SJohn Marino optimization. */ 151895b7b453SJohn Marino 151995b7b453SJohn Marino static re_dfastate_t * 152095b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 152195b7b453SJohn Marino re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 152295b7b453SJohn Marino const re_node_set *nodes, unsigned int context) 152395b7b453SJohn Marino { 152495b7b453SJohn Marino re_hashval_t hash; 152595b7b453SJohn Marino re_dfastate_t *new_state; 152695b7b453SJohn Marino struct re_state_table_entry *spot; 152795b7b453SJohn Marino Idx i; 152895b7b453SJohn Marino #ifdef lint 152995b7b453SJohn Marino /* Suppress bogus uninitialized-variable warnings. */ 153095b7b453SJohn Marino *err = REG_NOERROR; 153195b7b453SJohn Marino #endif 153295b7b453SJohn Marino if (nodes->nelem == 0) 153395b7b453SJohn Marino { 153495b7b453SJohn Marino *err = REG_NOERROR; 153595b7b453SJohn Marino return NULL; 153695b7b453SJohn Marino } 153795b7b453SJohn Marino hash = calc_state_hash (nodes, context); 153895b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask); 153995b7b453SJohn Marino 154095b7b453SJohn Marino for (i = 0 ; i < spot->num ; i++) 154195b7b453SJohn Marino { 154295b7b453SJohn Marino re_dfastate_t *state = spot->array[i]; 154395b7b453SJohn Marino if (state->hash == hash 154495b7b453SJohn Marino && state->context == context 154595b7b453SJohn Marino && re_node_set_compare (state->entrance_nodes, nodes)) 154695b7b453SJohn Marino return state; 154795b7b453SJohn Marino } 1548cf28ed85SJohn Marino /* There are no appropriate state in 'dfa', create the new one. */ 154995b7b453SJohn Marino new_state = create_cd_newstate (dfa, nodes, context, hash); 155095b7b453SJohn Marino if (BE (new_state == NULL, 0)) 155195b7b453SJohn Marino *err = REG_ESPACE; 155295b7b453SJohn Marino 155395b7b453SJohn Marino return new_state; 155495b7b453SJohn Marino } 155595b7b453SJohn Marino 155695b7b453SJohn Marino /* Finish initialization of the new state NEWSTATE, and using its hash value 155795b7b453SJohn Marino HASH put in the appropriate bucket of DFA's state table. Return value 155895b7b453SJohn Marino indicates the error code if failed. */ 155995b7b453SJohn Marino 156095b7b453SJohn Marino static reg_errcode_t 156195b7b453SJohn Marino __attribute_warn_unused_result__ 156295b7b453SJohn Marino register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 156395b7b453SJohn Marino re_hashval_t hash) 156495b7b453SJohn Marino { 156595b7b453SJohn Marino struct re_state_table_entry *spot; 156695b7b453SJohn Marino reg_errcode_t err; 156795b7b453SJohn Marino Idx i; 156895b7b453SJohn Marino 156995b7b453SJohn Marino newstate->hash = hash; 157095b7b453SJohn Marino err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 157195b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 157295b7b453SJohn Marino return REG_ESPACE; 157395b7b453SJohn Marino for (i = 0; i < newstate->nodes.nelem; i++) 157495b7b453SJohn Marino { 157595b7b453SJohn Marino Idx elem = newstate->nodes.elems[i]; 157695b7b453SJohn Marino if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1577cf28ed85SJohn Marino if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem)) 157895b7b453SJohn Marino return REG_ESPACE; 157995b7b453SJohn Marino } 158095b7b453SJohn Marino 158195b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask); 158295b7b453SJohn Marino if (BE (spot->alloc <= spot->num, 0)) 158395b7b453SJohn Marino { 158495b7b453SJohn Marino Idx new_alloc = 2 * spot->num + 2; 158595b7b453SJohn Marino re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 158695b7b453SJohn Marino new_alloc); 158795b7b453SJohn Marino if (BE (new_array == NULL, 0)) 158895b7b453SJohn Marino return REG_ESPACE; 158995b7b453SJohn Marino spot->array = new_array; 159095b7b453SJohn Marino spot->alloc = new_alloc; 159195b7b453SJohn Marino } 159295b7b453SJohn Marino spot->array[spot->num++] = newstate; 159395b7b453SJohn Marino return REG_NOERROR; 159495b7b453SJohn Marino } 159595b7b453SJohn Marino 159695b7b453SJohn Marino static void 159795b7b453SJohn Marino free_state (re_dfastate_t *state) 159895b7b453SJohn Marino { 159995b7b453SJohn Marino re_node_set_free (&state->non_eps_nodes); 160095b7b453SJohn Marino re_node_set_free (&state->inveclosure); 160195b7b453SJohn Marino if (state->entrance_nodes != &state->nodes) 160295b7b453SJohn Marino { 160395b7b453SJohn Marino re_node_set_free (state->entrance_nodes); 160495b7b453SJohn Marino re_free (state->entrance_nodes); 160595b7b453SJohn Marino } 160695b7b453SJohn Marino re_node_set_free (&state->nodes); 160795b7b453SJohn Marino re_free (state->word_trtable); 160895b7b453SJohn Marino re_free (state->trtable); 160995b7b453SJohn Marino re_free (state); 161095b7b453SJohn Marino } 161195b7b453SJohn Marino 1612cf28ed85SJohn Marino /* Create the new state which is independent of contexts. 161395b7b453SJohn Marino Return the new state if succeeded, otherwise return NULL. */ 161495b7b453SJohn Marino 161595b7b453SJohn Marino static re_dfastate_t * 161695b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 161795b7b453SJohn Marino create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 161895b7b453SJohn Marino re_hashval_t hash) 161995b7b453SJohn Marino { 162095b7b453SJohn Marino Idx i; 162195b7b453SJohn Marino reg_errcode_t err; 162295b7b453SJohn Marino re_dfastate_t *newstate; 162395b7b453SJohn Marino 162495b7b453SJohn Marino newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 162595b7b453SJohn Marino if (BE (newstate == NULL, 0)) 162695b7b453SJohn Marino return NULL; 162795b7b453SJohn Marino err = re_node_set_init_copy (&newstate->nodes, nodes); 162895b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 162995b7b453SJohn Marino { 163095b7b453SJohn Marino re_free (newstate); 163195b7b453SJohn Marino return NULL; 163295b7b453SJohn Marino } 163395b7b453SJohn Marino 163495b7b453SJohn Marino newstate->entrance_nodes = &newstate->nodes; 163595b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++) 163695b7b453SJohn Marino { 163795b7b453SJohn Marino re_token_t *node = dfa->nodes + nodes->elems[i]; 163895b7b453SJohn Marino re_token_type_t type = node->type; 163995b7b453SJohn Marino if (type == CHARACTER && !node->constraint) 164095b7b453SJohn Marino continue; 164195b7b453SJohn Marino #ifdef RE_ENABLE_I18N 164295b7b453SJohn Marino newstate->accept_mb |= node->accept_mb; 164395b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 164495b7b453SJohn Marino 164595b7b453SJohn Marino /* If the state has the halt node, the state is a halt state. */ 164695b7b453SJohn Marino if (type == END_OF_RE) 164795b7b453SJohn Marino newstate->halt = 1; 164895b7b453SJohn Marino else if (type == OP_BACK_REF) 164995b7b453SJohn Marino newstate->has_backref = 1; 165095b7b453SJohn Marino else if (type == ANCHOR || node->constraint) 165195b7b453SJohn Marino newstate->has_constraint = 1; 165295b7b453SJohn Marino } 165395b7b453SJohn Marino err = register_state (dfa, newstate, hash); 165495b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 165595b7b453SJohn Marino { 165695b7b453SJohn Marino free_state (newstate); 165795b7b453SJohn Marino newstate = NULL; 165895b7b453SJohn Marino } 165995b7b453SJohn Marino return newstate; 166095b7b453SJohn Marino } 166195b7b453SJohn Marino 166295b7b453SJohn Marino /* Create the new state which is depend on the context CONTEXT. 166395b7b453SJohn Marino Return the new state if succeeded, otherwise return NULL. */ 166495b7b453SJohn Marino 166595b7b453SJohn Marino static re_dfastate_t * 166695b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 166795b7b453SJohn Marino create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 166895b7b453SJohn Marino unsigned int context, re_hashval_t hash) 166995b7b453SJohn Marino { 167095b7b453SJohn Marino Idx i, nctx_nodes = 0; 167195b7b453SJohn Marino reg_errcode_t err; 167295b7b453SJohn Marino re_dfastate_t *newstate; 167395b7b453SJohn Marino 167495b7b453SJohn Marino newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 167595b7b453SJohn Marino if (BE (newstate == NULL, 0)) 167695b7b453SJohn Marino return NULL; 167795b7b453SJohn Marino err = re_node_set_init_copy (&newstate->nodes, nodes); 167895b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 167995b7b453SJohn Marino { 168095b7b453SJohn Marino re_free (newstate); 168195b7b453SJohn Marino return NULL; 168295b7b453SJohn Marino } 168395b7b453SJohn Marino 168495b7b453SJohn Marino newstate->context = context; 168595b7b453SJohn Marino newstate->entrance_nodes = &newstate->nodes; 168695b7b453SJohn Marino 168795b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++) 168895b7b453SJohn Marino { 168995b7b453SJohn Marino re_token_t *node = dfa->nodes + nodes->elems[i]; 169095b7b453SJohn Marino re_token_type_t type = node->type; 169195b7b453SJohn Marino unsigned int constraint = node->constraint; 169295b7b453SJohn Marino 169395b7b453SJohn Marino if (type == CHARACTER && !constraint) 169495b7b453SJohn Marino continue; 169595b7b453SJohn Marino #ifdef RE_ENABLE_I18N 169695b7b453SJohn Marino newstate->accept_mb |= node->accept_mb; 169795b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 169895b7b453SJohn Marino 169995b7b453SJohn Marino /* If the state has the halt node, the state is a halt state. */ 170095b7b453SJohn Marino if (type == END_OF_RE) 170195b7b453SJohn Marino newstate->halt = 1; 170295b7b453SJohn Marino else if (type == OP_BACK_REF) 170395b7b453SJohn Marino newstate->has_backref = 1; 170495b7b453SJohn Marino 170595b7b453SJohn Marino if (constraint) 170695b7b453SJohn Marino { 170795b7b453SJohn Marino if (newstate->entrance_nodes == &newstate->nodes) 170895b7b453SJohn Marino { 170995b7b453SJohn Marino newstate->entrance_nodes = re_malloc (re_node_set, 1); 171095b7b453SJohn Marino if (BE (newstate->entrance_nodes == NULL, 0)) 171195b7b453SJohn Marino { 171295b7b453SJohn Marino free_state (newstate); 171395b7b453SJohn Marino return NULL; 171495b7b453SJohn Marino } 171595b7b453SJohn Marino if (re_node_set_init_copy (newstate->entrance_nodes, nodes) 171695b7b453SJohn Marino != REG_NOERROR) 171795b7b453SJohn Marino return NULL; 171895b7b453SJohn Marino nctx_nodes = 0; 171995b7b453SJohn Marino newstate->has_constraint = 1; 172095b7b453SJohn Marino } 172195b7b453SJohn Marino 172295b7b453SJohn Marino if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 172395b7b453SJohn Marino { 172495b7b453SJohn Marino re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 172595b7b453SJohn Marino ++nctx_nodes; 172695b7b453SJohn Marino } 172795b7b453SJohn Marino } 172895b7b453SJohn Marino } 172995b7b453SJohn Marino err = register_state (dfa, newstate, hash); 173095b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 173195b7b453SJohn Marino { 173295b7b453SJohn Marino free_state (newstate); 173395b7b453SJohn Marino newstate = NULL; 173495b7b453SJohn Marino } 173595b7b453SJohn Marino return newstate; 173695b7b453SJohn Marino } 1737