1*95b7b453SJohn Marino /* -*- buffer-read-only: t -*- vi: set ro: */ 2*95b7b453SJohn Marino /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3*95b7b453SJohn Marino /* Extended regular expression matching and search library. 4*95b7b453SJohn Marino Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 5*95b7b453SJohn Marino Software Foundation, Inc. 6*95b7b453SJohn Marino This file is part of the GNU C Library. 7*95b7b453SJohn Marino Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 8*95b7b453SJohn Marino 9*95b7b453SJohn Marino This program is free software; you can redistribute it and/or modify 10*95b7b453SJohn Marino it under the terms of the GNU General Public License as published by 11*95b7b453SJohn Marino the Free Software Foundation; either version 3, or (at your option) 12*95b7b453SJohn Marino any later version. 13*95b7b453SJohn Marino 14*95b7b453SJohn Marino This program is distributed in the hope that it will be useful, 15*95b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of 16*95b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17*95b7b453SJohn Marino GNU General Public License for more details. 18*95b7b453SJohn Marino 19*95b7b453SJohn Marino You should have received a copy of the GNU General Public License along 20*95b7b453SJohn Marino with this program; if not, write to the Free Software Foundation, 21*95b7b453SJohn Marino Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22*95b7b453SJohn Marino 23*95b7b453SJohn Marino #include "verify.h" 24*95b7b453SJohn Marino #include "intprops.h" 25*95b7b453SJohn Marino static void re_string_construct_common (const char *str, Idx len, 26*95b7b453SJohn Marino re_string_t *pstr, 27*95b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, 28*95b7b453SJohn Marino const re_dfa_t *dfa) internal_function; 29*95b7b453SJohn Marino static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 30*95b7b453SJohn Marino const re_node_set *nodes, 31*95b7b453SJohn Marino re_hashval_t hash) internal_function; 32*95b7b453SJohn Marino static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 33*95b7b453SJohn Marino const re_node_set *nodes, 34*95b7b453SJohn Marino unsigned int context, 35*95b7b453SJohn Marino re_hashval_t hash) internal_function; 36*95b7b453SJohn Marino 37*95b7b453SJohn Marino /* Functions for string operation. */ 38*95b7b453SJohn Marino 39*95b7b453SJohn Marino /* This function allocate the buffers. It is necessary to call 40*95b7b453SJohn Marino re_string_reconstruct before using the object. */ 41*95b7b453SJohn Marino 42*95b7b453SJohn Marino static reg_errcode_t 43*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 44*95b7b453SJohn Marino re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 45*95b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 46*95b7b453SJohn Marino { 47*95b7b453SJohn Marino reg_errcode_t ret; 48*95b7b453SJohn Marino Idx init_buf_len; 49*95b7b453SJohn Marino 50*95b7b453SJohn Marino /* Ensure at least one character fits into the buffers. */ 51*95b7b453SJohn Marino if (init_len < dfa->mb_cur_max) 52*95b7b453SJohn Marino init_len = dfa->mb_cur_max; 53*95b7b453SJohn Marino init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 54*95b7b453SJohn Marino re_string_construct_common (str, len, pstr, trans, icase, dfa); 55*95b7b453SJohn Marino 56*95b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, init_buf_len); 57*95b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 58*95b7b453SJohn Marino return ret; 59*95b7b453SJohn Marino 60*95b7b453SJohn Marino pstr->word_char = dfa->word_char; 61*95b7b453SJohn Marino pstr->word_ops_used = dfa->word_ops_used; 62*95b7b453SJohn Marino pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 63*95b7b453SJohn Marino pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 64*95b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len; 65*95b7b453SJohn Marino return REG_NOERROR; 66*95b7b453SJohn Marino } 67*95b7b453SJohn Marino 68*95b7b453SJohn Marino /* This function allocate the buffers, and initialize them. */ 69*95b7b453SJohn Marino 70*95b7b453SJohn Marino static reg_errcode_t 71*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 72*95b7b453SJohn Marino re_string_construct (re_string_t *pstr, const char *str, Idx len, 73*95b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 74*95b7b453SJohn Marino { 75*95b7b453SJohn Marino reg_errcode_t ret; 76*95b7b453SJohn Marino memset (pstr, '\0', sizeof (re_string_t)); 77*95b7b453SJohn Marino re_string_construct_common (str, len, pstr, trans, icase, dfa); 78*95b7b453SJohn Marino 79*95b7b453SJohn Marino if (len > 0) 80*95b7b453SJohn Marino { 81*95b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, len + 1); 82*95b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 83*95b7b453SJohn Marino return ret; 84*95b7b453SJohn Marino } 85*95b7b453SJohn Marino pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 86*95b7b453SJohn Marino 87*95b7b453SJohn Marino if (icase) 88*95b7b453SJohn Marino { 89*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 90*95b7b453SJohn Marino if (dfa->mb_cur_max > 1) 91*95b7b453SJohn Marino { 92*95b7b453SJohn Marino while (1) 93*95b7b453SJohn Marino { 94*95b7b453SJohn Marino ret = build_wcs_upper_buffer (pstr); 95*95b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 96*95b7b453SJohn Marino return ret; 97*95b7b453SJohn Marino if (pstr->valid_raw_len >= len) 98*95b7b453SJohn Marino break; 99*95b7b453SJohn Marino if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 100*95b7b453SJohn Marino break; 101*95b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 102*95b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 103*95b7b453SJohn Marino return ret; 104*95b7b453SJohn Marino } 105*95b7b453SJohn Marino } 106*95b7b453SJohn Marino else 107*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 108*95b7b453SJohn Marino build_upper_buffer (pstr); 109*95b7b453SJohn Marino } 110*95b7b453SJohn Marino else 111*95b7b453SJohn Marino { 112*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 113*95b7b453SJohn Marino if (dfa->mb_cur_max > 1) 114*95b7b453SJohn Marino build_wcs_buffer (pstr); 115*95b7b453SJohn Marino else 116*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 117*95b7b453SJohn Marino { 118*95b7b453SJohn Marino if (trans != NULL) 119*95b7b453SJohn Marino re_string_translate_buffer (pstr); 120*95b7b453SJohn Marino else 121*95b7b453SJohn Marino { 122*95b7b453SJohn Marino pstr->valid_len = pstr->bufs_len; 123*95b7b453SJohn Marino pstr->valid_raw_len = pstr->bufs_len; 124*95b7b453SJohn Marino } 125*95b7b453SJohn Marino } 126*95b7b453SJohn Marino } 127*95b7b453SJohn Marino 128*95b7b453SJohn Marino return REG_NOERROR; 129*95b7b453SJohn Marino } 130*95b7b453SJohn Marino 131*95b7b453SJohn Marino /* Helper functions for re_string_allocate, and re_string_construct. */ 132*95b7b453SJohn Marino 133*95b7b453SJohn Marino static reg_errcode_t 134*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 135*95b7b453SJohn Marino re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 136*95b7b453SJohn Marino { 137*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 138*95b7b453SJohn Marino if (pstr->mb_cur_max > 1) 139*95b7b453SJohn Marino { 140*95b7b453SJohn Marino wint_t *new_wcs; 141*95b7b453SJohn Marino 142*95b7b453SJohn Marino /* Avoid overflow. */ 143*95b7b453SJohn Marino size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 144*95b7b453SJohn Marino if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 145*95b7b453SJohn Marino return REG_ESPACE; 146*95b7b453SJohn Marino 147*95b7b453SJohn Marino new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 148*95b7b453SJohn Marino if (BE (new_wcs == NULL, 0)) 149*95b7b453SJohn Marino return REG_ESPACE; 150*95b7b453SJohn Marino pstr->wcs = new_wcs; 151*95b7b453SJohn Marino if (pstr->offsets != NULL) 152*95b7b453SJohn Marino { 153*95b7b453SJohn Marino Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 154*95b7b453SJohn Marino if (BE (new_offsets == NULL, 0)) 155*95b7b453SJohn Marino return REG_ESPACE; 156*95b7b453SJohn Marino pstr->offsets = new_offsets; 157*95b7b453SJohn Marino } 158*95b7b453SJohn Marino } 159*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 160*95b7b453SJohn Marino if (pstr->mbs_allocated) 161*95b7b453SJohn Marino { 162*95b7b453SJohn Marino unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 163*95b7b453SJohn Marino new_buf_len); 164*95b7b453SJohn Marino if (BE (new_mbs == NULL, 0)) 165*95b7b453SJohn Marino return REG_ESPACE; 166*95b7b453SJohn Marino pstr->mbs = new_mbs; 167*95b7b453SJohn Marino } 168*95b7b453SJohn Marino pstr->bufs_len = new_buf_len; 169*95b7b453SJohn Marino return REG_NOERROR; 170*95b7b453SJohn Marino } 171*95b7b453SJohn Marino 172*95b7b453SJohn Marino 173*95b7b453SJohn Marino static void 174*95b7b453SJohn Marino internal_function 175*95b7b453SJohn Marino re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 176*95b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, 177*95b7b453SJohn Marino const re_dfa_t *dfa) 178*95b7b453SJohn Marino { 179*95b7b453SJohn Marino pstr->raw_mbs = (const unsigned char *) str; 180*95b7b453SJohn Marino pstr->len = len; 181*95b7b453SJohn Marino pstr->raw_len = len; 182*95b7b453SJohn Marino pstr->trans = trans; 183*95b7b453SJohn Marino pstr->icase = icase; 184*95b7b453SJohn Marino pstr->mbs_allocated = (trans != NULL || icase); 185*95b7b453SJohn Marino pstr->mb_cur_max = dfa->mb_cur_max; 186*95b7b453SJohn Marino pstr->is_utf8 = dfa->is_utf8; 187*95b7b453SJohn Marino pstr->map_notascii = dfa->map_notascii; 188*95b7b453SJohn Marino pstr->stop = pstr->len; 189*95b7b453SJohn Marino pstr->raw_stop = pstr->stop; 190*95b7b453SJohn Marino } 191*95b7b453SJohn Marino 192*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 193*95b7b453SJohn Marino 194*95b7b453SJohn Marino /* Build wide character buffer PSTR->WCS. 195*95b7b453SJohn Marino If the byte sequence of the string are: 196*95b7b453SJohn Marino <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 197*95b7b453SJohn Marino Then wide character buffer will be: 198*95b7b453SJohn Marino <wc1> , WEOF , <wc2> , WEOF , <wc3> 199*95b7b453SJohn Marino We use WEOF for padding, they indicate that the position isn't 200*95b7b453SJohn Marino a first byte of a multibyte character. 201*95b7b453SJohn Marino 202*95b7b453SJohn Marino Note that this function assumes PSTR->VALID_LEN elements are already 203*95b7b453SJohn Marino built and starts from PSTR->VALID_LEN. */ 204*95b7b453SJohn Marino 205*95b7b453SJohn Marino static void 206*95b7b453SJohn Marino internal_function 207*95b7b453SJohn Marino build_wcs_buffer (re_string_t *pstr) 208*95b7b453SJohn Marino { 209*95b7b453SJohn Marino #ifdef _LIBC 210*95b7b453SJohn Marino unsigned char buf[MB_LEN_MAX]; 211*95b7b453SJohn Marino assert (MB_LEN_MAX >= pstr->mb_cur_max); 212*95b7b453SJohn Marino #else 213*95b7b453SJohn Marino unsigned char buf[64]; 214*95b7b453SJohn Marino #endif 215*95b7b453SJohn Marino mbstate_t prev_st; 216*95b7b453SJohn Marino Idx byte_idx, end_idx, remain_len; 217*95b7b453SJohn Marino size_t mbclen; 218*95b7b453SJohn Marino 219*95b7b453SJohn Marino /* Build the buffers from pstr->valid_len to either pstr->len or 220*95b7b453SJohn Marino pstr->bufs_len. */ 221*95b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 222*95b7b453SJohn Marino for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 223*95b7b453SJohn Marino { 224*95b7b453SJohn Marino wchar_t wc; 225*95b7b453SJohn Marino const char *p; 226*95b7b453SJohn Marino 227*95b7b453SJohn Marino remain_len = end_idx - byte_idx; 228*95b7b453SJohn Marino prev_st = pstr->cur_state; 229*95b7b453SJohn Marino /* Apply the translation if we need. */ 230*95b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 231*95b7b453SJohn Marino { 232*95b7b453SJohn Marino int i, ch; 233*95b7b453SJohn Marino 234*95b7b453SJohn Marino for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 235*95b7b453SJohn Marino { 236*95b7b453SJohn Marino ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 237*95b7b453SJohn Marino buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 238*95b7b453SJohn Marino } 239*95b7b453SJohn Marino p = (const char *) buf; 240*95b7b453SJohn Marino } 241*95b7b453SJohn Marino else 242*95b7b453SJohn Marino p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 243*95b7b453SJohn Marino mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 244*95b7b453SJohn Marino if (BE (mbclen == (size_t) -2, 0)) 245*95b7b453SJohn Marino { 246*95b7b453SJohn Marino /* The buffer doesn't have enough space, finish to build. */ 247*95b7b453SJohn Marino pstr->cur_state = prev_st; 248*95b7b453SJohn Marino break; 249*95b7b453SJohn Marino } 250*95b7b453SJohn Marino else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 251*95b7b453SJohn Marino { 252*95b7b453SJohn Marino /* We treat these cases as a singlebyte character. */ 253*95b7b453SJohn Marino mbclen = 1; 254*95b7b453SJohn Marino wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 255*95b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 256*95b7b453SJohn Marino wc = pstr->trans[wc]; 257*95b7b453SJohn Marino pstr->cur_state = prev_st; 258*95b7b453SJohn Marino } 259*95b7b453SJohn Marino 260*95b7b453SJohn Marino /* Write wide character and padding. */ 261*95b7b453SJohn Marino pstr->wcs[byte_idx++] = wc; 262*95b7b453SJohn Marino /* Write paddings. */ 263*95b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 264*95b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF; 265*95b7b453SJohn Marino } 266*95b7b453SJohn Marino pstr->valid_len = byte_idx; 267*95b7b453SJohn Marino pstr->valid_raw_len = byte_idx; 268*95b7b453SJohn Marino } 269*95b7b453SJohn Marino 270*95b7b453SJohn Marino /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 271*95b7b453SJohn Marino but for REG_ICASE. */ 272*95b7b453SJohn Marino 273*95b7b453SJohn Marino static reg_errcode_t 274*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 275*95b7b453SJohn Marino build_wcs_upper_buffer (re_string_t *pstr) 276*95b7b453SJohn Marino { 277*95b7b453SJohn Marino mbstate_t prev_st; 278*95b7b453SJohn Marino Idx src_idx, byte_idx, end_idx, remain_len; 279*95b7b453SJohn Marino size_t mbclen; 280*95b7b453SJohn Marino #ifdef _LIBC 281*95b7b453SJohn Marino char buf[MB_LEN_MAX]; 282*95b7b453SJohn Marino assert (MB_LEN_MAX >= pstr->mb_cur_max); 283*95b7b453SJohn Marino #else 284*95b7b453SJohn Marino char buf[64]; 285*95b7b453SJohn Marino #endif 286*95b7b453SJohn Marino 287*95b7b453SJohn Marino byte_idx = pstr->valid_len; 288*95b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 289*95b7b453SJohn Marino 290*95b7b453SJohn Marino /* The following optimization assumes that ASCII characters can be 291*95b7b453SJohn Marino mapped to wide characters with a simple cast. */ 292*95b7b453SJohn Marino if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 293*95b7b453SJohn Marino { 294*95b7b453SJohn Marino while (byte_idx < end_idx) 295*95b7b453SJohn Marino { 296*95b7b453SJohn Marino wchar_t wc; 297*95b7b453SJohn Marino 298*95b7b453SJohn Marino if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 299*95b7b453SJohn Marino && mbsinit (&pstr->cur_state)) 300*95b7b453SJohn Marino { 301*95b7b453SJohn Marino /* In case of a singlebyte character. */ 302*95b7b453SJohn Marino pstr->mbs[byte_idx] 303*95b7b453SJohn Marino = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 304*95b7b453SJohn Marino /* The next step uses the assumption that wchar_t is encoded 305*95b7b453SJohn Marino ASCII-safe: all ASCII values can be converted like this. */ 306*95b7b453SJohn Marino pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 307*95b7b453SJohn Marino ++byte_idx; 308*95b7b453SJohn Marino continue; 309*95b7b453SJohn Marino } 310*95b7b453SJohn Marino 311*95b7b453SJohn Marino remain_len = end_idx - byte_idx; 312*95b7b453SJohn Marino prev_st = pstr->cur_state; 313*95b7b453SJohn Marino mbclen = __mbrtowc (&wc, 314*95b7b453SJohn Marino ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 315*95b7b453SJohn Marino + byte_idx), remain_len, &pstr->cur_state); 316*95b7b453SJohn Marino if (BE (mbclen < (size_t) -2, 1)) 317*95b7b453SJohn Marino { 318*95b7b453SJohn Marino wchar_t wcu = wc; 319*95b7b453SJohn Marino if (iswlower (wc)) 320*95b7b453SJohn Marino { 321*95b7b453SJohn Marino size_t mbcdlen; 322*95b7b453SJohn Marino 323*95b7b453SJohn Marino wcu = towupper (wc); 324*95b7b453SJohn Marino mbcdlen = wcrtomb (buf, wcu, &prev_st); 325*95b7b453SJohn Marino if (BE (mbclen == mbcdlen, 1)) 326*95b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbclen); 327*95b7b453SJohn Marino else 328*95b7b453SJohn Marino { 329*95b7b453SJohn Marino src_idx = byte_idx; 330*95b7b453SJohn Marino goto offsets_needed; 331*95b7b453SJohn Marino } 332*95b7b453SJohn Marino } 333*95b7b453SJohn Marino else 334*95b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, 335*95b7b453SJohn Marino pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 336*95b7b453SJohn Marino pstr->wcs[byte_idx++] = wcu; 337*95b7b453SJohn Marino /* Write paddings. */ 338*95b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 339*95b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF; 340*95b7b453SJohn Marino } 341*95b7b453SJohn Marino else if (mbclen == (size_t) -1 || mbclen == 0) 342*95b7b453SJohn Marino { 343*95b7b453SJohn Marino /* It is an invalid character or '\0'. Just use the byte. */ 344*95b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 345*95b7b453SJohn Marino pstr->mbs[byte_idx] = ch; 346*95b7b453SJohn Marino /* And also cast it to wide char. */ 347*95b7b453SJohn Marino pstr->wcs[byte_idx++] = (wchar_t) ch; 348*95b7b453SJohn Marino if (BE (mbclen == (size_t) -1, 0)) 349*95b7b453SJohn Marino pstr->cur_state = prev_st; 350*95b7b453SJohn Marino } 351*95b7b453SJohn Marino else 352*95b7b453SJohn Marino { 353*95b7b453SJohn Marino /* The buffer doesn't have enough space, finish to build. */ 354*95b7b453SJohn Marino pstr->cur_state = prev_st; 355*95b7b453SJohn Marino break; 356*95b7b453SJohn Marino } 357*95b7b453SJohn Marino } 358*95b7b453SJohn Marino pstr->valid_len = byte_idx; 359*95b7b453SJohn Marino pstr->valid_raw_len = byte_idx; 360*95b7b453SJohn Marino return REG_NOERROR; 361*95b7b453SJohn Marino } 362*95b7b453SJohn Marino else 363*95b7b453SJohn Marino for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 364*95b7b453SJohn Marino { 365*95b7b453SJohn Marino wchar_t wc; 366*95b7b453SJohn Marino const char *p; 367*95b7b453SJohn Marino offsets_needed: 368*95b7b453SJohn Marino remain_len = end_idx - byte_idx; 369*95b7b453SJohn Marino prev_st = pstr->cur_state; 370*95b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 371*95b7b453SJohn Marino { 372*95b7b453SJohn Marino int i, ch; 373*95b7b453SJohn Marino 374*95b7b453SJohn Marino for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 375*95b7b453SJohn Marino { 376*95b7b453SJohn Marino ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 377*95b7b453SJohn Marino buf[i] = pstr->trans[ch]; 378*95b7b453SJohn Marino } 379*95b7b453SJohn Marino p = (const char *) buf; 380*95b7b453SJohn Marino } 381*95b7b453SJohn Marino else 382*95b7b453SJohn Marino p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 383*95b7b453SJohn Marino mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 384*95b7b453SJohn Marino if (BE (mbclen < (size_t) -2, 1)) 385*95b7b453SJohn Marino { 386*95b7b453SJohn Marino wchar_t wcu = wc; 387*95b7b453SJohn Marino if (iswlower (wc)) 388*95b7b453SJohn Marino { 389*95b7b453SJohn Marino size_t mbcdlen; 390*95b7b453SJohn Marino 391*95b7b453SJohn Marino wcu = towupper (wc); 392*95b7b453SJohn Marino mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 393*95b7b453SJohn Marino if (BE (mbclen == mbcdlen, 1)) 394*95b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbclen); 395*95b7b453SJohn Marino else if (mbcdlen != (size_t) -1) 396*95b7b453SJohn Marino { 397*95b7b453SJohn Marino size_t i; 398*95b7b453SJohn Marino 399*95b7b453SJohn Marino if (byte_idx + mbcdlen > pstr->bufs_len) 400*95b7b453SJohn Marino { 401*95b7b453SJohn Marino pstr->cur_state = prev_st; 402*95b7b453SJohn Marino break; 403*95b7b453SJohn Marino } 404*95b7b453SJohn Marino 405*95b7b453SJohn Marino if (pstr->offsets == NULL) 406*95b7b453SJohn Marino { 407*95b7b453SJohn Marino pstr->offsets = re_malloc (Idx, pstr->bufs_len); 408*95b7b453SJohn Marino 409*95b7b453SJohn Marino if (pstr->offsets == NULL) 410*95b7b453SJohn Marino return REG_ESPACE; 411*95b7b453SJohn Marino } 412*95b7b453SJohn Marino if (!pstr->offsets_needed) 413*95b7b453SJohn Marino { 414*95b7b453SJohn Marino for (i = 0; i < (size_t) byte_idx; ++i) 415*95b7b453SJohn Marino pstr->offsets[i] = i; 416*95b7b453SJohn Marino pstr->offsets_needed = 1; 417*95b7b453SJohn Marino } 418*95b7b453SJohn Marino 419*95b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 420*95b7b453SJohn Marino pstr->wcs[byte_idx] = wcu; 421*95b7b453SJohn Marino pstr->offsets[byte_idx] = src_idx; 422*95b7b453SJohn Marino for (i = 1; i < mbcdlen; ++i) 423*95b7b453SJohn Marino { 424*95b7b453SJohn Marino pstr->offsets[byte_idx + i] 425*95b7b453SJohn Marino = src_idx + (i < mbclen ? i : mbclen - 1); 426*95b7b453SJohn Marino pstr->wcs[byte_idx + i] = WEOF; 427*95b7b453SJohn Marino } 428*95b7b453SJohn Marino pstr->len += mbcdlen - mbclen; 429*95b7b453SJohn Marino if (pstr->raw_stop > src_idx) 430*95b7b453SJohn Marino pstr->stop += mbcdlen - mbclen; 431*95b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) 432*95b7b453SJohn Marino ? pstr->len : pstr->bufs_len; 433*95b7b453SJohn Marino byte_idx += mbcdlen; 434*95b7b453SJohn Marino src_idx += mbclen; 435*95b7b453SJohn Marino continue; 436*95b7b453SJohn Marino } 437*95b7b453SJohn Marino else 438*95b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, p, mbclen); 439*95b7b453SJohn Marino } 440*95b7b453SJohn Marino else 441*95b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, p, mbclen); 442*95b7b453SJohn Marino 443*95b7b453SJohn Marino if (BE (pstr->offsets_needed != 0, 0)) 444*95b7b453SJohn Marino { 445*95b7b453SJohn Marino size_t i; 446*95b7b453SJohn Marino for (i = 0; i < mbclen; ++i) 447*95b7b453SJohn Marino pstr->offsets[byte_idx + i] = src_idx + i; 448*95b7b453SJohn Marino } 449*95b7b453SJohn Marino src_idx += mbclen; 450*95b7b453SJohn Marino 451*95b7b453SJohn Marino pstr->wcs[byte_idx++] = wcu; 452*95b7b453SJohn Marino /* Write paddings. */ 453*95b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 454*95b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF; 455*95b7b453SJohn Marino } 456*95b7b453SJohn Marino else if (mbclen == (size_t) -1 || mbclen == 0) 457*95b7b453SJohn Marino { 458*95b7b453SJohn Marino /* It is an invalid character or '\0'. Just use the byte. */ 459*95b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 460*95b7b453SJohn Marino 461*95b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 462*95b7b453SJohn Marino ch = pstr->trans [ch]; 463*95b7b453SJohn Marino pstr->mbs[byte_idx] = ch; 464*95b7b453SJohn Marino 465*95b7b453SJohn Marino if (BE (pstr->offsets_needed != 0, 0)) 466*95b7b453SJohn Marino pstr->offsets[byte_idx] = src_idx; 467*95b7b453SJohn Marino ++src_idx; 468*95b7b453SJohn Marino 469*95b7b453SJohn Marino /* And also cast it to wide char. */ 470*95b7b453SJohn Marino pstr->wcs[byte_idx++] = (wchar_t) ch; 471*95b7b453SJohn Marino if (BE (mbclen == (size_t) -1, 0)) 472*95b7b453SJohn Marino pstr->cur_state = prev_st; 473*95b7b453SJohn Marino } 474*95b7b453SJohn Marino else 475*95b7b453SJohn Marino { 476*95b7b453SJohn Marino /* The buffer doesn't have enough space, finish to build. */ 477*95b7b453SJohn Marino pstr->cur_state = prev_st; 478*95b7b453SJohn Marino break; 479*95b7b453SJohn Marino } 480*95b7b453SJohn Marino } 481*95b7b453SJohn Marino pstr->valid_len = byte_idx; 482*95b7b453SJohn Marino pstr->valid_raw_len = src_idx; 483*95b7b453SJohn Marino return REG_NOERROR; 484*95b7b453SJohn Marino } 485*95b7b453SJohn Marino 486*95b7b453SJohn Marino /* Skip characters until the index becomes greater than NEW_RAW_IDX. 487*95b7b453SJohn Marino Return the index. */ 488*95b7b453SJohn Marino 489*95b7b453SJohn Marino static Idx 490*95b7b453SJohn Marino internal_function 491*95b7b453SJohn Marino re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 492*95b7b453SJohn Marino { 493*95b7b453SJohn Marino mbstate_t prev_st; 494*95b7b453SJohn Marino Idx rawbuf_idx; 495*95b7b453SJohn Marino size_t mbclen; 496*95b7b453SJohn Marino wint_t wc = WEOF; 497*95b7b453SJohn Marino 498*95b7b453SJohn Marino /* Skip the characters which are not necessary to check. */ 499*95b7b453SJohn Marino for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 500*95b7b453SJohn Marino rawbuf_idx < new_raw_idx;) 501*95b7b453SJohn Marino { 502*95b7b453SJohn Marino wchar_t wc2; 503*95b7b453SJohn Marino Idx remain_len; 504*95b7b453SJohn Marino remain_len = pstr->len - rawbuf_idx; 505*95b7b453SJohn Marino prev_st = pstr->cur_state; 506*95b7b453SJohn Marino mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 507*95b7b453SJohn Marino remain_len, &pstr->cur_state); 508*95b7b453SJohn Marino if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 509*95b7b453SJohn Marino { 510*95b7b453SJohn Marino /* We treat these cases as a single byte character. */ 511*95b7b453SJohn Marino if (mbclen == 0 || remain_len == 0) 512*95b7b453SJohn Marino wc = L'\0'; 513*95b7b453SJohn Marino else 514*95b7b453SJohn Marino wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 515*95b7b453SJohn Marino mbclen = 1; 516*95b7b453SJohn Marino pstr->cur_state = prev_st; 517*95b7b453SJohn Marino } 518*95b7b453SJohn Marino else 519*95b7b453SJohn Marino wc = wc2; 520*95b7b453SJohn Marino /* Then proceed the next character. */ 521*95b7b453SJohn Marino rawbuf_idx += mbclen; 522*95b7b453SJohn Marino } 523*95b7b453SJohn Marino *last_wc = wc; 524*95b7b453SJohn Marino return rawbuf_idx; 525*95b7b453SJohn Marino } 526*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 527*95b7b453SJohn Marino 528*95b7b453SJohn Marino /* Build the buffer PSTR->MBS, and apply the translation if we need. 529*95b7b453SJohn Marino This function is used in case of REG_ICASE. */ 530*95b7b453SJohn Marino 531*95b7b453SJohn Marino static void 532*95b7b453SJohn Marino internal_function 533*95b7b453SJohn Marino build_upper_buffer (re_string_t *pstr) 534*95b7b453SJohn Marino { 535*95b7b453SJohn Marino Idx char_idx, end_idx; 536*95b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 537*95b7b453SJohn Marino 538*95b7b453SJohn Marino for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 539*95b7b453SJohn Marino { 540*95b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 541*95b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 542*95b7b453SJohn Marino ch = pstr->trans[ch]; 543*95b7b453SJohn Marino if (islower (ch)) 544*95b7b453SJohn Marino pstr->mbs[char_idx] = toupper (ch); 545*95b7b453SJohn Marino else 546*95b7b453SJohn Marino pstr->mbs[char_idx] = ch; 547*95b7b453SJohn Marino } 548*95b7b453SJohn Marino pstr->valid_len = char_idx; 549*95b7b453SJohn Marino pstr->valid_raw_len = char_idx; 550*95b7b453SJohn Marino } 551*95b7b453SJohn Marino 552*95b7b453SJohn Marino /* Apply TRANS to the buffer in PSTR. */ 553*95b7b453SJohn Marino 554*95b7b453SJohn Marino static void 555*95b7b453SJohn Marino internal_function 556*95b7b453SJohn Marino re_string_translate_buffer (re_string_t *pstr) 557*95b7b453SJohn Marino { 558*95b7b453SJohn Marino Idx buf_idx, end_idx; 559*95b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 560*95b7b453SJohn Marino 561*95b7b453SJohn Marino for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 562*95b7b453SJohn Marino { 563*95b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 564*95b7b453SJohn Marino pstr->mbs[buf_idx] = pstr->trans[ch]; 565*95b7b453SJohn Marino } 566*95b7b453SJohn Marino 567*95b7b453SJohn Marino pstr->valid_len = buf_idx; 568*95b7b453SJohn Marino pstr->valid_raw_len = buf_idx; 569*95b7b453SJohn Marino } 570*95b7b453SJohn Marino 571*95b7b453SJohn Marino /* This function re-construct the buffers. 572*95b7b453SJohn Marino Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 573*95b7b453SJohn Marino convert to upper case in case of REG_ICASE, apply translation. */ 574*95b7b453SJohn Marino 575*95b7b453SJohn Marino static reg_errcode_t 576*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 577*95b7b453SJohn Marino re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 578*95b7b453SJohn Marino { 579*95b7b453SJohn Marino Idx offset; 580*95b7b453SJohn Marino 581*95b7b453SJohn Marino if (BE (pstr->raw_mbs_idx <= idx, 0)) 582*95b7b453SJohn Marino offset = idx - pstr->raw_mbs_idx; 583*95b7b453SJohn Marino else 584*95b7b453SJohn Marino { 585*95b7b453SJohn Marino /* Reset buffer. */ 586*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 587*95b7b453SJohn Marino if (pstr->mb_cur_max > 1) 588*95b7b453SJohn Marino memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 589*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 590*95b7b453SJohn Marino pstr->len = pstr->raw_len; 591*95b7b453SJohn Marino pstr->stop = pstr->raw_stop; 592*95b7b453SJohn Marino pstr->valid_len = 0; 593*95b7b453SJohn Marino pstr->raw_mbs_idx = 0; 594*95b7b453SJohn Marino pstr->valid_raw_len = 0; 595*95b7b453SJohn Marino pstr->offsets_needed = 0; 596*95b7b453SJohn Marino pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 597*95b7b453SJohn Marino : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 598*95b7b453SJohn Marino if (!pstr->mbs_allocated) 599*95b7b453SJohn Marino pstr->mbs = (unsigned char *) pstr->raw_mbs; 600*95b7b453SJohn Marino offset = idx; 601*95b7b453SJohn Marino } 602*95b7b453SJohn Marino 603*95b7b453SJohn Marino if (BE (offset != 0, 1)) 604*95b7b453SJohn Marino { 605*95b7b453SJohn Marino /* Should the already checked characters be kept? */ 606*95b7b453SJohn Marino if (BE (offset < pstr->valid_raw_len, 1)) 607*95b7b453SJohn Marino { 608*95b7b453SJohn Marino /* Yes, move them to the front of the buffer. */ 609*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 610*95b7b453SJohn Marino if (BE (pstr->offsets_needed, 0)) 611*95b7b453SJohn Marino { 612*95b7b453SJohn Marino Idx low = 0, high = pstr->valid_len, mid; 613*95b7b453SJohn Marino do 614*95b7b453SJohn Marino { 615*95b7b453SJohn Marino mid = (high + low) / 2; 616*95b7b453SJohn Marino if (pstr->offsets[mid] > offset) 617*95b7b453SJohn Marino high = mid; 618*95b7b453SJohn Marino else if (pstr->offsets[mid] < offset) 619*95b7b453SJohn Marino low = mid + 1; 620*95b7b453SJohn Marino else 621*95b7b453SJohn Marino break; 622*95b7b453SJohn Marino } 623*95b7b453SJohn Marino while (low < high); 624*95b7b453SJohn Marino if (pstr->offsets[mid] < offset) 625*95b7b453SJohn Marino ++mid; 626*95b7b453SJohn Marino pstr->tip_context = re_string_context_at (pstr, mid - 1, 627*95b7b453SJohn Marino eflags); 628*95b7b453SJohn Marino /* This can be quite complicated, so handle specially 629*95b7b453SJohn Marino only the common and easy case where the character with 630*95b7b453SJohn Marino different length representation of lower and upper 631*95b7b453SJohn Marino case is present at or after offset. */ 632*95b7b453SJohn Marino if (pstr->valid_len > offset 633*95b7b453SJohn Marino && mid == offset && pstr->offsets[mid] == offset) 634*95b7b453SJohn Marino { 635*95b7b453SJohn Marino memmove (pstr->wcs, pstr->wcs + offset, 636*95b7b453SJohn Marino (pstr->valid_len - offset) * sizeof (wint_t)); 637*95b7b453SJohn Marino memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 638*95b7b453SJohn Marino pstr->valid_len -= offset; 639*95b7b453SJohn Marino pstr->valid_raw_len -= offset; 640*95b7b453SJohn Marino for (low = 0; low < pstr->valid_len; low++) 641*95b7b453SJohn Marino pstr->offsets[low] = pstr->offsets[low + offset] - offset; 642*95b7b453SJohn Marino } 643*95b7b453SJohn Marino else 644*95b7b453SJohn Marino { 645*95b7b453SJohn Marino /* Otherwise, just find out how long the partial multibyte 646*95b7b453SJohn Marino character at offset is and fill it with WEOF/255. */ 647*95b7b453SJohn Marino pstr->len = pstr->raw_len - idx + offset; 648*95b7b453SJohn Marino pstr->stop = pstr->raw_stop - idx + offset; 649*95b7b453SJohn Marino pstr->offsets_needed = 0; 650*95b7b453SJohn Marino while (mid > 0 && pstr->offsets[mid - 1] == offset) 651*95b7b453SJohn Marino --mid; 652*95b7b453SJohn Marino while (mid < pstr->valid_len) 653*95b7b453SJohn Marino if (pstr->wcs[mid] != WEOF) 654*95b7b453SJohn Marino break; 655*95b7b453SJohn Marino else 656*95b7b453SJohn Marino ++mid; 657*95b7b453SJohn Marino if (mid == pstr->valid_len) 658*95b7b453SJohn Marino pstr->valid_len = 0; 659*95b7b453SJohn Marino else 660*95b7b453SJohn Marino { 661*95b7b453SJohn Marino pstr->valid_len = pstr->offsets[mid] - offset; 662*95b7b453SJohn Marino if (pstr->valid_len) 663*95b7b453SJohn Marino { 664*95b7b453SJohn Marino for (low = 0; low < pstr->valid_len; ++low) 665*95b7b453SJohn Marino pstr->wcs[low] = WEOF; 666*95b7b453SJohn Marino memset (pstr->mbs, 255, pstr->valid_len); 667*95b7b453SJohn Marino } 668*95b7b453SJohn Marino } 669*95b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len; 670*95b7b453SJohn Marino } 671*95b7b453SJohn Marino } 672*95b7b453SJohn Marino else 673*95b7b453SJohn Marino #endif 674*95b7b453SJohn Marino { 675*95b7b453SJohn Marino pstr->tip_context = re_string_context_at (pstr, offset - 1, 676*95b7b453SJohn Marino eflags); 677*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 678*95b7b453SJohn Marino if (pstr->mb_cur_max > 1) 679*95b7b453SJohn Marino memmove (pstr->wcs, pstr->wcs + offset, 680*95b7b453SJohn Marino (pstr->valid_len - offset) * sizeof (wint_t)); 681*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 682*95b7b453SJohn Marino if (BE (pstr->mbs_allocated, 0)) 683*95b7b453SJohn Marino memmove (pstr->mbs, pstr->mbs + offset, 684*95b7b453SJohn Marino pstr->valid_len - offset); 685*95b7b453SJohn Marino pstr->valid_len -= offset; 686*95b7b453SJohn Marino pstr->valid_raw_len -= offset; 687*95b7b453SJohn Marino #if DEBUG 688*95b7b453SJohn Marino assert (pstr->valid_len > 0); 689*95b7b453SJohn Marino #endif 690*95b7b453SJohn Marino } 691*95b7b453SJohn Marino } 692*95b7b453SJohn Marino else 693*95b7b453SJohn Marino { 694*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 695*95b7b453SJohn Marino /* No, skip all characters until IDX. */ 696*95b7b453SJohn Marino Idx prev_valid_len = pstr->valid_len; 697*95b7b453SJohn Marino 698*95b7b453SJohn Marino if (BE (pstr->offsets_needed, 0)) 699*95b7b453SJohn Marino { 700*95b7b453SJohn Marino pstr->len = pstr->raw_len - idx + offset; 701*95b7b453SJohn Marino pstr->stop = pstr->raw_stop - idx + offset; 702*95b7b453SJohn Marino pstr->offsets_needed = 0; 703*95b7b453SJohn Marino } 704*95b7b453SJohn Marino #endif 705*95b7b453SJohn Marino pstr->valid_len = 0; 706*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 707*95b7b453SJohn Marino if (pstr->mb_cur_max > 1) 708*95b7b453SJohn Marino { 709*95b7b453SJohn Marino Idx wcs_idx; 710*95b7b453SJohn Marino wint_t wc = WEOF; 711*95b7b453SJohn Marino 712*95b7b453SJohn Marino if (pstr->is_utf8) 713*95b7b453SJohn Marino { 714*95b7b453SJohn Marino const unsigned char *raw, *p, *end; 715*95b7b453SJohn Marino 716*95b7b453SJohn Marino /* Special case UTF-8. Multi-byte chars start with any 717*95b7b453SJohn Marino byte other than 0x80 - 0xbf. */ 718*95b7b453SJohn Marino raw = pstr->raw_mbs + pstr->raw_mbs_idx; 719*95b7b453SJohn Marino end = raw + (offset - pstr->mb_cur_max); 720*95b7b453SJohn Marino if (end < pstr->raw_mbs) 721*95b7b453SJohn Marino end = pstr->raw_mbs; 722*95b7b453SJohn Marino p = raw + offset - 1; 723*95b7b453SJohn Marino #ifdef _LIBC 724*95b7b453SJohn Marino /* We know the wchar_t encoding is UCS4, so for the simple 725*95b7b453SJohn Marino case, ASCII characters, skip the conversion step. */ 726*95b7b453SJohn Marino if (isascii (*p) && BE (pstr->trans == NULL, 1)) 727*95b7b453SJohn Marino { 728*95b7b453SJohn Marino memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 729*95b7b453SJohn Marino /* pstr->valid_len = 0; */ 730*95b7b453SJohn Marino wc = (wchar_t) *p; 731*95b7b453SJohn Marino } 732*95b7b453SJohn Marino else 733*95b7b453SJohn Marino #endif 734*95b7b453SJohn Marino for (; p >= end; --p) 735*95b7b453SJohn Marino if ((*p & 0xc0) != 0x80) 736*95b7b453SJohn Marino { 737*95b7b453SJohn Marino mbstate_t cur_state; 738*95b7b453SJohn Marino wchar_t wc2; 739*95b7b453SJohn Marino Idx mlen = raw + pstr->len - p; 740*95b7b453SJohn Marino size_t mbclen; 741*95b7b453SJohn Marino 742*95b7b453SJohn Marino #if 0 /* dead code: buf is set but never used */ 743*95b7b453SJohn Marino unsigned char buf[6]; 744*95b7b453SJohn Marino if (BE (pstr->trans != NULL, 0)) 745*95b7b453SJohn Marino { 746*95b7b453SJohn Marino int i = mlen < 6 ? mlen : 6; 747*95b7b453SJohn Marino while (--i >= 0) 748*95b7b453SJohn Marino buf[i] = pstr->trans[p[i]]; 749*95b7b453SJohn Marino } 750*95b7b453SJohn Marino #endif 751*95b7b453SJohn Marino /* XXX Don't use mbrtowc, we know which conversion 752*95b7b453SJohn Marino to use (UTF-8 -> UCS4). */ 753*95b7b453SJohn Marino memset (&cur_state, 0, sizeof (cur_state)); 754*95b7b453SJohn Marino mbclen = __mbrtowc (&wc2, (const char *) p, mlen, 755*95b7b453SJohn Marino &cur_state); 756*95b7b453SJohn Marino if (raw + offset - p <= mbclen 757*95b7b453SJohn Marino && mbclen < (size_t) -2) 758*95b7b453SJohn Marino { 759*95b7b453SJohn Marino memset (&pstr->cur_state, '\0', 760*95b7b453SJohn Marino sizeof (mbstate_t)); 761*95b7b453SJohn Marino pstr->valid_len = mbclen - (raw + offset - p); 762*95b7b453SJohn Marino wc = wc2; 763*95b7b453SJohn Marino } 764*95b7b453SJohn Marino break; 765*95b7b453SJohn Marino } 766*95b7b453SJohn Marino } 767*95b7b453SJohn Marino 768*95b7b453SJohn Marino if (wc == WEOF) 769*95b7b453SJohn Marino pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 770*95b7b453SJohn Marino if (wc == WEOF) 771*95b7b453SJohn Marino pstr->tip_context 772*95b7b453SJohn Marino = re_string_context_at (pstr, prev_valid_len - 1, eflags); 773*95b7b453SJohn Marino else 774*95b7b453SJohn Marino pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 775*95b7b453SJohn Marino && IS_WIDE_WORD_CHAR (wc)) 776*95b7b453SJohn Marino ? CONTEXT_WORD 777*95b7b453SJohn Marino : ((IS_WIDE_NEWLINE (wc) 778*95b7b453SJohn Marino && pstr->newline_anchor) 779*95b7b453SJohn Marino ? CONTEXT_NEWLINE : 0)); 780*95b7b453SJohn Marino if (BE (pstr->valid_len, 0)) 781*95b7b453SJohn Marino { 782*95b7b453SJohn Marino for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 783*95b7b453SJohn Marino pstr->wcs[wcs_idx] = WEOF; 784*95b7b453SJohn Marino if (pstr->mbs_allocated) 785*95b7b453SJohn Marino memset (pstr->mbs, 255, pstr->valid_len); 786*95b7b453SJohn Marino } 787*95b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len; 788*95b7b453SJohn Marino } 789*95b7b453SJohn Marino else 790*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 791*95b7b453SJohn Marino { 792*95b7b453SJohn Marino int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 793*95b7b453SJohn Marino pstr->valid_raw_len = 0; 794*95b7b453SJohn Marino if (pstr->trans) 795*95b7b453SJohn Marino c = pstr->trans[c]; 796*95b7b453SJohn Marino pstr->tip_context = (bitset_contain (pstr->word_char, c) 797*95b7b453SJohn Marino ? CONTEXT_WORD 798*95b7b453SJohn Marino : ((IS_NEWLINE (c) && pstr->newline_anchor) 799*95b7b453SJohn Marino ? CONTEXT_NEWLINE : 0)); 800*95b7b453SJohn Marino } 801*95b7b453SJohn Marino } 802*95b7b453SJohn Marino if (!BE (pstr->mbs_allocated, 0)) 803*95b7b453SJohn Marino pstr->mbs += offset; 804*95b7b453SJohn Marino } 805*95b7b453SJohn Marino pstr->raw_mbs_idx = idx; 806*95b7b453SJohn Marino pstr->len -= offset; 807*95b7b453SJohn Marino pstr->stop -= offset; 808*95b7b453SJohn Marino 809*95b7b453SJohn Marino /* Then build the buffers. */ 810*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 811*95b7b453SJohn Marino if (pstr->mb_cur_max > 1) 812*95b7b453SJohn Marino { 813*95b7b453SJohn Marino if (pstr->icase) 814*95b7b453SJohn Marino { 815*95b7b453SJohn Marino reg_errcode_t ret = build_wcs_upper_buffer (pstr); 816*95b7b453SJohn Marino if (BE (ret != REG_NOERROR, 0)) 817*95b7b453SJohn Marino return ret; 818*95b7b453SJohn Marino } 819*95b7b453SJohn Marino else 820*95b7b453SJohn Marino build_wcs_buffer (pstr); 821*95b7b453SJohn Marino } 822*95b7b453SJohn Marino else 823*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 824*95b7b453SJohn Marino if (BE (pstr->mbs_allocated, 0)) 825*95b7b453SJohn Marino { 826*95b7b453SJohn Marino if (pstr->icase) 827*95b7b453SJohn Marino build_upper_buffer (pstr); 828*95b7b453SJohn Marino else if (pstr->trans != NULL) 829*95b7b453SJohn Marino re_string_translate_buffer (pstr); 830*95b7b453SJohn Marino } 831*95b7b453SJohn Marino else 832*95b7b453SJohn Marino pstr->valid_len = pstr->len; 833*95b7b453SJohn Marino 834*95b7b453SJohn Marino pstr->cur_idx = 0; 835*95b7b453SJohn Marino return REG_NOERROR; 836*95b7b453SJohn Marino } 837*95b7b453SJohn Marino 838*95b7b453SJohn Marino static unsigned char 839*95b7b453SJohn Marino internal_function __attribute ((pure)) 840*95b7b453SJohn Marino re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 841*95b7b453SJohn Marino { 842*95b7b453SJohn Marino int ch; 843*95b7b453SJohn Marino Idx off; 844*95b7b453SJohn Marino 845*95b7b453SJohn Marino /* Handle the common (easiest) cases first. */ 846*95b7b453SJohn Marino if (BE (!pstr->mbs_allocated, 1)) 847*95b7b453SJohn Marino return re_string_peek_byte (pstr, idx); 848*95b7b453SJohn Marino 849*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 850*95b7b453SJohn Marino if (pstr->mb_cur_max > 1 851*95b7b453SJohn Marino && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 852*95b7b453SJohn Marino return re_string_peek_byte (pstr, idx); 853*95b7b453SJohn Marino #endif 854*95b7b453SJohn Marino 855*95b7b453SJohn Marino off = pstr->cur_idx + idx; 856*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 857*95b7b453SJohn Marino if (pstr->offsets_needed) 858*95b7b453SJohn Marino off = pstr->offsets[off]; 859*95b7b453SJohn Marino #endif 860*95b7b453SJohn Marino 861*95b7b453SJohn Marino ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 862*95b7b453SJohn Marino 863*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 864*95b7b453SJohn Marino /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 865*95b7b453SJohn Marino this function returns CAPITAL LETTER I instead of first byte of 866*95b7b453SJohn Marino DOTLESS SMALL LETTER I. The latter would confuse the parser, 867*95b7b453SJohn Marino since peek_byte_case doesn't advance cur_idx in any way. */ 868*95b7b453SJohn Marino if (pstr->offsets_needed && !isascii (ch)) 869*95b7b453SJohn Marino return re_string_peek_byte (pstr, idx); 870*95b7b453SJohn Marino #endif 871*95b7b453SJohn Marino 872*95b7b453SJohn Marino return ch; 873*95b7b453SJohn Marino } 874*95b7b453SJohn Marino 875*95b7b453SJohn Marino static unsigned char 876*95b7b453SJohn Marino internal_function __attribute ((pure)) 877*95b7b453SJohn Marino re_string_fetch_byte_case (re_string_t *pstr) 878*95b7b453SJohn Marino { 879*95b7b453SJohn Marino if (BE (!pstr->mbs_allocated, 1)) 880*95b7b453SJohn Marino return re_string_fetch_byte (pstr); 881*95b7b453SJohn Marino 882*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 883*95b7b453SJohn Marino if (pstr->offsets_needed) 884*95b7b453SJohn Marino { 885*95b7b453SJohn Marino Idx off; 886*95b7b453SJohn Marino int ch; 887*95b7b453SJohn Marino 888*95b7b453SJohn Marino /* For tr_TR.UTF-8 [[:islower:]] there is 889*95b7b453SJohn Marino [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 890*95b7b453SJohn Marino in that case the whole multi-byte character and return 891*95b7b453SJohn Marino the original letter. On the other side, with 892*95b7b453SJohn Marino [[: DOTLESS SMALL LETTER I return [[:I, as doing 893*95b7b453SJohn Marino anything else would complicate things too much. */ 894*95b7b453SJohn Marino 895*95b7b453SJohn Marino if (!re_string_first_byte (pstr, pstr->cur_idx)) 896*95b7b453SJohn Marino return re_string_fetch_byte (pstr); 897*95b7b453SJohn Marino 898*95b7b453SJohn Marino off = pstr->offsets[pstr->cur_idx]; 899*95b7b453SJohn Marino ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 900*95b7b453SJohn Marino 901*95b7b453SJohn Marino if (! isascii (ch)) 902*95b7b453SJohn Marino return re_string_fetch_byte (pstr); 903*95b7b453SJohn Marino 904*95b7b453SJohn Marino re_string_skip_bytes (pstr, 905*95b7b453SJohn Marino re_string_char_size_at (pstr, pstr->cur_idx)); 906*95b7b453SJohn Marino return ch; 907*95b7b453SJohn Marino } 908*95b7b453SJohn Marino #endif 909*95b7b453SJohn Marino 910*95b7b453SJohn Marino return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 911*95b7b453SJohn Marino } 912*95b7b453SJohn Marino 913*95b7b453SJohn Marino static void 914*95b7b453SJohn Marino internal_function 915*95b7b453SJohn Marino re_string_destruct (re_string_t *pstr) 916*95b7b453SJohn Marino { 917*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 918*95b7b453SJohn Marino re_free (pstr->wcs); 919*95b7b453SJohn Marino re_free (pstr->offsets); 920*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 921*95b7b453SJohn Marino if (pstr->mbs_allocated) 922*95b7b453SJohn Marino re_free (pstr->mbs); 923*95b7b453SJohn Marino } 924*95b7b453SJohn Marino 925*95b7b453SJohn Marino /* Return the context at IDX in INPUT. */ 926*95b7b453SJohn Marino 927*95b7b453SJohn Marino static unsigned int 928*95b7b453SJohn Marino internal_function 929*95b7b453SJohn Marino re_string_context_at (const re_string_t *input, Idx idx, int eflags) 930*95b7b453SJohn Marino { 931*95b7b453SJohn Marino int c; 932*95b7b453SJohn Marino if (BE (! REG_VALID_INDEX (idx), 0)) 933*95b7b453SJohn Marino /* In this case, we use the value stored in input->tip_context, 934*95b7b453SJohn Marino since we can't know the character in input->mbs[-1] here. */ 935*95b7b453SJohn Marino return input->tip_context; 936*95b7b453SJohn Marino if (BE (idx == input->len, 0)) 937*95b7b453SJohn Marino return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 938*95b7b453SJohn Marino : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 939*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 940*95b7b453SJohn Marino if (input->mb_cur_max > 1) 941*95b7b453SJohn Marino { 942*95b7b453SJohn Marino wint_t wc; 943*95b7b453SJohn Marino Idx wc_idx = idx; 944*95b7b453SJohn Marino while(input->wcs[wc_idx] == WEOF) 945*95b7b453SJohn Marino { 946*95b7b453SJohn Marino #ifdef DEBUG 947*95b7b453SJohn Marino /* It must not happen. */ 948*95b7b453SJohn Marino assert (REG_VALID_INDEX (wc_idx)); 949*95b7b453SJohn Marino #endif 950*95b7b453SJohn Marino --wc_idx; 951*95b7b453SJohn Marino if (! REG_VALID_INDEX (wc_idx)) 952*95b7b453SJohn Marino return input->tip_context; 953*95b7b453SJohn Marino } 954*95b7b453SJohn Marino wc = input->wcs[wc_idx]; 955*95b7b453SJohn Marino if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 956*95b7b453SJohn Marino return CONTEXT_WORD; 957*95b7b453SJohn Marino return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 958*95b7b453SJohn Marino ? CONTEXT_NEWLINE : 0); 959*95b7b453SJohn Marino } 960*95b7b453SJohn Marino else 961*95b7b453SJohn Marino #endif 962*95b7b453SJohn Marino { 963*95b7b453SJohn Marino c = re_string_byte_at (input, idx); 964*95b7b453SJohn Marino if (bitset_contain (input->word_char, c)) 965*95b7b453SJohn Marino return CONTEXT_WORD; 966*95b7b453SJohn Marino return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 967*95b7b453SJohn Marino } 968*95b7b453SJohn Marino } 969*95b7b453SJohn Marino 970*95b7b453SJohn Marino /* Functions for set operation. */ 971*95b7b453SJohn Marino 972*95b7b453SJohn Marino static reg_errcode_t 973*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 974*95b7b453SJohn Marino re_node_set_alloc (re_node_set *set, Idx size) 975*95b7b453SJohn Marino { 976*95b7b453SJohn Marino set->alloc = size; 977*95b7b453SJohn Marino set->nelem = 0; 978*95b7b453SJohn Marino set->elems = re_malloc (Idx, size); 979*95b7b453SJohn Marino if (BE (set->elems == NULL, 0)) 980*95b7b453SJohn Marino return REG_ESPACE; 981*95b7b453SJohn Marino return REG_NOERROR; 982*95b7b453SJohn Marino } 983*95b7b453SJohn Marino 984*95b7b453SJohn Marino static reg_errcode_t 985*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 986*95b7b453SJohn Marino re_node_set_init_1 (re_node_set *set, Idx elem) 987*95b7b453SJohn Marino { 988*95b7b453SJohn Marino set->alloc = 1; 989*95b7b453SJohn Marino set->nelem = 1; 990*95b7b453SJohn Marino set->elems = re_malloc (Idx, 1); 991*95b7b453SJohn Marino if (BE (set->elems == NULL, 0)) 992*95b7b453SJohn Marino { 993*95b7b453SJohn Marino set->alloc = set->nelem = 0; 994*95b7b453SJohn Marino return REG_ESPACE; 995*95b7b453SJohn Marino } 996*95b7b453SJohn Marino set->elems[0] = elem; 997*95b7b453SJohn Marino return REG_NOERROR; 998*95b7b453SJohn Marino } 999*95b7b453SJohn Marino 1000*95b7b453SJohn Marino static reg_errcode_t 1001*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1002*95b7b453SJohn Marino re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 1003*95b7b453SJohn Marino { 1004*95b7b453SJohn Marino set->alloc = 2; 1005*95b7b453SJohn Marino set->elems = re_malloc (Idx, 2); 1006*95b7b453SJohn Marino if (BE (set->elems == NULL, 0)) 1007*95b7b453SJohn Marino return REG_ESPACE; 1008*95b7b453SJohn Marino if (elem1 == elem2) 1009*95b7b453SJohn Marino { 1010*95b7b453SJohn Marino set->nelem = 1; 1011*95b7b453SJohn Marino set->elems[0] = elem1; 1012*95b7b453SJohn Marino } 1013*95b7b453SJohn Marino else 1014*95b7b453SJohn Marino { 1015*95b7b453SJohn Marino set->nelem = 2; 1016*95b7b453SJohn Marino if (elem1 < elem2) 1017*95b7b453SJohn Marino { 1018*95b7b453SJohn Marino set->elems[0] = elem1; 1019*95b7b453SJohn Marino set->elems[1] = elem2; 1020*95b7b453SJohn Marino } 1021*95b7b453SJohn Marino else 1022*95b7b453SJohn Marino { 1023*95b7b453SJohn Marino set->elems[0] = elem2; 1024*95b7b453SJohn Marino set->elems[1] = elem1; 1025*95b7b453SJohn Marino } 1026*95b7b453SJohn Marino } 1027*95b7b453SJohn Marino return REG_NOERROR; 1028*95b7b453SJohn Marino } 1029*95b7b453SJohn Marino 1030*95b7b453SJohn Marino static reg_errcode_t 1031*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1032*95b7b453SJohn Marino re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1033*95b7b453SJohn Marino { 1034*95b7b453SJohn Marino dest->nelem = src->nelem; 1035*95b7b453SJohn Marino if (src->nelem > 0) 1036*95b7b453SJohn Marino { 1037*95b7b453SJohn Marino dest->alloc = dest->nelem; 1038*95b7b453SJohn Marino dest->elems = re_malloc (Idx, dest->alloc); 1039*95b7b453SJohn Marino if (BE (dest->elems == NULL, 0)) 1040*95b7b453SJohn Marino { 1041*95b7b453SJohn Marino dest->alloc = dest->nelem = 0; 1042*95b7b453SJohn Marino return REG_ESPACE; 1043*95b7b453SJohn Marino } 1044*95b7b453SJohn Marino memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1045*95b7b453SJohn Marino } 1046*95b7b453SJohn Marino else 1047*95b7b453SJohn Marino re_node_set_init_empty (dest); 1048*95b7b453SJohn Marino return REG_NOERROR; 1049*95b7b453SJohn Marino } 1050*95b7b453SJohn Marino 1051*95b7b453SJohn Marino /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1052*95b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1053*95b7b453SJohn Marino Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1054*95b7b453SJohn Marino 1055*95b7b453SJohn Marino static reg_errcode_t 1056*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1057*95b7b453SJohn Marino re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1058*95b7b453SJohn Marino const re_node_set *src2) 1059*95b7b453SJohn Marino { 1060*95b7b453SJohn Marino Idx i1, i2, is, id, delta, sbase; 1061*95b7b453SJohn Marino if (src1->nelem == 0 || src2->nelem == 0) 1062*95b7b453SJohn Marino return REG_NOERROR; 1063*95b7b453SJohn Marino 1064*95b7b453SJohn Marino /* We need dest->nelem + 2 * elems_in_intersection; this is a 1065*95b7b453SJohn Marino conservative estimate. */ 1066*95b7b453SJohn Marino if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1067*95b7b453SJohn Marino { 1068*95b7b453SJohn Marino Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 1069*95b7b453SJohn Marino Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 1070*95b7b453SJohn Marino if (BE (new_elems == NULL, 0)) 1071*95b7b453SJohn Marino return REG_ESPACE; 1072*95b7b453SJohn Marino dest->elems = new_elems; 1073*95b7b453SJohn Marino dest->alloc = new_alloc; 1074*95b7b453SJohn Marino } 1075*95b7b453SJohn Marino 1076*95b7b453SJohn Marino /* Find the items in the intersection of SRC1 and SRC2, and copy 1077*95b7b453SJohn Marino into the top of DEST those that are not already in DEST itself. */ 1078*95b7b453SJohn Marino sbase = dest->nelem + src1->nelem + src2->nelem; 1079*95b7b453SJohn Marino i1 = src1->nelem - 1; 1080*95b7b453SJohn Marino i2 = src2->nelem - 1; 1081*95b7b453SJohn Marino id = dest->nelem - 1; 1082*95b7b453SJohn Marino for (;;) 1083*95b7b453SJohn Marino { 1084*95b7b453SJohn Marino if (src1->elems[i1] == src2->elems[i2]) 1085*95b7b453SJohn Marino { 1086*95b7b453SJohn Marino /* Try to find the item in DEST. Maybe we could binary search? */ 1087*95b7b453SJohn Marino while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 1088*95b7b453SJohn Marino --id; 1089*95b7b453SJohn Marino 1090*95b7b453SJohn Marino if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 1091*95b7b453SJohn Marino dest->elems[--sbase] = src1->elems[i1]; 1092*95b7b453SJohn Marino 1093*95b7b453SJohn Marino if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1094*95b7b453SJohn Marino break; 1095*95b7b453SJohn Marino } 1096*95b7b453SJohn Marino 1097*95b7b453SJohn Marino /* Lower the highest of the two items. */ 1098*95b7b453SJohn Marino else if (src1->elems[i1] < src2->elems[i2]) 1099*95b7b453SJohn Marino { 1100*95b7b453SJohn Marino if (! REG_VALID_INDEX (--i2)) 1101*95b7b453SJohn Marino break; 1102*95b7b453SJohn Marino } 1103*95b7b453SJohn Marino else 1104*95b7b453SJohn Marino { 1105*95b7b453SJohn Marino if (! REG_VALID_INDEX (--i1)) 1106*95b7b453SJohn Marino break; 1107*95b7b453SJohn Marino } 1108*95b7b453SJohn Marino } 1109*95b7b453SJohn Marino 1110*95b7b453SJohn Marino id = dest->nelem - 1; 1111*95b7b453SJohn Marino is = dest->nelem + src1->nelem + src2->nelem - 1; 1112*95b7b453SJohn Marino delta = is - sbase + 1; 1113*95b7b453SJohn Marino 1114*95b7b453SJohn Marino /* Now copy. When DELTA becomes zero, the remaining 1115*95b7b453SJohn Marino DEST elements are already in place; this is more or 1116*95b7b453SJohn Marino less the same loop that is in re_node_set_merge. */ 1117*95b7b453SJohn Marino dest->nelem += delta; 1118*95b7b453SJohn Marino if (delta > 0 && REG_VALID_INDEX (id)) 1119*95b7b453SJohn Marino for (;;) 1120*95b7b453SJohn Marino { 1121*95b7b453SJohn Marino if (dest->elems[is] > dest->elems[id]) 1122*95b7b453SJohn Marino { 1123*95b7b453SJohn Marino /* Copy from the top. */ 1124*95b7b453SJohn Marino dest->elems[id + delta--] = dest->elems[is--]; 1125*95b7b453SJohn Marino if (delta == 0) 1126*95b7b453SJohn Marino break; 1127*95b7b453SJohn Marino } 1128*95b7b453SJohn Marino else 1129*95b7b453SJohn Marino { 1130*95b7b453SJohn Marino /* Slide from the bottom. */ 1131*95b7b453SJohn Marino dest->elems[id + delta] = dest->elems[id]; 1132*95b7b453SJohn Marino if (! REG_VALID_INDEX (--id)) 1133*95b7b453SJohn Marino break; 1134*95b7b453SJohn Marino } 1135*95b7b453SJohn Marino } 1136*95b7b453SJohn Marino 1137*95b7b453SJohn Marino /* Copy remaining SRC elements. */ 1138*95b7b453SJohn Marino memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 1139*95b7b453SJohn Marino 1140*95b7b453SJohn Marino return REG_NOERROR; 1141*95b7b453SJohn Marino } 1142*95b7b453SJohn Marino 1143*95b7b453SJohn Marino /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1144*95b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1145*95b7b453SJohn Marino 1146*95b7b453SJohn Marino static reg_errcode_t 1147*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1148*95b7b453SJohn Marino re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1149*95b7b453SJohn Marino const re_node_set *src2) 1150*95b7b453SJohn Marino { 1151*95b7b453SJohn Marino Idx i1, i2, id; 1152*95b7b453SJohn Marino if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1153*95b7b453SJohn Marino { 1154*95b7b453SJohn Marino dest->alloc = src1->nelem + src2->nelem; 1155*95b7b453SJohn Marino dest->elems = re_malloc (Idx, dest->alloc); 1156*95b7b453SJohn Marino if (BE (dest->elems == NULL, 0)) 1157*95b7b453SJohn Marino return REG_ESPACE; 1158*95b7b453SJohn Marino } 1159*95b7b453SJohn Marino else 1160*95b7b453SJohn Marino { 1161*95b7b453SJohn Marino if (src1 != NULL && src1->nelem > 0) 1162*95b7b453SJohn Marino return re_node_set_init_copy (dest, src1); 1163*95b7b453SJohn Marino else if (src2 != NULL && src2->nelem > 0) 1164*95b7b453SJohn Marino return re_node_set_init_copy (dest, src2); 1165*95b7b453SJohn Marino else 1166*95b7b453SJohn Marino re_node_set_init_empty (dest); 1167*95b7b453SJohn Marino return REG_NOERROR; 1168*95b7b453SJohn Marino } 1169*95b7b453SJohn Marino for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1170*95b7b453SJohn Marino { 1171*95b7b453SJohn Marino if (src1->elems[i1] > src2->elems[i2]) 1172*95b7b453SJohn Marino { 1173*95b7b453SJohn Marino dest->elems[id++] = src2->elems[i2++]; 1174*95b7b453SJohn Marino continue; 1175*95b7b453SJohn Marino } 1176*95b7b453SJohn Marino if (src1->elems[i1] == src2->elems[i2]) 1177*95b7b453SJohn Marino ++i2; 1178*95b7b453SJohn Marino dest->elems[id++] = src1->elems[i1++]; 1179*95b7b453SJohn Marino } 1180*95b7b453SJohn Marino if (i1 < src1->nelem) 1181*95b7b453SJohn Marino { 1182*95b7b453SJohn Marino memcpy (dest->elems + id, src1->elems + i1, 1183*95b7b453SJohn Marino (src1->nelem - i1) * sizeof (Idx)); 1184*95b7b453SJohn Marino id += src1->nelem - i1; 1185*95b7b453SJohn Marino } 1186*95b7b453SJohn Marino else if (i2 < src2->nelem) 1187*95b7b453SJohn Marino { 1188*95b7b453SJohn Marino memcpy (dest->elems + id, src2->elems + i2, 1189*95b7b453SJohn Marino (src2->nelem - i2) * sizeof (Idx)); 1190*95b7b453SJohn Marino id += src2->nelem - i2; 1191*95b7b453SJohn Marino } 1192*95b7b453SJohn Marino dest->nelem = id; 1193*95b7b453SJohn Marino return REG_NOERROR; 1194*95b7b453SJohn Marino } 1195*95b7b453SJohn Marino 1196*95b7b453SJohn Marino /* Calculate the union set of the sets DEST and SRC. And store it to 1197*95b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1198*95b7b453SJohn Marino 1199*95b7b453SJohn Marino static reg_errcode_t 1200*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1201*95b7b453SJohn Marino re_node_set_merge (re_node_set *dest, const re_node_set *src) 1202*95b7b453SJohn Marino { 1203*95b7b453SJohn Marino Idx is, id, sbase, delta; 1204*95b7b453SJohn Marino if (src == NULL || src->nelem == 0) 1205*95b7b453SJohn Marino return REG_NOERROR; 1206*95b7b453SJohn Marino if (dest->alloc < 2 * src->nelem + dest->nelem) 1207*95b7b453SJohn Marino { 1208*95b7b453SJohn Marino Idx new_alloc = 2 * (src->nelem + dest->alloc); 1209*95b7b453SJohn Marino Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 1210*95b7b453SJohn Marino if (BE (new_buffer == NULL, 0)) 1211*95b7b453SJohn Marino return REG_ESPACE; 1212*95b7b453SJohn Marino dest->elems = new_buffer; 1213*95b7b453SJohn Marino dest->alloc = new_alloc; 1214*95b7b453SJohn Marino } 1215*95b7b453SJohn Marino 1216*95b7b453SJohn Marino if (BE (dest->nelem == 0, 0)) 1217*95b7b453SJohn Marino { 1218*95b7b453SJohn Marino dest->nelem = src->nelem; 1219*95b7b453SJohn Marino memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1220*95b7b453SJohn Marino return REG_NOERROR; 1221*95b7b453SJohn Marino } 1222*95b7b453SJohn Marino 1223*95b7b453SJohn Marino /* Copy into the top of DEST the items of SRC that are not 1224*95b7b453SJohn Marino found in DEST. Maybe we could binary search in DEST? */ 1225*95b7b453SJohn Marino for (sbase = dest->nelem + 2 * src->nelem, 1226*95b7b453SJohn Marino is = src->nelem - 1, id = dest->nelem - 1; 1227*95b7b453SJohn Marino REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1228*95b7b453SJohn Marino { 1229*95b7b453SJohn Marino if (dest->elems[id] == src->elems[is]) 1230*95b7b453SJohn Marino is--, id--; 1231*95b7b453SJohn Marino else if (dest->elems[id] < src->elems[is]) 1232*95b7b453SJohn Marino dest->elems[--sbase] = src->elems[is--]; 1233*95b7b453SJohn Marino else /* if (dest->elems[id] > src->elems[is]) */ 1234*95b7b453SJohn Marino --id; 1235*95b7b453SJohn Marino } 1236*95b7b453SJohn Marino 1237*95b7b453SJohn Marino if (REG_VALID_INDEX (is)) 1238*95b7b453SJohn Marino { 1239*95b7b453SJohn Marino /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1240*95b7b453SJohn Marino sbase -= is + 1; 1241*95b7b453SJohn Marino memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 1242*95b7b453SJohn Marino } 1243*95b7b453SJohn Marino 1244*95b7b453SJohn Marino id = dest->nelem - 1; 1245*95b7b453SJohn Marino is = dest->nelem + 2 * src->nelem - 1; 1246*95b7b453SJohn Marino delta = is - sbase + 1; 1247*95b7b453SJohn Marino if (delta == 0) 1248*95b7b453SJohn Marino return REG_NOERROR; 1249*95b7b453SJohn Marino 1250*95b7b453SJohn Marino /* Now copy. When DELTA becomes zero, the remaining 1251*95b7b453SJohn Marino DEST elements are already in place. */ 1252*95b7b453SJohn Marino dest->nelem += delta; 1253*95b7b453SJohn Marino for (;;) 1254*95b7b453SJohn Marino { 1255*95b7b453SJohn Marino if (dest->elems[is] > dest->elems[id]) 1256*95b7b453SJohn Marino { 1257*95b7b453SJohn Marino /* Copy from the top. */ 1258*95b7b453SJohn Marino dest->elems[id + delta--] = dest->elems[is--]; 1259*95b7b453SJohn Marino if (delta == 0) 1260*95b7b453SJohn Marino break; 1261*95b7b453SJohn Marino } 1262*95b7b453SJohn Marino else 1263*95b7b453SJohn Marino { 1264*95b7b453SJohn Marino /* Slide from the bottom. */ 1265*95b7b453SJohn Marino dest->elems[id + delta] = dest->elems[id]; 1266*95b7b453SJohn Marino if (! REG_VALID_INDEX (--id)) 1267*95b7b453SJohn Marino { 1268*95b7b453SJohn Marino /* Copy remaining SRC elements. */ 1269*95b7b453SJohn Marino memcpy (dest->elems, dest->elems + sbase, 1270*95b7b453SJohn Marino delta * sizeof (Idx)); 1271*95b7b453SJohn Marino break; 1272*95b7b453SJohn Marino } 1273*95b7b453SJohn Marino } 1274*95b7b453SJohn Marino } 1275*95b7b453SJohn Marino 1276*95b7b453SJohn Marino return REG_NOERROR; 1277*95b7b453SJohn Marino } 1278*95b7b453SJohn Marino 1279*95b7b453SJohn Marino /* Insert the new element ELEM to the re_node_set* SET. 1280*95b7b453SJohn Marino SET should not already have ELEM. 1281*95b7b453SJohn Marino Return true if successful. */ 1282*95b7b453SJohn Marino 1283*95b7b453SJohn Marino static bool 1284*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1285*95b7b453SJohn Marino re_node_set_insert (re_node_set *set, Idx elem) 1286*95b7b453SJohn Marino { 1287*95b7b453SJohn Marino Idx idx; 1288*95b7b453SJohn Marino /* In case the set is empty. */ 1289*95b7b453SJohn Marino if (set->alloc == 0) 1290*95b7b453SJohn Marino return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 1291*95b7b453SJohn Marino 1292*95b7b453SJohn Marino if (BE (set->nelem, 0) == 0) 1293*95b7b453SJohn Marino { 1294*95b7b453SJohn Marino /* We already guaranteed above that set->alloc != 0. */ 1295*95b7b453SJohn Marino set->elems[0] = elem; 1296*95b7b453SJohn Marino ++set->nelem; 1297*95b7b453SJohn Marino return true; 1298*95b7b453SJohn Marino } 1299*95b7b453SJohn Marino 1300*95b7b453SJohn Marino /* Realloc if we need. */ 1301*95b7b453SJohn Marino if (set->alloc == set->nelem) 1302*95b7b453SJohn Marino { 1303*95b7b453SJohn Marino Idx *new_elems; 1304*95b7b453SJohn Marino set->alloc = set->alloc * 2; 1305*95b7b453SJohn Marino new_elems = re_realloc (set->elems, Idx, set->alloc); 1306*95b7b453SJohn Marino if (BE (new_elems == NULL, 0)) 1307*95b7b453SJohn Marino return false; 1308*95b7b453SJohn Marino set->elems = new_elems; 1309*95b7b453SJohn Marino } 1310*95b7b453SJohn Marino 1311*95b7b453SJohn Marino /* Move the elements which follows the new element. Test the 1312*95b7b453SJohn Marino first element separately to skip a check in the inner loop. */ 1313*95b7b453SJohn Marino if (elem < set->elems[0]) 1314*95b7b453SJohn Marino { 1315*95b7b453SJohn Marino idx = 0; 1316*95b7b453SJohn Marino for (idx = set->nelem; idx > 0; idx--) 1317*95b7b453SJohn Marino set->elems[idx] = set->elems[idx - 1]; 1318*95b7b453SJohn Marino } 1319*95b7b453SJohn Marino else 1320*95b7b453SJohn Marino { 1321*95b7b453SJohn Marino for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1322*95b7b453SJohn Marino set->elems[idx] = set->elems[idx - 1]; 1323*95b7b453SJohn Marino } 1324*95b7b453SJohn Marino 1325*95b7b453SJohn Marino /* Insert the new element. */ 1326*95b7b453SJohn Marino set->elems[idx] = elem; 1327*95b7b453SJohn Marino ++set->nelem; 1328*95b7b453SJohn Marino return true; 1329*95b7b453SJohn Marino } 1330*95b7b453SJohn Marino 1331*95b7b453SJohn Marino /* Insert the new element ELEM to the re_node_set* SET. 1332*95b7b453SJohn Marino SET should not already have any element greater than or equal to ELEM. 1333*95b7b453SJohn Marino Return true if successful. */ 1334*95b7b453SJohn Marino 1335*95b7b453SJohn Marino static bool 1336*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1337*95b7b453SJohn Marino re_node_set_insert_last (re_node_set *set, Idx elem) 1338*95b7b453SJohn Marino { 1339*95b7b453SJohn Marino /* Realloc if we need. */ 1340*95b7b453SJohn Marino if (set->alloc == set->nelem) 1341*95b7b453SJohn Marino { 1342*95b7b453SJohn Marino Idx *new_elems; 1343*95b7b453SJohn Marino set->alloc = (set->alloc + 1) * 2; 1344*95b7b453SJohn Marino new_elems = re_realloc (set->elems, Idx, set->alloc); 1345*95b7b453SJohn Marino if (BE (new_elems == NULL, 0)) 1346*95b7b453SJohn Marino return false; 1347*95b7b453SJohn Marino set->elems = new_elems; 1348*95b7b453SJohn Marino } 1349*95b7b453SJohn Marino 1350*95b7b453SJohn Marino /* Insert the new element. */ 1351*95b7b453SJohn Marino set->elems[set->nelem++] = elem; 1352*95b7b453SJohn Marino return true; 1353*95b7b453SJohn Marino } 1354*95b7b453SJohn Marino 1355*95b7b453SJohn Marino /* Compare two node sets SET1 and SET2. 1356*95b7b453SJohn Marino Return true if SET1 and SET2 are equivalent. */ 1357*95b7b453SJohn Marino 1358*95b7b453SJohn Marino static bool 1359*95b7b453SJohn Marino internal_function __attribute ((pure)) 1360*95b7b453SJohn Marino re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1361*95b7b453SJohn Marino { 1362*95b7b453SJohn Marino Idx i; 1363*95b7b453SJohn Marino if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1364*95b7b453SJohn Marino return false; 1365*95b7b453SJohn Marino for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1366*95b7b453SJohn Marino if (set1->elems[i] != set2->elems[i]) 1367*95b7b453SJohn Marino return false; 1368*95b7b453SJohn Marino return true; 1369*95b7b453SJohn Marino } 1370*95b7b453SJohn Marino 1371*95b7b453SJohn Marino /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1372*95b7b453SJohn Marino 1373*95b7b453SJohn Marino static Idx 1374*95b7b453SJohn Marino internal_function __attribute ((pure)) 1375*95b7b453SJohn Marino re_node_set_contains (const re_node_set *set, Idx elem) 1376*95b7b453SJohn Marino { 1377*95b7b453SJohn Marino __re_size_t idx, right, mid; 1378*95b7b453SJohn Marino if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1379*95b7b453SJohn Marino return 0; 1380*95b7b453SJohn Marino 1381*95b7b453SJohn Marino /* Binary search the element. */ 1382*95b7b453SJohn Marino idx = 0; 1383*95b7b453SJohn Marino right = set->nelem - 1; 1384*95b7b453SJohn Marino while (idx < right) 1385*95b7b453SJohn Marino { 1386*95b7b453SJohn Marino mid = (idx + right) / 2; 1387*95b7b453SJohn Marino if (set->elems[mid] < elem) 1388*95b7b453SJohn Marino idx = mid + 1; 1389*95b7b453SJohn Marino else 1390*95b7b453SJohn Marino right = mid; 1391*95b7b453SJohn Marino } 1392*95b7b453SJohn Marino return set->elems[idx] == elem ? idx + 1 : 0; 1393*95b7b453SJohn Marino } 1394*95b7b453SJohn Marino 1395*95b7b453SJohn Marino static void 1396*95b7b453SJohn Marino internal_function 1397*95b7b453SJohn Marino re_node_set_remove_at (re_node_set *set, Idx idx) 1398*95b7b453SJohn Marino { 1399*95b7b453SJohn Marino verify (! TYPE_SIGNED (Idx)); 1400*95b7b453SJohn Marino /* if (idx < 0) 1401*95b7b453SJohn Marino return; */ 1402*95b7b453SJohn Marino if (idx >= set->nelem) 1403*95b7b453SJohn Marino return; 1404*95b7b453SJohn Marino --set->nelem; 1405*95b7b453SJohn Marino for (; idx < set->nelem; idx++) 1406*95b7b453SJohn Marino set->elems[idx] = set->elems[idx + 1]; 1407*95b7b453SJohn Marino } 1408*95b7b453SJohn Marino 1409*95b7b453SJohn Marino 1410*95b7b453SJohn Marino /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1411*95b7b453SJohn Marino Or return REG_MISSING if an error occurred. */ 1412*95b7b453SJohn Marino 1413*95b7b453SJohn Marino static Idx 1414*95b7b453SJohn Marino internal_function 1415*95b7b453SJohn Marino re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1416*95b7b453SJohn Marino { 1417*95b7b453SJohn Marino if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1418*95b7b453SJohn Marino { 1419*95b7b453SJohn Marino size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1420*95b7b453SJohn Marino Idx *new_nexts, *new_indices; 1421*95b7b453SJohn Marino re_node_set *new_edests, *new_eclosures; 1422*95b7b453SJohn Marino re_token_t *new_nodes; 1423*95b7b453SJohn Marino size_t max_object_size = 1424*95b7b453SJohn Marino MAX (sizeof (re_token_t), 1425*95b7b453SJohn Marino MAX (sizeof (re_node_set), 1426*95b7b453SJohn Marino sizeof (Idx))); 1427*95b7b453SJohn Marino 1428*95b7b453SJohn Marino /* Avoid overflows. */ 1429*95b7b453SJohn Marino if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0)) 1430*95b7b453SJohn Marino return REG_MISSING; 1431*95b7b453SJohn Marino 1432*95b7b453SJohn Marino new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1433*95b7b453SJohn Marino if (BE (new_nodes == NULL, 0)) 1434*95b7b453SJohn Marino return REG_MISSING; 1435*95b7b453SJohn Marino dfa->nodes = new_nodes; 1436*95b7b453SJohn Marino new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1437*95b7b453SJohn Marino new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1438*95b7b453SJohn Marino new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1439*95b7b453SJohn Marino new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1440*95b7b453SJohn Marino if (BE (new_nexts == NULL || new_indices == NULL 1441*95b7b453SJohn Marino || new_edests == NULL || new_eclosures == NULL, 0)) 1442*95b7b453SJohn Marino return REG_MISSING; 1443*95b7b453SJohn Marino dfa->nexts = new_nexts; 1444*95b7b453SJohn Marino dfa->org_indices = new_indices; 1445*95b7b453SJohn Marino dfa->edests = new_edests; 1446*95b7b453SJohn Marino dfa->eclosures = new_eclosures; 1447*95b7b453SJohn Marino dfa->nodes_alloc = new_nodes_alloc; 1448*95b7b453SJohn Marino } 1449*95b7b453SJohn Marino dfa->nodes[dfa->nodes_len] = token; 1450*95b7b453SJohn Marino dfa->nodes[dfa->nodes_len].constraint = 0; 1451*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 1452*95b7b453SJohn Marino { 1453*95b7b453SJohn Marino int type = token.type; 1454*95b7b453SJohn Marino dfa->nodes[dfa->nodes_len].accept_mb = 1455*95b7b453SJohn Marino (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1456*95b7b453SJohn Marino } 1457*95b7b453SJohn Marino #endif 1458*95b7b453SJohn Marino dfa->nexts[dfa->nodes_len] = REG_MISSING; 1459*95b7b453SJohn Marino re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1460*95b7b453SJohn Marino re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1461*95b7b453SJohn Marino return dfa->nodes_len++; 1462*95b7b453SJohn Marino } 1463*95b7b453SJohn Marino 1464*95b7b453SJohn Marino static inline re_hashval_t 1465*95b7b453SJohn Marino internal_function 1466*95b7b453SJohn Marino calc_state_hash (const re_node_set *nodes, unsigned int context) 1467*95b7b453SJohn Marino { 1468*95b7b453SJohn Marino re_hashval_t hash = nodes->nelem + context; 1469*95b7b453SJohn Marino Idx i; 1470*95b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++) 1471*95b7b453SJohn Marino hash += nodes->elems[i]; 1472*95b7b453SJohn Marino return hash; 1473*95b7b453SJohn Marino } 1474*95b7b453SJohn Marino 1475*95b7b453SJohn Marino /* Search for the state whose node_set is equivalent to NODES. 1476*95b7b453SJohn Marino Return the pointer to the state, if we found it in the DFA. 1477*95b7b453SJohn Marino Otherwise create the new one and return it. In case of an error 1478*95b7b453SJohn Marino return NULL and set the error code in ERR. 1479*95b7b453SJohn Marino Note: - We assume NULL as the invalid state, then it is possible that 1480*95b7b453SJohn Marino return value is NULL and ERR is REG_NOERROR. 1481*95b7b453SJohn Marino - We never return non-NULL value in case of any errors, it is for 1482*95b7b453SJohn Marino optimization. */ 1483*95b7b453SJohn Marino 1484*95b7b453SJohn Marino static re_dfastate_t * 1485*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1486*95b7b453SJohn Marino re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1487*95b7b453SJohn Marino const re_node_set *nodes) 1488*95b7b453SJohn Marino { 1489*95b7b453SJohn Marino re_hashval_t hash; 1490*95b7b453SJohn Marino re_dfastate_t *new_state; 1491*95b7b453SJohn Marino struct re_state_table_entry *spot; 1492*95b7b453SJohn Marino Idx i; 1493*95b7b453SJohn Marino #ifdef lint 1494*95b7b453SJohn Marino /* Suppress bogus uninitialized-variable warnings. */ 1495*95b7b453SJohn Marino *err = REG_NOERROR; 1496*95b7b453SJohn Marino #endif 1497*95b7b453SJohn Marino if (BE (nodes->nelem == 0, 0)) 1498*95b7b453SJohn Marino { 1499*95b7b453SJohn Marino *err = REG_NOERROR; 1500*95b7b453SJohn Marino return NULL; 1501*95b7b453SJohn Marino } 1502*95b7b453SJohn Marino hash = calc_state_hash (nodes, 0); 1503*95b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask); 1504*95b7b453SJohn Marino 1505*95b7b453SJohn Marino for (i = 0 ; i < spot->num ; i++) 1506*95b7b453SJohn Marino { 1507*95b7b453SJohn Marino re_dfastate_t *state = spot->array[i]; 1508*95b7b453SJohn Marino if (hash != state->hash) 1509*95b7b453SJohn Marino continue; 1510*95b7b453SJohn Marino if (re_node_set_compare (&state->nodes, nodes)) 1511*95b7b453SJohn Marino return state; 1512*95b7b453SJohn Marino } 1513*95b7b453SJohn Marino 1514*95b7b453SJohn Marino /* There are no appropriate state in the dfa, create the new one. */ 1515*95b7b453SJohn Marino new_state = create_ci_newstate (dfa, nodes, hash); 1516*95b7b453SJohn Marino if (BE (new_state == NULL, 0)) 1517*95b7b453SJohn Marino *err = REG_ESPACE; 1518*95b7b453SJohn Marino 1519*95b7b453SJohn Marino return new_state; 1520*95b7b453SJohn Marino } 1521*95b7b453SJohn Marino 1522*95b7b453SJohn Marino /* Search for the state whose node_set is equivalent to NODES and 1523*95b7b453SJohn Marino whose context is equivalent to CONTEXT. 1524*95b7b453SJohn Marino Return the pointer to the state, if we found it in the DFA. 1525*95b7b453SJohn Marino Otherwise create the new one and return it. In case of an error 1526*95b7b453SJohn Marino return NULL and set the error code in ERR. 1527*95b7b453SJohn Marino Note: - We assume NULL as the invalid state, then it is possible that 1528*95b7b453SJohn Marino return value is NULL and ERR is REG_NOERROR. 1529*95b7b453SJohn Marino - We never return non-NULL value in case of any errors, it is for 1530*95b7b453SJohn Marino optimization. */ 1531*95b7b453SJohn Marino 1532*95b7b453SJohn Marino static re_dfastate_t * 1533*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1534*95b7b453SJohn Marino re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1535*95b7b453SJohn Marino const re_node_set *nodes, unsigned int context) 1536*95b7b453SJohn Marino { 1537*95b7b453SJohn Marino re_hashval_t hash; 1538*95b7b453SJohn Marino re_dfastate_t *new_state; 1539*95b7b453SJohn Marino struct re_state_table_entry *spot; 1540*95b7b453SJohn Marino Idx i; 1541*95b7b453SJohn Marino #ifdef lint 1542*95b7b453SJohn Marino /* Suppress bogus uninitialized-variable warnings. */ 1543*95b7b453SJohn Marino *err = REG_NOERROR; 1544*95b7b453SJohn Marino #endif 1545*95b7b453SJohn Marino if (nodes->nelem == 0) 1546*95b7b453SJohn Marino { 1547*95b7b453SJohn Marino *err = REG_NOERROR; 1548*95b7b453SJohn Marino return NULL; 1549*95b7b453SJohn Marino } 1550*95b7b453SJohn Marino hash = calc_state_hash (nodes, context); 1551*95b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask); 1552*95b7b453SJohn Marino 1553*95b7b453SJohn Marino for (i = 0 ; i < spot->num ; i++) 1554*95b7b453SJohn Marino { 1555*95b7b453SJohn Marino re_dfastate_t *state = spot->array[i]; 1556*95b7b453SJohn Marino if (state->hash == hash 1557*95b7b453SJohn Marino && state->context == context 1558*95b7b453SJohn Marino && re_node_set_compare (state->entrance_nodes, nodes)) 1559*95b7b453SJohn Marino return state; 1560*95b7b453SJohn Marino } 1561*95b7b453SJohn Marino /* There are no appropriate state in `dfa', create the new one. */ 1562*95b7b453SJohn Marino new_state = create_cd_newstate (dfa, nodes, context, hash); 1563*95b7b453SJohn Marino if (BE (new_state == NULL, 0)) 1564*95b7b453SJohn Marino *err = REG_ESPACE; 1565*95b7b453SJohn Marino 1566*95b7b453SJohn Marino return new_state; 1567*95b7b453SJohn Marino } 1568*95b7b453SJohn Marino 1569*95b7b453SJohn Marino /* Finish initialization of the new state NEWSTATE, and using its hash value 1570*95b7b453SJohn Marino HASH put in the appropriate bucket of DFA's state table. Return value 1571*95b7b453SJohn Marino indicates the error code if failed. */ 1572*95b7b453SJohn Marino 1573*95b7b453SJohn Marino static reg_errcode_t 1574*95b7b453SJohn Marino __attribute_warn_unused_result__ 1575*95b7b453SJohn Marino register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1576*95b7b453SJohn Marino re_hashval_t hash) 1577*95b7b453SJohn Marino { 1578*95b7b453SJohn Marino struct re_state_table_entry *spot; 1579*95b7b453SJohn Marino reg_errcode_t err; 1580*95b7b453SJohn Marino Idx i; 1581*95b7b453SJohn Marino 1582*95b7b453SJohn Marino newstate->hash = hash; 1583*95b7b453SJohn Marino err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1584*95b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 1585*95b7b453SJohn Marino return REG_ESPACE; 1586*95b7b453SJohn Marino for (i = 0; i < newstate->nodes.nelem; i++) 1587*95b7b453SJohn Marino { 1588*95b7b453SJohn Marino Idx elem = newstate->nodes.elems[i]; 1589*95b7b453SJohn Marino if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1590*95b7b453SJohn Marino if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0)) 1591*95b7b453SJohn Marino return REG_ESPACE; 1592*95b7b453SJohn Marino } 1593*95b7b453SJohn Marino 1594*95b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask); 1595*95b7b453SJohn Marino if (BE (spot->alloc <= spot->num, 0)) 1596*95b7b453SJohn Marino { 1597*95b7b453SJohn Marino Idx new_alloc = 2 * spot->num + 2; 1598*95b7b453SJohn Marino re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1599*95b7b453SJohn Marino new_alloc); 1600*95b7b453SJohn Marino if (BE (new_array == NULL, 0)) 1601*95b7b453SJohn Marino return REG_ESPACE; 1602*95b7b453SJohn Marino spot->array = new_array; 1603*95b7b453SJohn Marino spot->alloc = new_alloc; 1604*95b7b453SJohn Marino } 1605*95b7b453SJohn Marino spot->array[spot->num++] = newstate; 1606*95b7b453SJohn Marino return REG_NOERROR; 1607*95b7b453SJohn Marino } 1608*95b7b453SJohn Marino 1609*95b7b453SJohn Marino static void 1610*95b7b453SJohn Marino free_state (re_dfastate_t *state) 1611*95b7b453SJohn Marino { 1612*95b7b453SJohn Marino re_node_set_free (&state->non_eps_nodes); 1613*95b7b453SJohn Marino re_node_set_free (&state->inveclosure); 1614*95b7b453SJohn Marino if (state->entrance_nodes != &state->nodes) 1615*95b7b453SJohn Marino { 1616*95b7b453SJohn Marino re_node_set_free (state->entrance_nodes); 1617*95b7b453SJohn Marino re_free (state->entrance_nodes); 1618*95b7b453SJohn Marino } 1619*95b7b453SJohn Marino re_node_set_free (&state->nodes); 1620*95b7b453SJohn Marino re_free (state->word_trtable); 1621*95b7b453SJohn Marino re_free (state->trtable); 1622*95b7b453SJohn Marino re_free (state); 1623*95b7b453SJohn Marino } 1624*95b7b453SJohn Marino 1625*95b7b453SJohn Marino /* Create the new state which is independ of contexts. 1626*95b7b453SJohn Marino Return the new state if succeeded, otherwise return NULL. */ 1627*95b7b453SJohn Marino 1628*95b7b453SJohn Marino static re_dfastate_t * 1629*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1630*95b7b453SJohn Marino create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1631*95b7b453SJohn Marino re_hashval_t hash) 1632*95b7b453SJohn Marino { 1633*95b7b453SJohn Marino Idx i; 1634*95b7b453SJohn Marino reg_errcode_t err; 1635*95b7b453SJohn Marino re_dfastate_t *newstate; 1636*95b7b453SJohn Marino 1637*95b7b453SJohn Marino newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1638*95b7b453SJohn Marino if (BE (newstate == NULL, 0)) 1639*95b7b453SJohn Marino return NULL; 1640*95b7b453SJohn Marino err = re_node_set_init_copy (&newstate->nodes, nodes); 1641*95b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 1642*95b7b453SJohn Marino { 1643*95b7b453SJohn Marino re_free (newstate); 1644*95b7b453SJohn Marino return NULL; 1645*95b7b453SJohn Marino } 1646*95b7b453SJohn Marino 1647*95b7b453SJohn Marino newstate->entrance_nodes = &newstate->nodes; 1648*95b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++) 1649*95b7b453SJohn Marino { 1650*95b7b453SJohn Marino re_token_t *node = dfa->nodes + nodes->elems[i]; 1651*95b7b453SJohn Marino re_token_type_t type = node->type; 1652*95b7b453SJohn Marino if (type == CHARACTER && !node->constraint) 1653*95b7b453SJohn Marino continue; 1654*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 1655*95b7b453SJohn Marino newstate->accept_mb |= node->accept_mb; 1656*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 1657*95b7b453SJohn Marino 1658*95b7b453SJohn Marino /* If the state has the halt node, the state is a halt state. */ 1659*95b7b453SJohn Marino if (type == END_OF_RE) 1660*95b7b453SJohn Marino newstate->halt = 1; 1661*95b7b453SJohn Marino else if (type == OP_BACK_REF) 1662*95b7b453SJohn Marino newstate->has_backref = 1; 1663*95b7b453SJohn Marino else if (type == ANCHOR || node->constraint) 1664*95b7b453SJohn Marino newstate->has_constraint = 1; 1665*95b7b453SJohn Marino } 1666*95b7b453SJohn Marino err = register_state (dfa, newstate, hash); 1667*95b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 1668*95b7b453SJohn Marino { 1669*95b7b453SJohn Marino free_state (newstate); 1670*95b7b453SJohn Marino newstate = NULL; 1671*95b7b453SJohn Marino } 1672*95b7b453SJohn Marino return newstate; 1673*95b7b453SJohn Marino } 1674*95b7b453SJohn Marino 1675*95b7b453SJohn Marino /* Create the new state which is depend on the context CONTEXT. 1676*95b7b453SJohn Marino Return the new state if succeeded, otherwise return NULL. */ 1677*95b7b453SJohn Marino 1678*95b7b453SJohn Marino static re_dfastate_t * 1679*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__ 1680*95b7b453SJohn Marino create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1681*95b7b453SJohn Marino unsigned int context, re_hashval_t hash) 1682*95b7b453SJohn Marino { 1683*95b7b453SJohn Marino Idx i, nctx_nodes = 0; 1684*95b7b453SJohn Marino reg_errcode_t err; 1685*95b7b453SJohn Marino re_dfastate_t *newstate; 1686*95b7b453SJohn Marino 1687*95b7b453SJohn Marino newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1688*95b7b453SJohn Marino if (BE (newstate == NULL, 0)) 1689*95b7b453SJohn Marino return NULL; 1690*95b7b453SJohn Marino err = re_node_set_init_copy (&newstate->nodes, nodes); 1691*95b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 1692*95b7b453SJohn Marino { 1693*95b7b453SJohn Marino re_free (newstate); 1694*95b7b453SJohn Marino return NULL; 1695*95b7b453SJohn Marino } 1696*95b7b453SJohn Marino 1697*95b7b453SJohn Marino newstate->context = context; 1698*95b7b453SJohn Marino newstate->entrance_nodes = &newstate->nodes; 1699*95b7b453SJohn Marino 1700*95b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++) 1701*95b7b453SJohn Marino { 1702*95b7b453SJohn Marino re_token_t *node = dfa->nodes + nodes->elems[i]; 1703*95b7b453SJohn Marino re_token_type_t type = node->type; 1704*95b7b453SJohn Marino unsigned int constraint = node->constraint; 1705*95b7b453SJohn Marino 1706*95b7b453SJohn Marino if (type == CHARACTER && !constraint) 1707*95b7b453SJohn Marino continue; 1708*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N 1709*95b7b453SJohn Marino newstate->accept_mb |= node->accept_mb; 1710*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */ 1711*95b7b453SJohn Marino 1712*95b7b453SJohn Marino /* If the state has the halt node, the state is a halt state. */ 1713*95b7b453SJohn Marino if (type == END_OF_RE) 1714*95b7b453SJohn Marino newstate->halt = 1; 1715*95b7b453SJohn Marino else if (type == OP_BACK_REF) 1716*95b7b453SJohn Marino newstate->has_backref = 1; 1717*95b7b453SJohn Marino 1718*95b7b453SJohn Marino if (constraint) 1719*95b7b453SJohn Marino { 1720*95b7b453SJohn Marino if (newstate->entrance_nodes == &newstate->nodes) 1721*95b7b453SJohn Marino { 1722*95b7b453SJohn Marino newstate->entrance_nodes = re_malloc (re_node_set, 1); 1723*95b7b453SJohn Marino if (BE (newstate->entrance_nodes == NULL, 0)) 1724*95b7b453SJohn Marino { 1725*95b7b453SJohn Marino free_state (newstate); 1726*95b7b453SJohn Marino return NULL; 1727*95b7b453SJohn Marino } 1728*95b7b453SJohn Marino if (re_node_set_init_copy (newstate->entrance_nodes, nodes) 1729*95b7b453SJohn Marino != REG_NOERROR) 1730*95b7b453SJohn Marino return NULL; 1731*95b7b453SJohn Marino nctx_nodes = 0; 1732*95b7b453SJohn Marino newstate->has_constraint = 1; 1733*95b7b453SJohn Marino } 1734*95b7b453SJohn Marino 1735*95b7b453SJohn Marino if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1736*95b7b453SJohn Marino { 1737*95b7b453SJohn Marino re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1738*95b7b453SJohn Marino ++nctx_nodes; 1739*95b7b453SJohn Marino } 1740*95b7b453SJohn Marino } 1741*95b7b453SJohn Marino } 1742*95b7b453SJohn Marino err = register_state (dfa, newstate, hash); 1743*95b7b453SJohn Marino if (BE (err != REG_NOERROR, 0)) 1744*95b7b453SJohn Marino { 1745*95b7b453SJohn Marino free_state (newstate); 1746*95b7b453SJohn Marino newstate = NULL; 1747*95b7b453SJohn Marino } 1748*95b7b453SJohn Marino return newstate; 1749*95b7b453SJohn Marino } 1750