xref: /dflybsd-src/contrib/grep/lib/regex_internal.c (revision 95b7b45377d922df3f9cbf9a1f160c77aa3555fa)
1*95b7b453SJohn Marino /* -*- buffer-read-only: t -*- vi: set ro: */
2*95b7b453SJohn Marino /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3*95b7b453SJohn Marino /* Extended regular expression matching and search library.
4*95b7b453SJohn Marino    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
5*95b7b453SJohn Marino    Software Foundation, Inc.
6*95b7b453SJohn Marino    This file is part of the GNU C Library.
7*95b7b453SJohn Marino    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8*95b7b453SJohn Marino 
9*95b7b453SJohn Marino    This program is free software; you can redistribute it and/or modify
10*95b7b453SJohn Marino    it under the terms of the GNU General Public License as published by
11*95b7b453SJohn Marino    the Free Software Foundation; either version 3, or (at your option)
12*95b7b453SJohn Marino    any later version.
13*95b7b453SJohn Marino 
14*95b7b453SJohn Marino    This program is distributed in the hope that it will be useful,
15*95b7b453SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
16*95b7b453SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17*95b7b453SJohn Marino    GNU General Public License for more details.
18*95b7b453SJohn Marino 
19*95b7b453SJohn Marino    You should have received a copy of the GNU General Public License along
20*95b7b453SJohn Marino    with this program; if not, write to the Free Software Foundation,
21*95b7b453SJohn Marino    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22*95b7b453SJohn Marino 
23*95b7b453SJohn Marino #include "verify.h"
24*95b7b453SJohn Marino #include "intprops.h"
25*95b7b453SJohn Marino static void re_string_construct_common (const char *str, Idx len,
26*95b7b453SJohn Marino 					re_string_t *pstr,
27*95b7b453SJohn Marino 					RE_TRANSLATE_TYPE trans, bool icase,
28*95b7b453SJohn Marino 					const re_dfa_t *dfa) internal_function;
29*95b7b453SJohn Marino static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
30*95b7b453SJohn Marino 					  const re_node_set *nodes,
31*95b7b453SJohn Marino 					  re_hashval_t hash) internal_function;
32*95b7b453SJohn Marino static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
33*95b7b453SJohn Marino 					  const re_node_set *nodes,
34*95b7b453SJohn Marino 					  unsigned int context,
35*95b7b453SJohn Marino 					  re_hashval_t hash) internal_function;
36*95b7b453SJohn Marino 
37*95b7b453SJohn Marino /* Functions for string operation.  */
38*95b7b453SJohn Marino 
39*95b7b453SJohn Marino /* This function allocate the buffers.  It is necessary to call
40*95b7b453SJohn Marino    re_string_reconstruct before using the object.  */
41*95b7b453SJohn Marino 
42*95b7b453SJohn Marino static reg_errcode_t
43*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
44*95b7b453SJohn Marino re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
45*95b7b453SJohn Marino 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
46*95b7b453SJohn Marino {
47*95b7b453SJohn Marino   reg_errcode_t ret;
48*95b7b453SJohn Marino   Idx init_buf_len;
49*95b7b453SJohn Marino 
50*95b7b453SJohn Marino   /* Ensure at least one character fits into the buffers.  */
51*95b7b453SJohn Marino   if (init_len < dfa->mb_cur_max)
52*95b7b453SJohn Marino     init_len = dfa->mb_cur_max;
53*95b7b453SJohn Marino   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
54*95b7b453SJohn Marino   re_string_construct_common (str, len, pstr, trans, icase, dfa);
55*95b7b453SJohn Marino 
56*95b7b453SJohn Marino   ret = re_string_realloc_buffers (pstr, init_buf_len);
57*95b7b453SJohn Marino   if (BE (ret != REG_NOERROR, 0))
58*95b7b453SJohn Marino     return ret;
59*95b7b453SJohn Marino 
60*95b7b453SJohn Marino   pstr->word_char = dfa->word_char;
61*95b7b453SJohn Marino   pstr->word_ops_used = dfa->word_ops_used;
62*95b7b453SJohn Marino   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
63*95b7b453SJohn Marino   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
64*95b7b453SJohn Marino   pstr->valid_raw_len = pstr->valid_len;
65*95b7b453SJohn Marino   return REG_NOERROR;
66*95b7b453SJohn Marino }
67*95b7b453SJohn Marino 
68*95b7b453SJohn Marino /* This function allocate the buffers, and initialize them.  */
69*95b7b453SJohn Marino 
70*95b7b453SJohn Marino static reg_errcode_t
71*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
72*95b7b453SJohn Marino re_string_construct (re_string_t *pstr, const char *str, Idx len,
73*95b7b453SJohn Marino 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
74*95b7b453SJohn Marino {
75*95b7b453SJohn Marino   reg_errcode_t ret;
76*95b7b453SJohn Marino   memset (pstr, '\0', sizeof (re_string_t));
77*95b7b453SJohn Marino   re_string_construct_common (str, len, pstr, trans, icase, dfa);
78*95b7b453SJohn Marino 
79*95b7b453SJohn Marino   if (len > 0)
80*95b7b453SJohn Marino     {
81*95b7b453SJohn Marino       ret = re_string_realloc_buffers (pstr, len + 1);
82*95b7b453SJohn Marino       if (BE (ret != REG_NOERROR, 0))
83*95b7b453SJohn Marino 	return ret;
84*95b7b453SJohn Marino     }
85*95b7b453SJohn Marino   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
86*95b7b453SJohn Marino 
87*95b7b453SJohn Marino   if (icase)
88*95b7b453SJohn Marino     {
89*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
90*95b7b453SJohn Marino       if (dfa->mb_cur_max > 1)
91*95b7b453SJohn Marino 	{
92*95b7b453SJohn Marino 	  while (1)
93*95b7b453SJohn Marino 	    {
94*95b7b453SJohn Marino 	      ret = build_wcs_upper_buffer (pstr);
95*95b7b453SJohn Marino 	      if (BE (ret != REG_NOERROR, 0))
96*95b7b453SJohn Marino 		return ret;
97*95b7b453SJohn Marino 	      if (pstr->valid_raw_len >= len)
98*95b7b453SJohn Marino 		break;
99*95b7b453SJohn Marino 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
100*95b7b453SJohn Marino 		break;
101*95b7b453SJohn Marino 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
102*95b7b453SJohn Marino 	      if (BE (ret != REG_NOERROR, 0))
103*95b7b453SJohn Marino 		return ret;
104*95b7b453SJohn Marino 	    }
105*95b7b453SJohn Marino 	}
106*95b7b453SJohn Marino       else
107*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N  */
108*95b7b453SJohn Marino 	build_upper_buffer (pstr);
109*95b7b453SJohn Marino     }
110*95b7b453SJohn Marino   else
111*95b7b453SJohn Marino     {
112*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
113*95b7b453SJohn Marino       if (dfa->mb_cur_max > 1)
114*95b7b453SJohn Marino 	build_wcs_buffer (pstr);
115*95b7b453SJohn Marino       else
116*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N  */
117*95b7b453SJohn Marino 	{
118*95b7b453SJohn Marino 	  if (trans != NULL)
119*95b7b453SJohn Marino 	    re_string_translate_buffer (pstr);
120*95b7b453SJohn Marino 	  else
121*95b7b453SJohn Marino 	    {
122*95b7b453SJohn Marino 	      pstr->valid_len = pstr->bufs_len;
123*95b7b453SJohn Marino 	      pstr->valid_raw_len = pstr->bufs_len;
124*95b7b453SJohn Marino 	    }
125*95b7b453SJohn Marino 	}
126*95b7b453SJohn Marino     }
127*95b7b453SJohn Marino 
128*95b7b453SJohn Marino   return REG_NOERROR;
129*95b7b453SJohn Marino }
130*95b7b453SJohn Marino 
131*95b7b453SJohn Marino /* Helper functions for re_string_allocate, and re_string_construct.  */
132*95b7b453SJohn Marino 
133*95b7b453SJohn Marino static reg_errcode_t
134*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
135*95b7b453SJohn Marino re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
136*95b7b453SJohn Marino {
137*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
138*95b7b453SJohn Marino   if (pstr->mb_cur_max > 1)
139*95b7b453SJohn Marino     {
140*95b7b453SJohn Marino       wint_t *new_wcs;
141*95b7b453SJohn Marino 
142*95b7b453SJohn Marino       /* Avoid overflow.  */
143*95b7b453SJohn Marino       size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
144*95b7b453SJohn Marino       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
145*95b7b453SJohn Marino 	return REG_ESPACE;
146*95b7b453SJohn Marino 
147*95b7b453SJohn Marino       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
148*95b7b453SJohn Marino       if (BE (new_wcs == NULL, 0))
149*95b7b453SJohn Marino 	return REG_ESPACE;
150*95b7b453SJohn Marino       pstr->wcs = new_wcs;
151*95b7b453SJohn Marino       if (pstr->offsets != NULL)
152*95b7b453SJohn Marino 	{
153*95b7b453SJohn Marino 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
154*95b7b453SJohn Marino 	  if (BE (new_offsets == NULL, 0))
155*95b7b453SJohn Marino 	    return REG_ESPACE;
156*95b7b453SJohn Marino 	  pstr->offsets = new_offsets;
157*95b7b453SJohn Marino 	}
158*95b7b453SJohn Marino     }
159*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N  */
160*95b7b453SJohn Marino   if (pstr->mbs_allocated)
161*95b7b453SJohn Marino     {
162*95b7b453SJohn Marino       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
163*95b7b453SJohn Marino 					   new_buf_len);
164*95b7b453SJohn Marino       if (BE (new_mbs == NULL, 0))
165*95b7b453SJohn Marino 	return REG_ESPACE;
166*95b7b453SJohn Marino       pstr->mbs = new_mbs;
167*95b7b453SJohn Marino     }
168*95b7b453SJohn Marino   pstr->bufs_len = new_buf_len;
169*95b7b453SJohn Marino   return REG_NOERROR;
170*95b7b453SJohn Marino }
171*95b7b453SJohn Marino 
172*95b7b453SJohn Marino 
173*95b7b453SJohn Marino static void
174*95b7b453SJohn Marino internal_function
175*95b7b453SJohn Marino re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
176*95b7b453SJohn Marino 			    RE_TRANSLATE_TYPE trans, bool icase,
177*95b7b453SJohn Marino 			    const re_dfa_t *dfa)
178*95b7b453SJohn Marino {
179*95b7b453SJohn Marino   pstr->raw_mbs = (const unsigned char *) str;
180*95b7b453SJohn Marino   pstr->len = len;
181*95b7b453SJohn Marino   pstr->raw_len = len;
182*95b7b453SJohn Marino   pstr->trans = trans;
183*95b7b453SJohn Marino   pstr->icase = icase;
184*95b7b453SJohn Marino   pstr->mbs_allocated = (trans != NULL || icase);
185*95b7b453SJohn Marino   pstr->mb_cur_max = dfa->mb_cur_max;
186*95b7b453SJohn Marino   pstr->is_utf8 = dfa->is_utf8;
187*95b7b453SJohn Marino   pstr->map_notascii = dfa->map_notascii;
188*95b7b453SJohn Marino   pstr->stop = pstr->len;
189*95b7b453SJohn Marino   pstr->raw_stop = pstr->stop;
190*95b7b453SJohn Marino }
191*95b7b453SJohn Marino 
192*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
193*95b7b453SJohn Marino 
194*95b7b453SJohn Marino /* Build wide character buffer PSTR->WCS.
195*95b7b453SJohn Marino    If the byte sequence of the string are:
196*95b7b453SJohn Marino      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
197*95b7b453SJohn Marino    Then wide character buffer will be:
198*95b7b453SJohn Marino      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
199*95b7b453SJohn Marino    We use WEOF for padding, they indicate that the position isn't
200*95b7b453SJohn Marino    a first byte of a multibyte character.
201*95b7b453SJohn Marino 
202*95b7b453SJohn Marino    Note that this function assumes PSTR->VALID_LEN elements are already
203*95b7b453SJohn Marino    built and starts from PSTR->VALID_LEN.  */
204*95b7b453SJohn Marino 
205*95b7b453SJohn Marino static void
206*95b7b453SJohn Marino internal_function
207*95b7b453SJohn Marino build_wcs_buffer (re_string_t *pstr)
208*95b7b453SJohn Marino {
209*95b7b453SJohn Marino #ifdef _LIBC
210*95b7b453SJohn Marino   unsigned char buf[MB_LEN_MAX];
211*95b7b453SJohn Marino   assert (MB_LEN_MAX >= pstr->mb_cur_max);
212*95b7b453SJohn Marino #else
213*95b7b453SJohn Marino   unsigned char buf[64];
214*95b7b453SJohn Marino #endif
215*95b7b453SJohn Marino   mbstate_t prev_st;
216*95b7b453SJohn Marino   Idx byte_idx, end_idx, remain_len;
217*95b7b453SJohn Marino   size_t mbclen;
218*95b7b453SJohn Marino 
219*95b7b453SJohn Marino   /* Build the buffers from pstr->valid_len to either pstr->len or
220*95b7b453SJohn Marino      pstr->bufs_len.  */
221*95b7b453SJohn Marino   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
222*95b7b453SJohn Marino   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
223*95b7b453SJohn Marino     {
224*95b7b453SJohn Marino       wchar_t wc;
225*95b7b453SJohn Marino       const char *p;
226*95b7b453SJohn Marino 
227*95b7b453SJohn Marino       remain_len = end_idx - byte_idx;
228*95b7b453SJohn Marino       prev_st = pstr->cur_state;
229*95b7b453SJohn Marino       /* Apply the translation if we need.  */
230*95b7b453SJohn Marino       if (BE (pstr->trans != NULL, 0))
231*95b7b453SJohn Marino 	{
232*95b7b453SJohn Marino 	  int i, ch;
233*95b7b453SJohn Marino 
234*95b7b453SJohn Marino 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
235*95b7b453SJohn Marino 	    {
236*95b7b453SJohn Marino 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
237*95b7b453SJohn Marino 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
238*95b7b453SJohn Marino 	    }
239*95b7b453SJohn Marino 	  p = (const char *) buf;
240*95b7b453SJohn Marino 	}
241*95b7b453SJohn Marino       else
242*95b7b453SJohn Marino 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
243*95b7b453SJohn Marino       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
244*95b7b453SJohn Marino       if (BE (mbclen == (size_t) -2, 0))
245*95b7b453SJohn Marino 	{
246*95b7b453SJohn Marino 	  /* The buffer doesn't have enough space, finish to build.  */
247*95b7b453SJohn Marino 	  pstr->cur_state = prev_st;
248*95b7b453SJohn Marino 	  break;
249*95b7b453SJohn Marino 	}
250*95b7b453SJohn Marino       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
251*95b7b453SJohn Marino 	{
252*95b7b453SJohn Marino 	  /* We treat these cases as a singlebyte character.  */
253*95b7b453SJohn Marino 	  mbclen = 1;
254*95b7b453SJohn Marino 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255*95b7b453SJohn Marino 	  if (BE (pstr->trans != NULL, 0))
256*95b7b453SJohn Marino 	    wc = pstr->trans[wc];
257*95b7b453SJohn Marino 	  pstr->cur_state = prev_st;
258*95b7b453SJohn Marino 	}
259*95b7b453SJohn Marino 
260*95b7b453SJohn Marino       /* Write wide character and padding.  */
261*95b7b453SJohn Marino       pstr->wcs[byte_idx++] = wc;
262*95b7b453SJohn Marino       /* Write paddings.  */
263*95b7b453SJohn Marino       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
264*95b7b453SJohn Marino 	pstr->wcs[byte_idx++] = WEOF;
265*95b7b453SJohn Marino     }
266*95b7b453SJohn Marino   pstr->valid_len = byte_idx;
267*95b7b453SJohn Marino   pstr->valid_raw_len = byte_idx;
268*95b7b453SJohn Marino }
269*95b7b453SJohn Marino 
270*95b7b453SJohn Marino /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
271*95b7b453SJohn Marino    but for REG_ICASE.  */
272*95b7b453SJohn Marino 
273*95b7b453SJohn Marino static reg_errcode_t
274*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
275*95b7b453SJohn Marino build_wcs_upper_buffer (re_string_t *pstr)
276*95b7b453SJohn Marino {
277*95b7b453SJohn Marino   mbstate_t prev_st;
278*95b7b453SJohn Marino   Idx src_idx, byte_idx, end_idx, remain_len;
279*95b7b453SJohn Marino   size_t mbclen;
280*95b7b453SJohn Marino #ifdef _LIBC
281*95b7b453SJohn Marino   char buf[MB_LEN_MAX];
282*95b7b453SJohn Marino   assert (MB_LEN_MAX >= pstr->mb_cur_max);
283*95b7b453SJohn Marino #else
284*95b7b453SJohn Marino   char buf[64];
285*95b7b453SJohn Marino #endif
286*95b7b453SJohn Marino 
287*95b7b453SJohn Marino   byte_idx = pstr->valid_len;
288*95b7b453SJohn Marino   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
289*95b7b453SJohn Marino 
290*95b7b453SJohn Marino   /* The following optimization assumes that ASCII characters can be
291*95b7b453SJohn Marino      mapped to wide characters with a simple cast.  */
292*95b7b453SJohn Marino   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
293*95b7b453SJohn Marino     {
294*95b7b453SJohn Marino       while (byte_idx < end_idx)
295*95b7b453SJohn Marino 	{
296*95b7b453SJohn Marino 	  wchar_t wc;
297*95b7b453SJohn Marino 
298*95b7b453SJohn Marino 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
299*95b7b453SJohn Marino 	      && mbsinit (&pstr->cur_state))
300*95b7b453SJohn Marino 	    {
301*95b7b453SJohn Marino 	      /* In case of a singlebyte character.  */
302*95b7b453SJohn Marino 	      pstr->mbs[byte_idx]
303*95b7b453SJohn Marino 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
304*95b7b453SJohn Marino 	      /* The next step uses the assumption that wchar_t is encoded
305*95b7b453SJohn Marino 		 ASCII-safe: all ASCII values can be converted like this.  */
306*95b7b453SJohn Marino 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
307*95b7b453SJohn Marino 	      ++byte_idx;
308*95b7b453SJohn Marino 	      continue;
309*95b7b453SJohn Marino 	    }
310*95b7b453SJohn Marino 
311*95b7b453SJohn Marino 	  remain_len = end_idx - byte_idx;
312*95b7b453SJohn Marino 	  prev_st = pstr->cur_state;
313*95b7b453SJohn Marino 	  mbclen = __mbrtowc (&wc,
314*95b7b453SJohn Marino 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
315*95b7b453SJohn Marino 			       + byte_idx), remain_len, &pstr->cur_state);
316*95b7b453SJohn Marino 	  if (BE (mbclen < (size_t) -2, 1))
317*95b7b453SJohn Marino 	    {
318*95b7b453SJohn Marino 	      wchar_t wcu = wc;
319*95b7b453SJohn Marino 	      if (iswlower (wc))
320*95b7b453SJohn Marino 		{
321*95b7b453SJohn Marino 		  size_t mbcdlen;
322*95b7b453SJohn Marino 
323*95b7b453SJohn Marino 		  wcu = towupper (wc);
324*95b7b453SJohn Marino 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
325*95b7b453SJohn Marino 		  if (BE (mbclen == mbcdlen, 1))
326*95b7b453SJohn Marino 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
327*95b7b453SJohn Marino 		  else
328*95b7b453SJohn Marino 		    {
329*95b7b453SJohn Marino 		      src_idx = byte_idx;
330*95b7b453SJohn Marino 		      goto offsets_needed;
331*95b7b453SJohn Marino 		    }
332*95b7b453SJohn Marino 		}
333*95b7b453SJohn Marino 	      else
334*95b7b453SJohn Marino 		memcpy (pstr->mbs + byte_idx,
335*95b7b453SJohn Marino 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
336*95b7b453SJohn Marino 	      pstr->wcs[byte_idx++] = wcu;
337*95b7b453SJohn Marino 	      /* Write paddings.  */
338*95b7b453SJohn Marino 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
339*95b7b453SJohn Marino 		pstr->wcs[byte_idx++] = WEOF;
340*95b7b453SJohn Marino 	    }
341*95b7b453SJohn Marino 	  else if (mbclen == (size_t) -1 || mbclen == 0)
342*95b7b453SJohn Marino 	    {
343*95b7b453SJohn Marino 	      /* It is an invalid character or '\0'.  Just use the byte.  */
344*95b7b453SJohn Marino 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345*95b7b453SJohn Marino 	      pstr->mbs[byte_idx] = ch;
346*95b7b453SJohn Marino 	      /* And also cast it to wide char.  */
347*95b7b453SJohn Marino 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
348*95b7b453SJohn Marino 	      if (BE (mbclen == (size_t) -1, 0))
349*95b7b453SJohn Marino 		pstr->cur_state = prev_st;
350*95b7b453SJohn Marino 	    }
351*95b7b453SJohn Marino 	  else
352*95b7b453SJohn Marino 	    {
353*95b7b453SJohn Marino 	      /* The buffer doesn't have enough space, finish to build.  */
354*95b7b453SJohn Marino 	      pstr->cur_state = prev_st;
355*95b7b453SJohn Marino 	      break;
356*95b7b453SJohn Marino 	    }
357*95b7b453SJohn Marino 	}
358*95b7b453SJohn Marino       pstr->valid_len = byte_idx;
359*95b7b453SJohn Marino       pstr->valid_raw_len = byte_idx;
360*95b7b453SJohn Marino       return REG_NOERROR;
361*95b7b453SJohn Marino     }
362*95b7b453SJohn Marino   else
363*95b7b453SJohn Marino     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364*95b7b453SJohn Marino       {
365*95b7b453SJohn Marino 	wchar_t wc;
366*95b7b453SJohn Marino 	const char *p;
367*95b7b453SJohn Marino       offsets_needed:
368*95b7b453SJohn Marino 	remain_len = end_idx - byte_idx;
369*95b7b453SJohn Marino 	prev_st = pstr->cur_state;
370*95b7b453SJohn Marino 	if (BE (pstr->trans != NULL, 0))
371*95b7b453SJohn Marino 	  {
372*95b7b453SJohn Marino 	    int i, ch;
373*95b7b453SJohn Marino 
374*95b7b453SJohn Marino 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375*95b7b453SJohn Marino 	      {
376*95b7b453SJohn Marino 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377*95b7b453SJohn Marino 		buf[i] = pstr->trans[ch];
378*95b7b453SJohn Marino 	      }
379*95b7b453SJohn Marino 	    p = (const char *) buf;
380*95b7b453SJohn Marino 	  }
381*95b7b453SJohn Marino 	else
382*95b7b453SJohn Marino 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383*95b7b453SJohn Marino 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384*95b7b453SJohn Marino 	if (BE (mbclen < (size_t) -2, 1))
385*95b7b453SJohn Marino 	  {
386*95b7b453SJohn Marino 	    wchar_t wcu = wc;
387*95b7b453SJohn Marino 	    if (iswlower (wc))
388*95b7b453SJohn Marino 	      {
389*95b7b453SJohn Marino 		size_t mbcdlen;
390*95b7b453SJohn Marino 
391*95b7b453SJohn Marino 		wcu = towupper (wc);
392*95b7b453SJohn Marino 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
393*95b7b453SJohn Marino 		if (BE (mbclen == mbcdlen, 1))
394*95b7b453SJohn Marino 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
395*95b7b453SJohn Marino 		else if (mbcdlen != (size_t) -1)
396*95b7b453SJohn Marino 		  {
397*95b7b453SJohn Marino 		    size_t i;
398*95b7b453SJohn Marino 
399*95b7b453SJohn Marino 		    if (byte_idx + mbcdlen > pstr->bufs_len)
400*95b7b453SJohn Marino 		      {
401*95b7b453SJohn Marino 			pstr->cur_state = prev_st;
402*95b7b453SJohn Marino 			break;
403*95b7b453SJohn Marino 		      }
404*95b7b453SJohn Marino 
405*95b7b453SJohn Marino 		    if (pstr->offsets == NULL)
406*95b7b453SJohn Marino 		      {
407*95b7b453SJohn Marino 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
408*95b7b453SJohn Marino 
409*95b7b453SJohn Marino 			if (pstr->offsets == NULL)
410*95b7b453SJohn Marino 			  return REG_ESPACE;
411*95b7b453SJohn Marino 		      }
412*95b7b453SJohn Marino 		    if (!pstr->offsets_needed)
413*95b7b453SJohn Marino 		      {
414*95b7b453SJohn Marino 			for (i = 0; i < (size_t) byte_idx; ++i)
415*95b7b453SJohn Marino 			  pstr->offsets[i] = i;
416*95b7b453SJohn Marino 			pstr->offsets_needed = 1;
417*95b7b453SJohn Marino 		      }
418*95b7b453SJohn Marino 
419*95b7b453SJohn Marino 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
420*95b7b453SJohn Marino 		    pstr->wcs[byte_idx] = wcu;
421*95b7b453SJohn Marino 		    pstr->offsets[byte_idx] = src_idx;
422*95b7b453SJohn Marino 		    for (i = 1; i < mbcdlen; ++i)
423*95b7b453SJohn Marino 		      {
424*95b7b453SJohn Marino 			pstr->offsets[byte_idx + i]
425*95b7b453SJohn Marino 			  = src_idx + (i < mbclen ? i : mbclen - 1);
426*95b7b453SJohn Marino 			pstr->wcs[byte_idx + i] = WEOF;
427*95b7b453SJohn Marino 		      }
428*95b7b453SJohn Marino 		    pstr->len += mbcdlen - mbclen;
429*95b7b453SJohn Marino 		    if (pstr->raw_stop > src_idx)
430*95b7b453SJohn Marino 		      pstr->stop += mbcdlen - mbclen;
431*95b7b453SJohn Marino 		    end_idx = (pstr->bufs_len > pstr->len)
432*95b7b453SJohn Marino 			      ? pstr->len : pstr->bufs_len;
433*95b7b453SJohn Marino 		    byte_idx += mbcdlen;
434*95b7b453SJohn Marino 		    src_idx += mbclen;
435*95b7b453SJohn Marino 		    continue;
436*95b7b453SJohn Marino 		  }
437*95b7b453SJohn Marino 		else
438*95b7b453SJohn Marino 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
439*95b7b453SJohn Marino 	      }
440*95b7b453SJohn Marino 	    else
441*95b7b453SJohn Marino 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
442*95b7b453SJohn Marino 
443*95b7b453SJohn Marino 	    if (BE (pstr->offsets_needed != 0, 0))
444*95b7b453SJohn Marino 	      {
445*95b7b453SJohn Marino 		size_t i;
446*95b7b453SJohn Marino 		for (i = 0; i < mbclen; ++i)
447*95b7b453SJohn Marino 		  pstr->offsets[byte_idx + i] = src_idx + i;
448*95b7b453SJohn Marino 	      }
449*95b7b453SJohn Marino 	    src_idx += mbclen;
450*95b7b453SJohn Marino 
451*95b7b453SJohn Marino 	    pstr->wcs[byte_idx++] = wcu;
452*95b7b453SJohn Marino 	    /* Write paddings.  */
453*95b7b453SJohn Marino 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
454*95b7b453SJohn Marino 	      pstr->wcs[byte_idx++] = WEOF;
455*95b7b453SJohn Marino 	  }
456*95b7b453SJohn Marino 	else if (mbclen == (size_t) -1 || mbclen == 0)
457*95b7b453SJohn Marino 	  {
458*95b7b453SJohn Marino 	    /* It is an invalid character or '\0'.  Just use the byte.  */
459*95b7b453SJohn Marino 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
460*95b7b453SJohn Marino 
461*95b7b453SJohn Marino 	    if (BE (pstr->trans != NULL, 0))
462*95b7b453SJohn Marino 	      ch = pstr->trans [ch];
463*95b7b453SJohn Marino 	    pstr->mbs[byte_idx] = ch;
464*95b7b453SJohn Marino 
465*95b7b453SJohn Marino 	    if (BE (pstr->offsets_needed != 0, 0))
466*95b7b453SJohn Marino 	      pstr->offsets[byte_idx] = src_idx;
467*95b7b453SJohn Marino 	    ++src_idx;
468*95b7b453SJohn Marino 
469*95b7b453SJohn Marino 	    /* And also cast it to wide char.  */
470*95b7b453SJohn Marino 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
471*95b7b453SJohn Marino 	    if (BE (mbclen == (size_t) -1, 0))
472*95b7b453SJohn Marino 	      pstr->cur_state = prev_st;
473*95b7b453SJohn Marino 	  }
474*95b7b453SJohn Marino 	else
475*95b7b453SJohn Marino 	  {
476*95b7b453SJohn Marino 	    /* The buffer doesn't have enough space, finish to build.  */
477*95b7b453SJohn Marino 	    pstr->cur_state = prev_st;
478*95b7b453SJohn Marino 	    break;
479*95b7b453SJohn Marino 	  }
480*95b7b453SJohn Marino       }
481*95b7b453SJohn Marino   pstr->valid_len = byte_idx;
482*95b7b453SJohn Marino   pstr->valid_raw_len = src_idx;
483*95b7b453SJohn Marino   return REG_NOERROR;
484*95b7b453SJohn Marino }
485*95b7b453SJohn Marino 
486*95b7b453SJohn Marino /* Skip characters until the index becomes greater than NEW_RAW_IDX.
487*95b7b453SJohn Marino    Return the index.  */
488*95b7b453SJohn Marino 
489*95b7b453SJohn Marino static Idx
490*95b7b453SJohn Marino internal_function
491*95b7b453SJohn Marino re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
492*95b7b453SJohn Marino {
493*95b7b453SJohn Marino   mbstate_t prev_st;
494*95b7b453SJohn Marino   Idx rawbuf_idx;
495*95b7b453SJohn Marino   size_t mbclen;
496*95b7b453SJohn Marino   wint_t wc = WEOF;
497*95b7b453SJohn Marino 
498*95b7b453SJohn Marino   /* Skip the characters which are not necessary to check.  */
499*95b7b453SJohn Marino   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
500*95b7b453SJohn Marino        rawbuf_idx < new_raw_idx;)
501*95b7b453SJohn Marino     {
502*95b7b453SJohn Marino       wchar_t wc2;
503*95b7b453SJohn Marino       Idx remain_len;
504*95b7b453SJohn Marino       remain_len = pstr->len - rawbuf_idx;
505*95b7b453SJohn Marino       prev_st = pstr->cur_state;
506*95b7b453SJohn Marino       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
507*95b7b453SJohn Marino 			  remain_len, &pstr->cur_state);
508*95b7b453SJohn Marino       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
509*95b7b453SJohn Marino 	{
510*95b7b453SJohn Marino 	  /* We treat these cases as a single byte character.  */
511*95b7b453SJohn Marino 	  if (mbclen == 0 || remain_len == 0)
512*95b7b453SJohn Marino 	    wc = L'\0';
513*95b7b453SJohn Marino 	  else
514*95b7b453SJohn Marino 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
515*95b7b453SJohn Marino 	  mbclen = 1;
516*95b7b453SJohn Marino 	  pstr->cur_state = prev_st;
517*95b7b453SJohn Marino 	}
518*95b7b453SJohn Marino       else
519*95b7b453SJohn Marino 	wc = wc2;
520*95b7b453SJohn Marino       /* Then proceed the next character.  */
521*95b7b453SJohn Marino       rawbuf_idx += mbclen;
522*95b7b453SJohn Marino     }
523*95b7b453SJohn Marino   *last_wc = wc;
524*95b7b453SJohn Marino   return rawbuf_idx;
525*95b7b453SJohn Marino }
526*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N  */
527*95b7b453SJohn Marino 
528*95b7b453SJohn Marino /* Build the buffer PSTR->MBS, and apply the translation if we need.
529*95b7b453SJohn Marino    This function is used in case of REG_ICASE.  */
530*95b7b453SJohn Marino 
531*95b7b453SJohn Marino static void
532*95b7b453SJohn Marino internal_function
533*95b7b453SJohn Marino build_upper_buffer (re_string_t *pstr)
534*95b7b453SJohn Marino {
535*95b7b453SJohn Marino   Idx char_idx, end_idx;
536*95b7b453SJohn Marino   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
537*95b7b453SJohn Marino 
538*95b7b453SJohn Marino   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
539*95b7b453SJohn Marino     {
540*95b7b453SJohn Marino       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
541*95b7b453SJohn Marino       if (BE (pstr->trans != NULL, 0))
542*95b7b453SJohn Marino 	ch = pstr->trans[ch];
543*95b7b453SJohn Marino       if (islower (ch))
544*95b7b453SJohn Marino 	pstr->mbs[char_idx] = toupper (ch);
545*95b7b453SJohn Marino       else
546*95b7b453SJohn Marino 	pstr->mbs[char_idx] = ch;
547*95b7b453SJohn Marino     }
548*95b7b453SJohn Marino   pstr->valid_len = char_idx;
549*95b7b453SJohn Marino   pstr->valid_raw_len = char_idx;
550*95b7b453SJohn Marino }
551*95b7b453SJohn Marino 
552*95b7b453SJohn Marino /* Apply TRANS to the buffer in PSTR.  */
553*95b7b453SJohn Marino 
554*95b7b453SJohn Marino static void
555*95b7b453SJohn Marino internal_function
556*95b7b453SJohn Marino re_string_translate_buffer (re_string_t *pstr)
557*95b7b453SJohn Marino {
558*95b7b453SJohn Marino   Idx buf_idx, end_idx;
559*95b7b453SJohn Marino   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
560*95b7b453SJohn Marino 
561*95b7b453SJohn Marino   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
562*95b7b453SJohn Marino     {
563*95b7b453SJohn Marino       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
564*95b7b453SJohn Marino       pstr->mbs[buf_idx] = pstr->trans[ch];
565*95b7b453SJohn Marino     }
566*95b7b453SJohn Marino 
567*95b7b453SJohn Marino   pstr->valid_len = buf_idx;
568*95b7b453SJohn Marino   pstr->valid_raw_len = buf_idx;
569*95b7b453SJohn Marino }
570*95b7b453SJohn Marino 
571*95b7b453SJohn Marino /* This function re-construct the buffers.
572*95b7b453SJohn Marino    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
573*95b7b453SJohn Marino    convert to upper case in case of REG_ICASE, apply translation.  */
574*95b7b453SJohn Marino 
575*95b7b453SJohn Marino static reg_errcode_t
576*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
577*95b7b453SJohn Marino re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
578*95b7b453SJohn Marino {
579*95b7b453SJohn Marino   Idx offset;
580*95b7b453SJohn Marino 
581*95b7b453SJohn Marino   if (BE (pstr->raw_mbs_idx <= idx, 0))
582*95b7b453SJohn Marino     offset = idx - pstr->raw_mbs_idx;
583*95b7b453SJohn Marino   else
584*95b7b453SJohn Marino     {
585*95b7b453SJohn Marino       /* Reset buffer.  */
586*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
587*95b7b453SJohn Marino       if (pstr->mb_cur_max > 1)
588*95b7b453SJohn Marino 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
590*95b7b453SJohn Marino       pstr->len = pstr->raw_len;
591*95b7b453SJohn Marino       pstr->stop = pstr->raw_stop;
592*95b7b453SJohn Marino       pstr->valid_len = 0;
593*95b7b453SJohn Marino       pstr->raw_mbs_idx = 0;
594*95b7b453SJohn Marino       pstr->valid_raw_len = 0;
595*95b7b453SJohn Marino       pstr->offsets_needed = 0;
596*95b7b453SJohn Marino       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597*95b7b453SJohn Marino 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598*95b7b453SJohn Marino       if (!pstr->mbs_allocated)
599*95b7b453SJohn Marino 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
600*95b7b453SJohn Marino       offset = idx;
601*95b7b453SJohn Marino     }
602*95b7b453SJohn Marino 
603*95b7b453SJohn Marino   if (BE (offset != 0, 1))
604*95b7b453SJohn Marino     {
605*95b7b453SJohn Marino       /* Should the already checked characters be kept?  */
606*95b7b453SJohn Marino       if (BE (offset < pstr->valid_raw_len, 1))
607*95b7b453SJohn Marino 	{
608*95b7b453SJohn Marino 	  /* Yes, move them to the front of the buffer.  */
609*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
610*95b7b453SJohn Marino 	  if (BE (pstr->offsets_needed, 0))
611*95b7b453SJohn Marino 	    {
612*95b7b453SJohn Marino 	      Idx low = 0, high = pstr->valid_len, mid;
613*95b7b453SJohn Marino 	      do
614*95b7b453SJohn Marino 		{
615*95b7b453SJohn Marino 		  mid = (high + low) / 2;
616*95b7b453SJohn Marino 		  if (pstr->offsets[mid] > offset)
617*95b7b453SJohn Marino 		    high = mid;
618*95b7b453SJohn Marino 		  else if (pstr->offsets[mid] < offset)
619*95b7b453SJohn Marino 		    low = mid + 1;
620*95b7b453SJohn Marino 		  else
621*95b7b453SJohn Marino 		    break;
622*95b7b453SJohn Marino 		}
623*95b7b453SJohn Marino 	      while (low < high);
624*95b7b453SJohn Marino 	      if (pstr->offsets[mid] < offset)
625*95b7b453SJohn Marino 		++mid;
626*95b7b453SJohn Marino 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
627*95b7b453SJohn Marino 							eflags);
628*95b7b453SJohn Marino 	      /* This can be quite complicated, so handle specially
629*95b7b453SJohn Marino 		 only the common and easy case where the character with
630*95b7b453SJohn Marino 		 different length representation of lower and upper
631*95b7b453SJohn Marino 		 case is present at or after offset.  */
632*95b7b453SJohn Marino 	      if (pstr->valid_len > offset
633*95b7b453SJohn Marino 		  && mid == offset && pstr->offsets[mid] == offset)
634*95b7b453SJohn Marino 		{
635*95b7b453SJohn Marino 		  memmove (pstr->wcs, pstr->wcs + offset,
636*95b7b453SJohn Marino 			   (pstr->valid_len - offset) * sizeof (wint_t));
637*95b7b453SJohn Marino 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
638*95b7b453SJohn Marino 		  pstr->valid_len -= offset;
639*95b7b453SJohn Marino 		  pstr->valid_raw_len -= offset;
640*95b7b453SJohn Marino 		  for (low = 0; low < pstr->valid_len; low++)
641*95b7b453SJohn Marino 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
642*95b7b453SJohn Marino 		}
643*95b7b453SJohn Marino 	      else
644*95b7b453SJohn Marino 		{
645*95b7b453SJohn Marino 		  /* Otherwise, just find out how long the partial multibyte
646*95b7b453SJohn Marino 		     character at offset is and fill it with WEOF/255.  */
647*95b7b453SJohn Marino 		  pstr->len = pstr->raw_len - idx + offset;
648*95b7b453SJohn Marino 		  pstr->stop = pstr->raw_stop - idx + offset;
649*95b7b453SJohn Marino 		  pstr->offsets_needed = 0;
650*95b7b453SJohn Marino 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
651*95b7b453SJohn Marino 		    --mid;
652*95b7b453SJohn Marino 		  while (mid < pstr->valid_len)
653*95b7b453SJohn Marino 		    if (pstr->wcs[mid] != WEOF)
654*95b7b453SJohn Marino 		      break;
655*95b7b453SJohn Marino 		    else
656*95b7b453SJohn Marino 		      ++mid;
657*95b7b453SJohn Marino 		  if (mid == pstr->valid_len)
658*95b7b453SJohn Marino 		    pstr->valid_len = 0;
659*95b7b453SJohn Marino 		  else
660*95b7b453SJohn Marino 		    {
661*95b7b453SJohn Marino 		      pstr->valid_len = pstr->offsets[mid] - offset;
662*95b7b453SJohn Marino 		      if (pstr->valid_len)
663*95b7b453SJohn Marino 			{
664*95b7b453SJohn Marino 			  for (low = 0; low < pstr->valid_len; ++low)
665*95b7b453SJohn Marino 			    pstr->wcs[low] = WEOF;
666*95b7b453SJohn Marino 			  memset (pstr->mbs, 255, pstr->valid_len);
667*95b7b453SJohn Marino 			}
668*95b7b453SJohn Marino 		    }
669*95b7b453SJohn Marino 		  pstr->valid_raw_len = pstr->valid_len;
670*95b7b453SJohn Marino 		}
671*95b7b453SJohn Marino 	    }
672*95b7b453SJohn Marino 	  else
673*95b7b453SJohn Marino #endif
674*95b7b453SJohn Marino 	    {
675*95b7b453SJohn Marino 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
676*95b7b453SJohn Marino 							eflags);
677*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
678*95b7b453SJohn Marino 	      if (pstr->mb_cur_max > 1)
679*95b7b453SJohn Marino 		memmove (pstr->wcs, pstr->wcs + offset,
680*95b7b453SJohn Marino 			 (pstr->valid_len - offset) * sizeof (wint_t));
681*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
682*95b7b453SJohn Marino 	      if (BE (pstr->mbs_allocated, 0))
683*95b7b453SJohn Marino 		memmove (pstr->mbs, pstr->mbs + offset,
684*95b7b453SJohn Marino 			 pstr->valid_len - offset);
685*95b7b453SJohn Marino 	      pstr->valid_len -= offset;
686*95b7b453SJohn Marino 	      pstr->valid_raw_len -= offset;
687*95b7b453SJohn Marino #if DEBUG
688*95b7b453SJohn Marino 	      assert (pstr->valid_len > 0);
689*95b7b453SJohn Marino #endif
690*95b7b453SJohn Marino 	    }
691*95b7b453SJohn Marino 	}
692*95b7b453SJohn Marino       else
693*95b7b453SJohn Marino 	{
694*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
695*95b7b453SJohn Marino 	  /* No, skip all characters until IDX.  */
696*95b7b453SJohn Marino 	  Idx prev_valid_len = pstr->valid_len;
697*95b7b453SJohn Marino 
698*95b7b453SJohn Marino 	  if (BE (pstr->offsets_needed, 0))
699*95b7b453SJohn Marino 	    {
700*95b7b453SJohn Marino 	      pstr->len = pstr->raw_len - idx + offset;
701*95b7b453SJohn Marino 	      pstr->stop = pstr->raw_stop - idx + offset;
702*95b7b453SJohn Marino 	      pstr->offsets_needed = 0;
703*95b7b453SJohn Marino 	    }
704*95b7b453SJohn Marino #endif
705*95b7b453SJohn Marino 	  pstr->valid_len = 0;
706*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
707*95b7b453SJohn Marino 	  if (pstr->mb_cur_max > 1)
708*95b7b453SJohn Marino 	    {
709*95b7b453SJohn Marino 	      Idx wcs_idx;
710*95b7b453SJohn Marino 	      wint_t wc = WEOF;
711*95b7b453SJohn Marino 
712*95b7b453SJohn Marino 	      if (pstr->is_utf8)
713*95b7b453SJohn Marino 		{
714*95b7b453SJohn Marino 		  const unsigned char *raw, *p, *end;
715*95b7b453SJohn Marino 
716*95b7b453SJohn Marino 		  /* Special case UTF-8.  Multi-byte chars start with any
717*95b7b453SJohn Marino 		     byte other than 0x80 - 0xbf.  */
718*95b7b453SJohn Marino 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719*95b7b453SJohn Marino 		  end = raw + (offset - pstr->mb_cur_max);
720*95b7b453SJohn Marino 		  if (end < pstr->raw_mbs)
721*95b7b453SJohn Marino 		    end = pstr->raw_mbs;
722*95b7b453SJohn Marino 		  p = raw + offset - 1;
723*95b7b453SJohn Marino #ifdef _LIBC
724*95b7b453SJohn Marino 		  /* We know the wchar_t encoding is UCS4, so for the simple
725*95b7b453SJohn Marino 		     case, ASCII characters, skip the conversion step.  */
726*95b7b453SJohn Marino 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
727*95b7b453SJohn Marino 		    {
728*95b7b453SJohn Marino 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
729*95b7b453SJohn Marino 		      /* pstr->valid_len = 0; */
730*95b7b453SJohn Marino 		      wc = (wchar_t) *p;
731*95b7b453SJohn Marino 		    }
732*95b7b453SJohn Marino 		  else
733*95b7b453SJohn Marino #endif
734*95b7b453SJohn Marino 		    for (; p >= end; --p)
735*95b7b453SJohn Marino 		      if ((*p & 0xc0) != 0x80)
736*95b7b453SJohn Marino 			{
737*95b7b453SJohn Marino 			  mbstate_t cur_state;
738*95b7b453SJohn Marino 			  wchar_t wc2;
739*95b7b453SJohn Marino 			  Idx mlen = raw + pstr->len - p;
740*95b7b453SJohn Marino 			  size_t mbclen;
741*95b7b453SJohn Marino 
742*95b7b453SJohn Marino #if 0 /* dead code: buf is set but never used */
743*95b7b453SJohn Marino 			  unsigned char buf[6];
744*95b7b453SJohn Marino 			  if (BE (pstr->trans != NULL, 0))
745*95b7b453SJohn Marino 			    {
746*95b7b453SJohn Marino 			      int i = mlen < 6 ? mlen : 6;
747*95b7b453SJohn Marino 			      while (--i >= 0)
748*95b7b453SJohn Marino 				buf[i] = pstr->trans[p[i]];
749*95b7b453SJohn Marino 			    }
750*95b7b453SJohn Marino #endif
751*95b7b453SJohn Marino 			  /* XXX Don't use mbrtowc, we know which conversion
752*95b7b453SJohn Marino 			     to use (UTF-8 -> UCS4).  */
753*95b7b453SJohn Marino 			  memset (&cur_state, 0, sizeof (cur_state));
754*95b7b453SJohn Marino 			  mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
755*95b7b453SJohn Marino 					      &cur_state);
756*95b7b453SJohn Marino 			  if (raw + offset - p <= mbclen
757*95b7b453SJohn Marino 			      && mbclen < (size_t) -2)
758*95b7b453SJohn Marino 			    {
759*95b7b453SJohn Marino 			      memset (&pstr->cur_state, '\0',
760*95b7b453SJohn Marino 				      sizeof (mbstate_t));
761*95b7b453SJohn Marino 			      pstr->valid_len = mbclen - (raw + offset - p);
762*95b7b453SJohn Marino 			      wc = wc2;
763*95b7b453SJohn Marino 			    }
764*95b7b453SJohn Marino 			  break;
765*95b7b453SJohn Marino 			}
766*95b7b453SJohn Marino 		}
767*95b7b453SJohn Marino 
768*95b7b453SJohn Marino 	      if (wc == WEOF)
769*95b7b453SJohn Marino 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770*95b7b453SJohn Marino 	      if (wc == WEOF)
771*95b7b453SJohn Marino 		pstr->tip_context
772*95b7b453SJohn Marino 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
773*95b7b453SJohn Marino 	      else
774*95b7b453SJohn Marino 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
775*95b7b453SJohn Marino 				      && IS_WIDE_WORD_CHAR (wc))
776*95b7b453SJohn Marino 				     ? CONTEXT_WORD
777*95b7b453SJohn Marino 				     : ((IS_WIDE_NEWLINE (wc)
778*95b7b453SJohn Marino 					 && pstr->newline_anchor)
779*95b7b453SJohn Marino 					? CONTEXT_NEWLINE : 0));
780*95b7b453SJohn Marino 	      if (BE (pstr->valid_len, 0))
781*95b7b453SJohn Marino 		{
782*95b7b453SJohn Marino 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
783*95b7b453SJohn Marino 		    pstr->wcs[wcs_idx] = WEOF;
784*95b7b453SJohn Marino 		  if (pstr->mbs_allocated)
785*95b7b453SJohn Marino 		    memset (pstr->mbs, 255, pstr->valid_len);
786*95b7b453SJohn Marino 		}
787*95b7b453SJohn Marino 	      pstr->valid_raw_len = pstr->valid_len;
788*95b7b453SJohn Marino 	    }
789*95b7b453SJohn Marino 	  else
790*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
791*95b7b453SJohn Marino 	    {
792*95b7b453SJohn Marino 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
793*95b7b453SJohn Marino 	      pstr->valid_raw_len = 0;
794*95b7b453SJohn Marino 	      if (pstr->trans)
795*95b7b453SJohn Marino 		c = pstr->trans[c];
796*95b7b453SJohn Marino 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
797*95b7b453SJohn Marino 				   ? CONTEXT_WORD
798*95b7b453SJohn Marino 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
799*95b7b453SJohn Marino 				      ? CONTEXT_NEWLINE : 0));
800*95b7b453SJohn Marino 	    }
801*95b7b453SJohn Marino 	}
802*95b7b453SJohn Marino       if (!BE (pstr->mbs_allocated, 0))
803*95b7b453SJohn Marino 	pstr->mbs += offset;
804*95b7b453SJohn Marino     }
805*95b7b453SJohn Marino   pstr->raw_mbs_idx = idx;
806*95b7b453SJohn Marino   pstr->len -= offset;
807*95b7b453SJohn Marino   pstr->stop -= offset;
808*95b7b453SJohn Marino 
809*95b7b453SJohn Marino   /* Then build the buffers.  */
810*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
811*95b7b453SJohn Marino   if (pstr->mb_cur_max > 1)
812*95b7b453SJohn Marino     {
813*95b7b453SJohn Marino       if (pstr->icase)
814*95b7b453SJohn Marino 	{
815*95b7b453SJohn Marino 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816*95b7b453SJohn Marino 	  if (BE (ret != REG_NOERROR, 0))
817*95b7b453SJohn Marino 	    return ret;
818*95b7b453SJohn Marino 	}
819*95b7b453SJohn Marino       else
820*95b7b453SJohn Marino 	build_wcs_buffer (pstr);
821*95b7b453SJohn Marino     }
822*95b7b453SJohn Marino   else
823*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
824*95b7b453SJohn Marino     if (BE (pstr->mbs_allocated, 0))
825*95b7b453SJohn Marino       {
826*95b7b453SJohn Marino 	if (pstr->icase)
827*95b7b453SJohn Marino 	  build_upper_buffer (pstr);
828*95b7b453SJohn Marino 	else if (pstr->trans != NULL)
829*95b7b453SJohn Marino 	  re_string_translate_buffer (pstr);
830*95b7b453SJohn Marino       }
831*95b7b453SJohn Marino     else
832*95b7b453SJohn Marino       pstr->valid_len = pstr->len;
833*95b7b453SJohn Marino 
834*95b7b453SJohn Marino   pstr->cur_idx = 0;
835*95b7b453SJohn Marino   return REG_NOERROR;
836*95b7b453SJohn Marino }
837*95b7b453SJohn Marino 
838*95b7b453SJohn Marino static unsigned char
839*95b7b453SJohn Marino internal_function __attribute ((pure))
840*95b7b453SJohn Marino re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
841*95b7b453SJohn Marino {
842*95b7b453SJohn Marino   int ch;
843*95b7b453SJohn Marino   Idx off;
844*95b7b453SJohn Marino 
845*95b7b453SJohn Marino   /* Handle the common (easiest) cases first.  */
846*95b7b453SJohn Marino   if (BE (!pstr->mbs_allocated, 1))
847*95b7b453SJohn Marino     return re_string_peek_byte (pstr, idx);
848*95b7b453SJohn Marino 
849*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
850*95b7b453SJohn Marino   if (pstr->mb_cur_max > 1
851*95b7b453SJohn Marino       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852*95b7b453SJohn Marino     return re_string_peek_byte (pstr, idx);
853*95b7b453SJohn Marino #endif
854*95b7b453SJohn Marino 
855*95b7b453SJohn Marino   off = pstr->cur_idx + idx;
856*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
857*95b7b453SJohn Marino   if (pstr->offsets_needed)
858*95b7b453SJohn Marino     off = pstr->offsets[off];
859*95b7b453SJohn Marino #endif
860*95b7b453SJohn Marino 
861*95b7b453SJohn Marino   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
862*95b7b453SJohn Marino 
863*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
864*95b7b453SJohn Marino   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
865*95b7b453SJohn Marino      this function returns CAPITAL LETTER I instead of first byte of
866*95b7b453SJohn Marino      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
867*95b7b453SJohn Marino      since peek_byte_case doesn't advance cur_idx in any way.  */
868*95b7b453SJohn Marino   if (pstr->offsets_needed && !isascii (ch))
869*95b7b453SJohn Marino     return re_string_peek_byte (pstr, idx);
870*95b7b453SJohn Marino #endif
871*95b7b453SJohn Marino 
872*95b7b453SJohn Marino   return ch;
873*95b7b453SJohn Marino }
874*95b7b453SJohn Marino 
875*95b7b453SJohn Marino static unsigned char
876*95b7b453SJohn Marino internal_function __attribute ((pure))
877*95b7b453SJohn Marino re_string_fetch_byte_case (re_string_t *pstr)
878*95b7b453SJohn Marino {
879*95b7b453SJohn Marino   if (BE (!pstr->mbs_allocated, 1))
880*95b7b453SJohn Marino     return re_string_fetch_byte (pstr);
881*95b7b453SJohn Marino 
882*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
883*95b7b453SJohn Marino   if (pstr->offsets_needed)
884*95b7b453SJohn Marino     {
885*95b7b453SJohn Marino       Idx off;
886*95b7b453SJohn Marino       int ch;
887*95b7b453SJohn Marino 
888*95b7b453SJohn Marino       /* For tr_TR.UTF-8 [[:islower:]] there is
889*95b7b453SJohn Marino 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
890*95b7b453SJohn Marino 	 in that case the whole multi-byte character and return
891*95b7b453SJohn Marino 	 the original letter.  On the other side, with
892*95b7b453SJohn Marino 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
893*95b7b453SJohn Marino 	 anything else would complicate things too much.  */
894*95b7b453SJohn Marino 
895*95b7b453SJohn Marino       if (!re_string_first_byte (pstr, pstr->cur_idx))
896*95b7b453SJohn Marino 	return re_string_fetch_byte (pstr);
897*95b7b453SJohn Marino 
898*95b7b453SJohn Marino       off = pstr->offsets[pstr->cur_idx];
899*95b7b453SJohn Marino       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900*95b7b453SJohn Marino 
901*95b7b453SJohn Marino       if (! isascii (ch))
902*95b7b453SJohn Marino 	return re_string_fetch_byte (pstr);
903*95b7b453SJohn Marino 
904*95b7b453SJohn Marino       re_string_skip_bytes (pstr,
905*95b7b453SJohn Marino 			    re_string_char_size_at (pstr, pstr->cur_idx));
906*95b7b453SJohn Marino       return ch;
907*95b7b453SJohn Marino     }
908*95b7b453SJohn Marino #endif
909*95b7b453SJohn Marino 
910*95b7b453SJohn Marino   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911*95b7b453SJohn Marino }
912*95b7b453SJohn Marino 
913*95b7b453SJohn Marino static void
914*95b7b453SJohn Marino internal_function
915*95b7b453SJohn Marino re_string_destruct (re_string_t *pstr)
916*95b7b453SJohn Marino {
917*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
918*95b7b453SJohn Marino   re_free (pstr->wcs);
919*95b7b453SJohn Marino   re_free (pstr->offsets);
920*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N  */
921*95b7b453SJohn Marino   if (pstr->mbs_allocated)
922*95b7b453SJohn Marino     re_free (pstr->mbs);
923*95b7b453SJohn Marino }
924*95b7b453SJohn Marino 
925*95b7b453SJohn Marino /* Return the context at IDX in INPUT.  */
926*95b7b453SJohn Marino 
927*95b7b453SJohn Marino static unsigned int
928*95b7b453SJohn Marino internal_function
929*95b7b453SJohn Marino re_string_context_at (const re_string_t *input, Idx idx, int eflags)
930*95b7b453SJohn Marino {
931*95b7b453SJohn Marino   int c;
932*95b7b453SJohn Marino   if (BE (! REG_VALID_INDEX (idx), 0))
933*95b7b453SJohn Marino     /* In this case, we use the value stored in input->tip_context,
934*95b7b453SJohn Marino        since we can't know the character in input->mbs[-1] here.  */
935*95b7b453SJohn Marino     return input->tip_context;
936*95b7b453SJohn Marino   if (BE (idx == input->len, 0))
937*95b7b453SJohn Marino     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
938*95b7b453SJohn Marino 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
939*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
940*95b7b453SJohn Marino   if (input->mb_cur_max > 1)
941*95b7b453SJohn Marino     {
942*95b7b453SJohn Marino       wint_t wc;
943*95b7b453SJohn Marino       Idx wc_idx = idx;
944*95b7b453SJohn Marino       while(input->wcs[wc_idx] == WEOF)
945*95b7b453SJohn Marino 	{
946*95b7b453SJohn Marino #ifdef DEBUG
947*95b7b453SJohn Marino 	  /* It must not happen.  */
948*95b7b453SJohn Marino 	  assert (REG_VALID_INDEX (wc_idx));
949*95b7b453SJohn Marino #endif
950*95b7b453SJohn Marino 	  --wc_idx;
951*95b7b453SJohn Marino 	  if (! REG_VALID_INDEX (wc_idx))
952*95b7b453SJohn Marino 	    return input->tip_context;
953*95b7b453SJohn Marino 	}
954*95b7b453SJohn Marino       wc = input->wcs[wc_idx];
955*95b7b453SJohn Marino       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
956*95b7b453SJohn Marino 	return CONTEXT_WORD;
957*95b7b453SJohn Marino       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
958*95b7b453SJohn Marino 	      ? CONTEXT_NEWLINE : 0);
959*95b7b453SJohn Marino     }
960*95b7b453SJohn Marino   else
961*95b7b453SJohn Marino #endif
962*95b7b453SJohn Marino     {
963*95b7b453SJohn Marino       c = re_string_byte_at (input, idx);
964*95b7b453SJohn Marino       if (bitset_contain (input->word_char, c))
965*95b7b453SJohn Marino 	return CONTEXT_WORD;
966*95b7b453SJohn Marino       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
967*95b7b453SJohn Marino     }
968*95b7b453SJohn Marino }
969*95b7b453SJohn Marino 
970*95b7b453SJohn Marino /* Functions for set operation.  */
971*95b7b453SJohn Marino 
972*95b7b453SJohn Marino static reg_errcode_t
973*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
974*95b7b453SJohn Marino re_node_set_alloc (re_node_set *set, Idx size)
975*95b7b453SJohn Marino {
976*95b7b453SJohn Marino   set->alloc = size;
977*95b7b453SJohn Marino   set->nelem = 0;
978*95b7b453SJohn Marino   set->elems = re_malloc (Idx, size);
979*95b7b453SJohn Marino   if (BE (set->elems == NULL, 0))
980*95b7b453SJohn Marino     return REG_ESPACE;
981*95b7b453SJohn Marino   return REG_NOERROR;
982*95b7b453SJohn Marino }
983*95b7b453SJohn Marino 
984*95b7b453SJohn Marino static reg_errcode_t
985*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
986*95b7b453SJohn Marino re_node_set_init_1 (re_node_set *set, Idx elem)
987*95b7b453SJohn Marino {
988*95b7b453SJohn Marino   set->alloc = 1;
989*95b7b453SJohn Marino   set->nelem = 1;
990*95b7b453SJohn Marino   set->elems = re_malloc (Idx, 1);
991*95b7b453SJohn Marino   if (BE (set->elems == NULL, 0))
992*95b7b453SJohn Marino     {
993*95b7b453SJohn Marino       set->alloc = set->nelem = 0;
994*95b7b453SJohn Marino       return REG_ESPACE;
995*95b7b453SJohn Marino     }
996*95b7b453SJohn Marino   set->elems[0] = elem;
997*95b7b453SJohn Marino   return REG_NOERROR;
998*95b7b453SJohn Marino }
999*95b7b453SJohn Marino 
1000*95b7b453SJohn Marino static reg_errcode_t
1001*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1002*95b7b453SJohn Marino re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1003*95b7b453SJohn Marino {
1004*95b7b453SJohn Marino   set->alloc = 2;
1005*95b7b453SJohn Marino   set->elems = re_malloc (Idx, 2);
1006*95b7b453SJohn Marino   if (BE (set->elems == NULL, 0))
1007*95b7b453SJohn Marino     return REG_ESPACE;
1008*95b7b453SJohn Marino   if (elem1 == elem2)
1009*95b7b453SJohn Marino     {
1010*95b7b453SJohn Marino       set->nelem = 1;
1011*95b7b453SJohn Marino       set->elems[0] = elem1;
1012*95b7b453SJohn Marino     }
1013*95b7b453SJohn Marino   else
1014*95b7b453SJohn Marino     {
1015*95b7b453SJohn Marino       set->nelem = 2;
1016*95b7b453SJohn Marino       if (elem1 < elem2)
1017*95b7b453SJohn Marino 	{
1018*95b7b453SJohn Marino 	  set->elems[0] = elem1;
1019*95b7b453SJohn Marino 	  set->elems[1] = elem2;
1020*95b7b453SJohn Marino 	}
1021*95b7b453SJohn Marino       else
1022*95b7b453SJohn Marino 	{
1023*95b7b453SJohn Marino 	  set->elems[0] = elem2;
1024*95b7b453SJohn Marino 	  set->elems[1] = elem1;
1025*95b7b453SJohn Marino 	}
1026*95b7b453SJohn Marino     }
1027*95b7b453SJohn Marino   return REG_NOERROR;
1028*95b7b453SJohn Marino }
1029*95b7b453SJohn Marino 
1030*95b7b453SJohn Marino static reg_errcode_t
1031*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1032*95b7b453SJohn Marino re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1033*95b7b453SJohn Marino {
1034*95b7b453SJohn Marino   dest->nelem = src->nelem;
1035*95b7b453SJohn Marino   if (src->nelem > 0)
1036*95b7b453SJohn Marino     {
1037*95b7b453SJohn Marino       dest->alloc = dest->nelem;
1038*95b7b453SJohn Marino       dest->elems = re_malloc (Idx, dest->alloc);
1039*95b7b453SJohn Marino       if (BE (dest->elems == NULL, 0))
1040*95b7b453SJohn Marino 	{
1041*95b7b453SJohn Marino 	  dest->alloc = dest->nelem = 0;
1042*95b7b453SJohn Marino 	  return REG_ESPACE;
1043*95b7b453SJohn Marino 	}
1044*95b7b453SJohn Marino       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1045*95b7b453SJohn Marino     }
1046*95b7b453SJohn Marino   else
1047*95b7b453SJohn Marino     re_node_set_init_empty (dest);
1048*95b7b453SJohn Marino   return REG_NOERROR;
1049*95b7b453SJohn Marino }
1050*95b7b453SJohn Marino 
1051*95b7b453SJohn Marino /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1052*95b7b453SJohn Marino    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1053*95b7b453SJohn Marino    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1054*95b7b453SJohn Marino 
1055*95b7b453SJohn Marino static reg_errcode_t
1056*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1057*95b7b453SJohn Marino re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1058*95b7b453SJohn Marino 			   const re_node_set *src2)
1059*95b7b453SJohn Marino {
1060*95b7b453SJohn Marino   Idx i1, i2, is, id, delta, sbase;
1061*95b7b453SJohn Marino   if (src1->nelem == 0 || src2->nelem == 0)
1062*95b7b453SJohn Marino     return REG_NOERROR;
1063*95b7b453SJohn Marino 
1064*95b7b453SJohn Marino   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1065*95b7b453SJohn Marino      conservative estimate.  */
1066*95b7b453SJohn Marino   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1067*95b7b453SJohn Marino     {
1068*95b7b453SJohn Marino       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1069*95b7b453SJohn Marino       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1070*95b7b453SJohn Marino       if (BE (new_elems == NULL, 0))
1071*95b7b453SJohn Marino 	return REG_ESPACE;
1072*95b7b453SJohn Marino       dest->elems = new_elems;
1073*95b7b453SJohn Marino       dest->alloc = new_alloc;
1074*95b7b453SJohn Marino     }
1075*95b7b453SJohn Marino 
1076*95b7b453SJohn Marino   /* Find the items in the intersection of SRC1 and SRC2, and copy
1077*95b7b453SJohn Marino      into the top of DEST those that are not already in DEST itself.  */
1078*95b7b453SJohn Marino   sbase = dest->nelem + src1->nelem + src2->nelem;
1079*95b7b453SJohn Marino   i1 = src1->nelem - 1;
1080*95b7b453SJohn Marino   i2 = src2->nelem - 1;
1081*95b7b453SJohn Marino   id = dest->nelem - 1;
1082*95b7b453SJohn Marino   for (;;)
1083*95b7b453SJohn Marino     {
1084*95b7b453SJohn Marino       if (src1->elems[i1] == src2->elems[i2])
1085*95b7b453SJohn Marino 	{
1086*95b7b453SJohn Marino 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1087*95b7b453SJohn Marino 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1088*95b7b453SJohn Marino 	    --id;
1089*95b7b453SJohn Marino 
1090*95b7b453SJohn Marino           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1091*95b7b453SJohn Marino             dest->elems[--sbase] = src1->elems[i1];
1092*95b7b453SJohn Marino 
1093*95b7b453SJohn Marino 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1094*95b7b453SJohn Marino 	    break;
1095*95b7b453SJohn Marino 	}
1096*95b7b453SJohn Marino 
1097*95b7b453SJohn Marino       /* Lower the highest of the two items.  */
1098*95b7b453SJohn Marino       else if (src1->elems[i1] < src2->elems[i2])
1099*95b7b453SJohn Marino 	{
1100*95b7b453SJohn Marino 	  if (! REG_VALID_INDEX (--i2))
1101*95b7b453SJohn Marino 	    break;
1102*95b7b453SJohn Marino 	}
1103*95b7b453SJohn Marino       else
1104*95b7b453SJohn Marino 	{
1105*95b7b453SJohn Marino 	  if (! REG_VALID_INDEX (--i1))
1106*95b7b453SJohn Marino 	    break;
1107*95b7b453SJohn Marino 	}
1108*95b7b453SJohn Marino     }
1109*95b7b453SJohn Marino 
1110*95b7b453SJohn Marino   id = dest->nelem - 1;
1111*95b7b453SJohn Marino   is = dest->nelem + src1->nelem + src2->nelem - 1;
1112*95b7b453SJohn Marino   delta = is - sbase + 1;
1113*95b7b453SJohn Marino 
1114*95b7b453SJohn Marino   /* Now copy.  When DELTA becomes zero, the remaining
1115*95b7b453SJohn Marino      DEST elements are already in place; this is more or
1116*95b7b453SJohn Marino      less the same loop that is in re_node_set_merge.  */
1117*95b7b453SJohn Marino   dest->nelem += delta;
1118*95b7b453SJohn Marino   if (delta > 0 && REG_VALID_INDEX (id))
1119*95b7b453SJohn Marino     for (;;)
1120*95b7b453SJohn Marino       {
1121*95b7b453SJohn Marino 	if (dest->elems[is] > dest->elems[id])
1122*95b7b453SJohn Marino 	  {
1123*95b7b453SJohn Marino 	    /* Copy from the top.  */
1124*95b7b453SJohn Marino 	    dest->elems[id + delta--] = dest->elems[is--];
1125*95b7b453SJohn Marino 	    if (delta == 0)
1126*95b7b453SJohn Marino 	      break;
1127*95b7b453SJohn Marino 	  }
1128*95b7b453SJohn Marino 	else
1129*95b7b453SJohn Marino 	  {
1130*95b7b453SJohn Marino 	    /* Slide from the bottom.  */
1131*95b7b453SJohn Marino 	    dest->elems[id + delta] = dest->elems[id];
1132*95b7b453SJohn Marino 	    if (! REG_VALID_INDEX (--id))
1133*95b7b453SJohn Marino 	      break;
1134*95b7b453SJohn Marino 	  }
1135*95b7b453SJohn Marino       }
1136*95b7b453SJohn Marino 
1137*95b7b453SJohn Marino   /* Copy remaining SRC elements.  */
1138*95b7b453SJohn Marino   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1139*95b7b453SJohn Marino 
1140*95b7b453SJohn Marino   return REG_NOERROR;
1141*95b7b453SJohn Marino }
1142*95b7b453SJohn Marino 
1143*95b7b453SJohn Marino /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1144*95b7b453SJohn Marino    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1145*95b7b453SJohn Marino 
1146*95b7b453SJohn Marino static reg_errcode_t
1147*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1148*95b7b453SJohn Marino re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1149*95b7b453SJohn Marino 			const re_node_set *src2)
1150*95b7b453SJohn Marino {
1151*95b7b453SJohn Marino   Idx i1, i2, id;
1152*95b7b453SJohn Marino   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1153*95b7b453SJohn Marino     {
1154*95b7b453SJohn Marino       dest->alloc = src1->nelem + src2->nelem;
1155*95b7b453SJohn Marino       dest->elems = re_malloc (Idx, dest->alloc);
1156*95b7b453SJohn Marino       if (BE (dest->elems == NULL, 0))
1157*95b7b453SJohn Marino 	return REG_ESPACE;
1158*95b7b453SJohn Marino     }
1159*95b7b453SJohn Marino   else
1160*95b7b453SJohn Marino     {
1161*95b7b453SJohn Marino       if (src1 != NULL && src1->nelem > 0)
1162*95b7b453SJohn Marino 	return re_node_set_init_copy (dest, src1);
1163*95b7b453SJohn Marino       else if (src2 != NULL && src2->nelem > 0)
1164*95b7b453SJohn Marino 	return re_node_set_init_copy (dest, src2);
1165*95b7b453SJohn Marino       else
1166*95b7b453SJohn Marino 	re_node_set_init_empty (dest);
1167*95b7b453SJohn Marino       return REG_NOERROR;
1168*95b7b453SJohn Marino     }
1169*95b7b453SJohn Marino   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1170*95b7b453SJohn Marino     {
1171*95b7b453SJohn Marino       if (src1->elems[i1] > src2->elems[i2])
1172*95b7b453SJohn Marino 	{
1173*95b7b453SJohn Marino 	  dest->elems[id++] = src2->elems[i2++];
1174*95b7b453SJohn Marino 	  continue;
1175*95b7b453SJohn Marino 	}
1176*95b7b453SJohn Marino       if (src1->elems[i1] == src2->elems[i2])
1177*95b7b453SJohn Marino 	++i2;
1178*95b7b453SJohn Marino       dest->elems[id++] = src1->elems[i1++];
1179*95b7b453SJohn Marino     }
1180*95b7b453SJohn Marino   if (i1 < src1->nelem)
1181*95b7b453SJohn Marino     {
1182*95b7b453SJohn Marino       memcpy (dest->elems + id, src1->elems + i1,
1183*95b7b453SJohn Marino 	     (src1->nelem - i1) * sizeof (Idx));
1184*95b7b453SJohn Marino       id += src1->nelem - i1;
1185*95b7b453SJohn Marino     }
1186*95b7b453SJohn Marino   else if (i2 < src2->nelem)
1187*95b7b453SJohn Marino     {
1188*95b7b453SJohn Marino       memcpy (dest->elems + id, src2->elems + i2,
1189*95b7b453SJohn Marino 	     (src2->nelem - i2) * sizeof (Idx));
1190*95b7b453SJohn Marino       id += src2->nelem - i2;
1191*95b7b453SJohn Marino     }
1192*95b7b453SJohn Marino   dest->nelem = id;
1193*95b7b453SJohn Marino   return REG_NOERROR;
1194*95b7b453SJohn Marino }
1195*95b7b453SJohn Marino 
1196*95b7b453SJohn Marino /* Calculate the union set of the sets DEST and SRC. And store it to
1197*95b7b453SJohn Marino    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1198*95b7b453SJohn Marino 
1199*95b7b453SJohn Marino static reg_errcode_t
1200*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1201*95b7b453SJohn Marino re_node_set_merge (re_node_set *dest, const re_node_set *src)
1202*95b7b453SJohn Marino {
1203*95b7b453SJohn Marino   Idx is, id, sbase, delta;
1204*95b7b453SJohn Marino   if (src == NULL || src->nelem == 0)
1205*95b7b453SJohn Marino     return REG_NOERROR;
1206*95b7b453SJohn Marino   if (dest->alloc < 2 * src->nelem + dest->nelem)
1207*95b7b453SJohn Marino     {
1208*95b7b453SJohn Marino       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1209*95b7b453SJohn Marino       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1210*95b7b453SJohn Marino       if (BE (new_buffer == NULL, 0))
1211*95b7b453SJohn Marino 	return REG_ESPACE;
1212*95b7b453SJohn Marino       dest->elems = new_buffer;
1213*95b7b453SJohn Marino       dest->alloc = new_alloc;
1214*95b7b453SJohn Marino     }
1215*95b7b453SJohn Marino 
1216*95b7b453SJohn Marino   if (BE (dest->nelem == 0, 0))
1217*95b7b453SJohn Marino     {
1218*95b7b453SJohn Marino       dest->nelem = src->nelem;
1219*95b7b453SJohn Marino       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1220*95b7b453SJohn Marino       return REG_NOERROR;
1221*95b7b453SJohn Marino     }
1222*95b7b453SJohn Marino 
1223*95b7b453SJohn Marino   /* Copy into the top of DEST the items of SRC that are not
1224*95b7b453SJohn Marino      found in DEST.  Maybe we could binary search in DEST?  */
1225*95b7b453SJohn Marino   for (sbase = dest->nelem + 2 * src->nelem,
1226*95b7b453SJohn Marino        is = src->nelem - 1, id = dest->nelem - 1;
1227*95b7b453SJohn Marino        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1228*95b7b453SJohn Marino     {
1229*95b7b453SJohn Marino       if (dest->elems[id] == src->elems[is])
1230*95b7b453SJohn Marino 	is--, id--;
1231*95b7b453SJohn Marino       else if (dest->elems[id] < src->elems[is])
1232*95b7b453SJohn Marino 	dest->elems[--sbase] = src->elems[is--];
1233*95b7b453SJohn Marino       else /* if (dest->elems[id] > src->elems[is]) */
1234*95b7b453SJohn Marino 	--id;
1235*95b7b453SJohn Marino     }
1236*95b7b453SJohn Marino 
1237*95b7b453SJohn Marino   if (REG_VALID_INDEX (is))
1238*95b7b453SJohn Marino     {
1239*95b7b453SJohn Marino       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1240*95b7b453SJohn Marino       sbase -= is + 1;
1241*95b7b453SJohn Marino       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1242*95b7b453SJohn Marino     }
1243*95b7b453SJohn Marino 
1244*95b7b453SJohn Marino   id = dest->nelem - 1;
1245*95b7b453SJohn Marino   is = dest->nelem + 2 * src->nelem - 1;
1246*95b7b453SJohn Marino   delta = is - sbase + 1;
1247*95b7b453SJohn Marino   if (delta == 0)
1248*95b7b453SJohn Marino     return REG_NOERROR;
1249*95b7b453SJohn Marino 
1250*95b7b453SJohn Marino   /* Now copy.  When DELTA becomes zero, the remaining
1251*95b7b453SJohn Marino      DEST elements are already in place.  */
1252*95b7b453SJohn Marino   dest->nelem += delta;
1253*95b7b453SJohn Marino   for (;;)
1254*95b7b453SJohn Marino     {
1255*95b7b453SJohn Marino       if (dest->elems[is] > dest->elems[id])
1256*95b7b453SJohn Marino 	{
1257*95b7b453SJohn Marino 	  /* Copy from the top.  */
1258*95b7b453SJohn Marino 	  dest->elems[id + delta--] = dest->elems[is--];
1259*95b7b453SJohn Marino 	  if (delta == 0)
1260*95b7b453SJohn Marino 	    break;
1261*95b7b453SJohn Marino 	}
1262*95b7b453SJohn Marino       else
1263*95b7b453SJohn Marino 	{
1264*95b7b453SJohn Marino 	  /* Slide from the bottom.  */
1265*95b7b453SJohn Marino 	  dest->elems[id + delta] = dest->elems[id];
1266*95b7b453SJohn Marino 	  if (! REG_VALID_INDEX (--id))
1267*95b7b453SJohn Marino 	    {
1268*95b7b453SJohn Marino 	      /* Copy remaining SRC elements.  */
1269*95b7b453SJohn Marino 	      memcpy (dest->elems, dest->elems + sbase,
1270*95b7b453SJohn Marino 		      delta * sizeof (Idx));
1271*95b7b453SJohn Marino 	      break;
1272*95b7b453SJohn Marino 	    }
1273*95b7b453SJohn Marino 	}
1274*95b7b453SJohn Marino     }
1275*95b7b453SJohn Marino 
1276*95b7b453SJohn Marino   return REG_NOERROR;
1277*95b7b453SJohn Marino }
1278*95b7b453SJohn Marino 
1279*95b7b453SJohn Marino /* Insert the new element ELEM to the re_node_set* SET.
1280*95b7b453SJohn Marino    SET should not already have ELEM.
1281*95b7b453SJohn Marino    Return true if successful.  */
1282*95b7b453SJohn Marino 
1283*95b7b453SJohn Marino static bool
1284*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1285*95b7b453SJohn Marino re_node_set_insert (re_node_set *set, Idx elem)
1286*95b7b453SJohn Marino {
1287*95b7b453SJohn Marino   Idx idx;
1288*95b7b453SJohn Marino   /* In case the set is empty.  */
1289*95b7b453SJohn Marino   if (set->alloc == 0)
1290*95b7b453SJohn Marino     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1291*95b7b453SJohn Marino 
1292*95b7b453SJohn Marino   if (BE (set->nelem, 0) == 0)
1293*95b7b453SJohn Marino     {
1294*95b7b453SJohn Marino       /* We already guaranteed above that set->alloc != 0.  */
1295*95b7b453SJohn Marino       set->elems[0] = elem;
1296*95b7b453SJohn Marino       ++set->nelem;
1297*95b7b453SJohn Marino       return true;
1298*95b7b453SJohn Marino     }
1299*95b7b453SJohn Marino 
1300*95b7b453SJohn Marino   /* Realloc if we need.  */
1301*95b7b453SJohn Marino   if (set->alloc == set->nelem)
1302*95b7b453SJohn Marino     {
1303*95b7b453SJohn Marino       Idx *new_elems;
1304*95b7b453SJohn Marino       set->alloc = set->alloc * 2;
1305*95b7b453SJohn Marino       new_elems = re_realloc (set->elems, Idx, set->alloc);
1306*95b7b453SJohn Marino       if (BE (new_elems == NULL, 0))
1307*95b7b453SJohn Marino 	return false;
1308*95b7b453SJohn Marino       set->elems = new_elems;
1309*95b7b453SJohn Marino     }
1310*95b7b453SJohn Marino 
1311*95b7b453SJohn Marino   /* Move the elements which follows the new element.  Test the
1312*95b7b453SJohn Marino      first element separately to skip a check in the inner loop.  */
1313*95b7b453SJohn Marino   if (elem < set->elems[0])
1314*95b7b453SJohn Marino     {
1315*95b7b453SJohn Marino       idx = 0;
1316*95b7b453SJohn Marino       for (idx = set->nelem; idx > 0; idx--)
1317*95b7b453SJohn Marino 	set->elems[idx] = set->elems[idx - 1];
1318*95b7b453SJohn Marino     }
1319*95b7b453SJohn Marino   else
1320*95b7b453SJohn Marino     {
1321*95b7b453SJohn Marino       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1322*95b7b453SJohn Marino 	set->elems[idx] = set->elems[idx - 1];
1323*95b7b453SJohn Marino     }
1324*95b7b453SJohn Marino 
1325*95b7b453SJohn Marino   /* Insert the new element.  */
1326*95b7b453SJohn Marino   set->elems[idx] = elem;
1327*95b7b453SJohn Marino   ++set->nelem;
1328*95b7b453SJohn Marino   return true;
1329*95b7b453SJohn Marino }
1330*95b7b453SJohn Marino 
1331*95b7b453SJohn Marino /* Insert the new element ELEM to the re_node_set* SET.
1332*95b7b453SJohn Marino    SET should not already have any element greater than or equal to ELEM.
1333*95b7b453SJohn Marino    Return true if successful.  */
1334*95b7b453SJohn Marino 
1335*95b7b453SJohn Marino static bool
1336*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1337*95b7b453SJohn Marino re_node_set_insert_last (re_node_set *set, Idx elem)
1338*95b7b453SJohn Marino {
1339*95b7b453SJohn Marino   /* Realloc if we need.  */
1340*95b7b453SJohn Marino   if (set->alloc == set->nelem)
1341*95b7b453SJohn Marino     {
1342*95b7b453SJohn Marino       Idx *new_elems;
1343*95b7b453SJohn Marino       set->alloc = (set->alloc + 1) * 2;
1344*95b7b453SJohn Marino       new_elems = re_realloc (set->elems, Idx, set->alloc);
1345*95b7b453SJohn Marino       if (BE (new_elems == NULL, 0))
1346*95b7b453SJohn Marino 	return false;
1347*95b7b453SJohn Marino       set->elems = new_elems;
1348*95b7b453SJohn Marino     }
1349*95b7b453SJohn Marino 
1350*95b7b453SJohn Marino   /* Insert the new element.  */
1351*95b7b453SJohn Marino   set->elems[set->nelem++] = elem;
1352*95b7b453SJohn Marino   return true;
1353*95b7b453SJohn Marino }
1354*95b7b453SJohn Marino 
1355*95b7b453SJohn Marino /* Compare two node sets SET1 and SET2.
1356*95b7b453SJohn Marino    Return true if SET1 and SET2 are equivalent.  */
1357*95b7b453SJohn Marino 
1358*95b7b453SJohn Marino static bool
1359*95b7b453SJohn Marino internal_function __attribute ((pure))
1360*95b7b453SJohn Marino re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1361*95b7b453SJohn Marino {
1362*95b7b453SJohn Marino   Idx i;
1363*95b7b453SJohn Marino   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1364*95b7b453SJohn Marino     return false;
1365*95b7b453SJohn Marino   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1366*95b7b453SJohn Marino     if (set1->elems[i] != set2->elems[i])
1367*95b7b453SJohn Marino       return false;
1368*95b7b453SJohn Marino   return true;
1369*95b7b453SJohn Marino }
1370*95b7b453SJohn Marino 
1371*95b7b453SJohn Marino /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1372*95b7b453SJohn Marino 
1373*95b7b453SJohn Marino static Idx
1374*95b7b453SJohn Marino internal_function __attribute ((pure))
1375*95b7b453SJohn Marino re_node_set_contains (const re_node_set *set, Idx elem)
1376*95b7b453SJohn Marino {
1377*95b7b453SJohn Marino   __re_size_t idx, right, mid;
1378*95b7b453SJohn Marino   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1379*95b7b453SJohn Marino     return 0;
1380*95b7b453SJohn Marino 
1381*95b7b453SJohn Marino   /* Binary search the element.  */
1382*95b7b453SJohn Marino   idx = 0;
1383*95b7b453SJohn Marino   right = set->nelem - 1;
1384*95b7b453SJohn Marino   while (idx < right)
1385*95b7b453SJohn Marino     {
1386*95b7b453SJohn Marino       mid = (idx + right) / 2;
1387*95b7b453SJohn Marino       if (set->elems[mid] < elem)
1388*95b7b453SJohn Marino 	idx = mid + 1;
1389*95b7b453SJohn Marino       else
1390*95b7b453SJohn Marino 	right = mid;
1391*95b7b453SJohn Marino     }
1392*95b7b453SJohn Marino   return set->elems[idx] == elem ? idx + 1 : 0;
1393*95b7b453SJohn Marino }
1394*95b7b453SJohn Marino 
1395*95b7b453SJohn Marino static void
1396*95b7b453SJohn Marino internal_function
1397*95b7b453SJohn Marino re_node_set_remove_at (re_node_set *set, Idx idx)
1398*95b7b453SJohn Marino {
1399*95b7b453SJohn Marino   verify (! TYPE_SIGNED (Idx));
1400*95b7b453SJohn Marino   /* if (idx < 0)
1401*95b7b453SJohn Marino      return; */
1402*95b7b453SJohn Marino   if (idx >= set->nelem)
1403*95b7b453SJohn Marino     return;
1404*95b7b453SJohn Marino   --set->nelem;
1405*95b7b453SJohn Marino   for (; idx < set->nelem; idx++)
1406*95b7b453SJohn Marino     set->elems[idx] = set->elems[idx + 1];
1407*95b7b453SJohn Marino }
1408*95b7b453SJohn Marino 
1409*95b7b453SJohn Marino 
1410*95b7b453SJohn Marino /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1411*95b7b453SJohn Marino    Or return REG_MISSING if an error occurred.  */
1412*95b7b453SJohn Marino 
1413*95b7b453SJohn Marino static Idx
1414*95b7b453SJohn Marino internal_function
1415*95b7b453SJohn Marino re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1416*95b7b453SJohn Marino {
1417*95b7b453SJohn Marino   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1418*95b7b453SJohn Marino     {
1419*95b7b453SJohn Marino       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1420*95b7b453SJohn Marino       Idx *new_nexts, *new_indices;
1421*95b7b453SJohn Marino       re_node_set *new_edests, *new_eclosures;
1422*95b7b453SJohn Marino       re_token_t *new_nodes;
1423*95b7b453SJohn Marino       size_t max_object_size =
1424*95b7b453SJohn Marino 	MAX (sizeof (re_token_t),
1425*95b7b453SJohn Marino 	     MAX (sizeof (re_node_set),
1426*95b7b453SJohn Marino 		  sizeof (Idx)));
1427*95b7b453SJohn Marino 
1428*95b7b453SJohn Marino       /* Avoid overflows.  */
1429*95b7b453SJohn Marino       if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1430*95b7b453SJohn Marino 	return REG_MISSING;
1431*95b7b453SJohn Marino 
1432*95b7b453SJohn Marino       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1433*95b7b453SJohn Marino       if (BE (new_nodes == NULL, 0))
1434*95b7b453SJohn Marino 	return REG_MISSING;
1435*95b7b453SJohn Marino       dfa->nodes = new_nodes;
1436*95b7b453SJohn Marino       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1437*95b7b453SJohn Marino       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1438*95b7b453SJohn Marino       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1439*95b7b453SJohn Marino       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1440*95b7b453SJohn Marino       if (BE (new_nexts == NULL || new_indices == NULL
1441*95b7b453SJohn Marino 	      || new_edests == NULL || new_eclosures == NULL, 0))
1442*95b7b453SJohn Marino 	return REG_MISSING;
1443*95b7b453SJohn Marino       dfa->nexts = new_nexts;
1444*95b7b453SJohn Marino       dfa->org_indices = new_indices;
1445*95b7b453SJohn Marino       dfa->edests = new_edests;
1446*95b7b453SJohn Marino       dfa->eclosures = new_eclosures;
1447*95b7b453SJohn Marino       dfa->nodes_alloc = new_nodes_alloc;
1448*95b7b453SJohn Marino     }
1449*95b7b453SJohn Marino   dfa->nodes[dfa->nodes_len] = token;
1450*95b7b453SJohn Marino   dfa->nodes[dfa->nodes_len].constraint = 0;
1451*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
1452*95b7b453SJohn Marino   {
1453*95b7b453SJohn Marino   int type = token.type;
1454*95b7b453SJohn Marino   dfa->nodes[dfa->nodes_len].accept_mb =
1455*95b7b453SJohn Marino     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1456*95b7b453SJohn Marino   }
1457*95b7b453SJohn Marino #endif
1458*95b7b453SJohn Marino   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1459*95b7b453SJohn Marino   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1460*95b7b453SJohn Marino   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1461*95b7b453SJohn Marino   return dfa->nodes_len++;
1462*95b7b453SJohn Marino }
1463*95b7b453SJohn Marino 
1464*95b7b453SJohn Marino static inline re_hashval_t
1465*95b7b453SJohn Marino internal_function
1466*95b7b453SJohn Marino calc_state_hash (const re_node_set *nodes, unsigned int context)
1467*95b7b453SJohn Marino {
1468*95b7b453SJohn Marino   re_hashval_t hash = nodes->nelem + context;
1469*95b7b453SJohn Marino   Idx i;
1470*95b7b453SJohn Marino   for (i = 0 ; i < nodes->nelem ; i++)
1471*95b7b453SJohn Marino     hash += nodes->elems[i];
1472*95b7b453SJohn Marino   return hash;
1473*95b7b453SJohn Marino }
1474*95b7b453SJohn Marino 
1475*95b7b453SJohn Marino /* Search for the state whose node_set is equivalent to NODES.
1476*95b7b453SJohn Marino    Return the pointer to the state, if we found it in the DFA.
1477*95b7b453SJohn Marino    Otherwise create the new one and return it.  In case of an error
1478*95b7b453SJohn Marino    return NULL and set the error code in ERR.
1479*95b7b453SJohn Marino    Note: - We assume NULL as the invalid state, then it is possible that
1480*95b7b453SJohn Marino 	   return value is NULL and ERR is REG_NOERROR.
1481*95b7b453SJohn Marino 	 - We never return non-NULL value in case of any errors, it is for
1482*95b7b453SJohn Marino 	   optimization.  */
1483*95b7b453SJohn Marino 
1484*95b7b453SJohn Marino static re_dfastate_t *
1485*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1486*95b7b453SJohn Marino re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1487*95b7b453SJohn Marino 		  const re_node_set *nodes)
1488*95b7b453SJohn Marino {
1489*95b7b453SJohn Marino   re_hashval_t hash;
1490*95b7b453SJohn Marino   re_dfastate_t *new_state;
1491*95b7b453SJohn Marino   struct re_state_table_entry *spot;
1492*95b7b453SJohn Marino   Idx i;
1493*95b7b453SJohn Marino #ifdef lint
1494*95b7b453SJohn Marino   /* Suppress bogus uninitialized-variable warnings.  */
1495*95b7b453SJohn Marino   *err = REG_NOERROR;
1496*95b7b453SJohn Marino #endif
1497*95b7b453SJohn Marino   if (BE (nodes->nelem == 0, 0))
1498*95b7b453SJohn Marino     {
1499*95b7b453SJohn Marino       *err = REG_NOERROR;
1500*95b7b453SJohn Marino       return NULL;
1501*95b7b453SJohn Marino     }
1502*95b7b453SJohn Marino   hash = calc_state_hash (nodes, 0);
1503*95b7b453SJohn Marino   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1504*95b7b453SJohn Marino 
1505*95b7b453SJohn Marino   for (i = 0 ; i < spot->num ; i++)
1506*95b7b453SJohn Marino     {
1507*95b7b453SJohn Marino       re_dfastate_t *state = spot->array[i];
1508*95b7b453SJohn Marino       if (hash != state->hash)
1509*95b7b453SJohn Marino 	continue;
1510*95b7b453SJohn Marino       if (re_node_set_compare (&state->nodes, nodes))
1511*95b7b453SJohn Marino 	return state;
1512*95b7b453SJohn Marino     }
1513*95b7b453SJohn Marino 
1514*95b7b453SJohn Marino   /* There are no appropriate state in the dfa, create the new one.  */
1515*95b7b453SJohn Marino   new_state = create_ci_newstate (dfa, nodes, hash);
1516*95b7b453SJohn Marino   if (BE (new_state == NULL, 0))
1517*95b7b453SJohn Marino     *err = REG_ESPACE;
1518*95b7b453SJohn Marino 
1519*95b7b453SJohn Marino   return new_state;
1520*95b7b453SJohn Marino }
1521*95b7b453SJohn Marino 
1522*95b7b453SJohn Marino /* Search for the state whose node_set is equivalent to NODES and
1523*95b7b453SJohn Marino    whose context is equivalent to CONTEXT.
1524*95b7b453SJohn Marino    Return the pointer to the state, if we found it in the DFA.
1525*95b7b453SJohn Marino    Otherwise create the new one and return it.  In case of an error
1526*95b7b453SJohn Marino    return NULL and set the error code in ERR.
1527*95b7b453SJohn Marino    Note: - We assume NULL as the invalid state, then it is possible that
1528*95b7b453SJohn Marino 	   return value is NULL and ERR is REG_NOERROR.
1529*95b7b453SJohn Marino 	 - We never return non-NULL value in case of any errors, it is for
1530*95b7b453SJohn Marino 	   optimization.  */
1531*95b7b453SJohn Marino 
1532*95b7b453SJohn Marino static re_dfastate_t *
1533*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1534*95b7b453SJohn Marino re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1535*95b7b453SJohn Marino 			  const re_node_set *nodes, unsigned int context)
1536*95b7b453SJohn Marino {
1537*95b7b453SJohn Marino   re_hashval_t hash;
1538*95b7b453SJohn Marino   re_dfastate_t *new_state;
1539*95b7b453SJohn Marino   struct re_state_table_entry *spot;
1540*95b7b453SJohn Marino   Idx i;
1541*95b7b453SJohn Marino #ifdef lint
1542*95b7b453SJohn Marino   /* Suppress bogus uninitialized-variable warnings.  */
1543*95b7b453SJohn Marino   *err = REG_NOERROR;
1544*95b7b453SJohn Marino #endif
1545*95b7b453SJohn Marino   if (nodes->nelem == 0)
1546*95b7b453SJohn Marino     {
1547*95b7b453SJohn Marino       *err = REG_NOERROR;
1548*95b7b453SJohn Marino       return NULL;
1549*95b7b453SJohn Marino     }
1550*95b7b453SJohn Marino   hash = calc_state_hash (nodes, context);
1551*95b7b453SJohn Marino   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1552*95b7b453SJohn Marino 
1553*95b7b453SJohn Marino   for (i = 0 ; i < spot->num ; i++)
1554*95b7b453SJohn Marino     {
1555*95b7b453SJohn Marino       re_dfastate_t *state = spot->array[i];
1556*95b7b453SJohn Marino       if (state->hash == hash
1557*95b7b453SJohn Marino 	  && state->context == context
1558*95b7b453SJohn Marino 	  && re_node_set_compare (state->entrance_nodes, nodes))
1559*95b7b453SJohn Marino 	return state;
1560*95b7b453SJohn Marino     }
1561*95b7b453SJohn Marino   /* There are no appropriate state in `dfa', create the new one.  */
1562*95b7b453SJohn Marino   new_state = create_cd_newstate (dfa, nodes, context, hash);
1563*95b7b453SJohn Marino   if (BE (new_state == NULL, 0))
1564*95b7b453SJohn Marino     *err = REG_ESPACE;
1565*95b7b453SJohn Marino 
1566*95b7b453SJohn Marino   return new_state;
1567*95b7b453SJohn Marino }
1568*95b7b453SJohn Marino 
1569*95b7b453SJohn Marino /* Finish initialization of the new state NEWSTATE, and using its hash value
1570*95b7b453SJohn Marino    HASH put in the appropriate bucket of DFA's state table.  Return value
1571*95b7b453SJohn Marino    indicates the error code if failed.  */
1572*95b7b453SJohn Marino 
1573*95b7b453SJohn Marino static reg_errcode_t
1574*95b7b453SJohn Marino __attribute_warn_unused_result__
1575*95b7b453SJohn Marino register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1576*95b7b453SJohn Marino 		re_hashval_t hash)
1577*95b7b453SJohn Marino {
1578*95b7b453SJohn Marino   struct re_state_table_entry *spot;
1579*95b7b453SJohn Marino   reg_errcode_t err;
1580*95b7b453SJohn Marino   Idx i;
1581*95b7b453SJohn Marino 
1582*95b7b453SJohn Marino   newstate->hash = hash;
1583*95b7b453SJohn Marino   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1584*95b7b453SJohn Marino   if (BE (err != REG_NOERROR, 0))
1585*95b7b453SJohn Marino     return REG_ESPACE;
1586*95b7b453SJohn Marino   for (i = 0; i < newstate->nodes.nelem; i++)
1587*95b7b453SJohn Marino     {
1588*95b7b453SJohn Marino       Idx elem = newstate->nodes.elems[i];
1589*95b7b453SJohn Marino       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1590*95b7b453SJohn Marino 	if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1591*95b7b453SJohn Marino 	  return REG_ESPACE;
1592*95b7b453SJohn Marino     }
1593*95b7b453SJohn Marino 
1594*95b7b453SJohn Marino   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1595*95b7b453SJohn Marino   if (BE (spot->alloc <= spot->num, 0))
1596*95b7b453SJohn Marino     {
1597*95b7b453SJohn Marino       Idx new_alloc = 2 * spot->num + 2;
1598*95b7b453SJohn Marino       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1599*95b7b453SJohn Marino 					      new_alloc);
1600*95b7b453SJohn Marino       if (BE (new_array == NULL, 0))
1601*95b7b453SJohn Marino 	return REG_ESPACE;
1602*95b7b453SJohn Marino       spot->array = new_array;
1603*95b7b453SJohn Marino       spot->alloc = new_alloc;
1604*95b7b453SJohn Marino     }
1605*95b7b453SJohn Marino   spot->array[spot->num++] = newstate;
1606*95b7b453SJohn Marino   return REG_NOERROR;
1607*95b7b453SJohn Marino }
1608*95b7b453SJohn Marino 
1609*95b7b453SJohn Marino static void
1610*95b7b453SJohn Marino free_state (re_dfastate_t *state)
1611*95b7b453SJohn Marino {
1612*95b7b453SJohn Marino   re_node_set_free (&state->non_eps_nodes);
1613*95b7b453SJohn Marino   re_node_set_free (&state->inveclosure);
1614*95b7b453SJohn Marino   if (state->entrance_nodes != &state->nodes)
1615*95b7b453SJohn Marino     {
1616*95b7b453SJohn Marino       re_node_set_free (state->entrance_nodes);
1617*95b7b453SJohn Marino       re_free (state->entrance_nodes);
1618*95b7b453SJohn Marino     }
1619*95b7b453SJohn Marino   re_node_set_free (&state->nodes);
1620*95b7b453SJohn Marino   re_free (state->word_trtable);
1621*95b7b453SJohn Marino   re_free (state->trtable);
1622*95b7b453SJohn Marino   re_free (state);
1623*95b7b453SJohn Marino }
1624*95b7b453SJohn Marino 
1625*95b7b453SJohn Marino /* Create the new state which is independ of contexts.
1626*95b7b453SJohn Marino    Return the new state if succeeded, otherwise return NULL.  */
1627*95b7b453SJohn Marino 
1628*95b7b453SJohn Marino static re_dfastate_t *
1629*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1630*95b7b453SJohn Marino create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1631*95b7b453SJohn Marino 		    re_hashval_t hash)
1632*95b7b453SJohn Marino {
1633*95b7b453SJohn Marino   Idx i;
1634*95b7b453SJohn Marino   reg_errcode_t err;
1635*95b7b453SJohn Marino   re_dfastate_t *newstate;
1636*95b7b453SJohn Marino 
1637*95b7b453SJohn Marino   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1638*95b7b453SJohn Marino   if (BE (newstate == NULL, 0))
1639*95b7b453SJohn Marino     return NULL;
1640*95b7b453SJohn Marino   err = re_node_set_init_copy (&newstate->nodes, nodes);
1641*95b7b453SJohn Marino   if (BE (err != REG_NOERROR, 0))
1642*95b7b453SJohn Marino     {
1643*95b7b453SJohn Marino       re_free (newstate);
1644*95b7b453SJohn Marino       return NULL;
1645*95b7b453SJohn Marino     }
1646*95b7b453SJohn Marino 
1647*95b7b453SJohn Marino   newstate->entrance_nodes = &newstate->nodes;
1648*95b7b453SJohn Marino   for (i = 0 ; i < nodes->nelem ; i++)
1649*95b7b453SJohn Marino     {
1650*95b7b453SJohn Marino       re_token_t *node = dfa->nodes + nodes->elems[i];
1651*95b7b453SJohn Marino       re_token_type_t type = node->type;
1652*95b7b453SJohn Marino       if (type == CHARACTER && !node->constraint)
1653*95b7b453SJohn Marino 	continue;
1654*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
1655*95b7b453SJohn Marino       newstate->accept_mb |= node->accept_mb;
1656*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
1657*95b7b453SJohn Marino 
1658*95b7b453SJohn Marino       /* If the state has the halt node, the state is a halt state.  */
1659*95b7b453SJohn Marino       if (type == END_OF_RE)
1660*95b7b453SJohn Marino 	newstate->halt = 1;
1661*95b7b453SJohn Marino       else if (type == OP_BACK_REF)
1662*95b7b453SJohn Marino 	newstate->has_backref = 1;
1663*95b7b453SJohn Marino       else if (type == ANCHOR || node->constraint)
1664*95b7b453SJohn Marino 	newstate->has_constraint = 1;
1665*95b7b453SJohn Marino     }
1666*95b7b453SJohn Marino   err = register_state (dfa, newstate, hash);
1667*95b7b453SJohn Marino   if (BE (err != REG_NOERROR, 0))
1668*95b7b453SJohn Marino     {
1669*95b7b453SJohn Marino       free_state (newstate);
1670*95b7b453SJohn Marino       newstate = NULL;
1671*95b7b453SJohn Marino     }
1672*95b7b453SJohn Marino   return newstate;
1673*95b7b453SJohn Marino }
1674*95b7b453SJohn Marino 
1675*95b7b453SJohn Marino /* Create the new state which is depend on the context CONTEXT.
1676*95b7b453SJohn Marino    Return the new state if succeeded, otherwise return NULL.  */
1677*95b7b453SJohn Marino 
1678*95b7b453SJohn Marino static re_dfastate_t *
1679*95b7b453SJohn Marino internal_function __attribute_warn_unused_result__
1680*95b7b453SJohn Marino create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1681*95b7b453SJohn Marino 		    unsigned int context, re_hashval_t hash)
1682*95b7b453SJohn Marino {
1683*95b7b453SJohn Marino   Idx i, nctx_nodes = 0;
1684*95b7b453SJohn Marino   reg_errcode_t err;
1685*95b7b453SJohn Marino   re_dfastate_t *newstate;
1686*95b7b453SJohn Marino 
1687*95b7b453SJohn Marino   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1688*95b7b453SJohn Marino   if (BE (newstate == NULL, 0))
1689*95b7b453SJohn Marino     return NULL;
1690*95b7b453SJohn Marino   err = re_node_set_init_copy (&newstate->nodes, nodes);
1691*95b7b453SJohn Marino   if (BE (err != REG_NOERROR, 0))
1692*95b7b453SJohn Marino     {
1693*95b7b453SJohn Marino       re_free (newstate);
1694*95b7b453SJohn Marino       return NULL;
1695*95b7b453SJohn Marino     }
1696*95b7b453SJohn Marino 
1697*95b7b453SJohn Marino   newstate->context = context;
1698*95b7b453SJohn Marino   newstate->entrance_nodes = &newstate->nodes;
1699*95b7b453SJohn Marino 
1700*95b7b453SJohn Marino   for (i = 0 ; i < nodes->nelem ; i++)
1701*95b7b453SJohn Marino     {
1702*95b7b453SJohn Marino       re_token_t *node = dfa->nodes + nodes->elems[i];
1703*95b7b453SJohn Marino       re_token_type_t type = node->type;
1704*95b7b453SJohn Marino       unsigned int constraint = node->constraint;
1705*95b7b453SJohn Marino 
1706*95b7b453SJohn Marino       if (type == CHARACTER && !constraint)
1707*95b7b453SJohn Marino 	continue;
1708*95b7b453SJohn Marino #ifdef RE_ENABLE_I18N
1709*95b7b453SJohn Marino       newstate->accept_mb |= node->accept_mb;
1710*95b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
1711*95b7b453SJohn Marino 
1712*95b7b453SJohn Marino       /* If the state has the halt node, the state is a halt state.  */
1713*95b7b453SJohn Marino       if (type == END_OF_RE)
1714*95b7b453SJohn Marino 	newstate->halt = 1;
1715*95b7b453SJohn Marino       else if (type == OP_BACK_REF)
1716*95b7b453SJohn Marino 	newstate->has_backref = 1;
1717*95b7b453SJohn Marino 
1718*95b7b453SJohn Marino       if (constraint)
1719*95b7b453SJohn Marino 	{
1720*95b7b453SJohn Marino 	  if (newstate->entrance_nodes == &newstate->nodes)
1721*95b7b453SJohn Marino 	    {
1722*95b7b453SJohn Marino 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1723*95b7b453SJohn Marino 	      if (BE (newstate->entrance_nodes == NULL, 0))
1724*95b7b453SJohn Marino 		{
1725*95b7b453SJohn Marino 		  free_state (newstate);
1726*95b7b453SJohn Marino 		  return NULL;
1727*95b7b453SJohn Marino 		}
1728*95b7b453SJohn Marino 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1729*95b7b453SJohn Marino 		  != REG_NOERROR)
1730*95b7b453SJohn Marino 		return NULL;
1731*95b7b453SJohn Marino 	      nctx_nodes = 0;
1732*95b7b453SJohn Marino 	      newstate->has_constraint = 1;
1733*95b7b453SJohn Marino 	    }
1734*95b7b453SJohn Marino 
1735*95b7b453SJohn Marino 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1736*95b7b453SJohn Marino 	    {
1737*95b7b453SJohn Marino 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1738*95b7b453SJohn Marino 	      ++nctx_nodes;
1739*95b7b453SJohn Marino 	    }
1740*95b7b453SJohn Marino 	}
1741*95b7b453SJohn Marino     }
1742*95b7b453SJohn Marino   err = register_state (dfa, newstate, hash);
1743*95b7b453SJohn Marino   if (BE (err != REG_NOERROR, 0))
1744*95b7b453SJohn Marino     {
1745*95b7b453SJohn Marino       free_state (newstate);
1746*95b7b453SJohn Marino       newstate = NULL;
1747*95b7b453SJohn Marino     }
1748*95b7b453SJohn Marino   return  newstate;
1749*95b7b453SJohn Marino }
1750