xref: /netbsd-src/external/gpl2/xcvs/dist/lib/regex_internal.c (revision 5a6c14c844c4c665da5632061aebde7bb2cb5766)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 #include <sys/cdefs.h>
20 __RCSID("$NetBSD: regex_internal.c,v 1.2 2016/05/17 14:00:09 christos Exp $");
21 
22 
23 static void re_string_construct_common (const char *str, Idx len,
24 					re_string_t *pstr,
25 					REG_TRANSLATE_TYPE trans, bool icase,
26 					const re_dfa_t *dfa) internal_function;
27 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  re_hashval_t hash) internal_function;
30 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
31 					  const re_node_set *nodes,
32 					  unsigned int context,
33 					  re_hashval_t hash) internal_function;
34 
35 /* Functions for string operation.  */
36 
37 /* This function allocate the buffers.  It is necessary to call
38    re_string_reconstruct before using the object.  */
39 
40 static reg_errcode_t
41 internal_function
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)42 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
43 		    REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44 {
45   reg_errcode_t ret;
46   Idx init_buf_len;
47 
48   /* Ensure at least one character fits into the buffers.  */
49   if (init_len < dfa->mb_cur_max)
50     init_len = dfa->mb_cur_max;
51   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52   re_string_construct_common (str, len, pstr, trans, icase, dfa);
53 
54   ret = re_string_realloc_buffers (pstr, init_buf_len);
55   if (BE (ret != REG_NOERROR, 0))
56     return ret;
57 
58   pstr->word_char = dfa->word_char;
59   pstr->word_ops_used = dfa->word_ops_used;
60   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
62   pstr->valid_raw_len = pstr->valid_len;
63   return REG_NOERROR;
64 }
65 
66 /* This function allocate the buffers, and initialize them.  */
67 
68 static reg_errcode_t
69 internal_function
re_string_construct(re_string_t * pstr,const char * str,Idx len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)70 re_string_construct (re_string_t *pstr, const char *str, Idx len,
71 		     REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72 {
73   reg_errcode_t ret;
74   memset (pstr, '\0', sizeof (re_string_t));
75   re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 
77   if (len > 0)
78     {
79       ret = re_string_realloc_buffers (pstr, len + 1);
80       if (BE (ret != REG_NOERROR, 0))
81 	return ret;
82     }
83   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84 
85   if (icase)
86     {
87 #ifdef RE_ENABLE_I18N
88       if (dfa->mb_cur_max > 1)
89 	{
90 	  while (1)
91 	    {
92 	      ret = build_wcs_upper_buffer (pstr);
93 	      if (BE (ret != REG_NOERROR, 0))
94 		return ret;
95 	      if (pstr->valid_raw_len >= len)
96 		break;
97 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98 		break;
99 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100 	      if (BE (ret != REG_NOERROR, 0))
101 		return ret;
102 	    }
103 	}
104       else
105 #endif /* RE_ENABLE_I18N  */
106 	build_upper_buffer (pstr);
107     }
108   else
109     {
110 #ifdef RE_ENABLE_I18N
111       if (dfa->mb_cur_max > 1)
112 	build_wcs_buffer (pstr);
113       else
114 #endif /* RE_ENABLE_I18N  */
115 	{
116 	  if (trans != NULL)
117 	    re_string_translate_buffer (pstr);
118 	  else
119 	    {
120 	      pstr->valid_len = pstr->bufs_len;
121 	      pstr->valid_raw_len = pstr->bufs_len;
122 	    }
123 	}
124     }
125 
126   return REG_NOERROR;
127 }
128 
129 /* Helper functions for re_string_allocate, and re_string_construct.  */
130 
131 static reg_errcode_t
132 internal_function
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)133 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134 {
135 #ifdef RE_ENABLE_I18N
136   if (pstr->mb_cur_max > 1)
137     {
138       wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
139       if (BE (new_wcs == NULL, 0))
140 	return REG_ESPACE;
141       pstr->wcs = new_wcs;
142       if (pstr->offsets != NULL)
143 	{
144 	  Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
145 	  if (BE (new_offsets == NULL, 0))
146 	    return REG_ESPACE;
147 	  pstr->offsets = new_offsets;
148 	}
149     }
150 #endif /* RE_ENABLE_I18N  */
151   if (pstr->mbs_allocated)
152     {
153       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
154 					   new_buf_len);
155       if (BE (new_mbs == NULL, 0))
156 	return REG_ESPACE;
157       pstr->mbs = new_mbs;
158     }
159   pstr->bufs_len = new_buf_len;
160   return REG_NOERROR;
161 }
162 
163 
164 static void
165 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)166 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
167 			    REG_TRANSLATE_TYPE trans, bool icase,
168 			    const re_dfa_t *dfa)
169 {
170   pstr->raw_mbs = (const unsigned char *) str;
171   pstr->len = len;
172   pstr->raw_len = len;
173   pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
174   pstr->icase = icase;
175   pstr->mbs_allocated = (trans != NULL || icase);
176   pstr->mb_cur_max = dfa->mb_cur_max;
177   pstr->is_utf8 = dfa->is_utf8;
178   pstr->map_notascii = dfa->map_notascii;
179   pstr->stop = pstr->len;
180   pstr->raw_stop = pstr->stop;
181 }
182 
183 #ifdef RE_ENABLE_I18N
184 
185 /* Build wide character buffer PSTR->WCS.
186    If the byte sequence of the string are:
187      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
188    Then wide character buffer will be:
189      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
190    We use WEOF for padding, they indicate that the position isn't
191    a first byte of a multibyte character.
192 
193    Note that this function assumes PSTR->VALID_LEN elements are already
194    built and starts from PSTR->VALID_LEN.  */
195 
196 static void
197 internal_function
build_wcs_buffer(re_string_t * pstr)198 build_wcs_buffer (re_string_t *pstr)
199 {
200 #ifdef _LIBC
201   unsigned char buf[MB_LEN_MAX];
202   assert (MB_LEN_MAX >= pstr->mb_cur_max);
203 #else
204   unsigned char buf[64];
205 #endif
206   mbstate_t prev_st;
207   Idx byte_idx, end_idx, remain_len;
208   size_t mbclen;
209 
210   /* Build the buffers from pstr->valid_len to either pstr->len or
211      pstr->bufs_len.  */
212   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
213   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
214     {
215       wchar_t wc;
216       const char *p;
217 
218       remain_len = end_idx - byte_idx;
219       prev_st = pstr->cur_state;
220       /* Apply the translation if we need.  */
221       if (BE (pstr->trans != NULL, 0))
222 	{
223 	  int i, ch;
224 
225 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
226 	    {
227 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
228 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
229 	    }
230 	  p = (const char *) buf;
231 	}
232       else
233 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
234       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
235       if (BE (mbclen == (size_t) -2, 0))
236 	{
237 	  /* The buffer doesn't have enough space, finish to build.  */
238 	  pstr->cur_state = prev_st;
239 	  break;
240 	}
241       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
242 	{
243 	  /* We treat these cases as a singlebyte character.  */
244 	  mbclen = 1;
245 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246 	  if (BE (pstr->trans != NULL, 0))
247 	    wc = pstr->trans[wc];
248 	  pstr->cur_state = prev_st;
249 	}
250 
251       /* Write wide character and padding.  */
252       pstr->wcs[byte_idx++] = wc;
253       /* Write paddings.  */
254       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
255 	pstr->wcs[byte_idx++] = WEOF;
256     }
257   pstr->valid_len = byte_idx;
258   pstr->valid_raw_len = byte_idx;
259 }
260 
261 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
262    but for REG_ICASE.  */
263 
264 static reg_errcode_t
265 internal_function
build_wcs_upper_buffer(re_string_t * pstr)266 build_wcs_upper_buffer (re_string_t *pstr)
267 {
268   mbstate_t prev_st;
269   Idx src_idx, byte_idx, end_idx, remain_len;
270   size_t mbclen;
271 #ifdef _LIBC
272   char buf[MB_LEN_MAX];
273   assert (MB_LEN_MAX >= pstr->mb_cur_max);
274 #else
275   char buf[64];
276 #endif
277 
278   byte_idx = pstr->valid_len;
279   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
280 
281   /* The following optimization assumes that ASCII characters can be
282      mapped to wide characters with a simple cast.  */
283   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
284     {
285       while (byte_idx < end_idx)
286 	{
287 	  wchar_t wc;
288 
289 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
290 	      && mbsinit (&pstr->cur_state))
291 	    {
292 	      /* In case of a singlebyte character.  */
293 	      pstr->mbs[byte_idx]
294 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
295 	      /* The next step uses the assumption that wchar_t is encoded
296 		 ASCII-safe: all ASCII values can be converted like this.  */
297 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
298 	      ++byte_idx;
299 	      continue;
300 	    }
301 
302 	  remain_len = end_idx - byte_idx;
303 	  prev_st = pstr->cur_state;
304 	  mbclen = mbrtowc (&wc,
305 			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
306 			     + byte_idx), remain_len, &pstr->cur_state);
307 	  if (BE ((size_t) (mbclen + 2) > 2, 1))
308 	    {
309 	      wchar_t wcu = wc;
310 	      if (iswlower (wc))
311 		{
312 		  size_t mbcdlen;
313 
314 		  wcu = towupper (wc);
315 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
316 		  if (BE (mbclen == mbcdlen, 1))
317 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
318 		  else
319 		    {
320 		      src_idx = byte_idx;
321 		      goto offsets_needed;
322 		    }
323 		}
324 	      else
325 		memcpy (pstr->mbs + byte_idx,
326 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
327 	      pstr->wcs[byte_idx++] = wcu;
328 	      /* Write paddings.  */
329 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
330 		pstr->wcs[byte_idx++] = WEOF;
331 	    }
332 	  else if (mbclen == (size_t) -1 || mbclen == 0)
333 	    {
334 	      /* It is an invalid character or '\0'.  Just use the byte.  */
335 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
336 	      pstr->mbs[byte_idx] = ch;
337 	      /* And also cast it to wide char.  */
338 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
339 	      if (BE (mbclen == (size_t) -1, 0))
340 		pstr->cur_state = prev_st;
341 	    }
342 	  else
343 	    {
344 	      /* The buffer doesn't have enough space, finish to build.  */
345 	      pstr->cur_state = prev_st;
346 	      break;
347 	    }
348 	}
349       pstr->valid_len = byte_idx;
350       pstr->valid_raw_len = byte_idx;
351       return REG_NOERROR;
352     }
353   else
354     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
355       {
356 	wchar_t wc;
357 	const char *p;
358       offsets_needed:
359 	remain_len = end_idx - byte_idx;
360 	prev_st = pstr->cur_state;
361 	if (BE (pstr->trans != NULL, 0))
362 	  {
363 	    int i, ch;
364 
365 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
366 	      {
367 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
368 		buf[i] = pstr->trans[ch];
369 	      }
370 	    p = (const char *) buf;
371 	  }
372 	else
373 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
374 	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
375 	if (BE ((size_t) (mbclen + 2) > 2, 1))
376 	  {
377 	    wchar_t wcu = wc;
378 	    if (iswlower (wc))
379 	      {
380 		size_t mbcdlen;
381 
382 		wcu = towupper (wc);
383 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
384 		if (BE (mbclen == mbcdlen, 1))
385 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
386 		else if (mbcdlen != (size_t) -1)
387 		  {
388 		    size_t i;
389 
390 		    if (byte_idx + mbcdlen > pstr->bufs_len)
391 		      {
392 			pstr->cur_state = prev_st;
393 			break;
394 		      }
395 
396 		    if (pstr->offsets == NULL)
397 		      {
398 			pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
399 
400 			if (pstr->offsets == NULL)
401 			  return REG_ESPACE;
402 		      }
403 		    if (!pstr->offsets_needed)
404 		      {
405 			for (i = 0; i < (size_t) byte_idx; ++i)
406 			  pstr->offsets[i] = i;
407 			pstr->offsets_needed = 1;
408 		      }
409 
410 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
411 		    pstr->wcs[byte_idx] = wcu;
412 		    pstr->offsets[byte_idx] = src_idx;
413 		    for (i = 1; i < mbcdlen; ++i)
414 		      {
415 			pstr->offsets[byte_idx + i]
416 			  = src_idx + (i < mbclen ? i : mbclen - 1);
417 			pstr->wcs[byte_idx + i] = WEOF;
418 		      }
419 		    pstr->len += mbcdlen - mbclen;
420 		    if (pstr->raw_stop > src_idx)
421 		      pstr->stop += mbcdlen - mbclen;
422 		    end_idx = (pstr->bufs_len > pstr->len)
423 			      ? pstr->len : pstr->bufs_len;
424 		    byte_idx += mbcdlen;
425 		    src_idx += mbclen;
426 		    continue;
427 		  }
428                 else
429                   memcpy (pstr->mbs + byte_idx, p, mbclen);
430 	      }
431 	    else
432 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
433 
434 	    if (BE (pstr->offsets_needed != 0, 0))
435 	      {
436 		size_t i;
437 		for (i = 0; i < mbclen; ++i)
438 		  pstr->offsets[byte_idx + i] = src_idx + i;
439 	      }
440 	    src_idx += mbclen;
441 
442 	    pstr->wcs[byte_idx++] = wcu;
443 	    /* Write paddings.  */
444 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
445 	      pstr->wcs[byte_idx++] = WEOF;
446 	  }
447 	else if (mbclen == (size_t) -1 || mbclen == 0)
448 	  {
449 	    /* It is an invalid character or '\0'.  Just use the byte.  */
450 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
451 
452 	    if (BE (pstr->trans != NULL, 0))
453 	      ch = pstr->trans [ch];
454 	    pstr->mbs[byte_idx] = ch;
455 
456 	    if (BE (pstr->offsets_needed != 0, 0))
457 	      pstr->offsets[byte_idx] = src_idx;
458 	    ++src_idx;
459 
460 	    /* And also cast it to wide char.  */
461 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
462 	    if (BE (mbclen == (size_t) -1, 0))
463 	      pstr->cur_state = prev_st;
464 	  }
465 	else
466 	  {
467 	    /* The buffer doesn't have enough space, finish to build.  */
468 	    pstr->cur_state = prev_st;
469 	    break;
470 	  }
471       }
472   pstr->valid_len = byte_idx;
473   pstr->valid_raw_len = src_idx;
474   return REG_NOERROR;
475 }
476 
477 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
478    Return the index.  */
479 
480 static Idx
481 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)482 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
483 {
484   mbstate_t prev_st;
485   Idx rawbuf_idx;
486   size_t mbclen;
487   wchar_t wc = 0;
488 
489   /* Skip the characters which are not necessary to check.  */
490   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
491        rawbuf_idx < new_raw_idx;)
492     {
493       Idx remain_len;
494       remain_len = pstr->len - rawbuf_idx;
495       prev_st = pstr->cur_state;
496       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
497 			remain_len, &pstr->cur_state);
498       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
499 	{
500 	  /* We treat these cases as a singlebyte character.  */
501 	  mbclen = 1;
502 	  pstr->cur_state = prev_st;
503 	}
504       /* Then proceed the next character.  */
505       rawbuf_idx += mbclen;
506     }
507   *last_wc = (wint_t) wc;
508   return rawbuf_idx;
509 }
510 #endif /* RE_ENABLE_I18N  */
511 
512 /* Build the buffer PSTR->MBS, and apply the translation if we need.
513    This function is used in case of REG_ICASE.  */
514 
515 static void
516 internal_function
build_upper_buffer(re_string_t * pstr)517 build_upper_buffer (re_string_t *pstr)
518 {
519   Idx char_idx, end_idx;
520   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
521 
522   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
523     {
524       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
525       if (BE (pstr->trans != NULL, 0))
526 	ch = pstr->trans[ch];
527       if (islower (ch))
528 	pstr->mbs[char_idx] = toupper (ch);
529       else
530 	pstr->mbs[char_idx] = ch;
531     }
532   pstr->valid_len = char_idx;
533   pstr->valid_raw_len = char_idx;
534 }
535 
536 /* Apply TRANS to the buffer in PSTR.  */
537 
538 static void
539 internal_function
re_string_translate_buffer(re_string_t * pstr)540 re_string_translate_buffer (re_string_t *pstr)
541 {
542   Idx buf_idx, end_idx;
543   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
544 
545   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
546     {
547       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
548       pstr->mbs[buf_idx] = pstr->trans[ch];
549     }
550 
551   pstr->valid_len = buf_idx;
552   pstr->valid_raw_len = buf_idx;
553 }
554 
555 /* This function re-construct the buffers.
556    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
557    convert to upper case in case of REG_ICASE, apply translation.  */
558 
559 static reg_errcode_t
560 internal_function
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)561 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
562 {
563   Idx offset;
564 
565   if (BE (pstr->raw_mbs_idx <= idx, 0))
566     offset = idx - pstr->raw_mbs_idx;
567   else
568     {
569       /* Reset buffer.  */
570 #ifdef RE_ENABLE_I18N
571       if (pstr->mb_cur_max > 1)
572 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
573 #endif /* RE_ENABLE_I18N */
574       pstr->len = pstr->raw_len;
575       pstr->stop = pstr->raw_stop;
576       pstr->valid_len = 0;
577       pstr->raw_mbs_idx = 0;
578       pstr->valid_raw_len = 0;
579       pstr->offsets_needed = 0;
580       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
581 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
582       if (!pstr->mbs_allocated)
583 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
584       offset = idx;
585     }
586 
587   if (BE (offset != 0, 1))
588     {
589       /* Are the characters which are already checked remain?  */
590       if (BE (offset < pstr->valid_raw_len, 1)
591 #ifdef RE_ENABLE_I18N
592 	  /* Handling this would enlarge the code too much.
593 	     Accept a slowdown in that case.  */
594 	  && pstr->offsets_needed == 0
595 #endif
596 	 )
597 	{
598 	  /* Yes, move them to the front of the buffer.  */
599 	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
600 #ifdef RE_ENABLE_I18N
601 	  if (pstr->mb_cur_max > 1)
602 	    memmove (pstr->wcs, pstr->wcs + offset,
603 		     (pstr->valid_len - offset) * sizeof (wint_t));
604 #endif /* RE_ENABLE_I18N */
605 	  if (BE (pstr->mbs_allocated, 0))
606 	    memmove (pstr->mbs, pstr->mbs + offset,
607 		     pstr->valid_len - offset);
608 	  pstr->valid_len -= offset;
609 	  pstr->valid_raw_len -= offset;
610 #if DEBUG
611 	  assert (pstr->valid_len > 0);
612 #endif
613 	}
614       else
615 	{
616 	  /* No, skip all characters until IDX.  */
617 #ifdef RE_ENABLE_I18N
618 	  if (BE (pstr->offsets_needed, 0))
619 	    {
620 	      pstr->len = pstr->raw_len - idx + offset;
621 	      pstr->stop = pstr->raw_stop - idx + offset;
622 	      pstr->offsets_needed = 0;
623 	    }
624 #endif
625 	  pstr->valid_len = 0;
626 	  pstr->valid_raw_len = 0;
627 #ifdef RE_ENABLE_I18N
628 	  if (pstr->mb_cur_max > 1)
629 	    {
630 	      Idx wcs_idx;
631 	      wint_t wc = WEOF;
632 
633 	      if (pstr->is_utf8)
634 		{
635 		  const unsigned char *raw, *p, *q, *end;
636 
637 		  /* Special case UTF-8.  Multi-byte chars start with any
638 		     byte other than 0x80 - 0xbf.  */
639 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
640 		  end = raw + (offset - pstr->mb_cur_max);
641 		  for (p = raw + offset - 1; p >= end; --p)
642 		    if ((*p & 0xc0) != 0x80)
643 		      {
644 			mbstate_t cur_state;
645 			wchar_t wc2;
646 			Idx mlen = raw + pstr->len - p;
647 			unsigned char buf[6];
648 			size_t mbclen;
649 
650 			q = p;
651 			if (BE (pstr->trans != NULL, 0))
652 			  {
653 			    int i = mlen < 6 ? mlen : 6;
654 			    while (--i >= 0)
655 			      buf[i] = pstr->trans[p[i]];
656 			    q = buf;
657 			  }
658 			/* XXX Don't use mbrtowc, we know which conversion
659 			   to use (UTF-8 -> UCS4).  */
660 			memset (&cur_state, 0, sizeof (cur_state));
661 			mbclen = mbrtowc (&wc2, (const char *) p, mlen,
662 					  &cur_state);
663 			if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
664 			  {
665 			    memset (&pstr->cur_state, '\0',
666 				    sizeof (mbstate_t));
667 			    pstr->valid_len = mbclen - (raw + offset - p);
668 			    wc = wc2;
669 			  }
670 			break;
671 		      }
672 		}
673 
674 	      if (wc == WEOF)
675 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
676 	      if (BE (pstr->valid_len, 0))
677 		{
678 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
679 		    pstr->wcs[wcs_idx] = WEOF;
680 		  if (pstr->mbs_allocated)
681 		    memset (pstr->mbs, -1, pstr->valid_len);
682 		}
683 	      pstr->valid_raw_len = pstr->valid_len;
684 	      pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
685 				    && IS_WIDE_WORD_CHAR (wc))
686 				   ? CONTEXT_WORD
687 				   : ((IS_WIDE_NEWLINE (wc)
688 				       && pstr->newline_anchor)
689 				      ? CONTEXT_NEWLINE : 0));
690 	    }
691 	  else
692 #endif /* RE_ENABLE_I18N */
693 	    {
694 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
695 	      if (pstr->trans)
696 		c = pstr->trans[c];
697 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
698 				   ? CONTEXT_WORD
699 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
700 				      ? CONTEXT_NEWLINE : 0));
701 	    }
702 	}
703       if (!BE (pstr->mbs_allocated, 0))
704 	pstr->mbs += offset;
705     }
706   pstr->raw_mbs_idx = idx;
707   pstr->len -= offset;
708   pstr->stop -= offset;
709 
710   /* Then build the buffers.  */
711 #ifdef RE_ENABLE_I18N
712   if (pstr->mb_cur_max > 1)
713     {
714       if (pstr->icase)
715 	{
716 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
717 	  if (BE (ret != REG_NOERROR, 0))
718 	    return ret;
719 	}
720       else
721 	build_wcs_buffer (pstr);
722     }
723   else
724 #endif /* RE_ENABLE_I18N */
725   if (BE (pstr->mbs_allocated, 0))
726     {
727       if (pstr->icase)
728 	build_upper_buffer (pstr);
729       else if (pstr->trans != NULL)
730 	re_string_translate_buffer (pstr);
731     }
732   else
733     pstr->valid_len = pstr->len;
734 
735   pstr->cur_idx = 0;
736   return REG_NOERROR;
737 }
738 
739 static unsigned char
internal_function(pure)740 internal_function __attribute ((pure))
741 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
742 {
743   int ch;
744   Idx off;
745 
746   /* Handle the common (easiest) cases first.  */
747   if (BE (!pstr->mbs_allocated, 1))
748     return re_string_peek_byte (pstr, idx);
749 
750 #ifdef RE_ENABLE_I18N
751   if (pstr->mb_cur_max > 1
752       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
753     return re_string_peek_byte (pstr, idx);
754 #endif
755 
756   off = pstr->cur_idx + idx;
757 #ifdef RE_ENABLE_I18N
758   if (pstr->offsets_needed)
759     off = pstr->offsets[off];
760 #endif
761 
762   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
763 
764 #ifdef RE_ENABLE_I18N
765   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
766      this function returns CAPITAL LETTER I instead of first byte of
767      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
768      since peek_byte_case doesn't advance cur_idx in any way.  */
769   if (pstr->offsets_needed && !isascii (ch))
770     return re_string_peek_byte (pstr, idx);
771 #endif
772 
773   return ch;
774 }
775 
776 static unsigned char
internal_function(pure)777 internal_function __attribute ((pure))
778 re_string_fetch_byte_case (re_string_t *pstr)
779 {
780   if (BE (!pstr->mbs_allocated, 1))
781     return re_string_fetch_byte (pstr);
782 
783 #ifdef RE_ENABLE_I18N
784   if (pstr->offsets_needed)
785     {
786       Idx off;
787       int ch;
788 
789       /* For tr_TR.UTF-8 [[:islower:]] there is
790 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
791 	 in that case the whole multi-byte character and return
792 	 the original letter.  On the other side, with
793 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
794 	 anything else would complicate things too much.  */
795 
796       if (!re_string_first_byte (pstr, pstr->cur_idx))
797 	return re_string_fetch_byte (pstr);
798 
799       off = pstr->offsets[pstr->cur_idx];
800       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
801 
802       if (! isascii (ch))
803 	return re_string_fetch_byte (pstr);
804 
805       re_string_skip_bytes (pstr,
806 			    re_string_char_size_at (pstr, pstr->cur_idx));
807       return ch;
808     }
809 #endif
810 
811   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
812 }
813 
814 static void
815 internal_function
re_string_destruct(re_string_t * pstr)816 re_string_destruct (re_string_t *pstr)
817 {
818 #ifdef RE_ENABLE_I18N
819   re_free (pstr->wcs);
820   re_free (pstr->offsets);
821 #endif /* RE_ENABLE_I18N  */
822   if (pstr->mbs_allocated)
823     re_free (pstr->mbs);
824 }
825 
826 /* Return the context at IDX in INPUT.  */
827 
828 static unsigned int
829 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)830 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
831 {
832   int c;
833   if (BE (! REG_VALID_INDEX (idx), 0))
834     /* In this case, we use the value stored in input->tip_context,
835        since we can't know the character in input->mbs[-1] here.  */
836     return input->tip_context;
837   if (BE (idx == input->len, 0))
838     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
839 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
840 #ifdef RE_ENABLE_I18N
841   if (input->mb_cur_max > 1)
842     {
843       wint_t wc;
844       Idx wc_idx = idx;
845       while(input->wcs[wc_idx] == WEOF)
846 	{
847 #ifdef DEBUG
848 	  /* It must not happen.  */
849 	  assert (REG_VALID_INDEX (wc_idx));
850 #endif
851 	  --wc_idx;
852 	  if (! REG_VALID_INDEX (wc_idx))
853 	    return input->tip_context;
854 	}
855       wc = input->wcs[wc_idx];
856       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
857 	return CONTEXT_WORD;
858       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
859 	      ? CONTEXT_NEWLINE : 0);
860     }
861   else
862 #endif
863     {
864       c = re_string_byte_at (input, idx);
865       if (bitset_contain (input->word_char, c))
866 	return CONTEXT_WORD;
867       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
868     }
869 }
870 
871 /* Functions for set operation.  */
872 
873 static reg_errcode_t
874 internal_function
re_node_set_alloc(re_node_set * set,Idx size)875 re_node_set_alloc (re_node_set *set, Idx size)
876 {
877   set->alloc = size;
878   set->nelem = 0;
879   set->elems = re_xmalloc (Idx, size);
880   if (BE (set->elems == NULL, 0))
881     return REG_ESPACE;
882   return REG_NOERROR;
883 }
884 
885 static reg_errcode_t
886 internal_function
re_node_set_init_1(re_node_set * set,Idx elem)887 re_node_set_init_1 (re_node_set *set, Idx elem)
888 {
889   set->alloc = 1;
890   set->nelem = 1;
891   set->elems = re_malloc (Idx, 1);
892   if (BE (set->elems == NULL, 0))
893     {
894       set->alloc = set->nelem = 0;
895       return REG_ESPACE;
896     }
897   set->elems[0] = elem;
898   return REG_NOERROR;
899 }
900 
901 static reg_errcode_t
902 internal_function
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)903 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
904 {
905   set->alloc = 2;
906   set->elems = re_malloc (Idx, 2);
907   if (BE (set->elems == NULL, 0))
908     return REG_ESPACE;
909   if (elem1 == elem2)
910     {
911       set->nelem = 1;
912       set->elems[0] = elem1;
913     }
914   else
915     {
916       set->nelem = 2;
917       if (elem1 < elem2)
918 	{
919 	  set->elems[0] = elem1;
920 	  set->elems[1] = elem2;
921 	}
922       else
923 	{
924 	  set->elems[0] = elem2;
925 	  set->elems[1] = elem1;
926 	}
927     }
928   return REG_NOERROR;
929 }
930 
931 static reg_errcode_t
932 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)933 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
934 {
935   dest->nelem = src->nelem;
936   if (src->nelem > 0)
937     {
938       dest->alloc = dest->nelem;
939       dest->elems = re_malloc (Idx, dest->alloc);
940       if (BE (dest->elems == NULL, 0))
941 	{
942 	  dest->alloc = dest->nelem = 0;
943 	  return REG_ESPACE;
944 	}
945       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
946     }
947   else
948     re_node_set_init_empty (dest);
949   return REG_NOERROR;
950 }
951 
952 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
953    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
954    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
955 
956 static reg_errcode_t
957 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)958 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
959 			   const re_node_set *src2)
960 {
961   Idx i1, i2, is, id, delta, sbase;
962   if (src1->nelem == 0 || src2->nelem == 0)
963     return REG_NOERROR;
964 
965   /* We need dest->nelem + 2 * elems_in_intersection; this is a
966      conservative estimate.  */
967   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
968     {
969       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
970       Idx *new_elems;
971       if (sizeof (Idx) < 3
972 	  && (new_alloc < dest->alloc
973 	      || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
974 	return REG_ESPACE;
975       new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
976       if (BE (new_elems == NULL, 0))
977         return REG_ESPACE;
978       dest->elems = new_elems;
979       dest->alloc = new_alloc;
980     }
981 
982   /* Find the items in the intersection of SRC1 and SRC2, and copy
983      into the top of DEST those that are not already in DEST itself.  */
984   sbase = dest->nelem + src1->nelem + src2->nelem;
985   i1 = src1->nelem - 1;
986   i2 = src2->nelem - 1;
987   id = dest->nelem - 1;
988   for (;;)
989     {
990       if (src1->elems[i1] == src2->elems[i2])
991 	{
992 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
993 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
994 	    --id;
995 
996           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
997             dest->elems[--sbase] = src1->elems[i1];
998 
999 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1000 	    break;
1001 	}
1002 
1003       /* Lower the highest of the two items.  */
1004       else if (src1->elems[i1] < src2->elems[i2])
1005 	{
1006 	  if (! REG_VALID_INDEX (--i2))
1007 	    break;
1008 	}
1009       else
1010 	{
1011 	  if (! REG_VALID_INDEX (--i1))
1012 	    break;
1013 	}
1014     }
1015 
1016   id = dest->nelem - 1;
1017   is = dest->nelem + src1->nelem + src2->nelem - 1;
1018   delta = is - sbase + 1;
1019 
1020   /* Now copy.  When DELTA becomes zero, the remaining
1021      DEST elements are already in place; this is more or
1022      less the same loop that is in re_node_set_merge.  */
1023   dest->nelem += delta;
1024   if (delta > 0 && REG_VALID_INDEX (id))
1025     for (;;)
1026       {
1027         if (dest->elems[is] > dest->elems[id])
1028           {
1029             /* Copy from the top.  */
1030             dest->elems[id + delta--] = dest->elems[is--];
1031             if (delta == 0)
1032               break;
1033           }
1034         else
1035           {
1036             /* Slide from the bottom.  */
1037             dest->elems[id + delta] = dest->elems[id];
1038             if (! REG_VALID_INDEX (--id))
1039               break;
1040           }
1041       }
1042 
1043   /* Copy remaining SRC elements.  */
1044   memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1045 
1046   return REG_NOERROR;
1047 }
1048 
1049 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1050    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1051 
1052 static reg_errcode_t
1053 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1054 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1055 			const re_node_set *src2)
1056 {
1057   Idx i1, i2, id;
1058   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1059     {
1060       dest->alloc = src1->nelem + src2->nelem;
1061       if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1062 	return REG_ESPACE;
1063       dest->elems = re_xmalloc (Idx, dest->alloc);
1064       if (BE (dest->elems == NULL, 0))
1065 	return REG_ESPACE;
1066     }
1067   else
1068     {
1069       if (src1 != NULL && src1->nelem > 0)
1070 	return re_node_set_init_copy (dest, src1);
1071       else if (src2 != NULL && src2->nelem > 0)
1072 	return re_node_set_init_copy (dest, src2);
1073       else
1074 	re_node_set_init_empty (dest);
1075       return REG_NOERROR;
1076     }
1077   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1078     {
1079       if (src1->elems[i1] > src2->elems[i2])
1080 	{
1081 	  dest->elems[id++] = src2->elems[i2++];
1082 	  continue;
1083 	}
1084       if (src1->elems[i1] == src2->elems[i2])
1085 	++i2;
1086       dest->elems[id++] = src1->elems[i1++];
1087     }
1088   if (i1 < src1->nelem)
1089     {
1090       memcpy (dest->elems + id, src1->elems + i1,
1091 	     (src1->nelem - i1) * sizeof dest->elems[0]);
1092       id += src1->nelem - i1;
1093     }
1094   else if (i2 < src2->nelem)
1095     {
1096       memcpy (dest->elems + id, src2->elems + i2,
1097 	     (src2->nelem - i2) * sizeof dest->elems[0]);
1098       id += src2->nelem - i2;
1099     }
1100   dest->nelem = id;
1101   return REG_NOERROR;
1102 }
1103 
1104 /* Calculate the union set of the sets DEST and SRC. And store it to
1105    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1106 
1107 static reg_errcode_t
1108 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1109 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1110 {
1111   Idx is, id, sbase, delta;
1112   if (src == NULL || src->nelem == 0)
1113     return REG_NOERROR;
1114   if (sizeof (Idx) < 3
1115       && ((Idx) (2 * src->nelem) < src->nelem
1116 	  || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1117     return REG_ESPACE;
1118   if (dest->alloc < 2 * src->nelem + dest->nelem)
1119     {
1120       Idx new_alloc = src->nelem + dest->alloc;
1121       Idx *new_buffer;
1122       if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1123 	return REG_ESPACE;
1124       new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1125       if (BE (new_buffer == NULL, 0))
1126 	return REG_ESPACE;
1127       dest->elems = new_buffer;
1128       dest->alloc = new_alloc;
1129     }
1130 
1131   if (BE (dest->nelem == 0, 0))
1132     {
1133       dest->nelem = src->nelem;
1134       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1135       return REG_NOERROR;
1136     }
1137 
1138   /* Copy into the top of DEST the items of SRC that are not
1139      found in DEST.  Maybe we could binary search in DEST?  */
1140   for (sbase = dest->nelem + 2 * src->nelem,
1141        is = src->nelem - 1, id = dest->nelem - 1;
1142        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1143     {
1144       if (dest->elems[id] == src->elems[is])
1145         is--, id--;
1146       else if (dest->elems[id] < src->elems[is])
1147         dest->elems[--sbase] = src->elems[is--];
1148       else /* if (dest->elems[id] > src->elems[is]) */
1149         --id;
1150     }
1151 
1152   if (REG_VALID_INDEX (is))
1153     {
1154       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1155       sbase -= is + 1;
1156       memcpy (dest->elems + sbase, src->elems,
1157 	      (is + 1) * sizeof dest->elems[0]);
1158     }
1159 
1160   id = dest->nelem - 1;
1161   is = dest->nelem + 2 * src->nelem - 1;
1162   delta = is - sbase + 1;
1163   if (delta == 0)
1164     return REG_NOERROR;
1165 
1166   /* Now copy.  When DELTA becomes zero, the remaining
1167      DEST elements are already in place.  */
1168   dest->nelem += delta;
1169   for (;;)
1170     {
1171       if (dest->elems[is] > dest->elems[id])
1172         {
1173 	  /* Copy from the top.  */
1174           dest->elems[id + delta--] = dest->elems[is--];
1175 	  if (delta == 0)
1176 	    break;
1177 	}
1178       else
1179         {
1180           /* Slide from the bottom.  */
1181           dest->elems[id + delta] = dest->elems[id];
1182 	  if (! REG_VALID_INDEX (--id))
1183 	    {
1184 	      /* Copy remaining SRC elements.  */
1185 	      memcpy (dest->elems, dest->elems + sbase,
1186 	              delta * sizeof dest->elems[0]);
1187 	      break;
1188 	    }
1189 	}
1190     }
1191 
1192   return REG_NOERROR;
1193 }
1194 
1195 /* Insert the new element ELEM to the re_node_set* SET.
1196    SET should not already have ELEM.
1197    Return true if successful.  */
1198 
1199 static bool
1200 internal_function
re_node_set_insert(re_node_set * set,Idx elem)1201 re_node_set_insert (re_node_set *set, Idx elem)
1202 {
1203   Idx idx;
1204   /* In case the set is empty.  */
1205   if (set->alloc == 0)
1206     return re_node_set_init_1 (set, elem) == REG_NOERROR;
1207 
1208   if (BE (set->nelem, 0) == 0)
1209     {
1210       /* We already guaranteed above that set->alloc != 0.  */
1211       set->elems[0] = elem;
1212       ++set->nelem;
1213       return true;
1214     }
1215 
1216   /* Realloc if we need.  */
1217   if (set->alloc == set->nelem)
1218     {
1219       Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1220       if (BE (new_elems == NULL, 0))
1221 	return false;
1222       set->elems = new_elems;
1223     }
1224 
1225   /* Move the elements which follows the new element.  Test the
1226      first element separately to skip a check in the inner loop.  */
1227   if (elem < set->elems[0])
1228     {
1229       idx = 0;
1230       for (idx = set->nelem; idx > 0; idx--)
1231         set->elems[idx] = set->elems[idx - 1];
1232     }
1233   else
1234     {
1235       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1236         set->elems[idx] = set->elems[idx - 1];
1237     }
1238 
1239   /* Insert the new element.  */
1240   set->elems[idx] = elem;
1241   ++set->nelem;
1242   return true;
1243 }
1244 
1245 /* Insert the new element ELEM to the re_node_set* SET.
1246    SET should not already have any element greater than or equal to ELEM.
1247    Return true if successful.  */
1248 
1249 static bool
1250 internal_function
re_node_set_insert_last(re_node_set * set,Idx elem)1251 re_node_set_insert_last (re_node_set *set, Idx elem)
1252 {
1253   /* Realloc if we need.  */
1254   if (set->alloc == set->nelem)
1255     {
1256       Idx *new_elems;
1257       new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1258       if (BE (new_elems == NULL, 0))
1259 	return false;
1260       set->elems = new_elems;
1261     }
1262 
1263   /* Insert the new element.  */
1264   set->elems[set->nelem++] = elem;
1265   return true;
1266 }
1267 
1268 /* Compare two node sets SET1 and SET2.
1269    Return true if SET1 and SET2 are equivalent.  */
1270 
1271 static bool
internal_function(pure)1272 internal_function __attribute ((pure))
1273 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1274 {
1275   Idx i;
1276   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1277     return false;
1278   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1279     if (set1->elems[i] != set2->elems[i])
1280       return false;
1281   return true;
1282 }
1283 
1284 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1285 
1286 static Idx
internal_function(pure)1287 internal_function __attribute ((pure))
1288 re_node_set_contains (const re_node_set *set, Idx elem)
1289 {
1290   __re_size_t idx, right, mid;
1291   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1292     return 0;
1293 
1294   /* Binary search the element.  */
1295   idx = 0;
1296   right = set->nelem - 1;
1297   while (idx < right)
1298     {
1299       mid = (idx + right) / 2;
1300       if (set->elems[mid] < elem)
1301 	idx = mid + 1;
1302       else
1303 	right = mid;
1304     }
1305   return set->elems[idx] == elem ? idx + 1 : 0;
1306 }
1307 
1308 static void
1309 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1310 re_node_set_remove_at (re_node_set *set, Idx idx)
1311 {
1312   if (idx < 0 || idx >= set->nelem)
1313     return;
1314   --set->nelem;
1315   for (; idx < set->nelem; idx++)
1316     set->elems[idx] = set->elems[idx + 1];
1317 }
1318 
1319 
1320 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1321    Or return REG_MISSING if an error occurred.  */
1322 
1323 static Idx
1324 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1325 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1326 {
1327   int type = token.type;
1328   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1329     {
1330       Idx new_nodes_alloc = dfa->nodes_alloc;
1331       Idx *new_nexts, *new_indices;
1332       re_node_set *new_edests, *new_eclosures;
1333 
1334       re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1335 					    &new_nodes_alloc);
1336       if (BE (new_nodes == NULL, 0))
1337 	return REG_MISSING;
1338       dfa->nodes = new_nodes;
1339       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1340       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1341       new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1342       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1343       if (BE (new_nexts == NULL || new_indices == NULL
1344 	      || new_edests == NULL || new_eclosures == NULL, 0))
1345 	return REG_MISSING;
1346       dfa->nexts = new_nexts;
1347       dfa->org_indices = new_indices;
1348       dfa->edests = new_edests;
1349       dfa->eclosures = new_eclosures;
1350       dfa->nodes_alloc = new_nodes_alloc;
1351     }
1352   dfa->nodes[dfa->nodes_len] = token;
1353   dfa->nodes[dfa->nodes_len].constraint = 0;
1354 #ifdef RE_ENABLE_I18N
1355   dfa->nodes[dfa->nodes_len].accept_mb =
1356     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1357 #endif
1358   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1359   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1360   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1361   return dfa->nodes_len++;
1362 }
1363 
1364 static inline re_hashval_t
1365 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1366 calc_state_hash (const re_node_set *nodes, unsigned int context)
1367 {
1368   re_hashval_t hash = nodes->nelem + context;
1369   Idx i;
1370   for (i = 0 ; i < nodes->nelem ; i++)
1371     hash += nodes->elems[i];
1372   return hash;
1373 }
1374 
1375 /* Search for the state whose node_set is equivalent to NODES.
1376    Return the pointer to the state, if we found it in the DFA.
1377    Otherwise create the new one and return it.  In case of an error
1378    return NULL and set the error code in ERR.
1379    Note: - We assume NULL as the invalid state, then it is possible that
1380 	   return value is NULL and ERR is REG_NOERROR.
1381 	 - We never return non-NULL value in case of any errors, it is for
1382 	   optimization.  */
1383 
1384 static re_dfastate_t*
1385 internal_function
re_acquire_state(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes)1386 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1387 {
1388   re_hashval_t hash;
1389   re_dfastate_t *new_state;
1390   struct re_state_table_entry *spot;
1391   Idx i;
1392 #ifdef lint
1393   /* Suppress bogus uninitialized-variable warnings.  */
1394   *err = REG_NOERROR;
1395 #endif
1396   if (BE (nodes->nelem == 0, 0))
1397     {
1398       *err = REG_NOERROR;
1399       return NULL;
1400     }
1401   hash = calc_state_hash (nodes, 0);
1402   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1403 
1404   for (i = 0 ; i < spot->num ; i++)
1405     {
1406       re_dfastate_t *state = spot->array[i];
1407       if (hash != state->hash)
1408 	continue;
1409       if (re_node_set_compare (&state->nodes, nodes))
1410 	return state;
1411     }
1412 
1413   /* There are no appropriate state in the dfa, create the new one.  */
1414   new_state = create_ci_newstate (dfa, nodes, hash);
1415   if (BE (new_state != NULL, 1))
1416     return new_state;
1417   else
1418     {
1419       *err = REG_ESPACE;
1420       return NULL;
1421     }
1422 }
1423 
1424 /* Search for the state whose node_set is equivalent to NODES and
1425    whose context is equivalent to CONTEXT.
1426    Return the pointer to the state, if we found it in the DFA.
1427    Otherwise create the new one and return it.  In case of an error
1428    return NULL and set the error code in ERR.
1429    Note: - We assume NULL as the invalid state, then it is possible that
1430 	   return value is NULL and ERR is REG_NOERROR.
1431 	 - We never return non-NULL value in case of any errors, it is for
1432 	   optimization.  */
1433 
1434 static re_dfastate_t*
1435 internal_function
re_acquire_state_context(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1436 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1437 			  const re_node_set *nodes, unsigned int context)
1438 {
1439   re_hashval_t hash;
1440   re_dfastate_t *new_state;
1441   struct re_state_table_entry *spot;
1442   Idx i;
1443 #ifdef lint
1444   /* Suppress bogus uninitialized-variable warnings.  */
1445   *err = REG_NOERROR;
1446 #endif
1447   if (nodes->nelem == 0)
1448     {
1449       *err = REG_NOERROR;
1450       return NULL;
1451     }
1452   hash = calc_state_hash (nodes, context);
1453   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1454 
1455   for (i = 0 ; i < spot->num ; i++)
1456     {
1457       re_dfastate_t *state = spot->array[i];
1458       if (state->hash == hash
1459 	  && state->context == context
1460 	  && re_node_set_compare (state->entrance_nodes, nodes))
1461 	return state;
1462     }
1463   /* There are no appropriate state in `dfa', create the new one.  */
1464   new_state = create_cd_newstate (dfa, nodes, context, hash);
1465   if (BE (new_state != NULL, 1))
1466     return new_state;
1467   else
1468     {
1469       *err = REG_ESPACE;
1470       return NULL;
1471     }
1472 }
1473 
1474 /* Finish initialization of the new state NEWSTATE, and using its hash value
1475    HASH put in the appropriate bucket of DFA's state table.  Return value
1476    indicates the error code if failed.  */
1477 
1478 static reg_errcode_t
1479 internal_function
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1480 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1481 {
1482   struct re_state_table_entry *spot;
1483   reg_errcode_t err;
1484   Idx i;
1485 
1486   newstate->hash = hash;
1487   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1488   if (BE (err != REG_NOERROR, 0))
1489     return REG_ESPACE;
1490   for (i = 0; i < newstate->nodes.nelem; i++)
1491     {
1492       Idx elem = newstate->nodes.elems[i];
1493       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1494 	{
1495 	  bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1496 	  if (BE (! ok, 0))
1497 	    return REG_ESPACE;
1498 	}
1499     }
1500 
1501   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1502   if (BE (spot->alloc <= spot->num, 0))
1503     {
1504       Idx new_alloc = spot->num;
1505       re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1506 						&new_alloc);
1507       if (BE (new_array == NULL, 0))
1508 	return REG_ESPACE;
1509       spot->array = new_array;
1510       spot->alloc = new_alloc;
1511     }
1512   spot->array[spot->num++] = newstate;
1513   return REG_NOERROR;
1514 }
1515 
1516 /* Create the new state which is independ of contexts.
1517    Return the new state if succeeded, otherwise return NULL.  */
1518 
1519 static re_dfastate_t *
1520 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1521 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1522 		    re_hashval_t hash)
1523 {
1524   Idx i;
1525   reg_errcode_t err;
1526   re_dfastate_t *newstate;
1527 
1528   newstate = re_calloc (re_dfastate_t, 1);
1529   if (BE (newstate == NULL, 0))
1530     return NULL;
1531   err = re_node_set_init_copy (&newstate->nodes, nodes);
1532   if (BE (err != REG_NOERROR, 0))
1533     {
1534       re_free (newstate);
1535       return NULL;
1536     }
1537 
1538   newstate->entrance_nodes = &newstate->nodes;
1539   for (i = 0 ; i < nodes->nelem ; i++)
1540     {
1541       re_token_t *node = dfa->nodes + nodes->elems[i];
1542       re_token_type_t type = node->type;
1543       if (type == CHARACTER && !node->constraint)
1544 	continue;
1545 #ifdef RE_ENABLE_I18N
1546       newstate->accept_mb |= node->accept_mb;
1547 #endif /* RE_ENABLE_I18N */
1548 
1549       /* If the state has the halt node, the state is a halt state.  */
1550       if (type == END_OF_RE)
1551 	newstate->halt = 1;
1552       else if (type == OP_BACK_REF)
1553 	newstate->has_backref = 1;
1554       else if (type == ANCHOR || node->constraint)
1555 	newstate->has_constraint = 1;
1556     }
1557   err = register_state (dfa, newstate, hash);
1558   if (BE (err != REG_NOERROR, 0))
1559     {
1560       free_state (newstate);
1561       newstate = NULL;
1562     }
1563   return newstate;
1564 }
1565 
1566 /* Create the new state which is depend on the context CONTEXT.
1567    Return the new state if succeeded, otherwise return NULL.  */
1568 
1569 static re_dfastate_t *
1570 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1571 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1572 		    unsigned int context, re_hashval_t hash)
1573 {
1574   Idx i, nctx_nodes = 0;
1575   reg_errcode_t err;
1576   re_dfastate_t *newstate;
1577 
1578   newstate = re_calloc (re_dfastate_t, 1);
1579   if (BE (newstate == NULL, 0))
1580     return NULL;
1581   err = re_node_set_init_copy (&newstate->nodes, nodes);
1582   if (BE (err != REG_NOERROR, 0))
1583     {
1584       re_free (newstate);
1585       return NULL;
1586     }
1587 
1588   newstate->context = context;
1589   newstate->entrance_nodes = &newstate->nodes;
1590 
1591   for (i = 0 ; i < nodes->nelem ; i++)
1592     {
1593       unsigned int constraint = 0;
1594       re_token_t *node = dfa->nodes + nodes->elems[i];
1595       re_token_type_t type = node->type;
1596       if (node->constraint)
1597 	constraint = node->constraint;
1598 
1599       if (type == CHARACTER && !constraint)
1600 	continue;
1601 #ifdef RE_ENABLE_I18N
1602       newstate->accept_mb |= node->accept_mb;
1603 #endif /* RE_ENABLE_I18N */
1604 
1605       /* If the state has the halt node, the state is a halt state.  */
1606       if (type == END_OF_RE)
1607 	newstate->halt = 1;
1608       else if (type == OP_BACK_REF)
1609 	newstate->has_backref = 1;
1610       else if (type == ANCHOR)
1611 	constraint = node->opr.ctx_type;
1612 
1613       if (constraint)
1614 	{
1615 	  if (newstate->entrance_nodes == &newstate->nodes)
1616 	    {
1617 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1618 	      if (BE (newstate->entrance_nodes == NULL, 0))
1619 		{
1620 		  free_state (newstate);
1621 		  return NULL;
1622 		}
1623 	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1624 	      nctx_nodes = 0;
1625 	      newstate->has_constraint = 1;
1626 	    }
1627 
1628 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1629 	    {
1630 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1631 	      ++nctx_nodes;
1632 	    }
1633 	}
1634     }
1635   err = register_state (dfa, newstate, hash);
1636   if (BE (err != REG_NOERROR, 0))
1637     {
1638       free_state (newstate);
1639       newstate = NULL;
1640     }
1641   return  newstate;
1642 }
1643 
1644 static void
1645 internal_function
free_state(re_dfastate_t * state)1646 free_state (re_dfastate_t *state)
1647 {
1648   re_node_set_free (&state->non_eps_nodes);
1649   re_node_set_free (&state->inveclosure);
1650   if (state->entrance_nodes != &state->nodes)
1651     {
1652       re_node_set_free (state->entrance_nodes);
1653       re_free (state->entrance_nodes);
1654     }
1655   re_node_set_free (&state->nodes);
1656   re_free (state->word_trtable);
1657   re_free (state->trtable);
1658   re_free (state);
1659 }
1660