xref: /dflybsd-src/contrib/diffutils/lib/regex_internal.c (revision f3c0e43315ed177c30bea14922610325ba413b22)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU General Public
8    License as published by the Free Software Foundation; either
9    version 3 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    General Public License for more details.
15 
16    You should have received a copy of the GNU General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 static void re_string_construct_common (const char *str, Idx len,
21 					re_string_t *pstr,
22 					RE_TRANSLATE_TYPE trans, bool icase,
23 					const re_dfa_t *dfa);
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 					  const re_node_set *nodes,
26 					  re_hashval_t hash);
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  unsigned int context,
30 					  re_hashval_t hash);
31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 						Idx new_buf_len);
33 #ifdef RE_ENABLE_I18N
34 static void build_wcs_buffer (re_string_t *pstr);
35 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36 #endif /* RE_ENABLE_I18N */
37 static void build_upper_buffer (re_string_t *pstr);
38 static void re_string_translate_buffer (re_string_t *pstr);
39 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 					  int eflags) __attribute__ ((pure));
41 
42 /* Functions for string operation.  */
43 
44 /* This function allocate the buffers.  It is necessary to call
45    re_string_reconstruct before using the object.  */
46 
47 static reg_errcode_t
48 __attribute_warn_unused_result__
49 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51 {
52   reg_errcode_t ret;
53   Idx init_buf_len;
54 
55   /* Ensure at least one character fits into the buffers.  */
56   if (init_len < dfa->mb_cur_max)
57     init_len = dfa->mb_cur_max;
58   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59   re_string_construct_common (str, len, pstr, trans, icase, dfa);
60 
61   ret = re_string_realloc_buffers (pstr, init_buf_len);
62   if (__glibc_unlikely (ret != REG_NOERROR))
63     return ret;
64 
65   pstr->word_char = dfa->word_char;
66   pstr->word_ops_used = dfa->word_ops_used;
67   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69   pstr->valid_raw_len = pstr->valid_len;
70   return REG_NOERROR;
71 }
72 
73 /* This function allocate the buffers, and initialize them.  */
74 
75 static reg_errcode_t
76 __attribute_warn_unused_result__
77 re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79 {
80   reg_errcode_t ret;
81   memset (pstr, '\0', sizeof (re_string_t));
82   re_string_construct_common (str, len, pstr, trans, icase, dfa);
83 
84   if (len > 0)
85     {
86       ret = re_string_realloc_buffers (pstr, len + 1);
87       if (__glibc_unlikely (ret != REG_NOERROR))
88 	return ret;
89     }
90   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91 
92   if (icase)
93     {
94 #ifdef RE_ENABLE_I18N
95       if (dfa->mb_cur_max > 1)
96 	{
97 	  while (1)
98 	    {
99 	      ret = build_wcs_upper_buffer (pstr);
100 	      if (__glibc_unlikely (ret != REG_NOERROR))
101 		return ret;
102 	      if (pstr->valid_raw_len >= len)
103 		break;
104 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 		break;
106 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 	      if (__glibc_unlikely (ret != REG_NOERROR))
108 		return ret;
109 	    }
110 	}
111       else
112 #endif /* RE_ENABLE_I18N  */
113 	build_upper_buffer (pstr);
114     }
115   else
116     {
117 #ifdef RE_ENABLE_I18N
118       if (dfa->mb_cur_max > 1)
119 	build_wcs_buffer (pstr);
120       else
121 #endif /* RE_ENABLE_I18N  */
122 	{
123 	  if (trans != NULL)
124 	    re_string_translate_buffer (pstr);
125 	  else
126 	    {
127 	      pstr->valid_len = pstr->bufs_len;
128 	      pstr->valid_raw_len = pstr->bufs_len;
129 	    }
130 	}
131     }
132 
133   return REG_NOERROR;
134 }
135 
136 /* Helper functions for re_string_allocate, and re_string_construct.  */
137 
138 static reg_errcode_t
139 __attribute_warn_unused_result__
140 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141 {
142 #ifdef RE_ENABLE_I18N
143   if (pstr->mb_cur_max > 1)
144     {
145       wint_t *new_wcs;
146 
147       /* Avoid overflow in realloc.  */
148       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150 			    < new_buf_len))
151 	return REG_ESPACE;
152 
153       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154       if (__glibc_unlikely (new_wcs == NULL))
155 	return REG_ESPACE;
156       pstr->wcs = new_wcs;
157       if (pstr->offsets != NULL)
158 	{
159 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160 	  if (__glibc_unlikely (new_offsets == NULL))
161 	    return REG_ESPACE;
162 	  pstr->offsets = new_offsets;
163 	}
164     }
165 #endif /* RE_ENABLE_I18N  */
166   if (pstr->mbs_allocated)
167     {
168       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
169 					   new_buf_len);
170       if (__glibc_unlikely (new_mbs == NULL))
171 	return REG_ESPACE;
172       pstr->mbs = new_mbs;
173     }
174   pstr->bufs_len = new_buf_len;
175   return REG_NOERROR;
176 }
177 
178 
179 static void
180 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
181 			    RE_TRANSLATE_TYPE trans, bool icase,
182 			    const re_dfa_t *dfa)
183 {
184   pstr->raw_mbs = (const unsigned char *) str;
185   pstr->len = len;
186   pstr->raw_len = len;
187   pstr->trans = trans;
188   pstr->icase = icase;
189   pstr->mbs_allocated = (trans != NULL || icase);
190   pstr->mb_cur_max = dfa->mb_cur_max;
191   pstr->is_utf8 = dfa->is_utf8;
192   pstr->map_notascii = dfa->map_notascii;
193   pstr->stop = pstr->len;
194   pstr->raw_stop = pstr->stop;
195 }
196 
197 #ifdef RE_ENABLE_I18N
198 
199 /* Build wide character buffer PSTR->WCS.
200    If the byte sequence of the string are:
201      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202    Then wide character buffer will be:
203      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
204    We use WEOF for padding, they indicate that the position isn't
205    a first byte of a multibyte character.
206 
207    Note that this function assumes PSTR->VALID_LEN elements are already
208    built and starts from PSTR->VALID_LEN.  */
209 
210 static void
211 build_wcs_buffer (re_string_t *pstr)
212 {
213 #ifdef _LIBC
214   unsigned char buf[MB_LEN_MAX];
215   assert (MB_LEN_MAX >= pstr->mb_cur_max);
216 #else
217   unsigned char buf[64];
218 #endif
219   mbstate_t prev_st;
220   Idx byte_idx, end_idx, remain_len;
221   size_t mbclen;
222 
223   /* Build the buffers from pstr->valid_len to either pstr->len or
224      pstr->bufs_len.  */
225   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227     {
228       wchar_t wc;
229       const char *p;
230 
231       remain_len = end_idx - byte_idx;
232       prev_st = pstr->cur_state;
233       /* Apply the translation if we need.  */
234       if (__glibc_unlikely (pstr->trans != NULL))
235 	{
236 	  int i, ch;
237 
238 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
239 	    {
240 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242 	    }
243 	  p = (const char *) buf;
244 	}
245       else
246 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249 			    || (mbclen == (size_t) -2
250 				&& pstr->bufs_len >= pstr->len)))
251 	{
252 	  /* We treat these cases as a singlebyte character.  */
253 	  mbclen = 1;
254 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255 	  if (__glibc_unlikely (pstr->trans != NULL))
256 	    wc = pstr->trans[wc];
257 	  pstr->cur_state = prev_st;
258 	}
259       else if (__glibc_unlikely (mbclen == (size_t) -2))
260 	{
261 	  /* The buffer doesn't have enough space, finish to build.  */
262 	  pstr->cur_state = prev_st;
263 	  break;
264 	}
265 
266       /* Write wide character and padding.  */
267       pstr->wcs[byte_idx++] = wc;
268       /* Write paddings.  */
269       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270 	pstr->wcs[byte_idx++] = WEOF;
271     }
272   pstr->valid_len = byte_idx;
273   pstr->valid_raw_len = byte_idx;
274 }
275 
276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277    but for REG_ICASE.  */
278 
279 static reg_errcode_t
280 __attribute_warn_unused_result__
281 build_wcs_upper_buffer (re_string_t *pstr)
282 {
283   mbstate_t prev_st;
284   Idx src_idx, byte_idx, end_idx, remain_len;
285   size_t mbclen;
286 #ifdef _LIBC
287   char buf[MB_LEN_MAX];
288   assert (MB_LEN_MAX >= pstr->mb_cur_max);
289 #else
290   char buf[64];
291 #endif
292 
293   byte_idx = pstr->valid_len;
294   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295 
296   /* The following optimization assumes that ASCII characters can be
297      mapped to wide characters with a simple cast.  */
298   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299     {
300       while (byte_idx < end_idx)
301 	{
302 	  wchar_t wc;
303 
304 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
305 	      && mbsinit (&pstr->cur_state))
306 	    {
307 	      /* In case of a singlebyte character.  */
308 	      pstr->mbs[byte_idx]
309 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
310 	      /* The next step uses the assumption that wchar_t is encoded
311 		 ASCII-safe: all ASCII values can be converted like this.  */
312 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
313 	      ++byte_idx;
314 	      continue;
315 	    }
316 
317 	  remain_len = end_idx - byte_idx;
318 	  prev_st = pstr->cur_state;
319 	  mbclen = __mbrtowc (&wc,
320 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
321 			       + byte_idx), remain_len, &pstr->cur_state);
322 	  if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
323 	    {
324 	      wchar_t wcu = __towupper (wc);
325 	      if (wcu != wc)
326 		{
327 		  size_t mbcdlen;
328 
329 		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
330 		  if (__glibc_likely (mbclen == mbcdlen))
331 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
332 		  else
333 		    {
334 		      src_idx = byte_idx;
335 		      goto offsets_needed;
336 		    }
337 		}
338 	      else
339 		memcpy (pstr->mbs + byte_idx,
340 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
341 	      pstr->wcs[byte_idx++] = wcu;
342 	      /* Write paddings.  */
343 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
344 		pstr->wcs[byte_idx++] = WEOF;
345 	    }
346 	  else if (mbclen == (size_t) -1 || mbclen == 0
347 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
348 	    {
349 	      /* It is an invalid character, an incomplete character
350 		 at the end of the string, or '\0'.  Just use the byte.  */
351 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
352 	      pstr->mbs[byte_idx] = ch;
353 	      /* And also cast it to wide char.  */
354 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
355 	      if (__glibc_unlikely (mbclen == (size_t) -1))
356 		pstr->cur_state = prev_st;
357 	    }
358 	  else
359 	    {
360 	      /* The buffer doesn't have enough space, finish to build.  */
361 	      pstr->cur_state = prev_st;
362 	      break;
363 	    }
364 	}
365       pstr->valid_len = byte_idx;
366       pstr->valid_raw_len = byte_idx;
367       return REG_NOERROR;
368     }
369   else
370     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
371       {
372 	wchar_t wc;
373 	const char *p;
374       offsets_needed:
375 	remain_len = end_idx - byte_idx;
376 	prev_st = pstr->cur_state;
377 	if (__glibc_unlikely (pstr->trans != NULL))
378 	  {
379 	    int i, ch;
380 
381 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
382 	      {
383 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
384 		buf[i] = pstr->trans[ch];
385 	      }
386 	    p = (const char *) buf;
387 	  }
388 	else
389 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
390 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391 	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
392 	  {
393 	    wchar_t wcu = __towupper (wc);
394 	    if (wcu != wc)
395 	      {
396 		size_t mbcdlen;
397 
398 		mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
399 		if (__glibc_likely (mbclen == mbcdlen))
400 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
401 		else if (mbcdlen != (size_t) -1)
402 		  {
403 		    size_t i;
404 
405 		    if (byte_idx + mbcdlen > pstr->bufs_len)
406 		      {
407 			pstr->cur_state = prev_st;
408 			break;
409 		      }
410 
411 		    if (pstr->offsets == NULL)
412 		      {
413 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
414 
415 			if (pstr->offsets == NULL)
416 			  return REG_ESPACE;
417 		      }
418 		    if (!pstr->offsets_needed)
419 		      {
420 			for (i = 0; i < (size_t) byte_idx; ++i)
421 			  pstr->offsets[i] = i;
422 			pstr->offsets_needed = 1;
423 		      }
424 
425 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
426 		    pstr->wcs[byte_idx] = wcu;
427 		    pstr->offsets[byte_idx] = src_idx;
428 		    for (i = 1; i < mbcdlen; ++i)
429 		      {
430 			pstr->offsets[byte_idx + i]
431 			  = src_idx + (i < mbclen ? i : mbclen - 1);
432 			pstr->wcs[byte_idx + i] = WEOF;
433 		      }
434 		    pstr->len += mbcdlen - mbclen;
435 		    if (pstr->raw_stop > src_idx)
436 		      pstr->stop += mbcdlen - mbclen;
437 		    end_idx = (pstr->bufs_len > pstr->len)
438 			      ? pstr->len : pstr->bufs_len;
439 		    byte_idx += mbcdlen;
440 		    src_idx += mbclen;
441 		    continue;
442 		  }
443 		else
444 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
445 	      }
446 	    else
447 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
448 
449 	    if (__glibc_unlikely (pstr->offsets_needed != 0))
450 	      {
451 		size_t i;
452 		for (i = 0; i < mbclen; ++i)
453 		  pstr->offsets[byte_idx + i] = src_idx + i;
454 	      }
455 	    src_idx += mbclen;
456 
457 	    pstr->wcs[byte_idx++] = wcu;
458 	    /* Write paddings.  */
459 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
460 	      pstr->wcs[byte_idx++] = WEOF;
461 	  }
462 	else if (mbclen == (size_t) -1 || mbclen == 0
463 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
464 	  {
465 	    /* It is an invalid character or '\0'.  Just use the byte.  */
466 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
467 
468 	    if (__glibc_unlikely (pstr->trans != NULL))
469 	      ch = pstr->trans [ch];
470 	    pstr->mbs[byte_idx] = ch;
471 
472 	    if (__glibc_unlikely (pstr->offsets_needed != 0))
473 	      pstr->offsets[byte_idx] = src_idx;
474 	    ++src_idx;
475 
476 	    /* And also cast it to wide char.  */
477 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
478 	    if (__glibc_unlikely (mbclen == (size_t) -1))
479 	      pstr->cur_state = prev_st;
480 	  }
481 	else
482 	  {
483 	    /* The buffer doesn't have enough space, finish to build.  */
484 	    pstr->cur_state = prev_st;
485 	    break;
486 	  }
487       }
488   pstr->valid_len = byte_idx;
489   pstr->valid_raw_len = src_idx;
490   return REG_NOERROR;
491 }
492 
493 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
494    Return the index.  */
495 
496 static Idx
497 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
498 {
499   mbstate_t prev_st;
500   Idx rawbuf_idx;
501   size_t mbclen;
502   wint_t wc = WEOF;
503 
504   /* Skip the characters which are not necessary to check.  */
505   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
506        rawbuf_idx < new_raw_idx;)
507     {
508       wchar_t wc2;
509       Idx remain_len = pstr->raw_len - rawbuf_idx;
510       prev_st = pstr->cur_state;
511       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
512 			  remain_len, &pstr->cur_state);
513       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
514 			    || mbclen == 0))
515 	{
516 	  /* We treat these cases as a single byte character.  */
517 	  if (mbclen == 0 || remain_len == 0)
518 	    wc = L'\0';
519 	  else
520 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
521 	  mbclen = 1;
522 	  pstr->cur_state = prev_st;
523 	}
524       else
525 	wc = wc2;
526       /* Then proceed the next character.  */
527       rawbuf_idx += mbclen;
528     }
529   *last_wc = wc;
530   return rawbuf_idx;
531 }
532 #endif /* RE_ENABLE_I18N  */
533 
534 /* Build the buffer PSTR->MBS, and apply the translation if we need.
535    This function is used in case of REG_ICASE.  */
536 
537 static void
538 build_upper_buffer (re_string_t *pstr)
539 {
540   Idx char_idx, end_idx;
541   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542 
543   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
544     {
545       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
546       if (__glibc_unlikely (pstr->trans != NULL))
547 	ch = pstr->trans[ch];
548       pstr->mbs[char_idx] = toupper (ch);
549     }
550   pstr->valid_len = char_idx;
551   pstr->valid_raw_len = char_idx;
552 }
553 
554 /* Apply TRANS to the buffer in PSTR.  */
555 
556 static void
557 re_string_translate_buffer (re_string_t *pstr)
558 {
559   Idx buf_idx, end_idx;
560   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
561 
562   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
563     {
564       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
565       pstr->mbs[buf_idx] = pstr->trans[ch];
566     }
567 
568   pstr->valid_len = buf_idx;
569   pstr->valid_raw_len = buf_idx;
570 }
571 
572 /* This function re-construct the buffers.
573    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
574    convert to upper case in case of REG_ICASE, apply translation.  */
575 
576 static reg_errcode_t
577 __attribute_warn_unused_result__
578 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
579 {
580   Idx offset;
581 
582   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
583     offset = idx - pstr->raw_mbs_idx;
584   else
585     {
586       /* Reset buffer.  */
587 #ifdef RE_ENABLE_I18N
588       if (pstr->mb_cur_max > 1)
589 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590 #endif /* RE_ENABLE_I18N */
591       pstr->len = pstr->raw_len;
592       pstr->stop = pstr->raw_stop;
593       pstr->valid_len = 0;
594       pstr->raw_mbs_idx = 0;
595       pstr->valid_raw_len = 0;
596       pstr->offsets_needed = 0;
597       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
599       if (!pstr->mbs_allocated)
600 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
601       offset = idx;
602     }
603 
604   if (__glibc_likely (offset != 0))
605     {
606       /* Should the already checked characters be kept?  */
607       if (__glibc_likely (offset < pstr->valid_raw_len))
608 	{
609 	  /* Yes, move them to the front of the buffer.  */
610 #ifdef RE_ENABLE_I18N
611 	  if (__glibc_unlikely (pstr->offsets_needed))
612 	    {
613 	      Idx low = 0, high = pstr->valid_len, mid;
614 	      do
615 		{
616 		  mid = (high + low) / 2;
617 		  if (pstr->offsets[mid] > offset)
618 		    high = mid;
619 		  else if (pstr->offsets[mid] < offset)
620 		    low = mid + 1;
621 		  else
622 		    break;
623 		}
624 	      while (low < high);
625 	      if (pstr->offsets[mid] < offset)
626 		++mid;
627 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
628 							eflags);
629 	      /* This can be quite complicated, so handle specially
630 		 only the common and easy case where the character with
631 		 different length representation of lower and upper
632 		 case is present at or after offset.  */
633 	      if (pstr->valid_len > offset
634 		  && mid == offset && pstr->offsets[mid] == offset)
635 		{
636 		  memmove (pstr->wcs, pstr->wcs + offset,
637 			   (pstr->valid_len - offset) * sizeof (wint_t));
638 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
639 		  pstr->valid_len -= offset;
640 		  pstr->valid_raw_len -= offset;
641 		  for (low = 0; low < pstr->valid_len; low++)
642 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
643 		}
644 	      else
645 		{
646 		  /* Otherwise, just find out how long the partial multibyte
647 		     character at offset is and fill it with WEOF/255.  */
648 		  pstr->len = pstr->raw_len - idx + offset;
649 		  pstr->stop = pstr->raw_stop - idx + offset;
650 		  pstr->offsets_needed = 0;
651 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
652 		    --mid;
653 		  while (mid < pstr->valid_len)
654 		    if (pstr->wcs[mid] != WEOF)
655 		      break;
656 		    else
657 		      ++mid;
658 		  if (mid == pstr->valid_len)
659 		    pstr->valid_len = 0;
660 		  else
661 		    {
662 		      pstr->valid_len = pstr->offsets[mid] - offset;
663 		      if (pstr->valid_len)
664 			{
665 			  for (low = 0; low < pstr->valid_len; ++low)
666 			    pstr->wcs[low] = WEOF;
667 			  memset (pstr->mbs, 255, pstr->valid_len);
668 			}
669 		    }
670 		  pstr->valid_raw_len = pstr->valid_len;
671 		}
672 	    }
673 	  else
674 #endif
675 	    {
676 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
677 							eflags);
678 #ifdef RE_ENABLE_I18N
679 	      if (pstr->mb_cur_max > 1)
680 		memmove (pstr->wcs, pstr->wcs + offset,
681 			 (pstr->valid_len - offset) * sizeof (wint_t));
682 #endif /* RE_ENABLE_I18N */
683 	      if (__glibc_unlikely (pstr->mbs_allocated))
684 		memmove (pstr->mbs, pstr->mbs + offset,
685 			 pstr->valid_len - offset);
686 	      pstr->valid_len -= offset;
687 	      pstr->valid_raw_len -= offset;
688 #if defined DEBUG && DEBUG
689 	      assert (pstr->valid_len > 0);
690 #endif
691 	    }
692 	}
693       else
694 	{
695 #ifdef RE_ENABLE_I18N
696 	  /* No, skip all characters until IDX.  */
697 	  Idx prev_valid_len = pstr->valid_len;
698 
699 	  if (__glibc_unlikely (pstr->offsets_needed))
700 	    {
701 	      pstr->len = pstr->raw_len - idx + offset;
702 	      pstr->stop = pstr->raw_stop - idx + offset;
703 	      pstr->offsets_needed = 0;
704 	    }
705 #endif
706 	  pstr->valid_len = 0;
707 #ifdef RE_ENABLE_I18N
708 	  if (pstr->mb_cur_max > 1)
709 	    {
710 	      Idx wcs_idx;
711 	      wint_t wc = WEOF;
712 
713 	      if (pstr->is_utf8)
714 		{
715 		  const unsigned char *raw, *p, *end;
716 
717 		  /* Special case UTF-8.  Multi-byte chars start with any
718 		     byte other than 0x80 - 0xbf.  */
719 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
720 		  end = raw + (offset - pstr->mb_cur_max);
721 		  if (end < pstr->raw_mbs)
722 		    end = pstr->raw_mbs;
723 		  p = raw + offset - 1;
724 #ifdef _LIBC
725 		  /* We know the wchar_t encoding is UCS4, so for the simple
726 		     case, ASCII characters, skip the conversion step.  */
727 		  if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
728 		    {
729 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
730 		      /* pstr->valid_len = 0; */
731 		      wc = (wchar_t) *p;
732 		    }
733 		  else
734 #endif
735 		    for (; p >= end; --p)
736 		      if ((*p & 0xc0) != 0x80)
737 			{
738 			  mbstate_t cur_state;
739 			  wchar_t wc2;
740 			  Idx mlen = raw + pstr->len - p;
741 			  unsigned char buf[6];
742 			  size_t mbclen;
743 
744 			  const unsigned char *pp = p;
745 			  if (__glibc_unlikely (pstr->trans != NULL))
746 			    {
747 			      int i = mlen < 6 ? mlen : 6;
748 			      while (--i >= 0)
749 				buf[i] = pstr->trans[p[i]];
750 			      pp = buf;
751 			    }
752 			  /* XXX Don't use mbrtowc, we know which conversion
753 			     to use (UTF-8 -> UCS4).  */
754 			  memset (&cur_state, 0, sizeof (cur_state));
755 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
756 					      &cur_state);
757 			  if (raw + offset - p <= mbclen
758 			      && mbclen < (size_t) -2)
759 			    {
760 			      memset (&pstr->cur_state, '\0',
761 				      sizeof (mbstate_t));
762 			      pstr->valid_len = mbclen - (raw + offset - p);
763 			      wc = wc2;
764 			    }
765 			  break;
766 			}
767 		}
768 
769 	      if (wc == WEOF)
770 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
771 	      if (wc == WEOF)
772 		pstr->tip_context
773 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
774 	      else
775 		pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
776 				      && IS_WIDE_WORD_CHAR (wc))
777 				     ? CONTEXT_WORD
778 				     : ((IS_WIDE_NEWLINE (wc)
779 					 && pstr->newline_anchor)
780 					? CONTEXT_NEWLINE : 0));
781 	      if (__glibc_unlikely (pstr->valid_len))
782 		{
783 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
784 		    pstr->wcs[wcs_idx] = WEOF;
785 		  if (pstr->mbs_allocated)
786 		    memset (pstr->mbs, 255, pstr->valid_len);
787 		}
788 	      pstr->valid_raw_len = pstr->valid_len;
789 	    }
790 	  else
791 #endif /* RE_ENABLE_I18N */
792 	    {
793 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
794 	      pstr->valid_raw_len = 0;
795 	      if (pstr->trans)
796 		c = pstr->trans[c];
797 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
798 				   ? CONTEXT_WORD
799 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
800 				      ? CONTEXT_NEWLINE : 0));
801 	    }
802 	}
803       if (!__glibc_unlikely (pstr->mbs_allocated))
804 	pstr->mbs += offset;
805     }
806   pstr->raw_mbs_idx = idx;
807   pstr->len -= offset;
808   pstr->stop -= offset;
809 
810   /* Then build the buffers.  */
811 #ifdef RE_ENABLE_I18N
812   if (pstr->mb_cur_max > 1)
813     {
814       if (pstr->icase)
815 	{
816 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
817 	  if (__glibc_unlikely (ret != REG_NOERROR))
818 	    return ret;
819 	}
820       else
821 	build_wcs_buffer (pstr);
822     }
823   else
824 #endif /* RE_ENABLE_I18N */
825     if (__glibc_unlikely (pstr->mbs_allocated))
826       {
827 	if (pstr->icase)
828 	  build_upper_buffer (pstr);
829 	else if (pstr->trans != NULL)
830 	  re_string_translate_buffer (pstr);
831       }
832     else
833       pstr->valid_len = pstr->len;
834 
835   pstr->cur_idx = 0;
836   return REG_NOERROR;
837 }
838 
839 static unsigned char
840 __attribute__ ((pure))
841 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
842 {
843   int ch;
844   Idx off;
845 
846   /* Handle the common (easiest) cases first.  */
847   if (__glibc_likely (!pstr->mbs_allocated))
848     return re_string_peek_byte (pstr, idx);
849 
850 #ifdef RE_ENABLE_I18N
851   if (pstr->mb_cur_max > 1
852       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
853     return re_string_peek_byte (pstr, idx);
854 #endif
855 
856   off = pstr->cur_idx + idx;
857 #ifdef RE_ENABLE_I18N
858   if (pstr->offsets_needed)
859     off = pstr->offsets[off];
860 #endif
861 
862   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
863 
864 #ifdef RE_ENABLE_I18N
865   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
866      this function returns CAPITAL LETTER I instead of first byte of
867      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
868      since peek_byte_case doesn't advance cur_idx in any way.  */
869   if (pstr->offsets_needed && !isascii (ch))
870     return re_string_peek_byte (pstr, idx);
871 #endif
872 
873   return ch;
874 }
875 
876 static unsigned char
877 re_string_fetch_byte_case (re_string_t *pstr)
878 {
879   if (__glibc_likely (!pstr->mbs_allocated))
880     return re_string_fetch_byte (pstr);
881 
882 #ifdef RE_ENABLE_I18N
883   if (pstr->offsets_needed)
884     {
885       Idx off;
886       int ch;
887 
888       /* For tr_TR.UTF-8 [[:islower:]] there is
889 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
890 	 in that case the whole multi-byte character and return
891 	 the original letter.  On the other side, with
892 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
893 	 anything else would complicate things too much.  */
894 
895       if (!re_string_first_byte (pstr, pstr->cur_idx))
896 	return re_string_fetch_byte (pstr);
897 
898       off = pstr->offsets[pstr->cur_idx];
899       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
900 
901       if (! isascii (ch))
902 	return re_string_fetch_byte (pstr);
903 
904       re_string_skip_bytes (pstr,
905 			    re_string_char_size_at (pstr, pstr->cur_idx));
906       return ch;
907     }
908 #endif
909 
910   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
911 }
912 
913 static void
914 re_string_destruct (re_string_t *pstr)
915 {
916 #ifdef RE_ENABLE_I18N
917   re_free (pstr->wcs);
918   re_free (pstr->offsets);
919 #endif /* RE_ENABLE_I18N  */
920   if (pstr->mbs_allocated)
921     re_free (pstr->mbs);
922 }
923 
924 /* Return the context at IDX in INPUT.  */
925 
926 static unsigned int
927 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
928 {
929   int c;
930   if (__glibc_unlikely (idx < 0))
931     /* In this case, we use the value stored in input->tip_context,
932        since we can't know the character in input->mbs[-1] here.  */
933     return input->tip_context;
934   if (__glibc_unlikely (idx == input->len))
935     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
936 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
937 #ifdef RE_ENABLE_I18N
938   if (input->mb_cur_max > 1)
939     {
940       wint_t wc;
941       Idx wc_idx = idx;
942       while(input->wcs[wc_idx] == WEOF)
943 	{
944 #if defined DEBUG && DEBUG
945 	  /* It must not happen.  */
946 	  assert (wc_idx >= 0);
947 #endif
948 	  --wc_idx;
949 	  if (wc_idx < 0)
950 	    return input->tip_context;
951 	}
952       wc = input->wcs[wc_idx];
953       if (__glibc_unlikely (input->word_ops_used != 0)
954 	  && IS_WIDE_WORD_CHAR (wc))
955 	return CONTEXT_WORD;
956       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
957 	      ? CONTEXT_NEWLINE : 0);
958     }
959   else
960 #endif
961     {
962       c = re_string_byte_at (input, idx);
963       if (bitset_contain (input->word_char, c))
964 	return CONTEXT_WORD;
965       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
966     }
967 }
968 
969 /* Functions for set operation.  */
970 
971 static reg_errcode_t
972 __attribute_warn_unused_result__
973 re_node_set_alloc (re_node_set *set, Idx size)
974 {
975   set->alloc = size;
976   set->nelem = 0;
977   set->elems = re_malloc (Idx, size);
978   if (__glibc_unlikely (set->elems == NULL)
979       && (MALLOC_0_IS_NONNULL || size != 0))
980     return REG_ESPACE;
981   return REG_NOERROR;
982 }
983 
984 static reg_errcode_t
985 __attribute_warn_unused_result__
986 re_node_set_init_1 (re_node_set *set, Idx elem)
987 {
988   set->alloc = 1;
989   set->nelem = 1;
990   set->elems = re_malloc (Idx, 1);
991   if (__glibc_unlikely (set->elems == NULL))
992     {
993       set->alloc = set->nelem = 0;
994       return REG_ESPACE;
995     }
996   set->elems[0] = elem;
997   return REG_NOERROR;
998 }
999 
1000 static reg_errcode_t
1001 __attribute_warn_unused_result__
1002 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1003 {
1004   set->alloc = 2;
1005   set->elems = re_malloc (Idx, 2);
1006   if (__glibc_unlikely (set->elems == NULL))
1007     return REG_ESPACE;
1008   if (elem1 == elem2)
1009     {
1010       set->nelem = 1;
1011       set->elems[0] = elem1;
1012     }
1013   else
1014     {
1015       set->nelem = 2;
1016       if (elem1 < elem2)
1017 	{
1018 	  set->elems[0] = elem1;
1019 	  set->elems[1] = elem2;
1020 	}
1021       else
1022 	{
1023 	  set->elems[0] = elem2;
1024 	  set->elems[1] = elem1;
1025 	}
1026     }
1027   return REG_NOERROR;
1028 }
1029 
1030 static reg_errcode_t
1031 __attribute_warn_unused_result__
1032 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1033 {
1034   dest->nelem = src->nelem;
1035   if (src->nelem > 0)
1036     {
1037       dest->alloc = dest->nelem;
1038       dest->elems = re_malloc (Idx, dest->alloc);
1039       if (__glibc_unlikely (dest->elems == NULL))
1040 	{
1041 	  dest->alloc = dest->nelem = 0;
1042 	  return REG_ESPACE;
1043 	}
1044       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1045     }
1046   else
1047     re_node_set_init_empty (dest);
1048   return REG_NOERROR;
1049 }
1050 
1051 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1052    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1053    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1054 
1055 static reg_errcode_t
1056 __attribute_warn_unused_result__
1057 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1058 			   const re_node_set *src2)
1059 {
1060   Idx i1, i2, is, id, delta, sbase;
1061   if (src1->nelem == 0 || src2->nelem == 0)
1062     return REG_NOERROR;
1063 
1064   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1065      conservative estimate.  */
1066   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1067     {
1068       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1069       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1070       if (__glibc_unlikely (new_elems == NULL))
1071 	return REG_ESPACE;
1072       dest->elems = new_elems;
1073       dest->alloc = new_alloc;
1074     }
1075 
1076   /* Find the items in the intersection of SRC1 and SRC2, and copy
1077      into the top of DEST those that are not already in DEST itself.  */
1078   sbase = dest->nelem + src1->nelem + src2->nelem;
1079   i1 = src1->nelem - 1;
1080   i2 = src2->nelem - 1;
1081   id = dest->nelem - 1;
1082   for (;;)
1083     {
1084       if (src1->elems[i1] == src2->elems[i2])
1085 	{
1086 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1087 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1088 	    --id;
1089 
1090 	  if (id < 0 || dest->elems[id] != src1->elems[i1])
1091             dest->elems[--sbase] = src1->elems[i1];
1092 
1093 	  if (--i1 < 0 || --i2 < 0)
1094 	    break;
1095 	}
1096 
1097       /* Lower the highest of the two items.  */
1098       else if (src1->elems[i1] < src2->elems[i2])
1099 	{
1100 	  if (--i2 < 0)
1101 	    break;
1102 	}
1103       else
1104 	{
1105 	  if (--i1 < 0)
1106 	    break;
1107 	}
1108     }
1109 
1110   id = dest->nelem - 1;
1111   is = dest->nelem + src1->nelem + src2->nelem - 1;
1112   delta = is - sbase + 1;
1113 
1114   /* Now copy.  When DELTA becomes zero, the remaining
1115      DEST elements are already in place; this is more or
1116      less the same loop that is in re_node_set_merge.  */
1117   dest->nelem += delta;
1118   if (delta > 0 && id >= 0)
1119     for (;;)
1120       {
1121 	if (dest->elems[is] > dest->elems[id])
1122 	  {
1123 	    /* Copy from the top.  */
1124 	    dest->elems[id + delta--] = dest->elems[is--];
1125 	    if (delta == 0)
1126 	      break;
1127 	  }
1128 	else
1129 	  {
1130 	    /* Slide from the bottom.  */
1131 	    dest->elems[id + delta] = dest->elems[id];
1132 	    if (--id < 0)
1133 	      break;
1134 	  }
1135       }
1136 
1137   /* Copy remaining SRC elements.  */
1138   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1139 
1140   return REG_NOERROR;
1141 }
1142 
1143 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1144    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1145 
1146 static reg_errcode_t
1147 __attribute_warn_unused_result__
1148 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1149 			const re_node_set *src2)
1150 {
1151   Idx i1, i2, id;
1152   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1153     {
1154       dest->alloc = src1->nelem + src2->nelem;
1155       dest->elems = re_malloc (Idx, dest->alloc);
1156       if (__glibc_unlikely (dest->elems == NULL))
1157 	return REG_ESPACE;
1158     }
1159   else
1160     {
1161       if (src1 != NULL && src1->nelem > 0)
1162 	return re_node_set_init_copy (dest, src1);
1163       else if (src2 != NULL && src2->nelem > 0)
1164 	return re_node_set_init_copy (dest, src2);
1165       else
1166 	re_node_set_init_empty (dest);
1167       return REG_NOERROR;
1168     }
1169   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1170     {
1171       if (src1->elems[i1] > src2->elems[i2])
1172 	{
1173 	  dest->elems[id++] = src2->elems[i2++];
1174 	  continue;
1175 	}
1176       if (src1->elems[i1] == src2->elems[i2])
1177 	++i2;
1178       dest->elems[id++] = src1->elems[i1++];
1179     }
1180   if (i1 < src1->nelem)
1181     {
1182       memcpy (dest->elems + id, src1->elems + i1,
1183 	     (src1->nelem - i1) * sizeof (Idx));
1184       id += src1->nelem - i1;
1185     }
1186   else if (i2 < src2->nelem)
1187     {
1188       memcpy (dest->elems + id, src2->elems + i2,
1189 	     (src2->nelem - i2) * sizeof (Idx));
1190       id += src2->nelem - i2;
1191     }
1192   dest->nelem = id;
1193   return REG_NOERROR;
1194 }
1195 
1196 /* Calculate the union set of the sets DEST and SRC. And store it to
1197    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1198 
1199 static reg_errcode_t
1200 __attribute_warn_unused_result__
1201 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1202 {
1203   Idx is, id, sbase, delta;
1204   if (src == NULL || src->nelem == 0)
1205     return REG_NOERROR;
1206   if (dest->alloc < 2 * src->nelem + dest->nelem)
1207     {
1208       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1209       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1210       if (__glibc_unlikely (new_buffer == NULL))
1211 	return REG_ESPACE;
1212       dest->elems = new_buffer;
1213       dest->alloc = new_alloc;
1214     }
1215 
1216   if (__glibc_unlikely (dest->nelem == 0))
1217     {
1218       dest->nelem = src->nelem;
1219       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1220       return REG_NOERROR;
1221     }
1222 
1223   /* Copy into the top of DEST the items of SRC that are not
1224      found in DEST.  Maybe we could binary search in DEST?  */
1225   for (sbase = dest->nelem + 2 * src->nelem,
1226        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1227     {
1228       if (dest->elems[id] == src->elems[is])
1229 	is--, id--;
1230       else if (dest->elems[id] < src->elems[is])
1231 	dest->elems[--sbase] = src->elems[is--];
1232       else /* if (dest->elems[id] > src->elems[is]) */
1233 	--id;
1234     }
1235 
1236   if (is >= 0)
1237     {
1238       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1239       sbase -= is + 1;
1240       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1241     }
1242 
1243   id = dest->nelem - 1;
1244   is = dest->nelem + 2 * src->nelem - 1;
1245   delta = is - sbase + 1;
1246   if (delta == 0)
1247     return REG_NOERROR;
1248 
1249   /* Now copy.  When DELTA becomes zero, the remaining
1250      DEST elements are already in place.  */
1251   dest->nelem += delta;
1252   for (;;)
1253     {
1254       if (dest->elems[is] > dest->elems[id])
1255 	{
1256 	  /* Copy from the top.  */
1257 	  dest->elems[id + delta--] = dest->elems[is--];
1258 	  if (delta == 0)
1259 	    break;
1260 	}
1261       else
1262 	{
1263 	  /* Slide from the bottom.  */
1264 	  dest->elems[id + delta] = dest->elems[id];
1265 	  if (--id < 0)
1266 	    {
1267 	      /* Copy remaining SRC elements.  */
1268 	      memcpy (dest->elems, dest->elems + sbase,
1269 		      delta * sizeof (Idx));
1270 	      break;
1271 	    }
1272 	}
1273     }
1274 
1275   return REG_NOERROR;
1276 }
1277 
1278 /* Insert the new element ELEM to the re_node_set* SET.
1279    SET should not already have ELEM.
1280    Return true if successful.  */
1281 
1282 static bool
1283 __attribute_warn_unused_result__
1284 re_node_set_insert (re_node_set *set, Idx elem)
1285 {
1286   Idx idx;
1287   /* In case the set is empty.  */
1288   if (set->alloc == 0)
1289     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1290 
1291   if (__glibc_unlikely (set->nelem) == 0)
1292     {
1293       /* We already guaranteed above that set->alloc != 0.  */
1294       set->elems[0] = elem;
1295       ++set->nelem;
1296       return true;
1297     }
1298 
1299   /* Realloc if we need.  */
1300   if (set->alloc == set->nelem)
1301     {
1302       Idx *new_elems;
1303       set->alloc = set->alloc * 2;
1304       new_elems = re_realloc (set->elems, Idx, set->alloc);
1305       if (__glibc_unlikely (new_elems == NULL))
1306 	return false;
1307       set->elems = new_elems;
1308     }
1309 
1310   /* Move the elements which follows the new element.  Test the
1311      first element separately to skip a check in the inner loop.  */
1312   if (elem < set->elems[0])
1313     {
1314       idx = 0;
1315       for (idx = set->nelem; idx > 0; idx--)
1316 	set->elems[idx] = set->elems[idx - 1];
1317     }
1318   else
1319     {
1320       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1321 	set->elems[idx] = set->elems[idx - 1];
1322     }
1323 
1324   /* Insert the new element.  */
1325   set->elems[idx] = elem;
1326   ++set->nelem;
1327   return true;
1328 }
1329 
1330 /* Insert the new element ELEM to the re_node_set* SET.
1331    SET should not already have any element greater than or equal to ELEM.
1332    Return true if successful.  */
1333 
1334 static bool
1335 __attribute_warn_unused_result__
1336 re_node_set_insert_last (re_node_set *set, Idx elem)
1337 {
1338   /* Realloc if we need.  */
1339   if (set->alloc == set->nelem)
1340     {
1341       Idx *new_elems;
1342       set->alloc = (set->alloc + 1) * 2;
1343       new_elems = re_realloc (set->elems, Idx, set->alloc);
1344       if (__glibc_unlikely (new_elems == NULL))
1345 	return false;
1346       set->elems = new_elems;
1347     }
1348 
1349   /* Insert the new element.  */
1350   set->elems[set->nelem++] = elem;
1351   return true;
1352 }
1353 
1354 /* Compare two node sets SET1 and SET2.
1355    Return true if SET1 and SET2 are equivalent.  */
1356 
1357 static bool
1358 __attribute__ ((pure))
1359 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1360 {
1361   Idx i;
1362   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1363     return false;
1364   for (i = set1->nelem ; --i >= 0 ; )
1365     if (set1->elems[i] != set2->elems[i])
1366       return false;
1367   return true;
1368 }
1369 
1370 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1371 
1372 static Idx
1373 __attribute__ ((pure))
1374 re_node_set_contains (const re_node_set *set, Idx elem)
1375 {
1376   __re_size_t idx, right, mid;
1377   if (set->nelem <= 0)
1378     return 0;
1379 
1380   /* Binary search the element.  */
1381   idx = 0;
1382   right = set->nelem - 1;
1383   while (idx < right)
1384     {
1385       mid = (idx + right) / 2;
1386       if (set->elems[mid] < elem)
1387 	idx = mid + 1;
1388       else
1389 	right = mid;
1390     }
1391   return set->elems[idx] == elem ? idx + 1 : 0;
1392 }
1393 
1394 static void
1395 re_node_set_remove_at (re_node_set *set, Idx idx)
1396 {
1397   if (idx < 0 || idx >= set->nelem)
1398     return;
1399   --set->nelem;
1400   for (; idx < set->nelem; idx++)
1401     set->elems[idx] = set->elems[idx + 1];
1402 }
1403 
1404 
1405 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406    Or return -1 if an error occurred.  */
1407 
1408 static Idx
1409 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410 {
1411   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1412     {
1413       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1414       Idx *new_nexts, *new_indices;
1415       re_node_set *new_edests, *new_eclosures;
1416       re_token_t *new_nodes;
1417 
1418       /* Avoid overflows in realloc.  */
1419       const size_t max_object_size = MAX (sizeof (re_token_t),
1420 					  MAX (sizeof (re_node_set),
1421 					       sizeof (Idx)));
1422       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1423 			    < new_nodes_alloc))
1424 	return -1;
1425 
1426       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427       if (__glibc_unlikely (new_nodes == NULL))
1428 	return -1;
1429       dfa->nodes = new_nodes;
1430       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1431       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1432       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1435 			    || new_edests == NULL || new_eclosures == NULL))
1436 	{
1437 	   re_free (new_nexts);
1438 	   re_free (new_indices);
1439 	   re_free (new_edests);
1440 	   re_free (new_eclosures);
1441 	   return -1;
1442 	}
1443       dfa->nexts = new_nexts;
1444       dfa->org_indices = new_indices;
1445       dfa->edests = new_edests;
1446       dfa->eclosures = new_eclosures;
1447       dfa->nodes_alloc = new_nodes_alloc;
1448     }
1449   dfa->nodes[dfa->nodes_len] = token;
1450   dfa->nodes[dfa->nodes_len].constraint = 0;
1451 #ifdef RE_ENABLE_I18N
1452   dfa->nodes[dfa->nodes_len].accept_mb =
1453     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1454      || token.type == COMPLEX_BRACKET);
1455 #endif
1456   dfa->nexts[dfa->nodes_len] = -1;
1457   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1458   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1459   return dfa->nodes_len++;
1460 }
1461 
1462 static re_hashval_t
1463 calc_state_hash (const re_node_set *nodes, unsigned int context)
1464 {
1465   re_hashval_t hash = nodes->nelem + context;
1466   Idx i;
1467   for (i = 0 ; i < nodes->nelem ; i++)
1468     hash += nodes->elems[i];
1469   return hash;
1470 }
1471 
1472 /* Search for the state whose node_set is equivalent to NODES.
1473    Return the pointer to the state, if we found it in the DFA.
1474    Otherwise create the new one and return it.  In case of an error
1475    return NULL and set the error code in ERR.
1476    Note: - We assume NULL as the invalid state, then it is possible that
1477 	   return value is NULL and ERR is REG_NOERROR.
1478 	 - We never return non-NULL value in case of any errors, it is for
1479 	   optimization.  */
1480 
1481 static re_dfastate_t *
1482 __attribute_warn_unused_result__
1483 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1484 		  const re_node_set *nodes)
1485 {
1486   re_hashval_t hash;
1487   re_dfastate_t *new_state;
1488   struct re_state_table_entry *spot;
1489   Idx i;
1490 #if defined GCC_LINT || defined lint
1491   /* Suppress bogus uninitialized-variable warnings.  */
1492   *err = REG_NOERROR;
1493 #endif
1494   if (__glibc_unlikely (nodes->nelem == 0))
1495     {
1496       *err = REG_NOERROR;
1497       return NULL;
1498     }
1499   hash = calc_state_hash (nodes, 0);
1500   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1501 
1502   for (i = 0 ; i < spot->num ; i++)
1503     {
1504       re_dfastate_t *state = spot->array[i];
1505       if (hash != state->hash)
1506 	continue;
1507       if (re_node_set_compare (&state->nodes, nodes))
1508 	return state;
1509     }
1510 
1511   /* There are no appropriate state in the dfa, create the new one.  */
1512   new_state = create_ci_newstate (dfa, nodes, hash);
1513   if (__glibc_unlikely (new_state == NULL))
1514     *err = REG_ESPACE;
1515 
1516   return new_state;
1517 }
1518 
1519 /* Search for the state whose node_set is equivalent to NODES and
1520    whose context is equivalent to CONTEXT.
1521    Return the pointer to the state, if we found it in the DFA.
1522    Otherwise create the new one and return it.  In case of an error
1523    return NULL and set the error code in ERR.
1524    Note: - We assume NULL as the invalid state, then it is possible that
1525 	   return value is NULL and ERR is REG_NOERROR.
1526 	 - We never return non-NULL value in case of any errors, it is for
1527 	   optimization.  */
1528 
1529 static re_dfastate_t *
1530 __attribute_warn_unused_result__
1531 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1532 			  const re_node_set *nodes, unsigned int context)
1533 {
1534   re_hashval_t hash;
1535   re_dfastate_t *new_state;
1536   struct re_state_table_entry *spot;
1537   Idx i;
1538 #if defined GCC_LINT || defined lint
1539   /* Suppress bogus uninitialized-variable warnings.  */
1540   *err = REG_NOERROR;
1541 #endif
1542   if (nodes->nelem == 0)
1543     {
1544       *err = REG_NOERROR;
1545       return NULL;
1546     }
1547   hash = calc_state_hash (nodes, context);
1548   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1549 
1550   for (i = 0 ; i < spot->num ; i++)
1551     {
1552       re_dfastate_t *state = spot->array[i];
1553       if (state->hash == hash
1554 	  && state->context == context
1555 	  && re_node_set_compare (state->entrance_nodes, nodes))
1556 	return state;
1557     }
1558   /* There are no appropriate state in 'dfa', create the new one.  */
1559   new_state = create_cd_newstate (dfa, nodes, context, hash);
1560   if (__glibc_unlikely (new_state == NULL))
1561     *err = REG_ESPACE;
1562 
1563   return new_state;
1564 }
1565 
1566 /* Finish initialization of the new state NEWSTATE, and using its hash value
1567    HASH put in the appropriate bucket of DFA's state table.  Return value
1568    indicates the error code if failed.  */
1569 
1570 static reg_errcode_t
1571 __attribute_warn_unused_result__
1572 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1573 		re_hashval_t hash)
1574 {
1575   struct re_state_table_entry *spot;
1576   reg_errcode_t err;
1577   Idx i;
1578 
1579   newstate->hash = hash;
1580   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1581   if (__glibc_unlikely (err != REG_NOERROR))
1582     return REG_ESPACE;
1583   for (i = 0; i < newstate->nodes.nelem; i++)
1584     {
1585       Idx elem = newstate->nodes.elems[i];
1586       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1587 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1588 	  return REG_ESPACE;
1589     }
1590 
1591   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1592   if (__glibc_unlikely (spot->alloc <= spot->num))
1593     {
1594       Idx new_alloc = 2 * spot->num + 2;
1595       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1596 					      new_alloc);
1597       if (__glibc_unlikely (new_array == NULL))
1598 	return REG_ESPACE;
1599       spot->array = new_array;
1600       spot->alloc = new_alloc;
1601     }
1602   spot->array[spot->num++] = newstate;
1603   return REG_NOERROR;
1604 }
1605 
1606 static void
1607 free_state (re_dfastate_t *state)
1608 {
1609   re_node_set_free (&state->non_eps_nodes);
1610   re_node_set_free (&state->inveclosure);
1611   if (state->entrance_nodes != &state->nodes)
1612     {
1613       re_node_set_free (state->entrance_nodes);
1614       re_free (state->entrance_nodes);
1615     }
1616   re_node_set_free (&state->nodes);
1617   re_free (state->word_trtable);
1618   re_free (state->trtable);
1619   re_free (state);
1620 }
1621 
1622 /* Create the new state which is independent of contexts.
1623    Return the new state if succeeded, otherwise return NULL.  */
1624 
1625 static re_dfastate_t *
1626 __attribute_warn_unused_result__
1627 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1628 		    re_hashval_t hash)
1629 {
1630   Idx i;
1631   reg_errcode_t err;
1632   re_dfastate_t *newstate;
1633 
1634   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1635   if (__glibc_unlikely (newstate == NULL))
1636     return NULL;
1637   err = re_node_set_init_copy (&newstate->nodes, nodes);
1638   if (__glibc_unlikely (err != REG_NOERROR))
1639     {
1640       re_free (newstate);
1641       return NULL;
1642     }
1643 
1644   newstate->entrance_nodes = &newstate->nodes;
1645   for (i = 0 ; i < nodes->nelem ; i++)
1646     {
1647       re_token_t *node = dfa->nodes + nodes->elems[i];
1648       re_token_type_t type = node->type;
1649       if (type == CHARACTER && !node->constraint)
1650 	continue;
1651 #ifdef RE_ENABLE_I18N
1652       newstate->accept_mb |= node->accept_mb;
1653 #endif /* RE_ENABLE_I18N */
1654 
1655       /* If the state has the halt node, the state is a halt state.  */
1656       if (type == END_OF_RE)
1657 	newstate->halt = 1;
1658       else if (type == OP_BACK_REF)
1659 	newstate->has_backref = 1;
1660       else if (type == ANCHOR || node->constraint)
1661 	newstate->has_constraint = 1;
1662     }
1663   err = register_state (dfa, newstate, hash);
1664   if (__glibc_unlikely (err != REG_NOERROR))
1665     {
1666       free_state (newstate);
1667       newstate = NULL;
1668     }
1669   return newstate;
1670 }
1671 
1672 /* Create the new state which is depend on the context CONTEXT.
1673    Return the new state if succeeded, otherwise return NULL.  */
1674 
1675 static re_dfastate_t *
1676 __attribute_warn_unused_result__
1677 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1678 		    unsigned int context, re_hashval_t hash)
1679 {
1680   Idx i, nctx_nodes = 0;
1681   reg_errcode_t err;
1682   re_dfastate_t *newstate;
1683 
1684   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1685   if (__glibc_unlikely (newstate == NULL))
1686     return NULL;
1687   err = re_node_set_init_copy (&newstate->nodes, nodes);
1688   if (__glibc_unlikely (err != REG_NOERROR))
1689     {
1690       re_free (newstate);
1691       return NULL;
1692     }
1693 
1694   newstate->context = context;
1695   newstate->entrance_nodes = &newstate->nodes;
1696 
1697   for (i = 0 ; i < nodes->nelem ; i++)
1698     {
1699       re_token_t *node = dfa->nodes + nodes->elems[i];
1700       re_token_type_t type = node->type;
1701       unsigned int constraint = node->constraint;
1702 
1703       if (type == CHARACTER && !constraint)
1704 	continue;
1705 #ifdef RE_ENABLE_I18N
1706       newstate->accept_mb |= node->accept_mb;
1707 #endif /* RE_ENABLE_I18N */
1708 
1709       /* If the state has the halt node, the state is a halt state.  */
1710       if (type == END_OF_RE)
1711 	newstate->halt = 1;
1712       else if (type == OP_BACK_REF)
1713 	newstate->has_backref = 1;
1714 
1715       if (constraint)
1716 	{
1717 	  if (newstate->entrance_nodes == &newstate->nodes)
1718 	    {
1719 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1720 	      if (__glibc_unlikely (newstate->entrance_nodes == NULL))
1721 		{
1722 		  free_state (newstate);
1723 		  return NULL;
1724 		}
1725 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1726 		  != REG_NOERROR)
1727 		return NULL;
1728 	      nctx_nodes = 0;
1729 	      newstate->has_constraint = 1;
1730 	    }
1731 
1732 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1733 	    {
1734 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1735 	      ++nctx_nodes;
1736 	    }
1737 	}
1738     }
1739   err = register_state (dfa, newstate, hash);
1740   if (__glibc_unlikely (err != REG_NOERROR))
1741     {
1742       free_state (newstate);
1743       newstate = NULL;
1744     }
1745   return  newstate;
1746 }
1747