1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 #include <sys/cdefs.h>
20 __RCSID("$NetBSD: regex_internal.c,v 1.2 2016/05/17 14:00:09 christos Exp $");
21
22
23 static void re_string_construct_common (const char *str, Idx len,
24 re_string_t *pstr,
25 REG_TRANSLATE_TYPE trans, bool icase,
26 const re_dfa_t *dfa) internal_function;
27 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 re_hashval_t hash) internal_function;
30 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
31 const re_node_set *nodes,
32 unsigned int context,
33 re_hashval_t hash) internal_function;
34
35 /* Functions for string operation. */
36
37 /* This function allocate the buffers. It is necessary to call
38 re_string_reconstruct before using the object. */
39
40 static reg_errcode_t
41 internal_function
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)42 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
43 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44 {
45 reg_errcode_t ret;
46 Idx init_buf_len;
47
48 /* Ensure at least one character fits into the buffers. */
49 if (init_len < dfa->mb_cur_max)
50 init_len = dfa->mb_cur_max;
51 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52 re_string_construct_common (str, len, pstr, trans, icase, dfa);
53
54 ret = re_string_realloc_buffers (pstr, init_buf_len);
55 if (BE (ret != REG_NOERROR, 0))
56 return ret;
57
58 pstr->word_char = dfa->word_char;
59 pstr->word_ops_used = dfa->word_ops_used;
60 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
62 pstr->valid_raw_len = pstr->valid_len;
63 return REG_NOERROR;
64 }
65
66 /* This function allocate the buffers, and initialize them. */
67
68 static reg_errcode_t
69 internal_function
re_string_construct(re_string_t * pstr,const char * str,Idx len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)70 re_string_construct (re_string_t *pstr, const char *str, Idx len,
71 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72 {
73 reg_errcode_t ret;
74 memset (pstr, '\0', sizeof (re_string_t));
75 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76
77 if (len > 0)
78 {
79 ret = re_string_realloc_buffers (pstr, len + 1);
80 if (BE (ret != REG_NOERROR, 0))
81 return ret;
82 }
83 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84
85 if (icase)
86 {
87 #ifdef RE_ENABLE_I18N
88 if (dfa->mb_cur_max > 1)
89 {
90 while (1)
91 {
92 ret = build_wcs_upper_buffer (pstr);
93 if (BE (ret != REG_NOERROR, 0))
94 return ret;
95 if (pstr->valid_raw_len >= len)
96 break;
97 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98 break;
99 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100 if (BE (ret != REG_NOERROR, 0))
101 return ret;
102 }
103 }
104 else
105 #endif /* RE_ENABLE_I18N */
106 build_upper_buffer (pstr);
107 }
108 else
109 {
110 #ifdef RE_ENABLE_I18N
111 if (dfa->mb_cur_max > 1)
112 build_wcs_buffer (pstr);
113 else
114 #endif /* RE_ENABLE_I18N */
115 {
116 if (trans != NULL)
117 re_string_translate_buffer (pstr);
118 else
119 {
120 pstr->valid_len = pstr->bufs_len;
121 pstr->valid_raw_len = pstr->bufs_len;
122 }
123 }
124 }
125
126 return REG_NOERROR;
127 }
128
129 /* Helper functions for re_string_allocate, and re_string_construct. */
130
131 static reg_errcode_t
132 internal_function
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)133 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134 {
135 #ifdef RE_ENABLE_I18N
136 if (pstr->mb_cur_max > 1)
137 {
138 wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
139 if (BE (new_wcs == NULL, 0))
140 return REG_ESPACE;
141 pstr->wcs = new_wcs;
142 if (pstr->offsets != NULL)
143 {
144 Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
145 if (BE (new_offsets == NULL, 0))
146 return REG_ESPACE;
147 pstr->offsets = new_offsets;
148 }
149 }
150 #endif /* RE_ENABLE_I18N */
151 if (pstr->mbs_allocated)
152 {
153 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
154 new_buf_len);
155 if (BE (new_mbs == NULL, 0))
156 return REG_ESPACE;
157 pstr->mbs = new_mbs;
158 }
159 pstr->bufs_len = new_buf_len;
160 return REG_NOERROR;
161 }
162
163
164 static void
165 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)166 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
167 REG_TRANSLATE_TYPE trans, bool icase,
168 const re_dfa_t *dfa)
169 {
170 pstr->raw_mbs = (const unsigned char *) str;
171 pstr->len = len;
172 pstr->raw_len = len;
173 pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
174 pstr->icase = icase;
175 pstr->mbs_allocated = (trans != NULL || icase);
176 pstr->mb_cur_max = dfa->mb_cur_max;
177 pstr->is_utf8 = dfa->is_utf8;
178 pstr->map_notascii = dfa->map_notascii;
179 pstr->stop = pstr->len;
180 pstr->raw_stop = pstr->stop;
181 }
182
183 #ifdef RE_ENABLE_I18N
184
185 /* Build wide character buffer PSTR->WCS.
186 If the byte sequence of the string are:
187 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
188 Then wide character buffer will be:
189 <wc1> , WEOF , <wc2> , WEOF , <wc3>
190 We use WEOF for padding, they indicate that the position isn't
191 a first byte of a multibyte character.
192
193 Note that this function assumes PSTR->VALID_LEN elements are already
194 built and starts from PSTR->VALID_LEN. */
195
196 static void
197 internal_function
build_wcs_buffer(re_string_t * pstr)198 build_wcs_buffer (re_string_t *pstr)
199 {
200 #ifdef _LIBC
201 unsigned char buf[MB_LEN_MAX];
202 assert (MB_LEN_MAX >= pstr->mb_cur_max);
203 #else
204 unsigned char buf[64];
205 #endif
206 mbstate_t prev_st;
207 Idx byte_idx, end_idx, remain_len;
208 size_t mbclen;
209
210 /* Build the buffers from pstr->valid_len to either pstr->len or
211 pstr->bufs_len. */
212 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
213 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
214 {
215 wchar_t wc;
216 const char *p;
217
218 remain_len = end_idx - byte_idx;
219 prev_st = pstr->cur_state;
220 /* Apply the translation if we need. */
221 if (BE (pstr->trans != NULL, 0))
222 {
223 int i, ch;
224
225 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
226 {
227 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
228 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
229 }
230 p = (const char *) buf;
231 }
232 else
233 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
234 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
235 if (BE (mbclen == (size_t) -2, 0))
236 {
237 /* The buffer doesn't have enough space, finish to build. */
238 pstr->cur_state = prev_st;
239 break;
240 }
241 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
242 {
243 /* We treat these cases as a singlebyte character. */
244 mbclen = 1;
245 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246 if (BE (pstr->trans != NULL, 0))
247 wc = pstr->trans[wc];
248 pstr->cur_state = prev_st;
249 }
250
251 /* Write wide character and padding. */
252 pstr->wcs[byte_idx++] = wc;
253 /* Write paddings. */
254 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
255 pstr->wcs[byte_idx++] = WEOF;
256 }
257 pstr->valid_len = byte_idx;
258 pstr->valid_raw_len = byte_idx;
259 }
260
261 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
262 but for REG_ICASE. */
263
264 static reg_errcode_t
265 internal_function
build_wcs_upper_buffer(re_string_t * pstr)266 build_wcs_upper_buffer (re_string_t *pstr)
267 {
268 mbstate_t prev_st;
269 Idx src_idx, byte_idx, end_idx, remain_len;
270 size_t mbclen;
271 #ifdef _LIBC
272 char buf[MB_LEN_MAX];
273 assert (MB_LEN_MAX >= pstr->mb_cur_max);
274 #else
275 char buf[64];
276 #endif
277
278 byte_idx = pstr->valid_len;
279 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
280
281 /* The following optimization assumes that ASCII characters can be
282 mapped to wide characters with a simple cast. */
283 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
284 {
285 while (byte_idx < end_idx)
286 {
287 wchar_t wc;
288
289 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
290 && mbsinit (&pstr->cur_state))
291 {
292 /* In case of a singlebyte character. */
293 pstr->mbs[byte_idx]
294 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
295 /* The next step uses the assumption that wchar_t is encoded
296 ASCII-safe: all ASCII values can be converted like this. */
297 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
298 ++byte_idx;
299 continue;
300 }
301
302 remain_len = end_idx - byte_idx;
303 prev_st = pstr->cur_state;
304 mbclen = mbrtowc (&wc,
305 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
306 + byte_idx), remain_len, &pstr->cur_state);
307 if (BE ((size_t) (mbclen + 2) > 2, 1))
308 {
309 wchar_t wcu = wc;
310 if (iswlower (wc))
311 {
312 size_t mbcdlen;
313
314 wcu = towupper (wc);
315 mbcdlen = wcrtomb (buf, wcu, &prev_st);
316 if (BE (mbclen == mbcdlen, 1))
317 memcpy (pstr->mbs + byte_idx, buf, mbclen);
318 else
319 {
320 src_idx = byte_idx;
321 goto offsets_needed;
322 }
323 }
324 else
325 memcpy (pstr->mbs + byte_idx,
326 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
327 pstr->wcs[byte_idx++] = wcu;
328 /* Write paddings. */
329 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
330 pstr->wcs[byte_idx++] = WEOF;
331 }
332 else if (mbclen == (size_t) -1 || mbclen == 0)
333 {
334 /* It is an invalid character or '\0'. Just use the byte. */
335 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
336 pstr->mbs[byte_idx] = ch;
337 /* And also cast it to wide char. */
338 pstr->wcs[byte_idx++] = (wchar_t) ch;
339 if (BE (mbclen == (size_t) -1, 0))
340 pstr->cur_state = prev_st;
341 }
342 else
343 {
344 /* The buffer doesn't have enough space, finish to build. */
345 pstr->cur_state = prev_st;
346 break;
347 }
348 }
349 pstr->valid_len = byte_idx;
350 pstr->valid_raw_len = byte_idx;
351 return REG_NOERROR;
352 }
353 else
354 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
355 {
356 wchar_t wc;
357 const char *p;
358 offsets_needed:
359 remain_len = end_idx - byte_idx;
360 prev_st = pstr->cur_state;
361 if (BE (pstr->trans != NULL, 0))
362 {
363 int i, ch;
364
365 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
366 {
367 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
368 buf[i] = pstr->trans[ch];
369 }
370 p = (const char *) buf;
371 }
372 else
373 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
374 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
375 if (BE ((size_t) (mbclen + 2) > 2, 1))
376 {
377 wchar_t wcu = wc;
378 if (iswlower (wc))
379 {
380 size_t mbcdlen;
381
382 wcu = towupper (wc);
383 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
384 if (BE (mbclen == mbcdlen, 1))
385 memcpy (pstr->mbs + byte_idx, buf, mbclen);
386 else if (mbcdlen != (size_t) -1)
387 {
388 size_t i;
389
390 if (byte_idx + mbcdlen > pstr->bufs_len)
391 {
392 pstr->cur_state = prev_st;
393 break;
394 }
395
396 if (pstr->offsets == NULL)
397 {
398 pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
399
400 if (pstr->offsets == NULL)
401 return REG_ESPACE;
402 }
403 if (!pstr->offsets_needed)
404 {
405 for (i = 0; i < (size_t) byte_idx; ++i)
406 pstr->offsets[i] = i;
407 pstr->offsets_needed = 1;
408 }
409
410 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
411 pstr->wcs[byte_idx] = wcu;
412 pstr->offsets[byte_idx] = src_idx;
413 for (i = 1; i < mbcdlen; ++i)
414 {
415 pstr->offsets[byte_idx + i]
416 = src_idx + (i < mbclen ? i : mbclen - 1);
417 pstr->wcs[byte_idx + i] = WEOF;
418 }
419 pstr->len += mbcdlen - mbclen;
420 if (pstr->raw_stop > src_idx)
421 pstr->stop += mbcdlen - mbclen;
422 end_idx = (pstr->bufs_len > pstr->len)
423 ? pstr->len : pstr->bufs_len;
424 byte_idx += mbcdlen;
425 src_idx += mbclen;
426 continue;
427 }
428 else
429 memcpy (pstr->mbs + byte_idx, p, mbclen);
430 }
431 else
432 memcpy (pstr->mbs + byte_idx, p, mbclen);
433
434 if (BE (pstr->offsets_needed != 0, 0))
435 {
436 size_t i;
437 for (i = 0; i < mbclen; ++i)
438 pstr->offsets[byte_idx + i] = src_idx + i;
439 }
440 src_idx += mbclen;
441
442 pstr->wcs[byte_idx++] = wcu;
443 /* Write paddings. */
444 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
445 pstr->wcs[byte_idx++] = WEOF;
446 }
447 else if (mbclen == (size_t) -1 || mbclen == 0)
448 {
449 /* It is an invalid character or '\0'. Just use the byte. */
450 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
451
452 if (BE (pstr->trans != NULL, 0))
453 ch = pstr->trans [ch];
454 pstr->mbs[byte_idx] = ch;
455
456 if (BE (pstr->offsets_needed != 0, 0))
457 pstr->offsets[byte_idx] = src_idx;
458 ++src_idx;
459
460 /* And also cast it to wide char. */
461 pstr->wcs[byte_idx++] = (wchar_t) ch;
462 if (BE (mbclen == (size_t) -1, 0))
463 pstr->cur_state = prev_st;
464 }
465 else
466 {
467 /* The buffer doesn't have enough space, finish to build. */
468 pstr->cur_state = prev_st;
469 break;
470 }
471 }
472 pstr->valid_len = byte_idx;
473 pstr->valid_raw_len = src_idx;
474 return REG_NOERROR;
475 }
476
477 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
478 Return the index. */
479
480 static Idx
481 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)482 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
483 {
484 mbstate_t prev_st;
485 Idx rawbuf_idx;
486 size_t mbclen;
487 wchar_t wc = 0;
488
489 /* Skip the characters which are not necessary to check. */
490 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
491 rawbuf_idx < new_raw_idx;)
492 {
493 Idx remain_len;
494 remain_len = pstr->len - rawbuf_idx;
495 prev_st = pstr->cur_state;
496 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
497 remain_len, &pstr->cur_state);
498 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
499 {
500 /* We treat these cases as a singlebyte character. */
501 mbclen = 1;
502 pstr->cur_state = prev_st;
503 }
504 /* Then proceed the next character. */
505 rawbuf_idx += mbclen;
506 }
507 *last_wc = (wint_t) wc;
508 return rawbuf_idx;
509 }
510 #endif /* RE_ENABLE_I18N */
511
512 /* Build the buffer PSTR->MBS, and apply the translation if we need.
513 This function is used in case of REG_ICASE. */
514
515 static void
516 internal_function
build_upper_buffer(re_string_t * pstr)517 build_upper_buffer (re_string_t *pstr)
518 {
519 Idx char_idx, end_idx;
520 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
521
522 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
523 {
524 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
525 if (BE (pstr->trans != NULL, 0))
526 ch = pstr->trans[ch];
527 if (islower (ch))
528 pstr->mbs[char_idx] = toupper (ch);
529 else
530 pstr->mbs[char_idx] = ch;
531 }
532 pstr->valid_len = char_idx;
533 pstr->valid_raw_len = char_idx;
534 }
535
536 /* Apply TRANS to the buffer in PSTR. */
537
538 static void
539 internal_function
re_string_translate_buffer(re_string_t * pstr)540 re_string_translate_buffer (re_string_t *pstr)
541 {
542 Idx buf_idx, end_idx;
543 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
544
545 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
546 {
547 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
548 pstr->mbs[buf_idx] = pstr->trans[ch];
549 }
550
551 pstr->valid_len = buf_idx;
552 pstr->valid_raw_len = buf_idx;
553 }
554
555 /* This function re-construct the buffers.
556 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
557 convert to upper case in case of REG_ICASE, apply translation. */
558
559 static reg_errcode_t
560 internal_function
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)561 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
562 {
563 Idx offset;
564
565 if (BE (pstr->raw_mbs_idx <= idx, 0))
566 offset = idx - pstr->raw_mbs_idx;
567 else
568 {
569 /* Reset buffer. */
570 #ifdef RE_ENABLE_I18N
571 if (pstr->mb_cur_max > 1)
572 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
573 #endif /* RE_ENABLE_I18N */
574 pstr->len = pstr->raw_len;
575 pstr->stop = pstr->raw_stop;
576 pstr->valid_len = 0;
577 pstr->raw_mbs_idx = 0;
578 pstr->valid_raw_len = 0;
579 pstr->offsets_needed = 0;
580 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
581 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
582 if (!pstr->mbs_allocated)
583 pstr->mbs = (unsigned char *) pstr->raw_mbs;
584 offset = idx;
585 }
586
587 if (BE (offset != 0, 1))
588 {
589 /* Are the characters which are already checked remain? */
590 if (BE (offset < pstr->valid_raw_len, 1)
591 #ifdef RE_ENABLE_I18N
592 /* Handling this would enlarge the code too much.
593 Accept a slowdown in that case. */
594 && pstr->offsets_needed == 0
595 #endif
596 )
597 {
598 /* Yes, move them to the front of the buffer. */
599 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
600 #ifdef RE_ENABLE_I18N
601 if (pstr->mb_cur_max > 1)
602 memmove (pstr->wcs, pstr->wcs + offset,
603 (pstr->valid_len - offset) * sizeof (wint_t));
604 #endif /* RE_ENABLE_I18N */
605 if (BE (pstr->mbs_allocated, 0))
606 memmove (pstr->mbs, pstr->mbs + offset,
607 pstr->valid_len - offset);
608 pstr->valid_len -= offset;
609 pstr->valid_raw_len -= offset;
610 #if DEBUG
611 assert (pstr->valid_len > 0);
612 #endif
613 }
614 else
615 {
616 /* No, skip all characters until IDX. */
617 #ifdef RE_ENABLE_I18N
618 if (BE (pstr->offsets_needed, 0))
619 {
620 pstr->len = pstr->raw_len - idx + offset;
621 pstr->stop = pstr->raw_stop - idx + offset;
622 pstr->offsets_needed = 0;
623 }
624 #endif
625 pstr->valid_len = 0;
626 pstr->valid_raw_len = 0;
627 #ifdef RE_ENABLE_I18N
628 if (pstr->mb_cur_max > 1)
629 {
630 Idx wcs_idx;
631 wint_t wc = WEOF;
632
633 if (pstr->is_utf8)
634 {
635 const unsigned char *raw, *p, *q, *end;
636
637 /* Special case UTF-8. Multi-byte chars start with any
638 byte other than 0x80 - 0xbf. */
639 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
640 end = raw + (offset - pstr->mb_cur_max);
641 for (p = raw + offset - 1; p >= end; --p)
642 if ((*p & 0xc0) != 0x80)
643 {
644 mbstate_t cur_state;
645 wchar_t wc2;
646 Idx mlen = raw + pstr->len - p;
647 unsigned char buf[6];
648 size_t mbclen;
649
650 q = p;
651 if (BE (pstr->trans != NULL, 0))
652 {
653 int i = mlen < 6 ? mlen : 6;
654 while (--i >= 0)
655 buf[i] = pstr->trans[p[i]];
656 q = buf;
657 }
658 /* XXX Don't use mbrtowc, we know which conversion
659 to use (UTF-8 -> UCS4). */
660 memset (&cur_state, 0, sizeof (cur_state));
661 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
662 &cur_state);
663 if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
664 {
665 memset (&pstr->cur_state, '\0',
666 sizeof (mbstate_t));
667 pstr->valid_len = mbclen - (raw + offset - p);
668 wc = wc2;
669 }
670 break;
671 }
672 }
673
674 if (wc == WEOF)
675 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
676 if (BE (pstr->valid_len, 0))
677 {
678 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
679 pstr->wcs[wcs_idx] = WEOF;
680 if (pstr->mbs_allocated)
681 memset (pstr->mbs, -1, pstr->valid_len);
682 }
683 pstr->valid_raw_len = pstr->valid_len;
684 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
685 && IS_WIDE_WORD_CHAR (wc))
686 ? CONTEXT_WORD
687 : ((IS_WIDE_NEWLINE (wc)
688 && pstr->newline_anchor)
689 ? CONTEXT_NEWLINE : 0));
690 }
691 else
692 #endif /* RE_ENABLE_I18N */
693 {
694 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
695 if (pstr->trans)
696 c = pstr->trans[c];
697 pstr->tip_context = (bitset_contain (pstr->word_char, c)
698 ? CONTEXT_WORD
699 : ((IS_NEWLINE (c) && pstr->newline_anchor)
700 ? CONTEXT_NEWLINE : 0));
701 }
702 }
703 if (!BE (pstr->mbs_allocated, 0))
704 pstr->mbs += offset;
705 }
706 pstr->raw_mbs_idx = idx;
707 pstr->len -= offset;
708 pstr->stop -= offset;
709
710 /* Then build the buffers. */
711 #ifdef RE_ENABLE_I18N
712 if (pstr->mb_cur_max > 1)
713 {
714 if (pstr->icase)
715 {
716 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
717 if (BE (ret != REG_NOERROR, 0))
718 return ret;
719 }
720 else
721 build_wcs_buffer (pstr);
722 }
723 else
724 #endif /* RE_ENABLE_I18N */
725 if (BE (pstr->mbs_allocated, 0))
726 {
727 if (pstr->icase)
728 build_upper_buffer (pstr);
729 else if (pstr->trans != NULL)
730 re_string_translate_buffer (pstr);
731 }
732 else
733 pstr->valid_len = pstr->len;
734
735 pstr->cur_idx = 0;
736 return REG_NOERROR;
737 }
738
739 static unsigned char
internal_function(pure)740 internal_function __attribute ((pure))
741 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
742 {
743 int ch;
744 Idx off;
745
746 /* Handle the common (easiest) cases first. */
747 if (BE (!pstr->mbs_allocated, 1))
748 return re_string_peek_byte (pstr, idx);
749
750 #ifdef RE_ENABLE_I18N
751 if (pstr->mb_cur_max > 1
752 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
753 return re_string_peek_byte (pstr, idx);
754 #endif
755
756 off = pstr->cur_idx + idx;
757 #ifdef RE_ENABLE_I18N
758 if (pstr->offsets_needed)
759 off = pstr->offsets[off];
760 #endif
761
762 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
763
764 #ifdef RE_ENABLE_I18N
765 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
766 this function returns CAPITAL LETTER I instead of first byte of
767 DOTLESS SMALL LETTER I. The latter would confuse the parser,
768 since peek_byte_case doesn't advance cur_idx in any way. */
769 if (pstr->offsets_needed && !isascii (ch))
770 return re_string_peek_byte (pstr, idx);
771 #endif
772
773 return ch;
774 }
775
776 static unsigned char
internal_function(pure)777 internal_function __attribute ((pure))
778 re_string_fetch_byte_case (re_string_t *pstr)
779 {
780 if (BE (!pstr->mbs_allocated, 1))
781 return re_string_fetch_byte (pstr);
782
783 #ifdef RE_ENABLE_I18N
784 if (pstr->offsets_needed)
785 {
786 Idx off;
787 int ch;
788
789 /* For tr_TR.UTF-8 [[:islower:]] there is
790 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
791 in that case the whole multi-byte character and return
792 the original letter. On the other side, with
793 [[: DOTLESS SMALL LETTER I return [[:I, as doing
794 anything else would complicate things too much. */
795
796 if (!re_string_first_byte (pstr, pstr->cur_idx))
797 return re_string_fetch_byte (pstr);
798
799 off = pstr->offsets[pstr->cur_idx];
800 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
801
802 if (! isascii (ch))
803 return re_string_fetch_byte (pstr);
804
805 re_string_skip_bytes (pstr,
806 re_string_char_size_at (pstr, pstr->cur_idx));
807 return ch;
808 }
809 #endif
810
811 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
812 }
813
814 static void
815 internal_function
re_string_destruct(re_string_t * pstr)816 re_string_destruct (re_string_t *pstr)
817 {
818 #ifdef RE_ENABLE_I18N
819 re_free (pstr->wcs);
820 re_free (pstr->offsets);
821 #endif /* RE_ENABLE_I18N */
822 if (pstr->mbs_allocated)
823 re_free (pstr->mbs);
824 }
825
826 /* Return the context at IDX in INPUT. */
827
828 static unsigned int
829 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)830 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
831 {
832 int c;
833 if (BE (! REG_VALID_INDEX (idx), 0))
834 /* In this case, we use the value stored in input->tip_context,
835 since we can't know the character in input->mbs[-1] here. */
836 return input->tip_context;
837 if (BE (idx == input->len, 0))
838 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
839 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
840 #ifdef RE_ENABLE_I18N
841 if (input->mb_cur_max > 1)
842 {
843 wint_t wc;
844 Idx wc_idx = idx;
845 while(input->wcs[wc_idx] == WEOF)
846 {
847 #ifdef DEBUG
848 /* It must not happen. */
849 assert (REG_VALID_INDEX (wc_idx));
850 #endif
851 --wc_idx;
852 if (! REG_VALID_INDEX (wc_idx))
853 return input->tip_context;
854 }
855 wc = input->wcs[wc_idx];
856 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
857 return CONTEXT_WORD;
858 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
859 ? CONTEXT_NEWLINE : 0);
860 }
861 else
862 #endif
863 {
864 c = re_string_byte_at (input, idx);
865 if (bitset_contain (input->word_char, c))
866 return CONTEXT_WORD;
867 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
868 }
869 }
870
871 /* Functions for set operation. */
872
873 static reg_errcode_t
874 internal_function
re_node_set_alloc(re_node_set * set,Idx size)875 re_node_set_alloc (re_node_set *set, Idx size)
876 {
877 set->alloc = size;
878 set->nelem = 0;
879 set->elems = re_xmalloc (Idx, size);
880 if (BE (set->elems == NULL, 0))
881 return REG_ESPACE;
882 return REG_NOERROR;
883 }
884
885 static reg_errcode_t
886 internal_function
re_node_set_init_1(re_node_set * set,Idx elem)887 re_node_set_init_1 (re_node_set *set, Idx elem)
888 {
889 set->alloc = 1;
890 set->nelem = 1;
891 set->elems = re_malloc (Idx, 1);
892 if (BE (set->elems == NULL, 0))
893 {
894 set->alloc = set->nelem = 0;
895 return REG_ESPACE;
896 }
897 set->elems[0] = elem;
898 return REG_NOERROR;
899 }
900
901 static reg_errcode_t
902 internal_function
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)903 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
904 {
905 set->alloc = 2;
906 set->elems = re_malloc (Idx, 2);
907 if (BE (set->elems == NULL, 0))
908 return REG_ESPACE;
909 if (elem1 == elem2)
910 {
911 set->nelem = 1;
912 set->elems[0] = elem1;
913 }
914 else
915 {
916 set->nelem = 2;
917 if (elem1 < elem2)
918 {
919 set->elems[0] = elem1;
920 set->elems[1] = elem2;
921 }
922 else
923 {
924 set->elems[0] = elem2;
925 set->elems[1] = elem1;
926 }
927 }
928 return REG_NOERROR;
929 }
930
931 static reg_errcode_t
932 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)933 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
934 {
935 dest->nelem = src->nelem;
936 if (src->nelem > 0)
937 {
938 dest->alloc = dest->nelem;
939 dest->elems = re_malloc (Idx, dest->alloc);
940 if (BE (dest->elems == NULL, 0))
941 {
942 dest->alloc = dest->nelem = 0;
943 return REG_ESPACE;
944 }
945 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
946 }
947 else
948 re_node_set_init_empty (dest);
949 return REG_NOERROR;
950 }
951
952 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
953 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
954 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
955
956 static reg_errcode_t
957 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)958 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
959 const re_node_set *src2)
960 {
961 Idx i1, i2, is, id, delta, sbase;
962 if (src1->nelem == 0 || src2->nelem == 0)
963 return REG_NOERROR;
964
965 /* We need dest->nelem + 2 * elems_in_intersection; this is a
966 conservative estimate. */
967 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
968 {
969 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
970 Idx *new_elems;
971 if (sizeof (Idx) < 3
972 && (new_alloc < dest->alloc
973 || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
974 return REG_ESPACE;
975 new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
976 if (BE (new_elems == NULL, 0))
977 return REG_ESPACE;
978 dest->elems = new_elems;
979 dest->alloc = new_alloc;
980 }
981
982 /* Find the items in the intersection of SRC1 and SRC2, and copy
983 into the top of DEST those that are not already in DEST itself. */
984 sbase = dest->nelem + src1->nelem + src2->nelem;
985 i1 = src1->nelem - 1;
986 i2 = src2->nelem - 1;
987 id = dest->nelem - 1;
988 for (;;)
989 {
990 if (src1->elems[i1] == src2->elems[i2])
991 {
992 /* Try to find the item in DEST. Maybe we could binary search? */
993 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
994 --id;
995
996 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
997 dest->elems[--sbase] = src1->elems[i1];
998
999 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1000 break;
1001 }
1002
1003 /* Lower the highest of the two items. */
1004 else if (src1->elems[i1] < src2->elems[i2])
1005 {
1006 if (! REG_VALID_INDEX (--i2))
1007 break;
1008 }
1009 else
1010 {
1011 if (! REG_VALID_INDEX (--i1))
1012 break;
1013 }
1014 }
1015
1016 id = dest->nelem - 1;
1017 is = dest->nelem + src1->nelem + src2->nelem - 1;
1018 delta = is - sbase + 1;
1019
1020 /* Now copy. When DELTA becomes zero, the remaining
1021 DEST elements are already in place; this is more or
1022 less the same loop that is in re_node_set_merge. */
1023 dest->nelem += delta;
1024 if (delta > 0 && REG_VALID_INDEX (id))
1025 for (;;)
1026 {
1027 if (dest->elems[is] > dest->elems[id])
1028 {
1029 /* Copy from the top. */
1030 dest->elems[id + delta--] = dest->elems[is--];
1031 if (delta == 0)
1032 break;
1033 }
1034 else
1035 {
1036 /* Slide from the bottom. */
1037 dest->elems[id + delta] = dest->elems[id];
1038 if (! REG_VALID_INDEX (--id))
1039 break;
1040 }
1041 }
1042
1043 /* Copy remaining SRC elements. */
1044 memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1045
1046 return REG_NOERROR;
1047 }
1048
1049 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1050 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1051
1052 static reg_errcode_t
1053 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1054 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1055 const re_node_set *src2)
1056 {
1057 Idx i1, i2, id;
1058 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1059 {
1060 dest->alloc = src1->nelem + src2->nelem;
1061 if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1062 return REG_ESPACE;
1063 dest->elems = re_xmalloc (Idx, dest->alloc);
1064 if (BE (dest->elems == NULL, 0))
1065 return REG_ESPACE;
1066 }
1067 else
1068 {
1069 if (src1 != NULL && src1->nelem > 0)
1070 return re_node_set_init_copy (dest, src1);
1071 else if (src2 != NULL && src2->nelem > 0)
1072 return re_node_set_init_copy (dest, src2);
1073 else
1074 re_node_set_init_empty (dest);
1075 return REG_NOERROR;
1076 }
1077 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1078 {
1079 if (src1->elems[i1] > src2->elems[i2])
1080 {
1081 dest->elems[id++] = src2->elems[i2++];
1082 continue;
1083 }
1084 if (src1->elems[i1] == src2->elems[i2])
1085 ++i2;
1086 dest->elems[id++] = src1->elems[i1++];
1087 }
1088 if (i1 < src1->nelem)
1089 {
1090 memcpy (dest->elems + id, src1->elems + i1,
1091 (src1->nelem - i1) * sizeof dest->elems[0]);
1092 id += src1->nelem - i1;
1093 }
1094 else if (i2 < src2->nelem)
1095 {
1096 memcpy (dest->elems + id, src2->elems + i2,
1097 (src2->nelem - i2) * sizeof dest->elems[0]);
1098 id += src2->nelem - i2;
1099 }
1100 dest->nelem = id;
1101 return REG_NOERROR;
1102 }
1103
1104 /* Calculate the union set of the sets DEST and SRC. And store it to
1105 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1106
1107 static reg_errcode_t
1108 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1109 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1110 {
1111 Idx is, id, sbase, delta;
1112 if (src == NULL || src->nelem == 0)
1113 return REG_NOERROR;
1114 if (sizeof (Idx) < 3
1115 && ((Idx) (2 * src->nelem) < src->nelem
1116 || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1117 return REG_ESPACE;
1118 if (dest->alloc < 2 * src->nelem + dest->nelem)
1119 {
1120 Idx new_alloc = src->nelem + dest->alloc;
1121 Idx *new_buffer;
1122 if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1123 return REG_ESPACE;
1124 new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1125 if (BE (new_buffer == NULL, 0))
1126 return REG_ESPACE;
1127 dest->elems = new_buffer;
1128 dest->alloc = new_alloc;
1129 }
1130
1131 if (BE (dest->nelem == 0, 0))
1132 {
1133 dest->nelem = src->nelem;
1134 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1135 return REG_NOERROR;
1136 }
1137
1138 /* Copy into the top of DEST the items of SRC that are not
1139 found in DEST. Maybe we could binary search in DEST? */
1140 for (sbase = dest->nelem + 2 * src->nelem,
1141 is = src->nelem - 1, id = dest->nelem - 1;
1142 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1143 {
1144 if (dest->elems[id] == src->elems[is])
1145 is--, id--;
1146 else if (dest->elems[id] < src->elems[is])
1147 dest->elems[--sbase] = src->elems[is--];
1148 else /* if (dest->elems[id] > src->elems[is]) */
1149 --id;
1150 }
1151
1152 if (REG_VALID_INDEX (is))
1153 {
1154 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1155 sbase -= is + 1;
1156 memcpy (dest->elems + sbase, src->elems,
1157 (is + 1) * sizeof dest->elems[0]);
1158 }
1159
1160 id = dest->nelem - 1;
1161 is = dest->nelem + 2 * src->nelem - 1;
1162 delta = is - sbase + 1;
1163 if (delta == 0)
1164 return REG_NOERROR;
1165
1166 /* Now copy. When DELTA becomes zero, the remaining
1167 DEST elements are already in place. */
1168 dest->nelem += delta;
1169 for (;;)
1170 {
1171 if (dest->elems[is] > dest->elems[id])
1172 {
1173 /* Copy from the top. */
1174 dest->elems[id + delta--] = dest->elems[is--];
1175 if (delta == 0)
1176 break;
1177 }
1178 else
1179 {
1180 /* Slide from the bottom. */
1181 dest->elems[id + delta] = dest->elems[id];
1182 if (! REG_VALID_INDEX (--id))
1183 {
1184 /* Copy remaining SRC elements. */
1185 memcpy (dest->elems, dest->elems + sbase,
1186 delta * sizeof dest->elems[0]);
1187 break;
1188 }
1189 }
1190 }
1191
1192 return REG_NOERROR;
1193 }
1194
1195 /* Insert the new element ELEM to the re_node_set* SET.
1196 SET should not already have ELEM.
1197 Return true if successful. */
1198
1199 static bool
1200 internal_function
re_node_set_insert(re_node_set * set,Idx elem)1201 re_node_set_insert (re_node_set *set, Idx elem)
1202 {
1203 Idx idx;
1204 /* In case the set is empty. */
1205 if (set->alloc == 0)
1206 return re_node_set_init_1 (set, elem) == REG_NOERROR;
1207
1208 if (BE (set->nelem, 0) == 0)
1209 {
1210 /* We already guaranteed above that set->alloc != 0. */
1211 set->elems[0] = elem;
1212 ++set->nelem;
1213 return true;
1214 }
1215
1216 /* Realloc if we need. */
1217 if (set->alloc == set->nelem)
1218 {
1219 Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1220 if (BE (new_elems == NULL, 0))
1221 return false;
1222 set->elems = new_elems;
1223 }
1224
1225 /* Move the elements which follows the new element. Test the
1226 first element separately to skip a check in the inner loop. */
1227 if (elem < set->elems[0])
1228 {
1229 idx = 0;
1230 for (idx = set->nelem; idx > 0; idx--)
1231 set->elems[idx] = set->elems[idx - 1];
1232 }
1233 else
1234 {
1235 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1236 set->elems[idx] = set->elems[idx - 1];
1237 }
1238
1239 /* Insert the new element. */
1240 set->elems[idx] = elem;
1241 ++set->nelem;
1242 return true;
1243 }
1244
1245 /* Insert the new element ELEM to the re_node_set* SET.
1246 SET should not already have any element greater than or equal to ELEM.
1247 Return true if successful. */
1248
1249 static bool
1250 internal_function
re_node_set_insert_last(re_node_set * set,Idx elem)1251 re_node_set_insert_last (re_node_set *set, Idx elem)
1252 {
1253 /* Realloc if we need. */
1254 if (set->alloc == set->nelem)
1255 {
1256 Idx *new_elems;
1257 new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1258 if (BE (new_elems == NULL, 0))
1259 return false;
1260 set->elems = new_elems;
1261 }
1262
1263 /* Insert the new element. */
1264 set->elems[set->nelem++] = elem;
1265 return true;
1266 }
1267
1268 /* Compare two node sets SET1 and SET2.
1269 Return true if SET1 and SET2 are equivalent. */
1270
1271 static bool
internal_function(pure)1272 internal_function __attribute ((pure))
1273 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1274 {
1275 Idx i;
1276 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1277 return false;
1278 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1279 if (set1->elems[i] != set2->elems[i])
1280 return false;
1281 return true;
1282 }
1283
1284 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1285
1286 static Idx
internal_function(pure)1287 internal_function __attribute ((pure))
1288 re_node_set_contains (const re_node_set *set, Idx elem)
1289 {
1290 __re_size_t idx, right, mid;
1291 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1292 return 0;
1293
1294 /* Binary search the element. */
1295 idx = 0;
1296 right = set->nelem - 1;
1297 while (idx < right)
1298 {
1299 mid = (idx + right) / 2;
1300 if (set->elems[mid] < elem)
1301 idx = mid + 1;
1302 else
1303 right = mid;
1304 }
1305 return set->elems[idx] == elem ? idx + 1 : 0;
1306 }
1307
1308 static void
1309 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1310 re_node_set_remove_at (re_node_set *set, Idx idx)
1311 {
1312 if (idx < 0 || idx >= set->nelem)
1313 return;
1314 --set->nelem;
1315 for (; idx < set->nelem; idx++)
1316 set->elems[idx] = set->elems[idx + 1];
1317 }
1318
1319
1320 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1321 Or return REG_MISSING if an error occurred. */
1322
1323 static Idx
1324 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1325 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1326 {
1327 int type = token.type;
1328 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1329 {
1330 Idx new_nodes_alloc = dfa->nodes_alloc;
1331 Idx *new_nexts, *new_indices;
1332 re_node_set *new_edests, *new_eclosures;
1333
1334 re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1335 &new_nodes_alloc);
1336 if (BE (new_nodes == NULL, 0))
1337 return REG_MISSING;
1338 dfa->nodes = new_nodes;
1339 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1340 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1341 new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1342 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1343 if (BE (new_nexts == NULL || new_indices == NULL
1344 || new_edests == NULL || new_eclosures == NULL, 0))
1345 return REG_MISSING;
1346 dfa->nexts = new_nexts;
1347 dfa->org_indices = new_indices;
1348 dfa->edests = new_edests;
1349 dfa->eclosures = new_eclosures;
1350 dfa->nodes_alloc = new_nodes_alloc;
1351 }
1352 dfa->nodes[dfa->nodes_len] = token;
1353 dfa->nodes[dfa->nodes_len].constraint = 0;
1354 #ifdef RE_ENABLE_I18N
1355 dfa->nodes[dfa->nodes_len].accept_mb =
1356 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1357 #endif
1358 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1359 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1360 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1361 return dfa->nodes_len++;
1362 }
1363
1364 static inline re_hashval_t
1365 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1366 calc_state_hash (const re_node_set *nodes, unsigned int context)
1367 {
1368 re_hashval_t hash = nodes->nelem + context;
1369 Idx i;
1370 for (i = 0 ; i < nodes->nelem ; i++)
1371 hash += nodes->elems[i];
1372 return hash;
1373 }
1374
1375 /* Search for the state whose node_set is equivalent to NODES.
1376 Return the pointer to the state, if we found it in the DFA.
1377 Otherwise create the new one and return it. In case of an error
1378 return NULL and set the error code in ERR.
1379 Note: - We assume NULL as the invalid state, then it is possible that
1380 return value is NULL and ERR is REG_NOERROR.
1381 - We never return non-NULL value in case of any errors, it is for
1382 optimization. */
1383
1384 static re_dfastate_t*
1385 internal_function
re_acquire_state(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes)1386 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1387 {
1388 re_hashval_t hash;
1389 re_dfastate_t *new_state;
1390 struct re_state_table_entry *spot;
1391 Idx i;
1392 #ifdef lint
1393 /* Suppress bogus uninitialized-variable warnings. */
1394 *err = REG_NOERROR;
1395 #endif
1396 if (BE (nodes->nelem == 0, 0))
1397 {
1398 *err = REG_NOERROR;
1399 return NULL;
1400 }
1401 hash = calc_state_hash (nodes, 0);
1402 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1403
1404 for (i = 0 ; i < spot->num ; i++)
1405 {
1406 re_dfastate_t *state = spot->array[i];
1407 if (hash != state->hash)
1408 continue;
1409 if (re_node_set_compare (&state->nodes, nodes))
1410 return state;
1411 }
1412
1413 /* There are no appropriate state in the dfa, create the new one. */
1414 new_state = create_ci_newstate (dfa, nodes, hash);
1415 if (BE (new_state != NULL, 1))
1416 return new_state;
1417 else
1418 {
1419 *err = REG_ESPACE;
1420 return NULL;
1421 }
1422 }
1423
1424 /* Search for the state whose node_set is equivalent to NODES and
1425 whose context is equivalent to CONTEXT.
1426 Return the pointer to the state, if we found it in the DFA.
1427 Otherwise create the new one and return it. In case of an error
1428 return NULL and set the error code in ERR.
1429 Note: - We assume NULL as the invalid state, then it is possible that
1430 return value is NULL and ERR is REG_NOERROR.
1431 - We never return non-NULL value in case of any errors, it is for
1432 optimization. */
1433
1434 static re_dfastate_t*
1435 internal_function
re_acquire_state_context(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1436 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1437 const re_node_set *nodes, unsigned int context)
1438 {
1439 re_hashval_t hash;
1440 re_dfastate_t *new_state;
1441 struct re_state_table_entry *spot;
1442 Idx i;
1443 #ifdef lint
1444 /* Suppress bogus uninitialized-variable warnings. */
1445 *err = REG_NOERROR;
1446 #endif
1447 if (nodes->nelem == 0)
1448 {
1449 *err = REG_NOERROR;
1450 return NULL;
1451 }
1452 hash = calc_state_hash (nodes, context);
1453 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1454
1455 for (i = 0 ; i < spot->num ; i++)
1456 {
1457 re_dfastate_t *state = spot->array[i];
1458 if (state->hash == hash
1459 && state->context == context
1460 && re_node_set_compare (state->entrance_nodes, nodes))
1461 return state;
1462 }
1463 /* There are no appropriate state in `dfa', create the new one. */
1464 new_state = create_cd_newstate (dfa, nodes, context, hash);
1465 if (BE (new_state != NULL, 1))
1466 return new_state;
1467 else
1468 {
1469 *err = REG_ESPACE;
1470 return NULL;
1471 }
1472 }
1473
1474 /* Finish initialization of the new state NEWSTATE, and using its hash value
1475 HASH put in the appropriate bucket of DFA's state table. Return value
1476 indicates the error code if failed. */
1477
1478 static reg_errcode_t
1479 internal_function
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1480 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1481 {
1482 struct re_state_table_entry *spot;
1483 reg_errcode_t err;
1484 Idx i;
1485
1486 newstate->hash = hash;
1487 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1488 if (BE (err != REG_NOERROR, 0))
1489 return REG_ESPACE;
1490 for (i = 0; i < newstate->nodes.nelem; i++)
1491 {
1492 Idx elem = newstate->nodes.elems[i];
1493 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1494 {
1495 bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1496 if (BE (! ok, 0))
1497 return REG_ESPACE;
1498 }
1499 }
1500
1501 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1502 if (BE (spot->alloc <= spot->num, 0))
1503 {
1504 Idx new_alloc = spot->num;
1505 re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1506 &new_alloc);
1507 if (BE (new_array == NULL, 0))
1508 return REG_ESPACE;
1509 spot->array = new_array;
1510 spot->alloc = new_alloc;
1511 }
1512 spot->array[spot->num++] = newstate;
1513 return REG_NOERROR;
1514 }
1515
1516 /* Create the new state which is independ of contexts.
1517 Return the new state if succeeded, otherwise return NULL. */
1518
1519 static re_dfastate_t *
1520 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1521 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1522 re_hashval_t hash)
1523 {
1524 Idx i;
1525 reg_errcode_t err;
1526 re_dfastate_t *newstate;
1527
1528 newstate = re_calloc (re_dfastate_t, 1);
1529 if (BE (newstate == NULL, 0))
1530 return NULL;
1531 err = re_node_set_init_copy (&newstate->nodes, nodes);
1532 if (BE (err != REG_NOERROR, 0))
1533 {
1534 re_free (newstate);
1535 return NULL;
1536 }
1537
1538 newstate->entrance_nodes = &newstate->nodes;
1539 for (i = 0 ; i < nodes->nelem ; i++)
1540 {
1541 re_token_t *node = dfa->nodes + nodes->elems[i];
1542 re_token_type_t type = node->type;
1543 if (type == CHARACTER && !node->constraint)
1544 continue;
1545 #ifdef RE_ENABLE_I18N
1546 newstate->accept_mb |= node->accept_mb;
1547 #endif /* RE_ENABLE_I18N */
1548
1549 /* If the state has the halt node, the state is a halt state. */
1550 if (type == END_OF_RE)
1551 newstate->halt = 1;
1552 else if (type == OP_BACK_REF)
1553 newstate->has_backref = 1;
1554 else if (type == ANCHOR || node->constraint)
1555 newstate->has_constraint = 1;
1556 }
1557 err = register_state (dfa, newstate, hash);
1558 if (BE (err != REG_NOERROR, 0))
1559 {
1560 free_state (newstate);
1561 newstate = NULL;
1562 }
1563 return newstate;
1564 }
1565
1566 /* Create the new state which is depend on the context CONTEXT.
1567 Return the new state if succeeded, otherwise return NULL. */
1568
1569 static re_dfastate_t *
1570 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1571 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1572 unsigned int context, re_hashval_t hash)
1573 {
1574 Idx i, nctx_nodes = 0;
1575 reg_errcode_t err;
1576 re_dfastate_t *newstate;
1577
1578 newstate = re_calloc (re_dfastate_t, 1);
1579 if (BE (newstate == NULL, 0))
1580 return NULL;
1581 err = re_node_set_init_copy (&newstate->nodes, nodes);
1582 if (BE (err != REG_NOERROR, 0))
1583 {
1584 re_free (newstate);
1585 return NULL;
1586 }
1587
1588 newstate->context = context;
1589 newstate->entrance_nodes = &newstate->nodes;
1590
1591 for (i = 0 ; i < nodes->nelem ; i++)
1592 {
1593 unsigned int constraint = 0;
1594 re_token_t *node = dfa->nodes + nodes->elems[i];
1595 re_token_type_t type = node->type;
1596 if (node->constraint)
1597 constraint = node->constraint;
1598
1599 if (type == CHARACTER && !constraint)
1600 continue;
1601 #ifdef RE_ENABLE_I18N
1602 newstate->accept_mb |= node->accept_mb;
1603 #endif /* RE_ENABLE_I18N */
1604
1605 /* If the state has the halt node, the state is a halt state. */
1606 if (type == END_OF_RE)
1607 newstate->halt = 1;
1608 else if (type == OP_BACK_REF)
1609 newstate->has_backref = 1;
1610 else if (type == ANCHOR)
1611 constraint = node->opr.ctx_type;
1612
1613 if (constraint)
1614 {
1615 if (newstate->entrance_nodes == &newstate->nodes)
1616 {
1617 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1618 if (BE (newstate->entrance_nodes == NULL, 0))
1619 {
1620 free_state (newstate);
1621 return NULL;
1622 }
1623 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1624 nctx_nodes = 0;
1625 newstate->has_constraint = 1;
1626 }
1627
1628 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1629 {
1630 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1631 ++nctx_nodes;
1632 }
1633 }
1634 }
1635 err = register_state (dfa, newstate, hash);
1636 if (BE (err != REG_NOERROR, 0))
1637 {
1638 free_state (newstate);
1639 newstate = NULL;
1640 }
1641 return newstate;
1642 }
1643
1644 static void
1645 internal_function
free_state(re_dfastate_t * state)1646 free_state (re_dfastate_t *state)
1647 {
1648 re_node_set_free (&state->non_eps_nodes);
1649 re_node_set_free (&state->inveclosure);
1650 if (state->entrance_nodes != &state->nodes)
1651 {
1652 re_node_set_free (state->entrance_nodes);
1653 re_free (state->entrance_nodes);
1654 }
1655 re_node_set_free (&state->nodes);
1656 re_free (state->word_trtable);
1657 re_free (state->trtable);
1658 re_free (state);
1659 }
1660