1a7c91847Schristos /* Extended regular expression matching and search library.
2a7c91847Schristos Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3a7c91847Schristos This file is part of the GNU C Library.
4a7c91847Schristos Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5a7c91847Schristos
6a7c91847Schristos This program is free software; you can redistribute it and/or modify
7a7c91847Schristos it under the terms of the GNU General Public License as published by
8a7c91847Schristos the Free Software Foundation; either version 2, or (at your option)
9a7c91847Schristos any later version.
10a7c91847Schristos
11a7c91847Schristos This program is distributed in the hope that it will be useful,
12a7c91847Schristos but WITHOUT ANY WARRANTY; without even the implied warranty of
13a7c91847Schristos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14a7c91847Schristos GNU General Public License for more details.
15a7c91847Schristos
16a7c91847Schristos You should have received a copy of the GNU General Public License along
17a7c91847Schristos with this program; if not, write to the Free Software Foundation,
18a7c91847Schristos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19*5a6c14c8Schristos #include <sys/cdefs.h>
20*5a6c14c8Schristos __RCSID("$NetBSD: regex_internal.c,v 1.2 2016/05/17 14:00:09 christos Exp $");
21*5a6c14c8Schristos
22a7c91847Schristos
23a7c91847Schristos static void re_string_construct_common (const char *str, Idx len,
24a7c91847Schristos re_string_t *pstr,
25a7c91847Schristos REG_TRANSLATE_TYPE trans, bool icase,
26a7c91847Schristos const re_dfa_t *dfa) internal_function;
27a7c91847Schristos static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
28a7c91847Schristos const re_node_set *nodes,
29a7c91847Schristos re_hashval_t hash) internal_function;
30a7c91847Schristos static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
31a7c91847Schristos const re_node_set *nodes,
32a7c91847Schristos unsigned int context,
33a7c91847Schristos re_hashval_t hash) internal_function;
34a7c91847Schristos
35a7c91847Schristos /* Functions for string operation. */
36a7c91847Schristos
37a7c91847Schristos /* This function allocate the buffers. It is necessary to call
38a7c91847Schristos re_string_reconstruct before using the object. */
39a7c91847Schristos
40a7c91847Schristos static reg_errcode_t
41a7c91847Schristos internal_function
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)42a7c91847Schristos re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
43a7c91847Schristos REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44a7c91847Schristos {
45a7c91847Schristos reg_errcode_t ret;
46a7c91847Schristos Idx init_buf_len;
47a7c91847Schristos
48a7c91847Schristos /* Ensure at least one character fits into the buffers. */
49a7c91847Schristos if (init_len < dfa->mb_cur_max)
50a7c91847Schristos init_len = dfa->mb_cur_max;
51a7c91847Schristos init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52a7c91847Schristos re_string_construct_common (str, len, pstr, trans, icase, dfa);
53a7c91847Schristos
54a7c91847Schristos ret = re_string_realloc_buffers (pstr, init_buf_len);
55a7c91847Schristos if (BE (ret != REG_NOERROR, 0))
56a7c91847Schristos return ret;
57a7c91847Schristos
58a7c91847Schristos pstr->word_char = dfa->word_char;
59a7c91847Schristos pstr->word_ops_used = dfa->word_ops_used;
60a7c91847Schristos pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61a7c91847Schristos pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
62a7c91847Schristos pstr->valid_raw_len = pstr->valid_len;
63a7c91847Schristos return REG_NOERROR;
64a7c91847Schristos }
65a7c91847Schristos
66a7c91847Schristos /* This function allocate the buffers, and initialize them. */
67a7c91847Schristos
68a7c91847Schristos static reg_errcode_t
69a7c91847Schristos internal_function
re_string_construct(re_string_t * pstr,const char * str,Idx len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)70a7c91847Schristos re_string_construct (re_string_t *pstr, const char *str, Idx len,
71a7c91847Schristos REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72a7c91847Schristos {
73a7c91847Schristos reg_errcode_t ret;
74a7c91847Schristos memset (pstr, '\0', sizeof (re_string_t));
75a7c91847Schristos re_string_construct_common (str, len, pstr, trans, icase, dfa);
76a7c91847Schristos
77a7c91847Schristos if (len > 0)
78a7c91847Schristos {
79a7c91847Schristos ret = re_string_realloc_buffers (pstr, len + 1);
80a7c91847Schristos if (BE (ret != REG_NOERROR, 0))
81a7c91847Schristos return ret;
82a7c91847Schristos }
83a7c91847Schristos pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84a7c91847Schristos
85a7c91847Schristos if (icase)
86a7c91847Schristos {
87a7c91847Schristos #ifdef RE_ENABLE_I18N
88a7c91847Schristos if (dfa->mb_cur_max > 1)
89a7c91847Schristos {
90a7c91847Schristos while (1)
91a7c91847Schristos {
92a7c91847Schristos ret = build_wcs_upper_buffer (pstr);
93a7c91847Schristos if (BE (ret != REG_NOERROR, 0))
94a7c91847Schristos return ret;
95a7c91847Schristos if (pstr->valid_raw_len >= len)
96a7c91847Schristos break;
97a7c91847Schristos if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98a7c91847Schristos break;
99a7c91847Schristos ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100a7c91847Schristos if (BE (ret != REG_NOERROR, 0))
101a7c91847Schristos return ret;
102a7c91847Schristos }
103a7c91847Schristos }
104a7c91847Schristos else
105a7c91847Schristos #endif /* RE_ENABLE_I18N */
106a7c91847Schristos build_upper_buffer (pstr);
107a7c91847Schristos }
108a7c91847Schristos else
109a7c91847Schristos {
110a7c91847Schristos #ifdef RE_ENABLE_I18N
111a7c91847Schristos if (dfa->mb_cur_max > 1)
112a7c91847Schristos build_wcs_buffer (pstr);
113a7c91847Schristos else
114a7c91847Schristos #endif /* RE_ENABLE_I18N */
115a7c91847Schristos {
116a7c91847Schristos if (trans != NULL)
117a7c91847Schristos re_string_translate_buffer (pstr);
118a7c91847Schristos else
119a7c91847Schristos {
120a7c91847Schristos pstr->valid_len = pstr->bufs_len;
121a7c91847Schristos pstr->valid_raw_len = pstr->bufs_len;
122a7c91847Schristos }
123a7c91847Schristos }
124a7c91847Schristos }
125a7c91847Schristos
126a7c91847Schristos return REG_NOERROR;
127a7c91847Schristos }
128a7c91847Schristos
129a7c91847Schristos /* Helper functions for re_string_allocate, and re_string_construct. */
130a7c91847Schristos
131a7c91847Schristos static reg_errcode_t
132a7c91847Schristos internal_function
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)133a7c91847Schristos re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134a7c91847Schristos {
135a7c91847Schristos #ifdef RE_ENABLE_I18N
136a7c91847Schristos if (pstr->mb_cur_max > 1)
137a7c91847Schristos {
138a7c91847Schristos wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
139a7c91847Schristos if (BE (new_wcs == NULL, 0))
140a7c91847Schristos return REG_ESPACE;
141a7c91847Schristos pstr->wcs = new_wcs;
142a7c91847Schristos if (pstr->offsets != NULL)
143a7c91847Schristos {
144a7c91847Schristos Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
145a7c91847Schristos if (BE (new_offsets == NULL, 0))
146a7c91847Schristos return REG_ESPACE;
147a7c91847Schristos pstr->offsets = new_offsets;
148a7c91847Schristos }
149a7c91847Schristos }
150a7c91847Schristos #endif /* RE_ENABLE_I18N */
151a7c91847Schristos if (pstr->mbs_allocated)
152a7c91847Schristos {
153a7c91847Schristos unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
154a7c91847Schristos new_buf_len);
155a7c91847Schristos if (BE (new_mbs == NULL, 0))
156a7c91847Schristos return REG_ESPACE;
157a7c91847Schristos pstr->mbs = new_mbs;
158a7c91847Schristos }
159a7c91847Schristos pstr->bufs_len = new_buf_len;
160a7c91847Schristos return REG_NOERROR;
161a7c91847Schristos }
162a7c91847Schristos
163a7c91847Schristos
164a7c91847Schristos static void
165a7c91847Schristos internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)166a7c91847Schristos re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
167a7c91847Schristos REG_TRANSLATE_TYPE trans, bool icase,
168a7c91847Schristos const re_dfa_t *dfa)
169a7c91847Schristos {
170a7c91847Schristos pstr->raw_mbs = (const unsigned char *) str;
171a7c91847Schristos pstr->len = len;
172a7c91847Schristos pstr->raw_len = len;
173a7c91847Schristos pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
174a7c91847Schristos pstr->icase = icase;
175a7c91847Schristos pstr->mbs_allocated = (trans != NULL || icase);
176a7c91847Schristos pstr->mb_cur_max = dfa->mb_cur_max;
177a7c91847Schristos pstr->is_utf8 = dfa->is_utf8;
178a7c91847Schristos pstr->map_notascii = dfa->map_notascii;
179a7c91847Schristos pstr->stop = pstr->len;
180a7c91847Schristos pstr->raw_stop = pstr->stop;
181a7c91847Schristos }
182a7c91847Schristos
183a7c91847Schristos #ifdef RE_ENABLE_I18N
184a7c91847Schristos
185a7c91847Schristos /* Build wide character buffer PSTR->WCS.
186a7c91847Schristos If the byte sequence of the string are:
187a7c91847Schristos <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
188a7c91847Schristos Then wide character buffer will be:
189a7c91847Schristos <wc1> , WEOF , <wc2> , WEOF , <wc3>
190a7c91847Schristos We use WEOF for padding, they indicate that the position isn't
191a7c91847Schristos a first byte of a multibyte character.
192a7c91847Schristos
193a7c91847Schristos Note that this function assumes PSTR->VALID_LEN elements are already
194a7c91847Schristos built and starts from PSTR->VALID_LEN. */
195a7c91847Schristos
196a7c91847Schristos static void
197a7c91847Schristos internal_function
build_wcs_buffer(re_string_t * pstr)198a7c91847Schristos build_wcs_buffer (re_string_t *pstr)
199a7c91847Schristos {
200a7c91847Schristos #ifdef _LIBC
201a7c91847Schristos unsigned char buf[MB_LEN_MAX];
202a7c91847Schristos assert (MB_LEN_MAX >= pstr->mb_cur_max);
203a7c91847Schristos #else
204a7c91847Schristos unsigned char buf[64];
205a7c91847Schristos #endif
206a7c91847Schristos mbstate_t prev_st;
207a7c91847Schristos Idx byte_idx, end_idx, remain_len;
208a7c91847Schristos size_t mbclen;
209a7c91847Schristos
210a7c91847Schristos /* Build the buffers from pstr->valid_len to either pstr->len or
211a7c91847Schristos pstr->bufs_len. */
212a7c91847Schristos end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
213a7c91847Schristos for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
214a7c91847Schristos {
215a7c91847Schristos wchar_t wc;
216a7c91847Schristos const char *p;
217a7c91847Schristos
218a7c91847Schristos remain_len = end_idx - byte_idx;
219a7c91847Schristos prev_st = pstr->cur_state;
220a7c91847Schristos /* Apply the translation if we need. */
221a7c91847Schristos if (BE (pstr->trans != NULL, 0))
222a7c91847Schristos {
223a7c91847Schristos int i, ch;
224a7c91847Schristos
225a7c91847Schristos for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
226a7c91847Schristos {
227a7c91847Schristos ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
228a7c91847Schristos buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
229a7c91847Schristos }
230a7c91847Schristos p = (const char *) buf;
231a7c91847Schristos }
232a7c91847Schristos else
233a7c91847Schristos p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
234a7c91847Schristos mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
235a7c91847Schristos if (BE (mbclen == (size_t) -2, 0))
236a7c91847Schristos {
237a7c91847Schristos /* The buffer doesn't have enough space, finish to build. */
238a7c91847Schristos pstr->cur_state = prev_st;
239a7c91847Schristos break;
240a7c91847Schristos }
241a7c91847Schristos else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
242a7c91847Schristos {
243a7c91847Schristos /* We treat these cases as a singlebyte character. */
244a7c91847Schristos mbclen = 1;
245a7c91847Schristos wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
246a7c91847Schristos if (BE (pstr->trans != NULL, 0))
247a7c91847Schristos wc = pstr->trans[wc];
248a7c91847Schristos pstr->cur_state = prev_st;
249a7c91847Schristos }
250a7c91847Schristos
251a7c91847Schristos /* Write wide character and padding. */
252a7c91847Schristos pstr->wcs[byte_idx++] = wc;
253a7c91847Schristos /* Write paddings. */
254a7c91847Schristos for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
255a7c91847Schristos pstr->wcs[byte_idx++] = WEOF;
256a7c91847Schristos }
257a7c91847Schristos pstr->valid_len = byte_idx;
258a7c91847Schristos pstr->valid_raw_len = byte_idx;
259a7c91847Schristos }
260a7c91847Schristos
261a7c91847Schristos /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
262a7c91847Schristos but for REG_ICASE. */
263a7c91847Schristos
264a7c91847Schristos static reg_errcode_t
265a7c91847Schristos internal_function
build_wcs_upper_buffer(re_string_t * pstr)266a7c91847Schristos build_wcs_upper_buffer (re_string_t *pstr)
267a7c91847Schristos {
268a7c91847Schristos mbstate_t prev_st;
269a7c91847Schristos Idx src_idx, byte_idx, end_idx, remain_len;
270a7c91847Schristos size_t mbclen;
271a7c91847Schristos #ifdef _LIBC
272a7c91847Schristos char buf[MB_LEN_MAX];
273a7c91847Schristos assert (MB_LEN_MAX >= pstr->mb_cur_max);
274a7c91847Schristos #else
275a7c91847Schristos char buf[64];
276a7c91847Schristos #endif
277a7c91847Schristos
278a7c91847Schristos byte_idx = pstr->valid_len;
279a7c91847Schristos end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
280a7c91847Schristos
281a7c91847Schristos /* The following optimization assumes that ASCII characters can be
282a7c91847Schristos mapped to wide characters with a simple cast. */
283a7c91847Schristos if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
284a7c91847Schristos {
285a7c91847Schristos while (byte_idx < end_idx)
286a7c91847Schristos {
287a7c91847Schristos wchar_t wc;
288a7c91847Schristos
289a7c91847Schristos if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
290a7c91847Schristos && mbsinit (&pstr->cur_state))
291a7c91847Schristos {
292a7c91847Schristos /* In case of a singlebyte character. */
293a7c91847Schristos pstr->mbs[byte_idx]
294a7c91847Schristos = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
295a7c91847Schristos /* The next step uses the assumption that wchar_t is encoded
296a7c91847Schristos ASCII-safe: all ASCII values can be converted like this. */
297a7c91847Schristos pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
298a7c91847Schristos ++byte_idx;
299a7c91847Schristos continue;
300a7c91847Schristos }
301a7c91847Schristos
302a7c91847Schristos remain_len = end_idx - byte_idx;
303a7c91847Schristos prev_st = pstr->cur_state;
304a7c91847Schristos mbclen = mbrtowc (&wc,
305a7c91847Schristos ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
306a7c91847Schristos + byte_idx), remain_len, &pstr->cur_state);
307a7c91847Schristos if (BE ((size_t) (mbclen + 2) > 2, 1))
308a7c91847Schristos {
309a7c91847Schristos wchar_t wcu = wc;
310a7c91847Schristos if (iswlower (wc))
311a7c91847Schristos {
312a7c91847Schristos size_t mbcdlen;
313a7c91847Schristos
314a7c91847Schristos wcu = towupper (wc);
315a7c91847Schristos mbcdlen = wcrtomb (buf, wcu, &prev_st);
316a7c91847Schristos if (BE (mbclen == mbcdlen, 1))
317a7c91847Schristos memcpy (pstr->mbs + byte_idx, buf, mbclen);
318a7c91847Schristos else
319a7c91847Schristos {
320a7c91847Schristos src_idx = byte_idx;
321a7c91847Schristos goto offsets_needed;
322a7c91847Schristos }
323a7c91847Schristos }
324a7c91847Schristos else
325a7c91847Schristos memcpy (pstr->mbs + byte_idx,
326a7c91847Schristos pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
327a7c91847Schristos pstr->wcs[byte_idx++] = wcu;
328a7c91847Schristos /* Write paddings. */
329a7c91847Schristos for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
330a7c91847Schristos pstr->wcs[byte_idx++] = WEOF;
331a7c91847Schristos }
332a7c91847Schristos else if (mbclen == (size_t) -1 || mbclen == 0)
333a7c91847Schristos {
334a7c91847Schristos /* It is an invalid character or '\0'. Just use the byte. */
335a7c91847Schristos int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
336a7c91847Schristos pstr->mbs[byte_idx] = ch;
337a7c91847Schristos /* And also cast it to wide char. */
338a7c91847Schristos pstr->wcs[byte_idx++] = (wchar_t) ch;
339a7c91847Schristos if (BE (mbclen == (size_t) -1, 0))
340a7c91847Schristos pstr->cur_state = prev_st;
341a7c91847Schristos }
342a7c91847Schristos else
343a7c91847Schristos {
344a7c91847Schristos /* The buffer doesn't have enough space, finish to build. */
345a7c91847Schristos pstr->cur_state = prev_st;
346a7c91847Schristos break;
347a7c91847Schristos }
348a7c91847Schristos }
349a7c91847Schristos pstr->valid_len = byte_idx;
350a7c91847Schristos pstr->valid_raw_len = byte_idx;
351a7c91847Schristos return REG_NOERROR;
352a7c91847Schristos }
353a7c91847Schristos else
354a7c91847Schristos for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
355a7c91847Schristos {
356a7c91847Schristos wchar_t wc;
357a7c91847Schristos const char *p;
358a7c91847Schristos offsets_needed:
359a7c91847Schristos remain_len = end_idx - byte_idx;
360a7c91847Schristos prev_st = pstr->cur_state;
361a7c91847Schristos if (BE (pstr->trans != NULL, 0))
362a7c91847Schristos {
363a7c91847Schristos int i, ch;
364a7c91847Schristos
365a7c91847Schristos for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
366a7c91847Schristos {
367a7c91847Schristos ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
368a7c91847Schristos buf[i] = pstr->trans[ch];
369a7c91847Schristos }
370a7c91847Schristos p = (const char *) buf;
371a7c91847Schristos }
372a7c91847Schristos else
373a7c91847Schristos p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
374a7c91847Schristos mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
375a7c91847Schristos if (BE ((size_t) (mbclen + 2) > 2, 1))
376a7c91847Schristos {
377a7c91847Schristos wchar_t wcu = wc;
378a7c91847Schristos if (iswlower (wc))
379a7c91847Schristos {
380a7c91847Schristos size_t mbcdlen;
381a7c91847Schristos
382a7c91847Schristos wcu = towupper (wc);
383a7c91847Schristos mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
384a7c91847Schristos if (BE (mbclen == mbcdlen, 1))
385a7c91847Schristos memcpy (pstr->mbs + byte_idx, buf, mbclen);
386a7c91847Schristos else if (mbcdlen != (size_t) -1)
387a7c91847Schristos {
388a7c91847Schristos size_t i;
389a7c91847Schristos
390a7c91847Schristos if (byte_idx + mbcdlen > pstr->bufs_len)
391a7c91847Schristos {
392a7c91847Schristos pstr->cur_state = prev_st;
393a7c91847Schristos break;
394a7c91847Schristos }
395a7c91847Schristos
396a7c91847Schristos if (pstr->offsets == NULL)
397a7c91847Schristos {
398a7c91847Schristos pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
399a7c91847Schristos
400a7c91847Schristos if (pstr->offsets == NULL)
401a7c91847Schristos return REG_ESPACE;
402a7c91847Schristos }
403a7c91847Schristos if (!pstr->offsets_needed)
404a7c91847Schristos {
405a7c91847Schristos for (i = 0; i < (size_t) byte_idx; ++i)
406a7c91847Schristos pstr->offsets[i] = i;
407a7c91847Schristos pstr->offsets_needed = 1;
408a7c91847Schristos }
409a7c91847Schristos
410a7c91847Schristos memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
411a7c91847Schristos pstr->wcs[byte_idx] = wcu;
412a7c91847Schristos pstr->offsets[byte_idx] = src_idx;
413a7c91847Schristos for (i = 1; i < mbcdlen; ++i)
414a7c91847Schristos {
415a7c91847Schristos pstr->offsets[byte_idx + i]
416a7c91847Schristos = src_idx + (i < mbclen ? i : mbclen - 1);
417a7c91847Schristos pstr->wcs[byte_idx + i] = WEOF;
418a7c91847Schristos }
419a7c91847Schristos pstr->len += mbcdlen - mbclen;
420a7c91847Schristos if (pstr->raw_stop > src_idx)
421a7c91847Schristos pstr->stop += mbcdlen - mbclen;
422a7c91847Schristos end_idx = (pstr->bufs_len > pstr->len)
423a7c91847Schristos ? pstr->len : pstr->bufs_len;
424a7c91847Schristos byte_idx += mbcdlen;
425a7c91847Schristos src_idx += mbclen;
426a7c91847Schristos continue;
427a7c91847Schristos }
428a7c91847Schristos else
429a7c91847Schristos memcpy (pstr->mbs + byte_idx, p, mbclen);
430a7c91847Schristos }
431a7c91847Schristos else
432a7c91847Schristos memcpy (pstr->mbs + byte_idx, p, mbclen);
433a7c91847Schristos
434a7c91847Schristos if (BE (pstr->offsets_needed != 0, 0))
435a7c91847Schristos {
436a7c91847Schristos size_t i;
437a7c91847Schristos for (i = 0; i < mbclen; ++i)
438a7c91847Schristos pstr->offsets[byte_idx + i] = src_idx + i;
439a7c91847Schristos }
440a7c91847Schristos src_idx += mbclen;
441a7c91847Schristos
442a7c91847Schristos pstr->wcs[byte_idx++] = wcu;
443a7c91847Schristos /* Write paddings. */
444a7c91847Schristos for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
445a7c91847Schristos pstr->wcs[byte_idx++] = WEOF;
446a7c91847Schristos }
447a7c91847Schristos else if (mbclen == (size_t) -1 || mbclen == 0)
448a7c91847Schristos {
449a7c91847Schristos /* It is an invalid character or '\0'. Just use the byte. */
450a7c91847Schristos int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
451a7c91847Schristos
452a7c91847Schristos if (BE (pstr->trans != NULL, 0))
453a7c91847Schristos ch = pstr->trans [ch];
454a7c91847Schristos pstr->mbs[byte_idx] = ch;
455a7c91847Schristos
456a7c91847Schristos if (BE (pstr->offsets_needed != 0, 0))
457a7c91847Schristos pstr->offsets[byte_idx] = src_idx;
458a7c91847Schristos ++src_idx;
459a7c91847Schristos
460a7c91847Schristos /* And also cast it to wide char. */
461a7c91847Schristos pstr->wcs[byte_idx++] = (wchar_t) ch;
462a7c91847Schristos if (BE (mbclen == (size_t) -1, 0))
463a7c91847Schristos pstr->cur_state = prev_st;
464a7c91847Schristos }
465a7c91847Schristos else
466a7c91847Schristos {
467a7c91847Schristos /* The buffer doesn't have enough space, finish to build. */
468a7c91847Schristos pstr->cur_state = prev_st;
469a7c91847Schristos break;
470a7c91847Schristos }
471a7c91847Schristos }
472a7c91847Schristos pstr->valid_len = byte_idx;
473a7c91847Schristos pstr->valid_raw_len = src_idx;
474a7c91847Schristos return REG_NOERROR;
475a7c91847Schristos }
476a7c91847Schristos
477a7c91847Schristos /* Skip characters until the index becomes greater than NEW_RAW_IDX.
478a7c91847Schristos Return the index. */
479a7c91847Schristos
480a7c91847Schristos static Idx
481a7c91847Schristos internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)482a7c91847Schristos re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
483a7c91847Schristos {
484a7c91847Schristos mbstate_t prev_st;
485a7c91847Schristos Idx rawbuf_idx;
486a7c91847Schristos size_t mbclen;
487a7c91847Schristos wchar_t wc = 0;
488a7c91847Schristos
489a7c91847Schristos /* Skip the characters which are not necessary to check. */
490a7c91847Schristos for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
491a7c91847Schristos rawbuf_idx < new_raw_idx;)
492a7c91847Schristos {
493a7c91847Schristos Idx remain_len;
494a7c91847Schristos remain_len = pstr->len - rawbuf_idx;
495a7c91847Schristos prev_st = pstr->cur_state;
496a7c91847Schristos mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
497a7c91847Schristos remain_len, &pstr->cur_state);
498a7c91847Schristos if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
499a7c91847Schristos {
500a7c91847Schristos /* We treat these cases as a singlebyte character. */
501a7c91847Schristos mbclen = 1;
502a7c91847Schristos pstr->cur_state = prev_st;
503a7c91847Schristos }
504a7c91847Schristos /* Then proceed the next character. */
505a7c91847Schristos rawbuf_idx += mbclen;
506a7c91847Schristos }
507a7c91847Schristos *last_wc = (wint_t) wc;
508a7c91847Schristos return rawbuf_idx;
509a7c91847Schristos }
510a7c91847Schristos #endif /* RE_ENABLE_I18N */
511a7c91847Schristos
512a7c91847Schristos /* Build the buffer PSTR->MBS, and apply the translation if we need.
513a7c91847Schristos This function is used in case of REG_ICASE. */
514a7c91847Schristos
515a7c91847Schristos static void
516a7c91847Schristos internal_function
build_upper_buffer(re_string_t * pstr)517a7c91847Schristos build_upper_buffer (re_string_t *pstr)
518a7c91847Schristos {
519a7c91847Schristos Idx char_idx, end_idx;
520a7c91847Schristos end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
521a7c91847Schristos
522a7c91847Schristos for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
523a7c91847Schristos {
524a7c91847Schristos int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
525a7c91847Schristos if (BE (pstr->trans != NULL, 0))
526a7c91847Schristos ch = pstr->trans[ch];
527a7c91847Schristos if (islower (ch))
528a7c91847Schristos pstr->mbs[char_idx] = toupper (ch);
529a7c91847Schristos else
530a7c91847Schristos pstr->mbs[char_idx] = ch;
531a7c91847Schristos }
532a7c91847Schristos pstr->valid_len = char_idx;
533a7c91847Schristos pstr->valid_raw_len = char_idx;
534a7c91847Schristos }
535a7c91847Schristos
536a7c91847Schristos /* Apply TRANS to the buffer in PSTR. */
537a7c91847Schristos
538a7c91847Schristos static void
539a7c91847Schristos internal_function
re_string_translate_buffer(re_string_t * pstr)540a7c91847Schristos re_string_translate_buffer (re_string_t *pstr)
541a7c91847Schristos {
542a7c91847Schristos Idx buf_idx, end_idx;
543a7c91847Schristos end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
544a7c91847Schristos
545a7c91847Schristos for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
546a7c91847Schristos {
547a7c91847Schristos int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
548a7c91847Schristos pstr->mbs[buf_idx] = pstr->trans[ch];
549a7c91847Schristos }
550a7c91847Schristos
551a7c91847Schristos pstr->valid_len = buf_idx;
552a7c91847Schristos pstr->valid_raw_len = buf_idx;
553a7c91847Schristos }
554a7c91847Schristos
555a7c91847Schristos /* This function re-construct the buffers.
556a7c91847Schristos Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
557a7c91847Schristos convert to upper case in case of REG_ICASE, apply translation. */
558a7c91847Schristos
559a7c91847Schristos static reg_errcode_t
560a7c91847Schristos internal_function
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)561a7c91847Schristos re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
562a7c91847Schristos {
563a7c91847Schristos Idx offset;
564a7c91847Schristos
565a7c91847Schristos if (BE (pstr->raw_mbs_idx <= idx, 0))
566a7c91847Schristos offset = idx - pstr->raw_mbs_idx;
567a7c91847Schristos else
568a7c91847Schristos {
569a7c91847Schristos /* Reset buffer. */
570a7c91847Schristos #ifdef RE_ENABLE_I18N
571a7c91847Schristos if (pstr->mb_cur_max > 1)
572a7c91847Schristos memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
573a7c91847Schristos #endif /* RE_ENABLE_I18N */
574a7c91847Schristos pstr->len = pstr->raw_len;
575a7c91847Schristos pstr->stop = pstr->raw_stop;
576a7c91847Schristos pstr->valid_len = 0;
577a7c91847Schristos pstr->raw_mbs_idx = 0;
578a7c91847Schristos pstr->valid_raw_len = 0;
579a7c91847Schristos pstr->offsets_needed = 0;
580a7c91847Schristos pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
581a7c91847Schristos : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
582a7c91847Schristos if (!pstr->mbs_allocated)
583a7c91847Schristos pstr->mbs = (unsigned char *) pstr->raw_mbs;
584a7c91847Schristos offset = idx;
585a7c91847Schristos }
586a7c91847Schristos
587a7c91847Schristos if (BE (offset != 0, 1))
588a7c91847Schristos {
589a7c91847Schristos /* Are the characters which are already checked remain? */
590a7c91847Schristos if (BE (offset < pstr->valid_raw_len, 1)
591a7c91847Schristos #ifdef RE_ENABLE_I18N
592a7c91847Schristos /* Handling this would enlarge the code too much.
593a7c91847Schristos Accept a slowdown in that case. */
594a7c91847Schristos && pstr->offsets_needed == 0
595a7c91847Schristos #endif
596a7c91847Schristos )
597a7c91847Schristos {
598a7c91847Schristos /* Yes, move them to the front of the buffer. */
599a7c91847Schristos pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
600a7c91847Schristos #ifdef RE_ENABLE_I18N
601a7c91847Schristos if (pstr->mb_cur_max > 1)
602a7c91847Schristos memmove (pstr->wcs, pstr->wcs + offset,
603a7c91847Schristos (pstr->valid_len - offset) * sizeof (wint_t));
604a7c91847Schristos #endif /* RE_ENABLE_I18N */
605a7c91847Schristos if (BE (pstr->mbs_allocated, 0))
606a7c91847Schristos memmove (pstr->mbs, pstr->mbs + offset,
607a7c91847Schristos pstr->valid_len - offset);
608a7c91847Schristos pstr->valid_len -= offset;
609a7c91847Schristos pstr->valid_raw_len -= offset;
610a7c91847Schristos #if DEBUG
611a7c91847Schristos assert (pstr->valid_len > 0);
612a7c91847Schristos #endif
613a7c91847Schristos }
614a7c91847Schristos else
615a7c91847Schristos {
616a7c91847Schristos /* No, skip all characters until IDX. */
617a7c91847Schristos #ifdef RE_ENABLE_I18N
618a7c91847Schristos if (BE (pstr->offsets_needed, 0))
619a7c91847Schristos {
620a7c91847Schristos pstr->len = pstr->raw_len - idx + offset;
621a7c91847Schristos pstr->stop = pstr->raw_stop - idx + offset;
622a7c91847Schristos pstr->offsets_needed = 0;
623a7c91847Schristos }
624a7c91847Schristos #endif
625a7c91847Schristos pstr->valid_len = 0;
626a7c91847Schristos pstr->valid_raw_len = 0;
627a7c91847Schristos #ifdef RE_ENABLE_I18N
628a7c91847Schristos if (pstr->mb_cur_max > 1)
629a7c91847Schristos {
630a7c91847Schristos Idx wcs_idx;
631a7c91847Schristos wint_t wc = WEOF;
632a7c91847Schristos
633a7c91847Schristos if (pstr->is_utf8)
634a7c91847Schristos {
635a7c91847Schristos const unsigned char *raw, *p, *q, *end;
636a7c91847Schristos
637a7c91847Schristos /* Special case UTF-8. Multi-byte chars start with any
638a7c91847Schristos byte other than 0x80 - 0xbf. */
639a7c91847Schristos raw = pstr->raw_mbs + pstr->raw_mbs_idx;
640a7c91847Schristos end = raw + (offset - pstr->mb_cur_max);
641a7c91847Schristos for (p = raw + offset - 1; p >= end; --p)
642a7c91847Schristos if ((*p & 0xc0) != 0x80)
643a7c91847Schristos {
644a7c91847Schristos mbstate_t cur_state;
645a7c91847Schristos wchar_t wc2;
646a7c91847Schristos Idx mlen = raw + pstr->len - p;
647a7c91847Schristos unsigned char buf[6];
648a7c91847Schristos size_t mbclen;
649a7c91847Schristos
650a7c91847Schristos q = p;
651a7c91847Schristos if (BE (pstr->trans != NULL, 0))
652a7c91847Schristos {
653a7c91847Schristos int i = mlen < 6 ? mlen : 6;
654a7c91847Schristos while (--i >= 0)
655a7c91847Schristos buf[i] = pstr->trans[p[i]];
656a7c91847Schristos q = buf;
657a7c91847Schristos }
658a7c91847Schristos /* XXX Don't use mbrtowc, we know which conversion
659a7c91847Schristos to use (UTF-8 -> UCS4). */
660a7c91847Schristos memset (&cur_state, 0, sizeof (cur_state));
661a7c91847Schristos mbclen = mbrtowc (&wc2, (const char *) p, mlen,
662a7c91847Schristos &cur_state);
663a7c91847Schristos if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
664a7c91847Schristos {
665a7c91847Schristos memset (&pstr->cur_state, '\0',
666a7c91847Schristos sizeof (mbstate_t));
667a7c91847Schristos pstr->valid_len = mbclen - (raw + offset - p);
668a7c91847Schristos wc = wc2;
669a7c91847Schristos }
670a7c91847Schristos break;
671a7c91847Schristos }
672a7c91847Schristos }
673a7c91847Schristos
674a7c91847Schristos if (wc == WEOF)
675a7c91847Schristos pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
676a7c91847Schristos if (BE (pstr->valid_len, 0))
677a7c91847Schristos {
678a7c91847Schristos for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
679a7c91847Schristos pstr->wcs[wcs_idx] = WEOF;
680a7c91847Schristos if (pstr->mbs_allocated)
681a7c91847Schristos memset (pstr->mbs, -1, pstr->valid_len);
682a7c91847Schristos }
683a7c91847Schristos pstr->valid_raw_len = pstr->valid_len;
684a7c91847Schristos pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
685a7c91847Schristos && IS_WIDE_WORD_CHAR (wc))
686a7c91847Schristos ? CONTEXT_WORD
687a7c91847Schristos : ((IS_WIDE_NEWLINE (wc)
688a7c91847Schristos && pstr->newline_anchor)
689a7c91847Schristos ? CONTEXT_NEWLINE : 0));
690a7c91847Schristos }
691a7c91847Schristos else
692a7c91847Schristos #endif /* RE_ENABLE_I18N */
693a7c91847Schristos {
694a7c91847Schristos int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
695a7c91847Schristos if (pstr->trans)
696a7c91847Schristos c = pstr->trans[c];
697a7c91847Schristos pstr->tip_context = (bitset_contain (pstr->word_char, c)
698a7c91847Schristos ? CONTEXT_WORD
699a7c91847Schristos : ((IS_NEWLINE (c) && pstr->newline_anchor)
700a7c91847Schristos ? CONTEXT_NEWLINE : 0));
701a7c91847Schristos }
702a7c91847Schristos }
703a7c91847Schristos if (!BE (pstr->mbs_allocated, 0))
704a7c91847Schristos pstr->mbs += offset;
705a7c91847Schristos }
706a7c91847Schristos pstr->raw_mbs_idx = idx;
707a7c91847Schristos pstr->len -= offset;
708a7c91847Schristos pstr->stop -= offset;
709a7c91847Schristos
710a7c91847Schristos /* Then build the buffers. */
711a7c91847Schristos #ifdef RE_ENABLE_I18N
712a7c91847Schristos if (pstr->mb_cur_max > 1)
713a7c91847Schristos {
714a7c91847Schristos if (pstr->icase)
715a7c91847Schristos {
716a7c91847Schristos reg_errcode_t ret = build_wcs_upper_buffer (pstr);
717a7c91847Schristos if (BE (ret != REG_NOERROR, 0))
718a7c91847Schristos return ret;
719a7c91847Schristos }
720a7c91847Schristos else
721a7c91847Schristos build_wcs_buffer (pstr);
722a7c91847Schristos }
723a7c91847Schristos else
724a7c91847Schristos #endif /* RE_ENABLE_I18N */
725a7c91847Schristos if (BE (pstr->mbs_allocated, 0))
726a7c91847Schristos {
727a7c91847Schristos if (pstr->icase)
728a7c91847Schristos build_upper_buffer (pstr);
729a7c91847Schristos else if (pstr->trans != NULL)
730a7c91847Schristos re_string_translate_buffer (pstr);
731a7c91847Schristos }
732a7c91847Schristos else
733a7c91847Schristos pstr->valid_len = pstr->len;
734a7c91847Schristos
735a7c91847Schristos pstr->cur_idx = 0;
736a7c91847Schristos return REG_NOERROR;
737a7c91847Schristos }
738a7c91847Schristos
739a7c91847Schristos static unsigned char
internal_function(pure)740a7c91847Schristos internal_function __attribute ((pure))
741a7c91847Schristos re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
742a7c91847Schristos {
743a7c91847Schristos int ch;
744a7c91847Schristos Idx off;
745a7c91847Schristos
746a7c91847Schristos /* Handle the common (easiest) cases first. */
747a7c91847Schristos if (BE (!pstr->mbs_allocated, 1))
748a7c91847Schristos return re_string_peek_byte (pstr, idx);
749a7c91847Schristos
750a7c91847Schristos #ifdef RE_ENABLE_I18N
751a7c91847Schristos if (pstr->mb_cur_max > 1
752a7c91847Schristos && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
753a7c91847Schristos return re_string_peek_byte (pstr, idx);
754a7c91847Schristos #endif
755a7c91847Schristos
756a7c91847Schristos off = pstr->cur_idx + idx;
757a7c91847Schristos #ifdef RE_ENABLE_I18N
758a7c91847Schristos if (pstr->offsets_needed)
759a7c91847Schristos off = pstr->offsets[off];
760a7c91847Schristos #endif
761a7c91847Schristos
762a7c91847Schristos ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
763a7c91847Schristos
764a7c91847Schristos #ifdef RE_ENABLE_I18N
765a7c91847Schristos /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
766a7c91847Schristos this function returns CAPITAL LETTER I instead of first byte of
767a7c91847Schristos DOTLESS SMALL LETTER I. The latter would confuse the parser,
768a7c91847Schristos since peek_byte_case doesn't advance cur_idx in any way. */
769a7c91847Schristos if (pstr->offsets_needed && !isascii (ch))
770a7c91847Schristos return re_string_peek_byte (pstr, idx);
771a7c91847Schristos #endif
772a7c91847Schristos
773a7c91847Schristos return ch;
774a7c91847Schristos }
775a7c91847Schristos
776a7c91847Schristos static unsigned char
internal_function(pure)777a7c91847Schristos internal_function __attribute ((pure))
778a7c91847Schristos re_string_fetch_byte_case (re_string_t *pstr)
779a7c91847Schristos {
780a7c91847Schristos if (BE (!pstr->mbs_allocated, 1))
781a7c91847Schristos return re_string_fetch_byte (pstr);
782a7c91847Schristos
783a7c91847Schristos #ifdef RE_ENABLE_I18N
784a7c91847Schristos if (pstr->offsets_needed)
785a7c91847Schristos {
786a7c91847Schristos Idx off;
787a7c91847Schristos int ch;
788a7c91847Schristos
789a7c91847Schristos /* For tr_TR.UTF-8 [[:islower:]] there is
790a7c91847Schristos [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
791a7c91847Schristos in that case the whole multi-byte character and return
792a7c91847Schristos the original letter. On the other side, with
793a7c91847Schristos [[: DOTLESS SMALL LETTER I return [[:I, as doing
794a7c91847Schristos anything else would complicate things too much. */
795a7c91847Schristos
796a7c91847Schristos if (!re_string_first_byte (pstr, pstr->cur_idx))
797a7c91847Schristos return re_string_fetch_byte (pstr);
798a7c91847Schristos
799a7c91847Schristos off = pstr->offsets[pstr->cur_idx];
800a7c91847Schristos ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
801a7c91847Schristos
802a7c91847Schristos if (! isascii (ch))
803a7c91847Schristos return re_string_fetch_byte (pstr);
804a7c91847Schristos
805a7c91847Schristos re_string_skip_bytes (pstr,
806a7c91847Schristos re_string_char_size_at (pstr, pstr->cur_idx));
807a7c91847Schristos return ch;
808a7c91847Schristos }
809a7c91847Schristos #endif
810a7c91847Schristos
811a7c91847Schristos return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
812a7c91847Schristos }
813a7c91847Schristos
814a7c91847Schristos static void
815a7c91847Schristos internal_function
re_string_destruct(re_string_t * pstr)816a7c91847Schristos re_string_destruct (re_string_t *pstr)
817a7c91847Schristos {
818a7c91847Schristos #ifdef RE_ENABLE_I18N
819a7c91847Schristos re_free (pstr->wcs);
820a7c91847Schristos re_free (pstr->offsets);
821a7c91847Schristos #endif /* RE_ENABLE_I18N */
822a7c91847Schristos if (pstr->mbs_allocated)
823a7c91847Schristos re_free (pstr->mbs);
824a7c91847Schristos }
825a7c91847Schristos
826a7c91847Schristos /* Return the context at IDX in INPUT. */
827a7c91847Schristos
828a7c91847Schristos static unsigned int
829a7c91847Schristos internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)830a7c91847Schristos re_string_context_at (const re_string_t *input, Idx idx, int eflags)
831a7c91847Schristos {
832a7c91847Schristos int c;
833a7c91847Schristos if (BE (! REG_VALID_INDEX (idx), 0))
834a7c91847Schristos /* In this case, we use the value stored in input->tip_context,
835a7c91847Schristos since we can't know the character in input->mbs[-1] here. */
836a7c91847Schristos return input->tip_context;
837a7c91847Schristos if (BE (idx == input->len, 0))
838a7c91847Schristos return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
839a7c91847Schristos : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
840a7c91847Schristos #ifdef RE_ENABLE_I18N
841a7c91847Schristos if (input->mb_cur_max > 1)
842a7c91847Schristos {
843a7c91847Schristos wint_t wc;
844a7c91847Schristos Idx wc_idx = idx;
845a7c91847Schristos while(input->wcs[wc_idx] == WEOF)
846a7c91847Schristos {
847a7c91847Schristos #ifdef DEBUG
848a7c91847Schristos /* It must not happen. */
849a7c91847Schristos assert (REG_VALID_INDEX (wc_idx));
850a7c91847Schristos #endif
851a7c91847Schristos --wc_idx;
852a7c91847Schristos if (! REG_VALID_INDEX (wc_idx))
853a7c91847Schristos return input->tip_context;
854a7c91847Schristos }
855a7c91847Schristos wc = input->wcs[wc_idx];
856a7c91847Schristos if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
857a7c91847Schristos return CONTEXT_WORD;
858a7c91847Schristos return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
859a7c91847Schristos ? CONTEXT_NEWLINE : 0);
860a7c91847Schristos }
861a7c91847Schristos else
862a7c91847Schristos #endif
863a7c91847Schristos {
864a7c91847Schristos c = re_string_byte_at (input, idx);
865a7c91847Schristos if (bitset_contain (input->word_char, c))
866a7c91847Schristos return CONTEXT_WORD;
867a7c91847Schristos return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
868a7c91847Schristos }
869a7c91847Schristos }
870a7c91847Schristos
871a7c91847Schristos /* Functions for set operation. */
872a7c91847Schristos
873a7c91847Schristos static reg_errcode_t
874a7c91847Schristos internal_function
re_node_set_alloc(re_node_set * set,Idx size)875a7c91847Schristos re_node_set_alloc (re_node_set *set, Idx size)
876a7c91847Schristos {
877a7c91847Schristos set->alloc = size;
878a7c91847Schristos set->nelem = 0;
879a7c91847Schristos set->elems = re_xmalloc (Idx, size);
880a7c91847Schristos if (BE (set->elems == NULL, 0))
881a7c91847Schristos return REG_ESPACE;
882a7c91847Schristos return REG_NOERROR;
883a7c91847Schristos }
884a7c91847Schristos
885a7c91847Schristos static reg_errcode_t
886a7c91847Schristos internal_function
re_node_set_init_1(re_node_set * set,Idx elem)887a7c91847Schristos re_node_set_init_1 (re_node_set *set, Idx elem)
888a7c91847Schristos {
889a7c91847Schristos set->alloc = 1;
890a7c91847Schristos set->nelem = 1;
891a7c91847Schristos set->elems = re_malloc (Idx, 1);
892a7c91847Schristos if (BE (set->elems == NULL, 0))
893a7c91847Schristos {
894a7c91847Schristos set->alloc = set->nelem = 0;
895a7c91847Schristos return REG_ESPACE;
896a7c91847Schristos }
897a7c91847Schristos set->elems[0] = elem;
898a7c91847Schristos return REG_NOERROR;
899a7c91847Schristos }
900a7c91847Schristos
901a7c91847Schristos static reg_errcode_t
902a7c91847Schristos internal_function
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)903a7c91847Schristos re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
904a7c91847Schristos {
905a7c91847Schristos set->alloc = 2;
906a7c91847Schristos set->elems = re_malloc (Idx, 2);
907a7c91847Schristos if (BE (set->elems == NULL, 0))
908a7c91847Schristos return REG_ESPACE;
909a7c91847Schristos if (elem1 == elem2)
910a7c91847Schristos {
911a7c91847Schristos set->nelem = 1;
912a7c91847Schristos set->elems[0] = elem1;
913a7c91847Schristos }
914a7c91847Schristos else
915a7c91847Schristos {
916a7c91847Schristos set->nelem = 2;
917a7c91847Schristos if (elem1 < elem2)
918a7c91847Schristos {
919a7c91847Schristos set->elems[0] = elem1;
920a7c91847Schristos set->elems[1] = elem2;
921a7c91847Schristos }
922a7c91847Schristos else
923a7c91847Schristos {
924a7c91847Schristos set->elems[0] = elem2;
925a7c91847Schristos set->elems[1] = elem1;
926a7c91847Schristos }
927a7c91847Schristos }
928a7c91847Schristos return REG_NOERROR;
929a7c91847Schristos }
930a7c91847Schristos
931a7c91847Schristos static reg_errcode_t
932a7c91847Schristos internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)933a7c91847Schristos re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
934a7c91847Schristos {
935a7c91847Schristos dest->nelem = src->nelem;
936a7c91847Schristos if (src->nelem > 0)
937a7c91847Schristos {
938a7c91847Schristos dest->alloc = dest->nelem;
939a7c91847Schristos dest->elems = re_malloc (Idx, dest->alloc);
940a7c91847Schristos if (BE (dest->elems == NULL, 0))
941a7c91847Schristos {
942a7c91847Schristos dest->alloc = dest->nelem = 0;
943a7c91847Schristos return REG_ESPACE;
944a7c91847Schristos }
945a7c91847Schristos memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
946a7c91847Schristos }
947a7c91847Schristos else
948a7c91847Schristos re_node_set_init_empty (dest);
949a7c91847Schristos return REG_NOERROR;
950a7c91847Schristos }
951a7c91847Schristos
952a7c91847Schristos /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
953a7c91847Schristos DEST. Return value indicate the error code or REG_NOERROR if succeeded.
954a7c91847Schristos Note: We assume dest->elems is NULL, when dest->alloc is 0. */
955a7c91847Schristos
956a7c91847Schristos static reg_errcode_t
957a7c91847Schristos internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)958a7c91847Schristos re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
959a7c91847Schristos const re_node_set *src2)
960a7c91847Schristos {
961a7c91847Schristos Idx i1, i2, is, id, delta, sbase;
962a7c91847Schristos if (src1->nelem == 0 || src2->nelem == 0)
963a7c91847Schristos return REG_NOERROR;
964a7c91847Schristos
965a7c91847Schristos /* We need dest->nelem + 2 * elems_in_intersection; this is a
966a7c91847Schristos conservative estimate. */
967a7c91847Schristos if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
968a7c91847Schristos {
969a7c91847Schristos Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
970a7c91847Schristos Idx *new_elems;
971a7c91847Schristos if (sizeof (Idx) < 3
972a7c91847Schristos && (new_alloc < dest->alloc
973a7c91847Schristos || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
974a7c91847Schristos return REG_ESPACE;
975a7c91847Schristos new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
976a7c91847Schristos if (BE (new_elems == NULL, 0))
977a7c91847Schristos return REG_ESPACE;
978a7c91847Schristos dest->elems = new_elems;
979a7c91847Schristos dest->alloc = new_alloc;
980a7c91847Schristos }
981a7c91847Schristos
982a7c91847Schristos /* Find the items in the intersection of SRC1 and SRC2, and copy
983a7c91847Schristos into the top of DEST those that are not already in DEST itself. */
984a7c91847Schristos sbase = dest->nelem + src1->nelem + src2->nelem;
985a7c91847Schristos i1 = src1->nelem - 1;
986a7c91847Schristos i2 = src2->nelem - 1;
987a7c91847Schristos id = dest->nelem - 1;
988a7c91847Schristos for (;;)
989a7c91847Schristos {
990a7c91847Schristos if (src1->elems[i1] == src2->elems[i2])
991a7c91847Schristos {
992a7c91847Schristos /* Try to find the item in DEST. Maybe we could binary search? */
993a7c91847Schristos while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
994a7c91847Schristos --id;
995a7c91847Schristos
996a7c91847Schristos if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
997a7c91847Schristos dest->elems[--sbase] = src1->elems[i1];
998a7c91847Schristos
999a7c91847Schristos if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1000a7c91847Schristos break;
1001a7c91847Schristos }
1002a7c91847Schristos
1003a7c91847Schristos /* Lower the highest of the two items. */
1004a7c91847Schristos else if (src1->elems[i1] < src2->elems[i2])
1005a7c91847Schristos {
1006a7c91847Schristos if (! REG_VALID_INDEX (--i2))
1007a7c91847Schristos break;
1008a7c91847Schristos }
1009a7c91847Schristos else
1010a7c91847Schristos {
1011a7c91847Schristos if (! REG_VALID_INDEX (--i1))
1012a7c91847Schristos break;
1013a7c91847Schristos }
1014a7c91847Schristos }
1015a7c91847Schristos
1016a7c91847Schristos id = dest->nelem - 1;
1017a7c91847Schristos is = dest->nelem + src1->nelem + src2->nelem - 1;
1018a7c91847Schristos delta = is - sbase + 1;
1019a7c91847Schristos
1020a7c91847Schristos /* Now copy. When DELTA becomes zero, the remaining
1021a7c91847Schristos DEST elements are already in place; this is more or
1022a7c91847Schristos less the same loop that is in re_node_set_merge. */
1023a7c91847Schristos dest->nelem += delta;
1024a7c91847Schristos if (delta > 0 && REG_VALID_INDEX (id))
1025a7c91847Schristos for (;;)
1026a7c91847Schristos {
1027a7c91847Schristos if (dest->elems[is] > dest->elems[id])
1028a7c91847Schristos {
1029a7c91847Schristos /* Copy from the top. */
1030a7c91847Schristos dest->elems[id + delta--] = dest->elems[is--];
1031a7c91847Schristos if (delta == 0)
1032a7c91847Schristos break;
1033a7c91847Schristos }
1034a7c91847Schristos else
1035a7c91847Schristos {
1036a7c91847Schristos /* Slide from the bottom. */
1037a7c91847Schristos dest->elems[id + delta] = dest->elems[id];
1038a7c91847Schristos if (! REG_VALID_INDEX (--id))
1039a7c91847Schristos break;
1040a7c91847Schristos }
1041a7c91847Schristos }
1042a7c91847Schristos
1043a7c91847Schristos /* Copy remaining SRC elements. */
1044a7c91847Schristos memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1045a7c91847Schristos
1046a7c91847Schristos return REG_NOERROR;
1047a7c91847Schristos }
1048a7c91847Schristos
1049a7c91847Schristos /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1050a7c91847Schristos DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1051a7c91847Schristos
1052a7c91847Schristos static reg_errcode_t
1053a7c91847Schristos internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1054a7c91847Schristos re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1055a7c91847Schristos const re_node_set *src2)
1056a7c91847Schristos {
1057a7c91847Schristos Idx i1, i2, id;
1058a7c91847Schristos if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1059a7c91847Schristos {
1060a7c91847Schristos dest->alloc = src1->nelem + src2->nelem;
1061a7c91847Schristos if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1062a7c91847Schristos return REG_ESPACE;
1063a7c91847Schristos dest->elems = re_xmalloc (Idx, dest->alloc);
1064a7c91847Schristos if (BE (dest->elems == NULL, 0))
1065a7c91847Schristos return REG_ESPACE;
1066a7c91847Schristos }
1067a7c91847Schristos else
1068a7c91847Schristos {
1069a7c91847Schristos if (src1 != NULL && src1->nelem > 0)
1070a7c91847Schristos return re_node_set_init_copy (dest, src1);
1071a7c91847Schristos else if (src2 != NULL && src2->nelem > 0)
1072a7c91847Schristos return re_node_set_init_copy (dest, src2);
1073a7c91847Schristos else
1074a7c91847Schristos re_node_set_init_empty (dest);
1075a7c91847Schristos return REG_NOERROR;
1076a7c91847Schristos }
1077a7c91847Schristos for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1078a7c91847Schristos {
1079a7c91847Schristos if (src1->elems[i1] > src2->elems[i2])
1080a7c91847Schristos {
1081a7c91847Schristos dest->elems[id++] = src2->elems[i2++];
1082a7c91847Schristos continue;
1083a7c91847Schristos }
1084a7c91847Schristos if (src1->elems[i1] == src2->elems[i2])
1085a7c91847Schristos ++i2;
1086a7c91847Schristos dest->elems[id++] = src1->elems[i1++];
1087a7c91847Schristos }
1088a7c91847Schristos if (i1 < src1->nelem)
1089a7c91847Schristos {
1090a7c91847Schristos memcpy (dest->elems + id, src1->elems + i1,
1091a7c91847Schristos (src1->nelem - i1) * sizeof dest->elems[0]);
1092a7c91847Schristos id += src1->nelem - i1;
1093a7c91847Schristos }
1094a7c91847Schristos else if (i2 < src2->nelem)
1095a7c91847Schristos {
1096a7c91847Schristos memcpy (dest->elems + id, src2->elems + i2,
1097a7c91847Schristos (src2->nelem - i2) * sizeof dest->elems[0]);
1098a7c91847Schristos id += src2->nelem - i2;
1099a7c91847Schristos }
1100a7c91847Schristos dest->nelem = id;
1101a7c91847Schristos return REG_NOERROR;
1102a7c91847Schristos }
1103a7c91847Schristos
1104a7c91847Schristos /* Calculate the union set of the sets DEST and SRC. And store it to
1105a7c91847Schristos DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1106a7c91847Schristos
1107a7c91847Schristos static reg_errcode_t
1108a7c91847Schristos internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1109a7c91847Schristos re_node_set_merge (re_node_set *dest, const re_node_set *src)
1110a7c91847Schristos {
1111a7c91847Schristos Idx is, id, sbase, delta;
1112a7c91847Schristos if (src == NULL || src->nelem == 0)
1113a7c91847Schristos return REG_NOERROR;
1114a7c91847Schristos if (sizeof (Idx) < 3
1115a7c91847Schristos && ((Idx) (2 * src->nelem) < src->nelem
1116a7c91847Schristos || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1117a7c91847Schristos return REG_ESPACE;
1118a7c91847Schristos if (dest->alloc < 2 * src->nelem + dest->nelem)
1119a7c91847Schristos {
1120a7c91847Schristos Idx new_alloc = src->nelem + dest->alloc;
1121a7c91847Schristos Idx *new_buffer;
1122a7c91847Schristos if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1123a7c91847Schristos return REG_ESPACE;
1124a7c91847Schristos new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1125a7c91847Schristos if (BE (new_buffer == NULL, 0))
1126a7c91847Schristos return REG_ESPACE;
1127a7c91847Schristos dest->elems = new_buffer;
1128a7c91847Schristos dest->alloc = new_alloc;
1129a7c91847Schristos }
1130a7c91847Schristos
1131a7c91847Schristos if (BE (dest->nelem == 0, 0))
1132a7c91847Schristos {
1133a7c91847Schristos dest->nelem = src->nelem;
1134a7c91847Schristos memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1135a7c91847Schristos return REG_NOERROR;
1136a7c91847Schristos }
1137a7c91847Schristos
1138a7c91847Schristos /* Copy into the top of DEST the items of SRC that are not
1139a7c91847Schristos found in DEST. Maybe we could binary search in DEST? */
1140a7c91847Schristos for (sbase = dest->nelem + 2 * src->nelem,
1141a7c91847Schristos is = src->nelem - 1, id = dest->nelem - 1;
1142a7c91847Schristos REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1143a7c91847Schristos {
1144a7c91847Schristos if (dest->elems[id] == src->elems[is])
1145a7c91847Schristos is--, id--;
1146a7c91847Schristos else if (dest->elems[id] < src->elems[is])
1147a7c91847Schristos dest->elems[--sbase] = src->elems[is--];
1148a7c91847Schristos else /* if (dest->elems[id] > src->elems[is]) */
1149a7c91847Schristos --id;
1150a7c91847Schristos }
1151a7c91847Schristos
1152a7c91847Schristos if (REG_VALID_INDEX (is))
1153a7c91847Schristos {
1154a7c91847Schristos /* If DEST is exhausted, the remaining items of SRC must be unique. */
1155a7c91847Schristos sbase -= is + 1;
1156a7c91847Schristos memcpy (dest->elems + sbase, src->elems,
1157a7c91847Schristos (is + 1) * sizeof dest->elems[0]);
1158a7c91847Schristos }
1159a7c91847Schristos
1160a7c91847Schristos id = dest->nelem - 1;
1161a7c91847Schristos is = dest->nelem + 2 * src->nelem - 1;
1162a7c91847Schristos delta = is - sbase + 1;
1163a7c91847Schristos if (delta == 0)
1164a7c91847Schristos return REG_NOERROR;
1165a7c91847Schristos
1166a7c91847Schristos /* Now copy. When DELTA becomes zero, the remaining
1167a7c91847Schristos DEST elements are already in place. */
1168a7c91847Schristos dest->nelem += delta;
1169a7c91847Schristos for (;;)
1170a7c91847Schristos {
1171a7c91847Schristos if (dest->elems[is] > dest->elems[id])
1172a7c91847Schristos {
1173a7c91847Schristos /* Copy from the top. */
1174a7c91847Schristos dest->elems[id + delta--] = dest->elems[is--];
1175a7c91847Schristos if (delta == 0)
1176a7c91847Schristos break;
1177a7c91847Schristos }
1178a7c91847Schristos else
1179a7c91847Schristos {
1180a7c91847Schristos /* Slide from the bottom. */
1181a7c91847Schristos dest->elems[id + delta] = dest->elems[id];
1182a7c91847Schristos if (! REG_VALID_INDEX (--id))
1183a7c91847Schristos {
1184a7c91847Schristos /* Copy remaining SRC elements. */
1185a7c91847Schristos memcpy (dest->elems, dest->elems + sbase,
1186a7c91847Schristos delta * sizeof dest->elems[0]);
1187a7c91847Schristos break;
1188a7c91847Schristos }
1189a7c91847Schristos }
1190a7c91847Schristos }
1191a7c91847Schristos
1192a7c91847Schristos return REG_NOERROR;
1193a7c91847Schristos }
1194a7c91847Schristos
1195a7c91847Schristos /* Insert the new element ELEM to the re_node_set* SET.
1196a7c91847Schristos SET should not already have ELEM.
1197a7c91847Schristos Return true if successful. */
1198a7c91847Schristos
1199a7c91847Schristos static bool
1200a7c91847Schristos internal_function
re_node_set_insert(re_node_set * set,Idx elem)1201a7c91847Schristos re_node_set_insert (re_node_set *set, Idx elem)
1202a7c91847Schristos {
1203a7c91847Schristos Idx idx;
1204a7c91847Schristos /* In case the set is empty. */
1205a7c91847Schristos if (set->alloc == 0)
1206a7c91847Schristos return re_node_set_init_1 (set, elem) == REG_NOERROR;
1207a7c91847Schristos
1208a7c91847Schristos if (BE (set->nelem, 0) == 0)
1209a7c91847Schristos {
1210a7c91847Schristos /* We already guaranteed above that set->alloc != 0. */
1211a7c91847Schristos set->elems[0] = elem;
1212a7c91847Schristos ++set->nelem;
1213a7c91847Schristos return true;
1214a7c91847Schristos }
1215a7c91847Schristos
1216a7c91847Schristos /* Realloc if we need. */
1217a7c91847Schristos if (set->alloc == set->nelem)
1218a7c91847Schristos {
1219a7c91847Schristos Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1220a7c91847Schristos if (BE (new_elems == NULL, 0))
1221a7c91847Schristos return false;
1222a7c91847Schristos set->elems = new_elems;
1223a7c91847Schristos }
1224a7c91847Schristos
1225a7c91847Schristos /* Move the elements which follows the new element. Test the
1226a7c91847Schristos first element separately to skip a check in the inner loop. */
1227a7c91847Schristos if (elem < set->elems[0])
1228a7c91847Schristos {
1229a7c91847Schristos idx = 0;
1230a7c91847Schristos for (idx = set->nelem; idx > 0; idx--)
1231a7c91847Schristos set->elems[idx] = set->elems[idx - 1];
1232a7c91847Schristos }
1233a7c91847Schristos else
1234a7c91847Schristos {
1235a7c91847Schristos for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1236a7c91847Schristos set->elems[idx] = set->elems[idx - 1];
1237a7c91847Schristos }
1238a7c91847Schristos
1239a7c91847Schristos /* Insert the new element. */
1240a7c91847Schristos set->elems[idx] = elem;
1241a7c91847Schristos ++set->nelem;
1242a7c91847Schristos return true;
1243a7c91847Schristos }
1244a7c91847Schristos
1245a7c91847Schristos /* Insert the new element ELEM to the re_node_set* SET.
1246a7c91847Schristos SET should not already have any element greater than or equal to ELEM.
1247a7c91847Schristos Return true if successful. */
1248a7c91847Schristos
1249a7c91847Schristos static bool
1250a7c91847Schristos internal_function
re_node_set_insert_last(re_node_set * set,Idx elem)1251a7c91847Schristos re_node_set_insert_last (re_node_set *set, Idx elem)
1252a7c91847Schristos {
1253a7c91847Schristos /* Realloc if we need. */
1254a7c91847Schristos if (set->alloc == set->nelem)
1255a7c91847Schristos {
1256a7c91847Schristos Idx *new_elems;
1257a7c91847Schristos new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1258a7c91847Schristos if (BE (new_elems == NULL, 0))
1259a7c91847Schristos return false;
1260a7c91847Schristos set->elems = new_elems;
1261a7c91847Schristos }
1262a7c91847Schristos
1263a7c91847Schristos /* Insert the new element. */
1264a7c91847Schristos set->elems[set->nelem++] = elem;
1265a7c91847Schristos return true;
1266a7c91847Schristos }
1267a7c91847Schristos
1268a7c91847Schristos /* Compare two node sets SET1 and SET2.
1269a7c91847Schristos Return true if SET1 and SET2 are equivalent. */
1270a7c91847Schristos
1271a7c91847Schristos static bool
internal_function(pure)1272a7c91847Schristos internal_function __attribute ((pure))
1273a7c91847Schristos re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1274a7c91847Schristos {
1275a7c91847Schristos Idx i;
1276a7c91847Schristos if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1277a7c91847Schristos return false;
1278a7c91847Schristos for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1279a7c91847Schristos if (set1->elems[i] != set2->elems[i])
1280a7c91847Schristos return false;
1281a7c91847Schristos return true;
1282a7c91847Schristos }
1283a7c91847Schristos
1284a7c91847Schristos /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1285a7c91847Schristos
1286a7c91847Schristos static Idx
internal_function(pure)1287a7c91847Schristos internal_function __attribute ((pure))
1288a7c91847Schristos re_node_set_contains (const re_node_set *set, Idx elem)
1289a7c91847Schristos {
1290a7c91847Schristos __re_size_t idx, right, mid;
1291a7c91847Schristos if (! REG_VALID_NONZERO_INDEX (set->nelem))
1292a7c91847Schristos return 0;
1293a7c91847Schristos
1294a7c91847Schristos /* Binary search the element. */
1295a7c91847Schristos idx = 0;
1296a7c91847Schristos right = set->nelem - 1;
1297a7c91847Schristos while (idx < right)
1298a7c91847Schristos {
1299a7c91847Schristos mid = (idx + right) / 2;
1300a7c91847Schristos if (set->elems[mid] < elem)
1301a7c91847Schristos idx = mid + 1;
1302a7c91847Schristos else
1303a7c91847Schristos right = mid;
1304a7c91847Schristos }
1305a7c91847Schristos return set->elems[idx] == elem ? idx + 1 : 0;
1306a7c91847Schristos }
1307a7c91847Schristos
1308a7c91847Schristos static void
1309a7c91847Schristos internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1310a7c91847Schristos re_node_set_remove_at (re_node_set *set, Idx idx)
1311a7c91847Schristos {
1312a7c91847Schristos if (idx < 0 || idx >= set->nelem)
1313a7c91847Schristos return;
1314a7c91847Schristos --set->nelem;
1315a7c91847Schristos for (; idx < set->nelem; idx++)
1316a7c91847Schristos set->elems[idx] = set->elems[idx + 1];
1317a7c91847Schristos }
1318a7c91847Schristos
1319a7c91847Schristos
1320a7c91847Schristos /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1321a7c91847Schristos Or return REG_MISSING if an error occurred. */
1322a7c91847Schristos
1323a7c91847Schristos static Idx
1324a7c91847Schristos internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1325a7c91847Schristos re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1326a7c91847Schristos {
1327a7c91847Schristos int type = token.type;
1328a7c91847Schristos if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1329a7c91847Schristos {
1330a7c91847Schristos Idx new_nodes_alloc = dfa->nodes_alloc;
1331a7c91847Schristos Idx *new_nexts, *new_indices;
1332a7c91847Schristos re_node_set *new_edests, *new_eclosures;
1333a7c91847Schristos
1334a7c91847Schristos re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1335a7c91847Schristos &new_nodes_alloc);
1336a7c91847Schristos if (BE (new_nodes == NULL, 0))
1337a7c91847Schristos return REG_MISSING;
1338a7c91847Schristos dfa->nodes = new_nodes;
1339a7c91847Schristos new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1340a7c91847Schristos new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1341a7c91847Schristos new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1342a7c91847Schristos new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1343a7c91847Schristos if (BE (new_nexts == NULL || new_indices == NULL
1344a7c91847Schristos || new_edests == NULL || new_eclosures == NULL, 0))
1345a7c91847Schristos return REG_MISSING;
1346a7c91847Schristos dfa->nexts = new_nexts;
1347a7c91847Schristos dfa->org_indices = new_indices;
1348a7c91847Schristos dfa->edests = new_edests;
1349a7c91847Schristos dfa->eclosures = new_eclosures;
1350a7c91847Schristos dfa->nodes_alloc = new_nodes_alloc;
1351a7c91847Schristos }
1352a7c91847Schristos dfa->nodes[dfa->nodes_len] = token;
1353a7c91847Schristos dfa->nodes[dfa->nodes_len].constraint = 0;
1354a7c91847Schristos #ifdef RE_ENABLE_I18N
1355a7c91847Schristos dfa->nodes[dfa->nodes_len].accept_mb =
1356a7c91847Schristos (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1357a7c91847Schristos #endif
1358a7c91847Schristos dfa->nexts[dfa->nodes_len] = REG_MISSING;
1359a7c91847Schristos re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1360a7c91847Schristos re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1361a7c91847Schristos return dfa->nodes_len++;
1362a7c91847Schristos }
1363a7c91847Schristos
1364a7c91847Schristos static inline re_hashval_t
1365a7c91847Schristos internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1366a7c91847Schristos calc_state_hash (const re_node_set *nodes, unsigned int context)
1367a7c91847Schristos {
1368a7c91847Schristos re_hashval_t hash = nodes->nelem + context;
1369a7c91847Schristos Idx i;
1370a7c91847Schristos for (i = 0 ; i < nodes->nelem ; i++)
1371a7c91847Schristos hash += nodes->elems[i];
1372a7c91847Schristos return hash;
1373a7c91847Schristos }
1374a7c91847Schristos
1375a7c91847Schristos /* Search for the state whose node_set is equivalent to NODES.
1376a7c91847Schristos Return the pointer to the state, if we found it in the DFA.
1377a7c91847Schristos Otherwise create the new one and return it. In case of an error
1378a7c91847Schristos return NULL and set the error code in ERR.
1379a7c91847Schristos Note: - We assume NULL as the invalid state, then it is possible that
1380a7c91847Schristos return value is NULL and ERR is REG_NOERROR.
1381a7c91847Schristos - We never return non-NULL value in case of any errors, it is for
1382a7c91847Schristos optimization. */
1383a7c91847Schristos
1384a7c91847Schristos static re_dfastate_t*
1385a7c91847Schristos internal_function
re_acquire_state(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes)1386a7c91847Schristos re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1387a7c91847Schristos {
1388a7c91847Schristos re_hashval_t hash;
1389a7c91847Schristos re_dfastate_t *new_state;
1390a7c91847Schristos struct re_state_table_entry *spot;
1391a7c91847Schristos Idx i;
1392a7c91847Schristos #ifdef lint
1393a7c91847Schristos /* Suppress bogus uninitialized-variable warnings. */
1394a7c91847Schristos *err = REG_NOERROR;
1395a7c91847Schristos #endif
1396a7c91847Schristos if (BE (nodes->nelem == 0, 0))
1397a7c91847Schristos {
1398a7c91847Schristos *err = REG_NOERROR;
1399a7c91847Schristos return NULL;
1400a7c91847Schristos }
1401a7c91847Schristos hash = calc_state_hash (nodes, 0);
1402a7c91847Schristos spot = dfa->state_table + (hash & dfa->state_hash_mask);
1403a7c91847Schristos
1404a7c91847Schristos for (i = 0 ; i < spot->num ; i++)
1405a7c91847Schristos {
1406a7c91847Schristos re_dfastate_t *state = spot->array[i];
1407a7c91847Schristos if (hash != state->hash)
1408a7c91847Schristos continue;
1409a7c91847Schristos if (re_node_set_compare (&state->nodes, nodes))
1410a7c91847Schristos return state;
1411a7c91847Schristos }
1412a7c91847Schristos
1413a7c91847Schristos /* There are no appropriate state in the dfa, create the new one. */
1414a7c91847Schristos new_state = create_ci_newstate (dfa, nodes, hash);
1415a7c91847Schristos if (BE (new_state != NULL, 1))
1416a7c91847Schristos return new_state;
1417a7c91847Schristos else
1418a7c91847Schristos {
1419a7c91847Schristos *err = REG_ESPACE;
1420a7c91847Schristos return NULL;
1421a7c91847Schristos }
1422a7c91847Schristos }
1423a7c91847Schristos
1424a7c91847Schristos /* Search for the state whose node_set is equivalent to NODES and
1425a7c91847Schristos whose context is equivalent to CONTEXT.
1426a7c91847Schristos Return the pointer to the state, if we found it in the DFA.
1427a7c91847Schristos Otherwise create the new one and return it. In case of an error
1428a7c91847Schristos return NULL and set the error code in ERR.
1429a7c91847Schristos Note: - We assume NULL as the invalid state, then it is possible that
1430a7c91847Schristos return value is NULL and ERR is REG_NOERROR.
1431a7c91847Schristos - We never return non-NULL value in case of any errors, it is for
1432a7c91847Schristos optimization. */
1433a7c91847Schristos
1434a7c91847Schristos static re_dfastate_t*
1435a7c91847Schristos internal_function
re_acquire_state_context(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1436a7c91847Schristos re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1437a7c91847Schristos const re_node_set *nodes, unsigned int context)
1438a7c91847Schristos {
1439a7c91847Schristos re_hashval_t hash;
1440a7c91847Schristos re_dfastate_t *new_state;
1441a7c91847Schristos struct re_state_table_entry *spot;
1442a7c91847Schristos Idx i;
1443a7c91847Schristos #ifdef lint
1444a7c91847Schristos /* Suppress bogus uninitialized-variable warnings. */
1445a7c91847Schristos *err = REG_NOERROR;
1446a7c91847Schristos #endif
1447a7c91847Schristos if (nodes->nelem == 0)
1448a7c91847Schristos {
1449a7c91847Schristos *err = REG_NOERROR;
1450a7c91847Schristos return NULL;
1451a7c91847Schristos }
1452a7c91847Schristos hash = calc_state_hash (nodes, context);
1453a7c91847Schristos spot = dfa->state_table + (hash & dfa->state_hash_mask);
1454a7c91847Schristos
1455a7c91847Schristos for (i = 0 ; i < spot->num ; i++)
1456a7c91847Schristos {
1457a7c91847Schristos re_dfastate_t *state = spot->array[i];
1458a7c91847Schristos if (state->hash == hash
1459a7c91847Schristos && state->context == context
1460a7c91847Schristos && re_node_set_compare (state->entrance_nodes, nodes))
1461a7c91847Schristos return state;
1462a7c91847Schristos }
1463a7c91847Schristos /* There are no appropriate state in `dfa', create the new one. */
1464a7c91847Schristos new_state = create_cd_newstate (dfa, nodes, context, hash);
1465a7c91847Schristos if (BE (new_state != NULL, 1))
1466a7c91847Schristos return new_state;
1467a7c91847Schristos else
1468a7c91847Schristos {
1469a7c91847Schristos *err = REG_ESPACE;
1470a7c91847Schristos return NULL;
1471a7c91847Schristos }
1472a7c91847Schristos }
1473a7c91847Schristos
1474a7c91847Schristos /* Finish initialization of the new state NEWSTATE, and using its hash value
1475a7c91847Schristos HASH put in the appropriate bucket of DFA's state table. Return value
1476a7c91847Schristos indicates the error code if failed. */
1477a7c91847Schristos
1478a7c91847Schristos static reg_errcode_t
1479a7c91847Schristos internal_function
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1480a7c91847Schristos register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1481a7c91847Schristos {
1482a7c91847Schristos struct re_state_table_entry *spot;
1483a7c91847Schristos reg_errcode_t err;
1484a7c91847Schristos Idx i;
1485a7c91847Schristos
1486a7c91847Schristos newstate->hash = hash;
1487a7c91847Schristos err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1488a7c91847Schristos if (BE (err != REG_NOERROR, 0))
1489a7c91847Schristos return REG_ESPACE;
1490a7c91847Schristos for (i = 0; i < newstate->nodes.nelem; i++)
1491a7c91847Schristos {
1492a7c91847Schristos Idx elem = newstate->nodes.elems[i];
1493a7c91847Schristos if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1494a7c91847Schristos {
1495a7c91847Schristos bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1496a7c91847Schristos if (BE (! ok, 0))
1497a7c91847Schristos return REG_ESPACE;
1498a7c91847Schristos }
1499a7c91847Schristos }
1500a7c91847Schristos
1501a7c91847Schristos spot = dfa->state_table + (hash & dfa->state_hash_mask);
1502a7c91847Schristos if (BE (spot->alloc <= spot->num, 0))
1503a7c91847Schristos {
1504a7c91847Schristos Idx new_alloc = spot->num;
1505a7c91847Schristos re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1506a7c91847Schristos &new_alloc);
1507a7c91847Schristos if (BE (new_array == NULL, 0))
1508a7c91847Schristos return REG_ESPACE;
1509a7c91847Schristos spot->array = new_array;
1510a7c91847Schristos spot->alloc = new_alloc;
1511a7c91847Schristos }
1512a7c91847Schristos spot->array[spot->num++] = newstate;
1513a7c91847Schristos return REG_NOERROR;
1514a7c91847Schristos }
1515a7c91847Schristos
1516a7c91847Schristos /* Create the new state which is independ of contexts.
1517a7c91847Schristos Return the new state if succeeded, otherwise return NULL. */
1518a7c91847Schristos
1519a7c91847Schristos static re_dfastate_t *
1520a7c91847Schristos internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1521a7c91847Schristos create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1522a7c91847Schristos re_hashval_t hash)
1523a7c91847Schristos {
1524a7c91847Schristos Idx i;
1525a7c91847Schristos reg_errcode_t err;
1526a7c91847Schristos re_dfastate_t *newstate;
1527a7c91847Schristos
1528a7c91847Schristos newstate = re_calloc (re_dfastate_t, 1);
1529a7c91847Schristos if (BE (newstate == NULL, 0))
1530a7c91847Schristos return NULL;
1531a7c91847Schristos err = re_node_set_init_copy (&newstate->nodes, nodes);
1532a7c91847Schristos if (BE (err != REG_NOERROR, 0))
1533a7c91847Schristos {
1534a7c91847Schristos re_free (newstate);
1535a7c91847Schristos return NULL;
1536a7c91847Schristos }
1537a7c91847Schristos
1538a7c91847Schristos newstate->entrance_nodes = &newstate->nodes;
1539a7c91847Schristos for (i = 0 ; i < nodes->nelem ; i++)
1540a7c91847Schristos {
1541a7c91847Schristos re_token_t *node = dfa->nodes + nodes->elems[i];
1542a7c91847Schristos re_token_type_t type = node->type;
1543a7c91847Schristos if (type == CHARACTER && !node->constraint)
1544a7c91847Schristos continue;
1545a7c91847Schristos #ifdef RE_ENABLE_I18N
1546a7c91847Schristos newstate->accept_mb |= node->accept_mb;
1547a7c91847Schristos #endif /* RE_ENABLE_I18N */
1548a7c91847Schristos
1549a7c91847Schristos /* If the state has the halt node, the state is a halt state. */
1550a7c91847Schristos if (type == END_OF_RE)
1551a7c91847Schristos newstate->halt = 1;
1552a7c91847Schristos else if (type == OP_BACK_REF)
1553a7c91847Schristos newstate->has_backref = 1;
1554a7c91847Schristos else if (type == ANCHOR || node->constraint)
1555a7c91847Schristos newstate->has_constraint = 1;
1556a7c91847Schristos }
1557a7c91847Schristos err = register_state (dfa, newstate, hash);
1558a7c91847Schristos if (BE (err != REG_NOERROR, 0))
1559a7c91847Schristos {
1560a7c91847Schristos free_state (newstate);
1561a7c91847Schristos newstate = NULL;
1562a7c91847Schristos }
1563a7c91847Schristos return newstate;
1564a7c91847Schristos }
1565a7c91847Schristos
1566a7c91847Schristos /* Create the new state which is depend on the context CONTEXT.
1567a7c91847Schristos Return the new state if succeeded, otherwise return NULL. */
1568a7c91847Schristos
1569a7c91847Schristos static re_dfastate_t *
1570a7c91847Schristos internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1571a7c91847Schristos create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1572a7c91847Schristos unsigned int context, re_hashval_t hash)
1573a7c91847Schristos {
1574a7c91847Schristos Idx i, nctx_nodes = 0;
1575a7c91847Schristos reg_errcode_t err;
1576a7c91847Schristos re_dfastate_t *newstate;
1577a7c91847Schristos
1578a7c91847Schristos newstate = re_calloc (re_dfastate_t, 1);
1579a7c91847Schristos if (BE (newstate == NULL, 0))
1580a7c91847Schristos return NULL;
1581a7c91847Schristos err = re_node_set_init_copy (&newstate->nodes, nodes);
1582a7c91847Schristos if (BE (err != REG_NOERROR, 0))
1583a7c91847Schristos {
1584a7c91847Schristos re_free (newstate);
1585a7c91847Schristos return NULL;
1586a7c91847Schristos }
1587a7c91847Schristos
1588a7c91847Schristos newstate->context = context;
1589a7c91847Schristos newstate->entrance_nodes = &newstate->nodes;
1590a7c91847Schristos
1591a7c91847Schristos for (i = 0 ; i < nodes->nelem ; i++)
1592a7c91847Schristos {
1593a7c91847Schristos unsigned int constraint = 0;
1594a7c91847Schristos re_token_t *node = dfa->nodes + nodes->elems[i];
1595a7c91847Schristos re_token_type_t type = node->type;
1596a7c91847Schristos if (node->constraint)
1597a7c91847Schristos constraint = node->constraint;
1598a7c91847Schristos
1599a7c91847Schristos if (type == CHARACTER && !constraint)
1600a7c91847Schristos continue;
1601a7c91847Schristos #ifdef RE_ENABLE_I18N
1602a7c91847Schristos newstate->accept_mb |= node->accept_mb;
1603a7c91847Schristos #endif /* RE_ENABLE_I18N */
1604a7c91847Schristos
1605a7c91847Schristos /* If the state has the halt node, the state is a halt state. */
1606a7c91847Schristos if (type == END_OF_RE)
1607a7c91847Schristos newstate->halt = 1;
1608a7c91847Schristos else if (type == OP_BACK_REF)
1609a7c91847Schristos newstate->has_backref = 1;
1610a7c91847Schristos else if (type == ANCHOR)
1611a7c91847Schristos constraint = node->opr.ctx_type;
1612a7c91847Schristos
1613a7c91847Schristos if (constraint)
1614a7c91847Schristos {
1615a7c91847Schristos if (newstate->entrance_nodes == &newstate->nodes)
1616a7c91847Schristos {
1617a7c91847Schristos newstate->entrance_nodes = re_malloc (re_node_set, 1);
1618a7c91847Schristos if (BE (newstate->entrance_nodes == NULL, 0))
1619a7c91847Schristos {
1620a7c91847Schristos free_state (newstate);
1621a7c91847Schristos return NULL;
1622a7c91847Schristos }
1623a7c91847Schristos re_node_set_init_copy (newstate->entrance_nodes, nodes);
1624a7c91847Schristos nctx_nodes = 0;
1625a7c91847Schristos newstate->has_constraint = 1;
1626a7c91847Schristos }
1627a7c91847Schristos
1628a7c91847Schristos if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1629a7c91847Schristos {
1630a7c91847Schristos re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1631a7c91847Schristos ++nctx_nodes;
1632a7c91847Schristos }
1633a7c91847Schristos }
1634a7c91847Schristos }
1635a7c91847Schristos err = register_state (dfa, newstate, hash);
1636a7c91847Schristos if (BE (err != REG_NOERROR, 0))
1637a7c91847Schristos {
1638a7c91847Schristos free_state (newstate);
1639a7c91847Schristos newstate = NULL;
1640a7c91847Schristos }
1641a7c91847Schristos return newstate;
1642a7c91847Schristos }
1643a7c91847Schristos
1644a7c91847Schristos static void
1645a7c91847Schristos internal_function
free_state(re_dfastate_t * state)1646a7c91847Schristos free_state (re_dfastate_t *state)
1647a7c91847Schristos {
1648a7c91847Schristos re_node_set_free (&state->non_eps_nodes);
1649a7c91847Schristos re_node_set_free (&state->inveclosure);
1650a7c91847Schristos if (state->entrance_nodes != &state->nodes)
1651a7c91847Schristos {
1652a7c91847Schristos re_node_set_free (state->entrance_nodes);
1653a7c91847Schristos re_free (state->entrance_nodes);
1654a7c91847Schristos }
1655a7c91847Schristos re_node_set_free (&state->nodes);
1656a7c91847Schristos re_free (state->word_trtable);
1657a7c91847Schristos re_free (state->trtable);
1658a7c91847Schristos re_free (state);
1659a7c91847Schristos }
1660