195b7b453SJohn Marino /* searchutils.c - helper subroutines for grep's matchers.
2*09d4459fSDaniel Fojt Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino
495b7b453SJohn Marino This program is free software; you can redistribute it and/or modify
595b7b453SJohn Marino it under the terms of the GNU General Public License as published by
695b7b453SJohn Marino the Free Software Foundation; either version 3, or (at your option)
795b7b453SJohn Marino any later version.
895b7b453SJohn Marino
995b7b453SJohn Marino This program is distributed in the hope that it will be useful,
1095b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1195b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1295b7b453SJohn Marino GNU General Public License for more details.
1395b7b453SJohn Marino
1495b7b453SJohn Marino You should have received a copy of the GNU General Public License
1595b7b453SJohn Marino along with this program; if not, write to the Free Software
1695b7b453SJohn Marino Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
1795b7b453SJohn Marino 02110-1301, USA. */
1895b7b453SJohn Marino
1995b7b453SJohn Marino #include <config.h>
20dc7c36e4SJohn Marino
21dc7c36e4SJohn Marino #define SEARCH_INLINE _GL_EXTERN_INLINE
22dc7c36e4SJohn Marino #define SYSTEM_INLINE _GL_EXTERN_INLINE
2395b7b453SJohn Marino #include "search.h"
2495b7b453SJohn Marino
25*09d4459fSDaniel Fojt /* For each byte B, sbwordchar[B] is true if B is a single-byte
26*09d4459fSDaniel Fojt character that is a word constituent, and is false otherwise. */
27*09d4459fSDaniel Fojt static bool sbwordchar[NCHAR];
28dc7c36e4SJohn Marino
29*09d4459fSDaniel Fojt /* Whether -w considers WC to be a word constituent. */
30*09d4459fSDaniel Fojt static bool
wordchar(wint_t wc)31*09d4459fSDaniel Fojt wordchar (wint_t wc)
32*09d4459fSDaniel Fojt {
33*09d4459fSDaniel Fojt return wc == L'_' || iswalnum (wc);
34*09d4459fSDaniel Fojt }
35680a9cb8SJohn Marino
3695b7b453SJohn Marino void
wordinit(void)37*09d4459fSDaniel Fojt wordinit (void)
3895b7b453SJohn Marino {
39*09d4459fSDaniel Fojt for (int i = 0; i < NCHAR; i++)
40*09d4459fSDaniel Fojt sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
41*09d4459fSDaniel Fojt }
4295b7b453SJohn Marino
43*09d4459fSDaniel Fojt kwset_t
kwsinit(bool mb_trans)44*09d4459fSDaniel Fojt kwsinit (bool mb_trans)
4595b7b453SJohn Marino {
46*09d4459fSDaniel Fojt char *trans = NULL;
47*09d4459fSDaniel Fojt
48*09d4459fSDaniel Fojt if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
49*09d4459fSDaniel Fojt {
50*09d4459fSDaniel Fojt trans = xmalloc (NCHAR);
51*09d4459fSDaniel Fojt if (MB_CUR_MAX == 1)
52*09d4459fSDaniel Fojt for (int i = 0; i < NCHAR; i++)
53680a9cb8SJohn Marino trans[i] = toupper (i);
5495b7b453SJohn Marino else
55*09d4459fSDaniel Fojt for (int i = 0; i < NCHAR; i++)
5695b7b453SJohn Marino {
57*09d4459fSDaniel Fojt wint_t wc = localeinfo.sbctowc[i];
58*09d4459fSDaniel Fojt wint_t uwc = towupper (wc);
59*09d4459fSDaniel Fojt if (uwc != wc)
6095b7b453SJohn Marino {
61680a9cb8SJohn Marino mbstate_t mbs = { 0 };
62*09d4459fSDaniel Fojt size_t len = wcrtomb (&trans[i], uwc, &mbs);
63*09d4459fSDaniel Fojt if (len != 1)
64*09d4459fSDaniel Fojt abort ();
65680a9cb8SJohn Marino }
66*09d4459fSDaniel Fojt else
67*09d4459fSDaniel Fojt trans[i] = i;
68*09d4459fSDaniel Fojt }
69*09d4459fSDaniel Fojt }
70*09d4459fSDaniel Fojt
71*09d4459fSDaniel Fojt return kwsalloc (trans);
72680a9cb8SJohn Marino }
73680a9cb8SJohn Marino
74680a9cb8SJohn Marino /* In the buffer *MB_START, return the number of bytes needed to go
75680a9cb8SJohn Marino back from CUR to the previous boundary, where a "boundary" is the
76680a9cb8SJohn Marino start of a multibyte character or is an error-encoding byte. The
77680a9cb8SJohn Marino buffer ends at END (i.e., one past the address of the buffer's last
78*09d4459fSDaniel Fojt byte). If CUR is already at a boundary, return 0. If CUR is no
79*09d4459fSDaniel Fojt larger than *MB_START, return CUR - *MB_START without modifying
80*09d4459fSDaniel Fojt *MB_START or *MBCLEN.
81680a9cb8SJohn Marino
82680a9cb8SJohn Marino When returning zero, set *MB_START to CUR. When returning a
83*09d4459fSDaniel Fojt positive value, set *MB_START to the next boundary after CUR,
84*09d4459fSDaniel Fojt or to END if there is no such boundary, and set *MBCLEN to the
85*09d4459fSDaniel Fojt length of the preceding character. */
86680a9cb8SJohn Marino ptrdiff_t
mb_goback(char const ** mb_start,size_t * mbclen,char const * cur,char const * end)87*09d4459fSDaniel Fojt mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
88*09d4459fSDaniel Fojt char const *end)
89680a9cb8SJohn Marino {
90680a9cb8SJohn Marino const char *p = *mb_start;
91680a9cb8SJohn Marino const char *p0 = p;
92*09d4459fSDaniel Fojt size_t clen;
9395b7b453SJohn Marino
94*09d4459fSDaniel Fojt if (cur <= p)
95*09d4459fSDaniel Fojt return cur - p;
9695b7b453SJohn Marino
97*09d4459fSDaniel Fojt if (localeinfo.using_utf8)
9895b7b453SJohn Marino {
99*09d4459fSDaniel Fojt p = cur;
100*09d4459fSDaniel Fojt clen = 1;
101*09d4459fSDaniel Fojt
102*09d4459fSDaniel Fojt if (cur < end && (*cur & 0xc0) == 0x80)
103*09d4459fSDaniel Fojt for (int i = 1; i <= 3; i++)
104*09d4459fSDaniel Fojt if ((cur[-i] & 0xc0) != 0x80)
105*09d4459fSDaniel Fojt {
106*09d4459fSDaniel Fojt mbstate_t mbs = { 0 };
107*09d4459fSDaniel Fojt clen = mb_clen (cur - i, end - (cur - i), &mbs);
108*09d4459fSDaniel Fojt if (i < clen && clen < (size_t) -2)
109*09d4459fSDaniel Fojt {
110*09d4459fSDaniel Fojt p0 = cur - i;
111*09d4459fSDaniel Fojt p = p0 + clen;
112*09d4459fSDaniel Fojt }
113*09d4459fSDaniel Fojt break;
114*09d4459fSDaniel Fojt }
115*09d4459fSDaniel Fojt }
116*09d4459fSDaniel Fojt else
117*09d4459fSDaniel Fojt {
118*09d4459fSDaniel Fojt mbstate_t mbs = { 0 };
119*09d4459fSDaniel Fojt do
120*09d4459fSDaniel Fojt {
121*09d4459fSDaniel Fojt clen = mb_clen (p, end - p, &mbs);
122680a9cb8SJohn Marino
123dc7c36e4SJohn Marino if ((size_t) -2 <= clen)
124680a9cb8SJohn Marino {
125dc7c36e4SJohn Marino /* An invalid sequence, or a truncated multibyte character.
126dc7c36e4SJohn Marino Treat it as a single byte character. */
127dc7c36e4SJohn Marino clen = 1;
128*09d4459fSDaniel Fojt memset (&mbs, 0, sizeof mbs);
12995b7b453SJohn Marino }
130680a9cb8SJohn Marino p0 = p;
131dc7c36e4SJohn Marino p += clen;
13295b7b453SJohn Marino }
133*09d4459fSDaniel Fojt while (p < cur);
134*09d4459fSDaniel Fojt }
13595b7b453SJohn Marino
136680a9cb8SJohn Marino *mb_start = p;
137*09d4459fSDaniel Fojt if (mbclen)
138*09d4459fSDaniel Fojt *mbclen = clen;
139680a9cb8SJohn Marino return p == cur ? 0 : cur - p0;
14095b7b453SJohn Marino }
141680a9cb8SJohn Marino
142*09d4459fSDaniel Fojt /* Examine the start of BUF (which goes to END) for word constituents.
143*09d4459fSDaniel Fojt If COUNTALL, examine as many as possible; otherwise, examine at most one.
144*09d4459fSDaniel Fojt Return the total number of bytes in the examined characters. */
145*09d4459fSDaniel Fojt static size_t
wordchars_count(char const * buf,char const * end,bool countall)146*09d4459fSDaniel Fojt wordchars_count (char const *buf, char const *end, bool countall)
147680a9cb8SJohn Marino {
148*09d4459fSDaniel Fojt size_t n = 0;
149*09d4459fSDaniel Fojt mbstate_t mbs = { 0 };
150*09d4459fSDaniel Fojt while (n < end - buf)
151*09d4459fSDaniel Fojt {
152*09d4459fSDaniel Fojt unsigned char b = buf[n];
153*09d4459fSDaniel Fojt if (sbwordchar[b])
154*09d4459fSDaniel Fojt n++;
155*09d4459fSDaniel Fojt else if (localeinfo.sbclen[b] != -2)
156*09d4459fSDaniel Fojt break;
157*09d4459fSDaniel Fojt else
158*09d4459fSDaniel Fojt {
159*09d4459fSDaniel Fojt wchar_t wc = 0;
160*09d4459fSDaniel Fojt size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs);
161*09d4459fSDaniel Fojt if (!wordchar (wc))
162*09d4459fSDaniel Fojt break;
163*09d4459fSDaniel Fojt n += wcbytes + !wcbytes;
164*09d4459fSDaniel Fojt }
165*09d4459fSDaniel Fojt if (!countall)
166*09d4459fSDaniel Fojt break;
167*09d4459fSDaniel Fojt }
168*09d4459fSDaniel Fojt return n;
169680a9cb8SJohn Marino }
170680a9cb8SJohn Marino
171*09d4459fSDaniel Fojt /* Examine the start of BUF for the longest prefix containing just
172*09d4459fSDaniel Fojt word constituents. Return the total number of bytes in the prefix.
173*09d4459fSDaniel Fojt The buffer ends at END. */
174*09d4459fSDaniel Fojt size_t
wordchars_size(char const * buf,char const * end)175*09d4459fSDaniel Fojt wordchars_size (char const *buf, char const *end)
176680a9cb8SJohn Marino {
177*09d4459fSDaniel Fojt return wordchars_count (buf, end, true);
178*09d4459fSDaniel Fojt }
179*09d4459fSDaniel Fojt
180*09d4459fSDaniel Fojt /* If BUF starts with a word constituent, return the number of bytes
181*09d4459fSDaniel Fojt used to represent it; otherwise, return zero. The buffer ends at END. */
182*09d4459fSDaniel Fojt size_t
wordchar_next(char const * buf,char const * end)183*09d4459fSDaniel Fojt wordchar_next (char const *buf, char const *end)
184*09d4459fSDaniel Fojt {
185*09d4459fSDaniel Fojt return wordchars_count (buf, end, false);
186*09d4459fSDaniel Fojt }
187*09d4459fSDaniel Fojt
188*09d4459fSDaniel Fojt /* In the buffer BUF, return nonzero if the character whose encoding
189*09d4459fSDaniel Fojt contains the byte before CUR is a word constituent. The buffer
190*09d4459fSDaniel Fojt ends at END. */
191*09d4459fSDaniel Fojt size_t
wordchar_prev(char const * buf,char const * cur,char const * end)192*09d4459fSDaniel Fojt wordchar_prev (char const *buf, char const *cur, char const *end)
193*09d4459fSDaniel Fojt {
194*09d4459fSDaniel Fojt if (buf == cur)
195*09d4459fSDaniel Fojt return 0;
196*09d4459fSDaniel Fojt unsigned char b = *--cur;
197*09d4459fSDaniel Fojt if (! localeinfo.multibyte
198*09d4459fSDaniel Fojt || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
199*09d4459fSDaniel Fojt return sbwordchar[b];
200*09d4459fSDaniel Fojt char const *p = buf;
201*09d4459fSDaniel Fojt cur -= mb_goback (&p, NULL, cur, end);
202*09d4459fSDaniel Fojt return wordchar_next (cur, end);
203680a9cb8SJohn Marino }
204