xref: /dflybsd-src/contrib/grep/src/searchutils.c (revision 91b9ed38d3db6a8a8ac5b66da1d43e6e331e259a)
195b7b453SJohn Marino /* searchutils.c - helper subroutines for grep's matchers.
2*09d4459fSDaniel Fojt    Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino 
495b7b453SJohn Marino    This program is free software; you can redistribute it and/or modify
595b7b453SJohn Marino    it under the terms of the GNU General Public License as published by
695b7b453SJohn Marino    the Free Software Foundation; either version 3, or (at your option)
795b7b453SJohn Marino    any later version.
895b7b453SJohn Marino 
995b7b453SJohn Marino    This program is distributed in the hope that it will be useful,
1095b7b453SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
1195b7b453SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1295b7b453SJohn Marino    GNU General Public License for more details.
1395b7b453SJohn Marino 
1495b7b453SJohn Marino    You should have received a copy of the GNU General Public License
1595b7b453SJohn Marino    along with this program; if not, write to the Free Software
1695b7b453SJohn Marino    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
1795b7b453SJohn Marino    02110-1301, USA.  */
1895b7b453SJohn Marino 
1995b7b453SJohn Marino #include <config.h>
20dc7c36e4SJohn Marino 
21dc7c36e4SJohn Marino #define SEARCH_INLINE _GL_EXTERN_INLINE
22dc7c36e4SJohn Marino #define SYSTEM_INLINE _GL_EXTERN_INLINE
2395b7b453SJohn Marino #include "search.h"
2495b7b453SJohn Marino 
25*09d4459fSDaniel Fojt /* For each byte B, sbwordchar[B] is true if B is a single-byte
26*09d4459fSDaniel Fojt    character that is a word constituent, and is false otherwise.  */
27*09d4459fSDaniel Fojt static bool sbwordchar[NCHAR];
28dc7c36e4SJohn Marino 
29*09d4459fSDaniel Fojt /* Whether -w considers WC to be a word constituent.  */
30*09d4459fSDaniel Fojt static bool
wordchar(wint_t wc)31*09d4459fSDaniel Fojt wordchar (wint_t wc)
32*09d4459fSDaniel Fojt {
33*09d4459fSDaniel Fojt   return wc == L'_' || iswalnum (wc);
34*09d4459fSDaniel Fojt }
35680a9cb8SJohn Marino 
3695b7b453SJohn Marino void
wordinit(void)37*09d4459fSDaniel Fojt wordinit (void)
3895b7b453SJohn Marino {
39*09d4459fSDaniel Fojt   for (int i = 0; i < NCHAR; i++)
40*09d4459fSDaniel Fojt     sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
41*09d4459fSDaniel Fojt }
4295b7b453SJohn Marino 
43*09d4459fSDaniel Fojt kwset_t
kwsinit(bool mb_trans)44*09d4459fSDaniel Fojt kwsinit (bool mb_trans)
4595b7b453SJohn Marino {
46*09d4459fSDaniel Fojt   char *trans = NULL;
47*09d4459fSDaniel Fojt 
48*09d4459fSDaniel Fojt   if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
49*09d4459fSDaniel Fojt     {
50*09d4459fSDaniel Fojt       trans = xmalloc (NCHAR);
51*09d4459fSDaniel Fojt       if (MB_CUR_MAX == 1)
52*09d4459fSDaniel Fojt         for (int i = 0; i < NCHAR; i++)
53680a9cb8SJohn Marino           trans[i] = toupper (i);
5495b7b453SJohn Marino       else
55*09d4459fSDaniel Fojt         for (int i = 0; i < NCHAR; i++)
5695b7b453SJohn Marino           {
57*09d4459fSDaniel Fojt             wint_t wc = localeinfo.sbctowc[i];
58*09d4459fSDaniel Fojt             wint_t uwc = towupper (wc);
59*09d4459fSDaniel Fojt             if (uwc != wc)
6095b7b453SJohn Marino               {
61680a9cb8SJohn Marino                 mbstate_t mbs = { 0 };
62*09d4459fSDaniel Fojt                 size_t len = wcrtomb (&trans[i], uwc, &mbs);
63*09d4459fSDaniel Fojt                 if (len != 1)
64*09d4459fSDaniel Fojt                   abort ();
65680a9cb8SJohn Marino               }
66*09d4459fSDaniel Fojt             else
67*09d4459fSDaniel Fojt               trans[i] = i;
68*09d4459fSDaniel Fojt           }
69*09d4459fSDaniel Fojt     }
70*09d4459fSDaniel Fojt 
71*09d4459fSDaniel Fojt   return kwsalloc (trans);
72680a9cb8SJohn Marino }
73680a9cb8SJohn Marino 
74680a9cb8SJohn Marino /* In the buffer *MB_START, return the number of bytes needed to go
75680a9cb8SJohn Marino    back from CUR to the previous boundary, where a "boundary" is the
76680a9cb8SJohn Marino    start of a multibyte character or is an error-encoding byte.  The
77680a9cb8SJohn Marino    buffer ends at END (i.e., one past the address of the buffer's last
78*09d4459fSDaniel Fojt    byte).  If CUR is already at a boundary, return 0.  If CUR is no
79*09d4459fSDaniel Fojt    larger than *MB_START, return CUR - *MB_START without modifying
80*09d4459fSDaniel Fojt    *MB_START or *MBCLEN.
81680a9cb8SJohn Marino 
82680a9cb8SJohn Marino    When returning zero, set *MB_START to CUR.  When returning a
83*09d4459fSDaniel Fojt    positive value, set *MB_START to the next boundary after CUR,
84*09d4459fSDaniel Fojt    or to END if there is no such boundary, and set *MBCLEN to the
85*09d4459fSDaniel Fojt    length of the preceding character.  */
86680a9cb8SJohn Marino ptrdiff_t
mb_goback(char const ** mb_start,size_t * mbclen,char const * cur,char const * end)87*09d4459fSDaniel Fojt mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
88*09d4459fSDaniel Fojt            char const *end)
89680a9cb8SJohn Marino {
90680a9cb8SJohn Marino   const char *p = *mb_start;
91680a9cb8SJohn Marino   const char *p0 = p;
92*09d4459fSDaniel Fojt   size_t clen;
9395b7b453SJohn Marino 
94*09d4459fSDaniel Fojt   if (cur <= p)
95*09d4459fSDaniel Fojt     return cur - p;
9695b7b453SJohn Marino 
97*09d4459fSDaniel Fojt   if (localeinfo.using_utf8)
9895b7b453SJohn Marino     {
99*09d4459fSDaniel Fojt       p = cur;
100*09d4459fSDaniel Fojt       clen = 1;
101*09d4459fSDaniel Fojt 
102*09d4459fSDaniel Fojt       if (cur < end && (*cur & 0xc0) == 0x80)
103*09d4459fSDaniel Fojt         for (int i = 1; i <= 3; i++)
104*09d4459fSDaniel Fojt           if ((cur[-i] & 0xc0) != 0x80)
105*09d4459fSDaniel Fojt             {
106*09d4459fSDaniel Fojt               mbstate_t mbs = { 0 };
107*09d4459fSDaniel Fojt               clen = mb_clen (cur - i, end - (cur - i), &mbs);
108*09d4459fSDaniel Fojt               if (i < clen && clen < (size_t) -2)
109*09d4459fSDaniel Fojt                 {
110*09d4459fSDaniel Fojt                   p0 = cur - i;
111*09d4459fSDaniel Fojt                   p = p0 + clen;
112*09d4459fSDaniel Fojt                 }
113*09d4459fSDaniel Fojt               break;
114*09d4459fSDaniel Fojt             }
115*09d4459fSDaniel Fojt     }
116*09d4459fSDaniel Fojt   else
117*09d4459fSDaniel Fojt     {
118*09d4459fSDaniel Fojt       mbstate_t mbs = { 0 };
119*09d4459fSDaniel Fojt       do
120*09d4459fSDaniel Fojt         {
121*09d4459fSDaniel Fojt           clen = mb_clen (p, end - p, &mbs);
122680a9cb8SJohn Marino 
123dc7c36e4SJohn Marino           if ((size_t) -2 <= clen)
124680a9cb8SJohn Marino             {
125dc7c36e4SJohn Marino               /* An invalid sequence, or a truncated multibyte character.
126dc7c36e4SJohn Marino                  Treat it as a single byte character.  */
127dc7c36e4SJohn Marino               clen = 1;
128*09d4459fSDaniel Fojt               memset (&mbs, 0, sizeof mbs);
12995b7b453SJohn Marino             }
130680a9cb8SJohn Marino           p0 = p;
131dc7c36e4SJohn Marino           p += clen;
13295b7b453SJohn Marino         }
133*09d4459fSDaniel Fojt       while (p < cur);
134*09d4459fSDaniel Fojt     }
13595b7b453SJohn Marino 
136680a9cb8SJohn Marino   *mb_start = p;
137*09d4459fSDaniel Fojt   if (mbclen)
138*09d4459fSDaniel Fojt     *mbclen = clen;
139680a9cb8SJohn Marino   return p == cur ? 0 : cur - p0;
14095b7b453SJohn Marino }
141680a9cb8SJohn Marino 
142*09d4459fSDaniel Fojt /* Examine the start of BUF (which goes to END) for word constituents.
143*09d4459fSDaniel Fojt    If COUNTALL, examine as many as possible; otherwise, examine at most one.
144*09d4459fSDaniel Fojt    Return the total number of bytes in the examined characters.  */
145*09d4459fSDaniel Fojt static size_t
wordchars_count(char const * buf,char const * end,bool countall)146*09d4459fSDaniel Fojt wordchars_count (char const *buf, char const *end, bool countall)
147680a9cb8SJohn Marino {
148*09d4459fSDaniel Fojt   size_t n = 0;
149*09d4459fSDaniel Fojt   mbstate_t mbs = { 0 };
150*09d4459fSDaniel Fojt   while (n < end - buf)
151*09d4459fSDaniel Fojt     {
152*09d4459fSDaniel Fojt       unsigned char b = buf[n];
153*09d4459fSDaniel Fojt       if (sbwordchar[b])
154*09d4459fSDaniel Fojt         n++;
155*09d4459fSDaniel Fojt       else if (localeinfo.sbclen[b] != -2)
156*09d4459fSDaniel Fojt         break;
157*09d4459fSDaniel Fojt       else
158*09d4459fSDaniel Fojt         {
159*09d4459fSDaniel Fojt           wchar_t wc = 0;
160*09d4459fSDaniel Fojt           size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs);
161*09d4459fSDaniel Fojt           if (!wordchar (wc))
162*09d4459fSDaniel Fojt             break;
163*09d4459fSDaniel Fojt           n += wcbytes + !wcbytes;
164*09d4459fSDaniel Fojt         }
165*09d4459fSDaniel Fojt       if (!countall)
166*09d4459fSDaniel Fojt         break;
167*09d4459fSDaniel Fojt     }
168*09d4459fSDaniel Fojt   return n;
169680a9cb8SJohn Marino }
170680a9cb8SJohn Marino 
171*09d4459fSDaniel Fojt /* Examine the start of BUF for the longest prefix containing just
172*09d4459fSDaniel Fojt    word constituents.  Return the total number of bytes in the prefix.
173*09d4459fSDaniel Fojt    The buffer ends at END.  */
174*09d4459fSDaniel Fojt size_t
wordchars_size(char const * buf,char const * end)175*09d4459fSDaniel Fojt wordchars_size (char const *buf, char const *end)
176680a9cb8SJohn Marino {
177*09d4459fSDaniel Fojt   return wordchars_count (buf, end, true);
178*09d4459fSDaniel Fojt }
179*09d4459fSDaniel Fojt 
180*09d4459fSDaniel Fojt /* If BUF starts with a word constituent, return the number of bytes
181*09d4459fSDaniel Fojt    used to represent it; otherwise, return zero.  The buffer ends at END.  */
182*09d4459fSDaniel Fojt size_t
wordchar_next(char const * buf,char const * end)183*09d4459fSDaniel Fojt wordchar_next (char const *buf, char const *end)
184*09d4459fSDaniel Fojt {
185*09d4459fSDaniel Fojt   return wordchars_count (buf, end, false);
186*09d4459fSDaniel Fojt }
187*09d4459fSDaniel Fojt 
188*09d4459fSDaniel Fojt /* In the buffer BUF, return nonzero if the character whose encoding
189*09d4459fSDaniel Fojt    contains the byte before CUR is a word constituent.  The buffer
190*09d4459fSDaniel Fojt    ends at END.  */
191*09d4459fSDaniel Fojt size_t
wordchar_prev(char const * buf,char const * cur,char const * end)192*09d4459fSDaniel Fojt wordchar_prev (char const *buf, char const *cur, char const *end)
193*09d4459fSDaniel Fojt {
194*09d4459fSDaniel Fojt   if (buf == cur)
195*09d4459fSDaniel Fojt     return 0;
196*09d4459fSDaniel Fojt   unsigned char b = *--cur;
197*09d4459fSDaniel Fojt   if (! localeinfo.multibyte
198*09d4459fSDaniel Fojt       || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
199*09d4459fSDaniel Fojt     return sbwordchar[b];
200*09d4459fSDaniel Fojt   char const *p = buf;
201*09d4459fSDaniel Fojt   cur -= mb_goback (&p, NULL, cur, end);
202*09d4459fSDaniel Fojt   return wordchar_next (cur, end);
203680a9cb8SJohn Marino }
204