xref: /dflybsd-src/contrib/grep/lib/mbiter.h (revision 91b9ed38d3db6a8a8ac5b66da1d43e6e331e259a)
195b7b453SJohn Marino /* Iterating through multibyte strings: macros for multi-byte encodings.
2*09d4459fSDaniel Fojt    Copyright (C) 2001, 2005, 2007, 2009-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino 
495b7b453SJohn Marino    This program is free software: you can redistribute it and/or modify
595b7b453SJohn Marino    it under the terms of the GNU General Public License as published by
695b7b453SJohn Marino    the Free Software Foundation; either version 3 of the License, or
795b7b453SJohn Marino    (at your option) any later version.
895b7b453SJohn Marino 
995b7b453SJohn Marino    This program is distributed in the hope that it will be useful,
1095b7b453SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
1195b7b453SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1295b7b453SJohn Marino    GNU General Public License for more details.
1395b7b453SJohn Marino 
1495b7b453SJohn Marino    You should have received a copy of the GNU General Public License
15*09d4459fSDaniel Fojt    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
1695b7b453SJohn Marino 
1795b7b453SJohn Marino /* Written by Bruno Haible <bruno@clisp.org>.  */
1895b7b453SJohn Marino 
1995b7b453SJohn Marino /* The macros in this file implement forward iteration through a
2095b7b453SJohn Marino    multi-byte string.
2195b7b453SJohn Marino 
2295b7b453SJohn Marino    With these macros, an iteration loop that looks like
2395b7b453SJohn Marino 
2495b7b453SJohn Marino       char *iter;
2595b7b453SJohn Marino       for (iter = buf; iter < buf + buflen; iter++)
2695b7b453SJohn Marino         {
2795b7b453SJohn Marino           do_something (*iter);
2895b7b453SJohn Marino         }
2995b7b453SJohn Marino 
3095b7b453SJohn Marino    becomes
3195b7b453SJohn Marino 
3295b7b453SJohn Marino       mbi_iterator_t iter;
3395b7b453SJohn Marino       for (mbi_init (iter, buf, buflen); mbi_avail (iter); mbi_advance (iter))
3495b7b453SJohn Marino         {
3595b7b453SJohn Marino           do_something (mbi_cur_ptr (iter), mb_len (mbi_cur (iter)));
3695b7b453SJohn Marino         }
3795b7b453SJohn Marino 
3895b7b453SJohn Marino    The benefit of these macros over plain use of mbrtowc is:
3995b7b453SJohn Marino    - Handling of invalid multibyte sequences is possible without
4095b7b453SJohn Marino      making the code more complicated, while still preserving the
4195b7b453SJohn Marino      invalid multibyte sequences.
4295b7b453SJohn Marino 
4395b7b453SJohn Marino    mbi_iterator_t
4495b7b453SJohn Marino      is a type usable for variable declarations.
4595b7b453SJohn Marino 
4695b7b453SJohn Marino    mbi_init (iter, startptr, length)
4795b7b453SJohn Marino      initializes the iterator, starting at startptr and crossing length bytes.
4895b7b453SJohn Marino 
4995b7b453SJohn Marino    mbi_avail (iter)
50cf28ed85SJohn Marino      returns true if there are more multibyte characters available before
5195b7b453SJohn Marino      the end of string is reached. In this case, mbi_cur (iter) is
52cf28ed85SJohn Marino      initialized to the next multibyte character.
5395b7b453SJohn Marino 
5495b7b453SJohn Marino    mbi_advance (iter)
5595b7b453SJohn Marino      advances the iterator by one multibyte character.
5695b7b453SJohn Marino 
5795b7b453SJohn Marino    mbi_cur (iter)
5895b7b453SJohn Marino      returns the current multibyte character, of type mbchar_t.  All the
5995b7b453SJohn Marino      macros defined in mbchar.h can be used on it.
6095b7b453SJohn Marino 
6195b7b453SJohn Marino    mbi_cur_ptr (iter)
6295b7b453SJohn Marino      return a pointer to the beginning of the current multibyte character.
6395b7b453SJohn Marino 
6495b7b453SJohn Marino    mbi_reloc (iter, ptrdiff)
6595b7b453SJohn Marino      relocates iterator when the string is moved by ptrdiff bytes.
6695b7b453SJohn Marino 
6795b7b453SJohn Marino    mbi_copy (&destiter, &srciter)
6895b7b453SJohn Marino      copies srciter to destiter.
6995b7b453SJohn Marino 
7095b7b453SJohn Marino    Here are the function prototypes of the macros.
7195b7b453SJohn Marino 
7295b7b453SJohn Marino    extern void          mbi_init (mbi_iterator_t iter,
7395b7b453SJohn Marino                                   const char *startptr, size_t length);
7495b7b453SJohn Marino    extern bool          mbi_avail (mbi_iterator_t iter);
7595b7b453SJohn Marino    extern void          mbi_advance (mbi_iterator_t iter);
7695b7b453SJohn Marino    extern mbchar_t      mbi_cur (mbi_iterator_t iter);
7795b7b453SJohn Marino    extern const char *  mbi_cur_ptr (mbi_iterator_t iter);
7895b7b453SJohn Marino    extern void          mbi_reloc (mbi_iterator_t iter, ptrdiff_t ptrdiff);
7995b7b453SJohn Marino    extern void          mbi_copy (mbi_iterator_t *new, const mbi_iterator_t *old);
8095b7b453SJohn Marino  */
8195b7b453SJohn Marino 
8295b7b453SJohn Marino #ifndef _MBITER_H
8395b7b453SJohn Marino #define _MBITER_H 1
8495b7b453SJohn Marino 
8595b7b453SJohn Marino #include <assert.h>
8695b7b453SJohn Marino #include <stdbool.h>
8795b7b453SJohn Marino #include <stddef.h>
8895b7b453SJohn Marino #include <string.h>
8995b7b453SJohn Marino 
9095b7b453SJohn Marino /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9195b7b453SJohn Marino    <wchar.h>.
9295b7b453SJohn Marino    BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
9395b7b453SJohn Marino    <wchar.h>.  */
9495b7b453SJohn Marino #include <stdio.h>
9595b7b453SJohn Marino #include <time.h>
9695b7b453SJohn Marino #include <wchar.h>
9795b7b453SJohn Marino 
9895b7b453SJohn Marino #include "mbchar.h"
9995b7b453SJohn Marino 
100680a9cb8SJohn Marino #ifndef _GL_INLINE_HEADER_BEGIN
101680a9cb8SJohn Marino  #error "Please include config.h first."
102680a9cb8SJohn Marino #endif
103680a9cb8SJohn Marino _GL_INLINE_HEADER_BEGIN
104680a9cb8SJohn Marino #ifndef MBITER_INLINE
105680a9cb8SJohn Marino # define MBITER_INLINE _GL_INLINE
106680a9cb8SJohn Marino #endif
107680a9cb8SJohn Marino 
10895b7b453SJohn Marino struct mbiter_multi
10995b7b453SJohn Marino {
11095b7b453SJohn Marino   const char *limit;    /* pointer to end of string */
11195b7b453SJohn Marino   bool in_shift;        /* true if next byte may not be interpreted as ASCII */
11295b7b453SJohn Marino   mbstate_t state;      /* if in_shift: current shift state */
11395b7b453SJohn Marino   bool next_done;       /* true if mbi_avail has already filled the following */
11495b7b453SJohn Marino   struct mbchar cur;    /* the current character:
11595b7b453SJohn Marino         const char *cur.ptr             pointer to current character
11695b7b453SJohn Marino         The following are only valid after mbi_avail.
11795b7b453SJohn Marino         size_t cur.bytes                number of bytes of current character
11895b7b453SJohn Marino         bool cur.wc_valid               true if wc is a valid wide character
11995b7b453SJohn Marino         wchar_t cur.wc                  if wc_valid: the current character
12095b7b453SJohn Marino         */
12195b7b453SJohn Marino };
12295b7b453SJohn Marino 
123680a9cb8SJohn Marino MBITER_INLINE void
mbiter_multi_next(struct mbiter_multi * iter)12495b7b453SJohn Marino mbiter_multi_next (struct mbiter_multi *iter)
12595b7b453SJohn Marino {
12695b7b453SJohn Marino   if (iter->next_done)
12795b7b453SJohn Marino     return;
12895b7b453SJohn Marino   if (iter->in_shift)
12995b7b453SJohn Marino     goto with_shift;
13095b7b453SJohn Marino   /* Handle most ASCII characters quickly, without calling mbrtowc().  */
13195b7b453SJohn Marino   if (is_basic (*iter->cur.ptr))
13295b7b453SJohn Marino     {
13395b7b453SJohn Marino       /* These characters are part of the basic character set.  ISO C 99
13495b7b453SJohn Marino          guarantees that their wide character code is identical to their
13595b7b453SJohn Marino          char code.  */
13695b7b453SJohn Marino       iter->cur.bytes = 1;
13795b7b453SJohn Marino       iter->cur.wc = *iter->cur.ptr;
13895b7b453SJohn Marino       iter->cur.wc_valid = true;
13995b7b453SJohn Marino     }
14095b7b453SJohn Marino   else
14195b7b453SJohn Marino     {
14295b7b453SJohn Marino       assert (mbsinit (&iter->state));
14395b7b453SJohn Marino       iter->in_shift = true;
14495b7b453SJohn Marino     with_shift:
14595b7b453SJohn Marino       iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
14695b7b453SJohn Marino                                  iter->limit - iter->cur.ptr, &iter->state);
14795b7b453SJohn Marino       if (iter->cur.bytes == (size_t) -1)
14895b7b453SJohn Marino         {
14995b7b453SJohn Marino           /* An invalid multibyte sequence was encountered.  */
15095b7b453SJohn Marino           iter->cur.bytes = 1;
15195b7b453SJohn Marino           iter->cur.wc_valid = false;
15295b7b453SJohn Marino           /* Whether to set iter->in_shift = false and reset iter->state
15395b7b453SJohn Marino              or not is not very important; the string is bogus anyway.  */
15495b7b453SJohn Marino         }
15595b7b453SJohn Marino       else if (iter->cur.bytes == (size_t) -2)
15695b7b453SJohn Marino         {
15795b7b453SJohn Marino           /* An incomplete multibyte character at the end.  */
15895b7b453SJohn Marino           iter->cur.bytes = iter->limit - iter->cur.ptr;
15995b7b453SJohn Marino           iter->cur.wc_valid = false;
16095b7b453SJohn Marino           /* Whether to set iter->in_shift = false and reset iter->state
16195b7b453SJohn Marino              or not is not important; the string end is reached anyway.  */
16295b7b453SJohn Marino         }
16395b7b453SJohn Marino       else
16495b7b453SJohn Marino         {
16595b7b453SJohn Marino           if (iter->cur.bytes == 0)
16695b7b453SJohn Marino             {
16795b7b453SJohn Marino               /* A null wide character was encountered.  */
16895b7b453SJohn Marino               iter->cur.bytes = 1;
16995b7b453SJohn Marino               assert (*iter->cur.ptr == '\0');
17095b7b453SJohn Marino               assert (iter->cur.wc == 0);
17195b7b453SJohn Marino             }
17295b7b453SJohn Marino           iter->cur.wc_valid = true;
17395b7b453SJohn Marino 
17495b7b453SJohn Marino           /* When in the initial state, we can go back treating ASCII
17595b7b453SJohn Marino              characters more quickly.  */
17695b7b453SJohn Marino           if (mbsinit (&iter->state))
17795b7b453SJohn Marino             iter->in_shift = false;
17895b7b453SJohn Marino         }
17995b7b453SJohn Marino     }
18095b7b453SJohn Marino   iter->next_done = true;
18195b7b453SJohn Marino }
18295b7b453SJohn Marino 
183680a9cb8SJohn Marino MBITER_INLINE void
mbiter_multi_reloc(struct mbiter_multi * iter,ptrdiff_t ptrdiff)18495b7b453SJohn Marino mbiter_multi_reloc (struct mbiter_multi *iter, ptrdiff_t ptrdiff)
18595b7b453SJohn Marino {
18695b7b453SJohn Marino   iter->cur.ptr += ptrdiff;
18795b7b453SJohn Marino   iter->limit += ptrdiff;
18895b7b453SJohn Marino }
18995b7b453SJohn Marino 
190680a9cb8SJohn Marino MBITER_INLINE void
mbiter_multi_copy(struct mbiter_multi * new_iter,const struct mbiter_multi * old_iter)19195b7b453SJohn Marino mbiter_multi_copy (struct mbiter_multi *new_iter, const struct mbiter_multi *old_iter)
19295b7b453SJohn Marino {
19395b7b453SJohn Marino   new_iter->limit = old_iter->limit;
19495b7b453SJohn Marino   if ((new_iter->in_shift = old_iter->in_shift))
19595b7b453SJohn Marino     memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
19695b7b453SJohn Marino   else
19795b7b453SJohn Marino     memset (&new_iter->state, 0, sizeof (mbstate_t));
19895b7b453SJohn Marino   new_iter->next_done = old_iter->next_done;
19995b7b453SJohn Marino   mb_copy (&new_iter->cur, &old_iter->cur);
20095b7b453SJohn Marino }
20195b7b453SJohn Marino 
20295b7b453SJohn Marino /* Iteration macros.  */
20395b7b453SJohn Marino typedef struct mbiter_multi mbi_iterator_t;
20495b7b453SJohn Marino #define mbi_init(iter, startptr, length) \
20595b7b453SJohn Marino   ((iter).cur.ptr = (startptr), (iter).limit = (iter).cur.ptr + (length), \
20695b7b453SJohn Marino    (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
20795b7b453SJohn Marino    (iter).next_done = false)
20895b7b453SJohn Marino #define mbi_avail(iter) \
20995b7b453SJohn Marino   ((iter).cur.ptr < (iter).limit && (mbiter_multi_next (&(iter)), true))
21095b7b453SJohn Marino #define mbi_advance(iter) \
21195b7b453SJohn Marino   ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
21295b7b453SJohn Marino 
21395b7b453SJohn Marino /* Access to the current character.  */
21495b7b453SJohn Marino #define mbi_cur(iter) (iter).cur
21595b7b453SJohn Marino #define mbi_cur_ptr(iter) (iter).cur.ptr
21695b7b453SJohn Marino 
21795b7b453SJohn Marino /* Relocation.  */
21895b7b453SJohn Marino #define mbi_reloc(iter, ptrdiff) mbiter_multi_reloc (&iter, ptrdiff)
21995b7b453SJohn Marino 
22095b7b453SJohn Marino /* Copying an iterator.  */
22195b7b453SJohn Marino #define mbi_copy mbiter_multi_copy
22295b7b453SJohn Marino 
223680a9cb8SJohn Marino _GL_INLINE_HEADER_END
224680a9cb8SJohn Marino 
22595b7b453SJohn Marino #endif /* _MBITER_H */
226