195b7b453SJohn Marino /* Iterating through multibyte strings: macros for multi-byte encodings.
2*09d4459fSDaniel Fojt Copyright (C) 2001, 2005, 2007, 2009-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino
495b7b453SJohn Marino This program is free software: you can redistribute it and/or modify
595b7b453SJohn Marino it under the terms of the GNU General Public License as published by
695b7b453SJohn Marino the Free Software Foundation; either version 3 of the License, or
795b7b453SJohn Marino (at your option) any later version.
895b7b453SJohn Marino
995b7b453SJohn Marino This program is distributed in the hope that it will be useful,
1095b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1195b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1295b7b453SJohn Marino GNU General Public License for more details.
1395b7b453SJohn Marino
1495b7b453SJohn Marino You should have received a copy of the GNU General Public License
15*09d4459fSDaniel Fojt along with this program. If not, see <https://www.gnu.org/licenses/>. */
1695b7b453SJohn Marino
1795b7b453SJohn Marino /* Written by Bruno Haible <bruno@clisp.org>. */
1895b7b453SJohn Marino
1995b7b453SJohn Marino /* The macros in this file implement forward iteration through a
2095b7b453SJohn Marino multi-byte string.
2195b7b453SJohn Marino
2295b7b453SJohn Marino With these macros, an iteration loop that looks like
2395b7b453SJohn Marino
2495b7b453SJohn Marino char *iter;
2595b7b453SJohn Marino for (iter = buf; iter < buf + buflen; iter++)
2695b7b453SJohn Marino {
2795b7b453SJohn Marino do_something (*iter);
2895b7b453SJohn Marino }
2995b7b453SJohn Marino
3095b7b453SJohn Marino becomes
3195b7b453SJohn Marino
3295b7b453SJohn Marino mbi_iterator_t iter;
3395b7b453SJohn Marino for (mbi_init (iter, buf, buflen); mbi_avail (iter); mbi_advance (iter))
3495b7b453SJohn Marino {
3595b7b453SJohn Marino do_something (mbi_cur_ptr (iter), mb_len (mbi_cur (iter)));
3695b7b453SJohn Marino }
3795b7b453SJohn Marino
3895b7b453SJohn Marino The benefit of these macros over plain use of mbrtowc is:
3995b7b453SJohn Marino - Handling of invalid multibyte sequences is possible without
4095b7b453SJohn Marino making the code more complicated, while still preserving the
4195b7b453SJohn Marino invalid multibyte sequences.
4295b7b453SJohn Marino
4395b7b453SJohn Marino mbi_iterator_t
4495b7b453SJohn Marino is a type usable for variable declarations.
4595b7b453SJohn Marino
4695b7b453SJohn Marino mbi_init (iter, startptr, length)
4795b7b453SJohn Marino initializes the iterator, starting at startptr and crossing length bytes.
4895b7b453SJohn Marino
4995b7b453SJohn Marino mbi_avail (iter)
50cf28ed85SJohn Marino returns true if there are more multibyte characters available before
5195b7b453SJohn Marino the end of string is reached. In this case, mbi_cur (iter) is
52cf28ed85SJohn Marino initialized to the next multibyte character.
5395b7b453SJohn Marino
5495b7b453SJohn Marino mbi_advance (iter)
5595b7b453SJohn Marino advances the iterator by one multibyte character.
5695b7b453SJohn Marino
5795b7b453SJohn Marino mbi_cur (iter)
5895b7b453SJohn Marino returns the current multibyte character, of type mbchar_t. All the
5995b7b453SJohn Marino macros defined in mbchar.h can be used on it.
6095b7b453SJohn Marino
6195b7b453SJohn Marino mbi_cur_ptr (iter)
6295b7b453SJohn Marino return a pointer to the beginning of the current multibyte character.
6395b7b453SJohn Marino
6495b7b453SJohn Marino mbi_reloc (iter, ptrdiff)
6595b7b453SJohn Marino relocates iterator when the string is moved by ptrdiff bytes.
6695b7b453SJohn Marino
6795b7b453SJohn Marino mbi_copy (&destiter, &srciter)
6895b7b453SJohn Marino copies srciter to destiter.
6995b7b453SJohn Marino
7095b7b453SJohn Marino Here are the function prototypes of the macros.
7195b7b453SJohn Marino
7295b7b453SJohn Marino extern void mbi_init (mbi_iterator_t iter,
7395b7b453SJohn Marino const char *startptr, size_t length);
7495b7b453SJohn Marino extern bool mbi_avail (mbi_iterator_t iter);
7595b7b453SJohn Marino extern void mbi_advance (mbi_iterator_t iter);
7695b7b453SJohn Marino extern mbchar_t mbi_cur (mbi_iterator_t iter);
7795b7b453SJohn Marino extern const char * mbi_cur_ptr (mbi_iterator_t iter);
7895b7b453SJohn Marino extern void mbi_reloc (mbi_iterator_t iter, ptrdiff_t ptrdiff);
7995b7b453SJohn Marino extern void mbi_copy (mbi_iterator_t *new, const mbi_iterator_t *old);
8095b7b453SJohn Marino */
8195b7b453SJohn Marino
8295b7b453SJohn Marino #ifndef _MBITER_H
8395b7b453SJohn Marino #define _MBITER_H 1
8495b7b453SJohn Marino
8595b7b453SJohn Marino #include <assert.h>
8695b7b453SJohn Marino #include <stdbool.h>
8795b7b453SJohn Marino #include <stddef.h>
8895b7b453SJohn Marino #include <string.h>
8995b7b453SJohn Marino
9095b7b453SJohn Marino /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9195b7b453SJohn Marino <wchar.h>.
9295b7b453SJohn Marino BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
9395b7b453SJohn Marino <wchar.h>. */
9495b7b453SJohn Marino #include <stdio.h>
9595b7b453SJohn Marino #include <time.h>
9695b7b453SJohn Marino #include <wchar.h>
9795b7b453SJohn Marino
9895b7b453SJohn Marino #include "mbchar.h"
9995b7b453SJohn Marino
100680a9cb8SJohn Marino #ifndef _GL_INLINE_HEADER_BEGIN
101680a9cb8SJohn Marino #error "Please include config.h first."
102680a9cb8SJohn Marino #endif
103680a9cb8SJohn Marino _GL_INLINE_HEADER_BEGIN
104680a9cb8SJohn Marino #ifndef MBITER_INLINE
105680a9cb8SJohn Marino # define MBITER_INLINE _GL_INLINE
106680a9cb8SJohn Marino #endif
107680a9cb8SJohn Marino
10895b7b453SJohn Marino struct mbiter_multi
10995b7b453SJohn Marino {
11095b7b453SJohn Marino const char *limit; /* pointer to end of string */
11195b7b453SJohn Marino bool in_shift; /* true if next byte may not be interpreted as ASCII */
11295b7b453SJohn Marino mbstate_t state; /* if in_shift: current shift state */
11395b7b453SJohn Marino bool next_done; /* true if mbi_avail has already filled the following */
11495b7b453SJohn Marino struct mbchar cur; /* the current character:
11595b7b453SJohn Marino const char *cur.ptr pointer to current character
11695b7b453SJohn Marino The following are only valid after mbi_avail.
11795b7b453SJohn Marino size_t cur.bytes number of bytes of current character
11895b7b453SJohn Marino bool cur.wc_valid true if wc is a valid wide character
11995b7b453SJohn Marino wchar_t cur.wc if wc_valid: the current character
12095b7b453SJohn Marino */
12195b7b453SJohn Marino };
12295b7b453SJohn Marino
123680a9cb8SJohn Marino MBITER_INLINE void
mbiter_multi_next(struct mbiter_multi * iter)12495b7b453SJohn Marino mbiter_multi_next (struct mbiter_multi *iter)
12595b7b453SJohn Marino {
12695b7b453SJohn Marino if (iter->next_done)
12795b7b453SJohn Marino return;
12895b7b453SJohn Marino if (iter->in_shift)
12995b7b453SJohn Marino goto with_shift;
13095b7b453SJohn Marino /* Handle most ASCII characters quickly, without calling mbrtowc(). */
13195b7b453SJohn Marino if (is_basic (*iter->cur.ptr))
13295b7b453SJohn Marino {
13395b7b453SJohn Marino /* These characters are part of the basic character set. ISO C 99
13495b7b453SJohn Marino guarantees that their wide character code is identical to their
13595b7b453SJohn Marino char code. */
13695b7b453SJohn Marino iter->cur.bytes = 1;
13795b7b453SJohn Marino iter->cur.wc = *iter->cur.ptr;
13895b7b453SJohn Marino iter->cur.wc_valid = true;
13995b7b453SJohn Marino }
14095b7b453SJohn Marino else
14195b7b453SJohn Marino {
14295b7b453SJohn Marino assert (mbsinit (&iter->state));
14395b7b453SJohn Marino iter->in_shift = true;
14495b7b453SJohn Marino with_shift:
14595b7b453SJohn Marino iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
14695b7b453SJohn Marino iter->limit - iter->cur.ptr, &iter->state);
14795b7b453SJohn Marino if (iter->cur.bytes == (size_t) -1)
14895b7b453SJohn Marino {
14995b7b453SJohn Marino /* An invalid multibyte sequence was encountered. */
15095b7b453SJohn Marino iter->cur.bytes = 1;
15195b7b453SJohn Marino iter->cur.wc_valid = false;
15295b7b453SJohn Marino /* Whether to set iter->in_shift = false and reset iter->state
15395b7b453SJohn Marino or not is not very important; the string is bogus anyway. */
15495b7b453SJohn Marino }
15595b7b453SJohn Marino else if (iter->cur.bytes == (size_t) -2)
15695b7b453SJohn Marino {
15795b7b453SJohn Marino /* An incomplete multibyte character at the end. */
15895b7b453SJohn Marino iter->cur.bytes = iter->limit - iter->cur.ptr;
15995b7b453SJohn Marino iter->cur.wc_valid = false;
16095b7b453SJohn Marino /* Whether to set iter->in_shift = false and reset iter->state
16195b7b453SJohn Marino or not is not important; the string end is reached anyway. */
16295b7b453SJohn Marino }
16395b7b453SJohn Marino else
16495b7b453SJohn Marino {
16595b7b453SJohn Marino if (iter->cur.bytes == 0)
16695b7b453SJohn Marino {
16795b7b453SJohn Marino /* A null wide character was encountered. */
16895b7b453SJohn Marino iter->cur.bytes = 1;
16995b7b453SJohn Marino assert (*iter->cur.ptr == '\0');
17095b7b453SJohn Marino assert (iter->cur.wc == 0);
17195b7b453SJohn Marino }
17295b7b453SJohn Marino iter->cur.wc_valid = true;
17395b7b453SJohn Marino
17495b7b453SJohn Marino /* When in the initial state, we can go back treating ASCII
17595b7b453SJohn Marino characters more quickly. */
17695b7b453SJohn Marino if (mbsinit (&iter->state))
17795b7b453SJohn Marino iter->in_shift = false;
17895b7b453SJohn Marino }
17995b7b453SJohn Marino }
18095b7b453SJohn Marino iter->next_done = true;
18195b7b453SJohn Marino }
18295b7b453SJohn Marino
183680a9cb8SJohn Marino MBITER_INLINE void
mbiter_multi_reloc(struct mbiter_multi * iter,ptrdiff_t ptrdiff)18495b7b453SJohn Marino mbiter_multi_reloc (struct mbiter_multi *iter, ptrdiff_t ptrdiff)
18595b7b453SJohn Marino {
18695b7b453SJohn Marino iter->cur.ptr += ptrdiff;
18795b7b453SJohn Marino iter->limit += ptrdiff;
18895b7b453SJohn Marino }
18995b7b453SJohn Marino
190680a9cb8SJohn Marino MBITER_INLINE void
mbiter_multi_copy(struct mbiter_multi * new_iter,const struct mbiter_multi * old_iter)19195b7b453SJohn Marino mbiter_multi_copy (struct mbiter_multi *new_iter, const struct mbiter_multi *old_iter)
19295b7b453SJohn Marino {
19395b7b453SJohn Marino new_iter->limit = old_iter->limit;
19495b7b453SJohn Marino if ((new_iter->in_shift = old_iter->in_shift))
19595b7b453SJohn Marino memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
19695b7b453SJohn Marino else
19795b7b453SJohn Marino memset (&new_iter->state, 0, sizeof (mbstate_t));
19895b7b453SJohn Marino new_iter->next_done = old_iter->next_done;
19995b7b453SJohn Marino mb_copy (&new_iter->cur, &old_iter->cur);
20095b7b453SJohn Marino }
20195b7b453SJohn Marino
20295b7b453SJohn Marino /* Iteration macros. */
20395b7b453SJohn Marino typedef struct mbiter_multi mbi_iterator_t;
20495b7b453SJohn Marino #define mbi_init(iter, startptr, length) \
20595b7b453SJohn Marino ((iter).cur.ptr = (startptr), (iter).limit = (iter).cur.ptr + (length), \
20695b7b453SJohn Marino (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
20795b7b453SJohn Marino (iter).next_done = false)
20895b7b453SJohn Marino #define mbi_avail(iter) \
20995b7b453SJohn Marino ((iter).cur.ptr < (iter).limit && (mbiter_multi_next (&(iter)), true))
21095b7b453SJohn Marino #define mbi_advance(iter) \
21195b7b453SJohn Marino ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
21295b7b453SJohn Marino
21395b7b453SJohn Marino /* Access to the current character. */
21495b7b453SJohn Marino #define mbi_cur(iter) (iter).cur
21595b7b453SJohn Marino #define mbi_cur_ptr(iter) (iter).cur.ptr
21695b7b453SJohn Marino
21795b7b453SJohn Marino /* Relocation. */
21895b7b453SJohn Marino #define mbi_reloc(iter, ptrdiff) mbiter_multi_reloc (&iter, ptrdiff)
21995b7b453SJohn Marino
22095b7b453SJohn Marino /* Copying an iterator. */
22195b7b453SJohn Marino #define mbi_copy mbiter_multi_copy
22295b7b453SJohn Marino
223680a9cb8SJohn Marino _GL_INLINE_HEADER_END
224680a9cb8SJohn Marino
22595b7b453SJohn Marino #endif /* _MBITER_H */
226