186d7f5d3SJohn Marino /* Iterating through multibyte strings: macros for multi-byte encodings.
286d7f5d3SJohn Marino Copyright (C) 2001, 2005 Free Software Foundation, Inc.
386d7f5d3SJohn Marino
486d7f5d3SJohn Marino This program is free software; you can redistribute it and/or modify
586d7f5d3SJohn Marino it under the terms of the GNU General Public License as published by
686d7f5d3SJohn Marino the Free Software Foundation; either version 2, or (at your option)
786d7f5d3SJohn Marino any later version.
886d7f5d3SJohn Marino
986d7f5d3SJohn Marino This program is distributed in the hope that it will be useful,
1086d7f5d3SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1186d7f5d3SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1286d7f5d3SJohn Marino GNU General Public License for more details.
1386d7f5d3SJohn Marino
1486d7f5d3SJohn Marino You should have received a copy of the GNU General Public License
1586d7f5d3SJohn Marino along with this program; if not, write to the Free Software Foundation,
1686d7f5d3SJohn Marino Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
1786d7f5d3SJohn Marino
1886d7f5d3SJohn Marino /* Written by Bruno Haible <bruno@clisp.org>. */
1986d7f5d3SJohn Marino
2086d7f5d3SJohn Marino /* The macros in this file implement forward iteration through a
2186d7f5d3SJohn Marino multi-byte string, without knowing its length a-priori.
2286d7f5d3SJohn Marino
2386d7f5d3SJohn Marino With these macros, an iteration loop that looks like
2486d7f5d3SJohn Marino
2586d7f5d3SJohn Marino char *iter;
2686d7f5d3SJohn Marino for (iter = buf; *iter != '\0'; iter++)
2786d7f5d3SJohn Marino {
2886d7f5d3SJohn Marino do_something (*iter);
2986d7f5d3SJohn Marino }
3086d7f5d3SJohn Marino
3186d7f5d3SJohn Marino becomes
3286d7f5d3SJohn Marino
3386d7f5d3SJohn Marino mbui_iterator_t iter;
3486d7f5d3SJohn Marino for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter))
3586d7f5d3SJohn Marino {
3686d7f5d3SJohn Marino do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
3786d7f5d3SJohn Marino }
3886d7f5d3SJohn Marino
3986d7f5d3SJohn Marino The benefit of these macros over plain use of mbrtowc is:
4086d7f5d3SJohn Marino - Handling of invalid multibyte sequences is possible without
4186d7f5d3SJohn Marino making the code more complicated, while still preserving the
4286d7f5d3SJohn Marino invalid multibyte sequences.
4386d7f5d3SJohn Marino
4486d7f5d3SJohn Marino Compared to mbiter.h, the macros here don't need to know the string's
4586d7f5d3SJohn Marino length a-priori. The downside is that at each step, the look-ahead
4686d7f5d3SJohn Marino that guards against overrunning the terminating '\0' is more expensive.
4786d7f5d3SJohn Marino The mbui_* macros are therefore suitable when there is a high probability
4886d7f5d3SJohn Marino that only the first few multibyte characters need to be inspected.
4986d7f5d3SJohn Marino Whereas the mbi_* macros are better if usually the iteration runs
5086d7f5d3SJohn Marino through the entire string.
5186d7f5d3SJohn Marino
5286d7f5d3SJohn Marino mbui_iterator_t
5386d7f5d3SJohn Marino is a type usable for variable declarations.
5486d7f5d3SJohn Marino
5586d7f5d3SJohn Marino mbui_init (iter, startptr)
5686d7f5d3SJohn Marino initializes the iterator, starting at startptr.
5786d7f5d3SJohn Marino
5886d7f5d3SJohn Marino mbui_avail (iter)
5986d7f5d3SJohn Marino returns true if there are more multibyte chracters available before
6086d7f5d3SJohn Marino the end of string is reached. In this case, mbui_cur (iter) is
6186d7f5d3SJohn Marino initialized to the next multibyte chracter.
6286d7f5d3SJohn Marino
6386d7f5d3SJohn Marino mbui_advance (iter)
6486d7f5d3SJohn Marino advances the iterator by one multibyte character.
6586d7f5d3SJohn Marino
6686d7f5d3SJohn Marino mbui_cur (iter)
6786d7f5d3SJohn Marino returns the current multibyte character, of type mbchar_t. All the
6886d7f5d3SJohn Marino macros defined in mbchar.h can be used on it.
6986d7f5d3SJohn Marino
7086d7f5d3SJohn Marino mbui_cur_ptr (iter)
7186d7f5d3SJohn Marino return a pointer to the beginning of the current multibyte character.
7286d7f5d3SJohn Marino
7386d7f5d3SJohn Marino mbui_reloc (iter, ptrdiff)
7486d7f5d3SJohn Marino relocates iterator when the string is moved by ptrdiff bytes.
7586d7f5d3SJohn Marino
7686d7f5d3SJohn Marino Here are the function prototypes of the macros.
7786d7f5d3SJohn Marino
7886d7f5d3SJohn Marino extern void mbui_init (mbui_iterator_t iter, const char *startptr);
7986d7f5d3SJohn Marino extern bool mbui_avail (mbui_iterator_t iter);
8086d7f5d3SJohn Marino extern void mbui_advance (mbui_iterator_t iter);
8186d7f5d3SJohn Marino extern mbchar_t mbui_cur (mbui_iterator_t iter);
8286d7f5d3SJohn Marino extern const char * mbui_cur_ptr (mbui_iterator_t iter);
8386d7f5d3SJohn Marino extern void mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff);
8486d7f5d3SJohn Marino */
8586d7f5d3SJohn Marino
8686d7f5d3SJohn Marino #ifndef _MBUITER_H
8786d7f5d3SJohn Marino #define _MBUITER_H 1
8886d7f5d3SJohn Marino
8986d7f5d3SJohn Marino #include <assert.h>
9086d7f5d3SJohn Marino #include <stdbool.h>
9186d7f5d3SJohn Marino #include <stdlib.h>
9286d7f5d3SJohn Marino
9386d7f5d3SJohn Marino /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9486d7f5d3SJohn Marino <wchar.h>.
9586d7f5d3SJohn Marino BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
9686d7f5d3SJohn Marino <wchar.h>. */
9786d7f5d3SJohn Marino #include <stdio.h>
9886d7f5d3SJohn Marino #include <time.h>
9986d7f5d3SJohn Marino #include <wchar.h>
10086d7f5d3SJohn Marino
10186d7f5d3SJohn Marino #include "mbchar.h"
10286d7f5d3SJohn Marino #include "strnlen1.h"
10386d7f5d3SJohn Marino
10486d7f5d3SJohn Marino struct mbuiter_multi
10586d7f5d3SJohn Marino {
10686d7f5d3SJohn Marino bool in_shift; /* true if next byte may not be interpreted as ASCII */
10786d7f5d3SJohn Marino mbstate_t state; /* if in_shift: current shift state */
10886d7f5d3SJohn Marino bool next_done; /* true if mbui_avail has already filled the following */
10986d7f5d3SJohn Marino struct mbchar cur; /* the current character:
11086d7f5d3SJohn Marino const char *cur.ptr pointer to current character
11186d7f5d3SJohn Marino The following are only valid after mbui_avail.
11286d7f5d3SJohn Marino size_t cur.bytes number of bytes of current character
11386d7f5d3SJohn Marino bool cur.wc_valid true if wc is a valid wide character
11486d7f5d3SJohn Marino wchar_t cur.wc if wc_valid: the current character
11586d7f5d3SJohn Marino */
11686d7f5d3SJohn Marino };
11786d7f5d3SJohn Marino
11886d7f5d3SJohn Marino static inline void
mbuiter_multi_next(struct mbuiter_multi * iter)11986d7f5d3SJohn Marino mbuiter_multi_next (struct mbuiter_multi *iter)
12086d7f5d3SJohn Marino {
12186d7f5d3SJohn Marino if (iter->next_done)
12286d7f5d3SJohn Marino return;
12386d7f5d3SJohn Marino if (iter->in_shift)
12486d7f5d3SJohn Marino goto with_shift;
12586d7f5d3SJohn Marino /* Handle most ASCII characters quickly, without calling mbrtowc(). */
12686d7f5d3SJohn Marino if (is_basic (*iter->cur.ptr))
12786d7f5d3SJohn Marino {
12886d7f5d3SJohn Marino /* These characters are part of the basic character set. ISO C 99
12986d7f5d3SJohn Marino guarantees that their wide character code is identical to their
13086d7f5d3SJohn Marino char code. */
13186d7f5d3SJohn Marino iter->cur.bytes = 1;
13286d7f5d3SJohn Marino iter->cur.wc = *iter->cur.ptr;
13386d7f5d3SJohn Marino iter->cur.wc_valid = true;
13486d7f5d3SJohn Marino }
13586d7f5d3SJohn Marino else
13686d7f5d3SJohn Marino {
13786d7f5d3SJohn Marino assert (mbsinit (&iter->state));
13886d7f5d3SJohn Marino iter->in_shift = true;
13986d7f5d3SJohn Marino with_shift:
14086d7f5d3SJohn Marino iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
14186d7f5d3SJohn Marino strnlen1 (iter->cur.ptr, MB_CUR_MAX),
14286d7f5d3SJohn Marino &iter->state);
14386d7f5d3SJohn Marino if (iter->cur.bytes == (size_t) -1)
14486d7f5d3SJohn Marino {
14586d7f5d3SJohn Marino /* An invalid multibyte sequence was encountered. */
14686d7f5d3SJohn Marino iter->cur.bytes = 1;
14786d7f5d3SJohn Marino iter->cur.wc_valid = false;
14886d7f5d3SJohn Marino /* Whether to set iter->in_shift = false and reset iter->state
14986d7f5d3SJohn Marino or not is not very important; the string is bogus anyway. */
15086d7f5d3SJohn Marino }
15186d7f5d3SJohn Marino else if (iter->cur.bytes == (size_t) -2)
15286d7f5d3SJohn Marino {
15386d7f5d3SJohn Marino /* An incomplete multibyte character at the end. */
15486d7f5d3SJohn Marino iter->cur.bytes = strlen (iter->cur.ptr);
15586d7f5d3SJohn Marino iter->cur.wc_valid = false;
15686d7f5d3SJohn Marino /* Whether to set iter->in_shift = false and reset iter->state
15786d7f5d3SJohn Marino or not is not important; the string end is reached anyway. */
15886d7f5d3SJohn Marino }
15986d7f5d3SJohn Marino else
16086d7f5d3SJohn Marino {
16186d7f5d3SJohn Marino if (iter->cur.bytes == 0)
16286d7f5d3SJohn Marino {
16386d7f5d3SJohn Marino /* A null wide character was encountered. */
16486d7f5d3SJohn Marino iter->cur.bytes = 1;
16586d7f5d3SJohn Marino assert (*iter->cur.ptr == '\0');
16686d7f5d3SJohn Marino assert (iter->cur.wc == 0);
16786d7f5d3SJohn Marino }
16886d7f5d3SJohn Marino iter->cur.wc_valid = true;
16986d7f5d3SJohn Marino
17086d7f5d3SJohn Marino /* When in the initial state, we can go back treating ASCII
17186d7f5d3SJohn Marino characters more quickly. */
17286d7f5d3SJohn Marino if (mbsinit (&iter->state))
17386d7f5d3SJohn Marino iter->in_shift = false;
17486d7f5d3SJohn Marino }
17586d7f5d3SJohn Marino }
17686d7f5d3SJohn Marino iter->next_done = true;
17786d7f5d3SJohn Marino }
17886d7f5d3SJohn Marino
17986d7f5d3SJohn Marino static inline void
mbuiter_multi_reloc(struct mbuiter_multi * iter,ptrdiff_t ptrdiff)18086d7f5d3SJohn Marino mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff)
18186d7f5d3SJohn Marino {
18286d7f5d3SJohn Marino iter->cur.ptr += ptrdiff;
18386d7f5d3SJohn Marino }
18486d7f5d3SJohn Marino
18586d7f5d3SJohn Marino /* Iteration macros. */
18686d7f5d3SJohn Marino typedef struct mbuiter_multi mbui_iterator_t;
18786d7f5d3SJohn Marino #define mbui_init(iter, startptr) \
18886d7f5d3SJohn Marino ((iter).cur.ptr = (startptr), \
18986d7f5d3SJohn Marino (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
19086d7f5d3SJohn Marino (iter).next_done = false)
19186d7f5d3SJohn Marino #define mbui_avail(iter) \
19286d7f5d3SJohn Marino (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur))
19386d7f5d3SJohn Marino #define mbui_advance(iter) \
19486d7f5d3SJohn Marino ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
19586d7f5d3SJohn Marino
19686d7f5d3SJohn Marino /* Access to the current character. */
19786d7f5d3SJohn Marino #define mbui_cur(iter) (iter).cur
19886d7f5d3SJohn Marino #define mbui_cur_ptr(iter) (iter).cur.ptr
19986d7f5d3SJohn Marino
20086d7f5d3SJohn Marino /* Relocation. */
20186d7f5d3SJohn Marino #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff)
20286d7f5d3SJohn Marino
20386d7f5d3SJohn Marino #endif /* _MBUITER_H */
204