xref: /dflybsd-src/contrib/cvs-1.12/lib/mbuiter.h (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* Iterating through multibyte strings: macros for multi-byte encodings.
286d7f5d3SJohn Marino    Copyright (C) 2001, 2005 Free Software Foundation, Inc.
386d7f5d3SJohn Marino 
486d7f5d3SJohn Marino    This program is free software; you can redistribute it and/or modify
586d7f5d3SJohn Marino    it under the terms of the GNU General Public License as published by
686d7f5d3SJohn Marino    the Free Software Foundation; either version 2, or (at your option)
786d7f5d3SJohn Marino    any later version.
886d7f5d3SJohn Marino 
986d7f5d3SJohn Marino    This program is distributed in the hope that it will be useful,
1086d7f5d3SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
1186d7f5d3SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1286d7f5d3SJohn Marino    GNU General Public License for more details.
1386d7f5d3SJohn Marino 
1486d7f5d3SJohn Marino    You should have received a copy of the GNU General Public License
1586d7f5d3SJohn Marino    along with this program; if not, write to the Free Software Foundation,
1686d7f5d3SJohn Marino    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
1786d7f5d3SJohn Marino 
1886d7f5d3SJohn Marino /* Written by Bruno Haible <bruno@clisp.org>.  */
1986d7f5d3SJohn Marino 
2086d7f5d3SJohn Marino /* The macros in this file implement forward iteration through a
2186d7f5d3SJohn Marino    multi-byte string, without knowing its length a-priori.
2286d7f5d3SJohn Marino 
2386d7f5d3SJohn Marino    With these macros, an iteration loop that looks like
2486d7f5d3SJohn Marino 
2586d7f5d3SJohn Marino       char *iter;
2686d7f5d3SJohn Marino       for (iter = buf; *iter != '\0'; iter++)
2786d7f5d3SJohn Marino         {
2886d7f5d3SJohn Marino           do_something (*iter);
2986d7f5d3SJohn Marino         }
3086d7f5d3SJohn Marino 
3186d7f5d3SJohn Marino    becomes
3286d7f5d3SJohn Marino 
3386d7f5d3SJohn Marino       mbui_iterator_t iter;
3486d7f5d3SJohn Marino       for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter))
3586d7f5d3SJohn Marino         {
3686d7f5d3SJohn Marino           do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
3786d7f5d3SJohn Marino         }
3886d7f5d3SJohn Marino 
3986d7f5d3SJohn Marino    The benefit of these macros over plain use of mbrtowc is:
4086d7f5d3SJohn Marino    - Handling of invalid multibyte sequences is possible without
4186d7f5d3SJohn Marino      making the code more complicated, while still preserving the
4286d7f5d3SJohn Marino      invalid multibyte sequences.
4386d7f5d3SJohn Marino 
4486d7f5d3SJohn Marino    Compared to mbiter.h, the macros here don't need to know the string's
4586d7f5d3SJohn Marino    length a-priori.  The downside is that at each step, the look-ahead
4686d7f5d3SJohn Marino    that guards against overrunning the terminating '\0' is more expensive.
4786d7f5d3SJohn Marino    The mbui_* macros are therefore suitable when there is a high probability
4886d7f5d3SJohn Marino    that only the first few multibyte characters need to be inspected.
4986d7f5d3SJohn Marino    Whereas the mbi_* macros are better if usually the iteration runs
5086d7f5d3SJohn Marino    through the entire string.
5186d7f5d3SJohn Marino 
5286d7f5d3SJohn Marino    mbui_iterator_t
5386d7f5d3SJohn Marino      is a type usable for variable declarations.
5486d7f5d3SJohn Marino 
5586d7f5d3SJohn Marino    mbui_init (iter, startptr)
5686d7f5d3SJohn Marino      initializes the iterator, starting at startptr.
5786d7f5d3SJohn Marino 
5886d7f5d3SJohn Marino    mbui_avail (iter)
5986d7f5d3SJohn Marino      returns true if there are more multibyte chracters available before
6086d7f5d3SJohn Marino      the end of string is reached. In this case, mbui_cur (iter) is
6186d7f5d3SJohn Marino      initialized to the next multibyte chracter.
6286d7f5d3SJohn Marino 
6386d7f5d3SJohn Marino    mbui_advance (iter)
6486d7f5d3SJohn Marino      advances the iterator by one multibyte character.
6586d7f5d3SJohn Marino 
6686d7f5d3SJohn Marino    mbui_cur (iter)
6786d7f5d3SJohn Marino      returns the current multibyte character, of type mbchar_t.  All the
6886d7f5d3SJohn Marino      macros defined in mbchar.h can be used on it.
6986d7f5d3SJohn Marino 
7086d7f5d3SJohn Marino    mbui_cur_ptr (iter)
7186d7f5d3SJohn Marino      return a pointer to the beginning of the current multibyte character.
7286d7f5d3SJohn Marino 
7386d7f5d3SJohn Marino    mbui_reloc (iter, ptrdiff)
7486d7f5d3SJohn Marino      relocates iterator when the string is moved by ptrdiff bytes.
7586d7f5d3SJohn Marino 
7686d7f5d3SJohn Marino    Here are the function prototypes of the macros.
7786d7f5d3SJohn Marino 
7886d7f5d3SJohn Marino    extern void		mbui_init (mbui_iterator_t iter, const char *startptr);
7986d7f5d3SJohn Marino    extern bool		mbui_avail (mbui_iterator_t iter);
8086d7f5d3SJohn Marino    extern void		mbui_advance (mbui_iterator_t iter);
8186d7f5d3SJohn Marino    extern mbchar_t	mbui_cur (mbui_iterator_t iter);
8286d7f5d3SJohn Marino    extern const char *	mbui_cur_ptr (mbui_iterator_t iter);
8386d7f5d3SJohn Marino    extern void		mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff);
8486d7f5d3SJohn Marino  */
8586d7f5d3SJohn Marino 
8686d7f5d3SJohn Marino #ifndef _MBUITER_H
8786d7f5d3SJohn Marino #define _MBUITER_H 1
8886d7f5d3SJohn Marino 
8986d7f5d3SJohn Marino #include <assert.h>
9086d7f5d3SJohn Marino #include <stdbool.h>
9186d7f5d3SJohn Marino #include <stdlib.h>
9286d7f5d3SJohn Marino 
9386d7f5d3SJohn Marino /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9486d7f5d3SJohn Marino    <wchar.h>.
9586d7f5d3SJohn Marino    BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
9686d7f5d3SJohn Marino    <wchar.h>.  */
9786d7f5d3SJohn Marino #include <stdio.h>
9886d7f5d3SJohn Marino #include <time.h>
9986d7f5d3SJohn Marino #include <wchar.h>
10086d7f5d3SJohn Marino 
10186d7f5d3SJohn Marino #include "mbchar.h"
10286d7f5d3SJohn Marino #include "strnlen1.h"
10386d7f5d3SJohn Marino 
10486d7f5d3SJohn Marino struct mbuiter_multi
10586d7f5d3SJohn Marino {
10686d7f5d3SJohn Marino   bool in_shift;	/* true if next byte may not be interpreted as ASCII */
10786d7f5d3SJohn Marino   mbstate_t state;	/* if in_shift: current shift state */
10886d7f5d3SJohn Marino   bool next_done;	/* true if mbui_avail has already filled the following */
10986d7f5d3SJohn Marino   struct mbchar cur;	/* the current character:
11086d7f5d3SJohn Marino 	const char *cur.ptr		pointer to current character
11186d7f5d3SJohn Marino 	The following are only valid after mbui_avail.
11286d7f5d3SJohn Marino 	size_t cur.bytes		number of bytes of current character
11386d7f5d3SJohn Marino 	bool cur.wc_valid		true if wc is a valid wide character
11486d7f5d3SJohn Marino 	wchar_t cur.wc			if wc_valid: the current character
11586d7f5d3SJohn Marino 	*/
11686d7f5d3SJohn Marino };
11786d7f5d3SJohn Marino 
11886d7f5d3SJohn Marino static inline void
mbuiter_multi_next(struct mbuiter_multi * iter)11986d7f5d3SJohn Marino mbuiter_multi_next (struct mbuiter_multi *iter)
12086d7f5d3SJohn Marino {
12186d7f5d3SJohn Marino   if (iter->next_done)
12286d7f5d3SJohn Marino     return;
12386d7f5d3SJohn Marino   if (iter->in_shift)
12486d7f5d3SJohn Marino     goto with_shift;
12586d7f5d3SJohn Marino   /* Handle most ASCII characters quickly, without calling mbrtowc().  */
12686d7f5d3SJohn Marino   if (is_basic (*iter->cur.ptr))
12786d7f5d3SJohn Marino     {
12886d7f5d3SJohn Marino       /* These characters are part of the basic character set.  ISO C 99
12986d7f5d3SJohn Marino 	 guarantees that their wide character code is identical to their
13086d7f5d3SJohn Marino 	 char code.  */
13186d7f5d3SJohn Marino       iter->cur.bytes = 1;
13286d7f5d3SJohn Marino       iter->cur.wc = *iter->cur.ptr;
13386d7f5d3SJohn Marino       iter->cur.wc_valid = true;
13486d7f5d3SJohn Marino     }
13586d7f5d3SJohn Marino   else
13686d7f5d3SJohn Marino     {
13786d7f5d3SJohn Marino       assert (mbsinit (&iter->state));
13886d7f5d3SJohn Marino       iter->in_shift = true;
13986d7f5d3SJohn Marino     with_shift:
14086d7f5d3SJohn Marino       iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
14186d7f5d3SJohn Marino 				 strnlen1 (iter->cur.ptr, MB_CUR_MAX),
14286d7f5d3SJohn Marino 				 &iter->state);
14386d7f5d3SJohn Marino       if (iter->cur.bytes == (size_t) -1)
14486d7f5d3SJohn Marino 	{
14586d7f5d3SJohn Marino 	  /* An invalid multibyte sequence was encountered.  */
14686d7f5d3SJohn Marino 	  iter->cur.bytes = 1;
14786d7f5d3SJohn Marino 	  iter->cur.wc_valid = false;
14886d7f5d3SJohn Marino 	  /* Whether to set iter->in_shift = false and reset iter->state
14986d7f5d3SJohn Marino 	     or not is not very important; the string is bogus anyway.  */
15086d7f5d3SJohn Marino 	}
15186d7f5d3SJohn Marino       else if (iter->cur.bytes == (size_t) -2)
15286d7f5d3SJohn Marino 	{
15386d7f5d3SJohn Marino 	  /* An incomplete multibyte character at the end.  */
15486d7f5d3SJohn Marino 	  iter->cur.bytes = strlen (iter->cur.ptr);
15586d7f5d3SJohn Marino 	  iter->cur.wc_valid = false;
15686d7f5d3SJohn Marino 	  /* Whether to set iter->in_shift = false and reset iter->state
15786d7f5d3SJohn Marino 	     or not is not important; the string end is reached anyway.  */
15886d7f5d3SJohn Marino 	}
15986d7f5d3SJohn Marino       else
16086d7f5d3SJohn Marino 	{
16186d7f5d3SJohn Marino 	  if (iter->cur.bytes == 0)
16286d7f5d3SJohn Marino 	    {
16386d7f5d3SJohn Marino 	      /* A null wide character was encountered.  */
16486d7f5d3SJohn Marino 	      iter->cur.bytes = 1;
16586d7f5d3SJohn Marino 	      assert (*iter->cur.ptr == '\0');
16686d7f5d3SJohn Marino 	      assert (iter->cur.wc == 0);
16786d7f5d3SJohn Marino 	    }
16886d7f5d3SJohn Marino 	  iter->cur.wc_valid = true;
16986d7f5d3SJohn Marino 
17086d7f5d3SJohn Marino 	  /* When in the initial state, we can go back treating ASCII
17186d7f5d3SJohn Marino 	     characters more quickly.  */
17286d7f5d3SJohn Marino 	  if (mbsinit (&iter->state))
17386d7f5d3SJohn Marino 	    iter->in_shift = false;
17486d7f5d3SJohn Marino 	}
17586d7f5d3SJohn Marino     }
17686d7f5d3SJohn Marino   iter->next_done = true;
17786d7f5d3SJohn Marino }
17886d7f5d3SJohn Marino 
17986d7f5d3SJohn Marino static inline void
mbuiter_multi_reloc(struct mbuiter_multi * iter,ptrdiff_t ptrdiff)18086d7f5d3SJohn Marino mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff)
18186d7f5d3SJohn Marino {
18286d7f5d3SJohn Marino   iter->cur.ptr += ptrdiff;
18386d7f5d3SJohn Marino }
18486d7f5d3SJohn Marino 
18586d7f5d3SJohn Marino /* Iteration macros.  */
18686d7f5d3SJohn Marino typedef struct mbuiter_multi mbui_iterator_t;
18786d7f5d3SJohn Marino #define mbui_init(iter, startptr) \
18886d7f5d3SJohn Marino   ((iter).cur.ptr = (startptr), \
18986d7f5d3SJohn Marino    (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
19086d7f5d3SJohn Marino    (iter).next_done = false)
19186d7f5d3SJohn Marino #define mbui_avail(iter) \
19286d7f5d3SJohn Marino   (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur))
19386d7f5d3SJohn Marino #define mbui_advance(iter) \
19486d7f5d3SJohn Marino   ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
19586d7f5d3SJohn Marino 
19686d7f5d3SJohn Marino /* Access to the current character.  */
19786d7f5d3SJohn Marino #define mbui_cur(iter) (iter).cur
19886d7f5d3SJohn Marino #define mbui_cur_ptr(iter) (iter).cur.ptr
19986d7f5d3SJohn Marino 
20086d7f5d3SJohn Marino /* Relocation.  */
20186d7f5d3SJohn Marino #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff)
20286d7f5d3SJohn Marino 
20386d7f5d3SJohn Marino #endif /* _MBUITER_H */
204