xref: /openbsd-src/gnu/lib/libreadline/mbutil.c (revision 9704b281e65e1189747652d0ba55eee892cff5f7)
1*15b117eaSkettenis /* mbutil.c -- readline multibyte character utility functions */
2*15b117eaSkettenis 
3*15b117eaSkettenis /* Copyright (C) 2001 Free Software Foundation, Inc.
4*15b117eaSkettenis 
5*15b117eaSkettenis    This file is part of the GNU Readline Library, a library for
6*15b117eaSkettenis    reading lines of text with interactive input and history editing.
7*15b117eaSkettenis 
8*15b117eaSkettenis    The GNU Readline Library is free software; you can redistribute it
9*15b117eaSkettenis    and/or modify it under the terms of the GNU General Public License
10*15b117eaSkettenis    as published by the Free Software Foundation; either version 2, or
11*15b117eaSkettenis    (at your option) any later version.
12*15b117eaSkettenis 
13*15b117eaSkettenis    The GNU Readline Library is distributed in the hope that it will be
14*15b117eaSkettenis    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15*15b117eaSkettenis    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16*15b117eaSkettenis    GNU General Public License for more details.
17*15b117eaSkettenis 
18*15b117eaSkettenis    The GNU General Public License is often shipped with GNU software, and
19*15b117eaSkettenis    is generally kept in a file called COPYING or LICENSE.  If you do not
20*15b117eaSkettenis    have a copy of the license, write to the Free Software Foundation,
21*15b117eaSkettenis    59 Temple Place, Suite 330, Boston, MA 02111 USA. */
22*15b117eaSkettenis #define READLINE_LIBRARY
23*15b117eaSkettenis 
24*15b117eaSkettenis #if defined (HAVE_CONFIG_H)
25*15b117eaSkettenis #  include <config.h>
26*15b117eaSkettenis #endif
27*15b117eaSkettenis 
28*15b117eaSkettenis #include <sys/types.h>
29*15b117eaSkettenis #include <fcntl.h>
30*15b117eaSkettenis #include "posixjmp.h"
31*15b117eaSkettenis 
32*15b117eaSkettenis #if defined (HAVE_UNISTD_H)
33*15b117eaSkettenis #  include <unistd.h>	   /* for _POSIX_VERSION */
34*15b117eaSkettenis #endif /* HAVE_UNISTD_H */
35*15b117eaSkettenis 
36*15b117eaSkettenis #if defined (HAVE_STDLIB_H)
37*15b117eaSkettenis #  include <stdlib.h>
38*15b117eaSkettenis #else
39*15b117eaSkettenis #  include "ansi_stdlib.h"
40*15b117eaSkettenis #endif /* HAVE_STDLIB_H */
41*15b117eaSkettenis 
42*15b117eaSkettenis #include <stdio.h>
43*15b117eaSkettenis #include <ctype.h>
44*15b117eaSkettenis 
45*15b117eaSkettenis /* System-specific feature definitions and include files. */
46*15b117eaSkettenis #include "rldefs.h"
47*15b117eaSkettenis #include "rlmbutil.h"
48*15b117eaSkettenis 
49*15b117eaSkettenis #if defined (TIOCSTAT_IN_SYS_IOCTL)
50*15b117eaSkettenis #  include <sys/ioctl.h>
51*15b117eaSkettenis #endif /* TIOCSTAT_IN_SYS_IOCTL */
52*15b117eaSkettenis 
53*15b117eaSkettenis /* Some standard library routines. */
54*15b117eaSkettenis #include "readline.h"
55*15b117eaSkettenis 
56*15b117eaSkettenis #include "rlprivate.h"
57*15b117eaSkettenis #include "xmalloc.h"
58*15b117eaSkettenis 
59*15b117eaSkettenis /* Declared here so it can be shared between the readline and history
60*15b117eaSkettenis    libraries. */
61*15b117eaSkettenis #if defined (HANDLE_MULTIBYTE)
62*15b117eaSkettenis int rl_byte_oriented = 0;
63*15b117eaSkettenis #else
64*15b117eaSkettenis int rl_byte_oriented = 1;
65*15b117eaSkettenis #endif
66*15b117eaSkettenis 
67*15b117eaSkettenis /* **************************************************************** */
68*15b117eaSkettenis /*								    */
69*15b117eaSkettenis /*		Multibyte Character Utility Functions		    */
70*15b117eaSkettenis /*								    */
71*15b117eaSkettenis /* **************************************************************** */
72*15b117eaSkettenis 
73*15b117eaSkettenis #if defined(HANDLE_MULTIBYTE)
74*15b117eaSkettenis 
75*15b117eaSkettenis static int
_rl_find_next_mbchar_internal(string,seed,count,find_non_zero)76*15b117eaSkettenis _rl_find_next_mbchar_internal (string, seed, count, find_non_zero)
77*15b117eaSkettenis      char *string;
78*15b117eaSkettenis      int seed, count, find_non_zero;
79*15b117eaSkettenis {
80*15b117eaSkettenis   size_t tmp = 0;
81*15b117eaSkettenis   mbstate_t ps;
82*15b117eaSkettenis   int point = 0;
83*15b117eaSkettenis   wchar_t wc;
84*15b117eaSkettenis 
85*15b117eaSkettenis   memset(&ps, 0, sizeof (mbstate_t));
86*15b117eaSkettenis   if (seed < 0)
87*15b117eaSkettenis     seed = 0;
88*15b117eaSkettenis   if (count <= 0)
89*15b117eaSkettenis     return seed;
90*15b117eaSkettenis 
91*15b117eaSkettenis   point = seed + _rl_adjust_point(string, seed, &ps);
92*15b117eaSkettenis   /* if this is true, means that seed was not pointed character
93*15b117eaSkettenis      started byte.  So correct the point and consume count */
94*15b117eaSkettenis   if (seed < point)
95*15b117eaSkettenis     count --;
96*15b117eaSkettenis 
97*15b117eaSkettenis   while (count > 0)
98*15b117eaSkettenis     {
99*15b117eaSkettenis       tmp = mbrtowc (&wc, string+point, strlen(string + point), &ps);
100*15b117eaSkettenis       if ((size_t)(tmp) == (size_t)-1 || (size_t)(tmp) == (size_t)-2)
101*15b117eaSkettenis 	{
102*15b117eaSkettenis 	  /* invalid bytes. asume a byte represents a character */
103*15b117eaSkettenis 	  point++;
104*15b117eaSkettenis 	  count--;
105*15b117eaSkettenis 	  /* reset states. */
106*15b117eaSkettenis 	  memset(&ps, 0, sizeof(mbstate_t));
107*15b117eaSkettenis 	}
108*15b117eaSkettenis       else if (tmp == (size_t)0)
109*15b117eaSkettenis 	/* found '\0' char */
110*15b117eaSkettenis 	break;
111*15b117eaSkettenis       else
112*15b117eaSkettenis 	{
113*15b117eaSkettenis 	  /* valid bytes */
114*15b117eaSkettenis 	  point += tmp;
115*15b117eaSkettenis 	  if (find_non_zero)
116*15b117eaSkettenis 	    {
117*15b117eaSkettenis 	      if (wcwidth (wc) == 0)
118*15b117eaSkettenis 		continue;
119*15b117eaSkettenis 	      else
120*15b117eaSkettenis 		count--;
121*15b117eaSkettenis 	    }
122*15b117eaSkettenis 	  else
123*15b117eaSkettenis 	    count--;
124*15b117eaSkettenis 	}
125*15b117eaSkettenis     }
126*15b117eaSkettenis 
127*15b117eaSkettenis   if (find_non_zero)
128*15b117eaSkettenis     {
129*15b117eaSkettenis       tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
130*15b117eaSkettenis       while (wcwidth (wc) == 0)
131*15b117eaSkettenis 	{
132*15b117eaSkettenis 	  point += tmp;
133*15b117eaSkettenis 	  tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
134*15b117eaSkettenis 	  if (tmp == (size_t)(0) || tmp == (size_t)(-1) || tmp == (size_t)(-2))
135*15b117eaSkettenis 	    break;
136*15b117eaSkettenis 	}
137*15b117eaSkettenis     }
138*15b117eaSkettenis     return point;
139*15b117eaSkettenis }
140*15b117eaSkettenis 
141*15b117eaSkettenis static int
_rl_find_prev_mbchar_internal(string,seed,find_non_zero)142*15b117eaSkettenis _rl_find_prev_mbchar_internal (string, seed, find_non_zero)
143*15b117eaSkettenis      char *string;
144*15b117eaSkettenis      int seed, find_non_zero;
145*15b117eaSkettenis {
146*15b117eaSkettenis   mbstate_t ps;
147*15b117eaSkettenis   int prev, non_zero_prev, point, length;
148*15b117eaSkettenis   size_t tmp;
149*15b117eaSkettenis   wchar_t wc;
150*15b117eaSkettenis 
151*15b117eaSkettenis   memset(&ps, 0, sizeof(mbstate_t));
152*15b117eaSkettenis   length = strlen(string);
153*15b117eaSkettenis 
154*15b117eaSkettenis   if (seed < 0)
155*15b117eaSkettenis     return 0;
156*15b117eaSkettenis   else if (length < seed)
157*15b117eaSkettenis     return length;
158*15b117eaSkettenis 
159*15b117eaSkettenis   prev = non_zero_prev = point = 0;
160*15b117eaSkettenis   while (point < seed)
161*15b117eaSkettenis     {
162*15b117eaSkettenis       tmp = mbrtowc (&wc, string + point, length - point, &ps);
163*15b117eaSkettenis       if ((size_t)(tmp) == (size_t)-1 || (size_t)(tmp) == (size_t)-2)
164*15b117eaSkettenis 	{
165*15b117eaSkettenis 	  /* in this case, bytes are invalid or shorted to compose
166*15b117eaSkettenis 	     multibyte char, so assume that the first byte represents
167*15b117eaSkettenis 	     a single character anyway. */
168*15b117eaSkettenis 	  tmp = 1;
169*15b117eaSkettenis 	  /* clear the state of the byte sequence, because
170*15b117eaSkettenis 	     in this case effect of mbstate is undefined  */
171*15b117eaSkettenis 	  memset(&ps, 0, sizeof (mbstate_t));
172*15b117eaSkettenis 	}
173*15b117eaSkettenis       else if (tmp == 0)
174*15b117eaSkettenis 	break;			/* Found '\0' char.  Can this happen? */
175*15b117eaSkettenis       else
176*15b117eaSkettenis 	{
177*15b117eaSkettenis 	  if (find_non_zero)
178*15b117eaSkettenis 	    {
179*15b117eaSkettenis 	      if (wcwidth (wc) != 0)
180*15b117eaSkettenis 		prev = point;
181*15b117eaSkettenis 	    }
182*15b117eaSkettenis 	  else
183*15b117eaSkettenis 	    prev = point;
184*15b117eaSkettenis 	}
185*15b117eaSkettenis 
186*15b117eaSkettenis       point += tmp;
187*15b117eaSkettenis     }
188*15b117eaSkettenis 
189*15b117eaSkettenis   return prev;
190*15b117eaSkettenis }
191*15b117eaSkettenis 
192*15b117eaSkettenis /* return the number of bytes parsed from the multibyte sequence starting
193*15b117eaSkettenis    at src, if a non-L'\0' wide character was recognized. It returns 0,
194*15b117eaSkettenis    if a L'\0' wide character was recognized. It  returns (size_t)(-1),
195*15b117eaSkettenis    if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
196*15b117eaSkettenis    if it couldn't parse a complete  multibyte character.  */
197*15b117eaSkettenis int
_rl_get_char_len(src,ps)198*15b117eaSkettenis _rl_get_char_len (src, ps)
199*15b117eaSkettenis      char *src;
200*15b117eaSkettenis      mbstate_t *ps;
201*15b117eaSkettenis {
202*15b117eaSkettenis   size_t tmp;
203*15b117eaSkettenis 
204*15b117eaSkettenis   tmp = mbrlen((const char *)src, (size_t)strlen (src), ps);
205*15b117eaSkettenis   if (tmp == (size_t)(-2))
206*15b117eaSkettenis     {
207*15b117eaSkettenis       /* shorted to compose multibyte char */
208*15b117eaSkettenis       if (ps)
209*15b117eaSkettenis 	memset (ps, 0, sizeof(mbstate_t));
210*15b117eaSkettenis       return -2;
211*15b117eaSkettenis     }
212*15b117eaSkettenis   else if (tmp == (size_t)(-1))
213*15b117eaSkettenis     {
214*15b117eaSkettenis       /* invalid to compose multibyte char */
215*15b117eaSkettenis       /* initialize the conversion state */
216*15b117eaSkettenis       if (ps)
217*15b117eaSkettenis 	memset (ps, 0, sizeof(mbstate_t));
218*15b117eaSkettenis       return -1;
219*15b117eaSkettenis     }
220*15b117eaSkettenis   else if (tmp == (size_t)0)
221*15b117eaSkettenis     return 0;
222*15b117eaSkettenis   else
223*15b117eaSkettenis     return (int)tmp;
224*15b117eaSkettenis }
225*15b117eaSkettenis 
226*15b117eaSkettenis /* compare the specified two characters. If the characters matched,
227*15b117eaSkettenis    return 1. Otherwise return 0. */
228*15b117eaSkettenis int
_rl_compare_chars(buf1,pos1,ps1,buf2,pos2,ps2)229*15b117eaSkettenis _rl_compare_chars (buf1, pos1, ps1, buf2, pos2, ps2)
230*15b117eaSkettenis      char *buf1;
231*15b117eaSkettenis      int pos1;
232*15b117eaSkettenis      mbstate_t *ps1;
233*15b117eaSkettenis      char *buf2;
234*15b117eaSkettenis      int pos2;
235*15b117eaSkettenis      mbstate_t *ps2;
236*15b117eaSkettenis {
237*15b117eaSkettenis   int i, w1, w2;
238*15b117eaSkettenis 
239*15b117eaSkettenis   if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
240*15b117eaSkettenis 	(w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
241*15b117eaSkettenis 	(w1 != w2) ||
242*15b117eaSkettenis 	(buf1[pos1] != buf2[pos2]))
243*15b117eaSkettenis     return 0;
244*15b117eaSkettenis 
245*15b117eaSkettenis   for (i = 1; i < w1; i++)
246*15b117eaSkettenis     if (buf1[pos1+i] != buf2[pos2+i])
247*15b117eaSkettenis       return 0;
248*15b117eaSkettenis 
249*15b117eaSkettenis   return 1;
250*15b117eaSkettenis }
251*15b117eaSkettenis 
252*15b117eaSkettenis /* adjust pointed byte and find mbstate of the point of string.
253*15b117eaSkettenis    adjusted point will be point <= adjusted_point, and returns
254*15b117eaSkettenis    differences of the byte(adjusted_point - point).
255*15b117eaSkettenis    if point is invalied (point < 0 || more than string length),
256*15b117eaSkettenis    it returns -1 */
257*15b117eaSkettenis int
_rl_adjust_point(string,point,ps)258*15b117eaSkettenis _rl_adjust_point(string, point, ps)
259*15b117eaSkettenis      char *string;
260*15b117eaSkettenis      int point;
261*15b117eaSkettenis      mbstate_t *ps;
262*15b117eaSkettenis {
263*15b117eaSkettenis   size_t tmp = 0;
264*15b117eaSkettenis   int length;
265*15b117eaSkettenis   int pos = 0;
266*15b117eaSkettenis 
267*15b117eaSkettenis   length = strlen(string);
268*15b117eaSkettenis   if (point < 0)
269*15b117eaSkettenis     return -1;
270*15b117eaSkettenis   if (length < point)
271*15b117eaSkettenis     return -1;
272*15b117eaSkettenis 
273*15b117eaSkettenis   while (pos < point)
274*15b117eaSkettenis     {
275*15b117eaSkettenis       tmp = mbrlen (string + pos, length - pos, ps);
276*15b117eaSkettenis       if((size_t)(tmp) == (size_t)-1 || (size_t)(tmp) == (size_t)-2)
277*15b117eaSkettenis 	{
278*15b117eaSkettenis 	  /* in this case, bytes are invalid or shorted to compose
279*15b117eaSkettenis 	     multibyte char, so assume that the first byte represents
280*15b117eaSkettenis 	     a single character anyway. */
281*15b117eaSkettenis 	  pos++;
282*15b117eaSkettenis 	  /* clear the state of the byte sequence, because
283*15b117eaSkettenis 	     in this case effect of mbstate is undefined  */
284*15b117eaSkettenis 	  if (ps)
285*15b117eaSkettenis 	    memset (ps, 0, sizeof (mbstate_t));
286*15b117eaSkettenis 	}
287*15b117eaSkettenis       else if (tmp == 0)
288*15b117eaSkettenis 	pos++;
289*15b117eaSkettenis       else
290*15b117eaSkettenis 	pos += tmp;
291*15b117eaSkettenis     }
292*15b117eaSkettenis 
293*15b117eaSkettenis   return (pos - point);
294*15b117eaSkettenis }
295*15b117eaSkettenis 
296*15b117eaSkettenis int
_rl_is_mbchar_matched(string,seed,end,mbchar,length)297*15b117eaSkettenis _rl_is_mbchar_matched (string, seed, end, mbchar, length)
298*15b117eaSkettenis      char *string;
299*15b117eaSkettenis      int seed, end;
300*15b117eaSkettenis      char *mbchar;
301*15b117eaSkettenis      int length;
302*15b117eaSkettenis {
303*15b117eaSkettenis   int i;
304*15b117eaSkettenis 
305*15b117eaSkettenis   if ((end - seed) < length)
306*15b117eaSkettenis     return 0;
307*15b117eaSkettenis 
308*15b117eaSkettenis   for (i = 0; i < length; i++)
309*15b117eaSkettenis     if (string[seed + i] != mbchar[i])
310*15b117eaSkettenis       return 0;
311*15b117eaSkettenis   return 1;
312*15b117eaSkettenis }
313*15b117eaSkettenis #endif /* HANDLE_MULTIBYTE */
314*15b117eaSkettenis 
315*15b117eaSkettenis /* Find next `count' characters started byte point of the specified seed.
316*15b117eaSkettenis    If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
317*15b117eaSkettenis    characters. */
318*15b117eaSkettenis #undef _rl_find_next_mbchar
319*15b117eaSkettenis int
_rl_find_next_mbchar(string,seed,count,flags)320*15b117eaSkettenis _rl_find_next_mbchar (string, seed, count, flags)
321*15b117eaSkettenis      char *string;
322*15b117eaSkettenis      int seed, count, flags;
323*15b117eaSkettenis {
324*15b117eaSkettenis #if defined (HANDLE_MULTIBYTE)
325*15b117eaSkettenis   return _rl_find_next_mbchar_internal (string, seed, count, flags);
326*15b117eaSkettenis #else
327*15b117eaSkettenis   return (seed + count);
328*15b117eaSkettenis #endif
329*15b117eaSkettenis }
330*15b117eaSkettenis 
331*15b117eaSkettenis /* Find previous character started byte point of the specified seed.
332*15b117eaSkettenis    Returned point will be point <= seed.  If flags is MB_FIND_NONZERO,
333*15b117eaSkettenis    we look for non-zero-width multibyte characters. */
334*15b117eaSkettenis #undef _rl_find_prev_mbchar
335*15b117eaSkettenis int
_rl_find_prev_mbchar(string,seed,flags)336*15b117eaSkettenis _rl_find_prev_mbchar (string, seed, flags)
337*15b117eaSkettenis      char *string;
338*15b117eaSkettenis      int seed, flags;
339*15b117eaSkettenis {
340*15b117eaSkettenis #if defined (HANDLE_MULTIBYTE)
341*15b117eaSkettenis   return _rl_find_prev_mbchar_internal (string, seed, flags);
342*15b117eaSkettenis #else
343*15b117eaSkettenis   return ((seed == 0) ? seed : seed - 1);
344*15b117eaSkettenis #endif
345*15b117eaSkettenis }
346